HDF5#
import h5py
import tempfile
from pathlib import Path
import numpy as np
import pandas as pd
Datasets#
The dataset object in HDF5 is like a NumPy array.
You can get the element in the dataset by index just like a NumPy array.
rng = np.random.default_rng(42)
# Data to save
arr = rng.random((10, 3))
with tempfile.TemporaryDirectory() as tmpdir:
# H5 file path
path = Path(tmpdir).joinpath("test.h5")
# Create an HDF5 file
with h5py.File(path, "w") as f:
f.create_dataset("arr", data=arr)
# Reald the file
with h5py.File(path, "r") as f:
print(f"attributes: {f.keys()}")
# You can access each dataset by the key
dataset = f["arr"]
# Check the type of the dataset
print(f"type: {type(dataset)}")
# Check the data type of each element
print(f"dtype: {dataset.dtype}")
# You can get the element by index just like a NumPy array
print(f"element at (0, 1): {dataset[0, 1]}")
# The entire array
print(f"array: {dataset[:]}")
attributes: <KeysViewHDF5 ['arr']>
type: <class 'h5py._hl.dataset.Dataset'>
dtype: int64
element at (0, 1): 2
array: [[1 2 3]], <class 'numpy.ndarray'>
However, you cannot access the element in the dataset after the HDF5 file is closed!
rng = np.random.default_rng(42)
# Data to save
arr = rng.random((10, 3))
with tempfile.TemporaryDirectory() as tmpdir:
# H5 file path
path = Path(tmpdir).joinpath("test.h5")
# Create an HDF5 file
with h5py.File(path, "w") as f:
f.create_dataset("arr", data=arr)
# Reald the file
with h5py.File(path, "r") as f:
dataset = f["arr"]
# ! You cannot access the element after the file is closed
try:
print(f"element at (0, 1): {dataset[0, 1]}")
except Exception as e:
print(e)
Invalid dataset identifier (invalid dataset identifier)
So, to keep the array after closing the file you need to get a copy of the entire slice of the dataset.
rng = np.random.default_rng(42)
# Data to save
arr = rng.random((10, 3))
with tempfile.TemporaryDirectory() as tmpdir:
# H5 file path
path = Path(tmpdir).joinpath("test.h5")
# Create an HDF5 file
with h5py.File(path, "w") as f:
f.create_dataset("arr", data=arr)
# Reald the file
with h5py.File(path, "r") as f:
arr = f["arr"][:]
# The stored data
print(f"arr: {arr}")
arr: [[0.77395605 0.43887844 0.85859792]
[0.69736803 0.09417735 0.97562235]
[0.7611397 0.78606431 0.12811363]
[0.45038594 0.37079802 0.92676499]
[0.64386512 0.82276161 0.4434142 ]
[0.22723872 0.55458479 0.06381726]
[0.82763117 0.6316644 0.75808774]
[0.35452597 0.97069802 0.89312112]
[0.7783835 0.19463871 0.466721 ]
[0.04380377 0.15428949 0.68304895]]
Storing Data Frames#
records = [
{
"name": "Isaac",
"age": 24,
},
{
"name": "John",
"age": 25,
},
]
# Create a data frame
df = pd.DataFrame(records)
df
name | age | |
---|---|---|
0 | Isaac | 24 |
1 | John | 25 |
Convert the data frame to a dictionary of columns.
columns = df.to_dict(orient="list")
columns
{'name': ['Isaac', 'John'], 'age': [24, 25]}
with tempfile.TemporaryDirectory() as tmpdir:
# H5 file path
path = Path(tmpdir).joinpath("test.h5")
# Create an HDF5 file
with h5py.File(path, "w") as f:
# Create a dataset for each column
f.create_dataset("name", data=columns["name"])
f.create_dataset("age", data=columns["age"])
# Reald the file
with h5py.File(path, "r") as f:
names = f["name"][:]
ages = f["age"][:]
# The stored data
print(f"names: {names}")
print(f"ages: {ages}")
# Recover the data frame
df = pd.DataFrame(
{
# Note that the strings are stored asbytes
# Decode the bytes
"name": map(bytes.decode, names),
"age": ages,
}
)
display(df)
names: [b'Isaac' b'John']
ages: [24 25]
name | age | |
---|---|---|
0 | Isaac | 24 |
1 | John | 25 |
The following example shows how to deal with a more complicated data frame in which one of the columns is a NumPy array.
rng = np.random.default_rng(42)
records = [
{
"name": "Isaac",
"data": rng.random((10, 3)),
},
{
"name": "John",
"data": rng.random((10, 3)),
},
]
df = pd.DataFrame(records)
df
name | data | |
---|---|---|
0 | Isaac | [[0.7739560485559633, 0.4388784397520523, 0.85... |
1 | John | [[0.7447621559078171, 0.96750973243421, 0.3258... |
columns = df.to_dict(orient="list")
columns
{'name': ['Isaac', 'John'],
'data': [array([[0.77395605, 0.43887844, 0.85859792],
[0.69736803, 0.09417735, 0.97562235],
[0.7611397 , 0.78606431, 0.12811363],
[0.45038594, 0.37079802, 0.92676499],
[0.64386512, 0.82276161, 0.4434142 ],
[0.22723872, 0.55458479, 0.06381726],
[0.82763117, 0.6316644 , 0.75808774],
[0.35452597, 0.97069802, 0.89312112],
[0.7783835 , 0.19463871, 0.466721 ],
[0.04380377, 0.15428949, 0.68304895]]),
array([[0.74476216, 0.96750973, 0.32582536],
[0.37045971, 0.46955581, 0.18947136],
[0.12992151, 0.47570493, 0.22690935],
[0.66981399, 0.43715192, 0.8326782 ],
[0.7002651 , 0.31236664, 0.8322598 ],
[0.80476436, 0.38747838, 0.2883281 ],
[0.6824955 , 0.13975248, 0.1999082 ],
[0.00736227, 0.78692438, 0.66485086],
[0.70516538, 0.78072903, 0.45891578],
[0.5687412 , 0.139797 , 0.11453007]])]}
with tempfile.TemporaryDirectory() as tmpdir:
# H5 file path
path = Path(tmpdir).joinpath("test.h5")
# Create an HDF5 file
with h5py.File(path, "w") as f:
# Create a dataset for each column
f.create_dataset("name", data=columns["name"])
f.create_dataset("data", data=columns["data"])
# Reald the file
with h5py.File(path, "r") as f:
names = f["name"][:]
data = f["data"][:]
# The stored data
print(f"names: {names}")
print(f"data: {ages}")
# Recover the data frame
df = pd.DataFrame(
{
# Note that the strings are stored asbytes
# Decode the bytes
"name": map(bytes.decode, names),
# The data itself is an array
# Convert it to a list of arrays
"data": list(data),
}
)
display(df)
names: [b'Isaac' b'John']
data: [[[0.77395605 0.43887844 0.85859792]
[0.69736803 0.09417735 0.97562235]
[0.7611397 0.78606431 0.12811363]
[0.45038594 0.37079802 0.92676499]
[0.64386512 0.82276161 0.4434142 ]
[0.22723872 0.55458479 0.06381726]
[0.82763117 0.6316644 0.75808774]
[0.35452597 0.97069802 0.89312112]
[0.7783835 0.19463871 0.466721 ]
[0.04380377 0.15428949 0.68304895]]
[[0.74476216 0.96750973 0.32582536]
[0.37045971 0.46955581 0.18947136]
[0.12992151 0.47570493 0.22690935]
[0.66981399 0.43715192 0.8326782 ]
[0.7002651 0.31236664 0.8322598 ]
[0.80476436 0.38747838 0.2883281 ]
[0.6824955 0.13975248 0.1999082 ]
[0.00736227 0.78692438 0.66485086]
[0.70516538 0.78072903 0.45891578]
[0.5687412 0.139797 0.11453007]]]
name | data | |
---|---|---|
0 | Isaac | [[0.7739560485559633, 0.4388784397520523, 0.85... |
1 | John | [[0.7447621559078171, 0.96750973243421, 0.3258... |
Groups#
with tempfile.TemporaryDirectory() as tmpdir:
# H5 file path
path = Path(tmpdir).joinpath("test.h5")
# Create an HDF5 file
with h5py.File(path, "w") as f:
# Create a dataset for each column
f.create_dataset("name", data=columns["name"])
f.create_dataset("data", data=columns["data"])
# Reald the file
with h5py.File(path, "r") as f:
names = f["name"][:]
data = f["data"][:]
# The stored data
print(f"names: {names}")
print(f"data: {ages}")
# Recover the data frame
df = pd.DataFrame(
{
# Note that the strings are stored asbytes
# Decode the bytes
"name": map(bytes.decode, names),
# The data itself is an array
# Convert it to a list of arrays
"data": list(data),
}
)
display(df)