HDF5#

import h5py
import tempfile
from pathlib import Path
import numpy as np
import pandas as pd

Datasets#

The dataset object in HDF5 is like a NumPy array.

You can get the element in the dataset by index just like a NumPy array.

rng = np.random.default_rng(42)

# Data to save
arr = rng.random((10, 3))

with tempfile.TemporaryDirectory() as tmpdir:

    # H5 file path
    path = Path(tmpdir).joinpath("test.h5")

    # Create an HDF5 file
    with h5py.File(path, "w") as f:
        f.create_dataset("arr", data=arr)

    # Reald the file
    with h5py.File(path, "r") as f:

        print(f"attributes: {f.keys()}")

        # You can access each dataset by the key
        dataset = f["arr"]

        # Check the type of the dataset
        print(f"type: {type(dataset)}")

        # Check the data type of each element
        print(f"dtype: {dataset.dtype}")

        # You can get the element by index just like a NumPy array
        print(f"element at (0, 1): {dataset[0, 1]}")

        # The entire array
        print(f"array: {dataset[:]}")
attributes: <KeysViewHDF5 ['arr']>
type: <class 'h5py._hl.dataset.Dataset'>
dtype: int64
element at (0, 1): 2
array: [[1 2 3]], <class 'numpy.ndarray'>

However, you cannot access the element in the dataset after the HDF5 file is closed!

rng = np.random.default_rng(42)

# Data to save
arr = rng.random((10, 3))

with tempfile.TemporaryDirectory() as tmpdir:

    # H5 file path
    path = Path(tmpdir).joinpath("test.h5")

    # Create an HDF5 file
    with h5py.File(path, "w") as f:
        f.create_dataset("arr", data=arr)

    # Reald the file
    with h5py.File(path, "r") as f:
        dataset = f["arr"]

    # ! You cannot access the element after the file is closed
    try:
        print(f"element at (0, 1): {dataset[0, 1]}")
    except Exception as e:
        print(e)
Invalid dataset identifier (invalid dataset identifier)

So, to keep the array after closing the file you need to get a copy of the entire slice of the dataset.

rng = np.random.default_rng(42)

# Data to save
arr = rng.random((10, 3))

with tempfile.TemporaryDirectory() as tmpdir:

    # H5 file path
    path = Path(tmpdir).joinpath("test.h5")

    # Create an HDF5 file
    with h5py.File(path, "w") as f:
        f.create_dataset("arr", data=arr)

    # Reald the file
    with h5py.File(path, "r") as f:
        arr = f["arr"][:]

    # The stored data
    print(f"arr: {arr}")
arr: [[0.77395605 0.43887844 0.85859792]
 [0.69736803 0.09417735 0.97562235]
 [0.7611397  0.78606431 0.12811363]
 [0.45038594 0.37079802 0.92676499]
 [0.64386512 0.82276161 0.4434142 ]
 [0.22723872 0.55458479 0.06381726]
 [0.82763117 0.6316644  0.75808774]
 [0.35452597 0.97069802 0.89312112]
 [0.7783835  0.19463871 0.466721  ]
 [0.04380377 0.15428949 0.68304895]]

Storing Data Frames#

records = [
    {
        "name": "Isaac",
        "age": 24,
    },
    {
        "name": "John",
        "age": 25,
    },
]

# Create a data frame
df = pd.DataFrame(records)

df
name age
0 Isaac 24
1 John 25

Convert the data frame to a dictionary of columns.

columns = df.to_dict(orient="list")

columns
{'name': ['Isaac', 'John'], 'age': [24, 25]}
with tempfile.TemporaryDirectory() as tmpdir:

    # H5 file path
    path = Path(tmpdir).joinpath("test.h5")

    # Create an HDF5 file
    with h5py.File(path, "w") as f:

        # Create a dataset for each column
        f.create_dataset("name", data=columns["name"])
        f.create_dataset("age", data=columns["age"])

    # Reald the file
    with h5py.File(path, "r") as f:
        names = f["name"][:]
        ages = f["age"][:]

    # The stored data
    print(f"names: {names}")
    print(f"ages: {ages}")

    # Recover the data frame
    df = pd.DataFrame(
        {
            # Note that the strings are stored asbytes
            # Decode the bytes
            "name": map(bytes.decode, names),
            "age": ages,
        }
    )

    display(df)
names: [b'Isaac' b'John']
ages: [24 25]
name age
0 Isaac 24
1 John 25

The following example shows how to deal with a more complicated data frame in which one of the columns is a NumPy array.

rng = np.random.default_rng(42)

records = [
    {
        "name": "Isaac",
        "data": rng.random((10, 3)),
    },
    {
        "name": "John",
        "data": rng.random((10, 3)),
    },
]

df = pd.DataFrame(records)

df
name data
0 Isaac [[0.7739560485559633, 0.4388784397520523, 0.85...
1 John [[0.7447621559078171, 0.96750973243421, 0.3258...
columns = df.to_dict(orient="list")

columns
{'name': ['Isaac', 'John'],
 'data': [array([[0.77395605, 0.43887844, 0.85859792],
         [0.69736803, 0.09417735, 0.97562235],
         [0.7611397 , 0.78606431, 0.12811363],
         [0.45038594, 0.37079802, 0.92676499],
         [0.64386512, 0.82276161, 0.4434142 ],
         [0.22723872, 0.55458479, 0.06381726],
         [0.82763117, 0.6316644 , 0.75808774],
         [0.35452597, 0.97069802, 0.89312112],
         [0.7783835 , 0.19463871, 0.466721  ],
         [0.04380377, 0.15428949, 0.68304895]]),
  array([[0.74476216, 0.96750973, 0.32582536],
         [0.37045971, 0.46955581, 0.18947136],
         [0.12992151, 0.47570493, 0.22690935],
         [0.66981399, 0.43715192, 0.8326782 ],
         [0.7002651 , 0.31236664, 0.8322598 ],
         [0.80476436, 0.38747838, 0.2883281 ],
         [0.6824955 , 0.13975248, 0.1999082 ],
         [0.00736227, 0.78692438, 0.66485086],
         [0.70516538, 0.78072903, 0.45891578],
         [0.5687412 , 0.139797  , 0.11453007]])]}
with tempfile.TemporaryDirectory() as tmpdir:

    # H5 file path
    path = Path(tmpdir).joinpath("test.h5")

    # Create an HDF5 file
    with h5py.File(path, "w") as f:

        # Create a dataset for each column
        f.create_dataset("name", data=columns["name"])
        f.create_dataset("data", data=columns["data"])

    # Reald the file
    with h5py.File(path, "r") as f:
        names = f["name"][:]
        data = f["data"][:]

    # The stored data
    print(f"names: {names}")
    print(f"data: {ages}")

    # Recover the data frame
    df = pd.DataFrame(
        {
            # Note that the strings are stored asbytes
            # Decode the bytes
            "name": map(bytes.decode, names),
            # The data itself is an array
            # Convert it to a list of arrays
            "data": list(data),
        }
    )

    display(df)
names: [b'Isaac' b'John']
data: [[[0.77395605 0.43887844 0.85859792]
  [0.69736803 0.09417735 0.97562235]
  [0.7611397  0.78606431 0.12811363]
  [0.45038594 0.37079802 0.92676499]
  [0.64386512 0.82276161 0.4434142 ]
  [0.22723872 0.55458479 0.06381726]
  [0.82763117 0.6316644  0.75808774]
  [0.35452597 0.97069802 0.89312112]
  [0.7783835  0.19463871 0.466721  ]
  [0.04380377 0.15428949 0.68304895]]

 [[0.74476216 0.96750973 0.32582536]
  [0.37045971 0.46955581 0.18947136]
  [0.12992151 0.47570493 0.22690935]
  [0.66981399 0.43715192 0.8326782 ]
  [0.7002651  0.31236664 0.8322598 ]
  [0.80476436 0.38747838 0.2883281 ]
  [0.6824955  0.13975248 0.1999082 ]
  [0.00736227 0.78692438 0.66485086]
  [0.70516538 0.78072903 0.45891578]
  [0.5687412  0.139797   0.11453007]]]
name data
0 Isaac [[0.7739560485559633, 0.4388784397520523, 0.85...
1 John [[0.7447621559078171, 0.96750973243421, 0.3258...

Groups#

with tempfile.TemporaryDirectory() as tmpdir:

    # H5 file path
    path = Path(tmpdir).joinpath("test.h5")

    # Create an HDF5 file
    with h5py.File(path, "w") as f:

        # Create a dataset for each column
        f.create_dataset("name", data=columns["name"])
        f.create_dataset("data", data=columns["data"])

    # Reald the file
    with h5py.File(path, "r") as f:
        names = f["name"][:]
        data = f["data"][:]

    # The stored data
    print(f"names: {names}")
    print(f"data: {ages}")

    # Recover the data frame
    df = pd.DataFrame(
        {
            # Note that the strings are stored asbytes
            # Decode the bytes
            "name": map(bytes.decode, names),
            # The data itself is an array
            # Convert it to a list of arrays
            "data": list(data),
        }
    )

    display(df)