HDF5 (h5py) - Hierarchical Data Format
Guide to HDF5 file format and h5py library for efficient storage and retrieval of large scientific datasets with hierarchical organization.
What is HDF5?
HDF5 (Hierarchical Data Format version 5) is a binary data format designed for managing large and complex datasets. It provides a hierarchical structure similar to a filesystem, allowing efficient storage and retrieval of multidimensional arrays and associated metadata.
Key Features of HDF5
Hierarchical Organization
- Groups act like directories to organize data
- Datasets contain the actual data arrays
- Attributes store metadata for groups and datasets
Efficient Storage
- Compression reduces file size
- Chunking enables efficient partial access
- In-place modification without rewriting entire files
Cross-Language Support
- H5Py for Python
- HDF5 libraries for C/C++, Java, MATLAB, R
- Standard format ensures interoperability
Installing and Importing h5py
pip install h5py
import h5py
import numpy as np
Basic HDF5 Operations
Creating HDF5 Files
# Create a new HDF5 file
with h5py.File('data.h5', 'w') as f:
# File is automatically closed when exiting the context manager
# Open in different modes
# 'w' - write (creates new file or truncates existing)
# 'r' - read only
# 'a' - append (creates if doesn't exist)
# 'r+' - read/write existing file
Working with Groups
with h5py.File('data.h5', 'a') as f:
# Create groups (like folders)
grp1 = f.create_group('experiment1')
grp2 = f.create_group('/measurements/temperature')
# Access groups
temp_group = f['measurements']['temperature']
# Check structure
print(f.keys()) # List top-level groups/datasets
Creating Datasets
with h5py.File('data.h5', 'a') as f:
# Create dataset from NumPy arrays
data = np.random.rand(100, 200)
dset = f.create_dataset('temperature_readings', data=data)
# Create with specific data types
int_array = np.array([1, 2, 3, 4], dtype='i4')
dset_int = f.create_dataset('parameters', data=int_array)
# Empty dataset (specify shape and dtype)
empty_dset = f.create_dataset('results',
shape=(1000,),
dtype='f8')
Dataset Properties and Attributes
Dataset Properties
with h5py.File('data.h5', 'r') as f:
dset = f['temperature_readings']
print(f"Shape: {dset.shape}") # (100, 200)
print(f"Data type: {dset.dtype}") # float64 etc.
print(f"Size: {dset.size} elements") # Total elements
print(f"Chunks: {dset.chunks}") # Chunking info
print(f"Compression: {dset.compression}") # Compression type
Attributes (Metadata)
with h5py.File('data.h5', 'a') as f:
dset = f['temperature_readings']
# Add metadata
dset.attrs['units'] = 'Celsius'
dset.attrs['sensor_type'] = 'thermocouple'
dset.attrs['date_collected'] = '2025-01-15'
# Read attributes
print(f"Units: {dset.attrs['units']}")
print(f"Available attrs: {list(dset.attrs.keys())}")
# File-level attributes
f.attrs['experiment_name'] = 'Heat Transfer Study'
f.attrs['researcher'] = 'Dr. Jane Smith'
Advanced Features
Chunking and Compression
with h5py.File('large_data.h5', 'w') as f:
# Compression reduces file size
dset = f.create_dataset('large_array',
shape=(10000, 10000),
dtype='f4',
chunks=(1000, 1000), # Chunk size
compression='gzip', # Compression algorithm
compression_opts=6) # Compression level
# Fill data
data = np.random.rand(10000, 10000).astype('f4')
dset[:] = data
Partial I/O Operations
with h5py.File('data.h5', 'r') as f:
dset = f['large_matrix']
# Read subset of data
subset = dset[100:200, 50:150] # NumPy-like slicing
# Read by index
single_value = dset[10, 20]
# Boolean indexing not supported
# Use numpy operations on sliced data instead
# Update subset (if file opened with 'a' or 'r+')
# dset[0:10, 0:10] = new_data
Variable-Length Strings
with h5py.File('text_data.h5', 'w') as f:
# Fixed-length strings
fixed_strings = ['apple', 'banana', 'cherry']
dt = h5py.string_dtype(encoding='utf-8', length=10)
dset_fixed = f.create_dataset('fixed_strings',
(len(fixed_strings),),
dtype=dt)
dset_fixed[:] = fixed_strings
# Variable-length strings
variable_strings = ['short', 'medium length', 'very long string indeed']
dt_var = h5py.string_dtype(encoding='utf-8')
dset_var = f.create_dataset('var_strings',
(len(variable_strings),),
dtype=dt_var)
dset_var[:] = variable_strings
Links and References
with h5py.File('linked.h5', 'w') as f:
# Create datasets
data1 = f.create_dataset('dataset1', data=[1, 2, 3])
data2 = f.create_dataset('dataset2', data=[4, 5, 6])
# Create hard links (same data, different names)
f['alias1'] = f['dataset1']
# Create groups for soft links
grp = f.create_group('experiments')
# grp['results'] = h5py.SoftLink('/dataset2') # Reference to data2
# References to objects
ref1 = f.ref(data1)
ref2 = f.ref(data2)
print(f"Reference to dataset1: {ref1}")
Working with NumPy Arrays
Direct HDF5 ↔ NumPy Conversion
# Save NumPy array
arr = np.random.rand(1000, 1000)
with h5py.File('arrays.h5', 'w') as f:
f['random_matrix'] = arr
# Load as NumPy array
with h5py.File('arrays.h5', 'r') as f:
loaded_arr = f['random_matrix'][:] # Note the [:]
print(type(loaded_arr)) # <class 'numpy.ndarray'>
print(np.array_equal(arr, loaded_arr)) # True
Memory-Mapped Arrays
# For very large arrays that don't fit in memory
with h5py.File('huge_data.h5', 'r') as f:
# Access data without loading into memory
dset = f['large_dataset']
# Process in chunks
chunk_size = 1000
for i in range(0, dset.shape[0], chunk_size):
chunk = dset[i:i+chunk_size, :]
# Process chunk
process_data(chunk)
Performance Optimization
Dataset Layout Optimization
# Optimal chunk size (roughly equal to data access patterns)
chunk_size = (1000, 1000) # For 2D arrays
# Create with optimal layout
with h5py.File('optimized.h5', 'w') as f:
dset = f.create_dataset('data',
shape=(10000, 10000),
dtype='f4',
chunks=chunk_size,
compression='lzf', # Fast compression
shuffle=True, # Arrange data for compression
fletcher32=True) # Error detection
Parallel I/O with MPI
# For cluster computing
import mpi4py.MPI as MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
with h5py.File('parallel.h5', 'w', driver='mpio', comm=comm) as f:
# Each process writes to different part
start = rank * 1000
end = (rank + 1) * 1000
f.create_dataset(f'process_{rank}',
data=np.arange(start, end))
Comparison with Other Formats
HDF5 vs CSV
- HDF5: Fast access, compression, hierarchical
- CSV: Human-readable, simple, universal
HDF5 vs Parquet
- HDF5: General purpose, better for 2D/3D arrays
- Parquet: Optimized for tabular data, columnar storage
HDF5 vs NumPy .npy
- HDF5: Metadata, compression, partial access
- NumPy: Simple, fast, single array per file
Best Practices
File Organization
# Recommended structure
/
├── metadata/ # File-level attributes
│ └── description
├── datasets/
│ ├── sensor_data/
│ │ ├── temperature/
│ │ └── pressure/
│ └── computed/
│ └── results/
└── references/ # Links to related files
Error Handling
# Robust file operations
try:
with h5py.File('data.h5', 'r') as f:
if 'dataset_name' in f:
data = f['dataset_name'][:]
else:
print("Dataset not found")
except OSError as e:
print(f"Cannot open file: {e}")
except KeyError as e:
print(f"Missing dataset: {e}")
Data Validation
def validate_hdf5_file(filepath):
"""Validate HDF5 file integrity"""
try:
with h5py.File(filepath, 'r') as f:
# Check required groups/datasets
required_datasets = ['temperature', 'pressure']
for dataset in required_datasets:
if dataset not in f:
raise ValueError(f"Missing required dataset: {dataset}")
# Validate data types and shapes
temp = f['temperature']
if temp.dtype != np.float32:
print("Warning: temperature not float32")
print("File validation passed")
return True
except Exception as e:
print(f"File validation failed: {e}")
return False
HDF5 and h5py provide essential tools for scientific data management. The hierarchical structure, compression capabilities, and cross-language support make it ideal for complex scientific applications where performance and metadata management are critical.
This guide covers the core functionality of HDF5 and h5py. For advanced parallel I/O and custom filters, consult the official documentation.
Updated: January 15, 2025
Author: Danial Pahlavan
Category: Data Formats