RoboMME / scripts /dev /inspect_hdf5.py
HongzeFu's picture
HF Space: code-only (no binary assets)
06c11b0
import h5py
import sys
import numpy as np
def _decode_h5_object(value):
"""Decode HDF5 object dtype (vlen str) to Python str for display."""
if value is None:
return None
if isinstance(value, np.ndarray):
if value.size == 0:
return None
value = np.reshape(value, -1)[0]
if isinstance(value, (bytes, np.bytes_)):
try:
return value.decode("utf-8")
except Exception:
return repr(value)
if isinstance(value, str):
return value
return str(value)
def print_hdf5_structure(name, obj, indent=0):
"""
Recursively print the structure of an HDF5 group or dataset.
Only prints one 'episode_' and one 'timestep_' per level to avoid cluttering.
"""
base_name = name.split('/')[-1]
# Check if we should skip this item to limit to one episode/timestep
parent_path = '/'.join(name.split('/')[:-1])
# This logic is a bit tricky inside visititems since it's flat traversal normally
# But we can implement a custom recursive function instead
pass
def _format_value(obj, max_elems=20, max_str_len=200, max_array_size=10000):
"""Read dataset and format for display; handle scalars and arrays."""
try:
shape = obj.shape
size = int(np.prod(shape)) if shape else 0
if size > max_array_size:
# Large array: read only the first max_elems elements (flattened in C-order)
take = min(max_elems, size)
if take == 0:
return "[]"
idx = np.unravel_index(take - 1, shape)
slice_tuple = tuple(slice(0, int(i) + 1) for i in idx)
raw = obj[slice_tuple]
flat = np.asarray(raw).reshape(-1)[:take]
n = len(flat)
total = size
else:
raw = obj[()]
if raw is None:
return "None"
if obj.shape == () or np.isscalar(raw):
out = _decode_h5_object(raw)
if out is None:
out = str(raw)
if isinstance(out, str) and len(out) > max_str_len:
out = out[:max_str_len] + "..."
return out
arr = np.asarray(raw)
flat = np.reshape(arr, -1)
n = min(flat.size, max_elems)
total = flat.size
except Exception as e:
return f"(read error: {e})"
if n == 0:
return "[]"
parts = []
for i in range(n):
v = flat.flat[i]
if isinstance(v, (bytes, np.bytes_)):
try:
v = v.decode("utf-8")
except Exception:
v = repr(v)
parts.append(str(v))
s = "[" + ", ".join(parts) + "]"
if total > max_elems:
s += f" ... ({total} total)"
return s
def print_recursive(obj, indent=0):
tab = " " * indent
if isinstance(obj, h5py.Dataset):
name = (obj.name or "").split("/")[-1]
print(f"{tab}- [Dataset] {name}: shape={obj.shape}, dtype={obj.dtype}")
# Print value: scalar, small array, or array summary
value_str = _format_value(obj)
if value_str:
print(f"{tab} -> {value_str}")
elif isinstance(obj, h5py.Group):
print(f"{tab}+ [Group] {(obj.name or '').split('/')[-1]}")
# Sort items: groups first, then datasets? Or just as is.
# Filter items to only show one episode_* or timestep_*
items = list(obj.items())
shown_episode = False
shown_timestep = False
for name, item in items:
is_episode = name.startswith('episode_')
is_timestep = name.startswith('timestep_')
if is_episode:
if not shown_episode:
print_recursive(item, indent + 1)
shown_episode = True
continue
if is_timestep:
if not shown_timestep:
print_recursive(item, indent + 1)
shown_timestep = True
continue
# Regular items (meta, obs, action, info etc)
print_recursive(item, indent + 1)
DEFAULT_PATH = "/data/hongzefu/data_0226/record_dataset_SwingXtimes.h5"
def main():
filepath = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_PATH
print(f"Inspecting HDF5 file: {filepath}")
try:
with h5py.File(filepath, 'r') as f:
# The root itself if it has a name (usually empty string or '/')
print("/")
items = list(f.items())
shown_episode = False
for name, item in items:
if name.startswith('episode_'):
if not shown_episode:
print_recursive(item, 1)
shown_episode = True
continue
print_recursive(item, 1)
except Exception as e:
print(f"Error reading HDF5 file: {e}")
if __name__ == "__main__":
main()