File size: 5,115 Bytes
06c11b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import h5py
import sys
import numpy as np

def _decode_h5_object(value):
    """Decode HDF5 object dtype (vlen str) to Python str for display."""
    if value is None:
        return None
    if isinstance(value, np.ndarray):
        if value.size == 0:
            return None
        value = np.reshape(value, -1)[0]
    if isinstance(value, (bytes, np.bytes_)):
        try:
            return value.decode("utf-8")
        except Exception:
            return repr(value)
    if isinstance(value, str):
        return value
    return str(value)


def print_hdf5_structure(name, obj, indent=0):
    """
    Recursively print the structure of an HDF5 group or dataset.
    Only prints one 'episode_' and one 'timestep_' per level to avoid cluttering.
    """
    base_name = name.split('/')[-1]
    
    # Check if we should skip this item to limit to one episode/timestep
    parent_path = '/'.join(name.split('/')[:-1])
    
    # This logic is a bit tricky inside visititems since it's flat traversal normally
    # But we can implement a custom recursive function instead
    pass

def _format_value(obj, max_elems=20, max_str_len=200, max_array_size=10000):
    """Read dataset and format for display; handle scalars and arrays."""
    try:
        shape = obj.shape
        size = int(np.prod(shape)) if shape else 0
        if size > max_array_size:
            # Large array: read only the first max_elems elements (flattened in C-order)
            take = min(max_elems, size)
            if take == 0:
                return "[]"
            idx = np.unravel_index(take - 1, shape)
            slice_tuple = tuple(slice(0, int(i) + 1) for i in idx)
            raw = obj[slice_tuple]
            flat = np.asarray(raw).reshape(-1)[:take]
            n = len(flat)
            total = size
        else:
            raw = obj[()]
            if raw is None:
                return "None"
            if obj.shape == () or np.isscalar(raw):
                out = _decode_h5_object(raw)
                if out is None:
                    out = str(raw)
                if isinstance(out, str) and len(out) > max_str_len:
                    out = out[:max_str_len] + "..."
                return out
            arr = np.asarray(raw)
            flat = np.reshape(arr, -1)
            n = min(flat.size, max_elems)
            total = flat.size
    except Exception as e:
        return f"(read error: {e})"

    if n == 0:
        return "[]"
    parts = []
    for i in range(n):
        v = flat.flat[i]
        if isinstance(v, (bytes, np.bytes_)):
            try:
                v = v.decode("utf-8")
            except Exception:
                v = repr(v)
        parts.append(str(v))
    s = "[" + ", ".join(parts) + "]"
    if total > max_elems:
        s += f" ... ({total} total)"
    return s


def print_recursive(obj, indent=0):
    tab = "  " * indent
    if isinstance(obj, h5py.Dataset):
        name = (obj.name or "").split("/")[-1]
        print(f"{tab}- [Dataset] {name}: shape={obj.shape}, dtype={obj.dtype}")
        # Print value: scalar, small array, or array summary
        value_str = _format_value(obj)
        if value_str:
            print(f"{tab}    -> {value_str}")
    elif isinstance(obj, h5py.Group):
        print(f"{tab}+ [Group] {(obj.name or '').split('/')[-1]}")
        
        # Sort items: groups first, then datasets? Or just as is.
        # Filter items to only show one episode_* or timestep_*
        
        items = list(obj.items())
        
        shown_episode = False
        shown_timestep = False
        
        for name, item in items:
            is_episode = name.startswith('episode_')
            is_timestep = name.startswith('timestep_')
            
            if is_episode:
                if not shown_episode:
                    print_recursive(item, indent + 1)
                    shown_episode = True
                continue
            
            if is_timestep:
                if not shown_timestep:
                    print_recursive(item, indent + 1)
                    shown_timestep = True
                continue
                
            # Regular items (meta, obs, action, info etc)
            print_recursive(item, indent + 1)

DEFAULT_PATH = "/data/hongzefu/data_0226/record_dataset_SwingXtimes.h5"

def main():
    filepath = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_PATH
    print(f"Inspecting HDF5 file: {filepath}")
    
    try:
        with h5py.File(filepath, 'r') as f:
            # The root itself if it has a name (usually empty string or '/')
            print("/")
            
            items = list(f.items())
            shown_episode = False
            
            for name, item in items:
                if name.startswith('episode_'):
                    if not shown_episode:
                        print_recursive(item, 1)
                        shown_episode = True
                    continue
                print_recursive(item, 1)
                
    except Exception as e:
        print(f"Error reading HDF5 file: {e}")

if __name__ == "__main__":
    main()