griffingoodwin04 commited on
Commit
7a084be
·
1 Parent(s): 4c39a05

Fixed issue with extraction...

Browse files
Files changed (1) hide show
  1. download/parquet_to_npy.py +9 -9
download/parquet_to_npy.py CHANGED
@@ -99,20 +99,20 @@ def convert_split(parquet_dir: str, hf_split: str, aia_base: str, sxr_base: str,
99
  # Falls back to to_pylist() if the column type doesn't support it.
100
  def _to_numpy_bulk(col, n):
101
  chunk = col.combine_chunks()
102
- # Walk the PyArrow type to recover the per-element shape
 
 
 
103
  shape = []
104
- t = chunk.type
105
- while hasattr(t, "value_type"):
106
- if hasattr(t, "list_size"): # FixedSizeList
107
- shape.append(t.list_size)
108
- t = t.value_type
109
- # Walk the values to get the raw flat buffer
110
  vals = chunk
111
  while hasattr(vals, "values"):
 
 
 
 
112
  vals = vals.values
113
  arr = vals.to_numpy(zero_copy_only=False).astype(np.float32)
114
- elem_shape = tuple(shape) if shape else (arr.size // n,)
115
- return arr.reshape(n, *elem_shape)
116
 
117
  try:
118
  aia_bulk = _to_numpy_bulk(table.column("aia_stack"), n_rows)
 
99
  # Falls back to to_pylist() if the column type doesn't support it.
100
  def _to_numpy_bulk(col, n):
101
  chunk = col.combine_chunks()
102
+ # Walk the array levels to recover per-element shape.
103
+ # FixedSizeList exposes list_size directly; regular List
104
+ # (which HF uses even for fixed-size arrays) stores a uniform
105
+ # stride in its offsets buffer — both give the same answer.
106
  shape = []
 
 
 
 
 
 
107
  vals = chunk
108
  while hasattr(vals, "values"):
109
+ if hasattr(vals, "list_size"): # FixedSizeList
110
+ shape.append(vals.list_size)
111
+ elif hasattr(vals, "offsets"): # List — stride from offsets
112
+ shape.append(vals.offsets[1].as_py())
113
  vals = vals.values
114
  arr = vals.to_numpy(zero_copy_only=False).astype(np.float32)
115
+ return arr.reshape(n, *shape) if shape else arr.reshape(n, arr.size // n)
 
116
 
117
  try:
118
  aia_bulk = _to_numpy_bulk(table.column("aia_stack"), n_rows)