Commit ·
7a084be
1
Parent(s): 4c39a05
Fixed issue with extraction...
Browse files
download/parquet_to_npy.py
CHANGED
|
@@ -99,20 +99,20 @@ def convert_split(parquet_dir: str, hf_split: str, aia_base: str, sxr_base: str,
|
|
| 99 |
# Falls back to to_pylist() if the column type doesn't support it.
|
| 100 |
def _to_numpy_bulk(col, n):
|
| 101 |
chunk = col.combine_chunks()
|
| 102 |
-
# Walk the
|
|
|
|
|
|
|
|
|
|
| 103 |
shape = []
|
| 104 |
-
t = chunk.type
|
| 105 |
-
while hasattr(t, "value_type"):
|
| 106 |
-
if hasattr(t, "list_size"): # FixedSizeList
|
| 107 |
-
shape.append(t.list_size)
|
| 108 |
-
t = t.value_type
|
| 109 |
-
# Walk the values to get the raw flat buffer
|
| 110 |
vals = chunk
|
| 111 |
while hasattr(vals, "values"):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
vals = vals.values
|
| 113 |
arr = vals.to_numpy(zero_copy_only=False).astype(np.float32)
|
| 114 |
-
|
| 115 |
-
return arr.reshape(n, *elem_shape)
|
| 116 |
|
| 117 |
try:
|
| 118 |
aia_bulk = _to_numpy_bulk(table.column("aia_stack"), n_rows)
|
|
|
|
| 99 |
# Falls back to to_pylist() if the column type doesn't support it.
|
| 100 |
def _to_numpy_bulk(col, n):
|
| 101 |
chunk = col.combine_chunks()
|
| 102 |
+
# Walk the array levels to recover per-element shape.
|
| 103 |
+
# FixedSizeList exposes list_size directly; regular List
|
| 104 |
+
# (which HF uses even for fixed-size arrays) stores a uniform
|
| 105 |
+
# stride in its offsets buffer — both give the same answer.
|
| 106 |
shape = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
vals = chunk
|
| 108 |
while hasattr(vals, "values"):
|
| 109 |
+
if hasattr(vals, "list_size"): # FixedSizeList
|
| 110 |
+
shape.append(vals.list_size)
|
| 111 |
+
elif hasattr(vals, "offsets"): # List — stride from offsets
|
| 112 |
+
shape.append(vals.offsets[1].as_py())
|
| 113 |
vals = vals.values
|
| 114 |
arr = vals.to_numpy(zero_copy_only=False).astype(np.float32)
|
| 115 |
+
return arr.reshape(n, *shape) if shape else arr.reshape(n, arr.size // n)
|
|
|
|
| 116 |
|
| 117 |
try:
|
| 118 |
aia_bulk = _to_numpy_bulk(table.column("aia_stack"), n_rows)
|