Neon-tech commited on
Commit
ae57a5a
·
verified ·
1 Parent(s): 12780f6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -0
app.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pyarrow.parquet as pq
2
+ import pyarrow as pa
3
+ from pathlib import Path
4
+
5
+ OUT_DIR = "/data/image-shards"
6
+
7
+ for shard in Path(OUT_DIR).glob("*.parquet"):
8
+ table = pq.read_table(shard)
9
+ df = table.to_pandas()
10
+
11
+ df["image"] = df["image"].apply(lambda b: {"bytes": b, "path": None})
12
+
13
+ pq.write_table(pa.Table.from_pandas(df), shard, compression="snappy")
14
+ print(f"Fixed {shard.name}")