Spaces:

Neon-tech
/

Tok-cor

Paused

App Files Files Community

Neon-tech commited on about 1 month ago

Commit

c5bb1b8

verified ·

1 Parent(s): 991345e

Create app.py

Browse files

Files changed (1) hide show

app.py +91 -0

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import json
+import re
+import gc
+from pathlib import Path
+import pyarrow.parquet as pq
+import pyarrow as pa
+# ── Config ───────────────────────────────────────────────────────────────────
+RAW_DIR       = "/data/raw"
+STATE_FILE    = "/data/state.json"
+ROWS_PER_CHUNK = 50_000
+def friendly_name(path):
+    m = re.search(r"CC-MAIN-(\d{4}-\d+)/\d+_(\d+)\.parquet", str(path))
+    if m:
+        return f"cc{m.group(1)}_{int(m.group(2)):06d}.parquet"
+    return Path(path).name
+def split_file(raw_path):
+    name = raw_path.name
+    print(f"  Splitting: {name}")
+    try:
+        pf = pq.ParquetFile(raw_path)
+    except Exception as e:
+        print(f"  ✗ Corrupt, skipping: {name} — {e}")
+        return []
+    chunk_paths = []
+    chunk_idx   = 0
+    current     = []
+    for batch in pf.iter_batches(batch_size=10_000, columns=["text"]):
+        current.append(batch)
+        if sum(len(b) for b in current) >= ROWS_PER_CHUNK:
+            chunk_name = name.replace(".parquet", f"_chunk{chunk_idx:03d}.parquet")
+            chunk_path = Path(RAW_DIR) / chunk_name
+            table = pa.Table.from_batches(current)
+            pq.write_table(table, chunk_path)
+            print(f"    ✓ {chunk_name} ({len(table):,} rows)")
+            chunk_paths.append(chunk_name)
+            chunk_idx += 1
+            current = []
+            del table
+            gc.collect()
+    if current:
+        chunk_name = name.replace(".parquet", f"_chunk{chunk_idx:03d}.parquet")
+        chunk_path = Path(RAW_DIR) / chunk_name
+        table = pa.Table.from_batches(current)
+        pq.write_table(table, chunk_path)
+        print(f"    ✓ {chunk_name} ({len(table):,} rows)")
+        chunk_paths.append(chunk_name)
+        del table
+        gc.collect()
+    return chunk_paths
+# ── Main ──────────────────────────────────────────────────────────────────────
+raw_files = [f for f in Path(RAW_DIR).glob("*.parquet") if "_chunk" not in f.name]
+print(f"Found {len(raw_files)} unsplit files in /data/raw")
+state = {"shards": {}, "queue": []}
+for raw_path in sorted(raw_files):
+    chunk_names = split_file(raw_path)
+    if not chunk_names:
+        continue
+    # delete the big original
+    raw_path.unlink()
+    print(f"  🗑 Deleted original: {raw_path.name}")
+    for chunk_name in chunk_names:
+        state["shards"][chunk_name] = {
+            "status":     "pending",
+            "hf_path":    str(raw_path.name),
+            "worker":     None,
+            "claimed_at": None,
+            "error":      None,
+            "retries":    0,
+        }
+tmp = STATE_FILE + ".tmp"
+with open(tmp, "w") as f:
+    json.dump(state, f, indent=2)
+os.replace(tmp, STATE_FILE)
+total = len(state["shards"])
+print(f"\n✓ Done — {total} chunks registered in state.json")