Spaces:

Neon-tech
/

Tok-cor

Paused

App Files Files Community

Neon-tech commited on about 1 month ago

Commit

e80aacb

verified ·

1 Parent(s): c5bb1b8

Update app.py

Browse files

Files changed (1) hide show

app.py +149 -42

app.py CHANGED Viewed

@@ -2,20 +2,48 @@ import os
 import json
 import re
 import gc
 from pathlib import Path
 import pyarrow.parquet as pq
 import pyarrow as pa
 # ── Config ───────────────────────────────────────────────────────────────────
-RAW_DIR       = "/data/raw"
-STATE_FILE    = "/data/state.json"
 ROWS_PER_CHUNK = 50_000
-def friendly_name(path):
-    m = re.search(r"CC-MAIN-(\d{4}-\d+)/\d+_(\d+)\.parquet", str(path))
     if m:
-        return f"cc{m.group(1)}_{int(m.group(2)):06d}.parquet"
-    return Path(path).name
 def split_file(raw_path):
     name = raw_path.name
@@ -24,26 +52,34 @@ def split_file(raw_path):
     try:
         pf = pq.ParquetFile(raw_path)
     except Exception as e:
-        print(f"  ✗ Corrupt, skipping: {name} — {e}")
-        return []
     chunk_paths = []
     chunk_idx   = 0
     current     = []
-    for batch in pf.iter_batches(batch_size=10_000, columns=["text"]):
-        current.append(batch)
-        if sum(len(b) for b in current) >= ROWS_PER_CHUNK:
-            chunk_name = name.replace(".parquet", f"_chunk{chunk_idx:03d}.parquet")
-            chunk_path = Path(RAW_DIR) / chunk_name
-            table = pa.Table.from_batches(current)
-            pq.write_table(table, chunk_path)
-            print(f"    ✓ {chunk_name} ({len(table):,} rows)")
-            chunk_paths.append(chunk_name)
-            chunk_idx += 1
-            current = []
-            del table
-            gc.collect()
     if current:
         chunk_name = name.replace(".parquet", f"_chunk{chunk_idx:03d}.parquet")
@@ -58,34 +94,105 @@ def split_file(raw_path):
     return chunk_paths
 # ── Main ──────────────────────────────────────────────────────────────────────
-raw_files = [f for f in Path(RAW_DIR).glob("*.parquet") if "_chunk" not in f.name]
 print(f"Found {len(raw_files)} unsplit files in /data/raw")
-state = {"shards": {}, "queue": []}
-for raw_path in sorted(raw_files):
-    chunk_names = split_file(raw_path)
-    if not chunk_names:
         continue
-    # delete the big original
-    raw_path.unlink()
-    print(f"  🗑 Deleted original: {raw_path.name}")
-    for chunk_name in chunk_names:
-        state["shards"][chunk_name] = {
-            "status":     "pending",
-            "hf_path":    str(raw_path.name),
-            "worker":     None,
-            "claimed_at": None,
-            "error":      None,
-            "retries":    0,
-        }
 tmp = STATE_FILE + ".tmp"
 with open(tmp, "w") as f:
     json.dump(state, f, indent=2)
 os.replace(tmp, STATE_FILE)
-total = len(state["shards"])
-print(f"\n✓ Done — {total} chunks registered in state.json")

 import json
 import re
 import gc
+import requests
 from pathlib import Path
 import pyarrow.parquet as pq
 import pyarrow as pa
 # ── Config ───────────────────────────────────────────────────────────────────
+HF_TOKEN       = os.environ.get("HF_TOKEN")
+DATASET_REPO   = "HuggingFaceFW/fineweb-edu"
+RAW_DIR        = "/data/raw"
+STATE_FILE     = "/data/state.json"
 ROWS_PER_CHUNK = 50_000
+def hf_path_from_name(name):
+    m = re.match(r"cc(\d{4})-(\d+)_(\d+)\.parquet", name)
     if m:
+        year_week = f"{m.group(1)}-{m.group(2)}"
+        idx       = int(m.group(3))
+        return f"data/CC-MAIN-{year_week}/{idx//1000:03d}_{idx:05d}.parquet"
+    return None
+def download_file(hf_path, dest_path):
+    url      = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/{hf_path}"
+    tmp_path = dest_path.with_suffix(".tmp")
+    print(f"  Downloading: {hf_path}")
+    try:
+        resp = requests.get(
+            url,
+            headers={"Authorization": f"Bearer {HF_TOKEN}"},
+            timeout=300,
+            stream=True,
+        )
+        resp.raise_for_status()
+        with open(tmp_path, "wb") as f:
+            for chunk in resp.iter_content(chunk_size=8 * 1024 * 1024):
+                f.write(chunk)
+        tmp_path.rename(dest_path)
+        print(f"  ✓ Downloaded: {dest_path.name}")
+        return True
+    except Exception as e:
+        print(f"  ✗ Download failed: {e}")
+        tmp_path.unlink(missing_ok=True)
+        return False
 def split_file(raw_path):
     name = raw_path.name
     try:
         pf = pq.ParquetFile(raw_path)
     except Exception as e:
+        print(f"  ✗ Corrupt (open): {name} — {e}")
+        raw_path.unlink(missing_ok=True)
+        return None
     chunk_paths = []
     chunk_idx   = 0
     current     = []
+    try:
+        for batch in pf.iter_batches(batch_size=10_000, columns=["text"]):
+            current.append(batch)
+            if sum(len(b) for b in current) >= ROWS_PER_CHUNK:
+                chunk_name = name.replace(".parquet", f"_chunk{chunk_idx:03d}.parquet")
+                chunk_path = Path(RAW_DIR) / chunk_name
+                table = pa.Table.from_batches(current)
+                pq.write_table(table, chunk_path)
+                print(f"    ✓ {chunk_name} ({len(table):,} rows)")
+                chunk_paths.append(chunk_name)
+                chunk_idx += 1
+                current = []
+                del table
+                gc.collect()
+    except Exception as e:
+        print(f"  ✗ Corrupt (read): {name} — {e}")
+        for c in chunk_paths:
+            Path(RAW_DIR, c).unlink(missing_ok=True)
+        raw_path.unlink(missing_ok=True)
+        return None
     if current:
         chunk_name = name.replace(".parquet", f"_chunk{chunk_idx:03d}.parquet")
     return chunk_paths
 # ── Main ──────────────────────────────────────────────────────────────────────
+raw_files = [f for f in sorted(Path(RAW_DIR).glob("*.parquet")) if "_chunk" not in f.name]
 print(f"Found {len(raw_files)} unsplit files in /data/raw")
+# ── Load or create state ──────────────────────────────────────────────────────
+if os.path.exists(STATE_FILE):
+    with open(STATE_FILE) as f:
+        state = json.load(f)
+    print(f"Loaded existing state — {len(state['shards'])} shards, {len(state.get('queue', []))} queued")
+else:
+    state = {"shards": {}, "queue": []}
+    print("Fresh state")
+# ── Step 1: write all unsplit files to queue and delete them ──────────────────
+print("\n── Step 1: queuing and deleting unsplit files ──")
+for raw_path in raw_files:
+    hf_path = hf_path_from_name(raw_path.name)
+    if not hf_path:
+        print(f"  ✗ Could not derive hf_path for {raw_path.name}, skipping")
         continue
+    if hf_path not in state.get("queue", []):
+        state.setdefault("queue", []).append(hf_path)
+        print(f"  ↺ Queued: {hf_path}")
+    raw_path.unlink(missing_ok=True)
+    print(f"  🗑 Deleted: {raw_path.name}")
+# save state with queue populated
 tmp = STATE_FILE + ".tmp"
 with open(tmp, "w") as f:
     json.dump(state, f, indent=2)
 os.replace(tmp, STATE_FILE)
+print(f"\n✓ State saved — {len(state['queue'])} in queue")
+# ── Step 2: download, split, register ────────────────────────────────────────
+print("\n── Step 2: downloading and splitting ──")
+for hf_path in list(state["queue"]):
+    # derive name from hf_path
+    m = re.search(r"CC-MAIN-(\d{4}-\d+)/\d+_(\d+)\.parquet", hf_path)
+    if not m:
+        print(f"  ✗ Could not parse hf_path: {hf_path}, skipping")
+        continue
+    name     = f"cc{m.group(1)}_{int(m.group(2)):06d}.parquet"
+    raw_path = Path(RAW_DIR) / name
+    # check if already split from a previous run
+    existing_chunks = sorted(Path(RAW_DIR).glob(f"{name.replace('.parquet', '')}_chunk*.parquet"))
+    if existing_chunks:
+        print(f"  ✓ Already split: {name}")
+        state["queue"].remove(hf_path)
+        for chunk in existing_chunks:
+            if chunk.name not in state["shards"]:
+                state["shards"][chunk.name] = {
+                    "status":     "pending",
+                    "hf_path":    hf_path,
+                    "worker":     None,
+                    "claimed_at": None,
+                    "error":      None,
+                    "retries":    0,
+                }
+        save_tmp = STATE_FILE + ".tmp"
+        with open(save_tmp, "w") as f:
+            json.dump(state, f, indent=2)
+        os.replace(save_tmp, STATE_FILE)
+        continue
+    # download
+    success = download_file(hf_path, raw_path)
+    if not success:
+        print(f"  ✗ Download failed, leaving in queue: {hf_path}")
+        continue
+    # split
+    chunk_names = split_file(raw_path)
+    if chunk_names is None:
+        print(f"  ✗ Still corrupt after download, leaving in queue: {hf_path}")
+        continue
+    # delete original
+    raw_path.unlink(missing_ok=True)
+    print(f"  🗑 Deleted original: {name}")
+    # register chunks and pop from queue
+    state["queue"].remove(hf_path)
+    for chunk_name in chunk_names:
+        if chunk_name not in state["shards"]:
+            state["shards"][chunk_name] = {
+                "status":     "pending",
+                "hf_path":    hf_path,
+                "worker":     None,
+                "claimed_at": None,
+                "error":      None,
+                "retries":    0,
+            }
+    # save after every file so restarts are safe
+    save_tmp = STATE_FILE + ".tmp"
+    with open(save_tmp, "w") as f:
+        json.dump(state, f, indent=2)
+    os.replace(save_tmp, STATE_FILE)
+    print(f"  ✓ Registered {len(chunk_names)} chunks for {name}")
+print(f"\n✓ All done — {len(state['shards'])} shards in state.json")