Spaces:

Neon-tech
/

Tok-cor

Paused

App Files Files Community

Neon-tech commited on about 1 month ago

Commit

af01b43

verified ·

1 Parent(s): e80aacb

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -198

app.py DELETED Viewed

@@ -1,198 +0,0 @@
-import os
-import json
-import re
-import gc
-import requests
-from pathlib import Path
-import pyarrow.parquet as pq
-import pyarrow as pa
-# ── Config ───────────────────────────────────────────────────────────────────
-HF_TOKEN       = os.environ.get("HF_TOKEN")
-DATASET_REPO   = "HuggingFaceFW/fineweb-edu"
-RAW_DIR        = "/data/raw"
-STATE_FILE     = "/data/state.json"
-ROWS_PER_CHUNK = 50_000
-def hf_path_from_name(name):
-    m = re.match(r"cc(\d{4})-(\d+)_(\d+)\.parquet", name)
-    if m:
-        year_week = f"{m.group(1)}-{m.group(2)}"
-        idx       = int(m.group(3))
-        return f"data/CC-MAIN-{year_week}/{idx//1000:03d}_{idx:05d}.parquet"
-    return None
-def download_file(hf_path, dest_path):
-    url      = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/{hf_path}"
-    tmp_path = dest_path.with_suffix(".tmp")
-    print(f"  Downloading: {hf_path}")
-    try:
-        resp = requests.get(
-            url,
-            headers={"Authorization": f"Bearer {HF_TOKEN}"},
-            timeout=300,
-            stream=True,
-        )
-        resp.raise_for_status()
-        with open(tmp_path, "wb") as f:
-            for chunk in resp.iter_content(chunk_size=8 * 1024 * 1024):
-                f.write(chunk)
-        tmp_path.rename(dest_path)
-        print(f"  ✓ Downloaded: {dest_path.name}")
-        return True
-    except Exception as e:
-        print(f"  ✗ Download failed: {e}")
-        tmp_path.unlink(missing_ok=True)
-        return False
-def split_file(raw_path):
-    name = raw_path.name
-    print(f"  Splitting: {name}")
-    try:
-        pf = pq.ParquetFile(raw_path)
-    except Exception as e:
-        print(f"  ✗ Corrupt (open): {name} — {e}")
-        raw_path.unlink(missing_ok=True)
-        return None
-    chunk_paths = []
-    chunk_idx   = 0
-    current     = []
-    try:
-        for batch in pf.iter_batches(batch_size=10_000, columns=["text"]):
-            current.append(batch)
-            if sum(len(b) for b in current) >= ROWS_PER_CHUNK:
-                chunk_name = name.replace(".parquet", f"_chunk{chunk_idx:03d}.parquet")
-                chunk_path = Path(RAW_DIR) / chunk_name
-                table = pa.Table.from_batches(current)
-                pq.write_table(table, chunk_path)
-                print(f"    ✓ {chunk_name} ({len(table):,} rows)")
-                chunk_paths.append(chunk_name)
-                chunk_idx += 1
-                current = []
-                del table
-                gc.collect()
-    except Exception as e:
-        print(f"  ✗ Corrupt (read): {name} — {e}")
-        for c in chunk_paths:
-            Path(RAW_DIR, c).unlink(missing_ok=True)
-        raw_path.unlink(missing_ok=True)
-        return None
-    if current:
-        chunk_name = name.replace(".parquet", f"_chunk{chunk_idx:03d}.parquet")
-        chunk_path = Path(RAW_DIR) / chunk_name
-        table = pa.Table.from_batches(current)
-        pq.write_table(table, chunk_path)
-        print(f"    ✓ {chunk_name} ({len(table):,} rows)")
-        chunk_paths.append(chunk_name)
-        del table
-        gc.collect()
-    return chunk_paths
-# ── Main ──────────────────────────────────────────────────────────────────────
-raw_files = [f for f in sorted(Path(RAW_DIR).glob("*.parquet")) if "_chunk" not in f.name]
-print(f"Found {len(raw_files)} unsplit files in /data/raw")
-# ── Load or create state ──────────────────────────────────────────────────────
-if os.path.exists(STATE_FILE):
-    with open(STATE_FILE) as f:
-        state = json.load(f)
-    print(f"Loaded existing state — {len(state['shards'])} shards, {len(state.get('queue', []))} queued")
-else:
-    state = {"shards": {}, "queue": []}
-    print("Fresh state")
-# ── Step 1: write all unsplit files to queue and delete them ──────────────────
-print("\n── Step 1: queuing and deleting unsplit files ──")
-for raw_path in raw_files:
-    hf_path = hf_path_from_name(raw_path.name)
-    if not hf_path:
-        print(f"  ✗ Could not derive hf_path for {raw_path.name}, skipping")
-        continue
-    if hf_path not in state.get("queue", []):
-        state.setdefault("queue", []).append(hf_path)
-        print(f"  ↺ Queued: {hf_path}")
-    raw_path.unlink(missing_ok=True)
-    print(f"  🗑 Deleted: {raw_path.name}")
-# save state with queue populated
-tmp = STATE_FILE + ".tmp"
-with open(tmp, "w") as f:
-    json.dump(state, f, indent=2)
-os.replace(tmp, STATE_FILE)
-print(f"\n✓ State saved — {len(state['queue'])} in queue")
-# ── Step 2: download, split, register ───────────���────────────────────────────
-print("\n── Step 2: downloading and splitting ──")
-for hf_path in list(state["queue"]):
-    # derive name from hf_path
-    m = re.search(r"CC-MAIN-(\d{4}-\d+)/\d+_(\d+)\.parquet", hf_path)
-    if not m:
-        print(f"  ✗ Could not parse hf_path: {hf_path}, skipping")
-        continue
-    name     = f"cc{m.group(1)}_{int(m.group(2)):06d}.parquet"
-    raw_path = Path(RAW_DIR) / name
-    # check if already split from a previous run
-    existing_chunks = sorted(Path(RAW_DIR).glob(f"{name.replace('.parquet', '')}_chunk*.parquet"))
-    if existing_chunks:
-        print(f"  ✓ Already split: {name}")
-        state["queue"].remove(hf_path)
-        for chunk in existing_chunks:
-            if chunk.name not in state["shards"]:
-                state["shards"][chunk.name] = {
-                    "status":     "pending",
-                    "hf_path":    hf_path,
-                    "worker":     None,
-                    "claimed_at": None,
-                    "error":      None,
-                    "retries":    0,
-                }
-        save_tmp = STATE_FILE + ".tmp"
-        with open(save_tmp, "w") as f:
-            json.dump(state, f, indent=2)
-        os.replace(save_tmp, STATE_FILE)
-        continue
-    # download
-    success = download_file(hf_path, raw_path)
-    if not success:
-        print(f"  ✗ Download failed, leaving in queue: {hf_path}")
-        continue
-    # split
-    chunk_names = split_file(raw_path)
-    if chunk_names is None:
-        print(f"  ✗ Still corrupt after download, leaving in queue: {hf_path}")
-        continue
-    # delete original
-    raw_path.unlink(missing_ok=True)
-    print(f"  🗑 Deleted original: {name}")
-    # register chunks and pop from queue
-    state["queue"].remove(hf_path)
-    for chunk_name in chunk_names:
-        if chunk_name not in state["shards"]:
-            state["shards"][chunk_name] = {
-                "status":     "pending",
-                "hf_path":    hf_path,
-                "worker":     None,
-                "claimed_at": None,
-                "error":      None,
-                "retries":    0,
-            }
-    # save after every file so restarts are safe
-    save_tmp = STATE_FILE + ".tmp"
-    with open(save_tmp, "w") as f:
-        json.dump(state, f, indent=2)
-    os.replace(save_tmp, STATE_FILE)
-    print(f"  ✓ Registered {len(chunk_names)} chunks for {name}")
-print(f"\n✓ All done — {len(state['shards'])} shards in state.json")