Spaces:

Neon-tech
/

Tok-cor

Paused

App Files Files Community

Neon-tech commited on 17 days ago

Commit

a29dc9c

verified ·

1 Parent(s): 8d0f444

Create app.py

Browse files

Files changed (1) hide show

app.py +184 -0

app.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import os
+import json
+import time
+import socket
+import threading
+import io
+import requests
+from pathlib import Path
+from huggingface_hub import HfApi, list_repo_tree
+# ── Config ───────────────────────────────────────────────────────────────────
+HF_TOKEN      = os.environ.get("HF_TOKEN")
+DATASET_REPO  = "HuggingFaceFW/fineweb-edu"
+RAW_DIR       = "/data/raw"
+STATE_FILE    = "/data/state.json"
+WORKER_TIMEOUT = 600  # 10 min — reclaim stale claimed shards
+# CC-MAIN-2025 prefix filter
+CC_PREFIX = "data/CC-MAIN-2025"
+os.makedirs(RAW_DIR, exist_ok=True)
+api = HfApi(token=HF_TOKEN)
+# ── Keep-alive ────────────────────────────────────────────────────────────────
+def serve():
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    s.bind(("0.0.0.0", 7860))
+    s.listen(5)
+    print("✓ Listening on port 7860")
+    while True:
+        conn, _ = s.accept()
+        conn.send(b"HTTP/1.1 200 OK\r\nContent-Length: 2\r\n\r\nOK")
+        conn.close()
+# ── State ─────────────────────────────────────────────────────────────────────
+def load_state():
+    if os.path.exists(STATE_FILE):
+        with open(STATE_FILE) as f:
+            state = json.load(f)
+        total   = len(state["shards"])
+        done    = sum(1 for s in state["shards"].values() if s["status"] == "done")
+        claimed = sum(1 for s in state["shards"].values() if s["status"] == "claimed")
+        pending = sum(1 for s in state["shards"].values() if s["status"] == "pending")
+        print(f"Resuming — {done} done / {claimed} claimed / {pending} pending / {total} total")
+    else:
+        state = {"shards": {}}
+        print("Starting fresh")
+    return state
+def save_state(state):
+    tmp = STATE_FILE + ".tmp"
+    with open(tmp, "w") as f:
+        json.dump(state, f, indent=2)
+    os.replace(tmp, STATE_FILE)
+# ── Discover all CC-MAIN-2025 parquet files ───────────────────────────────────
+def discover_shards(state):
+    print("Discovering shards from HF...")
+    files = api.list_repo_files(DATASET_REPO, repo_type="dataset")
+    new_count = 0
+    for f in files:
+        if f.startswith(CC_PREFIX) and f.endswith(".parquet"):
+            if f not in state["shards"]:
+                state["shards"][f] = {
+                    "status":     "pending",
+                    "worker":     None,
+                    "claimed_at": None,
+                }
+                new_count += 1
+    print(f"✓ {new_count} new shards discovered | {len(state['shards'])} total")
+    save_state(state)
+# ── Reclaim timed-out shards ──────────────────────────────────────────────────
+def reclaim_stale(state):
+    now = time.time()
+    reclaimed = 0
+    for shard, info in state["shards"].items():
+        if info["status"] == "claimed" and info["claimed_at"]:
+            if now - info["claimed_at"] > WORKER_TIMEOUT:
+                print(f"  ⚠ Reclaiming stale shard: {shard} (worker: {info['worker']})")
+                info["status"]     = "pending"
+                info["worker"]     = None
+                info["claimed_at"] = None
+                reclaimed += 1
+    if reclaimed:
+        save_state(state)
+    return reclaimed
+# ── Download pending shards to /data/raw ─────────────────────────────────────
+def download_loop(state):
+    base_url = f"https://huggingface.co/datasets/{DATASET_REPO}/resolve/main/"
+    while True:
+        # Reclaim stale first
+        reclaim_stale(state)
+        # Reload state to pick up worker updates
+        if os.path.exists(STATE_FILE):
+            with open(STATE_FILE) as f:
+                state["shards"] = json.load(f)["shards"]
+        # Count how many raw files already sitting in /data/raw (not yet claimed)
+        raw_files = list(Path(RAW_DIR).glob("*.parquet"))
+        pending_raw = len(raw_files)
+        # Keep at most 4 shards pre-downloaded to avoid filling disk
+        if pending_raw >= 4:
+            print(f"  Buffer full ({pending_raw} shards waiting) — sleeping...")
+            time.sleep(60)
+            continue
+        # Find next pending shard to download
+        to_download = None
+        for shard, info in state["shards"].items():
+            if info["status"] == "pending":
+                raw_name = shard.replace("/", "__") + ".parquet"
+                raw_path = Path(RAW_DIR) / raw_name
+                if not raw_path.exists():
+                    to_download = shard
+                    break
+        if not to_download:
+            done  = sum(1 for s in state["shards"].values() if s["status"] == "done")
+            total = len(state["shards"])
+            if done == total:
+                print("✓ All shards complete!")
+                break
+            print("  Nothing to download right now — sleeping...")
+            time.sleep(60)
+            continue
+        # Download it
+        url      = base_url + to_download
+        raw_name = to_download.replace("/", "__") + ".parquet"
+        raw_path = Path(RAW_DIR) / raw_name
+        print(f"  Downloading: {to_download}")
+        try:
+            resp = requests.get(
+                url,
+                headers={"Authorization": f"Bearer {HF_TOKEN}"},
+                timeout=300,
+                stream=True,
+            )
+            resp.raise_for_status()
+            with open(raw_path, "wb") as f:
+                for chunk in resp.iter_content(chunk_size=8 * 1024 * 1024):
+                    f.write(chunk)
+            print(f"  ✓ Downloaded: {raw_name}")
+        except Exception as e:
+            print(f"  ✗ Download failed: {e}")
+            time.sleep(30)
+            continue
+        time.sleep(5)
+# ── Monitor loop — prints progress ───────────────────────────────────────────
+def monitor_loop(state):
+    while True:
+        time.sleep(120)
+        if os.path.exists(STATE_FILE):
+            with open(STATE_FILE) as f:
+                s = json.load(f)["shards"]
+            done    = sum(1 for v in s.values() if v["status"] == "done")
+            claimed = sum(1 for v in s.values() if v["status"] == "claimed")
+            pending = sum(1 for v in s.values() if v["status"] == "pending")
+            total   = len(s)
+            pct     = (done / total * 100) if total else 0
+            print(f"[MONITOR] {done}/{total} done ({pct:.1f}%) | {claimed} active | {pending} pending")
+# ── Entry point ───────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    threading.Thread(target=serve, daemon=True).start()
+    state = load_state()
+    discover_shards(state)
+    threading.Thread(target=monitor_loop, args=(state,), daemon=True).start()
+    threading.Thread(target=download_loop, args=(state,), daemon=True).start()
+    while True:
+        time.sleep(60)