"""Entry point for the GraphTestbed scoring server on HF Spaces. On boot: 1. snapshot_download the companion dataset repo (lanczos/graphtestbed-gt by default) into /data: gt/*.csv, leaderboard.db, submissions/**/*.csv. 2. Spawn a daemon thread that every BACKUP_INTERVAL seconds: a. SELECT COUNT(*) FROM submissions; bail if unchanged. b. sqlite3.Connection.backup() into a temp file (atomic, lock-safe). c. upload_file the temp file → leaderboard.db in the dataset repo. d. upload_folder /data/submissions/ → submissions/ in the dataset repo (huggingface_hub diffs by content-hash; unchanged files don't transfer). 3. Hand off to server/api.py via Flask app.run(threaded=True). Env vars (all have sensible defaults baked into the Dockerfile): HF_TOKEN required write scope on GT_DATASET_REPO GT_DATASET_REPO optional default: lanczos/graphtestbed-gt GT_DATA_ROOT optional default: /data GT_BACKUP_INTERVAL optional default: 60 (seconds) PORT optional default: 7860 """ from __future__ import annotations import os import sqlite3 import sys import threading import time from pathlib import Path from huggingface_hub import snapshot_download, upload_file, upload_folder HF_TOKEN = os.environ.get("HF_TOKEN") HF_REPO = os.environ.get("GT_DATASET_REPO", "lanczos/graphtestbed-gt") DATA_DIR = Path(os.environ.get("GT_DATA_ROOT", "/data")) GT_DIR = DATA_DIR / "gt" DB_PATH = DATA_DIR / "leaderboard.db" ARCHIVE_DIR = DATA_DIR / "submissions" BACKUP_INTERVAL = int(os.environ.get("GT_BACKUP_INTERVAL", "60")) PORT = int(os.environ.get("PORT", "7860")) def _require_token() -> str: if not HF_TOKEN: raise SystemExit( "HF_TOKEN is unset. Set it as a Space secret with write scope on " f"{HF_REPO}." ) return HF_TOKEN def bootstrap() -> None: """Pull GT files, leaderboard, and submission archive from the dataset repo.""" token = _require_token() for d in (DATA_DIR, GT_DIR, ARCHIVE_DIR): d.mkdir(parents=True, exist_ok=True) print(f"snapshot_download {HF_REPO} → {DATA_DIR}", flush=True) try: snapshot_download( HF_REPO, repo_type="dataset", local_dir=str(DATA_DIR), allow_patterns=["gt/*.csv", "leaderboard.db", "submissions/**/*.csv"], token=token, ) except Exception as e: # First-deploy or empty repo: keep going with empty /data. print(f"snapshot_download warning ({type(e).__name__}): {e}", flush=True) n_gt = len(list(GT_DIR.glob("*.csv"))) print(f"GT files present: {n_gt}", flush=True) if DB_PATH.exists(): try: n = int(sqlite3.connect(DB_PATH).execute( "SELECT COUNT(*) FROM submissions" ).fetchone()[0]) print(f"restored leaderboard.db ({n} submissions)", flush=True) except sqlite3.OperationalError: print("leaderboard.db present but no submissions table yet", flush=True) else: print("no prior leaderboard.db; starting fresh", flush=True) def _submission_count() -> int: if not DB_PATH.exists(): return 0 try: conn = sqlite3.connect(DB_PATH) try: row = conn.execute("SELECT COUNT(*) FROM submissions").fetchone() return int(row[0]) if row else 0 finally: conn.close() except sqlite3.OperationalError: return 0 def _atomic_db_copy(dst: Path) -> None: """sqlite3.backup() is lock-safe — readers/writers stay consistent.""" src = sqlite3.connect(DB_PATH) try: target = sqlite3.connect(dst) try: src.backup(target) finally: target.close() finally: src.close() def backup_loop() -> None: token = _require_token() last_count = -1 print(f"backup_loop started (interval={BACKUP_INTERVAL}s)", flush=True) while True: time.sleep(BACKUP_INTERVAL) n = _submission_count() if n == last_count: continue try: tmp = DATA_DIR / "_leaderboard.db.tmp" _atomic_db_copy(tmp) upload_file( path_or_fileobj=str(tmp), path_in_repo="leaderboard.db", repo_id=HF_REPO, repo_type="dataset", token=token, commit_message=f"backup leaderboard ({n} submissions)", ) tmp.unlink() except Exception as e: print(f"leaderboard backup failed: {type(e).__name__}: {e}", flush=True) continue if ARCHIVE_DIR.exists() and any(ARCHIVE_DIR.rglob("*.csv")): try: upload_folder( folder_path=str(ARCHIVE_DIR), path_in_repo="submissions", repo_id=HF_REPO, repo_type="dataset", token=token, commit_message=f"archive submissions ({n} total)", allow_patterns=["**/*.csv"], ) except Exception as e: print(f"submission archive failed: {type(e).__name__}: {e}", flush=True) last_count = n print(f"backup pushed: {n} submissions", flush=True) def main() -> int: bootstrap() # Make sure server/api.py reads paths consistent with what we just bootstrapped. os.environ.setdefault("GT_DIR", str(GT_DIR)) os.environ.setdefault("GT_DB", str(DB_PATH)) os.environ.setdefault("GT_ARCHIVE_DIR", str(ARCHIVE_DIR)) threading.Thread(target=backup_loop, daemon=True).start() sys.path.insert(0, str(Path(__file__).resolve().parents[1])) from api import app # noqa: E402 — env vars must be set first print(f"serving on 0.0.0.0:{PORT}", flush=True) app.run(host="0.0.0.0", port=PORT, threaded=True, use_reloader=False) return 0 if __name__ == "__main__": raise SystemExit(main())