Spaces:

lanczos
/

graphtestbed

Sleeping

File size: 6,002 Bytes

d094faf

"""Entry point for the GraphTestbed scoring server on HF Spaces.

On boot:
  1. snapshot_download the companion dataset repo (lanczos/graphtestbed-gt by
     default) into /data: gt/*.csv, leaderboard.db, submissions/**/*.csv.
  2. Spawn a daemon thread that every BACKUP_INTERVAL seconds:
       a. SELECT COUNT(*) FROM submissions; bail if unchanged.
       b. sqlite3.Connection.backup() into a temp file (atomic, lock-safe).
       c. upload_file the temp file → leaderboard.db in the dataset repo.
       d. upload_folder /data/submissions/ → submissions/ in the dataset repo
          (huggingface_hub diffs by content-hash; unchanged files don't transfer).
  3. Hand off to server/api.py via Flask app.run(threaded=True).

Env vars (all have sensible defaults baked into the Dockerfile):
  HF_TOKEN              required   write scope on GT_DATASET_REPO
  GT_DATASET_REPO       optional   default: lanczos/graphtestbed-gt
  GT_DATA_ROOT          optional   default: /data
  GT_BACKUP_INTERVAL    optional   default: 60 (seconds)
  PORT                  optional   default: 7860
"""

from __future__ import annotations

import os
import sqlite3
import sys
import threading
import time
from pathlib import Path

from huggingface_hub import snapshot_download, upload_file, upload_folder

HF_TOKEN = os.environ.get("HF_TOKEN")
HF_REPO = os.environ.get("GT_DATASET_REPO", "lanczos/graphtestbed-gt")
DATA_DIR = Path(os.environ.get("GT_DATA_ROOT", "/data"))
GT_DIR = DATA_DIR / "gt"
DB_PATH = DATA_DIR / "leaderboard.db"
ARCHIVE_DIR = DATA_DIR / "submissions"
BACKUP_INTERVAL = int(os.environ.get("GT_BACKUP_INTERVAL", "60"))
PORT = int(os.environ.get("PORT", "7860"))


def _require_token() -> str:
    if not HF_TOKEN:
        raise SystemExit(
            "HF_TOKEN is unset. Set it as a Space secret with write scope on "
            f"{HF_REPO}."
        )
    return HF_TOKEN


def bootstrap() -> None:
    """Pull GT files, leaderboard, and submission archive from the dataset repo."""
    token = _require_token()
    for d in (DATA_DIR, GT_DIR, ARCHIVE_DIR):
        d.mkdir(parents=True, exist_ok=True)

    print(f"snapshot_download {HF_REPO} → {DATA_DIR}", flush=True)
    try:
        snapshot_download(
            HF_REPO,
            repo_type="dataset",
            local_dir=str(DATA_DIR),
            allow_patterns=["gt/*.csv", "leaderboard.db", "submissions/**/*.csv"],
            token=token,
        )
    except Exception as e:
        # First-deploy or empty repo: keep going with empty /data.
        print(f"snapshot_download warning ({type(e).__name__}): {e}", flush=True)

    n_gt = len(list(GT_DIR.glob("*.csv")))
    print(f"GT files present: {n_gt}", flush=True)
    if DB_PATH.exists():
        try:
            n = int(sqlite3.connect(DB_PATH).execute(
                "SELECT COUNT(*) FROM submissions"
            ).fetchone()[0])
            print(f"restored leaderboard.db ({n} submissions)", flush=True)
        except sqlite3.OperationalError:
            print("leaderboard.db present but no submissions table yet", flush=True)
    else:
        print("no prior leaderboard.db; starting fresh", flush=True)


def _submission_count() -> int:
    if not DB_PATH.exists():
        return 0
    try:
        conn = sqlite3.connect(DB_PATH)
        try:
            row = conn.execute("SELECT COUNT(*) FROM submissions").fetchone()
            return int(row[0]) if row else 0
        finally:
            conn.close()
    except sqlite3.OperationalError:
        return 0


def _atomic_db_copy(dst: Path) -> None:
    """sqlite3.backup() is lock-safe — readers/writers stay consistent."""
    src = sqlite3.connect(DB_PATH)
    try:
        target = sqlite3.connect(dst)
        try:
            src.backup(target)
        finally:
            target.close()
    finally:
        src.close()


def backup_loop() -> None:
    token = _require_token()
    last_count = -1
    print(f"backup_loop started (interval={BACKUP_INTERVAL}s)", flush=True)
    while True:
        time.sleep(BACKUP_INTERVAL)
        n = _submission_count()
        if n == last_count:
            continue

        try:
            tmp = DATA_DIR / "_leaderboard.db.tmp"
            _atomic_db_copy(tmp)
            upload_file(
                path_or_fileobj=str(tmp),
                path_in_repo="leaderboard.db",
                repo_id=HF_REPO, repo_type="dataset",
                token=token,
                commit_message=f"backup leaderboard ({n} submissions)",
            )
            tmp.unlink()
        except Exception as e:
            print(f"leaderboard backup failed: {type(e).__name__}: {e}", flush=True)
            continue

        if ARCHIVE_DIR.exists() and any(ARCHIVE_DIR.rglob("*.csv")):
            try:
                upload_folder(
                    folder_path=str(ARCHIVE_DIR),
                    path_in_repo="submissions",
                    repo_id=HF_REPO, repo_type="dataset",
                    token=token,
                    commit_message=f"archive submissions ({n} total)",
                    allow_patterns=["**/*.csv"],
                )
            except Exception as e:
                print(f"submission archive failed: {type(e).__name__}: {e}", flush=True)

        last_count = n
        print(f"backup pushed: {n} submissions", flush=True)


def main() -> int:
    bootstrap()

    # Make sure server/api.py reads paths consistent with what we just bootstrapped.
    os.environ.setdefault("GT_DIR", str(GT_DIR))
    os.environ.setdefault("GT_DB", str(DB_PATH))
    os.environ.setdefault("GT_ARCHIVE_DIR", str(ARCHIVE_DIR))

    threading.Thread(target=backup_loop, daemon=True).start()

    sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
    from api import app  # noqa: E402 — env vars must be set first

    print(f"serving on 0.0.0.0:{PORT}", flush=True)
    app.run(host="0.0.0.0", port=PORT, threaded=True, use_reloader=False)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())