Spaces:

lanczos
/

graphtestbed

Running

Zhu Jiajun (jz28583) Claude Opus 4.7 (1M context) commited on Apr 21

Commit

9cb903d

1 Parent(s): 00bf799

Async Kaggle scoring: submit + insert pending row + background poll

The HF Space reverse proxy kills any request that holds the connection
open past ~5 min, so synchronous Kaggle scoring (which can take 10+ min
end-to-end) was failing even when the underlying submit + scoring both
succeeded — the response just never reached the client.

New flow for kaggle backend:
1. Server submits to Kaggle synchronously (~30s upload).
2. Inserts a 'pending' row into submissions (NULL primary_metric).
3. Spawns a daemon thread that polls Kaggle every 15s for up to 30 min
and UPDATEs the row when complete (or with secondary.error on fail).
4. Returns 200 immediately with pending=true and the run_id.
5. New GET /run/<run_id> lets clients poll for the resolved score.

Leaderboard queries now filter `WHERE primary_metric IS NOT NULL` so
pending rows don't pollute rankings.

Client (gtb submit) recognizes pending response and prints a follow-up
hint instead of trying to print scores it doesn't have yet.

Removed /admin/insert — the only honest path into the leaderboard is
through the scoring backend.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (2) hide show

graphtestbed/submit.py +6 -0
server/api.py +124 -106

graphtestbed/submit.py CHANGED Viewed

@@ -141,6 +141,12 @@ def submit(
     out = resp.json()
     print()
     print(f"✓ Scored  (run_id={out['run_id']})")
     print(f"  primary ({task_config(task)['metric']['primary']}): "
           f"{out['primary']}")

     out = resp.json()
     print()
+    if out.get("pending"):
+        print(f"✓ Submitted to Kaggle  (run_id={out['run_id']})")
+        print(f"  Scoring runs async (typically 5–15 min). Check back via:")
+        print(f"    curl {API_URL}/run/{out['run_id']}")
+        print(f"    gtb leaderboard {task}")
+        return
     print(f"✓ Scored  (run_id={out['run_id']})")
     print(f"  primary ({task_config(task)['metric']['primary']}): "
           f"{out['primary']}")

server/api.py CHANGED Viewed

@@ -130,28 +130,18 @@ def _score(task: str, sub_df: pd.DataFrame, cfg: dict) -> dict:
     }
-def _score_kaggle(competition: str, raw_csv: bytes, run_id: str,
-                  poll_interval: int = 15, timeout_s: int = 600) -> dict:
-    """Forward to Kaggle's grading API. Returns once Kaggle reports complete.
-    Submits via `kaggle competitions submit` with description=graphtestbed-<run_id>
-    so we can locate the entry in `kaggle competitions submissions`. Polls every
-    `poll_interval` seconds until the submission's status is `complete` or until
-    `timeout_s` elapses. Public/private scores both surface (private is what
-    counts for the historical Kaggle leaderboard).
     """
-    import csv
-    import io
     import subprocess
     import tempfile
-    import time
     description = f"graphtestbed-{run_id}"
     with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
         tmp.write(raw_csv)
         tmp_path = tmp.name
     try:
         sub = subprocess.run(
             ["kaggle", "competitions", "submit",
@@ -164,42 +154,62 @@ def _score_kaggle(competition: str, raw_csv: bytes, run_id: str,
                 f"stdout={sub.stdout.strip()[-500:]!r}; "
                 f"stderr={sub.stderr.strip()[-500:]!r}"
             )
-        deadline = time.monotonic() + timeout_s
-        while time.monotonic() < deadline:
-            time.sleep(poll_interval)
-            ls = subprocess.run(
-                ["kaggle", "competitions", "submissions",
-                 "-c", competition, "--csv"],
-                capture_output=True, text=True, timeout=60,
-            )
-            if ls.returncode != 0:
-                continue
-            for row in csv.DictReader(io.StringIO(ls.stdout)):
-                if row.get("description") != description:
-                    continue
-                status = (row.get("status") or "").lower()
-                if status == "complete":
-                    pub = row.get("publicScore") or ""
-                    priv = row.get("privateScore") or ""
-                    pub_f = float(pub) if pub else float("nan")
-                    return {
-                        "primary": round(pub_f, 3),
-                        "secondary": (
-                            {"private_score": round(float(priv), 3)} if priv
-                            else {}
-                        ),
-                        "n_rows": -1,  # Kaggle doesn't report row count
-                    }
-                if status in ("error", "failed"):
-                    err = row.get("errorDescription") or "unspecified"
-                    raise RuntimeError(f"kaggle scoring failed: {err}")
-                break  # found our row but pending — keep polling
-        raise TimeoutError(
-            f"kaggle scoring on {competition} did not complete within {timeout_s}s"
-        )
     finally:
         Path(tmp_path).unlink(missing_ok=True)
 def _validate_schema(sub_df: pd.DataFrame, cfg: dict) -> None:
@@ -283,6 +293,10 @@ def submit():
         return jsonify({"error": f"schema check failed: {e}"}), 422
     backend = cfg.get("backend", "gt")
     try:
         if backend == "gt":
             scored = _score(task, sub_df, cfg)
@@ -293,7 +307,14 @@ def submit():
                     f"task '{task}' has backend=kaggle but no "
                     f"backend_config.competition"
                 )}), 500
-            scored = _score_kaggle(comp, raw, uuid.uuid4().hex[:12])
         else:
             return jsonify({"error": f"unknown backend '{backend}'"}), 500
     except FileNotFoundError:
@@ -301,8 +322,6 @@ def submit():
     except Exception as e:
         return jsonify({"error": f"{backend}-backend scoring failed: {e}"}), 500
-    run_id = uuid.uuid4().hex[:12]
-    now = dt.datetime.now(dt.timezone.utc).isoformat()
     conn = _db()
     if not dry:
         conn.execute(
@@ -321,19 +340,29 @@ def submit():
             out.parent.mkdir(parents=True, exist_ok=True)
             out.write_bytes(raw)
-    # Rank = how many distinct agents have a strictly better best-score on
-    # this task. The just-inserted row contributes to that count only if the
-    # SAME agent had a better prior submission (in which case rank doesn't
-    # change for them on this submission).
-    rank = conn.execute("""
-        SELECT COUNT(*) + 1 FROM (
-            SELECT agent, MAX(primary_metric) AS best
-            FROM submissions
-            WHERE task = ?
-            GROUP BY agent
-            HAVING best > ?
-        )
-    """, (task, scored["primary"])).fetchone()[0]
     conn.close()
     return jsonify({
@@ -347,6 +376,7 @@ def submit():
         "quota_remaining": "unlimited" if bypass else (quota - 1),
         "bypass": bypass,
         "dry": dry,
         "submitted_at": now,
     })
@@ -359,7 +389,7 @@ def leaderboard(task: str):
         SELECT agent, MAX(primary_metric) as best, COUNT(*) as n_subs,
                MIN(submitted_at) as first_seen
         FROM submissions
-        WHERE task = ?
         GROUP BY agent
         ORDER BY best DESC
     """, (task,)).fetchall()
@@ -381,6 +411,7 @@ def leaderboard_all():
     rows = conn.execute("""
         SELECT task, agent, MAX(primary_metric) as best
         FROM submissions
         GROUP BY task, agent
     """).fetchall()
     conn.close()
@@ -435,49 +466,35 @@ def admin_delete():
     })
-@app.post("/admin/insert")
-def admin_insert():
-    """Insert a leaderboard row directly. Bypass-key gated.
-    Use for backends we can't proxy server-side (e.g. when Kaggle creds are
-    only available on the maintainer's machine — they run the submit + poll
-    locally and POST the resulting score here).
-    Body: JSON {"task": "...", "agent": "...", "primary": float,
-                "secondary": {...}, "n_rows": int|null, "sha256": str|null}
     """
-    import datetime as dt
-    import json as _json
-    import uuid as _uuid
-    sent_key = request.headers.get("X-Bypass-Key", "").strip()
-    if not (BYPASS_KEY and sent_key
-            and __import__("hmac").compare_digest(sent_key, BYPASS_KEY)):
-        return jsonify({"error": "bypass key required"}), 403
-    payload = request.get_json(silent=True) or {}
-    task = payload.get("task")
-    agent = payload.get("agent")
-    primary = payload.get("primary")
-    if not (task and agent and isinstance(primary, (int, float))):
-        return jsonify({"error": "task, agent, primary required"}), 400
-    secondary = payload.get("secondary") or {}
-    n_rows = int(payload.get("n_rows") or -1)
-    sha = payload.get("sha256") or "manual_insert"
-    run_id = _uuid.uuid4().hex[:12]
-    now = dt.datetime.now(dt.timezone.utc).isoformat()
     conn = _db()
-    conn.execute(
-        "INSERT INTO submissions VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
-        (run_id, task, agent, float(primary), _json.dumps(secondary),
-         sha, n_rows, "admin", now),
-    )
-    conn.commit()
     return jsonify({
-        "run_id": run_id,
-        "task": task,
-        "agent": agent,
-        "primary": primary,
-        "secondary": secondary,
     })
@@ -1249,7 +1266,8 @@ def landing():
         rows = conn.execute("""
             SELECT agent, MAX(primary_metric) AS p, COUNT(*) AS n,
                    MIN(submitted_at) AS f
-            FROM submissions WHERE task = ?
             GROUP BY agent ORDER BY p DESC
         """, (name,)).fetchall()
         n_rows_cfg = s.get("n_rows")

     }
+def _kaggle_submit(competition: str, raw_csv: bytes, run_id: str) -> str:
+    """Synchronously submit a CSV to Kaggle. Returns the description string used
+    to identify the submission; the caller is responsible for polling for the
+    score later via `_kaggle_poll_loop`. Raises on submit failure.
     """
     import subprocess
     import tempfile
     description = f"graphtestbed-{run_id}"
     with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
         tmp.write(raw_csv)
         tmp_path = tmp.name
     try:
         sub = subprocess.run(
             ["kaggle", "competitions", "submit",
                 f"stdout={sub.stdout.strip()[-500:]!r}; "
                 f"stderr={sub.stderr.strip()[-500:]!r}"
             )
     finally:
         Path(tmp_path).unlink(missing_ok=True)
+    return description
+def _kaggle_poll_loop(competition: str, description: str, run_id: str,
+                      poll_interval: int = 15, timeout_s: int = 1800) -> None:
+    """Poll Kaggle for the submission's score and UPDATE the matching DB row.
+    Designed to run in a daemon thread — never raises; failures are logged and
+    written into the row's `secondary` JSON so they're inspectable later. The
+    DB row must already exist (caller inserted it as 'pending' before spawning).
+    """
+    import csv
+    import io
+    import json as _json
+    import subprocess
+    import time
+    deadline = time.monotonic() + timeout_s
+    final = None  # tuple (primary, secondary_dict) or None on timeout/error
+    while time.monotonic() < deadline and final is None:
+        time.sleep(poll_interval)
+        ls = subprocess.run(
+            ["kaggle", "competitions", "submissions", "-c", competition, "--csv"],
+            capture_output=True, text=True, timeout=60,
+        )
+        if ls.returncode != 0:
+            continue
+        for row in csv.DictReader(io.StringIO(ls.stdout)):
+            if row.get("description") != description:
+                continue
+            status = (row.get("status") or "").lower()
+            if status == "complete":
+                pub = row.get("publicScore") or ""
+                priv = row.get("privateScore") or ""
+                final = (
+                    round(float(pub), 3) if pub else float("nan"),
+                    {"private_score": round(float(priv), 3)} if priv else {},
+                )
+            elif status in ("error", "failed"):
+                err = row.get("errorDescription") or "unspecified"
+                final = (float("nan"), {"error": f"kaggle scoring failed: {err}"})
+            break  # found our row; if still pending the inner loop falls through
+    if final is None:
+        final = (float("nan"), {"error": f"polled {timeout_s}s without complete"})
+    primary, secondary = final
+    # NaN can't go through SQLite NUMERIC; persist as NULL when scoring failed.
+    primary_db = None if primary != primary else primary  # NaN check
+    conn = _db()
+    conn.execute(
+        "UPDATE submissions SET primary_metric = ?, secondary = ? "
+        "WHERE run_id = ?",
+        (primary_db, _json.dumps(secondary), run_id),
+    )
+    conn.commit()
 def _validate_schema(sub_df: pd.DataFrame, cfg: dict) -> None:
         return jsonify({"error": f"schema check failed: {e}"}), 422
     backend = cfg.get("backend", "gt")
+    run_id = uuid.uuid4().hex[:12]
+    now = dt.datetime.now(dt.timezone.utc).isoformat()
+    pending = False
     try:
         if backend == "gt":
             scored = _score(task, sub_df, cfg)
                     f"task '{task}' has backend=kaggle but no "
                     f"backend_config.competition"
                 )}), 500
+            # Submit synchronously (fast, ~30s). Polling for the score happens
+            # in a background thread — we insert a 'pending' row immediately so
+            # the client never has to hold open a long-running connection
+            # (HF Space's reverse proxy kills these around the 5-min mark).
+            description = _kaggle_submit(comp, raw, run_id)
+            scored = {"primary": None, "secondary": {"status": "pending"},
+                      "n_rows": -1}
+            pending = True
         else:
             return jsonify({"error": f"unknown backend '{backend}'"}), 500
     except FileNotFoundError:
     except Exception as e:
         return jsonify({"error": f"{backend}-backend scoring failed: {e}"}), 500
     conn = _db()
     if not dry:
         conn.execute(
             out.parent.mkdir(parents=True, exist_ok=True)
             out.write_bytes(raw)
+    # For Kaggle backend, kick off the async poll AFTER inserting the row so
+    # the worker has a row to UPDATE.
+    if pending and not dry:
+        import threading
+        threading.Thread(
+            target=_kaggle_poll_loop,
+            args=(comp, description, run_id),
+            daemon=True,
+        ).start()
+    # Rank only meaningful for completed scores. Pending Kaggle entries skip it.
+    if pending:
+        rank = None
+    else:
+        rank = conn.execute("""
+            SELECT COUNT(*) + 1 FROM (
+                SELECT agent, MAX(primary_metric) AS best
+                FROM submissions
+                WHERE task = ?
+                GROUP BY agent
+                HAVING best > ?
+            )
+        """, (task, scored["primary"])).fetchone()[0]
     conn.close()
     return jsonify({
         "quota_remaining": "unlimited" if bypass else (quota - 1),
         "bypass": bypass,
         "dry": dry,
+        "pending": pending,
         "submitted_at": now,
     })
         SELECT agent, MAX(primary_metric) as best, COUNT(*) as n_subs,
                MIN(submitted_at) as first_seen
         FROM submissions
+        WHERE task = ? AND primary_metric IS NOT NULL
         GROUP BY agent
         ORDER BY best DESC
     """, (task,)).fetchall()
     rows = conn.execute("""
         SELECT task, agent, MAX(primary_metric) as best
         FROM submissions
+        WHERE primary_metric IS NOT NULL
         GROUP BY task, agent
     """).fetchall()
     conn.close()
     })
+@app.get("/run/<run_id>")
+def run_status(run_id: str):
+    """Look up a submission by run_id. Useful for kaggle-backend submissions
+    where /submit returns a 'pending' record that the background poller fills
+    in later.
     """
     conn = _db()
+    row = conn.execute("""
+        SELECT run_id, task, agent, primary_metric, secondary, sha256,
+               n_rows, ts
+        FROM submissions WHERE run_id = ?
+    """, (run_id,)).fetchone()
+    conn.close()
+    if not row:
+        return jsonify({"error": f"no run '{run_id}'"}), 404
+    rid, task, agent, primary, secondary, sha, n_rows, ts = row
+    sec = json.loads(secondary) if secondary else {}
+    if primary is None:
+        # Kaggle backend, still polling
+        status = "pending"
+    elif sec.get("error"):
+        status = "failed"
+    else:
+        status = "complete"
     return jsonify({
+        "run_id": rid, "task": task, "agent": agent,
+        "primary": primary, "secondary": sec,
+        "n_rows": n_rows, "submitted_at": ts,
+        "status": status,
     })
         rows = conn.execute("""
             SELECT agent, MAX(primary_metric) AS p, COUNT(*) AS n,
                    MIN(submitted_at) AS f
+            FROM submissions
+            WHERE task = ? AND primary_metric IS NOT NULL
             GROUP BY agent ORDER BY p DESC
         """, (name,)).fetchall()
         n_rows_cfg = s.get("n_rows")