Spaces:
Running
Running
| """GraphTestbed scoring API. | |
| Single-file Flask app. Holds ground_truth files locally, scores submissions, | |
| returns metrics, appends to leaderboard. No DB, no auth — submitter identity | |
| is just whatever string the client sends in `agent`. | |
| Deployment unit: the `server` branch (or this `server/` subdir on a deploy | |
| host). Ground-truth files live at $GT_DIR (default /var/graphtestbed/gt/), | |
| populated separately from git — they MUST NOT be committed. | |
| Endpoints: | |
| POST /submit | |
| form: task=<task>, agent=<name>, file=<csv> | |
| → 200 { primary, secondary, n_rows, leaderboard_rank, run_id, | |
| quota_remaining } | |
| → 4xx { error } | |
| GET /leaderboard/<task> | |
| → 200 [ { agent, primary, secondary, submitted_at, run_id }, ... ] | |
| sorted by primary descending | |
| GET /healthz | |
| → 200 { status: "ok", tasks: [...], gt_present: [...] } | |
| """ | |
| from __future__ import annotations | |
| import datetime as dt | |
| import hashlib | |
| import json | |
| import os | |
| import sqlite3 | |
| import time | |
| import uuid | |
| from pathlib import Path | |
| import pandas as pd | |
| import yaml | |
| from flask import Flask, jsonify, render_template_string, request | |
| GT_DIR = Path(os.environ.get("GT_DIR", "/var/graphtestbed/gt")) | |
| DB_PATH = Path(os.environ.get("GT_DB", "/var/graphtestbed/leaderboard.db")) | |
| ARCHIVE_DIR = ( | |
| Path(os.environ["GT_ARCHIVE_DIR"]) | |
| if os.environ.get("GT_ARCHIVE_DIR") else None | |
| ) | |
| MANIFEST_PATH = Path(os.environ.get( | |
| "GT_MANIFEST", | |
| Path(__file__).resolve().parents[1] / "datasets" / "manifest.yaml", | |
| )) | |
| QUOTA_PER_DAY = int(os.environ.get("GT_QUOTA", "5")) | |
| BYPASS_KEY = os.environ.get("GT_BYPASS_KEY", "").strip() or None | |
| # Sentinel for kaggle-backend rows whose score is still being polled. The | |
| # submissions table has primary_metric NOT NULL so we can't store NULL — | |
| # leaderboard queries filter `primary_metric > -1`. | |
| _PENDING_SENTINEL = -1.0 | |
| MAX_UPLOAD_BYTES = 50 * 1024 * 1024 # 50 MB hard cap | |
| app = Flask(__name__) | |
| app.config["MAX_CONTENT_LENGTH"] = MAX_UPLOAD_BYTES | |
| def _manifest() -> dict: | |
| return yaml.safe_load(MANIFEST_PATH.read_text()) | |
| def _db() -> sqlite3.Connection: | |
| DB_PATH.parent.mkdir(parents=True, exist_ok=True) | |
| conn = sqlite3.connect(DB_PATH) | |
| conn.execute(""" | |
| CREATE TABLE IF NOT EXISTS submissions ( | |
| run_id TEXT PRIMARY KEY, | |
| task TEXT NOT NULL, | |
| agent TEXT NOT NULL, | |
| primary_metric REAL NOT NULL, | |
| secondary_json TEXT NOT NULL, | |
| submission_sha256 TEXT NOT NULL, | |
| n_rows INTEGER NOT NULL, | |
| submitter_ip TEXT, | |
| submitted_at TEXT NOT NULL | |
| ) | |
| """) | |
| return conn | |
| def _quota_remaining(task: str, ip: str) -> int: | |
| """Count submissions in the last 24h from this IP for this task.""" | |
| conn = _db() | |
| cutoff = (dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=1)).isoformat() | |
| n = conn.execute( | |
| "SELECT COUNT(*) FROM submissions " | |
| "WHERE task = ? AND submitter_ip = ? AND submitted_at > ?", | |
| (task, ip, cutoff), | |
| ).fetchone()[0] | |
| conn.close() | |
| return max(0, QUOTA_PER_DAY - n) | |
| def _score(task: str, sub_df: pd.DataFrame, cfg: dict) -> dict: | |
| from sklearn.metrics import ( | |
| average_precision_score, f1_score, roc_auc_score, | |
| ) | |
| schema = cfg["submission_schema"] | |
| metric = cfg["metric"] | |
| gt = pd.read_csv(GT_DIR / f"{task}.csv")[[schema["id_col"], "Label"]] | |
| sub_renamed = sub_df.rename(columns={schema["pred_col"]: "_pred"}) | |
| merged = gt.merge(sub_renamed, on=schema["id_col"], how="inner") | |
| if len(merged) != len(gt): | |
| raise ValueError( | |
| f"Coverage mismatch: scored {len(merged)} / expected {len(gt)} rows" | |
| ) | |
| y_true = merged["Label"].astype(int) | |
| if schema.get("pred_dtype") == "binary": | |
| y_pred = merged["_pred"].astype(int) | |
| y_score = y_pred.astype(float) | |
| else: | |
| y_score = merged["_pred"].astype(float) | |
| y_pred = (y_score >= 0.5).astype(int) | |
| funcs = { | |
| "auc_roc": lambda: roc_auc_score(y_true, y_score), | |
| "auc_pr": lambda: average_precision_score(y_true, y_score), | |
| "f1": lambda: f1_score(y_true, y_pred), | |
| } | |
| return { | |
| "primary": round(float(funcs[metric["primary"]]()), 3), | |
| "secondary": { | |
| s: round(float(funcs[s]()), 3) for s in metric["secondary"] | |
| }, | |
| "n_rows": len(merged), | |
| } | |
| def _kaggle_submit(competition: str, raw_csv: bytes, run_id: str) -> str: | |
| """Synchronously submit a CSV to Kaggle. Returns the description string used | |
| to identify the submission; the caller is responsible for polling for the | |
| score later via `_kaggle_poll_loop`. Raises on submit failure. | |
| """ | |
| import subprocess | |
| import tempfile | |
| description = f"graphtestbed-{run_id}" | |
| with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp: | |
| tmp.write(raw_csv) | |
| tmp_path = tmp.name | |
| try: | |
| sub = subprocess.run( | |
| ["kaggle", "competitions", "submit", | |
| "-c", competition, "-f", tmp_path, "-m", description], | |
| capture_output=True, text=True, timeout=120, | |
| ) | |
| if sub.returncode != 0: | |
| raise RuntimeError( | |
| f"kaggle submit failed (rc={sub.returncode}); " | |
| f"stdout={sub.stdout.strip()[-500:]!r}; " | |
| f"stderr={sub.stderr.strip()[-500:]!r}" | |
| ) | |
| finally: | |
| Path(tmp_path).unlink(missing_ok=True) | |
| return description | |
| def _kaggle_poll_loop(competition: str, description: str, run_id: str, | |
| poll_interval: int = 15, timeout_s: int = 1800) -> None: | |
| """Poll Kaggle for the submission's score and UPDATE the matching DB row. | |
| Designed to run in a daemon thread — never raises; failures are logged and | |
| written into the row's `secondary` JSON so they're inspectable later. The | |
| DB row must already exist (caller inserted it as 'pending' before spawning). | |
| """ | |
| import csv | |
| import io | |
| import json as _json | |
| import subprocess | |
| import time | |
| deadline = time.monotonic() + timeout_s | |
| final = None # tuple (primary, secondary_dict) or None on timeout/error | |
| while time.monotonic() < deadline and final is None: | |
| time.sleep(poll_interval) | |
| ls = subprocess.run( | |
| ["kaggle", "competitions", "submissions", "-c", competition, "--csv"], | |
| capture_output=True, text=True, timeout=60, | |
| ) | |
| if ls.returncode != 0: | |
| continue | |
| for row in csv.DictReader(io.StringIO(ls.stdout)): | |
| if row.get("description") != description: | |
| continue | |
| # Kaggle prints status as "SubmissionStatus.COMPLETE" (enum repr), | |
| # not just "complete" — match the suffix after the last dot. | |
| status_raw = (row.get("status") or "") | |
| status = status_raw.rsplit(".", 1)[-1].lower() | |
| if status == "complete": | |
| pub = row.get("publicScore") or "" | |
| priv = row.get("privateScore") or "" | |
| final = ( | |
| round(float(pub), 3) if pub else float("nan"), | |
| {"private_score": round(float(priv), 3)} if priv else {}, | |
| ) | |
| elif status in ("error", "failed"): | |
| err = row.get("errorDescription") or "unspecified" | |
| final = (float("nan"), {"error": f"kaggle scoring failed: {err}"}) | |
| break # found our row; if still pending the inner loop falls through | |
| if final is None: | |
| final = (-1.0, {"error": f"polled {timeout_s}s without complete"}) | |
| primary, secondary = final | |
| # On failure leave the sentinel so it stays out of the leaderboard. | |
| primary_db = -1.0 if primary != primary else primary # NaN check | |
| conn = _db() | |
| conn.execute( | |
| "UPDATE submissions SET primary_metric = ?, secondary_json = ? " | |
| "WHERE run_id = ?", | |
| (primary_db, _json.dumps(secondary), run_id), | |
| ) | |
| conn.commit() | |
| def _validate_schema(sub_df: pd.DataFrame, cfg: dict) -> None: | |
| s = cfg["submission_schema"] | |
| if list(sub_df.columns) != [s["id_col"], s["pred_col"]]: | |
| raise ValueError( | |
| f"columns must be [{s['id_col']}, {s['pred_col']}], " | |
| f"got {list(sub_df.columns)}" | |
| ) | |
| if s.get("n_rows") not in ("TBD", None) and len(sub_df) != s["n_rows"]: | |
| raise ValueError( | |
| f"row count {len(sub_df)} != expected {s['n_rows']}" | |
| ) | |
| if sub_df[s["id_col"]].duplicated().any(): | |
| raise ValueError(f"duplicate IDs in {s['id_col']}") | |
| dtype = s.get("pred_dtype") | |
| if dtype == "float": | |
| try: | |
| preds = sub_df[s["pred_col"]].astype(float) | |
| except (TypeError, ValueError) as e: | |
| raise ValueError(f"pred_col not float-castable: {e}") | |
| if (preds < 0).any() or (preds > 1).any(): | |
| raise ValueError("predictions must lie in [0, 1]") | |
| elif dtype == "binary": | |
| try: | |
| preds = sub_df[s["pred_col"]].astype(float) | |
| except (TypeError, ValueError) as e: | |
| raise ValueError(f"pred_col not numeric: {e}") | |
| bad = ~preds.isin([0.0, 1.0]) | |
| if bad.any(): | |
| raise ValueError( | |
| f"binary submission must contain only 0 or 1 " | |
| f"(no probabilities); got {int(bad.sum())} other values" | |
| ) | |
| def submit(): | |
| task = request.form.get("task") | |
| agent = request.form.get("agent") | |
| file = request.files.get("file") | |
| ip = request.headers.get("X-Forwarded-For", request.remote_addr or "unknown") | |
| # Bypass: maintainer/CI key skips quota and (optionally with dry=1) the | |
| # leaderboard insert. Compared with hmac.compare_digest to avoid timing | |
| # leaks against the hex-string secret. | |
| sent_key = request.headers.get("X-Bypass-Key", "").strip() | |
| bypass = bool(BYPASS_KEY and sent_key | |
| and __import__("hmac").compare_digest(sent_key, BYPASS_KEY)) | |
| dry = bypass and request.form.get("dry") == "1" | |
| if not (task and agent and file): | |
| return jsonify({"error": "form fields required: task, agent, file"}), 400 | |
| manifest = _manifest() | |
| if task not in manifest: | |
| return jsonify({"error": f"unknown task '{task}'", "known": sorted(manifest)}), 404 | |
| cfg = manifest[task] | |
| if bypass: | |
| quota = -1 | |
| else: | |
| quota = _quota_remaining(task, ip) | |
| if quota <= 0: | |
| return jsonify({ | |
| "error": f"quota exceeded ({QUOTA_PER_DAY}/day per IP per task)", | |
| "task": task, | |
| }), 429 | |
| raw = file.read() | |
| sub_sha = hashlib.sha256(raw).hexdigest() | |
| try: | |
| import io | |
| sub_df = pd.read_csv(io.BytesIO(raw)) | |
| except Exception as e: | |
| return jsonify({"error": f"could not parse CSV: {e}"}), 400 | |
| try: | |
| _validate_schema(sub_df, cfg) | |
| except ValueError as e: | |
| return jsonify({"error": f"schema check failed: {e}"}), 422 | |
| backend = cfg.get("backend", "gt") | |
| run_id = uuid.uuid4().hex[:12] | |
| now = dt.datetime.now(dt.timezone.utc).isoformat() | |
| pending = False | |
| try: | |
| if backend == "gt": | |
| scored = _score(task, sub_df, cfg) | |
| elif backend == "kaggle": | |
| comp = cfg.get("backend_config", {}).get("competition") | |
| if not comp: | |
| return jsonify({"error": ( | |
| f"task '{task}' has backend=kaggle but no " | |
| f"backend_config.competition" | |
| )}), 500 | |
| # Submit synchronously (fast, ~30s). Polling for the score happens | |
| # in a background thread — we insert a 'pending' row immediately so | |
| # the client never has to hold open a long-running connection | |
| # (HF Space's reverse proxy kills these around the 5-min mark). | |
| description = _kaggle_submit(comp, raw, run_id) | |
| scored = {"primary": _PENDING_SENTINEL, | |
| "secondary": {"status": "pending"}, | |
| "n_rows": -1} | |
| pending = True | |
| else: | |
| return jsonify({"error": f"unknown backend '{backend}'"}), 500 | |
| except FileNotFoundError: | |
| return jsonify({"error": f"ground truth not deployed for task '{task}'"}), 503 | |
| except Exception as e: | |
| return jsonify({"error": f"{backend}-backend scoring failed: {e}"}), 500 | |
| conn = _db() | |
| if not dry: | |
| conn.execute( | |
| "INSERT INTO submissions VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", | |
| (run_id, task, agent, scored["primary"], | |
| json.dumps(scored["secondary"]), sub_sha, scored["n_rows"], ip, now), | |
| ) | |
| conn.commit() | |
| # Archive the raw CSV when GT_ARCHIVE_DIR is configured, so the deploy | |
| # host can later prove what each scored entry was. Filename embeds the | |
| # agent + run_id so multiple submissions don't collide. | |
| if ARCHIVE_DIR is not None: | |
| safe_agent = "".join(c if c.isalnum() or c in "-_." else "_" for c in agent) | |
| out = ARCHIVE_DIR / task / f"{safe_agent}-{run_id}.csv" | |
| out.parent.mkdir(parents=True, exist_ok=True) | |
| out.write_bytes(raw) | |
| # For Kaggle backend, kick off the async poll AFTER inserting the row so | |
| # the worker has a row to UPDATE. | |
| if pending and not dry: | |
| import threading | |
| threading.Thread( | |
| target=_kaggle_poll_loop, | |
| args=(comp, description, run_id), | |
| daemon=True, | |
| ).start() | |
| # Rank only meaningful for completed scores. Pending Kaggle entries skip it. | |
| if pending: | |
| rank = None | |
| else: | |
| rank = conn.execute(""" | |
| SELECT COUNT(*) + 1 FROM ( | |
| SELECT agent, MAX(primary_metric) AS best | |
| FROM submissions | |
| WHERE task = ? | |
| GROUP BY agent | |
| HAVING best > ? | |
| ) | |
| """, (task, scored["primary"])).fetchone()[0] | |
| conn.close() | |
| return jsonify({ | |
| "run_id": run_id, | |
| "task": task, | |
| "agent": agent, | |
| "primary": scored["primary"], | |
| "secondary": scored["secondary"], | |
| "n_rows": scored["n_rows"], | |
| "leaderboard_rank": rank, | |
| "quota_remaining": "unlimited" if bypass else (quota - 1), | |
| "bypass": bypass, | |
| "dry": dry, | |
| "pending": pending, | |
| "submitted_at": now, | |
| }) | |
| def leaderboard(task: str): | |
| """Per-agent best submission, sorted by primary metric desc.""" | |
| conn = _db() | |
| rows = conn.execute(""" | |
| SELECT agent, MAX(primary_metric) as best, COUNT(*) as n_subs, | |
| MIN(submitted_at) as first_seen | |
| FROM submissions | |
| WHERE task = ? AND primary_metric > -1 | |
| GROUP BY agent | |
| ORDER BY best DESC | |
| """, (task,)).fetchall() | |
| conn.close() | |
| return jsonify([ | |
| {"agent": a, "primary": p, "n_submissions": n, "first_seen": f} | |
| for (a, p, n, f) in rows | |
| ]) | |
| def leaderboard_all(): | |
| """Cross-task average per agent. The average is only computed for agents | |
| that have a score on every task — an incomplete agent shows '—' and ranks | |
| below all complete ones (ties broken by agent name for stability).""" | |
| manifest = _manifest() | |
| tasks = sorted(manifest) | |
| conn = _db() | |
| rows = conn.execute(""" | |
| SELECT task, agent, MAX(primary_metric) as best | |
| FROM submissions | |
| WHERE primary_metric > -1 | |
| GROUP BY task, agent | |
| """).fetchall() | |
| conn.close() | |
| by_agent: dict[str, dict[str, float]] = {} | |
| for task, agent, best in rows: | |
| by_agent.setdefault(agent, {})[task] = float(best) | |
| out = [] | |
| for agent, scores in by_agent.items(): | |
| covered = [t for t in tasks if t in scores] | |
| if not covered: | |
| continue | |
| complete = len(covered) == len(tasks) | |
| avg = sum(scores[t] for t in covered) / len(covered) if complete else None | |
| out.append({ | |
| "agent": agent, | |
| "average": round(avg, 3) if avg is not None else None, | |
| "n_tasks": len(covered), | |
| "per_task": {t: scores.get(t) for t in tasks}, | |
| }) | |
| # Complete agents first (sorted by average desc), then incomplete ones at | |
| # the bottom (sorted by # tasks covered desc, then name). | |
| out.sort(key=lambda r: ( | |
| 0 if r["average"] is not None else 1, | |
| -(r["average"] if r["average"] is not None else 0), | |
| -r["n_tasks"], | |
| r["agent"], | |
| )) | |
| return jsonify({"tasks": tasks, "rows": out}) | |
| def admin_delete(): | |
| """Delete leaderboard entries by (task, agent). Bypass-key gated. | |
| Body: JSON {"entries": [{"task": "...", "agent": "..."}, ...]} | |
| Returns count deleted per pair + total. | |
| """ | |
| sent_key = request.headers.get("X-Bypass-Key", "").strip() | |
| if not (BYPASS_KEY and sent_key | |
| and __import__("hmac").compare_digest(sent_key, BYPASS_KEY)): | |
| return jsonify({"error": "bypass key required"}), 403 | |
| payload = request.get_json(silent=True) or {} | |
| entries = payload.get("entries") or [] | |
| if not isinstance(entries, list) or not entries: | |
| return jsonify({"error": "body must be {entries: [{task, agent}, ...]}"}), 400 | |
| conn = _db() | |
| deleted = [] | |
| for e in entries: | |
| t, a = e.get("task"), e.get("agent") | |
| if not (t and a): | |
| continue | |
| cur = conn.execute( | |
| "DELETE FROM submissions WHERE task = ? AND agent = ?", (t, a) | |
| ) | |
| deleted.append({"task": t, "agent": a, "rows": cur.rowcount}) | |
| conn.commit() | |
| return jsonify({ | |
| "deleted": deleted, | |
| "total_rows": sum(d["rows"] for d in deleted), | |
| }) | |
| def admin_insert(): | |
| """Insert a leaderboard row directly. Bypass-key gated; intended for | |
| maintainer corrections (e.g. backfilling a known score whose CSV is no | |
| longer available). For routine scoring, use POST /submit. | |
| Body: JSON {"task": "...", "agent": "...", "primary": float, | |
| "secondary": {...}, "n_rows": int|null, "sha256": str|null} | |
| """ | |
| import datetime as _dt | |
| import json as _json | |
| import uuid as _uuid | |
| sent_key = request.headers.get("X-Bypass-Key", "").strip() | |
| if not (BYPASS_KEY and sent_key | |
| and __import__("hmac").compare_digest(sent_key, BYPASS_KEY)): | |
| return jsonify({"error": "bypass key required"}), 403 | |
| payload = request.get_json(silent=True) or {} | |
| task = payload.get("task") | |
| agent = payload.get("agent") | |
| primary = payload.get("primary") | |
| if not (task and agent and isinstance(primary, (int, float))): | |
| return jsonify({"error": "task, agent, primary required"}), 400 | |
| secondary = payload.get("secondary") or {} | |
| n_rows = int(payload.get("n_rows") or -1) | |
| sha = payload.get("sha256") or "manual_insert" | |
| run_id = _uuid.uuid4().hex[:12] | |
| now = _dt.datetime.now(_dt.timezone.utc).isoformat() | |
| conn = _db() | |
| conn.execute( | |
| "INSERT INTO submissions VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", | |
| (run_id, task, agent, float(primary), _json.dumps(secondary), | |
| sha, n_rows, "admin", now), | |
| ) | |
| conn.commit() | |
| return jsonify({"run_id": run_id, "task": task, "agent": agent, | |
| "primary": primary, "secondary": secondary}) | |
| def admin_repoll(run_id: str): | |
| """Re-trigger the Kaggle poll loop for a stuck/failed pending row, without | |
| re-submitting to Kaggle. Useful after fixing a poller bug — the existing | |
| Kaggle submission still has its score, we just need to read it. | |
| """ | |
| sent_key = request.headers.get("X-Bypass-Key", "").strip() | |
| if not (BYPASS_KEY and sent_key | |
| and __import__("hmac").compare_digest(sent_key, BYPASS_KEY)): | |
| return jsonify({"error": "bypass key required"}), 403 | |
| conn = _db() | |
| row = conn.execute( | |
| "SELECT task FROM submissions WHERE run_id = ?", (run_id,) | |
| ).fetchone() | |
| conn.close() | |
| if not row: | |
| return jsonify({"error": f"no run '{run_id}'"}), 404 | |
| task = row[0] | |
| cfg = _manifest().get(task, {}) | |
| comp = cfg.get("backend_config", {}).get("competition") | |
| if not comp: | |
| return jsonify({"error": f"task '{task}' is not a kaggle backend"}), 400 | |
| description = f"graphtestbed-{run_id}" | |
| import threading | |
| threading.Thread( | |
| target=_kaggle_poll_loop, | |
| args=(comp, description, run_id), | |
| daemon=True, | |
| ).start() | |
| return jsonify({"run_id": run_id, "task": task, "competition": comp, | |
| "status": "repolling"}) | |
| def run_status(run_id: str): | |
| """Look up a submission by run_id. Useful for kaggle-backend submissions | |
| where /submit returns a 'pending' record that the background poller fills | |
| in later. | |
| """ | |
| conn = _db() | |
| row = conn.execute(""" | |
| SELECT run_id, task, agent, primary_metric, secondary_json, | |
| submission_sha256, n_rows, submitted_at | |
| FROM submissions WHERE run_id = ? | |
| """, (run_id,)).fetchone() | |
| conn.close() | |
| if not row: | |
| return jsonify({"error": f"no run '{run_id}'"}), 404 | |
| rid, task, agent, primary, secondary, sha, n_rows, ts = row | |
| sec = json.loads(secondary) if secondary else {} | |
| if primary == _PENDING_SENTINEL: | |
| status = "pending" | |
| primary = None | |
| elif sec.get("error"): | |
| status = "failed" | |
| primary = None | |
| else: | |
| status = "complete" | |
| return jsonify({ | |
| "run_id": rid, "task": task, "agent": agent, | |
| "primary": primary, "secondary": sec, | |
| "n_rows": n_rows, "submitted_at": ts, | |
| "status": status, | |
| }) | |
| def healthz(): | |
| manifest = _manifest() | |
| return jsonify({ | |
| "status": "ok", | |
| "tasks": sorted(manifest), | |
| "gt_present": [t for t in manifest if (GT_DIR / f"{t}.csv").exists()], | |
| "quota_per_day": QUOTA_PER_DAY, | |
| "uptime_unix": int(time.time()), | |
| }) | |
| _LANDING_TMPL = r"""<!doctype html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="utf-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1"> | |
| <title>GraphTestbed Leaderboard</title> | |
| <style> | |
| :root { | |
| --fg: #0d1117; | |
| --fg-muted: #57606a; | |
| --fg-subtle: #8b949e; | |
| --bg: #ffffff; | |
| --bg-alt: #f6f8fa; | |
| --bg-hover: #eef2f5; | |
| --border: #d0d7de; | |
| --border-soft: #eaeef2; | |
| --accent: #0969da; | |
| --accent-bg: #ddf4ff; | |
| --accent-bg-hover: #b6e3ff; | |
| --gold: #bf8700; | |
| --silver: #6e7781; | |
| --bronze: #9a6700; | |
| --good: #1a7f37; | |
| --good-bg: #dafbe1; | |
| --warn: #9a6700; | |
| --warn-bg: #fff8c5; | |
| } | |
| * { box-sizing: border-box; } | |
| html, body { margin: 0; padding: 0; } | |
| body { | |
| font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", system-ui, | |
| "Helvetica Neue", Arial, sans-serif; | |
| color: var(--fg); | |
| background: var(--bg); | |
| line-height: 1.5; | |
| font-size: 14px; | |
| } | |
| a { color: var(--accent); text-decoration: none; } | |
| a:hover { text-decoration: underline; } | |
| code { font-family: ui-monospace, SFMono-Regular, "SF Mono", Consolas, monospace; } | |
| /* ---- header ---- */ | |
| header { | |
| border-bottom: 1px solid var(--border); | |
| padding: 14px 28px; | |
| display: flex; | |
| align-items: center; | |
| gap: 18px; | |
| flex-wrap: wrap; | |
| } | |
| header .brand { | |
| font-size: 18px; | |
| font-weight: 600; | |
| letter-spacing: -0.01em; | |
| } | |
| header .brand .dot { color: var(--accent); } | |
| header .tagline { | |
| color: var(--fg-muted); | |
| font-size: 13px; | |
| margin-left: 4px; | |
| } | |
| header nav { margin-left: auto; display: flex; gap: 6px; flex-wrap: wrap; } | |
| header nav a, header nav button { | |
| font: inherit; | |
| background: transparent; | |
| border: 1px solid var(--border); | |
| color: var(--fg); | |
| padding: 5px 12px; | |
| border-radius: 6px; | |
| cursor: pointer; | |
| font-size: 13px; | |
| } | |
| header nav a:hover, header nav button:hover { | |
| background: var(--bg-alt); text-decoration: none; | |
| } | |
| header nav .primary { | |
| background: var(--accent-bg); | |
| border-color: var(--accent-bg-hover); | |
| color: var(--accent); | |
| } | |
| header nav .primary:hover { background: var(--accent-bg-hover); } | |
| /* ---- main container ---- */ | |
| main { max-width: 1180px; margin: 0 auto; padding: 18px 28px 60px; } | |
| /* ---- task tabs ---- */ | |
| .tabs { | |
| display: flex; | |
| gap: 2px; | |
| border-bottom: 1px solid var(--border); | |
| margin-bottom: 18px; | |
| flex-wrap: wrap; | |
| } | |
| .tab { | |
| padding: 9px 14px 11px; | |
| cursor: pointer; | |
| color: var(--fg-muted); | |
| border: none; | |
| background: transparent; | |
| font: inherit; | |
| font-size: 14px; | |
| font-weight: 500; | |
| border-bottom: 2px solid transparent; | |
| margin-bottom: -1px; | |
| display: flex; | |
| align-items: center; | |
| gap: 8px; | |
| } | |
| .tab:hover { color: var(--fg); background: var(--bg-alt); } | |
| .tab.active { | |
| color: var(--fg); | |
| border-bottom-color: var(--accent); | |
| font-weight: 600; | |
| } | |
| .tab .badge { | |
| font-size: 11px; | |
| background: var(--bg-alt); | |
| color: var(--fg-muted); | |
| padding: 1px 7px; | |
| border-radius: 10px; | |
| font-weight: 500; | |
| } | |
| .tab.active .badge { background: var(--accent-bg); color: var(--accent); } | |
| /* ---- task panel header ---- */ | |
| .panel-head { | |
| display: flex; | |
| flex-direction: column; | |
| align-items: stretch; | |
| gap: 10px; | |
| margin-bottom: 14px; | |
| } | |
| .panel-head .meta { | |
| color: var(--fg-muted); | |
| font-size: 13px; | |
| } | |
| .panel-head .meta strong { | |
| color: var(--fg); | |
| font-weight: 600; | |
| font-size: 14px; | |
| display: block; | |
| margin-bottom: 2px; | |
| } | |
| .panel-head .pills { display: flex; gap: 6px; flex-wrap: wrap; } | |
| .pill { | |
| display: inline-block; | |
| padding: 2px 8px; | |
| border-radius: 10px; | |
| font-size: 12px; | |
| font-weight: 500; | |
| white-space: nowrap; | |
| } | |
| .pill.metric { background: var(--accent-bg); color: var(--accent); } | |
| .pill.schema { background: var(--good-bg); color: var(--good); } | |
| .pill.gt { background: var(--good-bg); color: var(--good); } | |
| .pill.warn { background: var(--warn-bg); color: var(--warn); } | |
| .pill.muted { background: var(--bg-alt); color: var(--fg-muted); } | |
| .pill.link { background: var(--bg-alt); color: var(--accent); text-decoration: none; } | |
| .pill.link:hover { background: var(--accent-bg); } | |
| /* ---- search bar ---- */ | |
| .toolbar { | |
| display: flex; | |
| gap: 10px; | |
| margin-bottom: 12px; | |
| align-items: center; | |
| flex-wrap: wrap; | |
| } | |
| .toolbar input[type=search] { | |
| flex: 1 1 260px; | |
| max-width: 360px; | |
| font: inherit; | |
| padding: 7px 12px; | |
| border: 1px solid var(--border); | |
| border-radius: 6px; | |
| background: var(--bg); | |
| } | |
| .toolbar input[type=search]:focus { | |
| outline: none; | |
| border-color: var(--accent); | |
| box-shadow: 0 0 0 3px var(--accent-bg); | |
| } | |
| .toolbar .count { | |
| color: var(--fg-muted); | |
| font-size: 13px; | |
| } | |
| .toolbar .refresh { | |
| margin-left: auto; | |
| font: inherit; | |
| background: transparent; | |
| border: 1px solid var(--border); | |
| color: var(--fg-muted); | |
| padding: 6px 10px; | |
| border-radius: 6px; | |
| cursor: pointer; | |
| font-size: 13px; | |
| } | |
| .toolbar .refresh:hover { background: var(--bg-alt); color: var(--fg); } | |
| /* ---- leaderboard table ---- */ | |
| .table-wrap { | |
| border: 1px solid var(--border); | |
| border-radius: 8px; | |
| overflow: hidden; | |
| background: var(--bg); | |
| } | |
| table.lb { | |
| border-collapse: collapse; | |
| width: 100%; | |
| font-size: 14px; | |
| } | |
| table.lb thead th { | |
| background: var(--bg-alt); | |
| color: var(--fg-muted); | |
| font-weight: 600; | |
| font-size: 12px; | |
| text-transform: uppercase; | |
| letter-spacing: 0.04em; | |
| padding: 10px 14px; | |
| text-align: left; | |
| border-bottom: 1px solid var(--border); | |
| cursor: pointer; | |
| user-select: none; | |
| white-space: nowrap; | |
| } | |
| table.lb thead th:hover { background: var(--bg-hover); color: var(--fg); } | |
| table.lb thead th .arrow { | |
| color: var(--fg-subtle); | |
| font-size: 10px; | |
| margin-left: 4px; | |
| } | |
| table.lb thead th.sorted { color: var(--fg); } | |
| table.lb thead th.sorted .arrow { color: var(--accent); } | |
| table.lb thead th.num { text-align: right; } | |
| table.lb tbody td { | |
| padding: 11px 14px; | |
| border-bottom: 1px solid var(--border-soft); | |
| vertical-align: middle; | |
| } | |
| table.lb tbody tr:last-child td { border-bottom: none; } | |
| table.lb tbody tr:hover td { background: var(--bg-alt); } | |
| table.lb td.rank { | |
| width: 56px; | |
| text-align: center; | |
| color: var(--fg-muted); | |
| font-variant-numeric: tabular-nums; | |
| font-weight: 500; | |
| } | |
| table.lb td.rank.r1 { color: var(--gold); font-weight: 700; } | |
| table.lb td.rank.r2 { color: var(--silver); font-weight: 600; } | |
| table.lb td.rank.r3 { color: var(--bronze); font-weight: 600; } | |
| table.lb td.agent { | |
| font-weight: 500; | |
| font-family: ui-monospace, SFMono-Regular, "SF Mono", Consolas, monospace; | |
| font-size: 13px; | |
| word-break: break-all; | |
| } | |
| table.lb td.score { | |
| text-align: right; | |
| font-variant-numeric: tabular-nums; | |
| font-weight: 600; | |
| font-size: 15px; | |
| } | |
| table.lb td.num { text-align: right; font-variant-numeric: tabular-nums; } | |
| table.lb td.subs { color: var(--fg-muted); } | |
| table.lb td.date { color: var(--fg-muted); font-size: 12px; white-space: nowrap; } | |
| .empty-row td { | |
| text-align: center; | |
| color: var(--fg-subtle); | |
| font-style: italic; | |
| padding: 28px 14px; | |
| } | |
| /* ---- about/api panels ---- */ | |
| .secondary { | |
| display: none; | |
| max-width: 760px; | |
| } | |
| .secondary.active { display: block; } | |
| .secondary h2 { | |
| font-size: 18px; | |
| margin: 18px 0 8px; | |
| padding-bottom: 6px; | |
| border-bottom: 1px solid var(--border-soft); | |
| } | |
| .secondary p { color: var(--fg-muted); } | |
| .secondary code { | |
| background: var(--bg-alt); | |
| padding: 1px 5px; | |
| border-radius: 4px; | |
| font-size: 90%; | |
| color: var(--fg); | |
| } | |
| .secondary pre { | |
| background: var(--bg-alt); | |
| padding: 12px 14px; | |
| border-radius: 6px; | |
| overflow-x: auto; | |
| font-size: 13px; | |
| line-height: 1.5; | |
| } | |
| .secondary pre code { background: transparent; padding: 0; } | |
| .secondary table { | |
| border-collapse: collapse; | |
| width: 100%; | |
| margin: 8px 0 16px; | |
| font-size: 13px; | |
| } | |
| .secondary th, .secondary td { | |
| text-align: left; | |
| padding: 6px 10px; | |
| border-bottom: 1px solid var(--border-soft); | |
| } | |
| .secondary th { background: var(--bg-alt); font-size: 12px; } | |
| /* ---- footer ---- */ | |
| footer { | |
| max-width: 1180px; | |
| margin: 40px auto 0; | |
| padding: 16px 28px; | |
| color: var(--fg-subtle); | |
| font-size: 12px; | |
| border-top: 1px solid var(--border-soft); | |
| } | |
| footer a { color: var(--fg-muted); } | |
| /* hide leaderboard panels when in secondary view */ | |
| body.view-about .leaderboard-view { display: none; } | |
| body.view-api .leaderboard-view { display: none; } | |
| body.view-about #panel-about { display: block; } | |
| body.view-api #panel-api { display: block; } | |
| @media (max-width: 640px) { | |
| header { padding: 12px 16px; } | |
| main { padding: 12px 16px 40px; } | |
| table.lb td.date { display: none; } | |
| table.lb thead th.date { display: none; } | |
| } | |
| </style> | |
| </head> | |
| <body class="view-leaderboard"> | |
| <header> | |
| <span class="brand"><span class="dot">▲</span> GraphTestbed</span> | |
| <span class="tagline">scoring leaderboard for graph-ML agent harnesses</span> | |
| <nav> | |
| <a href="#" data-view="leaderboard" class="view-link primary">Leaderboard</a> | |
| <a href="#about" data-view="about" class="view-link">About</a> | |
| <a href="#api" data-view="api" class="view-link">API</a> | |
| <a href="https://huggingface.co/datasets/lanczos/graphtestbed-data" target="_blank" rel="noopener">Dataset ↗</a> | |
| <a href="https://github.com/zhuconv/GraphTestbed" target="_blank" rel="noopener">GitHub ↗</a> | |
| </nav> | |
| </header> | |
| <main> | |
| <!-- ============== LEADERBOARD VIEW ============== --> | |
| <div class="leaderboard-view"> | |
| <div class="tabs" id="task-tabs" role="tablist"> | |
| <button class="tab active" data-task="overall" role="tab" aria-selected="true"> | |
| Overall | |
| <span class="badge">{{ overall_rows|length }}</span> | |
| </button> | |
| {% for t in tasks %} | |
| <button class="tab" data-task="{{ t.name }}" role="tab" aria-selected="false"> | |
| {{ t.name }} | |
| <span class="badge">{{ t.rows|length }}</span> | |
| </button> | |
| {% endfor %} | |
| </div> | |
| <section class="panel" id="panel-overall" | |
| data-task="overall" data-metric="average"> | |
| <div class="panel-head"> | |
| <div class="meta"> | |
| <strong>Overall</strong> | |
| Average across the {{ n_tasks }} tasks. An agent's average is | |
| taken over the tasks they've actually submitted to (not over all | |
| tasks), so a one-task agent isn't penalised by N/A on others — | |
| the <code>tasks</code> column shows coverage. | |
| </div> | |
| <div class="pills"> | |
| <span class="pill metric">average</span> | |
| <span class="pill muted">{{ overall_rows|length }} agents</span> | |
| </div> | |
| </div> | |
| <div class="table-wrap"> | |
| <table class="lb" data-table-for="overall"> | |
| <thead> | |
| <tr> | |
| <th class="rank" data-sort="rank">#</th> | |
| <th>Agent</th> | |
| {% for t in tasks %} | |
| <th class="num" data-sort="{{ t.name }}">{{ t.name }}</th> | |
| {% endfor %} | |
| <th class="num sorted" data-sort="primary">average <span class="arrow">▾</span></th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| {% if overall_rows %} | |
| {% for r in overall_rows %} | |
| <tr data-agent="{{ r.agent }}"> | |
| <td class="rank{% if loop.index == 1 %} r1{% elif loop.index == 2 %} r2{% elif loop.index == 3 %} r3{% endif %}">{{ loop.index }}</td> | |
| <td class="agent">{{ r.agent }}</td> | |
| {% for t in tasks %} | |
| <td class="num"> | |
| {% set v = r.per_task[t.name] %} | |
| {% if v is not none %}{{ "%.3f"|format(v) }}{% else %}<span class="muted">—</span>{% endif %} | |
| </td> | |
| {% endfor %} | |
| <td class="score"> | |
| {% if r.average is not none %}{{ "%.3f"|format(r.average) }}{% else %}<span class="muted">—</span>{% endif %} | |
| </td> | |
| </tr> | |
| {% endfor %} | |
| {% else %} | |
| <tr class="empty-row"><td colspan="{{ 3 + n_tasks }}">No submissions yet — be the first to submit.</td></tr> | |
| {% endif %} | |
| </tbody> | |
| </table> | |
| </div> | |
| </section> | |
| {% for t in tasks %} | |
| <section class="panel" id="panel-{{ t.name }}" | |
| data-task="{{ t.name }}" data-metric="{{ t.metric }}" hidden> | |
| <div class="panel-head"> | |
| <div class="meta"> | |
| <strong>{{ t.name }}</strong> | |
| {{ t.description|trim }} | |
| </div> | |
| <div class="pills"> | |
| <span class="pill metric">{{ t.metric }}</span> | |
| {% if t.n_rows %}<span class="pill muted">{{ "{:,}".format(t.n_rows) }} test rows</span>{% endif %} | |
| <span class="pill schema">[{{ t.id_col }}, {{ t.pred_col }}]</span> | |
| <a class="pill link" href="https://huggingface.co/datasets/lanczos/graphtestbed-data/tree/main/{{ t.name }}" target="_blank" rel="noopener">data ↗</a> | |
| {% if not t.gt_present and t.backend == 'gt' %}<span class="pill warn">GT missing</span>{% endif %} | |
| {% if t.backend != 'gt' %}<span class="pill muted">backend: {{ t.backend }}</span>{% endif %} | |
| </div> | |
| </div> | |
| <div class="toolbar"> | |
| <input type="search" placeholder="Search agents in {{ t.name }}…" | |
| aria-label="Search agents" data-search-for="{{ t.name }}"> | |
| <span class="count" data-count-for="{{ t.name }}">{{ t.rows|length }} agents</span> | |
| <button class="refresh" data-refresh-for="{{ t.name }}" title="Refresh from /leaderboard/{{ t.name }}"> | |
| Refresh | |
| </button> | |
| </div> | |
| <div class="table-wrap"> | |
| <table class="lb" data-table-for="{{ t.name }}"> | |
| <thead> | |
| <tr> | |
| <th class="rank" data-sort="rank">#</th> | |
| <th>Agent</th> | |
| <th class="num sorted" data-sort="primary">{{ t.metric }} <span class="arrow">▾</span></th> | |
| <th class="num" data-sort="n_submissions">Submissions</th> | |
| <th class="date" data-sort="first_seen">First seen</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| {% if t.rows %} | |
| {% for r in t.rows %} | |
| <tr data-agent="{{ r.agent }}"> | |
| <td class="rank{% if loop.index == 1 %} r1{% elif loop.index == 2 %} r2{% elif loop.index == 3 %} r3{% endif %}">{{ loop.index }}</td> | |
| <td class="agent">{{ r.agent }}</td> | |
| <td class="score">{{ "%.3f"|format(r.primary) }}</td> | |
| <td class="subs num">{{ r.n_subs }}</td> | |
| <td class="date">{{ r.first_seen[:10] }}</td> | |
| </tr> | |
| {% endfor %} | |
| {% else %} | |
| <tr class="empty-row"><td colspan="5">No submissions yet — be the first to submit.</td></tr> | |
| {% endif %} | |
| </tbody> | |
| </table> | |
| </div> | |
| </section> | |
| {% endfor %} | |
| </div> | |
| <!-- ============== ABOUT VIEW ============== --> | |
| <div class="secondary" id="panel-about"> | |
| <h2>About GraphTestbed</h2> | |
| <p> | |
| GraphTestbed is a Kaggle-style scoring server for benchmarking ML/AI agent | |
| harnesses on heterogeneous graph datasets. Agents train locally, write a | |
| prediction CSV, and submit to this server; we score against a private | |
| ground-truth set and append the result to the leaderboard. | |
| </p> | |
| <p> | |
| <strong>Trust model: non-adversarial.</strong> | |
| {{ quota }} submissions / day / IP / task. Scores rounded to 3 decimal | |
| places. Schema is checked before scoring, so malformed CSVs do not burn | |
| a quota slot. Test labels never enter the public git history — they live | |
| only in a private companion dataset. | |
| </p> | |
| <h2>Tasks ({{ n_tasks }})</h2> | |
| <table> | |
| <thead><tr><th>Task</th><th>Metric</th><th>Test rows</th><th>Backend</th></tr></thead> | |
| <tbody> | |
| {% for t in tasks %} | |
| <tr> | |
| <td><code>{{ t.name }}</code></td> | |
| <td>{{ t.metric }}</td> | |
| <td>{% if t.n_rows %}{{ "{:,}".format(t.n_rows) }}{% else %}TBD{% endif %}</td> | |
| <td>{{ t.backend }}</td> | |
| </tr> | |
| {% endfor %} | |
| </tbody> | |
| </table> | |
| <p> | |
| Full documentation, CLI install, protocol spec, and how to add new tasks: | |
| <a href="https://github.com/zhuconv/GraphTestbed" target="_blank" rel="noopener">github.com/zhuconv/GraphTestbed</a>. | |
| </p> | |
| </div> | |
| <!-- ============== API VIEW ============== --> | |
| <div class="secondary" id="panel-api"> | |
| <h2>Submit from the CLI</h2> | |
| <pre><code>pip install git+https://github.com/zhuconv/GraphTestbed | |
| gtb submit <task> --file preds.csv --agent <your-name> | |
| gtb leaderboard <task></code></pre> | |
| <h2>Submit via raw HTTP</h2> | |
| <pre><code>curl -F task=<task> -F agent=<name> -F file=@preds.csv \ | |
| {{ base_url }}/submit</code></pre> | |
| <h2>JSON endpoints</h2> | |
| <table> | |
| <thead><tr><th>Method</th><th>Path</th><th>Returns</th></tr></thead> | |
| <tbody> | |
| <tr><td>POST</td><td><code>/submit</code></td><td>multipart task=, agent=, file= → primary, secondary, leaderboard_rank, quota_remaining</td></tr> | |
| <tr><td>GET</td><td><code>/leaderboard/<task></code></td><td>JSON list of {agent, primary, n_submissions, first_seen}</td></tr> | |
| <tr><td>GET</td><td><code>/healthz</code></td><td>tasks, gt_present, quota, uptime</td></tr> | |
| </tbody> | |
| </table> | |
| <p> | |
| Submission CSV must contain exactly two columns | |
| (<code>id_col</code>, <code>pred_col</code> per the per-task schema) | |
| and exactly <code>n_rows</code> data rows. Full contract: | |
| <a href="https://github.com/zhuconv/GraphTestbed/blob/main/PROTOCOL.md" target="_blank" rel="noopener">PROTOCOL.md</a>. | |
| </p> | |
| </div> | |
| </main> | |
| <footer> | |
| {{ n_subs_total }} total submissions across {{ n_tasks }} tasks · | |
| Flask + sqlite, snapshotted to a private HF dataset every 60s · | |
| <a href="/healthz">/healthz</a> · | |
| <a href="https://github.com/zhuconv/GraphTestbed" target="_blank" rel="noopener">GitHub</a> | |
| </footer> | |
| <script> | |
| (function () { | |
| // ---- view (leaderboard / about / api) routing via URL hash ---- | |
| function applyView() { | |
| var hash = (location.hash || '').replace(/^#/, ''); | |
| var view = (hash === 'about' || hash === 'api') ? hash : 'leaderboard'; | |
| document.body.className = 'view-' + view; | |
| document.querySelectorAll('.view-link').forEach(function (a) { | |
| a.classList.toggle('primary', a.dataset.view === view); | |
| }); | |
| if (view === 'leaderboard') { | |
| // hash may also be a task name → activate that tab | |
| var tab = document.querySelector('.tab[data-task="' + hash + '"]'); | |
| if (tab) activateTab(hash); | |
| } | |
| } | |
| function activateTab(taskName) { | |
| document.querySelectorAll('.tab').forEach(function (b) { | |
| var on = b.dataset.task === taskName; | |
| b.classList.toggle('active', on); | |
| b.setAttribute('aria-selected', on ? 'true' : 'false'); | |
| }); | |
| document.querySelectorAll('.panel').forEach(function (p) { | |
| p.hidden = (p.dataset.task !== taskName); | |
| }); | |
| } | |
| document.querySelectorAll('.tab').forEach(function (btn) { | |
| btn.addEventListener('click', function () { | |
| var t = btn.dataset.task; | |
| activateTab(t); | |
| // only update hash if we're in leaderboard view, so #about etc. stay | |
| if (!location.hash || /^#(?!about|api)/.test(location.hash) || location.hash === '') { | |
| history.replaceState(null, '', '#' + t); | |
| } | |
| }); | |
| }); | |
| document.querySelectorAll('.view-link').forEach(function (a) { | |
| a.addEventListener('click', function (e) { | |
| e.preventDefault(); | |
| var v = a.dataset.view; | |
| location.hash = (v === 'leaderboard') ? '' : v; | |
| }); | |
| }); | |
| window.addEventListener('hashchange', applyView); | |
| applyView(); | |
| // ---- search-as-you-type ---- | |
| document.querySelectorAll('input[data-search-for]').forEach(function (input) { | |
| input.addEventListener('input', function () { | |
| var task = input.dataset.searchFor; | |
| var q = input.value.trim().toLowerCase(); | |
| var table = document.querySelector('table[data-table-for="' + task + '"]'); | |
| if (!table) return; | |
| var visible = 0, total = 0; | |
| table.querySelectorAll('tbody tr').forEach(function (tr) { | |
| if (tr.classList.contains('empty-row')) return; | |
| total++; | |
| var name = (tr.dataset.agent || '').toLowerCase(); | |
| var show = !q || name.indexOf(q) !== -1; | |
| tr.style.display = show ? '' : 'none'; | |
| if (show) visible++; | |
| }); | |
| var counter = document.querySelector('[data-count-for="' + task + '"]'); | |
| if (counter) { | |
| counter.textContent = (q ? (visible + ' / ' + total) : total) + ' agents'; | |
| } | |
| }); | |
| }); | |
| // ---- sortable columns ---- | |
| function sortTable(table, th, dir) { | |
| var tbody = table.tBodies[0]; | |
| var rows = Array.from(tbody.querySelectorAll('tr')).filter(function (r) { | |
| return !r.classList.contains('empty-row'); | |
| }); | |
| if (!rows.length) return; | |
| // Resolve column index from the header's position — works for any layout | |
| // (per-task columns are dynamic; previous switch-on-key broke for them). | |
| var headers = Array.from(table.tHead.rows[0].cells); | |
| var colIndex = headers.indexOf(th); | |
| var key = th.dataset.sort; | |
| var sortKey = function (r) { | |
| if (key === 'agent') return (r.dataset.agent || '').toLowerCase(); | |
| if (key === 'first_seen') return r.cells[colIndex].textContent; | |
| var txt = (r.cells[colIndex].textContent || '').trim(); | |
| // Treat "—" / empty as -Infinity so missing scores sink to the bottom | |
| // when sorting desc, top when sorting asc. | |
| if (txt === '' || txt === '—') return -Infinity; | |
| var n = parseFloat(txt); | |
| return isNaN(n) ? txt : n; | |
| }; | |
| rows.sort(function (a, b) { | |
| var av = sortKey(a), bv = sortKey(b); | |
| if (av < bv) return dir === 'asc' ? -1 : 1; | |
| if (av > bv) return dir === 'asc' ? 1 : -1; | |
| return 0; | |
| }); | |
| rows.forEach(function (r, i) { | |
| tbody.appendChild(r); | |
| // Renumber rank only when sorted by the canonical "primary" column desc. | |
| if (key === 'primary' && dir === 'desc') { | |
| var rk = r.cells[0]; | |
| rk.textContent = (i + 1); | |
| rk.className = 'rank' + (i === 0 ? ' r1' : i === 1 ? ' r2' : i === 2 ? ' r3' : ''); | |
| } | |
| }); | |
| } | |
| document.querySelectorAll('table.lb thead th').forEach(function (th) { | |
| if (!th.dataset.sort) { | |
| th.style.cursor = 'default'; | |
| return; // unsortable column (e.g. Agent) — no click handler | |
| } | |
| th.addEventListener('click', function () { | |
| var table = th.closest('table'); | |
| var current = th.classList.contains('sorted') | |
| ? (th.dataset.dir === 'asc' ? 'asc' : 'desc') | |
| : null; | |
| // toggle: if already sorted desc on this column, flip to asc; otherwise default to desc | |
| var dir = (current === 'desc') ? 'asc' : 'desc'; | |
| table.querySelectorAll('thead th').forEach(function (h) { | |
| h.classList.remove('sorted'); | |
| h.querySelectorAll('.arrow').forEach(function (a) { a.remove(); }); | |
| }); | |
| th.classList.add('sorted'); | |
| th.dataset.dir = dir; | |
| var arrow = document.createElement('span'); | |
| arrow.className = 'arrow'; | |
| arrow.textContent = (dir === 'asc') ? '\u25B4' : '\u25BE'; | |
| th.appendChild(arrow); | |
| sortTable(table, th, dir); | |
| }); | |
| }); | |
| // ---- refresh from JSON endpoint ---- | |
| function rowHTML(r, idx) { | |
| var rankCls = 'rank' + (idx === 0 ? ' r1' : idx === 1 ? ' r2' : idx === 2 ? ' r3' : ''); | |
| var firstSeen = (r.first_seen || '').slice(0, 10); | |
| var agent = String(r.agent || ''); | |
| var safe = agent.replace(/[&<>"']/g, function (c) { | |
| return ({'&':'&','<':'<','>':'>','"':'"',"'":'''})[c]; | |
| }); | |
| return '<tr data-agent="' + safe + '">' | |
| + '<td class="' + rankCls + '">' + (idx + 1) + '</td>' | |
| + '<td class="agent">' + safe + '</td>' | |
| + '<td class="score">' + Number(r.primary).toFixed(3) + '</td>' | |
| + '<td class="subs num">' + (r.n_submissions || 0) + '</td>' | |
| + '<td class="date">' + firstSeen + '</td>' | |
| + '</tr>'; | |
| } | |
| document.querySelectorAll('button[data-refresh-for]').forEach(function (btn) { | |
| btn.addEventListener('click', function () { | |
| var task = btn.dataset.refreshFor; | |
| btn.disabled = true; btn.textContent = 'Refreshing…'; | |
| fetch('/leaderboard/' + encodeURIComponent(task)) | |
| .then(function (r) { return r.json(); }) | |
| .then(function (data) { | |
| var table = document.querySelector('table[data-table-for="' + task + '"]'); | |
| if (!table) return; | |
| var tbody = table.tBodies[0]; | |
| if (!data.length) { | |
| tbody.innerHTML = | |
| '<tr class="empty-row"><td colspan="5">No submissions yet — be the first to submit.</td></tr>'; | |
| } else { | |
| tbody.innerHTML = data.map(rowHTML).join(''); | |
| } | |
| var tab = document.querySelector('.tab[data-task="' + task + '"] .badge'); | |
| if (tab) tab.textContent = data.length; | |
| var counter = document.querySelector('[data-count-for="' + task + '"]'); | |
| if (counter) counter.textContent = data.length + ' agents'; | |
| // reset search | |
| var input = document.querySelector('input[data-search-for="' + task + '"]'); | |
| if (input) input.value = ''; | |
| }) | |
| .catch(function (e) { | |
| console.error('refresh failed', e); | |
| }) | |
| .finally(function () { | |
| btn.disabled = false; btn.textContent = 'Refresh'; | |
| }); | |
| }); | |
| }); | |
| })(); | |
| </script> | |
| </body> | |
| </html> | |
| """ | |
| def landing(): | |
| """Leaderboard-first single-page UI. | |
| Server-side renders the per-task tables for instant first paint; a tiny | |
| inline JS layer adds search, sort, tab-switching and refresh-from-JSON | |
| on top, all consuming the existing /leaderboard/<task> endpoint. | |
| """ | |
| manifest = _manifest() | |
| conn = _db() | |
| tasks = [] | |
| n_subs_total = 0 | |
| for name in sorted(manifest): | |
| cfg = manifest[name] | |
| s = cfg["submission_schema"] | |
| rows = conn.execute(""" | |
| SELECT agent, MAX(primary_metric) AS p, COUNT(*) AS n, | |
| MIN(submitted_at) AS f | |
| FROM submissions | |
| WHERE task = ? AND primary_metric > -1 | |
| GROUP BY agent ORDER BY p DESC | |
| """, (name,)).fetchall() | |
| n_rows_cfg = s.get("n_rows") | |
| tasks.append({ | |
| "name": name, | |
| "description": str(cfg.get("description", "")), | |
| "metric": cfg["metric"]["primary"], | |
| "id_col": s["id_col"], | |
| "pred_col": s["pred_col"], | |
| "n_rows": n_rows_cfg if n_rows_cfg not in ("TBD", None) else None, | |
| "gt_present": (GT_DIR / f"{name}.csv").exists(), | |
| "backend": cfg.get("backend", "gt"), | |
| "rows": [{"agent": a, "primary": p, "n_subs": n, "first_seen": f} | |
| for (a, p, n, f) in rows], | |
| }) | |
| n_subs_total += sum(r["n_subs"] for r in tasks[-1]["rows"]) | |
| conn.close() | |
| # Cross-task average per agent. Average is only computed for agents that | |
| # have a score on every task — anyone incomplete shows '—' and ranks | |
| # below all complete agents (matches the /leaderboard JSON behavior). | |
| by_agent: dict[str, dict[str, float]] = {} | |
| for t in tasks: | |
| for r in t["rows"]: | |
| by_agent.setdefault(r["agent"], {})[t["name"]] = r["primary"] | |
| overall_rows = [] | |
| n_total = len(tasks) | |
| for agent, scores in by_agent.items(): | |
| complete = len(scores) == n_total | |
| avg = round(sum(scores.values()) / len(scores), 3) if complete else None | |
| overall_rows.append({ | |
| "agent": agent, | |
| "average": avg, | |
| "n_tasks": len(scores), | |
| "per_task": {t["name"]: scores.get(t["name"]) for t in tasks}, | |
| }) | |
| overall_rows.sort(key=lambda r: ( | |
| 0 if r["average"] is not None else 1, | |
| -(r["average"] if r["average"] is not None else 0), | |
| -r["n_tasks"], | |
| r["agent"], | |
| )) | |
| base_url = request.url_root.rstrip("/") | |
| return render_template_string( | |
| _LANDING_TMPL, | |
| tasks=tasks, | |
| n_tasks=len(tasks), | |
| n_subs_total=n_subs_total, | |
| quota=QUOTA_PER_DAY, | |
| base_url=base_url, | |
| overall_rows=overall_rows, | |
| ) | |
| if __name__ == "__main__": | |
| port = int(os.environ.get("PORT", "8080")) | |
| app.run(host="0.0.0.0", port=port) | |