Spaces:
Running
Running
Zhu Jiajun (jz28583) Claude Opus 4.7 (1M context) commited on
Commit ·
464248e
1
Parent(s): 5ead61d
Add Flask + Jinja2 landing page at GET /
Browse filesSingle-page UI rendered server-side: title + status pills (task count,
submission count, quota, healthz/github links), three-line Quickstart,
then per-task sections with description, schema/metric metadata, GT-loaded
indicator, and a rank/agent/primary/subs/first-seen leaderboard table.
Empty state for tasks with no entries. Inline CSS, no external assets, no JS.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- server/api.py +155 -1
server/api.py
CHANGED
|
@@ -36,7 +36,7 @@ from pathlib import Path
|
|
| 36 |
|
| 37 |
import pandas as pd
|
| 38 |
import yaml
|
| 39 |
-
from flask import Flask, jsonify, request
|
| 40 |
|
| 41 |
|
| 42 |
GT_DIR = Path(os.environ.get("GT_DIR", "/var/graphtestbed/gt"))
|
|
@@ -282,6 +282,160 @@ def healthz():
|
|
| 282 |
})
|
| 283 |
|
| 284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
if __name__ == "__main__":
|
| 286 |
port = int(os.environ.get("PORT", "8080"))
|
| 287 |
app.run(host="0.0.0.0", port=port)
|
|
|
|
| 36 |
|
| 37 |
import pandas as pd
|
| 38 |
import yaml
|
| 39 |
+
from flask import Flask, jsonify, render_template_string, request
|
| 40 |
|
| 41 |
|
| 42 |
GT_DIR = Path(os.environ.get("GT_DIR", "/var/graphtestbed/gt"))
|
|
|
|
| 282 |
})
|
| 283 |
|
| 284 |
|
| 285 |
+
_LANDING_TMPL = """<!doctype html>
|
| 286 |
+
<html lang="en">
|
| 287 |
+
<head>
|
| 288 |
+
<meta charset="utf-8">
|
| 289 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 290 |
+
<title>GraphTestbed Scoring</title>
|
| 291 |
+
<style>
|
| 292 |
+
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", system-ui, sans-serif;
|
| 293 |
+
max-width: 920px; margin: 2em auto; padding: 0 1em; color: #1f2328; line-height: 1.55; }
|
| 294 |
+
h1 { margin: 0 0 .15em; }
|
| 295 |
+
h2 { margin-top: 1.8em; padding-bottom: .25em; border-bottom: 1px solid #d0d7de; }
|
| 296 |
+
.subtitle { color: #57606a; margin: 0 0 1em; }
|
| 297 |
+
pre { background: #f6f8fa; padding: 12px 14px; border-radius: 6px;
|
| 298 |
+
overflow-x: auto; font-size: 13px; line-height: 1.45; }
|
| 299 |
+
code { background: #eaeef2; padding: 1px 5px; border-radius: 4px; font-size: 90%; }
|
| 300 |
+
pre code { background: transparent; padding: 0; font-size: inherit; }
|
| 301 |
+
table { border-collapse: collapse; width: 100%; margin: .4em 0 1.4em; font-size: 14px; }
|
| 302 |
+
th, td { padding: .5em .75em; text-align: left; border-bottom: 1px solid #eaeef2; }
|
| 303 |
+
th { background: #f6f8fa; font-weight: 600; font-size: 13px; }
|
| 304 |
+
td.num { text-align: right; font-variant-numeric: tabular-nums; }
|
| 305 |
+
td.rank { text-align: right; color: #57606a; width: 3em; }
|
| 306 |
+
td.rank-1 { color: #bf8700; font-weight: 700; }
|
| 307 |
+
.empty { color: #8b949e; font-style: italic; padding: .4em 0; font-size: 14px; }
|
| 308 |
+
.meta { color: #57606a; font-size: 13px; margin: .25em 0 .8em; }
|
| 309 |
+
.meta code { font-size: 90%; }
|
| 310 |
+
.pills { display: flex; gap: .5em; flex-wrap: wrap; margin: .5em 0 1.5em; }
|
| 311 |
+
.pill { background: #ddf4ff; color: #0969da; padding: .2em .65em; border-radius: 12px;
|
| 312 |
+
font-size: 12px; font-weight: 500; text-decoration: none; }
|
| 313 |
+
.pill:hover { background: #b6e3ff; }
|
| 314 |
+
.pill.gt { background: #dafbe1; color: #1a7f37; }
|
| 315 |
+
.pill.warn { background: #fff8c5; color: #9a6700; }
|
| 316 |
+
a { color: #0969da; }
|
| 317 |
+
footer { margin-top: 3em; padding-top: 1em; border-top: 1px solid #d0d7de;
|
| 318 |
+
color: #8b949e; font-size: 13px; }
|
| 319 |
+
</style>
|
| 320 |
+
</head>
|
| 321 |
+
<body>
|
| 322 |
+
|
| 323 |
+
<h1>📊 GraphTestbed Scoring</h1>
|
| 324 |
+
<p class="subtitle">
|
| 325 |
+
Public leaderboard for benchmarking ML/AI agent harnesses on heterogeneous graph datasets.
|
| 326 |
+
</p>
|
| 327 |
+
|
| 328 |
+
<div class="pills">
|
| 329 |
+
<span class="pill">{{ n_tasks }} tasks</span>
|
| 330 |
+
<span class="pill">{{ n_subs_total }} submissions</span>
|
| 331 |
+
<span class="pill">quota: {{ quota }}/day/IP/task</span>
|
| 332 |
+
<a class="pill" href="/healthz">healthz</a>
|
| 333 |
+
<a class="pill" href="https://github.com/zhuconv/GraphTestbed">github</a>
|
| 334 |
+
</div>
|
| 335 |
+
|
| 336 |
+
<h2>Quickstart</h2>
|
| 337 |
+
<pre><code>pip install git+https://github.com/zhuconv/GraphTestbed
|
| 338 |
+
gtb submit <task> --file preds.csv --agent <your-name>
|
| 339 |
+
gtb leaderboard <task></code></pre>
|
| 340 |
+
|
| 341 |
+
<p>
|
| 342 |
+
Submission CSV must have exactly two columns
|
| 343 |
+
(<code>id_col</code>, <code>pred_col</code> per the per-task schema below)
|
| 344 |
+
and exactly <code>n_rows</code> data rows. Schema is checked client-side
|
| 345 |
+
first, so a malformed file never burns a quota slot.
|
| 346 |
+
Full contract: <a href="https://github.com/zhuconv/GraphTestbed/blob/main/PROTOCOL.md">PROTOCOL.md</a>.
|
| 347 |
+
</p>
|
| 348 |
+
|
| 349 |
+
{% for t in tasks %}
|
| 350 |
+
<h2 id="{{ t.name }}">{{ t.name }}</h2>
|
| 351 |
+
<p class="meta">
|
| 352 |
+
metric: <code>{{ t.metric }}</code>
|
| 353 |
+
{%- if t.n_rows %} · <code>{{ t.n_rows }}</code> test rows{% endif %} ·
|
| 354 |
+
columns: <code>[{{ t.id_col }}, {{ t.pred_col }}]</code>
|
| 355 |
+
{% if t.gt_present %}<span class="pill gt">GT loaded</span>
|
| 356 |
+
{%- else %}<span class="pill warn">GT missing</span>{% endif %}
|
| 357 |
+
</p>
|
| 358 |
+
{% if t.description %}<p>{{ t.description|trim }}</p>{% endif %}
|
| 359 |
+
{% if t.rows %}
|
| 360 |
+
<table>
|
| 361 |
+
<thead>
|
| 362 |
+
<tr><th>#</th><th>agent</th><th class="num">{{ t.metric }}</th><th class="num">subs</th><th>first seen</th></tr>
|
| 363 |
+
</thead>
|
| 364 |
+
<tbody>
|
| 365 |
+
{% for r in t.rows %}
|
| 366 |
+
<tr>
|
| 367 |
+
<td class="rank{% if loop.index == 1 %} rank-1{% endif %}">{{ loop.index }}</td>
|
| 368 |
+
<td><code>{{ r.agent }}</code></td>
|
| 369 |
+
<td class="num">{{ "%.3f"|format(r.primary) }}</td>
|
| 370 |
+
<td class="num">{{ r.n_subs }}</td>
|
| 371 |
+
<td><small>{{ r.first_seen[:19] }}Z</small></td>
|
| 372 |
+
</tr>
|
| 373 |
+
{% endfor %}
|
| 374 |
+
</tbody>
|
| 375 |
+
</table>
|
| 376 |
+
{% else %}
|
| 377 |
+
<p class="empty">no submissions yet</p>
|
| 378 |
+
{% endif %}
|
| 379 |
+
{% endfor %}
|
| 380 |
+
|
| 381 |
+
<h2>Endpoints</h2>
|
| 382 |
+
<pre><code>POST /submit multipart task=&agent=&file=<csv>
|
| 383 |
+
→ {primary, secondary, leaderboard_rank, quota_remaining, ...}
|
| 384 |
+
GET /leaderboard/<task> JSON: per-agent best, sorted by primary desc
|
| 385 |
+
GET /healthz JSON: tasks, gt_present, quota
|
| 386 |
+
GET / this page</code></pre>
|
| 387 |
+
|
| 388 |
+
<footer>
|
| 389 |
+
Backed by Flask + sqlite at <code>/data/leaderboard.db</code>; periodic
|
| 390 |
+
snapshot to a private HF dataset for durability. Non-adversarial trust
|
| 391 |
+
model — see PROTOCOL.md.
|
| 392 |
+
</footer>
|
| 393 |
+
|
| 394 |
+
</body>
|
| 395 |
+
</html>
|
| 396 |
+
"""
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
@app.get("/")
|
| 400 |
+
def landing():
|
| 401 |
+
"""Single-page UI: quickstart + per-task leaderboard tables."""
|
| 402 |
+
manifest = _manifest()
|
| 403 |
+
conn = _db()
|
| 404 |
+
tasks = []
|
| 405 |
+
n_subs_total = 0
|
| 406 |
+
for name in sorted(manifest):
|
| 407 |
+
cfg = manifest[name]
|
| 408 |
+
s = cfg["submission_schema"]
|
| 409 |
+
rows = conn.execute("""
|
| 410 |
+
SELECT agent, MAX(primary_metric) AS p, COUNT(*) AS n,
|
| 411 |
+
MIN(submitted_at) AS f
|
| 412 |
+
FROM submissions WHERE task = ?
|
| 413 |
+
GROUP BY agent ORDER BY p DESC
|
| 414 |
+
""", (name,)).fetchall()
|
| 415 |
+
n_rows_cfg = s.get("n_rows")
|
| 416 |
+
tasks.append({
|
| 417 |
+
"name": name,
|
| 418 |
+
"description": str(cfg.get("description", "")),
|
| 419 |
+
"metric": cfg["metric"]["primary"],
|
| 420 |
+
"id_col": s["id_col"],
|
| 421 |
+
"pred_col": s["pred_col"],
|
| 422 |
+
"n_rows": n_rows_cfg if n_rows_cfg not in ("TBD", None) else None,
|
| 423 |
+
"gt_present": (GT_DIR / f"{name}.csv").exists(),
|
| 424 |
+
"rows": [{"agent": a, "primary": p, "n_subs": n, "first_seen": f}
|
| 425 |
+
for (a, p, n, f) in rows],
|
| 426 |
+
})
|
| 427 |
+
n_subs_total += sum(r["n_subs"] for r in tasks[-1]["rows"])
|
| 428 |
+
conn.close()
|
| 429 |
+
|
| 430 |
+
return render_template_string(
|
| 431 |
+
_LANDING_TMPL,
|
| 432 |
+
tasks=tasks,
|
| 433 |
+
n_tasks=len(tasks),
|
| 434 |
+
n_subs_total=n_subs_total,
|
| 435 |
+
quota=QUOTA_PER_DAY,
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
|
| 439 |
if __name__ == "__main__":
|
| 440 |
port = int(os.environ.get("PORT", "8080"))
|
| 441 |
app.run(host="0.0.0.0", port=port)
|