Zhu Jiajun (jz28583) Claude Opus 4.7 (1M context) commited on
Commit
464248e
·
1 Parent(s): 5ead61d

Add Flask + Jinja2 landing page at GET /

Browse files

Single-page UI rendered server-side: title + status pills (task count,
submission count, quota, healthz/github links), three-line Quickstart,
then per-task sections with description, schema/metric metadata, GT-loaded
indicator, and a rank/agent/primary/subs/first-seen leaderboard table.
Empty state for tasks with no entries. Inline CSS, no external assets, no JS.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. server/api.py +155 -1
server/api.py CHANGED
@@ -36,7 +36,7 @@ from pathlib import Path
36
 
37
  import pandas as pd
38
  import yaml
39
- from flask import Flask, jsonify, request
40
 
41
 
42
  GT_DIR = Path(os.environ.get("GT_DIR", "/var/graphtestbed/gt"))
@@ -282,6 +282,160 @@ def healthz():
282
  })
283
 
284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  if __name__ == "__main__":
286
  port = int(os.environ.get("PORT", "8080"))
287
  app.run(host="0.0.0.0", port=port)
 
36
 
37
  import pandas as pd
38
  import yaml
39
+ from flask import Flask, jsonify, render_template_string, request
40
 
41
 
42
  GT_DIR = Path(os.environ.get("GT_DIR", "/var/graphtestbed/gt"))
 
282
  })
283
 
284
 
285
+ _LANDING_TMPL = """<!doctype html>
286
+ <html lang="en">
287
+ <head>
288
+ <meta charset="utf-8">
289
+ <meta name="viewport" content="width=device-width, initial-scale=1">
290
+ <title>GraphTestbed Scoring</title>
291
+ <style>
292
+ body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", system-ui, sans-serif;
293
+ max-width: 920px; margin: 2em auto; padding: 0 1em; color: #1f2328; line-height: 1.55; }
294
+ h1 { margin: 0 0 .15em; }
295
+ h2 { margin-top: 1.8em; padding-bottom: .25em; border-bottom: 1px solid #d0d7de; }
296
+ .subtitle { color: #57606a; margin: 0 0 1em; }
297
+ pre { background: #f6f8fa; padding: 12px 14px; border-radius: 6px;
298
+ overflow-x: auto; font-size: 13px; line-height: 1.45; }
299
+ code { background: #eaeef2; padding: 1px 5px; border-radius: 4px; font-size: 90%; }
300
+ pre code { background: transparent; padding: 0; font-size: inherit; }
301
+ table { border-collapse: collapse; width: 100%; margin: .4em 0 1.4em; font-size: 14px; }
302
+ th, td { padding: .5em .75em; text-align: left; border-bottom: 1px solid #eaeef2; }
303
+ th { background: #f6f8fa; font-weight: 600; font-size: 13px; }
304
+ td.num { text-align: right; font-variant-numeric: tabular-nums; }
305
+ td.rank { text-align: right; color: #57606a; width: 3em; }
306
+ td.rank-1 { color: #bf8700; font-weight: 700; }
307
+ .empty { color: #8b949e; font-style: italic; padding: .4em 0; font-size: 14px; }
308
+ .meta { color: #57606a; font-size: 13px; margin: .25em 0 .8em; }
309
+ .meta code { font-size: 90%; }
310
+ .pills { display: flex; gap: .5em; flex-wrap: wrap; margin: .5em 0 1.5em; }
311
+ .pill { background: #ddf4ff; color: #0969da; padding: .2em .65em; border-radius: 12px;
312
+ font-size: 12px; font-weight: 500; text-decoration: none; }
313
+ .pill:hover { background: #b6e3ff; }
314
+ .pill.gt { background: #dafbe1; color: #1a7f37; }
315
+ .pill.warn { background: #fff8c5; color: #9a6700; }
316
+ a { color: #0969da; }
317
+ footer { margin-top: 3em; padding-top: 1em; border-top: 1px solid #d0d7de;
318
+ color: #8b949e; font-size: 13px; }
319
+ </style>
320
+ </head>
321
+ <body>
322
+
323
+ <h1>📊 GraphTestbed Scoring</h1>
324
+ <p class="subtitle">
325
+ Public leaderboard for benchmarking ML/AI agent harnesses on heterogeneous graph datasets.
326
+ </p>
327
+
328
+ <div class="pills">
329
+ <span class="pill">{{ n_tasks }} tasks</span>
330
+ <span class="pill">{{ n_subs_total }} submissions</span>
331
+ <span class="pill">quota: {{ quota }}/day/IP/task</span>
332
+ <a class="pill" href="/healthz">healthz</a>
333
+ <a class="pill" href="https://github.com/zhuconv/GraphTestbed">github</a>
334
+ </div>
335
+
336
+ <h2>Quickstart</h2>
337
+ <pre><code>pip install git+https://github.com/zhuconv/GraphTestbed
338
+ gtb submit &lt;task&gt; --file preds.csv --agent &lt;your-name&gt;
339
+ gtb leaderboard &lt;task&gt;</code></pre>
340
+
341
+ <p>
342
+ Submission CSV must have exactly two columns
343
+ (<code>id_col</code>, <code>pred_col</code> per the per-task schema below)
344
+ and exactly <code>n_rows</code> data rows. Schema is checked client-side
345
+ first, so a malformed file never burns a quota slot.
346
+ Full contract: <a href="https://github.com/zhuconv/GraphTestbed/blob/main/PROTOCOL.md">PROTOCOL.md</a>.
347
+ </p>
348
+
349
+ {% for t in tasks %}
350
+ <h2 id="{{ t.name }}">{{ t.name }}</h2>
351
+ <p class="meta">
352
+ metric: <code>{{ t.metric }}</code>
353
+ {%- if t.n_rows %} · <code>{{ t.n_rows }}</code> test rows{% endif %} ·
354
+ columns: <code>[{{ t.id_col }}, {{ t.pred_col }}]</code>
355
+ {% if t.gt_present %}<span class="pill gt">GT loaded</span>
356
+ {%- else %}<span class="pill warn">GT missing</span>{% endif %}
357
+ </p>
358
+ {% if t.description %}<p>{{ t.description|trim }}</p>{% endif %}
359
+ {% if t.rows %}
360
+ <table>
361
+ <thead>
362
+ <tr><th>#</th><th>agent</th><th class="num">{{ t.metric }}</th><th class="num">subs</th><th>first&nbsp;seen</th></tr>
363
+ </thead>
364
+ <tbody>
365
+ {% for r in t.rows %}
366
+ <tr>
367
+ <td class="rank{% if loop.index == 1 %} rank-1{% endif %}">{{ loop.index }}</td>
368
+ <td><code>{{ r.agent }}</code></td>
369
+ <td class="num">{{ "%.3f"|format(r.primary) }}</td>
370
+ <td class="num">{{ r.n_subs }}</td>
371
+ <td><small>{{ r.first_seen[:19] }}Z</small></td>
372
+ </tr>
373
+ {% endfor %}
374
+ </tbody>
375
+ </table>
376
+ {% else %}
377
+ <p class="empty">no submissions yet</p>
378
+ {% endif %}
379
+ {% endfor %}
380
+
381
+ <h2>Endpoints</h2>
382
+ <pre><code>POST /submit multipart task=&amp;agent=&amp;file=&lt;csv&gt;
383
+ → {primary, secondary, leaderboard_rank, quota_remaining, ...}
384
+ GET /leaderboard/&lt;task&gt; JSON: per-agent best, sorted by primary desc
385
+ GET /healthz JSON: tasks, gt_present, quota
386
+ GET / this page</code></pre>
387
+
388
+ <footer>
389
+ Backed by Flask + sqlite at <code>/data/leaderboard.db</code>; periodic
390
+ snapshot to a private HF dataset for durability. Non-adversarial trust
391
+ model — see PROTOCOL.md.
392
+ </footer>
393
+
394
+ </body>
395
+ </html>
396
+ """
397
+
398
+
399
+ @app.get("/")
400
+ def landing():
401
+ """Single-page UI: quickstart + per-task leaderboard tables."""
402
+ manifest = _manifest()
403
+ conn = _db()
404
+ tasks = []
405
+ n_subs_total = 0
406
+ for name in sorted(manifest):
407
+ cfg = manifest[name]
408
+ s = cfg["submission_schema"]
409
+ rows = conn.execute("""
410
+ SELECT agent, MAX(primary_metric) AS p, COUNT(*) AS n,
411
+ MIN(submitted_at) AS f
412
+ FROM submissions WHERE task = ?
413
+ GROUP BY agent ORDER BY p DESC
414
+ """, (name,)).fetchall()
415
+ n_rows_cfg = s.get("n_rows")
416
+ tasks.append({
417
+ "name": name,
418
+ "description": str(cfg.get("description", "")),
419
+ "metric": cfg["metric"]["primary"],
420
+ "id_col": s["id_col"],
421
+ "pred_col": s["pred_col"],
422
+ "n_rows": n_rows_cfg if n_rows_cfg not in ("TBD", None) else None,
423
+ "gt_present": (GT_DIR / f"{name}.csv").exists(),
424
+ "rows": [{"agent": a, "primary": p, "n_subs": n, "first_seen": f}
425
+ for (a, p, n, f) in rows],
426
+ })
427
+ n_subs_total += sum(r["n_subs"] for r in tasks[-1]["rows"])
428
+ conn.close()
429
+
430
+ return render_template_string(
431
+ _LANDING_TMPL,
432
+ tasks=tasks,
433
+ n_tasks=len(tasks),
434
+ n_subs_total=n_subs_total,
435
+ quota=QUOTA_PER_DAY,
436
+ )
437
+
438
+
439
  if __name__ == "__main__":
440
  port = int(os.environ.get("PORT", "8080"))
441
  app.run(host="0.0.0.0", port=port)