Spaces:

lanczos
/

graphtestbed

Running

Zhu Jiajun (jz28583) Claude Opus 4.7 (1M context) commited on Apr 20

Commit

5955024

1 Parent(s): 6741538

Add Overall tab + /leaderboard JSON; stage ibm-aml + arxiv data

Overall tab is the new first tab on the landing: per-agent average across
the 4 tasks, computed only over tasks the agent has actually submitted to
(so a one-task agent isn't penalised by N/A on others). Per-task columns
show individual scores (or — when missing). Sorted by average desc, ties
broken by task coverage.

New JSON endpoint GET /leaderboard (no slug) returns the same shape as
JSON for client/CI consumption: {tasks: [...], rows: [{agent, average,
n_tasks, per_task: {...}}, ...]}.

Manifest files: blocks for ibm-aml + arxiv-citation reduced to the four
files we actually stage (train/val/test_features + sample_submission);
the original blocks listed text/category aux tables we don't materialise.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (2) hide show

datasets/manifest.yaml +6 -24
server/api.py +108 -5

datasets/manifest.yaml CHANGED Viewed

@@ -49,21 +49,6 @@ arxiv-citation:
     test_features:
       filename: test_features.csv
       sha256: TBD
-    train_text:
-      filename: train_text.csv
-      sha256: TBD
-    val_text:
-      filename: val_text.csv
-      sha256: TBD
-    test_text:
-      filename: test_text.csv
-      sha256: TBD
-    citations:
-      filename: 3Citation.csv
-      sha256: TBD
-    paper_author:
-      filename: 6Paper_Author.csv
-      sha256: TBD
     sample_submission:
       filename: sample_submission.csv
       sha256: TBD
@@ -133,17 +118,14 @@ ibm-aml:
   hf_repo: graphtestbed/ibm-aml
   hf_revision: main
   files:
-    train_transactions:
-      filename: train_transactions.csv
-      sha256: TBD
-    val_transactions:
-      filename: val_transactions.csv
       sha256: TBD
-    test_transactions:
-      filename: test_transactions.csv
       sha256: TBD
-    accounts:
-      filename: accounts.csv
       sha256: TBD
     sample_submission:
       filename: sample_submission.csv

     test_features:
       filename: test_features.csv
       sha256: TBD
     sample_submission:
       filename: sample_submission.csv
       sha256: TBD
   hf_repo: graphtestbed/ibm-aml
   hf_revision: main
   files:
+    train_features:
+      filename: train_features.csv
       sha256: TBD
+    val_features:
+      filename: val_features.csv
       sha256: TBD
+    test_features:
+      filename: test_features.csv
       sha256: TBD
     sample_submission:
       filename: sample_submission.csv

server/api.py CHANGED Viewed

@@ -350,6 +350,39 @@ def leaderboard(task: str):
     ])
 @app.get("/healthz")
 def healthz():
     manifest = _manifest()
@@ -718,20 +751,73 @@ _LANDING_TMPL = r"""<!doctype html>
   <!-- ============== LEADERBOARD VIEW ============== -->
   <div class="leaderboard-view">
     <div class="tabs" id="task-tabs" role="tablist">
       {% for t in tasks %}
-      <button class="tab{% if loop.first %} active{% endif %}"
-              data-task="{{ t.name }}"
-              role="tab" aria-selected="{{ 'true' if loop.first else 'false' }}">
         {{ t.name }}
         <span class="badge">{{ t.rows|length }}</span>
       </button>
       {% endfor %}
     </div>
     {% for t in tasks %}
     <section class="panel" id="panel-{{ t.name }}"
-             data-task="{{ t.name }}" data-metric="{{ t.metric }}"
-             {% if not loop.first %}hidden{% endif %}>
       <div class="panel-head">
         <div class="meta">
           <strong>{{ t.name }}</strong>
@@ -1085,6 +1171,22 @@ def landing():
         n_subs_total += sum(r["n_subs"] for r in tasks[-1]["rows"])
     conn.close()
     base_url = request.url_root.rstrip("/")
     return render_template_string(
@@ -1094,6 +1196,7 @@ def landing():
         n_subs_total=n_subs_total,
         quota=QUOTA_PER_DAY,
         base_url=base_url,
     )

     ])
+@app.get("/leaderboard")
+def leaderboard_all():
+    """Cross-task average per agent. Average is over tasks the agent has
+    submitted to (not over all tasks), so a one-task agent isn't penalized
+    by N/A on others. Sorted by average desc; ties broken by # tasks covered."""
+    manifest = _manifest()
+    tasks = sorted(manifest)
+    conn = _db()
+    rows = conn.execute("""
+        SELECT task, agent, MAX(primary_metric) as best
+        FROM submissions
+        GROUP BY task, agent
+    """).fetchall()
+    conn.close()
+    by_agent: dict[str, dict[str, float]] = {}
+    for task, agent, best in rows:
+        by_agent.setdefault(agent, {})[task] = float(best)
+    out = []
+    for agent, scores in by_agent.items():
+        covered = [t for t in tasks if t in scores]
+        if not covered:
+            continue
+        avg = sum(scores[t] for t in covered) / len(covered)
+        out.append({
+            "agent": agent,
+            "average": round(avg, 3),
+            "n_tasks": len(covered),
+            "per_task": {t: scores.get(t) for t in tasks},
+        })
+    out.sort(key=lambda r: (-r["average"], -r["n_tasks"], r["agent"]))
+    return jsonify({"tasks": tasks, "rows": out})
 @app.get("/healthz")
 def healthz():
     manifest = _manifest()
   <!-- ============== LEADERBOARD VIEW ============== -->
   <div class="leaderboard-view">
     <div class="tabs" id="task-tabs" role="tablist">
+      <button class="tab active" data-task="overall" role="tab" aria-selected="true">
+        Overall
+        <span class="badge">{{ overall_rows|length }}</span>
+      </button>
       {% for t in tasks %}
+      <button class="tab" data-task="{{ t.name }}" role="tab" aria-selected="false">
         {{ t.name }}
         <span class="badge">{{ t.rows|length }}</span>
       </button>
       {% endfor %}
     </div>
+    <section class="panel" id="panel-overall"
+             data-task="overall" data-metric="average">
+      <div class="panel-head">
+        <div class="meta">
+          <strong>Overall</strong>
+          Average across the {{ n_tasks }} tasks. An agent's average is
+          taken over the tasks they've actually submitted to (not over all
+          tasks), so a one-task agent isn't penalised by N/A on others —
+          the <code>tasks</code> column shows coverage.
+        </div>
+        <div class="pills">
+          <span class="pill metric">average</span>
+          <span class="pill muted">{{ overall_rows|length }} agents</span>
+        </div>
+      </div>
+      <div class="table-wrap">
+        <table class="lb" data-table-for="overall">
+          <thead>
+            <tr>
+              <th class="rank" data-sort="rank">#</th>
+              <th data-sort="agent">Agent</th>
+              <th class="num sorted" data-sort="primary">average <span class="arrow">&#9662;</span></th>
+              <th class="num" data-sort="n_submissions">tasks</th>
+              {% for t in tasks %}
+              <th class="num" data-sort="{{ t.name }}">{{ t.name }}</th>
+              {% endfor %}
+            </tr>
+          </thead>
+          <tbody>
+            {% if overall_rows %}
+              {% for r in overall_rows %}
+              <tr data-agent="{{ r.agent }}">
+                <td class="rank{% if loop.index == 1 %} r1{% elif loop.index == 2 %} r2{% elif loop.index == 3 %} r3{% endif %}">{{ loop.index }}</td>
+                <td class="agent">{{ r.agent }}</td>
+                <td class="score">{{ "%.3f"|format(r.average) }}</td>
+                <td class="num">{{ r.n_tasks }} / {{ n_tasks }}</td>
+                {% for t in tasks %}
+                <td class="num">
+                  {% set v = r.per_task[t.name] %}
+                  {% if v is not none %}{{ "%.3f"|format(v) }}{% else %}<span class="muted">—</span>{% endif %}
+                </td>
+                {% endfor %}
+              </tr>
+              {% endfor %}
+            {% else %}
+              <tr class="empty-row"><td colspan="{{ 4 + n_tasks }}">No submissions yet — be the first to submit.</td></tr>
+            {% endif %}
+          </tbody>
+        </table>
+      </div>
+    </section>
     {% for t in tasks %}
     <section class="panel" id="panel-{{ t.name }}"
+             data-task="{{ t.name }}" data-metric="{{ t.metric }}" hidden>
       <div class="panel-head">
         <div class="meta">
           <strong>{{ t.name }}</strong>
         n_subs_total += sum(r["n_subs"] for r in tasks[-1]["rows"])
     conn.close()
+    # Cross-task average per agent (only over tasks they've submitted to).
+    by_agent: dict[str, dict[str, float]] = {}
+    for t in tasks:
+        for r in t["rows"]:
+            by_agent.setdefault(r["agent"], {})[t["name"]] = r["primary"]
+    overall_rows = []
+    for agent, scores in by_agent.items():
+        avg = sum(scores.values()) / len(scores)
+        overall_rows.append({
+            "agent": agent,
+            "average": round(avg, 3),
+            "n_tasks": len(scores),
+            "per_task": {t["name"]: scores.get(t["name"]) for t in tasks},
+        })
+    overall_rows.sort(key=lambda r: (-r["average"], -r["n_tasks"], r["agent"]))
     base_url = request.url_root.rstrip("/")
     return render_template_string(
         n_subs_total=n_subs_total,
         quota=QUOTA_PER_DAY,
         base_url=base_url,
+        overall_rows=overall_rows,
     )