Zhu Jiajun (jz28583) Claude Opus 4.7 (1M context) commited on
Commit
5955024
·
1 Parent(s): 6741538

Add Overall tab + /leaderboard JSON; stage ibm-aml + arxiv data

Browse files

Overall tab is the new first tab on the landing: per-agent average across
the 4 tasks, computed only over tasks the agent has actually submitted to
(so a one-task agent isn't penalised by N/A on others). Per-task columns
show individual scores (or — when missing). Sorted by average desc, ties
broken by task coverage.

New JSON endpoint GET /leaderboard (no slug) returns the same shape as
JSON for client/CI consumption: {tasks: [...], rows: [{agent, average,
n_tasks, per_task: {...}}, ...]}.

Manifest files: blocks for ibm-aml + arxiv-citation reduced to the four
files we actually stage (train/val/test_features + sample_submission);
the original blocks listed text/category aux tables we don't materialise.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (2) hide show
  1. datasets/manifest.yaml +6 -24
  2. server/api.py +108 -5
datasets/manifest.yaml CHANGED
@@ -49,21 +49,6 @@ arxiv-citation:
49
  test_features:
50
  filename: test_features.csv
51
  sha256: TBD
52
- train_text:
53
- filename: train_text.csv
54
- sha256: TBD
55
- val_text:
56
- filename: val_text.csv
57
- sha256: TBD
58
- test_text:
59
- filename: test_text.csv
60
- sha256: TBD
61
- citations:
62
- filename: 3Citation.csv
63
- sha256: TBD
64
- paper_author:
65
- filename: 6Paper_Author.csv
66
- sha256: TBD
67
  sample_submission:
68
  filename: sample_submission.csv
69
  sha256: TBD
@@ -133,17 +118,14 @@ ibm-aml:
133
  hf_repo: graphtestbed/ibm-aml
134
  hf_revision: main
135
  files:
136
- train_transactions:
137
- filename: train_transactions.csv
138
- sha256: TBD
139
- val_transactions:
140
- filename: val_transactions.csv
141
  sha256: TBD
142
- test_transactions:
143
- filename: test_transactions.csv
144
  sha256: TBD
145
- accounts:
146
- filename: accounts.csv
147
  sha256: TBD
148
  sample_submission:
149
  filename: sample_submission.csv
 
49
  test_features:
50
  filename: test_features.csv
51
  sha256: TBD
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  sample_submission:
53
  filename: sample_submission.csv
54
  sha256: TBD
 
118
  hf_repo: graphtestbed/ibm-aml
119
  hf_revision: main
120
  files:
121
+ train_features:
122
+ filename: train_features.csv
 
 
 
123
  sha256: TBD
124
+ val_features:
125
+ filename: val_features.csv
126
  sha256: TBD
127
+ test_features:
128
+ filename: test_features.csv
129
  sha256: TBD
130
  sample_submission:
131
  filename: sample_submission.csv
server/api.py CHANGED
@@ -350,6 +350,39 @@ def leaderboard(task: str):
350
  ])
351
 
352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  @app.get("/healthz")
354
  def healthz():
355
  manifest = _manifest()
@@ -718,20 +751,73 @@ _LANDING_TMPL = r"""<!doctype html>
718
  <!-- ============== LEADERBOARD VIEW ============== -->
719
  <div class="leaderboard-view">
720
  <div class="tabs" id="task-tabs" role="tablist">
 
 
 
 
721
  {% for t in tasks %}
722
- <button class="tab{% if loop.first %} active{% endif %}"
723
- data-task="{{ t.name }}"
724
- role="tab" aria-selected="{{ 'true' if loop.first else 'false' }}">
725
  {{ t.name }}
726
  <span class="badge">{{ t.rows|length }}</span>
727
  </button>
728
  {% endfor %}
729
  </div>
730
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
  {% for t in tasks %}
732
  <section class="panel" id="panel-{{ t.name }}"
733
- data-task="{{ t.name }}" data-metric="{{ t.metric }}"
734
- {% if not loop.first %}hidden{% endif %}>
735
  <div class="panel-head">
736
  <div class="meta">
737
  <strong>{{ t.name }}</strong>
@@ -1085,6 +1171,22 @@ def landing():
1085
  n_subs_total += sum(r["n_subs"] for r in tasks[-1]["rows"])
1086
  conn.close()
1087
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1088
  base_url = request.url_root.rstrip("/")
1089
 
1090
  return render_template_string(
@@ -1094,6 +1196,7 @@ def landing():
1094
  n_subs_total=n_subs_total,
1095
  quota=QUOTA_PER_DAY,
1096
  base_url=base_url,
 
1097
  )
1098
 
1099
 
 
350
  ])
351
 
352
 
353
+ @app.get("/leaderboard")
354
+ def leaderboard_all():
355
+ """Cross-task average per agent. Average is over tasks the agent has
356
+ submitted to (not over all tasks), so a one-task agent isn't penalized
357
+ by N/A on others. Sorted by average desc; ties broken by # tasks covered."""
358
+ manifest = _manifest()
359
+ tasks = sorted(manifest)
360
+ conn = _db()
361
+ rows = conn.execute("""
362
+ SELECT task, agent, MAX(primary_metric) as best
363
+ FROM submissions
364
+ GROUP BY task, agent
365
+ """).fetchall()
366
+ conn.close()
367
+ by_agent: dict[str, dict[str, float]] = {}
368
+ for task, agent, best in rows:
369
+ by_agent.setdefault(agent, {})[task] = float(best)
370
+ out = []
371
+ for agent, scores in by_agent.items():
372
+ covered = [t for t in tasks if t in scores]
373
+ if not covered:
374
+ continue
375
+ avg = sum(scores[t] for t in covered) / len(covered)
376
+ out.append({
377
+ "agent": agent,
378
+ "average": round(avg, 3),
379
+ "n_tasks": len(covered),
380
+ "per_task": {t: scores.get(t) for t in tasks},
381
+ })
382
+ out.sort(key=lambda r: (-r["average"], -r["n_tasks"], r["agent"]))
383
+ return jsonify({"tasks": tasks, "rows": out})
384
+
385
+
386
  @app.get("/healthz")
387
  def healthz():
388
  manifest = _manifest()
 
751
  <!-- ============== LEADERBOARD VIEW ============== -->
752
  <div class="leaderboard-view">
753
  <div class="tabs" id="task-tabs" role="tablist">
754
+ <button class="tab active" data-task="overall" role="tab" aria-selected="true">
755
+ Overall
756
+ <span class="badge">{{ overall_rows|length }}</span>
757
+ </button>
758
  {% for t in tasks %}
759
+ <button class="tab" data-task="{{ t.name }}" role="tab" aria-selected="false">
 
 
760
  {{ t.name }}
761
  <span class="badge">{{ t.rows|length }}</span>
762
  </button>
763
  {% endfor %}
764
  </div>
765
 
766
+ <section class="panel" id="panel-overall"
767
+ data-task="overall" data-metric="average">
768
+ <div class="panel-head">
769
+ <div class="meta">
770
+ <strong>Overall</strong>
771
+ Average across the {{ n_tasks }} tasks. An agent's average is
772
+ taken over the tasks they've actually submitted to (not over all
773
+ tasks), so a one-task agent isn't penalised by N/A on others —
774
+ the <code>tasks</code> column shows coverage.
775
+ </div>
776
+ <div class="pills">
777
+ <span class="pill metric">average</span>
778
+ <span class="pill muted">{{ overall_rows|length }} agents</span>
779
+ </div>
780
+ </div>
781
+ <div class="table-wrap">
782
+ <table class="lb" data-table-for="overall">
783
+ <thead>
784
+ <tr>
785
+ <th class="rank" data-sort="rank">#</th>
786
+ <th data-sort="agent">Agent</th>
787
+ <th class="num sorted" data-sort="primary">average <span class="arrow">&#9662;</span></th>
788
+ <th class="num" data-sort="n_submissions">tasks</th>
789
+ {% for t in tasks %}
790
+ <th class="num" data-sort="{{ t.name }}">{{ t.name }}</th>
791
+ {% endfor %}
792
+ </tr>
793
+ </thead>
794
+ <tbody>
795
+ {% if overall_rows %}
796
+ {% for r in overall_rows %}
797
+ <tr data-agent="{{ r.agent }}">
798
+ <td class="rank{% if loop.index == 1 %} r1{% elif loop.index == 2 %} r2{% elif loop.index == 3 %} r3{% endif %}">{{ loop.index }}</td>
799
+ <td class="agent">{{ r.agent }}</td>
800
+ <td class="score">{{ "%.3f"|format(r.average) }}</td>
801
+ <td class="num">{{ r.n_tasks }} / {{ n_tasks }}</td>
802
+ {% for t in tasks %}
803
+ <td class="num">
804
+ {% set v = r.per_task[t.name] %}
805
+ {% if v is not none %}{{ "%.3f"|format(v) }}{% else %}<span class="muted">—</span>{% endif %}
806
+ </td>
807
+ {% endfor %}
808
+ </tr>
809
+ {% endfor %}
810
+ {% else %}
811
+ <tr class="empty-row"><td colspan="{{ 4 + n_tasks }}">No submissions yet — be the first to submit.</td></tr>
812
+ {% endif %}
813
+ </tbody>
814
+ </table>
815
+ </div>
816
+ </section>
817
+
818
  {% for t in tasks %}
819
  <section class="panel" id="panel-{{ t.name }}"
820
+ data-task="{{ t.name }}" data-metric="{{ t.metric }}" hidden>
 
821
  <div class="panel-head">
822
  <div class="meta">
823
  <strong>{{ t.name }}</strong>
 
1171
  n_subs_total += sum(r["n_subs"] for r in tasks[-1]["rows"])
1172
  conn.close()
1173
 
1174
+ # Cross-task average per agent (only over tasks they've submitted to).
1175
+ by_agent: dict[str, dict[str, float]] = {}
1176
+ for t in tasks:
1177
+ for r in t["rows"]:
1178
+ by_agent.setdefault(r["agent"], {})[t["name"]] = r["primary"]
1179
+ overall_rows = []
1180
+ for agent, scores in by_agent.items():
1181
+ avg = sum(scores.values()) / len(scores)
1182
+ overall_rows.append({
1183
+ "agent": agent,
1184
+ "average": round(avg, 3),
1185
+ "n_tasks": len(scores),
1186
+ "per_task": {t["name"]: scores.get(t["name"]) for t in tasks},
1187
+ })
1188
+ overall_rows.sort(key=lambda r: (-r["average"], -r["n_tasks"], r["agent"]))
1189
+
1190
  base_url = request.url_root.rstrip("/")
1191
 
1192
  return render_template_string(
 
1196
  n_subs_total=n_subs_total,
1197
  quota=QUOTA_PER_DAY,
1198
  base_url=base_url,
1199
+ overall_rows=overall_rows,
1200
  )
1201
 
1202