Spaces:
Running
Add Overall tab + /leaderboard JSON; stage ibm-aml + arxiv data
Browse filesOverall tab is the new first tab on the landing: per-agent average across
the 4 tasks, computed only over tasks the agent has actually submitted to
(so a one-task agent isn't penalised by N/A on others). Per-task columns
show individual scores (or — when missing). Sorted by average desc, ties
broken by task coverage.
New JSON endpoint GET /leaderboard (no slug) returns the same shape as
JSON for client/CI consumption: {tasks: [...], rows: [{agent, average,
n_tasks, per_task: {...}}, ...]}.
Manifest files: blocks for ibm-aml + arxiv-citation reduced to the four
files we actually stage (train/val/test_features + sample_submission);
the original blocks listed text/category aux tables we don't materialise.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- datasets/manifest.yaml +6 -24
- server/api.py +108 -5
|
@@ -49,21 +49,6 @@ arxiv-citation:
|
|
| 49 |
test_features:
|
| 50 |
filename: test_features.csv
|
| 51 |
sha256: TBD
|
| 52 |
-
train_text:
|
| 53 |
-
filename: train_text.csv
|
| 54 |
-
sha256: TBD
|
| 55 |
-
val_text:
|
| 56 |
-
filename: val_text.csv
|
| 57 |
-
sha256: TBD
|
| 58 |
-
test_text:
|
| 59 |
-
filename: test_text.csv
|
| 60 |
-
sha256: TBD
|
| 61 |
-
citations:
|
| 62 |
-
filename: 3Citation.csv
|
| 63 |
-
sha256: TBD
|
| 64 |
-
paper_author:
|
| 65 |
-
filename: 6Paper_Author.csv
|
| 66 |
-
sha256: TBD
|
| 67 |
sample_submission:
|
| 68 |
filename: sample_submission.csv
|
| 69 |
sha256: TBD
|
|
@@ -133,17 +118,14 @@ ibm-aml:
|
|
| 133 |
hf_repo: graphtestbed/ibm-aml
|
| 134 |
hf_revision: main
|
| 135 |
files:
|
| 136 |
-
|
| 137 |
-
filename:
|
| 138 |
-
sha256: TBD
|
| 139 |
-
val_transactions:
|
| 140 |
-
filename: val_transactions.csv
|
| 141 |
sha256: TBD
|
| 142 |
-
|
| 143 |
-
filename:
|
| 144 |
sha256: TBD
|
| 145 |
-
|
| 146 |
-
filename:
|
| 147 |
sha256: TBD
|
| 148 |
sample_submission:
|
| 149 |
filename: sample_submission.csv
|
|
|
|
| 49 |
test_features:
|
| 50 |
filename: test_features.csv
|
| 51 |
sha256: TBD
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
sample_submission:
|
| 53 |
filename: sample_submission.csv
|
| 54 |
sha256: TBD
|
|
|
|
| 118 |
hf_repo: graphtestbed/ibm-aml
|
| 119 |
hf_revision: main
|
| 120 |
files:
|
| 121 |
+
train_features:
|
| 122 |
+
filename: train_features.csv
|
|
|
|
|
|
|
|
|
|
| 123 |
sha256: TBD
|
| 124 |
+
val_features:
|
| 125 |
+
filename: val_features.csv
|
| 126 |
sha256: TBD
|
| 127 |
+
test_features:
|
| 128 |
+
filename: test_features.csv
|
| 129 |
sha256: TBD
|
| 130 |
sample_submission:
|
| 131 |
filename: sample_submission.csv
|
|
@@ -350,6 +350,39 @@ def leaderboard(task: str):
|
|
| 350 |
])
|
| 351 |
|
| 352 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
@app.get("/healthz")
|
| 354 |
def healthz():
|
| 355 |
manifest = _manifest()
|
|
@@ -718,20 +751,73 @@ _LANDING_TMPL = r"""<!doctype html>
|
|
| 718 |
<!-- ============== LEADERBOARD VIEW ============== -->
|
| 719 |
<div class="leaderboard-view">
|
| 720 |
<div class="tabs" id="task-tabs" role="tablist">
|
|
|
|
|
|
|
|
|
|
|
|
|
| 721 |
{% for t in tasks %}
|
| 722 |
-
<button class="tab{
|
| 723 |
-
data-task="{{ t.name }}"
|
| 724 |
-
role="tab" aria-selected="{{ 'true' if loop.first else 'false' }}">
|
| 725 |
{{ t.name }}
|
| 726 |
<span class="badge">{{ t.rows|length }}</span>
|
| 727 |
</button>
|
| 728 |
{% endfor %}
|
| 729 |
</div>
|
| 730 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 731 |
{% for t in tasks %}
|
| 732 |
<section class="panel" id="panel-{{ t.name }}"
|
| 733 |
-
data-task="{{ t.name }}" data-metric="{{ t.metric }}"
|
| 734 |
-
{% if not loop.first %}hidden{% endif %}>
|
| 735 |
<div class="panel-head">
|
| 736 |
<div class="meta">
|
| 737 |
<strong>{{ t.name }}</strong>
|
|
@@ -1085,6 +1171,22 @@ def landing():
|
|
| 1085 |
n_subs_total += sum(r["n_subs"] for r in tasks[-1]["rows"])
|
| 1086 |
conn.close()
|
| 1087 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1088 |
base_url = request.url_root.rstrip("/")
|
| 1089 |
|
| 1090 |
return render_template_string(
|
|
@@ -1094,6 +1196,7 @@ def landing():
|
|
| 1094 |
n_subs_total=n_subs_total,
|
| 1095 |
quota=QUOTA_PER_DAY,
|
| 1096 |
base_url=base_url,
|
|
|
|
| 1097 |
)
|
| 1098 |
|
| 1099 |
|
|
|
|
| 350 |
])
|
| 351 |
|
| 352 |
|
| 353 |
+
@app.get("/leaderboard")
|
| 354 |
+
def leaderboard_all():
|
| 355 |
+
"""Cross-task average per agent. Average is over tasks the agent has
|
| 356 |
+
submitted to (not over all tasks), so a one-task agent isn't penalized
|
| 357 |
+
by N/A on others. Sorted by average desc; ties broken by # tasks covered."""
|
| 358 |
+
manifest = _manifest()
|
| 359 |
+
tasks = sorted(manifest)
|
| 360 |
+
conn = _db()
|
| 361 |
+
rows = conn.execute("""
|
| 362 |
+
SELECT task, agent, MAX(primary_metric) as best
|
| 363 |
+
FROM submissions
|
| 364 |
+
GROUP BY task, agent
|
| 365 |
+
""").fetchall()
|
| 366 |
+
conn.close()
|
| 367 |
+
by_agent: dict[str, dict[str, float]] = {}
|
| 368 |
+
for task, agent, best in rows:
|
| 369 |
+
by_agent.setdefault(agent, {})[task] = float(best)
|
| 370 |
+
out = []
|
| 371 |
+
for agent, scores in by_agent.items():
|
| 372 |
+
covered = [t for t in tasks if t in scores]
|
| 373 |
+
if not covered:
|
| 374 |
+
continue
|
| 375 |
+
avg = sum(scores[t] for t in covered) / len(covered)
|
| 376 |
+
out.append({
|
| 377 |
+
"agent": agent,
|
| 378 |
+
"average": round(avg, 3),
|
| 379 |
+
"n_tasks": len(covered),
|
| 380 |
+
"per_task": {t: scores.get(t) for t in tasks},
|
| 381 |
+
})
|
| 382 |
+
out.sort(key=lambda r: (-r["average"], -r["n_tasks"], r["agent"]))
|
| 383 |
+
return jsonify({"tasks": tasks, "rows": out})
|
| 384 |
+
|
| 385 |
+
|
| 386 |
@app.get("/healthz")
|
| 387 |
def healthz():
|
| 388 |
manifest = _manifest()
|
|
|
|
| 751 |
<!-- ============== LEADERBOARD VIEW ============== -->
|
| 752 |
<div class="leaderboard-view">
|
| 753 |
<div class="tabs" id="task-tabs" role="tablist">
|
| 754 |
+
<button class="tab active" data-task="overall" role="tab" aria-selected="true">
|
| 755 |
+
Overall
|
| 756 |
+
<span class="badge">{{ overall_rows|length }}</span>
|
| 757 |
+
</button>
|
| 758 |
{% for t in tasks %}
|
| 759 |
+
<button class="tab" data-task="{{ t.name }}" role="tab" aria-selected="false">
|
|
|
|
|
|
|
| 760 |
{{ t.name }}
|
| 761 |
<span class="badge">{{ t.rows|length }}</span>
|
| 762 |
</button>
|
| 763 |
{% endfor %}
|
| 764 |
</div>
|
| 765 |
|
| 766 |
+
<section class="panel" id="panel-overall"
|
| 767 |
+
data-task="overall" data-metric="average">
|
| 768 |
+
<div class="panel-head">
|
| 769 |
+
<div class="meta">
|
| 770 |
+
<strong>Overall</strong>
|
| 771 |
+
Average across the {{ n_tasks }} tasks. An agent's average is
|
| 772 |
+
taken over the tasks they've actually submitted to (not over all
|
| 773 |
+
tasks), so a one-task agent isn't penalised by N/A on others —
|
| 774 |
+
the <code>tasks</code> column shows coverage.
|
| 775 |
+
</div>
|
| 776 |
+
<div class="pills">
|
| 777 |
+
<span class="pill metric">average</span>
|
| 778 |
+
<span class="pill muted">{{ overall_rows|length }} agents</span>
|
| 779 |
+
</div>
|
| 780 |
+
</div>
|
| 781 |
+
<div class="table-wrap">
|
| 782 |
+
<table class="lb" data-table-for="overall">
|
| 783 |
+
<thead>
|
| 784 |
+
<tr>
|
| 785 |
+
<th class="rank" data-sort="rank">#</th>
|
| 786 |
+
<th data-sort="agent">Agent</th>
|
| 787 |
+
<th class="num sorted" data-sort="primary">average <span class="arrow">▾</span></th>
|
| 788 |
+
<th class="num" data-sort="n_submissions">tasks</th>
|
| 789 |
+
{% for t in tasks %}
|
| 790 |
+
<th class="num" data-sort="{{ t.name }}">{{ t.name }}</th>
|
| 791 |
+
{% endfor %}
|
| 792 |
+
</tr>
|
| 793 |
+
</thead>
|
| 794 |
+
<tbody>
|
| 795 |
+
{% if overall_rows %}
|
| 796 |
+
{% for r in overall_rows %}
|
| 797 |
+
<tr data-agent="{{ r.agent }}">
|
| 798 |
+
<td class="rank{% if loop.index == 1 %} r1{% elif loop.index == 2 %} r2{% elif loop.index == 3 %} r3{% endif %}">{{ loop.index }}</td>
|
| 799 |
+
<td class="agent">{{ r.agent }}</td>
|
| 800 |
+
<td class="score">{{ "%.3f"|format(r.average) }}</td>
|
| 801 |
+
<td class="num">{{ r.n_tasks }} / {{ n_tasks }}</td>
|
| 802 |
+
{% for t in tasks %}
|
| 803 |
+
<td class="num">
|
| 804 |
+
{% set v = r.per_task[t.name] %}
|
| 805 |
+
{% if v is not none %}{{ "%.3f"|format(v) }}{% else %}<span class="muted">—</span>{% endif %}
|
| 806 |
+
</td>
|
| 807 |
+
{% endfor %}
|
| 808 |
+
</tr>
|
| 809 |
+
{% endfor %}
|
| 810 |
+
{% else %}
|
| 811 |
+
<tr class="empty-row"><td colspan="{{ 4 + n_tasks }}">No submissions yet — be the first to submit.</td></tr>
|
| 812 |
+
{% endif %}
|
| 813 |
+
</tbody>
|
| 814 |
+
</table>
|
| 815 |
+
</div>
|
| 816 |
+
</section>
|
| 817 |
+
|
| 818 |
{% for t in tasks %}
|
| 819 |
<section class="panel" id="panel-{{ t.name }}"
|
| 820 |
+
data-task="{{ t.name }}" data-metric="{{ t.metric }}" hidden>
|
|
|
|
| 821 |
<div class="panel-head">
|
| 822 |
<div class="meta">
|
| 823 |
<strong>{{ t.name }}</strong>
|
|
|
|
| 1171 |
n_subs_total += sum(r["n_subs"] for r in tasks[-1]["rows"])
|
| 1172 |
conn.close()
|
| 1173 |
|
| 1174 |
+
# Cross-task average per agent (only over tasks they've submitted to).
|
| 1175 |
+
by_agent: dict[str, dict[str, float]] = {}
|
| 1176 |
+
for t in tasks:
|
| 1177 |
+
for r in t["rows"]:
|
| 1178 |
+
by_agent.setdefault(r["agent"], {})[t["name"]] = r["primary"]
|
| 1179 |
+
overall_rows = []
|
| 1180 |
+
for agent, scores in by_agent.items():
|
| 1181 |
+
avg = sum(scores.values()) / len(scores)
|
| 1182 |
+
overall_rows.append({
|
| 1183 |
+
"agent": agent,
|
| 1184 |
+
"average": round(avg, 3),
|
| 1185 |
+
"n_tasks": len(scores),
|
| 1186 |
+
"per_task": {t["name"]: scores.get(t["name"]) for t in tasks},
|
| 1187 |
+
})
|
| 1188 |
+
overall_rows.sort(key=lambda r: (-r["average"], -r["n_tasks"], r["agent"]))
|
| 1189 |
+
|
| 1190 |
base_url = request.url_root.rstrip("/")
|
| 1191 |
|
| 1192 |
return render_template_string(
|
|
|
|
| 1196 |
n_subs_total=n_subs_total,
|
| 1197 |
quota=QUOTA_PER_DAY,
|
| 1198 |
base_url=base_url,
|
| 1199 |
+
overall_rows=overall_rows,
|
| 1200 |
)
|
| 1201 |
|
| 1202 |
|