Spaces:
Running
Running
Zhu Jiajun (jz28583) Claude Opus 4.7 (1M context) commited on
Commit Β·
ab28b31
1
Parent(s): bf48fd7
Overall: only complete agents get an average; rest sink to bottom
Browse filesPreviously, an agent with one task scored had its single score shown as
the average β making 'autopipe-v2 β figraph 0.824' appear ranked above
fully-covered agents with averages in the 0.5s. That hides the real
ranking from anyone scanning the page.
New rule: average is computed only when the agent has scored on every
task in the manifest. Incomplete agents render 'β' and sort to the
bottom (still ordered among themselves by # tasks covered, then name).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- server/api.py +30 -11
server/api.py
CHANGED
|
@@ -411,9 +411,9 @@ def leaderboard(task: str):
|
|
| 411 |
|
| 412 |
@app.get("/leaderboard")
|
| 413 |
def leaderboard_all():
|
| 414 |
-
"""Cross-task average per agent.
|
| 415 |
-
|
| 416 |
-
|
| 417 |
manifest = _manifest()
|
| 418 |
tasks = sorted(manifest)
|
| 419 |
conn = _db()
|
|
@@ -432,14 +432,22 @@ def leaderboard_all():
|
|
| 432 |
covered = [t for t in tasks if t in scores]
|
| 433 |
if not covered:
|
| 434 |
continue
|
| 435 |
-
|
|
|
|
| 436 |
out.append({
|
| 437 |
"agent": agent,
|
| 438 |
-
"average": round(avg, 3),
|
| 439 |
"n_tasks": len(covered),
|
| 440 |
"per_task": {t: scores.get(t) for t in tasks},
|
| 441 |
})
|
| 442 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
return jsonify({"tasks": tasks, "rows": out})
|
| 444 |
|
| 445 |
|
|
@@ -960,7 +968,9 @@ _LANDING_TMPL = r"""<!doctype html>
|
|
| 960 |
{% if v is not none %}{{ "%.3f"|format(v) }}{% else %}<span class="muted">β</span>{% endif %}
|
| 961 |
</td>
|
| 962 |
{% endfor %}
|
| 963 |
-
<td class="score">
|
|
|
|
|
|
|
| 964 |
</tr>
|
| 965 |
{% endfor %}
|
| 966 |
{% else %}
|
|
@@ -1332,21 +1342,30 @@ def landing():
|
|
| 1332 |
n_subs_total += sum(r["n_subs"] for r in tasks[-1]["rows"])
|
| 1333 |
conn.close()
|
| 1334 |
|
| 1335 |
-
# Cross-task average per agent
|
|
|
|
|
|
|
| 1336 |
by_agent: dict[str, dict[str, float]] = {}
|
| 1337 |
for t in tasks:
|
| 1338 |
for r in t["rows"]:
|
| 1339 |
by_agent.setdefault(r["agent"], {})[t["name"]] = r["primary"]
|
| 1340 |
overall_rows = []
|
|
|
|
| 1341 |
for agent, scores in by_agent.items():
|
| 1342 |
-
|
|
|
|
| 1343 |
overall_rows.append({
|
| 1344 |
"agent": agent,
|
| 1345 |
-
"average":
|
| 1346 |
"n_tasks": len(scores),
|
| 1347 |
"per_task": {t["name"]: scores.get(t["name"]) for t in tasks},
|
| 1348 |
})
|
| 1349 |
-
overall_rows.sort(key=lambda r: (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1350 |
|
| 1351 |
base_url = request.url_root.rstrip("/")
|
| 1352 |
|
|
|
|
| 411 |
|
| 412 |
@app.get("/leaderboard")
|
| 413 |
def leaderboard_all():
|
| 414 |
+
"""Cross-task average per agent. The average is only computed for agents
|
| 415 |
+
that have a score on every task β an incomplete agent shows 'β' and ranks
|
| 416 |
+
below all complete ones (ties broken by agent name for stability)."""
|
| 417 |
manifest = _manifest()
|
| 418 |
tasks = sorted(manifest)
|
| 419 |
conn = _db()
|
|
|
|
| 432 |
covered = [t for t in tasks if t in scores]
|
| 433 |
if not covered:
|
| 434 |
continue
|
| 435 |
+
complete = len(covered) == len(tasks)
|
| 436 |
+
avg = sum(scores[t] for t in covered) / len(covered) if complete else None
|
| 437 |
out.append({
|
| 438 |
"agent": agent,
|
| 439 |
+
"average": round(avg, 3) if avg is not None else None,
|
| 440 |
"n_tasks": len(covered),
|
| 441 |
"per_task": {t: scores.get(t) for t in tasks},
|
| 442 |
})
|
| 443 |
+
# Complete agents first (sorted by average desc), then incomplete ones at
|
| 444 |
+
# the bottom (sorted by # tasks covered desc, then name).
|
| 445 |
+
out.sort(key=lambda r: (
|
| 446 |
+
0 if r["average"] is not None else 1,
|
| 447 |
+
-(r["average"] if r["average"] is not None else 0),
|
| 448 |
+
-r["n_tasks"],
|
| 449 |
+
r["agent"],
|
| 450 |
+
))
|
| 451 |
return jsonify({"tasks": tasks, "rows": out})
|
| 452 |
|
| 453 |
|
|
|
|
| 968 |
{% if v is not none %}{{ "%.3f"|format(v) }}{% else %}<span class="muted">β</span>{% endif %}
|
| 969 |
</td>
|
| 970 |
{% endfor %}
|
| 971 |
+
<td class="score">
|
| 972 |
+
{% if r.average is not none %}{{ "%.3f"|format(r.average) }}{% else %}<span class="muted">β</span>{% endif %}
|
| 973 |
+
</td>
|
| 974 |
</tr>
|
| 975 |
{% endfor %}
|
| 976 |
{% else %}
|
|
|
|
| 1342 |
n_subs_total += sum(r["n_subs"] for r in tasks[-1]["rows"])
|
| 1343 |
conn.close()
|
| 1344 |
|
| 1345 |
+
# Cross-task average per agent. Average is only computed for agents that
|
| 1346 |
+
# have a score on every task β anyone incomplete shows 'β' and ranks
|
| 1347 |
+
# below all complete agents (matches the /leaderboard JSON behavior).
|
| 1348 |
by_agent: dict[str, dict[str, float]] = {}
|
| 1349 |
for t in tasks:
|
| 1350 |
for r in t["rows"]:
|
| 1351 |
by_agent.setdefault(r["agent"], {})[t["name"]] = r["primary"]
|
| 1352 |
overall_rows = []
|
| 1353 |
+
n_total = len(tasks)
|
| 1354 |
for agent, scores in by_agent.items():
|
| 1355 |
+
complete = len(scores) == n_total
|
| 1356 |
+
avg = round(sum(scores.values()) / len(scores), 3) if complete else None
|
| 1357 |
overall_rows.append({
|
| 1358 |
"agent": agent,
|
| 1359 |
+
"average": avg,
|
| 1360 |
"n_tasks": len(scores),
|
| 1361 |
"per_task": {t["name"]: scores.get(t["name"]) for t in tasks},
|
| 1362 |
})
|
| 1363 |
+
overall_rows.sort(key=lambda r: (
|
| 1364 |
+
0 if r["average"] is not None else 1,
|
| 1365 |
+
-(r["average"] if r["average"] is not None else 0),
|
| 1366 |
+
-r["n_tasks"],
|
| 1367 |
+
r["agent"],
|
| 1368 |
+
))
|
| 1369 |
|
| 1370 |
base_url = request.url_root.rstrip("/")
|
| 1371 |
|