Zhu Jiajun (jz28583) Claude Opus 4.7 (1M context) commited on
Commit
ab28b31
Β·
1 Parent(s): bf48fd7

Overall: only complete agents get an average; rest sink to bottom

Browse files

Previously, an agent with one task scored had its single score shown as
the average β€” making 'autopipe-v2 β†’ figraph 0.824' appear ranked above
fully-covered agents with averages in the 0.5s. That hides the real
ranking from anyone scanning the page.

New rule: average is computed only when the agent has scored on every
task in the manifest. Incomplete agents render 'β€”' and sort to the
bottom (still ordered among themselves by # tasks covered, then name).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. server/api.py +30 -11
server/api.py CHANGED
@@ -411,9 +411,9 @@ def leaderboard(task: str):
411
 
412
  @app.get("/leaderboard")
413
  def leaderboard_all():
414
- """Cross-task average per agent. Average is over tasks the agent has
415
- submitted to (not over all tasks), so a one-task agent isn't penalized
416
- by N/A on others. Sorted by average desc; ties broken by # tasks covered."""
417
  manifest = _manifest()
418
  tasks = sorted(manifest)
419
  conn = _db()
@@ -432,14 +432,22 @@ def leaderboard_all():
432
  covered = [t for t in tasks if t in scores]
433
  if not covered:
434
  continue
435
- avg = sum(scores[t] for t in covered) / len(covered)
 
436
  out.append({
437
  "agent": agent,
438
- "average": round(avg, 3),
439
  "n_tasks": len(covered),
440
  "per_task": {t: scores.get(t) for t in tasks},
441
  })
442
- out.sort(key=lambda r: (-r["average"], -r["n_tasks"], r["agent"]))
 
 
 
 
 
 
 
443
  return jsonify({"tasks": tasks, "rows": out})
444
 
445
 
@@ -960,7 +968,9 @@ _LANDING_TMPL = r"""<!doctype html>
960
  {% if v is not none %}{{ "%.3f"|format(v) }}{% else %}<span class="muted">β€”</span>{% endif %}
961
  </td>
962
  {% endfor %}
963
- <td class="score">{{ "%.3f"|format(r.average) }}</td>
 
 
964
  </tr>
965
  {% endfor %}
966
  {% else %}
@@ -1332,21 +1342,30 @@ def landing():
1332
  n_subs_total += sum(r["n_subs"] for r in tasks[-1]["rows"])
1333
  conn.close()
1334
 
1335
- # Cross-task average per agent (only over tasks they've submitted to).
 
 
1336
  by_agent: dict[str, dict[str, float]] = {}
1337
  for t in tasks:
1338
  for r in t["rows"]:
1339
  by_agent.setdefault(r["agent"], {})[t["name"]] = r["primary"]
1340
  overall_rows = []
 
1341
  for agent, scores in by_agent.items():
1342
- avg = sum(scores.values()) / len(scores)
 
1343
  overall_rows.append({
1344
  "agent": agent,
1345
- "average": round(avg, 3),
1346
  "n_tasks": len(scores),
1347
  "per_task": {t["name"]: scores.get(t["name"]) for t in tasks},
1348
  })
1349
- overall_rows.sort(key=lambda r: (-r["average"], -r["n_tasks"], r["agent"]))
 
 
 
 
 
1350
 
1351
  base_url = request.url_root.rstrip("/")
1352
 
 
411
 
412
  @app.get("/leaderboard")
413
  def leaderboard_all():
414
+ """Cross-task average per agent. The average is only computed for agents
415
+ that have a score on every task β€” an incomplete agent shows 'β€”' and ranks
416
+ below all complete ones (ties broken by agent name for stability)."""
417
  manifest = _manifest()
418
  tasks = sorted(manifest)
419
  conn = _db()
 
432
  covered = [t for t in tasks if t in scores]
433
  if not covered:
434
  continue
435
+ complete = len(covered) == len(tasks)
436
+ avg = sum(scores[t] for t in covered) / len(covered) if complete else None
437
  out.append({
438
  "agent": agent,
439
+ "average": round(avg, 3) if avg is not None else None,
440
  "n_tasks": len(covered),
441
  "per_task": {t: scores.get(t) for t in tasks},
442
  })
443
+ # Complete agents first (sorted by average desc), then incomplete ones at
444
+ # the bottom (sorted by # tasks covered desc, then name).
445
+ out.sort(key=lambda r: (
446
+ 0 if r["average"] is not None else 1,
447
+ -(r["average"] if r["average"] is not None else 0),
448
+ -r["n_tasks"],
449
+ r["agent"],
450
+ ))
451
  return jsonify({"tasks": tasks, "rows": out})
452
 
453
 
 
968
  {% if v is not none %}{{ "%.3f"|format(v) }}{% else %}<span class="muted">β€”</span>{% endif %}
969
  </td>
970
  {% endfor %}
971
+ <td class="score">
972
+ {% if r.average is not none %}{{ "%.3f"|format(r.average) }}{% else %}<span class="muted">β€”</span>{% endif %}
973
+ </td>
974
  </tr>
975
  {% endfor %}
976
  {% else %}
 
1342
  n_subs_total += sum(r["n_subs"] for r in tasks[-1]["rows"])
1343
  conn.close()
1344
 
1345
+ # Cross-task average per agent. Average is only computed for agents that
1346
+ # have a score on every task β€” anyone incomplete shows 'β€”' and ranks
1347
+ # below all complete agents (matches the /leaderboard JSON behavior).
1348
  by_agent: dict[str, dict[str, float]] = {}
1349
  for t in tasks:
1350
  for r in t["rows"]:
1351
  by_agent.setdefault(r["agent"], {})[t["name"]] = r["primary"]
1352
  overall_rows = []
1353
+ n_total = len(tasks)
1354
  for agent, scores in by_agent.items():
1355
+ complete = len(scores) == n_total
1356
+ avg = round(sum(scores.values()) / len(scores), 3) if complete else None
1357
  overall_rows.append({
1358
  "agent": agent,
1359
+ "average": avg,
1360
  "n_tasks": len(scores),
1361
  "per_task": {t["name"]: scores.get(t["name"]) for t in tasks},
1362
  })
1363
+ overall_rows.sort(key=lambda r: (
1364
+ 0 if r["average"] is not None else 1,
1365
+ -(r["average"] if r["average"] is not None else 0),
1366
+ -r["n_tasks"],
1367
+ r["agent"],
1368
+ ))
1369
 
1370
  base_url = request.url_root.rstrip("/")
1371