multi-agent-lab / src /core /leaderboard.py
agharsallah
feat: Add Hall of Fame leaderboard backed by a dedicated results table
c952d77
Raw
History Blame Contribute Delete
15.4 kB
"""Leaderboard β€” competitive-results aggregations over the dedicated scoreboard table.
This module is the *read model* of the Hall of Fame. It is deliberately **detached from
the event ledger**: it folds a list of :class:`~src.core.leaderboard_store.LeaderboardEntry`
rows β€” the materialised scoreboard persisted in the ``leaderboard_entries`` table β€” into
the model / agent / fairness tables the UI renders. It never touches the ``events`` log.
The split, in one line: :mod:`src.core.leaderboard_store` owns *persistence* (one durable
row per decided run, written at finish), and this module owns *aggregation* (cheap folds
over those rows). Because a row is only ever written for a finished, won, competitive run
(see ``build_entry`` and ``FishbowlSession.finalize``), the functions here can trust every
entry they receive is already "ranked" β€” they re-check ``winner`` only defensively.
Each entry is self-describing: it carries the cast→model bindings *and* the competition
shape (``competition_kind`` / ``teams`` / ``symmetric_seats``), so the per-seat fairness
rollup needs no registry lookup and no event replay.
Schema is additive only; ``schema_version`` is unaffected (this reads a separate table).
"""
from __future__ import annotations
from collections import defaultdict
from datetime import datetime
from typing import Iterable, Sequence
from pydantic import BaseModel, ConfigDict, Field
from src.core.leaderboard_store import LeaderboardEntry
# ── competition shape (built per entry for seat mapping) ─────────────────────────
class CompetitionBlock(BaseModel):
"""The competition shape of one run (built from a :class:`LeaderboardEntry`).
``kind`` is ``"versus"`` / ``"judged"`` / ``"none"``; ``teams`` (versus only) maps a
team label β†’ member agent names; ``symmetric_seats`` (versus only) lists identical
seats that differ only by model.
"""
model_config = ConfigDict(extra="ignore")
kind: str = "none"
teams: dict[str, list[str]] | None = None
symmetric_seats: list[str] | None = None
def _block(entry: LeaderboardEntry) -> CompetitionBlock:
"""The competition shape carried by *entry* (no registry / ledger lookup)."""
return CompetitionBlock(
kind=entry.competition_kind,
teams=entry.teams,
symmetric_seats=entry.symmetric_seats,
)
# ── row models ─────────────────────────────────────────────────────────────────
class ModelRow(BaseModel):
"""A model endpoint's record across *all* competitive decided runs."""
model_config = ConfigDict(extra="forbid")
model: str
plays: int = 0
"""Decided competitive runs whose cast contained this endpoint."""
wins: int = 0
"""Of those, how many this endpoint was credited with winning."""
win_rate: float = 0.0
"""``wins / plays`` (0.0 when ``plays == 0``)."""
scenarios: list[str] = Field(default_factory=list)
"""Sorted distinct scenario names this endpoint appeared in."""
class AgentRow(BaseModel):
"""A persona's (cast seat *name*) record within a single scenario."""
model_config = ConfigDict(extra="forbid")
agent: str
"""The cast member name β€” the seat, not the model."""
seat_type: str = ""
"""Team label, symmetric-seat name, or ``""`` when the seat maps to neither."""
plays: int = 0
wins: int = 0
win_rate: float = 0.0
model_endpoints: list[str] = Field(default_factory=list)
"""Sorted distinct model endpoints that filled this seat."""
class SeatRow(BaseModel):
"""Win rate per *seat type* within a scenario (the 6.3 fairness footnote).
Surfaces structural asymmetry: spy vs herd, debater-a vs debater-b, the judge that
never wins. ``seat_type`` is a team label (versus-teams) or a symmetric-seat name.
"""
model_config = ConfigDict(extra="forbid")
seat_type: str
plays: int = 0
wins: int = 0
win_rate: float = 0.0
# ── internal helpers ───────────────────────────────────────────────────────────
def _win_rate(wins: int, plays: int) -> float:
"""``wins / plays``, or ``0.0`` when *plays* is zero (never divide by zero)."""
return (wins / plays) if plays else 0.0
def _ranked(entries: Iterable[LeaderboardEntry]) -> list[LeaderboardEntry]:
"""Defensive gate: keep only entries that actually name a winner.
The store only ever persists finished + won + competitive runs, so this is belt-and-
suspenders β€” it drops any malformed/empty row rather than crediting a phantom win.
"""
return [e for e in entries if e and e.winner]
def _seat_type_for(agent: str, block: CompetitionBlock) -> str:
"""Map a cast member *name* to its seat type within *block*.
For ``versus`` teams the seat type is the team label the agent belongs to. For
``symmetric_seats`` each named seat is its own type (the agent name == the seat). A
cast member that belongs to neither (e.g. a judge, a narrator) maps to ``""``.
"""
if block.teams:
for label, members in block.teams.items():
if agent in (members or []):
return label
if block.symmetric_seats and agent in block.symmetric_seats:
return agent
return ""
def _winning_seat_types(entry: LeaderboardEntry, block: CompetitionBlock) -> set[str]:
"""The seat type(s) credited with the win for fairness accounting."""
winner = entry.winner or ""
if not winner:
return set()
if block.teams and winner in block.teams:
return {winner}
seat = _seat_type_for(winner, block)
return {seat} if seat else set()
def _winning_agents(entry: LeaderboardEntry, block: CompetitionBlock) -> set[str]:
"""Cast member name(s) credited with the win, for per-seat accounting.
A ``team`` winner credits every member of the winning team; an ``agent`` winner
(judged pick / symmetric-seat winner) credits just that name. Falls back to treating
``winner`` as a bare agent name when ``winner_kind`` is absent.
"""
winner = entry.winner or ""
if not winner:
return set()
if entry.winner_kind == "team" or (block.teams and winner in block.teams):
return set((block.teams or {}).get(winner) or [])
return {winner}
def _credited_models(entry: LeaderboardEntry) -> set[str]:
"""The model endpoint(s) credited with this run's win (``winning_models`` βˆͺ single)."""
credited = {m for m in entry.winning_models if m}
if entry.winning_model:
credited.add(entry.winning_model)
return credited
# ── public aggregations (fold the scoreboard rows) ───────────────────────────────
def scenario_sessions(entries: Sequence[LeaderboardEntry], scenario_name: str) -> list[LeaderboardEntry]:
"""The decided sessions of *scenario_name*, newest first.
A thin filter + deterministic sort over the stored rows: by ``finished_at``
descending (newest first), runs missing a finish time sorted last, ``run_id`` breaking
ties. Returns the :class:`LeaderboardEntry` rows themselves β€” they already carry the
winner, cast→model bindings and cost the sessions table renders.
"""
rows = [e for e in _ranked(entries) if e.scenario == scenario_name]
rows.sort(key=lambda e: (e.finished_at is None, _neg_ts(e.finished_at), e.run_id))
return rows
def model_table(entries: Sequence[LeaderboardEntry]) -> list[ModelRow]:
"""One :class:`ModelRow` per model endpoint across *all* decided competitive runs.
A model *plays* a run when its endpoint appears in that run's cast; it *wins* when its
endpoint is among the run's credited winners (``winning_models`` / ``winning_model``).
``scenarios`` lists the distinct scenario names the model appeared in (sorted). Sorted
by ``win_rate`` desc, then ``wins`` desc, then ``plays`` desc, then ``model`` asc.
"""
plays: dict[str, int] = defaultdict(int)
wins: dict[str, int] = defaultdict(int)
scenarios: dict[str, set[str]] = defaultdict(set)
for entry in _ranked(entries):
credited = _credited_models(entry)
seen: set[str] = set()
for binding in entry.cast.values():
endpoint = binding.model_endpoint
if not endpoint or endpoint in seen:
continue # one play per endpoint per run, even if it fills two seats
seen.add(endpoint)
plays[endpoint] += 1
scenarios[endpoint].add(entry.scenario)
if endpoint in credited:
wins[endpoint] += 1
rows = [
ModelRow(
model=endpoint,
plays=plays[endpoint],
wins=wins[endpoint],
win_rate=_win_rate(wins[endpoint], plays[endpoint]),
scenarios=sorted(scenarios[endpoint]),
)
for endpoint in plays
]
rows.sort(key=lambda r: (-r.win_rate, -r.wins, -r.plays, r.model))
return rows
def agent_table(entries: Sequence[LeaderboardEntry], scenario_name: str) -> list[AgentRow]:
"""Per-persona (cast seat *name*) wins within *scenario_name*.
One :class:`AgentRow` per cast member name that appears in a decided run of the
scenario. A seat *plays* a run when its name is in the cast and *wins* when it is the
run's winning agent, or a member of the winning team. ``seat_type`` is the seat's team
label / symmetric-seat name (or ``""``); ``model_endpoints`` lists the distinct models
that filled the seat. Deterministic sort matching :func:`model_table`.
"""
plays: dict[str, int] = defaultdict(int)
wins: dict[str, int] = defaultdict(int)
seat_types: dict[str, str] = {}
endpoints: dict[str, set[str]] = defaultdict(set)
for entry in _ranked(entries):
if entry.scenario != scenario_name:
continue
block = _block(entry)
winners = _winning_agents(entry, block)
for name, binding in entry.cast.items():
plays[name] += 1
seat_types.setdefault(name, _seat_type_for(name, block))
if binding.model_endpoint:
endpoints[name].add(binding.model_endpoint)
if name in winners:
wins[name] += 1
rows = [
AgentRow(
agent=name,
seat_type=seat_types.get(name, ""),
plays=plays[name],
wins=wins[name],
win_rate=_win_rate(wins[name], plays[name]),
model_endpoints=sorted(endpoints[name]),
)
for name in plays
]
rows.sort(key=lambda r: (-r.win_rate, -r.wins, -r.plays, r.agent))
return rows
def fairness_table(entries: Sequence[LeaderboardEntry], scenario_name: str) -> list[SeatRow]:
"""Win rate per *seat type* within *scenario_name* β€” the 6.3 fairness footnote.
Aggregates the per-persona view up to seat types so structural asymmetry is visible:
spy vs herd, debater-a vs debater-b, a judge that never wins. Seat membership comes
from each entry's stored competition shape (``teams`` β†’ label per member;
``symmetric_seats`` β†’ each seat its own type). A run contributes one *play* to each
seat type present in its cast, and one *win* to whichever seat type the winner maps to.
Unmapped cast members (``seat_type == ""``) are not counted β€” only declared seats
appear. Sorted by ``win_rate`` desc, ``wins`` desc, ``plays`` desc, ``seat_type`` asc.
"""
plays: dict[str, int] = defaultdict(int)
wins: dict[str, int] = defaultdict(int)
for entry in _ranked(entries):
if entry.scenario != scenario_name:
continue
block = _block(entry)
seats_present = {st for st in (_seat_type_for(n, block) for n in entry.cast) if st}
for seat in seats_present:
plays[seat] += 1
for seat in _winning_seat_types(entry, block):
if seat in seats_present:
wins[seat] += 1
rows = [
SeatRow(
seat_type=seat,
plays=plays[seat],
wins=wins[seat],
win_rate=_win_rate(wins[seat], plays[seat]),
)
for seat in plays
]
rows.sort(key=lambda r: (-r.win_rate, -r.wins, -r.plays, r.seat_type))
return rows
def headline(entries: Sequence[LeaderboardEntry]) -> str | None:
"""The killer demo line, or ``None`` when there isn't enough data.
Looks for the most-played *symmetric-seat* scenario (the "which model argues better"
comparison) and, within it, the two models with the most head-to-head wins. Renders
e.g. ``"MiniCPM-8B beats Gemma-12B Β· 7-3 at Debate Duel"``. Returns ``None`` when no
competitive symmetric scenario has at least two distinct models that have each won at
least once (so the line is never a hollow "0-0").
"""
ranked = _ranked(entries)
if not ranked:
return None
by_scenario: dict[str, list[LeaderboardEntry]] = defaultdict(list)
for entry in ranked:
if entry.symmetric_seats: # the model-vs-model comparison only
by_scenario[entry.scenario].append(entry)
if not by_scenario:
return None
best_line: str | None = None
best_key: tuple[int, int] = (-1, -1)
for scenario in sorted(by_scenario): # ascending scan: ties resolve to the first (alphabetical) name
runs = by_scenario[scenario]
wins: dict[str, int] = defaultdict(int)
plays: dict[str, int] = defaultdict(int)
for entry in runs:
credited = _credited_models(entry)
for endpoint in {b.model_endpoint for b in entry.cast.values() if b.model_endpoint}:
plays[endpoint] += 1
if endpoint in credited:
wins[endpoint] += 1
winners = sorted((m for m in plays if wins[m] > 0), key=lambda m: (-wins[m], -plays[m], m))
if len(winners) < 2:
continue
top, runner = winners[0], winners[1]
decided = wins[top] + wins[runner]
candidate_key = (decided, wins[top])
if candidate_key > best_key: # strict: a full tie keeps the earlier (alphabetical) scenario
best_key = candidate_key
best_line = f"{_short(top)} beats {_short(runner)} Β· {wins[top]}-{wins[runner]} at {scenario}"
return best_line
# ── tiny formatting helpers ─────────────────────────────────────────────────────
def _neg_ts(value: datetime | None) -> float:
"""Negated POSIX timestamp for descending sort; ``0.0`` for ``None`` (sorted last)."""
return -value.timestamp() if value is not None else 0.0
def _short(endpoint: str) -> str:
"""Compact a model endpoint for the headline (``"openai/openbmb/X"`` β†’ ``"X"``)."""
return endpoint.rsplit("/", 1)[-1] if endpoint else endpoint
__all__ = [
"AgentRow",
"CompetitionBlock",
"ModelRow",
"SeatRow",
"agent_table",
"fairness_table",
"headline",
"model_table",
"scenario_sessions",
]