Spaces:
Running on Zero
Running on Zero
| """Leaderboard β competitive-results aggregations over the dedicated scoreboard table. | |
| This module is the *read model* of the Hall of Fame. It is deliberately **detached from | |
| the event ledger**: it folds a list of :class:`~src.core.leaderboard_store.LeaderboardEntry` | |
| rows β the materialised scoreboard persisted in the ``leaderboard_entries`` table β into | |
| the model / agent / fairness tables the UI renders. It never touches the ``events`` log. | |
| The split, in one line: :mod:`src.core.leaderboard_store` owns *persistence* (one durable | |
| row per decided run, written at finish), and this module owns *aggregation* (cheap folds | |
| over those rows). Because a row is only ever written for a finished, won, competitive run | |
| (see ``build_entry`` and ``FishbowlSession.finalize``), the functions here can trust every | |
| entry they receive is already "ranked" β they re-check ``winner`` only defensively. | |
| Each entry is self-describing: it carries the castβmodel bindings *and* the competition | |
| shape (``competition_kind`` / ``teams`` / ``symmetric_seats``), so the per-seat fairness | |
| rollup needs no registry lookup and no event replay. | |
| Schema is additive only; ``schema_version`` is unaffected (this reads a separate table). | |
| """ | |
| from __future__ import annotations | |
| from collections import defaultdict | |
| from datetime import datetime | |
| from typing import Iterable, Sequence | |
| from pydantic import BaseModel, ConfigDict, Field | |
| from src.core.leaderboard_store import LeaderboardEntry | |
| # ββ competition shape (built per entry for seat mapping) βββββββββββββββββββββββββ | |
| class CompetitionBlock(BaseModel): | |
| """The competition shape of one run (built from a :class:`LeaderboardEntry`). | |
| ``kind`` is ``"versus"`` / ``"judged"`` / ``"none"``; ``teams`` (versus only) maps a | |
| team label β member agent names; ``symmetric_seats`` (versus only) lists identical | |
| seats that differ only by model. | |
| """ | |
| model_config = ConfigDict(extra="ignore") | |
| kind: str = "none" | |
| teams: dict[str, list[str]] | None = None | |
| symmetric_seats: list[str] | None = None | |
| def _block(entry: LeaderboardEntry) -> CompetitionBlock: | |
| """The competition shape carried by *entry* (no registry / ledger lookup).""" | |
| return CompetitionBlock( | |
| kind=entry.competition_kind, | |
| teams=entry.teams, | |
| symmetric_seats=entry.symmetric_seats, | |
| ) | |
| # ββ row models βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class ModelRow(BaseModel): | |
| """A model endpoint's record across *all* competitive decided runs.""" | |
| model_config = ConfigDict(extra="forbid") | |
| model: str | |
| plays: int = 0 | |
| """Decided competitive runs whose cast contained this endpoint.""" | |
| wins: int = 0 | |
| """Of those, how many this endpoint was credited with winning.""" | |
| win_rate: float = 0.0 | |
| """``wins / plays`` (0.0 when ``plays == 0``).""" | |
| scenarios: list[str] = Field(default_factory=list) | |
| """Sorted distinct scenario names this endpoint appeared in.""" | |
| class AgentRow(BaseModel): | |
| """A persona's (cast seat *name*) record within a single scenario.""" | |
| model_config = ConfigDict(extra="forbid") | |
| agent: str | |
| """The cast member name β the seat, not the model.""" | |
| seat_type: str = "" | |
| """Team label, symmetric-seat name, or ``""`` when the seat maps to neither.""" | |
| plays: int = 0 | |
| wins: int = 0 | |
| win_rate: float = 0.0 | |
| model_endpoints: list[str] = Field(default_factory=list) | |
| """Sorted distinct model endpoints that filled this seat.""" | |
| class SeatRow(BaseModel): | |
| """Win rate per *seat type* within a scenario (the 6.3 fairness footnote). | |
| Surfaces structural asymmetry: spy vs herd, debater-a vs debater-b, the judge that | |
| never wins. ``seat_type`` is a team label (versus-teams) or a symmetric-seat name. | |
| """ | |
| model_config = ConfigDict(extra="forbid") | |
| seat_type: str | |
| plays: int = 0 | |
| wins: int = 0 | |
| win_rate: float = 0.0 | |
| # ββ internal helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _win_rate(wins: int, plays: int) -> float: | |
| """``wins / plays``, or ``0.0`` when *plays* is zero (never divide by zero).""" | |
| return (wins / plays) if plays else 0.0 | |
| def _ranked(entries: Iterable[LeaderboardEntry]) -> list[LeaderboardEntry]: | |
| """Defensive gate: keep only entries that actually name a winner. | |
| The store only ever persists finished + won + competitive runs, so this is belt-and- | |
| suspenders β it drops any malformed/empty row rather than crediting a phantom win. | |
| """ | |
| return [e for e in entries if e and e.winner] | |
| def _seat_type_for(agent: str, block: CompetitionBlock) -> str: | |
| """Map a cast member *name* to its seat type within *block*. | |
| For ``versus`` teams the seat type is the team label the agent belongs to. For | |
| ``symmetric_seats`` each named seat is its own type (the agent name == the seat). A | |
| cast member that belongs to neither (e.g. a judge, a narrator) maps to ``""``. | |
| """ | |
| if block.teams: | |
| for label, members in block.teams.items(): | |
| if agent in (members or []): | |
| return label | |
| if block.symmetric_seats and agent in block.symmetric_seats: | |
| return agent | |
| return "" | |
| def _winning_seat_types(entry: LeaderboardEntry, block: CompetitionBlock) -> set[str]: | |
| """The seat type(s) credited with the win for fairness accounting.""" | |
| winner = entry.winner or "" | |
| if not winner: | |
| return set() | |
| if block.teams and winner in block.teams: | |
| return {winner} | |
| seat = _seat_type_for(winner, block) | |
| return {seat} if seat else set() | |
| def _winning_agents(entry: LeaderboardEntry, block: CompetitionBlock) -> set[str]: | |
| """Cast member name(s) credited with the win, for per-seat accounting. | |
| A ``team`` winner credits every member of the winning team; an ``agent`` winner | |
| (judged pick / symmetric-seat winner) credits just that name. Falls back to treating | |
| ``winner`` as a bare agent name when ``winner_kind`` is absent. | |
| """ | |
| winner = entry.winner or "" | |
| if not winner: | |
| return set() | |
| if entry.winner_kind == "team" or (block.teams and winner in block.teams): | |
| return set((block.teams or {}).get(winner) or []) | |
| return {winner} | |
| def _credited_models(entry: LeaderboardEntry) -> set[str]: | |
| """The model endpoint(s) credited with this run's win (``winning_models`` βͺ single).""" | |
| credited = {m for m in entry.winning_models if m} | |
| if entry.winning_model: | |
| credited.add(entry.winning_model) | |
| return credited | |
| # ββ public aggregations (fold the scoreboard rows) βββββββββββββββββββββββββββββββ | |
| def scenario_sessions(entries: Sequence[LeaderboardEntry], scenario_name: str) -> list[LeaderboardEntry]: | |
| """The decided sessions of *scenario_name*, newest first. | |
| A thin filter + deterministic sort over the stored rows: by ``finished_at`` | |
| descending (newest first), runs missing a finish time sorted last, ``run_id`` breaking | |
| ties. Returns the :class:`LeaderboardEntry` rows themselves β they already carry the | |
| winner, castβmodel bindings and cost the sessions table renders. | |
| """ | |
| rows = [e for e in _ranked(entries) if e.scenario == scenario_name] | |
| rows.sort(key=lambda e: (e.finished_at is None, _neg_ts(e.finished_at), e.run_id)) | |
| return rows | |
| def model_table(entries: Sequence[LeaderboardEntry]) -> list[ModelRow]: | |
| """One :class:`ModelRow` per model endpoint across *all* decided competitive runs. | |
| A model *plays* a run when its endpoint appears in that run's cast; it *wins* when its | |
| endpoint is among the run's credited winners (``winning_models`` / ``winning_model``). | |
| ``scenarios`` lists the distinct scenario names the model appeared in (sorted). Sorted | |
| by ``win_rate`` desc, then ``wins`` desc, then ``plays`` desc, then ``model`` asc. | |
| """ | |
| plays: dict[str, int] = defaultdict(int) | |
| wins: dict[str, int] = defaultdict(int) | |
| scenarios: dict[str, set[str]] = defaultdict(set) | |
| for entry in _ranked(entries): | |
| credited = _credited_models(entry) | |
| seen: set[str] = set() | |
| for binding in entry.cast.values(): | |
| endpoint = binding.model_endpoint | |
| if not endpoint or endpoint in seen: | |
| continue # one play per endpoint per run, even if it fills two seats | |
| seen.add(endpoint) | |
| plays[endpoint] += 1 | |
| scenarios[endpoint].add(entry.scenario) | |
| if endpoint in credited: | |
| wins[endpoint] += 1 | |
| rows = [ | |
| ModelRow( | |
| model=endpoint, | |
| plays=plays[endpoint], | |
| wins=wins[endpoint], | |
| win_rate=_win_rate(wins[endpoint], plays[endpoint]), | |
| scenarios=sorted(scenarios[endpoint]), | |
| ) | |
| for endpoint in plays | |
| ] | |
| rows.sort(key=lambda r: (-r.win_rate, -r.wins, -r.plays, r.model)) | |
| return rows | |
| def agent_table(entries: Sequence[LeaderboardEntry], scenario_name: str) -> list[AgentRow]: | |
| """Per-persona (cast seat *name*) wins within *scenario_name*. | |
| One :class:`AgentRow` per cast member name that appears in a decided run of the | |
| scenario. A seat *plays* a run when its name is in the cast and *wins* when it is the | |
| run's winning agent, or a member of the winning team. ``seat_type`` is the seat's team | |
| label / symmetric-seat name (or ``""``); ``model_endpoints`` lists the distinct models | |
| that filled the seat. Deterministic sort matching :func:`model_table`. | |
| """ | |
| plays: dict[str, int] = defaultdict(int) | |
| wins: dict[str, int] = defaultdict(int) | |
| seat_types: dict[str, str] = {} | |
| endpoints: dict[str, set[str]] = defaultdict(set) | |
| for entry in _ranked(entries): | |
| if entry.scenario != scenario_name: | |
| continue | |
| block = _block(entry) | |
| winners = _winning_agents(entry, block) | |
| for name, binding in entry.cast.items(): | |
| plays[name] += 1 | |
| seat_types.setdefault(name, _seat_type_for(name, block)) | |
| if binding.model_endpoint: | |
| endpoints[name].add(binding.model_endpoint) | |
| if name in winners: | |
| wins[name] += 1 | |
| rows = [ | |
| AgentRow( | |
| agent=name, | |
| seat_type=seat_types.get(name, ""), | |
| plays=plays[name], | |
| wins=wins[name], | |
| win_rate=_win_rate(wins[name], plays[name]), | |
| model_endpoints=sorted(endpoints[name]), | |
| ) | |
| for name in plays | |
| ] | |
| rows.sort(key=lambda r: (-r.win_rate, -r.wins, -r.plays, r.agent)) | |
| return rows | |
| def fairness_table(entries: Sequence[LeaderboardEntry], scenario_name: str) -> list[SeatRow]: | |
| """Win rate per *seat type* within *scenario_name* β the 6.3 fairness footnote. | |
| Aggregates the per-persona view up to seat types so structural asymmetry is visible: | |
| spy vs herd, debater-a vs debater-b, a judge that never wins. Seat membership comes | |
| from each entry's stored competition shape (``teams`` β label per member; | |
| ``symmetric_seats`` β each seat its own type). A run contributes one *play* to each | |
| seat type present in its cast, and one *win* to whichever seat type the winner maps to. | |
| Unmapped cast members (``seat_type == ""``) are not counted β only declared seats | |
| appear. Sorted by ``win_rate`` desc, ``wins`` desc, ``plays`` desc, ``seat_type`` asc. | |
| """ | |
| plays: dict[str, int] = defaultdict(int) | |
| wins: dict[str, int] = defaultdict(int) | |
| for entry in _ranked(entries): | |
| if entry.scenario != scenario_name: | |
| continue | |
| block = _block(entry) | |
| seats_present = {st for st in (_seat_type_for(n, block) for n in entry.cast) if st} | |
| for seat in seats_present: | |
| plays[seat] += 1 | |
| for seat in _winning_seat_types(entry, block): | |
| if seat in seats_present: | |
| wins[seat] += 1 | |
| rows = [ | |
| SeatRow( | |
| seat_type=seat, | |
| plays=plays[seat], | |
| wins=wins[seat], | |
| win_rate=_win_rate(wins[seat], plays[seat]), | |
| ) | |
| for seat in plays | |
| ] | |
| rows.sort(key=lambda r: (-r.win_rate, -r.wins, -r.plays, r.seat_type)) | |
| return rows | |
| def headline(entries: Sequence[LeaderboardEntry]) -> str | None: | |
| """The killer demo line, or ``None`` when there isn't enough data. | |
| Looks for the most-played *symmetric-seat* scenario (the "which model argues better" | |
| comparison) and, within it, the two models with the most head-to-head wins. Renders | |
| e.g. ``"MiniCPM-8B beats Gemma-12B Β· 7-3 at Debate Duel"``. Returns ``None`` when no | |
| competitive symmetric scenario has at least two distinct models that have each won at | |
| least once (so the line is never a hollow "0-0"). | |
| """ | |
| ranked = _ranked(entries) | |
| if not ranked: | |
| return None | |
| by_scenario: dict[str, list[LeaderboardEntry]] = defaultdict(list) | |
| for entry in ranked: | |
| if entry.symmetric_seats: # the model-vs-model comparison only | |
| by_scenario[entry.scenario].append(entry) | |
| if not by_scenario: | |
| return None | |
| best_line: str | None = None | |
| best_key: tuple[int, int] = (-1, -1) | |
| for scenario in sorted(by_scenario): # ascending scan: ties resolve to the first (alphabetical) name | |
| runs = by_scenario[scenario] | |
| wins: dict[str, int] = defaultdict(int) | |
| plays: dict[str, int] = defaultdict(int) | |
| for entry in runs: | |
| credited = _credited_models(entry) | |
| for endpoint in {b.model_endpoint for b in entry.cast.values() if b.model_endpoint}: | |
| plays[endpoint] += 1 | |
| if endpoint in credited: | |
| wins[endpoint] += 1 | |
| winners = sorted((m for m in plays if wins[m] > 0), key=lambda m: (-wins[m], -plays[m], m)) | |
| if len(winners) < 2: | |
| continue | |
| top, runner = winners[0], winners[1] | |
| decided = wins[top] + wins[runner] | |
| candidate_key = (decided, wins[top]) | |
| if candidate_key > best_key: # strict: a full tie keeps the earlier (alphabetical) scenario | |
| best_key = candidate_key | |
| best_line = f"{_short(top)} beats {_short(runner)} Β· {wins[top]}-{wins[runner]} at {scenario}" | |
| return best_line | |
| # ββ tiny formatting helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _neg_ts(value: datetime | None) -> float: | |
| """Negated POSIX timestamp for descending sort; ``0.0`` for ``None`` (sorted last).""" | |
| return -value.timestamp() if value is not None else 0.0 | |
| def _short(endpoint: str) -> str: | |
| """Compact a model endpoint for the headline (``"openai/openbmb/X"`` β ``"X"``).""" | |
| return endpoint.rsplit("/", 1)[-1] if endpoint else endpoint | |
| __all__ = [ | |
| "AgentRow", | |
| "CompetitionBlock", | |
| "ModelRow", | |
| "SeatRow", | |
| "agent_table", | |
| "fairness_table", | |
| "headline", | |
| "model_table", | |
| "scenario_sessions", | |
| ] | |