Spaces:
Running
Running
| """Pairwise adversarial evaluation + Elo. | |
| The project goal includes evaluating models "on pairwise adversarial | |
| conditions", not only fixed-scenario absolute scores. True co-located | |
| 1v1 (both sides model-driven in one map) is the deeper engine feature | |
| (task #3); this module delivers the tractable, immediately-useful form: | |
| **comparative** pairwise ranking — every model plays the *same* | |
| controlled scenarios, and models are ranked by head-to-head composite | |
| on shared cells via Elo. Pure + deterministic (no engine in | |
| `pairwise_elo`), so it is fully unit-testable; `run_pairwise` wires it | |
| to the live evaluator. | |
| """ | |
| from __future__ import annotations | |
| import itertools | |
| from pathlib import Path | |
| from typing import Callable | |
| from .run_eval import AgentFactory, evaluate | |
| # Per model: {cell -> mean composite over that cell's seeds}. | |
| ModelCellScores = dict[str, dict[str, float]] | |
| ELO_K = 32.0 | |
| ELO_START = 1000.0 | |
| _TIE_EPS = 1e-6 | |
| def _cell_means(stats: dict) -> dict[str, float]: | |
| """Mean composite per cell from a run_eval stats dict (public split).""" | |
| by: dict[str, list[float]] = {} | |
| for e in stats.get("episodes", []): | |
| if e.get("split", "public") != "public": | |
| continue | |
| by.setdefault(e["cell"], []).append(e["composite"]) | |
| return {c: sum(v) / len(v) for c, v in by.items() if v} | |
| def pairwise_elo(scores: ModelCellScores) -> dict: | |
| """Deterministic Elo from comparative per-cell composites. | |
| For each unordered model pair, the match score = fraction of shared | |
| cells where A strictly beats B (ties = 0.5). One Elo update per | |
| pair; pairs processed in sorted order so the result is fully | |
| deterministic and order-independent given the input. | |
| """ | |
| models = sorted(scores) | |
| elo = {m: ELO_START for m in models} | |
| matrix: dict[str, dict[str, float]] = {m: {} for m in models} | |
| for a, b in itertools.combinations(models, 2): | |
| shared = sorted(set(scores[a]) & set(scores[b])) | |
| if not shared: | |
| continue | |
| wins = 0.0 | |
| for c in shared: | |
| da = scores[a][c] - scores[b][c] | |
| if da > _TIE_EPS: | |
| wins += 1.0 | |
| elif abs(da) <= _TIE_EPS: | |
| wins += 0.5 | |
| sa = wins / len(shared) # A's match score in [0,1] | |
| matrix[a][b] = round(sa, 4) | |
| matrix[b][a] = round(1.0 - sa, 4) | |
| ea = 1.0 / (1.0 + 10.0 ** ((elo[b] - elo[a]) / 400.0)) | |
| elo[a] += ELO_K * (sa - ea) | |
| elo[b] += ELO_K * ((1.0 - sa) - (1.0 - ea)) | |
| ranked = sorted(models, key=lambda m: (-elo[m], m)) | |
| return { | |
| "elo": {m: round(elo[m], 1) for m in models}, | |
| "rank": {m: i + 1 for i, m in enumerate(ranked)}, | |
| "matrix": matrix, # matrix[a][b] = A's match score vs B | |
| "shared_cells": { | |
| f"{a}|{b}": len(set(scores[a]) & set(scores[b])) | |
| for a, b in itertools.combinations(models, 2) | |
| }, | |
| } | |
| def run_pairwise( | |
| packs: list[Path], | |
| levels: list[str], | |
| seeds: list[int], | |
| agents: dict[str, AgentFactory], | |
| ) -> dict: | |
| """Run each named model over the same packs/levels/seeds and return | |
| the pairwise Elo ranking + per-model cell scores.""" | |
| if len(agents) < 2: | |
| raise ValueError("pairwise eval needs >= 2 models") | |
| scores: ModelCellScores = {} | |
| per_model_stats: dict[str, dict] = {} | |
| for name, factory in agents.items(): | |
| st = evaluate(packs, levels, seeds, agent_factory=factory) | |
| per_model_stats[name] = st | |
| scores[name] = _cell_means(st) | |
| return { | |
| "pairwise": pairwise_elo(scores), | |
| "cell_scores": scores, | |
| "per_model_overall": { | |
| n: s.get("overall", {}) for n, s in per_model_stats.items() | |
| }, | |
| } | |