OpenRA-Bench / openra_bench /pairwise.py
yxc20098's picture
Pairwise adversarial eval + Elo (the user's 'pairwise conditions')
b04adfc
Raw
History Blame Contribute Delete
3.72 kB
"""Pairwise adversarial evaluation + Elo.
The project goal includes evaluating models "on pairwise adversarial
conditions", not only fixed-scenario absolute scores. True co-located
1v1 (both sides model-driven in one map) is the deeper engine feature
(task #3); this module delivers the tractable, immediately-useful form:
**comparative** pairwise ranking — every model plays the *same*
controlled scenarios, and models are ranked by head-to-head composite
on shared cells via Elo. Pure + deterministic (no engine in
`pairwise_elo`), so it is fully unit-testable; `run_pairwise` wires it
to the live evaluator.
"""
from __future__ import annotations
import itertools
from pathlib import Path
from typing import Callable
from .run_eval import AgentFactory, evaluate
# Per model: {cell -> mean composite over that cell's seeds}.
ModelCellScores = dict[str, dict[str, float]]
ELO_K = 32.0
ELO_START = 1000.0
_TIE_EPS = 1e-6
def _cell_means(stats: dict) -> dict[str, float]:
"""Mean composite per cell from a run_eval stats dict (public split)."""
by: dict[str, list[float]] = {}
for e in stats.get("episodes", []):
if e.get("split", "public") != "public":
continue
by.setdefault(e["cell"], []).append(e["composite"])
return {c: sum(v) / len(v) for c, v in by.items() if v}
def pairwise_elo(scores: ModelCellScores) -> dict:
"""Deterministic Elo from comparative per-cell composites.
For each unordered model pair, the match score = fraction of shared
cells where A strictly beats B (ties = 0.5). One Elo update per
pair; pairs processed in sorted order so the result is fully
deterministic and order-independent given the input.
"""
models = sorted(scores)
elo = {m: ELO_START for m in models}
matrix: dict[str, dict[str, float]] = {m: {} for m in models}
for a, b in itertools.combinations(models, 2):
shared = sorted(set(scores[a]) & set(scores[b]))
if not shared:
continue
wins = 0.0
for c in shared:
da = scores[a][c] - scores[b][c]
if da > _TIE_EPS:
wins += 1.0
elif abs(da) <= _TIE_EPS:
wins += 0.5
sa = wins / len(shared) # A's match score in [0,1]
matrix[a][b] = round(sa, 4)
matrix[b][a] = round(1.0 - sa, 4)
ea = 1.0 / (1.0 + 10.0 ** ((elo[b] - elo[a]) / 400.0))
elo[a] += ELO_K * (sa - ea)
elo[b] += ELO_K * ((1.0 - sa) - (1.0 - ea))
ranked = sorted(models, key=lambda m: (-elo[m], m))
return {
"elo": {m: round(elo[m], 1) for m in models},
"rank": {m: i + 1 for i, m in enumerate(ranked)},
"matrix": matrix, # matrix[a][b] = A's match score vs B
"shared_cells": {
f"{a}|{b}": len(set(scores[a]) & set(scores[b]))
for a, b in itertools.combinations(models, 2)
},
}
def run_pairwise(
packs: list[Path],
levels: list[str],
seeds: list[int],
agents: dict[str, AgentFactory],
) -> dict:
"""Run each named model over the same packs/levels/seeds and return
the pairwise Elo ranking + per-model cell scores."""
if len(agents) < 2:
raise ValueError("pairwise eval needs >= 2 models")
scores: ModelCellScores = {}
per_model_stats: dict[str, dict] = {}
for name, factory in agents.items():
st = evaluate(packs, levels, seeds, agent_factory=factory)
per_model_stats[name] = st
scores[name] = _cell_means(st)
return {
"pairwise": pairwise_elo(scores),
"cell_scores": scores,
"per_model_overall": {
n: s.get("overall", {}) for n, s in per_model_stats.items()
},
}