Spaces:
Running
Running
Pairwise adversarial eval + Elo (the user's 'pairwise conditions')
Browse filesopenra_bench/pairwise.py: comparative pairwise ranking — every model
plays the same controlled scenarios; ranked head-to-head on shared
cells by composite via deterministic, order-independent Elo (one
update per model pair, ties=0.5, no-shared-cells skipped). pure
pairwise_elo() (engine-free, unit-tested) + run_pairwise() wiring it
to the live evaluator. Delivers the 'evaluate on pairwise adversarial
conditions' goal without an engine rewrite (true co-located 1v1 is the
deeper task #3). tests (5): dominance ranking, determinism/order-
independence, tie handling, >=2-model guard, engine-backed
competent>=idle. Full bench suite green.
- openra_bench/pairwise.py +104 -0
- tests/test_pairwise.py +69 -0
openra_bench/pairwise.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pairwise adversarial evaluation + Elo.
|
| 2 |
+
|
| 3 |
+
The project goal includes evaluating models "on pairwise adversarial
|
| 4 |
+
conditions", not only fixed-scenario absolute scores. True co-located
|
| 5 |
+
1v1 (both sides model-driven in one map) is the deeper engine feature
|
| 6 |
+
(task #3); this module delivers the tractable, immediately-useful form:
|
| 7 |
+
**comparative** pairwise ranking — every model plays the *same*
|
| 8 |
+
controlled scenarios, and models are ranked by head-to-head composite
|
| 9 |
+
on shared cells via Elo. Pure + deterministic (no engine in
|
| 10 |
+
`pairwise_elo`), so it is fully unit-testable; `run_pairwise` wires it
|
| 11 |
+
to the live evaluator.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
import itertools
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from typing import Callable
|
| 19 |
+
|
| 20 |
+
from .run_eval import AgentFactory, evaluate
|
| 21 |
+
|
| 22 |
+
# Per model: {cell -> mean composite over that cell's seeds}.
|
| 23 |
+
ModelCellScores = dict[str, dict[str, float]]
|
| 24 |
+
|
| 25 |
+
ELO_K = 32.0
|
| 26 |
+
ELO_START = 1000.0
|
| 27 |
+
_TIE_EPS = 1e-6
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _cell_means(stats: dict) -> dict[str, float]:
|
| 31 |
+
"""Mean composite per cell from a run_eval stats dict (public split)."""
|
| 32 |
+
by: dict[str, list[float]] = {}
|
| 33 |
+
for e in stats.get("episodes", []):
|
| 34 |
+
if e.get("split", "public") != "public":
|
| 35 |
+
continue
|
| 36 |
+
by.setdefault(e["cell"], []).append(e["composite"])
|
| 37 |
+
return {c: sum(v) / len(v) for c, v in by.items() if v}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def pairwise_elo(scores: ModelCellScores) -> dict:
|
| 41 |
+
"""Deterministic Elo from comparative per-cell composites.
|
| 42 |
+
|
| 43 |
+
For each unordered model pair, the match score = fraction of shared
|
| 44 |
+
cells where A strictly beats B (ties = 0.5). One Elo update per
|
| 45 |
+
pair; pairs processed in sorted order so the result is fully
|
| 46 |
+
deterministic and order-independent given the input.
|
| 47 |
+
"""
|
| 48 |
+
models = sorted(scores)
|
| 49 |
+
elo = {m: ELO_START for m in models}
|
| 50 |
+
matrix: dict[str, dict[str, float]] = {m: {} for m in models}
|
| 51 |
+
|
| 52 |
+
for a, b in itertools.combinations(models, 2):
|
| 53 |
+
shared = sorted(set(scores[a]) & set(scores[b]))
|
| 54 |
+
if not shared:
|
| 55 |
+
continue
|
| 56 |
+
wins = 0.0
|
| 57 |
+
for c in shared:
|
| 58 |
+
da = scores[a][c] - scores[b][c]
|
| 59 |
+
if da > _TIE_EPS:
|
| 60 |
+
wins += 1.0
|
| 61 |
+
elif abs(da) <= _TIE_EPS:
|
| 62 |
+
wins += 0.5
|
| 63 |
+
sa = wins / len(shared) # A's match score in [0,1]
|
| 64 |
+
matrix[a][b] = round(sa, 4)
|
| 65 |
+
matrix[b][a] = round(1.0 - sa, 4)
|
| 66 |
+
ea = 1.0 / (1.0 + 10.0 ** ((elo[b] - elo[a]) / 400.0))
|
| 67 |
+
elo[a] += ELO_K * (sa - ea)
|
| 68 |
+
elo[b] += ELO_K * ((1.0 - sa) - (1.0 - ea))
|
| 69 |
+
|
| 70 |
+
ranked = sorted(models, key=lambda m: (-elo[m], m))
|
| 71 |
+
return {
|
| 72 |
+
"elo": {m: round(elo[m], 1) for m in models},
|
| 73 |
+
"rank": {m: i + 1 for i, m in enumerate(ranked)},
|
| 74 |
+
"matrix": matrix, # matrix[a][b] = A's match score vs B
|
| 75 |
+
"shared_cells": {
|
| 76 |
+
f"{a}|{b}": len(set(scores[a]) & set(scores[b]))
|
| 77 |
+
for a, b in itertools.combinations(models, 2)
|
| 78 |
+
},
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def run_pairwise(
|
| 83 |
+
packs: list[Path],
|
| 84 |
+
levels: list[str],
|
| 85 |
+
seeds: list[int],
|
| 86 |
+
agents: dict[str, AgentFactory],
|
| 87 |
+
) -> dict:
|
| 88 |
+
"""Run each named model over the same packs/levels/seeds and return
|
| 89 |
+
the pairwise Elo ranking + per-model cell scores."""
|
| 90 |
+
if len(agents) < 2:
|
| 91 |
+
raise ValueError("pairwise eval needs >= 2 models")
|
| 92 |
+
scores: ModelCellScores = {}
|
| 93 |
+
per_model_stats: dict[str, dict] = {}
|
| 94 |
+
for name, factory in agents.items():
|
| 95 |
+
st = evaluate(packs, levels, seeds, agent_factory=factory)
|
| 96 |
+
per_model_stats[name] = st
|
| 97 |
+
scores[name] = _cell_means(st)
|
| 98 |
+
return {
|
| 99 |
+
"pairwise": pairwise_elo(scores),
|
| 100 |
+
"cell_scores": scores,
|
| 101 |
+
"per_model_overall": {
|
| 102 |
+
n: s.get("overall", {}) for n, s in per_model_stats.items()
|
| 103 |
+
},
|
| 104 |
+
}
|
tests/test_pairwise.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pairwise adversarial Elo: pure determinism/correctness + one
|
| 2 |
+
engine-backed comparative run (the user's 'pairwise conditions')."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import pytest
|
| 9 |
+
|
| 10 |
+
from openra_bench.pairwise import pairwise_elo, run_pairwise
|
| 11 |
+
|
| 12 |
+
PACKS = Path(__file__).parent.parent / "openra_bench" / "scenarios" / "packs"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_pairwise_elo_ranks_dominant_model_first():
|
| 16 |
+
scores = {
|
| 17 |
+
"strong": {"c1:easy": 0.9, "c2:easy": 0.8, "c3:easy": 0.7},
|
| 18 |
+
"weak": {"c1:easy": 0.2, "c2:easy": 0.3, "c3:easy": 0.1},
|
| 19 |
+
}
|
| 20 |
+
r = pairwise_elo(scores)
|
| 21 |
+
assert r["rank"]["strong"] == 1 and r["rank"]["weak"] == 2
|
| 22 |
+
assert r["elo"]["strong"] > r["elo"]["weak"]
|
| 23 |
+
# strong beat weak on all 3 shared cells → match score 1.0
|
| 24 |
+
assert r["matrix"]["strong"]["weak"] == 1.0
|
| 25 |
+
assert r["matrix"]["weak"]["strong"] == 0.0
|
| 26 |
+
assert r["shared_cells"]["strong|weak"] == 3
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def test_pairwise_elo_is_deterministic_and_order_independent():
|
| 30 |
+
s1 = {"a": {"x": 0.5}, "b": {"x": 0.6}, "c": {"x": 0.4}}
|
| 31 |
+
s2 = {"c": {"x": 0.4}, "b": {"x": 0.6}, "a": {"x": 0.5}} # reordered
|
| 32 |
+
assert pairwise_elo(s1) == pairwise_elo(s2)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def test_ties_give_half_and_no_shared_cells_skipped():
|
| 36 |
+
r = pairwise_elo({"a": {"x": 0.5}, "b": {"x": 0.5}, "lonely": {"y": 0.9}})
|
| 37 |
+
assert r["matrix"]["a"]["b"] == 0.5
|
| 38 |
+
assert r["shared_cells"]["a|lonely"] == 0
|
| 39 |
+
assert r["elo"]["a"] == r["elo"]["b"] == 1000.0 # tie → no rating change
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_needs_two_models():
|
| 43 |
+
with pytest.raises(ValueError):
|
| 44 |
+
run_pairwise([], [], [], {"only": lambda c: None})
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@pytest.mark.skipif(
|
| 48 |
+
not __import__("importlib").util.find_spec("openra_train"),
|
| 49 |
+
reason="Rust env wheel not installed",
|
| 50 |
+
)
|
| 51 |
+
def test_run_pairwise_competent_beats_idle_on_engine():
|
| 52 |
+
from openra_bench.eval_core import scripted_explore_agent
|
| 53 |
+
|
| 54 |
+
agents = {
|
| 55 |
+
"explorer": lambda c: scripted_explore_agent,
|
| 56 |
+
"idle": lambda c: (lambda rs, Cmd: [Cmd.observe()]),
|
| 57 |
+
}
|
| 58 |
+
out = run_pairwise(
|
| 59 |
+
[PACKS / "perception-frontier-reading.yaml"],
|
| 60 |
+
["easy"],
|
| 61 |
+
[1, 2],
|
| 62 |
+
agents,
|
| 63 |
+
)
|
| 64 |
+
pw = out["pairwise"]
|
| 65 |
+
# The exploring agent should not rank below the idle one on a
|
| 66 |
+
# perception/exploration scenario.
|
| 67 |
+
assert pw["rank"]["explorer"] <= pw["rank"]["idle"]
|
| 68 |
+
assert set(out["cell_scores"]) == {"explorer", "idle"}
|
| 69 |
+
assert pw["elo"]["explorer"] >= pw["elo"]["idle"]
|