yxc20098 commited on
Commit
b04adfc
·
1 Parent(s): 03e4efa

Pairwise adversarial eval + Elo (the user's 'pairwise conditions')

Browse files

openra_bench/pairwise.py: comparative pairwise ranking — every model
plays the same controlled scenarios; ranked head-to-head on shared
cells by composite via deterministic, order-independent Elo (one
update per model pair, ties=0.5, no-shared-cells skipped). pure
pairwise_elo() (engine-free, unit-tested) + run_pairwise() wiring it
to the live evaluator. Delivers the 'evaluate on pairwise adversarial
conditions' goal without an engine rewrite (true co-located 1v1 is the
deeper task #3). tests (5): dominance ranking, determinism/order-
independence, tie handling, >=2-model guard, engine-backed
competent>=idle. Full bench suite green.

Files changed (2) hide show
  1. openra_bench/pairwise.py +104 -0
  2. tests/test_pairwise.py +69 -0
openra_bench/pairwise.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pairwise adversarial evaluation + Elo.
2
+
3
+ The project goal includes evaluating models "on pairwise adversarial
4
+ conditions", not only fixed-scenario absolute scores. True co-located
5
+ 1v1 (both sides model-driven in one map) is the deeper engine feature
6
+ (task #3); this module delivers the tractable, immediately-useful form:
7
+ **comparative** pairwise ranking — every model plays the *same*
8
+ controlled scenarios, and models are ranked by head-to-head composite
9
+ on shared cells via Elo. Pure + deterministic (no engine in
10
+ `pairwise_elo`), so it is fully unit-testable; `run_pairwise` wires it
11
+ to the live evaluator.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import itertools
17
+ from pathlib import Path
18
+ from typing import Callable
19
+
20
+ from .run_eval import AgentFactory, evaluate
21
+
22
+ # Per model: {cell -> mean composite over that cell's seeds}.
23
+ ModelCellScores = dict[str, dict[str, float]]
24
+
25
+ ELO_K = 32.0
26
+ ELO_START = 1000.0
27
+ _TIE_EPS = 1e-6
28
+
29
+
30
+ def _cell_means(stats: dict) -> dict[str, float]:
31
+ """Mean composite per cell from a run_eval stats dict (public split)."""
32
+ by: dict[str, list[float]] = {}
33
+ for e in stats.get("episodes", []):
34
+ if e.get("split", "public") != "public":
35
+ continue
36
+ by.setdefault(e["cell"], []).append(e["composite"])
37
+ return {c: sum(v) / len(v) for c, v in by.items() if v}
38
+
39
+
40
+ def pairwise_elo(scores: ModelCellScores) -> dict:
41
+ """Deterministic Elo from comparative per-cell composites.
42
+
43
+ For each unordered model pair, the match score = fraction of shared
44
+ cells where A strictly beats B (ties = 0.5). One Elo update per
45
+ pair; pairs processed in sorted order so the result is fully
46
+ deterministic and order-independent given the input.
47
+ """
48
+ models = sorted(scores)
49
+ elo = {m: ELO_START for m in models}
50
+ matrix: dict[str, dict[str, float]] = {m: {} for m in models}
51
+
52
+ for a, b in itertools.combinations(models, 2):
53
+ shared = sorted(set(scores[a]) & set(scores[b]))
54
+ if not shared:
55
+ continue
56
+ wins = 0.0
57
+ for c in shared:
58
+ da = scores[a][c] - scores[b][c]
59
+ if da > _TIE_EPS:
60
+ wins += 1.0
61
+ elif abs(da) <= _TIE_EPS:
62
+ wins += 0.5
63
+ sa = wins / len(shared) # A's match score in [0,1]
64
+ matrix[a][b] = round(sa, 4)
65
+ matrix[b][a] = round(1.0 - sa, 4)
66
+ ea = 1.0 / (1.0 + 10.0 ** ((elo[b] - elo[a]) / 400.0))
67
+ elo[a] += ELO_K * (sa - ea)
68
+ elo[b] += ELO_K * ((1.0 - sa) - (1.0 - ea))
69
+
70
+ ranked = sorted(models, key=lambda m: (-elo[m], m))
71
+ return {
72
+ "elo": {m: round(elo[m], 1) for m in models},
73
+ "rank": {m: i + 1 for i, m in enumerate(ranked)},
74
+ "matrix": matrix, # matrix[a][b] = A's match score vs B
75
+ "shared_cells": {
76
+ f"{a}|{b}": len(set(scores[a]) & set(scores[b]))
77
+ for a, b in itertools.combinations(models, 2)
78
+ },
79
+ }
80
+
81
+
82
+ def run_pairwise(
83
+ packs: list[Path],
84
+ levels: list[str],
85
+ seeds: list[int],
86
+ agents: dict[str, AgentFactory],
87
+ ) -> dict:
88
+ """Run each named model over the same packs/levels/seeds and return
89
+ the pairwise Elo ranking + per-model cell scores."""
90
+ if len(agents) < 2:
91
+ raise ValueError("pairwise eval needs >= 2 models")
92
+ scores: ModelCellScores = {}
93
+ per_model_stats: dict[str, dict] = {}
94
+ for name, factory in agents.items():
95
+ st = evaluate(packs, levels, seeds, agent_factory=factory)
96
+ per_model_stats[name] = st
97
+ scores[name] = _cell_means(st)
98
+ return {
99
+ "pairwise": pairwise_elo(scores),
100
+ "cell_scores": scores,
101
+ "per_model_overall": {
102
+ n: s.get("overall", {}) for n, s in per_model_stats.items()
103
+ },
104
+ }
tests/test_pairwise.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pairwise adversarial Elo: pure determinism/correctness + one
2
+ engine-backed comparative run (the user's 'pairwise conditions')."""
3
+
4
+ from __future__ import annotations
5
+
6
+ from pathlib import Path
7
+
8
+ import pytest
9
+
10
+ from openra_bench.pairwise import pairwise_elo, run_pairwise
11
+
12
+ PACKS = Path(__file__).parent.parent / "openra_bench" / "scenarios" / "packs"
13
+
14
+
15
+ def test_pairwise_elo_ranks_dominant_model_first():
16
+ scores = {
17
+ "strong": {"c1:easy": 0.9, "c2:easy": 0.8, "c3:easy": 0.7},
18
+ "weak": {"c1:easy": 0.2, "c2:easy": 0.3, "c3:easy": 0.1},
19
+ }
20
+ r = pairwise_elo(scores)
21
+ assert r["rank"]["strong"] == 1 and r["rank"]["weak"] == 2
22
+ assert r["elo"]["strong"] > r["elo"]["weak"]
23
+ # strong beat weak on all 3 shared cells → match score 1.0
24
+ assert r["matrix"]["strong"]["weak"] == 1.0
25
+ assert r["matrix"]["weak"]["strong"] == 0.0
26
+ assert r["shared_cells"]["strong|weak"] == 3
27
+
28
+
29
+ def test_pairwise_elo_is_deterministic_and_order_independent():
30
+ s1 = {"a": {"x": 0.5}, "b": {"x": 0.6}, "c": {"x": 0.4}}
31
+ s2 = {"c": {"x": 0.4}, "b": {"x": 0.6}, "a": {"x": 0.5}} # reordered
32
+ assert pairwise_elo(s1) == pairwise_elo(s2)
33
+
34
+
35
+ def test_ties_give_half_and_no_shared_cells_skipped():
36
+ r = pairwise_elo({"a": {"x": 0.5}, "b": {"x": 0.5}, "lonely": {"y": 0.9}})
37
+ assert r["matrix"]["a"]["b"] == 0.5
38
+ assert r["shared_cells"]["a|lonely"] == 0
39
+ assert r["elo"]["a"] == r["elo"]["b"] == 1000.0 # tie → no rating change
40
+
41
+
42
+ def test_needs_two_models():
43
+ with pytest.raises(ValueError):
44
+ run_pairwise([], [], [], {"only": lambda c: None})
45
+
46
+
47
+ @pytest.mark.skipif(
48
+ not __import__("importlib").util.find_spec("openra_train"),
49
+ reason="Rust env wheel not installed",
50
+ )
51
+ def test_run_pairwise_competent_beats_idle_on_engine():
52
+ from openra_bench.eval_core import scripted_explore_agent
53
+
54
+ agents = {
55
+ "explorer": lambda c: scripted_explore_agent,
56
+ "idle": lambda c: (lambda rs, Cmd: [Cmd.observe()]),
57
+ }
58
+ out = run_pairwise(
59
+ [PACKS / "perception-frontier-reading.yaml"],
60
+ ["easy"],
61
+ [1, 2],
62
+ agents,
63
+ )
64
+ pw = out["pairwise"]
65
+ # The exploring agent should not rank below the idle one on a
66
+ # perception/exploration scenario.
67
+ assert pw["rank"]["explorer"] <= pw["rank"]["idle"]
68
+ assert set(out["cell_scores"]) == {"explorer", "idle"}
69
+ assert pw["elo"]["explorer"] >= pw["elo"]["idle"]