Spaces:

qpluslab
/

OpenRA-Bench

Running

yxc20098 commited on May 17

Commit

b04adfc

1 Parent(s): 03e4efa

Pairwise adversarial eval + Elo (the user's 'pairwise conditions')

openra_bench/pairwise.py: comparative pairwise ranking — every model
plays the same controlled scenarios; ranked head-to-head on shared
cells by composite via deterministic, order-independent Elo (one
update per model pair, ties=0.5, no-shared-cells skipped). pure
pairwise_elo() (engine-free, unit-tested) + run_pairwise() wiring it
to the live evaluator. Delivers the 'evaluate on pairwise adversarial
conditions' goal without an engine rewrite (true co-located 1v1 is the
deeper task #3). tests (5): dominance ranking, determinism/order-
independence, tie handling, >=2-model guard, engine-backed
competent>=idle. Full bench suite green.

Files changed (2) hide show

openra_bench/pairwise.py +104 -0
tests/test_pairwise.py +69 -0

openra_bench/pairwise.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""Pairwise adversarial evaluation + Elo.
+The project goal includes evaluating models "on pairwise adversarial
+conditions", not only fixed-scenario absolute scores. True co-located
+1v1 (both sides model-driven in one map) is the deeper engine feature
+(task #3); this module delivers the tractable, immediately-useful form:
+**comparative** pairwise ranking — every model plays the *same*
+controlled scenarios, and models are ranked by head-to-head composite
+on shared cells via Elo. Pure + deterministic (no engine in
+`pairwise_elo`), so it is fully unit-testable; `run_pairwise` wires it
+to the live evaluator.
+"""
+from __future__ import annotations
+import itertools
+from pathlib import Path
+from typing import Callable
+from .run_eval import AgentFactory, evaluate
+# Per model: {cell -> mean composite over that cell's seeds}.
+ModelCellScores = dict[str, dict[str, float]]
+ELO_K = 32.0
+ELO_START = 1000.0
+_TIE_EPS = 1e-6
+def _cell_means(stats: dict) -> dict[str, float]:
+    """Mean composite per cell from a run_eval stats dict (public split)."""
+    by: dict[str, list[float]] = {}
+    for e in stats.get("episodes", []):
+        if e.get("split", "public") != "public":
+            continue
+        by.setdefault(e["cell"], []).append(e["composite"])
+    return {c: sum(v) / len(v) for c, v in by.items() if v}
+def pairwise_elo(scores: ModelCellScores) -> dict:
+    """Deterministic Elo from comparative per-cell composites.
+    For each unordered model pair, the match score = fraction of shared
+    cells where A strictly beats B (ties = 0.5). One Elo update per
+    pair; pairs processed in sorted order so the result is fully
+    deterministic and order-independent given the input.
+    """
+    models = sorted(scores)
+    elo = {m: ELO_START for m in models}
+    matrix: dict[str, dict[str, float]] = {m: {} for m in models}
+    for a, b in itertools.combinations(models, 2):
+        shared = sorted(set(scores[a]) & set(scores[b]))
+        if not shared:
+            continue
+        wins = 0.0
+        for c in shared:
+            da = scores[a][c] - scores[b][c]
+            if da > _TIE_EPS:
+                wins += 1.0
+            elif abs(da) <= _TIE_EPS:
+                wins += 0.5
+        sa = wins / len(shared)  # A's match score in [0,1]
+        matrix[a][b] = round(sa, 4)
+        matrix[b][a] = round(1.0 - sa, 4)
+        ea = 1.0 / (1.0 + 10.0 ** ((elo[b] - elo[a]) / 400.0))
+        elo[a] += ELO_K * (sa - ea)
+        elo[b] += ELO_K * ((1.0 - sa) - (1.0 - ea))
+    ranked = sorted(models, key=lambda m: (-elo[m], m))
+    return {
+        "elo": {m: round(elo[m], 1) for m in models},
+        "rank": {m: i + 1 for i, m in enumerate(ranked)},
+        "matrix": matrix,  # matrix[a][b] = A's match score vs B
+        "shared_cells": {
+            f"{a}|{b}": len(set(scores[a]) & set(scores[b]))
+            for a, b in itertools.combinations(models, 2)
+        },
+    }
+def run_pairwise(
+    packs: list[Path],
+    levels: list[str],
+    seeds: list[int],
+    agents: dict[str, AgentFactory],
+) -> dict:
+    """Run each named model over the same packs/levels/seeds and return
+    the pairwise Elo ranking + per-model cell scores."""
+    if len(agents) < 2:
+        raise ValueError("pairwise eval needs >= 2 models")
+    scores: ModelCellScores = {}
+    per_model_stats: dict[str, dict] = {}
+    for name, factory in agents.items():
+        st = evaluate(packs, levels, seeds, agent_factory=factory)
+        per_model_stats[name] = st
+        scores[name] = _cell_means(st)
+    return {
+        "pairwise": pairwise_elo(scores),
+        "cell_scores": scores,
+        "per_model_overall": {
+            n: s.get("overall", {}) for n, s in per_model_stats.items()
+        },
+    }

tests/test_pairwise.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""Pairwise adversarial Elo: pure determinism/correctness + one
+engine-backed comparative run (the user's 'pairwise conditions')."""
+from __future__ import annotations
+from pathlib import Path
+import pytest
+from openra_bench.pairwise import pairwise_elo, run_pairwise
+PACKS = Path(__file__).parent.parent / "openra_bench" / "scenarios" / "packs"
+def test_pairwise_elo_ranks_dominant_model_first():
+    scores = {
+        "strong": {"c1:easy": 0.9, "c2:easy": 0.8, "c3:easy": 0.7},
+        "weak": {"c1:easy": 0.2, "c2:easy": 0.3, "c3:easy": 0.1},
+    }
+    r = pairwise_elo(scores)
+    assert r["rank"]["strong"] == 1 and r["rank"]["weak"] == 2
+    assert r["elo"]["strong"] > r["elo"]["weak"]
+    # strong beat weak on all 3 shared cells → match score 1.0
+    assert r["matrix"]["strong"]["weak"] == 1.0
+    assert r["matrix"]["weak"]["strong"] == 0.0
+    assert r["shared_cells"]["strong|weak"] == 3
+def test_pairwise_elo_is_deterministic_and_order_independent():
+    s1 = {"a": {"x": 0.5}, "b": {"x": 0.6}, "c": {"x": 0.4}}
+    s2 = {"c": {"x": 0.4}, "b": {"x": 0.6}, "a": {"x": 0.5}}  # reordered
+    assert pairwise_elo(s1) == pairwise_elo(s2)
+def test_ties_give_half_and_no_shared_cells_skipped():
+    r = pairwise_elo({"a": {"x": 0.5}, "b": {"x": 0.5}, "lonely": {"y": 0.9}})
+    assert r["matrix"]["a"]["b"] == 0.5
+    assert r["shared_cells"]["a|lonely"] == 0
+    assert r["elo"]["a"] == r["elo"]["b"] == 1000.0  # tie → no rating change
+def test_needs_two_models():
+    with pytest.raises(ValueError):
+        run_pairwise([], [], [], {"only": lambda c: None})
+@pytest.mark.skipif(
+    not __import__("importlib").util.find_spec("openra_train"),
+    reason="Rust env wheel not installed",
+)
+def test_run_pairwise_competent_beats_idle_on_engine():
+    from openra_bench.eval_core import scripted_explore_agent
+    agents = {
+        "explorer": lambda c: scripted_explore_agent,
+        "idle": lambda c: (lambda rs, Cmd: [Cmd.observe()]),
+    }
+    out = run_pairwise(
+        [PACKS / "perception-frontier-reading.yaml"],
+        ["easy"],
+        [1, 2],
+        agents,
+    )
+    pw = out["pairwise"]
+    # The exploring agent should not rank below the idle one on a
+    # perception/exploration scenario.
+    assert pw["rank"]["explorer"] <= pw["rank"]["idle"]
+    assert set(out["cell_scores"]) == {"explorer", "idle"}
+    assert pw["elo"]["explorer"] >= pw["elo"]["idle"]