"""Adversarial 1v1 spotlight: ladder rating + family integ. The ladder metric is pure logic (fast, exhaustive); the 3 packs are also compiled and smoke-run on the live Rust engine to prove the new `adversarial` capability flows end to end (schema → engine → score → leaderboard breakdown). """ from __future__ import annotations from pathlib import Path import pytest pytest.importorskip("openra_rl_training", reason="Rust env wheel not installed") from openra_bench.adversarial import ( RUNGS, adversarial_summary, ladder_rating, ladder_ratings, ) from openra_bench.leaderboard import _capability_breakdown, ingest_run PACKS = Path(__file__).parent.parent / "openra_bench" / "scenarios" / "packs" # skirmish/siege consolidated into adversarial-duel (quarantined dups). ADV = ["adversarial-duel"] def test_ladder_rating_is_contiguous_from_easy(): assert ladder_rating({}) == 0 assert ladder_rating({"easy": "loss"}) == 0 assert ladder_rating({"easy": "win"}) == 1 assert ladder_rating({"easy": "win", "medium": "win"}) == 2 assert ladder_rating(dict.fromkeys(RUNGS, "win")) == 3 # non-contiguous: hard won but medium lost → still 1 assert ladder_rating( {"easy": "win", "medium": "loss", "hard": "win"} ) == 1 # draw does not clear a rung assert ladder_rating({"easy": "draw"}) == 0 def test_ladder_ratings_need_all_seeds_won(): stats = { "episodes": [ {"cell": "adversarial-duel:easy", "capability": "adversarial", "split": "public", "outcome": "win"}, {"cell": "adversarial-duel:easy", "capability": "adversarial", "split": "public", "outcome": "loss"}, # one seed lost {"cell": "adversarial-duel:medium", "capability": "adversarial", "split": "public", "outcome": "win"}, # non-adversarial + held-out ignored {"cell": "rush-hour:easy", "capability": "action", "split": "public", "outcome": "win"}, {"cell": "adversarial-duel:hard", "capability": "adversarial", "split": "held_out", "outcome": "win"}, ] } # easy not all-won → rating 0 (medium can't count, non-contiguous) assert ladder_ratings(stats) == {"adversarial-duel": 0} s = adversarial_summary(stats) assert s["packs"] == ["adversarial-duel"] assert s["mean_ladder_rating"] == 0.0 and s["max_rung"] == 3 def test_summary_and_leaderboard_carry_adversarial(tmp_path): stats = { "episodes": [ {"cell": "adversarial-duel:easy", "capability": "adversarial", "split": "public", "outcome": "win", "composite": 0.7}, {"cell": "adversarial-duel:medium", "capability": "adversarial", "split": "public", "outcome": "win", "composite": 0.6}, {"cell": "adversarial-duel:hard", "capability": "adversarial", "split": "public", "outcome": "loss", "composite": 0.2}, ], "overall": {"n": 3, "win_rate": 0.66, "composite_mean": 0.5}, "adversarial": None, } stats["adversarial"] = adversarial_summary(stats) assert stats["adversarial"]["ladder_ratings"] == {"adversarial-duel": 2} rec = ingest_run(stats, "m1", store=tmp_path / "lb.jsonl") assert rec["adversarial_rating"] == 2 / 1 # mean over 1 pack assert rec["adversarial_ladders"] == {"adversarial-duel": 2} cap = _capability_breakdown(stats["episodes"]) assert "adversarial" in cap and cap["adversarial"]["n"] == 3 @pytest.mark.parametrize("pid", ADV) def test_adversarial_pack_compiles_and_runs(pid): pytest.importorskip("openra_train") from openra_bench.eval_core import run_level from openra_bench.scenarios import load_pack from openra_bench.scenarios.loader import compile_level pack = load_pack(PACKS / f"{pid}.yaml") for lvl in RUNGS: c = compile_level(pack, lvl) assert c.meta.capability == "adversarial" assert c.map_supported, f"{pid}:{lvl} map must be Rust-loadable" c = compile_level(pack, "easy") res = run_level(c, lambda rs, C: [C.observe()], seed=1) assert res.outcome in {"win", "draw", "loss"} assert res.turns >= 1