OpenRA-Bench / tests /test_adversarial.py
Yiyu Tian
tests: module-level importorskip on all 80 engine-dependent test files
5cfed54
Raw
History Blame Contribute Delete
4.21 kB
"""Adversarial 1v1 spotlight: ladder rating + family integ.
The ladder metric is pure logic (fast, exhaustive); the 3 packs are
also compiled and smoke-run on the live Rust engine to prove the new
`adversarial` capability flows end to end (schema → engine → score →
leaderboard breakdown).
"""
from __future__ import annotations
from pathlib import Path
import pytest
pytest.importorskip("openra_rl_training", reason="Rust env wheel not installed")
from openra_bench.adversarial import (
RUNGS,
adversarial_summary,
ladder_rating,
ladder_ratings,
)
from openra_bench.leaderboard import _capability_breakdown, ingest_run
PACKS = Path(__file__).parent.parent / "openra_bench" / "scenarios" / "packs"
# skirmish/siege consolidated into adversarial-duel (quarantined dups).
ADV = ["adversarial-duel"]
def test_ladder_rating_is_contiguous_from_easy():
assert ladder_rating({}) == 0
assert ladder_rating({"easy": "loss"}) == 0
assert ladder_rating({"easy": "win"}) == 1
assert ladder_rating({"easy": "win", "medium": "win"}) == 2
assert ladder_rating(dict.fromkeys(RUNGS, "win")) == 3
# non-contiguous: hard won but medium lost → still 1
assert ladder_rating(
{"easy": "win", "medium": "loss", "hard": "win"}
) == 1
# draw does not clear a rung
assert ladder_rating({"easy": "draw"}) == 0
def test_ladder_ratings_need_all_seeds_won():
stats = {
"episodes": [
{"cell": "adversarial-duel:easy", "capability": "adversarial",
"split": "public", "outcome": "win"},
{"cell": "adversarial-duel:easy", "capability": "adversarial",
"split": "public", "outcome": "loss"}, # one seed lost
{"cell": "adversarial-duel:medium", "capability": "adversarial",
"split": "public", "outcome": "win"},
# non-adversarial + held-out ignored
{"cell": "rush-hour:easy", "capability": "action",
"split": "public", "outcome": "win"},
{"cell": "adversarial-duel:hard", "capability": "adversarial",
"split": "held_out", "outcome": "win"},
]
}
# easy not all-won → rating 0 (medium can't count, non-contiguous)
assert ladder_ratings(stats) == {"adversarial-duel": 0}
s = adversarial_summary(stats)
assert s["packs"] == ["adversarial-duel"]
assert s["mean_ladder_rating"] == 0.0 and s["max_rung"] == 3
def test_summary_and_leaderboard_carry_adversarial(tmp_path):
stats = {
"episodes": [
{"cell": "adversarial-duel:easy", "capability": "adversarial",
"split": "public", "outcome": "win", "composite": 0.7},
{"cell": "adversarial-duel:medium", "capability": "adversarial",
"split": "public", "outcome": "win", "composite": 0.6},
{"cell": "adversarial-duel:hard", "capability": "adversarial",
"split": "public", "outcome": "loss", "composite": 0.2},
],
"overall": {"n": 3, "win_rate": 0.66, "composite_mean": 0.5},
"adversarial": None,
}
stats["adversarial"] = adversarial_summary(stats)
assert stats["adversarial"]["ladder_ratings"] == {"adversarial-duel": 2}
rec = ingest_run(stats, "m1", store=tmp_path / "lb.jsonl")
assert rec["adversarial_rating"] == 2 / 1 # mean over 1 pack
assert rec["adversarial_ladders"] == {"adversarial-duel": 2}
cap = _capability_breakdown(stats["episodes"])
assert "adversarial" in cap and cap["adversarial"]["n"] == 3
@pytest.mark.parametrize("pid", ADV)
def test_adversarial_pack_compiles_and_runs(pid):
pytest.importorskip("openra_train")
from openra_bench.eval_core import run_level
from openra_bench.scenarios import load_pack
from openra_bench.scenarios.loader import compile_level
pack = load_pack(PACKS / f"{pid}.yaml")
for lvl in RUNGS:
c = compile_level(pack, lvl)
assert c.meta.capability == "adversarial"
assert c.map_supported, f"{pid}:{lvl} map must be Rust-loadable"
c = compile_level(pack, "easy")
res = run_level(c, lambda rs, C: [C.observe()], seed=1)
assert res.outcome in {"win", "draw", "loss"}
assert res.turns >= 1