OpenRA-Bench / tests /test_build_sequence_tech_fastest.py
yxc20098's picture
feat(scenario): build-sequence-tech-fastest โ€” fastest weap-tech BO (PlanBench cost-optimal anchor)
34333cc
Raw
History Blame Contribute Delete
13.4 kB
"""build-sequence-tech-fastest pack โ€” full no-cheat validation on Rust.
Wave-7 REASONING โ€” cost-optimal build-order planning. The agent must
reach the war factory (`weap`) on the SHORTEST prerequisite chain:
powr โ†’ proc โ†’ weap
Any detour (build a barracks/tent first, or a redundant power plant,
or an early infantry queue) overruns the tight tick budget and loses.
The chain is enforced by the Wave-2 `then:` happened-before composite;
the deadline (`within_ticks`) is the cost-optimality teeth โ€” slack is
tuned so the OPTIMAL plan fits and the tent-detour plan does NOT.
Bar (CLAUDE.md): the intended cost-optimal policy WINS on every
(level, seed); stall and the tent-first wrong-path policy LOSE on
every (level, seed). Real LOSS not DRAW โ€” `fail after_ticks:T+1`
inside max_turns is the bite.
Scenario shape:
- rush-hour-arena, allies vs soviet (bot disabled).
- easy: T=3000, max_turns=40 โ€” generous (4-turn buffer).
- medium: T=2800, max_turns=35 โ€” tight (โ‰ˆ2-turn buffer).
- hard: T=2800, max_turns=35 โ€” same tight T + โ‰ฅ2 spawn_point
groups (NORTH y=14 / SOUTH y=26 base, round-robined).
Measured optimal timing (seed 1, scripted intended policy):
powr completes โ‰ˆ tick 273 (turn 3)
proc completes โ‰ˆ tick 1263 (turn 14)
weap completes โ‰ˆ tick 2613 (turn 29)
Measured tent-first wrong-path timing:
weap completes โ‰ˆ tick 3063 (turn 34) โ€” beyond every level's T.
"""
from __future__ import annotations
import pytest
pytest.importorskip("openra_train", reason="Rust env wheel not installed")
pytest.importorskip("openra_rl_training", reason="Rust env wheel not installed")
from openra_bench.eval_core import run_level
from openra_bench.scenarios import load_pack
from openra_bench.scenarios.loader import PACKS_DIR, compile_level
PACK = PACKS_DIR / "build-sequence-tech-fastest.yaml"
LEVELS = ("easy", "medium", "hard")
SEEDS = (1, 2, 3, 4)
# โ”€โ”€ Policies โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _stall_policy():
"""Do nothing โ€” must LOSE on the clock on every level/seed."""
def pol(obs, Cmd):
return [Cmd.observe()]
return pol
def _intended_policy():
"""Cost-optimal play: build powr โ†’ proc โ†’ weap, each one placed
relative to the agent's actual fact (so the policy generalises
across the hard-tier spawn variation). This is the policy the
pack is solvable by โ€” must WIN on every (level, seed)."""
milestone = {"powr": False, "proc": False, "weap": False}
def pol(obs, Cmd):
ob = obs.get("own_buildings", []) or []
own_b = {b["type"] for b in ob}
prod = obs.get("production", []) or []
for b in ("powr", "proc", "weap"):
if b in own_b:
milestone[b] = True
cmds = []
base = [b for b in ob if b["type"] == "fact"]
if not milestone["powr"]:
if "powr" not in prod:
cmds.append(Cmd.build("powr"))
if base:
cmds.append(Cmd.place_building(
"powr", base[0]["cell_x"] + 4, base[0]["cell_y"]
))
elif not milestone["proc"]:
if "proc" not in prod:
cmds.append(Cmd.build("proc"))
if base:
cmds.append(Cmd.place_building(
"proc", base[0]["cell_x"] + 6, base[0]["cell_y"] + 3
))
elif not milestone["weap"]:
if "weap" not in prod:
cmds.append(Cmd.build("weap"))
if base:
cmds.append(Cmd.place_building(
"weap", base[0]["cell_x"] + 8, base[0]["cell_y"]
))
if not cmds:
cmds.append(Cmd.observe())
return cmds
return pol
def _tent_first_policy():
"""Wrong cost-non-optimal play: powr โ†’ tent โ†’ proc โ†’ weap. The
tent is not on the prerequisite chain for weap (only proc is); it
bloats the BOM by 500 credits and ~5 turns. Must LOSE on the
clock on every level/seed."""
milestone = {"powr": False, "tent": False, "proc": False, "weap": False}
def pol(obs, Cmd):
ob = obs.get("own_buildings", []) or []
own_b = {b["type"] for b in ob}
prod = obs.get("production", []) or []
for b in ("powr", "tent", "proc", "weap"):
if b in own_b:
milestone[b] = True
cmds = []
base = [b for b in ob if b["type"] == "fact"]
if not milestone["powr"]:
if "powr" not in prod:
cmds.append(Cmd.build("powr"))
if base:
cmds.append(Cmd.place_building(
"powr", base[0]["cell_x"] + 4, base[0]["cell_y"]
))
elif not milestone["tent"]:
if "tent" not in prod:
cmds.append(Cmd.build("tent"))
if base:
cmds.append(Cmd.place_building(
"tent", base[0]["cell_x"] + 4, base[0]["cell_y"] + 3
))
elif not milestone["proc"]:
if "proc" not in prod:
cmds.append(Cmd.build("proc"))
if base:
cmds.append(Cmd.place_building(
"proc", base[0]["cell_x"] + 6, base[0]["cell_y"] + 3
))
elif not milestone["weap"]:
if "weap" not in prod:
cmds.append(Cmd.build("weap"))
if base:
cmds.append(Cmd.place_building(
"weap", base[0]["cell_x"] + 8, base[0]["cell_y"]
))
if not cmds:
cmds.append(Cmd.observe())
return cmds
return pol
# โ”€โ”€ Pack-shape tests (cheap; do not run the engine) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def test_pack_compiles_with_three_levels():
pack = load_pack(PACK)
assert pack.meta.id == "build-sequence-tech-fastest"
assert pack.meta.capability == "reasoning"
assert set(pack.levels) == {"easy", "medium", "hard"}
def test_meta_benchmark_anchor_set():
"""Required by the seed taxonomy: PlanBench cost-optimal +
BOM manufacturing critical-path planning."""
pack = load_pack(PACK)
anchors = pack.meta.benchmark_anchor or []
assert any("PlanBench" in a for a in anchors), anchors
assert any("BOM" in a for a in anchors), anchors
def test_hard_tier_has_seed_driven_spawn_groups():
"""Hard must define โ‰ฅ2 agent spawn_point groups so seed varies
the start base (tests/test_hard_tier.py::UPGRADED contract)."""
c = compile_level(load_pack(PACK), "hard")
sp = {a.spawn_point for a in c.scenario.actors if a.owner == "agent"}
assert len(sp) >= 2, f"hard needs โ‰ฅ2 spawn groups, got {sp}"
def test_every_level_has_fail_condition():
"""No silent draws โ€” every level must be able to emit a LOSS."""
pack = load_pack(PACK)
for lvl in LEVELS:
c = compile_level(pack, lvl)
assert c.fail_condition is not None, f"{lvl} missing fail_condition"
def test_then_composite_used_in_win():
"""Confirms the 3-step build-order chain is wired through to the
compiled win condition โ€” the load-bearing teeth of this pack."""
for lvl in LEVELS:
c = compile_level(load_pack(PACK), lvl)
win = c.win_condition.model_dump(exclude_none=True)
inner = win.get("all_of") or []
assert any("then" in cl for cl in inner), (
f"{lvl} win missing then-chain: {win}"
)
for cl in inner:
if "then" in cl:
clauses = (cl["then"] or {}).get("clauses") or []
assert len(clauses) == 3, (
f"{lvl} then-chain must be powrโ†’procโ†’weap (3 clauses); "
f"got {clauses}"
)
# And in the exact engine-enforced prereq order.
assert clauses[0].get("has_building") == "powr"
assert clauses[1].get("has_building") == "proc"
assert clauses[2].get("has_building") == "weap"
def test_tick_budget_aligned_with_max_turns():
"""within_ticks must be reachable inside max_turns. Engine
advances ~90 ticks/turn โ†’ reachable max = 93 + 90ยท(N-1)."""
pack = load_pack(PACK)
for lvl in LEVELS:
level_def = pack.levels[lvl]
max_turns = level_def.max_turns
reachable = 93 + 90 * (max_turns - 1)
win = compile_level(pack, lvl).win_condition.model_dump(exclude_none=True)
def _collect(node, key, out):
if isinstance(node, dict):
if key in node:
out.append(node[key])
for v in node.values():
_collect(v, key, out)
elif isinstance(node, list):
for v in node:
_collect(v, key, out)
wts = []
_collect(win, "within_ticks", wts)
assert wts, f"{lvl} has no within_ticks leaf (no clock teeth)"
for wt in wts:
assert wt <= reachable, (
f"{lvl} within_ticks={wt} > reachable={reachable} "
f"(max_turns={max_turns}) โ€” deadline never bites โ‡’ draw"
)
# โ”€โ”€ Engine-bound tests (parameterised over seeds 1..4) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("level", LEVELS)
def test_intended_cost_optimal_policy_wins(level, seed):
"""The intended cost-optimal play (powr โ†’ proc โ†’ weap) must WIN
on every (level, seed). This is the load-bearing test that the
pack is solvable inside the budget by the advertised capability."""
c = compile_level(load_pack(PACK), level)
res = run_level(c, _intended_policy(), seed=seed)
tp = getattr(res.signals, "then_progress", {}) or {}
assert res.outcome == "win", (
f"intended cost-optimal must WIN on {level} s={seed}; "
f"got {res.outcome} (tick={res.signals.game_tick}, "
f"then_progress={tp}, "
f"own_buildings={res.signals.own_building_types})"
)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("level", LEVELS)
def test_stall_loses(level, seed):
"""A do-nothing policy must LOSE on every (level, seed). The
fail_condition's after_ticks clause bites at the budget; never
a draw."""
c = compile_level(load_pack(PACK), level)
res = run_level(c, _stall_policy(), seed=seed)
assert res.outcome == "loss", (
f"stall must LOSE on {level} s={seed}; got {res.outcome} "
f"(tick={res.signals.game_tick})"
)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("level", LEVELS)
def test_tent_first_wrong_path_loses(level, seed):
"""The cost-non-optimal tent-first play must LOSE on every
(level, seed). The tent detour adds ~500 credits + ~5 turns,
pushing weap completion to ~tick 3063 โ€” beyond every level's
deadline. The capability being measured is COST-OPTIMAL
planning; a 'some plan that arrives' policy must not win."""
c = compile_level(load_pack(PACK), level)
res = run_level(c, _tent_first_policy(), seed=seed)
tp = getattr(res.signals, "then_progress", {}) or {}
assert res.outcome == "loss", (
f"tent-first wrong-path must LOSE on {level} s={seed}; got "
f"{res.outcome} (tick={res.signals.game_tick}, "
f"then_progress={tp}, own_buildings={res.signals.own_building_types})"
)
@pytest.mark.parametrize("seed", SEEDS)
def test_hard_seeds_produce_distinct_starts(seed):
"""Hard's two spawn_point groups must actually round-robin โ€”
different seeds must place the agent fact at a different (x,y).
Smoke-tests the spawn-variation contract that
tests/test_hard_tier.py also enforces."""
c = compile_level(load_pack(PACK), "hard")
captured = {"first_obs": None}
def probe(obs, Cmd):
if captured["first_obs"] is None:
captured["first_obs"] = list(obs.get("own_buildings", []) or [])
return [Cmd.observe()]
res = run_level(c, probe, seed=seed)
assert res.outcome == "loss" # stall must lose
facts = [
(b["cell_x"], b["cell_y"])
for b in (captured["first_obs"] or [])
if b["type"] == "fact"
]
assert facts, f"no fact observed at turn 0 for seed={seed}"
def test_hard_spawns_round_robin_across_seeds():
"""Two seeds (1 and 2) must place the agent's fact at DIFFERENT
cells โ€” proves the spawn_point round-robin is active, not
degenerate."""
c = compile_level(load_pack(PACK), "hard")
def probe():
captured = {}
def pol(obs, Cmd):
if "fact_pos" not in captured:
bs = obs.get("own_buildings", []) or []
facts = [(b["cell_x"], b["cell_y"]) for b in bs if b["type"] == "fact"]
if facts:
captured["fact_pos"] = facts[0]
return [Cmd.observe()]
pol.captured = captured
return pol
p1 = probe(); run_level(c, p1, seed=1)
p2 = probe(); run_level(c, p2, seed=2)
pos1 = p1.captured.get("fact_pos")
pos2 = p2.captured.get("fact_pos")
assert pos1 and pos2, f"missing fact obs: s1={pos1} s2={pos2}"
assert pos1 != pos2, (
f"hard spawn round-robin is degenerate: seed 1 and 2 both "
f"started at {pos1}"
)