Spaces:
Running
Running
| """build-sequence-tech-fastest pack โ full no-cheat validation on Rust. | |
| Wave-7 REASONING โ cost-optimal build-order planning. The agent must | |
| reach the war factory (`weap`) on the SHORTEST prerequisite chain: | |
| powr โ proc โ weap | |
| Any detour (build a barracks/tent first, or a redundant power plant, | |
| or an early infantry queue) overruns the tight tick budget and loses. | |
| The chain is enforced by the Wave-2 `then:` happened-before composite; | |
| the deadline (`within_ticks`) is the cost-optimality teeth โ slack is | |
| tuned so the OPTIMAL plan fits and the tent-detour plan does NOT. | |
| Bar (CLAUDE.md): the intended cost-optimal policy WINS on every | |
| (level, seed); stall and the tent-first wrong-path policy LOSE on | |
| every (level, seed). Real LOSS not DRAW โ `fail after_ticks:T+1` | |
| inside max_turns is the bite. | |
| Scenario shape: | |
| - rush-hour-arena, allies vs soviet (bot disabled). | |
| - easy: T=3000, max_turns=40 โ generous (4-turn buffer). | |
| - medium: T=2800, max_turns=35 โ tight (โ2-turn buffer). | |
| - hard: T=2800, max_turns=35 โ same tight T + โฅ2 spawn_point | |
| groups (NORTH y=14 / SOUTH y=26 base, round-robined). | |
| Measured optimal timing (seed 1, scripted intended policy): | |
| powr completes โ tick 273 (turn 3) | |
| proc completes โ tick 1263 (turn 14) | |
| weap completes โ tick 2613 (turn 29) | |
| Measured tent-first wrong-path timing: | |
| weap completes โ tick 3063 (turn 34) โ beyond every level's T. | |
| """ | |
| from __future__ import annotations | |
| import pytest | |
| pytest.importorskip("openra_train", reason="Rust env wheel not installed") | |
| pytest.importorskip("openra_rl_training", reason="Rust env wheel not installed") | |
| from openra_bench.eval_core import run_level | |
| from openra_bench.scenarios import load_pack | |
| from openra_bench.scenarios.loader import PACKS_DIR, compile_level | |
| PACK = PACKS_DIR / "build-sequence-tech-fastest.yaml" | |
| LEVELS = ("easy", "medium", "hard") | |
| SEEDS = (1, 2, 3, 4) | |
| # โโ Policies โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def _stall_policy(): | |
| """Do nothing โ must LOSE on the clock on every level/seed.""" | |
| def pol(obs, Cmd): | |
| return [Cmd.observe()] | |
| return pol | |
| def _intended_policy(): | |
| """Cost-optimal play: build powr โ proc โ weap, each one placed | |
| relative to the agent's actual fact (so the policy generalises | |
| across the hard-tier spawn variation). This is the policy the | |
| pack is solvable by โ must WIN on every (level, seed).""" | |
| milestone = {"powr": False, "proc": False, "weap": False} | |
| def pol(obs, Cmd): | |
| ob = obs.get("own_buildings", []) or [] | |
| own_b = {b["type"] for b in ob} | |
| prod = obs.get("production", []) or [] | |
| for b in ("powr", "proc", "weap"): | |
| if b in own_b: | |
| milestone[b] = True | |
| cmds = [] | |
| base = [b for b in ob if b["type"] == "fact"] | |
| if not milestone["powr"]: | |
| if "powr" not in prod: | |
| cmds.append(Cmd.build("powr")) | |
| if base: | |
| cmds.append(Cmd.place_building( | |
| "powr", base[0]["cell_x"] + 4, base[0]["cell_y"] | |
| )) | |
| elif not milestone["proc"]: | |
| if "proc" not in prod: | |
| cmds.append(Cmd.build("proc")) | |
| if base: | |
| cmds.append(Cmd.place_building( | |
| "proc", base[0]["cell_x"] + 6, base[0]["cell_y"] + 3 | |
| )) | |
| elif not milestone["weap"]: | |
| if "weap" not in prod: | |
| cmds.append(Cmd.build("weap")) | |
| if base: | |
| cmds.append(Cmd.place_building( | |
| "weap", base[0]["cell_x"] + 8, base[0]["cell_y"] | |
| )) | |
| if not cmds: | |
| cmds.append(Cmd.observe()) | |
| return cmds | |
| return pol | |
| def _tent_first_policy(): | |
| """Wrong cost-non-optimal play: powr โ tent โ proc โ weap. The | |
| tent is not on the prerequisite chain for weap (only proc is); it | |
| bloats the BOM by 500 credits and ~5 turns. Must LOSE on the | |
| clock on every level/seed.""" | |
| milestone = {"powr": False, "tent": False, "proc": False, "weap": False} | |
| def pol(obs, Cmd): | |
| ob = obs.get("own_buildings", []) or [] | |
| own_b = {b["type"] for b in ob} | |
| prod = obs.get("production", []) or [] | |
| for b in ("powr", "tent", "proc", "weap"): | |
| if b in own_b: | |
| milestone[b] = True | |
| cmds = [] | |
| base = [b for b in ob if b["type"] == "fact"] | |
| if not milestone["powr"]: | |
| if "powr" not in prod: | |
| cmds.append(Cmd.build("powr")) | |
| if base: | |
| cmds.append(Cmd.place_building( | |
| "powr", base[0]["cell_x"] + 4, base[0]["cell_y"] | |
| )) | |
| elif not milestone["tent"]: | |
| if "tent" not in prod: | |
| cmds.append(Cmd.build("tent")) | |
| if base: | |
| cmds.append(Cmd.place_building( | |
| "tent", base[0]["cell_x"] + 4, base[0]["cell_y"] + 3 | |
| )) | |
| elif not milestone["proc"]: | |
| if "proc" not in prod: | |
| cmds.append(Cmd.build("proc")) | |
| if base: | |
| cmds.append(Cmd.place_building( | |
| "proc", base[0]["cell_x"] + 6, base[0]["cell_y"] + 3 | |
| )) | |
| elif not milestone["weap"]: | |
| if "weap" not in prod: | |
| cmds.append(Cmd.build("weap")) | |
| if base: | |
| cmds.append(Cmd.place_building( | |
| "weap", base[0]["cell_x"] + 8, base[0]["cell_y"] | |
| )) | |
| if not cmds: | |
| cmds.append(Cmd.observe()) | |
| return cmds | |
| return pol | |
| # โโ Pack-shape tests (cheap; do not run the engine) โโโโโโโโโโโโโโ | |
| def test_pack_compiles_with_three_levels(): | |
| pack = load_pack(PACK) | |
| assert pack.meta.id == "build-sequence-tech-fastest" | |
| assert pack.meta.capability == "reasoning" | |
| assert set(pack.levels) == {"easy", "medium", "hard"} | |
| def test_meta_benchmark_anchor_set(): | |
| """Required by the seed taxonomy: PlanBench cost-optimal + | |
| BOM manufacturing critical-path planning.""" | |
| pack = load_pack(PACK) | |
| anchors = pack.meta.benchmark_anchor or [] | |
| assert any("PlanBench" in a for a in anchors), anchors | |
| assert any("BOM" in a for a in anchors), anchors | |
| def test_hard_tier_has_seed_driven_spawn_groups(): | |
| """Hard must define โฅ2 agent spawn_point groups so seed varies | |
| the start base (tests/test_hard_tier.py::UPGRADED contract).""" | |
| c = compile_level(load_pack(PACK), "hard") | |
| sp = {a.spawn_point for a in c.scenario.actors if a.owner == "agent"} | |
| assert len(sp) >= 2, f"hard needs โฅ2 spawn groups, got {sp}" | |
| def test_every_level_has_fail_condition(): | |
| """No silent draws โ every level must be able to emit a LOSS.""" | |
| pack = load_pack(PACK) | |
| for lvl in LEVELS: | |
| c = compile_level(pack, lvl) | |
| assert c.fail_condition is not None, f"{lvl} missing fail_condition" | |
| def test_then_composite_used_in_win(): | |
| """Confirms the 3-step build-order chain is wired through to the | |
| compiled win condition โ the load-bearing teeth of this pack.""" | |
| for lvl in LEVELS: | |
| c = compile_level(load_pack(PACK), lvl) | |
| win = c.win_condition.model_dump(exclude_none=True) | |
| inner = win.get("all_of") or [] | |
| assert any("then" in cl for cl in inner), ( | |
| f"{lvl} win missing then-chain: {win}" | |
| ) | |
| for cl in inner: | |
| if "then" in cl: | |
| clauses = (cl["then"] or {}).get("clauses") or [] | |
| assert len(clauses) == 3, ( | |
| f"{lvl} then-chain must be powrโprocโweap (3 clauses); " | |
| f"got {clauses}" | |
| ) | |
| # And in the exact engine-enforced prereq order. | |
| assert clauses[0].get("has_building") == "powr" | |
| assert clauses[1].get("has_building") == "proc" | |
| assert clauses[2].get("has_building") == "weap" | |
| def test_tick_budget_aligned_with_max_turns(): | |
| """within_ticks must be reachable inside max_turns. Engine | |
| advances ~90 ticks/turn โ reachable max = 93 + 90ยท(N-1).""" | |
| pack = load_pack(PACK) | |
| for lvl in LEVELS: | |
| level_def = pack.levels[lvl] | |
| max_turns = level_def.max_turns | |
| reachable = 93 + 90 * (max_turns - 1) | |
| win = compile_level(pack, lvl).win_condition.model_dump(exclude_none=True) | |
| def _collect(node, key, out): | |
| if isinstance(node, dict): | |
| if key in node: | |
| out.append(node[key]) | |
| for v in node.values(): | |
| _collect(v, key, out) | |
| elif isinstance(node, list): | |
| for v in node: | |
| _collect(v, key, out) | |
| wts = [] | |
| _collect(win, "within_ticks", wts) | |
| assert wts, f"{lvl} has no within_ticks leaf (no clock teeth)" | |
| for wt in wts: | |
| assert wt <= reachable, ( | |
| f"{lvl} within_ticks={wt} > reachable={reachable} " | |
| f"(max_turns={max_turns}) โ deadline never bites โ draw" | |
| ) | |
| # โโ Engine-bound tests (parameterised over seeds 1..4) โโโโโโโโโโโโ | |
| def test_intended_cost_optimal_policy_wins(level, seed): | |
| """The intended cost-optimal play (powr โ proc โ weap) must WIN | |
| on every (level, seed). This is the load-bearing test that the | |
| pack is solvable inside the budget by the advertised capability.""" | |
| c = compile_level(load_pack(PACK), level) | |
| res = run_level(c, _intended_policy(), seed=seed) | |
| tp = getattr(res.signals, "then_progress", {}) or {} | |
| assert res.outcome == "win", ( | |
| f"intended cost-optimal must WIN on {level} s={seed}; " | |
| f"got {res.outcome} (tick={res.signals.game_tick}, " | |
| f"then_progress={tp}, " | |
| f"own_buildings={res.signals.own_building_types})" | |
| ) | |
| def test_stall_loses(level, seed): | |
| """A do-nothing policy must LOSE on every (level, seed). The | |
| fail_condition's after_ticks clause bites at the budget; never | |
| a draw.""" | |
| c = compile_level(load_pack(PACK), level) | |
| res = run_level(c, _stall_policy(), seed=seed) | |
| assert res.outcome == "loss", ( | |
| f"stall must LOSE on {level} s={seed}; got {res.outcome} " | |
| f"(tick={res.signals.game_tick})" | |
| ) | |
| def test_tent_first_wrong_path_loses(level, seed): | |
| """The cost-non-optimal tent-first play must LOSE on every | |
| (level, seed). The tent detour adds ~500 credits + ~5 turns, | |
| pushing weap completion to ~tick 3063 โ beyond every level's | |
| deadline. The capability being measured is COST-OPTIMAL | |
| planning; a 'some plan that arrives' policy must not win.""" | |
| c = compile_level(load_pack(PACK), level) | |
| res = run_level(c, _tent_first_policy(), seed=seed) | |
| tp = getattr(res.signals, "then_progress", {}) or {} | |
| assert res.outcome == "loss", ( | |
| f"tent-first wrong-path must LOSE on {level} s={seed}; got " | |
| f"{res.outcome} (tick={res.signals.game_tick}, " | |
| f"then_progress={tp}, own_buildings={res.signals.own_building_types})" | |
| ) | |
| def test_hard_seeds_produce_distinct_starts(seed): | |
| """Hard's two spawn_point groups must actually round-robin โ | |
| different seeds must place the agent fact at a different (x,y). | |
| Smoke-tests the spawn-variation contract that | |
| tests/test_hard_tier.py also enforces.""" | |
| c = compile_level(load_pack(PACK), "hard") | |
| captured = {"first_obs": None} | |
| def probe(obs, Cmd): | |
| if captured["first_obs"] is None: | |
| captured["first_obs"] = list(obs.get("own_buildings", []) or []) | |
| return [Cmd.observe()] | |
| res = run_level(c, probe, seed=seed) | |
| assert res.outcome == "loss" # stall must lose | |
| facts = [ | |
| (b["cell_x"], b["cell_y"]) | |
| for b in (captured["first_obs"] or []) | |
| if b["type"] == "fact" | |
| ] | |
| assert facts, f"no fact observed at turn 0 for seed={seed}" | |
| def test_hard_spawns_round_robin_across_seeds(): | |
| """Two seeds (1 and 2) must place the agent's fact at DIFFERENT | |
| cells โ proves the spawn_point round-robin is active, not | |
| degenerate.""" | |
| c = compile_level(load_pack(PACK), "hard") | |
| def probe(): | |
| captured = {} | |
| def pol(obs, Cmd): | |
| if "fact_pos" not in captured: | |
| bs = obs.get("own_buildings", []) or [] | |
| facts = [(b["cell_x"], b["cell_y"]) for b in bs if b["type"] == "fact"] | |
| if facts: | |
| captured["fact_pos"] = facts[0] | |
| return [Cmd.observe()] | |
| pol.captured = captured | |
| return pol | |
| p1 = probe(); run_level(c, p1, seed=1) | |
| p2 = probe(); run_level(c, p2, seed=2) | |
| pos1 = p1.captured.get("fact_pos") | |
| pos2 = p2.captured.get("fact_pos") | |
| assert pos1 and pos2, f"missing fact obs: s1={pos1} s2={pos2}" | |
| assert pos1 != pos2, ( | |
| f"hard spawn round-robin is degenerate: seed 1 and 2 both " | |
| f"started at {pos1}" | |
| ) | |