Spaces:
Running
Running
| """build-tech-skip-decision โ REASONING capability validation. | |
| Skip an unnecessary tech tier (PlanBench unnecessary-step pruning / | |
| lean process / YAGNI). The objective only needs BASIC infantry: the | |
| agent starts with a pre-placed Construction Yard (fact) + Allied | |
| barracks (tent), so rifle infantry (e1, $100) are trainable from turn | |
| 1 with no prior tech step. A light enemy garrison of rifle infantry | |
| (stance 3, AttackAnything) advances on the base. Training e1 and | |
| rallying them at the base front clears the kill bar comfortably. | |
| The trap is to climb the full tech chain to medium tanks | |
| (powr -> proc -> weap + fix -> 2tnk) โ a whole tech tier the | |
| objective never asked for. It burns the clock: by the deadline the | |
| tech play has only powr+proc standing and ZERO army. | |
| Bar (CLAUDE.md "no defect, no cheat, no draw"): | |
| * stall (observe-only) LOSES every tier / every hard seed โ | |
| 0 kills, the `after_ticks` deadline bites. | |
| * tech-full-chain (build powr -> proc -> weap -> fix, then 2tnk) | |
| LOSES every tier / every hard seed โ the unneeded tier burns the | |
| clock; no tank fields before the deadline, kill bar unmet. | |
| * intended skip-to-e1 (train e1 from the pre-placed tent from turn | |
| 1, rally them at the base front, let them auto-fire on the | |
| closing garrison) WINS every tier / every hard seed โ the kill | |
| bar is met AND the fact survives, well inside the deadline. | |
| * hard tier defines >=2 agent spawn_point groups (NORTH base y=14 | |
| / SOUTH base y=26) round-robined by seed; the garrison is | |
| duplicated at both latitudes (enemy actors don't honour | |
| spawn_point โ CLAUDE.md), so a memorised base-latitude opening | |
| cannot generalise. | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| import pytest | |
| pytest.importorskip("openra_train", reason="Rust env wheel not installed") | |
| pytest.importorskip("openra_rl_training", reason="Rust env wheel not installed") | |
| from openra_bench.eval_core import run_level | |
| from openra_bench.scenarios import load_pack | |
| from openra_bench.scenarios.loader import PACKS_DIR, compile_level | |
| from openra_bench.scenarios.win_conditions import WinContext, evaluate | |
| PACK = PACKS_DIR / "build-tech-skip-decision.yaml" | |
| LEVELS = ("easy", "medium", "hard") | |
| SEEDS = (1, 2, 3, 4) | |
| # โโ scripted policies โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def _stall(rs, C): | |
| """Observe-only โ no production. 0 kills, the after_ticks | |
| deadline bites -> LOSS.""" | |
| return [C.observe()] | |
| def _tech_full_chain(rs, C): | |
| """The wrong call: climb the full tech chain (powr -> proc -> | |
| weap -> fix -> 2tnk) instead of training basic infantry. The | |
| unneeded tier burns the clock โ no tank fields before the | |
| deadline -> the kill bar is unmet -> LOSS.""" | |
| ob = rs.get("own_buildings") or [] | |
| bt = {b.get("type") for b in ob} | |
| fy = 20 | |
| for b in ob: | |
| if b.get("type") == "fact": | |
| fy = int(b["cell_y"]) | |
| prod = [p.get("item") for p in (rs.get("production") or []) if isinstance(p, dict)] | |
| cmds = [] | |
| if "powr" not in bt: | |
| if "powr" not in prod: | |
| cmds.append(C.build("powr")) | |
| cmds.append(C.place_building("powr", 17, fy)) | |
| elif "proc" not in bt: | |
| if "proc" not in prod: | |
| cmds.append(C.build("proc")) | |
| cmds.append(C.place_building("proc", 20, fy)) | |
| elif "weap" not in bt: | |
| if "weap" not in prod: | |
| cmds.append(C.build("weap")) | |
| cmds.append(C.place_building("weap", 23, fy)) | |
| elif "fix" not in bt: | |
| if "fix" not in prod: | |
| cmds.append(C.build("fix")) | |
| cmds.append(C.place_building("fix", 23, fy + 4)) | |
| else: | |
| cmds.append(C.build("2tnk")) | |
| return cmds or [C.observe()] | |
| def _skip_to_e1(rs, C): | |
| """The intended capability โ skip the tech tier. Train e1 from | |
| the pre-placed tent from turn 1 and rally them at the base front; | |
| they auto-fire on the closing garrison and clear the kill bar.""" | |
| units = rs.get("units_summary") or [] | |
| ob = rs.get("own_buildings") or [] | |
| fy = 20 | |
| for b in ob: | |
| if b.get("type") == "tent": | |
| fy = int(b["cell_y"]) | |
| ids = [u.get("id") for u in units if u.get("type") == "e1"] | |
| cmds = [C.build("e1")] | |
| if ids: | |
| cmds.append(C.move_units(ids, 18, fy)) | |
| return cmds | |
| # โโ structural tests โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def test_pack_loads_and_meta_active(): | |
| pack = load_pack(PACK) | |
| assert pack.meta.id == "build-tech-skip-decision" | |
| assert pack.meta.capability == "reasoning" | |
| assert pack.meta.real_world_meaning | |
| assert pack.meta.robotics_analogue | |
| anchors = " ".join(pack.meta.benchmark_anchor).lower() | |
| assert "planbench" in anchors, anchors | |
| assert "pruning" in anchors, anchors | |
| assert "yagni" in anchors, anchors | |
| assert "lean process" in anchors, anchors | |
| def test_tools_include_build_and_combat_surface(): | |
| """Pack must expose [observe, build, place_building, move_units, | |
| attack_move, stop] โ the build-and-engage interaction surface.""" | |
| pack = load_pack(PACK) | |
| tools = set(pack.base.get("tools", []) if isinstance(pack.base, dict) else []) | |
| for required in ("observe", "build", "place_building", "move_units", | |
| "attack_move", "stop"): | |
| assert required in tools, f"missing tool: {required!r}" | |
| def test_preplaced_base_is_fact_plus_tent_no_higher_tech(): | |
| """Every level pre-places fact + tent for the agent โ the cheap | |
| e1 path must be actionable on turn 1 (tent present) with NO | |
| higher-tech building (powr / proc / weap / fix) handed to the | |
| agent: building the tech tier is what the agent must AVOID. The | |
| hard tier additionally carries an inert HoldFire e1 spawn-witness | |
| per spawn group (units_summary spawn-variation contract).""" | |
| pack = load_pack(PACK) | |
| higher_tech = {"powr", "proc", "weap", "fix", "dome", "apwr"} | |
| for lvl in LEVELS: | |
| c = compile_level(pack, lvl) | |
| agent_types = {a.type for a in c.scenario.actors if a.owner == "agent"} | |
| assert "fact" in agent_types, f"{lvl}: agent base missing fact" | |
| assert "tent" in agent_types, f"{lvl}: agent base missing tent" | |
| assert not (agent_types & higher_tech), ( | |
| f"{lvl}: agent must NOT start with a higher-tech building; " | |
| f"got {sorted(agent_types & higher_tech)}" | |
| ) | |
| # only fact / tent / (hard) the inert e1 witness | |
| assert agent_types <= {"fact", "tent", "e1"}, ( | |
| f"{lvl}: unexpected agent actor types {sorted(agent_types)}" | |
| ) | |
| def test_garrison_is_light_infantry_stance3(): | |
| """The enemy garrison is basic rifle infantry (e1) at stance 3 | |
| (AttackAnything โ advances on the agent base). No tanks / no | |
| higher-tech enemy units.""" | |
| pack = load_pack(PACK) | |
| for lvl in LEVELS: | |
| c = compile_level(pack, lvl) | |
| garrison = [ | |
| a for a in c.scenario.actors | |
| if a.owner == "enemy" and a.type == "e1" | |
| ] | |
| assert garrison, f"{lvl}: must have an e1 garrison" | |
| for a in garrison: | |
| assert a.stance == 3, ( | |
| f"{lvl}: garrison e1 must be stance 3 (AttackAnything); " | |
| f"got {a.stance}" | |
| ) | |
| # only e1 + the far fact marker on the enemy side | |
| enemy_types = {a.type for a in c.scenario.actors if a.owner == "enemy"} | |
| assert enemy_types == {"e1", "fact"}, ( | |
| f"{lvl}: enemy actors must be {{e1, fact}}; got {sorted(enemy_types)}" | |
| ) | |
| def test_far_enemy_fact_marker_present(): | |
| """A persistent unarmed enemy fact marker far east keeps the | |
| episode alive past the last garrison death (CLAUDE.md auto-done | |
| footgun) so a non-winner reaches the deadline as a real LOSS.""" | |
| pack = load_pack(PACK) | |
| for lvl in LEVELS: | |
| c = compile_level(pack, lvl) | |
| far = [ | |
| a for a in c.scenario.actors | |
| if a.owner == "enemy" and a.type == "fact" and a.position[0] >= 100 | |
| ] | |
| assert far, f"{lvl}: missing far enemy fact marker (anti-DRAW)" | |
| def test_every_level_has_reachable_timeout_fail(): | |
| """`after_ticks` fail must bite WITHIN max_turns (so stall / tech | |
| are a real reachable LOSS, not a draw). within_ticks + 1 == | |
| after_ticks so a non-finisher LOSES one tick past the window.""" | |
| pack = load_pack(PACK) | |
| for lvl in LEVELS: | |
| L = pack.levels[lvl] | |
| ceiling = 93 + 90 * (L.max_turns - 1) | |
| wt = next( | |
| int(c["within_ticks"]) | |
| for c in L.win_condition.model_dump()["all_of"] | |
| if "within_ticks" in c | |
| ) | |
| ft = next( | |
| int(c["after_ticks"]) | |
| for c in L.fail_condition.model_dump()["any_of"] | |
| if "after_ticks" in c | |
| ) | |
| assert wt < ceiling, f"{lvl}: within_ticks {wt} >= ceiling {ceiling}" | |
| assert ft <= ceiling, f"{lvl}: after_ticks {ft} > ceiling {ceiling}" | |
| assert wt + 1 == ft, ( | |
| f"{lvl}: within_ticks {wt} / after_ticks {ft} mismatch " | |
| "(boundary non-finisher must LOSE, not draw)" | |
| ) | |
| def test_every_level_has_a_fail_condition(): | |
| pack = load_pack(PACK) | |
| for lvl in LEVELS: | |
| c = compile_level(pack, lvl) | |
| assert c.fail_condition is not None, f"{lvl} needs a fail_condition" | |
| def test_hard_has_two_seed_driven_spawn_groups(): | |
| c = compile_level(load_pack(PACK), "hard") | |
| sp = { | |
| (a.spawn_point if a.spawn_point is not None else 0) | |
| for a in c.scenario.actors | |
| if a.owner == "agent" | |
| } | |
| assert sp == {0, 1}, f"hard must define exactly {{0, 1}}; got {sorted(sp)}" | |
| def test_in_bounds_actors_on_every_level(): | |
| """rush-hour-arena playable bounds ~ x:2..126, y:2..38.""" | |
| pack = load_pack(PACK) | |
| for lvl in LEVELS: | |
| c = compile_level(pack, lvl) | |
| for a in c.scenario.actors: | |
| x, y = a.position | |
| assert 2 <= x <= 126 and 2 <= y <= 38, ( | |
| f"{lvl}: actor {a.type} at ({x},{y}) out of bounds" | |
| ) | |
| # โโ predicate-level (no engine) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def _ctx(*, tick=0, kills=0, own_buildings=()): | |
| import types | |
| sig = types.SimpleNamespace( | |
| game_tick=tick, | |
| units_killed=kills, | |
| units_lost=0, | |
| cash=0, | |
| resources=0, | |
| own_buildings=list(own_buildings), | |
| own_building_types={str(t).lower() for (t, _, _) in own_buildings}, | |
| enemies_seen_ids=set(), | |
| enemy_buildings_seen_ids=set(), | |
| ) | |
| return WinContext(signals=sig, render_state={"units_summary": []}) | |
| def test_predicates_enforce_capability(): | |
| """Win requires (kill bar AND fact alive AND in time); fail fires | |
| on timeout OR fact destroyed.""" | |
| c = compile_level(load_pack(PACK), "easy") | |
| base_b = [("fact", 10, 20), ("tent", 13, 20)] | |
| # Intended: 4 kills, fact alive, in time -> WIN | |
| assert evaluate(c.win_condition, _ctx(tick=900, kills=4, own_buildings=base_b)) | |
| # 3 kills (under bar) -> not win | |
| assert not evaluate( | |
| c.win_condition, _ctx(tick=900, kills=3, own_buildings=base_b) | |
| ) | |
| # 4 kills but past within_ticks (easy within_ticks 1600) -> not win | |
| assert not evaluate( | |
| c.win_condition, _ctx(tick=1601, kills=4, own_buildings=base_b) | |
| ) | |
| # 4 kills but fact destroyed -> not win | |
| assert not evaluate( | |
| c.win_condition, _ctx(tick=900, kills=4, own_buildings=base_b[1:]) | |
| ) | |
| # Past after_ticks deadline (easy after_ticks 1601) -> fail | |
| assert evaluate( | |
| c.fail_condition, _ctx(tick=1700, kills=0, own_buildings=base_b) | |
| ) | |
| # Fact destroyed -> fail | |
| assert evaluate( | |
| c.fail_condition, _ctx(tick=900, kills=4, own_buildings=base_b[1:]) | |
| ) | |
| # Within deadline, fact alive -> not fail | |
| assert not evaluate( | |
| c.fail_condition, _ctx(tick=900, kills=0, own_buildings=base_b) | |
| ) | |
| # โโ engine-driven: every lazy / wrong policy LOSES, intended WINS โโโ | |
| def test_stall_loses_every_tier_and_seed(level, seed): | |
| """Observe-only -> 0 kills + the after_ticks deadline bites -> | |
| a real reachable LOSS, not a draw.""" | |
| c = compile_level(load_pack(PACK), level) | |
| r = run_level(c, _stall, seed=seed) | |
| assert r.outcome == "loss", ( | |
| f"{level}/seed{seed}: stall must LOSE; got {r.outcome} " | |
| f"tick={r.signals.game_tick} kills={r.signals.units_killed}" | |
| ) | |
| def test_tech_full_chain_loses_every_tier_and_seed(level, seed): | |
| """Climbing the full tech chain (powr -> proc -> weap -> fix -> | |
| 2tnk) burns the clock on a tier the objective never required -> no | |
| tank fields before the deadline -> kill bar unmet -> LOSS.""" | |
| c = compile_level(load_pack(PACK), level) | |
| r = run_level(c, _tech_full_chain, seed=seed) | |
| assert r.outcome == "loss", ( | |
| f"{level}/seed{seed}: tech-full-chain must LOSE (clock); " | |
| f"got {r.outcome} tick={r.signals.game_tick} " | |
| f"kills={r.signals.units_killed}" | |
| ) | |
| def test_skip_to_e1_wins_every_tier_and_seed(level, seed): | |
| """The intended capability โ skip the unneeded tech tier, train | |
| e1 from the pre-placed tent and rally them at the base front. | |
| Wins every tier and every hard seed, well inside the deadline.""" | |
| c = compile_level(load_pack(PACK), level) | |
| r = run_level(c, _skip_to_e1, seed=seed) | |
| assert r.outcome == "win", ( | |
| f"{level}/seed{seed}: skip-to-e1 must WIN; got {r.outcome} " | |
| f"tick={r.signals.game_tick} kills={r.signals.units_killed}" | |
| ) | |
| # โโ determinism โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def test_skip_to_e1_run_is_deterministic_per_seed(): | |
| """Same seed, same policy -> identical outcome / kills / turns.""" | |
| c = compile_level(load_pack(PACK), "medium") | |
| a = run_level(c, _skip_to_e1, seed=3) | |
| b = run_level(c, _skip_to_e1, seed=3) | |
| assert (a.outcome, a.turns, a.signals.units_killed) == ( | |
| b.outcome, b.turns, b.signals.units_killed | |
| ) | |