OpenRA-Bench / tests /test_build_tech_skip_decision.py
yxc20098's picture
fix(scenario): build-tech-skip-decision โ€” recalibrate after engine movement fixes
5fd1df2
Raw
History Blame Contribute Delete
14.6 kB
"""build-tech-skip-decision โ€” REASONING capability validation.
Skip an unnecessary tech tier (PlanBench unnecessary-step pruning /
lean process / YAGNI). The objective only needs BASIC infantry: the
agent starts with a pre-placed Construction Yard (fact) + Allied
barracks (tent), so rifle infantry (e1, $100) are trainable from turn
1 with no prior tech step. A light enemy garrison of rifle infantry
(stance 3, AttackAnything) advances on the base. Training e1 and
rallying them at the base front clears the kill bar comfortably.
The trap is to climb the full tech chain to medium tanks
(powr -> proc -> weap + fix -> 2tnk) โ€” a whole tech tier the
objective never asked for. It burns the clock: by the deadline the
tech play has only powr+proc standing and ZERO army.
Bar (CLAUDE.md "no defect, no cheat, no draw"):
* stall (observe-only) LOSES every tier / every hard seed โ€”
0 kills, the `after_ticks` deadline bites.
* tech-full-chain (build powr -> proc -> weap -> fix, then 2tnk)
LOSES every tier / every hard seed โ€” the unneeded tier burns the
clock; no tank fields before the deadline, kill bar unmet.
* intended skip-to-e1 (train e1 from the pre-placed tent from turn
1, rally them at the base front, let them auto-fire on the
closing garrison) WINS every tier / every hard seed โ€” the kill
bar is met AND the fact survives, well inside the deadline.
* hard tier defines >=2 agent spawn_point groups (NORTH base y=14
/ SOUTH base y=26) round-robined by seed; the garrison is
duplicated at both latitudes (enemy actors don't honour
spawn_point โ€” CLAUDE.md), so a memorised base-latitude opening
cannot generalise.
"""
from __future__ import annotations
from pathlib import Path
import pytest
pytest.importorskip("openra_train", reason="Rust env wheel not installed")
pytest.importorskip("openra_rl_training", reason="Rust env wheel not installed")
from openra_bench.eval_core import run_level
from openra_bench.scenarios import load_pack
from openra_bench.scenarios.loader import PACKS_DIR, compile_level
from openra_bench.scenarios.win_conditions import WinContext, evaluate
PACK = PACKS_DIR / "build-tech-skip-decision.yaml"
LEVELS = ("easy", "medium", "hard")
SEEDS = (1, 2, 3, 4)
# โ”€โ”€ scripted policies โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _stall(rs, C):
"""Observe-only โ€” no production. 0 kills, the after_ticks
deadline bites -> LOSS."""
return [C.observe()]
def _tech_full_chain(rs, C):
"""The wrong call: climb the full tech chain (powr -> proc ->
weap -> fix -> 2tnk) instead of training basic infantry. The
unneeded tier burns the clock โ€” no tank fields before the
deadline -> the kill bar is unmet -> LOSS."""
ob = rs.get("own_buildings") or []
bt = {b.get("type") for b in ob}
fy = 20
for b in ob:
if b.get("type") == "fact":
fy = int(b["cell_y"])
prod = [p.get("item") for p in (rs.get("production") or []) if isinstance(p, dict)]
cmds = []
if "powr" not in bt:
if "powr" not in prod:
cmds.append(C.build("powr"))
cmds.append(C.place_building("powr", 17, fy))
elif "proc" not in bt:
if "proc" not in prod:
cmds.append(C.build("proc"))
cmds.append(C.place_building("proc", 20, fy))
elif "weap" not in bt:
if "weap" not in prod:
cmds.append(C.build("weap"))
cmds.append(C.place_building("weap", 23, fy))
elif "fix" not in bt:
if "fix" not in prod:
cmds.append(C.build("fix"))
cmds.append(C.place_building("fix", 23, fy + 4))
else:
cmds.append(C.build("2tnk"))
return cmds or [C.observe()]
def _skip_to_e1(rs, C):
"""The intended capability โ€” skip the tech tier. Train e1 from
the pre-placed tent from turn 1 and rally them at the base front;
they auto-fire on the closing garrison and clear the kill bar."""
units = rs.get("units_summary") or []
ob = rs.get("own_buildings") or []
fy = 20
for b in ob:
if b.get("type") == "tent":
fy = int(b["cell_y"])
ids = [u.get("id") for u in units if u.get("type") == "e1"]
cmds = [C.build("e1")]
if ids:
cmds.append(C.move_units(ids, 18, fy))
return cmds
# โ”€โ”€ structural tests โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def test_pack_loads_and_meta_active():
pack = load_pack(PACK)
assert pack.meta.id == "build-tech-skip-decision"
assert pack.meta.capability == "reasoning"
assert pack.meta.real_world_meaning
assert pack.meta.robotics_analogue
anchors = " ".join(pack.meta.benchmark_anchor).lower()
assert "planbench" in anchors, anchors
assert "pruning" in anchors, anchors
assert "yagni" in anchors, anchors
assert "lean process" in anchors, anchors
def test_tools_include_build_and_combat_surface():
"""Pack must expose [observe, build, place_building, move_units,
attack_move, stop] โ€” the build-and-engage interaction surface."""
pack = load_pack(PACK)
tools = set(pack.base.get("tools", []) if isinstance(pack.base, dict) else [])
for required in ("observe", "build", "place_building", "move_units",
"attack_move", "stop"):
assert required in tools, f"missing tool: {required!r}"
def test_preplaced_base_is_fact_plus_tent_no_higher_tech():
"""Every level pre-places fact + tent for the agent โ€” the cheap
e1 path must be actionable on turn 1 (tent present) with NO
higher-tech building (powr / proc / weap / fix) handed to the
agent: building the tech tier is what the agent must AVOID. The
hard tier additionally carries an inert HoldFire e1 spawn-witness
per spawn group (units_summary spawn-variation contract)."""
pack = load_pack(PACK)
higher_tech = {"powr", "proc", "weap", "fix", "dome", "apwr"}
for lvl in LEVELS:
c = compile_level(pack, lvl)
agent_types = {a.type for a in c.scenario.actors if a.owner == "agent"}
assert "fact" in agent_types, f"{lvl}: agent base missing fact"
assert "tent" in agent_types, f"{lvl}: agent base missing tent"
assert not (agent_types & higher_tech), (
f"{lvl}: agent must NOT start with a higher-tech building; "
f"got {sorted(agent_types & higher_tech)}"
)
# only fact / tent / (hard) the inert e1 witness
assert agent_types <= {"fact", "tent", "e1"}, (
f"{lvl}: unexpected agent actor types {sorted(agent_types)}"
)
def test_garrison_is_light_infantry_stance3():
"""The enemy garrison is basic rifle infantry (e1) at stance 3
(AttackAnything โ€” advances on the agent base). No tanks / no
higher-tech enemy units."""
pack = load_pack(PACK)
for lvl in LEVELS:
c = compile_level(pack, lvl)
garrison = [
a for a in c.scenario.actors
if a.owner == "enemy" and a.type == "e1"
]
assert garrison, f"{lvl}: must have an e1 garrison"
for a in garrison:
assert a.stance == 3, (
f"{lvl}: garrison e1 must be stance 3 (AttackAnything); "
f"got {a.stance}"
)
# only e1 + the far fact marker on the enemy side
enemy_types = {a.type for a in c.scenario.actors if a.owner == "enemy"}
assert enemy_types == {"e1", "fact"}, (
f"{lvl}: enemy actors must be {{e1, fact}}; got {sorted(enemy_types)}"
)
def test_far_enemy_fact_marker_present():
"""A persistent unarmed enemy fact marker far east keeps the
episode alive past the last garrison death (CLAUDE.md auto-done
footgun) so a non-winner reaches the deadline as a real LOSS."""
pack = load_pack(PACK)
for lvl in LEVELS:
c = compile_level(pack, lvl)
far = [
a for a in c.scenario.actors
if a.owner == "enemy" and a.type == "fact" and a.position[0] >= 100
]
assert far, f"{lvl}: missing far enemy fact marker (anti-DRAW)"
def test_every_level_has_reachable_timeout_fail():
"""`after_ticks` fail must bite WITHIN max_turns (so stall / tech
are a real reachable LOSS, not a draw). within_ticks + 1 ==
after_ticks so a non-finisher LOSES one tick past the window."""
pack = load_pack(PACK)
for lvl in LEVELS:
L = pack.levels[lvl]
ceiling = 93 + 90 * (L.max_turns - 1)
wt = next(
int(c["within_ticks"])
for c in L.win_condition.model_dump()["all_of"]
if "within_ticks" in c
)
ft = next(
int(c["after_ticks"])
for c in L.fail_condition.model_dump()["any_of"]
if "after_ticks" in c
)
assert wt < ceiling, f"{lvl}: within_ticks {wt} >= ceiling {ceiling}"
assert ft <= ceiling, f"{lvl}: after_ticks {ft} > ceiling {ceiling}"
assert wt + 1 == ft, (
f"{lvl}: within_ticks {wt} / after_ticks {ft} mismatch "
"(boundary non-finisher must LOSE, not draw)"
)
def test_every_level_has_a_fail_condition():
pack = load_pack(PACK)
for lvl in LEVELS:
c = compile_level(pack, lvl)
assert c.fail_condition is not None, f"{lvl} needs a fail_condition"
def test_hard_has_two_seed_driven_spawn_groups():
c = compile_level(load_pack(PACK), "hard")
sp = {
(a.spawn_point if a.spawn_point is not None else 0)
for a in c.scenario.actors
if a.owner == "agent"
}
assert sp == {0, 1}, f"hard must define exactly {{0, 1}}; got {sorted(sp)}"
def test_in_bounds_actors_on_every_level():
"""rush-hour-arena playable bounds ~ x:2..126, y:2..38."""
pack = load_pack(PACK)
for lvl in LEVELS:
c = compile_level(pack, lvl)
for a in c.scenario.actors:
x, y = a.position
assert 2 <= x <= 126 and 2 <= y <= 38, (
f"{lvl}: actor {a.type} at ({x},{y}) out of bounds"
)
# โ”€โ”€ predicate-level (no engine) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _ctx(*, tick=0, kills=0, own_buildings=()):
import types
sig = types.SimpleNamespace(
game_tick=tick,
units_killed=kills,
units_lost=0,
cash=0,
resources=0,
own_buildings=list(own_buildings),
own_building_types={str(t).lower() for (t, _, _) in own_buildings},
enemies_seen_ids=set(),
enemy_buildings_seen_ids=set(),
)
return WinContext(signals=sig, render_state={"units_summary": []})
def test_predicates_enforce_capability():
"""Win requires (kill bar AND fact alive AND in time); fail fires
on timeout OR fact destroyed."""
c = compile_level(load_pack(PACK), "easy")
base_b = [("fact", 10, 20), ("tent", 13, 20)]
# Intended: 4 kills, fact alive, in time -> WIN
assert evaluate(c.win_condition, _ctx(tick=900, kills=4, own_buildings=base_b))
# 3 kills (under bar) -> not win
assert not evaluate(
c.win_condition, _ctx(tick=900, kills=3, own_buildings=base_b)
)
# 4 kills but past within_ticks (easy within_ticks 1600) -> not win
assert not evaluate(
c.win_condition, _ctx(tick=1601, kills=4, own_buildings=base_b)
)
# 4 kills but fact destroyed -> not win
assert not evaluate(
c.win_condition, _ctx(tick=900, kills=4, own_buildings=base_b[1:])
)
# Past after_ticks deadline (easy after_ticks 1601) -> fail
assert evaluate(
c.fail_condition, _ctx(tick=1700, kills=0, own_buildings=base_b)
)
# Fact destroyed -> fail
assert evaluate(
c.fail_condition, _ctx(tick=900, kills=4, own_buildings=base_b[1:])
)
# Within deadline, fact alive -> not fail
assert not evaluate(
c.fail_condition, _ctx(tick=900, kills=0, own_buildings=base_b)
)
# โ”€โ”€ engine-driven: every lazy / wrong policy LOSES, intended WINS โ”€โ”€โ”€
@pytest.mark.parametrize("level", LEVELS)
@pytest.mark.parametrize("seed", SEEDS)
def test_stall_loses_every_tier_and_seed(level, seed):
"""Observe-only -> 0 kills + the after_ticks deadline bites ->
a real reachable LOSS, not a draw."""
c = compile_level(load_pack(PACK), level)
r = run_level(c, _stall, seed=seed)
assert r.outcome == "loss", (
f"{level}/seed{seed}: stall must LOSE; got {r.outcome} "
f"tick={r.signals.game_tick} kills={r.signals.units_killed}"
)
@pytest.mark.parametrize("level", LEVELS)
@pytest.mark.parametrize("seed", SEEDS)
def test_tech_full_chain_loses_every_tier_and_seed(level, seed):
"""Climbing the full tech chain (powr -> proc -> weap -> fix ->
2tnk) burns the clock on a tier the objective never required -> no
tank fields before the deadline -> kill bar unmet -> LOSS."""
c = compile_level(load_pack(PACK), level)
r = run_level(c, _tech_full_chain, seed=seed)
assert r.outcome == "loss", (
f"{level}/seed{seed}: tech-full-chain must LOSE (clock); "
f"got {r.outcome} tick={r.signals.game_tick} "
f"kills={r.signals.units_killed}"
)
@pytest.mark.parametrize("level", LEVELS)
@pytest.mark.parametrize("seed", SEEDS)
def test_skip_to_e1_wins_every_tier_and_seed(level, seed):
"""The intended capability โ€” skip the unneeded tech tier, train
e1 from the pre-placed tent and rally them at the base front.
Wins every tier and every hard seed, well inside the deadline."""
c = compile_level(load_pack(PACK), level)
r = run_level(c, _skip_to_e1, seed=seed)
assert r.outcome == "win", (
f"{level}/seed{seed}: skip-to-e1 must WIN; got {r.outcome} "
f"tick={r.signals.game_tick} kills={r.signals.units_killed}"
)
# โ”€โ”€ determinism โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def test_skip_to_e1_run_is_deterministic_per_seed():
"""Same seed, same policy -> identical outcome / kills / turns."""
c = compile_level(load_pack(PACK), "medium")
a = run_level(c, _skip_to_e1, seed=3)
b = run_level(c, _skip_to_e1, seed=3)
assert (a.outcome, a.turns, a.signals.units_killed) == (
b.outcome, b.turns, b.signals.units_killed
)