Spaces:
Running
Running
File size: 15,538 Bytes
7cc5e9b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 | """build-sequence-tech-cheapest pack โ full no-cheat validation on Rust.
Wave-11 REASONING โ cost-MINIMAL build-order planning. Sibling of
build-sequence-tech-fastest (the time-optimal axis); here the binding
constraint is MONEY. The agent must reach the war factory (`weap`) on
the ONLY affordable prerequisite chain:
powr โ proc โ weap
There is NO ore on the map and NO harvester income โ the starting cash
is the entire, non-replenishing budget, tuned to exactly the cost of
the minimal path (powr $300 + proc $1400 + weap $2000 = $3700). Any
detour (build a barracks/tent or a pillbox first) bloats the bill of
materials, exhausts the fixed budget, and weap can then never be
funded โ the `then:` chain never completes and the episode times out.
The clock budget is GENEROUS: a policy loses by being WASTEFUL, not
by being slow.
Bar (CLAUDE.md): the intended cost-minimal policy WINS on every
(level, seed); stall and the wasteful-spend policies LOSE on every
(level, seed). Real LOSS not DRAW โ `fail after_ticks:T+1` inside
max_turns is the bite.
Scenario shape:
- rush-hour-arena, allies vs soviet (bot disabled).
- easy: budget $3750, T=3200, max_turns=40 โ 50-credit slack.
- medium: budget $3720, T=3200, max_turns=40 โ 20-credit slack.
- hard: budget $3720, T=3200, max_turns=40 โ same tight budget
+ โฅ2 spawn_point groups (NORTH y=14 / SOUTH y=26 base,
round-robined by seed).
Measured (seed 1, scripted policies):
intended powrโprocโweap completes โ tick 2613 (well under T=3200)
tent-first wasteful: cash hits $0 โ tick 2703, weap stuck in queue
forever (no income โ no recovery) โ after_ticks LOSS at T+1.
"""
from __future__ import annotations
import pytest
pytest.importorskip("openra_train", reason="Rust env wheel not installed")
pytest.importorskip("openra_rl_training", reason="Rust env wheel not installed")
from openra_bench.eval_core import run_level
from openra_bench.scenarios import load_pack
from openra_bench.scenarios.loader import PACKS_DIR, compile_level
PACK = PACKS_DIR / "build-sequence-tech-cheapest.yaml"
LEVELS = ("easy", "medium", "hard")
SEEDS = (1, 2, 3, 4)
# โโ Policies โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def _stall_policy():
"""Do nothing โ must LOSE on the clock on every level/seed."""
def pol(obs, Cmd):
return [Cmd.observe()]
return pol
def _intended_policy():
"""Cost-minimal play: build powr โ proc โ weap, each placed
relative to the agent's actual fact (so the policy generalises
across the hard-tier spawn variation). This is the policy the
pack is solvable by โ must WIN on every (level, seed)."""
milestone = {"powr": False, "proc": False, "weap": False}
def pol(obs, Cmd):
ob = obs.get("own_buildings", []) or []
own_b = {b["type"] for b in ob}
prod = obs.get("production", []) or []
for b in ("powr", "proc", "weap"):
if b in own_b:
milestone[b] = True
cmds = []
base = [b for b in ob if b["type"] == "fact"]
if not milestone["powr"]:
if "powr" not in prod:
cmds.append(Cmd.build("powr"))
if base:
cmds.append(Cmd.place_building(
"powr", base[0]["cell_x"] + 4, base[0]["cell_y"]
))
elif not milestone["proc"]:
if "proc" not in prod:
cmds.append(Cmd.build("proc"))
if base:
cmds.append(Cmd.place_building(
"proc", base[0]["cell_x"] + 6, base[0]["cell_y"] + 3
))
elif not milestone["weap"]:
if "weap" not in prod:
cmds.append(Cmd.build("weap"))
if base:
cmds.append(Cmd.place_building(
"weap", base[0]["cell_x"] + 8, base[0]["cell_y"]
))
if not cmds:
cmds.append(Cmd.observe())
return cmds
return pol
def _wasteful_policy(extra: str):
"""Cost-non-minimal play: powr โ <extra> โ proc โ weap, where
<extra> ('tent' $500 or 'pbox' $600) is NOT on weap's prerequisite
chain. The detour bloats the bill of materials past the fixed
budget, so weap can never be funded โ cash hits $0 mid-queue and,
with no ore/income, never recovers. Must LOSE on every
(level, seed). The capability measured is COST-MINIMAL planning;
a 'some plan that arrives' policy must not win."""
milestone = {"powr": False, extra: False, "proc": False, "weap": False}
def pol(obs, Cmd):
ob = obs.get("own_buildings", []) or []
own_b = {b["type"] for b in ob}
prod = obs.get("production", []) or []
for b in ("powr", extra, "proc", "weap"):
if b in own_b:
milestone[b] = True
cmds = []
base = [b for b in ob if b["type"] == "fact"]
if not milestone["powr"]:
if "powr" not in prod:
cmds.append(Cmd.build("powr"))
if base:
cmds.append(Cmd.place_building(
"powr", base[0]["cell_x"] + 4, base[0]["cell_y"]
))
elif not milestone[extra]:
if extra not in prod:
cmds.append(Cmd.build(extra))
if base:
cmds.append(Cmd.place_building(
extra, base[0]["cell_x"] + 4, base[0]["cell_y"] + 3
))
elif not milestone["proc"]:
if "proc" not in prod:
cmds.append(Cmd.build("proc"))
if base:
cmds.append(Cmd.place_building(
"proc", base[0]["cell_x"] + 6, base[0]["cell_y"] + 3
))
elif not milestone["weap"]:
if "weap" not in prod:
cmds.append(Cmd.build("weap"))
if base:
cmds.append(Cmd.place_building(
"weap", base[0]["cell_x"] + 8, base[0]["cell_y"]
))
if not cmds:
cmds.append(Cmd.observe())
return cmds
return pol
# โโ Pack-shape tests (cheap; do not run the engine) โโโโโโโโโโโโโโ
def test_pack_compiles_with_three_levels():
pack = load_pack(PACK)
assert pack.meta.id == "build-sequence-tech-cheapest"
assert pack.meta.capability == "reasoning"
assert set(pack.levels) == {"easy", "medium", "hard"}
def test_meta_benchmark_anchor_set():
"""Required by the seed taxonomy: PlanBench cost-optimal +
BOM cost minimization + budget-constrained planning."""
pack = load_pack(PACK)
anchors = pack.meta.benchmark_anchor or []
assert any("PlanBench" in a for a in anchors), anchors
assert any("BOM" in a for a in anchors), anchors
assert any("budget" in a for a in anchors), anchors
def test_budget_is_near_minimal_path_cost():
"""The whole pack hinges on starting_cash being tuned to the
minimal-path cost (powr 300 + proc 1400 + weap 2000 = 3700) with
near-zero slack โ enough to fund the minimal chain, never enough
to also afford a non-load-bearing structure."""
pack = load_pack(PACK)
minimal = 3700
for lvl in LEVELS:
cash = pack.levels[lvl].starting_cash
assert minimal <= cash <= minimal + 100, (
f"{lvl} starting_cash={cash} not near-minimal (3700 + โค100 "
f"slack); a wasteful detour must overrun the budget"
)
def test_no_ore_patches_placed():
"""The budget must be the entire, non-replenishing money supply:
no `mine` actors โ no harvester income โ a wasteful spend can
never be recovered no matter how generous the clock is."""
for lvl in LEVELS:
c = compile_level(load_pack(PACK), lvl)
mines = [a for a in c.scenario.actors if a.type == "mine"]
assert not mines, f"{lvl} has ore patches {mines} โ income would "\
"let a wasteful policy recover; budget must be fixed"
def test_hard_tier_has_seed_driven_spawn_groups():
"""Hard must define โฅ2 agent spawn_point groups so seed varies
the start base (tests/test_hard_tier.py::UPGRADED contract)."""
c = compile_level(load_pack(PACK), "hard")
sp = {a.spawn_point for a in c.scenario.actors if a.owner == "agent"}
assert len(sp) >= 2, f"hard needs โฅ2 spawn groups, got {sp}"
def test_every_level_has_fail_condition():
"""No silent draws โ every level must be able to emit a LOSS."""
pack = load_pack(PACK)
for lvl in LEVELS:
c = compile_level(pack, lvl)
assert c.fail_condition is not None, f"{lvl} missing fail_condition"
def test_then_composite_used_in_win():
"""Confirms the 3-step build-order chain is wired through to the
compiled win condition โ the load-bearing teeth of this pack."""
for lvl in LEVELS:
c = compile_level(load_pack(PACK), lvl)
win = c.win_condition.model_dump(exclude_none=True)
inner = win.get("all_of") or []
assert any("then" in cl for cl in inner), (
f"{lvl} win missing then-chain: {win}"
)
for cl in inner:
if "then" in cl:
clauses = (cl["then"] or {}).get("clauses") or []
assert len(clauses) == 3, (
f"{lvl} then-chain must be powrโprocโweap (3 clauses); "
f"got {clauses}"
)
# And in the exact engine-enforced prereq order.
assert clauses[0].get("has_building") == "powr"
assert clauses[1].get("has_building") == "proc"
assert clauses[2].get("has_building") == "weap"
def test_tick_budget_aligned_with_max_turns():
"""within_ticks must be reachable inside max_turns. Engine
advances ~90 ticks/turn โ reachable max = 93 + 90ยท(N-1)."""
pack = load_pack(PACK)
for lvl in LEVELS:
level_def = pack.levels[lvl]
max_turns = level_def.max_turns
reachable = 93 + 90 * (max_turns - 1)
win = compile_level(pack, lvl).win_condition.model_dump(exclude_none=True)
def _collect(node, key, out):
if isinstance(node, dict):
if key in node:
out.append(node[key])
for v in node.values():
_collect(v, key, out)
elif isinstance(node, list):
for v in node:
_collect(v, key, out)
wts = []
_collect(win, "within_ticks", wts)
assert wts, f"{lvl} has no within_ticks leaf (no clock teeth)"
for wt in wts:
assert wt <= reachable, (
f"{lvl} within_ticks={wt} > reachable={reachable} "
f"(max_turns={max_turns}) โ deadline never bites โ draw"
)
# โโ Engine-bound tests (parameterised over seeds 1..4) โโโโโโโโโโโโ
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("level", LEVELS)
def test_intended_cost_minimal_policy_wins(level, seed):
"""The intended cost-minimal play (powr โ proc โ weap) must WIN
on every (level, seed). This is the load-bearing test that the
pack is solvable inside the budget by the advertised capability."""
c = compile_level(load_pack(PACK), level)
res = run_level(c, _intended_policy(), seed=seed)
tp = getattr(res.signals, "then_progress", {}) or {}
assert res.outcome == "win", (
f"intended cost-minimal must WIN on {level} s={seed}; "
f"got {res.outcome} (tick={res.signals.game_tick}, "
f"then_progress={tp}, "
f"own_buildings={res.signals.own_building_types})"
)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("level", LEVELS)
def test_stall_loses(level, seed):
"""A do-nothing policy must LOSE on every (level, seed). The
fail_condition's after_ticks clause bites at the budget; never
a draw."""
c = compile_level(load_pack(PACK), level)
res = run_level(c, _stall_policy(), seed=seed)
assert res.outcome == "loss", (
f"stall must LOSE on {level} s={seed}; got {res.outcome} "
f"(tick={res.signals.game_tick})"
)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("level", LEVELS)
@pytest.mark.parametrize("extra", ("tent", "pbox"))
def test_wasteful_spend_loses(level, seed, extra):
"""The cost-non-minimal wasteful play (powr โ <extra> โ proc โ
weap) must LOSE on every (level, seed). The <extra> detour
('tent' $500 / 'pbox' $600) bloats the bill of materials past the
fixed budget; weap can never be funded (cash hits $0 mid-queue,
no income โ no recovery) and the `then:` chain never completes.
The capability measured is COST-MINIMAL planning."""
c = compile_level(load_pack(PACK), level)
res = run_level(c, _wasteful_policy(extra), seed=seed)
tp = getattr(res.signals, "then_progress", {}) or {}
assert res.outcome == "loss", (
f"wasteful {extra}-first must LOSE on {level} s={seed}; got "
f"{res.outcome} (tick={res.signals.game_tick}, "
f"then_progress={tp}, own_buildings={res.signals.own_building_types})"
)
# weap must NOT have been built โ the budget could not fund it.
assert "weap" not in (res.signals.own_building_types or []), (
f"wasteful {extra}-first built weap on {level} s={seed} โ the "
f"budget trap leaked (own_buildings={res.signals.own_building_types})"
)
@pytest.mark.parametrize("seed", SEEDS)
def test_hard_seeds_produce_distinct_starts(seed):
"""Hard's two spawn_point groups must actually round-robin โ
different seeds must place the agent fact at a different (x,y).
Smoke-tests the spawn-variation contract that
tests/test_hard_tier.py also enforces."""
c = compile_level(load_pack(PACK), "hard")
captured = {"first_obs": None}
def probe(obs, Cmd):
if captured["first_obs"] is None:
captured["first_obs"] = list(obs.get("own_buildings", []) or [])
return [Cmd.observe()]
res = run_level(c, probe, seed=seed)
assert res.outcome == "loss" # stall must lose
facts = [
(b["cell_x"], b["cell_y"])
for b in (captured["first_obs"] or [])
if b["type"] == "fact"
]
assert facts, f"no fact observed at turn 0 for seed={seed}"
def test_hard_spawns_round_robin_across_seeds():
"""Two seeds (1 and 2) must place the agent's fact at DIFFERENT
cells โ proves the spawn_point round-robin is active, not
degenerate."""
c = compile_level(load_pack(PACK), "hard")
def probe():
captured = {}
def pol(obs, Cmd):
if "fact_pos" not in captured:
bs = obs.get("own_buildings", []) or []
facts = [(b["cell_x"], b["cell_y"]) for b in bs if b["type"] == "fact"]
if facts:
captured["fact_pos"] = facts[0]
return [Cmd.observe()]
pol.captured = captured
return pol
p1 = probe(); run_level(c, p1, seed=1)
p2 = probe(); run_level(c, p2, seed=2)
pos1 = p1.captured.get("fact_pos")
pos2 = p2.captured.get("fact_pos")
assert pos1 and pos2, f"missing fact obs: s1={pos1} s2={pos2}"
assert pos1 != pos2, (
f"hard spawn round-robin is degenerate: seed 1 and 2 both "
f"started at {pos1}"
)
|