Spaces:
Running
Running
File size: 13,408 Bytes
34333cc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 | """build-sequence-tech-fastest pack โ full no-cheat validation on Rust.
Wave-7 REASONING โ cost-optimal build-order planning. The agent must
reach the war factory (`weap`) on the SHORTEST prerequisite chain:
powr โ proc โ weap
Any detour (build a barracks/tent first, or a redundant power plant,
or an early infantry queue) overruns the tight tick budget and loses.
The chain is enforced by the Wave-2 `then:` happened-before composite;
the deadline (`within_ticks`) is the cost-optimality teeth โ slack is
tuned so the OPTIMAL plan fits and the tent-detour plan does NOT.
Bar (CLAUDE.md): the intended cost-optimal policy WINS on every
(level, seed); stall and the tent-first wrong-path policy LOSE on
every (level, seed). Real LOSS not DRAW โ `fail after_ticks:T+1`
inside max_turns is the bite.
Scenario shape:
- rush-hour-arena, allies vs soviet (bot disabled).
- easy: T=3000, max_turns=40 โ generous (4-turn buffer).
- medium: T=2800, max_turns=35 โ tight (โ2-turn buffer).
- hard: T=2800, max_turns=35 โ same tight T + โฅ2 spawn_point
groups (NORTH y=14 / SOUTH y=26 base, round-robined).
Measured optimal timing (seed 1, scripted intended policy):
powr completes โ tick 273 (turn 3)
proc completes โ tick 1263 (turn 14)
weap completes โ tick 2613 (turn 29)
Measured tent-first wrong-path timing:
weap completes โ tick 3063 (turn 34) โ beyond every level's T.
"""
from __future__ import annotations
import pytest
pytest.importorskip("openra_train", reason="Rust env wheel not installed")
pytest.importorskip("openra_rl_training", reason="Rust env wheel not installed")
from openra_bench.eval_core import run_level
from openra_bench.scenarios import load_pack
from openra_bench.scenarios.loader import PACKS_DIR, compile_level
PACK = PACKS_DIR / "build-sequence-tech-fastest.yaml"
LEVELS = ("easy", "medium", "hard")
SEEDS = (1, 2, 3, 4)
# โโ Policies โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def _stall_policy():
"""Do nothing โ must LOSE on the clock on every level/seed."""
def pol(obs, Cmd):
return [Cmd.observe()]
return pol
def _intended_policy():
"""Cost-optimal play: build powr โ proc โ weap, each one placed
relative to the agent's actual fact (so the policy generalises
across the hard-tier spawn variation). This is the policy the
pack is solvable by โ must WIN on every (level, seed)."""
milestone = {"powr": False, "proc": False, "weap": False}
def pol(obs, Cmd):
ob = obs.get("own_buildings", []) or []
own_b = {b["type"] for b in ob}
prod = obs.get("production", []) or []
for b in ("powr", "proc", "weap"):
if b in own_b:
milestone[b] = True
cmds = []
base = [b for b in ob if b["type"] == "fact"]
if not milestone["powr"]:
if "powr" not in prod:
cmds.append(Cmd.build("powr"))
if base:
cmds.append(Cmd.place_building(
"powr", base[0]["cell_x"] + 4, base[0]["cell_y"]
))
elif not milestone["proc"]:
if "proc" not in prod:
cmds.append(Cmd.build("proc"))
if base:
cmds.append(Cmd.place_building(
"proc", base[0]["cell_x"] + 6, base[0]["cell_y"] + 3
))
elif not milestone["weap"]:
if "weap" not in prod:
cmds.append(Cmd.build("weap"))
if base:
cmds.append(Cmd.place_building(
"weap", base[0]["cell_x"] + 8, base[0]["cell_y"]
))
if not cmds:
cmds.append(Cmd.observe())
return cmds
return pol
def _tent_first_policy():
"""Wrong cost-non-optimal play: powr โ tent โ proc โ weap. The
tent is not on the prerequisite chain for weap (only proc is); it
bloats the BOM by 500 credits and ~5 turns. Must LOSE on the
clock on every level/seed."""
milestone = {"powr": False, "tent": False, "proc": False, "weap": False}
def pol(obs, Cmd):
ob = obs.get("own_buildings", []) or []
own_b = {b["type"] for b in ob}
prod = obs.get("production", []) or []
for b in ("powr", "tent", "proc", "weap"):
if b in own_b:
milestone[b] = True
cmds = []
base = [b for b in ob if b["type"] == "fact"]
if not milestone["powr"]:
if "powr" not in prod:
cmds.append(Cmd.build("powr"))
if base:
cmds.append(Cmd.place_building(
"powr", base[0]["cell_x"] + 4, base[0]["cell_y"]
))
elif not milestone["tent"]:
if "tent" not in prod:
cmds.append(Cmd.build("tent"))
if base:
cmds.append(Cmd.place_building(
"tent", base[0]["cell_x"] + 4, base[0]["cell_y"] + 3
))
elif not milestone["proc"]:
if "proc" not in prod:
cmds.append(Cmd.build("proc"))
if base:
cmds.append(Cmd.place_building(
"proc", base[0]["cell_x"] + 6, base[0]["cell_y"] + 3
))
elif not milestone["weap"]:
if "weap" not in prod:
cmds.append(Cmd.build("weap"))
if base:
cmds.append(Cmd.place_building(
"weap", base[0]["cell_x"] + 8, base[0]["cell_y"]
))
if not cmds:
cmds.append(Cmd.observe())
return cmds
return pol
# โโ Pack-shape tests (cheap; do not run the engine) โโโโโโโโโโโโโโ
def test_pack_compiles_with_three_levels():
pack = load_pack(PACK)
assert pack.meta.id == "build-sequence-tech-fastest"
assert pack.meta.capability == "reasoning"
assert set(pack.levels) == {"easy", "medium", "hard"}
def test_meta_benchmark_anchor_set():
"""Required by the seed taxonomy: PlanBench cost-optimal +
BOM manufacturing critical-path planning."""
pack = load_pack(PACK)
anchors = pack.meta.benchmark_anchor or []
assert any("PlanBench" in a for a in anchors), anchors
assert any("BOM" in a for a in anchors), anchors
def test_hard_tier_has_seed_driven_spawn_groups():
"""Hard must define โฅ2 agent spawn_point groups so seed varies
the start base (tests/test_hard_tier.py::UPGRADED contract)."""
c = compile_level(load_pack(PACK), "hard")
sp = {a.spawn_point for a in c.scenario.actors if a.owner == "agent"}
assert len(sp) >= 2, f"hard needs โฅ2 spawn groups, got {sp}"
def test_every_level_has_fail_condition():
"""No silent draws โ every level must be able to emit a LOSS."""
pack = load_pack(PACK)
for lvl in LEVELS:
c = compile_level(pack, lvl)
assert c.fail_condition is not None, f"{lvl} missing fail_condition"
def test_then_composite_used_in_win():
"""Confirms the 3-step build-order chain is wired through to the
compiled win condition โ the load-bearing teeth of this pack."""
for lvl in LEVELS:
c = compile_level(load_pack(PACK), lvl)
win = c.win_condition.model_dump(exclude_none=True)
inner = win.get("all_of") or []
assert any("then" in cl for cl in inner), (
f"{lvl} win missing then-chain: {win}"
)
for cl in inner:
if "then" in cl:
clauses = (cl["then"] or {}).get("clauses") or []
assert len(clauses) == 3, (
f"{lvl} then-chain must be powrโprocโweap (3 clauses); "
f"got {clauses}"
)
# And in the exact engine-enforced prereq order.
assert clauses[0].get("has_building") == "powr"
assert clauses[1].get("has_building") == "proc"
assert clauses[2].get("has_building") == "weap"
def test_tick_budget_aligned_with_max_turns():
"""within_ticks must be reachable inside max_turns. Engine
advances ~90 ticks/turn โ reachable max = 93 + 90ยท(N-1)."""
pack = load_pack(PACK)
for lvl in LEVELS:
level_def = pack.levels[lvl]
max_turns = level_def.max_turns
reachable = 93 + 90 * (max_turns - 1)
win = compile_level(pack, lvl).win_condition.model_dump(exclude_none=True)
def _collect(node, key, out):
if isinstance(node, dict):
if key in node:
out.append(node[key])
for v in node.values():
_collect(v, key, out)
elif isinstance(node, list):
for v in node:
_collect(v, key, out)
wts = []
_collect(win, "within_ticks", wts)
assert wts, f"{lvl} has no within_ticks leaf (no clock teeth)"
for wt in wts:
assert wt <= reachable, (
f"{lvl} within_ticks={wt} > reachable={reachable} "
f"(max_turns={max_turns}) โ deadline never bites โ draw"
)
# โโ Engine-bound tests (parameterised over seeds 1..4) โโโโโโโโโโโโ
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("level", LEVELS)
def test_intended_cost_optimal_policy_wins(level, seed):
"""The intended cost-optimal play (powr โ proc โ weap) must WIN
on every (level, seed). This is the load-bearing test that the
pack is solvable inside the budget by the advertised capability."""
c = compile_level(load_pack(PACK), level)
res = run_level(c, _intended_policy(), seed=seed)
tp = getattr(res.signals, "then_progress", {}) or {}
assert res.outcome == "win", (
f"intended cost-optimal must WIN on {level} s={seed}; "
f"got {res.outcome} (tick={res.signals.game_tick}, "
f"then_progress={tp}, "
f"own_buildings={res.signals.own_building_types})"
)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("level", LEVELS)
def test_stall_loses(level, seed):
"""A do-nothing policy must LOSE on every (level, seed). The
fail_condition's after_ticks clause bites at the budget; never
a draw."""
c = compile_level(load_pack(PACK), level)
res = run_level(c, _stall_policy(), seed=seed)
assert res.outcome == "loss", (
f"stall must LOSE on {level} s={seed}; got {res.outcome} "
f"(tick={res.signals.game_tick})"
)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("level", LEVELS)
def test_tent_first_wrong_path_loses(level, seed):
"""The cost-non-optimal tent-first play must LOSE on every
(level, seed). The tent detour adds ~500 credits + ~5 turns,
pushing weap completion to ~tick 3063 โ beyond every level's
deadline. The capability being measured is COST-OPTIMAL
planning; a 'some plan that arrives' policy must not win."""
c = compile_level(load_pack(PACK), level)
res = run_level(c, _tent_first_policy(), seed=seed)
tp = getattr(res.signals, "then_progress", {}) or {}
assert res.outcome == "loss", (
f"tent-first wrong-path must LOSE on {level} s={seed}; got "
f"{res.outcome} (tick={res.signals.game_tick}, "
f"then_progress={tp}, own_buildings={res.signals.own_building_types})"
)
@pytest.mark.parametrize("seed", SEEDS)
def test_hard_seeds_produce_distinct_starts(seed):
"""Hard's two spawn_point groups must actually round-robin โ
different seeds must place the agent fact at a different (x,y).
Smoke-tests the spawn-variation contract that
tests/test_hard_tier.py also enforces."""
c = compile_level(load_pack(PACK), "hard")
captured = {"first_obs": None}
def probe(obs, Cmd):
if captured["first_obs"] is None:
captured["first_obs"] = list(obs.get("own_buildings", []) or [])
return [Cmd.observe()]
res = run_level(c, probe, seed=seed)
assert res.outcome == "loss" # stall must lose
facts = [
(b["cell_x"], b["cell_y"])
for b in (captured["first_obs"] or [])
if b["type"] == "fact"
]
assert facts, f"no fact observed at turn 0 for seed={seed}"
def test_hard_spawns_round_robin_across_seeds():
"""Two seeds (1 and 2) must place the agent's fact at DIFFERENT
cells โ proves the spawn_point round-robin is active, not
degenerate."""
c = compile_level(load_pack(PACK), "hard")
def probe():
captured = {}
def pol(obs, Cmd):
if "fact_pos" not in captured:
bs = obs.get("own_buildings", []) or []
facts = [(b["cell_x"], b["cell_y"]) for b in bs if b["type"] == "fact"]
if facts:
captured["fact_pos"] = facts[0]
return [Cmd.observe()]
pol.captured = captured
return pol
p1 = probe(); run_level(c, p1, seed=1)
p2 = probe(); run_level(c, p2, seed=2)
pos1 = p1.captured.get("fact_pos")
pos2 = p2.captured.get("fact_pos")
assert pos1 and pos2, f"missing fact obs: s1={pos1} s2={pos2}"
assert pos1 != pos2, (
f"hard spawn round-robin is degenerate: seed 1 and 2 both "
f"started at {pos1}"
)
|