Spaces:

qpluslab
/

OpenRA-Bench

Running

App Files Files Community

OpenRA-Bench / tests /test_build_sequence_tech_fastest.py

yxc20098

feat(scenario): build-sequence-tech-fastest — fastest weap-tech BO (PlanBench cost-optimal anchor)

34333cc about 1 month ago

Raw

History Blame Contribute Delete

13.4 kB

	"""build-sequence-tech-fastest pack — full no-cheat validation on Rust.

	Wave-7 REASONING — cost-optimal build-order planning. The agent must
	reach the war factory (`weap`) on the SHORTEST prerequisite chain:

	powr → proc → weap

	Any detour (build a barracks/tent first, or a redundant power plant,
	or an early infantry queue) overruns the tight tick budget and loses.
	The chain is enforced by the Wave-2 `then:` happened-before composite;
	the deadline (`within_ticks`) is the cost-optimality teeth — slack is
	tuned so the OPTIMAL plan fits and the tent-detour plan does NOT.

	Bar (CLAUDE.md): the intended cost-optimal policy WINS on every
	(level, seed); stall and the tent-first wrong-path policy LOSE on
	every (level, seed). Real LOSS not DRAW — `fail after_ticks:T+1`
	inside max_turns is the bite.

	Scenario shape:
	- rush-hour-arena, allies vs soviet (bot disabled).
	- easy: T=3000, max_turns=40 — generous (4-turn buffer).
	- medium: T=2800, max_turns=35 — tight (≈2-turn buffer).
	- hard: T=2800, max_turns=35 — same tight T + ≥2 spawn_point
	groups (NORTH y=14 / SOUTH y=26 base, round-robined).

	Measured optimal timing (seed 1, scripted intended policy):
	powr completes ≈ tick 273 (turn 3)
	proc completes ≈ tick 1263 (turn 14)
	weap completes ≈ tick 2613 (turn 29)
	Measured tent-first wrong-path timing:
	weap completes ≈ tick 3063 (turn 34) — beyond every level's T.
	"""

	from __future__ import annotations

	import pytest

	pytest.importorskip("openra_train", reason="Rust env wheel not installed")
	pytest.importorskip("openra_rl_training", reason="Rust env wheel not installed")

	from openra_bench.eval_core import run_level
	from openra_bench.scenarios import load_pack
	from openra_bench.scenarios.loader import PACKS_DIR, compile_level

	PACK = PACKS_DIR / "build-sequence-tech-fastest.yaml"
	LEVELS = ("easy", "medium", "hard")
	SEEDS = (1, 2, 3, 4)


	# ── Policies ──────────────────────────────────────────────────────


	def _stall_policy():
	"""Do nothing — must LOSE on the clock on every level/seed."""
	def pol(obs, Cmd):
	return [Cmd.observe()]
	return pol


	def _intended_policy():
	"""Cost-optimal play: build powr → proc → weap, each one placed
	relative to the agent's actual fact (so the policy generalises
	across the hard-tier spawn variation). This is the policy the
	pack is solvable by — must WIN on every (level, seed)."""
	milestone = {"powr": False, "proc": False, "weap": False}

	def pol(obs, Cmd):
	ob = obs.get("own_buildings", []) or []
	own_b = {b["type"] for b in ob}
	prod = obs.get("production", []) or []
	for b in ("powr", "proc", "weap"):
	if b in own_b:
	milestone[b] = True
	cmds = []
	base = [b for b in ob if b["type"] == "fact"]
	if not milestone["powr"]:
	if "powr" not in prod:
	cmds.append(Cmd.build("powr"))
	if base:
	cmds.append(Cmd.place_building(
	"powr", base[0]["cell_x"] + 4, base[0]["cell_y"]
	))
	elif not milestone["proc"]:
	if "proc" not in prod:
	cmds.append(Cmd.build("proc"))
	if base:
	cmds.append(Cmd.place_building(
	"proc", base[0]["cell_x"] + 6, base[0]["cell_y"] + 3
	))
	elif not milestone["weap"]:
	if "weap" not in prod:
	cmds.append(Cmd.build("weap"))
	if base:
	cmds.append(Cmd.place_building(
	"weap", base[0]["cell_x"] + 8, base[0]["cell_y"]
	))
	if not cmds:
	cmds.append(Cmd.observe())
	return cmds
	return pol


	def _tent_first_policy():
	"""Wrong cost-non-optimal play: powr → tent → proc → weap. The
	tent is not on the prerequisite chain for weap (only proc is); it
	bloats the BOM by 500 credits and ~5 turns. Must LOSE on the
	clock on every level/seed."""
	milestone = {"powr": False, "tent": False, "proc": False, "weap": False}

	def pol(obs, Cmd):
	ob = obs.get("own_buildings", []) or []
	own_b = {b["type"] for b in ob}
	prod = obs.get("production", []) or []
	for b in ("powr", "tent", "proc", "weap"):
	if b in own_b:
	milestone[b] = True
	cmds = []
	base = [b for b in ob if b["type"] == "fact"]
	if not milestone["powr"]:
	if "powr" not in prod:
	cmds.append(Cmd.build("powr"))
	if base:
	cmds.append(Cmd.place_building(
	"powr", base[0]["cell_x"] + 4, base[0]["cell_y"]
	))
	elif not milestone["tent"]:
	if "tent" not in prod:
	cmds.append(Cmd.build("tent"))
	if base:
	cmds.append(Cmd.place_building(
	"tent", base[0]["cell_x"] + 4, base[0]["cell_y"] + 3
	))
	elif not milestone["proc"]:
	if "proc" not in prod:
	cmds.append(Cmd.build("proc"))
	if base:
	cmds.append(Cmd.place_building(
	"proc", base[0]["cell_x"] + 6, base[0]["cell_y"] + 3
	))
	elif not milestone["weap"]:
	if "weap" not in prod:
	cmds.append(Cmd.build("weap"))
	if base:
	cmds.append(Cmd.place_building(
	"weap", base[0]["cell_x"] + 8, base[0]["cell_y"]
	))
	if not cmds:
	cmds.append(Cmd.observe())
	return cmds
	return pol


	# ── Pack-shape tests (cheap; do not run the engine) ──────────────


	def test_pack_compiles_with_three_levels():
	pack = load_pack(PACK)
	assert pack.meta.id == "build-sequence-tech-fastest"
	assert pack.meta.capability == "reasoning"
	assert set(pack.levels) == {"easy", "medium", "hard"}


	def test_meta_benchmark_anchor_set():
	"""Required by the seed taxonomy: PlanBench cost-optimal +
	BOM manufacturing critical-path planning."""
	pack = load_pack(PACK)
	anchors = pack.meta.benchmark_anchor or []
	assert any("PlanBench" in a for a in anchors), anchors
	assert any("BOM" in a for a in anchors), anchors


	def test_hard_tier_has_seed_driven_spawn_groups():
	"""Hard must define ≥2 agent spawn_point groups so seed varies
	the start base (tests/test_hard_tier.py::UPGRADED contract)."""
	c = compile_level(load_pack(PACK), "hard")
	sp = {a.spawn_point for a in c.scenario.actors if a.owner == "agent"}
	assert len(sp) >= 2, f"hard needs ≥2 spawn groups, got {sp}"


	def test_every_level_has_fail_condition():
	"""No silent draws — every level must be able to emit a LOSS."""
	pack = load_pack(PACK)
	for lvl in LEVELS:
	c = compile_level(pack, lvl)
	assert c.fail_condition is not None, f"{lvl} missing fail_condition"


	def test_then_composite_used_in_win():
	"""Confirms the 3-step build-order chain is wired through to the
	compiled win condition — the load-bearing teeth of this pack."""
	for lvl in LEVELS:
	c = compile_level(load_pack(PACK), lvl)
	win = c.win_condition.model_dump(exclude_none=True)
	inner = win.get("all_of") or []
	assert any("then" in cl for cl in inner), (
	f"{lvl} win missing then-chain: {win}"
	)
	for cl in inner:
	if "then" in cl:
	clauses = (cl["then"] or {}).get("clauses") or []
	assert len(clauses) == 3, (
	f"{lvl} then-chain must be powr→proc→weap (3 clauses); "
	f"got {clauses}"
	)
	# And in the exact engine-enforced prereq order.
	assert clauses[0].get("has_building") == "powr"
	assert clauses[1].get("has_building") == "proc"
	assert clauses[2].get("has_building") == "weap"


	def test_tick_budget_aligned_with_max_turns():
	"""within_ticks must be reachable inside max_turns. Engine
	advances ~90 ticks/turn → reachable max = 93 + 90·(N-1)."""
	pack = load_pack(PACK)
	for lvl in LEVELS:
	level_def = pack.levels[lvl]
	max_turns = level_def.max_turns
	reachable = 93 + 90 * (max_turns - 1)
	win = compile_level(pack, lvl).win_condition.model_dump(exclude_none=True)

	def _collect(node, key, out):
	if isinstance(node, dict):
	if key in node:
	out.append(node[key])
	for v in node.values():
	_collect(v, key, out)
	elif isinstance(node, list):
	for v in node:
	_collect(v, key, out)
	wts = []
	_collect(win, "within_ticks", wts)
	assert wts, f"{lvl} has no within_ticks leaf (no clock teeth)"
	for wt in wts:
	assert wt <= reachable, (
	f"{lvl} within_ticks={wt} > reachable={reachable} "
	f"(max_turns={max_turns}) — deadline never bites ⇒ draw"
	)


	# ── Engine-bound tests (parameterised over seeds 1..4) ────────────


	@pytest.mark.parametrize("seed", SEEDS)
	@pytest.mark.parametrize("level", LEVELS)
	def test_intended_cost_optimal_policy_wins(level, seed):
	"""The intended cost-optimal play (powr → proc → weap) must WIN
	on every (level, seed). This is the load-bearing test that the
	pack is solvable inside the budget by the advertised capability."""
	c = compile_level(load_pack(PACK), level)
	res = run_level(c, _intended_policy(), seed=seed)
	tp = getattr(res.signals, "then_progress", {}) or {}
	assert res.outcome == "win", (
	f"intended cost-optimal must WIN on {level} s={seed}; "
	f"got {res.outcome} (tick={res.signals.game_tick}, "
	f"then_progress={tp}, "
	f"own_buildings={res.signals.own_building_types})"
	)


	@pytest.mark.parametrize("seed", SEEDS)
	@pytest.mark.parametrize("level", LEVELS)
	def test_stall_loses(level, seed):
	"""A do-nothing policy must LOSE on every (level, seed). The
	fail_condition's after_ticks clause bites at the budget; never
	a draw."""
	c = compile_level(load_pack(PACK), level)
	res = run_level(c, _stall_policy(), seed=seed)
	assert res.outcome == "loss", (
	f"stall must LOSE on {level} s={seed}; got {res.outcome} "
	f"(tick={res.signals.game_tick})"
	)


	@pytest.mark.parametrize("seed", SEEDS)
	@pytest.mark.parametrize("level", LEVELS)
	def test_tent_first_wrong_path_loses(level, seed):
	"""The cost-non-optimal tent-first play must LOSE on every
	(level, seed). The tent detour adds ~500 credits + ~5 turns,
	pushing weap completion to ~tick 3063 — beyond every level's
	deadline. The capability being measured is COST-OPTIMAL
	planning; a 'some plan that arrives' policy must not win."""
	c = compile_level(load_pack(PACK), level)
	res = run_level(c, _tent_first_policy(), seed=seed)
	tp = getattr(res.signals, "then_progress", {}) or {}
	assert res.outcome == "loss", (
	f"tent-first wrong-path must LOSE on {level} s={seed}; got "
	f"{res.outcome} (tick={res.signals.game_tick}, "
	f"then_progress={tp}, own_buildings={res.signals.own_building_types})"
	)


	@pytest.mark.parametrize("seed", SEEDS)
	def test_hard_seeds_produce_distinct_starts(seed):
	"""Hard's two spawn_point groups must actually round-robin —
	different seeds must place the agent fact at a different (x,y).
	Smoke-tests the spawn-variation contract that
	tests/test_hard_tier.py also enforces."""
	c = compile_level(load_pack(PACK), "hard")
	captured = {"first_obs": None}

	def probe(obs, Cmd):
	if captured["first_obs"] is None:
	captured["first_obs"] = list(obs.get("own_buildings", []) or [])
	return [Cmd.observe()]

	res = run_level(c, probe, seed=seed)
	assert res.outcome == "loss" # stall must lose
	facts = [
	(b["cell_x"], b["cell_y"])
	for b in (captured["first_obs"] or [])
	if b["type"] == "fact"
	]
	assert facts, f"no fact observed at turn 0 for seed={seed}"


	def test_hard_spawns_round_robin_across_seeds():
	"""Two seeds (1 and 2) must place the agent's fact at DIFFERENT
	cells — proves the spawn_point round-robin is active, not
	degenerate."""
	c = compile_level(load_pack(PACK), "hard")

	def probe():
	captured = {}
	def pol(obs, Cmd):
	if "fact_pos" not in captured:
	bs = obs.get("own_buildings", []) or []
	facts = [(b["cell_x"], b["cell_y"]) for b in bs if b["type"] == "fact"]
	if facts:
	captured["fact_pos"] = facts[0]
	return [Cmd.observe()]
	pol.captured = captured
	return pol

	p1 = probe(); run_level(c, p1, seed=1)
	p2 = probe(); run_level(c, p2, seed=2)
	pos1 = p1.captured.get("fact_pos")
	pos2 = p2.captured.get("fact_pos")
	assert pos1 and pos2, f"missing fact obs: s1={pos1} s2={pos2}"
	assert pos1 != pos2, (
	f"hard spawn round-robin is degenerate: seed 1 and 2 both "
	f"started at {pos1}"
	)