Spaces:

irregular6612
/

AgentnessBench

Sleeping

App Files Files Community

AgentnessBench / tests /runtime /test_interactive_equivalence.py

irregular6612

refactor(scenario): delete predator_evade; template is the canonical scenario

93cd78f 29 days ago

Raw

History Blame Contribute Delete

1.91 kB

	"""Golden: the same action sequence produces an identical SessionTrace whether
	driven through InteractiveSession (HTTP path) or SessionRunner + a scripted
	HumanAgent (CLI path). This pins the two paths together so they cannot drift."""
	from __future__ import annotations

	import proteus.game.scenarios # noqa: F401
	from proteus.game.agents.human import HumanAgent
	from proteus.game.engine.difficulty import Difficulty
	from proteus.game.runtime.interactive import InteractiveSession
	from proteus.game.runtime.session import SessionRunner

	ACTIONS = ["up", "up", "left", "stay", "right", "up"]


	def _scripted_human():
	feed = iter(ACTIONS)

	def input_fn(_prompt: str) -> str:
	return next(feed)

	def output_fn(_text: str) -> None:
	return None

	return HumanAgent(input_fn=input_fn, output_fn=output_fn)


	def test_interactive_matches_session_runner():
	runner = SessionRunner(
	"template", _scripted_human(),
	difficulty=Difficulty.EASY, seed=42,
	play_turns=len(ACTIONS), use_probe=False,
	)
	cli_trace = runner.run()

	sess = InteractiveSession(
	"template", difficulty=Difficulty.EASY, seed=42,
	play_turns=len(ACTIONS), use_probe=False,
	)
	for a in ACTIONS:
	if sess.state()["outcome"] is not None:
	break
	sess.step(a)
	web_trace = sess.finish()

	# Both are human; everything must match field-for-field.
	assert web_trace.model == cli_trace.model == "human"
	assert web_trace.cut_frames == cli_trace.cut_frames
	assert web_trace.outcome == cli_trace.outcome
	assert web_trace.metrics == cli_trace.metrics
	assert len(web_trace.turns) == len(cli_trace.turns)
	for wt, ct in zip(web_trace.turns, cli_trace.turns):
	assert wt.model_dump() == ct.model_dump()
	# Full-trace equality is the strongest pin.
	assert web_trace.model_dump() == cli_trace.model_dump()