AgentnessBench / tests /runtime /test_interactive_equivalence.py
irregular6612's picture
refactor(scenario): delete predator_evade; template is the canonical scenario
93cd78f
Raw
History Blame Contribute Delete
1.91 kB
"""Golden: the same action sequence produces an identical SessionTrace whether
driven through InteractiveSession (HTTP path) or SessionRunner + a scripted
HumanAgent (CLI path). This pins the two paths together so they cannot drift."""
from __future__ import annotations
import proteus.game.scenarios # noqa: F401
from proteus.game.agents.human import HumanAgent
from proteus.game.engine.difficulty import Difficulty
from proteus.game.runtime.interactive import InteractiveSession
from proteus.game.runtime.session import SessionRunner
ACTIONS = ["up", "up", "left", "stay", "right", "up"]
def _scripted_human():
feed = iter(ACTIONS)
def input_fn(_prompt: str) -> str:
return next(feed)
def output_fn(_text: str) -> None:
return None
return HumanAgent(input_fn=input_fn, output_fn=output_fn)
def test_interactive_matches_session_runner():
runner = SessionRunner(
"template", _scripted_human(),
difficulty=Difficulty.EASY, seed=42,
play_turns=len(ACTIONS), use_probe=False,
)
cli_trace = runner.run()
sess = InteractiveSession(
"template", difficulty=Difficulty.EASY, seed=42,
play_turns=len(ACTIONS), use_probe=False,
)
for a in ACTIONS:
if sess.state()["outcome"] is not None:
break
sess.step(a)
web_trace = sess.finish()
# Both are human; everything must match field-for-field.
assert web_trace.model == cli_trace.model == "human"
assert web_trace.cut_frames == cli_trace.cut_frames
assert web_trace.outcome == cli_trace.outcome
assert web_trace.metrics == cli_trace.metrics
assert len(web_trace.turns) == len(cli_trace.turns)
for wt, ct in zip(web_trace.turns, cli_trace.turns):
assert wt.model_dump() == ct.model_dump()
# Full-trace equality is the strongest pin.
assert web_trace.model_dump() == cli_trace.model_dump()