Spaces:

irregular6612
/

AgentnessBench

Sleeping

AgentnessBench / tests /runtime /test_human_comparability.py

refactor(scenario): delete predator_evade; template is the canonical scenario

93cd78f 27 days ago

1.29 kB

	from proteus.game.agents import HumanAgent, VanillaAgent
	from proteus.providers import FakeProvider
	from proteus.game.runtime.session import SessionRunner


	def _scripted(seq):
	it = iter(seq)
	return lambda prompt="": next(it)


	def test_human_and_llm_traces_share_schema_and_answer_keys():
	# Both players commit "up" every turn under the same deterministic world,
	# so cut frames and per-turn answer keys must be identical; only `model`
	# differs. This is the human-baseline comparability foundation (spec §10).
	human = HumanAgent(input_fn=_scripted(["up"] * 20), output_fn=lambda s: None)
	h = SessionRunner(
	"template", human, seed=42, play_turns=5, use_probe=False,
	).run()

	llm = VanillaAgent(FakeProvider(["ACTION: up"]))
	v = SessionRunner(
	"template", llm, seed=42, play_turns=5, use_probe=False,
	).run()

	assert h.cut_frames == v.cut_frames
	assert [t.action for t in h.turns] == [t.action for t in v.turns]
	assert [t.motive_action for t in h.turns] == [t.motive_action for t in v.turns]
	assert [t.habit_action for t in h.turns] == [t.habit_action for t in v.turns]
	assert h.outcome == v.outcome
	assert set(h.metrics) == set(v.metrics)
	assert h.model == "human"
	assert v.model == "fake"