Spaces:

irregular6612
/

AgentnessBench

Sleeping

irregular6612 commited on Jun 1

Commit

4d98bbb

1 Parent(s): 71926e3

test(cp5): lock human↔LLM trace comparability invariant (spec §10)

Files changed (1) hide show

tests/runtime/test_human_comparability.py ADDED Viewed

+from proteus.agents import HumanAgent, VanillaAgent
+from proteus.providers import FakeProvider
+from proteus.runtime.session import SessionRunner
+def _scripted(seq):
+    it = iter(seq)
+    return lambda prompt="": next(it)
+def test_human_and_llm_traces_share_schema_and_answer_keys():
+    # Both players commit "up" every turn under the same deterministic world,
+    # so cut frames and per-turn answer keys must be identical; only `model`
+    # differs. This is the human-baseline comparability foundation (spec §10).
+    human = HumanAgent(input_fn=_scripted(["up"] * 20), output_fn=lambda s: None)
+    h = SessionRunner(
+        "predator_evade", human, seed=42, play_turns=5, use_probe=False,
+    ).run()
+    llm = VanillaAgent(FakeProvider(["ACTION: up"]))
+    v = SessionRunner(
+        "predator_evade", llm, seed=42, play_turns=5, use_probe=False,
+    ).run()
+    assert h.cut_frames == v.cut_frames
+    assert [t.action for t in h.turns] == [t.action for t in v.turns]
+    assert [t.motive_action for t in h.turns] == [t.motive_action for t in v.turns]
+    assert [t.habit_action for t in h.turns] == [t.habit_action for t in v.turns]
+    assert h.outcome == v.outcome
+    assert set(h.metrics) == set(v.metrics)
+    assert h.model == "human"
+    assert v.model == "fake"