irregular6612 commited on
Commit
4d98bbb
·
1 Parent(s): 71926e3

test(cp5): lock human↔LLM trace comparability invariant (spec §10)

Browse files
tests/runtime/test_human_comparability.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from proteus.agents import HumanAgent, VanillaAgent
2
+ from proteus.providers import FakeProvider
3
+ from proteus.runtime.session import SessionRunner
4
+
5
+
6
+ def _scripted(seq):
7
+ it = iter(seq)
8
+ return lambda prompt="": next(it)
9
+
10
+
11
+ def test_human_and_llm_traces_share_schema_and_answer_keys():
12
+ # Both players commit "up" every turn under the same deterministic world,
13
+ # so cut frames and per-turn answer keys must be identical; only `model`
14
+ # differs. This is the human-baseline comparability foundation (spec §10).
15
+ human = HumanAgent(input_fn=_scripted(["up"] * 20), output_fn=lambda s: None)
16
+ h = SessionRunner(
17
+ "predator_evade", human, seed=42, play_turns=5, use_probe=False,
18
+ ).run()
19
+
20
+ llm = VanillaAgent(FakeProvider(["ACTION: up"]))
21
+ v = SessionRunner(
22
+ "predator_evade", llm, seed=42, play_turns=5, use_probe=False,
23
+ ).run()
24
+
25
+ assert h.cut_frames == v.cut_frames
26
+ assert [t.action for t in h.turns] == [t.action for t in v.turns]
27
+ assert [t.motive_action for t in h.turns] == [t.motive_action for t in v.turns]
28
+ assert [t.habit_action for t in h.turns] == [t.habit_action for t in v.turns]
29
+ assert h.outcome == v.outcome
30
+ assert set(h.metrics) == set(v.metrics)
31
+ assert h.model == "human"
32
+ assert v.model == "fake"