Spaces:
Sleeping
Sleeping
Commit ·
4d98bbb
1
Parent(s): 71926e3
test(cp5): lock human↔LLM trace comparability invariant (spec §10)
Browse files
tests/runtime/test_human_comparability.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from proteus.agents import HumanAgent, VanillaAgent
|
| 2 |
+
from proteus.providers import FakeProvider
|
| 3 |
+
from proteus.runtime.session import SessionRunner
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def _scripted(seq):
|
| 7 |
+
it = iter(seq)
|
| 8 |
+
return lambda prompt="": next(it)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_human_and_llm_traces_share_schema_and_answer_keys():
|
| 12 |
+
# Both players commit "up" every turn under the same deterministic world,
|
| 13 |
+
# so cut frames and per-turn answer keys must be identical; only `model`
|
| 14 |
+
# differs. This is the human-baseline comparability foundation (spec §10).
|
| 15 |
+
human = HumanAgent(input_fn=_scripted(["up"] * 20), output_fn=lambda s: None)
|
| 16 |
+
h = SessionRunner(
|
| 17 |
+
"predator_evade", human, seed=42, play_turns=5, use_probe=False,
|
| 18 |
+
).run()
|
| 19 |
+
|
| 20 |
+
llm = VanillaAgent(FakeProvider(["ACTION: up"]))
|
| 21 |
+
v = SessionRunner(
|
| 22 |
+
"predator_evade", llm, seed=42, play_turns=5, use_probe=False,
|
| 23 |
+
).run()
|
| 24 |
+
|
| 25 |
+
assert h.cut_frames == v.cut_frames
|
| 26 |
+
assert [t.action for t in h.turns] == [t.action for t in v.turns]
|
| 27 |
+
assert [t.motive_action for t in h.turns] == [t.motive_action for t in v.turns]
|
| 28 |
+
assert [t.habit_action for t in h.turns] == [t.habit_action for t in v.turns]
|
| 29 |
+
assert h.outcome == v.outcome
|
| 30 |
+
assert set(h.metrics) == set(v.metrics)
|
| 31 |
+
assert h.model == "human"
|
| 32 |
+
assert v.model == "fake"
|