Spaces:

irregular6612
/

AgentnessBench

Sleeping

File size: 1,285 Bytes

426093b
4d98bbb
426093b
4d98bbb
 
 
 
 
 
 
 
 
 
 
 
 
93cd78f
4d98bbb
 
 
 
93cd78f
4d98bbb

from proteus.game.agents import HumanAgent, VanillaAgent
from proteus.providers import FakeProvider
from proteus.game.runtime.session import SessionRunner


def _scripted(seq):
    it = iter(seq)
    return lambda prompt="": next(it)


def test_human_and_llm_traces_share_schema_and_answer_keys():
    # Both players commit "up" every turn under the same deterministic world,
    # so cut frames and per-turn answer keys must be identical; only `model`
    # differs. This is the human-baseline comparability foundation (spec §10).
    human = HumanAgent(input_fn=_scripted(["up"] * 20), output_fn=lambda s: None)
    h = SessionRunner(
        "template", human, seed=42, play_turns=5, use_probe=False,
    ).run()

    llm = VanillaAgent(FakeProvider(["ACTION: up"]))
    v = SessionRunner(
        "template", llm, seed=42, play_turns=5, use_probe=False,
    ).run()

    assert h.cut_frames == v.cut_frames
    assert [t.action for t in h.turns] == [t.action for t in v.turns]
    assert [t.motive_action for t in h.turns] == [t.motive_action for t in v.turns]
    assert [t.habit_action for t in h.turns] == [t.habit_action for t in v.turns]
    assert h.outcome == v.outcome
    assert set(h.metrics) == set(v.metrics)
    assert h.model == "human"
    assert v.model == "fake"