AgentnessBench / tests /cli /test_cli_persona.py
irregular6612's picture
refactor(scenario): delete predator_evade; template is the canonical scenario
93cd78f
Raw
History Blame Contribute Delete
3.3 kB
from proteus.cli import main
from proteus.game.runtime import read_traces
def test_run_with_persona_records_id_and_metrics(tmp_path):
out = tmp_path / "p.jsonl"
rc = main([
"run", "--scenario", "template", "--model", "fake:demo",
"--difficulty", "easy", "--seed", "42", "--play-turns", "3",
"--no-probe", "--no-gif", "--persona", "risk_averse", "--out", str(out),
])
assert rc == 0
trace = next(iter(read_traces(str(out))))
assert trace.persona_weight_id == "risk_averse"
assert "action_agreement" in trace.metrics
# the hidden weights never reach the serialized trace
assert "risk_cost" not in trace.model_dump_json()
def test_run_without_persona_has_no_persona_metrics(tmp_path):
out = tmp_path / "np.jsonl"
rc = main([
"run", "--scenario", "template", "--model", "fake:demo",
"--difficulty", "easy", "--seed", "42", "--play-turns", "3",
"--no-probe", "--no-gif", "--out", str(out),
])
assert rc == 0
trace = next(iter(read_traces(str(out))))
assert trace.persona_weight_id is None
assert "action_agreement" not in trace.metrics
def test_run_unknown_persona_errors(tmp_path):
out = tmp_path / "bad.jsonl"
rc = main([
"run", "--scenario", "template", "--model", "fake:demo",
"--difficulty", "easy", "--seed", "42", "--play-turns", "3",
"--no-probe", "--no-gif", "--persona", "nope", "--out", str(out),
])
assert rc == 2
def test_memory_with_persona_tags_checkpoint(tmp_path):
from proteus.game.runtime.memory import load_checkpoint
out = tmp_path / "mem.json"
rc = main([
"memory", "--scenario", "template", "--model", "fake:demo",
"--difficulty", "easy", "--seed", "42", "--memory-turns", "5",
"--persona", "risk_averse", "--out", str(out),
])
assert rc == 0
ck = load_checkpoint(str(out))
assert ck.persona_weight_id == "risk_averse"
assert "risk_cost" not in ck.model_dump_json()
def test_memory_unknown_persona_errors(tmp_path):
out = tmp_path / "bad.json"
rc = main([
"memory", "--scenario", "template", "--model", "fake:demo",
"--difficulty", "easy", "--seed", "42", "--memory-turns", "5",
"--persona", "nope", "--out", str(out),
])
assert rc == 2
def test_persona_memory_then_scored_run(tmp_path):
# Stage 4 acceptance: a persona demonstration memory + a scored run that
# measures whether the model continues that persona (same hidden weights).
mem = tmp_path / "demo.json"
rc = main([
"memory", "--scenario", "template", "--model", "fake:demo",
"--difficulty", "easy", "--seed", "42", "--memory-turns", "5",
"--persona", "risk_averse", "--out", str(mem),
])
assert rc == 0
out = tmp_path / "scored.jsonl"
rc = main([
"run", "--scenario", "template", "--model", "fake:demo",
"--difficulty", "easy", "--seed", "42", "--play-turns", "3",
"--no-probe", "--no-gif", "--memory", str(mem),
"--persona", "risk_averse", "--out", str(out),
])
assert rc == 0
trace = next(iter(read_traces(str(out))))
assert trace.persona_weight_id == "risk_averse"
assert trace.memory_ref == str(mem)
assert "action_agreement" in trace.metrics