Spaces:

irregular6612
/

AgentnessBench

Sleeping

App Files Files Community

AgentnessBench / tests /cli /test_cli_persona.py

irregular6612

refactor(scenario): delete predator_evade; template is the canonical scenario

93cd78f 27 days ago

Raw

History Blame Contribute Delete

3.3 kB

	from proteus.cli import main
	from proteus.game.runtime import read_traces


	def test_run_with_persona_records_id_and_metrics(tmp_path):
	out = tmp_path / "p.jsonl"
	rc = main([
	"run", "--scenario", "template", "--model", "fake:demo",
	"--difficulty", "easy", "--seed", "42", "--play-turns", "3",
	"--no-probe", "--no-gif", "--persona", "risk_averse", "--out", str(out),
	])
	assert rc == 0
	trace = next(iter(read_traces(str(out))))
	assert trace.persona_weight_id == "risk_averse"
	assert "action_agreement" in trace.metrics
	# the hidden weights never reach the serialized trace
	assert "risk_cost" not in trace.model_dump_json()


	def test_run_without_persona_has_no_persona_metrics(tmp_path):
	out = tmp_path / "np.jsonl"
	rc = main([
	"run", "--scenario", "template", "--model", "fake:demo",
	"--difficulty", "easy", "--seed", "42", "--play-turns", "3",
	"--no-probe", "--no-gif", "--out", str(out),
	])
	assert rc == 0
	trace = next(iter(read_traces(str(out))))
	assert trace.persona_weight_id is None
	assert "action_agreement" not in trace.metrics


	def test_run_unknown_persona_errors(tmp_path):
	out = tmp_path / "bad.jsonl"
	rc = main([
	"run", "--scenario", "template", "--model", "fake:demo",
	"--difficulty", "easy", "--seed", "42", "--play-turns", "3",
	"--no-probe", "--no-gif", "--persona", "nope", "--out", str(out),
	])
	assert rc == 2


	def test_memory_with_persona_tags_checkpoint(tmp_path):
	from proteus.game.runtime.memory import load_checkpoint
	out = tmp_path / "mem.json"
	rc = main([
	"memory", "--scenario", "template", "--model", "fake:demo",
	"--difficulty", "easy", "--seed", "42", "--memory-turns", "5",
	"--persona", "risk_averse", "--out", str(out),
	])
	assert rc == 0
	ck = load_checkpoint(str(out))
	assert ck.persona_weight_id == "risk_averse"
	assert "risk_cost" not in ck.model_dump_json()


	def test_memory_unknown_persona_errors(tmp_path):
	out = tmp_path / "bad.json"
	rc = main([
	"memory", "--scenario", "template", "--model", "fake:demo",
	"--difficulty", "easy", "--seed", "42", "--memory-turns", "5",
	"--persona", "nope", "--out", str(out),
	])
	assert rc == 2


	def test_persona_memory_then_scored_run(tmp_path):
	# Stage 4 acceptance: a persona demonstration memory + a scored run that
	# measures whether the model continues that persona (same hidden weights).
	mem = tmp_path / "demo.json"
	rc = main([
	"memory", "--scenario", "template", "--model", "fake:demo",
	"--difficulty", "easy", "--seed", "42", "--memory-turns", "5",
	"--persona", "risk_averse", "--out", str(mem),
	])
	assert rc == 0

	out = tmp_path / "scored.jsonl"
	rc = main([
	"run", "--scenario", "template", "--model", "fake:demo",
	"--difficulty", "easy", "--seed", "42", "--play-turns", "3",
	"--no-probe", "--no-gif", "--memory", str(mem),
	"--persona", "risk_averse", "--out", str(out),
	])
	assert rc == 0
	trace = next(iter(read_traces(str(out))))
	assert trace.persona_weight_id == "risk_averse"
	assert trace.memory_ref == str(mem)
	assert "action_agreement" in trace.metrics