Spaces:

irregular6612
/

AgentnessBench

Sleeping

App Files Files Community

AgentnessBench / tests /cli /test_cli.py

irregular6612

refactor(scenario): delete predator_evade; template is the canonical scenario

93cd78f 22 days ago

Raw

History Blame Contribute Delete

6.15 kB

	from proteus.cli import main
	from proteus.game.runtime import read_traces


	def test_list_scenarios_prints_template(capsys):
	rc = main(["list-scenarios"])
	out = capsys.readouterr().out
	assert rc == 0
	assert "template" in out


	def test_run_with_fake_provider_writes_reloadable_trace(tmp_path, capsys):
	out = tmp_path / "runs" / "smoke.jsonl"
	rc = main([
	"run",
	"--scenario", "template",
	"--model", "fake:cli-test",
	"--seed", "42",
	"--play-turns", "5",
	"--no-probe",
	"--out", str(out),
	])
	assert rc == 0
	assert out.exists()
	traces = read_traces(out)
	assert len(traces) == 1
	assert traces[0].scenario == "template"
	assert traces[0].model == "cli-test"
	# The run line summarizes the outcome.
	assert "template" in capsys.readouterr().out


	def test_replay_prints_turns_and_outcome(tmp_path, capsys):
	out = tmp_path / "r.jsonl"
	main([
	"run", "--scenario", "template", "--model", "fake:x",
	"--seed", "42", "--play-turns", "5", "--no-probe", "--out", str(out),
	])
	capsys.readouterr() # drain
	rc = main(["replay", str(out)])
	text = capsys.readouterr().out
	assert rc == 0
	assert "template" in text
	assert "turn 1" in text


	def test_run_unknown_provider_returns_nonzero(tmp_path, capsys):
	rc = main([
	"run", "--scenario", "template", "--model", "bogus:x",
	"--seed", "1", "--out", str(tmp_path / "x.jsonl"),
	])
	assert rc == 2
	assert "Unknown provider" in capsys.readouterr().err


	def test_run_unknown_scenario_returns_nonzero(tmp_path, capsys):
	rc = main([
	"run", "--scenario", "no_such_scenario", "--model", "fake:x",
	"--seed", "1", "--out", str(tmp_path / "x.jsonl"),
	])
	assert rc == 2
	assert "Unknown scenario" in capsys.readouterr().err
	# A bad scenario must not have written a trace file.
	assert not (tmp_path / "x.jsonl").exists()


	def test_replay_missing_file_returns_nonzero(capsys):
	rc = main(["replay", "/no/such/trace.jsonl"])
	assert rc == 2
	assert "not found" in capsys.readouterr().err


	def test_play_human_writes_comparable_trace(tmp_path, monkeypatch, capsys):
	# Feed scripted moves through builtins.input (HumanAgent resolves it lazily).
	inputs = iter(["up"] * 20)
	monkeypatch.setattr("builtins.input", lambda a, *k: next(inputs))
	out = tmp_path / "runs" / "human.jsonl"
	rc = main([
	"play",
	"--scenario", "template",
	"--seed", "42",
	"--play-turns", "5",
	"--out", str(out),
	])
	assert rc == 0
	traces = read_traces(out)
	assert len(traces) == 1
	assert traces[0].model == "human"
	assert traces[0].scenario == "template"
	# The run summary names the scenario.
	assert "template" in capsys.readouterr().out


	def test_play_unknown_scenario_errors(capsys):
	rc = main(["play", "--scenario", "nope", "--seed", "1", "--play-turns", "1"])
	assert rc == 2
	assert "Unknown scenario" in capsys.readouterr().err


	def _write_fake_trace(tmp_path):
	out = tmp_path / "r.jsonl"
	main([
	"run", "--scenario", "template", "--model", "fake:x",
	"--seed", "42", "--play-turns", "4", "--no-probe", "--out", str(out),
	])
	return out


	def test_replay_text_mode_unchanged(tmp_path, capsys):
	out = _write_fake_trace(tmp_path)
	capsys.readouterr() # drain
	rc = main(["replay", str(out)])
	text = capsys.readouterr().out
	assert rc == 0
	assert "turn 1" in text # legacy text behavior preserved


	def test_replay_visual_emits_truecolor(tmp_path, capsys):
	out = _write_fake_trace(tmp_path)
	capsys.readouterr()
	rc = main(["replay", str(out), "--visual", "--fps", "0"])
	text = capsys.readouterr().out
	assert rc == 0
	assert "\033[38;2;" in text # truecolor escape present


	def test_replay_png_writes_frames(tmp_path, capsys):
	import pytest

	pytest.importorskip("matplotlib")
	out = _write_fake_trace(tmp_path)
	pdir = tmp_path / "png"
	rc = main(["replay", str(out), "--png", str(pdir)])
	assert rc == 0
	frames = list(pdir.glob("frame_*.png"))
	assert frames
	assert all(p.stat().st_size > 0 for p in frames)
	assert "PNG" in capsys.readouterr().out


	def test_play_handles_stdin_eof(monkeypatch, capsys):
	# stdin closed/exhausted before the session finishes -> clean rc=2, no traceback.
	def _eof(args, *kwargs):
	raise EOFError

	monkeypatch.setattr("builtins.input", _eof)
	rc = main([
	"play", "--scenario", "template", "--seed", "42", "--play-turns", "5",
	])
	assert rc == 2
	assert "stdin" in capsys.readouterr().err.lower()


	def test_compare_aggregates_traces(tmp_path, capsys):
	out = tmp_path / "runs.jsonl"
	# Two fake-model traces at the same difficulty (model id "demo").
	for seed in (1, 2):
	main([
	"run", "--scenario", "template", "--model", "fake:demo",
	"--seed", str(seed), "--play-turns", "4", "--no-probe", "--out", str(out),
	])
	capsys.readouterr() # drain
	rc = main(["compare", str(out)])
	text = capsys.readouterr().out
	assert rc == 0
	assert "demo" in text and "easy" in text
	assert "n=2" in text
	assert "motive_reading_accuracy" in text


	def test_compare_writes_summary_json(tmp_path):
	import json

	out = tmp_path / "runs.jsonl"
	main([
	"run", "--scenario", "template", "--model", "fake:demo",
	"--seed", "1", "--play-turns", "4", "--no-probe", "--out", str(out),
	])
	summary = tmp_path / "summary.json"
	rc = main(["compare", str(out), "--out", str(summary)])
	assert rc == 0
	data = json.loads(summary.read_text())
	assert data # non-empty aggregate
	# Pin the documented "model\|difficulty" key format + nested shape.
	assert "demo\|easy" in data
	assert data["demo\|easy"]["n"] == 1
	assert "motive_reading_accuracy" in data["demo\|easy"]["metrics"]


	def test_compare_missing_file_errors(capsys):
	rc = main(["compare", "/no/such/file.jsonl"])
	assert rc == 2
	assert "not found" in capsys.readouterr().err