AgentnessBench / tests /cli /test_cli.py
irregular6612's picture
refactor(scenario): delete predator_evade; template is the canonical scenario
93cd78f
Raw
History Blame Contribute Delete
6.15 kB
from proteus.cli import main
from proteus.game.runtime import read_traces
def test_list_scenarios_prints_template(capsys):
rc = main(["list-scenarios"])
out = capsys.readouterr().out
assert rc == 0
assert "template" in out
def test_run_with_fake_provider_writes_reloadable_trace(tmp_path, capsys):
out = tmp_path / "runs" / "smoke.jsonl"
rc = main([
"run",
"--scenario", "template",
"--model", "fake:cli-test",
"--seed", "42",
"--play-turns", "5",
"--no-probe",
"--out", str(out),
])
assert rc == 0
assert out.exists()
traces = read_traces(out)
assert len(traces) == 1
assert traces[0].scenario == "template"
assert traces[0].model == "cli-test"
# The run line summarizes the outcome.
assert "template" in capsys.readouterr().out
def test_replay_prints_turns_and_outcome(tmp_path, capsys):
out = tmp_path / "r.jsonl"
main([
"run", "--scenario", "template", "--model", "fake:x",
"--seed", "42", "--play-turns", "5", "--no-probe", "--out", str(out),
])
capsys.readouterr() # drain
rc = main(["replay", str(out)])
text = capsys.readouterr().out
assert rc == 0
assert "template" in text
assert "turn 1" in text
def test_run_unknown_provider_returns_nonzero(tmp_path, capsys):
rc = main([
"run", "--scenario", "template", "--model", "bogus:x",
"--seed", "1", "--out", str(tmp_path / "x.jsonl"),
])
assert rc == 2
assert "Unknown provider" in capsys.readouterr().err
def test_run_unknown_scenario_returns_nonzero(tmp_path, capsys):
rc = main([
"run", "--scenario", "no_such_scenario", "--model", "fake:x",
"--seed", "1", "--out", str(tmp_path / "x.jsonl"),
])
assert rc == 2
assert "Unknown scenario" in capsys.readouterr().err
# A bad scenario must not have written a trace file.
assert not (tmp_path / "x.jsonl").exists()
def test_replay_missing_file_returns_nonzero(capsys):
rc = main(["replay", "/no/such/trace.jsonl"])
assert rc == 2
assert "not found" in capsys.readouterr().err
def test_play_human_writes_comparable_trace(tmp_path, monkeypatch, capsys):
# Feed scripted moves through builtins.input (HumanAgent resolves it lazily).
inputs = iter(["up"] * 20)
monkeypatch.setattr("builtins.input", lambda *a, **k: next(inputs))
out = tmp_path / "runs" / "human.jsonl"
rc = main([
"play",
"--scenario", "template",
"--seed", "42",
"--play-turns", "5",
"--out", str(out),
])
assert rc == 0
traces = read_traces(out)
assert len(traces) == 1
assert traces[0].model == "human"
assert traces[0].scenario == "template"
# The run summary names the scenario.
assert "template" in capsys.readouterr().out
def test_play_unknown_scenario_errors(capsys):
rc = main(["play", "--scenario", "nope", "--seed", "1", "--play-turns", "1"])
assert rc == 2
assert "Unknown scenario" in capsys.readouterr().err
def _write_fake_trace(tmp_path):
out = tmp_path / "r.jsonl"
main([
"run", "--scenario", "template", "--model", "fake:x",
"--seed", "42", "--play-turns", "4", "--no-probe", "--out", str(out),
])
return out
def test_replay_text_mode_unchanged(tmp_path, capsys):
out = _write_fake_trace(tmp_path)
capsys.readouterr() # drain
rc = main(["replay", str(out)])
text = capsys.readouterr().out
assert rc == 0
assert "turn 1" in text # legacy text behavior preserved
def test_replay_visual_emits_truecolor(tmp_path, capsys):
out = _write_fake_trace(tmp_path)
capsys.readouterr()
rc = main(["replay", str(out), "--visual", "--fps", "0"])
text = capsys.readouterr().out
assert rc == 0
assert "\033[38;2;" in text # truecolor escape present
def test_replay_png_writes_frames(tmp_path, capsys):
import pytest
pytest.importorskip("matplotlib")
out = _write_fake_trace(tmp_path)
pdir = tmp_path / "png"
rc = main(["replay", str(out), "--png", str(pdir)])
assert rc == 0
frames = list(pdir.glob("frame_*.png"))
assert frames
assert all(p.stat().st_size > 0 for p in frames)
assert "PNG" in capsys.readouterr().out
def test_play_handles_stdin_eof(monkeypatch, capsys):
# stdin closed/exhausted before the session finishes -> clean rc=2, no traceback.
def _eof(*args, **kwargs):
raise EOFError
monkeypatch.setattr("builtins.input", _eof)
rc = main([
"play", "--scenario", "template", "--seed", "42", "--play-turns", "5",
])
assert rc == 2
assert "stdin" in capsys.readouterr().err.lower()
def test_compare_aggregates_traces(tmp_path, capsys):
out = tmp_path / "runs.jsonl"
# Two fake-model traces at the same difficulty (model id "demo").
for seed in (1, 2):
main([
"run", "--scenario", "template", "--model", "fake:demo",
"--seed", str(seed), "--play-turns", "4", "--no-probe", "--out", str(out),
])
capsys.readouterr() # drain
rc = main(["compare", str(out)])
text = capsys.readouterr().out
assert rc == 0
assert "demo" in text and "easy" in text
assert "n=2" in text
assert "motive_reading_accuracy" in text
def test_compare_writes_summary_json(tmp_path):
import json
out = tmp_path / "runs.jsonl"
main([
"run", "--scenario", "template", "--model", "fake:demo",
"--seed", "1", "--play-turns", "4", "--no-probe", "--out", str(out),
])
summary = tmp_path / "summary.json"
rc = main(["compare", str(out), "--out", str(summary)])
assert rc == 0
data = json.loads(summary.read_text())
assert data # non-empty aggregate
# Pin the documented "model|difficulty" key format + nested shape.
assert "demo|easy" in data
assert data["demo|easy"]["n"] == 1
assert "motive_reading_accuracy" in data["demo|easy"]["metrics"]
def test_compare_missing_file_errors(capsys):
rc = main(["compare", "/no/such/file.jsonl"])
assert rc == 2
assert "not found" in capsys.readouterr().err