from proteus.cli import main from proteus.game.runtime import read_traces def test_list_scenarios_prints_template(capsys): rc = main(["list-scenarios"]) out = capsys.readouterr().out assert rc == 0 assert "template" in out def test_run_with_fake_provider_writes_reloadable_trace(tmp_path, capsys): out = tmp_path / "runs" / "smoke.jsonl" rc = main([ "run", "--scenario", "template", "--model", "fake:cli-test", "--seed", "42", "--play-turns", "5", "--no-probe", "--out", str(out), ]) assert rc == 0 assert out.exists() traces = read_traces(out) assert len(traces) == 1 assert traces[0].scenario == "template" assert traces[0].model == "cli-test" # The run line summarizes the outcome. assert "template" in capsys.readouterr().out def test_replay_prints_turns_and_outcome(tmp_path, capsys): out = tmp_path / "r.jsonl" main([ "run", "--scenario", "template", "--model", "fake:x", "--seed", "42", "--play-turns", "5", "--no-probe", "--out", str(out), ]) capsys.readouterr() # drain rc = main(["replay", str(out)]) text = capsys.readouterr().out assert rc == 0 assert "template" in text assert "turn 1" in text def test_run_unknown_provider_returns_nonzero(tmp_path, capsys): rc = main([ "run", "--scenario", "template", "--model", "bogus:x", "--seed", "1", "--out", str(tmp_path / "x.jsonl"), ]) assert rc == 2 assert "Unknown provider" in capsys.readouterr().err def test_run_unknown_scenario_returns_nonzero(tmp_path, capsys): rc = main([ "run", "--scenario", "no_such_scenario", "--model", "fake:x", "--seed", "1", "--out", str(tmp_path / "x.jsonl"), ]) assert rc == 2 assert "Unknown scenario" in capsys.readouterr().err # A bad scenario must not have written a trace file. assert not (tmp_path / "x.jsonl").exists() def test_replay_missing_file_returns_nonzero(capsys): rc = main(["replay", "/no/such/trace.jsonl"]) assert rc == 2 assert "not found" in capsys.readouterr().err def test_play_human_writes_comparable_trace(tmp_path, monkeypatch, capsys): # Feed scripted moves through builtins.input (HumanAgent resolves it lazily). inputs = iter(["up"] * 20) monkeypatch.setattr("builtins.input", lambda *a, **k: next(inputs)) out = tmp_path / "runs" / "human.jsonl" rc = main([ "play", "--scenario", "template", "--seed", "42", "--play-turns", "5", "--out", str(out), ]) assert rc == 0 traces = read_traces(out) assert len(traces) == 1 assert traces[0].model == "human" assert traces[0].scenario == "template" # The run summary names the scenario. assert "template" in capsys.readouterr().out def test_play_unknown_scenario_errors(capsys): rc = main(["play", "--scenario", "nope", "--seed", "1", "--play-turns", "1"]) assert rc == 2 assert "Unknown scenario" in capsys.readouterr().err def _write_fake_trace(tmp_path): out = tmp_path / "r.jsonl" main([ "run", "--scenario", "template", "--model", "fake:x", "--seed", "42", "--play-turns", "4", "--no-probe", "--out", str(out), ]) return out def test_replay_text_mode_unchanged(tmp_path, capsys): out = _write_fake_trace(tmp_path) capsys.readouterr() # drain rc = main(["replay", str(out)]) text = capsys.readouterr().out assert rc == 0 assert "turn 1" in text # legacy text behavior preserved def test_replay_visual_emits_truecolor(tmp_path, capsys): out = _write_fake_trace(tmp_path) capsys.readouterr() rc = main(["replay", str(out), "--visual", "--fps", "0"]) text = capsys.readouterr().out assert rc == 0 assert "\033[38;2;" in text # truecolor escape present def test_replay_png_writes_frames(tmp_path, capsys): import pytest pytest.importorskip("matplotlib") out = _write_fake_trace(tmp_path) pdir = tmp_path / "png" rc = main(["replay", str(out), "--png", str(pdir)]) assert rc == 0 frames = list(pdir.glob("frame_*.png")) assert frames assert all(p.stat().st_size > 0 for p in frames) assert "PNG" in capsys.readouterr().out def test_play_handles_stdin_eof(monkeypatch, capsys): # stdin closed/exhausted before the session finishes -> clean rc=2, no traceback. def _eof(*args, **kwargs): raise EOFError monkeypatch.setattr("builtins.input", _eof) rc = main([ "play", "--scenario", "template", "--seed", "42", "--play-turns", "5", ]) assert rc == 2 assert "stdin" in capsys.readouterr().err.lower() def test_compare_aggregates_traces(tmp_path, capsys): out = tmp_path / "runs.jsonl" # Two fake-model traces at the same difficulty (model id "demo"). for seed in (1, 2): main([ "run", "--scenario", "template", "--model", "fake:demo", "--seed", str(seed), "--play-turns", "4", "--no-probe", "--out", str(out), ]) capsys.readouterr() # drain rc = main(["compare", str(out)]) text = capsys.readouterr().out assert rc == 0 assert "demo" in text and "easy" in text assert "n=2" in text assert "motive_reading_accuracy" in text def test_compare_writes_summary_json(tmp_path): import json out = tmp_path / "runs.jsonl" main([ "run", "--scenario", "template", "--model", "fake:demo", "--seed", "1", "--play-turns", "4", "--no-probe", "--out", str(out), ]) summary = tmp_path / "summary.json" rc = main(["compare", str(out), "--out", str(summary)]) assert rc == 0 data = json.loads(summary.read_text()) assert data # non-empty aggregate # Pin the documented "model|difficulty" key format + nested shape. assert "demo|easy" in data assert data["demo|easy"]["n"] == 1 assert "motive_reading_accuracy" in data["demo|easy"]["metrics"] def test_compare_missing_file_errors(capsys): rc = main(["compare", "/no/such/file.jsonl"]) assert rc == 2 assert "not found" in capsys.readouterr().err