Spaces:
Sleeping
Sleeping
| from proteus.cli import main | |
| from proteus.game.runtime import read_traces | |
| def test_list_scenarios_prints_template(capsys): | |
| rc = main(["list-scenarios"]) | |
| out = capsys.readouterr().out | |
| assert rc == 0 | |
| assert "template" in out | |
| def test_run_with_fake_provider_writes_reloadable_trace(tmp_path, capsys): | |
| out = tmp_path / "runs" / "smoke.jsonl" | |
| rc = main([ | |
| "run", | |
| "--scenario", "template", | |
| "--model", "fake:cli-test", | |
| "--seed", "42", | |
| "--play-turns", "5", | |
| "--no-probe", | |
| "--out", str(out), | |
| ]) | |
| assert rc == 0 | |
| assert out.exists() | |
| traces = read_traces(out) | |
| assert len(traces) == 1 | |
| assert traces[0].scenario == "template" | |
| assert traces[0].model == "cli-test" | |
| # The run line summarizes the outcome. | |
| assert "template" in capsys.readouterr().out | |
| def test_replay_prints_turns_and_outcome(tmp_path, capsys): | |
| out = tmp_path / "r.jsonl" | |
| main([ | |
| "run", "--scenario", "template", "--model", "fake:x", | |
| "--seed", "42", "--play-turns", "5", "--no-probe", "--out", str(out), | |
| ]) | |
| capsys.readouterr() # drain | |
| rc = main(["replay", str(out)]) | |
| text = capsys.readouterr().out | |
| assert rc == 0 | |
| assert "template" in text | |
| assert "turn 1" in text | |
| def test_run_unknown_provider_returns_nonzero(tmp_path, capsys): | |
| rc = main([ | |
| "run", "--scenario", "template", "--model", "bogus:x", | |
| "--seed", "1", "--out", str(tmp_path / "x.jsonl"), | |
| ]) | |
| assert rc == 2 | |
| assert "Unknown provider" in capsys.readouterr().err | |
| def test_run_unknown_scenario_returns_nonzero(tmp_path, capsys): | |
| rc = main([ | |
| "run", "--scenario", "no_such_scenario", "--model", "fake:x", | |
| "--seed", "1", "--out", str(tmp_path / "x.jsonl"), | |
| ]) | |
| assert rc == 2 | |
| assert "Unknown scenario" in capsys.readouterr().err | |
| # A bad scenario must not have written a trace file. | |
| assert not (tmp_path / "x.jsonl").exists() | |
| def test_replay_missing_file_returns_nonzero(capsys): | |
| rc = main(["replay", "/no/such/trace.jsonl"]) | |
| assert rc == 2 | |
| assert "not found" in capsys.readouterr().err | |
| def test_play_human_writes_comparable_trace(tmp_path, monkeypatch, capsys): | |
| # Feed scripted moves through builtins.input (HumanAgent resolves it lazily). | |
| inputs = iter(["up"] * 20) | |
| monkeypatch.setattr("builtins.input", lambda *a, **k: next(inputs)) | |
| out = tmp_path / "runs" / "human.jsonl" | |
| rc = main([ | |
| "play", | |
| "--scenario", "template", | |
| "--seed", "42", | |
| "--play-turns", "5", | |
| "--out", str(out), | |
| ]) | |
| assert rc == 0 | |
| traces = read_traces(out) | |
| assert len(traces) == 1 | |
| assert traces[0].model == "human" | |
| assert traces[0].scenario == "template" | |
| # The run summary names the scenario. | |
| assert "template" in capsys.readouterr().out | |
| def test_play_unknown_scenario_errors(capsys): | |
| rc = main(["play", "--scenario", "nope", "--seed", "1", "--play-turns", "1"]) | |
| assert rc == 2 | |
| assert "Unknown scenario" in capsys.readouterr().err | |
| def _write_fake_trace(tmp_path): | |
| out = tmp_path / "r.jsonl" | |
| main([ | |
| "run", "--scenario", "template", "--model", "fake:x", | |
| "--seed", "42", "--play-turns", "4", "--no-probe", "--out", str(out), | |
| ]) | |
| return out | |
| def test_replay_text_mode_unchanged(tmp_path, capsys): | |
| out = _write_fake_trace(tmp_path) | |
| capsys.readouterr() # drain | |
| rc = main(["replay", str(out)]) | |
| text = capsys.readouterr().out | |
| assert rc == 0 | |
| assert "turn 1" in text # legacy text behavior preserved | |
| def test_replay_visual_emits_truecolor(tmp_path, capsys): | |
| out = _write_fake_trace(tmp_path) | |
| capsys.readouterr() | |
| rc = main(["replay", str(out), "--visual", "--fps", "0"]) | |
| text = capsys.readouterr().out | |
| assert rc == 0 | |
| assert "\033[38;2;" in text # truecolor escape present | |
| def test_replay_png_writes_frames(tmp_path, capsys): | |
| import pytest | |
| pytest.importorskip("matplotlib") | |
| out = _write_fake_trace(tmp_path) | |
| pdir = tmp_path / "png" | |
| rc = main(["replay", str(out), "--png", str(pdir)]) | |
| assert rc == 0 | |
| frames = list(pdir.glob("frame_*.png")) | |
| assert frames | |
| assert all(p.stat().st_size > 0 for p in frames) | |
| assert "PNG" in capsys.readouterr().out | |
| def test_play_handles_stdin_eof(monkeypatch, capsys): | |
| # stdin closed/exhausted before the session finishes -> clean rc=2, no traceback. | |
| def _eof(*args, **kwargs): | |
| raise EOFError | |
| monkeypatch.setattr("builtins.input", _eof) | |
| rc = main([ | |
| "play", "--scenario", "template", "--seed", "42", "--play-turns", "5", | |
| ]) | |
| assert rc == 2 | |
| assert "stdin" in capsys.readouterr().err.lower() | |
| def test_compare_aggregates_traces(tmp_path, capsys): | |
| out = tmp_path / "runs.jsonl" | |
| # Two fake-model traces at the same difficulty (model id "demo"). | |
| for seed in (1, 2): | |
| main([ | |
| "run", "--scenario", "template", "--model", "fake:demo", | |
| "--seed", str(seed), "--play-turns", "4", "--no-probe", "--out", str(out), | |
| ]) | |
| capsys.readouterr() # drain | |
| rc = main(["compare", str(out)]) | |
| text = capsys.readouterr().out | |
| assert rc == 0 | |
| assert "demo" in text and "easy" in text | |
| assert "n=2" in text | |
| assert "motive_reading_accuracy" in text | |
| def test_compare_writes_summary_json(tmp_path): | |
| import json | |
| out = tmp_path / "runs.jsonl" | |
| main([ | |
| "run", "--scenario", "template", "--model", "fake:demo", | |
| "--seed", "1", "--play-turns", "4", "--no-probe", "--out", str(out), | |
| ]) | |
| summary = tmp_path / "summary.json" | |
| rc = main(["compare", str(out), "--out", str(summary)]) | |
| assert rc == 0 | |
| data = json.loads(summary.read_text()) | |
| assert data # non-empty aggregate | |
| # Pin the documented "model|difficulty" key format + nested shape. | |
| assert "demo|easy" in data | |
| assert data["demo|easy"]["n"] == 1 | |
| assert "motive_reading_accuracy" in data["demo|easy"]["metrics"] | |
| def test_compare_missing_file_errors(capsys): | |
| rc = main(["compare", "/no/such/file.jsonl"]) | |
| assert rc == 2 | |
| assert "not found" in capsys.readouterr().err | |