Spaces:
Sleeping
Sleeping
File size: 6,154 Bytes
c318527 426093b c318527 93cd78f c318527 93cd78f c318527 93cd78f c318527 93cd78f c318527 93cd78f c318527 93cd78f c318527 93cd78f c318527 93cd78f c318527 3f6f600 93cd78f 3f6f600 93cd78f 3f6f600 93cd78f 3f6f600 ff9d5a9 93cd78f ff9d5a9 c817950 93cd78f c817950 c8beea5 93cd78f c8beea5 93cd78f c8beea5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 | from proteus.cli import main
from proteus.game.runtime import read_traces
def test_list_scenarios_prints_template(capsys):
rc = main(["list-scenarios"])
out = capsys.readouterr().out
assert rc == 0
assert "template" in out
def test_run_with_fake_provider_writes_reloadable_trace(tmp_path, capsys):
out = tmp_path / "runs" / "smoke.jsonl"
rc = main([
"run",
"--scenario", "template",
"--model", "fake:cli-test",
"--seed", "42",
"--play-turns", "5",
"--no-probe",
"--out", str(out),
])
assert rc == 0
assert out.exists()
traces = read_traces(out)
assert len(traces) == 1
assert traces[0].scenario == "template"
assert traces[0].model == "cli-test"
# The run line summarizes the outcome.
assert "template" in capsys.readouterr().out
def test_replay_prints_turns_and_outcome(tmp_path, capsys):
out = tmp_path / "r.jsonl"
main([
"run", "--scenario", "template", "--model", "fake:x",
"--seed", "42", "--play-turns", "5", "--no-probe", "--out", str(out),
])
capsys.readouterr() # drain
rc = main(["replay", str(out)])
text = capsys.readouterr().out
assert rc == 0
assert "template" in text
assert "turn 1" in text
def test_run_unknown_provider_returns_nonzero(tmp_path, capsys):
rc = main([
"run", "--scenario", "template", "--model", "bogus:x",
"--seed", "1", "--out", str(tmp_path / "x.jsonl"),
])
assert rc == 2
assert "Unknown provider" in capsys.readouterr().err
def test_run_unknown_scenario_returns_nonzero(tmp_path, capsys):
rc = main([
"run", "--scenario", "no_such_scenario", "--model", "fake:x",
"--seed", "1", "--out", str(tmp_path / "x.jsonl"),
])
assert rc == 2
assert "Unknown scenario" in capsys.readouterr().err
# A bad scenario must not have written a trace file.
assert not (tmp_path / "x.jsonl").exists()
def test_replay_missing_file_returns_nonzero(capsys):
rc = main(["replay", "/no/such/trace.jsonl"])
assert rc == 2
assert "not found" in capsys.readouterr().err
def test_play_human_writes_comparable_trace(tmp_path, monkeypatch, capsys):
# Feed scripted moves through builtins.input (HumanAgent resolves it lazily).
inputs = iter(["up"] * 20)
monkeypatch.setattr("builtins.input", lambda *a, **k: next(inputs))
out = tmp_path / "runs" / "human.jsonl"
rc = main([
"play",
"--scenario", "template",
"--seed", "42",
"--play-turns", "5",
"--out", str(out),
])
assert rc == 0
traces = read_traces(out)
assert len(traces) == 1
assert traces[0].model == "human"
assert traces[0].scenario == "template"
# The run summary names the scenario.
assert "template" in capsys.readouterr().out
def test_play_unknown_scenario_errors(capsys):
rc = main(["play", "--scenario", "nope", "--seed", "1", "--play-turns", "1"])
assert rc == 2
assert "Unknown scenario" in capsys.readouterr().err
def _write_fake_trace(tmp_path):
out = tmp_path / "r.jsonl"
main([
"run", "--scenario", "template", "--model", "fake:x",
"--seed", "42", "--play-turns", "4", "--no-probe", "--out", str(out),
])
return out
def test_replay_text_mode_unchanged(tmp_path, capsys):
out = _write_fake_trace(tmp_path)
capsys.readouterr() # drain
rc = main(["replay", str(out)])
text = capsys.readouterr().out
assert rc == 0
assert "turn 1" in text # legacy text behavior preserved
def test_replay_visual_emits_truecolor(tmp_path, capsys):
out = _write_fake_trace(tmp_path)
capsys.readouterr()
rc = main(["replay", str(out), "--visual", "--fps", "0"])
text = capsys.readouterr().out
assert rc == 0
assert "\033[38;2;" in text # truecolor escape present
def test_replay_png_writes_frames(tmp_path, capsys):
import pytest
pytest.importorskip("matplotlib")
out = _write_fake_trace(tmp_path)
pdir = tmp_path / "png"
rc = main(["replay", str(out), "--png", str(pdir)])
assert rc == 0
frames = list(pdir.glob("frame_*.png"))
assert frames
assert all(p.stat().st_size > 0 for p in frames)
assert "PNG" in capsys.readouterr().out
def test_play_handles_stdin_eof(monkeypatch, capsys):
# stdin closed/exhausted before the session finishes -> clean rc=2, no traceback.
def _eof(*args, **kwargs):
raise EOFError
monkeypatch.setattr("builtins.input", _eof)
rc = main([
"play", "--scenario", "template", "--seed", "42", "--play-turns", "5",
])
assert rc == 2
assert "stdin" in capsys.readouterr().err.lower()
def test_compare_aggregates_traces(tmp_path, capsys):
out = tmp_path / "runs.jsonl"
# Two fake-model traces at the same difficulty (model id "demo").
for seed in (1, 2):
main([
"run", "--scenario", "template", "--model", "fake:demo",
"--seed", str(seed), "--play-turns", "4", "--no-probe", "--out", str(out),
])
capsys.readouterr() # drain
rc = main(["compare", str(out)])
text = capsys.readouterr().out
assert rc == 0
assert "demo" in text and "easy" in text
assert "n=2" in text
assert "motive_reading_accuracy" in text
def test_compare_writes_summary_json(tmp_path):
import json
out = tmp_path / "runs.jsonl"
main([
"run", "--scenario", "template", "--model", "fake:demo",
"--seed", "1", "--play-turns", "4", "--no-probe", "--out", str(out),
])
summary = tmp_path / "summary.json"
rc = main(["compare", str(out), "--out", str(summary)])
assert rc == 0
data = json.loads(summary.read_text())
assert data # non-empty aggregate
# Pin the documented "model|difficulty" key format + nested shape.
assert "demo|easy" in data
assert data["demo|easy"]["n"] == 1
assert "motive_reading_accuracy" in data["demo|easy"]["metrics"]
def test_compare_missing_file_errors(capsys):
rc = main(["compare", "/no/such/file.jsonl"])
assert rc == 2
assert "not found" in capsys.readouterr().err
|