File size: 6,154 Bytes
c318527
426093b
c318527
 
93cd78f
c318527
 
 
93cd78f
c318527
 
 
 
 
 
93cd78f
c318527
 
 
 
 
 
 
 
 
 
93cd78f
c318527
 
93cd78f
c318527
 
 
 
 
93cd78f
c318527
 
 
 
 
 
93cd78f
c318527
 
 
 
 
93cd78f
c318527
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f6f600
 
 
 
 
 
 
 
 
93cd78f
3f6f600
 
 
 
 
 
 
 
93cd78f
3f6f600
93cd78f
3f6f600
 
 
 
 
 
ff9d5a9
 
 
 
 
93cd78f
ff9d5a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c817950
 
 
 
 
 
 
 
 
93cd78f
c817950
 
 
c8beea5
 
 
 
 
 
 
93cd78f
c8beea5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93cd78f
c8beea5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
from proteus.cli import main
from proteus.game.runtime import read_traces


def test_list_scenarios_prints_template(capsys):
    rc = main(["list-scenarios"])
    out = capsys.readouterr().out
    assert rc == 0
    assert "template" in out


def test_run_with_fake_provider_writes_reloadable_trace(tmp_path, capsys):
    out = tmp_path / "runs" / "smoke.jsonl"
    rc = main([
        "run",
        "--scenario", "template",
        "--model", "fake:cli-test",
        "--seed", "42",
        "--play-turns", "5",
        "--no-probe",
        "--out", str(out),
    ])
    assert rc == 0
    assert out.exists()
    traces = read_traces(out)
    assert len(traces) == 1
    assert traces[0].scenario == "template"
    assert traces[0].model == "cli-test"
    # The run line summarizes the outcome.
    assert "template" in capsys.readouterr().out


def test_replay_prints_turns_and_outcome(tmp_path, capsys):
    out = tmp_path / "r.jsonl"
    main([
        "run", "--scenario", "template", "--model", "fake:x",
        "--seed", "42", "--play-turns", "5", "--no-probe", "--out", str(out),
    ])
    capsys.readouterr()  # drain
    rc = main(["replay", str(out)])
    text = capsys.readouterr().out
    assert rc == 0
    assert "template" in text
    assert "turn 1" in text


def test_run_unknown_provider_returns_nonzero(tmp_path, capsys):
    rc = main([
        "run", "--scenario", "template", "--model", "bogus:x",
        "--seed", "1", "--out", str(tmp_path / "x.jsonl"),
    ])
    assert rc == 2
    assert "Unknown provider" in capsys.readouterr().err


def test_run_unknown_scenario_returns_nonzero(tmp_path, capsys):
    rc = main([
        "run", "--scenario", "no_such_scenario", "--model", "fake:x",
        "--seed", "1", "--out", str(tmp_path / "x.jsonl"),
    ])
    assert rc == 2
    assert "Unknown scenario" in capsys.readouterr().err
    # A bad scenario must not have written a trace file.
    assert not (tmp_path / "x.jsonl").exists()


def test_replay_missing_file_returns_nonzero(capsys):
    rc = main(["replay", "/no/such/trace.jsonl"])
    assert rc == 2
    assert "not found" in capsys.readouterr().err


def test_play_human_writes_comparable_trace(tmp_path, monkeypatch, capsys):
    # Feed scripted moves through builtins.input (HumanAgent resolves it lazily).
    inputs = iter(["up"] * 20)
    monkeypatch.setattr("builtins.input", lambda *a, **k: next(inputs))
    out = tmp_path / "runs" / "human.jsonl"
    rc = main([
        "play",
        "--scenario", "template",
        "--seed", "42",
        "--play-turns", "5",
        "--out", str(out),
    ])
    assert rc == 0
    traces = read_traces(out)
    assert len(traces) == 1
    assert traces[0].model == "human"
    assert traces[0].scenario == "template"
    # The run summary names the scenario.
    assert "template" in capsys.readouterr().out


def test_play_unknown_scenario_errors(capsys):
    rc = main(["play", "--scenario", "nope", "--seed", "1", "--play-turns", "1"])
    assert rc == 2
    assert "Unknown scenario" in capsys.readouterr().err


def _write_fake_trace(tmp_path):
    out = tmp_path / "r.jsonl"
    main([
        "run", "--scenario", "template", "--model", "fake:x",
        "--seed", "42", "--play-turns", "4", "--no-probe", "--out", str(out),
    ])
    return out


def test_replay_text_mode_unchanged(tmp_path, capsys):
    out = _write_fake_trace(tmp_path)
    capsys.readouterr()  # drain
    rc = main(["replay", str(out)])
    text = capsys.readouterr().out
    assert rc == 0
    assert "turn 1" in text  # legacy text behavior preserved


def test_replay_visual_emits_truecolor(tmp_path, capsys):
    out = _write_fake_trace(tmp_path)
    capsys.readouterr()
    rc = main(["replay", str(out), "--visual", "--fps", "0"])
    text = capsys.readouterr().out
    assert rc == 0
    assert "\033[38;2;" in text  # truecolor escape present


def test_replay_png_writes_frames(tmp_path, capsys):
    import pytest

    pytest.importorskip("matplotlib")
    out = _write_fake_trace(tmp_path)
    pdir = tmp_path / "png"
    rc = main(["replay", str(out), "--png", str(pdir)])
    assert rc == 0
    frames = list(pdir.glob("frame_*.png"))
    assert frames
    assert all(p.stat().st_size > 0 for p in frames)
    assert "PNG" in capsys.readouterr().out


def test_play_handles_stdin_eof(monkeypatch, capsys):
    # stdin closed/exhausted before the session finishes -> clean rc=2, no traceback.
    def _eof(*args, **kwargs):
        raise EOFError

    monkeypatch.setattr("builtins.input", _eof)
    rc = main([
        "play", "--scenario", "template", "--seed", "42", "--play-turns", "5",
    ])
    assert rc == 2
    assert "stdin" in capsys.readouterr().err.lower()


def test_compare_aggregates_traces(tmp_path, capsys):
    out = tmp_path / "runs.jsonl"
    # Two fake-model traces at the same difficulty (model id "demo").
    for seed in (1, 2):
        main([
            "run", "--scenario", "template", "--model", "fake:demo",
            "--seed", str(seed), "--play-turns", "4", "--no-probe", "--out", str(out),
        ])
    capsys.readouterr()  # drain
    rc = main(["compare", str(out)])
    text = capsys.readouterr().out
    assert rc == 0
    assert "demo" in text and "easy" in text
    assert "n=2" in text
    assert "motive_reading_accuracy" in text


def test_compare_writes_summary_json(tmp_path):
    import json

    out = tmp_path / "runs.jsonl"
    main([
        "run", "--scenario", "template", "--model", "fake:demo",
        "--seed", "1", "--play-turns", "4", "--no-probe", "--out", str(out),
    ])
    summary = tmp_path / "summary.json"
    rc = main(["compare", str(out), "--out", str(summary)])
    assert rc == 0
    data = json.loads(summary.read_text())
    assert data  # non-empty aggregate
    # Pin the documented "model|difficulty" key format + nested shape.
    assert "demo|easy" in data
    assert data["demo|easy"]["n"] == 1
    assert "motive_reading_accuracy" in data["demo|easy"]["metrics"]


def test_compare_missing_file_errors(capsys):
    rc = main(["compare", "/no/such/file.jsonl"])
    assert rc == 2
    assert "not found" in capsys.readouterr().err