Spaces:

irregular6612
/

AgentnessBench

Sleeping

File size: 6,154 Bytes

from proteus.cli import main
from proteus.game.runtime import read_traces


def test_list_scenarios_prints_template(capsys):
    rc = main(["list-scenarios"])
    out = capsys.readouterr().out
    assert rc == 0
    assert "template" in out


def test_run_with_fake_provider_writes_reloadable_trace(tmp_path, capsys):
    out = tmp_path / "runs" / "smoke.jsonl"
    rc = main([
        "run",
        "--scenario", "template",
        "--model", "fake:cli-test",
        "--seed", "42",
        "--play-turns", "5",
        "--no-probe",
        "--out", str(out),
    ])
    assert rc == 0
    assert out.exists()
    traces = read_traces(out)
    assert len(traces) == 1
    assert traces[0].scenario == "template"
    assert traces[0].model == "cli-test"
    # The run line summarizes the outcome.
    assert "template" in capsys.readouterr().out


def test_replay_prints_turns_and_outcome(tmp_path, capsys):
    out = tmp_path / "r.jsonl"
    main([
        "run", "--scenario", "template", "--model", "fake:x",
        "--seed", "42", "--play-turns", "5", "--no-probe", "--out", str(out),
    ])
    capsys.readouterr()  # drain
    rc = main(["replay", str(out)])
    text = capsys.readouterr().out
    assert rc == 0
    assert "template" in text
    assert "turn 1" in text


def test_run_unknown_provider_returns_nonzero(tmp_path, capsys):
    rc = main([
        "run", "--scenario", "template", "--model", "bogus:x",
        "--seed", "1", "--out", str(tmp_path / "x.jsonl"),
    ])
    assert rc == 2
    assert "Unknown provider" in capsys.readouterr().err


def test_run_unknown_scenario_returns_nonzero(tmp_path, capsys):
    rc = main([
        "run", "--scenario", "no_such_scenario", "--model", "fake:x",
        "--seed", "1", "--out", str(tmp_path / "x.jsonl"),
    ])
    assert rc == 2
    assert "Unknown scenario" in capsys.readouterr().err
    # A bad scenario must not have written a trace file.
    assert not (tmp_path / "x.jsonl").exists()


def test_replay_missing_file_returns_nonzero(capsys):
    rc = main(["replay", "/no/such/trace.jsonl"])
    assert rc == 2
    assert "not found" in capsys.readouterr().err


def test_play_human_writes_comparable_trace(tmp_path, monkeypatch, capsys):
    # Feed scripted moves through builtins.input (HumanAgent resolves it lazily).
    inputs = iter(["up"] * 20)
    monkeypatch.setattr("builtins.input", lambda *a, **k: next(inputs))
    out = tmp_path / "runs" / "human.jsonl"
    rc = main([
        "play",
        "--scenario", "template",
        "--seed", "42",
        "--play-turns", "5",
        "--out", str(out),
    ])
    assert rc == 0
    traces = read_traces(out)
    assert len(traces) == 1
    assert traces[0].model == "human"
    assert traces[0].scenario == "template"
    # The run summary names the scenario.
    assert "template" in capsys.readouterr().out


def test_play_unknown_scenario_errors(capsys):
    rc = main(["play", "--scenario", "nope", "--seed", "1", "--play-turns", "1"])
    assert rc == 2
    assert "Unknown scenario" in capsys.readouterr().err


def _write_fake_trace(tmp_path):
    out = tmp_path / "r.jsonl"
    main([
        "run", "--scenario", "template", "--model", "fake:x",
        "--seed", "42", "--play-turns", "4", "--no-probe", "--out", str(out),
    ])
    return out


def test_replay_text_mode_unchanged(tmp_path, capsys):
    out = _write_fake_trace(tmp_path)
    capsys.readouterr()  # drain
    rc = main(["replay", str(out)])
    text = capsys.readouterr().out
    assert rc == 0
    assert "turn 1" in text  # legacy text behavior preserved


def test_replay_visual_emits_truecolor(tmp_path, capsys):
    out = _write_fake_trace(tmp_path)
    capsys.readouterr()
    rc = main(["replay", str(out), "--visual", "--fps", "0"])
    text = capsys.readouterr().out
    assert rc == 0
    assert "\033[38;2;" in text  # truecolor escape present


def test_replay_png_writes_frames(tmp_path, capsys):
    import pytest

    pytest.importorskip("matplotlib")
    out = _write_fake_trace(tmp_path)
    pdir = tmp_path / "png"
    rc = main(["replay", str(out), "--png", str(pdir)])
    assert rc == 0
    frames = list(pdir.glob("frame_*.png"))
    assert frames
    assert all(p.stat().st_size > 0 for p in frames)
    assert "PNG" in capsys.readouterr().out


def test_play_handles_stdin_eof(monkeypatch, capsys):
    # stdin closed/exhausted before the session finishes -> clean rc=2, no traceback.
    def _eof(*args, **kwargs):
        raise EOFError

    monkeypatch.setattr("builtins.input", _eof)
    rc = main([
        "play", "--scenario", "template", "--seed", "42", "--play-turns", "5",
    ])
    assert rc == 2
    assert "stdin" in capsys.readouterr().err.lower()


def test_compare_aggregates_traces(tmp_path, capsys):
    out = tmp_path / "runs.jsonl"
    # Two fake-model traces at the same difficulty (model id "demo").
    for seed in (1, 2):
        main([
            "run", "--scenario", "template", "--model", "fake:demo",
            "--seed", str(seed), "--play-turns", "4", "--no-probe", "--out", str(out),
        ])
    capsys.readouterr()  # drain
    rc = main(["compare", str(out)])
    text = capsys.readouterr().out
    assert rc == 0
    assert "demo" in text and "easy" in text
    assert "n=2" in text
    assert "motive_reading_accuracy" in text


def test_compare_writes_summary_json(tmp_path):
    import json

    out = tmp_path / "runs.jsonl"
    main([
        "run", "--scenario", "template", "--model", "fake:demo",
        "--seed", "1", "--play-turns", "4", "--no-probe", "--out", str(out),
    ])
    summary = tmp_path / "summary.json"
    rc = main(["compare", str(out), "--out", str(summary)])
    assert rc == 0
    data = json.loads(summary.read_text())
    assert data  # non-empty aggregate
    # Pin the documented "model|difficulty" key format + nested shape.
    assert "demo|easy" in data
    assert data["demo|easy"]["n"] == 1
    assert "motive_reading_accuracy" in data["demo|easy"]["metrics"]


def test_compare_missing_file_errors(capsys):
    rc = main(["compare", "/no/such/file.jsonl"])
    assert rc == 2
    assert "not found" in capsys.readouterr().err