File size: 5,693 Bytes
2d0ff84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45ef63c
 
 
 
 
 
 
 
 
 
 
 
 
 
2d0ff84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c07d9e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
"""Tests for the evaluation harness."""

import csv
import sys
import tempfile
from pathlib import Path
from unittest.mock import patch

import pytest

# Add parent directory to path for direct import
sys.path.insert(0, str(Path(__file__).parent.parent))

from evaluate import (
    RESULTS_COLUMNS,
    append_results,
    get_agent_fn,
    parse_args,
)


class TestParseArgs:
    """Test argument parsing."""

    def test_minimal_args(self):
        with patch("sys.argv", ["evaluate.py", "--agent-name", "TestBot"]):
            args = parse_args()
            assert args.agent_name == "TestBot"
            assert args.agent == "scripted"
            assert args.agent_type == "Scripted"
            assert args.opponent == "Normal"
            assert args.games == 10

    def test_all_args(self):
        with patch("sys.argv", [
            "evaluate.py",
            "--agent", "llm",
            "--agent-name", "MyLLM",
            "--agent-type", "LLM",
            "--opponent", "Hard",
            "--games", "5",
            "--server", "http://example.com:8000",
            "--max-steps", "3000",
            "--dry-run",
        ]):
            args = parse_args()
            assert args.agent == "llm"
            assert args.agent_name == "MyLLM"
            assert args.agent_type == "LLM"
            assert args.opponent == "Hard"
            assert args.games == 5
            assert args.server == "http://example.com:8000"
            assert args.max_steps == 3000
            assert args.dry_run is True

    def test_auto_detect_agent_type(self):
        for agent, expected_type in [
            ("scripted", "Scripted"),
            ("llm", "LLM"),
            ("mcp", "Scripted"),
            ("custom", "RL"),
        ]:
            with patch("sys.argv", ["evaluate.py", "--agent", agent, "--agent-name", "T"]):
                args = parse_args()
                assert args.agent_type == expected_type, f"{agent} -> {expected_type}"

    def test_explicit_type_overrides_auto(self):
        with patch("sys.argv", [
            "evaluate.py", "--agent", "scripted",
            "--agent-name", "T", "--agent-type", "RL",
        ]):
            args = parse_args()
            assert args.agent_type == "RL"

    def test_beginner_opponent_accepted(self):
        with patch("sys.argv", [
            "evaluate.py", "--agent-name", "T", "--opponent", "Beginner",
        ]):
            args = parse_args()
            assert args.opponent == "Beginner"

    def test_medium_opponent_accepted(self):
        with patch("sys.argv", [
            "evaluate.py", "--agent-name", "T", "--opponent", "Medium",
        ]):
            args = parse_args()
            assert args.opponent == "Medium"


class TestGetAgentFn:
    """Test agent factory."""

    def test_scripted_returns_callable(self):
        fn = get_agent_fn("scripted")
        assert callable(fn)

    def test_llm_returns_callable(self):
        fn = get_agent_fn("llm")
        assert callable(fn)


class TestAppendResults:
    """Test CSV output."""

    def test_creates_new_file(self):
        with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
            path = Path(f.name)

        path.unlink()  # ensure it doesn't exist
        results = {col: "" for col in RESULTS_COLUMNS}
        results["agent_name"] = "TestBot"
        results["games"] = 5
        results["score"] = 85.0

        append_results(results, path)

        assert path.exists()
        with open(path) as f:
            reader = csv.DictReader(f)
            rows = list(reader)
            assert len(rows) == 1
            assert rows[0]["agent_name"] == "TestBot"

        path.unlink()

    def test_appends_to_existing(self):
        with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
            path = Path(f.name)

        # Write first result
        results1 = {col: "" for col in RESULTS_COLUMNS}
        results1["agent_name"] = "Bot1"
        append_results(results1, path)

        # Write second result
        results2 = {col: "" for col in RESULTS_COLUMNS}
        results2["agent_name"] = "Bot2"
        append_results(results2, path)

        with open(path) as f:
            reader = csv.DictReader(f)
            rows = list(reader)
            assert len(rows) == 2
            assert rows[0]["agent_name"] == "Bot1"
            assert rows[1]["agent_name"] == "Bot2"

        path.unlink()

    def test_columns_match_expected(self):
        assert "agent_name" in RESULTS_COLUMNS
        assert "score" in RESULTS_COLUMNS
        assert "win_rate" in RESULTS_COLUMNS
        assert "replay_url" in RESULTS_COLUMNS
        assert len(RESULTS_COLUMNS) == 13


class TestScoringUsesUtil:
    """Verify scoring uses the single source of truth from openra-rl-util."""

    def test_rubrics_re_exports_util(self):
        """rubrics.py should re-export from openra_rl_util."""
        from rubrics import compute_composite_score_from_games
        from openra_rl_util.rubrics import (
            compute_composite_score_from_games as util_fn,
        )
        assert compute_composite_score_from_games is util_fn

    def test_evaluate_uses_util_scoring(self):
        """evaluate.py should not have its own compute_composite_score."""
        import evaluate
        assert not hasattr(evaluate, "compute_composite_score"), \
            "evaluate.py should use compute_composite_score_from_games from Util"

    def test_compute_game_metrics_re_exported(self):
        from rubrics import compute_game_metrics
        from openra_rl_util.rubrics import compute_game_metrics as util_fn
        assert compute_game_metrics is util_fn