"""Tests for the evaluation harness.""" import csv import sys import tempfile from pathlib import Path from unittest.mock import patch import pytest # Add parent directory to path for direct import sys.path.insert(0, str(Path(__file__).parent.parent)) from evaluate import ( RESULTS_COLUMNS, append_results, get_agent_fn, parse_args, ) class TestParseArgs: """Test argument parsing.""" def test_minimal_args(self): with patch("sys.argv", ["evaluate.py", "--agent-name", "TestBot"]): args = parse_args() assert args.agent_name == "TestBot" assert args.agent == "scripted" assert args.agent_type == "Scripted" assert args.opponent == "Normal" assert args.games == 10 def test_all_args(self): with patch("sys.argv", [ "evaluate.py", "--agent", "llm", "--agent-name", "MyLLM", "--agent-type", "LLM", "--opponent", "Hard", "--games", "5", "--server", "http://example.com:8000", "--max-steps", "3000", "--dry-run", ]): args = parse_args() assert args.agent == "llm" assert args.agent_name == "MyLLM" assert args.agent_type == "LLM" assert args.opponent == "Hard" assert args.games == 5 assert args.server == "http://example.com:8000" assert args.max_steps == 3000 assert args.dry_run is True def test_auto_detect_agent_type(self): for agent, expected_type in [ ("scripted", "Scripted"), ("llm", "LLM"), ("mcp", "Scripted"), ("custom", "RL"), ]: with patch("sys.argv", ["evaluate.py", "--agent", agent, "--agent-name", "T"]): args = parse_args() assert args.agent_type == expected_type, f"{agent} -> {expected_type}" def test_explicit_type_overrides_auto(self): with patch("sys.argv", [ "evaluate.py", "--agent", "scripted", "--agent-name", "T", "--agent-type", "RL", ]): args = parse_args() assert args.agent_type == "RL" def test_beginner_opponent_accepted(self): with patch("sys.argv", [ "evaluate.py", "--agent-name", "T", "--opponent", "Beginner", ]): args = parse_args() assert args.opponent == "Beginner" def test_medium_opponent_accepted(self): with patch("sys.argv", [ "evaluate.py", "--agent-name", "T", "--opponent", "Medium", ]): args = parse_args() assert args.opponent == "Medium" class TestGetAgentFn: """Test agent factory.""" def test_scripted_returns_callable(self): fn = get_agent_fn("scripted") assert callable(fn) def test_llm_returns_callable(self): fn = get_agent_fn("llm") assert callable(fn) class TestAppendResults: """Test CSV output.""" def test_creates_new_file(self): with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: path = Path(f.name) path.unlink() # ensure it doesn't exist results = {col: "" for col in RESULTS_COLUMNS} results["agent_name"] = "TestBot" results["games"] = 5 results["score"] = 85.0 append_results(results, path) assert path.exists() with open(path) as f: reader = csv.DictReader(f) rows = list(reader) assert len(rows) == 1 assert rows[0]["agent_name"] == "TestBot" path.unlink() def test_appends_to_existing(self): with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: path = Path(f.name) # Write first result results1 = {col: "" for col in RESULTS_COLUMNS} results1["agent_name"] = "Bot1" append_results(results1, path) # Write second result results2 = {col: "" for col in RESULTS_COLUMNS} results2["agent_name"] = "Bot2" append_results(results2, path) with open(path) as f: reader = csv.DictReader(f) rows = list(reader) assert len(rows) == 2 assert rows[0]["agent_name"] == "Bot1" assert rows[1]["agent_name"] == "Bot2" path.unlink() def test_columns_match_expected(self): assert "agent_name" in RESULTS_COLUMNS assert "score" in RESULTS_COLUMNS assert "win_rate" in RESULTS_COLUMNS assert "replay_url" in RESULTS_COLUMNS assert len(RESULTS_COLUMNS) == 13 class TestScoringUsesUtil: """Verify scoring uses the single source of truth from openra-rl-util.""" def test_rubrics_re_exports_util(self): """rubrics.py should re-export from openra_rl_util.""" from rubrics import compute_composite_score_from_games from openra_rl_util.rubrics import ( compute_composite_score_from_games as util_fn, ) assert compute_composite_score_from_games is util_fn def test_evaluate_uses_util_scoring(self): """evaluate.py should not have its own compute_composite_score.""" import evaluate assert not hasattr(evaluate, "compute_composite_score"), \ "evaluate.py should use compute_composite_score_from_games from Util" def test_compute_game_metrics_re_exported(self): from rubrics import compute_game_metrics from openra_rl_util.rubrics import compute_game_metrics as util_fn assert compute_game_metrics is util_fn