Spaces:
Running
Running
| """Tests for the evaluation harness.""" | |
| import csv | |
| import sys | |
| import tempfile | |
| from pathlib import Path | |
| from unittest.mock import patch | |
| import pytest | |
| # Add parent directory to path for direct import | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from evaluate import ( | |
| RESULTS_COLUMNS, | |
| append_results, | |
| get_agent_fn, | |
| parse_args, | |
| ) | |
| class TestParseArgs: | |
| """Test argument parsing.""" | |
| def test_minimal_args(self): | |
| with patch("sys.argv", ["evaluate.py", "--agent-name", "TestBot"]): | |
| args = parse_args() | |
| assert args.agent_name == "TestBot" | |
| assert args.agent == "scripted" | |
| assert args.agent_type == "Scripted" | |
| assert args.opponent == "Normal" | |
| assert args.games == 10 | |
| def test_all_args(self): | |
| with patch("sys.argv", [ | |
| "evaluate.py", | |
| "--agent", "llm", | |
| "--agent-name", "MyLLM", | |
| "--agent-type", "LLM", | |
| "--opponent", "Hard", | |
| "--games", "5", | |
| "--server", "http://example.com:8000", | |
| "--max-steps", "3000", | |
| "--dry-run", | |
| ]): | |
| args = parse_args() | |
| assert args.agent == "llm" | |
| assert args.agent_name == "MyLLM" | |
| assert args.agent_type == "LLM" | |
| assert args.opponent == "Hard" | |
| assert args.games == 5 | |
| assert args.server == "http://example.com:8000" | |
| assert args.max_steps == 3000 | |
| assert args.dry_run is True | |
| def test_auto_detect_agent_type(self): | |
| for agent, expected_type in [ | |
| ("scripted", "Scripted"), | |
| ("llm", "LLM"), | |
| ("mcp", "Scripted"), | |
| ("custom", "RL"), | |
| ]: | |
| with patch("sys.argv", ["evaluate.py", "--agent", agent, "--agent-name", "T"]): | |
| args = parse_args() | |
| assert args.agent_type == expected_type, f"{agent} -> {expected_type}" | |
| def test_explicit_type_overrides_auto(self): | |
| with patch("sys.argv", [ | |
| "evaluate.py", "--agent", "scripted", | |
| "--agent-name", "T", "--agent-type", "RL", | |
| ]): | |
| args = parse_args() | |
| assert args.agent_type == "RL" | |
| def test_beginner_opponent_accepted(self): | |
| with patch("sys.argv", [ | |
| "evaluate.py", "--agent-name", "T", "--opponent", "Beginner", | |
| ]): | |
| args = parse_args() | |
| assert args.opponent == "Beginner" | |
| def test_medium_opponent_accepted(self): | |
| with patch("sys.argv", [ | |
| "evaluate.py", "--agent-name", "T", "--opponent", "Medium", | |
| ]): | |
| args = parse_args() | |
| assert args.opponent == "Medium" | |
| class TestGetAgentFn: | |
| """Test agent factory.""" | |
| def test_scripted_returns_callable(self): | |
| fn = get_agent_fn("scripted") | |
| assert callable(fn) | |
| def test_llm_returns_callable(self): | |
| fn = get_agent_fn("llm") | |
| assert callable(fn) | |
| class TestAppendResults: | |
| """Test CSV output.""" | |
| def test_creates_new_file(self): | |
| with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: | |
| path = Path(f.name) | |
| path.unlink() # ensure it doesn't exist | |
| results = {col: "" for col in RESULTS_COLUMNS} | |
| results["agent_name"] = "TestBot" | |
| results["games"] = 5 | |
| results["score"] = 85.0 | |
| append_results(results, path) | |
| assert path.exists() | |
| with open(path) as f: | |
| reader = csv.DictReader(f) | |
| rows = list(reader) | |
| assert len(rows) == 1 | |
| assert rows[0]["agent_name"] == "TestBot" | |
| path.unlink() | |
| def test_appends_to_existing(self): | |
| with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: | |
| path = Path(f.name) | |
| # Write first result | |
| results1 = {col: "" for col in RESULTS_COLUMNS} | |
| results1["agent_name"] = "Bot1" | |
| append_results(results1, path) | |
| # Write second result | |
| results2 = {col: "" for col in RESULTS_COLUMNS} | |
| results2["agent_name"] = "Bot2" | |
| append_results(results2, path) | |
| with open(path) as f: | |
| reader = csv.DictReader(f) | |
| rows = list(reader) | |
| assert len(rows) == 2 | |
| assert rows[0]["agent_name"] == "Bot1" | |
| assert rows[1]["agent_name"] == "Bot2" | |
| path.unlink() | |
| def test_columns_match_expected(self): | |
| assert "agent_name" in RESULTS_COLUMNS | |
| assert "score" in RESULTS_COLUMNS | |
| assert "win_rate" in RESULTS_COLUMNS | |
| assert "replay_url" in RESULTS_COLUMNS | |
| assert len(RESULTS_COLUMNS) == 13 | |
| class TestScoringUsesUtil: | |
| """Verify scoring uses the single source of truth from openra-rl-util.""" | |
| def test_rubrics_re_exports_util(self): | |
| """rubrics.py should re-export from openra_rl_util.""" | |
| from rubrics import compute_composite_score_from_games | |
| from openra_rl_util.rubrics import ( | |
| compute_composite_score_from_games as util_fn, | |
| ) | |
| assert compute_composite_score_from_games is util_fn | |
| def test_evaluate_uses_util_scoring(self): | |
| """evaluate.py should not have its own compute_composite_score.""" | |
| import evaluate | |
| assert not hasattr(evaluate, "compute_composite_score"), \ | |
| "evaluate.py should use compute_composite_score_from_games from Util" | |
| def test_compute_game_metrics_re_exported(self): | |
| from rubrics import compute_game_metrics | |
| from openra_rl_util.rubrics import compute_game_metrics as util_fn | |
| assert compute_game_metrics is util_fn | |