OpenRA-Bench / tests /test_evaluate.py
yxc20098's picture
Revert "Fix CI: import scoring from evaluate_runner instead of openra_rl_util"
c07d9e8
"""Tests for the evaluation harness."""
import csv
import sys
import tempfile
from pathlib import Path
from unittest.mock import patch
import pytest
# Add parent directory to path for direct import
sys.path.insert(0, str(Path(__file__).parent.parent))
from evaluate import (
RESULTS_COLUMNS,
append_results,
get_agent_fn,
parse_args,
)
class TestParseArgs:
"""Test argument parsing."""
def test_minimal_args(self):
with patch("sys.argv", ["evaluate.py", "--agent-name", "TestBot"]):
args = parse_args()
assert args.agent_name == "TestBot"
assert args.agent == "scripted"
assert args.agent_type == "Scripted"
assert args.opponent == "Normal"
assert args.games == 10
def test_all_args(self):
with patch("sys.argv", [
"evaluate.py",
"--agent", "llm",
"--agent-name", "MyLLM",
"--agent-type", "LLM",
"--opponent", "Hard",
"--games", "5",
"--server", "http://example.com:8000",
"--max-steps", "3000",
"--dry-run",
]):
args = parse_args()
assert args.agent == "llm"
assert args.agent_name == "MyLLM"
assert args.agent_type == "LLM"
assert args.opponent == "Hard"
assert args.games == 5
assert args.server == "http://example.com:8000"
assert args.max_steps == 3000
assert args.dry_run is True
def test_auto_detect_agent_type(self):
for agent, expected_type in [
("scripted", "Scripted"),
("llm", "LLM"),
("mcp", "Scripted"),
("custom", "RL"),
]:
with patch("sys.argv", ["evaluate.py", "--agent", agent, "--agent-name", "T"]):
args = parse_args()
assert args.agent_type == expected_type, f"{agent} -> {expected_type}"
def test_explicit_type_overrides_auto(self):
with patch("sys.argv", [
"evaluate.py", "--agent", "scripted",
"--agent-name", "T", "--agent-type", "RL",
]):
args = parse_args()
assert args.agent_type == "RL"
def test_beginner_opponent_accepted(self):
with patch("sys.argv", [
"evaluate.py", "--agent-name", "T", "--opponent", "Beginner",
]):
args = parse_args()
assert args.opponent == "Beginner"
def test_medium_opponent_accepted(self):
with patch("sys.argv", [
"evaluate.py", "--agent-name", "T", "--opponent", "Medium",
]):
args = parse_args()
assert args.opponent == "Medium"
class TestGetAgentFn:
"""Test agent factory."""
def test_scripted_returns_callable(self):
fn = get_agent_fn("scripted")
assert callable(fn)
def test_llm_returns_callable(self):
fn = get_agent_fn("llm")
assert callable(fn)
class TestAppendResults:
"""Test CSV output."""
def test_creates_new_file(self):
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
path = Path(f.name)
path.unlink() # ensure it doesn't exist
results = {col: "" for col in RESULTS_COLUMNS}
results["agent_name"] = "TestBot"
results["games"] = 5
results["score"] = 85.0
append_results(results, path)
assert path.exists()
with open(path) as f:
reader = csv.DictReader(f)
rows = list(reader)
assert len(rows) == 1
assert rows[0]["agent_name"] == "TestBot"
path.unlink()
def test_appends_to_existing(self):
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
path = Path(f.name)
# Write first result
results1 = {col: "" for col in RESULTS_COLUMNS}
results1["agent_name"] = "Bot1"
append_results(results1, path)
# Write second result
results2 = {col: "" for col in RESULTS_COLUMNS}
results2["agent_name"] = "Bot2"
append_results(results2, path)
with open(path) as f:
reader = csv.DictReader(f)
rows = list(reader)
assert len(rows) == 2
assert rows[0]["agent_name"] == "Bot1"
assert rows[1]["agent_name"] == "Bot2"
path.unlink()
def test_columns_match_expected(self):
assert "agent_name" in RESULTS_COLUMNS
assert "score" in RESULTS_COLUMNS
assert "win_rate" in RESULTS_COLUMNS
assert "replay_url" in RESULTS_COLUMNS
assert len(RESULTS_COLUMNS) == 13
class TestScoringUsesUtil:
"""Verify scoring uses the single source of truth from openra-rl-util."""
def test_rubrics_re_exports_util(self):
"""rubrics.py should re-export from openra_rl_util."""
from rubrics import compute_composite_score_from_games
from openra_rl_util.rubrics import (
compute_composite_score_from_games as util_fn,
)
assert compute_composite_score_from_games is util_fn
def test_evaluate_uses_util_scoring(self):
"""evaluate.py should not have its own compute_composite_score."""
import evaluate
assert not hasattr(evaluate, "compute_composite_score"), \
"evaluate.py should use compute_composite_score_from_games from Util"
def test_compute_game_metrics_re_exported(self):
from rubrics import compute_game_metrics
from openra_rl_util.rubrics import compute_game_metrics as util_fn
assert compute_game_metrics is util_fn