Spaces:

openra-rl
/

OpenRA-Bench

Running

App Files Files Community

OpenRA-Bench / tests /test_evaluate.py

yxc20098

Revert "Fix CI: import scoring from evaluate_runner instead of openra_rl_util"

c07d9e8 15 days ago

raw

history blame contribute delete

5.69 kB

	"""Tests for the evaluation harness."""

	import csv
	import sys
	import tempfile
	from pathlib import Path
	from unittest.mock import patch

	import pytest

	# Add parent directory to path for direct import
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from evaluate import (
	RESULTS_COLUMNS,
	append_results,
	get_agent_fn,
	parse_args,
	)


	class TestParseArgs:
	"""Test argument parsing."""

	def test_minimal_args(self):
	with patch("sys.argv", ["evaluate.py", "--agent-name", "TestBot"]):
	args = parse_args()
	assert args.agent_name == "TestBot"
	assert args.agent == "scripted"
	assert args.agent_type == "Scripted"
	assert args.opponent == "Normal"
	assert args.games == 10

	def test_all_args(self):
	with patch("sys.argv", [
	"evaluate.py",
	"--agent", "llm",
	"--agent-name", "MyLLM",
	"--agent-type", "LLM",
	"--opponent", "Hard",
	"--games", "5",
	"--server", "http://example.com:8000",
	"--max-steps", "3000",
	"--dry-run",
	]):
	args = parse_args()
	assert args.agent == "llm"
	assert args.agent_name == "MyLLM"
	assert args.agent_type == "LLM"
	assert args.opponent == "Hard"
	assert args.games == 5
	assert args.server == "http://example.com:8000"
	assert args.max_steps == 3000
	assert args.dry_run is True

	def test_auto_detect_agent_type(self):
	for agent, expected_type in [
	("scripted", "Scripted"),
	("llm", "LLM"),
	("mcp", "Scripted"),
	("custom", "RL"),
	]:
	with patch("sys.argv", ["evaluate.py", "--agent", agent, "--agent-name", "T"]):
	args = parse_args()
	assert args.agent_type == expected_type, f"{agent} -> {expected_type}"

	def test_explicit_type_overrides_auto(self):
	with patch("sys.argv", [
	"evaluate.py", "--agent", "scripted",
	"--agent-name", "T", "--agent-type", "RL",
	]):
	args = parse_args()
	assert args.agent_type == "RL"

	def test_beginner_opponent_accepted(self):
	with patch("sys.argv", [
	"evaluate.py", "--agent-name", "T", "--opponent", "Beginner",
	]):
	args = parse_args()
	assert args.opponent == "Beginner"

	def test_medium_opponent_accepted(self):
	with patch("sys.argv", [
	"evaluate.py", "--agent-name", "T", "--opponent", "Medium",
	]):
	args = parse_args()
	assert args.opponent == "Medium"


	class TestGetAgentFn:
	"""Test agent factory."""

	def test_scripted_returns_callable(self):
	fn = get_agent_fn("scripted")
	assert callable(fn)

	def test_llm_returns_callable(self):
	fn = get_agent_fn("llm")
	assert callable(fn)


	class TestAppendResults:
	"""Test CSV output."""

	def test_creates_new_file(self):
	with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
	path = Path(f.name)

	path.unlink() # ensure it doesn't exist
	results = {col: "" for col in RESULTS_COLUMNS}
	results["agent_name"] = "TestBot"
	results["games"] = 5
	results["score"] = 85.0

	append_results(results, path)

	assert path.exists()
	with open(path) as f:
	reader = csv.DictReader(f)
	rows = list(reader)
	assert len(rows) == 1
	assert rows[0]["agent_name"] == "TestBot"

	path.unlink()

	def test_appends_to_existing(self):
	with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
	path = Path(f.name)

	# Write first result
	results1 = {col: "" for col in RESULTS_COLUMNS}
	results1["agent_name"] = "Bot1"
	append_results(results1, path)

	# Write second result
	results2 = {col: "" for col in RESULTS_COLUMNS}
	results2["agent_name"] = "Bot2"
	append_results(results2, path)

	with open(path) as f:
	reader = csv.DictReader(f)
	rows = list(reader)
	assert len(rows) == 2
	assert rows[0]["agent_name"] == "Bot1"
	assert rows[1]["agent_name"] == "Bot2"

	path.unlink()

	def test_columns_match_expected(self):
	assert "agent_name" in RESULTS_COLUMNS
	assert "score" in RESULTS_COLUMNS
	assert "win_rate" in RESULTS_COLUMNS
	assert "replay_url" in RESULTS_COLUMNS
	assert len(RESULTS_COLUMNS) == 13


	class TestScoringUsesUtil:
	"""Verify scoring uses the single source of truth from openra-rl-util."""

	def test_rubrics_re_exports_util(self):
	"""rubrics.py should re-export from openra_rl_util."""
	from rubrics import compute_composite_score_from_games
	from openra_rl_util.rubrics import (
	compute_composite_score_from_games as util_fn,
	)
	assert compute_composite_score_from_games is util_fn

	def test_evaluate_uses_util_scoring(self):
	"""evaluate.py should not have its own compute_composite_score."""
	import evaluate
	assert not hasattr(evaluate, "compute_composite_score"), \
	"evaluate.py should use compute_composite_score_from_games from Util"

	def test_compute_game_metrics_re_exported(self):
	from rubrics import compute_game_metrics
	from openra_rl_util.rubrics import compute_game_metrics as util_fn
	assert compute_game_metrics is util_fn