Spaces:

openra-rl
/

OpenRA-Bench

Running

yxc20098 commited on 18 days ago

Commit

2673c6a

1 Parent(s): f96ea53

Fix scoring dedup, add test suite, wire openra-rl-util

- Replace duplicate compute_composite_score() with single source of
truth from openra-rl-util (compute_composite_score_from_games)
- Re-export rubrics from openra-rl-util instead of duplicating
- Add test suite: 25 tests covering evaluate.py and app.py
- Update requirements to include openra-rl-util and openra-rl deps

Files changed (6) hide show

evaluate.py +6 -29
requirements.txt +2 -0
rubrics.py +10 -148
tests/__init__.py +0 -0
tests/test_app.py +100 -0
tests/test_evaluate.py +163 -0

evaluate.py CHANGED Viewed

@@ -29,6 +29,8 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Dict, List
 # Evaluation results file
 RESULTS_FILE = Path(__file__).parent / "data" / "results.csv"
@@ -122,10 +124,8 @@ async def run_game(env: Any, agent_fn: Any, max_steps: int) -> Dict[str, Any]:
         max_steps: Maximum steps before timeout.
     Returns:
-        Dict with game metrics (from rubrics.compute_game_metrics).
     """
-    from rubrics import compute_game_metrics
     obs = await env.reset()
     steps = 0
@@ -143,10 +143,9 @@ def get_agent_fn(agent_type: str) -> Any:
     Returns a callable that takes an observation and returns an action.
     """
     if agent_type == "scripted":
-        # Import inline to avoid hard dependency
         from openra_env.models import OpenRAAction
         # Simple no-op agent for evaluation framework testing
-        # Replace with actual ScriptedBot integration
         return lambda obs: OpenRAAction(commands=[])
     else:
         from openra_env.models import OpenRAAction
@@ -168,7 +167,7 @@ async def run_evaluation(args: argparse.Namespace) -> Dict[str, Any]:
             result_str = metrics["result"] or "timeout"
             print(f"{result_str} (ticks: {metrics['ticks']}, K/D: {metrics['kd_ratio']:.1f})")
-    # Aggregate results
     wins = sum(1 for g in game_results if g["win"])
     total = len(game_results)
@@ -178,7 +177,7 @@ async def run_evaluation(args: argparse.Namespace) -> Dict[str, Any]:
         "opponent": args.opponent,
         "games": total,
         "win_rate": round(100.0 * wins / max(total, 1), 1),
-        "score": round(compute_composite_score(game_results), 1),
         "avg_kills": round(sum(g["kills_cost"] for g in game_results) / max(total, 1)),
         "avg_deaths": round(sum(g["deaths_cost"] for g in game_results) / max(total, 1)),
         "kd_ratio": round(
@@ -195,28 +194,6 @@ async def run_evaluation(args: argparse.Namespace) -> Dict[str, Any]:
     }
-def compute_composite_score(game_results: List[Dict[str, Any]]) -> float:
-    """Compute the OpenRA-Bench composite score.
-    Score = 50% win_rate + 25% avg_kd_normalized + 25% avg_economy_normalized
-    """
-    total = len(game_results)
-    if total == 0:
-        return 0.0
-    win_rate = sum(1 for g in game_results if g["win"]) / total
-    # K/D ratio normalized: kd / (kd + 1) maps [0, inf) -> [0, 1)
-    avg_kd = sum(g["kd_ratio"] for g in game_results) / total
-    kd_norm = avg_kd / (avg_kd + 1)
-    # Economy normalized: assets / (assets + 10000)
-    avg_assets = sum(g["assets_value"] for g in game_results) / total
-    econ_norm = avg_assets / (avg_assets + 10000) if avg_assets >= 0 else 0.0
-    return 100.0 * (0.5 * win_rate + 0.25 * kd_norm + 0.25 * econ_norm)
 def append_results(results: Dict[str, Any], output_path: Path) -> None:
     """Append evaluation results to CSV file."""
     file_exists = output_path.exists() and output_path.stat().st_size > 0

 from pathlib import Path
 from typing import Any, Dict, List
+from openra_rl_util.rubrics import compute_composite_score_from_games, compute_game_metrics
 # Evaluation results file
 RESULTS_FILE = Path(__file__).parent / "data" / "results.csv"
         max_steps: Maximum steps before timeout.
     Returns:
+        Dict with game metrics (from compute_game_metrics).
     """
     obs = await env.reset()
     steps = 0
     Returns a callable that takes an observation and returns an action.
     """
     if agent_type == "scripted":
         from openra_env.models import OpenRAAction
         # Simple no-op agent for evaluation framework testing
+        # TODO: Wire to openra_env.agents.scripted.ScriptedBot when extracted
         return lambda obs: OpenRAAction(commands=[])
     else:
         from openra_env.models import OpenRAAction
             result_str = metrics["result"] or "timeout"
             print(f"{result_str} (ticks: {metrics['ticks']}, K/D: {metrics['kd_ratio']:.1f})")
+    # Aggregate results using single source of truth from openra-rl-util
     wins = sum(1 for g in game_results if g["win"])
     total = len(game_results)
         "opponent": args.opponent,
         "games": total,
         "win_rate": round(100.0 * wins / max(total, 1), 1),
+        "score": round(compute_composite_score_from_games(game_results), 1),
         "avg_kills": round(sum(g["kills_cost"] for g in game_results) / max(total, 1)),
         "avg_deaths": round(sum(g["deaths_cost"] for g in game_results) / max(total, 1)),
         "kd_ratio": round(
     }
 def append_results(results: Dict[str, Any], output_path: Path) -> None:
     """Append evaluation results to CSV file."""
     file_exists = output_path.exists() and output_path.stat().st_size > 0

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
 gradio>=4.44.0
 pandas>=2.0.0
 openenv-core>=0.2.0

 gradio>=4.44.0
 pandas>=2.0.0
 openenv-core>=0.2.0
+openra-rl-util>=0.1.0
+openra-rl>=0.3.0

rubrics.py CHANGED Viewed

@@ -1,152 +1,14 @@
-"""OpenRA-Bench rubrics for agent evaluation.
-Follows the OpenEnv rubric pattern (see openenv.core.rubrics).
-These rubrics score game episodes based on win/loss, military efficiency,
-and economic performance.
-Usage:
-    rubric = OpenRABenchRubric()
-    rubric.reset()
-    for action, obs in episode:
-        reward = rubric(action, obs)  # 0.0 until done
-    step_rewards = rubric.win_loss.compute_step_rewards()
 """
-from typing import Any, Dict, List, Tuple
-from openenv.core.rubrics import (
-    ExponentialDiscountingTrajectoryRubric,
-    TrajectoryRubric,
-    WeightedSum,
 )
-class OpenRAWinLossRubric(ExponentialDiscountingTrajectoryRubric):
-    """Score game based on win/loss/draw outcome with temporal discounting.
-    Terminal rewards:
-    - Win:  +1.0
-    - Loss: -1.0
-    - Draw:  0.0
-    """
-    def score_trajectory(self, trajectory: List[Tuple[Any, Any]]) -> float:
-        if not trajectory:
-            return 0.0
-        _, final_obs = trajectory[-1]
-        result = getattr(final_obs, "result", "")
-        if result == "win":
-            return 1.0
-        elif result == "lose":
-            return -1.0
-        return 0.0
-class MilitaryEfficiencyRubric(TrajectoryRubric):
-    """Score based on kill/death cost ratio from final observation.
-    Score = kills_cost / max(kills_cost + deaths_cost, 1)
-    Normalized to 0.0-1.0 range.
-    """
-    def score_trajectory(self, trajectory: List[Tuple[Any, Any]]) -> float:
-        if not trajectory:
-            return 0.0
-        _, final_obs = trajectory[-1]
-        military = getattr(final_obs, "military", None)
-        if military is None:
-            return 0.0
-        kills = getattr(military, "kills_cost", 0)
-        deaths = getattr(military, "deaths_cost", 0)
-        total = kills + deaths
-        if total == 0:
-            return 0.5  # No combat occurred
-        return kills / total
-    def compute_step_rewards(self) -> List[float]:
-        if not self._trajectory:
-            return []
-        score = self.score_trajectory(self._trajectory)
-        return [score] * len(self._trajectory)
-class EconomyRubric(TrajectoryRubric):
-    """Score based on final economic state.
-    Score = assets_value / (assets_value + 10000)
-    Sigmoid-like normalization to 0.0-1.0 range.
-    """
-    def score_trajectory(self, trajectory: List[Tuple[Any, Any]]) -> float:
-        if not trajectory:
-            return 0.0
-        _, final_obs = trajectory[-1]
-        military = getattr(final_obs, "military", None)
-        if military is None:
-            return 0.0
-        assets = getattr(military, "assets_value", 0)
-        # Sigmoid normalization: maps [0, inf) -> [0, 1)
-        return assets / (assets + 10000) if assets >= 0 else 0.0
-    def compute_step_rewards(self) -> List[float]:
-        if not self._trajectory:
-            return []
-        score = self.score_trajectory(self._trajectory)
-        return [score] * len(self._trajectory)
-class OpenRABenchRubric(WeightedSum):
-    """Composite benchmark score combining win/loss, military, and economy.
-    Weights: 50% win/loss, 25% military efficiency, 25% economy.
-    """
-    def __init__(self, gamma: float = 0.99):
-        win_loss = OpenRAWinLossRubric(gamma=gamma)
-        military = MilitaryEfficiencyRubric()
-        economy = EconomyRubric()
-        super().__init__(
-            rubrics=[win_loss, military, economy],
-            weights=[0.5, 0.25, 0.25],
-        )
-        # Keep named references for direct access
-        self.win_loss = win_loss
-        self.military = military
-        self.economy = economy
-    def reset(self) -> None:
-        self.win_loss.reset()
-        self.military.reset()
-        self.economy.reset()
-def compute_game_metrics(final_obs: Any) -> Dict[str, Any]:
-    """Extract benchmark metrics from a final game observation.
-    Args:
-        final_obs: The terminal GameObservation (where done=True).
-    Returns:
-        Dict with keys: result, ticks, kills_cost, deaths_cost,
-        kd_ratio, assets_value, cash, win (bool).
-    """
-    military = getattr(final_obs, "military", None)
-    economy = getattr(final_obs, "economy", None)
-    kills = getattr(military, "kills_cost", 0) if military else 0
-    deaths = getattr(military, "deaths_cost", 0) if military else 0
-    assets = getattr(military, "assets_value", 0) if military else 0
-    cash = getattr(economy, "cash", 0) if economy else 0
-    result = getattr(final_obs, "result", "")
-    tick = getattr(final_obs, "tick", 0)
-    return {
-        "result": result,
-        "win": result == "win",
-        "ticks": tick,
-        "kills_cost": kills,
-        "deaths_cost": deaths,
-        "kd_ratio": kills / max(deaths, 1),
-        "assets_value": assets,
-        "cash": cash,
-    }

+"""OpenRA-Bench rubrics — re-exported from openra-rl-util.
+All scoring logic lives in the shared utility library.
+This module re-exports for backward compatibility.
 """
+from openra_rl_util.rubrics import (  # noqa: F401
+    EconomyRubric,
+    MilitaryEfficiencyRubric,
+    OpenRABenchRubric,
+    OpenRAWinLossRubric,
+    compute_composite_score_from_games,
+    compute_game_metrics,
 )

tests/__init__.py ADDED Viewed

File without changes

tests/test_app.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""Tests for the Gradio leaderboard app."""
+import sys
+from pathlib import Path
+from unittest.mock import patch
+import pandas as pd
+import pytest
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from app import (
+    AGENT_TYPE_COLORS,
+    DISPLAY_COLUMNS,
+    add_type_badges,
+    build_app,
+    filter_leaderboard,
+    load_data,
+)
+class TestLoadData:
+    """Test data loading."""
+    def test_returns_dataframe(self):
+        df = load_data()
+        assert isinstance(df, pd.DataFrame)
+    def test_has_display_columns(self):
+        df = load_data()
+        for col in DISPLAY_COLUMNS:
+            assert col in df.columns, f"Missing column: {col}"
+    def test_has_rank_column(self):
+        df = load_data()
+        if len(df) > 0:
+            assert df["Rank"].iloc[0] == 1
+    def test_sorted_by_score_descending(self):
+        df = load_data()
+        if len(df) > 1:
+            scores = df["Score"].tolist()
+            assert scores == sorted(scores, reverse=True)
+    def test_handles_missing_file(self):
+        with patch("app.DATA_PATH", Path("/nonexistent/data.csv")):
+            df = load_data()
+            assert isinstance(df, pd.DataFrame)
+            assert len(df) == 0
+class TestBadges:
+    """Test type badge rendering."""
+    def test_scripted_badge_has_gold(self):
+        df = pd.DataFrame({"Type": ["Scripted"]})
+        result = add_type_badges(df)
+        assert "#ffcd75" in result["Type"].iloc[0]
+    def test_llm_badge_has_blue(self):
+        df = pd.DataFrame({"Type": ["LLM"]})
+        result = add_type_badges(df)
+        assert "#7497db" in result["Type"].iloc[0]
+    def test_rl_badge_has_gray(self):
+        df = pd.DataFrame({"Type": ["RL"]})
+        result = add_type_badges(df)
+        assert "#75809c" in result["Type"].iloc[0]
+    def test_all_types_have_colors(self):
+        for t in ["Scripted", "LLM", "RL"]:
+            assert t in AGENT_TYPE_COLORS
+class TestFilter:
+    """Test leaderboard filtering."""
+    def test_returns_dataframe(self):
+        df = filter_leaderboard("", [], "All")
+        assert isinstance(df, pd.DataFrame)
+    def test_search_filters_by_name(self):
+        df = filter_leaderboard("ScriptedBot", [], "All")
+        # If there are results, they should contain "ScriptedBot"
+        if len(df) > 0:
+            # Badges are in the Type column, not Agent
+            assert all("ScriptedBot" in str(row) for row in df["Agent"])
+    def test_opponent_filter(self):
+        df = filter_leaderboard("", [], "Hard")
+        if len(df) > 0:
+            assert all(df["Opponent"] == "Hard")
+class TestBuildApp:
+    """Test app construction."""
+    def test_builds_without_error(self):
+        app = build_app()
+        assert app is not None

tests/test_evaluate.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""Tests for the evaluation harness."""
+import csv
+import sys
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+import pytest
+# Add parent directory to path for direct import
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from evaluate import (
+    RESULTS_COLUMNS,
+    append_results,
+    get_agent_fn,
+    parse_args,
+)
+class TestParseArgs:
+    """Test argument parsing."""
+    def test_minimal_args(self):
+        with patch("sys.argv", ["evaluate.py", "--agent-name", "TestBot"]):
+            args = parse_args()
+            assert args.agent_name == "TestBot"
+            assert args.agent == "scripted"
+            assert args.agent_type == "Scripted"
+            assert args.opponent == "Normal"
+            assert args.games == 10
+    def test_all_args(self):
+        with patch("sys.argv", [
+            "evaluate.py",
+            "--agent", "llm",
+            "--agent-name", "MyLLM",
+            "--agent-type", "LLM",
+            "--opponent", "Hard",
+            "--games", "5",
+            "--server", "http://example.com:8000",
+            "--max-steps", "3000",
+            "--dry-run",
+        ]):
+            args = parse_args()
+            assert args.agent == "llm"
+            assert args.agent_name == "MyLLM"
+            assert args.agent_type == "LLM"
+            assert args.opponent == "Hard"
+            assert args.games == 5
+            assert args.server == "http://example.com:8000"
+            assert args.max_steps == 3000
+            assert args.dry_run is True
+    def test_auto_detect_agent_type(self):
+        for agent, expected_type in [
+            ("scripted", "Scripted"),
+            ("llm", "LLM"),
+            ("mcp", "Scripted"),
+            ("custom", "RL"),
+        ]:
+            with patch("sys.argv", ["evaluate.py", "--agent", agent, "--agent-name", "T"]):
+                args = parse_args()
+                assert args.agent_type == expected_type, f"{agent} -> {expected_type}"
+    def test_explicit_type_overrides_auto(self):
+        with patch("sys.argv", [
+            "evaluate.py", "--agent", "scripted",
+            "--agent-name", "T", "--agent-type", "RL",
+        ]):
+            args = parse_args()
+            assert args.agent_type == "RL"
+class TestGetAgentFn:
+    """Test agent factory."""
+    def test_scripted_returns_callable(self):
+        fn = get_agent_fn("scripted")
+        assert callable(fn)
+    def test_llm_returns_callable(self):
+        fn = get_agent_fn("llm")
+        assert callable(fn)
+class TestAppendResults:
+    """Test CSV output."""
+    def test_creates_new_file(self):
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
+            path = Path(f.name)
+        path.unlink()  # ensure it doesn't exist
+        results = {col: "" for col in RESULTS_COLUMNS}
+        results["agent_name"] = "TestBot"
+        results["games"] = 5
+        results["score"] = 85.0
+        append_results(results, path)
+        assert path.exists()
+        with open(path) as f:
+            reader = csv.DictReader(f)
+            rows = list(reader)
+            assert len(rows) == 1
+            assert rows[0]["agent_name"] == "TestBot"
+        path.unlink()
+    def test_appends_to_existing(self):
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
+            path = Path(f.name)
+        # Write first result
+        results1 = {col: "" for col in RESULTS_COLUMNS}
+        results1["agent_name"] = "Bot1"
+        append_results(results1, path)
+        # Write second result
+        results2 = {col: "" for col in RESULTS_COLUMNS}
+        results2["agent_name"] = "Bot2"
+        append_results(results2, path)
+        with open(path) as f:
+            reader = csv.DictReader(f)
+            rows = list(reader)
+            assert len(rows) == 2
+            assert rows[0]["agent_name"] == "Bot1"
+            assert rows[1]["agent_name"] == "Bot2"
+        path.unlink()
+    def test_columns_match_expected(self):
+        assert "agent_name" in RESULTS_COLUMNS
+        assert "score" in RESULTS_COLUMNS
+        assert "win_rate" in RESULTS_COLUMNS
+        assert "replay_url" in RESULTS_COLUMNS
+        assert len(RESULTS_COLUMNS) == 13
+class TestScoringUsesUtil:
+    """Verify scoring uses the single source of truth from openra-rl-util."""
+    def test_rubrics_re_exports_util(self):
+        """rubrics.py should re-export from openra_rl_util."""
+        from rubrics import compute_composite_score_from_games
+        from openra_rl_util.rubrics import (
+            compute_composite_score_from_games as util_fn,
+        )
+        assert compute_composite_score_from_games is util_fn
+    def test_evaluate_uses_util_scoring(self):
+        """evaluate.py should not have its own compute_composite_score."""
+        import evaluate
+        assert not hasattr(evaluate, "compute_composite_score"), \
+            "evaluate.py should use compute_composite_score_from_games from Util"
+    def test_compute_game_metrics_re_exported(self):
+        from rubrics import compute_game_metrics
+        from openra_rl_util.rubrics import compute_game_metrics as util_fn
+        assert compute_game_metrics is util_fn