Spaces:

openra-rl
/

OpenRA-Bench

Running

App Files Files Community

yxc20098 commited on 18 days ago

Commit

d184363

1 Parent(s): b473b95

Revert "Fix scoring dedup, add test suite, wire openra-rl-util"

Browse files

This reverts commit 2673c6a98140b7d62dbf20be4fc27f38c1b937cd.

Files changed (6) hide show

evaluate.py +29 -6
requirements.txt +0 -2
rubrics.py +148 -10
tests/__init__.py +0 -0
tests/test_app.py +0 -100
tests/test_evaluate.py +0 -163

evaluate.py CHANGED Viewed

@@ -29,8 +29,6 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Dict, List
-from openra_rl_util.rubrics import compute_composite_score_from_games, compute_game_metrics
 # Evaluation results file
 RESULTS_FILE = Path(__file__).parent / "data" / "results.csv"
@@ -124,8 +122,10 @@ async def run_game(env: Any, agent_fn: Any, max_steps: int) -> Dict[str, Any]:
         max_steps: Maximum steps before timeout.
     Returns:
-        Dict with game metrics (from compute_game_metrics).
     """
     obs = await env.reset()
     steps = 0
@@ -143,9 +143,10 @@ def get_agent_fn(agent_type: str) -> Any:
     Returns a callable that takes an observation and returns an action.
     """
     if agent_type == "scripted":
         from openra_env.models import OpenRAAction
         # Simple no-op agent for evaluation framework testing
-        # TODO: Wire to openra_env.agents.scripted.ScriptedBot when extracted
         return lambda obs: OpenRAAction(commands=[])
     else:
         from openra_env.models import OpenRAAction
@@ -167,7 +168,7 @@ async def run_evaluation(args: argparse.Namespace) -> Dict[str, Any]:
             result_str = metrics["result"] or "timeout"
             print(f"{result_str} (ticks: {metrics['ticks']}, K/D: {metrics['kd_ratio']:.1f})")
-    # Aggregate results using single source of truth from openra-rl-util
     wins = sum(1 for g in game_results if g["win"])
     total = len(game_results)
@@ -177,7 +178,7 @@ async def run_evaluation(args: argparse.Namespace) -> Dict[str, Any]:
         "opponent": args.opponent,
         "games": total,
         "win_rate": round(100.0 * wins / max(total, 1), 1),
-        "score": round(compute_composite_score_from_games(game_results), 1),
         "avg_kills": round(sum(g["kills_cost"] for g in game_results) / max(total, 1)),
         "avg_deaths": round(sum(g["deaths_cost"] for g in game_results) / max(total, 1)),
         "kd_ratio": round(
@@ -194,6 +195,28 @@ async def run_evaluation(args: argparse.Namespace) -> Dict[str, Any]:
     }
 def append_results(results: Dict[str, Any], output_path: Path) -> None:
     """Append evaluation results to CSV file."""
     file_exists = output_path.exists() and output_path.stat().st_size > 0

 from pathlib import Path
 from typing import Any, Dict, List
 # Evaluation results file
 RESULTS_FILE = Path(__file__).parent / "data" / "results.csv"
         max_steps: Maximum steps before timeout.
     Returns:
+        Dict with game metrics (from rubrics.compute_game_metrics).
     """
+    from rubrics import compute_game_metrics
     obs = await env.reset()
     steps = 0
     Returns a callable that takes an observation and returns an action.
     """
     if agent_type == "scripted":
+        # Import inline to avoid hard dependency
         from openra_env.models import OpenRAAction
         # Simple no-op agent for evaluation framework testing
+        # Replace with actual ScriptedBot integration
         return lambda obs: OpenRAAction(commands=[])
     else:
         from openra_env.models import OpenRAAction
             result_str = metrics["result"] or "timeout"
             print(f"{result_str} (ticks: {metrics['ticks']}, K/D: {metrics['kd_ratio']:.1f})")
+    # Aggregate results
     wins = sum(1 for g in game_results if g["win"])
     total = len(game_results)
         "opponent": args.opponent,
         "games": total,
         "win_rate": round(100.0 * wins / max(total, 1), 1),
+        "score": round(compute_composite_score(game_results), 1),
         "avg_kills": round(sum(g["kills_cost"] for g in game_results) / max(total, 1)),
         "avg_deaths": round(sum(g["deaths_cost"] for g in game_results) / max(total, 1)),
         "kd_ratio": round(
     }
+def compute_composite_score(game_results: List[Dict[str, Any]]) -> float:
+    """Compute the OpenRA-Bench composite score.
+    Score = 50% win_rate + 25% avg_kd_normalized + 25% avg_economy_normalized
+    """
+    total = len(game_results)
+    if total == 0:
+        return 0.0
+    win_rate = sum(1 for g in game_results if g["win"]) / total
+    # K/D ratio normalized: kd / (kd + 1) maps [0, inf) -> [0, 1)
+    avg_kd = sum(g["kd_ratio"] for g in game_results) / total
+    kd_norm = avg_kd / (avg_kd + 1)
+    # Economy normalized: assets / (assets + 10000)
+    avg_assets = sum(g["assets_value"] for g in game_results) / total
+    econ_norm = avg_assets / (avg_assets + 10000) if avg_assets >= 0 else 0.0
+    return 100.0 * (0.5 * win_rate + 0.25 * kd_norm + 0.25 * econ_norm)
 def append_results(results: Dict[str, Any], output_path: Path) -> None:
     """Append evaluation results to CSV file."""
     file_exists = output_path.exists() and output_path.stat().st_size > 0

requirements.txt CHANGED Viewed

@@ -1,5 +1,3 @@
 gradio>=4.44.0
 pandas>=2.0.0
 openenv-core>=0.2.0
-openra-rl-util>=0.1.0
-openra-rl>=0.3.0

 gradio>=4.44.0
 pandas>=2.0.0
 openenv-core>=0.2.0

rubrics.py CHANGED Viewed

@@ -1,14 +1,152 @@
-"""OpenRA-Bench rubrics — re-exported from openra-rl-util.
-All scoring logic lives in the shared utility library.
-This module re-exports for backward compatibility.
 """
-from openra_rl_util.rubrics import (  # noqa: F401
-    EconomyRubric,
-    MilitaryEfficiencyRubric,
-    OpenRABenchRubric,
-    OpenRAWinLossRubric,
-    compute_composite_score_from_games,
-    compute_game_metrics,
 )

+"""OpenRA-Bench rubrics for agent evaluation.
+Follows the OpenEnv rubric pattern (see openenv.core.rubrics).
+These rubrics score game episodes based on win/loss, military efficiency,
+and economic performance.
+Usage:
+    rubric = OpenRABenchRubric()
+    rubric.reset()
+    for action, obs in episode:
+        reward = rubric(action, obs)  # 0.0 until done
+    step_rewards = rubric.win_loss.compute_step_rewards()
 """
+from typing import Any, Dict, List, Tuple
+from openenv.core.rubrics import (
+    ExponentialDiscountingTrajectoryRubric,
+    TrajectoryRubric,
+    WeightedSum,
 )
+class OpenRAWinLossRubric(ExponentialDiscountingTrajectoryRubric):
+    """Score game based on win/loss/draw outcome with temporal discounting.
+    Terminal rewards:
+    - Win:  +1.0
+    - Loss: -1.0
+    - Draw:  0.0
+    """
+    def score_trajectory(self, trajectory: List[Tuple[Any, Any]]) -> float:
+        if not trajectory:
+            return 0.0
+        _, final_obs = trajectory[-1]
+        result = getattr(final_obs, "result", "")
+        if result == "win":
+            return 1.0
+        elif result == "lose":
+            return -1.0
+        return 0.0
+class MilitaryEfficiencyRubric(TrajectoryRubric):
+    """Score based on kill/death cost ratio from final observation.
+    Score = kills_cost / max(kills_cost + deaths_cost, 1)
+    Normalized to 0.0-1.0 range.
+    """
+    def score_trajectory(self, trajectory: List[Tuple[Any, Any]]) -> float:
+        if not trajectory:
+            return 0.0
+        _, final_obs = trajectory[-1]
+        military = getattr(final_obs, "military", None)
+        if military is None:
+            return 0.0
+        kills = getattr(military, "kills_cost", 0)
+        deaths = getattr(military, "deaths_cost", 0)
+        total = kills + deaths
+        if total == 0:
+            return 0.5  # No combat occurred
+        return kills / total
+    def compute_step_rewards(self) -> List[float]:
+        if not self._trajectory:
+            return []
+        score = self.score_trajectory(self._trajectory)
+        return [score] * len(self._trajectory)
+class EconomyRubric(TrajectoryRubric):
+    """Score based on final economic state.
+    Score = assets_value / (assets_value + 10000)
+    Sigmoid-like normalization to 0.0-1.0 range.
+    """
+    def score_trajectory(self, trajectory: List[Tuple[Any, Any]]) -> float:
+        if not trajectory:
+            return 0.0
+        _, final_obs = trajectory[-1]
+        military = getattr(final_obs, "military", None)
+        if military is None:
+            return 0.0
+        assets = getattr(military, "assets_value", 0)
+        # Sigmoid normalization: maps [0, inf) -> [0, 1)
+        return assets / (assets + 10000) if assets >= 0 else 0.0
+    def compute_step_rewards(self) -> List[float]:
+        if not self._trajectory:
+            return []
+        score = self.score_trajectory(self._trajectory)
+        return [score] * len(self._trajectory)
+class OpenRABenchRubric(WeightedSum):
+    """Composite benchmark score combining win/loss, military, and economy.
+    Weights: 50% win/loss, 25% military efficiency, 25% economy.
+    """
+    def __init__(self, gamma: float = 0.99):
+        win_loss = OpenRAWinLossRubric(gamma=gamma)
+        military = MilitaryEfficiencyRubric()
+        economy = EconomyRubric()
+        super().__init__(
+            rubrics=[win_loss, military, economy],
+            weights=[0.5, 0.25, 0.25],
+        )
+        # Keep named references for direct access
+        self.win_loss = win_loss
+        self.military = military
+        self.economy = economy
+    def reset(self) -> None:
+        self.win_loss.reset()
+        self.military.reset()
+        self.economy.reset()
+def compute_game_metrics(final_obs: Any) -> Dict[str, Any]:
+    """Extract benchmark metrics from a final game observation.
+    Args:
+        final_obs: The terminal GameObservation (where done=True).
+    Returns:
+        Dict with keys: result, ticks, kills_cost, deaths_cost,
+        kd_ratio, assets_value, cash, win (bool).
+    """
+    military = getattr(final_obs, "military", None)
+    economy = getattr(final_obs, "economy", None)
+    kills = getattr(military, "kills_cost", 0) if military else 0
+    deaths = getattr(military, "deaths_cost", 0) if military else 0
+    assets = getattr(military, "assets_value", 0) if military else 0
+    cash = getattr(economy, "cash", 0) if economy else 0
+    result = getattr(final_obs, "result", "")
+    tick = getattr(final_obs, "tick", 0)
+    return {
+        "result": result,
+        "win": result == "win",
+        "ticks": tick,
+        "kills_cost": kills,
+        "deaths_cost": deaths,
+        "kd_ratio": kills / max(deaths, 1),
+        "assets_value": assets,
+        "cash": cash,
+    }

tests/__init__.py DELETED Viewed

File without changes

tests/test_app.py DELETED Viewed

@@ -1,100 +0,0 @@
-"""Tests for the Gradio leaderboard app."""
-import sys
-from pathlib import Path
-from unittest.mock import patch
-import pandas as pd
-import pytest
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from app import (
-    AGENT_TYPE_COLORS,
-    DISPLAY_COLUMNS,
-    add_type_badges,
-    build_app,
-    filter_leaderboard,
-    load_data,
-)
-class TestLoadData:
-    """Test data loading."""
-    def test_returns_dataframe(self):
-        df = load_data()
-        assert isinstance(df, pd.DataFrame)
-    def test_has_display_columns(self):
-        df = load_data()
-        for col in DISPLAY_COLUMNS:
-            assert col in df.columns, f"Missing column: {col}"
-    def test_has_rank_column(self):
-        df = load_data()
-        if len(df) > 0:
-            assert df["Rank"].iloc[0] == 1
-    def test_sorted_by_score_descending(self):
-        df = load_data()
-        if len(df) > 1:
-            scores = df["Score"].tolist()
-            assert scores == sorted(scores, reverse=True)
-    def test_handles_missing_file(self):
-        with patch("app.DATA_PATH", Path("/nonexistent/data.csv")):
-            df = load_data()
-            assert isinstance(df, pd.DataFrame)
-            assert len(df) == 0
-class TestBadges:
-    """Test type badge rendering."""
-    def test_scripted_badge_has_gold(self):
-        df = pd.DataFrame({"Type": ["Scripted"]})
-        result = add_type_badges(df)
-        assert "#ffcd75" in result["Type"].iloc[0]
-    def test_llm_badge_has_blue(self):
-        df = pd.DataFrame({"Type": ["LLM"]})
-        result = add_type_badges(df)
-        assert "#7497db" in result["Type"].iloc[0]
-    def test_rl_badge_has_gray(self):
-        df = pd.DataFrame({"Type": ["RL"]})
-        result = add_type_badges(df)
-        assert "#75809c" in result["Type"].iloc[0]
-    def test_all_types_have_colors(self):
-        for t in ["Scripted", "LLM", "RL"]:
-            assert t in AGENT_TYPE_COLORS
-class TestFilter:
-    """Test leaderboard filtering."""
-    def test_returns_dataframe(self):
-        df = filter_leaderboard("", [], "All")
-        assert isinstance(df, pd.DataFrame)
-    def test_search_filters_by_name(self):
-        df = filter_leaderboard("ScriptedBot", [], "All")
-        # If there are results, they should contain "ScriptedBot"
-        if len(df) > 0:
-            # Badges are in the Type column, not Agent
-            assert all("ScriptedBot" in str(row) for row in df["Agent"])
-    def test_opponent_filter(self):
-        df = filter_leaderboard("", [], "Hard")
-        if len(df) > 0:
-            assert all(df["Opponent"] == "Hard")
-class TestBuildApp:
-    """Test app construction."""
-    def test_builds_without_error(self):
-        app = build_app()
-        assert app is not None

tests/test_evaluate.py DELETED Viewed

@@ -1,163 +0,0 @@
-"""Tests for the evaluation harness."""
-import csv
-import sys
-import tempfile
-from pathlib import Path
-from unittest.mock import patch
-import pytest
-# Add parent directory to path for direct import
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from evaluate import (
-    RESULTS_COLUMNS,
-    append_results,
-    get_agent_fn,
-    parse_args,
-)
-class TestParseArgs:
-    """Test argument parsing."""
-    def test_minimal_args(self):
-        with patch("sys.argv", ["evaluate.py", "--agent-name", "TestBot"]):
-            args = parse_args()
-            assert args.agent_name == "TestBot"
-            assert args.agent == "scripted"
-            assert args.agent_type == "Scripted"
-            assert args.opponent == "Normal"
-            assert args.games == 10
-    def test_all_args(self):
-        with patch("sys.argv", [
-            "evaluate.py",
-            "--agent", "llm",
-            "--agent-name", "MyLLM",
-            "--agent-type", "LLM",
-            "--opponent", "Hard",
-            "--games", "5",
-            "--server", "http://example.com:8000",
-            "--max-steps", "3000",
-            "--dry-run",
-        ]):
-            args = parse_args()
-            assert args.agent == "llm"
-            assert args.agent_name == "MyLLM"
-            assert args.agent_type == "LLM"
-            assert args.opponent == "Hard"
-            assert args.games == 5
-            assert args.server == "http://example.com:8000"
-            assert args.max_steps == 3000
-            assert args.dry_run is True
-    def test_auto_detect_agent_type(self):
-        for agent, expected_type in [
-            ("scripted", "Scripted"),
-            ("llm", "LLM"),
-            ("mcp", "Scripted"),
-            ("custom", "RL"),
-        ]:
-            with patch("sys.argv", ["evaluate.py", "--agent", agent, "--agent-name", "T"]):
-                args = parse_args()
-                assert args.agent_type == expected_type, f"{agent} -> {expected_type}"
-    def test_explicit_type_overrides_auto(self):
-        with patch("sys.argv", [
-            "evaluate.py", "--agent", "scripted",
-            "--agent-name", "T", "--agent-type", "RL",
-        ]):
-            args = parse_args()
-            assert args.agent_type == "RL"
-class TestGetAgentFn:
-    """Test agent factory."""
-    def test_scripted_returns_callable(self):
-        fn = get_agent_fn("scripted")
-        assert callable(fn)
-    def test_llm_returns_callable(self):
-        fn = get_agent_fn("llm")
-        assert callable(fn)
-class TestAppendResults:
-    """Test CSV output."""
-    def test_creates_new_file(self):
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
-            path = Path(f.name)
-        path.unlink()  # ensure it doesn't exist
-        results = {col: "" for col in RESULTS_COLUMNS}
-        results["agent_name"] = "TestBot"
-        results["games"] = 5
-        results["score"] = 85.0
-        append_results(results, path)
-        assert path.exists()
-        with open(path) as f:
-            reader = csv.DictReader(f)
-            rows = list(reader)
-            assert len(rows) == 1
-            assert rows[0]["agent_name"] == "TestBot"
-        path.unlink()
-    def test_appends_to_existing(self):
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
-            path = Path(f.name)
-        # Write first result
-        results1 = {col: "" for col in RESULTS_COLUMNS}
-        results1["agent_name"] = "Bot1"
-        append_results(results1, path)
-        # Write second result
-        results2 = {col: "" for col in RESULTS_COLUMNS}
-        results2["agent_name"] = "Bot2"
-        append_results(results2, path)
-        with open(path) as f:
-            reader = csv.DictReader(f)
-            rows = list(reader)
-            assert len(rows) == 2
-            assert rows[0]["agent_name"] == "Bot1"
-            assert rows[1]["agent_name"] == "Bot2"
-        path.unlink()
-    def test_columns_match_expected(self):
-        assert "agent_name" in RESULTS_COLUMNS
-        assert "score" in RESULTS_COLUMNS
-        assert "win_rate" in RESULTS_COLUMNS
-        assert "replay_url" in RESULTS_COLUMNS
-        assert len(RESULTS_COLUMNS) == 13
-class TestScoringUsesUtil:
-    """Verify scoring uses the single source of truth from openra-rl-util."""
-    def test_rubrics_re_exports_util(self):
-        """rubrics.py should re-export from openra_rl_util."""
-        from rubrics import compute_composite_score_from_games
-        from openra_rl_util.rubrics import (
-            compute_composite_score_from_games as util_fn,
-        )
-        assert compute_composite_score_from_games is util_fn
-    def test_evaluate_uses_util_scoring(self):
-        """evaluate.py should not have its own compute_composite_score."""
-        import evaluate
-        assert not hasattr(evaluate, "compute_composite_score"), \
-            "evaluate.py should use compute_composite_score_from_games from Util"
-    def test_compute_game_metrics_re_exported(self):
-        from rubrics import compute_game_metrics
-        from openra_rl_util.rubrics import compute_game_metrics as util_fn
-        assert compute_game_metrics is util_fn