yxc20098 commited on
Commit
d184363
·
1 Parent(s): b473b95

Revert "Fix scoring dedup, add test suite, wire openra-rl-util"

Browse files

This reverts commit 2673c6a98140b7d62dbf20be4fc27f38c1b937cd.

Files changed (6) hide show
  1. evaluate.py +29 -6
  2. requirements.txt +0 -2
  3. rubrics.py +148 -10
  4. tests/__init__.py +0 -0
  5. tests/test_app.py +0 -100
  6. tests/test_evaluate.py +0 -163
evaluate.py CHANGED
@@ -29,8 +29,6 @@ from datetime import datetime, timezone
29
  from pathlib import Path
30
  from typing import Any, Dict, List
31
 
32
- from openra_rl_util.rubrics import compute_composite_score_from_games, compute_game_metrics
33
-
34
  # Evaluation results file
35
  RESULTS_FILE = Path(__file__).parent / "data" / "results.csv"
36
 
@@ -124,8 +122,10 @@ async def run_game(env: Any, agent_fn: Any, max_steps: int) -> Dict[str, Any]:
124
  max_steps: Maximum steps before timeout.
125
 
126
  Returns:
127
- Dict with game metrics (from compute_game_metrics).
128
  """
 
 
129
  obs = await env.reset()
130
  steps = 0
131
 
@@ -143,9 +143,10 @@ def get_agent_fn(agent_type: str) -> Any:
143
  Returns a callable that takes an observation and returns an action.
144
  """
145
  if agent_type == "scripted":
 
146
  from openra_env.models import OpenRAAction
147
  # Simple no-op agent for evaluation framework testing
148
- # TODO: Wire to openra_env.agents.scripted.ScriptedBot when extracted
149
  return lambda obs: OpenRAAction(commands=[])
150
  else:
151
  from openra_env.models import OpenRAAction
@@ -167,7 +168,7 @@ async def run_evaluation(args: argparse.Namespace) -> Dict[str, Any]:
167
  result_str = metrics["result"] or "timeout"
168
  print(f"{result_str} (ticks: {metrics['ticks']}, K/D: {metrics['kd_ratio']:.1f})")
169
 
170
- # Aggregate results using single source of truth from openra-rl-util
171
  wins = sum(1 for g in game_results if g["win"])
172
  total = len(game_results)
173
 
@@ -177,7 +178,7 @@ async def run_evaluation(args: argparse.Namespace) -> Dict[str, Any]:
177
  "opponent": args.opponent,
178
  "games": total,
179
  "win_rate": round(100.0 * wins / max(total, 1), 1),
180
- "score": round(compute_composite_score_from_games(game_results), 1),
181
  "avg_kills": round(sum(g["kills_cost"] for g in game_results) / max(total, 1)),
182
  "avg_deaths": round(sum(g["deaths_cost"] for g in game_results) / max(total, 1)),
183
  "kd_ratio": round(
@@ -194,6 +195,28 @@ async def run_evaluation(args: argparse.Namespace) -> Dict[str, Any]:
194
  }
195
 
196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  def append_results(results: Dict[str, Any], output_path: Path) -> None:
198
  """Append evaluation results to CSV file."""
199
  file_exists = output_path.exists() and output_path.stat().st_size > 0
 
29
  from pathlib import Path
30
  from typing import Any, Dict, List
31
 
 
 
32
  # Evaluation results file
33
  RESULTS_FILE = Path(__file__).parent / "data" / "results.csv"
34
 
 
122
  max_steps: Maximum steps before timeout.
123
 
124
  Returns:
125
+ Dict with game metrics (from rubrics.compute_game_metrics).
126
  """
127
+ from rubrics import compute_game_metrics
128
+
129
  obs = await env.reset()
130
  steps = 0
131
 
 
143
  Returns a callable that takes an observation and returns an action.
144
  """
145
  if agent_type == "scripted":
146
+ # Import inline to avoid hard dependency
147
  from openra_env.models import OpenRAAction
148
  # Simple no-op agent for evaluation framework testing
149
+ # Replace with actual ScriptedBot integration
150
  return lambda obs: OpenRAAction(commands=[])
151
  else:
152
  from openra_env.models import OpenRAAction
 
168
  result_str = metrics["result"] or "timeout"
169
  print(f"{result_str} (ticks: {metrics['ticks']}, K/D: {metrics['kd_ratio']:.1f})")
170
 
171
+ # Aggregate results
172
  wins = sum(1 for g in game_results if g["win"])
173
  total = len(game_results)
174
 
 
178
  "opponent": args.opponent,
179
  "games": total,
180
  "win_rate": round(100.0 * wins / max(total, 1), 1),
181
+ "score": round(compute_composite_score(game_results), 1),
182
  "avg_kills": round(sum(g["kills_cost"] for g in game_results) / max(total, 1)),
183
  "avg_deaths": round(sum(g["deaths_cost"] for g in game_results) / max(total, 1)),
184
  "kd_ratio": round(
 
195
  }
196
 
197
 
198
+ def compute_composite_score(game_results: List[Dict[str, Any]]) -> float:
199
+ """Compute the OpenRA-Bench composite score.
200
+
201
+ Score = 50% win_rate + 25% avg_kd_normalized + 25% avg_economy_normalized
202
+ """
203
+ total = len(game_results)
204
+ if total == 0:
205
+ return 0.0
206
+
207
+ win_rate = sum(1 for g in game_results if g["win"]) / total
208
+
209
+ # K/D ratio normalized: kd / (kd + 1) maps [0, inf) -> [0, 1)
210
+ avg_kd = sum(g["kd_ratio"] for g in game_results) / total
211
+ kd_norm = avg_kd / (avg_kd + 1)
212
+
213
+ # Economy normalized: assets / (assets + 10000)
214
+ avg_assets = sum(g["assets_value"] for g in game_results) / total
215
+ econ_norm = avg_assets / (avg_assets + 10000) if avg_assets >= 0 else 0.0
216
+
217
+ return 100.0 * (0.5 * win_rate + 0.25 * kd_norm + 0.25 * econ_norm)
218
+
219
+
220
  def append_results(results: Dict[str, Any], output_path: Path) -> None:
221
  """Append evaluation results to CSV file."""
222
  file_exists = output_path.exists() and output_path.stat().st_size > 0
requirements.txt CHANGED
@@ -1,5 +1,3 @@
1
  gradio>=4.44.0
2
  pandas>=2.0.0
3
  openenv-core>=0.2.0
4
- openra-rl-util>=0.1.0
5
- openra-rl>=0.3.0
 
1
  gradio>=4.44.0
2
  pandas>=2.0.0
3
  openenv-core>=0.2.0
 
 
rubrics.py CHANGED
@@ -1,14 +1,152 @@
1
- """OpenRA-Bench rubrics re-exported from openra-rl-util.
2
 
3
- All scoring logic lives in the shared utility library.
4
- This module re-exports for backward compatibility.
 
 
 
 
 
 
 
 
5
  """
6
 
7
- from openra_rl_util.rubrics import ( # noqa: F401
8
- EconomyRubric,
9
- MilitaryEfficiencyRubric,
10
- OpenRABenchRubric,
11
- OpenRAWinLossRubric,
12
- compute_composite_score_from_games,
13
- compute_game_metrics,
14
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OpenRA-Bench rubrics for agent evaluation.
2
 
3
+ Follows the OpenEnv rubric pattern (see openenv.core.rubrics).
4
+ These rubrics score game episodes based on win/loss, military efficiency,
5
+ and economic performance.
6
+
7
+ Usage:
8
+ rubric = OpenRABenchRubric()
9
+ rubric.reset()
10
+ for action, obs in episode:
11
+ reward = rubric(action, obs) # 0.0 until done
12
+ step_rewards = rubric.win_loss.compute_step_rewards()
13
  """
14
 
15
+ from typing import Any, Dict, List, Tuple
16
+
17
+ from openenv.core.rubrics import (
18
+ ExponentialDiscountingTrajectoryRubric,
19
+ TrajectoryRubric,
20
+ WeightedSum,
 
21
  )
22
+
23
+
24
+ class OpenRAWinLossRubric(ExponentialDiscountingTrajectoryRubric):
25
+ """Score game based on win/loss/draw outcome with temporal discounting.
26
+
27
+ Terminal rewards:
28
+ - Win: +1.0
29
+ - Loss: -1.0
30
+ - Draw: 0.0
31
+ """
32
+
33
+ def score_trajectory(self, trajectory: List[Tuple[Any, Any]]) -> float:
34
+ if not trajectory:
35
+ return 0.0
36
+ _, final_obs = trajectory[-1]
37
+ result = getattr(final_obs, "result", "")
38
+ if result == "win":
39
+ return 1.0
40
+ elif result == "lose":
41
+ return -1.0
42
+ return 0.0
43
+
44
+
45
+ class MilitaryEfficiencyRubric(TrajectoryRubric):
46
+ """Score based on kill/death cost ratio from final observation.
47
+
48
+ Score = kills_cost / max(kills_cost + deaths_cost, 1)
49
+ Normalized to 0.0-1.0 range.
50
+ """
51
+
52
+ def score_trajectory(self, trajectory: List[Tuple[Any, Any]]) -> float:
53
+ if not trajectory:
54
+ return 0.0
55
+ _, final_obs = trajectory[-1]
56
+ military = getattr(final_obs, "military", None)
57
+ if military is None:
58
+ return 0.0
59
+ kills = getattr(military, "kills_cost", 0)
60
+ deaths = getattr(military, "deaths_cost", 0)
61
+ total = kills + deaths
62
+ if total == 0:
63
+ return 0.5 # No combat occurred
64
+ return kills / total
65
+
66
+ def compute_step_rewards(self) -> List[float]:
67
+ if not self._trajectory:
68
+ return []
69
+ score = self.score_trajectory(self._trajectory)
70
+ return [score] * len(self._trajectory)
71
+
72
+
73
+ class EconomyRubric(TrajectoryRubric):
74
+ """Score based on final economic state.
75
+
76
+ Score = assets_value / (assets_value + 10000)
77
+ Sigmoid-like normalization to 0.0-1.0 range.
78
+ """
79
+
80
+ def score_trajectory(self, trajectory: List[Tuple[Any, Any]]) -> float:
81
+ if not trajectory:
82
+ return 0.0
83
+ _, final_obs = trajectory[-1]
84
+ military = getattr(final_obs, "military", None)
85
+ if military is None:
86
+ return 0.0
87
+ assets = getattr(military, "assets_value", 0)
88
+ # Sigmoid normalization: maps [0, inf) -> [0, 1)
89
+ return assets / (assets + 10000) if assets >= 0 else 0.0
90
+
91
+ def compute_step_rewards(self) -> List[float]:
92
+ if not self._trajectory:
93
+ return []
94
+ score = self.score_trajectory(self._trajectory)
95
+ return [score] * len(self._trajectory)
96
+
97
+
98
+ class OpenRABenchRubric(WeightedSum):
99
+ """Composite benchmark score combining win/loss, military, and economy.
100
+
101
+ Weights: 50% win/loss, 25% military efficiency, 25% economy.
102
+ """
103
+
104
+ def __init__(self, gamma: float = 0.99):
105
+ win_loss = OpenRAWinLossRubric(gamma=gamma)
106
+ military = MilitaryEfficiencyRubric()
107
+ economy = EconomyRubric()
108
+ super().__init__(
109
+ rubrics=[win_loss, military, economy],
110
+ weights=[0.5, 0.25, 0.25],
111
+ )
112
+ # Keep named references for direct access
113
+ self.win_loss = win_loss
114
+ self.military = military
115
+ self.economy = economy
116
+
117
+ def reset(self) -> None:
118
+ self.win_loss.reset()
119
+ self.military.reset()
120
+ self.economy.reset()
121
+
122
+
123
+ def compute_game_metrics(final_obs: Any) -> Dict[str, Any]:
124
+ """Extract benchmark metrics from a final game observation.
125
+
126
+ Args:
127
+ final_obs: The terminal GameObservation (where done=True).
128
+
129
+ Returns:
130
+ Dict with keys: result, ticks, kills_cost, deaths_cost,
131
+ kd_ratio, assets_value, cash, win (bool).
132
+ """
133
+ military = getattr(final_obs, "military", None)
134
+ economy = getattr(final_obs, "economy", None)
135
+
136
+ kills = getattr(military, "kills_cost", 0) if military else 0
137
+ deaths = getattr(military, "deaths_cost", 0) if military else 0
138
+ assets = getattr(military, "assets_value", 0) if military else 0
139
+ cash = getattr(economy, "cash", 0) if economy else 0
140
+ result = getattr(final_obs, "result", "")
141
+ tick = getattr(final_obs, "tick", 0)
142
+
143
+ return {
144
+ "result": result,
145
+ "win": result == "win",
146
+ "ticks": tick,
147
+ "kills_cost": kills,
148
+ "deaths_cost": deaths,
149
+ "kd_ratio": kills / max(deaths, 1),
150
+ "assets_value": assets,
151
+ "cash": cash,
152
+ }
tests/__init__.py DELETED
File without changes
tests/test_app.py DELETED
@@ -1,100 +0,0 @@
1
- """Tests for the Gradio leaderboard app."""
2
-
3
- import sys
4
- from pathlib import Path
5
- from unittest.mock import patch
6
-
7
- import pandas as pd
8
- import pytest
9
-
10
- sys.path.insert(0, str(Path(__file__).parent.parent))
11
-
12
- from app import (
13
- AGENT_TYPE_COLORS,
14
- DISPLAY_COLUMNS,
15
- add_type_badges,
16
- build_app,
17
- filter_leaderboard,
18
- load_data,
19
- )
20
-
21
-
22
- class TestLoadData:
23
- """Test data loading."""
24
-
25
- def test_returns_dataframe(self):
26
- df = load_data()
27
- assert isinstance(df, pd.DataFrame)
28
-
29
- def test_has_display_columns(self):
30
- df = load_data()
31
- for col in DISPLAY_COLUMNS:
32
- assert col in df.columns, f"Missing column: {col}"
33
-
34
- def test_has_rank_column(self):
35
- df = load_data()
36
- if len(df) > 0:
37
- assert df["Rank"].iloc[0] == 1
38
-
39
- def test_sorted_by_score_descending(self):
40
- df = load_data()
41
- if len(df) > 1:
42
- scores = df["Score"].tolist()
43
- assert scores == sorted(scores, reverse=True)
44
-
45
- def test_handles_missing_file(self):
46
- with patch("app.DATA_PATH", Path("/nonexistent/data.csv")):
47
- df = load_data()
48
- assert isinstance(df, pd.DataFrame)
49
- assert len(df) == 0
50
-
51
-
52
- class TestBadges:
53
- """Test type badge rendering."""
54
-
55
- def test_scripted_badge_has_gold(self):
56
- df = pd.DataFrame({"Type": ["Scripted"]})
57
- result = add_type_badges(df)
58
- assert "#ffcd75" in result["Type"].iloc[0]
59
-
60
- def test_llm_badge_has_blue(self):
61
- df = pd.DataFrame({"Type": ["LLM"]})
62
- result = add_type_badges(df)
63
- assert "#7497db" in result["Type"].iloc[0]
64
-
65
- def test_rl_badge_has_gray(self):
66
- df = pd.DataFrame({"Type": ["RL"]})
67
- result = add_type_badges(df)
68
- assert "#75809c" in result["Type"].iloc[0]
69
-
70
- def test_all_types_have_colors(self):
71
- for t in ["Scripted", "LLM", "RL"]:
72
- assert t in AGENT_TYPE_COLORS
73
-
74
-
75
- class TestFilter:
76
- """Test leaderboard filtering."""
77
-
78
- def test_returns_dataframe(self):
79
- df = filter_leaderboard("", [], "All")
80
- assert isinstance(df, pd.DataFrame)
81
-
82
- def test_search_filters_by_name(self):
83
- df = filter_leaderboard("ScriptedBot", [], "All")
84
- # If there are results, they should contain "ScriptedBot"
85
- if len(df) > 0:
86
- # Badges are in the Type column, not Agent
87
- assert all("ScriptedBot" in str(row) for row in df["Agent"])
88
-
89
- def test_opponent_filter(self):
90
- df = filter_leaderboard("", [], "Hard")
91
- if len(df) > 0:
92
- assert all(df["Opponent"] == "Hard")
93
-
94
-
95
- class TestBuildApp:
96
- """Test app construction."""
97
-
98
- def test_builds_without_error(self):
99
- app = build_app()
100
- assert app is not None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_evaluate.py DELETED
@@ -1,163 +0,0 @@
1
- """Tests for the evaluation harness."""
2
-
3
- import csv
4
- import sys
5
- import tempfile
6
- from pathlib import Path
7
- from unittest.mock import patch
8
-
9
- import pytest
10
-
11
- # Add parent directory to path for direct import
12
- sys.path.insert(0, str(Path(__file__).parent.parent))
13
-
14
- from evaluate import (
15
- RESULTS_COLUMNS,
16
- append_results,
17
- get_agent_fn,
18
- parse_args,
19
- )
20
-
21
-
22
- class TestParseArgs:
23
- """Test argument parsing."""
24
-
25
- def test_minimal_args(self):
26
- with patch("sys.argv", ["evaluate.py", "--agent-name", "TestBot"]):
27
- args = parse_args()
28
- assert args.agent_name == "TestBot"
29
- assert args.agent == "scripted"
30
- assert args.agent_type == "Scripted"
31
- assert args.opponent == "Normal"
32
- assert args.games == 10
33
-
34
- def test_all_args(self):
35
- with patch("sys.argv", [
36
- "evaluate.py",
37
- "--agent", "llm",
38
- "--agent-name", "MyLLM",
39
- "--agent-type", "LLM",
40
- "--opponent", "Hard",
41
- "--games", "5",
42
- "--server", "http://example.com:8000",
43
- "--max-steps", "3000",
44
- "--dry-run",
45
- ]):
46
- args = parse_args()
47
- assert args.agent == "llm"
48
- assert args.agent_name == "MyLLM"
49
- assert args.agent_type == "LLM"
50
- assert args.opponent == "Hard"
51
- assert args.games == 5
52
- assert args.server == "http://example.com:8000"
53
- assert args.max_steps == 3000
54
- assert args.dry_run is True
55
-
56
- def test_auto_detect_agent_type(self):
57
- for agent, expected_type in [
58
- ("scripted", "Scripted"),
59
- ("llm", "LLM"),
60
- ("mcp", "Scripted"),
61
- ("custom", "RL"),
62
- ]:
63
- with patch("sys.argv", ["evaluate.py", "--agent", agent, "--agent-name", "T"]):
64
- args = parse_args()
65
- assert args.agent_type == expected_type, f"{agent} -> {expected_type}"
66
-
67
- def test_explicit_type_overrides_auto(self):
68
- with patch("sys.argv", [
69
- "evaluate.py", "--agent", "scripted",
70
- "--agent-name", "T", "--agent-type", "RL",
71
- ]):
72
- args = parse_args()
73
- assert args.agent_type == "RL"
74
-
75
-
76
- class TestGetAgentFn:
77
- """Test agent factory."""
78
-
79
- def test_scripted_returns_callable(self):
80
- fn = get_agent_fn("scripted")
81
- assert callable(fn)
82
-
83
- def test_llm_returns_callable(self):
84
- fn = get_agent_fn("llm")
85
- assert callable(fn)
86
-
87
-
88
- class TestAppendResults:
89
- """Test CSV output."""
90
-
91
- def test_creates_new_file(self):
92
- with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
93
- path = Path(f.name)
94
-
95
- path.unlink() # ensure it doesn't exist
96
- results = {col: "" for col in RESULTS_COLUMNS}
97
- results["agent_name"] = "TestBot"
98
- results["games"] = 5
99
- results["score"] = 85.0
100
-
101
- append_results(results, path)
102
-
103
- assert path.exists()
104
- with open(path) as f:
105
- reader = csv.DictReader(f)
106
- rows = list(reader)
107
- assert len(rows) == 1
108
- assert rows[0]["agent_name"] == "TestBot"
109
-
110
- path.unlink()
111
-
112
- def test_appends_to_existing(self):
113
- with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
114
- path = Path(f.name)
115
-
116
- # Write first result
117
- results1 = {col: "" for col in RESULTS_COLUMNS}
118
- results1["agent_name"] = "Bot1"
119
- append_results(results1, path)
120
-
121
- # Write second result
122
- results2 = {col: "" for col in RESULTS_COLUMNS}
123
- results2["agent_name"] = "Bot2"
124
- append_results(results2, path)
125
-
126
- with open(path) as f:
127
- reader = csv.DictReader(f)
128
- rows = list(reader)
129
- assert len(rows) == 2
130
- assert rows[0]["agent_name"] == "Bot1"
131
- assert rows[1]["agent_name"] == "Bot2"
132
-
133
- path.unlink()
134
-
135
- def test_columns_match_expected(self):
136
- assert "agent_name" in RESULTS_COLUMNS
137
- assert "score" in RESULTS_COLUMNS
138
- assert "win_rate" in RESULTS_COLUMNS
139
- assert "replay_url" in RESULTS_COLUMNS
140
- assert len(RESULTS_COLUMNS) == 13
141
-
142
-
143
- class TestScoringUsesUtil:
144
- """Verify scoring uses the single source of truth from openra-rl-util."""
145
-
146
- def test_rubrics_re_exports_util(self):
147
- """rubrics.py should re-export from openra_rl_util."""
148
- from rubrics import compute_composite_score_from_games
149
- from openra_rl_util.rubrics import (
150
- compute_composite_score_from_games as util_fn,
151
- )
152
- assert compute_composite_score_from_games is util_fn
153
-
154
- def test_evaluate_uses_util_scoring(self):
155
- """evaluate.py should not have its own compute_composite_score."""
156
- import evaluate
157
- assert not hasattr(evaluate, "compute_composite_score"), \
158
- "evaluate.py should use compute_composite_score_from_games from Util"
159
-
160
- def test_compute_game_metrics_re_exported(self):
161
- from rubrics import compute_game_metrics
162
- from openra_rl_util.rubrics import compute_game_metrics as util_fn
163
- assert compute_game_metrics is util_fn