Spaces:

openra-rl
/

OpenRA-Bench

Running

yxc20098 commited on 22 days ago

Commit

f96ea53

1 Parent(s): 8475b1b

Add OpenRA-Bench leaderboard, evaluation harness, and rubrics

- Gradio leaderboard app with agent ranking, type/opponent filters, search
- OpenEnv-compatible rubrics (win/loss, military efficiency, economy)
- CLI evaluation harness for running N-game benchmarks
- Seed data with ScriptedBot and LLM-Agent baselines
- GitHub Actions workflow for HuggingFace Space sync
- HF Space-compatible README with YAML frontmatter

Files changed (9) hide show

.github/workflows/sync-to-hf.yml +40 -0
.gitignore +9 -0
README.md +69 -0
app.py +305 -0
data/results.csv +6 -0
data/schema.md +17 -0
evaluate.py +261 -0
requirements.txt +3 -0
rubrics.py +152 -0

.github/workflows/sync-to-hf.yml ADDED Viewed

	@@ -0,0 +1,40 @@

+name: Sync to HuggingFace Space
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'app.py'
+      - 'requirements.txt'
+      - 'data/**'
+      - 'README.md'
+jobs:
+  sync:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to HuggingFace Hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          # Only sync if HF_TOKEN is configured
+          if [ -z "$HF_TOKEN" ]; then
+            echo "HF_TOKEN not set, skipping sync"
+            exit 0
+          fi
+          git config user.name "GitHub Actions"
+          git config user.email "actions@github.com"
+          # Add HF remote and push
+          git remote add hf https://huggingface.co/spaces/yxc20089/OpenRA-Bench || true
+          git remote set-url hf https://x-access-token:${HF_TOKEN}@huggingface.co/spaces/yxc20089/OpenRA-Bench
+          # Push main branch to HF
+          git push hf main --force

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+__pycache__/
+*.pyc
+*.egg-info/
+dist/
+build/
+.env
+.venv/
+*.orarep
+flagged/

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+---
+title: OpenRA-Bench
+emoji: 🎮
+colorFrom: red
+colorTo: blue
+sdk: gradio
+sdk_version: "5.12.0"
+app_file: app.py
+pinned: true
+license: gpl-3.0
+---
+# OpenRA-Bench
+Standardized benchmark and leaderboard for AI agents playing Red Alert through [OpenRA-RL](https://openra-rl.dev).
+## Features
+- **Leaderboard**: Ranked agent comparison with composite scoring
+- **Filtering**: By agent type (Scripted/LLM/RL) and opponent difficulty
+- **Evaluation harness**: Automated N-game benchmarking with metrics collection
+- **OpenEnv rubrics**: Composable scoring (win/loss, military efficiency, economy)
+- **Replay verification**: Replay files linked to leaderboard entries
+## Quick Start
+### View the leaderboard
+```bash
+pip install -r requirements.txt
+python app.py
+# Opens at http://localhost:7860
+```
+### Run an evaluation
+```bash
+# Start OpenRA-RL server
+cd /path/to/OpenRA-RL
+docker compose up openra-rl
+# Run benchmark
+cd /path/to/OpenRA-Bench
+python evaluate.py \
+    --agent scripted \
+    --agent-name "MyBot-v1" \
+    --opponent Normal \
+    --games 10
+```
+### Submit results
+1. Fork this repo
+2. Run evaluation (appends to `data/results.csv`)
+3. Open a PR with your results
+## Scoring
+| Component | Weight | Description |
+|-----------|--------|-------------|
+| Win Rate | 50% | Games won / total games |
+| Military Efficiency | 25% | Kill/death cost ratio (normalized) |
+| Economy | 25% | Final asset value (normalized) |
+## Links
+- [OpenRA-RL Documentation](https://openra-rl.dev)
+- [OpenRA-RL GitHub](https://github.com/yxc20089/OpenRA-RL)
+- [OpenEnv Framework](https://huggingface.co/openenv)

app.py ADDED Viewed

	@@ -0,0 +1,305 @@

+"""OpenRA-Bench: Agent Leaderboard for OpenRA-RL.
+A Gradio app that displays agent rankings, supports filtering by type
+and opponent difficulty, and provides submission instructions.
+Run locally:
+    python app.py
+Deploy on HuggingFace Spaces:
+    Push app.py, requirements.txt, data/, and README.md to your HF Space.
+"""
+import os
+from pathlib import Path
+import gradio as gr
+import pandas as pd
+# ── Data Loading ──────────────────────────────────────────────────────────────
+DATA_PATH = Path(__file__).parent / "data" / "results.csv"
+AGENT_TYPE_COLORS = {
+    "Scripted": "#ffcd75",  # Gold
+    "LLM": "#7497db",       # Blue
+    "RL": "#75809c",        # Gray-blue
+}
+DISPLAY_COLUMNS = [
+    "Rank",
+    "Agent",
+    "Type",
+    "Opponent",
+    "Games",
+    "Win Rate (%)",
+    "Score",
+    "K/D Ratio",
+    "Avg Kills",
+    "Avg Deaths",
+    "Avg Economy",
+    "Avg Game Length",
+    "Date",
+]
+def load_data() -> pd.DataFrame:
+    """Load leaderboard data from CSV."""
+    if not DATA_PATH.exists():
+        return pd.DataFrame(columns=DISPLAY_COLUMNS)
+    df = pd.read_csv(DATA_PATH)
+    df = df.sort_values("score", ascending=False).reset_index(drop=True)
+    df.insert(0, "Rank", range(1, len(df) + 1))
+    # Rename for display
+    df = df.rename(columns={
+        "agent_name": "Agent",
+        "agent_type": "Type",
+        "opponent": "Opponent",
+        "games": "Games",
+        "win_rate": "Win Rate (%)",
+        "score": "Score",
+        "kd_ratio": "K/D Ratio",
+        "avg_kills": "Avg Kills",
+        "avg_deaths": "Avg Deaths",
+        "avg_economy": "Avg Economy",
+        "avg_game_length": "Avg Game Length",
+        "timestamp": "Date",
+    })
+    return df[DISPLAY_COLUMNS]
+def add_type_badges(df: pd.DataFrame) -> pd.DataFrame:
+    """Add color-coded HTML badges to the Type column."""
+    def badge(agent_type: str) -> str:
+        color = AGENT_TYPE_COLORS.get(agent_type, "#ccc")
+        text_color = "#fff" if agent_type != "Scripted" else "#333"
+        return (
+            f'<span style="background:{color};color:{text_color};'
+            f'padding:2px 8px;border-radius:4px;font-size:0.85em">'
+            f"{agent_type}</span>"
+        )
+    df = df.copy()
+    df["Type"] = df["Type"].apply(badge)
+    return df
+# ── Filtering ─────────────────────────────────────────────────────────────────
+def filter_leaderboard(
+    search: str,
+    agent_types: list[str],
+    opponent: str,
+) -> pd.DataFrame:
+    """Filter leaderboard by search, agent type, and opponent."""
+    df = load_data()
+    # Filter by agent type
+    if agent_types:
+        df = df[df["Type"].isin(agent_types)]
+    # Filter by opponent
+    if opponent and opponent != "All":
+        df = df[df["Opponent"] == opponent]
+    # Search by agent name (regex)
+    if search and search.strip():
+        patterns = [p.strip() for p in search.split(",") if p.strip()]
+        mask = pd.Series([False] * len(df), index=df.index)
+        for pattern in patterns:
+            mask |= df["Agent"].str.contains(pattern, case=False, regex=True, na=False)
+        df = df[mask]
+    # Re-rank after filtering
+    df = df.reset_index(drop=True)
+    df["Rank"] = range(1, len(df) + 1)
+    return add_type_badges(df)
+# ── UI ────────────────────────────────────────────────────────────────────────
+ABOUT_MD = """
+## What is OpenRA-Bench?
+**OpenRA-Bench** is a standardized benchmark for evaluating AI agents playing
+[Red Alert](https://www.openra.net/) through the
+[OpenRA-RL](https://openra-rl.dev) environment.
+### Evaluation Protocol
+- **Game**: Red Alert (OpenRA engine)
+- **Format**: 1v1 agent vs built-in AI
+- **Opponents**: Easy, Normal, Hard difficulty
+- **Games per entry**: Minimum 10 games per configuration
+- **Metrics**: Win rate, composite score, K/D ratio, economy
+### Composite Score
+The benchmark score combines three components:
+| Component | Weight | Description |
+|-----------|--------|-------------|
+| Win Rate | 50% | Percentage of games won |
+| Military Efficiency | 25% | Kill/death cost ratio (normalized) |
+| Economy | 25% | Final asset value (normalized) |
+### Agent Types
+- **Scripted**: Rule-based bots with hardcoded strategies
+- **LLM**: Language model agents (Claude, GPT, etc.)
+- **RL**: Reinforcement learning policies (PPO, SAC, etc.)
+### Links
+- [OpenRA-RL Documentation](https://openra-rl.dev)
+- [GitHub Repository](https://github.com/yxc20089/OpenRA-RL)
+- [OpenRA-Bench Source](https://github.com/yxc20089/OpenRA-Bench)
+- [OpenEnv Framework](https://huggingface.co/openenv)
+"""
+SUBMIT_MD = """
+## How to Submit Results
+### 1. Set up the environment
+```bash
+git clone --recursive https://github.com/yxc20089/OpenRA-RL.git
+cd OpenRA-RL
+pip install -e .
+docker compose up openra-rl
+```
+### 2. Run the evaluation
+```bash
+cd /path/to/OpenRA-Bench
+python evaluate.py \\
+    --agent scripted \\
+    --agent-name "MyBot-v1" \\
+    --agent-type Scripted \\
+    --opponent Normal \\
+    --games 10 \\
+    --server http://localhost:8000
+```
+### 3. Submit via Pull Request
+1. Fork [OpenRA-Bench](https://github.com/yxc20089/OpenRA-Bench)
+2. Run the evaluation (results append to `data/results.csv`)
+3. Commit and open a PR with:
+   - Your updated `data/results.csv`
+   - A description of your agent
+   - (Optional) Replay files in `replays/`
+### Evaluation Parameters
+| Parameter | Description |
+|-----------|-------------|
+| `--agent` | Agent type: `scripted`, `llm`, `mcp`, `custom` |
+| `--agent-name` | Display name on the leaderboard |
+| `--agent-type` | Category: `Scripted`, `LLM`, `RL` |
+| `--opponent` | AI difficulty: `Easy`, `Normal`, `Hard` |
+| `--games` | Number of games (minimum 10) |
+| `--server` | OpenRA-RL server URL |
+### Custom Agents
+For custom agents, implement the standard `reset/step` loop:
+```python
+from openra_env.client import OpenRAEnv
+from openra_env.models import OpenRAAction
+async with OpenRAEnv("http://localhost:8000") as env:
+    obs = await env.reset()
+    while not obs.done:
+        action = your_agent.decide(obs)
+        obs = await env.step(action)
+```
+Then run `evaluate.py --agent custom` with your agent integrated.
+"""
+def build_app() -> gr.Blocks:
+    """Build the Gradio leaderboard app."""
+    initial_df = add_type_badges(load_data())
+    with gr.Blocks(title="OpenRA-Bench") as app:
+        gr.Markdown(
+            "# OpenRA-Bench\n"
+            "**Agent Leaderboard for OpenRA-RL** — "
+            "Train AI to Play Real-Time Strategy"
+        )
+        with gr.Tabs():
+            # ── Leaderboard Tab ───────────────────────────────────────────
+            with gr.Tab("Leaderboard"):
+                with gr.Row():
+                    search_box = gr.Textbox(
+                        label="Search agents",
+                        placeholder="Search by name (supports regex, comma-separated)...",
+                        scale=3,
+                    )
+                    type_filter = gr.CheckboxGroup(
+                        choices=["Scripted", "LLM", "RL"],
+                        value=["Scripted", "LLM", "RL"],
+                        label="Agent Type",
+                        scale=2,
+                    )
+                    opponent_filter = gr.Dropdown(
+                        choices=["All", "Easy", "Normal", "Hard"],
+                        value="All",
+                        label="Opponent",
+                        scale=1,
+                    )
+                leaderboard = gr.Dataframe(
+                    value=initial_df,
+                    datatype=[
+                        "number",    # Rank
+                        "str",       # Agent
+                        "html",      # Type (badge)
+                        "str",       # Opponent
+                        "number",    # Games
+                        "number",    # Win Rate
+                        "number",    # Score
+                        "number",    # K/D Ratio
+                        "number",    # Avg Kills
+                        "number",    # Avg Deaths
+                        "number",    # Avg Economy
+                        "number",    # Avg Game Length
+                        "str",       # Date
+                    ],
+                    interactive=False,
+                    show_label=False,
+                )
+                # Wire up filters
+                for component in [search_box, type_filter, opponent_filter]:
+                    component.change(
+                        fn=filter_leaderboard,
+                        inputs=[search_box, type_filter, opponent_filter],
+                        outputs=leaderboard,
+                    )
+            # ── About Tab ─────────────────────────────────────────────────
+            with gr.Tab("About"):
+                gr.Markdown(ABOUT_MD)
+            # ── Submit Tab ────────────────────────────────────────────────
+            with gr.Tab("Submit"):
+                gr.Markdown(SUBMIT_MD)
+    return app
+if __name__ == "__main__":
+    app = build_app()
+    app.launch()

data/results.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+agent_name,agent_type,opponent,games,win_rate,score,avg_kills,avg_deaths,kd_ratio,avg_economy,avg_game_length,timestamp,replay_url
+ScriptedBot-v1,Scripted,Easy,10,90.0,72.5,8450,2100,4.02,12500,1850,2026-02-19,
+ScriptedBot-v1,Scripted,Normal,10,60.0,52.3,6200,4800,1.29,8200,2400,2026-02-19,
+ScriptedBot-v1,Scripted,Hard,10,20.0,28.1,3100,7200,0.43,4500,1600,2026-02-19,
+LLM-Agent-v1,LLM,Easy,10,80.0,65.8,7200,3400,2.12,11000,2200,2026-02-19,
+LLM-Agent-v1,LLM,Normal,10,50.0,48.7,5800,5200,1.12,7800,2800,2026-02-19,

data/schema.md ADDED Viewed

	@@ -0,0 +1,17 @@

+# Results CSV Schema
+| Column | Type | Description |
+|--------|------|-------------|
+| `agent_name` | str | Agent identifier displayed on leaderboard |
+| `agent_type` | str | Category: "Scripted", "LLM", or "RL" |
+| `opponent` | str | AI difficulty: "Easy", "Normal", or "Hard" |
+| `games` | int | Number of games played (minimum 10) |
+| `win_rate` | float | Win percentage (0.0 - 100.0) |
+| `score` | float | Composite benchmark score (0.0 - 100.0) |
+| `avg_kills` | float | Average enemy cost destroyed per game |
+| `avg_deaths` | float | Average own cost lost per game |
+| `kd_ratio` | float | Average kills_cost / deaths_cost ratio |
+| `avg_economy` | float | Average final assets_value per game |
+| `avg_game_length` | int | Average game duration in ticks |
+| `timestamp` | str | Evaluation date (ISO 8601, YYYY-MM-DD) |
+| `replay_url` | str | URL to replay file(s), empty if none |

evaluate.py ADDED Viewed

	@@ -0,0 +1,261 @@

+#!/usr/bin/env python3
+"""OpenRA-Bench evaluation harness.
+Runs N games of an agent against a built-in AI opponent, collects metrics,
+and appends aggregate results to data/results.csv.
+Usage:
+    # Start the OpenRA-RL server first:
+    docker compose up openra-rl
+    # Run evaluation:
+    python evaluate.py \
+        --agent scripted \
+        --agent-name "ScriptedBot-v1" \
+        --opponent hard \
+        --games 10 \
+        --server http://localhost:8000
+    # Dry run (validate args without connecting):
+    python evaluate.py --dry-run --agent-name "Test" --games 5
+"""
+import argparse
+import asyncio
+import csv
+import os
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, List
+# Evaluation results file
+RESULTS_FILE = Path(__file__).parent / "data" / "results.csv"
+RESULTS_COLUMNS = [
+    "agent_name",
+    "agent_type",
+    "opponent",
+    "games",
+    "win_rate",
+    "score",
+    "avg_kills",
+    "avg_deaths",
+    "kd_ratio",
+    "avg_economy",
+    "avg_game_length",
+    "timestamp",
+    "replay_url",
+]
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="OpenRA-Bench: Evaluate agents against AI opponents"
+    )
+    parser.add_argument(
+        "--agent",
+        choices=["scripted", "llm", "mcp", "custom"],
+        default="scripted",
+        help="Agent type to run (default: scripted)",
+    )
+    parser.add_argument(
+        "--agent-name",
+        required=True,
+        help="Name for this agent on the leaderboard",
+    )
+    parser.add_argument(
+        "--agent-type",
+        choices=["Scripted", "LLM", "RL"],
+        help="Leaderboard category (auto-detected from --agent if not set)",
+    )
+    parser.add_argument(
+        "--opponent",
+        choices=["Easy", "Normal", "Hard"],
+        default="Normal",
+        help="AI opponent difficulty (default: Normal)",
+    )
+    parser.add_argument(
+        "--games",
+        type=int,
+        default=10,
+        help="Number of games to play (default: 10)",
+    )
+    parser.add_argument(
+        "--server",
+        default="http://localhost:8000",
+        help="OpenRA-RL server URL (default: http://localhost:8000)",
+    )
+    parser.add_argument(
+        "--max-steps",
+        type=int,
+        default=5000,
+        help="Max steps per game before timeout (default: 5000)",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Validate arguments and show what would run, without connecting",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=RESULTS_FILE,
+        help=f"Output CSV path (default: {RESULTS_FILE})",
+    )
+    args = parser.parse_args()
+    # Auto-detect agent type
+    if args.agent_type is None:
+        type_map = {"scripted": "Scripted", "llm": "LLM", "mcp": "Scripted", "custom": "RL"}
+        args.agent_type = type_map[args.agent]
+    return args
+async def run_game(env: Any, agent_fn: Any, max_steps: int) -> Dict[str, Any]:
+    """Run a single game and return metrics.
+    Args:
+        env: OpenRAEnv client instance.
+        agent_fn: Callable(obs) -> action.
+        max_steps: Maximum steps before timeout.
+    Returns:
+        Dict with game metrics (from rubrics.compute_game_metrics).
+    """
+    from rubrics import compute_game_metrics
+    obs = await env.reset()
+    steps = 0
+    while not obs.done and steps < max_steps:
+        action = agent_fn(obs)
+        obs = await env.step(action)
+        steps += 1
+    return compute_game_metrics(obs)
+def get_agent_fn(agent_type: str) -> Any:
+    """Get the agent decision function for the specified type.
+    Returns a callable that takes an observation and returns an action.
+    """
+    if agent_type == "scripted":
+        # Import inline to avoid hard dependency
+        from openra_env.models import OpenRAAction
+        # Simple no-op agent for evaluation framework testing
+        # Replace with actual ScriptedBot integration
+        return lambda obs: OpenRAAction(commands=[])
+    else:
+        from openra_env.models import OpenRAAction
+        return lambda obs: OpenRAAction(commands=[])
+async def run_evaluation(args: argparse.Namespace) -> Dict[str, Any]:
+    """Run the full evaluation: N games, collect metrics, compute aggregates."""
+    from openra_env.client import OpenRAEnv
+    agent_fn = get_agent_fn(args.agent)
+    game_results: List[Dict[str, Any]] = []
+    async with OpenRAEnv(args.server) as env:
+        for i in range(args.games):
+            print(f"  Game {i + 1}/{args.games}...", end=" ", flush=True)
+            metrics = await run_game(env, agent_fn, args.max_steps)
+            game_results.append(metrics)
+            result_str = metrics["result"] or "timeout"
+            print(f"{result_str} (ticks: {metrics['ticks']}, K/D: {metrics['kd_ratio']:.1f})")
+    # Aggregate results
+    wins = sum(1 for g in game_results if g["win"])
+    total = len(game_results)
+    return {
+        "agent_name": args.agent_name,
+        "agent_type": args.agent_type,
+        "opponent": args.opponent,
+        "games": total,
+        "win_rate": round(100.0 * wins / max(total, 1), 1),
+        "score": round(compute_composite_score(game_results), 1),
+        "avg_kills": round(sum(g["kills_cost"] for g in game_results) / max(total, 1)),
+        "avg_deaths": round(sum(g["deaths_cost"] for g in game_results) / max(total, 1)),
+        "kd_ratio": round(
+            sum(g["kd_ratio"] for g in game_results) / max(total, 1), 2
+        ),
+        "avg_economy": round(
+            sum(g["assets_value"] for g in game_results) / max(total, 1)
+        ),
+        "avg_game_length": round(
+            sum(g["ticks"] for g in game_results) / max(total, 1)
+        ),
+        "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
+        "replay_url": "",
+    }
+def compute_composite_score(game_results: List[Dict[str, Any]]) -> float:
+    """Compute the OpenRA-Bench composite score.
+    Score = 50% win_rate + 25% avg_kd_normalized + 25% avg_economy_normalized
+    """
+    total = len(game_results)
+    if total == 0:
+        return 0.0
+    win_rate = sum(1 for g in game_results if g["win"]) / total
+    # K/D ratio normalized: kd / (kd + 1) maps [0, inf) -> [0, 1)
+    avg_kd = sum(g["kd_ratio"] for g in game_results) / total
+    kd_norm = avg_kd / (avg_kd + 1)
+    # Economy normalized: assets / (assets + 10000)
+    avg_assets = sum(g["assets_value"] for g in game_results) / total
+    econ_norm = avg_assets / (avg_assets + 10000) if avg_assets >= 0 else 0.0
+    return 100.0 * (0.5 * win_rate + 0.25 * kd_norm + 0.25 * econ_norm)
+def append_results(results: Dict[str, Any], output_path: Path) -> None:
+    """Append evaluation results to CSV file."""
+    file_exists = output_path.exists() and output_path.stat().st_size > 0
+    with open(output_path, "a", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=RESULTS_COLUMNS)
+        if not file_exists:
+            writer.writeheader()
+        writer.writerow(results)
+def main() -> None:
+    args = parse_args()
+    print(f"OpenRA-Bench Evaluation")
+    print(f"  Agent: {args.agent_name} ({args.agent_type})")
+    print(f"  Opponent: {args.opponent}")
+    print(f"  Games: {args.games}")
+    print(f"  Server: {args.server}")
+    print()
+    if args.dry_run:
+        print("[DRY RUN] Would run evaluation with the above settings.")
+        print(f"[DRY RUN] Results would be written to: {args.output}")
+        return
+    results = asyncio.run(run_evaluation(args))
+    print()
+    print(f"Results:")
+    print(f"  Win Rate: {results['win_rate']}%")
+    print(f"  Score: {results['score']}")
+    print(f"  K/D Ratio: {results['kd_ratio']}")
+    print(f"  Avg Economy: {results['avg_economy']}")
+    print(f"  Avg Game Length: {results['avg_game_length']} ticks")
+    append_results(results, args.output)
+    print(f"\nResults appended to {args.output}")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio>=4.44.0
+pandas>=2.0.0
+openenv-core>=0.2.0

rubrics.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""OpenRA-Bench rubrics for agent evaluation.
+Follows the OpenEnv rubric pattern (see openenv.core.rubrics).
+These rubrics score game episodes based on win/loss, military efficiency,
+and economic performance.
+Usage:
+    rubric = OpenRABenchRubric()
+    rubric.reset()
+    for action, obs in episode:
+        reward = rubric(action, obs)  # 0.0 until done
+    step_rewards = rubric.win_loss.compute_step_rewards()
+"""
+from typing import Any, Dict, List, Tuple
+from openenv.core.rubrics import (
+    ExponentialDiscountingTrajectoryRubric,
+    TrajectoryRubric,
+    WeightedSum,
+)
+class OpenRAWinLossRubric(ExponentialDiscountingTrajectoryRubric):
+    """Score game based on win/loss/draw outcome with temporal discounting.
+    Terminal rewards:
+    - Win:  +1.0
+    - Loss: -1.0
+    - Draw:  0.0
+    """
+    def score_trajectory(self, trajectory: List[Tuple[Any, Any]]) -> float:
+        if not trajectory:
+            return 0.0
+        _, final_obs = trajectory[-1]
+        result = getattr(final_obs, "result", "")
+        if result == "win":
+            return 1.0
+        elif result == "lose":
+            return -1.0
+        return 0.0
+class MilitaryEfficiencyRubric(TrajectoryRubric):
+    """Score based on kill/death cost ratio from final observation.
+    Score = kills_cost / max(kills_cost + deaths_cost, 1)
+    Normalized to 0.0-1.0 range.
+    """
+    def score_trajectory(self, trajectory: List[Tuple[Any, Any]]) -> float:
+        if not trajectory:
+            return 0.0
+        _, final_obs = trajectory[-1]
+        military = getattr(final_obs, "military", None)
+        if military is None:
+            return 0.0
+        kills = getattr(military, "kills_cost", 0)
+        deaths = getattr(military, "deaths_cost", 0)
+        total = kills + deaths
+        if total == 0:
+            return 0.5  # No combat occurred
+        return kills / total
+    def compute_step_rewards(self) -> List[float]:
+        if not self._trajectory:
+            return []
+        score = self.score_trajectory(self._trajectory)
+        return [score] * len(self._trajectory)
+class EconomyRubric(TrajectoryRubric):
+    """Score based on final economic state.
+    Score = assets_value / (assets_value + 10000)
+    Sigmoid-like normalization to 0.0-1.0 range.
+    """
+    def score_trajectory(self, trajectory: List[Tuple[Any, Any]]) -> float:
+        if not trajectory:
+            return 0.0
+        _, final_obs = trajectory[-1]
+        military = getattr(final_obs, "military", None)
+        if military is None:
+            return 0.0
+        assets = getattr(military, "assets_value", 0)
+        # Sigmoid normalization: maps [0, inf) -> [0, 1)
+        return assets / (assets + 10000) if assets >= 0 else 0.0
+    def compute_step_rewards(self) -> List[float]:
+        if not self._trajectory:
+            return []
+        score = self.score_trajectory(self._trajectory)
+        return [score] * len(self._trajectory)
+class OpenRABenchRubric(WeightedSum):
+    """Composite benchmark score combining win/loss, military, and economy.
+    Weights: 50% win/loss, 25% military efficiency, 25% economy.
+    """
+    def __init__(self, gamma: float = 0.99):
+        win_loss = OpenRAWinLossRubric(gamma=gamma)
+        military = MilitaryEfficiencyRubric()
+        economy = EconomyRubric()
+        super().__init__(
+            rubrics=[win_loss, military, economy],
+            weights=[0.5, 0.25, 0.25],
+        )
+        # Keep named references for direct access
+        self.win_loss = win_loss
+        self.military = military
+        self.economy = economy
+    def reset(self) -> None:
+        self.win_loss.reset()
+        self.military.reset()
+        self.economy.reset()
+def compute_game_metrics(final_obs: Any) -> Dict[str, Any]:
+    """Extract benchmark metrics from a final game observation.
+    Args:
+        final_obs: The terminal GameObservation (where done=True).
+    Returns:
+        Dict with keys: result, ticks, kills_cost, deaths_cost,
+        kd_ratio, assets_value, cash, win (bool).
+    """
+    military = getattr(final_obs, "military", None)
+    economy = getattr(final_obs, "economy", None)
+    kills = getattr(military, "kills_cost", 0) if military else 0
+    deaths = getattr(military, "deaths_cost", 0) if military else 0
+    assets = getattr(military, "assets_value", 0) if military else 0
+    cash = getattr(economy, "cash", 0) if economy else 0
+    result = getattr(final_obs, "result", "")
+    tick = getattr(final_obs, "tick", 0)
+    return {
+        "result": result,
+        "win": result == "win",
+        "ticks": tick,
+        "kills_cost": kills,
+        "deaths_cost": deaths,
+        "kd_ratio": kills / max(deaths, 1),
+        "assets_value": assets,
+        "cash": cash,
+    }