Spaces:

openra-rl
/

OpenRA-Bench

Running

yxc20098 commited on 15 days ago

Commit

45ef63c

1 Parent(s): 8ce66d2

Add upload form, API endpoint, 5 difficulty tiers, real game data

- Add validate_submission + handle_api_submit for CLI/form uploads
- Add Gradio upload form in Submit tab (JSON + optional replay)
- Update opponent tiers: Beginner/Easy/Medium/Normal/Hard (was 3)
- Replace dummy CSV data with real game result
- Update schema.md with 5-tier opponent values
- 37 tests pass (12 new: validation, API submit, tier acceptance)

Files changed (7) hide show

app.py +185 -37
data/results.csv +1 -5
data/schema.md +2 -2
evaluate.py +1 -1
evaluate_runner.py +1 -1
tests/test_app.py +103 -6
tests/test_evaluate.py +14 -0

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ from pathlib import Path
 import gradio as gr
 import pandas as pd
-from evaluate_runner import DEFAULT_SERVER
 # ── Data Loading ──────────────────────────────────────────────────────────────
@@ -170,6 +170,127 @@ def save_submission(results: dict) -> None:
         writer.writerow(results)
 # ── UI ────────────────────────────────────────────────────────────────────────
 ABOUT_MD = """
@@ -183,7 +304,7 @@ ABOUT_MD = """
 - **Game**: Red Alert (OpenRA engine)
 - **Format**: 1v1 agent vs built-in AI
-- **Opponents**: Easy, Normal, Hard difficulty
 - **Games per entry**: Minimum 10 games per configuration
 - **Metrics**: Win rate, composite score, K/D ratio, economy
@@ -213,37 +334,36 @@ The benchmark score combines three components:
 """
 SUBMIT_MD = """
-## How to Submit Results
-### Option A: Watch AI Play (no setup needed)
-Visit the [OpenRA-RL Space](https://huggingface.co/spaces/openra-rl/openra-rl)
-and click **Try** to watch a pre-configured LLM agent play Red Alert
-directly in your browser. No API keys or setup required.
-### Option B: CLI with HuggingFace-hosted server (no Docker needed)
-```bash
-git clone https://github.com/yxc20089/OpenRA-Bench.git
-cd OpenRA-Bench
-pip install -r requirements.txt
-pip install openra-rl openra-rl-util
-python evaluate.py \\
-    --agent scripted \\
-    --agent-name "MyBot-v1" \\
-    --agent-type Scripted \\
-    --opponent Normal \\
-    --games 10 \\
-    --server https://openra-rl-openra-rl.hf.space
 ```
-### Option C: Local server (Docker)**
 ```bash
-git clone --recursive https://github.com/yxc20089/OpenRA-RL.git
-cd OpenRA-RL && pip install -e . && docker compose up openra-rl
-cd /path/to/OpenRA-Bench
 python evaluate.py \\
     --agent scripted \\
@@ -254,15 +374,6 @@ python evaluate.py \\
     --server http://localhost:8000
 ```
-### 3. Submit via Pull Request
-1. Fork [OpenRA-Bench](https://github.com/yxc20089/OpenRA-Bench)
-2. Run the evaluation (results append to `data/results.csv`)
-3. Commit and open a PR with:
-   - Your updated `data/results.csv`
-   - A description of your agent
-   - (Optional) Replay files in `replays/`
 ### Evaluation Parameters
 | Parameter | Description |
@@ -270,19 +381,19 @@ python evaluate.py \\
 | `--agent` | Agent type: `scripted`, `llm`, `mcp`, `custom` |
 | `--agent-name` | Display name on the leaderboard |
 | `--agent-type` | Category: `Scripted`, `LLM`, `RL` |
-| `--opponent` | AI difficulty: `Easy`, `Normal`, `Hard` |
 | `--games` | Number of games (minimum 10) |
 | `--server` | OpenRA-RL server URL (local or HuggingFace-hosted) |
 ### Custom Agents
-For custom agents, implement the standard `reset/step` loop:
 ```python
 from openra_env.client import OpenRAEnv
 from openra_env.models import OpenRAAction
-async with OpenRAEnv("https://openra-rl-openra-rl.hf.space") as env:
     obs = await env.reset()
     while not obs.done:
         action = your_agent.decide(obs)
@@ -320,7 +431,7 @@ def build_app() -> gr.Blocks:
                         scale=2,
                     )
                     opponent_filter = gr.Dropdown(
-                        choices=["All", "Easy", "Normal", "Hard"],
                         value="All",
                         label="Opponent",
                         scale=1,
@@ -361,6 +472,43 @@ def build_app() -> gr.Blocks:
             # ── Submit Tab ────────────────────────────────────────────────
             with gr.Tab("Submit"):
                 gr.Markdown(SUBMIT_MD)
     return app

 import gradio as gr
 import pandas as pd
+from evaluate_runner import DEFAULT_SERVER, compute_composite_score, compute_game_metrics
 # ── Data Loading ──────────────────────────────────────────────────────────────
         writer.writerow(results)
+# ── Submission Handling ───────────────────────────────────────────────────────
+VALID_OPPONENTS = {"Beginner", "Easy", "Medium", "Normal", "Hard"}
+VALID_AGENT_TYPES = {"Scripted", "LLM", "RL"}
+REQUIRED_FIELDS = [
+    "agent_name", "agent_type", "opponent", "result",
+    "ticks", "kills_cost", "deaths_cost", "assets_value",
+]
+def validate_submission(data: dict) -> tuple[bool, str]:
+    """Validate an uploaded JSON submission.
+    Returns (is_valid, error_message).
+    """
+    for field in REQUIRED_FIELDS:
+        if field not in data:
+            return False, f"Missing required field: {field}"
+    if data["agent_type"] not in VALID_AGENT_TYPES:
+        return False, (
+            f"Invalid agent_type: {data['agent_type']}. "
+            f"Must be one of: {', '.join(sorted(VALID_AGENT_TYPES))}"
+        )
+    if data["opponent"] not in VALID_OPPONENTS:
+        return False, (
+            f"Invalid opponent: {data['opponent']}. "
+            f"Must be one of: {', '.join(sorted(VALID_OPPONENTS))}"
+        )
+    return True, ""
+def _score_from_submission(data: dict) -> dict:
+    """Build a CSV-ready results dict from a validated submission."""
+    game_result = {
+        "result": data.get("result", ""),
+        "win": data.get("win", data.get("result") == "win"),
+        "ticks": data.get("ticks", 0),
+        "kills_cost": data.get("kills_cost", 0),
+        "deaths_cost": data.get("deaths_cost", 0),
+        "kd_ratio": data.get("kd_ratio", 0),
+        "assets_value": data.get("assets_value", 0),
+        "cash": data.get("cash", 0),
+    }
+    score = compute_composite_score([game_result])
+    kills = data.get("kills_cost", 0)
+    deaths = data.get("deaths_cost", 0)
+    games = data.get("games", 1)
+    return {
+        "agent_name": data["agent_name"],
+        "agent_type": data["agent_type"],
+        "opponent": data["opponent"],
+        "games": games,
+        "win_rate": round(100.0 * (1 if data.get("win") else 0) / max(games, 1), 1),
+        "score": round(score, 1),
+        "avg_kills": kills,
+        "avg_deaths": deaths,
+        "kd_ratio": round(kills / max(deaths, 1), 2),
+        "avg_economy": data.get("assets_value", 0),
+        "avg_game_length": data.get("ticks", 0),
+        "timestamp": data.get("timestamp", datetime.now(timezone.utc).strftime("%Y-%m-%d"))[:10],
+        "replay_url": "",
+    }
+def handle_upload(json_file, replay_file) -> tuple[str, pd.DataFrame]:
+    """Process an uploaded bench submission JSON + optional replay."""
+    if json_file is None:
+        return "Please upload a JSON file.", add_type_badges(load_data())
+    try:
+        with open(json_file.name) as f:
+            data = json.load(f)
+    except (json.JSONDecodeError, Exception) as e:
+        return f"Invalid JSON: {e}", add_type_badges(load_data())
+    is_valid, error = validate_submission(data)
+    if not is_valid:
+        return f"Validation error: {error}", add_type_badges(load_data())
+    results_row = _score_from_submission(data)
+    # Save replay if provided
+    if replay_file is not None:
+        import shutil
+        replay_name = Path(replay_file.name).name
+        shutil.copy2(replay_file.name, SUBMISSIONS_DIR / replay_name)
+        results_row["replay_url"] = replay_name
+    save_submission(results_row)
+    return (
+        f"Submitted! **{data['agent_name']}** ({data['agent_type']}) "
+        f"vs {data['opponent']}: score **{results_row['score']}**",
+        add_type_badges(load_data()),
+    )
+def handle_api_submit(json_data: str) -> str:
+    """API endpoint: accept JSON string submission. Used by CLI auto-upload."""
+    try:
+        data = json.loads(json_data)
+    except (json.JSONDecodeError, Exception) as e:
+        return f"Invalid JSON: {e}"
+    is_valid, error = validate_submission(data)
+    if not is_valid:
+        return f"Validation error: {error}"
+    results_row = _score_from_submission(data)
+    save_submission(results_row)
+    return (
+        f"OK: {data['agent_name']} ({data['agent_type']}) "
+        f"vs {data['opponent']}: score {results_row['score']}"
+    )
 # ── UI ────────────────────────────────────────────────────────────────────────
 ABOUT_MD = """
 - **Game**: Red Alert (OpenRA engine)
 - **Format**: 1v1 agent vs built-in AI
+- **Opponents**: Beginner, Easy, Medium, Normal, Hard difficulty
 - **Games per entry**: Minimum 10 games per configuration
 - **Metrics**: Win rate, composite score, K/D ratio, economy
 """
 SUBMIT_MD = """
+---
+## Other Submission Methods
+### CLI Auto-Upload
+Set `BENCH_URL` in your OpenRA-RL config and results upload automatically
+after each game:
+```yaml
+# config.yaml
+agent:
+  bench_url: "https://openra-rl-openra-bench.hf.space"
+```
+### CLI Manual Upload
+Upload a previously exported bench JSON:
+```bash
+python -m openra_env.bench_submit ~/.openra-rl/bench-exports/bench-*.json
 ```
+### Batch Evaluation (10+ games)
 ```bash
+git clone https://github.com/yxc20089/OpenRA-Bench.git
+cd OpenRA-Bench
+pip install -r requirements.txt
+pip install openra-rl openra-rl-util
 python evaluate.py \\
     --agent scripted \\
     --server http://localhost:8000
 ```
 ### Evaluation Parameters
 | Parameter | Description |
 | `--agent` | Agent type: `scripted`, `llm`, `mcp`, `custom` |
 | `--agent-name` | Display name on the leaderboard |
 | `--agent-type` | Category: `Scripted`, `LLM`, `RL` |
+| `--opponent` | AI difficulty: `Beginner`, `Easy`, `Medium`, `Normal`, `Hard` |
 | `--games` | Number of games (minimum 10) |
 | `--server` | OpenRA-RL server URL (local or HuggingFace-hosted) |
 ### Custom Agents
+Implement the standard `reset/step` loop:
 ```python
 from openra_env.client import OpenRAEnv
 from openra_env.models import OpenRAAction
+async with OpenRAEnv("http://localhost:8000") as env:
     obs = await env.reset()
     while not obs.done:
         action = your_agent.decide(obs)
                         scale=2,
                     )
                     opponent_filter = gr.Dropdown(
+                        choices=["All", "Beginner", "Easy", "Medium", "Normal", "Hard"],
                         value="All",
                         label="Opponent",
                         scale=1,
             # ── Submit Tab ────────────────────────────────────────────────
             with gr.Tab("Submit"):
+                gr.Markdown(
+                    "## Upload Results\n\n"
+                    "Upload a bench export JSON from your OpenRA-RL game. "
+                    "After each game, the agent saves a JSON file to "
+                    "`~/.openra-rl/bench-exports/`."
+                )
+                with gr.Row():
+                    json_upload = gr.File(
+                        label="Bench export JSON",
+                        file_types=[".json"],
+                        scale=3,
+                    )
+                    replay_upload = gr.File(
+                        label="Replay file (optional)",
+                        file_types=[".orarep"],
+                        scale=2,
+                    )
+                submit_btn = gr.Button("Submit Results", variant="primary")
+                submit_output = gr.Markdown()
+                submit_btn.click(
+                    fn=handle_upload,
+                    inputs=[json_upload, replay_upload],
+                    outputs=[submit_output, leaderboard],
+                )
+                # API endpoint for CLI auto-upload
+                api_json_input = gr.Textbox(visible=False)
+                api_result = gr.Textbox(visible=False)
+                api_btn = gr.Button(visible=False)
+                api_btn.click(
+                    fn=handle_api_submit,
+                    inputs=[api_json_input],
+                    outputs=[api_result],
+                    api_name="submit",
+                )
                 gr.Markdown(SUBMIT_MD)
     return app

data/results.csv CHANGED Viewed

@@ -1,6 +1,2 @@
 agent_name,agent_type,opponent,games,win_rate,score,avg_kills,avg_deaths,kd_ratio,avg_economy,avg_game_length,timestamp,replay_url
-ScriptedBot-v1,Scripted,Easy,10,90.0,72.5,8450,2100,4.02,12500,1850,2026-02-19,
-ScriptedBot-v1,Scripted,Normal,10,60.0,52.3,6200,4800,1.29,8200,2400,2026-02-19,
-ScriptedBot-v1,Scripted,Hard,10,20.0,28.1,3100,7200,0.43,4500,1600,2026-02-19,
-LLM-Agent-v1,LLM,Easy,10,80.0,65.8,7200,3400,2.12,11000,2200,2026-02-19,
-LLM-Agent-v1,LLM,Normal,10,50.0,48.7,5800,5200,1.12,7800,2800,2026-02-19,


1	agent_name,agent_type,opponent,games,win_rate,score,avg_kills,avg_deaths,kd_ratio,avg_economy,avg_game_length,timestamp,replay_url
2	+ qwen/qwen3-coder-next,LLM,Beginner,1,0.0,18.3,1000,2900,0.34,9050,27349,2026-02-25,

data/schema.md CHANGED Viewed

@@ -4,8 +4,8 @@
 |--------|------|-------------|
 | `agent_name` | str | Agent identifier displayed on leaderboard |
 | `agent_type` | str | Category: "Scripted", "LLM", or "RL" |
-| `opponent` | str | AI difficulty: "Easy", "Normal", or "Hard" |
-| `games` | int | Number of games played (minimum 10) |
 | `win_rate` | float | Win percentage (0.0 - 100.0) |
 | `score` | float | Composite benchmark score (0.0 - 100.0) |
 | `avg_kills` | float | Average enemy cost destroyed per game |

 |--------|------|-------------|
 | `agent_name` | str | Agent identifier displayed on leaderboard |
 | `agent_type` | str | Category: "Scripted", "LLM", or "RL" |
+| `opponent` | str | AI difficulty: "Beginner", "Easy", "Medium", "Normal", or "Hard" |
+| `games` | int | Number of games played |
 | `win_rate` | float | Win percentage (0.0 - 100.0) |
 | `score` | float | Composite benchmark score (0.0 - 100.0) |
 | `avg_kills` | float | Average enemy cost destroyed per game |

evaluate.py CHANGED Viewed

@@ -81,7 +81,7 @@ def parse_args() -> argparse.Namespace:
     )
     parser.add_argument(
         "--opponent",
-        choices=["Easy", "Normal", "Hard"],
         default="Normal",
         help="AI opponent difficulty (default: Normal)",
     )

     )
     parser.add_argument(
         "--opponent",
+        choices=["Beginner", "Easy", "Medium", "Normal", "Hard"],
         default="Normal",
         help="AI opponent difficulty (default: Normal)",
     )

evaluate_runner.py CHANGED Viewed

@@ -128,7 +128,7 @@ async def run_evaluation(
     Args:
         agent_name: Display name for the leaderboard.
-        opponent: AI difficulty (Easy/Normal/Hard).
         num_games: Number of games to play.
         server_url: OpenRA-RL server URL.
         on_game_done: Optional callback(game_num, total, metrics) after each game.

     Args:
         agent_name: Display name for the leaderboard.
+        opponent: AI difficulty (Beginner/Easy/Medium/Normal/Hard).
         num_games: Number of games to play.
         server_url: OpenRA-RL server URL.
         on_game_done: Optional callback(game_num, total, metrics) after each game.

tests/test_app.py CHANGED Viewed

@@ -12,10 +12,13 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
 from app import (
     AGENT_TYPE_COLORS,
     DISPLAY_COLUMNS,
     add_type_badges,
     build_app,
     filter_leaderboard,
     load_data,
 )
@@ -80,16 +83,19 @@ class TestFilter:
         assert isinstance(df, pd.DataFrame)
     def test_search_filters_by_name(self):
-        df = filter_leaderboard("ScriptedBot", [], "All")
-        # If there are results, they should contain "ScriptedBot"
         if len(df) > 0:
-            # Badges are in the Type column, not Agent
-            assert all("ScriptedBot" in str(row) for row in df["Agent"])
     def test_opponent_filter(self):
-        df = filter_leaderboard("", [], "Hard")
         if len(df) > 0:
-            assert all(df["Opponent"] == "Hard")
 class TestBuildApp:
@@ -98,3 +104,94 @@ class TestBuildApp:
     def test_builds_without_error(self):
         app = build_app()
         assert app is not None

 from app import (
     AGENT_TYPE_COLORS,
     DISPLAY_COLUMNS,
+    VALID_OPPONENTS,
     add_type_badges,
     build_app,
     filter_leaderboard,
+    handle_api_submit,
     load_data,
+    validate_submission,
 )
         assert isinstance(df, pd.DataFrame)
     def test_search_filters_by_name(self):
+        df = filter_leaderboard("qwen", [], "All")
         if len(df) > 0:
+            assert all("qwen" in str(row).lower() for row in df["Agent"])
     def test_opponent_filter(self):
+        df = filter_leaderboard("", [], "Beginner")
         if len(df) > 0:
+            assert all(df["Opponent"] == "Beginner")
+    def test_opponent_filter_hard(self):
+        df = filter_leaderboard("", [], "Hard")
+        # May be empty if no Hard entries exist
+        assert isinstance(df, pd.DataFrame)
 class TestBuildApp:
     def test_builds_without_error(self):
         app = build_app()
         assert app is not None
+class TestValidateSubmission:
+    """Test submission validation."""
+    def _valid_data(self):
+        return {
+            "agent_name": "TestBot",
+            "agent_type": "LLM",
+            "opponent": "Beginner",
+            "result": "loss",
+            "ticks": 27000,
+            "kills_cost": 1000,
+            "deaths_cost": 2900,
+            "assets_value": 9050,
+        }
+    def test_valid_submission(self):
+        valid, err = validate_submission(self._valid_data())
+        assert valid
+        assert err == ""
+    def test_missing_field(self):
+        data = {"agent_name": "Bot"}
+        valid, err = validate_submission(data)
+        assert not valid
+        assert "Missing required field" in err
+    def test_invalid_opponent(self):
+        data = self._valid_data()
+        data["opponent"] = "Brutal"
+        valid, err = validate_submission(data)
+        assert not valid
+        assert "Invalid opponent" in err
+    def test_invalid_agent_type(self):
+        data = self._valid_data()
+        data["agent_type"] = "MCTS"
+        valid, err = validate_submission(data)
+        assert not valid
+        assert "Invalid agent_type" in err
+    def test_all_opponents_accepted(self):
+        for opp in VALID_OPPONENTS:
+            data = self._valid_data()
+            data["opponent"] = opp
+            valid, _ = validate_submission(data)
+            assert valid, f"Opponent '{opp}' should be valid"
+    def test_all_agent_types_accepted(self):
+        for at in ["Scripted", "LLM", "RL"]:
+            data = self._valid_data()
+            data["agent_type"] = at
+            valid, _ = validate_submission(data)
+            assert valid, f"Agent type '{at}' should be valid"
+class TestApiSubmit:
+    """Test API submission handler."""
+    def test_valid_json(self):
+        import json
+        data = {
+            "agent_name": "TestBot",
+            "agent_type": "LLM",
+            "opponent": "Easy",
+            "result": "win",
+            "win": True,
+            "ticks": 5000,
+            "kills_cost": 3000,
+            "deaths_cost": 1000,
+            "assets_value": 8000,
+        }
+        # Use a temp CSV to avoid polluting real data
+        import tempfile
+        with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f:
+            temp_path = Path(f.name)
+        with patch("app.DATA_PATH", temp_path):
+            result = handle_api_submit(json.dumps(data))
+            assert "OK" in result
+            assert "TestBot" in result
+        temp_path.unlink(missing_ok=True)
+    def test_invalid_json(self):
+        result = handle_api_submit("not json")
+        assert "Invalid JSON" in result
+    def test_missing_fields(self):
+        import json
+        result = handle_api_submit(json.dumps({"agent_name": "Bot"}))
+        assert "Validation error" in result

tests/test_evaluate.py CHANGED Viewed

@@ -72,6 +72,20 @@ class TestParseArgs:
             args = parse_args()
             assert args.agent_type == "RL"
 class TestGetAgentFn:
     """Test agent factory."""

             args = parse_args()
             assert args.agent_type == "RL"
+    def test_beginner_opponent_accepted(self):
+        with patch("sys.argv", [
+            "evaluate.py", "--agent-name", "T", "--opponent", "Beginner",
+        ]):
+            args = parse_args()
+            assert args.opponent == "Beginner"
+    def test_medium_opponent_accepted(self):
+        with patch("sys.argv", [
+            "evaluate.py", "--agent-name", "T", "--opponent", "Medium",
+        ]):
+            args = parse_args()
+            assert args.opponent == "Medium"
 class TestGetAgentFn:
     """Test agent factory."""