Spaces:

openra-rl
/

OpenRA-Bench

Running

yxc20098 commited on 16 days ago

Commit

824262a

1 Parent(s): 44493a3

Add in-browser evaluation via Evaluate tab

Users can now run scripted agent evaluations directly from the
leaderboard UI against the HF-hosted OpenRA-RL environment.
No Docker or local setup needed.

- evaluate_runner.py: self-contained eval via HTTP REST (no
openra-rl/openenv imports, avoids websockets conflict)
- Evaluate tab: form with agent name, opponent, game count
- Results saved to CSV and JSONL (CommitScheduler on HF)
- Submit tab updated with in-browser option

Files changed (4) hide show

.gitignore +1 -0
app.py +188 -9
evaluate_runner.py +171 -0
requirements.txt +2 -0

.gitignore CHANGED Viewed

@@ -7,3 +7,4 @@ build/
 .venv/
 *.orarep
 flagged/

 .venv/
 *.orarep
 flagged/
+submissions/

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """OpenRA-Bench: Agent Leaderboard for OpenRA-RL.
 A Gradio app that displays agent rankings, supports filtering by type
-and opponent difficulty, and provides submission instructions.
 Run locally:
     python app.py
@@ -10,12 +10,18 @@ Deploy on HuggingFace Spaces:
     Push app.py, requirements.txt, data/, and README.md to your HF Space.
 """
 import os
 from pathlib import Path
 import gradio as gr
 import pandas as pd
 # ── Data Loading ──────────────────────────────────────────────────────────────
 DATA_PATH = Path(__file__).parent / "data" / "results.csv"
@@ -121,6 +127,134 @@ def filter_leaderboard(
     return add_type_badges(df)
 # ── UI ────────────────────────────────────────────────────────────────────────
 ABOUT_MD = """
@@ -166,20 +300,19 @@ The benchmark score combines three components:
 SUBMIT_MD = """
 ## How to Submit Results
-### 1. Set up the evaluation harness
 ```bash
 git clone https://github.com/yxc20089/OpenRA-Bench.git
 cd OpenRA-Bench
 pip install -r requirements.txt
 pip install openra-rl openra-rl-util
-```
-### 2. Run the evaluation
-**Option A: HuggingFace-hosted (no Docker needed)**
-```bash
 python evaluate.py \\
     --agent scripted \\
     --agent-name "MyBot-v1" \\
@@ -189,7 +322,7 @@ python evaluate.py \\
     --server https://openra-rl-openra-rl.hf.space
 ```
-**Option B: Local server (Docker)**
 ```bash
 git clone --recursive https://github.com/yxc20089/OpenRA-RL.git
@@ -306,6 +439,52 @@ def build_app() -> gr.Blocks:
                         outputs=leaderboard,
                     )
             # ── About Tab ─────────────────────────────────────────────────
             with gr.Tab("About"):
                 gr.Markdown(ABOUT_MD)

 """OpenRA-Bench: Agent Leaderboard for OpenRA-RL.
 A Gradio app that displays agent rankings, supports filtering by type
+and opponent difficulty, and lets users run evaluations in-browser.
 Run locally:
     python app.py
     Push app.py, requirements.txt, data/, and README.md to your HF Space.
 """
+import asyncio
+import csv
+import json
 import os
+from datetime import datetime, timezone
 from pathlib import Path
 import gradio as gr
 import pandas as pd
+from evaluate_runner import DEFAULT_SERVER, run_evaluation, wake_hf_space
 # ── Data Loading ──────────────────────────────────────────────────────────────
 DATA_PATH = Path(__file__).parent / "data" / "results.csv"
     return add_type_badges(df)
+# ── Result Persistence ────────────────────────────────────────────────────────
+SUBMISSIONS_DIR = Path(__file__).parent / "submissions"
+SUBMISSIONS_DIR.mkdir(exist_ok=True)
+# CommitScheduler pushes submissions to HF dataset (only on HF Spaces)
+_scheduler = None
+if os.environ.get("HF_TOKEN") and os.environ.get("SPACE_ID"):
+    try:
+        from huggingface_hub import CommitScheduler
+        _scheduler = CommitScheduler(
+            repo_id="openra-rl/bench-results",
+            repo_type="dataset",
+            folder_path=str(SUBMISSIONS_DIR),
+            every=5,
+            token=os.environ["HF_TOKEN"],
+        )
+    except Exception:
+        pass  # Running locally without HF token — skip
+def save_submission(results: dict) -> None:
+    """Append results to local JSONL and CSV."""
+    # JSONL for CommitScheduler → HF dataset
+    jsonl_path = SUBMISSIONS_DIR / "results.jsonl"
+    with open(jsonl_path, "a") as f:
+        f.write(json.dumps(results) + "\n")
+    # Also append to data/results.csv for the leaderboard
+    csv_path = DATA_PATH
+    file_exists = csv_path.exists() and csv_path.stat().st_size > 0
+    fieldnames = [
+        "agent_name", "agent_type", "opponent", "games", "win_rate",
+        "score", "avg_kills", "avg_deaths", "kd_ratio", "avg_economy",
+        "avg_game_length", "timestamp", "replay_url",
+    ]
+    with open(csv_path, "a", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        if not file_exists:
+            writer.writeheader()
+        writer.writerow(results)
+# ── Evaluation Handler ────────────────────────────────────────────────────────
+def run_eval_sync(agent_name: str, opponent: str, num_games: int):
+    """Generator that runs evaluation and yields progress updates."""
+    if not agent_name or not agent_name.strip():
+        yield "Error: Please enter an agent name.", None, ""
+        return
+    agent_name = agent_name.strip()
+    num_games = int(num_games)
+    log_lines = []
+    def log(msg: str):
+        log_lines.append(msg)
+        return "\n".join(log_lines)
+    # Wake server
+    yield log(f"Connecting to {DEFAULT_SERVER}..."), None, ""
+    status = wake_hf_space(DEFAULT_SERVER)
+    yield log(status), None, ""
+    # Track per-game progress
+    game_log = []
+    def on_game_done(game_num, total, metrics):
+        result = metrics["result"] or "timeout"
+        kd = metrics["kd_ratio"]
+        game_log.append({
+            "Game": game_num,
+            "Result": result,
+            "K/D": round(kd, 1),
+            "Ticks": metrics["ticks"],
+        })
+    yield log(f"Running {num_games} game(s) vs {opponent} AI..."), None, ""
+    try:
+        results = asyncio.run(
+            run_evaluation(
+                agent_name=agent_name,
+                opponent=opponent,
+                num_games=num_games,
+                server_url=DEFAULT_SERVER,
+                on_game_done=on_game_done,
+            )
+        )
+    except Exception as e:
+        yield log(f"Error: {e}"), None, ""
+        return
+    # Save results
+    save_submission(results)
+    # Format output
+    for g in game_log:
+        log(f"  Game {g['Game']}: {g['Result']} (K/D: {g['K/D']}, ticks: {g['Ticks']})")
+    log(f"\nEvaluation complete!")
+    summary = (
+        f"### Results: {agent_name}\n\n"
+        f"| Metric | Value |\n|--------|-------|\n"
+        f"| **Score** | **{results['score']}** |\n"
+        f"| Win Rate | {results['win_rate']}% |\n"
+        f"| K/D Ratio | {results['kd_ratio']} |\n"
+        f"| Avg Economy | {results['avg_economy']} |\n"
+        f"| Games | {results['games']} vs {results['opponent']} |\n"
+    )
+    results_df = pd.DataFrame([{
+        "Agent": results["agent_name"],
+        "Type": results["agent_type"],
+        "Opponent": results["opponent"],
+        "Games": results["games"],
+        "Win Rate (%)": results["win_rate"],
+        "Score": results["score"],
+        "K/D Ratio": results["kd_ratio"],
+    }])
+    yield "\n".join(log_lines), results_df, summary
 # ── UI ────────────────────────────────────────────────────────────────────────
 ABOUT_MD = """
 SUBMIT_MD = """
 ## How to Submit Results
+### Option A: In-Browser (no setup needed)
+Use the **Evaluate** tab to run a scripted agent directly from your browser.
+Results are saved to the leaderboard automatically.
+### Option B: CLI with HuggingFace-hosted server (no Docker needed)
 ```bash
 git clone https://github.com/yxc20089/OpenRA-Bench.git
 cd OpenRA-Bench
 pip install -r requirements.txt
 pip install openra-rl openra-rl-util
 python evaluate.py \\
     --agent scripted \\
     --agent-name "MyBot-v1" \\
     --server https://openra-rl-openra-rl.hf.space
 ```
+### Option C: Local server (Docker)**
 ```bash
 git clone --recursive https://github.com/yxc20089/OpenRA-RL.git
                         outputs=leaderboard,
                     )
+            # ── Evaluate Tab ──────────────────────────────────────────────
+            with gr.Tab("Evaluate"):
+                gr.Markdown(
+                    "## Run Evaluation\n\n"
+                    "Run a scripted agent against the HuggingFace-hosted "
+                    "OpenRA-RL environment. No Docker or local setup needed."
+                )
+                with gr.Row():
+                    eval_name = gr.Textbox(
+                        label="Agent Name",
+                        placeholder="e.g. MyBot-v1",
+                        scale=2,
+                    )
+                    eval_opponent = gr.Dropdown(
+                        choices=["Easy", "Normal", "Hard"],
+                        value="Normal",
+                        label="Opponent",
+                        scale=1,
+                    )
+                    eval_games = gr.Slider(
+                        minimum=1,
+                        maximum=20,
+                        value=3,
+                        step=1,
+                        label="Games",
+                        scale=1,
+                    )
+                eval_btn = gr.Button("Run Evaluation", variant="primary")
+                eval_log = gr.Textbox(
+                    label="Progress",
+                    lines=10,
+                    interactive=False,
+                )
+                eval_results = gr.Dataframe(
+                    label="Game Results",
+                    interactive=False,
+                )
+                eval_summary = gr.Markdown()
+                eval_btn.click(
+                    fn=run_eval_sync,
+                    inputs=[eval_name, eval_opponent, eval_games],
+                    outputs=[eval_log, eval_results, eval_summary],
+                )
             # ── About Tab ─────────────────────────────────────────────────
             with gr.Tab("About"):
                 gr.Markdown(ABOUT_MD)

evaluate_runner.py ADDED Viewed

	@@ -0,0 +1,171 @@

+"""In-browser evaluation runner for OpenRA-Bench.
+Runs games against the OpenRA-RL server via HTTP REST API (POST /reset, /step).
+No openra-rl/openenv imports — avoids websockets version conflicts with Gradio.
+Scoring logic inlined from openra_rl_util.rubrics (which has zero dependencies).
+"""
+import time
+from datetime import datetime, timezone
+from typing import Any, Callable, Dict, List, Optional
+import httpx
+# HuggingFace-hosted OpenRA-RL environment
+DEFAULT_SERVER = "https://openra-rl-openra-rl.hf.space"
+MAX_STEPS_PER_GAME = 5000
+STEP_TIMEOUT = 60.0
+# ── Scoring (inlined from openra_rl_util/rubrics.py) ────────────────────────
+def compute_game_metrics(obs: Dict[str, Any]) -> Dict[str, Any]:
+    """Extract benchmark metrics from a final game observation dict."""
+    military = obs.get("military") or {}
+    economy = obs.get("economy") or {}
+    kills = military.get("kills_cost", 0)
+    deaths = military.get("deaths_cost", 0)
+    assets = military.get("assets_value", 0)
+    cash = economy.get("cash", 0)
+    result = obs.get("result", "")
+    tick = obs.get("tick", 0)
+    return {
+        "result": result,
+        "win": result == "win",
+        "ticks": tick,
+        "kills_cost": kills,
+        "deaths_cost": deaths,
+        "kd_ratio": kills / max(deaths, 1),
+        "assets_value": assets,
+        "cash": cash,
+    }
+def compute_composite_score(game_results: List[Dict[str, Any]]) -> float:
+    """Compute OpenRA-Bench composite score: 50% win + 25% military + 25% economy."""
+    total = len(game_results)
+    if total == 0:
+        return 0.0
+    win_rate = sum(1 for g in game_results if g["win"]) / total
+    mil_scores = []
+    for g in game_results:
+        kills, deaths = g["kills_cost"], g["deaths_cost"]
+        total_cost = kills + deaths
+        mil_scores.append(kills / total_cost if total_cost > 0 else 0.5)
+    avg_mil = sum(mil_scores) / total
+    econ_scores = []
+    for g in game_results:
+        assets = g["assets_value"]
+        econ_scores.append(assets / (assets + 10000) if assets >= 0 else 0.0)
+    avg_econ = sum(econ_scores) / total
+    return 100.0 * (0.5 * win_rate + 0.25 * avg_mil + 0.25 * avg_econ)
+# ── Server communication ────────────────────────────────────────────────────
+def wake_hf_space(server_url: str, max_wait: int = 120) -> str:
+    """Wake a sleeping HuggingFace Space. Returns status message."""
+    if ".hf.space" not in server_url:
+        return "Local server, skipping wake."
+    start = time.time()
+    while time.time() - start < max_wait:
+        try:
+            resp = httpx.get(server_url, timeout=10, follow_redirects=True)
+            if resp.status_code == 200:
+                return "Environment server is ready."
+        except httpx.HTTPError:
+            pass
+        time.sleep(5)
+    return "Warning: server may still be starting."
+async def run_single_game(
+    client: httpx.AsyncClient,
+    server_url: str,
+    max_steps: int = MAX_STEPS_PER_GAME,
+) -> Dict[str, Any]:
+    """Run one game via HTTP REST and return metrics."""
+    # Reset environment
+    resp = await client.post(f"{server_url}/reset", json={})
+    resp.raise_for_status()
+    data = resp.json()
+    obs = data["observation"]
+    steps = 0
+    while not obs.get("result") and steps < max_steps:
+        # Scripted no-op agent: send empty commands
+        action = {"commands": []}
+        resp = await client.post(
+            f"{server_url}/step",
+            json={"action": action},
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        obs = data["observation"]
+        steps += 1
+    return compute_game_metrics(obs)
+async def run_evaluation(
+    agent_name: str,
+    opponent: str,
+    num_games: int,
+    server_url: str = DEFAULT_SERVER,
+    on_game_done: Optional[Callable[[int, int, Dict], None]] = None,
+) -> Dict[str, Any]:
+    """Run N games and return aggregate results.
+    Args:
+        agent_name: Display name for the leaderboard.
+        opponent: AI difficulty (Easy/Normal/Hard).
+        num_games: Number of games to play.
+        server_url: OpenRA-RL server URL.
+        on_game_done: Optional callback(game_num, total, metrics) after each game.
+    Returns:
+        Dict with all fields needed for results.csv.
+    """
+    game_results: List[Dict[str, Any]] = []
+    async with httpx.AsyncClient(timeout=STEP_TIMEOUT) as client:
+        for i in range(num_games):
+            metrics = await run_single_game(client, server_url)
+            game_results.append(metrics)
+            if on_game_done:
+                on_game_done(i + 1, num_games, metrics)
+    total = len(game_results)
+    wins = sum(1 for g in game_results if g["win"])
+    return {
+        "agent_name": agent_name,
+        "agent_type": "Scripted",
+        "opponent": opponent,
+        "games": total,
+        "win_rate": round(100.0 * wins / max(total, 1), 1),
+        "score": round(compute_composite_score(game_results), 1),
+        "avg_kills": round(sum(g["kills_cost"] for g in game_results) / max(total, 1)),
+        "avg_deaths": round(sum(g["deaths_cost"] for g in game_results) / max(total, 1)),
+        "kd_ratio": round(
+            sum(g["kd_ratio"] for g in game_results) / max(total, 1), 2
+        ),
+        "avg_economy": round(
+            sum(g["assets_value"] for g in game_results) / max(total, 1)
+        ),
+        "avg_game_length": round(
+            sum(g["ticks"] for g in game_results) / max(total, 1)
+        ),
+        "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
+        "replay_url": "",
+    }

requirements.txt CHANGED Viewed

@@ -1,2 +1,4 @@
 gradio>=4.44.0
 pandas>=2.0.0

 gradio>=4.44.0
 pandas>=2.0.0
+httpx>=0.24.0
+huggingface_hub>=0.20.0