Spaces:

openra-rl
/

OpenRA-Bench

Running

File size: 5,737 Bytes

"""In-browser evaluation runner for OpenRA-Bench.

Runs games against the OpenRA-RL server via HTTP REST API (POST /reset, /step).
No openra-rl/openenv imports — avoids websockets version conflicts with Gradio.

Scoring logic inlined from openra_rl_util.rubrics (which has zero dependencies).
"""

import time
from datetime import datetime, timezone
from typing import Any, Callable, Dict, List, Optional

import httpx

# HuggingFace-hosted OpenRA-RL environment
DEFAULT_SERVER = "https://openra-rl-openra-rl.hf.space"
MAX_STEPS_PER_GAME = 5000
STEP_TIMEOUT = 60.0


# ── Scoring (inlined from openra_rl_util/rubrics.py) ────────────────────────


def compute_game_metrics(obs: Dict[str, Any]) -> Dict[str, Any]:
    """Extract benchmark metrics from a final game observation dict."""
    military = obs.get("military") or {}
    economy = obs.get("economy") or {}

    kills = military.get("kills_cost", 0)
    deaths = military.get("deaths_cost", 0)
    assets = military.get("assets_value", 0)
    cash = economy.get("cash", 0)
    result = obs.get("result", "")
    tick = obs.get("tick", 0)

    return {
        "result": result,
        "win": result == "win",
        "ticks": tick,
        "kills_cost": kills,
        "deaths_cost": deaths,
        "kd_ratio": kills / max(deaths, 1),
        "assets_value": assets,
        "cash": cash,
    }


def compute_composite_score(game_results: List[Dict[str, Any]]) -> float:
    """Compute OpenRA-Bench composite score: 50% win + 25% military + 25% economy."""
    total = len(game_results)
    if total == 0:
        return 0.0

    win_rate = sum(1 for g in game_results if g["win"]) / total

    mil_scores = []
    for g in game_results:
        kills, deaths = g["kills_cost"], g["deaths_cost"]
        total_cost = kills + deaths
        mil_scores.append(kills / total_cost if total_cost > 0 else 0.5)
    avg_mil = sum(mil_scores) / total

    econ_scores = []
    for g in game_results:
        assets = g["assets_value"]
        econ_scores.append(assets / (assets + 10000) if assets >= 0 else 0.0)
    avg_econ = sum(econ_scores) / total

    return 100.0 * (0.5 * win_rate + 0.25 * avg_mil + 0.25 * avg_econ)


# ── Server communication ────────────────────────────────────────────────────


def wake_hf_space(server_url: str, max_wait: int = 120) -> str:
    """Wake a sleeping HuggingFace Space. Returns status message."""
    if ".hf.space" not in server_url:
        return "Local server, skipping wake."

    start = time.time()
    while time.time() - start < max_wait:
        try:
            resp = httpx.get(server_url, timeout=10, follow_redirects=True)
            if resp.status_code == 200:
                return "Environment server is ready."
        except httpx.HTTPError:
            pass
        time.sleep(5)
    return "Warning: server may still be starting."


async def run_single_game(
    client: httpx.AsyncClient,
    server_url: str,
    max_steps: int = MAX_STEPS_PER_GAME,
) -> Dict[str, Any]:
    """Run one game via HTTP REST and return metrics."""
    # Reset environment
    resp = await client.post(f"{server_url}/reset", json={})
    resp.raise_for_status()
    data = resp.json()
    obs = data["observation"]

    steps = 0
    while not obs.get("result") and steps < max_steps:
        # Scripted no-op agent: send empty commands
        action = {"commands": []}
        resp = await client.post(
            f"{server_url}/step",
            json={"action": action},
        )
        resp.raise_for_status()
        data = resp.json()
        obs = data["observation"]
        steps += 1

    return compute_game_metrics(obs)


async def run_evaluation(
    agent_name: str,
    opponent: str,
    num_games: int,
    server_url: str = DEFAULT_SERVER,
    on_game_done: Optional[Callable[[int, int, Dict], None]] = None,
) -> Dict[str, Any]:
    """Run N games and return aggregate results.

    Args:
        agent_name: Display name for the leaderboard.
        opponent: AI difficulty (Beginner/Easy/Medium/Normal/Hard).
        num_games: Number of games to play.
        server_url: OpenRA-RL server URL.
        on_game_done: Optional callback(game_num, total, metrics) after each game.

    Returns:
        Dict with all fields needed for results.csv.
    """
    game_results: List[Dict[str, Any]] = []

    async with httpx.AsyncClient(timeout=STEP_TIMEOUT) as client:
        for i in range(num_games):
            metrics = await run_single_game(client, server_url)
            game_results.append(metrics)
            if on_game_done:
                on_game_done(i + 1, num_games, metrics)

    total = len(game_results)
    wins = sum(1 for g in game_results if g["win"])

    return {
        "agent_name": agent_name,
        "agent_type": "Scripted",
        "opponent": opponent,
        "games": total,
        "win_rate": round(100.0 * wins / max(total, 1), 1),
        "score": round(compute_composite_score(game_results), 1),
        "avg_kills": round(sum(g["kills_cost"] for g in game_results) / max(total, 1)),
        "avg_deaths": round(sum(g["deaths_cost"] for g in game_results) / max(total, 1)),
        "kd_ratio": round(
            sum(g["kd_ratio"] for g in game_results) / max(total, 1), 2
        ),
        "avg_economy": round(
            sum(g["assets_value"] for g in game_results) / max(total, 1)
        ),
        "avg_game_length": round(
            sum(g["ticks"] for g in game_results) / max(total, 1)
        ),
        "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
        "replay_url": "",
    }