Spaces:
Running
Running
| """In-browser evaluation runner for OpenRA-Bench. | |
| Runs games against the OpenRA-RL server via HTTP REST API (POST /reset, /step). | |
| No openra-rl/openenv imports โ avoids websockets version conflicts with Gradio. | |
| Scoring logic inlined from openra_rl_util.rubrics (which has zero dependencies). | |
| """ | |
| import time | |
| from datetime import datetime, timezone | |
| from typing import Any, Callable, Dict, List, Optional | |
| import httpx | |
| # HuggingFace-hosted OpenRA-RL environment | |
| DEFAULT_SERVER = "https://openra-rl-openra-rl.hf.space" | |
| MAX_STEPS_PER_GAME = 5000 | |
| STEP_TIMEOUT = 60.0 | |
| # โโ Scoring (inlined from openra_rl_util/rubrics.py) โโโโโโโโโโโโโโโโโโโโโโโโ | |
| def compute_game_metrics(obs: Dict[str, Any]) -> Dict[str, Any]: | |
| """Extract benchmark metrics from a final game observation dict.""" | |
| military = obs.get("military") or {} | |
| economy = obs.get("economy") or {} | |
| kills = military.get("kills_cost", 0) | |
| deaths = military.get("deaths_cost", 0) | |
| assets = military.get("assets_value", 0) | |
| cash = economy.get("cash", 0) | |
| result = obs.get("result", "") | |
| tick = obs.get("tick", 0) | |
| return { | |
| "result": result, | |
| "win": result == "win", | |
| "ticks": tick, | |
| "kills_cost": kills, | |
| "deaths_cost": deaths, | |
| "kd_ratio": kills / max(deaths, 1), | |
| "assets_value": assets, | |
| "cash": cash, | |
| } | |
| def compute_composite_score(game_results: List[Dict[str, Any]]) -> float: | |
| """Compute OpenRA-Bench composite score: 50% win + 25% military + 25% economy.""" | |
| total = len(game_results) | |
| if total == 0: | |
| return 0.0 | |
| win_rate = sum(1 for g in game_results if g["win"]) / total | |
| mil_scores = [] | |
| for g in game_results: | |
| kills, deaths = g["kills_cost"], g["deaths_cost"] | |
| total_cost = kills + deaths | |
| mil_scores.append(kills / total_cost if total_cost > 0 else 0.5) | |
| avg_mil = sum(mil_scores) / total | |
| econ_scores = [] | |
| for g in game_results: | |
| assets = g["assets_value"] | |
| econ_scores.append(assets / (assets + 10000) if assets >= 0 else 0.0) | |
| avg_econ = sum(econ_scores) / total | |
| return 100.0 * (0.5 * win_rate + 0.25 * avg_mil + 0.25 * avg_econ) | |
| # โโ Server communication โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def wake_hf_space(server_url: str, max_wait: int = 120) -> str: | |
| """Wake a sleeping HuggingFace Space. Returns status message.""" | |
| if ".hf.space" not in server_url: | |
| return "Local server, skipping wake." | |
| start = time.time() | |
| while time.time() - start < max_wait: | |
| try: | |
| resp = httpx.get(server_url, timeout=10, follow_redirects=True) | |
| if resp.status_code == 200: | |
| return "Environment server is ready." | |
| except httpx.HTTPError: | |
| pass | |
| time.sleep(5) | |
| return "Warning: server may still be starting." | |
| async def run_single_game( | |
| client: httpx.AsyncClient, | |
| server_url: str, | |
| max_steps: int = MAX_STEPS_PER_GAME, | |
| ) -> Dict[str, Any]: | |
| """Run one game via HTTP REST and return metrics.""" | |
| # Reset environment | |
| resp = await client.post(f"{server_url}/reset", json={}) | |
| resp.raise_for_status() | |
| data = resp.json() | |
| obs = data["observation"] | |
| steps = 0 | |
| while not obs.get("result") and steps < max_steps: | |
| # Scripted no-op agent: send empty commands | |
| action = {"commands": []} | |
| resp = await client.post( | |
| f"{server_url}/step", | |
| json={"action": action}, | |
| ) | |
| resp.raise_for_status() | |
| data = resp.json() | |
| obs = data["observation"] | |
| steps += 1 | |
| return compute_game_metrics(obs) | |
| async def run_evaluation( | |
| agent_name: str, | |
| opponent: str, | |
| num_games: int, | |
| server_url: str = DEFAULT_SERVER, | |
| on_game_done: Optional[Callable[[int, int, Dict], None]] = None, | |
| ) -> Dict[str, Any]: | |
| """Run N games and return aggregate results. | |
| Args: | |
| agent_name: Display name for the leaderboard. | |
| opponent: AI difficulty (Beginner/Easy/Medium/Normal/Hard). | |
| num_games: Number of games to play. | |
| server_url: OpenRA-RL server URL. | |
| on_game_done: Optional callback(game_num, total, metrics) after each game. | |
| Returns: | |
| Dict with all fields needed for results.csv. | |
| """ | |
| game_results: List[Dict[str, Any]] = [] | |
| async with httpx.AsyncClient(timeout=STEP_TIMEOUT) as client: | |
| for i in range(num_games): | |
| metrics = await run_single_game(client, server_url) | |
| game_results.append(metrics) | |
| if on_game_done: | |
| on_game_done(i + 1, num_games, metrics) | |
| total = len(game_results) | |
| wins = sum(1 for g in game_results if g["win"]) | |
| return { | |
| "agent_name": agent_name, | |
| "agent_type": "Scripted", | |
| "opponent": opponent, | |
| "games": total, | |
| "win_rate": round(100.0 * wins / max(total, 1), 1), | |
| "score": round(compute_composite_score(game_results), 1), | |
| "avg_kills": round(sum(g["kills_cost"] for g in game_results) / max(total, 1)), | |
| "avg_deaths": round(sum(g["deaths_cost"] for g in game_results) / max(total, 1)), | |
| "kd_ratio": round( | |
| sum(g["kd_ratio"] for g in game_results) / max(total, 1), 2 | |
| ), | |
| "avg_economy": round( | |
| sum(g["assets_value"] for g in game_results) / max(total, 1) | |
| ), | |
| "avg_game_length": round( | |
| sum(g["ticks"] for g in game_results) / max(total, 1) | |
| ), | |
| "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%d"), | |
| "replay_url": "", | |
| } | |