File size: 5,737 Bytes
824262a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45ef63c
824262a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""In-browser evaluation runner for OpenRA-Bench.

Runs games against the OpenRA-RL server via HTTP REST API (POST /reset, /step).
No openra-rl/openenv imports โ€” avoids websockets version conflicts with Gradio.

Scoring logic inlined from openra_rl_util.rubrics (which has zero dependencies).
"""

import time
from datetime import datetime, timezone
from typing import Any, Callable, Dict, List, Optional

import httpx

# HuggingFace-hosted OpenRA-RL environment
DEFAULT_SERVER = "https://openra-rl-openra-rl.hf.space"
MAX_STEPS_PER_GAME = 5000
STEP_TIMEOUT = 60.0


# โ”€โ”€ Scoring (inlined from openra_rl_util/rubrics.py) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€


def compute_game_metrics(obs: Dict[str, Any]) -> Dict[str, Any]:
    """Extract benchmark metrics from a final game observation dict."""
    military = obs.get("military") or {}
    economy = obs.get("economy") or {}

    kills = military.get("kills_cost", 0)
    deaths = military.get("deaths_cost", 0)
    assets = military.get("assets_value", 0)
    cash = economy.get("cash", 0)
    result = obs.get("result", "")
    tick = obs.get("tick", 0)

    return {
        "result": result,
        "win": result == "win",
        "ticks": tick,
        "kills_cost": kills,
        "deaths_cost": deaths,
        "kd_ratio": kills / max(deaths, 1),
        "assets_value": assets,
        "cash": cash,
    }


def compute_composite_score(game_results: List[Dict[str, Any]]) -> float:
    """Compute OpenRA-Bench composite score: 50% win + 25% military + 25% economy."""
    total = len(game_results)
    if total == 0:
        return 0.0

    win_rate = sum(1 for g in game_results if g["win"]) / total

    mil_scores = []
    for g in game_results:
        kills, deaths = g["kills_cost"], g["deaths_cost"]
        total_cost = kills + deaths
        mil_scores.append(kills / total_cost if total_cost > 0 else 0.5)
    avg_mil = sum(mil_scores) / total

    econ_scores = []
    for g in game_results:
        assets = g["assets_value"]
        econ_scores.append(assets / (assets + 10000) if assets >= 0 else 0.0)
    avg_econ = sum(econ_scores) / total

    return 100.0 * (0.5 * win_rate + 0.25 * avg_mil + 0.25 * avg_econ)


# โ”€โ”€ Server communication โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€


def wake_hf_space(server_url: str, max_wait: int = 120) -> str:
    """Wake a sleeping HuggingFace Space. Returns status message."""
    if ".hf.space" not in server_url:
        return "Local server, skipping wake."

    start = time.time()
    while time.time() - start < max_wait:
        try:
            resp = httpx.get(server_url, timeout=10, follow_redirects=True)
            if resp.status_code == 200:
                return "Environment server is ready."
        except httpx.HTTPError:
            pass
        time.sleep(5)
    return "Warning: server may still be starting."


async def run_single_game(
    client: httpx.AsyncClient,
    server_url: str,
    max_steps: int = MAX_STEPS_PER_GAME,
) -> Dict[str, Any]:
    """Run one game via HTTP REST and return metrics."""
    # Reset environment
    resp = await client.post(f"{server_url}/reset", json={})
    resp.raise_for_status()
    data = resp.json()
    obs = data["observation"]

    steps = 0
    while not obs.get("result") and steps < max_steps:
        # Scripted no-op agent: send empty commands
        action = {"commands": []}
        resp = await client.post(
            f"{server_url}/step",
            json={"action": action},
        )
        resp.raise_for_status()
        data = resp.json()
        obs = data["observation"]
        steps += 1

    return compute_game_metrics(obs)


async def run_evaluation(
    agent_name: str,
    opponent: str,
    num_games: int,
    server_url: str = DEFAULT_SERVER,
    on_game_done: Optional[Callable[[int, int, Dict], None]] = None,
) -> Dict[str, Any]:
    """Run N games and return aggregate results.

    Args:
        agent_name: Display name for the leaderboard.
        opponent: AI difficulty (Beginner/Easy/Medium/Normal/Hard).
        num_games: Number of games to play.
        server_url: OpenRA-RL server URL.
        on_game_done: Optional callback(game_num, total, metrics) after each game.

    Returns:
        Dict with all fields needed for results.csv.
    """
    game_results: List[Dict[str, Any]] = []

    async with httpx.AsyncClient(timeout=STEP_TIMEOUT) as client:
        for i in range(num_games):
            metrics = await run_single_game(client, server_url)
            game_results.append(metrics)
            if on_game_done:
                on_game_done(i + 1, num_games, metrics)

    total = len(game_results)
    wins = sum(1 for g in game_results if g["win"])

    return {
        "agent_name": agent_name,
        "agent_type": "Scripted",
        "opponent": opponent,
        "games": total,
        "win_rate": round(100.0 * wins / max(total, 1), 1),
        "score": round(compute_composite_score(game_results), 1),
        "avg_kills": round(sum(g["kills_cost"] for g in game_results) / max(total, 1)),
        "avg_deaths": round(sum(g["deaths_cost"] for g in game_results) / max(total, 1)),
        "kd_ratio": round(
            sum(g["kd_ratio"] for g in game_results) / max(total, 1), 2
        ),
        "avg_economy": round(
            sum(g["assets_value"] for g in game_results) / max(total, 1)
        ),
        "avg_game_length": round(
            sum(g["ticks"] for g in game_results) / max(total, 1)
        ),
        "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
        "replay_url": "",
    }