Spaces:
Running
Running
Add in-browser evaluation via Evaluate tab
Browse filesUsers can now run scripted agent evaluations directly from the
leaderboard UI against the HF-hosted OpenRA-RL environment.
No Docker or local setup needed.
- evaluate_runner.py: self-contained eval via HTTP REST (no
openra-rl/openenv imports, avoids websockets conflict)
- Evaluate tab: form with agent name, opponent, game count
- Results saved to CSV and JSONL (CommitScheduler on HF)
- Submit tab updated with in-browser option
- .gitignore +1 -0
- app.py +188 -9
- evaluate_runner.py +171 -0
- requirements.txt +2 -0
.gitignore
CHANGED
|
@@ -7,3 +7,4 @@ build/
|
|
| 7 |
.venv/
|
| 8 |
*.orarep
|
| 9 |
flagged/
|
|
|
|
|
|
| 7 |
.venv/
|
| 8 |
*.orarep
|
| 9 |
flagged/
|
| 10 |
+
submissions/
|
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
"""OpenRA-Bench: Agent Leaderboard for OpenRA-RL.
|
| 2 |
|
| 3 |
A Gradio app that displays agent rankings, supports filtering by type
|
| 4 |
-
and opponent difficulty, and
|
| 5 |
|
| 6 |
Run locally:
|
| 7 |
python app.py
|
|
@@ -10,12 +10,18 @@ Deploy on HuggingFace Spaces:
|
|
| 10 |
Push app.py, requirements.txt, data/, and README.md to your HF Space.
|
| 11 |
"""
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
import os
|
|
|
|
| 14 |
from pathlib import Path
|
| 15 |
|
| 16 |
import gradio as gr
|
| 17 |
import pandas as pd
|
| 18 |
|
|
|
|
|
|
|
| 19 |
# โโ Data Loading โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 20 |
|
| 21 |
DATA_PATH = Path(__file__).parent / "data" / "results.csv"
|
|
@@ -121,6 +127,134 @@ def filter_leaderboard(
|
|
| 121 |
return add_type_badges(df)
|
| 122 |
|
| 123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
# โโ UI โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 125 |
|
| 126 |
ABOUT_MD = """
|
|
@@ -166,20 +300,19 @@ The benchmark score combines three components:
|
|
| 166 |
SUBMIT_MD = """
|
| 167 |
## How to Submit Results
|
| 168 |
|
| 169 |
-
###
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
```bash
|
| 172 |
git clone https://github.com/yxc20089/OpenRA-Bench.git
|
| 173 |
cd OpenRA-Bench
|
| 174 |
pip install -r requirements.txt
|
| 175 |
pip install openra-rl openra-rl-util
|
| 176 |
-
```
|
| 177 |
-
|
| 178 |
-
### 2. Run the evaluation
|
| 179 |
|
| 180 |
-
**Option A: HuggingFace-hosted (no Docker needed)**
|
| 181 |
-
|
| 182 |
-
```bash
|
| 183 |
python evaluate.py \\
|
| 184 |
--agent scripted \\
|
| 185 |
--agent-name "MyBot-v1" \\
|
|
@@ -189,7 +322,7 @@ python evaluate.py \\
|
|
| 189 |
--server https://openra-rl-openra-rl.hf.space
|
| 190 |
```
|
| 191 |
|
| 192 |
-
|
| 193 |
|
| 194 |
```bash
|
| 195 |
git clone --recursive https://github.com/yxc20089/OpenRA-RL.git
|
|
@@ -306,6 +439,52 @@ def build_app() -> gr.Blocks:
|
|
| 306 |
outputs=leaderboard,
|
| 307 |
)
|
| 308 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
# โโ About Tab โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 310 |
with gr.Tab("About"):
|
| 311 |
gr.Markdown(ABOUT_MD)
|
|
|
|
| 1 |
"""OpenRA-Bench: Agent Leaderboard for OpenRA-RL.
|
| 2 |
|
| 3 |
A Gradio app that displays agent rankings, supports filtering by type
|
| 4 |
+
and opponent difficulty, and lets users run evaluations in-browser.
|
| 5 |
|
| 6 |
Run locally:
|
| 7 |
python app.py
|
|
|
|
| 10 |
Push app.py, requirements.txt, data/, and README.md to your HF Space.
|
| 11 |
"""
|
| 12 |
|
| 13 |
+
import asyncio
|
| 14 |
+
import csv
|
| 15 |
+
import json
|
| 16 |
import os
|
| 17 |
+
from datetime import datetime, timezone
|
| 18 |
from pathlib import Path
|
| 19 |
|
| 20 |
import gradio as gr
|
| 21 |
import pandas as pd
|
| 22 |
|
| 23 |
+
from evaluate_runner import DEFAULT_SERVER, run_evaluation, wake_hf_space
|
| 24 |
+
|
| 25 |
# โโ Data Loading โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 26 |
|
| 27 |
DATA_PATH = Path(__file__).parent / "data" / "results.csv"
|
|
|
|
| 127 |
return add_type_badges(df)
|
| 128 |
|
| 129 |
|
| 130 |
+
# โโ Result Persistence โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 131 |
+
|
| 132 |
+
SUBMISSIONS_DIR = Path(__file__).parent / "submissions"
|
| 133 |
+
SUBMISSIONS_DIR.mkdir(exist_ok=True)
|
| 134 |
+
|
| 135 |
+
# CommitScheduler pushes submissions to HF dataset (only on HF Spaces)
|
| 136 |
+
_scheduler = None
|
| 137 |
+
if os.environ.get("HF_TOKEN") and os.environ.get("SPACE_ID"):
|
| 138 |
+
try:
|
| 139 |
+
from huggingface_hub import CommitScheduler
|
| 140 |
+
|
| 141 |
+
_scheduler = CommitScheduler(
|
| 142 |
+
repo_id="openra-rl/bench-results",
|
| 143 |
+
repo_type="dataset",
|
| 144 |
+
folder_path=str(SUBMISSIONS_DIR),
|
| 145 |
+
every=5,
|
| 146 |
+
token=os.environ["HF_TOKEN"],
|
| 147 |
+
)
|
| 148 |
+
except Exception:
|
| 149 |
+
pass # Running locally without HF token โ skip
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def save_submission(results: dict) -> None:
|
| 153 |
+
"""Append results to local JSONL and CSV."""
|
| 154 |
+
# JSONL for CommitScheduler โ HF dataset
|
| 155 |
+
jsonl_path = SUBMISSIONS_DIR / "results.jsonl"
|
| 156 |
+
with open(jsonl_path, "a") as f:
|
| 157 |
+
f.write(json.dumps(results) + "\n")
|
| 158 |
+
|
| 159 |
+
# Also append to data/results.csv for the leaderboard
|
| 160 |
+
csv_path = DATA_PATH
|
| 161 |
+
file_exists = csv_path.exists() and csv_path.stat().st_size > 0
|
| 162 |
+
fieldnames = [
|
| 163 |
+
"agent_name", "agent_type", "opponent", "games", "win_rate",
|
| 164 |
+
"score", "avg_kills", "avg_deaths", "kd_ratio", "avg_economy",
|
| 165 |
+
"avg_game_length", "timestamp", "replay_url",
|
| 166 |
+
]
|
| 167 |
+
with open(csv_path, "a", newline="") as f:
|
| 168 |
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
| 169 |
+
if not file_exists:
|
| 170 |
+
writer.writeheader()
|
| 171 |
+
writer.writerow(results)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
# โโ Evaluation Handler โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def run_eval_sync(agent_name: str, opponent: str, num_games: int):
|
| 178 |
+
"""Generator that runs evaluation and yields progress updates."""
|
| 179 |
+
if not agent_name or not agent_name.strip():
|
| 180 |
+
yield "Error: Please enter an agent name.", None, ""
|
| 181 |
+
return
|
| 182 |
+
|
| 183 |
+
agent_name = agent_name.strip()
|
| 184 |
+
num_games = int(num_games)
|
| 185 |
+
|
| 186 |
+
log_lines = []
|
| 187 |
+
|
| 188 |
+
def log(msg: str):
|
| 189 |
+
log_lines.append(msg)
|
| 190 |
+
return "\n".join(log_lines)
|
| 191 |
+
|
| 192 |
+
# Wake server
|
| 193 |
+
yield log(f"Connecting to {DEFAULT_SERVER}..."), None, ""
|
| 194 |
+
status = wake_hf_space(DEFAULT_SERVER)
|
| 195 |
+
yield log(status), None, ""
|
| 196 |
+
|
| 197 |
+
# Track per-game progress
|
| 198 |
+
game_log = []
|
| 199 |
+
|
| 200 |
+
def on_game_done(game_num, total, metrics):
|
| 201 |
+
result = metrics["result"] or "timeout"
|
| 202 |
+
kd = metrics["kd_ratio"]
|
| 203 |
+
game_log.append({
|
| 204 |
+
"Game": game_num,
|
| 205 |
+
"Result": result,
|
| 206 |
+
"K/D": round(kd, 1),
|
| 207 |
+
"Ticks": metrics["ticks"],
|
| 208 |
+
})
|
| 209 |
+
|
| 210 |
+
yield log(f"Running {num_games} game(s) vs {opponent} AI..."), None, ""
|
| 211 |
+
|
| 212 |
+
try:
|
| 213 |
+
results = asyncio.run(
|
| 214 |
+
run_evaluation(
|
| 215 |
+
agent_name=agent_name,
|
| 216 |
+
opponent=opponent,
|
| 217 |
+
num_games=num_games,
|
| 218 |
+
server_url=DEFAULT_SERVER,
|
| 219 |
+
on_game_done=on_game_done,
|
| 220 |
+
)
|
| 221 |
+
)
|
| 222 |
+
except Exception as e:
|
| 223 |
+
yield log(f"Error: {e}"), None, ""
|
| 224 |
+
return
|
| 225 |
+
|
| 226 |
+
# Save results
|
| 227 |
+
save_submission(results)
|
| 228 |
+
|
| 229 |
+
# Format output
|
| 230 |
+
for g in game_log:
|
| 231 |
+
log(f" Game {g['Game']}: {g['Result']} (K/D: {g['K/D']}, ticks: {g['Ticks']})")
|
| 232 |
+
|
| 233 |
+
log(f"\nEvaluation complete!")
|
| 234 |
+
|
| 235 |
+
summary = (
|
| 236 |
+
f"### Results: {agent_name}\n\n"
|
| 237 |
+
f"| Metric | Value |\n|--------|-------|\n"
|
| 238 |
+
f"| **Score** | **{results['score']}** |\n"
|
| 239 |
+
f"| Win Rate | {results['win_rate']}% |\n"
|
| 240 |
+
f"| K/D Ratio | {results['kd_ratio']} |\n"
|
| 241 |
+
f"| Avg Economy | {results['avg_economy']} |\n"
|
| 242 |
+
f"| Games | {results['games']} vs {results['opponent']} |\n"
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
results_df = pd.DataFrame([{
|
| 246 |
+
"Agent": results["agent_name"],
|
| 247 |
+
"Type": results["agent_type"],
|
| 248 |
+
"Opponent": results["opponent"],
|
| 249 |
+
"Games": results["games"],
|
| 250 |
+
"Win Rate (%)": results["win_rate"],
|
| 251 |
+
"Score": results["score"],
|
| 252 |
+
"K/D Ratio": results["kd_ratio"],
|
| 253 |
+
}])
|
| 254 |
+
|
| 255 |
+
yield "\n".join(log_lines), results_df, summary
|
| 256 |
+
|
| 257 |
+
|
| 258 |
# โโ UI โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 259 |
|
| 260 |
ABOUT_MD = """
|
|
|
|
| 300 |
SUBMIT_MD = """
|
| 301 |
## How to Submit Results
|
| 302 |
|
| 303 |
+
### Option A: In-Browser (no setup needed)
|
| 304 |
+
|
| 305 |
+
Use the **Evaluate** tab to run a scripted agent directly from your browser.
|
| 306 |
+
Results are saved to the leaderboard automatically.
|
| 307 |
+
|
| 308 |
+
### Option B: CLI with HuggingFace-hosted server (no Docker needed)
|
| 309 |
|
| 310 |
```bash
|
| 311 |
git clone https://github.com/yxc20089/OpenRA-Bench.git
|
| 312 |
cd OpenRA-Bench
|
| 313 |
pip install -r requirements.txt
|
| 314 |
pip install openra-rl openra-rl-util
|
|
|
|
|
|
|
|
|
|
| 315 |
|
|
|
|
|
|
|
|
|
|
| 316 |
python evaluate.py \\
|
| 317 |
--agent scripted \\
|
| 318 |
--agent-name "MyBot-v1" \\
|
|
|
|
| 322 |
--server https://openra-rl-openra-rl.hf.space
|
| 323 |
```
|
| 324 |
|
| 325 |
+
### Option C: Local server (Docker)**
|
| 326 |
|
| 327 |
```bash
|
| 328 |
git clone --recursive https://github.com/yxc20089/OpenRA-RL.git
|
|
|
|
| 439 |
outputs=leaderboard,
|
| 440 |
)
|
| 441 |
|
| 442 |
+
# โโ Evaluate Tab โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 443 |
+
with gr.Tab("Evaluate"):
|
| 444 |
+
gr.Markdown(
|
| 445 |
+
"## Run Evaluation\n\n"
|
| 446 |
+
"Run a scripted agent against the HuggingFace-hosted "
|
| 447 |
+
"OpenRA-RL environment. No Docker or local setup needed."
|
| 448 |
+
)
|
| 449 |
+
with gr.Row():
|
| 450 |
+
eval_name = gr.Textbox(
|
| 451 |
+
label="Agent Name",
|
| 452 |
+
placeholder="e.g. MyBot-v1",
|
| 453 |
+
scale=2,
|
| 454 |
+
)
|
| 455 |
+
eval_opponent = gr.Dropdown(
|
| 456 |
+
choices=["Easy", "Normal", "Hard"],
|
| 457 |
+
value="Normal",
|
| 458 |
+
label="Opponent",
|
| 459 |
+
scale=1,
|
| 460 |
+
)
|
| 461 |
+
eval_games = gr.Slider(
|
| 462 |
+
minimum=1,
|
| 463 |
+
maximum=20,
|
| 464 |
+
value=3,
|
| 465 |
+
step=1,
|
| 466 |
+
label="Games",
|
| 467 |
+
scale=1,
|
| 468 |
+
)
|
| 469 |
+
eval_btn = gr.Button("Run Evaluation", variant="primary")
|
| 470 |
+
|
| 471 |
+
eval_log = gr.Textbox(
|
| 472 |
+
label="Progress",
|
| 473 |
+
lines=10,
|
| 474 |
+
interactive=False,
|
| 475 |
+
)
|
| 476 |
+
eval_results = gr.Dataframe(
|
| 477 |
+
label="Game Results",
|
| 478 |
+
interactive=False,
|
| 479 |
+
)
|
| 480 |
+
eval_summary = gr.Markdown()
|
| 481 |
+
|
| 482 |
+
eval_btn.click(
|
| 483 |
+
fn=run_eval_sync,
|
| 484 |
+
inputs=[eval_name, eval_opponent, eval_games],
|
| 485 |
+
outputs=[eval_log, eval_results, eval_summary],
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
# โโ About Tab โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 489 |
with gr.Tab("About"):
|
| 490 |
gr.Markdown(ABOUT_MD)
|
evaluate_runner.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""In-browser evaluation runner for OpenRA-Bench.
|
| 2 |
+
|
| 3 |
+
Runs games against the OpenRA-RL server via HTTP REST API (POST /reset, /step).
|
| 4 |
+
No openra-rl/openenv imports โ avoids websockets version conflicts with Gradio.
|
| 5 |
+
|
| 6 |
+
Scoring logic inlined from openra_rl_util.rubrics (which has zero dependencies).
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import time
|
| 10 |
+
from datetime import datetime, timezone
|
| 11 |
+
from typing import Any, Callable, Dict, List, Optional
|
| 12 |
+
|
| 13 |
+
import httpx
|
| 14 |
+
|
| 15 |
+
# HuggingFace-hosted OpenRA-RL environment
|
| 16 |
+
DEFAULT_SERVER = "https://openra-rl-openra-rl.hf.space"
|
| 17 |
+
MAX_STEPS_PER_GAME = 5000
|
| 18 |
+
STEP_TIMEOUT = 60.0
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# โโ Scoring (inlined from openra_rl_util/rubrics.py) โโโโโโโโโโโโโโโโโโโโโโโโ
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def compute_game_metrics(obs: Dict[str, Any]) -> Dict[str, Any]:
|
| 25 |
+
"""Extract benchmark metrics from a final game observation dict."""
|
| 26 |
+
military = obs.get("military") or {}
|
| 27 |
+
economy = obs.get("economy") or {}
|
| 28 |
+
|
| 29 |
+
kills = military.get("kills_cost", 0)
|
| 30 |
+
deaths = military.get("deaths_cost", 0)
|
| 31 |
+
assets = military.get("assets_value", 0)
|
| 32 |
+
cash = economy.get("cash", 0)
|
| 33 |
+
result = obs.get("result", "")
|
| 34 |
+
tick = obs.get("tick", 0)
|
| 35 |
+
|
| 36 |
+
return {
|
| 37 |
+
"result": result,
|
| 38 |
+
"win": result == "win",
|
| 39 |
+
"ticks": tick,
|
| 40 |
+
"kills_cost": kills,
|
| 41 |
+
"deaths_cost": deaths,
|
| 42 |
+
"kd_ratio": kills / max(deaths, 1),
|
| 43 |
+
"assets_value": assets,
|
| 44 |
+
"cash": cash,
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def compute_composite_score(game_results: List[Dict[str, Any]]) -> float:
|
| 49 |
+
"""Compute OpenRA-Bench composite score: 50% win + 25% military + 25% economy."""
|
| 50 |
+
total = len(game_results)
|
| 51 |
+
if total == 0:
|
| 52 |
+
return 0.0
|
| 53 |
+
|
| 54 |
+
win_rate = sum(1 for g in game_results if g["win"]) / total
|
| 55 |
+
|
| 56 |
+
mil_scores = []
|
| 57 |
+
for g in game_results:
|
| 58 |
+
kills, deaths = g["kills_cost"], g["deaths_cost"]
|
| 59 |
+
total_cost = kills + deaths
|
| 60 |
+
mil_scores.append(kills / total_cost if total_cost > 0 else 0.5)
|
| 61 |
+
avg_mil = sum(mil_scores) / total
|
| 62 |
+
|
| 63 |
+
econ_scores = []
|
| 64 |
+
for g in game_results:
|
| 65 |
+
assets = g["assets_value"]
|
| 66 |
+
econ_scores.append(assets / (assets + 10000) if assets >= 0 else 0.0)
|
| 67 |
+
avg_econ = sum(econ_scores) / total
|
| 68 |
+
|
| 69 |
+
return 100.0 * (0.5 * win_rate + 0.25 * avg_mil + 0.25 * avg_econ)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# โโ Server communication โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def wake_hf_space(server_url: str, max_wait: int = 120) -> str:
|
| 76 |
+
"""Wake a sleeping HuggingFace Space. Returns status message."""
|
| 77 |
+
if ".hf.space" not in server_url:
|
| 78 |
+
return "Local server, skipping wake."
|
| 79 |
+
|
| 80 |
+
start = time.time()
|
| 81 |
+
while time.time() - start < max_wait:
|
| 82 |
+
try:
|
| 83 |
+
resp = httpx.get(server_url, timeout=10, follow_redirects=True)
|
| 84 |
+
if resp.status_code == 200:
|
| 85 |
+
return "Environment server is ready."
|
| 86 |
+
except httpx.HTTPError:
|
| 87 |
+
pass
|
| 88 |
+
time.sleep(5)
|
| 89 |
+
return "Warning: server may still be starting."
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
async def run_single_game(
|
| 93 |
+
client: httpx.AsyncClient,
|
| 94 |
+
server_url: str,
|
| 95 |
+
max_steps: int = MAX_STEPS_PER_GAME,
|
| 96 |
+
) -> Dict[str, Any]:
|
| 97 |
+
"""Run one game via HTTP REST and return metrics."""
|
| 98 |
+
# Reset environment
|
| 99 |
+
resp = await client.post(f"{server_url}/reset", json={})
|
| 100 |
+
resp.raise_for_status()
|
| 101 |
+
data = resp.json()
|
| 102 |
+
obs = data["observation"]
|
| 103 |
+
|
| 104 |
+
steps = 0
|
| 105 |
+
while not obs.get("result") and steps < max_steps:
|
| 106 |
+
# Scripted no-op agent: send empty commands
|
| 107 |
+
action = {"commands": []}
|
| 108 |
+
resp = await client.post(
|
| 109 |
+
f"{server_url}/step",
|
| 110 |
+
json={"action": action},
|
| 111 |
+
)
|
| 112 |
+
resp.raise_for_status()
|
| 113 |
+
data = resp.json()
|
| 114 |
+
obs = data["observation"]
|
| 115 |
+
steps += 1
|
| 116 |
+
|
| 117 |
+
return compute_game_metrics(obs)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
async def run_evaluation(
|
| 121 |
+
agent_name: str,
|
| 122 |
+
opponent: str,
|
| 123 |
+
num_games: int,
|
| 124 |
+
server_url: str = DEFAULT_SERVER,
|
| 125 |
+
on_game_done: Optional[Callable[[int, int, Dict], None]] = None,
|
| 126 |
+
) -> Dict[str, Any]:
|
| 127 |
+
"""Run N games and return aggregate results.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
agent_name: Display name for the leaderboard.
|
| 131 |
+
opponent: AI difficulty (Easy/Normal/Hard).
|
| 132 |
+
num_games: Number of games to play.
|
| 133 |
+
server_url: OpenRA-RL server URL.
|
| 134 |
+
on_game_done: Optional callback(game_num, total, metrics) after each game.
|
| 135 |
+
|
| 136 |
+
Returns:
|
| 137 |
+
Dict with all fields needed for results.csv.
|
| 138 |
+
"""
|
| 139 |
+
game_results: List[Dict[str, Any]] = []
|
| 140 |
+
|
| 141 |
+
async with httpx.AsyncClient(timeout=STEP_TIMEOUT) as client:
|
| 142 |
+
for i in range(num_games):
|
| 143 |
+
metrics = await run_single_game(client, server_url)
|
| 144 |
+
game_results.append(metrics)
|
| 145 |
+
if on_game_done:
|
| 146 |
+
on_game_done(i + 1, num_games, metrics)
|
| 147 |
+
|
| 148 |
+
total = len(game_results)
|
| 149 |
+
wins = sum(1 for g in game_results if g["win"])
|
| 150 |
+
|
| 151 |
+
return {
|
| 152 |
+
"agent_name": agent_name,
|
| 153 |
+
"agent_type": "Scripted",
|
| 154 |
+
"opponent": opponent,
|
| 155 |
+
"games": total,
|
| 156 |
+
"win_rate": round(100.0 * wins / max(total, 1), 1),
|
| 157 |
+
"score": round(compute_composite_score(game_results), 1),
|
| 158 |
+
"avg_kills": round(sum(g["kills_cost"] for g in game_results) / max(total, 1)),
|
| 159 |
+
"avg_deaths": round(sum(g["deaths_cost"] for g in game_results) / max(total, 1)),
|
| 160 |
+
"kd_ratio": round(
|
| 161 |
+
sum(g["kd_ratio"] for g in game_results) / max(total, 1), 2
|
| 162 |
+
),
|
| 163 |
+
"avg_economy": round(
|
| 164 |
+
sum(g["assets_value"] for g in game_results) / max(total, 1)
|
| 165 |
+
),
|
| 166 |
+
"avg_game_length": round(
|
| 167 |
+
sum(g["ticks"] for g in game_results) / max(total, 1)
|
| 168 |
+
),
|
| 169 |
+
"timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
|
| 170 |
+
"replay_url": "",
|
| 171 |
+
}
|
requirements.txt
CHANGED
|
@@ -1,2 +1,4 @@
|
|
| 1 |
gradio>=4.44.0
|
| 2 |
pandas>=2.0.0
|
|
|
|
|
|
|
|
|
| 1 |
gradio>=4.44.0
|
| 2 |
pandas>=2.0.0
|
| 3 |
+
httpx>=0.24.0
|
| 4 |
+
huggingface_hub>=0.20.0
|