Spaces:
Running
Running
Add upload form, API endpoint, 5 difficulty tiers, real game data
Browse files- Add validate_submission + handle_api_submit for CLI/form uploads
- Add Gradio upload form in Submit tab (JSON + optional replay)
- Update opponent tiers: Beginner/Easy/Medium/Normal/Hard (was 3)
- Replace dummy CSV data with real game result
- Update schema.md with 5-tier opponent values
- 37 tests pass (12 new: validation, API submit, tier acceptance)
- app.py +185 -37
- data/results.csv +1 -5
- data/schema.md +2 -2
- evaluate.py +1 -1
- evaluate_runner.py +1 -1
- tests/test_app.py +103 -6
- tests/test_evaluate.py +14 -0
app.py
CHANGED
|
@@ -19,7 +19,7 @@ from pathlib import Path
|
|
| 19 |
import gradio as gr
|
| 20 |
import pandas as pd
|
| 21 |
|
| 22 |
-
from evaluate_runner import DEFAULT_SERVER
|
| 23 |
|
| 24 |
# โโ Data Loading โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 25 |
|
|
@@ -170,6 +170,127 @@ def save_submission(results: dict) -> None:
|
|
| 170 |
writer.writerow(results)
|
| 171 |
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
# โโ UI โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 174 |
|
| 175 |
ABOUT_MD = """
|
|
@@ -183,7 +304,7 @@ ABOUT_MD = """
|
|
| 183 |
|
| 184 |
- **Game**: Red Alert (OpenRA engine)
|
| 185 |
- **Format**: 1v1 agent vs built-in AI
|
| 186 |
-
- **Opponents**: Easy, Normal, Hard difficulty
|
| 187 |
- **Games per entry**: Minimum 10 games per configuration
|
| 188 |
- **Metrics**: Win rate, composite score, K/D ratio, economy
|
| 189 |
|
|
@@ -213,37 +334,36 @@ The benchmark score combines three components:
|
|
| 213 |
"""
|
| 214 |
|
| 215 |
SUBMIT_MD = """
|
| 216 |
-
|
| 217 |
|
| 218 |
-
##
|
| 219 |
|
| 220 |
-
|
| 221 |
-
and click **Try** to watch a pre-configured LLM agent play Red Alert
|
| 222 |
-
directly in your browser. No API keys or setup required.
|
| 223 |
|
| 224 |
-
|
|
|
|
| 225 |
|
| 226 |
-
```
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
--server https://openra-rl-openra-rl.hf.space
|
| 239 |
```
|
| 240 |
|
| 241 |
-
###
|
| 242 |
|
| 243 |
```bash
|
| 244 |
-
git clone
|
| 245 |
-
cd OpenRA-
|
| 246 |
-
|
|
|
|
| 247 |
|
| 248 |
python evaluate.py \\
|
| 249 |
--agent scripted \\
|
|
@@ -254,15 +374,6 @@ python evaluate.py \\
|
|
| 254 |
--server http://localhost:8000
|
| 255 |
```
|
| 256 |
|
| 257 |
-
### 3. Submit via Pull Request
|
| 258 |
-
|
| 259 |
-
1. Fork [OpenRA-Bench](https://github.com/yxc20089/OpenRA-Bench)
|
| 260 |
-
2. Run the evaluation (results append to `data/results.csv`)
|
| 261 |
-
3. Commit and open a PR with:
|
| 262 |
-
- Your updated `data/results.csv`
|
| 263 |
-
- A description of your agent
|
| 264 |
-
- (Optional) Replay files in `replays/`
|
| 265 |
-
|
| 266 |
### Evaluation Parameters
|
| 267 |
|
| 268 |
| Parameter | Description |
|
|
@@ -270,19 +381,19 @@ python evaluate.py \\
|
|
| 270 |
| `--agent` | Agent type: `scripted`, `llm`, `mcp`, `custom` |
|
| 271 |
| `--agent-name` | Display name on the leaderboard |
|
| 272 |
| `--agent-type` | Category: `Scripted`, `LLM`, `RL` |
|
| 273 |
-
| `--opponent` | AI difficulty: `Easy`, `Normal`, `Hard` |
|
| 274 |
| `--games` | Number of games (minimum 10) |
|
| 275 |
| `--server` | OpenRA-RL server URL (local or HuggingFace-hosted) |
|
| 276 |
|
| 277 |
### Custom Agents
|
| 278 |
|
| 279 |
-
|
| 280 |
|
| 281 |
```python
|
| 282 |
from openra_env.client import OpenRAEnv
|
| 283 |
from openra_env.models import OpenRAAction
|
| 284 |
|
| 285 |
-
async with OpenRAEnv("
|
| 286 |
obs = await env.reset()
|
| 287 |
while not obs.done:
|
| 288 |
action = your_agent.decide(obs)
|
|
@@ -320,7 +431,7 @@ def build_app() -> gr.Blocks:
|
|
| 320 |
scale=2,
|
| 321 |
)
|
| 322 |
opponent_filter = gr.Dropdown(
|
| 323 |
-
choices=["All", "Easy", "Normal", "Hard"],
|
| 324 |
value="All",
|
| 325 |
label="Opponent",
|
| 326 |
scale=1,
|
|
@@ -361,6 +472,43 @@ def build_app() -> gr.Blocks:
|
|
| 361 |
|
| 362 |
# โโ Submit Tab โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 363 |
with gr.Tab("Submit"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
gr.Markdown(SUBMIT_MD)
|
| 365 |
|
| 366 |
return app
|
|
|
|
| 19 |
import gradio as gr
|
| 20 |
import pandas as pd
|
| 21 |
|
| 22 |
+
from evaluate_runner import DEFAULT_SERVER, compute_composite_score, compute_game_metrics
|
| 23 |
|
| 24 |
# โโ Data Loading โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 25 |
|
|
|
|
| 170 |
writer.writerow(results)
|
| 171 |
|
| 172 |
|
| 173 |
+
# โโ Submission Handling โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 174 |
+
|
| 175 |
+
VALID_OPPONENTS = {"Beginner", "Easy", "Medium", "Normal", "Hard"}
|
| 176 |
+
VALID_AGENT_TYPES = {"Scripted", "LLM", "RL"}
|
| 177 |
+
REQUIRED_FIELDS = [
|
| 178 |
+
"agent_name", "agent_type", "opponent", "result",
|
| 179 |
+
"ticks", "kills_cost", "deaths_cost", "assets_value",
|
| 180 |
+
]
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def validate_submission(data: dict) -> tuple[bool, str]:
|
| 184 |
+
"""Validate an uploaded JSON submission.
|
| 185 |
+
|
| 186 |
+
Returns (is_valid, error_message).
|
| 187 |
+
"""
|
| 188 |
+
for field in REQUIRED_FIELDS:
|
| 189 |
+
if field not in data:
|
| 190 |
+
return False, f"Missing required field: {field}"
|
| 191 |
+
|
| 192 |
+
if data["agent_type"] not in VALID_AGENT_TYPES:
|
| 193 |
+
return False, (
|
| 194 |
+
f"Invalid agent_type: {data['agent_type']}. "
|
| 195 |
+
f"Must be one of: {', '.join(sorted(VALID_AGENT_TYPES))}"
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
if data["opponent"] not in VALID_OPPONENTS:
|
| 199 |
+
return False, (
|
| 200 |
+
f"Invalid opponent: {data['opponent']}. "
|
| 201 |
+
f"Must be one of: {', '.join(sorted(VALID_OPPONENTS))}"
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
return True, ""
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def _score_from_submission(data: dict) -> dict:
|
| 208 |
+
"""Build a CSV-ready results dict from a validated submission."""
|
| 209 |
+
game_result = {
|
| 210 |
+
"result": data.get("result", ""),
|
| 211 |
+
"win": data.get("win", data.get("result") == "win"),
|
| 212 |
+
"ticks": data.get("ticks", 0),
|
| 213 |
+
"kills_cost": data.get("kills_cost", 0),
|
| 214 |
+
"deaths_cost": data.get("deaths_cost", 0),
|
| 215 |
+
"kd_ratio": data.get("kd_ratio", 0),
|
| 216 |
+
"assets_value": data.get("assets_value", 0),
|
| 217 |
+
"cash": data.get("cash", 0),
|
| 218 |
+
}
|
| 219 |
+
score = compute_composite_score([game_result])
|
| 220 |
+
kills = data.get("kills_cost", 0)
|
| 221 |
+
deaths = data.get("deaths_cost", 0)
|
| 222 |
+
games = data.get("games", 1)
|
| 223 |
+
|
| 224 |
+
return {
|
| 225 |
+
"agent_name": data["agent_name"],
|
| 226 |
+
"agent_type": data["agent_type"],
|
| 227 |
+
"opponent": data["opponent"],
|
| 228 |
+
"games": games,
|
| 229 |
+
"win_rate": round(100.0 * (1 if data.get("win") else 0) / max(games, 1), 1),
|
| 230 |
+
"score": round(score, 1),
|
| 231 |
+
"avg_kills": kills,
|
| 232 |
+
"avg_deaths": deaths,
|
| 233 |
+
"kd_ratio": round(kills / max(deaths, 1), 2),
|
| 234 |
+
"avg_economy": data.get("assets_value", 0),
|
| 235 |
+
"avg_game_length": data.get("ticks", 0),
|
| 236 |
+
"timestamp": data.get("timestamp", datetime.now(timezone.utc).strftime("%Y-%m-%d"))[:10],
|
| 237 |
+
"replay_url": "",
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def handle_upload(json_file, replay_file) -> tuple[str, pd.DataFrame]:
|
| 242 |
+
"""Process an uploaded bench submission JSON + optional replay."""
|
| 243 |
+
if json_file is None:
|
| 244 |
+
return "Please upload a JSON file.", add_type_badges(load_data())
|
| 245 |
+
|
| 246 |
+
try:
|
| 247 |
+
with open(json_file.name) as f:
|
| 248 |
+
data = json.load(f)
|
| 249 |
+
except (json.JSONDecodeError, Exception) as e:
|
| 250 |
+
return f"Invalid JSON: {e}", add_type_badges(load_data())
|
| 251 |
+
|
| 252 |
+
is_valid, error = validate_submission(data)
|
| 253 |
+
if not is_valid:
|
| 254 |
+
return f"Validation error: {error}", add_type_badges(load_data())
|
| 255 |
+
|
| 256 |
+
results_row = _score_from_submission(data)
|
| 257 |
+
|
| 258 |
+
# Save replay if provided
|
| 259 |
+
if replay_file is not None:
|
| 260 |
+
import shutil
|
| 261 |
+
replay_name = Path(replay_file.name).name
|
| 262 |
+
shutil.copy2(replay_file.name, SUBMISSIONS_DIR / replay_name)
|
| 263 |
+
results_row["replay_url"] = replay_name
|
| 264 |
+
|
| 265 |
+
save_submission(results_row)
|
| 266 |
+
|
| 267 |
+
return (
|
| 268 |
+
f"Submitted! **{data['agent_name']}** ({data['agent_type']}) "
|
| 269 |
+
f"vs {data['opponent']}: score **{results_row['score']}**",
|
| 270 |
+
add_type_badges(load_data()),
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def handle_api_submit(json_data: str) -> str:
|
| 275 |
+
"""API endpoint: accept JSON string submission. Used by CLI auto-upload."""
|
| 276 |
+
try:
|
| 277 |
+
data = json.loads(json_data)
|
| 278 |
+
except (json.JSONDecodeError, Exception) as e:
|
| 279 |
+
return f"Invalid JSON: {e}"
|
| 280 |
+
|
| 281 |
+
is_valid, error = validate_submission(data)
|
| 282 |
+
if not is_valid:
|
| 283 |
+
return f"Validation error: {error}"
|
| 284 |
+
|
| 285 |
+
results_row = _score_from_submission(data)
|
| 286 |
+
save_submission(results_row)
|
| 287 |
+
|
| 288 |
+
return (
|
| 289 |
+
f"OK: {data['agent_name']} ({data['agent_type']}) "
|
| 290 |
+
f"vs {data['opponent']}: score {results_row['score']}"
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
|
| 294 |
# โโ UI โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 295 |
|
| 296 |
ABOUT_MD = """
|
|
|
|
| 304 |
|
| 305 |
- **Game**: Red Alert (OpenRA engine)
|
| 306 |
- **Format**: 1v1 agent vs built-in AI
|
| 307 |
+
- **Opponents**: Beginner, Easy, Medium, Normal, Hard difficulty
|
| 308 |
- **Games per entry**: Minimum 10 games per configuration
|
| 309 |
- **Metrics**: Win rate, composite score, K/D ratio, economy
|
| 310 |
|
|
|
|
| 334 |
"""
|
| 335 |
|
| 336 |
SUBMIT_MD = """
|
| 337 |
+
---
|
| 338 |
|
| 339 |
+
## Other Submission Methods
|
| 340 |
|
| 341 |
+
### CLI Auto-Upload
|
|
|
|
|
|
|
| 342 |
|
| 343 |
+
Set `BENCH_URL` in your OpenRA-RL config and results upload automatically
|
| 344 |
+
after each game:
|
| 345 |
|
| 346 |
+
```yaml
|
| 347 |
+
# config.yaml
|
| 348 |
+
agent:
|
| 349 |
+
bench_url: "https://openra-rl-openra-bench.hf.space"
|
| 350 |
+
```
|
| 351 |
|
| 352 |
+
### CLI Manual Upload
|
| 353 |
+
|
| 354 |
+
Upload a previously exported bench JSON:
|
| 355 |
+
|
| 356 |
+
```bash
|
| 357 |
+
python -m openra_env.bench_submit ~/.openra-rl/bench-exports/bench-*.json
|
|
|
|
| 358 |
```
|
| 359 |
|
| 360 |
+
### Batch Evaluation (10+ games)
|
| 361 |
|
| 362 |
```bash
|
| 363 |
+
git clone https://github.com/yxc20089/OpenRA-Bench.git
|
| 364 |
+
cd OpenRA-Bench
|
| 365 |
+
pip install -r requirements.txt
|
| 366 |
+
pip install openra-rl openra-rl-util
|
| 367 |
|
| 368 |
python evaluate.py \\
|
| 369 |
--agent scripted \\
|
|
|
|
| 374 |
--server http://localhost:8000
|
| 375 |
```
|
| 376 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
### Evaluation Parameters
|
| 378 |
|
| 379 |
| Parameter | Description |
|
|
|
|
| 381 |
| `--agent` | Agent type: `scripted`, `llm`, `mcp`, `custom` |
|
| 382 |
| `--agent-name` | Display name on the leaderboard |
|
| 383 |
| `--agent-type` | Category: `Scripted`, `LLM`, `RL` |
|
| 384 |
+
| `--opponent` | AI difficulty: `Beginner`, `Easy`, `Medium`, `Normal`, `Hard` |
|
| 385 |
| `--games` | Number of games (minimum 10) |
|
| 386 |
| `--server` | OpenRA-RL server URL (local or HuggingFace-hosted) |
|
| 387 |
|
| 388 |
### Custom Agents
|
| 389 |
|
| 390 |
+
Implement the standard `reset/step` loop:
|
| 391 |
|
| 392 |
```python
|
| 393 |
from openra_env.client import OpenRAEnv
|
| 394 |
from openra_env.models import OpenRAAction
|
| 395 |
|
| 396 |
+
async with OpenRAEnv("http://localhost:8000") as env:
|
| 397 |
obs = await env.reset()
|
| 398 |
while not obs.done:
|
| 399 |
action = your_agent.decide(obs)
|
|
|
|
| 431 |
scale=2,
|
| 432 |
)
|
| 433 |
opponent_filter = gr.Dropdown(
|
| 434 |
+
choices=["All", "Beginner", "Easy", "Medium", "Normal", "Hard"],
|
| 435 |
value="All",
|
| 436 |
label="Opponent",
|
| 437 |
scale=1,
|
|
|
|
| 472 |
|
| 473 |
# โโ Submit Tab โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 474 |
with gr.Tab("Submit"):
|
| 475 |
+
gr.Markdown(
|
| 476 |
+
"## Upload Results\n\n"
|
| 477 |
+
"Upload a bench export JSON from your OpenRA-RL game. "
|
| 478 |
+
"After each game, the agent saves a JSON file to "
|
| 479 |
+
"`~/.openra-rl/bench-exports/`."
|
| 480 |
+
)
|
| 481 |
+
with gr.Row():
|
| 482 |
+
json_upload = gr.File(
|
| 483 |
+
label="Bench export JSON",
|
| 484 |
+
file_types=[".json"],
|
| 485 |
+
scale=3,
|
| 486 |
+
)
|
| 487 |
+
replay_upload = gr.File(
|
| 488 |
+
label="Replay file (optional)",
|
| 489 |
+
file_types=[".orarep"],
|
| 490 |
+
scale=2,
|
| 491 |
+
)
|
| 492 |
+
submit_btn = gr.Button("Submit Results", variant="primary")
|
| 493 |
+
submit_output = gr.Markdown()
|
| 494 |
+
|
| 495 |
+
submit_btn.click(
|
| 496 |
+
fn=handle_upload,
|
| 497 |
+
inputs=[json_upload, replay_upload],
|
| 498 |
+
outputs=[submit_output, leaderboard],
|
| 499 |
+
)
|
| 500 |
+
|
| 501 |
+
# API endpoint for CLI auto-upload
|
| 502 |
+
api_json_input = gr.Textbox(visible=False)
|
| 503 |
+
api_result = gr.Textbox(visible=False)
|
| 504 |
+
api_btn = gr.Button(visible=False)
|
| 505 |
+
api_btn.click(
|
| 506 |
+
fn=handle_api_submit,
|
| 507 |
+
inputs=[api_json_input],
|
| 508 |
+
outputs=[api_result],
|
| 509 |
+
api_name="submit",
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
gr.Markdown(SUBMIT_MD)
|
| 513 |
|
| 514 |
return app
|
data/results.csv
CHANGED
|
@@ -1,6 +1,2 @@
|
|
| 1 |
agent_name,agent_type,opponent,games,win_rate,score,avg_kills,avg_deaths,kd_ratio,avg_economy,avg_game_length,timestamp,replay_url
|
| 2 |
-
|
| 3 |
-
ScriptedBot-v1,Scripted,Normal,10,60.0,52.3,6200,4800,1.29,8200,2400,2026-02-19,
|
| 4 |
-
ScriptedBot-v1,Scripted,Hard,10,20.0,28.1,3100,7200,0.43,4500,1600,2026-02-19,
|
| 5 |
-
LLM-Agent-v1,LLM,Easy,10,80.0,65.8,7200,3400,2.12,11000,2200,2026-02-19,
|
| 6 |
-
LLM-Agent-v1,LLM,Normal,10,50.0,48.7,5800,5200,1.12,7800,2800,2026-02-19,
|
|
|
|
| 1 |
agent_name,agent_type,opponent,games,win_rate,score,avg_kills,avg_deaths,kd_ratio,avg_economy,avg_game_length,timestamp,replay_url
|
| 2 |
+
qwen/qwen3-coder-next,LLM,Beginner,1,0.0,18.3,1000,2900,0.34,9050,27349,2026-02-25,
|
|
|
|
|
|
|
|
|
|
|
|
data/schema.md
CHANGED
|
@@ -4,8 +4,8 @@
|
|
| 4 |
|--------|------|-------------|
|
| 5 |
| `agent_name` | str | Agent identifier displayed on leaderboard |
|
| 6 |
| `agent_type` | str | Category: "Scripted", "LLM", or "RL" |
|
| 7 |
-
| `opponent` | str | AI difficulty: "Easy", "Normal", or "Hard" |
|
| 8 |
-
| `games` | int | Number of games played
|
| 9 |
| `win_rate` | float | Win percentage (0.0 - 100.0) |
|
| 10 |
| `score` | float | Composite benchmark score (0.0 - 100.0) |
|
| 11 |
| `avg_kills` | float | Average enemy cost destroyed per game |
|
|
|
|
| 4 |
|--------|------|-------------|
|
| 5 |
| `agent_name` | str | Agent identifier displayed on leaderboard |
|
| 6 |
| `agent_type` | str | Category: "Scripted", "LLM", or "RL" |
|
| 7 |
+
| `opponent` | str | AI difficulty: "Beginner", "Easy", "Medium", "Normal", or "Hard" |
|
| 8 |
+
| `games` | int | Number of games played |
|
| 9 |
| `win_rate` | float | Win percentage (0.0 - 100.0) |
|
| 10 |
| `score` | float | Composite benchmark score (0.0 - 100.0) |
|
| 11 |
| `avg_kills` | float | Average enemy cost destroyed per game |
|
evaluate.py
CHANGED
|
@@ -81,7 +81,7 @@ def parse_args() -> argparse.Namespace:
|
|
| 81 |
)
|
| 82 |
parser.add_argument(
|
| 83 |
"--opponent",
|
| 84 |
-
choices=["Easy", "Normal", "Hard"],
|
| 85 |
default="Normal",
|
| 86 |
help="AI opponent difficulty (default: Normal)",
|
| 87 |
)
|
|
|
|
| 81 |
)
|
| 82 |
parser.add_argument(
|
| 83 |
"--opponent",
|
| 84 |
+
choices=["Beginner", "Easy", "Medium", "Normal", "Hard"],
|
| 85 |
default="Normal",
|
| 86 |
help="AI opponent difficulty (default: Normal)",
|
| 87 |
)
|
evaluate_runner.py
CHANGED
|
@@ -128,7 +128,7 @@ async def run_evaluation(
|
|
| 128 |
|
| 129 |
Args:
|
| 130 |
agent_name: Display name for the leaderboard.
|
| 131 |
-
opponent: AI difficulty (Easy/Normal/Hard).
|
| 132 |
num_games: Number of games to play.
|
| 133 |
server_url: OpenRA-RL server URL.
|
| 134 |
on_game_done: Optional callback(game_num, total, metrics) after each game.
|
|
|
|
| 128 |
|
| 129 |
Args:
|
| 130 |
agent_name: Display name for the leaderboard.
|
| 131 |
+
opponent: AI difficulty (Beginner/Easy/Medium/Normal/Hard).
|
| 132 |
num_games: Number of games to play.
|
| 133 |
server_url: OpenRA-RL server URL.
|
| 134 |
on_game_done: Optional callback(game_num, total, metrics) after each game.
|
tests/test_app.py
CHANGED
|
@@ -12,10 +12,13 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
| 12 |
from app import (
|
| 13 |
AGENT_TYPE_COLORS,
|
| 14 |
DISPLAY_COLUMNS,
|
|
|
|
| 15 |
add_type_badges,
|
| 16 |
build_app,
|
| 17 |
filter_leaderboard,
|
|
|
|
| 18 |
load_data,
|
|
|
|
| 19 |
)
|
| 20 |
|
| 21 |
|
|
@@ -80,16 +83,19 @@ class TestFilter:
|
|
| 80 |
assert isinstance(df, pd.DataFrame)
|
| 81 |
|
| 82 |
def test_search_filters_by_name(self):
|
| 83 |
-
df = filter_leaderboard("
|
| 84 |
-
# If there are results, they should contain "ScriptedBot"
|
| 85 |
if len(df) > 0:
|
| 86 |
-
|
| 87 |
-
assert all("ScriptedBot" in str(row) for row in df["Agent"])
|
| 88 |
|
| 89 |
def test_opponent_filter(self):
|
| 90 |
-
df = filter_leaderboard("", [], "
|
| 91 |
if len(df) > 0:
|
| 92 |
-
assert all(df["Opponent"] == "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
|
| 95 |
class TestBuildApp:
|
|
@@ -98,3 +104,94 @@ class TestBuildApp:
|
|
| 98 |
def test_builds_without_error(self):
|
| 99 |
app = build_app()
|
| 100 |
assert app is not None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
from app import (
|
| 13 |
AGENT_TYPE_COLORS,
|
| 14 |
DISPLAY_COLUMNS,
|
| 15 |
+
VALID_OPPONENTS,
|
| 16 |
add_type_badges,
|
| 17 |
build_app,
|
| 18 |
filter_leaderboard,
|
| 19 |
+
handle_api_submit,
|
| 20 |
load_data,
|
| 21 |
+
validate_submission,
|
| 22 |
)
|
| 23 |
|
| 24 |
|
|
|
|
| 83 |
assert isinstance(df, pd.DataFrame)
|
| 84 |
|
| 85 |
def test_search_filters_by_name(self):
|
| 86 |
+
df = filter_leaderboard("qwen", [], "All")
|
|
|
|
| 87 |
if len(df) > 0:
|
| 88 |
+
assert all("qwen" in str(row).lower() for row in df["Agent"])
|
|
|
|
| 89 |
|
| 90 |
def test_opponent_filter(self):
|
| 91 |
+
df = filter_leaderboard("", [], "Beginner")
|
| 92 |
if len(df) > 0:
|
| 93 |
+
assert all(df["Opponent"] == "Beginner")
|
| 94 |
+
|
| 95 |
+
def test_opponent_filter_hard(self):
|
| 96 |
+
df = filter_leaderboard("", [], "Hard")
|
| 97 |
+
# May be empty if no Hard entries exist
|
| 98 |
+
assert isinstance(df, pd.DataFrame)
|
| 99 |
|
| 100 |
|
| 101 |
class TestBuildApp:
|
|
|
|
| 104 |
def test_builds_without_error(self):
|
| 105 |
app = build_app()
|
| 106 |
assert app is not None
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
class TestValidateSubmission:
|
| 110 |
+
"""Test submission validation."""
|
| 111 |
+
|
| 112 |
+
def _valid_data(self):
|
| 113 |
+
return {
|
| 114 |
+
"agent_name": "TestBot",
|
| 115 |
+
"agent_type": "LLM",
|
| 116 |
+
"opponent": "Beginner",
|
| 117 |
+
"result": "loss",
|
| 118 |
+
"ticks": 27000,
|
| 119 |
+
"kills_cost": 1000,
|
| 120 |
+
"deaths_cost": 2900,
|
| 121 |
+
"assets_value": 9050,
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
def test_valid_submission(self):
|
| 125 |
+
valid, err = validate_submission(self._valid_data())
|
| 126 |
+
assert valid
|
| 127 |
+
assert err == ""
|
| 128 |
+
|
| 129 |
+
def test_missing_field(self):
|
| 130 |
+
data = {"agent_name": "Bot"}
|
| 131 |
+
valid, err = validate_submission(data)
|
| 132 |
+
assert not valid
|
| 133 |
+
assert "Missing required field" in err
|
| 134 |
+
|
| 135 |
+
def test_invalid_opponent(self):
|
| 136 |
+
data = self._valid_data()
|
| 137 |
+
data["opponent"] = "Brutal"
|
| 138 |
+
valid, err = validate_submission(data)
|
| 139 |
+
assert not valid
|
| 140 |
+
assert "Invalid opponent" in err
|
| 141 |
+
|
| 142 |
+
def test_invalid_agent_type(self):
|
| 143 |
+
data = self._valid_data()
|
| 144 |
+
data["agent_type"] = "MCTS"
|
| 145 |
+
valid, err = validate_submission(data)
|
| 146 |
+
assert not valid
|
| 147 |
+
assert "Invalid agent_type" in err
|
| 148 |
+
|
| 149 |
+
def test_all_opponents_accepted(self):
|
| 150 |
+
for opp in VALID_OPPONENTS:
|
| 151 |
+
data = self._valid_data()
|
| 152 |
+
data["opponent"] = opp
|
| 153 |
+
valid, _ = validate_submission(data)
|
| 154 |
+
assert valid, f"Opponent '{opp}' should be valid"
|
| 155 |
+
|
| 156 |
+
def test_all_agent_types_accepted(self):
|
| 157 |
+
for at in ["Scripted", "LLM", "RL"]:
|
| 158 |
+
data = self._valid_data()
|
| 159 |
+
data["agent_type"] = at
|
| 160 |
+
valid, _ = validate_submission(data)
|
| 161 |
+
assert valid, f"Agent type '{at}' should be valid"
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
class TestApiSubmit:
|
| 165 |
+
"""Test API submission handler."""
|
| 166 |
+
|
| 167 |
+
def test_valid_json(self):
|
| 168 |
+
import json
|
| 169 |
+
data = {
|
| 170 |
+
"agent_name": "TestBot",
|
| 171 |
+
"agent_type": "LLM",
|
| 172 |
+
"opponent": "Easy",
|
| 173 |
+
"result": "win",
|
| 174 |
+
"win": True,
|
| 175 |
+
"ticks": 5000,
|
| 176 |
+
"kills_cost": 3000,
|
| 177 |
+
"deaths_cost": 1000,
|
| 178 |
+
"assets_value": 8000,
|
| 179 |
+
}
|
| 180 |
+
# Use a temp CSV to avoid polluting real data
|
| 181 |
+
import tempfile
|
| 182 |
+
with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f:
|
| 183 |
+
temp_path = Path(f.name)
|
| 184 |
+
with patch("app.DATA_PATH", temp_path):
|
| 185 |
+
result = handle_api_submit(json.dumps(data))
|
| 186 |
+
assert "OK" in result
|
| 187 |
+
assert "TestBot" in result
|
| 188 |
+
temp_path.unlink(missing_ok=True)
|
| 189 |
+
|
| 190 |
+
def test_invalid_json(self):
|
| 191 |
+
result = handle_api_submit("not json")
|
| 192 |
+
assert "Invalid JSON" in result
|
| 193 |
+
|
| 194 |
+
def test_missing_fields(self):
|
| 195 |
+
import json
|
| 196 |
+
result = handle_api_submit(json.dumps({"agent_name": "Bot"}))
|
| 197 |
+
assert "Validation error" in result
|
tests/test_evaluate.py
CHANGED
|
@@ -72,6 +72,20 @@ class TestParseArgs:
|
|
| 72 |
args = parse_args()
|
| 73 |
assert args.agent_type == "RL"
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
class TestGetAgentFn:
|
| 77 |
"""Test agent factory."""
|
|
|
|
| 72 |
args = parse_args()
|
| 73 |
assert args.agent_type == "RL"
|
| 74 |
|
| 75 |
+
def test_beginner_opponent_accepted(self):
|
| 76 |
+
with patch("sys.argv", [
|
| 77 |
+
"evaluate.py", "--agent-name", "T", "--opponent", "Beginner",
|
| 78 |
+
]):
|
| 79 |
+
args = parse_args()
|
| 80 |
+
assert args.opponent == "Beginner"
|
| 81 |
+
|
| 82 |
+
def test_medium_opponent_accepted(self):
|
| 83 |
+
with patch("sys.argv", [
|
| 84 |
+
"evaluate.py", "--agent-name", "T", "--opponent", "Medium",
|
| 85 |
+
]):
|
| 86 |
+
args = parse_args()
|
| 87 |
+
assert args.opponent == "Medium"
|
| 88 |
+
|
| 89 |
|
| 90 |
class TestGetAgentFn:
|
| 91 |
"""Test agent factory."""
|