yxc20098 commited on
Commit
45ef63c
ยท
1 Parent(s): 8ce66d2

Add upload form, API endpoint, 5 difficulty tiers, real game data

Browse files

- Add validate_submission + handle_api_submit for CLI/form uploads
- Add Gradio upload form in Submit tab (JSON + optional replay)
- Update opponent tiers: Beginner/Easy/Medium/Normal/Hard (was 3)
- Replace dummy CSV data with real game result
- Update schema.md with 5-tier opponent values
- 37 tests pass (12 new: validation, API submit, tier acceptance)

Files changed (7) hide show
  1. app.py +185 -37
  2. data/results.csv +1 -5
  3. data/schema.md +2 -2
  4. evaluate.py +1 -1
  5. evaluate_runner.py +1 -1
  6. tests/test_app.py +103 -6
  7. tests/test_evaluate.py +14 -0
app.py CHANGED
@@ -19,7 +19,7 @@ from pathlib import Path
19
  import gradio as gr
20
  import pandas as pd
21
 
22
- from evaluate_runner import DEFAULT_SERVER
23
 
24
  # โ”€โ”€ Data Loading โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
25
 
@@ -170,6 +170,127 @@ def save_submission(results: dict) -> None:
170
  writer.writerow(results)
171
 
172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  # โ”€โ”€ UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
174
 
175
  ABOUT_MD = """
@@ -183,7 +304,7 @@ ABOUT_MD = """
183
 
184
  - **Game**: Red Alert (OpenRA engine)
185
  - **Format**: 1v1 agent vs built-in AI
186
- - **Opponents**: Easy, Normal, Hard difficulty
187
  - **Games per entry**: Minimum 10 games per configuration
188
  - **Metrics**: Win rate, composite score, K/D ratio, economy
189
 
@@ -213,37 +334,36 @@ The benchmark score combines three components:
213
  """
214
 
215
  SUBMIT_MD = """
216
- ## How to Submit Results
217
 
218
- ### Option A: Watch AI Play (no setup needed)
219
 
220
- Visit the [OpenRA-RL Space](https://huggingface.co/spaces/openra-rl/openra-rl)
221
- and click **Try** to watch a pre-configured LLM agent play Red Alert
222
- directly in your browser. No API keys or setup required.
223
 
224
- ### Option B: CLI with HuggingFace-hosted server (no Docker needed)
 
225
 
226
- ```bash
227
- git clone https://github.com/yxc20089/OpenRA-Bench.git
228
- cd OpenRA-Bench
229
- pip install -r requirements.txt
230
- pip install openra-rl openra-rl-util
231
 
232
- python evaluate.py \\
233
- --agent scripted \\
234
- --agent-name "MyBot-v1" \\
235
- --agent-type Scripted \\
236
- --opponent Normal \\
237
- --games 10 \\
238
- --server https://openra-rl-openra-rl.hf.space
239
  ```
240
 
241
- ### Option C: Local server (Docker)**
242
 
243
  ```bash
244
- git clone --recursive https://github.com/yxc20089/OpenRA-RL.git
245
- cd OpenRA-RL && pip install -e . && docker compose up openra-rl
246
- cd /path/to/OpenRA-Bench
 
247
 
248
  python evaluate.py \\
249
  --agent scripted \\
@@ -254,15 +374,6 @@ python evaluate.py \\
254
  --server http://localhost:8000
255
  ```
256
 
257
- ### 3. Submit via Pull Request
258
-
259
- 1. Fork [OpenRA-Bench](https://github.com/yxc20089/OpenRA-Bench)
260
- 2. Run the evaluation (results append to `data/results.csv`)
261
- 3. Commit and open a PR with:
262
- - Your updated `data/results.csv`
263
- - A description of your agent
264
- - (Optional) Replay files in `replays/`
265
-
266
  ### Evaluation Parameters
267
 
268
  | Parameter | Description |
@@ -270,19 +381,19 @@ python evaluate.py \\
270
  | `--agent` | Agent type: `scripted`, `llm`, `mcp`, `custom` |
271
  | `--agent-name` | Display name on the leaderboard |
272
  | `--agent-type` | Category: `Scripted`, `LLM`, `RL` |
273
- | `--opponent` | AI difficulty: `Easy`, `Normal`, `Hard` |
274
  | `--games` | Number of games (minimum 10) |
275
  | `--server` | OpenRA-RL server URL (local or HuggingFace-hosted) |
276
 
277
  ### Custom Agents
278
 
279
- For custom agents, implement the standard `reset/step` loop:
280
 
281
  ```python
282
  from openra_env.client import OpenRAEnv
283
  from openra_env.models import OpenRAAction
284
 
285
- async with OpenRAEnv("https://openra-rl-openra-rl.hf.space") as env:
286
  obs = await env.reset()
287
  while not obs.done:
288
  action = your_agent.decide(obs)
@@ -320,7 +431,7 @@ def build_app() -> gr.Blocks:
320
  scale=2,
321
  )
322
  opponent_filter = gr.Dropdown(
323
- choices=["All", "Easy", "Normal", "Hard"],
324
  value="All",
325
  label="Opponent",
326
  scale=1,
@@ -361,6 +472,43 @@ def build_app() -> gr.Blocks:
361
 
362
  # โ”€โ”€ Submit Tab โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
363
  with gr.Tab("Submit"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  gr.Markdown(SUBMIT_MD)
365
 
366
  return app
 
19
  import gradio as gr
20
  import pandas as pd
21
 
22
+ from evaluate_runner import DEFAULT_SERVER, compute_composite_score, compute_game_metrics
23
 
24
  # โ”€โ”€ Data Loading โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
25
 
 
170
  writer.writerow(results)
171
 
172
 
173
+ # โ”€โ”€ Submission Handling โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
174
+
175
+ VALID_OPPONENTS = {"Beginner", "Easy", "Medium", "Normal", "Hard"}
176
+ VALID_AGENT_TYPES = {"Scripted", "LLM", "RL"}
177
+ REQUIRED_FIELDS = [
178
+ "agent_name", "agent_type", "opponent", "result",
179
+ "ticks", "kills_cost", "deaths_cost", "assets_value",
180
+ ]
181
+
182
+
183
+ def validate_submission(data: dict) -> tuple[bool, str]:
184
+ """Validate an uploaded JSON submission.
185
+
186
+ Returns (is_valid, error_message).
187
+ """
188
+ for field in REQUIRED_FIELDS:
189
+ if field not in data:
190
+ return False, f"Missing required field: {field}"
191
+
192
+ if data["agent_type"] not in VALID_AGENT_TYPES:
193
+ return False, (
194
+ f"Invalid agent_type: {data['agent_type']}. "
195
+ f"Must be one of: {', '.join(sorted(VALID_AGENT_TYPES))}"
196
+ )
197
+
198
+ if data["opponent"] not in VALID_OPPONENTS:
199
+ return False, (
200
+ f"Invalid opponent: {data['opponent']}. "
201
+ f"Must be one of: {', '.join(sorted(VALID_OPPONENTS))}"
202
+ )
203
+
204
+ return True, ""
205
+
206
+
207
+ def _score_from_submission(data: dict) -> dict:
208
+ """Build a CSV-ready results dict from a validated submission."""
209
+ game_result = {
210
+ "result": data.get("result", ""),
211
+ "win": data.get("win", data.get("result") == "win"),
212
+ "ticks": data.get("ticks", 0),
213
+ "kills_cost": data.get("kills_cost", 0),
214
+ "deaths_cost": data.get("deaths_cost", 0),
215
+ "kd_ratio": data.get("kd_ratio", 0),
216
+ "assets_value": data.get("assets_value", 0),
217
+ "cash": data.get("cash", 0),
218
+ }
219
+ score = compute_composite_score([game_result])
220
+ kills = data.get("kills_cost", 0)
221
+ deaths = data.get("deaths_cost", 0)
222
+ games = data.get("games", 1)
223
+
224
+ return {
225
+ "agent_name": data["agent_name"],
226
+ "agent_type": data["agent_type"],
227
+ "opponent": data["opponent"],
228
+ "games": games,
229
+ "win_rate": round(100.0 * (1 if data.get("win") else 0) / max(games, 1), 1),
230
+ "score": round(score, 1),
231
+ "avg_kills": kills,
232
+ "avg_deaths": deaths,
233
+ "kd_ratio": round(kills / max(deaths, 1), 2),
234
+ "avg_economy": data.get("assets_value", 0),
235
+ "avg_game_length": data.get("ticks", 0),
236
+ "timestamp": data.get("timestamp", datetime.now(timezone.utc).strftime("%Y-%m-%d"))[:10],
237
+ "replay_url": "",
238
+ }
239
+
240
+
241
+ def handle_upload(json_file, replay_file) -> tuple[str, pd.DataFrame]:
242
+ """Process an uploaded bench submission JSON + optional replay."""
243
+ if json_file is None:
244
+ return "Please upload a JSON file.", add_type_badges(load_data())
245
+
246
+ try:
247
+ with open(json_file.name) as f:
248
+ data = json.load(f)
249
+ except (json.JSONDecodeError, Exception) as e:
250
+ return f"Invalid JSON: {e}", add_type_badges(load_data())
251
+
252
+ is_valid, error = validate_submission(data)
253
+ if not is_valid:
254
+ return f"Validation error: {error}", add_type_badges(load_data())
255
+
256
+ results_row = _score_from_submission(data)
257
+
258
+ # Save replay if provided
259
+ if replay_file is not None:
260
+ import shutil
261
+ replay_name = Path(replay_file.name).name
262
+ shutil.copy2(replay_file.name, SUBMISSIONS_DIR / replay_name)
263
+ results_row["replay_url"] = replay_name
264
+
265
+ save_submission(results_row)
266
+
267
+ return (
268
+ f"Submitted! **{data['agent_name']}** ({data['agent_type']}) "
269
+ f"vs {data['opponent']}: score **{results_row['score']}**",
270
+ add_type_badges(load_data()),
271
+ )
272
+
273
+
274
+ def handle_api_submit(json_data: str) -> str:
275
+ """API endpoint: accept JSON string submission. Used by CLI auto-upload."""
276
+ try:
277
+ data = json.loads(json_data)
278
+ except (json.JSONDecodeError, Exception) as e:
279
+ return f"Invalid JSON: {e}"
280
+
281
+ is_valid, error = validate_submission(data)
282
+ if not is_valid:
283
+ return f"Validation error: {error}"
284
+
285
+ results_row = _score_from_submission(data)
286
+ save_submission(results_row)
287
+
288
+ return (
289
+ f"OK: {data['agent_name']} ({data['agent_type']}) "
290
+ f"vs {data['opponent']}: score {results_row['score']}"
291
+ )
292
+
293
+
294
  # โ”€โ”€ UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
295
 
296
  ABOUT_MD = """
 
304
 
305
  - **Game**: Red Alert (OpenRA engine)
306
  - **Format**: 1v1 agent vs built-in AI
307
+ - **Opponents**: Beginner, Easy, Medium, Normal, Hard difficulty
308
  - **Games per entry**: Minimum 10 games per configuration
309
  - **Metrics**: Win rate, composite score, K/D ratio, economy
310
 
 
334
  """
335
 
336
  SUBMIT_MD = """
337
+ ---
338
 
339
+ ## Other Submission Methods
340
 
341
+ ### CLI Auto-Upload
 
 
342
 
343
+ Set `BENCH_URL` in your OpenRA-RL config and results upload automatically
344
+ after each game:
345
 
346
+ ```yaml
347
+ # config.yaml
348
+ agent:
349
+ bench_url: "https://openra-rl-openra-bench.hf.space"
350
+ ```
351
 
352
+ ### CLI Manual Upload
353
+
354
+ Upload a previously exported bench JSON:
355
+
356
+ ```bash
357
+ python -m openra_env.bench_submit ~/.openra-rl/bench-exports/bench-*.json
 
358
  ```
359
 
360
+ ### Batch Evaluation (10+ games)
361
 
362
  ```bash
363
+ git clone https://github.com/yxc20089/OpenRA-Bench.git
364
+ cd OpenRA-Bench
365
+ pip install -r requirements.txt
366
+ pip install openra-rl openra-rl-util
367
 
368
  python evaluate.py \\
369
  --agent scripted \\
 
374
  --server http://localhost:8000
375
  ```
376
 
 
 
 
 
 
 
 
 
 
377
  ### Evaluation Parameters
378
 
379
  | Parameter | Description |
 
381
  | `--agent` | Agent type: `scripted`, `llm`, `mcp`, `custom` |
382
  | `--agent-name` | Display name on the leaderboard |
383
  | `--agent-type` | Category: `Scripted`, `LLM`, `RL` |
384
+ | `--opponent` | AI difficulty: `Beginner`, `Easy`, `Medium`, `Normal`, `Hard` |
385
  | `--games` | Number of games (minimum 10) |
386
  | `--server` | OpenRA-RL server URL (local or HuggingFace-hosted) |
387
 
388
  ### Custom Agents
389
 
390
+ Implement the standard `reset/step` loop:
391
 
392
  ```python
393
  from openra_env.client import OpenRAEnv
394
  from openra_env.models import OpenRAAction
395
 
396
+ async with OpenRAEnv("http://localhost:8000") as env:
397
  obs = await env.reset()
398
  while not obs.done:
399
  action = your_agent.decide(obs)
 
431
  scale=2,
432
  )
433
  opponent_filter = gr.Dropdown(
434
+ choices=["All", "Beginner", "Easy", "Medium", "Normal", "Hard"],
435
  value="All",
436
  label="Opponent",
437
  scale=1,
 
472
 
473
  # โ”€โ”€ Submit Tab โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
474
  with gr.Tab("Submit"):
475
+ gr.Markdown(
476
+ "## Upload Results\n\n"
477
+ "Upload a bench export JSON from your OpenRA-RL game. "
478
+ "After each game, the agent saves a JSON file to "
479
+ "`~/.openra-rl/bench-exports/`."
480
+ )
481
+ with gr.Row():
482
+ json_upload = gr.File(
483
+ label="Bench export JSON",
484
+ file_types=[".json"],
485
+ scale=3,
486
+ )
487
+ replay_upload = gr.File(
488
+ label="Replay file (optional)",
489
+ file_types=[".orarep"],
490
+ scale=2,
491
+ )
492
+ submit_btn = gr.Button("Submit Results", variant="primary")
493
+ submit_output = gr.Markdown()
494
+
495
+ submit_btn.click(
496
+ fn=handle_upload,
497
+ inputs=[json_upload, replay_upload],
498
+ outputs=[submit_output, leaderboard],
499
+ )
500
+
501
+ # API endpoint for CLI auto-upload
502
+ api_json_input = gr.Textbox(visible=False)
503
+ api_result = gr.Textbox(visible=False)
504
+ api_btn = gr.Button(visible=False)
505
+ api_btn.click(
506
+ fn=handle_api_submit,
507
+ inputs=[api_json_input],
508
+ outputs=[api_result],
509
+ api_name="submit",
510
+ )
511
+
512
  gr.Markdown(SUBMIT_MD)
513
 
514
  return app
data/results.csv CHANGED
@@ -1,6 +1,2 @@
1
  agent_name,agent_type,opponent,games,win_rate,score,avg_kills,avg_deaths,kd_ratio,avg_economy,avg_game_length,timestamp,replay_url
2
- ScriptedBot-v1,Scripted,Easy,10,90.0,72.5,8450,2100,4.02,12500,1850,2026-02-19,
3
- ScriptedBot-v1,Scripted,Normal,10,60.0,52.3,6200,4800,1.29,8200,2400,2026-02-19,
4
- ScriptedBot-v1,Scripted,Hard,10,20.0,28.1,3100,7200,0.43,4500,1600,2026-02-19,
5
- LLM-Agent-v1,LLM,Easy,10,80.0,65.8,7200,3400,2.12,11000,2200,2026-02-19,
6
- LLM-Agent-v1,LLM,Normal,10,50.0,48.7,5800,5200,1.12,7800,2800,2026-02-19,
 
1
  agent_name,agent_type,opponent,games,win_rate,score,avg_kills,avg_deaths,kd_ratio,avg_economy,avg_game_length,timestamp,replay_url
2
+ qwen/qwen3-coder-next,LLM,Beginner,1,0.0,18.3,1000,2900,0.34,9050,27349,2026-02-25,
 
 
 
 
data/schema.md CHANGED
@@ -4,8 +4,8 @@
4
  |--------|------|-------------|
5
  | `agent_name` | str | Agent identifier displayed on leaderboard |
6
  | `agent_type` | str | Category: "Scripted", "LLM", or "RL" |
7
- | `opponent` | str | AI difficulty: "Easy", "Normal", or "Hard" |
8
- | `games` | int | Number of games played (minimum 10) |
9
  | `win_rate` | float | Win percentage (0.0 - 100.0) |
10
  | `score` | float | Composite benchmark score (0.0 - 100.0) |
11
  | `avg_kills` | float | Average enemy cost destroyed per game |
 
4
  |--------|------|-------------|
5
  | `agent_name` | str | Agent identifier displayed on leaderboard |
6
  | `agent_type` | str | Category: "Scripted", "LLM", or "RL" |
7
+ | `opponent` | str | AI difficulty: "Beginner", "Easy", "Medium", "Normal", or "Hard" |
8
+ | `games` | int | Number of games played |
9
  | `win_rate` | float | Win percentage (0.0 - 100.0) |
10
  | `score` | float | Composite benchmark score (0.0 - 100.0) |
11
  | `avg_kills` | float | Average enemy cost destroyed per game |
evaluate.py CHANGED
@@ -81,7 +81,7 @@ def parse_args() -> argparse.Namespace:
81
  )
82
  parser.add_argument(
83
  "--opponent",
84
- choices=["Easy", "Normal", "Hard"],
85
  default="Normal",
86
  help="AI opponent difficulty (default: Normal)",
87
  )
 
81
  )
82
  parser.add_argument(
83
  "--opponent",
84
+ choices=["Beginner", "Easy", "Medium", "Normal", "Hard"],
85
  default="Normal",
86
  help="AI opponent difficulty (default: Normal)",
87
  )
evaluate_runner.py CHANGED
@@ -128,7 +128,7 @@ async def run_evaluation(
128
 
129
  Args:
130
  agent_name: Display name for the leaderboard.
131
- opponent: AI difficulty (Easy/Normal/Hard).
132
  num_games: Number of games to play.
133
  server_url: OpenRA-RL server URL.
134
  on_game_done: Optional callback(game_num, total, metrics) after each game.
 
128
 
129
  Args:
130
  agent_name: Display name for the leaderboard.
131
+ opponent: AI difficulty (Beginner/Easy/Medium/Normal/Hard).
132
  num_games: Number of games to play.
133
  server_url: OpenRA-RL server URL.
134
  on_game_done: Optional callback(game_num, total, metrics) after each game.
tests/test_app.py CHANGED
@@ -12,10 +12,13 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
12
  from app import (
13
  AGENT_TYPE_COLORS,
14
  DISPLAY_COLUMNS,
 
15
  add_type_badges,
16
  build_app,
17
  filter_leaderboard,
 
18
  load_data,
 
19
  )
20
 
21
 
@@ -80,16 +83,19 @@ class TestFilter:
80
  assert isinstance(df, pd.DataFrame)
81
 
82
  def test_search_filters_by_name(self):
83
- df = filter_leaderboard("ScriptedBot", [], "All")
84
- # If there are results, they should contain "ScriptedBot"
85
  if len(df) > 0:
86
- # Badges are in the Type column, not Agent
87
- assert all("ScriptedBot" in str(row) for row in df["Agent"])
88
 
89
  def test_opponent_filter(self):
90
- df = filter_leaderboard("", [], "Hard")
91
  if len(df) > 0:
92
- assert all(df["Opponent"] == "Hard")
 
 
 
 
 
93
 
94
 
95
  class TestBuildApp:
@@ -98,3 +104,94 @@ class TestBuildApp:
98
  def test_builds_without_error(self):
99
  app = build_app()
100
  assert app is not None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  from app import (
13
  AGENT_TYPE_COLORS,
14
  DISPLAY_COLUMNS,
15
+ VALID_OPPONENTS,
16
  add_type_badges,
17
  build_app,
18
  filter_leaderboard,
19
+ handle_api_submit,
20
  load_data,
21
+ validate_submission,
22
  )
23
 
24
 
 
83
  assert isinstance(df, pd.DataFrame)
84
 
85
  def test_search_filters_by_name(self):
86
+ df = filter_leaderboard("qwen", [], "All")
 
87
  if len(df) > 0:
88
+ assert all("qwen" in str(row).lower() for row in df["Agent"])
 
89
 
90
  def test_opponent_filter(self):
91
+ df = filter_leaderboard("", [], "Beginner")
92
  if len(df) > 0:
93
+ assert all(df["Opponent"] == "Beginner")
94
+
95
+ def test_opponent_filter_hard(self):
96
+ df = filter_leaderboard("", [], "Hard")
97
+ # May be empty if no Hard entries exist
98
+ assert isinstance(df, pd.DataFrame)
99
 
100
 
101
  class TestBuildApp:
 
104
  def test_builds_without_error(self):
105
  app = build_app()
106
  assert app is not None
107
+
108
+
109
+ class TestValidateSubmission:
110
+ """Test submission validation."""
111
+
112
+ def _valid_data(self):
113
+ return {
114
+ "agent_name": "TestBot",
115
+ "agent_type": "LLM",
116
+ "opponent": "Beginner",
117
+ "result": "loss",
118
+ "ticks": 27000,
119
+ "kills_cost": 1000,
120
+ "deaths_cost": 2900,
121
+ "assets_value": 9050,
122
+ }
123
+
124
+ def test_valid_submission(self):
125
+ valid, err = validate_submission(self._valid_data())
126
+ assert valid
127
+ assert err == ""
128
+
129
+ def test_missing_field(self):
130
+ data = {"agent_name": "Bot"}
131
+ valid, err = validate_submission(data)
132
+ assert not valid
133
+ assert "Missing required field" in err
134
+
135
+ def test_invalid_opponent(self):
136
+ data = self._valid_data()
137
+ data["opponent"] = "Brutal"
138
+ valid, err = validate_submission(data)
139
+ assert not valid
140
+ assert "Invalid opponent" in err
141
+
142
+ def test_invalid_agent_type(self):
143
+ data = self._valid_data()
144
+ data["agent_type"] = "MCTS"
145
+ valid, err = validate_submission(data)
146
+ assert not valid
147
+ assert "Invalid agent_type" in err
148
+
149
+ def test_all_opponents_accepted(self):
150
+ for opp in VALID_OPPONENTS:
151
+ data = self._valid_data()
152
+ data["opponent"] = opp
153
+ valid, _ = validate_submission(data)
154
+ assert valid, f"Opponent '{opp}' should be valid"
155
+
156
+ def test_all_agent_types_accepted(self):
157
+ for at in ["Scripted", "LLM", "RL"]:
158
+ data = self._valid_data()
159
+ data["agent_type"] = at
160
+ valid, _ = validate_submission(data)
161
+ assert valid, f"Agent type '{at}' should be valid"
162
+
163
+
164
+ class TestApiSubmit:
165
+ """Test API submission handler."""
166
+
167
+ def test_valid_json(self):
168
+ import json
169
+ data = {
170
+ "agent_name": "TestBot",
171
+ "agent_type": "LLM",
172
+ "opponent": "Easy",
173
+ "result": "win",
174
+ "win": True,
175
+ "ticks": 5000,
176
+ "kills_cost": 3000,
177
+ "deaths_cost": 1000,
178
+ "assets_value": 8000,
179
+ }
180
+ # Use a temp CSV to avoid polluting real data
181
+ import tempfile
182
+ with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f:
183
+ temp_path = Path(f.name)
184
+ with patch("app.DATA_PATH", temp_path):
185
+ result = handle_api_submit(json.dumps(data))
186
+ assert "OK" in result
187
+ assert "TestBot" in result
188
+ temp_path.unlink(missing_ok=True)
189
+
190
+ def test_invalid_json(self):
191
+ result = handle_api_submit("not json")
192
+ assert "Invalid JSON" in result
193
+
194
+ def test_missing_fields(self):
195
+ import json
196
+ result = handle_api_submit(json.dumps({"agent_name": "Bot"}))
197
+ assert "Validation error" in result
tests/test_evaluate.py CHANGED
@@ -72,6 +72,20 @@ class TestParseArgs:
72
  args = parse_args()
73
  assert args.agent_type == "RL"
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  class TestGetAgentFn:
77
  """Test agent factory."""
 
72
  args = parse_args()
73
  assert args.agent_type == "RL"
74
 
75
+ def test_beginner_opponent_accepted(self):
76
+ with patch("sys.argv", [
77
+ "evaluate.py", "--agent-name", "T", "--opponent", "Beginner",
78
+ ]):
79
+ args = parse_args()
80
+ assert args.opponent == "Beginner"
81
+
82
+ def test_medium_opponent_accepted(self):
83
+ with patch("sys.argv", [
84
+ "evaluate.py", "--agent-name", "T", "--opponent", "Medium",
85
+ ]):
86
+ args = parse_args()
87
+ assert args.opponent == "Medium"
88
+
89
 
90
  class TestGetAgentFn:
91
  """Test agent factory."""