Spaces:
Sleeping
Sleeping
Commit ·
5b25e42
1
Parent(s): 14170d7
Make openenv validate pass
Browse files- changes.md +164 -483
- pyproject.toml +3 -0
- server/__init__.py +0 -0
- server/app.py +18 -0
- tests/test_inference.py +7 -0
changes.md
CHANGED
|
@@ -1,19 +1,15 @@
|
|
| 1 |
-
#
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
---
|
| 8 |
-
|
| 9 |
-
## SECTION 1 — CRITICAL BUGS (
|
| 10 |
-
|
| 11 |
-
### 1.1 Fix `openenv.yaml` —
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
**Action:** Rewrite `openenv.yaml` using 2-space indentation throughout. Use exactly this content:
|
| 16 |
-
|
| 17 |
```yaml
|
| 18 |
name: citywide-dispatch-supervisor
|
| 19 |
version: "0.1.0"
|
|
@@ -36,132 +32,76 @@ tasks:
|
|
| 36 |
name: Shift Surge
|
| 37 |
description: Incident waves combined with units going out of service; maintain coverage over time.
|
| 38 |
```
|
| 39 |
-
|
| 40 |
-
Verify with: `python -c "import yaml; yaml.safe_load(open('openenv.yaml'))
|
| 41 |
-
|
| 42 |
---
|
| 43 |
-
|
| 44 |
-
### 1.2 Fix `src/server/app.py` — Server never starts
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
**Action:** Add the following two lines at the very bottom of `src/server/app.py`, after the `def main()` block:
|
| 49 |
-
|
| 50 |
```python
|
| 51 |
if __name__ == "__main__":
|
| 52 |
main()
|
| 53 |
```
|
| 54 |
-
|
| 55 |
-
Also update the `main()` function to
|
| 56 |
-
|
| 57 |
```python
|
| 58 |
def main():
|
| 59 |
import uvicorn
|
| 60 |
uvicorn.run("src.server.app:app", host="0.0.0.0", port=8000, reload=False)
|
| 61 |
```
|
| 62 |
-
|
| 63 |
-
**Verify:** `docker build -t citywide-dispatch-supervisor . && docker run -p 8000:8000 citywide-dispatch-supervisor` must hold open and `curl http://localhost:8000/health` must return `{"status":"ok"}`.
|
| 64 |
-
|
| 65 |
---
|
| 66 |
-
|
| 67 |
-
### 1.3 Fix `src/server/app.py`
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
**Action:** In `src/server/app.py`, change `ResetRequest` to give `task_id` a sensible default:
|
| 72 |
-
|
| 73 |
```python
|
| 74 |
class ResetRequest(BaseModel):
|
| 75 |
task_id: str = "single_incident"
|
| 76 |
seed: int | None = None
|
| 77 |
```
|
| 78 |
-
|
| 79 |
-
**Verify:** `curl -s -X POST http://localhost:8000/reset -H "Content-Type: application/json" -d '{}'` must return HTTP 200 with a valid observation JSON.
|
| 80 |
-
|
| 81 |
---
|
| 82 |
-
|
| 83 |
-
### 1.4 Fix `Dockerfile` — Use
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
**Action:** Replace the `CMD` in the root `Dockerfile` with:
|
| 88 |
-
|
| 89 |
```dockerfile
|
| 90 |
CMD ["uv", "run", "uvicorn", "src.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
| 91 |
```
|
| 92 |
-
|
| 93 |
-
The full updated `Dockerfile` should be:
|
| 94 |
-
|
| 95 |
-
```dockerfile
|
| 96 |
-
FROM python:3.11-slim
|
| 97 |
-
LABEL org.opencontainers.image.title="911 City-Wide Emergency Dispatch Supervisor"
|
| 98 |
-
LABEL org.opencontainers.image.description="City-wide 911 dispatch supervisor RL environment"
|
| 99 |
-
WORKDIR /app
|
| 100 |
-
COPY . /app
|
| 101 |
-
RUN pip install uv && uv sync --frozen
|
| 102 |
-
EXPOSE 8000
|
| 103 |
-
CMD ["uv", "run", "uvicorn", "src.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
| 104 |
-
```
|
| 105 |
-
|
| 106 |
---
|
| 107 |
-
|
| 108 |
-
## SECTION 2 — HIGH PRIORITY BUGS
|
| 109 |
-
|
| 110 |
### 2.1 Fix `validate_local.py` — `check_inference()` never uses random mode
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
**Action:** In `validate_local.py`, inside `check_inference()`, add `env["USE_RANDOM"] = "true"` before the `subprocess.run` call:
|
| 115 |
-
|
| 116 |
```python
|
| 117 |
-
|
| 118 |
-
import os
|
| 119 |
-
|
| 120 |
-
env = os.environ.copy()
|
| 121 |
-
env["API_BASE_URL"] = "https://api.openai.com/v1"
|
| 122 |
-
env["MODEL_NAME"] = "gpt-4"
|
| 123 |
-
env["HF_TOKEN"] = "dummy-token-for-local-validation"
|
| 124 |
-
env["USE_RANDOM"] = "true" # <-- ADD THIS LINE
|
| 125 |
-
|
| 126 |
-
print("\nNOTE: Running inference.py in random-agent mode for local validation")
|
| 127 |
-
result = subprocess.run(
|
| 128 |
-
["uv", "run", "python", "inference.py"],
|
| 129 |
-
capture_output=True,
|
| 130 |
-
text=True,
|
| 131 |
-
env=env,
|
| 132 |
-
timeout=300, # also increase timeout; 4 tasks can take time
|
| 133 |
-
)
|
| 134 |
-
# ... rest of function unchanged
|
| 135 |
```
|
| 136 |
-
|
|
|
|
|
|
|
| 137 |
---
|
| 138 |
-
|
| 139 |
-
### 2.2 Fix `pyproject.toml` —
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
**Action:** Add the following to the `[tool.pytest.ini_options]` section in `pyproject.toml`:
|
| 144 |
-
|
| 145 |
```toml
|
| 146 |
-
[tool.pytest.ini_options]
|
| 147 |
-
testpaths = ["tests"]
|
| 148 |
-
python_files = ["test_*.py"]
|
| 149 |
-
python_classes = ["Test*"]
|
| 150 |
-
python_functions = ["test_*"]
|
| 151 |
asyncio_mode = "auto"
|
| 152 |
```
|
| 153 |
-
|
| 154 |
---
|
| 155 |
-
|
| 156 |
-
### 2.3 Fix `inference.py` —
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
**Action:** In `inference.py`, inside the inner `except Exception as e` block within the step loop, normalize the error:
|
| 161 |
-
|
| 162 |
```python
|
| 163 |
except Exception as e:
|
| 164 |
-
error_msg =
|
| 165 |
print(
|
| 166 |
f"[STEP] step={step_count} action={action_str} "
|
| 167 |
f"reward=0.00 done=true error={error_msg}"
|
|
@@ -169,26 +109,32 @@ except Exception as e:
|
|
| 169 |
success = False
|
| 170 |
break
|
| 171 |
```
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
|
|
|
|
|
|
|
|
|
| 175 |
```python
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
```
|
| 178 |
-
|
| 179 |
---
|
| 180 |
-
|
| 181 |
-
### 2.
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
**Action:** Add a null-guard to `get_dashboard_state()`:
|
| 186 |
-
|
| 187 |
```python
|
| 188 |
@app.get("/dashboard/state")
|
| 189 |
async def get_dashboard_state() -> dict[str, Any]:
|
| 190 |
if _env is None:
|
| 191 |
-
# Return an empty but valid structure before /reset is called
|
| 192 |
return {
|
| 193 |
"units": {},
|
| 194 |
"incidents": {},
|
|
@@ -201,337 +147,153 @@ async def get_dashboard_state() -> dict[str, Any]:
|
|
| 201 |
"issues": [],
|
| 202 |
"observation": None,
|
| 203 |
}
|
| 204 |
-
|
| 205 |
-
legal_actions = [a.model_dump() for a in _env.legal_actions()]
|
| 206 |
-
last_obs = _env.last_observation()
|
| 207 |
-
issues = list(last_obs.issues) if last_obs is not None else []
|
| 208 |
-
obs_dict = last_obs.model_dump() if last_obs is not None else None
|
| 209 |
-
return {
|
| 210 |
-
**state_dict,
|
| 211 |
-
"legal_actions": legal_actions,
|
| 212 |
-
"issues": issues,
|
| 213 |
-
"observation": obs_dict,
|
| 214 |
-
}
|
| 215 |
-
```
|
| 216 |
-
|
| 217 |
-
---
|
| 218 |
-
|
| 219 |
-
### 2.5 Fix `inference.py` — Score computation is not normalized to competition spec
|
| 220 |
-
|
| 221 |
-
**Problem:** `total_score = sum(rewards) / len(rewards)` computes the average step reward. Since each step reward is already in [0, 1], this is a valid value but it weights the reset-time reward (score=0.0 from `obs.score=0.0` in `reset()`) equally with step rewards. This deflates the score.
|
| 222 |
-
|
| 223 |
-
**Action:** Change score computation in `run_episode()` to exclude the initial zero from reset:
|
| 224 |
-
|
| 225 |
-
```python
|
| 226 |
-
# Separate reset reward from step rewards
|
| 227 |
-
step_rewards = rewards[1:] # index 0 is the reset observation score (always 0.0)
|
| 228 |
-
if step_rewards:
|
| 229 |
-
total_score = sum(step_rewards) / len(step_rewards)
|
| 230 |
-
else:
|
| 231 |
-
total_score = 0.0
|
| 232 |
-
total_score = max(0.0, min(1.0, total_score))
|
| 233 |
-
```
|
| 234 |
-
|
| 235 |
-
Also update the `rewards_str` to only include step rewards so the `[END]` line is meaningful:
|
| 236 |
-
|
| 237 |
-
```python
|
| 238 |
-
rewards_str = ",".join(f"{r:.2f}" for r in rewards[1:]) if len(rewards) > 1 else "0.00"
|
| 239 |
```
|
| 240 |
-
|
| 241 |
---
|
| 242 |
-
|
| 243 |
-
## SECTION 3 — ENVIRONMENT DESIGN IMPROVEMENTS
|
| 244 |
-
|
| 245 |
-
### 3.1 Improve
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
**Action — `src/tasks/single_incident.py`:** Replace `SingleIncidentGrader.grade()` with:
|
| 250 |
-
|
| 251 |
```python
|
| 252 |
def grade(self, state: State, rewards: list[float]) -> float:
|
| 253 |
-
"""Grade based on: correct unit dispatched, fast response, incident resolved."""
|
| 254 |
if not rewards:
|
| 255 |
return 0.0
|
| 256 |
-
|
| 257 |
incident = state.incidents.get("INC-001")
|
| 258 |
if incident is None:
|
| 259 |
return 0.0
|
| 260 |
-
|
| 261 |
score = 0.0
|
| 262 |
-
|
| 263 |
-
# Component 1: Was the incident resolved? (50% weight)
|
| 264 |
if incident.status.value == "RESOLVED":
|
| 265 |
score += 0.50
|
| 266 |
-
|
| 267 |
-
# Component 2: Correct unit type dispatched? (30% weight)
|
| 268 |
medic_dispatched = any(
|
| 269 |
-
u.unit_type.value == "MEDIC"
|
| 270 |
-
|
| 271 |
-
u.
|
|
|
|
| 272 |
)
|
| 273 |
for u in state.units.values()
|
| 274 |
)
|
| 275 |
if medic_dispatched:
|
| 276 |
score += 0.30
|
| 277 |
-
|
| 278 |
-
# Component 3: Speed — resolved within first 10 steps (20% weight)
|
| 279 |
if incident.status.value == "RESOLVED" and state.step_count <= 10:
|
| 280 |
score += 0.20
|
| 281 |
-
|
| 282 |
return max(0.0, min(1.0, score))
|
| 283 |
```
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
|
|
|
|
|
|
|
|
|
| 287 |
```python
|
| 288 |
def grade(self, state: State, rewards: list[float]) -> float:
|
| 289 |
-
"""Grade based on: P1 incidents resolved, triage correctness, coverage."""
|
| 290 |
if not rewards:
|
| 291 |
return 0.0
|
| 292 |
-
|
| 293 |
total = len(state.incidents)
|
| 294 |
if total == 0:
|
| 295 |
return 0.0
|
| 296 |
-
|
| 297 |
-
resolved = sum(
|
| 298 |
-
|
| 299 |
-
if i.status.value == "RESOLVED"
|
| 300 |
-
)
|
| 301 |
-
failed = sum(
|
| 302 |
-
1 for i in state.incidents.values()
|
| 303 |
-
if i.status.value == "ESCALATED"
|
| 304 |
-
)
|
| 305 |
p1_total = sum(1 for i in state.incidents.values() if i.severity.value == "PRIORITY_1")
|
| 306 |
p1_resolved = sum(
|
| 307 |
-
1
|
| 308 |
-
|
|
|
|
|
|
|
| 309 |
)
|
| 310 |
-
|
| 311 |
resolution_score = resolved / total
|
| 312 |
p1_score = (p1_resolved / p1_total) if p1_total > 0 else 1.0
|
| 313 |
failure_penalty = failed / total
|
| 314 |
-
|
| 315 |
score = 0.5 * p1_score + 0.3 * resolution_score - 0.2 * failure_penalty
|
| 316 |
return max(0.0, min(1.0, score))
|
| 317 |
```
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
|
|
|
|
|
|
|
|
|
| 321 |
```python
|
| 322 |
def grade(self, state: State, rewards: list[float]) -> float:
|
| 323 |
if not rewards:
|
| 324 |
return 0.0
|
| 325 |
-
|
| 326 |
p1_seen = list(state.metadata.get("p1_seen", []))
|
| 327 |
p1_resolved = [
|
| 328 |
-
iid
|
|
|
|
| 329 |
if iid in p1_seen and iid not in state.metadata.get("failed_incidents", [])
|
| 330 |
]
|
| 331 |
p1_failed = list(state.metadata.get("failed_incidents", []))
|
| 332 |
-
|
| 333 |
survival_score = len(p1_resolved) / max(len(p1_seen), 1)
|
| 334 |
failure_penalty = len(p1_failed) / max(len(p1_seen), 1) * 0.5
|
| 335 |
-
|
| 336 |
mean_reward = sum(rewards) / len(rewards)
|
| 337 |
score = 0.6 * survival_score + 0.3 * mean_reward - failure_penalty
|
| 338 |
return max(0.0, min(1.0, score))
|
| 339 |
```
|
| 340 |
-
|
| 341 |
---
|
| 342 |
-
|
| 343 |
-
### 3.
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
```markdown
|
| 348 |
-
| `/tasks` | GET | List all available tasks with metadata |
|
| 349 |
-
```
|
| 350 |
-
|
| 351 |
-
---
|
| 352 |
-
|
| 353 |
-
### 3.3 Improve reward signal in `src/rewards.py` — triage scoring uses wrong key format
|
| 354 |
-
|
| 355 |
-
**Problem:** In `_compute_triage()`, the lookup is:
|
| 356 |
-
|
| 357 |
```python
|
| 358 |
-
|
| 359 |
-
|
|
|
|
|
|
|
| 360 |
```
|
| 361 |
-
|
| 362 |
-
But `str(incident.incident_type)` for a `StrEnum` returns `"CARDIAC_ARREST"` (the value), while the metadata stores types like `"IncidentType.CARDIAC_ARREST"` (the repr). This mismatch means triage always returns 0.5 (the neutral value), undermining the reward signal.
|
| 363 |
-
|
| 364 |
-
**Action:** In `src/rewards.py`, change `_compute_triage()` to use the value directly:
|
| 365 |
-
|
| 366 |
-
```python
|
| 367 |
-
def _compute_triage(self, state: State, action: Action) -> float:
|
| 368 |
-
if action.action_type != DispatchAction.DISPATCH:
|
| 369 |
-
return 0.5
|
| 370 |
-
unit = state.units.get(action.unit_id)
|
| 371 |
-
incident = state.incidents.get(action.incident_id)
|
| 372 |
-
if unit is None or incident is None:
|
| 373 |
-
return 0.0
|
| 374 |
-
required_map = state.metadata.get("default_required_units", {})
|
| 375 |
-
# Try both formats: plain value and StrEnum repr
|
| 376 |
-
required_types = (
|
| 377 |
-
required_map.get(incident.incident_type.value, []) or
|
| 378 |
-
required_map.get(str(incident.incident_type), [])
|
| 379 |
-
)
|
| 380 |
-
if not required_types:
|
| 381 |
-
return 0.5
|
| 382 |
-
return 1.0 if unit.unit_type.value in required_types else 0.0
|
| 383 |
-
```
|
| 384 |
-
|
| 385 |
-
Also fix the metadata population in `src/state_machine.py`. In `reset()`, when enriching metadata, convert the `default_required_units` schema data to use plain string values:
|
| 386 |
-
|
| 387 |
-
```python
|
| 388 |
-
# Convert unit type values to plain strings for consistent lookup
|
| 389 |
-
raw_required = schema_dump.get("default_required_units", {})
|
| 390 |
-
converted_required = {
|
| 391 |
-
str(inc_type): [str(u) for u in unit_types]
|
| 392 |
-
for inc_type, unit_types in raw_required.items()
|
| 393 |
-
}
|
| 394 |
-
state.metadata.setdefault("default_required_units", converted_required)
|
| 395 |
-
```
|
| 396 |
-
|
| 397 |
---
|
| 398 |
-
|
| 399 |
-
### 3.
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
```python
|
| 404 |
-
dist = _distance(unit.location_x, unit.location_y, incident.location_x, incident.location_y)
|
| 405 |
-
```
|
| 406 |
-
|
| 407 |
-
Where `_distance` uses `math.hypot`. But `move_unit_toward` uses Manhattan movement. This inconsistency means units arrive "early" by Euclidean measurement but take longer by Manhattan movement.
|
| 408 |
-
|
| 409 |
-
**Action:** Replace `_distance` usage in `_apply_dispatch()` to use Manhattan distance:
|
| 410 |
-
|
| 411 |
```python
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
speed = float(self._schema.unit_speeds.get(unit.unit_type, 1.0))
|
| 417 |
-
# Use Manhattan distance to match move_unit_toward physics
|
| 418 |
-
dx = abs(unit.location_x - incident.location_x)
|
| 419 |
-
dy = abs(unit.location_y - incident.location_y)
|
| 420 |
-
manhattan_dist = dx + dy
|
| 421 |
-
eta = manhattan_dist / max(speed, 1e-6)
|
| 422 |
-
|
| 423 |
-
unit.status = UnitStatus.DISPATCHED
|
| 424 |
-
unit.assigned_incident_id = incident.incident_id
|
| 425 |
-
unit.eta_seconds = max(0.0, float(eta))
|
| 426 |
-
|
| 427 |
-
if unit.unit_id not in incident.units_assigned:
|
| 428 |
-
incident.units_assigned.append(unit.unit_id)
|
| 429 |
-
if incident.status == IncidentStatus.PENDING:
|
| 430 |
-
incident.status = IncidentStatus.RESPONDING
|
| 431 |
-
```
|
| 432 |
-
|
| 433 |
-
---
|
| 434 |
-
|
| 435 |
-
## SECTION 4 — README IMPROVEMENTS (required by competition)
|
| 436 |
-
|
| 437 |
-
### 4.1 Add missing required README sections
|
| 438 |
-
|
| 439 |
-
The current README is good but is missing:
|
| 440 |
-
|
| 441 |
-
1. **Baseline scores table with instructions to reproduce** — The README mentions scores but doesn't show how to generate them with a single command.
|
| 442 |
-
2. **Full action space table** — Currently only shows key fields, needs all fields.
|
| 443 |
-
3. **Setup instructions** — Missing explicit `uv sync` + server start commands.
|
| 444 |
-
|
| 445 |
-
**Action:** Add the following sections to `README.md`:
|
| 446 |
-
|
| 447 |
-
After the existing "Quick Start" section, add:
|
| 448 |
-
|
| 449 |
-
```markdown
|
| 450 |
-
## Reproducing Baseline Scores
|
| 451 |
-
|
| 452 |
-
Run the random baseline agent against all 4 tasks:
|
| 453 |
-
|
| 454 |
-
```bash
|
| 455 |
-
USE_RANDOM=true API_BASE_URL=https://api.openai.com/v1 MODEL_NAME=gpt-4 HF_TOKEN=x python inference.py
|
| 456 |
-
```
|
| 457 |
-
|
| 458 |
-
Expected output (approximate):
|
| 459 |
-
|
| 460 |
-
| Task | Difficulty | Random Baseline Score |
|
| 461 |
-
|------|-----------|----------------------|
|
| 462 |
-
| `single_incident` | Easy | ~0.55 |
|
| 463 |
-
| `multi_incident` | Medium | ~0.48 |
|
| 464 |
-
| `mass_casualty` | Hard | ~0.32 |
|
| 465 |
-
| `shift_surge` | Hard | ~0.38 |
|
| 466 |
-
|
| 467 |
-
*Scores vary slightly due to seeded randomness. Run with `seed=42` for exact reproduction.*
|
| 468 |
-
```
|
| 469 |
-
|
| 470 |
-
Also add an explicit environment variable table near the top:
|
| 471 |
-
|
| 472 |
-
```markdown
|
| 473 |
-
## Environment Variables
|
| 474 |
-
|
| 475 |
-
| Variable | Required | Description |
|
| 476 |
-
|----------|----------|-------------|
|
| 477 |
-
| `API_BASE_URL` | Yes | OpenAI-compatible endpoint base URL |
|
| 478 |
-
| `MODEL_NAME` | Yes | Model identifier string |
|
| 479 |
-
| `HF_TOKEN` | Yes (unless `USE_RANDOM=true`) | API key / HF token |
|
| 480 |
-
| `USE_RANDOM` | No | Set to `true` to use deterministic random agent (no LLM) |
|
| 481 |
-
```
|
| 482 |
-
|
| 483 |
-
---
|
| 484 |
-
|
| 485 |
-
### 4.2 Add task difficulty descriptions to README
|
| 486 |
-
|
| 487 |
-
Under the "Tasks" section, expand each task to include expected agent behaviors:
|
| 488 |
-
|
| 489 |
-
```markdown
|
| 490 |
-
### Task Difficulty Guide
|
| 491 |
-
|
| 492 |
-
| Task | Difficulty | Key Challenge | Success Criteria |
|
| 493 |
-
|------|-----------|---------------|-----------------|
|
| 494 |
-
| `single_incident` | Easy | Dispatch the right unit type (MEDIC) quickly | Incident resolved, correct unit, ETA < 300s |
|
| 495 |
-
| `multi_incident` | Medium | Triage 3 simultaneous incidents, prioritize P1 | All P1 incidents responded to, no ESCALATED |
|
| 496 |
-
| `mass_casualty` | Hard | Manage wave-based surge with limited resources | Maximize P1 survival rate across waves |
|
| 497 |
-
| `shift_surge` | Hard | Adapt as units go out of service over time | Maintain coverage and resolve incidents despite attrition |
|
| 498 |
```
|
| 499 |
-
|
| 500 |
---
|
| 501 |
-
|
| 502 |
-
## SECTION
|
| 503 |
-
|
| 504 |
-
###
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
```python
|
| 509 |
valid_errors = {"null", "max_steps_exceeded", "illegal_transition", "step_error"}
|
| 510 |
```
|
| 511 |
-
|
| 512 |
---
|
| 513 |
-
|
| 514 |
-
###
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
```python
|
| 519 |
def test_reset_with_empty_body_returns_200(self) -> None:
|
| 520 |
-
"""Verify prevalidation.sh compatible: POST /reset with {} returns 200."""
|
| 521 |
c = TestClient(server_app.app)
|
| 522 |
response = c.post("/reset", json={})
|
| 523 |
assert response.status_code == 200
|
| 524 |
data = response.json()
|
| 525 |
assert data["result"] == "dispatch center online"
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
---
|
| 529 |
-
|
| 530 |
-
### 5.3 Add a test for the `/tasks` endpoint
|
| 531 |
-
|
| 532 |
-
Add to `tests/test_openenv_integration.py`:
|
| 533 |
-
|
| 534 |
-
```python
|
| 535 |
def test_tasks_endpoint_returns_four_tasks(self) -> None:
|
| 536 |
c = TestClient(server_app.app)
|
| 537 |
response = c.get("/tasks")
|
|
@@ -541,97 +303,43 @@ def test_tasks_endpoint_returns_four_tasks(self) -> None:
|
|
| 541 |
task_ids = {t["task_id"] for t in tasks}
|
| 542 |
assert task_ids == {"single_incident", "multi_incident", "mass_casualty", "shift_surge"}
|
| 543 |
```
|
| 544 |
-
|
|
|
|
|
|
|
| 545 |
---
|
| 546 |
-
|
| 547 |
-
## SECTION
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
The `src/server/Dockerfile` is a separate server-only Dockerfile. Ensure it also starts the server properly. Replace its CMD with:
|
| 552 |
-
|
| 553 |
-
```dockerfile
|
| 554 |
-
CMD ["uvicorn", "src.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
| 555 |
-
```
|
| 556 |
-
|
| 557 |
-
---
|
| 558 |
-
|
| 559 |
-
### 6.2 Add `.dockerignore` to speed up builds
|
| 560 |
-
|
| 561 |
-
Create `.dockerignore` at the repo root:
|
| 562 |
-
|
| 563 |
-
```
|
| 564 |
-
.git
|
| 565 |
-
.venv
|
| 566 |
-
.uv
|
| 567 |
-
__pycache__
|
| 568 |
-
*.pyc
|
| 569 |
-
*.pyo
|
| 570 |
-
.pytest_cache
|
| 571 |
-
.coverage
|
| 572 |
-
htmlcov
|
| 573 |
-
.sisyphus/evidence/
|
| 574 |
-
*.log
|
| 575 |
-
tmp/
|
| 576 |
-
dashboard.html
|
| 577 |
-
*.png
|
| 578 |
-
*.jpg
|
| 579 |
-
.env
|
| 580 |
-
.env.*
|
| 581 |
-
```
|
| 582 |
-
|
| 583 |
-
---
|
| 584 |
-
|
| 585 |
-
### 6.3 Verify `requirements.txt` is complete
|
| 586 |
-
|
| 587 |
-
The current `requirements.txt` is missing `groq` which is in `pyproject.toml`. Add it:
|
| 588 |
-
|
| 589 |
-
```
|
| 590 |
-
pydantic>=2.7
|
| 591 |
-
openenv-core>=0.2.0
|
| 592 |
-
fastapi>=0.110
|
| 593 |
-
uvicorn[standard]>=0.29
|
| 594 |
-
openai>=1.12
|
| 595 |
-
httpx>=0.27
|
| 596 |
-
matplotlib>=3.8
|
| 597 |
-
numpy>=1.26
|
| 598 |
-
groq>=1.1.2
|
| 599 |
-
```
|
| 600 |
-
|
| 601 |
-
---
|
| 602 |
-
|
| 603 |
-
## SECTION 7 — FINAL VALIDATION CHECKLIST
|
| 604 |
-
|
| 605 |
-
After making all changes, run these commands in order and confirm each passes:
|
| 606 |
-
|
| 607 |
```bash
|
| 608 |
# 1. YAML parse check
|
| 609 |
python -c "import yaml; yaml.safe_load(open('openenv.yaml')); print('YAML OK')"
|
| 610 |
-
|
| 611 |
# 2. Full test suite
|
| 612 |
uv run python -m pytest tests/ -v --tb=short
|
| 613 |
-
|
| 614 |
# 3. Inference script with random agent
|
| 615 |
USE_RANDOM=true API_BASE_URL=https://api.openai.com/v1 MODEL_NAME=gpt-4 HF_TOKEN=x \
|
| 616 |
uv run python inference.py 2>&1 | grep -E '^\[(START|STEP|END)\]' | head -20
|
| 617 |
-
|
| 618 |
# 4. Demo script
|
| 619 |
uv run python demo.py
|
| 620 |
-
|
| 621 |
# 5. OpenEnv validate
|
| 622 |
uv run openenv validate
|
| 623 |
-
|
| 624 |
# 6. Docker build
|
| 625 |
docker build -t citywide-dispatch-supervisor .
|
| 626 |
-
|
| 627 |
# 7. Docker run + health check + empty reset
|
| 628 |
docker run -d -p 8000:8000 --name test-dispatch citywide-dispatch-supervisor
|
| 629 |
sleep 5
|
| 630 |
curl -s http://localhost:8000/health
|
| 631 |
-
curl -s -X POST http://localhost:8000/reset
|
|
|
|
| 632 |
docker stop test-dispatch && docker rm test-dispatch
|
| 633 |
-
|
| 634 |
-
# 8. Benchmark scores
|
| 635 |
uv run python -c "
|
| 636 |
from src.benchmark import run_all
|
| 637 |
scores = run_all()
|
|
@@ -641,32 +349,5 @@ for task_id, score in scores.items():
|
|
| 641 |
print('All scores in [0.0, 1.0] — PASS')
|
| 642 |
"
|
| 643 |
```
|
| 644 |
-
|
| 645 |
-
All 8 checks must pass before submission.
|
| 646 |
-
|
| 647 |
-
---
|
| 648 |
-
|
| 649 |
-
## SECTION 8 — PRIORITY ORDER SUMMARY
|
| 650 |
-
|
| 651 |
-
Work through issues in this exact order:
|
| 652 |
-
|
| 653 |
-
| # | File | Change | Severity |
|
| 654 |
-
|---|------|--------|----------|
|
| 655 |
-
| 1 | `openenv.yaml` | Fix tab → space indentation | CRITICAL |
|
| 656 |
-
| 2 | `src/server/app.py` | Add `if __name__ == "__main__": main()` | CRITICAL |
|
| 657 |
-
| 3 | `src/server/app.py` | Make `task_id` optional in `ResetRequest` | CRITICAL |
|
| 658 |
-
| 4 | `Dockerfile` | Use uvicorn directly in CMD | CRITICAL |
|
| 659 |
-
| 5 | `validate_local.py` | Add `USE_RANDOM=true` in `check_inference` | HIGH |
|
| 660 |
-
| 6 | `pyproject.toml` | Add `asyncio_mode = "auto"` | HIGH |
|
| 661 |
-
| 7 | `inference.py` | Normalize exception error messages | HIGH |
|
| 662 |
-
| 8 | `inference.py` | Fix score computation (exclude reset reward) | HIGH |
|
| 663 |
-
| 9 | `src/server/app.py` | Guard `get_dashboard_state` against None env | MEDIUM |
|
| 664 |
-
| 10 | `src/rewards.py` | Fix triage key format mismatch | MEDIUM |
|
| 665 |
-
| 11 | `src/state_machine.py` | Use Manhattan distance for ETA | MEDIUM |
|
| 666 |
-
| 12 | `src/tasks/*.py` | Improve grader logic | MEDIUM |
|
| 667 |
-
| 13 | `tests/test_openenv_integration.py` | Add empty-body reset test | MEDIUM |
|
| 668 |
-
| 14 | `tests/test_openenv_integration.py` | Add /tasks endpoint test | LOW |
|
| 669 |
-
| 15 | `tests/test_inference.py` | Add `step_error` to valid errors set | LOW |
|
| 670 |
-
| 16 | `requirements.txt` | Add `groq>=1.1.2` | LOW |
|
| 671 |
-
| 17 | `.dockerignore` | Create file | LOW |
|
| 672 |
-
| 18 | `README.md` | Add baseline scores table + env var table + difficulty guide | LOW |
|
|
|
|
| 1 |
+
# 911 Dispatch Supervisor — Fix & Polish for OpenEnv Submission
|
| 2 |
+
|
| 3 |
+
You are working on the repo at the current directory. Apply ALL fixes below in order.
|
| 4 |
+
Do not skip any item. After all fixes, run the final validation checklist.
|
| 5 |
+
|
|
|
|
| 6 |
---
|
| 7 |
+
|
| 8 |
+
## SECTION 1 — CRITICAL BUGS (fix these first)
|
| 9 |
+
|
| 10 |
+
### 1.1 Fix `openenv.yaml` — Replace entire file content
|
| 11 |
+
|
| 12 |
+
The file uses hard tab characters which breaks YAML parsing. Replace the entire file with:
|
|
|
|
|
|
|
|
|
|
| 13 |
```yaml
|
| 14 |
name: citywide-dispatch-supervisor
|
| 15 |
version: "0.1.0"
|
|
|
|
| 32 |
name: Shift Surge
|
| 33 |
description: Incident waves combined with units going out of service; maintain coverage over time.
|
| 34 |
```
|
| 35 |
+
|
| 36 |
+
Verify with: `python -c "import yaml; yaml.safe_load(open('openenv.yaml')); print('YAML OK')"`
|
| 37 |
+
|
| 38 |
---
|
| 39 |
+
|
| 40 |
+
### 1.2 Fix `src/server/app.py` — Server never starts
|
| 41 |
+
|
| 42 |
+
Add these two lines at the very bottom of `src/server/app.py`, after the `def main()` block:
|
|
|
|
|
|
|
|
|
|
| 43 |
```python
|
| 44 |
if __name__ == "__main__":
|
| 45 |
main()
|
| 46 |
```
|
| 47 |
+
|
| 48 |
+
Also update the `main()` function to:
|
|
|
|
| 49 |
```python
|
| 50 |
def main():
|
| 51 |
import uvicorn
|
| 52 |
uvicorn.run("src.server.app:app", host="0.0.0.0", port=8000, reload=False)
|
| 53 |
```
|
| 54 |
+
|
|
|
|
|
|
|
| 55 |
---
|
| 56 |
+
|
| 57 |
+
### 1.3 Fix `src/server/app.py` — `/reset` rejects empty body
|
| 58 |
+
|
| 59 |
+
Change `ResetRequest` so `task_id` has a default:
|
|
|
|
|
|
|
|
|
|
| 60 |
```python
|
| 61 |
class ResetRequest(BaseModel):
|
| 62 |
task_id: str = "single_incident"
|
| 63 |
seed: int | None = None
|
| 64 |
```
|
| 65 |
+
|
|
|
|
|
|
|
| 66 |
---
|
| 67 |
+
|
| 68 |
+
### 1.4 Fix `Dockerfile` — Use uvicorn directly in CMD
|
| 69 |
+
|
| 70 |
+
Replace the CMD line in the root `Dockerfile` with:
|
|
|
|
|
|
|
|
|
|
| 71 |
```dockerfile
|
| 72 |
CMD ["uv", "run", "uvicorn", "src.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
| 73 |
```
|
| 74 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
---
|
| 76 |
+
|
| 77 |
+
## SECTION 2 — HIGH PRIORITY BUGS
|
| 78 |
+
|
| 79 |
### 2.1 Fix `validate_local.py` — `check_inference()` never uses random mode
|
| 80 |
+
|
| 81 |
+
In `validate_local.py`, inside `check_inference()`, add `env["USE_RANDOM"] = "true"` before the `subprocess.run` call:
|
|
|
|
|
|
|
|
|
|
| 82 |
```python
|
| 83 |
+
env["USE_RANDOM"] = "true"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
```
|
| 85 |
+
|
| 86 |
+
Also increase the timeout to 300 seconds if not already set.
|
| 87 |
+
|
| 88 |
---
|
| 89 |
+
|
| 90 |
+
### 2.2 Fix `pyproject.toml` — Add `asyncio_mode`
|
| 91 |
+
|
| 92 |
+
In `[tool.pytest.ini_options]`, add:
|
|
|
|
|
|
|
|
|
|
| 93 |
```toml
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
asyncio_mode = "auto"
|
| 95 |
```
|
| 96 |
+
|
| 97 |
---
|
| 98 |
+
|
| 99 |
+
### 2.3 Fix `inference.py` — Normalize exception error token
|
| 100 |
+
|
| 101 |
+
In `inference.py`, inside the inner `except Exception as e` block within the step loop, change the error string:
|
|
|
|
|
|
|
|
|
|
| 102 |
```python
|
| 103 |
except Exception as e:
|
| 104 |
+
error_msg = "step_error"
|
| 105 |
print(
|
| 106 |
f"[STEP] step={step_count} action={action_str} "
|
| 107 |
f"reward=0.00 done=true error={error_msg}"
|
|
|
|
| 109 |
success = False
|
| 110 |
break
|
| 111 |
```
|
| 112 |
+
|
| 113 |
+
---
|
| 114 |
+
|
| 115 |
+
### 2.4 Fix `inference.py` — Score computation excludes reset reward
|
| 116 |
+
|
| 117 |
+
Change score computation to exclude the initial reset observation score:
|
| 118 |
```python
|
| 119 |
+
step_rewards = rewards[1:]
|
| 120 |
+
if step_rewards:
|
| 121 |
+
total_score = sum(step_rewards) / len(step_rewards)
|
| 122 |
+
else:
|
| 123 |
+
total_score = 0.0
|
| 124 |
+
total_score = max(0.0, min(1.0, total_score))
|
| 125 |
+
|
| 126 |
+
rewards_str = ",".join(f"{r:.2f}" for r in step_rewards) if step_rewards else "0.00"
|
| 127 |
```
|
| 128 |
+
|
| 129 |
---
|
| 130 |
+
|
| 131 |
+
### 2.5 Fix `src/server/app.py` — Guard `get_dashboard_state` against None env
|
| 132 |
+
|
| 133 |
+
The `/dashboard/state` endpoint should return a safe empty structure before `/reset` is called. It already does this in the current code — verify it matches:
|
|
|
|
|
|
|
|
|
|
| 134 |
```python
|
| 135 |
@app.get("/dashboard/state")
|
| 136 |
async def get_dashboard_state() -> dict[str, Any]:
|
| 137 |
if _env is None:
|
|
|
|
| 138 |
return {
|
| 139 |
"units": {},
|
| 140 |
"incidents": {},
|
|
|
|
| 147 |
"issues": [],
|
| 148 |
"observation": None,
|
| 149 |
}
|
| 150 |
+
# ... rest unchanged
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
```
|
| 152 |
+
|
| 153 |
---
|
| 154 |
+
|
| 155 |
+
## SECTION 3 — ENVIRONMENT DESIGN IMPROVEMENTS
|
| 156 |
+
|
| 157 |
+
### 3.1 Improve `src/tasks/single_incident.py` grader
|
| 158 |
+
|
| 159 |
+
Replace `SingleIncidentGrader.grade()` with:
|
|
|
|
|
|
|
|
|
|
| 160 |
```python
|
| 161 |
def grade(self, state: State, rewards: list[float]) -> float:
|
|
|
|
| 162 |
if not rewards:
|
| 163 |
return 0.0
|
| 164 |
+
|
| 165 |
incident = state.incidents.get("INC-001")
|
| 166 |
if incident is None:
|
| 167 |
return 0.0
|
| 168 |
+
|
| 169 |
score = 0.0
|
| 170 |
+
|
|
|
|
| 171 |
if incident.status.value == "RESOLVED":
|
| 172 |
score += 0.50
|
| 173 |
+
|
|
|
|
| 174 |
medic_dispatched = any(
|
| 175 |
+
u.unit_type.value == "MEDIC"
|
| 176 |
+
and (
|
| 177 |
+
u.assigned_incident_id == "INC-001"
|
| 178 |
+
or u.status.value in {"ON_SCENE", "DISPATCHED"}
|
| 179 |
)
|
| 180 |
for u in state.units.values()
|
| 181 |
)
|
| 182 |
if medic_dispatched:
|
| 183 |
score += 0.30
|
| 184 |
+
|
|
|
|
| 185 |
if incident.status.value == "RESOLVED" and state.step_count <= 10:
|
| 186 |
score += 0.20
|
| 187 |
+
|
| 188 |
return max(0.0, min(1.0, score))
|
| 189 |
```
|
| 190 |
+
|
| 191 |
+
---
|
| 192 |
+
|
| 193 |
+
### 3.2 Improve `src/tasks/multi_incident.py` grader
|
| 194 |
+
|
| 195 |
+
Replace `MultiIncidentGrader.grade()` with:
|
| 196 |
```python
|
| 197 |
def grade(self, state: State, rewards: list[float]) -> float:
|
|
|
|
| 198 |
if not rewards:
|
| 199 |
return 0.0
|
| 200 |
+
|
| 201 |
total = len(state.incidents)
|
| 202 |
if total == 0:
|
| 203 |
return 0.0
|
| 204 |
+
|
| 205 |
+
resolved = sum(1 for i in state.incidents.values() if i.status.value == "RESOLVED")
|
| 206 |
+
failed = sum(1 for i in state.incidents.values() if i.status.value == "ESCALATED")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
p1_total = sum(1 for i in state.incidents.values() if i.severity.value == "PRIORITY_1")
|
| 208 |
p1_resolved = sum(
|
| 209 |
+
1
|
| 210 |
+
for iid in state.metadata.get("resolved_incidents", [])
|
| 211 |
+
if state.incidents.get(iid)
|
| 212 |
+
and state.incidents[iid].severity.value == "PRIORITY_1"
|
| 213 |
)
|
| 214 |
+
|
| 215 |
resolution_score = resolved / total
|
| 216 |
p1_score = (p1_resolved / p1_total) if p1_total > 0 else 1.0
|
| 217 |
failure_penalty = failed / total
|
| 218 |
+
|
| 219 |
score = 0.5 * p1_score + 0.3 * resolution_score - 0.2 * failure_penalty
|
| 220 |
return max(0.0, min(1.0, score))
|
| 221 |
```
|
| 222 |
+
|
| 223 |
+
---
|
| 224 |
+
|
| 225 |
+
### 3.3 Improve `src/tasks/mass_casualty.py` grader
|
| 226 |
+
|
| 227 |
+
Replace `MassCasualtyGrader.grade()` with:
|
| 228 |
```python
|
| 229 |
def grade(self, state: State, rewards: list[float]) -> float:
|
| 230 |
if not rewards:
|
| 231 |
return 0.0
|
| 232 |
+
|
| 233 |
p1_seen = list(state.metadata.get("p1_seen", []))
|
| 234 |
p1_resolved = [
|
| 235 |
+
iid
|
| 236 |
+
for iid in state.metadata.get("resolved_incidents", [])
|
| 237 |
if iid in p1_seen and iid not in state.metadata.get("failed_incidents", [])
|
| 238 |
]
|
| 239 |
p1_failed = list(state.metadata.get("failed_incidents", []))
|
| 240 |
+
|
| 241 |
survival_score = len(p1_resolved) / max(len(p1_seen), 1)
|
| 242 |
failure_penalty = len(p1_failed) / max(len(p1_seen), 1) * 0.5
|
| 243 |
+
|
| 244 |
mean_reward = sum(rewards) / len(rewards)
|
| 245 |
score = 0.6 * survival_score + 0.3 * mean_reward - failure_penalty
|
| 246 |
return max(0.0, min(1.0, score))
|
| 247 |
```
|
| 248 |
+
|
| 249 |
---
|
| 250 |
+
|
| 251 |
+
### 3.4 Fix `src/rewards.py` — Triage key format mismatch
|
| 252 |
+
|
| 253 |
+
In `_compute_triage()`, the metadata lookup uses inconsistent key formats. Ensure it tries both:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
```python
|
| 255 |
+
required_types = (
|
| 256 |
+
required_map.get(incident.incident_type.value, [])
|
| 257 |
+
or required_map.get(str(incident.incident_type), [])
|
| 258 |
+
)
|
| 259 |
```
|
| 260 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
---
|
| 262 |
+
|
| 263 |
+
### 3.5 Fix `src/state_machine.py` — Use Manhattan distance for ETA
|
| 264 |
+
|
| 265 |
+
In `_apply_dispatch()`, replace Euclidean distance with Manhattan:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
```python
|
| 267 |
+
dx = abs(unit.location_x - incident.location_x)
|
| 268 |
+
dy = abs(unit.location_y - incident.location_y)
|
| 269 |
+
manhattan_dist = dx + dy
|
| 270 |
+
eta = manhattan_dist / max(speed, 1e-6)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
```
|
| 272 |
+
|
| 273 |
---
|
| 274 |
+
|
| 275 |
+
## SECTION 4 — TEST FIXES
|
| 276 |
+
|
| 277 |
+
### 4.1 Update `tests/test_inference.py` — Add `step_error` to valid error tokens
|
| 278 |
+
|
| 279 |
+
Find `valid_errors` in `test_step_line_error_format` and add `"step_error"`:
|
|
|
|
| 280 |
```python
|
| 281 |
valid_errors = {"null", "max_steps_exceeded", "illegal_transition", "step_error"}
|
| 282 |
```
|
| 283 |
+
|
| 284 |
---
|
| 285 |
+
|
| 286 |
+
### 4.2 Verify `tests/test_openenv_integration.py` has these two tests
|
| 287 |
+
|
| 288 |
+
Confirm the following tests exist (they appear to be already present based on the file):
|
|
|
|
| 289 |
```python
|
| 290 |
def test_reset_with_empty_body_returns_200(self) -> None:
|
|
|
|
| 291 |
c = TestClient(server_app.app)
|
| 292 |
response = c.post("/reset", json={})
|
| 293 |
assert response.status_code == 200
|
| 294 |
data = response.json()
|
| 295 |
assert data["result"] == "dispatch center online"
|
| 296 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
def test_tasks_endpoint_returns_four_tasks(self) -> None:
|
| 298 |
c = TestClient(server_app.app)
|
| 299 |
response = c.get("/tasks")
|
|
|
|
| 303 |
task_ids = {t["task_id"] for t in tasks}
|
| 304 |
assert task_ids == {"single_incident", "multi_incident", "mass_casualty", "shift_surge"}
|
| 305 |
```
|
| 306 |
+
|
| 307 |
+
If missing, add them to the `TestTasksEndpoint` and `TestResetEndpoint` classes.
|
| 308 |
+
|
| 309 |
---
|
| 310 |
+
|
| 311 |
+
## SECTION 5 — FINAL VALIDATION CHECKLIST
|
| 312 |
+
|
| 313 |
+
Run these commands in order and confirm each passes:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
```bash
|
| 315 |
# 1. YAML parse check
|
| 316 |
python -c "import yaml; yaml.safe_load(open('openenv.yaml')); print('YAML OK')"
|
| 317 |
+
|
| 318 |
# 2. Full test suite
|
| 319 |
uv run python -m pytest tests/ -v --tb=short
|
| 320 |
+
|
| 321 |
# 3. Inference script with random agent
|
| 322 |
USE_RANDOM=true API_BASE_URL=https://api.openai.com/v1 MODEL_NAME=gpt-4 HF_TOKEN=x \
|
| 323 |
uv run python inference.py 2>&1 | grep -E '^\[(START|STEP|END)\]' | head -20
|
| 324 |
+
|
| 325 |
# 4. Demo script
|
| 326 |
uv run python demo.py
|
| 327 |
+
|
| 328 |
# 5. OpenEnv validate
|
| 329 |
uv run openenv validate
|
| 330 |
+
|
| 331 |
# 6. Docker build
|
| 332 |
docker build -t citywide-dispatch-supervisor .
|
| 333 |
+
|
| 334 |
# 7. Docker run + health check + empty reset
|
| 335 |
docker run -d -p 8000:8000 --name test-dispatch citywide-dispatch-supervisor
|
| 336 |
sleep 5
|
| 337 |
curl -s http://localhost:8000/health
|
| 338 |
+
curl -s -X POST http://localhost:8000/reset \
|
| 339 |
+
-H "Content-Type: application/json" -d '{}'
|
| 340 |
docker stop test-dispatch && docker rm test-dispatch
|
| 341 |
+
|
| 342 |
+
# 8. Benchmark scores all in [0.0, 1.0]
|
| 343 |
uv run python -c "
|
| 344 |
from src.benchmark import run_all
|
| 345 |
scores = run_all()
|
|
|
|
| 349 |
print('All scores in [0.0, 1.0] — PASS')
|
| 350 |
"
|
| 351 |
```
|
| 352 |
+
|
| 353 |
+
All 8 checks must pass before the submission is ready.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pyproject.toml
CHANGED
|
@@ -19,6 +19,9 @@ dependencies = [
|
|
| 19 |
"pyyaml>=6.0.1",
|
| 20 |
]
|
| 21 |
|
|
|
|
|
|
|
|
|
|
| 22 |
[project.optional-dependencies]
|
| 23 |
dev = [
|
| 24 |
"pytest>=8.0",
|
|
|
|
| 19 |
"pyyaml>=6.0.1",
|
| 20 |
]
|
| 21 |
|
| 22 |
+
[project.scripts]
|
| 23 |
+
server = "server.app:main"
|
| 24 |
+
|
| 25 |
[project.optional-dependencies]
|
| 26 |
dev = [
|
| 27 |
"pytest>=8.0",
|
server/__init__.py
ADDED
|
File without changes
|
server/app.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OpenEnv server shim for validators.
|
| 2 |
+
|
| 3 |
+
The canonical FastAPI app lives in src.server.app.
|
| 4 |
+
This module exists to satisfy OpenEnv's expected multi-mode layout.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from src.server.app import app
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def main() -> None:
|
| 11 |
+
"""Run the OpenEnv FastAPI server."""
|
| 12 |
+
from src.server.app import main as _main
|
| 13 |
+
|
| 14 |
+
_main()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
if __name__ == "__main__":
|
| 18 |
+
main()
|
tests/test_inference.py
CHANGED
|
@@ -77,6 +77,13 @@ class TestEnvVarValidation:
|
|
| 77 |
cmd = [sys.executable, "inference.py"]
|
| 78 |
merged_env = os.environ.copy()
|
| 79 |
merged_env.update(env)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
result = subprocess.run(
|
| 81 |
cmd,
|
| 82 |
capture_output=True,
|
|
|
|
| 77 |
cmd = [sys.executable, "inference.py"]
|
| 78 |
merged_env = os.environ.copy()
|
| 79 |
merged_env.update(env)
|
| 80 |
+
|
| 81 |
+
# Ensure tests are not affected by host environment variables.
|
| 82 |
+
# If the test doesn't provide a required var, explicitly remove it.
|
| 83 |
+
if "API_BASE_URL" not in env:
|
| 84 |
+
merged_env.pop("API_BASE_URL", None)
|
| 85 |
+
if "MODEL_NAME" not in env:
|
| 86 |
+
merged_env.pop("MODEL_NAME", None)
|
| 87 |
result = subprocess.run(
|
| 88 |
cmd,
|
| 89 |
capture_output=True,
|