Spaces:
Sleeping
Sleeping
Commit ·
14170d7
1
Parent(s): e259b96
Apply changes.md compliance fixes
Browse files- .dockerignore +17 -0
- Dockerfile +1 -1
- README.md +43 -23
- changes.md +672 -0
- inference.py +9 -5
- openenv.yaml +15 -15
- pyproject.toml +2 -0
- requirements.txt +2 -0
- src/rewards.py +6 -2
- src/server/Dockerfile +1 -1
- src/server/app.py +19 -3
- src/state_machine.py +14 -10
- src/tasks/mass_casualty.py +13 -10
- src/tasks/multi_incident.py +18 -11
- src/tasks/single_incident.py +24 -23
- tests/test_inference.py +1 -1
- tests/test_openenv_integration.py +24 -0
- validate_local.py +3 -4
.dockerignore
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.venv
|
| 3 |
+
.uv
|
| 4 |
+
__pycache__
|
| 5 |
+
*.pyc
|
| 6 |
+
*.pyo
|
| 7 |
+
.pytest_cache
|
| 8 |
+
.coverage
|
| 9 |
+
htmlcov
|
| 10 |
+
.sisyphus/evidence/
|
| 11 |
+
*.log
|
| 12 |
+
tmp/
|
| 13 |
+
dashboard.html
|
| 14 |
+
*.png
|
| 15 |
+
*.jpg
|
| 16 |
+
.env
|
| 17 |
+
.env.*
|
Dockerfile
CHANGED
|
@@ -5,4 +5,4 @@ WORKDIR /app
|
|
| 5 |
COPY . /app
|
| 6 |
RUN pip install uv && uv sync --frozen
|
| 7 |
EXPOSE 8000
|
| 8 |
-
CMD ["uv", "run", "
|
|
|
|
| 5 |
COPY . /app
|
| 6 |
RUN pip install uv && uv sync --frozen
|
| 7 |
EXPOSE 8000
|
| 8 |
+
CMD ["uv", "run", "uvicorn", "src.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
CHANGED
|
@@ -6,10 +6,11 @@ colorTo: orange
|
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
tags:
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
| 13 |
---
|
| 14 |
|
| 15 |
# 911 City-Wide Emergency Dispatch Supervisor
|
|
@@ -28,6 +29,15 @@ This project implements a benchmark environment for training and evaluating LLM
|
|
| 28 |
- **OpenEnv compatible**: Standard RL environment interface
|
| 29 |
- **Read-only 2D visualization**: Synchronized unit/incident visualization
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
## Tasks
|
| 32 |
|
| 33 |
### 1. `single_incident`
|
|
@@ -46,6 +56,15 @@ High severity surge (Priority-1 heavy). Focus: survival outcomes and rapid alloc
|
|
| 46 |
|
| 47 |
Longer horizon with incident waves and unit availability changes. Focus: coverage and strategic staging.
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
## Contracts
|
| 50 |
|
| 51 |
### Action
|
|
@@ -120,6 +139,25 @@ python demo.py
|
|
| 120 |
python inference.py
|
| 121 |
```
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
## Reward Function
|
| 124 |
|
| 125 |
The reward signal is a weighted combination of five components:
|
|
@@ -134,18 +172,7 @@ The reward signal is a weighted combination of five components:
|
|
| 134 |
|
| 135 |
**Safety gate:** If any Priority-1 incident was seen and `survival=0.0`, the total episode score is capped at `0.2` regardless of other components.
|
| 136 |
|
| 137 |
-
## Baseline Scores
|
| 138 |
-
|
| 139 |
-
Scores from the random baseline agent (`USE_RANDOM=true`):
|
| 140 |
-
|
| 141 |
-
| Task | Difficulty | Baseline Score |
|
| 142 |
-
|------|-----------|---------------|
|
| 143 |
-
| `single_incident` | Easy | ~0.55 |
|
| 144 |
-
| `multi_incident` | Medium | ~0.48 |
|
| 145 |
-
| `mass_casualty` | Hard | ~0.32 |
|
| 146 |
-
| `shift_surge` | Hard | ~0.38 |
|
| 147 |
|
| 148 |
-
*Run `USE_RANDOM=true python inference.py` to reproduce.*
|
| 149 |
|
| 150 |
## Project Structure
|
| 151 |
|
|
@@ -203,14 +230,6 @@ curl http://localhost:8000/health
|
|
| 203 |
curl -X POST http://localhost:8000/reset -H "Content-Type: application/json" -d '{"task_id": "single_incident", "seed": 42}'
|
| 204 |
```
|
| 205 |
|
| 206 |
-
### Environment Variables
|
| 207 |
-
|
| 208 |
-
| Variable | Description | Default |
|
| 209 |
-
|----------|-------------|---------|
|
| 210 |
-
| `API_BASE_URL` | OpenAI API base URL | `https://api.openai.com/v1` |
|
| 211 |
-
| `MODEL_NAME` | Model to use | `gpt-4` |
|
| 212 |
-
| `HF_TOKEN` | HuggingFace token | None |
|
| 213 |
-
|
| 214 |
## API Endpoints
|
| 215 |
|
| 216 |
| Endpoint | Method | Description |
|
|
@@ -220,6 +239,7 @@ curl -X POST http://localhost:8000/reset -H "Content-Type: application/json" -d
|
|
| 220 |
| `/step` | POST | Execute an action |
|
| 221 |
| `/state` | GET | Get current environment state |
|
| 222 |
| `/dashboard/state` | GET | Extended state for `live_dashboard.html` |
|
|
|
|
| 223 |
|
| 224 |
## HF Space
|
| 225 |
|
|
|
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
tags:
|
| 9 |
+
|
| 10 |
+
- openenv
|
| 11 |
+
- reinforcement-learning
|
| 12 |
+
- llm-agent
|
| 13 |
+
- emergency-dispatch
|
| 14 |
---
|
| 15 |
|
| 16 |
# 911 City-Wide Emergency Dispatch Supervisor
|
|
|
|
| 29 |
- **OpenEnv compatible**: Standard RL environment interface
|
| 30 |
- **Read-only 2D visualization**: Synchronized unit/incident visualization
|
| 31 |
|
| 32 |
+
## Environment Variables
|
| 33 |
+
|
| 34 |
+
| Variable | Required | Description |
|
| 35 |
+
|----------|----------|-------------|
|
| 36 |
+
| `API_BASE_URL` | Yes | OpenAI-compatible endpoint base URL |
|
| 37 |
+
| `MODEL_NAME` | Yes | Model identifier string |
|
| 38 |
+
| `HF_TOKEN` | Yes (unless `USE_RANDOM=true`) | API key / HF token |
|
| 39 |
+
| `USE_RANDOM` | No | Set to `true` to use deterministic random agent (no LLM) |
|
| 40 |
+
|
| 41 |
## Tasks
|
| 42 |
|
| 43 |
### 1. `single_incident`
|
|
|
|
| 56 |
|
| 57 |
Longer horizon with incident waves and unit availability changes. Focus: coverage and strategic staging.
|
| 58 |
|
| 59 |
+
### Task Difficulty Guide
|
| 60 |
+
|
| 61 |
+
| Task | Difficulty | Key Challenge | Success Criteria |
|
| 62 |
+
|------|-----------|---------------|-----------------|
|
| 63 |
+
| `single_incident` | Easy | Dispatch the right unit type (MEDIC) quickly | Incident resolved, correct unit, ETA < 300s |
|
| 64 |
+
| `multi_incident` | Medium | Triage 3 simultaneous incidents, prioritize P1 | All P1 incidents responded to, no ESCALATED |
|
| 65 |
+
| `mass_casualty` | Hard | Manage wave-based surge with limited resources | Maximize P1 survival rate across waves |
|
| 66 |
+
| `shift_surge` | Hard | Adapt as units go out of service over time | Maintain coverage and resolve incidents despite attrition |
|
| 67 |
+
|
| 68 |
## Contracts
|
| 69 |
|
| 70 |
### Action
|
|
|
|
| 139 |
python inference.py
|
| 140 |
```
|
| 141 |
|
| 142 |
+
## Reproducing Baseline Scores
|
| 143 |
+
|
| 144 |
+
Run the random baseline agent against all 4 tasks:
|
| 145 |
+
|
| 146 |
+
```bash
|
| 147 |
+
USE_RANDOM=true API_BASE_URL=https://api.openai.com/v1 MODEL_NAME=gpt-4 HF_TOKEN=x python inference.py
|
| 148 |
+
```
|
| 149 |
+
|
| 150 |
+
Expected output (approximate):
|
| 151 |
+
|
| 152 |
+
| Task | Difficulty | Random Baseline Score |
|
| 153 |
+
|------|-----------|----------------------|
|
| 154 |
+
| `single_incident` | Easy | ~0.55 |
|
| 155 |
+
| `multi_incident` | Medium | ~0.48 |
|
| 156 |
+
| `mass_casualty` | Hard | ~0.32 |
|
| 157 |
+
| `shift_surge` | Hard | ~0.38 |
|
| 158 |
+
|
| 159 |
+
*Scores vary slightly due to seeded randomness. Run with `seed=42` for exact reproduction.*
|
| 160 |
+
|
| 161 |
## Reward Function
|
| 162 |
|
| 163 |
The reward signal is a weighted combination of five components:
|
|
|
|
| 172 |
|
| 173 |
**Safety gate:** If any Priority-1 incident was seen and `survival=0.0`, the total episode score is capped at `0.2` regardless of other components.
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
|
|
|
| 176 |
|
| 177 |
## Project Structure
|
| 178 |
|
|
|
|
| 230 |
curl -X POST http://localhost:8000/reset -H "Content-Type: application/json" -d '{"task_id": "single_incident", "seed": 42}'
|
| 231 |
```
|
| 232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
## API Endpoints
|
| 234 |
|
| 235 |
| Endpoint | Method | Description |
|
|
|
|
| 239 |
| `/step` | POST | Execute an action |
|
| 240 |
| `/state` | GET | Get current environment state |
|
| 241 |
| `/dashboard/state` | GET | Extended state for `live_dashboard.html` |
|
| 242 |
+
| `/tasks` | GET | List all available tasks with metadata |
|
| 243 |
|
| 244 |
## HF Space
|
| 245 |
|
changes.md
ADDED
|
@@ -0,0 +1,672 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copilot Agent Instructions: 911 Dispatch Supervisor RL Environment — Fix & Polish
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
These are ordered, actionable instructions to bring the `citywide-dispatch-supervisor` repository fully into compliance with the OpenEnv hackathon requirements and fix every discovered bug. Work through each section in order. **Do not skip any item.**
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## SECTION 1 — CRITICAL BUGS (will cause validation failure)
|
| 10 |
+
|
| 11 |
+
### 1.1 Fix `openenv.yaml` — Tab indentation is invalid YAML
|
| 12 |
+
|
| 13 |
+
**Problem:** The file uses hard `\t` tab characters for indentation. YAML forbids tabs; `openenv validate` will crash with a parse error.
|
| 14 |
+
|
| 15 |
+
**Action:** Rewrite `openenv.yaml` using 2-space indentation throughout. Use exactly this content:
|
| 16 |
+
|
| 17 |
+
```yaml
|
| 18 |
+
name: citywide-dispatch-supervisor
|
| 19 |
+
version: "0.1.0"
|
| 20 |
+
description: >
|
| 21 |
+
City-wide 911 emergency dispatch supervisor RL environment.
|
| 22 |
+
An LLM agent learns to manage simultaneous incidents by dispatching
|
| 23 |
+
police, fire, and EMS units across a city grid under realistic constraints.
|
| 24 |
+
entrypoint: src.openenv_environment:OpenEnvEnvironment
|
| 25 |
+
tasks:
|
| 26 |
+
- id: single_incident
|
| 27 |
+
name: Single Incident Response
|
| 28 |
+
description: One incident with a small unit pool; learn basic dispatch, correct unit type, and response time.
|
| 29 |
+
- id: multi_incident
|
| 30 |
+
name: Simultaneous Multi-Incident
|
| 31 |
+
description: Multiple concurrent incidents requiring triage, prioritization, and correct unit matching.
|
| 32 |
+
- id: mass_casualty
|
| 33 |
+
name: Mass Casualty Event
|
| 34 |
+
description: Wave-based Priority-1 surge with resource conflict; maximize survival outcomes.
|
| 35 |
+
- id: shift_surge
|
| 36 |
+
name: Shift Surge
|
| 37 |
+
description: Incident waves combined with units going out of service; maintain coverage over time.
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
Verify with: `python -c "import yaml; yaml.safe_load(open('openenv.yaml'))"` — must not raise any error.
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
+
### 1.2 Fix `src/server/app.py` — Server never starts inside Docker
|
| 45 |
+
|
| 46 |
+
**Problem:** The file defines `def main()` but never calls it. Running `python -m src.server.app` executes the module top-level code (which only defines routes) but never invokes `uvicorn.run`. The Docker container starts but immediately exits or hangs silently without binding to port 8000.
|
| 47 |
+
|
| 48 |
+
**Action:** Add the following two lines at the very bottom of `src/server/app.py`, after the `def main()` block:
|
| 49 |
+
|
| 50 |
+
```python
|
| 51 |
+
if __name__ == "__main__":
|
| 52 |
+
main()
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
Also update the `main()` function to be more robust:
|
| 56 |
+
|
| 57 |
+
```python
|
| 58 |
+
def main():
|
| 59 |
+
import uvicorn
|
| 60 |
+
uvicorn.run("src.server.app:app", host="0.0.0.0", port=8000, reload=False)
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
**Verify:** `docker build -t citywide-dispatch-supervisor . && docker run -p 8000:8000 citywide-dispatch-supervisor` must hold open and `curl http://localhost:8000/health` must return `{"status":"ok"}`.
|
| 64 |
+
|
| 65 |
+
---
|
| 66 |
+
|
| 67 |
+
### 1.3 Fix `src/server/app.py` ResetRequest — `/reset` rejects empty body
|
| 68 |
+
|
| 69 |
+
**Problem:** The prevalidation script calls `POST /reset` with an empty JSON body `{}`. The current `ResetRequest` model has `task_id: str` as a required field with no default. This produces HTTP 422 Unprocessable Entity, causing the prevalidation check to fail at Step 1.
|
| 70 |
+
|
| 71 |
+
**Action:** In `src/server/app.py`, change `ResetRequest` to give `task_id` a sensible default:
|
| 72 |
+
|
| 73 |
+
```python
|
| 74 |
+
class ResetRequest(BaseModel):
|
| 75 |
+
task_id: str = "single_incident"
|
| 76 |
+
seed: int | None = None
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
**Verify:** `curl -s -X POST http://localhost:8000/reset -H "Content-Type: application/json" -d '{}'` must return HTTP 200 with a valid observation JSON.
|
| 80 |
+
|
| 81 |
+
---
|
| 82 |
+
|
| 83 |
+
### 1.4 Fix `Dockerfile` — Use module string for uvicorn to enable proper reloading and port binding
|
| 84 |
+
|
| 85 |
+
**Problem:** `CMD ["uv", "run", "python", "-m", "src.server.app"]` relies on `__main__` execution. Combined with bug 1.2, if `if __name__ == "__main__"` is properly added, this will work — but it is more reliable and production-correct to invoke uvicorn directly as the CMD.
|
| 86 |
+
|
| 87 |
+
**Action:** Replace the `CMD` in the root `Dockerfile` with:
|
| 88 |
+
|
| 89 |
+
```dockerfile
|
| 90 |
+
CMD ["uv", "run", "uvicorn", "src.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
The full updated `Dockerfile` should be:
|
| 94 |
+
|
| 95 |
+
```dockerfile
|
| 96 |
+
FROM python:3.11-slim
|
| 97 |
+
LABEL org.opencontainers.image.title="911 City-Wide Emergency Dispatch Supervisor"
|
| 98 |
+
LABEL org.opencontainers.image.description="City-wide 911 dispatch supervisor RL environment"
|
| 99 |
+
WORKDIR /app
|
| 100 |
+
COPY . /app
|
| 101 |
+
RUN pip install uv && uv sync --frozen
|
| 102 |
+
EXPOSE 8000
|
| 103 |
+
CMD ["uv", "run", "uvicorn", "src.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
---
|
| 107 |
+
|
| 108 |
+
## SECTION 2 — HIGH PRIORITY BUGS (cause test failures or incorrect behavior)
|
| 109 |
+
|
| 110 |
+
### 2.1 Fix `validate_local.py` — `check_inference()` never uses random mode
|
| 111 |
+
|
| 112 |
+
**Problem:** `check_inference()` sets real-looking credentials but does NOT set `USE_RANDOM=true`. The inference script will attempt a live API call with the dummy token and fail with an authentication error, making the local validation always report `FAILED: inference`.
|
| 113 |
+
|
| 114 |
+
**Action:** In `validate_local.py`, inside `check_inference()`, add `env["USE_RANDOM"] = "true"` before the `subprocess.run` call:
|
| 115 |
+
|
| 116 |
+
```python
|
| 117 |
+
def check_inference() -> bool:
|
| 118 |
+
import os
|
| 119 |
+
|
| 120 |
+
env = os.environ.copy()
|
| 121 |
+
env["API_BASE_URL"] = "https://api.openai.com/v1"
|
| 122 |
+
env["MODEL_NAME"] = "gpt-4"
|
| 123 |
+
env["HF_TOKEN"] = "dummy-token-for-local-validation"
|
| 124 |
+
env["USE_RANDOM"] = "true" # <-- ADD THIS LINE
|
| 125 |
+
|
| 126 |
+
print("\nNOTE: Running inference.py in random-agent mode for local validation")
|
| 127 |
+
result = subprocess.run(
|
| 128 |
+
["uv", "run", "python", "inference.py"],
|
| 129 |
+
capture_output=True,
|
| 130 |
+
text=True,
|
| 131 |
+
env=env,
|
| 132 |
+
timeout=300, # also increase timeout; 4 tasks can take time
|
| 133 |
+
)
|
| 134 |
+
# ... rest of function unchanged
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
---
|
| 138 |
+
|
| 139 |
+
### 2.2 Fix `pyproject.toml` — Missing `asyncio_mode` for `pytest-asyncio`
|
| 140 |
+
|
| 141 |
+
**Problem:** The test suite uses `asyncio.run()` inline rather than `@pytest.mark.asyncio` decorators. With `pytest-asyncio >= 0.21`, the default mode is `strict`, which requires explicit markers. This can cause silent test collection warnings or failures.
|
| 142 |
+
|
| 143 |
+
**Action:** Add the following to the `[tool.pytest.ini_options]` section in `pyproject.toml`:
|
| 144 |
+
|
| 145 |
+
```toml
|
| 146 |
+
[tool.pytest.ini_options]
|
| 147 |
+
testpaths = ["tests"]
|
| 148 |
+
python_files = ["test_*.py"]
|
| 149 |
+
python_classes = ["Test*"]
|
| 150 |
+
python_functions = ["test_*"]
|
| 151 |
+
asyncio_mode = "auto"
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
---
|
| 155 |
+
|
| 156 |
+
### 2.3 Fix `inference.py` — Exception error messages break the format compliance test
|
| 157 |
+
|
| 158 |
+
**Problem:** The `except Exception as e` block in `run_episode()` outputs `error={str(e)}` which can be any arbitrary string. The test `test_step_line_error_format` only allows `{"null", "max_steps_exceeded", "illegal_transition"}`. Any real exception will produce a string outside this set.
|
| 159 |
+
|
| 160 |
+
**Action:** In `inference.py`, inside the inner `except Exception as e` block within the step loop, normalize the error:
|
| 161 |
+
|
| 162 |
+
```python
|
| 163 |
+
except Exception as e:
|
| 164 |
+
error_msg = f"step_error" # normalize to a fixed token
|
| 165 |
+
print(
|
| 166 |
+
f"[STEP] step={step_count} action={action_str} "
|
| 167 |
+
f"reward=0.00 done=true error={error_msg}"
|
| 168 |
+
)
|
| 169 |
+
success = False
|
| 170 |
+
break
|
| 171 |
+
```
|
| 172 |
+
|
| 173 |
+
Also update `test_inference.py` to include `"step_error"` in `valid_errors`:
|
| 174 |
+
|
| 175 |
+
```python
|
| 176 |
+
valid_errors = {"null", "max_steps_exceeded", "illegal_transition", "step_error"}
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
---
|
| 180 |
+
|
| 181 |
+
### 2.4 Fix `src/server/app.py` — `/reset` endpoint does not return `task_id` in the state after reset
|
| 182 |
+
|
| 183 |
+
**Problem:** After `POST /reset`, calling `GET /state` returns a state with the correct `task_id`. But the dashboard endpoint `GET /dashboard/state` may return `None` for metadata fields if `reset()` hasn't been called. The health check and dashboard should be safe to call at any time.
|
| 184 |
+
|
| 185 |
+
**Action:** Add a null-guard to `get_dashboard_state()`:
|
| 186 |
+
|
| 187 |
+
```python
|
| 188 |
+
@app.get("/dashboard/state")
|
| 189 |
+
async def get_dashboard_state() -> dict[str, Any]:
|
| 190 |
+
if _env is None:
|
| 191 |
+
# Return an empty but valid structure before /reset is called
|
| 192 |
+
return {
|
| 193 |
+
"units": {},
|
| 194 |
+
"incidents": {},
|
| 195 |
+
"episode_id": "not-initialized",
|
| 196 |
+
"step_count": 0,
|
| 197 |
+
"task_id": "none",
|
| 198 |
+
"city_time": 0.0,
|
| 199 |
+
"metadata": {},
|
| 200 |
+
"legal_actions": [],
|
| 201 |
+
"issues": [],
|
| 202 |
+
"observation": None,
|
| 203 |
+
}
|
| 204 |
+
state_dict = _env.state().model_dump()
|
| 205 |
+
legal_actions = [a.model_dump() for a in _env.legal_actions()]
|
| 206 |
+
last_obs = _env.last_observation()
|
| 207 |
+
issues = list(last_obs.issues) if last_obs is not None else []
|
| 208 |
+
obs_dict = last_obs.model_dump() if last_obs is not None else None
|
| 209 |
+
return {
|
| 210 |
+
**state_dict,
|
| 211 |
+
"legal_actions": legal_actions,
|
| 212 |
+
"issues": issues,
|
| 213 |
+
"observation": obs_dict,
|
| 214 |
+
}
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
---
|
| 218 |
+
|
| 219 |
+
### 2.5 Fix `inference.py` — Score computation is not normalized to competition spec
|
| 220 |
+
|
| 221 |
+
**Problem:** `total_score = sum(rewards) / len(rewards)` computes the average step reward. Since each step reward is already in [0, 1], this is a valid value but it weights the reset-time reward (score=0.0 from `obs.score=0.0` in `reset()`) equally with step rewards. This deflates the score.
|
| 222 |
+
|
| 223 |
+
**Action:** Change score computation in `run_episode()` to exclude the initial zero from reset:
|
| 224 |
+
|
| 225 |
+
```python
|
| 226 |
+
# Separate reset reward from step rewards
|
| 227 |
+
step_rewards = rewards[1:] # index 0 is the reset observation score (always 0.0)
|
| 228 |
+
if step_rewards:
|
| 229 |
+
total_score = sum(step_rewards) / len(step_rewards)
|
| 230 |
+
else:
|
| 231 |
+
total_score = 0.0
|
| 232 |
+
total_score = max(0.0, min(1.0, total_score))
|
| 233 |
+
```
|
| 234 |
+
|
| 235 |
+
Also update the `rewards_str` to only include step rewards so the `[END]` line is meaningful:
|
| 236 |
+
|
| 237 |
+
```python
|
| 238 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards[1:]) if len(rewards) > 1 else "0.00"
|
| 239 |
+
```
|
| 240 |
+
|
| 241 |
+
---
|
| 242 |
+
|
| 243 |
+
## SECTION 3 — ENVIRONMENT DESIGN IMPROVEMENTS (affect scoring)
|
| 244 |
+
|
| 245 |
+
### 3.1 Improve task graders — current graders are too simple
|
| 246 |
+
|
| 247 |
+
**Problem:** The graders (`SingleIncidentGrader`, `MultiIncidentGrader`, etc.) compute very simple scores that don't fully capture task success. Judges will look at whether hard tasks genuinely challenge frontier models with clear, deterministic success criteria.
|
| 248 |
+
|
| 249 |
+
**Action — `src/tasks/single_incident.py`:** Replace `SingleIncidentGrader.grade()` with:
|
| 250 |
+
|
| 251 |
+
```python
|
| 252 |
+
def grade(self, state: State, rewards: list[float]) -> float:
|
| 253 |
+
"""Grade based on: correct unit dispatched, fast response, incident resolved."""
|
| 254 |
+
if not rewards:
|
| 255 |
+
return 0.0
|
| 256 |
+
|
| 257 |
+
incident = state.incidents.get("INC-001")
|
| 258 |
+
if incident is None:
|
| 259 |
+
return 0.0
|
| 260 |
+
|
| 261 |
+
score = 0.0
|
| 262 |
+
|
| 263 |
+
# Component 1: Was the incident resolved? (50% weight)
|
| 264 |
+
if incident.status.value == "RESOLVED":
|
| 265 |
+
score += 0.50
|
| 266 |
+
|
| 267 |
+
# Component 2: Correct unit type dispatched? (30% weight)
|
| 268 |
+
medic_dispatched = any(
|
| 269 |
+
u.unit_type.value == "MEDIC" and (
|
| 270 |
+
u.assigned_incident_id == "INC-001" or
|
| 271 |
+
u.status.value in {"ON_SCENE", "DISPATCHED"}
|
| 272 |
+
)
|
| 273 |
+
for u in state.units.values()
|
| 274 |
+
)
|
| 275 |
+
if medic_dispatched:
|
| 276 |
+
score += 0.30
|
| 277 |
+
|
| 278 |
+
# Component 3: Speed — resolved within first 10 steps (20% weight)
|
| 279 |
+
if incident.status.value == "RESOLVED" and state.step_count <= 10:
|
| 280 |
+
score += 0.20
|
| 281 |
+
|
| 282 |
+
return max(0.0, min(1.0, score))
|
| 283 |
+
```
|
| 284 |
+
|
| 285 |
+
**Action — `src/tasks/multi_incident.py`:** Replace `MultiIncidentGrader.grade()` with:
|
| 286 |
+
|
| 287 |
+
```python
|
| 288 |
+
def grade(self, state: State, rewards: list[float]) -> float:
|
| 289 |
+
"""Grade based on: P1 incidents resolved, triage correctness, coverage."""
|
| 290 |
+
if not rewards:
|
| 291 |
+
return 0.0
|
| 292 |
+
|
| 293 |
+
total = len(state.incidents)
|
| 294 |
+
if total == 0:
|
| 295 |
+
return 0.0
|
| 296 |
+
|
| 297 |
+
resolved = sum(
|
| 298 |
+
1 for i in state.incidents.values()
|
| 299 |
+
if i.status.value == "RESOLVED"
|
| 300 |
+
)
|
| 301 |
+
failed = sum(
|
| 302 |
+
1 for i in state.incidents.values()
|
| 303 |
+
if i.status.value == "ESCALATED"
|
| 304 |
+
)
|
| 305 |
+
p1_total = sum(1 for i in state.incidents.values() if i.severity.value == "PRIORITY_1")
|
| 306 |
+
p1_resolved = sum(
|
| 307 |
+
1 for iid in state.metadata.get("resolved_incidents", [])
|
| 308 |
+
if state.incidents.get(iid) and state.incidents[iid].severity.value == "PRIORITY_1"
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
resolution_score = resolved / total
|
| 312 |
+
p1_score = (p1_resolved / p1_total) if p1_total > 0 else 1.0
|
| 313 |
+
failure_penalty = failed / total
|
| 314 |
+
|
| 315 |
+
score = 0.5 * p1_score + 0.3 * resolution_score - 0.2 * failure_penalty
|
| 316 |
+
return max(0.0, min(1.0, score))
|
| 317 |
+
```
|
| 318 |
+
|
| 319 |
+
**Action — `src/tasks/mass_casualty.py`:** The existing `MassCasualtyGrader` is reasonable. Improve it slightly:
|
| 320 |
+
|
| 321 |
+
```python
|
| 322 |
+
def grade(self, state: State, rewards: list[float]) -> float:
|
| 323 |
+
if not rewards:
|
| 324 |
+
return 0.0
|
| 325 |
+
|
| 326 |
+
p1_seen = list(state.metadata.get("p1_seen", []))
|
| 327 |
+
p1_resolved = [
|
| 328 |
+
iid for iid in state.metadata.get("resolved_incidents", [])
|
| 329 |
+
if iid in p1_seen and iid not in state.metadata.get("failed_incidents", [])
|
| 330 |
+
]
|
| 331 |
+
p1_failed = list(state.metadata.get("failed_incidents", []))
|
| 332 |
+
|
| 333 |
+
survival_score = len(p1_resolved) / max(len(p1_seen), 1)
|
| 334 |
+
failure_penalty = len(p1_failed) / max(len(p1_seen), 1) * 0.5
|
| 335 |
+
|
| 336 |
+
mean_reward = sum(rewards) / len(rewards)
|
| 337 |
+
score = 0.6 * survival_score + 0.3 * mean_reward - failure_penalty
|
| 338 |
+
return max(0.0, min(1.0, score))
|
| 339 |
+
```
|
| 340 |
+
|
| 341 |
+
---
|
| 342 |
+
|
| 343 |
+
### 3.2 Add `GET /tasks` documentation to README (the endpoint already exists)
|
| 344 |
+
|
| 345 |
+
The server already has `GET /tasks` but it's not documented in the README API table. Add it to the README's API Endpoints section:
|
| 346 |
+
|
| 347 |
+
```markdown
|
| 348 |
+
| `/tasks` | GET | List all available tasks with metadata |
|
| 349 |
+
```
|
| 350 |
+
|
| 351 |
+
---
|
| 352 |
+
|
| 353 |
+
### 3.3 Improve reward signal in `src/rewards.py` — triage scoring uses wrong key format
|
| 354 |
+
|
| 355 |
+
**Problem:** In `_compute_triage()`, the lookup is:
|
| 356 |
+
|
| 357 |
+
```python
|
| 358 |
+
required_map = state.metadata.get("default_required_units", {})
|
| 359 |
+
required_types = required_map.get(str(incident.incident_type), [])
|
| 360 |
+
```
|
| 361 |
+
|
| 362 |
+
But `str(incident.incident_type)` for a `StrEnum` returns `"CARDIAC_ARREST"` (the value), while the metadata stores types like `"IncidentType.CARDIAC_ARREST"` (the repr). This mismatch means triage always returns 0.5 (the neutral value), undermining the reward signal.
|
| 363 |
+
|
| 364 |
+
**Action:** In `src/rewards.py`, change `_compute_triage()` to use the value directly:
|
| 365 |
+
|
| 366 |
+
```python
|
| 367 |
+
def _compute_triage(self, state: State, action: Action) -> float:
|
| 368 |
+
if action.action_type != DispatchAction.DISPATCH:
|
| 369 |
+
return 0.5
|
| 370 |
+
unit = state.units.get(action.unit_id)
|
| 371 |
+
incident = state.incidents.get(action.incident_id)
|
| 372 |
+
if unit is None or incident is None:
|
| 373 |
+
return 0.0
|
| 374 |
+
required_map = state.metadata.get("default_required_units", {})
|
| 375 |
+
# Try both formats: plain value and StrEnum repr
|
| 376 |
+
required_types = (
|
| 377 |
+
required_map.get(incident.incident_type.value, []) or
|
| 378 |
+
required_map.get(str(incident.incident_type), [])
|
| 379 |
+
)
|
| 380 |
+
if not required_types:
|
| 381 |
+
return 0.5
|
| 382 |
+
return 1.0 if unit.unit_type.value in required_types else 0.0
|
| 383 |
+
```
|
| 384 |
+
|
| 385 |
+
Also fix the metadata population in `src/state_machine.py`. In `reset()`, when enriching metadata, convert the `default_required_units` schema data to use plain string values:
|
| 386 |
+
|
| 387 |
+
```python
|
| 388 |
+
# Convert unit type values to plain strings for consistent lookup
|
| 389 |
+
raw_required = schema_dump.get("default_required_units", {})
|
| 390 |
+
converted_required = {
|
| 391 |
+
str(inc_type): [str(u) for u in unit_types]
|
| 392 |
+
for inc_type, unit_types in raw_required.items()
|
| 393 |
+
}
|
| 394 |
+
state.metadata.setdefault("default_required_units", converted_required)
|
| 395 |
+
```
|
| 396 |
+
|
| 397 |
+
---
|
| 398 |
+
|
| 399 |
+
### 3.4 Fix `src/state_machine.py` — ETA computation uses wrong distance formula
|
| 400 |
+
|
| 401 |
+
**Problem:** In `_apply_dispatch()`, the ETA is computed using Euclidean distance (`math.hypot`) but the `physics.py` module uses Manhattan distance (`dx + dy`). The physics module is used for movement, so the ETA should match:
|
| 402 |
+
|
| 403 |
+
```python
|
| 404 |
+
dist = _distance(unit.location_x, unit.location_y, incident.location_x, incident.location_y)
|
| 405 |
+
```
|
| 406 |
+
|
| 407 |
+
Where `_distance` uses `math.hypot`. But `move_unit_toward` uses Manhattan movement. This inconsistency means units arrive "early" by Euclidean measurement but take longer by Manhattan movement.
|
| 408 |
+
|
| 409 |
+
**Action:** Replace `_distance` usage in `_apply_dispatch()` to use Manhattan distance:
|
| 410 |
+
|
| 411 |
+
```python
|
| 412 |
+
def _apply_dispatch(self, state: State, action: Action) -> None:
|
| 413 |
+
unit = state.units[action.unit_id]
|
| 414 |
+
incident = state.incidents[action.incident_id]
|
| 415 |
+
|
| 416 |
+
speed = float(self._schema.unit_speeds.get(unit.unit_type, 1.0))
|
| 417 |
+
# Use Manhattan distance to match move_unit_toward physics
|
| 418 |
+
dx = abs(unit.location_x - incident.location_x)
|
| 419 |
+
dy = abs(unit.location_y - incident.location_y)
|
| 420 |
+
manhattan_dist = dx + dy
|
| 421 |
+
eta = manhattan_dist / max(speed, 1e-6)
|
| 422 |
+
|
| 423 |
+
unit.status = UnitStatus.DISPATCHED
|
| 424 |
+
unit.assigned_incident_id = incident.incident_id
|
| 425 |
+
unit.eta_seconds = max(0.0, float(eta))
|
| 426 |
+
|
| 427 |
+
if unit.unit_id not in incident.units_assigned:
|
| 428 |
+
incident.units_assigned.append(unit.unit_id)
|
| 429 |
+
if incident.status == IncidentStatus.PENDING:
|
| 430 |
+
incident.status = IncidentStatus.RESPONDING
|
| 431 |
+
```
|
| 432 |
+
|
| 433 |
+
---
|
| 434 |
+
|
| 435 |
+
## SECTION 4 — README IMPROVEMENTS (required by competition)
|
| 436 |
+
|
| 437 |
+
### 4.1 Add missing required README sections
|
| 438 |
+
|
| 439 |
+
The current README is good but is missing:
|
| 440 |
+
|
| 441 |
+
1. **Baseline scores table with instructions to reproduce** — The README mentions scores but doesn't show how to generate them with a single command.
|
| 442 |
+
2. **Full action space table** — Currently only shows key fields, needs all fields.
|
| 443 |
+
3. **Setup instructions** — Missing explicit `uv sync` + server start commands.
|
| 444 |
+
|
| 445 |
+
**Action:** Add the following sections to `README.md`:
|
| 446 |
+
|
| 447 |
+
After the existing "Quick Start" section, add:
|
| 448 |
+
|
| 449 |
+
```markdown
|
| 450 |
+
## Reproducing Baseline Scores
|
| 451 |
+
|
| 452 |
+
Run the random baseline agent against all 4 tasks:
|
| 453 |
+
|
| 454 |
+
```bash
|
| 455 |
+
USE_RANDOM=true API_BASE_URL=https://api.openai.com/v1 MODEL_NAME=gpt-4 HF_TOKEN=x python inference.py
|
| 456 |
+
```
|
| 457 |
+
|
| 458 |
+
Expected output (approximate):
|
| 459 |
+
|
| 460 |
+
| Task | Difficulty | Random Baseline Score |
|
| 461 |
+
|------|-----------|----------------------|
|
| 462 |
+
| `single_incident` | Easy | ~0.55 |
|
| 463 |
+
| `multi_incident` | Medium | ~0.48 |
|
| 464 |
+
| `mass_casualty` | Hard | ~0.32 |
|
| 465 |
+
| `shift_surge` | Hard | ~0.38 |
|
| 466 |
+
|
| 467 |
+
*Scores vary slightly due to seeded randomness. Run with `seed=42` for exact reproduction.*
|
| 468 |
+
```
|
| 469 |
+
|
| 470 |
+
Also add an explicit environment variable table near the top:
|
| 471 |
+
|
| 472 |
+
```markdown
|
| 473 |
+
## Environment Variables
|
| 474 |
+
|
| 475 |
+
| Variable | Required | Description |
|
| 476 |
+
|----------|----------|-------------|
|
| 477 |
+
| `API_BASE_URL` | Yes | OpenAI-compatible endpoint base URL |
|
| 478 |
+
| `MODEL_NAME` | Yes | Model identifier string |
|
| 479 |
+
| `HF_TOKEN` | Yes (unless `USE_RANDOM=true`) | API key / HF token |
|
| 480 |
+
| `USE_RANDOM` | No | Set to `true` to use deterministic random agent (no LLM) |
|
| 481 |
+
```
|
| 482 |
+
|
| 483 |
+
---
|
| 484 |
+
|
| 485 |
+
### 4.2 Add task difficulty descriptions to README
|
| 486 |
+
|
| 487 |
+
Under the "Tasks" section, expand each task to include expected agent behaviors:
|
| 488 |
+
|
| 489 |
+
```markdown
|
| 490 |
+
### Task Difficulty Guide
|
| 491 |
+
|
| 492 |
+
| Task | Difficulty | Key Challenge | Success Criteria |
|
| 493 |
+
|------|-----------|---------------|-----------------|
|
| 494 |
+
| `single_incident` | Easy | Dispatch the right unit type (MEDIC) quickly | Incident resolved, correct unit, ETA < 300s |
|
| 495 |
+
| `multi_incident` | Medium | Triage 3 simultaneous incidents, prioritize P1 | All P1 incidents responded to, no ESCALATED |
|
| 496 |
+
| `mass_casualty` | Hard | Manage wave-based surge with limited resources | Maximize P1 survival rate across waves |
|
| 497 |
+
| `shift_surge` | Hard | Adapt as units go out of service over time | Maintain coverage and resolve incidents despite attrition |
|
| 498 |
+
```
|
| 499 |
+
|
| 500 |
+
---
|
| 501 |
+
|
| 502 |
+
## SECTION 5 — TEST FIXES
|
| 503 |
+
|
| 504 |
+
### 5.1 Update `tests/test_inference.py` to reflect valid error tokens
|
| 505 |
+
|
| 506 |
+
After the fix in Section 2.3, update the valid error set in `test_step_line_error_format`:
|
| 507 |
+
|
| 508 |
+
```python
|
| 509 |
+
valid_errors = {"null", "max_steps_exceeded", "illegal_transition", "step_error"}
|
| 510 |
+
```
|
| 511 |
+
|
| 512 |
+
---
|
| 513 |
+
|
| 514 |
+
### 5.2 Add a test for `/reset` with empty body
|
| 515 |
+
|
| 516 |
+
Add this test to `tests/test_openenv_integration.py`:
|
| 517 |
+
|
| 518 |
+
```python
|
| 519 |
+
def test_reset_with_empty_body_returns_200(self) -> None:
|
| 520 |
+
"""Verify prevalidation.sh compatible: POST /reset with {} returns 200."""
|
| 521 |
+
c = TestClient(server_app.app)
|
| 522 |
+
response = c.post("/reset", json={})
|
| 523 |
+
assert response.status_code == 200
|
| 524 |
+
data = response.json()
|
| 525 |
+
assert data["result"] == "dispatch center online"
|
| 526 |
+
```
|
| 527 |
+
|
| 528 |
+
---
|
| 529 |
+
|
| 530 |
+
### 5.3 Add a test for the `/tasks` endpoint
|
| 531 |
+
|
| 532 |
+
Add to `tests/test_openenv_integration.py`:
|
| 533 |
+
|
| 534 |
+
```python
|
| 535 |
+
def test_tasks_endpoint_returns_four_tasks(self) -> None:
|
| 536 |
+
c = TestClient(server_app.app)
|
| 537 |
+
response = c.get("/tasks")
|
| 538 |
+
assert response.status_code == 200
|
| 539 |
+
tasks = response.json()
|
| 540 |
+
assert len(tasks) == 4
|
| 541 |
+
task_ids = {t["task_id"] for t in tasks}
|
| 542 |
+
assert task_ids == {"single_incident", "multi_incident", "mass_casualty", "shift_surge"}
|
| 543 |
+
```
|
| 544 |
+
|
| 545 |
+
---
|
| 546 |
+
|
| 547 |
+
## SECTION 6 — DOCKER AND DEPLOYMENT CHECKS
|
| 548 |
+
|
| 549 |
+
### 6.1 Verify `src/server/Dockerfile` is consistent
|
| 550 |
+
|
| 551 |
+
The `src/server/Dockerfile` is a separate server-only Dockerfile. Ensure it also starts the server properly. Replace its CMD with:
|
| 552 |
+
|
| 553 |
+
```dockerfile
|
| 554 |
+
CMD ["uvicorn", "src.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
| 555 |
+
```
|
| 556 |
+
|
| 557 |
+
---
|
| 558 |
+
|
| 559 |
+
### 6.2 Add `.dockerignore` to speed up builds
|
| 560 |
+
|
| 561 |
+
Create `.dockerignore` at the repo root:
|
| 562 |
+
|
| 563 |
+
```
|
| 564 |
+
.git
|
| 565 |
+
.venv
|
| 566 |
+
.uv
|
| 567 |
+
__pycache__
|
| 568 |
+
*.pyc
|
| 569 |
+
*.pyo
|
| 570 |
+
.pytest_cache
|
| 571 |
+
.coverage
|
| 572 |
+
htmlcov
|
| 573 |
+
.sisyphus/evidence/
|
| 574 |
+
*.log
|
| 575 |
+
tmp/
|
| 576 |
+
dashboard.html
|
| 577 |
+
*.png
|
| 578 |
+
*.jpg
|
| 579 |
+
.env
|
| 580 |
+
.env.*
|
| 581 |
+
```
|
| 582 |
+
|
| 583 |
+
---
|
| 584 |
+
|
| 585 |
+
### 6.3 Verify `requirements.txt` is complete
|
| 586 |
+
|
| 587 |
+
The current `requirements.txt` is missing `groq` which is in `pyproject.toml`. Add it:
|
| 588 |
+
|
| 589 |
+
```
|
| 590 |
+
pydantic>=2.7
|
| 591 |
+
openenv-core>=0.2.0
|
| 592 |
+
fastapi>=0.110
|
| 593 |
+
uvicorn[standard]>=0.29
|
| 594 |
+
openai>=1.12
|
| 595 |
+
httpx>=0.27
|
| 596 |
+
matplotlib>=3.8
|
| 597 |
+
numpy>=1.26
|
| 598 |
+
groq>=1.1.2
|
| 599 |
+
```
|
| 600 |
+
|
| 601 |
+
---
|
| 602 |
+
|
| 603 |
+
## SECTION 7 — FINAL VALIDATION CHECKLIST
|
| 604 |
+
|
| 605 |
+
After making all changes, run these commands in order and confirm each passes:
|
| 606 |
+
|
| 607 |
+
```bash
|
| 608 |
+
# 1. YAML parse check
|
| 609 |
+
python -c "import yaml; yaml.safe_load(open('openenv.yaml')); print('YAML OK')"
|
| 610 |
+
|
| 611 |
+
# 2. Full test suite
|
| 612 |
+
uv run python -m pytest tests/ -v --tb=short
|
| 613 |
+
|
| 614 |
+
# 3. Inference script with random agent
|
| 615 |
+
USE_RANDOM=true API_BASE_URL=https://api.openai.com/v1 MODEL_NAME=gpt-4 HF_TOKEN=x \
|
| 616 |
+
uv run python inference.py 2>&1 | grep -E '^\[(START|STEP|END)\]' | head -20
|
| 617 |
+
|
| 618 |
+
# 4. Demo script
|
| 619 |
+
uv run python demo.py
|
| 620 |
+
|
| 621 |
+
# 5. OpenEnv validate
|
| 622 |
+
uv run openenv validate
|
| 623 |
+
|
| 624 |
+
# 6. Docker build
|
| 625 |
+
docker build -t citywide-dispatch-supervisor .
|
| 626 |
+
|
| 627 |
+
# 7. Docker run + health check + empty reset
|
| 628 |
+
docker run -d -p 8000:8000 --name test-dispatch citywide-dispatch-supervisor
|
| 629 |
+
sleep 5
|
| 630 |
+
curl -s http://localhost:8000/health
|
| 631 |
+
curl -s -X POST http://localhost:8000/reset -H "Content-Type: application/json" -d '{}'
|
| 632 |
+
docker stop test-dispatch && docker rm test-dispatch
|
| 633 |
+
|
| 634 |
+
# 8. Benchmark scores
|
| 635 |
+
uv run python -c "
|
| 636 |
+
from src.benchmark import run_all
|
| 637 |
+
scores = run_all()
|
| 638 |
+
for task_id, score in scores.items():
|
| 639 |
+
assert 0.0 <= score <= 1.0, f'{task_id}: score {score} out of range'
|
| 640 |
+
print(f'{task_id}: {score:.3f}')
|
| 641 |
+
print('All scores in [0.0, 1.0] — PASS')
|
| 642 |
+
"
|
| 643 |
+
```
|
| 644 |
+
|
| 645 |
+
All 8 checks must pass before submission.
|
| 646 |
+
|
| 647 |
+
---
|
| 648 |
+
|
| 649 |
+
## SECTION 8 — PRIORITY ORDER SUMMARY
|
| 650 |
+
|
| 651 |
+
Work through issues in this exact order:
|
| 652 |
+
|
| 653 |
+
| # | File | Change | Severity |
|
| 654 |
+
|---|------|--------|----------|
|
| 655 |
+
| 1 | `openenv.yaml` | Fix tab → space indentation | CRITICAL |
|
| 656 |
+
| 2 | `src/server/app.py` | Add `if __name__ == "__main__": main()` | CRITICAL |
|
| 657 |
+
| 3 | `src/server/app.py` | Make `task_id` optional in `ResetRequest` | CRITICAL |
|
| 658 |
+
| 4 | `Dockerfile` | Use uvicorn directly in CMD | CRITICAL |
|
| 659 |
+
| 5 | `validate_local.py` | Add `USE_RANDOM=true` in `check_inference` | HIGH |
|
| 660 |
+
| 6 | `pyproject.toml` | Add `asyncio_mode = "auto"` | HIGH |
|
| 661 |
+
| 7 | `inference.py` | Normalize exception error messages | HIGH |
|
| 662 |
+
| 8 | `inference.py` | Fix score computation (exclude reset reward) | HIGH |
|
| 663 |
+
| 9 | `src/server/app.py` | Guard `get_dashboard_state` against None env | MEDIUM |
|
| 664 |
+
| 10 | `src/rewards.py` | Fix triage key format mismatch | MEDIUM |
|
| 665 |
+
| 11 | `src/state_machine.py` | Use Manhattan distance for ETA | MEDIUM |
|
| 666 |
+
| 12 | `src/tasks/*.py` | Improve grader logic | MEDIUM |
|
| 667 |
+
| 13 | `tests/test_openenv_integration.py` | Add empty-body reset test | MEDIUM |
|
| 668 |
+
| 14 | `tests/test_openenv_integration.py` | Add /tasks endpoint test | LOW |
|
| 669 |
+
| 15 | `tests/test_inference.py` | Add `step_error` to valid errors set | LOW |
|
| 670 |
+
| 16 | `requirements.txt` | Add `groq>=1.1.2` | LOW |
|
| 671 |
+
| 17 | `.dockerignore` | Create file | LOW |
|
| 672 |
+
| 18 | `README.md` | Add baseline scores table + env var table + difficulty guide | LOW |
|
inference.py
CHANGED
|
@@ -259,7 +259,7 @@ async def run_episode(
|
|
| 259 |
break
|
| 260 |
|
| 261 |
except Exception as e:
|
| 262 |
-
error_msg =
|
| 263 |
print(
|
| 264 |
f"[STEP] step={step_count} action={action_str} "
|
| 265 |
f"reward=0.00 done=true error={error_msg}"
|
|
@@ -273,12 +273,16 @@ async def run_episode(
|
|
| 273 |
finally:
|
| 274 |
env.close()
|
| 275 |
|
| 276 |
-
#
|
| 277 |
-
|
| 278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
# Format rewards list as comma-separated with 2 decimal places
|
| 281 |
-
rewards_str = ",".join(f"{r:.2f}" for r in
|
| 282 |
|
| 283 |
print(
|
| 284 |
f"[END] success={str(success).lower()} steps={step_count} "
|
|
|
|
| 259 |
break
|
| 260 |
|
| 261 |
except Exception as e:
|
| 262 |
+
error_msg = "step_error" # normalize to a fixed token
|
| 263 |
print(
|
| 264 |
f"[STEP] step={step_count} action={action_str} "
|
| 265 |
f"reward=0.00 done=true error={error_msg}"
|
|
|
|
| 273 |
finally:
|
| 274 |
env.close()
|
| 275 |
|
| 276 |
+
# Separate reset reward from step rewards
|
| 277 |
+
step_rewards = rewards[1:]
|
| 278 |
+
if step_rewards:
|
| 279 |
+
total_score = sum(step_rewards) / len(step_rewards)
|
| 280 |
+
else:
|
| 281 |
+
total_score = 0.0
|
| 282 |
+
total_score = max(0.0, min(1.0, total_score))
|
| 283 |
|
| 284 |
# Format rewards list as comma-separated with 2 decimal places
|
| 285 |
+
rewards_str = ",".join(f"{r:.2f}" for r in step_rewards) if step_rewards else "0.00"
|
| 286 |
|
| 287 |
print(
|
| 288 |
f"[END] success={str(success).lower()} steps={step_count} "
|
openenv.yaml
CHANGED
|
@@ -1,20 +1,20 @@
|
|
| 1 |
name: citywide-dispatch-supervisor
|
| 2 |
version: "0.1.0"
|
| 3 |
description: >
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
entrypoint: src.openenv_environment:OpenEnvEnvironment
|
| 8 |
tasks:
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
| 1 |
name: citywide-dispatch-supervisor
|
| 2 |
version: "0.1.0"
|
| 3 |
description: >
|
| 4 |
+
City-wide 911 emergency dispatch supervisor RL environment.
|
| 5 |
+
An LLM agent learns to manage simultaneous incidents by dispatching
|
| 6 |
+
police, fire, and EMS units across a city grid under realistic constraints.
|
| 7 |
entrypoint: src.openenv_environment:OpenEnvEnvironment
|
| 8 |
tasks:
|
| 9 |
+
- id: single_incident
|
| 10 |
+
name: Single Incident Response
|
| 11 |
+
description: One incident with a small unit pool; learn basic dispatch, correct unit type, and response time.
|
| 12 |
+
- id: multi_incident
|
| 13 |
+
name: Simultaneous Multi-Incident
|
| 14 |
+
description: Multiple concurrent incidents requiring triage, prioritization, and correct unit matching.
|
| 15 |
+
- id: mass_casualty
|
| 16 |
+
name: Mass Casualty Event
|
| 17 |
+
description: Wave-based Priority-1 surge with resource conflict; maximize survival outcomes.
|
| 18 |
+
- id: shift_surge
|
| 19 |
+
name: Shift Surge
|
| 20 |
+
description: Incident waves combined with units going out of service; maintain coverage over time.
|
pyproject.toml
CHANGED
|
@@ -16,6 +16,7 @@ dependencies = [
|
|
| 16 |
"httpx>=0.27",
|
| 17 |
"matplotlib>=3.7",
|
| 18 |
"groq>=1.1.2",
|
|
|
|
| 19 |
]
|
| 20 |
|
| 21 |
[project.optional-dependencies]
|
|
@@ -28,6 +29,7 @@ testpaths = ["tests"]
|
|
| 28 |
python_files = ["test_*.py"]
|
| 29 |
python_classes = ["Test*"]
|
| 30 |
python_functions = ["test_*"]
|
|
|
|
| 31 |
|
| 32 |
[tool.hatch.build.targets.wheel]
|
| 33 |
packages = ["src"]
|
|
|
|
| 16 |
"httpx>=0.27",
|
| 17 |
"matplotlib>=3.7",
|
| 18 |
"groq>=1.1.2",
|
| 19 |
+
"pyyaml>=6.0.1",
|
| 20 |
]
|
| 21 |
|
| 22 |
[project.optional-dependencies]
|
|
|
|
| 29 |
python_files = ["test_*.py"]
|
| 30 |
python_classes = ["Test*"]
|
| 31 |
python_functions = ["test_*"]
|
| 32 |
+
asyncio_mode = "auto"
|
| 33 |
|
| 34 |
[tool.hatch.build.targets.wheel]
|
| 35 |
packages = ["src"]
|
requirements.txt
CHANGED
|
@@ -6,3 +6,5 @@ openai>=1.12
|
|
| 6 |
httpx>=0.27
|
| 7 |
matplotlib>=3.8
|
| 8 |
numpy>=1.26
|
|
|
|
|
|
|
|
|
| 6 |
httpx>=0.27
|
| 7 |
matplotlib>=3.8
|
| 8 |
numpy>=1.26
|
| 9 |
+
groq>=1.1.2
|
| 10 |
+
pyyaml>=6.0.1
|
src/rewards.py
CHANGED
|
@@ -103,12 +103,16 @@ class RewardCalculator:
|
|
| 103 |
return 0.0
|
| 104 |
|
| 105 |
required_map = state.metadata.get("default_required_units", {})
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
if not required_types:
|
| 108 |
return 0.5
|
| 109 |
|
| 110 |
# required_types are stored as strings in metadata.
|
| 111 |
-
if
|
| 112 |
return 1.0
|
| 113 |
return 0.0
|
| 114 |
|
|
|
|
| 103 |
return 0.0
|
| 104 |
|
| 105 |
required_map = state.metadata.get("default_required_units", {})
|
| 106 |
+
# Try both formats: plain value and StrEnum repr
|
| 107 |
+
required_types = (
|
| 108 |
+
required_map.get(incident.incident_type.value, [])
|
| 109 |
+
or required_map.get(str(incident.incident_type), [])
|
| 110 |
+
)
|
| 111 |
if not required_types:
|
| 112 |
return 0.5
|
| 113 |
|
| 114 |
# required_types are stored as strings in metadata.
|
| 115 |
+
if unit.unit_type.value in set(required_types):
|
| 116 |
return 1.0
|
| 117 |
return 0.0
|
| 118 |
|
src/server/Dockerfile
CHANGED
|
@@ -10,4 +10,4 @@ COPY data/ /app/data/
|
|
| 10 |
|
| 11 |
EXPOSE 8000
|
| 12 |
|
| 13 |
-
CMD ["
|
|
|
|
| 10 |
|
| 11 |
EXPOSE 8000
|
| 12 |
|
| 13 |
+
CMD ["uvicorn", "src.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
src/server/app.py
CHANGED
|
@@ -22,7 +22,7 @@ _env: OpenEnvEnvironment | None = None
|
|
| 22 |
|
| 23 |
|
| 24 |
class ResetRequest(BaseModel):
|
| 25 |
-
task_id: str
|
| 26 |
seed: int | None = None
|
| 27 |
|
| 28 |
|
|
@@ -103,7 +103,19 @@ async def get_dashboard_state() -> dict[str, Any]:
|
|
| 103 |
Keeps the existing /state response stable for typed clients.
|
| 104 |
"""
|
| 105 |
if _env is None:
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
state_dict = _env.state().model_dump()
|
| 109 |
legal_actions = [a.model_dump() for a in _env.legal_actions()]
|
|
@@ -122,4 +134,8 @@ async def get_dashboard_state() -> dict[str, Any]:
|
|
| 122 |
def main():
|
| 123 |
import uvicorn
|
| 124 |
|
| 125 |
-
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
class ResetRequest(BaseModel):
|
| 25 |
+
task_id: str = "single_incident"
|
| 26 |
seed: int | None = None
|
| 27 |
|
| 28 |
|
|
|
|
| 103 |
Keeps the existing /state response stable for typed clients.
|
| 104 |
"""
|
| 105 |
if _env is None:
|
| 106 |
+
# Return an empty but valid structure before /reset is called
|
| 107 |
+
return {
|
| 108 |
+
"units": {},
|
| 109 |
+
"incidents": {},
|
| 110 |
+
"episode_id": "not-initialized",
|
| 111 |
+
"step_count": 0,
|
| 112 |
+
"task_id": "none",
|
| 113 |
+
"city_time": 0.0,
|
| 114 |
+
"metadata": {},
|
| 115 |
+
"legal_actions": [],
|
| 116 |
+
"issues": [],
|
| 117 |
+
"observation": None,
|
| 118 |
+
}
|
| 119 |
|
| 120 |
state_dict = _env.state().model_dump()
|
| 121 |
legal_actions = [a.model_dump() for a in _env.legal_actions()]
|
|
|
|
| 134 |
def main():
|
| 135 |
import uvicorn
|
| 136 |
|
| 137 |
+
uvicorn.run("src.server.app:app", host="0.0.0.0", port=8000, reload=False)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
if __name__ == "__main__":
|
| 141 |
+
main()
|
src/state_machine.py
CHANGED
|
@@ -78,9 +78,15 @@ class DispatchStateMachine:
|
|
| 78 |
state.metadata.setdefault("districts", meta.get("districts", schema_dump.get("districts", [])))
|
| 79 |
state.metadata.setdefault("grid_size", meta.get("grid_size", schema_dump.get("grid_size", [])))
|
| 80 |
state.metadata.setdefault("unit_speeds", schema_dump.get("unit_speeds", {}))
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
state.metadata["max_steps"] = int(meta.get("max_steps", MAX_STEPS))
|
| 86 |
state.metadata["waves"] = list(meta.get("waves", []))
|
|
@@ -217,13 +223,11 @@ class DispatchStateMachine:
|
|
| 217 |
incident = state.incidents[action.incident_id]
|
| 218 |
|
| 219 |
speed = float(self._schema.unit_speeds.get(unit.unit_type, 1.0))
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
)
|
| 226 |
-
eta = dist / max(speed, 1e-6)
|
| 227 |
|
| 228 |
unit.status = UnitStatus.DISPATCHED
|
| 229 |
unit.assigned_incident_id = incident.incident_id
|
|
|
|
| 78 |
state.metadata.setdefault("districts", meta.get("districts", schema_dump.get("districts", [])))
|
| 79 |
state.metadata.setdefault("grid_size", meta.get("grid_size", schema_dump.get("grid_size", [])))
|
| 80 |
state.metadata.setdefault("unit_speeds", schema_dump.get("unit_speeds", {}))
|
| 81 |
+
# Convert unit type values to plain strings for consistent lookup
|
| 82 |
+
raw_required = schema_dump.get("default_required_units", {})
|
| 83 |
+
converted_required: dict[str, list[str]] = {}
|
| 84 |
+
for inc_type, unit_types in raw_required.items():
|
| 85 |
+
inc_key = getattr(inc_type, "value", None) or str(inc_type)
|
| 86 |
+
converted_required[str(inc_key)] = [
|
| 87 |
+
str(getattr(u, "value", None) or str(u)) for u in list(unit_types)
|
| 88 |
+
]
|
| 89 |
+
state.metadata.setdefault("default_required_units", converted_required)
|
| 90 |
|
| 91 |
state.metadata["max_steps"] = int(meta.get("max_steps", MAX_STEPS))
|
| 92 |
state.metadata["waves"] = list(meta.get("waves", []))
|
|
|
|
| 223 |
incident = state.incidents[action.incident_id]
|
| 224 |
|
| 225 |
speed = float(self._schema.unit_speeds.get(unit.unit_type, 1.0))
|
| 226 |
+
# Use Manhattan distance to match move_unit_toward physics
|
| 227 |
+
dx = abs(unit.location_x - incident.location_x)
|
| 228 |
+
dy = abs(unit.location_y - incident.location_y)
|
| 229 |
+
manhattan_dist = dx + dy
|
| 230 |
+
eta = manhattan_dist / max(speed, 1e-6)
|
|
|
|
|
|
|
| 231 |
|
| 232 |
unit.status = UnitStatus.DISPATCHED
|
| 233 |
unit.assigned_incident_id = incident.incident_id
|
src/tasks/mass_casualty.py
CHANGED
|
@@ -43,14 +43,17 @@ class MassCasualtyGrader:
|
|
| 43 |
if not rewards:
|
| 44 |
return 0.0
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
)
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
if not rewards:
|
| 44 |
return 0.0
|
| 45 |
|
| 46 |
+
p1_seen = list(state.metadata.get("p1_seen", []))
|
| 47 |
+
p1_resolved = [
|
| 48 |
+
iid
|
| 49 |
+
for iid in state.metadata.get("resolved_incidents", [])
|
| 50 |
+
if iid in p1_seen and iid not in state.metadata.get("failed_incidents", [])
|
| 51 |
+
]
|
| 52 |
+
p1_failed = list(state.metadata.get("failed_incidents", []))
|
|
|
|
| 53 |
|
| 54 |
+
survival_score = len(p1_resolved) / max(len(p1_seen), 1)
|
| 55 |
+
failure_penalty = len(p1_failed) / max(len(p1_seen), 1) * 0.5
|
| 56 |
+
|
| 57 |
+
mean_reward = sum(rewards) / len(rewards)
|
| 58 |
+
score = 0.6 * survival_score + 0.3 * mean_reward - failure_penalty
|
| 59 |
+
return max(0.0, min(1.0, score))
|
src/tasks/multi_incident.py
CHANGED
|
@@ -40,20 +40,27 @@ class MultiIncidentGrader:
|
|
| 40 |
self.reward_calculator = RewardCalculator()
|
| 41 |
|
| 42 |
def grade(self, state: State, rewards: list[float]) -> float:
|
|
|
|
| 43 |
if not rewards:
|
| 44 |
return 0.0
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
cardiac = next((i for i in state.incidents.values() if i.incident_type == IncidentType.CARDIAC_ARREST), None)
|
| 50 |
-
if cardiac is None:
|
| 51 |
-
return max(0.0, min(1.0, mean_reward))
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
)
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
self.reward_calculator = RewardCalculator()
|
| 41 |
|
| 42 |
def grade(self, state: State, rewards: list[float]) -> float:
|
| 43 |
+
"""Grade based on: P1 incidents resolved, triage correctness, coverage."""
|
| 44 |
if not rewards:
|
| 45 |
return 0.0
|
| 46 |
|
| 47 |
+
total = len(state.incidents)
|
| 48 |
+
if total == 0:
|
| 49 |
+
return 0.0
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
+
resolved = sum(1 for i in state.incidents.values() if i.status.value == "RESOLVED")
|
| 52 |
+
failed = sum(1 for i in state.incidents.values() if i.status.value == "ESCALATED")
|
| 53 |
+
p1_total = sum(1 for i in state.incidents.values() if i.severity.value == "PRIORITY_1")
|
| 54 |
+
p1_resolved = sum(
|
| 55 |
+
1
|
| 56 |
+
for iid in state.metadata.get("resolved_incidents", [])
|
| 57 |
+
if state.incidents.get(iid)
|
| 58 |
+
and state.incidents[iid].severity.value == "PRIORITY_1"
|
| 59 |
)
|
| 60 |
|
| 61 |
+
resolution_score = resolved / total
|
| 62 |
+
p1_score = (p1_resolved / p1_total) if p1_total > 0 else 1.0
|
| 63 |
+
failure_penalty = failed / total
|
| 64 |
+
|
| 65 |
+
score = 0.5 * p1_score + 0.3 * resolution_score - 0.2 * failure_penalty
|
| 66 |
+
return max(0.0, min(1.0, score))
|
src/tasks/single_incident.py
CHANGED
|
@@ -40,33 +40,34 @@ class SingleIncidentGrader:
|
|
| 40 |
self.reward_calculator = RewardCalculator()
|
| 41 |
|
| 42 |
def grade(self, state: State, rewards: list[float]) -> float:
|
| 43 |
-
"""
|
| 44 |
if not rewards:
|
| 45 |
return 0.0
|
| 46 |
|
| 47 |
-
mean_reward = sum(rewards) / len(rewards)
|
| 48 |
-
|
| 49 |
incident = state.incidents.get("INC-001")
|
| 50 |
if incident is None:
|
| 51 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
-
#
|
| 54 |
-
|
| 55 |
-
u
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
for u in state.units.values()
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
if unit.eta_seconds < 300.0:
|
| 67 |
-
bonus += 0.2
|
| 68 |
-
|
| 69 |
-
# Base per spec: 0.6 + bonuses, but bounded and blended with mean reward.
|
| 70 |
-
target = 0.6 + bonus
|
| 71 |
-
total = 0.5 * mean_reward + 0.5 * target
|
| 72 |
-
return max(0.0, min(1.0, total))
|
|
|
|
| 40 |
self.reward_calculator = RewardCalculator()
|
| 41 |
|
| 42 |
def grade(self, state: State, rewards: list[float]) -> float:
|
| 43 |
+
"""Grade based on: correct unit dispatched, fast response, incident resolved."""
|
| 44 |
if not rewards:
|
| 45 |
return 0.0
|
| 46 |
|
|
|
|
|
|
|
| 47 |
incident = state.incidents.get("INC-001")
|
| 48 |
if incident is None:
|
| 49 |
+
return 0.0
|
| 50 |
+
|
| 51 |
+
score = 0.0
|
| 52 |
+
|
| 53 |
+
# Component 1: Was the incident resolved? (50% weight)
|
| 54 |
+
if incident.status.value == "RESOLVED":
|
| 55 |
+
score += 0.50
|
| 56 |
|
| 57 |
+
# Component 2: Correct unit type dispatched? (30% weight)
|
| 58 |
+
medic_dispatched = any(
|
| 59 |
+
u.unit_type.value == "MEDIC"
|
| 60 |
+
and (
|
| 61 |
+
u.assigned_incident_id == "INC-001"
|
| 62 |
+
or u.status.value in {"ON_SCENE", "DISPATCHED"}
|
| 63 |
+
)
|
| 64 |
for u in state.units.values()
|
| 65 |
+
)
|
| 66 |
+
if medic_dispatched:
|
| 67 |
+
score += 0.30
|
| 68 |
+
|
| 69 |
+
# Component 3: Speed — resolved within first 10 steps (20% weight)
|
| 70 |
+
if incident.status.value == "RESOLVED" and state.step_count <= 10:
|
| 71 |
+
score += 0.20
|
| 72 |
+
|
| 73 |
+
return max(0.0, min(1.0, score))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_inference.py
CHANGED
|
@@ -63,7 +63,7 @@ class TestInferenceFormatCompliance:
|
|
| 63 |
"USE_RANDOM": "true",
|
| 64 |
}
|
| 65 |
_, stdout, _ = self._run_inference_capture(env)
|
| 66 |
-
valid_errors = {"null", "max_steps_exceeded", "illegal_transition"}
|
| 67 |
for line in stdout.split("\n"):
|
| 68 |
if not line.startswith("[STEP]"):
|
| 69 |
continue
|
|
|
|
| 63 |
"USE_RANDOM": "true",
|
| 64 |
}
|
| 65 |
_, stdout, _ = self._run_inference_capture(env)
|
| 66 |
+
valid_errors = {"null", "max_steps_exceeded", "illegal_transition", "step_error"}
|
| 67 |
for line in stdout.split("\n"):
|
| 68 |
if not line.startswith("[STEP]"):
|
| 69 |
continue
|
tests/test_openenv_integration.py
CHANGED
|
@@ -53,6 +53,14 @@ class TestResetEndpoint:
|
|
| 53 |
assert data["result"] == "dispatch center online"
|
| 54 |
assert data["protocol_ok"] is True
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
class TestStepEndpoint:
|
| 58 |
def test_step_requires_reset_first(self) -> None:
|
|
@@ -117,3 +125,19 @@ class TestHealthEndpoint:
|
|
| 117 |
response = c.get("/health")
|
| 118 |
assert response.status_code == 200
|
| 119 |
assert response.json() == {"status": "ok"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
assert data["result"] == "dispatch center online"
|
| 54 |
assert data["protocol_ok"] is True
|
| 55 |
|
| 56 |
+
def test_reset_with_empty_body_returns_200(self) -> None:
|
| 57 |
+
"""Verify prevalidation.sh compatible: POST /reset with {} returns 200."""
|
| 58 |
+
c = TestClient(server_app.app)
|
| 59 |
+
response = c.post("/reset", json={})
|
| 60 |
+
assert response.status_code == 200
|
| 61 |
+
data = response.json()
|
| 62 |
+
assert data["result"] == "dispatch center online"
|
| 63 |
+
|
| 64 |
|
| 65 |
class TestStepEndpoint:
|
| 66 |
def test_step_requires_reset_first(self) -> None:
|
|
|
|
| 125 |
response = c.get("/health")
|
| 126 |
assert response.status_code == 200
|
| 127 |
assert response.json() == {"status": "ok"}
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class TestTasksEndpoint:
|
| 131 |
+
def test_tasks_endpoint_returns_four_tasks(self) -> None:
|
| 132 |
+
c = TestClient(server_app.app)
|
| 133 |
+
response = c.get("/tasks")
|
| 134 |
+
assert response.status_code == 200
|
| 135 |
+
tasks = response.json()
|
| 136 |
+
assert len(tasks) == 4
|
| 137 |
+
task_ids = {t["task_id"] for t in tasks}
|
| 138 |
+
assert task_ids == {
|
| 139 |
+
"single_incident",
|
| 140 |
+
"multi_incident",
|
| 141 |
+
"mass_casualty",
|
| 142 |
+
"shift_surge",
|
| 143 |
+
}
|
validate_local.py
CHANGED
|
@@ -40,16 +40,15 @@ def check_inference() -> bool:
|
|
| 40 |
env["API_BASE_URL"] = "https://api.openai.com/v1"
|
| 41 |
env["MODEL_NAME"] = "gpt-4"
|
| 42 |
env["HF_TOKEN"] = "dummy-token-for-local-validation"
|
|
|
|
| 43 |
|
| 44 |
-
print(
|
| 45 |
-
"\nNOTE: Running inference.py in dummy mode - will emit [START]/[STEP]/[END] lines"
|
| 46 |
-
)
|
| 47 |
result = subprocess.run(
|
| 48 |
["uv", "run", "python", "inference.py"],
|
| 49 |
capture_output=True,
|
| 50 |
text=True,
|
| 51 |
env=env,
|
| 52 |
-
timeout=
|
| 53 |
)
|
| 54 |
|
| 55 |
if result.stdout:
|
|
|
|
| 40 |
env["API_BASE_URL"] = "https://api.openai.com/v1"
|
| 41 |
env["MODEL_NAME"] = "gpt-4"
|
| 42 |
env["HF_TOKEN"] = "dummy-token-for-local-validation"
|
| 43 |
+
env["USE_RANDOM"] = "true"
|
| 44 |
|
| 45 |
+
print("\nNOTE: Running inference.py in random-agent mode for local validation")
|
|
|
|
|
|
|
| 46 |
result = subprocess.run(
|
| 47 |
["uv", "run", "python", "inference.py"],
|
| 48 |
capture_output=True,
|
| 49 |
text=True,
|
| 50 |
env=env,
|
| 51 |
+
timeout=300,
|
| 52 |
)
|
| 53 |
|
| 54 |
if result.stdout:
|