Spaces:
Sleeping
Sleeping
Commit ·
775befb
1
Parent(s): e493c92
Submission polish: compliance hardening, baseline matrix, dashboard UX, tests, and docs
Browse files- .gitignore +1 -0
- README.md +26 -6
- inference.py +11 -15
- live_dashboard.html +87 -6
- samplematerial/prevalidation.sh +29 -3
- samplematerial/sampleinference.py +16 -174
- scripts/run_baseline_matrix.py +235 -0
- scripts/run_nemotron_baseline.ps1 +53 -0
- tests/test_baseline_matrix.py +98 -0
- tests/test_openenv_integration.py +29 -0
- uv.lock +0 -0
- validate_local.py +61 -5
.gitignore
CHANGED
|
@@ -28,6 +28,7 @@ htmlcov/
|
|
| 28 |
.sisyphus/notepads/
|
| 29 |
*.log
|
| 30 |
tmp/
|
|
|
|
| 31 |
|
| 32 |
# Do not commit architecture notes
|
| 33 |
architecture.md
|
|
|
|
| 28 |
.sisyphus/notepads/
|
| 29 |
*.log
|
| 30 |
tmp/
|
| 31 |
+
baseline_*_report.json
|
| 32 |
|
| 33 |
# Do not commit architecture notes
|
| 34 |
architecture.md
|
README.md
CHANGED
|
@@ -421,14 +421,31 @@ USE_RANDOM=true \
|
|
| 421 |
uv run python inference.py
|
| 422 |
```
|
| 423 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
| Task | Difficulty | Random Baseline Score |
|
| 425 |
|---|---|---|
|
| 426 |
-
| `single_incident` | Easy | ~0.
|
| 427 |
-
| `multi_incident` | Medium | ~0.
|
| 428 |
-
| `mass_casualty` | Hard | ~0.
|
| 429 |
-
| `shift_surge` | Hard | ~0.
|
| 430 |
|
| 431 |
-
*Scores
|
|
|
|
| 432 |
|
| 433 |
---
|
| 434 |
|
|
@@ -557,10 +574,13 @@ uv run pytest tests/test_inference.py -v
|
|
| 557 |
uv run python validate_local.py
|
| 558 |
|
| 559 |
# OpenEnv spec validation
|
| 560 |
-
|
| 561 |
|
| 562 |
# HF Space validation (requires deployed space)
|
| 563 |
bash samplematerial/prevalidation.sh https://your-space.hf.space .
|
|
|
|
|
|
|
|
|
|
| 564 |
```
|
| 565 |
|
| 566 |
---
|
|
|
|
| 421 |
uv run python inference.py
|
| 422 |
```
|
| 423 |
|
| 424 |
+
Run the baseline matrix (random + Open LLM reruns) and emit a JSON report:
|
| 425 |
+
|
| 426 |
+
```bash
|
| 427 |
+
API_BASE_URL=https://api.openai.com/v1 \
|
| 428 |
+
MODEL_NAME=nvidia/Nemotron-3-Super-49B-v1 \
|
| 429 |
+
OPENAI_API_KEY=your_token \
|
| 430 |
+
uv run python scripts/run_baseline_matrix.py --random-runs 1 --llm-runs 3 --output-json baseline_nemotron_report.json
|
| 431 |
+
```
|
| 432 |
+
|
| 433 |
+
Windows PowerShell shortcut:
|
| 434 |
+
|
| 435 |
+
```powershell
|
| 436 |
+
$env:OPENAI_API_KEY="your_token"
|
| 437 |
+
powershell -ExecutionPolicy Bypass -File scripts/run_nemotron_baseline.ps1 -RandomRuns 1 -LlmRuns 3
|
| 438 |
+
```
|
| 439 |
+
|
| 440 |
| Task | Difficulty | Random Baseline Score |
|
| 441 |
|---|---|---|
|
| 442 |
+
| `single_incident` | Easy | ~0.30 |
|
| 443 |
+
| `multi_incident` | Medium | ~0.70 |
|
| 444 |
+
| `mass_casualty` | Hard | ~0.74 |
|
| 445 |
+
| `shift_surge` | Hard | ~0.56 |
|
| 446 |
|
| 447 |
+
*Scores above are from deterministic random-agent inference with `seed=42`.*
|
| 448 |
+
*For Open LLM evaluation, use Nemotron 3 Super as the primary baseline and report mean/std across reruns.*
|
| 449 |
|
| 450 |
---
|
| 451 |
|
|
|
|
| 574 |
uv run python validate_local.py
|
| 575 |
|
| 576 |
# OpenEnv spec validation
|
| 577 |
+
openenv validate
|
| 578 |
|
| 579 |
# HF Space validation (requires deployed space)
|
| 580 |
bash samplematerial/prevalidation.sh https://your-space.hf.space .
|
| 581 |
+
|
| 582 |
+
# Windows (explicit Git Bash)
|
| 583 |
+
"C:/Program Files/Git/bin/bash.exe" samplematerial/prevalidation.sh https://your-space.hf.space .
|
| 584 |
```
|
| 585 |
|
| 586 |
---
|
inference.py
CHANGED
|
@@ -70,6 +70,7 @@ class LLMAgent:
|
|
| 70 |
self.api_key = api_key
|
| 71 |
self.base_url = base_url.rstrip("/")
|
| 72 |
self.model = model
|
|
|
|
| 73 |
|
| 74 |
# Official OpenAI Python client for OpenAI-compatible endpoints.
|
| 75 |
self._client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url)
|
|
@@ -142,7 +143,7 @@ Respond with ONLY the exact action string from the legal actions list. No explan
|
|
| 142 |
return action
|
| 143 |
|
| 144 |
# Fallback to random if LLM response doesn't match
|
| 145 |
-
return
|
| 146 |
|
| 147 |
|
| 148 |
def _format_action(action: Action) -> str:
|
|
@@ -198,11 +199,11 @@ async def run_episode(
|
|
| 198 |
step_count = 0
|
| 199 |
rewards: list[float] = []
|
| 200 |
success = False
|
| 201 |
-
|
| 202 |
|
| 203 |
try:
|
| 204 |
observation = await env.reset()
|
| 205 |
-
|
| 206 |
prev_obs = observation
|
| 207 |
|
| 208 |
while True:
|
|
@@ -230,6 +231,7 @@ async def run_episode(
|
|
| 230 |
obs, reward, done = await env.step(action)
|
| 231 |
prev_obs = obs
|
| 232 |
rewards.append(reward)
|
|
|
|
| 233 |
|
| 234 |
# Terminal conditions: done flag OR any protocol-invalid transition.
|
| 235 |
has_illegal_transition = any(
|
|
@@ -264,7 +266,6 @@ async def run_episode(
|
|
| 264 |
|
| 265 |
# Safety check for runaway episodes
|
| 266 |
if step_count >= 1000:
|
| 267 |
-
error_msg = "max_steps_exceeded"
|
| 268 |
print(
|
| 269 |
f"[STEP] step={step_count} action={action_str} "
|
| 270 |
f"reward={reward_str} done=true error=max_steps_exceeded"
|
|
@@ -273,26 +274,21 @@ async def run_episode(
|
|
| 273 |
break
|
| 274 |
|
| 275 |
except Exception as e:
|
| 276 |
-
error_msg = "step_error" # normalize to a fixed token
|
| 277 |
print(
|
| 278 |
f"[STEP] step={step_count} action={action_str} "
|
| 279 |
-
f"reward=0.00 done=true error=
|
| 280 |
)
|
| 281 |
success = False
|
| 282 |
break
|
| 283 |
|
| 284 |
except Exception as e:
|
| 285 |
-
error_msg = str(e)
|
| 286 |
success = False
|
| 287 |
finally:
|
| 288 |
env.close()
|
| 289 |
|
| 290 |
-
#
|
| 291 |
-
step_rewards = rewards
|
| 292 |
-
|
| 293 |
-
total_score = sum(step_rewards) / len(step_rewards)
|
| 294 |
-
else:
|
| 295 |
-
total_score = 0.0
|
| 296 |
total_score = max(0.0, min(1.0, total_score))
|
| 297 |
|
| 298 |
# Format rewards list as comma-separated with 2 decimal places
|
|
@@ -331,12 +327,12 @@ async def main() -> int:
|
|
| 331 |
|
| 332 |
except EnvironmentError as e:
|
| 333 |
# Emit [END] to stdout for failure case
|
| 334 |
-
print("[END] success=false steps=0 score=0.000 rewards=")
|
| 335 |
print(f"ERROR: {e}", file=sys.stderr)
|
| 336 |
return 1
|
| 337 |
except Exception as e:
|
| 338 |
# Emit [END] to stdout for failure case
|
| 339 |
-
print("[END] success=false steps=0 score=0.000 rewards=")
|
| 340 |
print(f"ERROR: Unexpected error: {e}", file=sys.stderr)
|
| 341 |
return 1
|
| 342 |
|
|
|
|
| 70 |
self.api_key = api_key
|
| 71 |
self.base_url = base_url.rstrip("/")
|
| 72 |
self.model = model
|
| 73 |
+
self._rng = random.Random(42)
|
| 74 |
|
| 75 |
# Official OpenAI Python client for OpenAI-compatible endpoints.
|
| 76 |
self._client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url)
|
|
|
|
| 143 |
return action
|
| 144 |
|
| 145 |
# Fallback to random if LLM response doesn't match
|
| 146 |
+
return self._rng.choice(legal_actions)
|
| 147 |
|
| 148 |
|
| 149 |
def _format_action(action: Action) -> str:
|
|
|
|
| 199 |
step_count = 0
|
| 200 |
rewards: list[float] = []
|
| 201 |
success = False
|
| 202 |
+
episode_score = 0.0
|
| 203 |
|
| 204 |
try:
|
| 205 |
observation = await env.reset()
|
| 206 |
+
episode_score = float(observation.score)
|
| 207 |
prev_obs = observation
|
| 208 |
|
| 209 |
while True:
|
|
|
|
| 231 |
obs, reward, done = await env.step(action)
|
| 232 |
prev_obs = obs
|
| 233 |
rewards.append(reward)
|
| 234 |
+
episode_score = float(obs.score)
|
| 235 |
|
| 236 |
# Terminal conditions: done flag OR any protocol-invalid transition.
|
| 237 |
has_illegal_transition = any(
|
|
|
|
| 266 |
|
| 267 |
# Safety check for runaway episodes
|
| 268 |
if step_count >= 1000:
|
|
|
|
| 269 |
print(
|
| 270 |
f"[STEP] step={step_count} action={action_str} "
|
| 271 |
f"reward={reward_str} done=true error=max_steps_exceeded"
|
|
|
|
| 274 |
break
|
| 275 |
|
| 276 |
except Exception as e:
|
|
|
|
| 277 |
print(
|
| 278 |
f"[STEP] step={step_count} action={action_str} "
|
| 279 |
+
f"reward=0.00 done=true error=step_error"
|
| 280 |
)
|
| 281 |
success = False
|
| 282 |
break
|
| 283 |
|
| 284 |
except Exception as e:
|
|
|
|
| 285 |
success = False
|
| 286 |
finally:
|
| 287 |
env.close()
|
| 288 |
|
| 289 |
+
# OpenEnv publishes episode score in observation.score; keep this for [END].
|
| 290 |
+
step_rewards = rewards
|
| 291 |
+
total_score = episode_score
|
|
|
|
|
|
|
|
|
|
| 292 |
total_score = max(0.0, min(1.0, total_score))
|
| 293 |
|
| 294 |
# Format rewards list as comma-separated with 2 decimal places
|
|
|
|
| 327 |
|
| 328 |
except EnvironmentError as e:
|
| 329 |
# Emit [END] to stdout for failure case
|
| 330 |
+
print("[END] success=false steps=0 score=0.000 rewards=0.00")
|
| 331 |
print(f"ERROR: {e}", file=sys.stderr)
|
| 332 |
return 1
|
| 333 |
except Exception as e:
|
| 334 |
# Emit [END] to stdout for failure case
|
| 335 |
+
print("[END] success=false steps=0 score=0.000 rewards=0.00")
|
| 336 |
print(f"ERROR: Unexpected error: {e}", file=sys.stderr)
|
| 337 |
return 1
|
| 338 |
|
live_dashboard.html
CHANGED
|
@@ -36,6 +36,18 @@
|
|
| 36 |
.fill { height: 100%; width: 0%; background: #38bdf8; }
|
| 37 |
.kpi { display: flex; justify-content: space-between; font-size: 11px; color: #cbd5e1; margin-bottom: 6px; }
|
| 38 |
.status { margin-top: 10px; font-size: 11px; color: #64748b; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
</style>
|
| 40 |
</head>
|
| 41 |
<body>
|
|
@@ -47,7 +59,8 @@
|
|
| 47 |
<div class="pill">Task: <strong id="hdr-task">—</strong></div>
|
| 48 |
<div class="pill">Episode: <strong id="hdr-episode">—</strong></div>
|
| 49 |
<div class="pill">Step: <strong id="hdr-step">—</strong></div>
|
| 50 |
-
<div class="pill">Score: <strong id="hdr-score">—</strong></div>
|
|
|
|
| 51 |
</div>
|
| 52 |
</div>
|
| 53 |
|
|
@@ -77,11 +90,13 @@
|
|
| 77 |
<div class="col">
|
| 78 |
<div class="panel-title">Incidents</div>
|
| 79 |
<div id="incidents"></div>
|
|
|
|
|
|
|
| 80 |
</div>
|
| 81 |
</div>
|
| 82 |
|
| 83 |
<div class="bottom">
|
| 84 |
-
<div class="panel-title">
|
| 85 |
<div class="breakdown">
|
| 86 |
<div>
|
| 87 |
<div class="kpi"><span>response_time</span><span id="v-response_time">0.00</span></div>
|
|
@@ -111,6 +126,11 @@
|
|
| 111 |
const API = 'http://localhost:8000';
|
| 112 |
const DASHBOARD_STATE = `${API}/dashboard/state`;
|
| 113 |
const REFRESH_MS = 500;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
const STATUS_COLORS = {
|
| 116 |
AVAILABLE: '#10b981',
|
|
@@ -137,6 +157,15 @@ function setText(id, text) {
|
|
| 137 |
if (el) el.textContent = text;
|
| 138 |
}
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
function renderUnits(state) {
|
| 141 |
const root = document.getElementById('units');
|
| 142 |
root.innerHTML = '';
|
|
@@ -175,11 +204,17 @@ function renderIncidents(state) {
|
|
| 175 |
const sev = String(i.severity || '');
|
| 176 |
const sevColor = sev === 'PRIORITY_1' ? '#ef4444' : (sev === 'PRIORITY_2' ? '#fbbf24' : '#38bdf8');
|
| 177 |
const units = (i.units_assigned || []).join(', ') || '—';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
const card = document.createElement('div');
|
| 179 |
card.className = 'card';
|
| 180 |
card.innerHTML = `
|
| 181 |
<div class="row"><div><strong style="color:${sevColor}">${i.incident_id}</strong> <span class="muted">(${i.incident_type})</span></div><div class="muted">${i.status}</div></div>
|
| 182 |
<div class="row" style="margin-top:8px"><div class="muted">severity</div><div>${i.severity}</div></div>
|
|
|
|
| 183 |
<div class="row"><div class="muted">assigned</div><div>${units}</div></div>
|
| 184 |
<div class="row"><div class="muted">pos</div><div>${fmt(i.location_x,0)}, ${fmt(i.location_y,0)}</div></div>
|
| 185 |
`;
|
|
@@ -239,8 +274,10 @@ function renderMap(state) {
|
|
| 239 |
const color = sev === 'PRIORITY_1' ? '#ef4444' : (sev === 'PRIORITY_2' ? '#fbbf24' : '#38bdf8');
|
| 240 |
const x = Number(i.location_x || 0);
|
| 241 |
const y = Number(i.location_y || 0);
|
|
|
|
|
|
|
| 242 |
svg.appendChild(svgEl('circle', { cx: x, cy: y, r: Math.max(0.7, Math.min(w,h) * 0.012), fill: color, stroke: '#0f172a', 'stroke-width': 0.3 }));
|
| 243 |
-
svg.appendChild(svgEl('text', { x: x + 1, y: y - 1, fill: '#e2e8f0', 'font-size': 2.8, 'font-family': 'monospace' })).textContent = i.incident_id;
|
| 244 |
}
|
| 245 |
|
| 246 |
// units
|
|
@@ -271,8 +308,49 @@ function updateHeader(state) {
|
|
| 271 |
setText('hdr-task', state.task_id || '—');
|
| 272 |
setText('hdr-episode', state.episode_id ? String(state.episode_id).slice(0, 8) : '—');
|
| 273 |
setText('hdr-step', (state.step_count !== undefined) ? String(state.step_count) : '—');
|
|
|
|
| 274 |
const cum = state.metadata && state.metadata.cumulative_reward;
|
| 275 |
-
setText('hdr-score', (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
}
|
| 277 |
|
| 278 |
async function tick() {
|
|
@@ -292,9 +370,12 @@ async function tick() {
|
|
| 292 |
renderMap(state);
|
| 293 |
renderIncidents(state);
|
| 294 |
renderBreakdown(state);
|
|
|
|
|
|
|
| 295 |
|
| 296 |
-
const
|
| 297 |
-
|
|
|
|
| 298 |
} catch (e) {
|
| 299 |
status.textContent = `Disconnected · start server on :8000 (${String(e.message || e)})`;
|
| 300 |
}
|
|
|
|
| 36 |
.fill { height: 100%; width: 0%; background: #38bdf8; }
|
| 37 |
.kpi { display: flex; justify-content: space-between; font-size: 11px; color: #cbd5e1; margin-bottom: 6px; }
|
| 38 |
.status { margin-top: 10px; font-size: 11px; color: #64748b; }
|
| 39 |
+
.history-item { border-bottom: 1px solid #1e293b; padding: 8px 0; font-size: 11px; color: #cbd5e1; }
|
| 40 |
+
.history-item:last-child { border-bottom: 0; }
|
| 41 |
+
.history-step { color: #94a3b8; margin-right: 8px; }
|
| 42 |
+
.history-issues { color: #fbbf24; display: block; margin-top: 4px; }
|
| 43 |
+
|
| 44 |
+
@media (max-width: 1200px) {
|
| 45 |
+
.layout { grid-template-rows: auto auto; height: auto; min-height: calc(100vh - 52px); }
|
| 46 |
+
.main { grid-template-columns: 1fr; }
|
| 47 |
+
.col + .col { border-left: 0; border-top: 1px solid #1e293b; }
|
| 48 |
+
.map { min-height: 260px; }
|
| 49 |
+
.breakdown { grid-template-columns: repeat(2, 1fr); }
|
| 50 |
+
}
|
| 51 |
</style>
|
| 52 |
</head>
|
| 53 |
<body>
|
|
|
|
| 59 |
<div class="pill">Task: <strong id="hdr-task">—</strong></div>
|
| 60 |
<div class="pill">Episode: <strong id="hdr-episode">—</strong></div>
|
| 61 |
<div class="pill">Step: <strong id="hdr-step">—</strong></div>
|
| 62 |
+
<div class="pill">Episode Score: <strong id="hdr-episode-score">—</strong></div>
|
| 63 |
+
<div class="pill">Cumulative Reward: <strong id="hdr-cum-reward">—</strong></div>
|
| 64 |
</div>
|
| 65 |
</div>
|
| 66 |
|
|
|
|
| 90 |
<div class="col">
|
| 91 |
<div class="panel-title">Incidents</div>
|
| 92 |
<div id="incidents"></div>
|
| 93 |
+
<div class="panel-title" style="margin-top:14px;">Recent Events</div>
|
| 94 |
+
<div id="history"></div>
|
| 95 |
</div>
|
| 96 |
</div>
|
| 97 |
|
| 98 |
<div class="bottom">
|
| 99 |
+
<div class="panel-title">Step Reward Breakdown (latest observation)</div>
|
| 100 |
<div class="breakdown">
|
| 101 |
<div>
|
| 102 |
<div class="kpi"><span>response_time</span><span id="v-response_time">0.00</span></div>
|
|
|
|
| 126 |
const API = 'http://localhost:8000';
|
| 127 |
const DASHBOARD_STATE = `${API}/dashboard/state`;
|
| 128 |
const REFRESH_MS = 500;
|
| 129 |
+
const HISTORY_LIMIT = 12;
|
| 130 |
+
|
| 131 |
+
let lastHistoryEpisode = null;
|
| 132 |
+
let lastHistoryStep = -1;
|
| 133 |
+
let eventHistory = [];
|
| 134 |
|
| 135 |
const STATUS_COLORS = {
|
| 136 |
AVAILABLE: '#10b981',
|
|
|
|
| 157 |
if (el) el.textContent = text;
|
| 158 |
}
|
| 159 |
|
| 160 |
+
function escapeHtml(value) {
|
| 161 |
+
return String(value)
|
| 162 |
+
.replaceAll('&', '&')
|
| 163 |
+
.replaceAll('<', '<')
|
| 164 |
+
.replaceAll('>', '>')
|
| 165 |
+
.replaceAll('"', '"')
|
| 166 |
+
.replaceAll("'", ''');
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
function renderUnits(state) {
|
| 170 |
const root = document.getElementById('units');
|
| 171 |
root.innerHTML = '';
|
|
|
|
| 204 |
const sev = String(i.severity || '');
|
| 205 |
const sevColor = sev === 'PRIORITY_1' ? '#ef4444' : (sev === 'PRIORITY_2' ? '#fbbf24' : '#38bdf8');
|
| 206 |
const units = (i.units_assigned || []).join(', ') || '—';
|
| 207 |
+
const survival = Number(i.survival_clock);
|
| 208 |
+
const survivalStr = Number.isFinite(survival) ? `${survival.toFixed(0)}s` : '—';
|
| 209 |
+
const p1ClockRow = sev === 'PRIORITY_1'
|
| 210 |
+
? `<div class="row"><div class="muted">p1 clock</div><div>${survivalStr}</div></div>`
|
| 211 |
+
: '';
|
| 212 |
const card = document.createElement('div');
|
| 213 |
card.className = 'card';
|
| 214 |
card.innerHTML = `
|
| 215 |
<div class="row"><div><strong style="color:${sevColor}">${i.incident_id}</strong> <span class="muted">(${i.incident_type})</span></div><div class="muted">${i.status}</div></div>
|
| 216 |
<div class="row" style="margin-top:8px"><div class="muted">severity</div><div>${i.severity}</div></div>
|
| 217 |
+
${p1ClockRow}
|
| 218 |
<div class="row"><div class="muted">assigned</div><div>${units}</div></div>
|
| 219 |
<div class="row"><div class="muted">pos</div><div>${fmt(i.location_x,0)}, ${fmt(i.location_y,0)}</div></div>
|
| 220 |
`;
|
|
|
|
| 274 |
const color = sev === 'PRIORITY_1' ? '#ef4444' : (sev === 'PRIORITY_2' ? '#fbbf24' : '#38bdf8');
|
| 275 |
const x = Number(i.location_x || 0);
|
| 276 |
const y = Number(i.location_y || 0);
|
| 277 |
+
const p1Clock = Number(i.survival_clock);
|
| 278 |
+
const p1Suffix = (sev === 'PRIORITY_1' && Number.isFinite(p1Clock)) ? ` (${p1Clock.toFixed(0)}s)` : '';
|
| 279 |
svg.appendChild(svgEl('circle', { cx: x, cy: y, r: Math.max(0.7, Math.min(w,h) * 0.012), fill: color, stroke: '#0f172a', 'stroke-width': 0.3 }));
|
| 280 |
+
svg.appendChild(svgEl('text', { x: x + 1, y: y - 1, fill: '#e2e8f0', 'font-size': 2.8, 'font-family': 'monospace' })).textContent = `${i.incident_id}${p1Suffix}`;
|
| 281 |
}
|
| 282 |
|
| 283 |
// units
|
|
|
|
| 308 |
setText('hdr-task', state.task_id || '—');
|
| 309 |
setText('hdr-episode', state.episode_id ? String(state.episode_id).slice(0, 8) : '—');
|
| 310 |
setText('hdr-step', (state.step_count !== undefined) ? String(state.step_count) : '—');
|
| 311 |
+
const episodeScore = state.metadata && state.metadata.episode_score;
|
| 312 |
const cum = state.metadata && state.metadata.cumulative_reward;
|
| 313 |
+
setText('hdr-episode-score', (episodeScore !== undefined) ? fmt(episodeScore, 3) : '—');
|
| 314 |
+
setText('hdr-cum-reward', (cum !== undefined) ? fmt(cum, 3) : '—');
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
function updateHistory(state) {
|
| 318 |
+
const obs = state.observation;
|
| 319 |
+
if (!obs) return;
|
| 320 |
+
|
| 321 |
+
if (lastHistoryEpisode !== state.episode_id) {
|
| 322 |
+
lastHistoryEpisode = state.episode_id;
|
| 323 |
+
lastHistoryStep = -1;
|
| 324 |
+
eventHistory = [];
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
if (state.step_count === lastHistoryStep) return;
|
| 328 |
+
|
| 329 |
+
const issues = Array.isArray(state.issues) ? state.issues : [];
|
| 330 |
+
eventHistory.unshift({
|
| 331 |
+
step: state.step_count,
|
| 332 |
+
result: obs.result || 'state updated',
|
| 333 |
+
issues,
|
| 334 |
+
});
|
| 335 |
+
eventHistory = eventHistory.slice(0, HISTORY_LIMIT);
|
| 336 |
+
lastHistoryStep = state.step_count;
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
function renderHistory() {
|
| 340 |
+
const root = document.getElementById('history');
|
| 341 |
+
if (!root) return;
|
| 342 |
+
|
| 343 |
+
if (eventHistory.length === 0) {
|
| 344 |
+
root.innerHTML = '<div class="muted">No events yet</div>';
|
| 345 |
+
return;
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
root.innerHTML = eventHistory.map((item) => {
|
| 349 |
+
const issueText = item.issues.length > 0
|
| 350 |
+
? `<span class="history-issues">issues: ${escapeHtml(item.issues.join(', '))}</span>`
|
| 351 |
+
: '';
|
| 352 |
+
return `<div class="history-item"><span class="history-step">step ${item.step}</span>${escapeHtml(item.result)}${issueText}</div>`;
|
| 353 |
+
}).join('');
|
| 354 |
}
|
| 355 |
|
| 356 |
async function tick() {
|
|
|
|
| 370 |
renderMap(state);
|
| 371 |
renderIncidents(state);
|
| 372 |
renderBreakdown(state);
|
| 373 |
+
updateHistory(state);
|
| 374 |
+
renderHistory();
|
| 375 |
|
| 376 |
+
const issueList = Array.isArray(state.issues) ? state.issues : [];
|
| 377 |
+
const issuePreview = issueList.length > 0 ? issueList.slice(0, 2).join(', ') : 'none';
|
| 378 |
+
status.textContent = `Connected · issues=${issueList.length} (${issuePreview}) · refresh=${REFRESH_MS}ms`;
|
| 379 |
} catch (e) {
|
| 380 |
status.textContent = `Disconnected · start server on :8000 (${String(e.message || e)})`;
|
| 381 |
}
|
samplematerial/prevalidation.sh
CHANGED
|
@@ -62,6 +62,30 @@ portable_mktemp() {
|
|
| 62 |
mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
|
| 63 |
}
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
CLEANUP_FILES=()
|
| 66 |
cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
|
| 67 |
trap cleanup EXIT
|
|
@@ -157,14 +181,16 @@ fi
|
|
| 157 |
|
| 158 |
log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
|
| 159 |
|
| 160 |
-
|
|
|
|
| 161 |
fail "openenv command not found"
|
| 162 |
-
hint "Install it: pip install openenv-core"
|
|
|
|
| 163 |
stop_at "Step 3"
|
| 164 |
fi
|
| 165 |
|
| 166 |
VALIDATE_OK=false
|
| 167 |
-
VALIDATE_OUTPUT=$(cd "$REPO_DIR" &&
|
| 168 |
|
| 169 |
if [ "$VALIDATE_OK" = true ]; then
|
| 170 |
pass "openenv validate passed"
|
|
|
|
| 62 |
mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
|
| 63 |
}
|
| 64 |
|
| 65 |
+
resolve_openenv() {
|
| 66 |
+
if command -v openenv &>/dev/null; then
|
| 67 |
+
command -v openenv
|
| 68 |
+
return 0
|
| 69 |
+
fi
|
| 70 |
+
|
| 71 |
+
local candidates=(
|
| 72 |
+
"$REPO_DIR/../.venv/Scripts/openenv.exe"
|
| 73 |
+
"$REPO_DIR/.venv/Scripts/openenv.exe"
|
| 74 |
+
"$REPO_DIR/../.venv/bin/openenv"
|
| 75 |
+
"$REPO_DIR/.venv/bin/openenv"
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
local candidate
|
| 79 |
+
for candidate in "${candidates[@]}"; do
|
| 80 |
+
if [ -x "$candidate" ]; then
|
| 81 |
+
printf "%s\n" "$candidate"
|
| 82 |
+
return 0
|
| 83 |
+
fi
|
| 84 |
+
done
|
| 85 |
+
|
| 86 |
+
return 1
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
CLEANUP_FILES=()
|
| 90 |
cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
|
| 91 |
trap cleanup EXIT
|
|
|
|
| 181 |
|
| 182 |
log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
|
| 183 |
|
| 184 |
+
OPENENV_BIN=""
|
| 185 |
+
if ! OPENENV_BIN="$(resolve_openenv)"; then
|
| 186 |
fail "openenv command not found"
|
| 187 |
+
hint "Install it in your active env: pip install openenv-core"
|
| 188 |
+
hint "Or activate your project venv before running this script."
|
| 189 |
stop_at "Step 3"
|
| 190 |
fi
|
| 191 |
|
| 192 |
VALIDATE_OK=false
|
| 193 |
+
VALIDATE_OUTPUT=$(cd "$REPO_DIR" && "$OPENENV_BIN" validate 2>&1) && VALIDATE_OK=true
|
| 194 |
|
| 195 |
if [ "$VALIDATE_OK" = true ]; then
|
| 196 |
pass "openenv validate passed"
|
samplematerial/sampleinference.py
CHANGED
|
@@ -1,187 +1,29 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Inference Script Example
|
| 3 |
-
===================================
|
| 4 |
-
MANDATORY
|
| 5 |
-
- Before submitting, ensure the following variables are defined in your environment configuration:
|
| 6 |
-
API_BASE_URL The API endpoint for the LLM.
|
| 7 |
-
MODEL_NAME The model identifier to use for inference.
|
| 8 |
-
HF_TOKEN Your Hugging Face / API key.
|
| 9 |
-
LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()
|
| 10 |
-
method
|
| 11 |
-
|
| 12 |
-
- Defaults are set only for API_BASE_URL and MODEL_NAME
|
| 13 |
-
(and should reflect your active inference setup):
|
| 14 |
-
API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
|
| 15 |
-
MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
|
| 16 |
-
|
| 17 |
-
- The inference script must be named `inference.py` and placed in the root directory of the project
|
| 18 |
-
- Participants must use OpenAI Client for all LLM calls using above variables
|
| 19 |
-
|
| 20 |
-
STDOUT FORMAT
|
| 21 |
-
- The script must emit exactly three line types to stdout, in this order:
|
| 22 |
-
|
| 23 |
-
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 24 |
-
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 25 |
-
[END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
- One [STEP] line per step, immediately after env.step() returns.
|
| 30 |
-
- One [END] line after env.close(), always emitted (even on exception).
|
| 31 |
-
- reward and rewards are formatted to 2 decimal places.
|
| 32 |
-
- done and success are lowercase booleans: true or false.
|
| 33 |
-
- error is the raw last_action_error string, or null if none.
|
| 34 |
-
- All fields on a single line with no newlines within a line.
|
| 35 |
-
|
| 36 |
-
Example:
|
| 37 |
-
[START] task=click-test env=miniwob model=Qwen3-VL-30B
|
| 38 |
-
[STEP] step=1 action=click('123') reward=0.00 done=false error=null
|
| 39 |
-
[STEP] step=2 action=fill('456','text') reward=0.00 done=false error=null
|
| 40 |
-
[STEP] step=3 action=click('789') reward=1.00 done=true error=null
|
| 41 |
-
[END] success=true steps=3 rewards=0.00,0.00,1.00
|
| 42 |
"""
|
| 43 |
|
| 44 |
-
import
|
| 45 |
-
import os
|
| 46 |
-
import textwrap
|
| 47 |
-
from typing import List, Optional
|
| 48 |
-
|
| 49 |
-
from openai import OpenAI
|
| 50 |
-
|
| 51 |
-
from my_env_v4 import MyEnvV4Action, MyEnvV4Env
|
| 52 |
-
IMAGE_NAME = os.getenv("IMAGE_NAME") # If you are using docker image
|
| 53 |
-
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 54 |
-
|
| 55 |
-
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 56 |
-
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
| 57 |
-
TASK_NAME = os.getenv("MY_ENV_V4_TASK", "echo")
|
| 58 |
-
BENCHMARK = os.getenv("MY_ENV_V4_BENCHMARK", "my_env_v4")
|
| 59 |
-
MAX_STEPS = 8
|
| 60 |
-
TEMPERATURE = 0.7
|
| 61 |
-
MAX_TOKENS = 150
|
| 62 |
-
SUCCESS_SCORE_THRESHOLD = 0.1 # normalized score in [0, 1]
|
| 63 |
-
|
| 64 |
-
# Max possible reward: each token contributes 0.1, across all steps
|
| 65 |
-
_MAX_REWARD_PER_STEP = MAX_TOKENS * 0.1
|
| 66 |
-
MAX_TOTAL_REWARD = MAX_STEPS * _MAX_REWARD_PER_STEP
|
| 67 |
-
|
| 68 |
-
SYSTEM_PROMPT = textwrap.dedent(
|
| 69 |
-
"""
|
| 70 |
-
You are interacting with a simple echo environment.
|
| 71 |
-
Each turn you must send a message. The environment will echo it back.
|
| 72 |
-
Reward is proportional to message length: reward = len(message) * 0.1
|
| 73 |
-
Your goal is to maximize total reward by sending meaningful, substantive messages.
|
| 74 |
-
Reply with exactly one message string — no quotes, no prefixes, just the message text.
|
| 75 |
-
"""
|
| 76 |
-
).strip()
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
def log_start(task: str, env: str, model: str) -> None:
|
| 80 |
-
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 84 |
-
error_val = error if error else "null"
|
| 85 |
-
done_val = str(done).lower()
|
| 86 |
-
print(
|
| 87 |
-
f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
|
| 88 |
-
flush=True,
|
| 89 |
-
)
|
| 90 |
-
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
def build_user_prompt(step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
|
| 98 |
-
history_block = "\n".join(history[-4:]) if history else "None"
|
| 99 |
-
return textwrap.dedent(
|
| 100 |
-
f"""
|
| 101 |
-
Step: {step}
|
| 102 |
-
Last echoed message: {last_echoed!r}
|
| 103 |
-
Last reward: {last_reward:.2f}
|
| 104 |
-
Previous steps:
|
| 105 |
-
{history_block}
|
| 106 |
-
Send your next message.
|
| 107 |
-
"""
|
| 108 |
-
).strip()
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
def get_model_message(client: OpenAI, step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
|
| 112 |
-
user_prompt = build_user_prompt(step, last_echoed, last_reward, history)
|
| 113 |
-
try:
|
| 114 |
-
completion = client.chat.completions.create(
|
| 115 |
-
model=MODEL_NAME,
|
| 116 |
-
messages=[
|
| 117 |
-
{"role": "system", "content": SYSTEM_PROMPT},
|
| 118 |
-
{"role": "user", "content": user_prompt},
|
| 119 |
-
],
|
| 120 |
-
temperature=TEMPERATURE,
|
| 121 |
-
max_tokens=MAX_TOKENS,
|
| 122 |
-
stream=False,
|
| 123 |
-
)
|
| 124 |
-
text = (completion.choices[0].message.content or "").strip()
|
| 125 |
-
return text if text else "hello"
|
| 126 |
-
except Exception as exc:
|
| 127 |
-
print(f"[DEBUG] Model request failed: {exc}", flush=True)
|
| 128 |
-
return "hello"
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
async def main() -> None:
|
| 132 |
-
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 133 |
-
|
| 134 |
-
env = await MyEnvV4Env.from_docker_image(IMAGE_NAME)
|
| 135 |
-
|
| 136 |
-
history: List[str] = []
|
| 137 |
-
rewards: List[float] = []
|
| 138 |
-
steps_taken = 0
|
| 139 |
-
score = 0.0
|
| 140 |
-
success = False
|
| 141 |
-
|
| 142 |
-
log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
|
| 143 |
-
|
| 144 |
-
try:
|
| 145 |
-
result = await env.reset() # OpenENV.reset()
|
| 146 |
-
last_echoed = result.observation.echoed_message
|
| 147 |
-
last_reward = 0.0
|
| 148 |
-
|
| 149 |
-
for step in range(1, MAX_STEPS + 1):
|
| 150 |
-
if result.done:
|
| 151 |
-
break
|
| 152 |
-
|
| 153 |
-
message = get_model_message(client, step, last_echoed, last_reward, history)
|
| 154 |
-
|
| 155 |
-
result = await env.step(MyEnvV4Action(message=message))
|
| 156 |
-
obs = result.observation
|
| 157 |
-
|
| 158 |
-
reward = result.reward or 0.0
|
| 159 |
-
done = result.done
|
| 160 |
-
error = None
|
| 161 |
|
| 162 |
-
rewards.append(reward)
|
| 163 |
-
steps_taken = step
|
| 164 |
-
last_echoed = obs.echoed_message
|
| 165 |
-
last_reward = reward
|
| 166 |
|
| 167 |
-
|
|
|
|
| 168 |
|
| 169 |
-
history.append(f"Step {step}: {message!r} -> reward {reward:+.2f}")
|
| 170 |
|
| 171 |
-
|
| 172 |
-
|
|
|
|
|
|
|
| 173 |
|
| 174 |
-
|
| 175 |
-
score = min(max(score, 0.0), 1.0) # clamp to [0, 1]
|
| 176 |
-
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 177 |
|
| 178 |
-
|
| 179 |
-
try:
|
| 180 |
-
await env.close()
|
| 181 |
-
except Exception as e:
|
| 182 |
-
print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
|
| 183 |
-
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 184 |
|
| 185 |
|
| 186 |
if __name__ == "__main__":
|
| 187 |
-
|
|
|
|
| 1 |
+
"""Sample inference launcher for the 911 dispatch project.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
Use this file as a runnable reference from samplematerial.
|
| 4 |
+
For submission, the authoritative script is the root-level inference.py.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
+
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
import asyncio
|
| 10 |
+
import sys
|
| 11 |
+
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
def _project_root() -> Path:
|
| 15 |
+
return Path(__file__).resolve().parents[1]
|
| 16 |
|
|
|
|
| 17 |
|
| 18 |
+
def main() -> int:
|
| 19 |
+
root = _project_root()
|
| 20 |
+
if str(root) not in sys.path:
|
| 21 |
+
sys.path.insert(0, str(root))
|
| 22 |
|
| 23 |
+
from inference import main as run_inference
|
|
|
|
|
|
|
| 24 |
|
| 25 |
+
return asyncio.run(run_inference())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
if __name__ == "__main__":
|
| 29 |
+
raise SystemExit(main())
|
scripts/run_baseline_matrix.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Run baseline inference matrix (random + Open LLM) and summarize variance.
|
| 2 |
+
|
| 3 |
+
Usage examples:
|
| 4 |
+
python scripts/run_baseline_matrix.py --random-runs 1 --llm-runs 0
|
| 5 |
+
python scripts/run_baseline_matrix.py --random-runs 1 --llm-runs 3 --output-json baseline_report.json
|
| 6 |
+
|
| 7 |
+
Environment variables:
|
| 8 |
+
API_BASE_URL, MODEL_NAME
|
| 9 |
+
OPENAI_API_KEY or HF_TOKEN (required when --llm-runs > 0)
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import argparse
|
| 15 |
+
import json
|
| 16 |
+
import os
|
| 17 |
+
import re
|
| 18 |
+
import statistics
|
| 19 |
+
import subprocess
|
| 20 |
+
import sys
|
| 21 |
+
import time
|
| 22 |
+
from dataclasses import asdict, dataclass
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
|
| 25 |
+
START_RE = re.compile(r"^\[START\]\s+task=(\S+)\s+env=(\S+)\s+model=(\S+)$")
|
| 26 |
+
END_RE = re.compile(
|
| 27 |
+
r"^\[END\]\s+success=(true|false)\s+steps=(\d+)\s+score=([0-9]*\.?[0-9]+)\s+rewards=(.*)$"
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class TaskEpisode:
|
| 33 |
+
task_id: str
|
| 34 |
+
success: bool
|
| 35 |
+
steps: int
|
| 36 |
+
score: float
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
@dataclass
|
| 40 |
+
class RunResult:
|
| 41 |
+
lane: str
|
| 42 |
+
run_index: int
|
| 43 |
+
runtime_seconds: float
|
| 44 |
+
tasks: list[TaskEpisode]
|
| 45 |
+
return_code: int
|
| 46 |
+
stderr: str
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _project_root() -> Path:
|
| 50 |
+
return Path(__file__).resolve().parents[1]
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _required_var(name: str) -> str:
|
| 54 |
+
value = os.environ.get(name)
|
| 55 |
+
if not value:
|
| 56 |
+
raise RuntimeError(f"Missing required environment variable: {name}")
|
| 57 |
+
return value
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _extract_task_episodes(stdout: str) -> list[TaskEpisode]:
|
| 61 |
+
episodes: list[TaskEpisode] = []
|
| 62 |
+
current_task: str | None = None
|
| 63 |
+
|
| 64 |
+
for line in stdout.splitlines():
|
| 65 |
+
start_match = START_RE.match(line)
|
| 66 |
+
if start_match:
|
| 67 |
+
current_task = start_match.group(1)
|
| 68 |
+
continue
|
| 69 |
+
|
| 70 |
+
end_match = END_RE.match(line)
|
| 71 |
+
if end_match:
|
| 72 |
+
task_id = current_task or f"unknown-{len(episodes) + 1}"
|
| 73 |
+
episodes.append(
|
| 74 |
+
TaskEpisode(
|
| 75 |
+
task_id=task_id,
|
| 76 |
+
success=end_match.group(1) == "true",
|
| 77 |
+
steps=int(end_match.group(2)),
|
| 78 |
+
score=float(end_match.group(3)),
|
| 79 |
+
)
|
| 80 |
+
)
|
| 81 |
+
current_task = None
|
| 82 |
+
|
| 83 |
+
return episodes
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def _run_inference(lane: str, run_index: int, timeout_seconds: int) -> RunResult:
|
| 87 |
+
env = os.environ.copy()
|
| 88 |
+
env.setdefault("API_BASE_URL", "https://api.openai.com/v1")
|
| 89 |
+
env.setdefault("MODEL_NAME", "baseline-model")
|
| 90 |
+
|
| 91 |
+
if lane == "random":
|
| 92 |
+
env["USE_RANDOM"] = "true"
|
| 93 |
+
env.setdefault("OPENAI_API_KEY", "dummy-token")
|
| 94 |
+
else:
|
| 95 |
+
env["USE_RANDOM"] = "false"
|
| 96 |
+
if not (env.get("OPENAI_API_KEY") or env.get("HF_TOKEN")):
|
| 97 |
+
raise RuntimeError(
|
| 98 |
+
"OPENAI_API_KEY or HF_TOKEN is required for Open LLM runs"
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
cmd = [sys.executable, "inference.py"]
|
| 102 |
+
started = time.monotonic()
|
| 103 |
+
proc = subprocess.run(
|
| 104 |
+
cmd,
|
| 105 |
+
cwd=str(_project_root()),
|
| 106 |
+
capture_output=True,
|
| 107 |
+
text=True,
|
| 108 |
+
encoding="utf-8",
|
| 109 |
+
errors="replace",
|
| 110 |
+
env=env,
|
| 111 |
+
timeout=timeout_seconds,
|
| 112 |
+
)
|
| 113 |
+
runtime = time.monotonic() - started
|
| 114 |
+
|
| 115 |
+
tasks = _extract_task_episodes(proc.stdout)
|
| 116 |
+
|
| 117 |
+
return RunResult(
|
| 118 |
+
lane=lane,
|
| 119 |
+
run_index=run_index,
|
| 120 |
+
runtime_seconds=runtime,
|
| 121 |
+
tasks=tasks,
|
| 122 |
+
return_code=proc.returncode,
|
| 123 |
+
stderr=proc.stderr.strip(),
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def _summarize(runs: list[RunResult]) -> dict[str, dict[str, float]]:
|
| 128 |
+
by_task: dict[str, list[float]] = {}
|
| 129 |
+
for run in runs:
|
| 130 |
+
for ep in run.tasks:
|
| 131 |
+
by_task.setdefault(ep.task_id, []).append(ep.score)
|
| 132 |
+
|
| 133 |
+
summary: dict[str, dict[str, float]] = {}
|
| 134 |
+
for task_id, scores in sorted(by_task.items()):
|
| 135 |
+
mean_score = statistics.mean(scores)
|
| 136 |
+
stdev_score = statistics.pstdev(scores) if len(scores) > 1 else 0.0
|
| 137 |
+
summary[task_id] = {
|
| 138 |
+
"runs": float(len(scores)),
|
| 139 |
+
"mean": round(mean_score, 6),
|
| 140 |
+
"std": round(stdev_score, 6),
|
| 141 |
+
"min": round(min(scores), 6),
|
| 142 |
+
"max": round(max(scores), 6),
|
| 143 |
+
}
|
| 144 |
+
return summary
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def _print_summary(title: str, runs: list[RunResult]) -> None:
|
| 148 |
+
print(f"\n=== {title} ===")
|
| 149 |
+
if not runs:
|
| 150 |
+
print("No runs executed")
|
| 151 |
+
return
|
| 152 |
+
|
| 153 |
+
summary = _summarize(runs)
|
| 154 |
+
for task_id, metrics in summary.items():
|
| 155 |
+
print(
|
| 156 |
+
f"{task_id:16s} runs={int(metrics['runs'])} "
|
| 157 |
+
f"mean={metrics['mean']:.3f} std={metrics['std']:.3f} "
|
| 158 |
+
f"min={metrics['min']:.3f} max={metrics['max']:.3f}"
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
total_runtime = sum(r.runtime_seconds for r in runs)
|
| 162 |
+
failures = [r for r in runs if r.return_code != 0]
|
| 163 |
+
print(f"total_runtime_seconds={total_runtime:.2f}")
|
| 164 |
+
print(f"failed_runs={len(failures)}")
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def _to_jsonable(runs: list[RunResult]) -> list[dict]:
|
| 168 |
+
serialized: list[dict] = []
|
| 169 |
+
for run in runs:
|
| 170 |
+
entry = asdict(run)
|
| 171 |
+
entry["tasks"] = [asdict(t) for t in run.tasks]
|
| 172 |
+
serialized.append(entry)
|
| 173 |
+
return serialized
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def main() -> int:
|
| 177 |
+
parser = argparse.ArgumentParser(description="Run baseline matrix for inference.py")
|
| 178 |
+
parser.add_argument("--random-runs", type=int, default=1)
|
| 179 |
+
parser.add_argument("--llm-runs", type=int, default=3)
|
| 180 |
+
parser.add_argument("--timeout-seconds", type=int, default=1200)
|
| 181 |
+
parser.add_argument("--output-json", type=str, default="")
|
| 182 |
+
args = parser.parse_args()
|
| 183 |
+
|
| 184 |
+
os.environ.setdefault("API_BASE_URL", "https://api.openai.com/v1")
|
| 185 |
+
os.environ.setdefault("MODEL_NAME", "nvidia/Nemotron-3-Super-49B-v1")
|
| 186 |
+
|
| 187 |
+
_required_var("API_BASE_URL")
|
| 188 |
+
_required_var("MODEL_NAME")
|
| 189 |
+
|
| 190 |
+
random_runs: list[RunResult] = []
|
| 191 |
+
llm_runs: list[RunResult] = []
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
for idx in range(1, args.random_runs + 1):
|
| 195 |
+
print(f"Running random baseline {idx}/{args.random_runs}...")
|
| 196 |
+
random_runs.append(_run_inference("random", idx, args.timeout_seconds))
|
| 197 |
+
|
| 198 |
+
for idx in range(1, args.llm_runs + 1):
|
| 199 |
+
print(f"Running Open LLM baseline {idx}/{args.llm_runs}...")
|
| 200 |
+
llm_runs.append(_run_inference("llm", idx, args.timeout_seconds))
|
| 201 |
+
except RuntimeError as exc:
|
| 202 |
+
print(f"ERROR: {exc}")
|
| 203 |
+
return 1
|
| 204 |
+
|
| 205 |
+
_print_summary("Random Baseline", random_runs)
|
| 206 |
+
_print_summary("Open LLM Baseline", llm_runs)
|
| 207 |
+
|
| 208 |
+
all_runs = random_runs + llm_runs
|
| 209 |
+
|
| 210 |
+
if args.output_json:
|
| 211 |
+
report = {
|
| 212 |
+
"api_base_url": os.environ.get("API_BASE_URL", ""),
|
| 213 |
+
"model_name": os.environ.get("MODEL_NAME", ""),
|
| 214 |
+
"random_summary": _summarize(random_runs),
|
| 215 |
+
"llm_summary": _summarize(llm_runs),
|
| 216 |
+
"runs": _to_jsonable(all_runs),
|
| 217 |
+
}
|
| 218 |
+
out_path = Path(args.output_json)
|
| 219 |
+
out_path.write_text(json.dumps(report, indent=2), encoding="utf-8")
|
| 220 |
+
print(f"Wrote report to {out_path}")
|
| 221 |
+
|
| 222 |
+
failures = [r for r in all_runs if r.return_code != 0]
|
| 223 |
+
if failures:
|
| 224 |
+
print("\nOne or more runs failed:")
|
| 225 |
+
for run in failures:
|
| 226 |
+
print(f"- lane={run.lane} run={run.run_index} rc={run.return_code}")
|
| 227 |
+
if run.stderr:
|
| 228 |
+
print(run.stderr)
|
| 229 |
+
return 1
|
| 230 |
+
|
| 231 |
+
return 0
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
if __name__ == "__main__":
|
| 235 |
+
raise SystemExit(main())
|
scripts/run_nemotron_baseline.ps1
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
param(
|
| 2 |
+
[int]$RandomRuns = 1,
|
| 3 |
+
[int]$LlmRuns = 3,
|
| 4 |
+
[int]$TimeoutSeconds = 1200,
|
| 5 |
+
[string]$ApiBaseUrl = "https://api.openai.com/v1",
|
| 6 |
+
[string]$ModelName = "nvidia/Nemotron-3-Super-49B-v1",
|
| 7 |
+
[string]$OutputJson = "baseline_nemotron_report.json"
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
$ErrorActionPreference = "Stop"
|
| 11 |
+
|
| 12 |
+
$repoRoot = Split-Path -Parent $PSScriptRoot
|
| 13 |
+
Set-Location $repoRoot
|
| 14 |
+
|
| 15 |
+
if ($LlmRuns -gt 0 -and -not $env:OPENAI_API_KEY -and -not $env:HF_TOKEN) {
|
| 16 |
+
Write-Error "Set OPENAI_API_KEY or HF_TOKEN before running this script."
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
$env:API_BASE_URL = $ApiBaseUrl
|
| 20 |
+
$env:MODEL_NAME = $ModelName
|
| 21 |
+
|
| 22 |
+
Write-Host "Running baseline matrix in $repoRoot"
|
| 23 |
+
Write-Host "API_BASE_URL=$($env:API_BASE_URL)"
|
| 24 |
+
Write-Host "MODEL_NAME=$($env:MODEL_NAME)"
|
| 25 |
+
Write-Host "RandomRuns=$RandomRuns LlmRuns=$LlmRuns TimeoutSeconds=$TimeoutSeconds"
|
| 26 |
+
|
| 27 |
+
$candidatePython = @(
|
| 28 |
+
(Join-Path $repoRoot ".venv/Scripts/python.exe"),
|
| 29 |
+
(Join-Path (Split-Path -Parent $repoRoot) ".venv/Scripts/python.exe")
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
$python = $null
|
| 33 |
+
foreach ($candidate in $candidatePython) {
|
| 34 |
+
if (Test-Path $candidate) {
|
| 35 |
+
$python = $candidate
|
| 36 |
+
break
|
| 37 |
+
}
|
| 38 |
+
}
|
| 39 |
+
if (-not $python) {
|
| 40 |
+
$python = "python"
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
& $python scripts/run_baseline_matrix.py `
|
| 44 |
+
--random-runs $RandomRuns `
|
| 45 |
+
--llm-runs $LlmRuns `
|
| 46 |
+
--timeout-seconds $TimeoutSeconds `
|
| 47 |
+
--output-json $OutputJson
|
| 48 |
+
|
| 49 |
+
if ($LASTEXITCODE -ne 0) {
|
| 50 |
+
Write-Error "Baseline matrix run failed"
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
Write-Host "Done. Report written to $OutputJson"
|
tests/test_baseline_matrix.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unit tests for scripts/run_baseline_matrix.py helpers."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import importlib.util
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
import pytest
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
SCRIPT_PATH = Path(__file__).resolve().parents[1] / "scripts" / "run_baseline_matrix.py"
|
| 13 |
+
SPEC = importlib.util.spec_from_file_location("run_baseline_matrix", SCRIPT_PATH)
|
| 14 |
+
assert SPEC and SPEC.loader
|
| 15 |
+
baseline = importlib.util.module_from_spec(SPEC)
|
| 16 |
+
sys.modules[SPEC.name] = baseline
|
| 17 |
+
SPEC.loader.exec_module(baseline)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def test_extract_task_episodes_parses_start_end_pairs() -> None:
|
| 21 |
+
stdout = "\n".join(
|
| 22 |
+
[
|
| 23 |
+
"[START] task=single_incident env=citywide-dispatch-supervisor model=test-model",
|
| 24 |
+
"[STEP] step=1 action=WAIT reward=0.00 done=false error=null",
|
| 25 |
+
"[END] success=true steps=20 score=0.300 rewards=0.00,0.10",
|
| 26 |
+
"[START] task=multi_incident env=citywide-dispatch-supervisor model=test-model",
|
| 27 |
+
"[END] success=true steps=40 score=0.700 rewards=0.10,0.20",
|
| 28 |
+
]
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
episodes = baseline._extract_task_episodes(stdout)
|
| 32 |
+
|
| 33 |
+
assert len(episodes) == 2
|
| 34 |
+
assert episodes[0].task_id == "single_incident"
|
| 35 |
+
assert episodes[0].success is True
|
| 36 |
+
assert episodes[0].steps == 20
|
| 37 |
+
assert episodes[0].score == pytest.approx(0.3)
|
| 38 |
+
assert episodes[1].task_id == "multi_incident"
|
| 39 |
+
assert episodes[1].steps == 40
|
| 40 |
+
assert episodes[1].score == pytest.approx(0.7)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def test_extract_task_episodes_falls_back_to_unknown_task() -> None:
|
| 44 |
+
stdout = "[END] success=false steps=0 score=0.000 rewards=0.00"
|
| 45 |
+
|
| 46 |
+
episodes = baseline._extract_task_episodes(stdout)
|
| 47 |
+
|
| 48 |
+
assert len(episodes) == 1
|
| 49 |
+
assert episodes[0].task_id == "unknown-1"
|
| 50 |
+
assert episodes[0].success is False
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def test_summarize_computes_mean_and_std() -> None:
|
| 54 |
+
runs = [
|
| 55 |
+
baseline.RunResult(
|
| 56 |
+
lane="random",
|
| 57 |
+
run_index=1,
|
| 58 |
+
runtime_seconds=1.0,
|
| 59 |
+
tasks=[baseline.TaskEpisode("single_incident", True, 20, 0.2)],
|
| 60 |
+
return_code=0,
|
| 61 |
+
stderr="",
|
| 62 |
+
),
|
| 63 |
+
baseline.RunResult(
|
| 64 |
+
lane="random",
|
| 65 |
+
run_index=2,
|
| 66 |
+
runtime_seconds=1.1,
|
| 67 |
+
tasks=[baseline.TaskEpisode("single_incident", True, 20, 0.4)],
|
| 68 |
+
return_code=0,
|
| 69 |
+
stderr="",
|
| 70 |
+
),
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
summary = baseline._summarize(runs)
|
| 74 |
+
|
| 75 |
+
assert summary["single_incident"]["runs"] == 2.0
|
| 76 |
+
assert summary["single_incident"]["mean"] == pytest.approx(0.3)
|
| 77 |
+
assert summary["single_incident"]["std"] == pytest.approx(0.1)
|
| 78 |
+
assert summary["single_incident"]["min"] == pytest.approx(0.2)
|
| 79 |
+
assert summary["single_incident"]["max"] == pytest.approx(0.4)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def test_to_jsonable_serializes_runs() -> None:
|
| 83 |
+
runs = [
|
| 84 |
+
baseline.RunResult(
|
| 85 |
+
lane="llm",
|
| 86 |
+
run_index=1,
|
| 87 |
+
runtime_seconds=3.2,
|
| 88 |
+
tasks=[baseline.TaskEpisode("mass_casualty", True, 59, 0.742)],
|
| 89 |
+
return_code=0,
|
| 90 |
+
stderr="",
|
| 91 |
+
)
|
| 92 |
+
]
|
| 93 |
+
|
| 94 |
+
payload = baseline._to_jsonable(runs)
|
| 95 |
+
|
| 96 |
+
assert payload[0]["lane"] == "llm"
|
| 97 |
+
assert payload[0]["tasks"][0]["task_id"] == "mass_casualty"
|
| 98 |
+
assert payload[0]["tasks"][0]["score"] == pytest.approx(0.742)
|
tests/test_openenv_integration.py
CHANGED
|
@@ -141,3 +141,32 @@ class TestTasksEndpoint:
|
|
| 141 |
"mass_casualty",
|
| 142 |
"shift_surge",
|
| 143 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
"mass_casualty",
|
| 142 |
"shift_surge",
|
| 143 |
}
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class TestDashboardEndpoint:
|
| 147 |
+
def test_dashboard_state_before_reset_returns_valid_shape(self) -> None:
|
| 148 |
+
c = TestClient(server_app.app)
|
| 149 |
+
response = c.get("/dashboard/state")
|
| 150 |
+
assert response.status_code == 200
|
| 151 |
+
|
| 152 |
+
data = response.json()
|
| 153 |
+
assert data["task_id"] == "none"
|
| 154 |
+
assert data["step_count"] == 0
|
| 155 |
+
assert isinstance(data["units"], dict)
|
| 156 |
+
assert isinstance(data["incidents"], dict)
|
| 157 |
+
assert isinstance(data["legal_actions"], list)
|
| 158 |
+
assert isinstance(data["issues"], list)
|
| 159 |
+
assert data["observation"] is None
|
| 160 |
+
|
| 161 |
+
def test_dashboard_state_after_reset_exposes_legal_actions(self) -> None:
|
| 162 |
+
c = TestClient(server_app.app)
|
| 163 |
+
reset_response = c.post("/reset", json={"task_id": "single_incident", "seed": 42})
|
| 164 |
+
assert reset_response.status_code == 200
|
| 165 |
+
|
| 166 |
+
response = c.get("/dashboard/state")
|
| 167 |
+
assert response.status_code == 200
|
| 168 |
+
|
| 169 |
+
data = response.json()
|
| 170 |
+
assert data["task_id"] == "single_incident"
|
| 171 |
+
assert isinstance(data["legal_actions"], list)
|
| 172 |
+
assert data["observation"] is not None
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
validate_local.py
CHANGED
|
@@ -4,7 +4,9 @@
|
|
| 4 |
from __future__ import annotations
|
| 5 |
|
| 6 |
import subprocess
|
|
|
|
| 7 |
import sys
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def run_command(
|
|
@@ -14,7 +16,18 @@ def run_command(
|
|
| 14 |
print(f"CHECK: {description}")
|
| 15 |
print(f"CMD: {' '.join(cmd)}")
|
| 16 |
print(f"{'=' * 60}")
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
if result.stdout:
|
| 19 |
print(result.stdout)
|
| 20 |
if result.stderr:
|
|
@@ -26,10 +39,33 @@ def run_command(
|
|
| 26 |
return result
|
| 27 |
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def check_pytest() -> bool:
|
| 30 |
-
result = run_command(
|
| 31 |
-
["uv", "run", "python", "-m", "pytest", "tests/", "-q"], "All tests pass"
|
| 32 |
-
)
|
| 33 |
return result.returncode == 0
|
| 34 |
|
| 35 |
|
|
@@ -44,9 +80,11 @@ def check_inference() -> bool:
|
|
| 44 |
|
| 45 |
print("\nNOTE: Running inference.py in random-agent mode for local validation")
|
| 46 |
result = subprocess.run(
|
| 47 |
-
|
| 48 |
capture_output=True,
|
| 49 |
text=True,
|
|
|
|
|
|
|
| 50 |
env=env,
|
| 51 |
timeout=300,
|
| 52 |
)
|
|
@@ -68,6 +106,11 @@ def check_inference() -> bool:
|
|
| 68 |
|
| 69 |
|
| 70 |
def check_docker_build() -> bool:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
result = run_command(
|
| 72 |
["docker", "build", "-t", "citywide-dispatch-supervisor", "."],
|
| 73 |
"Docker build succeeds",
|
|
@@ -76,6 +119,18 @@ def check_docker_build() -> bool:
|
|
| 76 |
return result.returncode == 0
|
| 77 |
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
def check_benchmark_scores() -> bool:
|
| 80 |
from src.benchmark import list_tasks, run_task
|
| 81 |
|
|
@@ -109,6 +164,7 @@ def main() -> int:
|
|
| 109 |
("pytest", check_pytest),
|
| 110 |
("inference", check_inference),
|
| 111 |
("docker_build", check_docker_build),
|
|
|
|
| 112 |
("benchmark_scores", check_benchmark_scores),
|
| 113 |
]
|
| 114 |
|
|
|
|
| 4 |
from __future__ import annotations
|
| 5 |
|
| 6 |
import subprocess
|
| 7 |
+
import shutil
|
| 8 |
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
|
| 11 |
|
| 12 |
def run_command(
|
|
|
|
| 16 |
print(f"CHECK: {description}")
|
| 17 |
print(f"CMD: {' '.join(cmd)}")
|
| 18 |
print(f"{'=' * 60}")
|
| 19 |
+
try:
|
| 20 |
+
result = subprocess.run(
|
| 21 |
+
cmd,
|
| 22 |
+
capture_output=True,
|
| 23 |
+
text=True,
|
| 24 |
+
encoding="utf-8",
|
| 25 |
+
errors="replace",
|
| 26 |
+
)
|
| 27 |
+
except FileNotFoundError as exc:
|
| 28 |
+
print(f"FAILED: {description}")
|
| 29 |
+
print(f"ERROR: command not found: {cmd[0]}")
|
| 30 |
+
return subprocess.CompletedProcess(cmd, 127, stdout="", stderr=str(exc))
|
| 31 |
if result.stdout:
|
| 32 |
print(result.stdout)
|
| 33 |
if result.stderr:
|
|
|
|
| 39 |
return result
|
| 40 |
|
| 41 |
|
| 42 |
+
def _tool_path(name: str) -> str | None:
|
| 43 |
+
"""Resolve tool path from PATH or current interpreter's Scripts directory."""
|
| 44 |
+
found = shutil.which(name)
|
| 45 |
+
if found:
|
| 46 |
+
return found
|
| 47 |
+
|
| 48 |
+
scripts_dir = Path(sys.executable).resolve().parent
|
| 49 |
+
candidates = [
|
| 50 |
+
scripts_dir / name,
|
| 51 |
+
scripts_dir / f"{name}.exe",
|
| 52 |
+
]
|
| 53 |
+
for candidate in candidates:
|
| 54 |
+
if candidate.exists():
|
| 55 |
+
return str(candidate)
|
| 56 |
+
return None
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _python_cmd(*args: str) -> list[str]:
|
| 60 |
+
"""Build a Python command, preferring uv when available."""
|
| 61 |
+
uv = _tool_path("uv")
|
| 62 |
+
if uv:
|
| 63 |
+
return [uv, "run", "python", *args]
|
| 64 |
+
return [sys.executable, *args]
|
| 65 |
+
|
| 66 |
+
|
| 67 |
def check_pytest() -> bool:
|
| 68 |
+
result = run_command(_python_cmd("-m", "pytest", "tests/", "-q"), "All tests pass")
|
|
|
|
|
|
|
| 69 |
return result.returncode == 0
|
| 70 |
|
| 71 |
|
|
|
|
| 80 |
|
| 81 |
print("\nNOTE: Running inference.py in random-agent mode for local validation")
|
| 82 |
result = subprocess.run(
|
| 83 |
+
_python_cmd("inference.py"),
|
| 84 |
capture_output=True,
|
| 85 |
text=True,
|
| 86 |
+
encoding="utf-8",
|
| 87 |
+
errors="replace",
|
| 88 |
env=env,
|
| 89 |
timeout=300,
|
| 90 |
)
|
|
|
|
| 106 |
|
| 107 |
|
| 108 |
def check_docker_build() -> bool:
|
| 109 |
+
if not shutil.which("docker"):
|
| 110 |
+
print("FAILED: Docker build succeeds")
|
| 111 |
+
print("ERROR: docker command not found")
|
| 112 |
+
return False
|
| 113 |
+
|
| 114 |
result = run_command(
|
| 115 |
["docker", "build", "-t", "citywide-dispatch-supervisor", "."],
|
| 116 |
"Docker build succeeds",
|
|
|
|
| 119 |
return result.returncode == 0
|
| 120 |
|
| 121 |
|
| 122 |
+
def check_openenv_validate() -> bool:
|
| 123 |
+
openenv = _tool_path("openenv")
|
| 124 |
+
if not openenv:
|
| 125 |
+
print("FAILED: openenv validate passes")
|
| 126 |
+
print("ERROR: openenv command not found")
|
| 127 |
+
print("HINT: Install with: pip install openenv-core")
|
| 128 |
+
return False
|
| 129 |
+
|
| 130 |
+
result = run_command([openenv, "validate"], "openenv validate passes", check=False)
|
| 131 |
+
return result.returncode == 0
|
| 132 |
+
|
| 133 |
+
|
| 134 |
def check_benchmark_scores() -> bool:
|
| 135 |
from src.benchmark import list_tasks, run_task
|
| 136 |
|
|
|
|
| 164 |
("pytest", check_pytest),
|
| 165 |
("inference", check_inference),
|
| 166 |
("docker_build", check_docker_build),
|
| 167 |
+
("openenv_validate", check_openenv_validate),
|
| 168 |
("benchmark_scores", check_benchmark_scores),
|
| 169 |
]
|
| 170 |
|