garvitsachdeva commited on
Commit
775befb
·
1 Parent(s): e493c92

Submission polish: compliance hardening, baseline matrix, dashboard UX, tests, and docs

Browse files
.gitignore CHANGED
@@ -28,6 +28,7 @@ htmlcov/
28
  .sisyphus/notepads/
29
  *.log
30
  tmp/
 
31
 
32
  # Do not commit architecture notes
33
  architecture.md
 
28
  .sisyphus/notepads/
29
  *.log
30
  tmp/
31
+ baseline_*_report.json
32
 
33
  # Do not commit architecture notes
34
  architecture.md
README.md CHANGED
@@ -421,14 +421,31 @@ USE_RANDOM=true \
421
  uv run python inference.py
422
  ```
423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
  | Task | Difficulty | Random Baseline Score |
425
  |---|---|---|
426
- | `single_incident` | Easy | ~0.55 |
427
- | `multi_incident` | Medium | ~0.48 |
428
- | `mass_casualty` | Hard | ~0.32 |
429
- | `shift_surge` | Hard | ~0.38 |
430
 
431
- *Scores use `seed=42` for reproducibility. Variance is low across runs due to deterministic state machine.*
 
432
 
433
  ---
434
 
@@ -557,10 +574,13 @@ uv run pytest tests/test_inference.py -v
557
  uv run python validate_local.py
558
 
559
  # OpenEnv spec validation
560
- uv run openenv validate
561
 
562
  # HF Space validation (requires deployed space)
563
  bash samplematerial/prevalidation.sh https://your-space.hf.space .
 
 
 
564
  ```
565
 
566
  ---
 
421
  uv run python inference.py
422
  ```
423
 
424
+ Run the baseline matrix (random + Open LLM reruns) and emit a JSON report:
425
+
426
+ ```bash
427
+ API_BASE_URL=https://api.openai.com/v1 \
428
+ MODEL_NAME=nvidia/Nemotron-3-Super-49B-v1 \
429
+ OPENAI_API_KEY=your_token \
430
+ uv run python scripts/run_baseline_matrix.py --random-runs 1 --llm-runs 3 --output-json baseline_nemotron_report.json
431
+ ```
432
+
433
+ Windows PowerShell shortcut:
434
+
435
+ ```powershell
436
+ $env:OPENAI_API_KEY="your_token"
437
+ powershell -ExecutionPolicy Bypass -File scripts/run_nemotron_baseline.ps1 -RandomRuns 1 -LlmRuns 3
438
+ ```
439
+
440
  | Task | Difficulty | Random Baseline Score |
441
  |---|---|---|
442
+ | `single_incident` | Easy | ~0.30 |
443
+ | `multi_incident` | Medium | ~0.70 |
444
+ | `mass_casualty` | Hard | ~0.74 |
445
+ | `shift_surge` | Hard | ~0.56 |
446
 
447
+ *Scores above are from deterministic random-agent inference with `seed=42`.*
448
+ *For Open LLM evaluation, use Nemotron 3 Super as the primary baseline and report mean/std across reruns.*
449
 
450
  ---
451
 
 
574
  uv run python validate_local.py
575
 
576
  # OpenEnv spec validation
577
+ openenv validate
578
 
579
  # HF Space validation (requires deployed space)
580
  bash samplematerial/prevalidation.sh https://your-space.hf.space .
581
+
582
+ # Windows (explicit Git Bash)
583
+ "C:/Program Files/Git/bin/bash.exe" samplematerial/prevalidation.sh https://your-space.hf.space .
584
  ```
585
 
586
  ---
inference.py CHANGED
@@ -70,6 +70,7 @@ class LLMAgent:
70
  self.api_key = api_key
71
  self.base_url = base_url.rstrip("/")
72
  self.model = model
 
73
 
74
  # Official OpenAI Python client for OpenAI-compatible endpoints.
75
  self._client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url)
@@ -142,7 +143,7 @@ Respond with ONLY the exact action string from the legal actions list. No explan
142
  return action
143
 
144
  # Fallback to random if LLM response doesn't match
145
- return random.choice(legal_actions)
146
 
147
 
148
  def _format_action(action: Action) -> str:
@@ -198,11 +199,11 @@ async def run_episode(
198
  step_count = 0
199
  rewards: list[float] = []
200
  success = False
201
- error_msg: str | None = None
202
 
203
  try:
204
  observation = await env.reset()
205
- rewards.append(observation.score)
206
  prev_obs = observation
207
 
208
  while True:
@@ -230,6 +231,7 @@ async def run_episode(
230
  obs, reward, done = await env.step(action)
231
  prev_obs = obs
232
  rewards.append(reward)
 
233
 
234
  # Terminal conditions: done flag OR any protocol-invalid transition.
235
  has_illegal_transition = any(
@@ -264,7 +266,6 @@ async def run_episode(
264
 
265
  # Safety check for runaway episodes
266
  if step_count >= 1000:
267
- error_msg = "max_steps_exceeded"
268
  print(
269
  f"[STEP] step={step_count} action={action_str} "
270
  f"reward={reward_str} done=true error=max_steps_exceeded"
@@ -273,26 +274,21 @@ async def run_episode(
273
  break
274
 
275
  except Exception as e:
276
- error_msg = "step_error" # normalize to a fixed token
277
  print(
278
  f"[STEP] step={step_count} action={action_str} "
279
- f"reward=0.00 done=true error={error_msg}"
280
  )
281
  success = False
282
  break
283
 
284
  except Exception as e:
285
- error_msg = str(e)
286
  success = False
287
  finally:
288
  env.close()
289
 
290
- # Separate reset reward from step rewards
291
- step_rewards = rewards[1:]
292
- if step_rewards:
293
- total_score = sum(step_rewards) / len(step_rewards)
294
- else:
295
- total_score = 0.0
296
  total_score = max(0.0, min(1.0, total_score))
297
 
298
  # Format rewards list as comma-separated with 2 decimal places
@@ -331,12 +327,12 @@ async def main() -> int:
331
 
332
  except EnvironmentError as e:
333
  # Emit [END] to stdout for failure case
334
- print("[END] success=false steps=0 score=0.000 rewards=")
335
  print(f"ERROR: {e}", file=sys.stderr)
336
  return 1
337
  except Exception as e:
338
  # Emit [END] to stdout for failure case
339
- print("[END] success=false steps=0 score=0.000 rewards=")
340
  print(f"ERROR: Unexpected error: {e}", file=sys.stderr)
341
  return 1
342
 
 
70
  self.api_key = api_key
71
  self.base_url = base_url.rstrip("/")
72
  self.model = model
73
+ self._rng = random.Random(42)
74
 
75
  # Official OpenAI Python client for OpenAI-compatible endpoints.
76
  self._client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url)
 
143
  return action
144
 
145
  # Fallback to random if LLM response doesn't match
146
+ return self._rng.choice(legal_actions)
147
 
148
 
149
  def _format_action(action: Action) -> str:
 
199
  step_count = 0
200
  rewards: list[float] = []
201
  success = False
202
+ episode_score = 0.0
203
 
204
  try:
205
  observation = await env.reset()
206
+ episode_score = float(observation.score)
207
  prev_obs = observation
208
 
209
  while True:
 
231
  obs, reward, done = await env.step(action)
232
  prev_obs = obs
233
  rewards.append(reward)
234
+ episode_score = float(obs.score)
235
 
236
  # Terminal conditions: done flag OR any protocol-invalid transition.
237
  has_illegal_transition = any(
 
266
 
267
  # Safety check for runaway episodes
268
  if step_count >= 1000:
 
269
  print(
270
  f"[STEP] step={step_count} action={action_str} "
271
  f"reward={reward_str} done=true error=max_steps_exceeded"
 
274
  break
275
 
276
  except Exception as e:
 
277
  print(
278
  f"[STEP] step={step_count} action={action_str} "
279
+ f"reward=0.00 done=true error=step_error"
280
  )
281
  success = False
282
  break
283
 
284
  except Exception as e:
 
285
  success = False
286
  finally:
287
  env.close()
288
 
289
+ # OpenEnv publishes episode score in observation.score; keep this for [END].
290
+ step_rewards = rewards
291
+ total_score = episode_score
 
 
 
292
  total_score = max(0.0, min(1.0, total_score))
293
 
294
  # Format rewards list as comma-separated with 2 decimal places
 
327
 
328
  except EnvironmentError as e:
329
  # Emit [END] to stdout for failure case
330
+ print("[END] success=false steps=0 score=0.000 rewards=0.00")
331
  print(f"ERROR: {e}", file=sys.stderr)
332
  return 1
333
  except Exception as e:
334
  # Emit [END] to stdout for failure case
335
+ print("[END] success=false steps=0 score=0.000 rewards=0.00")
336
  print(f"ERROR: Unexpected error: {e}", file=sys.stderr)
337
  return 1
338
 
live_dashboard.html CHANGED
@@ -36,6 +36,18 @@
36
  .fill { height: 100%; width: 0%; background: #38bdf8; }
37
  .kpi { display: flex; justify-content: space-between; font-size: 11px; color: #cbd5e1; margin-bottom: 6px; }
38
  .status { margin-top: 10px; font-size: 11px; color: #64748b; }
 
 
 
 
 
 
 
 
 
 
 
 
39
  </style>
40
  </head>
41
  <body>
@@ -47,7 +59,8 @@
47
  <div class="pill">Task: <strong id="hdr-task">—</strong></div>
48
  <div class="pill">Episode: <strong id="hdr-episode">—</strong></div>
49
  <div class="pill">Step: <strong id="hdr-step">—</strong></div>
50
- <div class="pill">Score: <strong id="hdr-score">—</strong></div>
 
51
  </div>
52
  </div>
53
 
@@ -77,11 +90,13 @@
77
  <div class="col">
78
  <div class="panel-title">Incidents</div>
79
  <div id="incidents"></div>
 
 
80
  </div>
81
  </div>
82
 
83
  <div class="bottom">
84
- <div class="panel-title">Episode Score Breakdown</div>
85
  <div class="breakdown">
86
  <div>
87
  <div class="kpi"><span>response_time</span><span id="v-response_time">0.00</span></div>
@@ -111,6 +126,11 @@
111
  const API = 'http://localhost:8000';
112
  const DASHBOARD_STATE = `${API}/dashboard/state`;
113
  const REFRESH_MS = 500;
 
 
 
 
 
114
 
115
  const STATUS_COLORS = {
116
  AVAILABLE: '#10b981',
@@ -137,6 +157,15 @@ function setText(id, text) {
137
  if (el) el.textContent = text;
138
  }
139
 
 
 
 
 
 
 
 
 
 
140
  function renderUnits(state) {
141
  const root = document.getElementById('units');
142
  root.innerHTML = '';
@@ -175,11 +204,17 @@ function renderIncidents(state) {
175
  const sev = String(i.severity || '');
176
  const sevColor = sev === 'PRIORITY_1' ? '#ef4444' : (sev === 'PRIORITY_2' ? '#fbbf24' : '#38bdf8');
177
  const units = (i.units_assigned || []).join(', ') || '—';
 
 
 
 
 
178
  const card = document.createElement('div');
179
  card.className = 'card';
180
  card.innerHTML = `
181
  <div class="row"><div><strong style="color:${sevColor}">${i.incident_id}</strong> <span class="muted">(${i.incident_type})</span></div><div class="muted">${i.status}</div></div>
182
  <div class="row" style="margin-top:8px"><div class="muted">severity</div><div>${i.severity}</div></div>
 
183
  <div class="row"><div class="muted">assigned</div><div>${units}</div></div>
184
  <div class="row"><div class="muted">pos</div><div>${fmt(i.location_x,0)}, ${fmt(i.location_y,0)}</div></div>
185
  `;
@@ -239,8 +274,10 @@ function renderMap(state) {
239
  const color = sev === 'PRIORITY_1' ? '#ef4444' : (sev === 'PRIORITY_2' ? '#fbbf24' : '#38bdf8');
240
  const x = Number(i.location_x || 0);
241
  const y = Number(i.location_y || 0);
 
 
242
  svg.appendChild(svgEl('circle', { cx: x, cy: y, r: Math.max(0.7, Math.min(w,h) * 0.012), fill: color, stroke: '#0f172a', 'stroke-width': 0.3 }));
243
- svg.appendChild(svgEl('text', { x: x + 1, y: y - 1, fill: '#e2e8f0', 'font-size': 2.8, 'font-family': 'monospace' })).textContent = i.incident_id;
244
  }
245
 
246
  // units
@@ -271,8 +308,49 @@ function updateHeader(state) {
271
  setText('hdr-task', state.task_id || '—');
272
  setText('hdr-episode', state.episode_id ? String(state.episode_id).slice(0, 8) : '—');
273
  setText('hdr-step', (state.step_count !== undefined) ? String(state.step_count) : '—');
 
274
  const cum = state.metadata && state.metadata.cumulative_reward;
275
- setText('hdr-score', (cum !== undefined) ? fmt(cum, 3) : '—');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  }
277
 
278
  async function tick() {
@@ -292,9 +370,12 @@ async function tick() {
292
  renderMap(state);
293
  renderIncidents(state);
294
  renderBreakdown(state);
 
 
295
 
296
- const issues = Array.isArray(state.issues) ? state.issues.length : 0;
297
- status.textContent = `Connected · issues=${issues} · refresh=${REFRESH_MS}ms`;
 
298
  } catch (e) {
299
  status.textContent = `Disconnected · start server on :8000 (${String(e.message || e)})`;
300
  }
 
36
  .fill { height: 100%; width: 0%; background: #38bdf8; }
37
  .kpi { display: flex; justify-content: space-between; font-size: 11px; color: #cbd5e1; margin-bottom: 6px; }
38
  .status { margin-top: 10px; font-size: 11px; color: #64748b; }
39
+ .history-item { border-bottom: 1px solid #1e293b; padding: 8px 0; font-size: 11px; color: #cbd5e1; }
40
+ .history-item:last-child { border-bottom: 0; }
41
+ .history-step { color: #94a3b8; margin-right: 8px; }
42
+ .history-issues { color: #fbbf24; display: block; margin-top: 4px; }
43
+
44
+ @media (max-width: 1200px) {
45
+ .layout { grid-template-rows: auto auto; height: auto; min-height: calc(100vh - 52px); }
46
+ .main { grid-template-columns: 1fr; }
47
+ .col + .col { border-left: 0; border-top: 1px solid #1e293b; }
48
+ .map { min-height: 260px; }
49
+ .breakdown { grid-template-columns: repeat(2, 1fr); }
50
+ }
51
  </style>
52
  </head>
53
  <body>
 
59
  <div class="pill">Task: <strong id="hdr-task">—</strong></div>
60
  <div class="pill">Episode: <strong id="hdr-episode">—</strong></div>
61
  <div class="pill">Step: <strong id="hdr-step">—</strong></div>
62
+ <div class="pill">Episode Score: <strong id="hdr-episode-score">—</strong></div>
63
+ <div class="pill">Cumulative Reward: <strong id="hdr-cum-reward">—</strong></div>
64
  </div>
65
  </div>
66
 
 
90
  <div class="col">
91
  <div class="panel-title">Incidents</div>
92
  <div id="incidents"></div>
93
+ <div class="panel-title" style="margin-top:14px;">Recent Events</div>
94
+ <div id="history"></div>
95
  </div>
96
  </div>
97
 
98
  <div class="bottom">
99
+ <div class="panel-title">Step Reward Breakdown (latest observation)</div>
100
  <div class="breakdown">
101
  <div>
102
  <div class="kpi"><span>response_time</span><span id="v-response_time">0.00</span></div>
 
126
  const API = 'http://localhost:8000';
127
  const DASHBOARD_STATE = `${API}/dashboard/state`;
128
  const REFRESH_MS = 500;
129
+ const HISTORY_LIMIT = 12;
130
+
131
+ let lastHistoryEpisode = null;
132
+ let lastHistoryStep = -1;
133
+ let eventHistory = [];
134
 
135
  const STATUS_COLORS = {
136
  AVAILABLE: '#10b981',
 
157
  if (el) el.textContent = text;
158
  }
159
 
160
+ function escapeHtml(value) {
161
+ return String(value)
162
+ .replaceAll('&', '&amp;')
163
+ .replaceAll('<', '&lt;')
164
+ .replaceAll('>', '&gt;')
165
+ .replaceAll('"', '&quot;')
166
+ .replaceAll("'", '&#39;');
167
+ }
168
+
169
  function renderUnits(state) {
170
  const root = document.getElementById('units');
171
  root.innerHTML = '';
 
204
  const sev = String(i.severity || '');
205
  const sevColor = sev === 'PRIORITY_1' ? '#ef4444' : (sev === 'PRIORITY_2' ? '#fbbf24' : '#38bdf8');
206
  const units = (i.units_assigned || []).join(', ') || '—';
207
+ const survival = Number(i.survival_clock);
208
+ const survivalStr = Number.isFinite(survival) ? `${survival.toFixed(0)}s` : '—';
209
+ const p1ClockRow = sev === 'PRIORITY_1'
210
+ ? `<div class="row"><div class="muted">p1 clock</div><div>${survivalStr}</div></div>`
211
+ : '';
212
  const card = document.createElement('div');
213
  card.className = 'card';
214
  card.innerHTML = `
215
  <div class="row"><div><strong style="color:${sevColor}">${i.incident_id}</strong> <span class="muted">(${i.incident_type})</span></div><div class="muted">${i.status}</div></div>
216
  <div class="row" style="margin-top:8px"><div class="muted">severity</div><div>${i.severity}</div></div>
217
+ ${p1ClockRow}
218
  <div class="row"><div class="muted">assigned</div><div>${units}</div></div>
219
  <div class="row"><div class="muted">pos</div><div>${fmt(i.location_x,0)}, ${fmt(i.location_y,0)}</div></div>
220
  `;
 
274
  const color = sev === 'PRIORITY_1' ? '#ef4444' : (sev === 'PRIORITY_2' ? '#fbbf24' : '#38bdf8');
275
  const x = Number(i.location_x || 0);
276
  const y = Number(i.location_y || 0);
277
+ const p1Clock = Number(i.survival_clock);
278
+ const p1Suffix = (sev === 'PRIORITY_1' && Number.isFinite(p1Clock)) ? ` (${p1Clock.toFixed(0)}s)` : '';
279
  svg.appendChild(svgEl('circle', { cx: x, cy: y, r: Math.max(0.7, Math.min(w,h) * 0.012), fill: color, stroke: '#0f172a', 'stroke-width': 0.3 }));
280
+ svg.appendChild(svgEl('text', { x: x + 1, y: y - 1, fill: '#e2e8f0', 'font-size': 2.8, 'font-family': 'monospace' })).textContent = `${i.incident_id}${p1Suffix}`;
281
  }
282
 
283
  // units
 
308
  setText('hdr-task', state.task_id || '—');
309
  setText('hdr-episode', state.episode_id ? String(state.episode_id).slice(0, 8) : '—');
310
  setText('hdr-step', (state.step_count !== undefined) ? String(state.step_count) : '—');
311
+ const episodeScore = state.metadata && state.metadata.episode_score;
312
  const cum = state.metadata && state.metadata.cumulative_reward;
313
+ setText('hdr-episode-score', (episodeScore !== undefined) ? fmt(episodeScore, 3) : '—');
314
+ setText('hdr-cum-reward', (cum !== undefined) ? fmt(cum, 3) : '—');
315
+ }
316
+
317
+ function updateHistory(state) {
318
+ const obs = state.observation;
319
+ if (!obs) return;
320
+
321
+ if (lastHistoryEpisode !== state.episode_id) {
322
+ lastHistoryEpisode = state.episode_id;
323
+ lastHistoryStep = -1;
324
+ eventHistory = [];
325
+ }
326
+
327
+ if (state.step_count === lastHistoryStep) return;
328
+
329
+ const issues = Array.isArray(state.issues) ? state.issues : [];
330
+ eventHistory.unshift({
331
+ step: state.step_count,
332
+ result: obs.result || 'state updated',
333
+ issues,
334
+ });
335
+ eventHistory = eventHistory.slice(0, HISTORY_LIMIT);
336
+ lastHistoryStep = state.step_count;
337
+ }
338
+
339
+ function renderHistory() {
340
+ const root = document.getElementById('history');
341
+ if (!root) return;
342
+
343
+ if (eventHistory.length === 0) {
344
+ root.innerHTML = '<div class="muted">No events yet</div>';
345
+ return;
346
+ }
347
+
348
+ root.innerHTML = eventHistory.map((item) => {
349
+ const issueText = item.issues.length > 0
350
+ ? `<span class="history-issues">issues: ${escapeHtml(item.issues.join(', '))}</span>`
351
+ : '';
352
+ return `<div class="history-item"><span class="history-step">step ${item.step}</span>${escapeHtml(item.result)}${issueText}</div>`;
353
+ }).join('');
354
  }
355
 
356
  async function tick() {
 
370
  renderMap(state);
371
  renderIncidents(state);
372
  renderBreakdown(state);
373
+ updateHistory(state);
374
+ renderHistory();
375
 
376
+ const issueList = Array.isArray(state.issues) ? state.issues : [];
377
+ const issuePreview = issueList.length > 0 ? issueList.slice(0, 2).join(', ') : 'none';
378
+ status.textContent = `Connected · issues=${issueList.length} (${issuePreview}) · refresh=${REFRESH_MS}ms`;
379
  } catch (e) {
380
  status.textContent = `Disconnected · start server on :8000 (${String(e.message || e)})`;
381
  }
samplematerial/prevalidation.sh CHANGED
@@ -62,6 +62,30 @@ portable_mktemp() {
62
  mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
63
  }
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  CLEANUP_FILES=()
66
  cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
67
  trap cleanup EXIT
@@ -157,14 +181,16 @@ fi
157
 
158
  log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
159
 
160
- if ! command -v openenv &>/dev/null; then
 
161
  fail "openenv command not found"
162
- hint "Install it: pip install openenv-core"
 
163
  stop_at "Step 3"
164
  fi
165
 
166
  VALIDATE_OK=false
167
- VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
168
 
169
  if [ "$VALIDATE_OK" = true ]; then
170
  pass "openenv validate passed"
 
62
  mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
63
  }
64
 
65
+ resolve_openenv() {
66
+ if command -v openenv &>/dev/null; then
67
+ command -v openenv
68
+ return 0
69
+ fi
70
+
71
+ local candidates=(
72
+ "$REPO_DIR/../.venv/Scripts/openenv.exe"
73
+ "$REPO_DIR/.venv/Scripts/openenv.exe"
74
+ "$REPO_DIR/../.venv/bin/openenv"
75
+ "$REPO_DIR/.venv/bin/openenv"
76
+ )
77
+
78
+ local candidate
79
+ for candidate in "${candidates[@]}"; do
80
+ if [ -x "$candidate" ]; then
81
+ printf "%s\n" "$candidate"
82
+ return 0
83
+ fi
84
+ done
85
+
86
+ return 1
87
+ }
88
+
89
  CLEANUP_FILES=()
90
  cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
91
  trap cleanup EXIT
 
181
 
182
  log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
183
 
184
+ OPENENV_BIN=""
185
+ if ! OPENENV_BIN="$(resolve_openenv)"; then
186
  fail "openenv command not found"
187
+ hint "Install it in your active env: pip install openenv-core"
188
+ hint "Or activate your project venv before running this script."
189
  stop_at "Step 3"
190
  fi
191
 
192
  VALIDATE_OK=false
193
+ VALIDATE_OUTPUT=$(cd "$REPO_DIR" && "$OPENENV_BIN" validate 2>&1) && VALIDATE_OK=true
194
 
195
  if [ "$VALIDATE_OK" = true ]; then
196
  pass "openenv validate passed"
samplematerial/sampleinference.py CHANGED
@@ -1,187 +1,29 @@
1
- """
2
- Inference Script Example
3
- ===================================
4
- MANDATORY
5
- - Before submitting, ensure the following variables are defined in your environment configuration:
6
- API_BASE_URL The API endpoint for the LLM.
7
- MODEL_NAME The model identifier to use for inference.
8
- HF_TOKEN Your Hugging Face / API key.
9
- LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()
10
- method
11
-
12
- - Defaults are set only for API_BASE_URL and MODEL_NAME
13
- (and should reflect your active inference setup):
14
- API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
15
- MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
16
-
17
- - The inference script must be named `inference.py` and placed in the root directory of the project
18
- - Participants must use OpenAI Client for all LLM calls using above variables
19
-
20
- STDOUT FORMAT
21
- - The script must emit exactly three line types to stdout, in this order:
22
-
23
- [START] task=<task_name> env=<benchmark> model=<model_name>
24
- [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
25
- [END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
26
 
27
- Rules:
28
- - One [START] line at episode begin.
29
- - One [STEP] line per step, immediately after env.step() returns.
30
- - One [END] line after env.close(), always emitted (even on exception).
31
- - reward and rewards are formatted to 2 decimal places.
32
- - done and success are lowercase booleans: true or false.
33
- - error is the raw last_action_error string, or null if none.
34
- - All fields on a single line with no newlines within a line.
35
-
36
- Example:
37
- [START] task=click-test env=miniwob model=Qwen3-VL-30B
38
- [STEP] step=1 action=click('123') reward=0.00 done=false error=null
39
- [STEP] step=2 action=fill('456','text') reward=0.00 done=false error=null
40
- [STEP] step=3 action=click('789') reward=1.00 done=true error=null
41
- [END] success=true steps=3 rewards=0.00,0.00,1.00
42
  """
43
 
44
- import asyncio
45
- import os
46
- import textwrap
47
- from typing import List, Optional
48
-
49
- from openai import OpenAI
50
-
51
- from my_env_v4 import MyEnvV4Action, MyEnvV4Env
52
- IMAGE_NAME = os.getenv("IMAGE_NAME") # If you are using docker image
53
- API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
54
-
55
- API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
56
- MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
57
- TASK_NAME = os.getenv("MY_ENV_V4_TASK", "echo")
58
- BENCHMARK = os.getenv("MY_ENV_V4_BENCHMARK", "my_env_v4")
59
- MAX_STEPS = 8
60
- TEMPERATURE = 0.7
61
- MAX_TOKENS = 150
62
- SUCCESS_SCORE_THRESHOLD = 0.1 # normalized score in [0, 1]
63
-
64
- # Max possible reward: each token contributes 0.1, across all steps
65
- _MAX_REWARD_PER_STEP = MAX_TOKENS * 0.1
66
- MAX_TOTAL_REWARD = MAX_STEPS * _MAX_REWARD_PER_STEP
67
-
68
- SYSTEM_PROMPT = textwrap.dedent(
69
- """
70
- You are interacting with a simple echo environment.
71
- Each turn you must send a message. The environment will echo it back.
72
- Reward is proportional to message length: reward = len(message) * 0.1
73
- Your goal is to maximize total reward by sending meaningful, substantive messages.
74
- Reply with exactly one message string — no quotes, no prefixes, just the message text.
75
- """
76
- ).strip()
77
-
78
-
79
- def log_start(task: str, env: str, model: str) -> None:
80
- print(f"[START] task={task} env={env} model={model}", flush=True)
81
-
82
-
83
- def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
84
- error_val = error if error else "null"
85
- done_val = str(done).lower()
86
- print(
87
- f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
88
- flush=True,
89
- )
90
-
91
 
92
- def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
93
- rewards_str = ",".join(f"{r:.2f}" for r in rewards)
94
- print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
95
-
96
-
97
- def build_user_prompt(step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
98
- history_block = "\n".join(history[-4:]) if history else "None"
99
- return textwrap.dedent(
100
- f"""
101
- Step: {step}
102
- Last echoed message: {last_echoed!r}
103
- Last reward: {last_reward:.2f}
104
- Previous steps:
105
- {history_block}
106
- Send your next message.
107
- """
108
- ).strip()
109
-
110
-
111
- def get_model_message(client: OpenAI, step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
112
- user_prompt = build_user_prompt(step, last_echoed, last_reward, history)
113
- try:
114
- completion = client.chat.completions.create(
115
- model=MODEL_NAME,
116
- messages=[
117
- {"role": "system", "content": SYSTEM_PROMPT},
118
- {"role": "user", "content": user_prompt},
119
- ],
120
- temperature=TEMPERATURE,
121
- max_tokens=MAX_TOKENS,
122
- stream=False,
123
- )
124
- text = (completion.choices[0].message.content or "").strip()
125
- return text if text else "hello"
126
- except Exception as exc:
127
- print(f"[DEBUG] Model request failed: {exc}", flush=True)
128
- return "hello"
129
-
130
-
131
- async def main() -> None:
132
- client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
133
-
134
- env = await MyEnvV4Env.from_docker_image(IMAGE_NAME)
135
-
136
- history: List[str] = []
137
- rewards: List[float] = []
138
- steps_taken = 0
139
- score = 0.0
140
- success = False
141
-
142
- log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
143
-
144
- try:
145
- result = await env.reset() # OpenENV.reset()
146
- last_echoed = result.observation.echoed_message
147
- last_reward = 0.0
148
-
149
- for step in range(1, MAX_STEPS + 1):
150
- if result.done:
151
- break
152
-
153
- message = get_model_message(client, step, last_echoed, last_reward, history)
154
-
155
- result = await env.step(MyEnvV4Action(message=message))
156
- obs = result.observation
157
-
158
- reward = result.reward or 0.0
159
- done = result.done
160
- error = None
161
 
162
- rewards.append(reward)
163
- steps_taken = step
164
- last_echoed = obs.echoed_message
165
- last_reward = reward
166
 
167
- log_step(step=step, action=message, reward=reward, done=done, error=error)
 
168
 
169
- history.append(f"Step {step}: {message!r} -> reward {reward:+.2f}")
170
 
171
- if done:
172
- break
 
 
173
 
174
- score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
175
- score = min(max(score, 0.0), 1.0) # clamp to [0, 1]
176
- success = score >= SUCCESS_SCORE_THRESHOLD
177
 
178
- finally:
179
- try:
180
- await env.close()
181
- except Exception as e:
182
- print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
183
- log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
184
 
185
 
186
  if __name__ == "__main__":
187
- asyncio.run(main())
 
1
+ """Sample inference launcher for the 911 dispatch project.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ Use this file as a runnable reference from samplematerial.
4
+ For submission, the authoritative script is the root-level inference.py.
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  """
6
 
7
+ from __future__ import annotations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ import asyncio
10
+ import sys
11
+ from pathlib import Path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
 
 
 
 
13
 
14
+ def _project_root() -> Path:
15
+ return Path(__file__).resolve().parents[1]
16
 
 
17
 
18
+ def main() -> int:
19
+ root = _project_root()
20
+ if str(root) not in sys.path:
21
+ sys.path.insert(0, str(root))
22
 
23
+ from inference import main as run_inference
 
 
24
 
25
+ return asyncio.run(run_inference())
 
 
 
 
 
26
 
27
 
28
  if __name__ == "__main__":
29
+ raise SystemExit(main())
scripts/run_baseline_matrix.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Run baseline inference matrix (random + Open LLM) and summarize variance.
2
+
3
+ Usage examples:
4
+ python scripts/run_baseline_matrix.py --random-runs 1 --llm-runs 0
5
+ python scripts/run_baseline_matrix.py --random-runs 1 --llm-runs 3 --output-json baseline_report.json
6
+
7
+ Environment variables:
8
+ API_BASE_URL, MODEL_NAME
9
+ OPENAI_API_KEY or HF_TOKEN (required when --llm-runs > 0)
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import json
16
+ import os
17
+ import re
18
+ import statistics
19
+ import subprocess
20
+ import sys
21
+ import time
22
+ from dataclasses import asdict, dataclass
23
+ from pathlib import Path
24
+
25
+ START_RE = re.compile(r"^\[START\]\s+task=(\S+)\s+env=(\S+)\s+model=(\S+)$")
26
+ END_RE = re.compile(
27
+ r"^\[END\]\s+success=(true|false)\s+steps=(\d+)\s+score=([0-9]*\.?[0-9]+)\s+rewards=(.*)$"
28
+ )
29
+
30
+
31
+ @dataclass
32
+ class TaskEpisode:
33
+ task_id: str
34
+ success: bool
35
+ steps: int
36
+ score: float
37
+
38
+
39
+ @dataclass
40
+ class RunResult:
41
+ lane: str
42
+ run_index: int
43
+ runtime_seconds: float
44
+ tasks: list[TaskEpisode]
45
+ return_code: int
46
+ stderr: str
47
+
48
+
49
+ def _project_root() -> Path:
50
+ return Path(__file__).resolve().parents[1]
51
+
52
+
53
+ def _required_var(name: str) -> str:
54
+ value = os.environ.get(name)
55
+ if not value:
56
+ raise RuntimeError(f"Missing required environment variable: {name}")
57
+ return value
58
+
59
+
60
+ def _extract_task_episodes(stdout: str) -> list[TaskEpisode]:
61
+ episodes: list[TaskEpisode] = []
62
+ current_task: str | None = None
63
+
64
+ for line in stdout.splitlines():
65
+ start_match = START_RE.match(line)
66
+ if start_match:
67
+ current_task = start_match.group(1)
68
+ continue
69
+
70
+ end_match = END_RE.match(line)
71
+ if end_match:
72
+ task_id = current_task or f"unknown-{len(episodes) + 1}"
73
+ episodes.append(
74
+ TaskEpisode(
75
+ task_id=task_id,
76
+ success=end_match.group(1) == "true",
77
+ steps=int(end_match.group(2)),
78
+ score=float(end_match.group(3)),
79
+ )
80
+ )
81
+ current_task = None
82
+
83
+ return episodes
84
+
85
+
86
+ def _run_inference(lane: str, run_index: int, timeout_seconds: int) -> RunResult:
87
+ env = os.environ.copy()
88
+ env.setdefault("API_BASE_URL", "https://api.openai.com/v1")
89
+ env.setdefault("MODEL_NAME", "baseline-model")
90
+
91
+ if lane == "random":
92
+ env["USE_RANDOM"] = "true"
93
+ env.setdefault("OPENAI_API_KEY", "dummy-token")
94
+ else:
95
+ env["USE_RANDOM"] = "false"
96
+ if not (env.get("OPENAI_API_KEY") or env.get("HF_TOKEN")):
97
+ raise RuntimeError(
98
+ "OPENAI_API_KEY or HF_TOKEN is required for Open LLM runs"
99
+ )
100
+
101
+ cmd = [sys.executable, "inference.py"]
102
+ started = time.monotonic()
103
+ proc = subprocess.run(
104
+ cmd,
105
+ cwd=str(_project_root()),
106
+ capture_output=True,
107
+ text=True,
108
+ encoding="utf-8",
109
+ errors="replace",
110
+ env=env,
111
+ timeout=timeout_seconds,
112
+ )
113
+ runtime = time.monotonic() - started
114
+
115
+ tasks = _extract_task_episodes(proc.stdout)
116
+
117
+ return RunResult(
118
+ lane=lane,
119
+ run_index=run_index,
120
+ runtime_seconds=runtime,
121
+ tasks=tasks,
122
+ return_code=proc.returncode,
123
+ stderr=proc.stderr.strip(),
124
+ )
125
+
126
+
127
+ def _summarize(runs: list[RunResult]) -> dict[str, dict[str, float]]:
128
+ by_task: dict[str, list[float]] = {}
129
+ for run in runs:
130
+ for ep in run.tasks:
131
+ by_task.setdefault(ep.task_id, []).append(ep.score)
132
+
133
+ summary: dict[str, dict[str, float]] = {}
134
+ for task_id, scores in sorted(by_task.items()):
135
+ mean_score = statistics.mean(scores)
136
+ stdev_score = statistics.pstdev(scores) if len(scores) > 1 else 0.0
137
+ summary[task_id] = {
138
+ "runs": float(len(scores)),
139
+ "mean": round(mean_score, 6),
140
+ "std": round(stdev_score, 6),
141
+ "min": round(min(scores), 6),
142
+ "max": round(max(scores), 6),
143
+ }
144
+ return summary
145
+
146
+
147
+ def _print_summary(title: str, runs: list[RunResult]) -> None:
148
+ print(f"\n=== {title} ===")
149
+ if not runs:
150
+ print("No runs executed")
151
+ return
152
+
153
+ summary = _summarize(runs)
154
+ for task_id, metrics in summary.items():
155
+ print(
156
+ f"{task_id:16s} runs={int(metrics['runs'])} "
157
+ f"mean={metrics['mean']:.3f} std={metrics['std']:.3f} "
158
+ f"min={metrics['min']:.3f} max={metrics['max']:.3f}"
159
+ )
160
+
161
+ total_runtime = sum(r.runtime_seconds for r in runs)
162
+ failures = [r for r in runs if r.return_code != 0]
163
+ print(f"total_runtime_seconds={total_runtime:.2f}")
164
+ print(f"failed_runs={len(failures)}")
165
+
166
+
167
+ def _to_jsonable(runs: list[RunResult]) -> list[dict]:
168
+ serialized: list[dict] = []
169
+ for run in runs:
170
+ entry = asdict(run)
171
+ entry["tasks"] = [asdict(t) for t in run.tasks]
172
+ serialized.append(entry)
173
+ return serialized
174
+
175
+
176
+ def main() -> int:
177
+ parser = argparse.ArgumentParser(description="Run baseline matrix for inference.py")
178
+ parser.add_argument("--random-runs", type=int, default=1)
179
+ parser.add_argument("--llm-runs", type=int, default=3)
180
+ parser.add_argument("--timeout-seconds", type=int, default=1200)
181
+ parser.add_argument("--output-json", type=str, default="")
182
+ args = parser.parse_args()
183
+
184
+ os.environ.setdefault("API_BASE_URL", "https://api.openai.com/v1")
185
+ os.environ.setdefault("MODEL_NAME", "nvidia/Nemotron-3-Super-49B-v1")
186
+
187
+ _required_var("API_BASE_URL")
188
+ _required_var("MODEL_NAME")
189
+
190
+ random_runs: list[RunResult] = []
191
+ llm_runs: list[RunResult] = []
192
+
193
+ try:
194
+ for idx in range(1, args.random_runs + 1):
195
+ print(f"Running random baseline {idx}/{args.random_runs}...")
196
+ random_runs.append(_run_inference("random", idx, args.timeout_seconds))
197
+
198
+ for idx in range(1, args.llm_runs + 1):
199
+ print(f"Running Open LLM baseline {idx}/{args.llm_runs}...")
200
+ llm_runs.append(_run_inference("llm", idx, args.timeout_seconds))
201
+ except RuntimeError as exc:
202
+ print(f"ERROR: {exc}")
203
+ return 1
204
+
205
+ _print_summary("Random Baseline", random_runs)
206
+ _print_summary("Open LLM Baseline", llm_runs)
207
+
208
+ all_runs = random_runs + llm_runs
209
+
210
+ if args.output_json:
211
+ report = {
212
+ "api_base_url": os.environ.get("API_BASE_URL", ""),
213
+ "model_name": os.environ.get("MODEL_NAME", ""),
214
+ "random_summary": _summarize(random_runs),
215
+ "llm_summary": _summarize(llm_runs),
216
+ "runs": _to_jsonable(all_runs),
217
+ }
218
+ out_path = Path(args.output_json)
219
+ out_path.write_text(json.dumps(report, indent=2), encoding="utf-8")
220
+ print(f"Wrote report to {out_path}")
221
+
222
+ failures = [r for r in all_runs if r.return_code != 0]
223
+ if failures:
224
+ print("\nOne or more runs failed:")
225
+ for run in failures:
226
+ print(f"- lane={run.lane} run={run.run_index} rc={run.return_code}")
227
+ if run.stderr:
228
+ print(run.stderr)
229
+ return 1
230
+
231
+ return 0
232
+
233
+
234
+ if __name__ == "__main__":
235
+ raise SystemExit(main())
scripts/run_nemotron_baseline.ps1 ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ param(
2
+ [int]$RandomRuns = 1,
3
+ [int]$LlmRuns = 3,
4
+ [int]$TimeoutSeconds = 1200,
5
+ [string]$ApiBaseUrl = "https://api.openai.com/v1",
6
+ [string]$ModelName = "nvidia/Nemotron-3-Super-49B-v1",
7
+ [string]$OutputJson = "baseline_nemotron_report.json"
8
+ )
9
+
10
+ $ErrorActionPreference = "Stop"
11
+
12
+ $repoRoot = Split-Path -Parent $PSScriptRoot
13
+ Set-Location $repoRoot
14
+
15
+ if ($LlmRuns -gt 0 -and -not $env:OPENAI_API_KEY -and -not $env:HF_TOKEN) {
16
+ Write-Error "Set OPENAI_API_KEY or HF_TOKEN before running this script."
17
+ }
18
+
19
+ $env:API_BASE_URL = $ApiBaseUrl
20
+ $env:MODEL_NAME = $ModelName
21
+
22
+ Write-Host "Running baseline matrix in $repoRoot"
23
+ Write-Host "API_BASE_URL=$($env:API_BASE_URL)"
24
+ Write-Host "MODEL_NAME=$($env:MODEL_NAME)"
25
+ Write-Host "RandomRuns=$RandomRuns LlmRuns=$LlmRuns TimeoutSeconds=$TimeoutSeconds"
26
+
27
+ $candidatePython = @(
28
+ (Join-Path $repoRoot ".venv/Scripts/python.exe"),
29
+ (Join-Path (Split-Path -Parent $repoRoot) ".venv/Scripts/python.exe")
30
+ )
31
+
32
+ $python = $null
33
+ foreach ($candidate in $candidatePython) {
34
+ if (Test-Path $candidate) {
35
+ $python = $candidate
36
+ break
37
+ }
38
+ }
39
+ if (-not $python) {
40
+ $python = "python"
41
+ }
42
+
43
+ & $python scripts/run_baseline_matrix.py `
44
+ --random-runs $RandomRuns `
45
+ --llm-runs $LlmRuns `
46
+ --timeout-seconds $TimeoutSeconds `
47
+ --output-json $OutputJson
48
+
49
+ if ($LASTEXITCODE -ne 0) {
50
+ Write-Error "Baseline matrix run failed"
51
+ }
52
+
53
+ Write-Host "Done. Report written to $OutputJson"
tests/test_baseline_matrix.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for scripts/run_baseline_matrix.py helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import importlib.util
6
+ from pathlib import Path
7
+ import sys
8
+
9
+ import pytest
10
+
11
+
12
+ SCRIPT_PATH = Path(__file__).resolve().parents[1] / "scripts" / "run_baseline_matrix.py"
13
+ SPEC = importlib.util.spec_from_file_location("run_baseline_matrix", SCRIPT_PATH)
14
+ assert SPEC and SPEC.loader
15
+ baseline = importlib.util.module_from_spec(SPEC)
16
+ sys.modules[SPEC.name] = baseline
17
+ SPEC.loader.exec_module(baseline)
18
+
19
+
20
+ def test_extract_task_episodes_parses_start_end_pairs() -> None:
21
+ stdout = "\n".join(
22
+ [
23
+ "[START] task=single_incident env=citywide-dispatch-supervisor model=test-model",
24
+ "[STEP] step=1 action=WAIT reward=0.00 done=false error=null",
25
+ "[END] success=true steps=20 score=0.300 rewards=0.00,0.10",
26
+ "[START] task=multi_incident env=citywide-dispatch-supervisor model=test-model",
27
+ "[END] success=true steps=40 score=0.700 rewards=0.10,0.20",
28
+ ]
29
+ )
30
+
31
+ episodes = baseline._extract_task_episodes(stdout)
32
+
33
+ assert len(episodes) == 2
34
+ assert episodes[0].task_id == "single_incident"
35
+ assert episodes[0].success is True
36
+ assert episodes[0].steps == 20
37
+ assert episodes[0].score == pytest.approx(0.3)
38
+ assert episodes[1].task_id == "multi_incident"
39
+ assert episodes[1].steps == 40
40
+ assert episodes[1].score == pytest.approx(0.7)
41
+
42
+
43
+ def test_extract_task_episodes_falls_back_to_unknown_task() -> None:
44
+ stdout = "[END] success=false steps=0 score=0.000 rewards=0.00"
45
+
46
+ episodes = baseline._extract_task_episodes(stdout)
47
+
48
+ assert len(episodes) == 1
49
+ assert episodes[0].task_id == "unknown-1"
50
+ assert episodes[0].success is False
51
+
52
+
53
+ def test_summarize_computes_mean_and_std() -> None:
54
+ runs = [
55
+ baseline.RunResult(
56
+ lane="random",
57
+ run_index=1,
58
+ runtime_seconds=1.0,
59
+ tasks=[baseline.TaskEpisode("single_incident", True, 20, 0.2)],
60
+ return_code=0,
61
+ stderr="",
62
+ ),
63
+ baseline.RunResult(
64
+ lane="random",
65
+ run_index=2,
66
+ runtime_seconds=1.1,
67
+ tasks=[baseline.TaskEpisode("single_incident", True, 20, 0.4)],
68
+ return_code=0,
69
+ stderr="",
70
+ ),
71
+ ]
72
+
73
+ summary = baseline._summarize(runs)
74
+
75
+ assert summary["single_incident"]["runs"] == 2.0
76
+ assert summary["single_incident"]["mean"] == pytest.approx(0.3)
77
+ assert summary["single_incident"]["std"] == pytest.approx(0.1)
78
+ assert summary["single_incident"]["min"] == pytest.approx(0.2)
79
+ assert summary["single_incident"]["max"] == pytest.approx(0.4)
80
+
81
+
82
+ def test_to_jsonable_serializes_runs() -> None:
83
+ runs = [
84
+ baseline.RunResult(
85
+ lane="llm",
86
+ run_index=1,
87
+ runtime_seconds=3.2,
88
+ tasks=[baseline.TaskEpisode("mass_casualty", True, 59, 0.742)],
89
+ return_code=0,
90
+ stderr="",
91
+ )
92
+ ]
93
+
94
+ payload = baseline._to_jsonable(runs)
95
+
96
+ assert payload[0]["lane"] == "llm"
97
+ assert payload[0]["tasks"][0]["task_id"] == "mass_casualty"
98
+ assert payload[0]["tasks"][0]["score"] == pytest.approx(0.742)
tests/test_openenv_integration.py CHANGED
@@ -141,3 +141,32 @@ class TestTasksEndpoint:
141
  "mass_casualty",
142
  "shift_surge",
143
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  "mass_casualty",
142
  "shift_surge",
143
  }
144
+
145
+
146
+ class TestDashboardEndpoint:
147
+ def test_dashboard_state_before_reset_returns_valid_shape(self) -> None:
148
+ c = TestClient(server_app.app)
149
+ response = c.get("/dashboard/state")
150
+ assert response.status_code == 200
151
+
152
+ data = response.json()
153
+ assert data["task_id"] == "none"
154
+ assert data["step_count"] == 0
155
+ assert isinstance(data["units"], dict)
156
+ assert isinstance(data["incidents"], dict)
157
+ assert isinstance(data["legal_actions"], list)
158
+ assert isinstance(data["issues"], list)
159
+ assert data["observation"] is None
160
+
161
+ def test_dashboard_state_after_reset_exposes_legal_actions(self) -> None:
162
+ c = TestClient(server_app.app)
163
+ reset_response = c.post("/reset", json={"task_id": "single_incident", "seed": 42})
164
+ assert reset_response.status_code == 200
165
+
166
+ response = c.get("/dashboard/state")
167
+ assert response.status_code == 200
168
+
169
+ data = response.json()
170
+ assert data["task_id"] == "single_incident"
171
+ assert isinstance(data["legal_actions"], list)
172
+ assert data["observation"] is not None
uv.lock CHANGED
The diff for this file is too large to render. See raw diff
 
validate_local.py CHANGED
@@ -4,7 +4,9 @@
4
  from __future__ import annotations
5
 
6
  import subprocess
 
7
  import sys
 
8
 
9
 
10
  def run_command(
@@ -14,7 +16,18 @@ def run_command(
14
  print(f"CHECK: {description}")
15
  print(f"CMD: {' '.join(cmd)}")
16
  print(f"{'=' * 60}")
17
- result = subprocess.run(cmd, capture_output=True, text=True)
 
 
 
 
 
 
 
 
 
 
 
18
  if result.stdout:
19
  print(result.stdout)
20
  if result.stderr:
@@ -26,10 +39,33 @@ def run_command(
26
  return result
27
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def check_pytest() -> bool:
30
- result = run_command(
31
- ["uv", "run", "python", "-m", "pytest", "tests/", "-q"], "All tests pass"
32
- )
33
  return result.returncode == 0
34
 
35
 
@@ -44,9 +80,11 @@ def check_inference() -> bool:
44
 
45
  print("\nNOTE: Running inference.py in random-agent mode for local validation")
46
  result = subprocess.run(
47
- ["uv", "run", "python", "inference.py"],
48
  capture_output=True,
49
  text=True,
 
 
50
  env=env,
51
  timeout=300,
52
  )
@@ -68,6 +106,11 @@ def check_inference() -> bool:
68
 
69
 
70
  def check_docker_build() -> bool:
 
 
 
 
 
71
  result = run_command(
72
  ["docker", "build", "-t", "citywide-dispatch-supervisor", "."],
73
  "Docker build succeeds",
@@ -76,6 +119,18 @@ def check_docker_build() -> bool:
76
  return result.returncode == 0
77
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def check_benchmark_scores() -> bool:
80
  from src.benchmark import list_tasks, run_task
81
 
@@ -109,6 +164,7 @@ def main() -> int:
109
  ("pytest", check_pytest),
110
  ("inference", check_inference),
111
  ("docker_build", check_docker_build),
 
112
  ("benchmark_scores", check_benchmark_scores),
113
  ]
114
 
 
4
  from __future__ import annotations
5
 
6
  import subprocess
7
+ import shutil
8
  import sys
9
+ from pathlib import Path
10
 
11
 
12
  def run_command(
 
16
  print(f"CHECK: {description}")
17
  print(f"CMD: {' '.join(cmd)}")
18
  print(f"{'=' * 60}")
19
+ try:
20
+ result = subprocess.run(
21
+ cmd,
22
+ capture_output=True,
23
+ text=True,
24
+ encoding="utf-8",
25
+ errors="replace",
26
+ )
27
+ except FileNotFoundError as exc:
28
+ print(f"FAILED: {description}")
29
+ print(f"ERROR: command not found: {cmd[0]}")
30
+ return subprocess.CompletedProcess(cmd, 127, stdout="", stderr=str(exc))
31
  if result.stdout:
32
  print(result.stdout)
33
  if result.stderr:
 
39
  return result
40
 
41
 
42
+ def _tool_path(name: str) -> str | None:
43
+ """Resolve tool path from PATH or current interpreter's Scripts directory."""
44
+ found = shutil.which(name)
45
+ if found:
46
+ return found
47
+
48
+ scripts_dir = Path(sys.executable).resolve().parent
49
+ candidates = [
50
+ scripts_dir / name,
51
+ scripts_dir / f"{name}.exe",
52
+ ]
53
+ for candidate in candidates:
54
+ if candidate.exists():
55
+ return str(candidate)
56
+ return None
57
+
58
+
59
+ def _python_cmd(*args: str) -> list[str]:
60
+ """Build a Python command, preferring uv when available."""
61
+ uv = _tool_path("uv")
62
+ if uv:
63
+ return [uv, "run", "python", *args]
64
+ return [sys.executable, *args]
65
+
66
+
67
  def check_pytest() -> bool:
68
+ result = run_command(_python_cmd("-m", "pytest", "tests/", "-q"), "All tests pass")
 
 
69
  return result.returncode == 0
70
 
71
 
 
80
 
81
  print("\nNOTE: Running inference.py in random-agent mode for local validation")
82
  result = subprocess.run(
83
+ _python_cmd("inference.py"),
84
  capture_output=True,
85
  text=True,
86
+ encoding="utf-8",
87
+ errors="replace",
88
  env=env,
89
  timeout=300,
90
  )
 
106
 
107
 
108
  def check_docker_build() -> bool:
109
+ if not shutil.which("docker"):
110
+ print("FAILED: Docker build succeeds")
111
+ print("ERROR: docker command not found")
112
+ return False
113
+
114
  result = run_command(
115
  ["docker", "build", "-t", "citywide-dispatch-supervisor", "."],
116
  "Docker build succeeds",
 
119
  return result.returncode == 0
120
 
121
 
122
+ def check_openenv_validate() -> bool:
123
+ openenv = _tool_path("openenv")
124
+ if not openenv:
125
+ print("FAILED: openenv validate passes")
126
+ print("ERROR: openenv command not found")
127
+ print("HINT: Install with: pip install openenv-core")
128
+ return False
129
+
130
+ result = run_command([openenv, "validate"], "openenv validate passes", check=False)
131
+ return result.returncode == 0
132
+
133
+
134
  def check_benchmark_scores() -> bool:
135
  from src.benchmark import list_tasks, run_task
136
 
 
164
  ("pytest", check_pytest),
165
  ("inference", check_inference),
166
  ("docker_build", check_docker_build),
167
+ ("openenv_validate", check_openenv_validate),
168
  ("benchmark_scores", check_benchmark_scores),
169
  ]
170