Spaces:

garvitsachdeva
/

911

Sleeping

App Files Files Community

garvitsachdeva commited on Apr 6

Commit

775befb

1 Parent(s): e493c92

Submission polish: compliance hardening, baseline matrix, dashboard UX, tests, and docs

Browse files

Files changed (12) hide show

.gitignore +1 -0
README.md +26 -6
inference.py +11 -15
live_dashboard.html +87 -6
samplematerial/prevalidation.sh +29 -3
samplematerial/sampleinference.py +16 -174
scripts/run_baseline_matrix.py +235 -0
scripts/run_nemotron_baseline.ps1 +53 -0
tests/test_baseline_matrix.py +98 -0
tests/test_openenv_integration.py +29 -0
uv.lock +0 -0
validate_local.py +61 -5

.gitignore CHANGED Viewed

@@ -28,6 +28,7 @@ htmlcov/
 .sisyphus/notepads/
 *.log
 tmp/
 # Do not commit architecture notes
 architecture.md

 .sisyphus/notepads/
 *.log
 tmp/
+baseline_*_report.json
 # Do not commit architecture notes
 architecture.md

README.md CHANGED Viewed

@@ -421,14 +421,31 @@ USE_RANDOM=true \
   uv run python inference.py
 ```
 | Task | Difficulty | Random Baseline Score |
 |---|---|---|
-| `single_incident` | Easy | ~0.55 |
-| `multi_incident` | Medium | ~0.48 |
-| `mass_casualty` | Hard | ~0.32 |
-| `shift_surge` | Hard | ~0.38 |
-*Scores use `seed=42` for reproducibility. Variance is low across runs due to deterministic state machine.*
 ---
@@ -557,10 +574,13 @@ uv run pytest tests/test_inference.py -v
 uv run python validate_local.py
 # OpenEnv spec validation
-uv run openenv validate
 # HF Space validation (requires deployed space)
 bash samplematerial/prevalidation.sh https://your-space.hf.space .
 ```
 ---

   uv run python inference.py
 ```
+Run the baseline matrix (random + Open LLM reruns) and emit a JSON report:
+```bash
+API_BASE_URL=https://api.openai.com/v1 \
+MODEL_NAME=nvidia/Nemotron-3-Super-49B-v1 \
+OPENAI_API_KEY=your_token \
+uv run python scripts/run_baseline_matrix.py --random-runs 1 --llm-runs 3 --output-json baseline_nemotron_report.json
+```
+Windows PowerShell shortcut:
+```powershell
+$env:OPENAI_API_KEY="your_token"
+powershell -ExecutionPolicy Bypass -File scripts/run_nemotron_baseline.ps1 -RandomRuns 1 -LlmRuns 3
+```
 | Task | Difficulty | Random Baseline Score |
 |---|---|---|
+| `single_incident` | Easy | ~0.30 |
+| `multi_incident` | Medium | ~0.70 |
+| `mass_casualty` | Hard | ~0.74 |
+| `shift_surge` | Hard | ~0.56 |
+*Scores above are from deterministic random-agent inference with `seed=42`.*
+*For Open LLM evaluation, use Nemotron 3 Super as the primary baseline and report mean/std across reruns.*
 ---
 uv run python validate_local.py
 # OpenEnv spec validation
+openenv validate
 # HF Space validation (requires deployed space)
 bash samplematerial/prevalidation.sh https://your-space.hf.space .
+# Windows (explicit Git Bash)
+"C:/Program Files/Git/bin/bash.exe" samplematerial/prevalidation.sh https://your-space.hf.space .
 ```
 ---

inference.py CHANGED Viewed

@@ -70,6 +70,7 @@ class LLMAgent:
         self.api_key = api_key
         self.base_url = base_url.rstrip("/")
         self.model = model
         # Official OpenAI Python client for OpenAI-compatible endpoints.
         self._client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url)
@@ -142,7 +143,7 @@ Respond with ONLY the exact action string from the legal actions list. No explan
                 return action
         # Fallback to random if LLM response doesn't match
-        return random.choice(legal_actions)
 def _format_action(action: Action) -> str:
@@ -198,11 +199,11 @@ async def run_episode(
     step_count = 0
     rewards: list[float] = []
     success = False
-    error_msg: str | None = None
     try:
         observation = await env.reset()
-        rewards.append(observation.score)
         prev_obs = observation
         while True:
@@ -230,6 +231,7 @@ async def run_episode(
                 obs, reward, done = await env.step(action)
                 prev_obs = obs
                 rewards.append(reward)
                 # Terminal conditions: done flag OR any protocol-invalid transition.
                 has_illegal_transition = any(
@@ -264,7 +266,6 @@ async def run_episode(
                 # Safety check for runaway episodes
                 if step_count >= 1000:
-                    error_msg = "max_steps_exceeded"
                     print(
                         f"[STEP] step={step_count} action={action_str} "
                         f"reward={reward_str} done=true error=max_steps_exceeded"
@@ -273,26 +274,21 @@ async def run_episode(
                     break
             except Exception as e:
-                error_msg = "step_error"  # normalize to a fixed token
                 print(
                     f"[STEP] step={step_count} action={action_str} "
-                    f"reward=0.00 done=true error={error_msg}"
                 )
                 success = False
                 break
     except Exception as e:
-        error_msg = str(e)
         success = False
     finally:
         env.close()
-    # Separate reset reward from step rewards
-    step_rewards = rewards[1:]
-    if step_rewards:
-        total_score = sum(step_rewards) / len(step_rewards)
-    else:
-        total_score = 0.0
     total_score = max(0.0, min(1.0, total_score))
     # Format rewards list as comma-separated with 2 decimal places
@@ -331,12 +327,12 @@ async def main() -> int:
     except EnvironmentError as e:
         # Emit [END] to stdout for failure case
-        print("[END] success=false steps=0 score=0.000 rewards=")
         print(f"ERROR: {e}", file=sys.stderr)
         return 1
     except Exception as e:
         # Emit [END] to stdout for failure case
-        print("[END] success=false steps=0 score=0.000 rewards=")
         print(f"ERROR: Unexpected error: {e}", file=sys.stderr)
         return 1

         self.api_key = api_key
         self.base_url = base_url.rstrip("/")
         self.model = model
+        self._rng = random.Random(42)
         # Official OpenAI Python client for OpenAI-compatible endpoints.
         self._client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url)
                 return action
         # Fallback to random if LLM response doesn't match
+        return self._rng.choice(legal_actions)
 def _format_action(action: Action) -> str:
     step_count = 0
     rewards: list[float] = []
     success = False
+    episode_score = 0.0
     try:
         observation = await env.reset()
+        episode_score = float(observation.score)
         prev_obs = observation
         while True:
                 obs, reward, done = await env.step(action)
                 prev_obs = obs
                 rewards.append(reward)
+                episode_score = float(obs.score)
                 # Terminal conditions: done flag OR any protocol-invalid transition.
                 has_illegal_transition = any(
                 # Safety check for runaway episodes
                 if step_count >= 1000:
                     print(
                         f"[STEP] step={step_count} action={action_str} "
                         f"reward={reward_str} done=true error=max_steps_exceeded"
                     break
             except Exception as e:
                 print(
                     f"[STEP] step={step_count} action={action_str} "
+                    f"reward=0.00 done=true error=step_error"
                 )
                 success = False
                 break
     except Exception as e:
         success = False
     finally:
         env.close()
+    # OpenEnv publishes episode score in observation.score; keep this for [END].
+    step_rewards = rewards
+    total_score = episode_score
     total_score = max(0.0, min(1.0, total_score))
     # Format rewards list as comma-separated with 2 decimal places
     except EnvironmentError as e:
         # Emit [END] to stdout for failure case
+        print("[END] success=false steps=0 score=0.000 rewards=0.00")
         print(f"ERROR: {e}", file=sys.stderr)
         return 1
     except Exception as e:
         # Emit [END] to stdout for failure case
+        print("[END] success=false steps=0 score=0.000 rewards=0.00")
         print(f"ERROR: Unexpected error: {e}", file=sys.stderr)
         return 1

live_dashboard.html CHANGED Viewed

@@ -36,6 +36,18 @@
   .fill { height: 100%; width: 0%; background: #38bdf8; }
   .kpi { display: flex; justify-content: space-between; font-size: 11px; color: #cbd5e1; margin-bottom: 6px; }
   .status { margin-top: 10px; font-size: 11px; color: #64748b; }
 </style>
 </head>
 <body>
@@ -47,7 +59,8 @@
     <div class="pill">Task: <strong id="hdr-task">—</strong></div>
     <div class="pill">Episode: <strong id="hdr-episode">—</strong></div>
     <div class="pill">Step: <strong id="hdr-step">—</strong></div>
-    <div class="pill">Score: <strong id="hdr-score">—</strong></div>
   </div>
 </div>
@@ -77,11 +90,13 @@
     <div class="col">
       <div class="panel-title">Incidents</div>
       <div id="incidents"></div>
     </div>
   </div>
   <div class="bottom">
-    <div class="panel-title">Episode Score Breakdown</div>
     <div class="breakdown">
       <div>
         <div class="kpi"><span>response_time</span><span id="v-response_time">0.00</span></div>
@@ -111,6 +126,11 @@
 const API = 'http://localhost:8000';
 const DASHBOARD_STATE = `${API}/dashboard/state`;
 const REFRESH_MS = 500;
 const STATUS_COLORS = {
   AVAILABLE: '#10b981',
@@ -137,6 +157,15 @@ function setText(id, text) {
   if (el) el.textContent = text;
 }
 function renderUnits(state) {
   const root = document.getElementById('units');
   root.innerHTML = '';
@@ -175,11 +204,17 @@ function renderIncidents(state) {
     const sev = String(i.severity || '');
     const sevColor = sev === 'PRIORITY_1' ? '#ef4444' : (sev === 'PRIORITY_2' ? '#fbbf24' : '#38bdf8');
     const units = (i.units_assigned || []).join(', ') || '—';
     const card = document.createElement('div');
     card.className = 'card';
     card.innerHTML = `
       <div class="row"><div><strong style="color:${sevColor}">${i.incident_id}</strong> <span class="muted">(${i.incident_type})</span></div><div class="muted">${i.status}</div></div>
       <div class="row" style="margin-top:8px"><div class="muted">severity</div><div>${i.severity}</div></div>
       <div class="row"><div class="muted">assigned</div><div>${units}</div></div>
       <div class="row"><div class="muted">pos</div><div>${fmt(i.location_x,0)}, ${fmt(i.location_y,0)}</div></div>
     `;
@@ -239,8 +274,10 @@ function renderMap(state) {
     const color = sev === 'PRIORITY_1' ? '#ef4444' : (sev === 'PRIORITY_2' ? '#fbbf24' : '#38bdf8');
     const x = Number(i.location_x || 0);
     const y = Number(i.location_y || 0);
     svg.appendChild(svgEl('circle', { cx: x, cy: y, r: Math.max(0.7, Math.min(w,h) * 0.012), fill: color, stroke: '#0f172a', 'stroke-width': 0.3 }));
-    svg.appendChild(svgEl('text', { x: x + 1, y: y - 1, fill: '#e2e8f0', 'font-size': 2.8, 'font-family': 'monospace' })).textContent = i.incident_id;
   }
   // units
@@ -271,8 +308,49 @@ function updateHeader(state) {
   setText('hdr-task', state.task_id || '—');
   setText('hdr-episode', state.episode_id ? String(state.episode_id).slice(0, 8) : '—');
   setText('hdr-step', (state.step_count !== undefined) ? String(state.step_count) : '—');
   const cum = state.metadata && state.metadata.cumulative_reward;
-  setText('hdr-score', (cum !== undefined) ? fmt(cum, 3) : '—');
 }
 async function tick() {
@@ -292,9 +370,12 @@ async function tick() {
     renderMap(state);
     renderIncidents(state);
     renderBreakdown(state);
-    const issues = Array.isArray(state.issues) ? state.issues.length : 0;
-    status.textContent = `Connected · issues=${issues} · refresh=${REFRESH_MS}ms`;
   } catch (e) {
     status.textContent = `Disconnected · start server on :8000 (${String(e.message || e)})`;
   }

   .fill { height: 100%; width: 0%; background: #38bdf8; }
   .kpi { display: flex; justify-content: space-between; font-size: 11px; color: #cbd5e1; margin-bottom: 6px; }
   .status { margin-top: 10px; font-size: 11px; color: #64748b; }
+  .history-item { border-bottom: 1px solid #1e293b; padding: 8px 0; font-size: 11px; color: #cbd5e1; }
+  .history-item:last-child { border-bottom: 0; }
+  .history-step { color: #94a3b8; margin-right: 8px; }
+  .history-issues { color: #fbbf24; display: block; margin-top: 4px; }
+  @media (max-width: 1200px) {
+    .layout { grid-template-rows: auto auto; height: auto; min-height: calc(100vh - 52px); }
+    .main { grid-template-columns: 1fr; }
+    .col + .col { border-left: 0; border-top: 1px solid #1e293b; }
+    .map { min-height: 260px; }
+    .breakdown { grid-template-columns: repeat(2, 1fr); }
+  }
 </style>
 </head>
 <body>
     <div class="pill">Task: <strong id="hdr-task">—</strong></div>
     <div class="pill">Episode: <strong id="hdr-episode">—</strong></div>
     <div class="pill">Step: <strong id="hdr-step">—</strong></div>
+    <div class="pill">Episode Score: <strong id="hdr-episode-score">—</strong></div>
+    <div class="pill">Cumulative Reward: <strong id="hdr-cum-reward">—</strong></div>
   </div>
 </div>
     <div class="col">
       <div class="panel-title">Incidents</div>
       <div id="incidents"></div>
+      <div class="panel-title" style="margin-top:14px;">Recent Events</div>
+      <div id="history"></div>
     </div>
   </div>
   <div class="bottom">
+    <div class="panel-title">Step Reward Breakdown (latest observation)</div>
     <div class="breakdown">
       <div>
         <div class="kpi"><span>response_time</span><span id="v-response_time">0.00</span></div>
 const API = 'http://localhost:8000';
 const DASHBOARD_STATE = `${API}/dashboard/state`;
 const REFRESH_MS = 500;
+const HISTORY_LIMIT = 12;
+let lastHistoryEpisode = null;
+let lastHistoryStep = -1;
+let eventHistory = [];
 const STATUS_COLORS = {
   AVAILABLE: '#10b981',
   if (el) el.textContent = text;
 }
+function escapeHtml(value) {
+  return String(value)
+    .replaceAll('&', '&amp;')
+    .replaceAll('<', '&lt;')
+    .replaceAll('>', '&gt;')
+    .replaceAll('"', '&quot;')
+    .replaceAll("'", '&#39;');
+}
 function renderUnits(state) {
   const root = document.getElementById('units');
   root.innerHTML = '';
     const sev = String(i.severity || '');
     const sevColor = sev === 'PRIORITY_1' ? '#ef4444' : (sev === 'PRIORITY_2' ? '#fbbf24' : '#38bdf8');
     const units = (i.units_assigned || []).join(', ') || '—';
+    const survival = Number(i.survival_clock);
+    const survivalStr = Number.isFinite(survival) ? `${survival.toFixed(0)}s` : '—';
+    const p1ClockRow = sev === 'PRIORITY_1'
+      ? `<div class="row"><div class="muted">p1 clock</div><div>${survivalStr}</div></div>`
+      : '';
     const card = document.createElement('div');
     card.className = 'card';
     card.innerHTML = `
       <div class="row"><div><strong style="color:${sevColor}">${i.incident_id}</strong> <span class="muted">(${i.incident_type})</span></div><div class="muted">${i.status}</div></div>
       <div class="row" style="margin-top:8px"><div class="muted">severity</div><div>${i.severity}</div></div>
+      ${p1ClockRow}
       <div class="row"><div class="muted">assigned</div><div>${units}</div></div>
       <div class="row"><div class="muted">pos</div><div>${fmt(i.location_x,0)}, ${fmt(i.location_y,0)}</div></div>
     `;
     const color = sev === 'PRIORITY_1' ? '#ef4444' : (sev === 'PRIORITY_2' ? '#fbbf24' : '#38bdf8');
     const x = Number(i.location_x || 0);
     const y = Number(i.location_y || 0);
+    const p1Clock = Number(i.survival_clock);
+    const p1Suffix = (sev === 'PRIORITY_1' && Number.isFinite(p1Clock)) ? ` (${p1Clock.toFixed(0)}s)` : '';
     svg.appendChild(svgEl('circle', { cx: x, cy: y, r: Math.max(0.7, Math.min(w,h) * 0.012), fill: color, stroke: '#0f172a', 'stroke-width': 0.3 }));
+    svg.appendChild(svgEl('text', { x: x + 1, y: y - 1, fill: '#e2e8f0', 'font-size': 2.8, 'font-family': 'monospace' })).textContent = `${i.incident_id}${p1Suffix}`;
   }
   // units
   setText('hdr-task', state.task_id || '—');
   setText('hdr-episode', state.episode_id ? String(state.episode_id).slice(0, 8) : '—');
   setText('hdr-step', (state.step_count !== undefined) ? String(state.step_count) : '—');
+  const episodeScore = state.metadata && state.metadata.episode_score;
   const cum = state.metadata && state.metadata.cumulative_reward;
+  setText('hdr-episode-score', (episodeScore !== undefined) ? fmt(episodeScore, 3) : '—');
+  setText('hdr-cum-reward', (cum !== undefined) ? fmt(cum, 3) : '—');
+}
+function updateHistory(state) {
+  const obs = state.observation;
+  if (!obs) return;
+  if (lastHistoryEpisode !== state.episode_id) {
+    lastHistoryEpisode = state.episode_id;
+    lastHistoryStep = -1;
+    eventHistory = [];
+  }
+  if (state.step_count === lastHistoryStep) return;
+  const issues = Array.isArray(state.issues) ? state.issues : [];
+  eventHistory.unshift({
+    step: state.step_count,
+    result: obs.result || 'state updated',
+    issues,
+  });
+  eventHistory = eventHistory.slice(0, HISTORY_LIMIT);
+  lastHistoryStep = state.step_count;
+}
+function renderHistory() {
+  const root = document.getElementById('history');
+  if (!root) return;
+  if (eventHistory.length === 0) {
+    root.innerHTML = '<div class="muted">No events yet</div>';
+    return;
+  }
+  root.innerHTML = eventHistory.map((item) => {
+    const issueText = item.issues.length > 0
+      ? `<span class="history-issues">issues: ${escapeHtml(item.issues.join(', '))}</span>`
+      : '';
+    return `<div class="history-item"><span class="history-step">step ${item.step}</span>${escapeHtml(item.result)}${issueText}</div>`;
+  }).join('');
 }
 async function tick() {
     renderMap(state);
     renderIncidents(state);
     renderBreakdown(state);
+    updateHistory(state);
+    renderHistory();
+    const issueList = Array.isArray(state.issues) ? state.issues : [];
+    const issuePreview = issueList.length > 0 ? issueList.slice(0, 2).join(', ') : 'none';
+    status.textContent = `Connected · issues=${issueList.length} (${issuePreview}) · refresh=${REFRESH_MS}ms`;
   } catch (e) {
     status.textContent = `Disconnected · start server on :8000 (${String(e.message || e)})`;
   }

samplematerial/prevalidation.sh CHANGED Viewed

@@ -62,6 +62,30 @@ portable_mktemp() {
   mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
 }
 CLEANUP_FILES=()
 cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
 trap cleanup EXIT
@@ -157,14 +181,16 @@ fi
 log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
-if ! command -v openenv &>/dev/null; then
   fail "openenv command not found"
-  hint "Install it: pip install openenv-core"
   stop_at "Step 3"
 fi
 VALIDATE_OK=false
-VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
 if [ "$VALIDATE_OK" = true ]; then
   pass "openenv validate passed"

   mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
 }
+resolve_openenv() {
+  if command -v openenv &>/dev/null; then
+    command -v openenv
+    return 0
+  fi
+  local candidates=(
+    "$REPO_DIR/../.venv/Scripts/openenv.exe"
+    "$REPO_DIR/.venv/Scripts/openenv.exe"
+    "$REPO_DIR/../.venv/bin/openenv"
+    "$REPO_DIR/.venv/bin/openenv"
+  )
+  local candidate
+  for candidate in "${candidates[@]}"; do
+    if [ -x "$candidate" ]; then
+      printf "%s\n" "$candidate"
+      return 0
+    fi
+  done
+  return 1
+}
 CLEANUP_FILES=()
 cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
 trap cleanup EXIT
 log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
+OPENENV_BIN=""
+if ! OPENENV_BIN="$(resolve_openenv)"; then
   fail "openenv command not found"
+  hint "Install it in your active env: pip install openenv-core"
+  hint "Or activate your project venv before running this script."
   stop_at "Step 3"
 fi
 VALIDATE_OK=false
+VALIDATE_OUTPUT=$(cd "$REPO_DIR" && "$OPENENV_BIN" validate 2>&1) && VALIDATE_OK=true
 if [ "$VALIDATE_OK" = true ]; then
   pass "openenv validate passed"

samplematerial/sampleinference.py CHANGED Viewed

@@ -1,187 +1,29 @@
-"""
-Inference Script Example
-===================================
-MANDATORY
-- Before submitting, ensure the following variables are defined in your environment configuration:
-    API_BASE_URL   The API endpoint for the LLM.
-    MODEL_NAME     The model identifier to use for inference.
-    HF_TOKEN       Your Hugging Face / API key.
-    LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()
-                     method
-- Defaults are set only for API_BASE_URL and MODEL_NAME
-    (and should reflect your active inference setup):
-    API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
-    MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
-- The inference script must be named `inference.py` and placed in the root directory of the project
-- Participants must use OpenAI Client for all LLM calls using above variables
-STDOUT FORMAT
-- The script must emit exactly three line types to stdout, in this order:
-    [START] task=<task_name> env=<benchmark> model=<model_name>
-    [STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
-    [END]   success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
-  Rules:
-    - One [START] line at episode begin.
-    - One [STEP] line per step, immediately after env.step() returns.
-    - One [END] line after env.close(), always emitted (even on exception).
-    - reward and rewards are formatted to 2 decimal places.
-    - done and success are lowercase booleans: true or false.
-    - error is the raw last_action_error string, or null if none.
-    - All fields on a single line with no newlines within a line.
-  Example:
-    [START] task=click-test env=miniwob model=Qwen3-VL-30B
-    [STEP] step=1 action=click('123') reward=0.00 done=false error=null
-    [STEP] step=2 action=fill('456','text') reward=0.00 done=false error=null
-    [STEP] step=3 action=click('789') reward=1.00 done=true error=null
-    [END] success=true steps=3 rewards=0.00,0.00,1.00
 """
-import asyncio
-import os
-import textwrap
-from typing import List, Optional
-from openai import OpenAI
-from my_env_v4 import MyEnvV4Action, MyEnvV4Env
-IMAGE_NAME = os.getenv("IMAGE_NAME") # If you are using docker image
-API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
-API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
-MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
-TASK_NAME = os.getenv("MY_ENV_V4_TASK", "echo")
-BENCHMARK = os.getenv("MY_ENV_V4_BENCHMARK", "my_env_v4")
-MAX_STEPS = 8
-TEMPERATURE = 0.7
-MAX_TOKENS = 150
-SUCCESS_SCORE_THRESHOLD = 0.1  # normalized score in [0, 1]
-# Max possible reward: each token contributes 0.1, across all steps
-_MAX_REWARD_PER_STEP = MAX_TOKENS * 0.1
-MAX_TOTAL_REWARD = MAX_STEPS * _MAX_REWARD_PER_STEP
-SYSTEM_PROMPT = textwrap.dedent(
-    """
-    You are interacting with a simple echo environment.
-    Each turn you must send a message. The environment will echo it back.
-    Reward is proportional to message length: reward = len(message) * 0.1
-    Your goal is to maximize total reward by sending meaningful, substantive messages.
-    Reply with exactly one message string — no quotes, no prefixes, just the message text.
-    """
-).strip()
-def log_start(task: str, env: str, model: str) -> None:
-    print(f"[START] task={task} env={env} model={model}", flush=True)
-def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
-    error_val = error if error else "null"
-    done_val = str(done).lower()
-    print(
-        f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
-        flush=True,
-    )
-def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
-    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
-    print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
-def build_user_prompt(step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
-    history_block = "\n".join(history[-4:]) if history else "None"
-    return textwrap.dedent(
-        f"""
-        Step: {step}
-        Last echoed message: {last_echoed!r}
-        Last reward: {last_reward:.2f}
-        Previous steps:
-        {history_block}
-        Send your next message.
-        """
-    ).strip()
-def get_model_message(client: OpenAI, step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
-    user_prompt = build_user_prompt(step, last_echoed, last_reward, history)
-    try:
-        completion = client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=[
-                {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user", "content": user_prompt},
-            ],
-            temperature=TEMPERATURE,
-            max_tokens=MAX_TOKENS,
-            stream=False,
-        )
-        text = (completion.choices[0].message.content or "").strip()
-        return text if text else "hello"
-    except Exception as exc:
-        print(f"[DEBUG] Model request failed: {exc}", flush=True)
-        return "hello"
-async def main() -> None:
-    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
-    env = await MyEnvV4Env.from_docker_image(IMAGE_NAME)
-    history: List[str] = []
-    rewards: List[float] = []
-    steps_taken = 0
-    score = 0.0
-    success = False
-    log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
-    try:
-        result = await env.reset() # OpenENV.reset()
-        last_echoed = result.observation.echoed_message
-        last_reward = 0.0
-        for step in range(1, MAX_STEPS + 1):
-            if result.done:
-                break
-            message = get_model_message(client, step, last_echoed, last_reward, history)
-            result = await env.step(MyEnvV4Action(message=message))
-            obs = result.observation
-            reward = result.reward or 0.0
-            done = result.done
-            error = None
-            rewards.append(reward)
-            steps_taken = step
-            last_echoed = obs.echoed_message
-            last_reward = reward
-            log_step(step=step, action=message, reward=reward, done=done, error=error)
-            history.append(f"Step {step}: {message!r} -> reward {reward:+.2f}")
-            if done:
-                break
-        score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
-        score = min(max(score, 0.0), 1.0)  # clamp to [0, 1]
-        success = score >= SUCCESS_SCORE_THRESHOLD
-    finally:
-        try:
-            await env.close()
-        except Exception as e:
-            print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
-        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
 if __name__ == "__main__":
-    asyncio.run(main())

+"""Sample inference launcher for the 911 dispatch project.
+Use this file as a runnable reference from samplematerial.
+For submission, the authoritative script is the root-level inference.py.
 """
+from __future__ import annotations
+import asyncio
+import sys
+from pathlib import Path
+def _project_root() -> Path:
+    return Path(__file__).resolve().parents[1]
+def main() -> int:
+    root = _project_root()
+    if str(root) not in sys.path:
+        sys.path.insert(0, str(root))
+    from inference import main as run_inference
+    return asyncio.run(run_inference())
 if __name__ == "__main__":
+    raise SystemExit(main())

scripts/run_baseline_matrix.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""Run baseline inference matrix (random + Open LLM) and summarize variance.
+Usage examples:
+  python scripts/run_baseline_matrix.py --random-runs 1 --llm-runs 0
+  python scripts/run_baseline_matrix.py --random-runs 1 --llm-runs 3 --output-json baseline_report.json
+Environment variables:
+  API_BASE_URL, MODEL_NAME
+  OPENAI_API_KEY or HF_TOKEN (required when --llm-runs > 0)
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import re
+import statistics
+import subprocess
+import sys
+import time
+from dataclasses import asdict, dataclass
+from pathlib import Path
+START_RE = re.compile(r"^\[START\]\s+task=(\S+)\s+env=(\S+)\s+model=(\S+)$")
+END_RE = re.compile(
+    r"^\[END\]\s+success=(true|false)\s+steps=(\d+)\s+score=([0-9]*\.?[0-9]+)\s+rewards=(.*)$"
+)
+@dataclass
+class TaskEpisode:
+    task_id: str
+    success: bool
+    steps: int
+    score: float
+@dataclass
+class RunResult:
+    lane: str
+    run_index: int
+    runtime_seconds: float
+    tasks: list[TaskEpisode]
+    return_code: int
+    stderr: str
+def _project_root() -> Path:
+    return Path(__file__).resolve().parents[1]
+def _required_var(name: str) -> str:
+    value = os.environ.get(name)
+    if not value:
+        raise RuntimeError(f"Missing required environment variable: {name}")
+    return value
+def _extract_task_episodes(stdout: str) -> list[TaskEpisode]:
+    episodes: list[TaskEpisode] = []
+    current_task: str | None = None
+    for line in stdout.splitlines():
+        start_match = START_RE.match(line)
+        if start_match:
+            current_task = start_match.group(1)
+            continue
+        end_match = END_RE.match(line)
+        if end_match:
+            task_id = current_task or f"unknown-{len(episodes) + 1}"
+            episodes.append(
+                TaskEpisode(
+                    task_id=task_id,
+                    success=end_match.group(1) == "true",
+                    steps=int(end_match.group(2)),
+                    score=float(end_match.group(3)),
+                )
+            )
+            current_task = None
+    return episodes
+def _run_inference(lane: str, run_index: int, timeout_seconds: int) -> RunResult:
+    env = os.environ.copy()
+    env.setdefault("API_BASE_URL", "https://api.openai.com/v1")
+    env.setdefault("MODEL_NAME", "baseline-model")
+    if lane == "random":
+        env["USE_RANDOM"] = "true"
+        env.setdefault("OPENAI_API_KEY", "dummy-token")
+    else:
+        env["USE_RANDOM"] = "false"
+        if not (env.get("OPENAI_API_KEY") or env.get("HF_TOKEN")):
+            raise RuntimeError(
+                "OPENAI_API_KEY or HF_TOKEN is required for Open LLM runs"
+            )
+    cmd = [sys.executable, "inference.py"]
+    started = time.monotonic()
+    proc = subprocess.run(
+        cmd,
+        cwd=str(_project_root()),
+        capture_output=True,
+        text=True,
+        encoding="utf-8",
+        errors="replace",
+        env=env,
+        timeout=timeout_seconds,
+    )
+    runtime = time.monotonic() - started
+    tasks = _extract_task_episodes(proc.stdout)
+    return RunResult(
+        lane=lane,
+        run_index=run_index,
+        runtime_seconds=runtime,
+        tasks=tasks,
+        return_code=proc.returncode,
+        stderr=proc.stderr.strip(),
+    )
+def _summarize(runs: list[RunResult]) -> dict[str, dict[str, float]]:
+    by_task: dict[str, list[float]] = {}
+    for run in runs:
+        for ep in run.tasks:
+            by_task.setdefault(ep.task_id, []).append(ep.score)
+    summary: dict[str, dict[str, float]] = {}
+    for task_id, scores in sorted(by_task.items()):
+        mean_score = statistics.mean(scores)
+        stdev_score = statistics.pstdev(scores) if len(scores) > 1 else 0.0
+        summary[task_id] = {
+            "runs": float(len(scores)),
+            "mean": round(mean_score, 6),
+            "std": round(stdev_score, 6),
+            "min": round(min(scores), 6),
+            "max": round(max(scores), 6),
+        }
+    return summary
+def _print_summary(title: str, runs: list[RunResult]) -> None:
+    print(f"\n=== {title} ===")
+    if not runs:
+        print("No runs executed")
+        return
+    summary = _summarize(runs)
+    for task_id, metrics in summary.items():
+        print(
+            f"{task_id:16s} runs={int(metrics['runs'])} "
+            f"mean={metrics['mean']:.3f} std={metrics['std']:.3f} "
+            f"min={metrics['min']:.3f} max={metrics['max']:.3f}"
+        )
+    total_runtime = sum(r.runtime_seconds for r in runs)
+    failures = [r for r in runs if r.return_code != 0]
+    print(f"total_runtime_seconds={total_runtime:.2f}")
+    print(f"failed_runs={len(failures)}")
+def _to_jsonable(runs: list[RunResult]) -> list[dict]:
+    serialized: list[dict] = []
+    for run in runs:
+        entry = asdict(run)
+        entry["tasks"] = [asdict(t) for t in run.tasks]
+        serialized.append(entry)
+    return serialized
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run baseline matrix for inference.py")
+    parser.add_argument("--random-runs", type=int, default=1)
+    parser.add_argument("--llm-runs", type=int, default=3)
+    parser.add_argument("--timeout-seconds", type=int, default=1200)
+    parser.add_argument("--output-json", type=str, default="")
+    args = parser.parse_args()
+    os.environ.setdefault("API_BASE_URL", "https://api.openai.com/v1")
+    os.environ.setdefault("MODEL_NAME", "nvidia/Nemotron-3-Super-49B-v1")
+    _required_var("API_BASE_URL")
+    _required_var("MODEL_NAME")
+    random_runs: list[RunResult] = []
+    llm_runs: list[RunResult] = []
+    try:
+        for idx in range(1, args.random_runs + 1):
+            print(f"Running random baseline {idx}/{args.random_runs}...")
+            random_runs.append(_run_inference("random", idx, args.timeout_seconds))
+        for idx in range(1, args.llm_runs + 1):
+            print(f"Running Open LLM baseline {idx}/{args.llm_runs}...")
+            llm_runs.append(_run_inference("llm", idx, args.timeout_seconds))
+    except RuntimeError as exc:
+        print(f"ERROR: {exc}")
+        return 1
+    _print_summary("Random Baseline", random_runs)
+    _print_summary("Open LLM Baseline", llm_runs)
+    all_runs = random_runs + llm_runs
+    if args.output_json:
+        report = {
+            "api_base_url": os.environ.get("API_BASE_URL", ""),
+            "model_name": os.environ.get("MODEL_NAME", ""),
+            "random_summary": _summarize(random_runs),
+            "llm_summary": _summarize(llm_runs),
+            "runs": _to_jsonable(all_runs),
+        }
+        out_path = Path(args.output_json)
+        out_path.write_text(json.dumps(report, indent=2), encoding="utf-8")
+        print(f"Wrote report to {out_path}")
+    failures = [r for r in all_runs if r.return_code != 0]
+    if failures:
+        print("\nOne or more runs failed:")
+        for run in failures:
+            print(f"- lane={run.lane} run={run.run_index} rc={run.return_code}")
+            if run.stderr:
+                print(run.stderr)
+        return 1
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/run_nemotron_baseline.ps1 ADDED Viewed

	@@ -0,0 +1,53 @@

+param(
+    [int]$RandomRuns = 1,
+    [int]$LlmRuns = 3,
+    [int]$TimeoutSeconds = 1200,
+    [string]$ApiBaseUrl = "https://api.openai.com/v1",
+    [string]$ModelName = "nvidia/Nemotron-3-Super-49B-v1",
+    [string]$OutputJson = "baseline_nemotron_report.json"
+)
+$ErrorActionPreference = "Stop"
+$repoRoot = Split-Path -Parent $PSScriptRoot
+Set-Location $repoRoot
+if ($LlmRuns -gt 0 -and -not $env:OPENAI_API_KEY -and -not $env:HF_TOKEN) {
+    Write-Error "Set OPENAI_API_KEY or HF_TOKEN before running this script."
+}
+$env:API_BASE_URL = $ApiBaseUrl
+$env:MODEL_NAME = $ModelName
+Write-Host "Running baseline matrix in $repoRoot"
+Write-Host "API_BASE_URL=$($env:API_BASE_URL)"
+Write-Host "MODEL_NAME=$($env:MODEL_NAME)"
+Write-Host "RandomRuns=$RandomRuns LlmRuns=$LlmRuns TimeoutSeconds=$TimeoutSeconds"
+$candidatePython = @(
+    (Join-Path $repoRoot ".venv/Scripts/python.exe"),
+    (Join-Path (Split-Path -Parent $repoRoot) ".venv/Scripts/python.exe")
+)
+$python = $null
+foreach ($candidate in $candidatePython) {
+    if (Test-Path $candidate) {
+        $python = $candidate
+        break
+    }
+}
+if (-not $python) {
+    $python = "python"
+}
+& $python scripts/run_baseline_matrix.py `
+    --random-runs $RandomRuns `
+    --llm-runs $LlmRuns `
+    --timeout-seconds $TimeoutSeconds `
+    --output-json $OutputJson
+if ($LASTEXITCODE -ne 0) {
+    Write-Error "Baseline matrix run failed"
+}
+Write-Host "Done. Report written to $OutputJson"

tests/test_baseline_matrix.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""Unit tests for scripts/run_baseline_matrix.py helpers."""
+from __future__ import annotations
+import importlib.util
+from pathlib import Path
+import sys
+import pytest
+SCRIPT_PATH = Path(__file__).resolve().parents[1] / "scripts" / "run_baseline_matrix.py"
+SPEC = importlib.util.spec_from_file_location("run_baseline_matrix", SCRIPT_PATH)
+assert SPEC and SPEC.loader
+baseline = importlib.util.module_from_spec(SPEC)
+sys.modules[SPEC.name] = baseline
+SPEC.loader.exec_module(baseline)
+def test_extract_task_episodes_parses_start_end_pairs() -> None:
+    stdout = "\n".join(
+        [
+            "[START] task=single_incident env=citywide-dispatch-supervisor model=test-model",
+            "[STEP] step=1 action=WAIT reward=0.00 done=false error=null",
+            "[END] success=true steps=20 score=0.300 rewards=0.00,0.10",
+            "[START] task=multi_incident env=citywide-dispatch-supervisor model=test-model",
+            "[END] success=true steps=40 score=0.700 rewards=0.10,0.20",
+        ]
+    )
+    episodes = baseline._extract_task_episodes(stdout)
+    assert len(episodes) == 2
+    assert episodes[0].task_id == "single_incident"
+    assert episodes[0].success is True
+    assert episodes[0].steps == 20
+    assert episodes[0].score == pytest.approx(0.3)
+    assert episodes[1].task_id == "multi_incident"
+    assert episodes[1].steps == 40
+    assert episodes[1].score == pytest.approx(0.7)
+def test_extract_task_episodes_falls_back_to_unknown_task() -> None:
+    stdout = "[END] success=false steps=0 score=0.000 rewards=0.00"
+    episodes = baseline._extract_task_episodes(stdout)
+    assert len(episodes) == 1
+    assert episodes[0].task_id == "unknown-1"
+    assert episodes[0].success is False
+def test_summarize_computes_mean_and_std() -> None:
+    runs = [
+        baseline.RunResult(
+            lane="random",
+            run_index=1,
+            runtime_seconds=1.0,
+            tasks=[baseline.TaskEpisode("single_incident", True, 20, 0.2)],
+            return_code=0,
+            stderr="",
+        ),
+        baseline.RunResult(
+            lane="random",
+            run_index=2,
+            runtime_seconds=1.1,
+            tasks=[baseline.TaskEpisode("single_incident", True, 20, 0.4)],
+            return_code=0,
+            stderr="",
+        ),
+    ]
+    summary = baseline._summarize(runs)
+    assert summary["single_incident"]["runs"] == 2.0
+    assert summary["single_incident"]["mean"] == pytest.approx(0.3)
+    assert summary["single_incident"]["std"] == pytest.approx(0.1)
+    assert summary["single_incident"]["min"] == pytest.approx(0.2)
+    assert summary["single_incident"]["max"] == pytest.approx(0.4)
+def test_to_jsonable_serializes_runs() -> None:
+    runs = [
+        baseline.RunResult(
+            lane="llm",
+            run_index=1,
+            runtime_seconds=3.2,
+            tasks=[baseline.TaskEpisode("mass_casualty", True, 59, 0.742)],
+            return_code=0,
+            stderr="",
+        )
+    ]
+    payload = baseline._to_jsonable(runs)
+    assert payload[0]["lane"] == "llm"
+    assert payload[0]["tasks"][0]["task_id"] == "mass_casualty"
+    assert payload[0]["tasks"][0]["score"] == pytest.approx(0.742)

tests/test_openenv_integration.py CHANGED Viewed

@@ -141,3 +141,32 @@ class TestTasksEndpoint:
             "mass_casualty",
             "shift_surge",
         }

             "mass_casualty",
             "shift_surge",
         }
+class TestDashboardEndpoint:
+    def test_dashboard_state_before_reset_returns_valid_shape(self) -> None:
+        c = TestClient(server_app.app)
+        response = c.get("/dashboard/state")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["task_id"] == "none"
+        assert data["step_count"] == 0
+        assert isinstance(data["units"], dict)
+        assert isinstance(data["incidents"], dict)
+        assert isinstance(data["legal_actions"], list)
+        assert isinstance(data["issues"], list)
+        assert data["observation"] is None
+    def test_dashboard_state_after_reset_exposes_legal_actions(self) -> None:
+        c = TestClient(server_app.app)
+        reset_response = c.post("/reset", json={"task_id": "single_incident", "seed": 42})
+        assert reset_response.status_code == 200
+        response = c.get("/dashboard/state")
+        assert response.status_code == 200
+        data = response.json()
+        assert data["task_id"] == "single_incident"
+        assert isinstance(data["legal_actions"], list)
+        assert data["observation"] is not None

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff

validate_local.py CHANGED Viewed

@@ -4,7 +4,9 @@
 from __future__ import annotations
 import subprocess
 import sys
 def run_command(
@@ -14,7 +16,18 @@ def run_command(
     print(f"CHECK: {description}")
     print(f"CMD: {' '.join(cmd)}")
     print(f"{'=' * 60}")
-    result = subprocess.run(cmd, capture_output=True, text=True)
     if result.stdout:
         print(result.stdout)
     if result.stderr:
@@ -26,10 +39,33 @@ def run_command(
     return result
 def check_pytest() -> bool:
-    result = run_command(
-        ["uv", "run", "python", "-m", "pytest", "tests/", "-q"], "All tests pass"
-    )
     return result.returncode == 0
@@ -44,9 +80,11 @@ def check_inference() -> bool:
     print("\nNOTE: Running inference.py in random-agent mode for local validation")
     result = subprocess.run(
-        ["uv", "run", "python", "inference.py"],
         capture_output=True,
         text=True,
         env=env,
         timeout=300,
     )
@@ -68,6 +106,11 @@ def check_inference() -> bool:
 def check_docker_build() -> bool:
     result = run_command(
         ["docker", "build", "-t", "citywide-dispatch-supervisor", "."],
         "Docker build succeeds",
@@ -76,6 +119,18 @@ def check_docker_build() -> bool:
     return result.returncode == 0
 def check_benchmark_scores() -> bool:
     from src.benchmark import list_tasks, run_task
@@ -109,6 +164,7 @@ def main() -> int:
         ("pytest", check_pytest),
         ("inference", check_inference),
         ("docker_build", check_docker_build),
         ("benchmark_scores", check_benchmark_scores),
     ]

 from __future__ import annotations
 import subprocess
+import shutil
 import sys
+from pathlib import Path
 def run_command(
     print(f"CHECK: {description}")
     print(f"CMD: {' '.join(cmd)}")
     print(f"{'=' * 60}")
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            encoding="utf-8",
+            errors="replace",
+        )
+    except FileNotFoundError as exc:
+        print(f"FAILED: {description}")
+        print(f"ERROR: command not found: {cmd[0]}")
+        return subprocess.CompletedProcess(cmd, 127, stdout="", stderr=str(exc))
     if result.stdout:
         print(result.stdout)
     if result.stderr:
     return result
+def _tool_path(name: str) -> str | None:
+    """Resolve tool path from PATH or current interpreter's Scripts directory."""
+    found = shutil.which(name)
+    if found:
+        return found
+    scripts_dir = Path(sys.executable).resolve().parent
+    candidates = [
+        scripts_dir / name,
+        scripts_dir / f"{name}.exe",
+    ]
+    for candidate in candidates:
+        if candidate.exists():
+            return str(candidate)
+    return None
+def _python_cmd(*args: str) -> list[str]:
+    """Build a Python command, preferring uv when available."""
+    uv = _tool_path("uv")
+    if uv:
+        return [uv, "run", "python", *args]
+    return [sys.executable, *args]
 def check_pytest() -> bool:
+    result = run_command(_python_cmd("-m", "pytest", "tests/", "-q"), "All tests pass")
     return result.returncode == 0
     print("\nNOTE: Running inference.py in random-agent mode for local validation")
     result = subprocess.run(
+        _python_cmd("inference.py"),
         capture_output=True,
         text=True,
+        encoding="utf-8",
+        errors="replace",
         env=env,
         timeout=300,
     )
 def check_docker_build() -> bool:
+    if not shutil.which("docker"):
+        print("FAILED: Docker build succeeds")
+        print("ERROR: docker command not found")
+        return False
     result = run_command(
         ["docker", "build", "-t", "citywide-dispatch-supervisor", "."],
         "Docker build succeeds",
     return result.returncode == 0
+def check_openenv_validate() -> bool:
+    openenv = _tool_path("openenv")
+    if not openenv:
+        print("FAILED: openenv validate passes")
+        print("ERROR: openenv command not found")
+        print("HINT: Install with: pip install openenv-core")
+        return False
+    result = run_command([openenv, "validate"], "openenv validate passes", check=False)
+    return result.returncode == 0
 def check_benchmark_scores() -> bool:
     from src.benchmark import list_tasks, run_task
         ("pytest", check_pytest),
         ("inference", check_inference),
         ("docker_build", check_docker_build),
+        ("openenv_validate", check_openenv_validate),
         ("benchmark_scores", check_benchmark_scores),
     ]