Spaces:

Eshit
/

Wildfire-Containment-Simulator

Sleeping

App Files Files Community

Eshit commited on Apr 25

Commit

ad92ece

1 Parent(s): 363abf3

Improve wildfire metrics and training assets

Browse files

Files changed (14) hide show

env/models.py +12 -2
env/serialization.py +26 -6
env/wildfire_env.py +75 -17
frontend/app.js +120 -24
frontend/index.html +26 -6
frontend/style.css +14 -2
graders/grader_easy.py +1 -1
graders/grader_hard.py +1 -1
graders/grader_medium.py +1 -1
scripts/generate_sft_data.py +183 -0
scripts/results.json +65 -95
training/grpo_v2_colab.ipynb +678 -0
training/sft_colab.ipynb +387 -0
training/sft_data.jsonl +0 -0

env/models.py CHANGED Viewed

@@ -288,7 +288,13 @@ class ClusterStats(BaseModel):
     cells_saved: int = 0
     population_threatened: int = 0
     population_lost: int = 0
     containment_pct: float = Field(ge=0.0, le=100.0, default=0.0)
     current_step: int = 0
     max_steps: int = 100
     firebreaks_built: int = 0
@@ -342,6 +348,7 @@ class TierConfig(BaseModel):
     crew_loss_step: Optional[int] = None
     crew_loss_id: Optional[str] = None
     tanker_cooldown: int = 5
     wind_speed_init: float = 10.0
     wind_dir_init: float = 0.0
     humidity_init: float = 40.0
@@ -367,11 +374,12 @@ TIER_EASY = TierConfig(
     firebreak_budget=15,
     recon_budget=0,
     episode_length=80,
-    num_ignition_points=1,
     enable_smoke_occlusion=False,
     enable_sensor_noise=False,
     enable_fog_of_war=False,
     enable_wind_shifts=False,
     wind_speed_init=10.0,
     wind_dir_init=0.0,
     humidity_init=40.0,
@@ -391,11 +399,12 @@ TIER_MEDIUM = TierConfig(
     firebreak_budget=20,
     recon_budget=1,
     episode_length=150,
-    num_ignition_points=2,
     enable_smoke_occlusion=True,
     enable_sensor_noise=True,
     enable_fog_of_war=False,
     enable_wind_shifts=True,
     wind_speed_init=15.0,
     wind_dir_init=45.0,
     humidity_init=35.0,
@@ -417,6 +426,7 @@ TIER_HARD = TierConfig(
     episode_length=300,
     num_ignition_points=3,
     staggered_ignition_step=30,
     enable_smoke_occlusion=True,
     enable_sensor_noise=True,
     enable_fog_of_war=True,

     cells_saved: int = 0
     population_threatened: int = 0
     population_lost: int = 0
+    total_population: int = Field(ge=0, default=0, description="Initial population (for UI % civ safe)")
     containment_pct: float = Field(ge=0.0, le=100.0, default=0.0)
+    # Meaningful progress metrics shown to agent and display
+    area_saved_pct: float = Field(ge=0.0, le=100.0, default=100.0,
+        description="Percentage of burnable land not yet burned")
+    civilians_saved_pct: float = Field(ge=0.0, le=100.0, default=100.0,
+        description="Percentage of civilians in unburned zones")
     current_step: int = 0
     max_steps: int = 100
     firebreaks_built: int = 0
     crew_loss_step: Optional[int] = None
     crew_loss_id: Optional[str] = None
     tanker_cooldown: int = 5
+    min_active_steps: int = 5   # episode cannot end via fire-out before this step
     wind_speed_init: float = 10.0
     wind_dir_init: float = 0.0
     humidity_init: float = 40.0
     firebreak_budget=15,
     recon_budget=0,
     episode_length=80,
+    num_ignition_points=2,
     enable_smoke_occlusion=False,
     enable_sensor_noise=False,
     enable_fog_of_war=False,
     enable_wind_shifts=False,
+    min_active_steps=25,
     wind_speed_init=10.0,
     wind_dir_init=0.0,
     humidity_init=40.0,
     firebreak_budget=20,
     recon_budget=1,
     episode_length=150,
+    num_ignition_points=3,
     enable_smoke_occlusion=True,
     enable_sensor_noise=True,
     enable_fog_of_war=False,
     enable_wind_shifts=True,
+    min_active_steps=45,
     wind_speed_init=15.0,
     wind_dir_init=45.0,
     humidity_init=35.0,
     episode_length=300,
     num_ignition_points=3,
     staggered_ignition_step=30,
+    min_active_steps=80,
     enable_smoke_occlusion=True,
     enable_sensor_noise=True,
     enable_fog_of_war=True,

env/serialization.py CHANGED Viewed

@@ -12,8 +12,14 @@ if TYPE_CHECKING:
 from .models import FireState, IntensityBin
-def serialize_observation(obs: "Observation", step_num: int, max_steps: int) -> str:
-    situation = _format_situation(obs)
     grid_summary = _summarize_grid_regions(obs.grid)
     resources = _format_resources(obs.resources)
     events = _format_events(obs.recent_events)
@@ -29,8 +35,9 @@ def serialize_observation(obs: "Observation", step_num: int, max_steps: int) ->
         parts.append(obs._briefing_reminder)
         parts.append("")
     parts += [
-        f"=== WILDFIRE INCIDENT COMMAND — STEP {step_num}/{max_steps} ===",
         "",
         "SITUATION:",
         situation,
@@ -52,12 +59,14 @@ def serialize_observation(obs: "Observation", step_num: int, max_steps: int) ->
 # ── Situation block ──────────────────────────────────────────
-def _format_situation(obs: "Observation") -> str:
     stats = obs.stats
     w = obs.weather
     burning = stats.cells_burning
-    containment = round(stats.containment_pct, 1)
     pop_at_risk = stats.population_threatened
     wind_dir = _deg_to_compass(w.wind_direction_deg)
@@ -65,8 +74,19 @@ def _format_situation(obs: "Observation") -> str:
     last_event = obs.recent_events[-1] if obs.recent_events else "None"
     lines = [
-        f"- Fire active on {burning} cells. Containment: {containment}%. Population at risk: {pop_at_risk} zones.",
         f"- Wind: {w.wind_speed_kmh:.0f} km/h {wind_dir} (±5 km/h noise). Humidity: {w.humidity_pct:.0f}%. Rain: {rain}.",
         f"- Last event: {last_event}",
     ]

 from .models import FireState, IntensityBin
+def serialize_observation(
+    obs: "Observation",
+    step_num: int,
+    max_steps: int,
+    tier: str = "",
+    prev_cells_burning: int = 0,
+) -> str:
+    situation = _format_situation(obs, prev_cells_burning)
     grid_summary = _summarize_grid_regions(obs.grid)
     resources = _format_resources(obs.resources)
     events = _format_events(obs.recent_events)
         parts.append(obs._briefing_reminder)
         parts.append("")
+    tier_str = f" [{tier.upper()}]" if tier else ""
     parts += [
+        f"=== WILDFIRE INCIDENT COMMAND{tier_str} — STEP {step_num}/{max_steps} ===",
         "",
         "SITUATION:",
         situation,
 # ── Situation block ──────────────────────────────────────────
+def _format_situation(obs: "Observation", prev_cells_burning: int = 0) -> str:
     stats = obs.stats
     w = obs.weather
     burning = stats.cells_burning
+    land_saved = round(stats.area_saved_pct, 1)
+    civ_safe = round(stats.civilians_saved_pct, 1)
+    cells_burned = stats.cells_burned
     pop_at_risk = stats.population_threatened
     wind_dir = _deg_to_compass(w.wind_direction_deg)
     last_event = obs.recent_events[-1] if obs.recent_events else "None"
+    # Spread delta — positive means fire is growing, negative means shrinking
+    delta = burning - prev_cells_burning
+    if delta > 0:
+        spread_str = f" (+{delta} spreading)"
+    elif delta < 0:
+        spread_str = f" ({delta} shrinking)"
+    else:
+        spread_str = " (stable)"
     lines = [
+        f"- Fire active on {burning} cells{spread_str}. Land saved: {land_saved}% of burnable area "
+        f"({cells_burned} cells burned out). Civilians safe: {civ_safe}%. "
+        f"Population at risk: {pop_at_risk} zones.",
         f"- Wind: {w.wind_speed_kmh:.0f} km/h {wind_dir} (±5 km/h noise). Humidity: {w.humidity_pct:.0f}%. Rain: {rain}.",
         f"- Last event: {last_event}",
     ]

env/wildfire_env.py CHANGED Viewed

@@ -212,6 +212,16 @@ class WildfireEnv:
         self.current_step += 1
         # ── Step 9: Compute reward ──
         legacy_reward = self.reward_calc.compute_reward(self.grid, self.resources, self.current_step)
@@ -324,19 +334,39 @@ class WildfireEnv:
         }
     def _is_redundant(self, action: Action) -> bool:
-        """True if action repeats the same type + target coords as the previous action."""
         if self._prev_action is None:
             return False
         prev = self._prev_action
         if action.action_type != prev.action_type:
             return False
-        return action.target_row == prev.target_row and action.target_col == prev.target_col
     def _ignite_initial_fires(self) -> None:
         """Place initial fire ignition points based on tier config.
         Ignition candidates are shifted away from populated cells to ensure
         a minimum survivable distance, reducing unwinnable-scenario variance.
         """
         rows, cols = self.config.grid_rows, self.config.grid_cols
@@ -344,19 +374,25 @@ class WildfireEnv:
         min_pop_dist = {"easy": 4, "medium": 6, "hard": 7}.get(self.config.tier_name, 5)
         if self.config.tier_name == "easy":
-            r, c = self._find_ignition_candidate(rows // 2, cols // 2, min_pop_dist)
-            self.grid.ignite_cell(r, c, intensity=0.3)
         elif self.config.tier_name == "medium":
-            r1, c1 = self._find_ignition_candidate(rows // 3, cols // 3, min_pop_dist)
-            self.grid.ignite_cell(r1, c1, intensity=0.3)
             r2, c2 = self._find_ignition_candidate(2 * rows // 3, 2 * cols // 3, min_pop_dist)
-            self.grid.ignite_cell(r2, c2, intensity=0.3)
         else:
-            # Two initial points (third comes later via staggered ignition)
             r1, c1 = self._find_ignition_candidate(rows // 4, cols // 4, min_pop_dist)
-            self.grid.ignite_cell(r1, c1, intensity=0.3)
             r2, c2 = self._find_ignition_candidate(rows // 2, 3 * cols // 4, min_pop_dist)
-            self.grid.ignite_cell(r2, c2, intensity=0.3)
     def _find_ignition_candidate(self, target_r: int, target_c: int, min_pop_dist: int) -> tuple[int, int]:
         """Return the nearest valid ignition cell to (target_r, target_c) that is at
@@ -482,11 +518,17 @@ class WildfireEnv:
         # Fire fully contained (no burning cells)
         burning = self.grid.count_by_state(FireState.BURNING)
         ember = self.grid.count_by_state(FireState.EMBER)
-        if burning == 0 and ember == 0 and self.current_step > 1:
-            # Don't end on step 0-1 (fire just started)
-            if not (self.config.staggered_ignition_step
                     and self.current_step < self.config.staggered_ignition_step):
-                return True
         # All populated zones burned (catastrophic failure)
         total_pop = self.grid.get_total_population()
@@ -514,13 +556,29 @@ class WildfireEnv:
         resource_state = self.resources.get_resource_state()
         # Stats
         stats = ClusterStats(
-            cells_burned=self.grid.get_burned_count(),
             cells_burning=self.grid.count_by_state(FireState.BURNING),
-            cells_saved=self.grid.get_total_burnable() - self.grid.get_burned_count() - self.grid.count_by_state(FireState.BURNING),
             population_threatened=self._count_threatened_population(),
-            population_lost=self.grid.get_population_lost(),
             containment_pct=self._compute_containment_pct(),
             current_step=self.current_step,
             max_steps=self.config.episode_length,
             firebreaks_built=self.resources.total_firebreaks_built,

         self.current_step += 1
+        # Log a hold-message when fire is extinguished before min_active_steps so
+        # agents (and the LLM) understand the episode must continue for monitoring.
+        burning_now = (self.grid.count_by_state(FireState.BURNING)
+                       + self.grid.count_by_state(FireState.EMBER))
+        if burning_now == 0 and self.current_step < self.config.min_active_steps:
+            step_events.append(
+                f"All fires contained. Holding perimeter until step "
+                f"{self.config.min_active_steps} (min_active_steps)."
+            )
         # ── Step 9: Compute reward ──
         legacy_reward = self.reward_calc.compute_reward(self.grid, self.resources, self.current_step)
         }
     def _is_redundant(self, action: Action) -> bool:
+        """True if action is a meaningless repeat of the previous action.
+        Actions that use target coordinates (DROP_RETARDANT, DEPLOY_CREW, RECON_FLIGHT)
+        are redundant when the type + target cell match.  Directional actions (MOVE_CREW,
+        BUILD_FIREBREAK) require the same crew_id AND direction to be redundant — two
+        consecutive MOVE_CREW steps by different crews, or in different directions, are
+        valid patrol behaviour and must not be penalised.
+        """
         if self._prev_action is None:
             return False
         prev = self._prev_action
         if action.action_type != prev.action_type:
             return False
+        # Coordinate-targeted actions: redundant when same cell is targeted again
+        if action.target_row is not None or prev.target_row is not None:
+            return (action.target_row == prev.target_row
+                    and action.target_col == prev.target_col)
+        # Crew directional actions: redundant only when same crew moves same direction
+        if action.crew_id is not None:
+            return (action.crew_id == prev.crew_id
+                    and action.direction == prev.direction)
+        return False
     def _ignite_initial_fires(self) -> None:
         """Place initial fire ignition points based on tier config.
         Ignition candidates are shifted away from populated cells to ensure
         a minimum survivable distance, reducing unwinnable-scenario variance.
+        Intensity is set high enough (0.65) that a single tanker drop (-0.4)
+        leaves residual fire (0.25) so the episode cannot be solved in 1-2
+        steps. The fire must spread, be actively managed, and burn for at
+        least min_active_steps before the episode can end.
         """
         rows, cols = self.config.grid_rows, self.config.grid_cols
         min_pop_dist = {"easy": 4, "medium": 6, "hard": 7}.get(self.config.tier_name, 5)
         if self.config.tier_name == "easy":
+            # Two ignition points spread across the grid so crews must split
+            r1, c1 = self._find_ignition_candidate(rows // 2, cols // 3, min_pop_dist)
+            self.grid.ignite_cell(r1, c1, intensity=0.65)
+            r2, c2 = self._find_ignition_candidate(rows // 2, 2 * cols // 3, min_pop_dist)
+            self.grid.ignite_cell(r2, c2, intensity=0.65)
         elif self.config.tier_name == "medium":
+            # Three ignition points: forces genuine multi-front management
+            r1, c1 = self._find_ignition_candidate(rows // 4, cols // 3, min_pop_dist)
+            self.grid.ignite_cell(r1, c1, intensity=0.65)
             r2, c2 = self._find_ignition_candidate(2 * rows // 3, 2 * cols // 3, min_pop_dist)
+            self.grid.ignite_cell(r2, c2, intensity=0.65)
+            r3, c3 = self._find_ignition_candidate(rows // 2, cols // 2, min_pop_dist)
+            self.grid.ignite_cell(r3, c3, intensity=0.65)
         else:
+            # Two initial points (third comes later via staggered ignition at step 30)
             r1, c1 = self._find_ignition_candidate(rows // 4, cols // 4, min_pop_dist)
+            self.grid.ignite_cell(r1, c1, intensity=0.65)
             r2, c2 = self._find_ignition_candidate(rows // 2, 3 * cols // 4, min_pop_dist)
+            self.grid.ignite_cell(r2, c2, intensity=0.65)
     def _find_ignition_candidate(self, target_r: int, target_c: int, min_pop_dist: int) -> tuple[int, int]:
         """Return the nearest valid ignition cell to (target_r, target_c) that is at
         # Fire fully contained (no burning cells)
         burning = self.grid.count_by_state(FireState.BURNING)
         ember = self.grid.count_by_state(FireState.EMBER)
+        if burning == 0 and ember == 0:
+            # Enforce minimum active steps — prevents trivial 1-2 step episodes
+            # where a single tanker drop or natural burnout ends the episode
+            # before the agent has taken any meaningful sequence of actions.
+            if self.current_step < self.config.min_active_steps:
+                return False
+            # Don't terminate before staggered ignition fires (hard tier)
+            if (self.config.staggered_ignition_step
                     and self.current_step < self.config.staggered_ignition_step):
+                return False
+            return True
         # All populated zones burned (catastrophic failure)
         total_pop = self.grid.get_total_population()
         resource_state = self.resources.get_resource_state()
         # Stats
+        total_burnable = self.grid.get_total_burnable()
+        cells_burned = self.grid.get_burned_count()
+        total_pop = self.grid.get_total_population()
+        pop_lost = self.grid.get_population_lost()
+        area_saved_pct = round(
+            100.0 * (total_burnable - cells_burned) / total_burnable, 1
+        ) if total_burnable > 0 else 100.0
+        civilians_saved_pct = round(
+            100.0 * (total_pop - pop_lost) / total_pop, 1
+        ) if total_pop > 0 else 100.0
         stats = ClusterStats(
+            cells_burned=cells_burned,
             cells_burning=self.grid.count_by_state(FireState.BURNING),
+            cells_saved=total_burnable - cells_burned - self.grid.count_by_state(FireState.BURNING),
             population_threatened=self._count_threatened_population(),
+            population_lost=pop_lost,
+            total_population=total_pop,
             containment_pct=self._compute_containment_pct(),
+            area_saved_pct=area_saved_pct,
+            civilians_saved_pct=civilians_saved_pct,
             current_step=self.current_step,
             max_steps=self.config.episode_length,
             firebreaks_built=self.resources.total_firebreaks_built,

frontend/app.js CHANGED Viewed

@@ -11,6 +11,69 @@
 "use strict";
 // ── Simulation state ──────────────────────────────────────────────────────────
 const sim = {
   obs: null,            // current Observation (agent's view)
@@ -161,17 +224,28 @@ function renderCanvas(obs, groundTruth = null) {
 }
 // ── Stats panel ───────────────────────────────────────────────────────────────
-function updateStats(stats, cumulativeReward, lastStepReward) {
-  if (!stats) return;
-  const cur = stats.current_step ?? 0;
-  const max = stats.max_steps ?? 1;
-  setText("stat-step",             `${cur} / ${max}`);
-  setText("stat-containment-val",  `${(stats.containment_pct ?? 0).toFixed(1)}%`);
-  setText("stat-burning-val",      stats.cells_burning ?? 0);
-  setText("stat-pop-threat-val",   stats.population_threatened ?? 0);
-  setText("stat-pop-lost-val",     stats.population_lost ?? 0);
   // Cumulative reward
   setText("reward-total", cumulativeReward.toFixed(3));
@@ -298,31 +372,53 @@ function updateActionLog(action) {
 }
 // ── Terminal overlay ──────────────────────────────────────────────────────────
-function showTerminal(obs) {
   const overlay = document.getElementById("terminal-overlay");
   if (!overlay) return;
-  const stats = obs?.stats ?? {};
-  const popLost = stats.population_lost ?? 0;
-  const containment = stats.containment_pct ?? 0;
   const card = document.getElementById("terminal-card");
   const title = card.querySelector("h2");
-  if (popLost === 0) {
-    title.textContent = "✅ FIRE CONTAINED";
     title.className = "win";
   } else {
     title.textContent = "⚠ EPISODE ENDED";
     title.className = "loss";
   }
-  setText("terminal-containment", `${containment.toFixed(1)}%`);
-  setText("terminal-pop-lost",    popLost);
-  setText("terminal-reward",      sim.cumulativeReward.toFixed(3));
-  setText("terminal-step",        stats.current_step ?? "—");
   overlay.classList.add("show");
 }
 function hideTerminal() {
@@ -356,7 +452,7 @@ async function apiGet(path) {
 function applyObservation(obs) {
   sim.obs = obs;
   renderCanvas(obs, sim.groundTruthData);
-  updateStats(obs.stats, sim.cumulativeReward, sim.lastStepReward);
   updateResources(obs.resources);
   updateWeather(obs.weather);
   updateEvents(obs.recent_events ?? []);
@@ -417,7 +513,7 @@ async function doAutoStep() {
       if (snap.done) {
         stopPlay();
-        showTerminal(snap.observation);
         break;
       }
     }

 "use strict";
+// ── API field helpers (snake_case from Python; tolerate camelCase if ever used) ─
+function pickStat(obj, ...keys) {
+  if (!obj) return undefined;
+  for (const k of keys) {
+    if (Object.prototype.hasOwnProperty.call(obj, k) && obj[k] != null) {
+      return obj[k];
+    }
+  }
+  return undefined;
+}
+/**
+ * Build display-ready episode metrics from the latest observation.
+ * Falls back to grid-visible cells for land % only when server omits area_saved_pct.
+ */
+function normalizeEpisodeStats(obs) {
+  const st = obs?.stats ?? {};
+  const cellsBurned = pickStat(st, "cells_burned", "cellsBurned") ?? 0;
+  const popLost = pickStat(st, "population_lost", "populationLost") ?? 0;
+  const totalPop = pickStat(st, "total_population", "totalPopulation") ?? 0;
+  let areaSaved = pickStat(st, "area_saved_pct", "areaSavedPct");
+  let civSafe = pickStat(st, "civilians_saved_pct", "civiliansSavedPct");
+  if (areaSaved == null && obs?.grid?.length) {
+    let burnable = 0;
+    let burnedVis = 0;
+    for (const row of obs.grid) {
+      for (const cell of row) {
+        const f = cell.fuel_type;
+        if (!f || f === "water" || f === "road") continue;
+        if (cell.fire_state === "unknown") continue;
+        burnable++;
+        if (cell.fire_state === "burned_out") burnedVis++;
+      }
+    }
+    if (burnable > 0) {
+      areaSaved = Math.round(1000 * (burnable - burnedVis) / burnable) / 10;
+    }
+  }
+  if (civSafe == null && totalPop > 0) {
+    civSafe = Math.round(1000 * (totalPop - popLost) / totalPop) / 10;
+  } else if (civSafe == null && popLost === 0) {
+    civSafe = 100.0;
+  }
+  const containment = pickStat(st, "containment_pct", "containmentPct");
+  if (areaSaved == null && containment != null) {
+    areaSaved = containment;
+  }
+  return {
+    areaSaved,
+    civSafe,
+    cellsBurned,
+    popLost,
+    totalPop,
+    currentStep: pickStat(st, "current_step", "currentStep"),
+    raw: st,
+  };
+}
 // ── Simulation state ──────────────────────────────────────────────────────────
 const sim = {
   obs: null,            // current Observation (agent's view)
 }
 // ── Stats panel ───────────────────────────────────────────────────────────────
+function updateStats(obs, cumulativeReward, lastStepReward) {
+  if (!obs?.stats) return;
+  const stats = obs.stats;
+  const cur = pickStat(stats, "current_step", "currentStep") ?? 0;
+  const max = pickStat(stats, "max_steps", "maxSteps") ?? 1;
+  setText("stat-step", `${cur} / ${max}`);
+  const n = normalizeEpisodeStats(obs);
+  setText(
+    "stat-land-saved-val",
+    n.areaSaved != null ? `${Number(n.areaSaved).toFixed(1)}%` : "—"
+  );
+  setText(
+    "stat-civilians-safe-val",
+    n.civSafe != null ? `${Number(n.civSafe).toFixed(1)}%` : "—"
+  );
+  setText("stat-cells-burned-val", n.cellsBurned);
+  setText("stat-burning-val", pickStat(stats, "cells_burning", "cellsBurning") ?? 0);
+  setText("stat-pop-threat-val", pickStat(stats, "population_threatened", "populationThreatened") ?? 0);
+  setText("stat-pop-lost-val", n.popLost);
   // Cumulative reward
   setText("reward-total", cumulativeReward.toFixed(3));
 }
 // ── Terminal overlay ──────────────────────────────────────────────────────────
+async function showTerminal() {
   const overlay = document.getElementById("terminal-overlay");
   if (!overlay) return;
   const card = document.getElementById("terminal-card");
+  if (!card) return;
+  const n = normalizeEpisodeStats(sim.obs);
   const title = card.querySelector("h2");
+  if (n.popLost === 0) {
+    title.textContent = "✅ EPISODE COMPLETE";
     title.className = "win";
   } else {
     title.textContent = "⚠ EPISODE ENDED";
     title.className = "loss";
   }
+  const landStr = n.areaSaved != null ? `${Number(n.areaSaved).toFixed(1)}%` : "—";
+  const civStr = n.civSafe != null ? `${Number(n.civSafe).toFixed(1)}%` : "—";
+  setText("terminal-land-saved", landStr);
+  setText("terminal-civilians-safe", civStr);
+  setText("terminal-cells-burned", String(n.cellsBurned));
+  setText("terminal-pop-lost", n.popLost);
+  setText("terminal-reward", sim.cumulativeReward.toFixed(3));
+  setText("terminal-step", n.currentStep ?? "—");
   overlay.classList.add("show");
+  // Authoritative end-game numbers (ground truth — fixes blank UI if observation JSON differed)
+  try {
+    const st = await apiGet("/state");
+    if (st.error) return;
+    const tb = st.total_burnable ?? 0;
+    const burned = st.cells_burned ?? 0;
+    const landPct = tb > 0 ? Math.round(1000 * (tb - burned) / tb) / 10 : 100;
+    const tp = st.total_population ?? 0;
+    const lost = st.population_lost ?? 0;
+    const civPct = tp > 0 ? Math.round(1000 * (tp - lost) / tp) / 10 : 100;
+    setText("terminal-land-saved", `${landPct}%`);
+    setText("terminal-civilians-safe", `${civPct}%`);
+    setText("terminal-cells-burned", String(burned));
+    setText("terminal-pop-lost", String(lost));
+    setText("terminal-step", st.current_step ?? "—");
+  } catch (e) {
+    console.warn("Could not refresh end-game stats from /state", e);
+  }
 }
 function hideTerminal() {
 function applyObservation(obs) {
   sim.obs = obs;
   renderCanvas(obs, sim.groundTruthData);
+  updateStats(obs, sim.cumulativeReward, sim.lastStepReward);
   updateResources(obs.resources);
   updateWeather(obs.weather);
   updateEvents(obs.recent_events ?? []);
       if (snap.done) {
         stopPlay();
+        await showTerminal();
         break;
       }
     }

frontend/index.html CHANGED Viewed

@@ -83,8 +83,16 @@
         <div id="terminal-card">
           <h2 class="win">✅ FIRE CONTAINED</h2>
           <div class="stat-row">
-            <span>Containment</span>
-            <span id="terminal-containment">—</span>
           </div>
           <div class="stat-row">
             <span>Population lost</span>
@@ -104,6 +112,10 @@
         </div>
       </div>
     </div>
   </main>
   <!-- Sidebar -->
@@ -117,9 +129,17 @@
           <span class="stat-label">STEP</span>
           <span class="stat-value" id="stat-step">— / —</span>
         </div>
-        <div class="stat-item" id="stat-containment">
-          <span class="stat-label">CONTAINMENT</span>
-          <span class="stat-value" id="stat-containment-val">—</span>
         </div>
         <div class="stat-item" id="stat-burning">
           <span class="stat-label">BURNING</span>
@@ -274,6 +294,6 @@
   </span>
 </footer>
-<script src="app.js"></script>
 </body>
 </html>

         <div id="terminal-card">
           <h2 class="win">✅ FIRE CONTAINED</h2>
           <div class="stat-row">
+            <span>Land saved (unburned)</span>
+            <span id="terminal-land-saved">—</span>
+          </div>
+          <div class="stat-row">
+            <span>Civilians safe</span>
+            <span id="terminal-civilians-safe">—</span>
+          </div>
+          <div class="stat-row">
+            <span>Cells burned (total)</span>
+            <span id="terminal-cells-burned">—</span>
           </div>
           <div class="stat-row">
             <span>Population lost</span>
         </div>
       </div>
     </div>
+    <p id="map-legend" class="map-legend">
+      <strong>Map:</strong> green dot / circle = ground crew · blue outline = populated zone ·
+      bright blue cells = water · grey = roads
+    </p>
   </main>
   <!-- Sidebar -->
           <span class="stat-label">STEP</span>
           <span class="stat-value" id="stat-step">— / —</span>
         </div>
+        <div class="stat-item" id="stat-land-saved">
+          <span class="stat-label">LAND SAVED</span>
+          <span class="stat-value" id="stat-land-saved-val">—</span>
+        </div>
+        <div class="stat-item" id="stat-civilians-safe">
+          <span class="stat-label">CIVILIANS SAFE</span>
+          <span class="stat-value" id="stat-civilians-safe-val">—</span>
+        </div>
+        <div class="stat-item" id="stat-cells-burned">
+          <span class="stat-label">CELLS BURNED</span>
+          <span class="stat-value" id="stat-cells-burned-val">—</span>
         </div>
         <div class="stat-item" id="stat-burning">
           <span class="stat-label">BURNING</span>
   </span>
 </footer>
+<script src="app.js?v=4"></script>
 </body>
 </html>

frontend/style.css CHANGED Viewed

@@ -250,6 +250,16 @@ input[type="range"]::-webkit-slider-thumb {
 #grid-canvas { display: block; image-rendering: pixelated; }
 /* Tooltip overlay (shows cell info on hover) */
 #cell-tooltip {
   position: absolute;
@@ -356,8 +366,10 @@ input[type="range"]::-webkit-slider-thumb {
 .stat-item.step-item { grid-column: 1 / -1; }
 .stat-item.step-item .stat-value { font-size: 14px; }
-#stat-containment .stat-value { color: var(--safe); }
-#stat-burning     .stat-value { color: var(--fire); }
 #stat-pop-threat  .stat-value { color: var(--warn); }
 #stat-pop-lost    .stat-value { color: var(--crit); }

 #grid-canvas { display: block; image-rendering: pixelated; }
+.map-legend {
+  margin: 8px 0 0;
+  padding: 6px 10px;
+  font-size: 11px;
+  color: var(--text-muted);
+  line-height: 1.45;
+  max-width: 100%;
+}
+.map-legend strong { color: var(--text); }
 /* Tooltip overlay (shows cell info on hover) */
 #cell-tooltip {
   position: absolute;
 .stat-item.step-item { grid-column: 1 / -1; }
 .stat-item.step-item .stat-value { font-size: 14px; }
+#stat-land-saved     .stat-value { color: var(--safe); }
+#stat-civilians-safe .stat-value { color: var(--safe); }
+#stat-cells-burned   .stat-value { color: var(--warn); }
+#stat-burning        .stat-value { color: var(--fire); }
 #stat-pop-threat  .stat-value { color: var(--warn); }
 #stat-pop-lost    .stat-value { color: var(--crit); }

graders/grader_easy.py CHANGED Viewed

@@ -27,7 +27,7 @@ def grade(agent, seed: int = 42):
     details = {
         "total_reward": round(total_reward, 4),
-        "containment_pct": round(final.get("containment_pct", 0.0), 4),
         "pop_saved_pct": round(1.0 - pop_lost / total_pop, 4),
         "steps": env.current_step,
         "crew_casualty": env._crew_casualty_occurred,

     details = {
         "total_reward": round(total_reward, 4),
+        "containment_pct": round(final.get("reward_breakdown", {}).get("containment", 0.0), 4),
         "pop_saved_pct": round(1.0 - pop_lost / total_pop, 4),
         "steps": env.current_step,
         "crew_casualty": env._crew_casualty_occurred,

graders/grader_hard.py CHANGED Viewed

@@ -27,7 +27,7 @@ def grade(agent, seed: int = 42):
     details = {
         "total_reward": round(total_reward, 4),
-        "containment_pct": round(final.get("containment_pct", 0.0), 4),
         "pop_saved_pct": round(1.0 - pop_lost / total_pop, 4),
         "steps": env.current_step,
         "crew_casualty": env._crew_casualty_occurred,

     details = {
         "total_reward": round(total_reward, 4),
+        "containment_pct": round(final.get("reward_breakdown", {}).get("containment", 0.0), 4),
         "pop_saved_pct": round(1.0 - pop_lost / total_pop, 4),
         "steps": env.current_step,
         "crew_casualty": env._crew_casualty_occurred,

graders/grader_medium.py CHANGED Viewed

@@ -27,7 +27,7 @@ def grade(agent, seed: int = 42):
     details = {
         "total_reward": round(total_reward, 4),
-        "containment_pct": round(final.get("containment_pct", 0.0), 4),
         "pop_saved_pct": round(1.0 - pop_lost / total_pop, 4),
         "steps": env.current_step,
         "crew_casualty": env._crew_casualty_occurred,

     details = {
         "total_reward": round(total_reward, 4),
+        "containment_pct": round(final.get("reward_breakdown", {}).get("containment", 0.0), 4),
         "pop_saved_pct": round(1.0 - pop_lost / total_pop, 4),
         "steps": env.current_step,
         "crew_casualty": env._crew_casualty_occurred,

scripts/generate_sft_data.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""
+Generate supervised fine-tuning (SFT) training examples by running the
+HeuristicAgent through episodes and recording (prompt, action) pairs.
+Usage:
+    python scripts/generate_sft_data.py
+    python scripts/generate_sft_data.py --output training/sft_data.jsonl --easy-seeds 500
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import random
+import sys
+from pathlib import Path
+PROJECT_ROOT = str(Path(__file__).resolve().parent.parent)
+sys.path.insert(0, PROJECT_ROOT)
+from env.wildfire_env import WildfireEnv
+from env.serialization import serialize_observation
+from env.models import TIER_EASY, TIER_MEDIUM, TIER_HARD, ActionType
+from agents.heuristic_agent import HeuristicAgent
+SYSTEM_PROMPT = (
+    "You are an AI Incident Commander managing wildfire containment. "
+    "You will receive a situation briefing each step. "
+    "Respond with ONLY a valid JSON action object and nothing else. "
+    'Example: {"action_type": "idle"}'
+)
+TIER_CONFIGS = {
+    "easy":   {"max_steps": TIER_EASY.episode_length,   "target": 2000},
+    "medium": {"max_steps": TIER_MEDIUM.episode_length, "target": 1500},
+    "hard":   {"max_steps": TIER_HARD.episode_length,   "target": 800},
+}
+def run_episode(tier: str, seed: int) -> list[dict] | None:
+    """Run a full episode with the HeuristicAgent.
+    Returns a list of raw (prompt, action, step) records for the episode,
+    or None if the episode is unsuccessful (population lost > 0).
+    """
+    max_steps = TIER_CONFIGS[tier]["max_steps"]
+    env = WildfireEnv()
+    obs = env.reset(task_id=tier, seed=seed)
+    agent = HeuristicAgent()
+    offset = random.randint(0, min(30, max_steps // 4))
+    prev_cells_burning = 0
+    records: list[dict] = []
+    step_num = 0
+    while not env.done:
+        action = agent.act(obs)
+        if step_num >= offset:
+            prompt_text = serialize_observation(
+                obs, step_num, max_steps,
+                tier=tier, prev_cells_burning=prev_cells_burning,
+            )
+            action_json = action.model_dump_json(exclude_none=True)
+            records.append({
+                "messages": [
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": prompt_text},
+                ],
+                "completion": action_json,
+                "tier": tier,
+                "seed": seed,
+                "step": step_num,
+                "action_type": action.action_type.value,
+            })
+        prev_cells_burning = obs.stats.cells_burning
+        result = env.step(action)
+        obs = result.observation
+        step_num += 1
+    state = env.state()
+    if state["population_lost"] != 0:
+        return None
+    return records
+def filter_idle(records: list[dict]) -> list[dict]:
+    """Keep all non-IDLE steps, then cap IDLE steps at 20% of total."""
+    non_idle = [r for r in records if r["action_type"] != "idle"]
+    idle = [r for r in records if r["action_type"] == "idle"]
+    if not non_idle:
+        return idle
+    max_idle = max(1, int(len(non_idle) * 0.25))
+    if len(idle) > max_idle:
+        random.shuffle(idle)
+        idle = idle[:max_idle]
+    combined = non_idle + idle
+    combined.sort(key=lambda r: r["step"])
+    return combined
+def strip_internal_fields(records: list[dict]) -> list[dict]:
+    """Remove the action_type helper field before writing."""
+    for r in records:
+        r.pop("action_type", None)
+    return records
+def generate(output_path: str, max_seeds: dict[str, int]) -> None:
+    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+    all_examples: list[dict] = []
+    tier_counts = {t: 0 for t in TIER_CONFIGS}
+    for tier in ["easy", "medium", "hard"]:
+        target = TIER_CONFIGS[tier]["target"]
+        limit = max_seeds[tier]
+        seed = 0
+        print(f"\n{'='*50}")
+        print(f"Generating {tier} tier  (target={target}, max_seeds={limit})")
+        print(f"{'='*50}")
+        while tier_counts[tier] < target and seed < limit:
+            records = run_episode(tier, seed)
+            if records is not None:
+                filtered = filter_idle(records)
+                remaining = target - tier_counts[tier]
+                if len(filtered) > remaining:
+                    filtered = filtered[:remaining]
+                all_examples.extend(strip_internal_fields(filtered))
+                tier_counts[tier] += len(filtered)
+            seed += 1
+            if seed % 50 == 0:
+                print(f"  [{tier}] seed={seed}, examples={tier_counts[tier]}/{target}")
+        print(f"  [{tier}] DONE — {tier_counts[tier]} examples from {seed} seeds")
+    with open(output_path, "w", encoding="utf-8") as f:
+        for ex in all_examples:
+            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+    total = len(all_examples)
+    print(f"\n{'='*50}")
+    print(f"SFT data saved to {output_path}")
+    print(f"Total examples: {total}")
+    print(f"Tier distribution:")
+    for tier in ["easy", "medium", "hard"]:
+        print(f"  {tier}: {tier_counts[tier]}")
+    print(f"{'='*50}")
+def main():
+    parser = argparse.ArgumentParser(description="Generate SFT training data from HeuristicAgent episodes")
+    parser.add_argument("--output", default="training/sft_data.jsonl",
+                        help="Output JSONL file path (default: training/sft_data.jsonl)")
+    parser.add_argument("--easy-seeds", type=int, default=500,
+                        help="Max seeds to try for easy tier")
+    parser.add_argument("--medium-seeds", type=int, default=500,
+                        help="Max seeds to try for medium tier")
+    parser.add_argument("--hard-seeds", type=int, default=500,
+                        help="Max seeds to try for hard tier")
+    args = parser.parse_args()
+    max_seeds = {
+        "easy": args.easy_seeds,
+        "medium": args.medium_seeds,
+        "hard": args.hard_seeds,
+    }
+    generate(args.output, max_seeds)
+if __name__ == "__main__":
+    main()

scripts/results.json CHANGED Viewed

@@ -2,131 +2,101 @@
   "random": {
     "easy": {
       "scores": [
-        8.225,
-        8.35,
-        0.39,
-        8.35,
-        7.875,
-        8.25,
-        0.36,
-        8.35,
-        6.8251,
-        5.825
       ],
-      "mean": 6.28,
-      "std": 3.0546,
-      "mean_containment_pct": 0.0,
-      "mean_pop_saved_pct": 0.975,
-      "mean_steps": 16.1,
       "crew_casualty_rate": 0.0,
-      "mean_time_s": 0.097
     },
     "medium": {
       "scores": [
-        -1.1475,
-        8.3067,
-        8.0667,
-        7.84,
-        0.2919,
-        7.2,
-        8.3733,
-        8.3333,
-        -1.024,
-        -3.6238
       ],
-      "mean": 4.2617,
-      "std": 4.7,
-      "mean_containment_pct": 0.0,
-      "mean_pop_saved_pct": 0.9587,
-      "mean_steps": 32.2,
       "crew_casualty_rate": 0.0,
-      "mean_time_s": 0.468
     },
     "hard": {
       "scores": [
-        -7.6189,
-        -3.9186,
-        5.3,
-        5.2999,
-        -2.8187,
-        -2.9395,
-        -5.5375,
-        -1.5395,
-        5.3,
-        5.3
       ],
-      "mean": -0.3173,
-      "std": 4.8412,
-      "mean_containment_pct": 0.0,
-      "mean_pop_saved_pct": 0.9802,
-      "mean_steps": 44.7,
       "crew_casualty_rate": 0.0,
-      "mean_time_s": 1.298
     }
   },
   "heuristic": {
     "easy": {
       "scores": [
-        8.35,
-        8.35,
-        8.35,
-        8.35,
-        8.35,
-        8.35,
-        8.35,
-        8.35,
-        8.35,
-        8.35
       ],
-      "mean": 8.35,
-      "std": 0.0,
-      "mean_containment_pct": 0.0,
       "mean_pop_saved_pct": 1.0,
-      "mean_steps": 2.0,
       "crew_casualty_rate": 0.0,
-      "mean_time_s": 0.021
     },
     "medium": {
       "scores": [
-        5.5,
-        8.3733,
-        8.3733,
-        8.3733,
-        8.3067,
-        7.94,
-        8.3733,
-        8.3733,
-        7.8467,
-        7.2933
       ],
-      "mean": 7.8753,
-      "std": 0.8609,
-      "mean_containment_pct": 0.0,
-      "mean_pop_saved_pct": 1.0,
-      "mean_steps": 11.6,
       "crew_casualty_rate": 0.0,
-      "mean_time_s": 0.214
     },
     "hard": {
       "scores": [
-        6.5001,
-        -5.5396,
-        6.0,
-        4.6468,
-        6.8,
-        5.8,
-        5.8,
-        4.8001,
-        5.6,
-        5.9
       ],
-      "mean": 4.6307,
-      "std": 3.4471,
-      "mean_containment_pct": 0.0,
-      "mean_pop_saved_pct": 0.9988,
-      "mean_steps": 41.4,
       "crew_casualty_rate": 0.0,
-      "mean_time_s": 1.384
     }
   }
 }

   "random": {
     "easy": {
       "scores": [
+        7.7749,
+        7.7751,
+        7.775,
+        7.775,
+        0.04
       ],
+      "mean": 6.228,
+      "std": 3.094,
+      "mean_containment_pct": 1.0,
+      "mean_pop_saved_pct": 0.92,
+      "mean_steps": 25.8,
       "crew_casualty_rate": 0.0,
+      "mean_time_s": 0.067
     },
     "medium": {
       "scores": [
+        -1.7044,
+        -1.0029,
+        1.0762,
+        0.7527,
+        7.4403
       ],
+      "mean": 1.3124,
+      "std": 3.2367,
+      "mean_containment_pct": 1.0,
+      "mean_pop_saved_pct": 0.7365,
+      "mean_steps": 72.0,
       "crew_casualty_rate": 0.0,
+      "mean_time_s": 0.676
     },
     "hard": {
       "scores": [
+        7.8668,
+        1.3602,
+        -0.7466,
+        1.0443,
+        1.2813
       ],
+      "mean": 2.1612,
+      "std": 2.9554,
+      "mean_containment_pct": 1.0,
+      "mean_pop_saved_pct": 0.9023,
+      "mean_steps": 84.6,
       "crew_casualty_rate": 0.0,
+      "mean_time_s": 1.301
     }
   },
   "heuristic": {
     "easy": {
       "scores": [
+        7.6749,
+        7.575,
+        7.475,
+        7.475,
+        7.4749
       ],
+      "mean": 7.535,
+      "std": 0.08,
+      "mean_containment_pct": 1.0,
       "mean_pop_saved_pct": 1.0,
+      "mean_steps": 26.6,
       "crew_casualty_rate": 0.0,
+      "mean_time_s": 0.118
     },
     "medium": {
       "scores": [
+        7.6001,
+        7.7001,
+        7.8,
+        7.7,
+        0.7683
       ],
+      "mean": 6.3137,
+      "std": 2.7734,
+      "mean_containment_pct": 1.0,
+      "mean_pop_saved_pct": 0.9746,
+      "mean_steps": 46.2,
       "crew_casualty_rate": 0.0,
+      "mean_time_s": 0.48
     },
     "hard": {
       "scores": [
+        7.8668,
+        7.867,
+        0.9443,
+        7.6667,
+        -0.6696
       ],
+      "mean": 4.735,
+      "std": 3.7892,
+      "mean_containment_pct": 1.0,
+      "mean_pop_saved_pct": 0.9279,
+      "mean_steps": 83.2,
       "crew_casualty_rate": 0.0,
+      "mean_time_s": 1.487
     }
   }
 }

training/grpo_v2_colab.ipynb ADDED Viewed

	@@ -0,0 +1,678 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Wildfire Incident Command - GRPO Training (v2)\n",
+        "\n",
+        "GRPO reinforcement learning on a wildfire incident command model, starting from the SFT checkpoint.\n",
+        "\n",
+        "**Five critical issues fixed in this version:**\n",
+        "1. Prompt/reward state mismatch - dataset uses step-0 prompts only; reward replays the exact (tier, seed)\n",
+        "2. Truncated rollout - reward runs full episode to completion (heuristic continuation), terminal reward always included\n",
+        "3. Wasted inner model generations - MODEL_STEPS=1, only the sampled completion is applied\n",
+        "4. GRPO loop too slow - consequence of fix 3\n",
+        "5. parse_action(text, None) crash - standalone check_json_format() for format reward\n",
+        "\n",
+        "**Hardware:** A100 40GB on Colab"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 1 - Install and Assert GPU"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!pip install \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\"\n",
+        "!pip install trl==0.15.2 datasets==3.4.1 wandb"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "assert torch.cuda.is_available(), \"GPU not available - switch to a GPU runtime\"\n",
+        "gpu_name = torch.cuda.get_device_name(0)\n",
+        "gpu_mem = torch.cuda.get_device_properties(0).total_mem / 1e9\n",
+        "print(f\"GPU: {gpu_name}  |  VRAM: {gpu_mem:.1f} GB\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 2 - Load SFT Checkpoint"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from unsloth import FastLanguageModel\n",
+        "\n",
+        "# Option A: Load from HuggingFace Hub\n",
+        "SFT_MODEL = \"Eshit/wildfire-sft-7b\"\n",
+        "# Option B: Load from local zip (uncomment and adjust if needed)\n",
+        "# !unzip sft_final.zip -d sft_final_dir\n",
+        "# SFT_MODEL = \"./sft_final_dir/sft_final\"\n",
+        "\n",
+        "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+        "    model_name=SFT_MODEL,\n",
+        "    max_seq_length=2048,\n",
+        "    load_in_4bit=True,\n",
+        ")\n",
+        "\n",
+        "if tokenizer.pad_token is None:\n",
+        "    tokenizer.pad_token = tokenizer.eos_token\n",
+        "\n",
+        "print(f\"Loaded SFT checkpoint: {SFT_MODEL}\")\n",
+        "model.print_trainable_parameters()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 3 - Constants and Controller Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os, random, json, sys\n",
+        "import torch\n",
+        "\n",
+        "REPO_ROOT = \".\"  # Adjust to repo root in Colab\n",
+        "if REPO_ROOT not in sys.path:\n",
+        "    sys.path.insert(0, REPO_ROOT)\n",
+        "\n",
+        "from env.wildfire_env import WildfireEnv\n",
+        "from env.serialization import serialize_observation\n",
+        "from env.action_parser import parse_action\n",
+        "from agents.heuristic_agent import HeuristicAgent\n",
+        "from env.curriculum import CurriculumController\n",
+        "from datasets import Dataset\n",
+        "\n",
+        "SEED_POOL = list(range(100))\n",
+        "TIER_MAX_STEPS = {'easy': 80, 'medium': 150, 'hard': 300}\n",
+        "SYSTEM_PROMPT = (\n",
+        "    'You are an AI Incident Commander managing wildfire containment. '\n",
+        "    'You will receive a situation briefing each step. '\n",
+        "    'Respond with ONLY a valid JSON action object and nothing else. '\n",
+        "    'Example: {\"action_type\": \"idle\"}'\n",
+        ")\n",
+        "\n",
+        "# Thresholds calibrated to full-episode reward with heuristic continuation.\n",
+        "# Promote easy->medium once model's first action consistently beats random (+6.23).\n",
+        "# Promote medium->hard once model demonstrates meaningful improvement over random (+1.31).\n",
+        "controller = CurriculumController(\n",
+        "    start_tier='easy',\n",
+        "    thresholds={'easy': 6.5, 'medium': 3.5},\n",
+        ")\n",
+        "\n",
+        "os.makedirs('training/samples', exist_ok=True)\n",
+        "_reward_call_count = 0\n",
+        "\n",
+        "print(f\"Start tier: {controller.get_tier()}\")\n",
+        "print(f\"Seed pool: {len(SEED_POOL)} seeds\")\n",
+        "print(\"Env imports OK\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 4 - Standalone JSON Format Checker\n",
+        "\n",
+        "Replaces parse_action for format reward - no obs object needed (Issue 5 fix)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import json as _json\n",
+        "import re as _re\n",
+        "from env.models import ActionType as _AT\n",
+        "\n",
+        "_VALID_ACTION_TYPES = {a.value for a in _AT}\n",
+        "\n",
+        "\n",
+        "def check_json_format(text: str) -> str:\n",
+        "    \"\"\"\n",
+        "    Validate LLM output format without needing an obs object.\n",
+        "    Returns 'json_success', 'regex_fallback', or 'safe_idle'.\n",
+        "    Does NOT use parse_action - avoids the obs.grid dependency.\n",
+        "    \"\"\"\n",
+        "    text = _re.sub(r'```(?:json)?\\s*', '', text).replace('```', '')\n",
+        "    start = text.find('{')\n",
+        "    if start == -1:\n",
+        "        return 'safe_idle'\n",
+        "    depth = 0\n",
+        "    end = -1\n",
+        "    for i, ch in enumerate(text[start:], start=start):\n",
+        "        if ch == '{':\n",
+        "            depth += 1\n",
+        "        elif ch == '}':\n",
+        "            depth -= 1\n",
+        "            if depth == 0:\n",
+        "                end = i\n",
+        "                break\n",
+        "    if end == -1:\n",
+        "        return 'safe_idle'\n",
+        "    try:\n",
+        "        obj = _json.loads(text[start:end+1])\n",
+        "        if not isinstance(obj, dict):\n",
+        "            return 'safe_idle'\n",
+        "        at = str(obj.get('action_type', '')).lower()\n",
+        "        if at in _VALID_ACTION_TYPES:\n",
+        "            return 'json_success'\n",
+        "        return 'regex_fallback'\n",
+        "    except Exception:\n",
+        "        return 'regex_fallback'\n",
+        "\n",
+        "\n",
+        "assert check_json_format('{\"action_type\": \"idle\"}') == 'json_success'\n",
+        "assert check_json_format('{\"action_type\": \"bogus\"}') == 'regex_fallback'\n",
+        "assert check_json_format('no json here') == 'safe_idle'\n",
+        "print('check_json_format OK')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 5 - Reward Functions\n",
+        "\n",
+        "Two reward signals for GRPO:\n",
+        "- **reward_fn_outcome** - full-episode env reward (1 model step + heuristic continuation)\n",
+        "- **reward_fn_format** - JSON formatting quality (fast, no env needed)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def reward_fn_outcome(completions, prompts, tier=None, seed=None, **kwargs):\n",
+        "    \"\"\"\n",
+        "    Score each GRPO completion by:\n",
+        "      1. Resetting the env to the EXACT (tier, seed) that generated the prompt (Issue 1 fix).\n",
+        "      2. Applying the sampled completion as the single first action (MODEL_STEPS=1, Issue 3/4 fix).\n",
+        "      3. Running HeuristicAgent until episode completion (Issue 2 fix - captures terminal reward).\n",
+        "\n",
+        "    tier and seed are dataset columns forwarded by GRPOTrainer.\n",
+        "    \"\"\"\n",
+        "    global _reward_call_count\n",
+        "    _reward_call_count += 1\n",
+        "    rewards = []\n",
+        "\n",
+        "    for i, completion in enumerate(completions):\n",
+        "        ep_tier = tier[i] if tier is not None else controller.get_tier()\n",
+        "        ep_seed = seed[i] if seed is not None else random.choice(SEED_POOL)\n",
+        "\n",
+        "        env = WildfireEnv()\n",
+        "        obs = env.reset(task_id=ep_tier, seed=ep_seed)\n",
+        "        total_reward = 0.0\n",
+        "\n",
+        "        # Apply the sampled completion as step 0\n",
+        "        text = completion if isinstance(completion, str) else completion[0]['content']\n",
+        "        action, _ = parse_action(text, obs)\n",
+        "        result = env.step(action)\n",
+        "        total_reward += result.reward\n",
+        "        obs = result.observation\n",
+        "\n",
+        "        # Heuristic drives everything after (full episode to capture terminal reward)\n",
+        "        heuristic = HeuristicAgent()\n",
+        "        while not env.done:\n",
+        "            action = heuristic.act(obs)\n",
+        "            result = env.step(action)\n",
+        "            total_reward += result.reward\n",
+        "            obs = result.observation\n",
+        "\n",
+        "        rewards.append(total_reward)\n",
+        "\n",
+        "    # Update curriculum (once per batch, not per completion)\n",
+        "    mean_r = sum(rewards) / len(rewards)\n",
+        "    promoted = controller.after_episode(mean_r)\n",
+        "    if promoted:\n",
+        "        print(f'  *** Curriculum promoted to: {promoted} (mean batch reward={mean_r:.2f}) ***')\n",
+        "\n",
+        "    # Sample completions to disk for inspection\n",
+        "    if _reward_call_count % 10 == 0:\n",
+        "        sample_path = f'training/samples/call_{_reward_call_count}.txt'\n",
+        "        with open(sample_path, 'w') as f:\n",
+        "            f.write(f'call={_reward_call_count}  tier={tier[0] if tier else \"?\"}  reward={rewards[0]:.3f}\\n')\n",
+        "            f.write('---\\n')\n",
+        "            c = completions[0]\n",
+        "            f.write(c if isinstance(c, str) else c[0]['content'])\n",
+        "            f.write('\\n')\n",
+        "\n",
+        "    return rewards\n",
+        "\n",
+        "\n",
+        "def reward_fn_format(completions, prompts, **kwargs):\n",
+        "    \"\"\"\n",
+        "    Scores JSON formatting quality using check_json_format() (no obs needed).\n",
+        "    Runs independently of the env - fast and always well-defined.\n",
+        "    \"\"\"\n",
+        "    rewards = []\n",
+        "    for completion in completions:\n",
+        "        text = completion if isinstance(completion, str) else completion[0]['content']\n",
+        "        status = check_json_format(text)\n",
+        "        if status == 'json_success':\n",
+        "            r = 0.15\n",
+        "        elif status == 'regex_fallback':\n",
+        "            r = 0.0\n",
+        "        else:\n",
+        "            r = -0.20\n",
+        "        rewards.append(r)\n",
+        "    return rewards\n",
+        "\n",
+        "\n",
+        "print('Reward functions defined.')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 6 - Dataset Builder (Step-0 Only)\n",
+        "\n",
+        "Each row stores the seed so reward_fn_outcome can replay the exact same env state.\n",
+        "No mid-episode offset - GRPO prompt and reward state are always step-0."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def build_prompt_dataset(n=200):\n",
+        "    \"\"\"\n",
+        "    Build step-0 prompts for the current curriculum tier.\n",
+        "    Stores the seed in each row so reward_fn can replay the exact same env state.\n",
+        "    \"\"\"\n",
+        "    rows = []\n",
+        "    env_tmp = WildfireEnv()\n",
+        "    tier = controller.get_tier()\n",
+        "    max_steps = TIER_MAX_STEPS[tier]\n",
+        "\n",
+        "    for i in range(n):\n",
+        "        seed = SEED_POOL[i % len(SEED_POOL)]\n",
+        "        obs = env_tmp.reset(task_id=tier, seed=seed)\n",
+        "        prompt = serialize_observation(obs, 0, max_steps, tier=tier, prev_cells_burning=0)\n",
+        "        rows.append({\n",
+        "            'prompt': [\n",
+        "                {'role': 'system', 'content': SYSTEM_PROMPT},\n",
+        "                {'role': 'user',   'content': prompt},\n",
+        "            ],\n",
+        "            'tier': tier,\n",
+        "            'seed': seed,\n",
+        "        })\n",
+        "    return rows\n",
+        "\n",
+        "\n",
+        "_test_ds = build_prompt_dataset(3)\n",
+        "print(f\"Sample dataset row keys: {list(_test_ds[0].keys())}\")\n",
+        "print(f\"Tier: {_test_ds[0]['tier']}, Seed: {_test_ds[0]['seed']}\")\n",
+        "print(f\"Prompt roles: {[m['role'] for m in _test_ds[0]['prompt']]}\")\n",
+        "del _test_ds"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 7 - CurriculumDatasetCallback\n",
+        "\n",
+        "Rebuilds the training dataset whenever the curriculum controller promotes to a new tier."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from transformers import TrainerCallback\n",
+        "\n",
+        "\n",
+        "class CurriculumDatasetCallback(TrainerCallback):\n",
+        "    def __init__(self, trainer_ref):\n",
+        "        self._trainer = trainer_ref\n",
+        "        self._last_tier = controller.get_tier()\n",
+        "\n",
+        "    def on_step_end(self, args, state, control, **kwargs):\n",
+        "        current_tier = controller.get_tier()\n",
+        "        if current_tier != self._last_tier:\n",
+        "            print(f'  Rebuilding dataset for tier: {current_tier}')\n",
+        "            new_ds = Dataset.from_list(build_prompt_dataset(200))\n",
+        "            self._trainer.train_dataset = new_ds\n",
+        "            self._last_tier = current_tier\n",
+        "\n",
+        "\n",
+        "print('CurriculumDatasetCallback defined.')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 8 - GRPOTrainer Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from trl import GRPOTrainer, GRPOConfig\n",
+        "\n",
+        "grpo_config = GRPOConfig(\n",
+        "    output_dir='./grpo_checkpoints',\n",
+        "    num_generations=8,\n",
+        "    learning_rate=3e-6,\n",
+        "    max_steps=400,\n",
+        "    save_steps=20,\n",
+        "    per_device_train_batch_size=1,\n",
+        "    gradient_accumulation_steps=4,\n",
+        "    max_completion_length=192,\n",
+        "    logging_steps=1,\n",
+        "    report_to='wandb',\n",
+        ")\n",
+        "\n",
+        "FastLanguageModel.for_training(model)\n",
+        "\n",
+        "dataset = Dataset.from_list(build_prompt_dataset(200))\n",
+        "print(f'Initial dataset: {len(dataset)} rows, tier={controller.get_tier()}')\n",
+        "\n",
+        "trainer = GRPOTrainer(\n",
+        "    model=model,\n",
+        "    processing_class=tokenizer,\n",
+        "    reward_funcs=[reward_fn_outcome, reward_fn_format],\n",
+        "    args=grpo_config,\n",
+        "    train_dataset=dataset,\n",
+        ")\n",
+        "trainer.add_callback(CurriculumDatasetCallback(trainer))\n",
+        "\n",
+        "print('GRPOTrainer ready.')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 9 - Run Training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import wandb\n",
+        "wandb.init(project='wildfire-grpo', name='qwen7b-v2')\n",
+        "\n",
+        "print(f'Starting GRPO - {grpo_config.max_steps} steps, {grpo_config.num_generations} gen/prompt')\n",
+        "print(f'Reward: 1 model step at step-0, heuristic continuation to episode completion')\n",
+        "print(f'Start tier: {controller.get_tier()}')\n",
+        "\n",
+        "trainer.train()\n",
+        "print('Training complete.')\n",
+        "\n",
+        "history = controller.get_history()\n",
+        "stats = [{'step': ep, 'tier': t, 'mean_reward': r} for ep, t, r in history]\n",
+        "with open('./training_stats.json', 'w') as f:\n",
+        "    json.dump(stats, f, indent=2)\n",
+        "print('Stats saved -> training_stats.json')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 10 - Evaluate vs Baselines\n",
+        "\n",
+        "Run 15 full episodes per tier (seeds 42-56), compare with heuristic and random baselines."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "class LLMAgent:\n",
+        "    \"\"\"Wraps the trained model for evaluation. Must be re-instantiated per episode.\"\"\"\n",
+        "\n",
+        "    def __init__(self, model, tokenizer, tier, max_steps):\n",
+        "        self.model = model\n",
+        "        self.tokenizer = tokenizer\n",
+        "        self.tier = tier\n",
+        "        self.max_steps = max_steps\n",
+        "        self._step = 0\n",
+        "        self._prev_burning = 0\n",
+        "        self.json_success = self.regex_fallback = self.safe_idle = 0\n",
+        "\n",
+        "    def act(self, obs):\n",
+        "        prompt = serialize_observation(\n",
+        "            obs, self._step, self.max_steps,\n",
+        "            tier=self.tier,\n",
+        "            prev_cells_burning=self._prev_burning,\n",
+        "        )\n",
+        "        self._prev_burning = obs.stats.cells_burning\n",
+        "        messages = [\n",
+        "            {'role': 'system', 'content': SYSTEM_PROMPT},\n",
+        "            {'role': 'user',   'content': prompt},\n",
+        "        ]\n",
+        "        input_ids = self.tokenizer.apply_chat_template(\n",
+        "            messages, tokenize=True,\n",
+        "            add_generation_prompt=True, return_tensors='pt',\n",
+        "        ).to(self.model.device)\n",
+        "        with torch.no_grad():\n",
+        "            out = self.model.generate(\n",
+        "                input_ids, max_new_tokens=128,\n",
+        "                pad_token_id=self.tokenizer.eos_token_id,\n",
+        "            )\n",
+        "        text = self.tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True)\n",
+        "        action, status = parse_action(text, obs)\n",
+        "        if status == 'json_success':\n",
+        "            self.json_success += 1\n",
+        "        elif status == 'regex_fallback':\n",
+        "            self.regex_fallback += 1\n",
+        "        else:\n",
+        "            self.safe_idle += 1\n",
+        "        self._step += 1\n",
+        "        return action\n",
+        "\n",
+        "\n",
+        "print('LLMAgent class defined.')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "\n",
+        "with open('scripts/results.json', 'r') as f:\n",
+        "    baselines = json.load(f)\n",
+        "\n",
+        "FastLanguageModel.for_inference(model)\n",
+        "\n",
+        "EVAL_SEEDS = list(range(42, 57))\n",
+        "TIERS = ['easy', 'medium', 'hard']\n",
+        "\n",
+        "results = {}\n",
+        "\n",
+        "for tier in TIERS:\n",
+        "    max_steps = TIER_MAX_STEPS[tier]\n",
+        "    tier_rewards = []\n",
+        "    tier_pop_saved = []\n",
+        "    tier_json_success = 0\n",
+        "    tier_total_actions = 0\n",
+        "\n",
+        "    print(f'\\nEvaluating {tier} tier...')\n",
+        "\n",
+        "    for seed in EVAL_SEEDS:\n",
+        "        agent = LLMAgent(model, tokenizer, tier, max_steps)\n",
+        "        env = WildfireEnv()\n",
+        "        obs = env.reset(task_id=tier, seed=seed)\n",
+        "        total_reward = 0.0\n",
+        "\n",
+        "        while not env.done:\n",
+        "            action = agent.act(obs)\n",
+        "            result = env.step(action)\n",
+        "            total_reward += result.reward\n",
+        "            obs = result.observation\n",
+        "\n",
+        "        tier_rewards.append(total_reward)\n",
+        "\n",
+        "        state = env.state()\n",
+        "        total_pop = state['total_population']\n",
+        "        pop_lost = state['population_lost']\n",
+        "        pop_saved = 100.0 * (total_pop - pop_lost) / total_pop if total_pop > 0 else 100.0\n",
+        "        tier_pop_saved.append(pop_saved)\n",
+        "\n",
+        "        tier_json_success += agent.json_success\n",
+        "        tier_total_actions += agent.json_success + agent.regex_fallback + agent.safe_idle\n",
+        "\n",
+        "        print(f'  seed={seed}: reward={total_reward:+.2f}, pop_saved={pop_saved:.0f}%')\n",
+        "\n",
+        "    json_rate = 100.0 * tier_json_success / tier_total_actions if tier_total_actions > 0 else 0\n",
+        "    results[tier] = {\n",
+        "        'mean': float(np.mean(tier_rewards)),\n",
+        "        'std': float(np.std(tier_rewards)),\n",
+        "        'pop_saved_pct': float(np.mean(tier_pop_saved)),\n",
+        "        'json_success_rate': json_rate,\n",
+        "    }\n",
+        "\n",
+        "print()\n",
+        "print('=' * 65)\n",
+        "print('=== Evaluation: Trained Model vs Baselines ===')\n",
+        "print('Seeds: 42-56  (15 per tier)')\n",
+        "print('=' * 65)\n",
+        "header = f'{\"Tier\":<10} {\"Trained\":>12} {\"Heuristic\":>12} {\"Random\":>12} {\"vs Heur.\":>12}'\n",
+        "print(header)\n",
+        "print('-' * 65)\n",
+        "\n",
+        "any_tier_close = False\n",
+        "for tier in TIERS:\n",
+        "    t = results[tier]\n",
+        "    h_mean = baselines['heuristic'][tier]['mean']\n",
+        "    h_std = baselines['heuristic'][tier]['std']\n",
+        "    r_mean = baselines['random'][tier]['mean']\n",
+        "    r_std = baselines['random'][tier]['std']\n",
+        "    delta = t['mean'] - h_mean\n",
+        "    marker = ' OK' if delta >= -1.0 else ''\n",
+        "    if delta >= -1.0:\n",
+        "        any_tier_close = True\n",
+        "    print(\n",
+        "        f'{tier:<10} '\n",
+        "        f'{t[\"mean\"]:+.2f}+/-{t[\"std\"]:.1f}  '\n",
+        "        f'{h_mean:+.2f}+/-{h_std:.1f}  '\n",
+        "        f'{r_mean:+.2f}+/-{r_std:.1f}  '\n",
+        "        f'{delta:+.2f}{marker}'\n",
+        "    )\n",
+        "\n",
+        "print()\n",
+        "print('JSON success rate:  ', end='')\n",
+        "print('  '.join(f'{t}={results[t][\"json_success_rate\"]:.1f}%' for t in TIERS))\n",
+        "print('Pop saved rate:     ', end='')\n",
+        "print('  '.join(f'{t}={results[t][\"pop_saved_pct\"]:.0f}%' for t in TIERS))\n",
+        "\n",
+        "assert any_tier_close, (\n",
+        "    'Trained model did not come within 1.0 of heuristic on any tier. '\n",
+        "    'Check training logs and sample completions.'\n",
+        ")\n",
+        "print('\\nPASS: At least one tier within 1.0 of heuristic baseline.')\n",
+        "\n",
+        "FastLanguageModel.for_training(model)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 11 - Save and Push"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "model.save_pretrained('./grpo_final')\n",
+        "tokenizer.save_pretrained('./grpo_final')\n",
+        "print('Saved to ./grpo_final')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "HF_USERNAME = 'Eshit'  # <-- CHANGE THIS\n",
+        "model.push_to_hub(f'{HF_USERNAME}/wildfire-grpo-7b')\n",
+        "tokenizer.push_to_hub(f'{HF_USERNAME}/wildfire-grpo-7b')\n",
+        "print(f'Pushed to hub: {HF_USERNAME}/wildfire-grpo-7b')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!zip -r grpo_final.zip ./grpo_final\n",
+        "from google.colab import files\n",
+        "files.download('grpo_final.zip')\n",
+        "print('Download started.')"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "A100",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.10.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}

training/sft_colab.ipynb ADDED Viewed

	@@ -0,0 +1,387 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Wildfire Incident Command — SFT Training\n",
+        "\n",
+        "Supervised fine-tuning of **Qwen2.5-7B-Instruct** on wildfire incident command data.\n",
+        "\n",
+        "- **Input:** `training/sft_data.jsonl` (generated by `scripts/generate_sft_data.py`)\n",
+        "- **Goal:** Teach the model to output valid JSON action objects given wildfire observations\n",
+        "- **Hardware:** A100 40GB on Colab"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 1 — Install Dependencies"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!pip install \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\"\n",
+        "!pip install trl==0.15.2 datasets==3.4.1"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "assert torch.cuda.is_available(), \"GPU not available — switch to a GPU runtime\"\n",
+        "gpu_name = torch.cuda.get_device_name(0)\n",
+        "gpu_mem = torch.cuda.get_device_properties(0).total_mem / 1e9\n",
+        "print(f\"GPU: {gpu_name}  |  VRAM: {gpu_mem:.1f} GB\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 2 — Load Model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from unsloth import FastLanguageModel\n",
+        "\n",
+        "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+        "    model_name=\"unsloth/Qwen2.5-7B-Instruct\",\n",
+        "    max_seq_length=2048,\n",
+        "    load_in_4bit=True,\n",
+        ")\n",
+        "\n",
+        "if tokenizer.pad_token is None:\n",
+        "    tokenizer.pad_token = tokenizer.eos_token\n",
+        "\n",
+        "model = FastLanguageModel.get_peft_model(\n",
+        "    model,\n",
+        "    r=32,\n",
+        "    lora_alpha=64,\n",
+        "    lora_dropout=0.05,\n",
+        "    target_modules=[\n",
+        "        'q_proj', 'k_proj', 'v_proj', 'o_proj',\n",
+        "        'gate_proj', 'up_proj', 'down_proj',\n",
+        "    ],\n",
+        "    bias=\"none\",\n",
+        "    use_gradient_checkpointing=\"unsloth\",\n",
+        ")\n",
+        "\n",
+        "print(f\"Model loaded. Trainable params: {model.print_trainable_parameters()}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 3 — Load Data"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import json\n",
+        "from datasets import Dataset\n",
+        "from collections import Counter\n",
+        "\n",
+        "SFT_DATA_PATH = \"training/sft_data.jsonl\"\n",
+        "\n",
+        "raw_examples = []\n",
+        "with open(SFT_DATA_PATH, \"r\", encoding=\"utf-8\") as f:\n",
+        "    for line in f:\n",
+        "        raw_examples.append(json.loads(line))\n",
+        "\n",
+        "print(f\"Loaded {len(raw_examples)} raw examples\")\n",
+        "\n",
+        "tier_dist = Counter(ex[\"tier\"] for ex in raw_examples)\n",
+        "print(f\"Tier distribution: {dict(tier_dist)}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def format_example(ex):\n",
+        "    \"\"\"Format a single SFT example into a full conversation string for causal LM loss.\"\"\"\n",
+        "    messages = ex[\"messages\"]\n",
+        "    completion = ex[\"completion\"]\n",
+        "\n",
+        "    prompt_str = tokenizer.apply_chat_template(\n",
+        "        messages,\n",
+        "        tokenize=False,\n",
+        "        add_generation_prompt=True,\n",
+        "    )\n",
+        "    full_text = prompt_str + completion + tokenizer.eos_token\n",
+        "    return {\"text\": full_text}\n",
+        "\n",
+        "\n",
+        "formatted = [format_example(ex) for ex in raw_examples]\n",
+        "dataset = Dataset.from_list(formatted)\n",
+        "\n",
+        "split = dataset.train_test_split(test_size=0.05, seed=42)\n",
+        "train_dataset = split[\"train\"]\n",
+        "val_dataset = split[\"test\"]\n",
+        "\n",
+        "print(f\"Train: {len(train_dataset)}  |  Val: {len(val_dataset)}\")\n",
+        "print(f\"\\nSample (first 500 chars):\\n{formatted[0]['text'][:500]}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 4 — Train"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from trl import SFTTrainer\n",
+        "from transformers import TrainingArguments\n",
+        "\n",
+        "trainer = SFTTrainer(\n",
+        "    model=model,\n",
+        "    tokenizer=tokenizer,\n",
+        "    train_dataset=train_dataset,\n",
+        "    eval_dataset=val_dataset,\n",
+        "    dataset_text_field=\"text\",\n",
+        "    max_seq_length=2048,\n",
+        "    packing=True,\n",
+        "    args=TrainingArguments(\n",
+        "        output_dir=\"./sft_checkpoints\",\n",
+        "        per_device_train_batch_size=2,\n",
+        "        gradient_accumulation_steps=4,\n",
+        "        num_train_epochs=1,\n",
+        "        learning_rate=2e-4,\n",
+        "        warmup_ratio=0.05,\n",
+        "        lr_scheduler_type=\"cosine\",\n",
+        "        logging_steps=10,\n",
+        "        save_steps=100,\n",
+        "        save_total_limit=2,\n",
+        "        eval_strategy=\"steps\",\n",
+        "        eval_steps=100,\n",
+        "        fp16=not torch.cuda.is_bf16_supported(),\n",
+        "        bf16=torch.cuda.is_bf16_supported(),\n",
+        "        report_to=\"none\",\n",
+        "        optim=\"adamw_8bit\",\n",
+        "        seed=42,\n",
+        "    ),\n",
+        ")\n",
+        "\n",
+        "print(\"Starting SFT training...\")\n",
+        "trainer.train()\n",
+        "print(\"SFT training complete.\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 5 — Quick Eval\n",
+        "\n",
+        "Run 10 full episodes on easy tier with the trained model driving every step.\n",
+        "Requires env imports — upload the repo or clone it."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import sys, os\n",
+        "\n",
+        "# Adjust this path to wherever the repo root is in Colab\n",
+        "REPO_ROOT = \".\"  # or e.g. \"/content/Wildfire-Containment-Simulator-main\"\n",
+        "if REPO_ROOT not in sys.path:\n",
+        "    sys.path.insert(0, REPO_ROOT)\n",
+        "\n",
+        "from env.wildfire_env import WildfireEnv\n",
+        "from env.serialization import serialize_observation\n",
+        "from env.action_parser import parse_action\n",
+        "from env.models import TIER_EASY\n",
+        "\n",
+        "SYSTEM_PROMPT = (\n",
+        "    \"You are an AI Incident Commander managing wildfire containment. \"\n",
+        "    \"You will receive a situation briefing each step. \"\n",
+        "    \"Respond with ONLY a valid JSON action object and nothing else. \"\n",
+        "    'Example: {\"action_type\": \"idle\"}'\n",
+        ")\n",
+        "\n",
+        "print(\"Env imports OK\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "\n",
+        "FastLanguageModel.for_inference(model)\n",
+        "\n",
+        "EVAL_SEEDS = range(42, 52)\n",
+        "TIER = \"easy\"\n",
+        "MAX_STEPS = TIER_EASY.episode_length\n",
+        "\n",
+        "rewards = []\n",
+        "pop_saved_pcts = []\n",
+        "parse_counts = {\"json_success\": 0, \"regex_fallback\": 0, \"safe_idle\": 0}\n",
+        "total_steps = 0\n",
+        "\n",
+        "for seed in EVAL_SEEDS:\n",
+        "    env = WildfireEnv()\n",
+        "    obs = env.reset(task_id=TIER, seed=seed)\n",
+        "    episode_reward = 0.0\n",
+        "    step_num = 0\n",
+        "    prev_burning = 0\n",
+        "\n",
+        "    while not env.done:\n",
+        "        prompt = serialize_observation(\n",
+        "            obs, step_num, MAX_STEPS,\n",
+        "            tier=TIER, prev_cells_burning=prev_burning,\n",
+        "        )\n",
+        "        prev_burning = obs.stats.cells_burning\n",
+        "\n",
+        "        messages = [\n",
+        "            {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
+        "            {\"role\": \"user\", \"content\": prompt},\n",
+        "        ]\n",
+        "        input_ids = tokenizer.apply_chat_template(\n",
+        "            messages, tokenize=True,\n",
+        "            add_generation_prompt=True, return_tensors=\"pt\",\n",
+        "        ).to(model.device)\n",
+        "\n",
+        "        with torch.no_grad():\n",
+        "            out = model.generate(\n",
+        "                input_ids,\n",
+        "                max_new_tokens=128,\n",
+        "                pad_token_id=tokenizer.eos_token_id,\n",
+        "            )\n",
+        "        text = tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True)\n",
+        "\n",
+        "        action, status = parse_action(text, obs)\n",
+        "        parse_counts[status] = parse_counts.get(status, 0) + 1\n",
+        "\n",
+        "        result = env.step(action)\n",
+        "        episode_reward += result.reward\n",
+        "        obs = result.observation\n",
+        "        step_num += 1\n",
+        "\n",
+        "    total_steps += step_num\n",
+        "    rewards.append(episode_reward)\n",
+        "\n",
+        "    state = env.state()\n",
+        "    total_pop = state[\"total_population\"]\n",
+        "    pop_lost = state[\"population_lost\"]\n",
+        "    pop_saved = 100.0 * (total_pop - pop_lost) / total_pop if total_pop > 0 else 100.0\n",
+        "    pop_saved_pcts.append(pop_saved)\n",
+        "\n",
+        "    print(f\"  Seed {seed}: reward={episode_reward:+.2f}, steps={step_num}, pop_saved={pop_saved:.0f}%\")\n",
+        "\n",
+        "mean_reward = np.mean(rewards)\n",
+        "std_reward = np.std(rewards)\n",
+        "total_parses = sum(parse_counts.values())\n",
+        "json_rate = 100.0 * parse_counts[\"json_success\"] / total_parses if total_parses > 0 else 0\n",
+        "mean_pop = np.mean(pop_saved_pcts)\n",
+        "\n",
+        "print(f\"\\n{'='*50}\")\n",
+        "print(f\"SFT Quick Eval — {TIER} tier, seeds {EVAL_SEEDS.start}-{EVAL_SEEDS.stop-1}\")\n",
+        "print(f\"Mean reward:       {mean_reward:+.2f} ± {std_reward:.2f}\")\n",
+        "print(f\"JSON success rate: {json_rate:.1f}%\")\n",
+        "print(f\"Mean pop saved:    {mean_pop:.1f}%\")\n",
+        "print(f\"Parse breakdown:   {dict(parse_counts)}\")\n",
+        "print(f\"{'='*50}\")\n",
+        "\n",
+        "assert mean_reward > 2.0, (\n",
+        "    f\"SFT warm-up insufficient (mean_reward={mean_reward:.2f}) — do not proceed to GRPO\"\n",
+        ")\n",
+        "print(\"\\n✓ SFT checkpoint passes warm-up gate. Safe to proceed to GRPO.\")\n",
+        "\n",
+        "FastLanguageModel.for_training(model)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Section 6 — Save & Export"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "model.save_pretrained(\"./sft_final\")\n",
+        "tokenizer.save_pretrained(\"./sft_final\")\n",
+        "print(\"Saved to ./sft_final\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Push to HuggingFace Hub — replace with your username\n",
+        "HF_USERNAME = \"Eshit\"\n",
+        "model.push_to_hub(f\"{HF_USERNAME}/wildfire-sft-7b\")\n",
+        "tokenizer.push_to_hub(f\"{HF_USERNAME}/wildfire-sft-7b\")\n",
+        "print(f\"Pushed to hub: {HF_USERNAME}/wildfire-sft-7b\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!zip -r sft_final.zip ./sft_final\n",
+        "from google.colab import files\n",
+        "files.download(\"sft_final.zip\")\n",
+        "print(\"Download started.\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "A100",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.10.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}

training/sft_data.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff