Spaces:

Torchflow1
/

Multi-Agent-Incident-Command-Center

Sleeping

App Files Files Community

SwapnilPatil28 commited on Apr 25

Commit

58af620

verified ·

1 Parent(s): a403b80

Add LLM policy, SFT saving & LLM evaluation

Browse files

Files changed (4) hide show

README.md +26 -4
inference.py +38 -9
llm_policy.py +184 -0
train_trl.py +189 -38

README.md CHANGED Viewed

@@ -236,9 +236,9 @@ Expected output: **21 passing** (domain rubric, incident catalog, environment in
 [`train_trl.py`](./train_trl.py) orchestrates the end-to-end training & evaluation pipeline:
 1. **Rollout** — the `HeuristicCoordinator` drives the live environment to collect `(prompt, completion)` pairs. Prompts include customer tier, revenue impact, visible signals and investigation targets; completions are structured JSON actions.
-2. **SFT** — the dataset is collapsed into a single `text` column (robust across TRL ≥ 0.20) and fed to `SFTTrainer`.
-3. **Evaluation** — the trained model is not yet wired as the acting policy (to stay CPU-friendly), but heuristic vs random are evaluated under identical seeds so the judges can see an observable gap.
-4. **Artifacts** — `artifacts/reward_curve.png` and `artifacts/summary_metrics.json` are written.
 ### Local run (small model)
@@ -274,7 +274,29 @@ Environment variables you can tune before running `train_trl.py`:
 | `TRAIN_EPOCHS` | `1` | SFT epochs |
 | `TRAIN_MAX_LENGTH` | `768` | Max sequence length |
 | `TRAIN_BATCH_SIZE` / `TRAIN_GRAD_ACCUM` | `1` / `2` | Effective batch size |
-| `MAX_ROLLOUT_STEPS` | `120` | Safety cap per episode |
 ---

 [`train_trl.py`](./train_trl.py) orchestrates the end-to-end training & evaluation pipeline:
 1. **Rollout** — the `HeuristicCoordinator` drives the live environment to collect `(prompt, completion)` pairs. Prompts include customer tier, revenue impact, visible signals and investigation targets; completions are structured JSON actions.
+2. **SFT** — the dataset is collapsed into a single `text` column (robust across TRL ≥ 0.20) and fed to `SFTTrainer`. The fine-tuned weights + tokenizer are saved to `artifacts/sft_model/`.
+3. **Evaluation** — four policies are rolled out under identical seeds: `random`, `heuristic`, `base_model` (raw `BASE_MODEL` HF checkpoint), and `sft_model` (the fine-tuned checkpoint just saved). LLM evaluation auto-enables on a CUDA GPU; force it with `EVAL_LLM_MODELS=true` or disable with `EVAL_LLM_MODELS=false`.
+4. **Artifacts** — `artifacts/reward_curve.png` (4 lines) and `artifacts/summary_metrics.json` (random / heuristic / base / SFT rewards + per-task SFT-over-base improvements) are written.
 ### Local run (small model)
 | `TRAIN_EPOCHS` | `1` | SFT epochs |
 | `TRAIN_MAX_LENGTH` | `768` | Max sequence length |
 | `TRAIN_BATCH_SIZE` / `TRAIN_GRAD_ACCUM` | `1` / `2` | Effective batch size |
+| `MAX_ROLLOUT_STEPS` | `120` | Safety cap per episode (data collection + baselines) |
+| `MAX_LLM_EVAL_STEPS` | `60` | Safety cap per episode when an LLM policy is acting |
+| `EVAL_LLM_MODELS` | `auto` | `auto` ⇒ eval LLMs only if CUDA is available; `true`/`false` to force |
+### Running a base vs fine-tuned comparison
+After `train_trl.py` finishes, the fine-tuned checkpoint lives at
+`artifacts/sft_model/`. You can re-run just the LLM rollouts against the
+running environment without retraining:
+```python
+# Colab / local
+import os
+os.environ["POLICY_MODEL"] = "Qwen/Qwen2.5-0.5B-Instruct"   # base model
+!python inference.py
+os.environ["POLICY_MODEL"] = "artifacts/sft_model"          # fine-tuned
+!python inference.py
+```
+`inference.py` picks up `POLICY_MODEL` and routes every step through the
+LLM via `llm_policy.LLMPolicy`, falling back to a safe action only when
+the model emits invalid JSON.
 ---

inference.py CHANGED Viewed

@@ -26,6 +26,9 @@ from models import IncidentAction, IncidentObservation
 ENV_URL = os.getenv("ENV_URL", "http://127.0.0.1:8000")
 BENCHMARK = "incident_command_center_env"
 RANDOM_BASELINE = os.getenv("RANDOM_BASELINE", "false").lower() == "true"
 # ---------------------------------------------------------------------------
@@ -297,9 +300,14 @@ def random_action(observation: IncidentObservation) -> IncidentAction:
 # ---------------------------------------------------------------------------
-async def run_task(task_name: str) -> None:
     env = IncidentCommandEnvClient(base_url=ENV_URL).sync()
-    policy_name = "random_baseline" if RANDOM_BASELINE else "heuristic_coordinator"
     coordinator = HeuristicCoordinator()
     log_start(task=task_name, env=BENCHMARK, policy=policy_name)
@@ -313,11 +321,12 @@ async def run_task(task_name: str) -> None:
         res = env.reset(task_name=task_name)
         while not res.done:
             steps_taken += 1
-            action = (
-                random_action(res.observation)
-                if RANDOM_BASELINE
-                else coordinator.select_action(res.observation)
-            )
             res = env.step(action)
             reward = float(res.reward or 0.0)
             rewards.append(reward)
@@ -340,19 +349,39 @@ async def run_task(task_name: str) -> None:
 def main() -> None:
     for task in ["easy", "medium", "hard"]:
-        asyncio.run(run_task(task))
     print(
         json.dumps(
             {
                 "benchmark": BENCHMARK,
-                "policy": "random_baseline" if RANDOM_BASELINE else "heuristic_coordinator",
                 "env_url": ENV_URL,
             },
             indent=2,
         )
     )
 if __name__ == "__main__":
     main()

 ENV_URL = os.getenv("ENV_URL", "http://127.0.0.1:8000")
 BENCHMARK = "incident_command_center_env"
 RANDOM_BASELINE = os.getenv("RANDOM_BASELINE", "false").lower() == "true"
+# When set, run an LLM-backed policy (base or fine-tuned checkpoint) instead
+# of the heuristic / random ones. Point this at a HF hub id or a local dir.
+POLICY_MODEL = os.getenv("POLICY_MODEL", "").strip()
 # ---------------------------------------------------------------------------
 # ---------------------------------------------------------------------------
+async def run_task(task_name: str, llm_policy=None) -> None:
     env = IncidentCommandEnvClient(base_url=ENV_URL).sync()
+    if llm_policy is not None:
+        policy_name = f"llm:{getattr(llm_policy, 'label', POLICY_MODEL)}"
+    elif RANDOM_BASELINE:
+        policy_name = "random_baseline"
+    else:
+        policy_name = "heuristic_coordinator"
     coordinator = HeuristicCoordinator()
     log_start(task=task_name, env=BENCHMARK, policy=policy_name)
         res = env.reset(task_name=task_name)
         while not res.done:
             steps_taken += 1
+            if llm_policy is not None:
+                action = llm_policy.select_action(res.observation)
+            elif RANDOM_BASELINE:
+                action = random_action(res.observation)
+            else:
+                action = coordinator.select_action(res.observation)
             res = env.step(action)
             reward = float(res.reward or 0.0)
             rewards.append(reward)
 def main() -> None:
+    llm_policy = None
+    if POLICY_MODEL:
+        from llm_policy import LLMPolicy
+        llm_policy = LLMPolicy(POLICY_MODEL, label=POLICY_MODEL)
     for task in ["easy", "medium", "hard"]:
+        asyncio.run(run_task(task, llm_policy=llm_policy))
+    if llm_policy is not None:
+        policy_label = f"llm:{POLICY_MODEL}"
+    elif RANDOM_BASELINE:
+        policy_label = "random_baseline"
+    else:
+        policy_label = "heuristic_coordinator"
     print(
         json.dumps(
             {
                 "benchmark": BENCHMARK,
+                "policy": policy_label,
                 "env_url": ENV_URL,
             },
             indent=2,
         )
     )
+    if llm_policy is not None:
+        try:
+            llm_policy.release()
+        except Exception:
+            pass
 if __name__ == "__main__":
     main()

llm_policy.py ADDED Viewed

	@@ -0,0 +1,184 @@

+"""LLM-backed policy for the Incident Command Center environment.
+Wraps any Hugging Face causal-LM (a base model OR a fine-tuned checkpoint)
+into a callable that takes an ``IncidentObservation`` and returns a typed
+``IncidentAction``. This is what turns a raw language model into an agent
+that can act inside the environment.
+Usage::
+    from llm_policy import LLMPolicy
+    policy = LLMPolicy("Qwen/Qwen2.5-0.5B-Instruct")
+    action = policy.select_action(observation)
+If the model emits invalid JSON, the policy degrades gracefully to a safe
+default action (inspect the first log target) so one bad generation never
+crashes a whole rollout.
+"""
+from __future__ import annotations
+import json
+import logging
+import re
+from typing import Any, Dict, Optional
+from models import IncidentAction, IncidentObservation
+_LOG = logging.getLogger("icc.llm_policy")
+# Regex for the first balanced-ish JSON object in the model output.
+# (Greedy `.*` inside `{...}` keeps nested braces intact for our tiny JSON.)
+_JSON_RE = re.compile(r"\{[\s\S]*\}")
+class LLMPolicy:
+    """Policy that calls a HF causal-LM and parses its JSON action."""
+    def __init__(
+        self,
+        model_name_or_path: str,
+        *,
+        device: Optional[str] = None,
+        max_new_tokens: int = 160,
+        temperature: float = 0.0,
+        dtype: Optional[str] = None,
+        label: Optional[str] = None,
+    ) -> None:
+        try:
+            import torch
+            from transformers import AutoModelForCausalLM, AutoTokenizer
+        except ImportError as exc:  # pragma: no cover - runtime dep
+            raise RuntimeError(
+                "LLMPolicy requires `transformers` and `torch` installed. "
+                "Run: pip install transformers torch"
+            ) from exc
+        self._torch = torch
+        self.label = label or model_name_or_path
+        self.max_new_tokens = max_new_tokens
+        self.temperature = temperature
+        resolved_device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        if dtype is None:
+            torch_dtype = torch.float16 if resolved_device == "cuda" else torch.float32
+        else:
+            torch_dtype = getattr(torch, dtype)
+        _LOG.info(
+            "Loading LLM policy %s on %s (dtype=%s)",
+            model_name_or_path,
+            resolved_device,
+            torch_dtype,
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name_or_path,
+            torch_dtype=torch_dtype,
+        ).to(resolved_device)
+        self.model.eval()
+        self.device = resolved_device
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def select_action(self, observation: IncidentObservation) -> IncidentAction:
+        prompt_text = self._build_prompt_text(observation)
+        response_text = self._generate(prompt_text)
+        return self._parse_action(response_text, observation)
+    # ------------------------------------------------------------------
+    # Internals
+    # ------------------------------------------------------------------
+    def _build_prompt_text(self, observation: IncidentObservation) -> str:
+        # Keep this import here to avoid importing the trainer stack when the
+        # module is used for inference only.
+        from train_trl import obs_to_prompt
+        user_prompt = obs_to_prompt(observation)
+        if getattr(self.tokenizer, "chat_template", None):
+            messages = [{"role": "user", "content": user_prompt}]
+            return self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+        return f"User: {user_prompt}\n\nAssistant:"
+    def _generate(self, prompt_text: str) -> str:
+        torch = self._torch
+        inputs = self.tokenizer(prompt_text, return_tensors="pt").to(self.device)
+        gen_kwargs: Dict[str, Any] = {
+            "max_new_tokens": self.max_new_tokens,
+            "pad_token_id": self.tokenizer.pad_token_id,
+        }
+        if self.temperature > 0:
+            gen_kwargs.update(
+                do_sample=True,
+                temperature=self.temperature,
+                top_p=0.9,
+            )
+        else:
+            gen_kwargs["do_sample"] = False
+        with torch.no_grad():
+            output = self.model.generate(**inputs, **gen_kwargs)
+        generated_ids = output[0][inputs["input_ids"].shape[1]:]
+        return self.tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+    def _parse_action(
+        self,
+        response_text: str,
+        observation: IncidentObservation,
+    ) -> IncidentAction:
+        json_match = _JSON_RE.search(response_text)
+        if json_match:
+            raw = json_match.group(0)
+            # Qwen / Llama sometimes add trailing commentary; strip past the
+            # last closing brace to give JSON parser a clean slice.
+            last_close = raw.rfind("}")
+            if last_close != -1:
+                raw = raw[: last_close + 1]
+            try:
+                data = json.loads(raw)
+                return IncidentAction.model_validate(data)
+            except Exception as exc:
+                _LOG.debug(
+                    "LLM JSON parse failed: %s :: raw=%s",
+                    exc,
+                    raw[:200],
+                )
+        return self._safe_fallback(observation)
+    def _safe_fallback(self, observation: IncidentObservation) -> IncidentAction:
+        logs = (observation.investigation_targets or {}).get("logs", []) or []
+        target = logs[0] if logs else "payments-api"
+        return IncidentAction(
+            actor="triage_agent",
+            action_type="inspect_logs",
+            target=target,
+            reason="LLM output invalid; using safe fallback action.",
+        )
+    # ------------------------------------------------------------------
+    # Resource cleanup
+    # ------------------------------------------------------------------
+    def release(self) -> None:
+        """Free GPU memory so a second model can be loaded after this one."""
+        try:
+            import gc
+            self.model = None  # type: ignore[assignment]
+            self.tokenizer = None  # type: ignore[assignment]
+            gc.collect()
+            if self._torch.cuda.is_available():
+                self._torch.cuda.empty_cache()
+        except Exception:
+            pass

train_trl.py CHANGED Viewed

@@ -1,17 +1,22 @@
 """Hugging Face TRL training + evaluation pipeline.
-What this script does end-to-end:
-1. Rolls out the `HeuristicCoordinator` against a running Incident Command
-   Center environment to produce `(prompt, completion)` training rows.
-2. Fine-tunes a small instruction-tuned LLM using TRL's `SFTTrainer` with a
-   single `text` column that works reliably across TRL >= 0.20.
-3. Evaluates the heuristic and random baseline policies post-training and
-   writes a reward curve + JSON metrics into `artifacts/` — exactly the
-   evidence the hackathon judges look for.
-Designed to run equally well on CPU (for smoke checks) and on a Colab T4 /
-HF Spaces GPU (for the real run).
 """
 from __future__ import annotations
@@ -19,9 +24,9 @@ from __future__ import annotations
 import json
 import os
 import random
-from dataclasses import dataclass, asdict
 from pathlib import Path
-from typing import Dict, List
 import matplotlib.pyplot as plt
 from datasets import Dataset
@@ -33,15 +38,18 @@ from models import IncidentAction, IncidentObservation
 ARTIFACT_DIR = Path("artifacts")
 ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)
 ENV_URL = os.getenv("ENV_URL", "http://127.0.0.1:8000")
 BASE_MODEL = os.getenv("BASE_MODEL", "Qwen/Qwen2.5-0.5B-Instruct")
 MAX_ROLLOUT_STEPS = int(os.getenv("MAX_ROLLOUT_STEPS", "120"))
 EPISODES_PER_TASK = int(os.getenv("EPISODES_PER_TASK", "3"))
 TRAIN_EPOCHS = float(os.getenv("TRAIN_EPOCHS", "1"))
 TRAIN_BATCH_SIZE = int(os.getenv("TRAIN_BATCH_SIZE", "1"))
 TRAIN_GRAD_ACCUM = int(os.getenv("TRAIN_GRAD_ACCUM", "2"))
 TRAIN_MAX_LENGTH = int(os.getenv("TRAIN_MAX_LENGTH", "768"))
 @dataclass
@@ -99,18 +107,28 @@ def rollout(
     policy_name: str,
     task_name: str,
     collect_dataset: bool = False,
 ):
     env = IncidentCommandEnvClient(base_url=ENV_URL).sync()
     coordinator = HeuristicCoordinator()
     records: List[Dict[str, str]] = []
     rewards: List[float] = []
     steps = 0
     try:
         result = env.reset(task_name=task_name)
-        while not result.done and steps < MAX_ROLLOUT_STEPS:
             steps += 1
-            if policy_name == "heuristic":
                 action = coordinator.select_action(result.observation)
             else:
                 action = random_action(result.observation)
@@ -157,11 +175,7 @@ def build_training_dataset(episodes_per_task: int = EPISODES_PER_TASK) -> Datase
 def _dataset_to_sft_text_column(dataset: Dataset, tokenizer) -> Dataset:
-    """Collapse (prompt, completion) pairs into a single `text` field.
-    The ``text`` column path in TRL 0.20+ is the most version-robust option,
-    side-stepping brittle prompt/completion tokenization across TRL releases.
-    """
     from transformers import PreTrainedTokenizerBase
     if not isinstance(tokenizer, PreTrainedTokenizerBase):
@@ -172,7 +186,8 @@ def _dataset_to_sft_text_column(dataset: Dataset, tokenizer) -> Dataset:
         dataset = dataset.rename_column("response", "completion")
     if "prompt" not in dataset.column_names or "completion" not in dataset.column_names:
         raise ValueError(
-            f"Expected columns 'prompt' and 'completion' (or 'response'). Got: {dataset.column_names}"
         )
     has_template = bool(getattr(tokenizer, "chat_template", None))
@@ -200,7 +215,11 @@ def _dataset_to_sft_text_column(dataset: Dataset, tokenizer) -> Dataset:
     return dataset.map(to_text_batched, batched=True, remove_columns=to_drop)
-def run_trl_sft(dataset: Dataset) -> None:
     try:
         from transformers import AutoModelForCausalLM, AutoTokenizer
         from trl import SFTConfig, SFTTrainer
@@ -237,36 +256,161 @@ def run_trl_sft(dataset: Dataset) -> None:
     )
     trainer.train()
 # ---------------------------------------------------------------------------
 # Evaluation + reporting
 # ---------------------------------------------------------------------------
-def evaluate_policies(seed: int = 7) -> Dict[str, List[float]]:
     random.seed(seed)
-    random_scores: List[float] = []
-    heuristic_scores: List[float] = []
     for task in ["easy", "medium", "hard"]:
         random_stats, _, _ = rollout("random", task)
         heuristic_stats, _, _ = rollout("heuristic", task)
-        random_scores.append(random_stats.total_reward)
-        heuristic_scores.append(heuristic_stats.total_reward)
-    return {"random": random_scores, "heuristic": heuristic_scores}
 def plot_rewards(score_map: Dict[str, List[float]]) -> None:
     labels = ["easy", "medium", "hard"]
     x = list(range(len(labels)))
-    plt.figure(figsize=(8, 4.5))
-    plt.plot(x, score_map["random"], marker="o", label="Random baseline")
-    plt.plot(x, score_map["heuristic"], marker="o", label="Heuristic coordinator")
     plt.xticks(x, labels)
     plt.xlabel("Task difficulty")
     plt.ylabel("Episode total reward")
-    plt.title("Incident Command Center — baseline comparison")
     plt.grid(alpha=0.3)
     plt.legend()
     plt.tight_layout()
@@ -276,7 +420,7 @@ def plot_rewards(score_map: Dict[str, List[float]]) -> None:
 def main() -> None:
     dataset = build_training_dataset(episodes_per_task=EPISODES_PER_TASK)
-    dataset.save_to_disk("artifacts/trl_dataset")
     run_trl_sft(dataset)
     scores = evaluate_policies()
@@ -286,10 +430,17 @@ def main() -> None:
         "base_model": BASE_MODEL,
         "dataset_rows": len(dataset),
         "episodes_per_task": EPISODES_PER_TASK,
-        "random_rewards": scores["random"],
-        "heuristic_rewards": scores["heuristic"],
-        "improvement_absolute": [
-            round(h - r, 4) for h, r in zip(scores["heuristic"], scores["random"])
         ],
     }
     with open(ARTIFACT_DIR / "summary_metrics.json", "w", encoding="utf-8") as f:

 """Hugging Face TRL training + evaluation pipeline.
+Pipeline:
+1. **Rollout**: run the ``HeuristicCoordinator`` against the live Incident
+   Command Center environment to collect ``(prompt, completion)`` pairs.
+2. **SFT**: fine-tune a small instruction-tuned LLM on those pairs using
+   TRL's ``SFTTrainer`` with a single ``text`` column (robust across TRL
+   ≥ 0.20).
+3. **Save**: persist the fine-tuned weights + tokenizer to
+   ``artifacts/sft_model`` so the same script can later load them as an
+   agent policy.
+4. **Evaluate**: play the environment with four policies
+   ``random / heuristic / base_model / sft_model`` under identical seeds
+   and write a reward curve + metrics JSON into ``artifacts/``.
+Designed to work on CPU for smoke checks and on Colab T4 / HF Spaces GPUs
+for full runs. LLM evaluation auto-enables on CUDA and can be forced with
+``EVAL_LLM_MODELS=true``.
 """
 from __future__ import annotations
 import json
 import os
 import random
+from dataclasses import dataclass
 from pathlib import Path
+from typing import Callable, Dict, List, Optional
 import matplotlib.pyplot as plt
 from datasets import Dataset
 ARTIFACT_DIR = Path("artifacts")
 ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)
+SFT_MODEL_DIR = ARTIFACT_DIR / "sft_model"
 ENV_URL = os.getenv("ENV_URL", "http://127.0.0.1:8000")
 BASE_MODEL = os.getenv("BASE_MODEL", "Qwen/Qwen2.5-0.5B-Instruct")
 MAX_ROLLOUT_STEPS = int(os.getenv("MAX_ROLLOUT_STEPS", "120"))
+MAX_LLM_EVAL_STEPS = int(os.getenv("MAX_LLM_EVAL_STEPS", "60"))
 EPISODES_PER_TASK = int(os.getenv("EPISODES_PER_TASK", "3"))
 TRAIN_EPOCHS = float(os.getenv("TRAIN_EPOCHS", "1"))
 TRAIN_BATCH_SIZE = int(os.getenv("TRAIN_BATCH_SIZE", "1"))
 TRAIN_GRAD_ACCUM = int(os.getenv("TRAIN_GRAD_ACCUM", "2"))
 TRAIN_MAX_LENGTH = int(os.getenv("TRAIN_MAX_LENGTH", "768"))
+_EVAL_LLM_ENV = os.getenv("EVAL_LLM_MODELS", "auto").strip().lower()
 @dataclass
     policy_name: str,
     task_name: str,
     collect_dataset: bool = False,
+    policy_callable: Optional[Callable[[IncidentObservation], IncidentAction]] = None,
+    max_steps: Optional[int] = None,
 ):
+    """Play one episode and return (stats, rows, rewards).
+    If ``policy_callable`` is provided it takes precedence over
+    ``policy_name`` — this is how the LLM policies plug in.
+    """
     env = IncidentCommandEnvClient(base_url=ENV_URL).sync()
     coordinator = HeuristicCoordinator()
     records: List[Dict[str, str]] = []
     rewards: List[float] = []
     steps = 0
+    step_cap = max_steps if max_steps is not None else MAX_ROLLOUT_STEPS
     try:
         result = env.reset(task_name=task_name)
+        while not result.done and steps < step_cap:
             steps += 1
+            if policy_callable is not None:
+                action = policy_callable(result.observation)
+            elif policy_name == "heuristic":
                 action = coordinator.select_action(result.observation)
             else:
                 action = random_action(result.observation)
 def _dataset_to_sft_text_column(dataset: Dataset, tokenizer) -> Dataset:
+    """Collapse (prompt, completion) pairs into a single `text` field."""
     from transformers import PreTrainedTokenizerBase
     if not isinstance(tokenizer, PreTrainedTokenizerBase):
         dataset = dataset.rename_column("response", "completion")
     if "prompt" not in dataset.column_names or "completion" not in dataset.column_names:
         raise ValueError(
+            f"Expected columns 'prompt' and 'completion' (or 'response'). "
+            f"Got: {dataset.column_names}"
         )
     has_template = bool(getattr(tokenizer, "chat_template", None))
     return dataset.map(to_text_batched, batched=True, remove_columns=to_drop)
+def run_trl_sft(dataset: Dataset) -> Path:
+    """Fine-tune ``BASE_MODEL`` on the collected dataset and save the model.
+    Returns the directory of the saved SFT checkpoint (``artifacts/sft_model``).
+    """
     try:
         from transformers import AutoModelForCausalLM, AutoTokenizer
         from trl import SFTConfig, SFTTrainer
     )
     trainer.train()
+    SFT_MODEL_DIR.mkdir(parents=True, exist_ok=True)
+    trainer.save_model(str(SFT_MODEL_DIR))
+    tokenizer.save_pretrained(str(SFT_MODEL_DIR))
+    print(f"[train] Saved SFT checkpoint to {SFT_MODEL_DIR}")
+    del trainer, model, tokenizer
+    _free_gpu_memory()
+    return SFT_MODEL_DIR
 # ---------------------------------------------------------------------------
 # Evaluation + reporting
 # ---------------------------------------------------------------------------
+def _free_gpu_memory() -> None:
+    try:
+        import gc
+        gc.collect()
+        import torch
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    except Exception:
+        pass
+def _cuda_available() -> bool:
+    try:
+        import torch
+        return torch.cuda.is_available()
+    except Exception:
+        return False
+def _should_evaluate_llms() -> bool:
+    if _EVAL_LLM_ENV in {"1", "true", "yes", "on"}:
+        return True
+    if _EVAL_LLM_ENV in {"0", "false", "no", "off"}:
+        return False
+    # "auto" / empty: enable only when a CUDA GPU is available so CPU runs
+    # stay fast.
+    return _cuda_available()
+def _evaluate_single_policy(
+    policy_name: str,
+    select_fn: Callable[[IncidentObservation], IncidentAction],
+    max_steps: Optional[int] = None,
+) -> List[float]:
+    scores: List[float] = []
+    for task in ["easy", "medium", "hard"]:
+        stats, _, _ = rollout(
+            policy_name=policy_name,
+            task_name=task,
+            policy_callable=select_fn,
+            max_steps=max_steps,
+        )
+        print(
+            f"[eval] policy={policy_name} task={task} "
+            f"reward={stats.total_reward:+.2f} steps={stats.steps}"
+        )
+        scores.append(round(stats.total_reward, 4))
+    return scores
+def evaluate_policies(
+    seed: int = 7,
+    evaluate_llms: Optional[bool] = None,
+) -> Dict[str, List[float]]:
+    """Run each policy once per task under the same seed.
+    The random policy is seeded for reproducibility. The heuristic policy is
+    deterministic already. LLM policies are evaluated with greedy decoding.
+    """
     random.seed(seed)
+    scores: Dict[str, List[float]] = {
+        "random": [],
+        "heuristic": [],
+        "base_model": [],
+        "sft_model": [],
+    }
     for task in ["easy", "medium", "hard"]:
         random_stats, _, _ = rollout("random", task)
         heuristic_stats, _, _ = rollout("heuristic", task)
+        scores["random"].append(round(random_stats.total_reward, 4))
+        scores["heuristic"].append(round(heuristic_stats.total_reward, 4))
+    should_eval_llms = _should_evaluate_llms() if evaluate_llms is None else evaluate_llms
+    if not should_eval_llms:
+        print("[eval] Skipping LLM evaluation (no GPU or EVAL_LLM_MODELS=false).")
+        return scores
+    try:
+        from llm_policy import LLMPolicy
+    except Exception as exc:  # pragma: no cover - import-time safety
+        print(f"[eval] Could not import LLMPolicy ({exc}); skipping LLM eval.")
+        return scores
+    # Base model
+    try:
+        print(f"[eval] Loading BASE model: {BASE_MODEL}")
+        base = LLMPolicy(BASE_MODEL, label="base_model")
+        scores["base_model"] = _evaluate_single_policy(
+            "base_model", base.select_action, max_steps=MAX_LLM_EVAL_STEPS
+        )
+        base.release()
+        _free_gpu_memory()
+    except Exception as exc:
+        print(f"[eval] Base-model evaluation failed: {exc}")
+    # SFT model
+    if SFT_MODEL_DIR.exists():
+        try:
+            print(f"[eval] Loading SFT model: {SFT_MODEL_DIR}")
+            sft = LLMPolicy(str(SFT_MODEL_DIR), label="sft_model")
+            scores["sft_model"] = _evaluate_single_policy(
+                "sft_model", sft.select_action, max_steps=MAX_LLM_EVAL_STEPS
+            )
+            sft.release()
+            _free_gpu_memory()
+        except Exception as exc:
+            print(f"[eval] SFT-model evaluation failed: {exc}")
+    else:
+        print(f"[eval] No SFT checkpoint found at {SFT_MODEL_DIR}; skipping SFT eval.")
+    return scores
 def plot_rewards(score_map: Dict[str, List[float]]) -> None:
     labels = ["easy", "medium", "hard"]
     x = list(range(len(labels)))
+    plt.figure(figsize=(9, 5))
+    style = {
+        "random": ("x", "tab:red", "Random baseline"),
+        "heuristic": ("o", "tab:blue", "Heuristic coordinator"),
+        "base_model": ("^", "tab:orange", "Base LLM (untrained)"),
+        "sft_model": ("D", "tab:green", "Fine-tuned LLM (SFT)"),
+    }
+    for key, (marker, color, label) in style.items():
+        values = score_map.get(key) or []
+        if not values or len(values) != len(labels):
+            continue
+        plt.plot(x, values, marker=marker, color=color, label=label, linewidth=2)
     plt.xticks(x, labels)
     plt.xlabel("Task difficulty")
     plt.ylabel("Episode total reward")
+    plt.title("Incident Command Center — policy comparison")
+    plt.axhline(0, linestyle="--", color="gray", alpha=0.5)
     plt.grid(alpha=0.3)
     plt.legend()
     plt.tight_layout()
 def main() -> None:
     dataset = build_training_dataset(episodes_per_task=EPISODES_PER_TASK)
+    dataset.save_to_disk(str(ARTIFACT_DIR / "trl_dataset"))
     run_trl_sft(dataset)
     scores = evaluate_policies()
         "base_model": BASE_MODEL,
         "dataset_rows": len(dataset),
         "episodes_per_task": EPISODES_PER_TASK,
+        "random_rewards": scores.get("random", []),
+        "heuristic_rewards": scores.get("heuristic", []),
+        "base_model_rewards": scores.get("base_model", []),
+        "sft_model_rewards": scores.get("sft_model", []),
+        "improvement_sft_over_base": [
+            round(s - b, 4)
+            for s, b in zip(scores.get("sft_model", []), scores.get("base_model", []))
+        ] if scores.get("sft_model") and scores.get("base_model") else [],
+        "improvement_heuristic_over_random": [
+            round(h - r, 4)
+            for h, r in zip(scores.get("heuristic", []), scores.get("random", []))
         ],
     }
     with open(ARTIFACT_DIR / "summary_metrics.json", "w", encoding="utf-8") as f: