Spaces:

eptan
/

crisis-inbox

Sleeping

App Files Files Community

eptan commited on Mar 8

Commit

ccffbf6

verified ·

1 Parent(s): f25d4f0

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

Dockerfile.notebook +5 -7
README.md +10 -2
generate_episodes.py +9 -4
notebooks/crisisinbox_grpo_simple.ipynb +301 -20
server/app.py +11 -1
server/crisis_inbox_environment.py +3 -0
training/crisisinbox_training.py +44 -18

Dockerfile.notebook CHANGED Viewed

@@ -13,14 +13,12 @@ RUN pip install --no-cache-dir \
     peft \
     huggingface_hub
-# Copy everything needed for training
-COPY episodes.json .
-COPY generate_episodes.py .
-COPY models.py .
-COPY messages.py .
-COPY drift_events.py .
 COPY notebooks/ ./notebooks/
 EXPOSE 8888
-CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root", "--NotebookApp.token=''"]

     peft \
     huggingface_hub
+# Copy source files needed for training
+COPY generate_episodes.py models.py messages.py drift_events.py ./
 COPY notebooks/ ./notebooks/
 EXPOSE 8888
+# Download training data at startup (build env may block outbound network), then launch Jupyter
+CMD python -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo_id='eptan/crisis-inbox-episodes', filename='episodes.json', repo_type='dataset', local_dir='/app'); import os; os.rename('/app/episodes.json', '/app/.episodes.json')" && \
+    jupyter notebook --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token=''

README.md CHANGED Viewed

@@ -166,8 +166,8 @@ crisis-inbox/
 │   ├── app.py                          # FastAPI app with MCPAction workaround
 │   └── Dockerfile                      # HF Spaces deployment
 ├── notebooks/
-│   └── crisisinbox_grpo.ipynb          # GRPO training notebook
-├── episodes.json                       # Pre-generated training episodes
 ├── generate_episodes.py                # Episode generator script
 ├── pyproject.toml                      # Package config
 ├── openenv.yaml                        # OpenEnv environment spec
@@ -182,6 +182,14 @@ crisis-inbox/
 - **Training:** Unsloth GRPO via Google Colab
 - **Model:** Qwen2.5-0.5B-Instruct
 ## Team
 Built at the OpenEnv Hackathon @ Shack15, SF — March 7-8, 2026

 │   ├── app.py                          # FastAPI app with MCPAction workaround
 │   └── Dockerfile                      # HF Spaces deployment
 ├── notebooks/
+│   └── crisisinbox_grpo_simple.ipynb   # GRPO training notebook (Colab)
+├── .episodes.json                      # Pre-generated training episodes (gitignored)
 ├── generate_episodes.py                # Episode generator script
 ├── pyproject.toml                      # Package config
 ├── openenv.yaml                        # OpenEnv environment spec
 - **Training:** Unsloth GRPO via Google Colab
 - **Model:** Qwen2.5-0.5B-Instruct
+### GRPO training (Colab)
+Open the notebook with the latest fixes (context length, reward signature, left-padding, batch size) in Google Colab (T4 GPU runtime):
+**[Open in Colab](https://colab.research.google.com/github/eptan/crisis-inbox/blob/main/notebooks/crisisinbox_grpo_simple.ipynb)**
+Push your local changes to the `main` branch so the link above serves the updated notebook.
 ## Team
 Built at the OpenEnv Hackathon @ Shack15, SF — March 7-8, 2026

generate_episodes.py CHANGED Viewed

@@ -270,7 +270,12 @@ def generate_episodes(num_episodes: int = 50, start_seed: int = 1000) -> list:
         seed = start_seed + i
         print(f"  Episode {i + 1}/{num_episodes} (seed={seed})...", end=" ")
         episode = build_episode(seed)
-        n_dp = len(episode["decision_points"])
         n_msg = episode["total_messages"]
         drifts = ", ".join(episode["drift_events"])
         print(f"{n_msg} messages, {n_dp} decision points, drifts: [{drifts}]")
@@ -282,7 +287,7 @@ def save_episodes(episodes: list, filename: str = "episodes.json"):
     """Save episodes to JSON file."""
     with open(filename, "w") as f:
         json.dump(episodes, f, indent=2)
-    total_prompts = sum(len(ep["decision_points"]) for ep in episodes)
     print(f"\nSaved {len(episodes)} episodes ({total_prompts} training prompts) to {filename}")
@@ -292,7 +297,7 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Generate CrisisInbox training episodes")
     parser.add_argument("-n", "--num-episodes", type=int, default=50, help="Number of episodes")
     parser.add_argument("-s", "--start-seed", type=int, default=1000, help="Starting seed")
-    parser.add_argument("-o", "--output", type=str, default="episodes.json", help="Output file")
     parser.add_argument("--sample", type=int, default=5, help="Also save N sample episodes")
     args = parser.parse_args()
@@ -301,5 +306,5 @@ if __name__ == "__main__":
     save_episodes(episodes, args.output)
     if args.sample > 0:
-        sample_file = "sample_episodes.json"
         save_episodes(episodes[:args.sample], sample_file)

         seed = start_seed + i
         print(f"  Episode {i + 1}/{num_episodes} (seed={seed})...", end=" ")
         episode = build_episode(seed)
+        # Some episodes may not have decision points; skip them.
+        decision_points = episode.get("decision_points")
+        if not decision_points:
+            print("skipped (no decision_points)")
+            continue
+        n_dp = len(decision_points)
         n_msg = episode["total_messages"]
         drifts = ", ".join(episode["drift_events"])
         print(f"{n_msg} messages, {n_dp} decision points, drifts: [{drifts}]")
     """Save episodes to JSON file."""
     with open(filename, "w") as f:
         json.dump(episodes, f, indent=2)
+    total_prompts = sum(len(ep.get("decision_points", [])) for ep in episodes)
     print(f"\nSaved {len(episodes)} episodes ({total_prompts} training prompts) to {filename}")
     parser = argparse.ArgumentParser(description="Generate CrisisInbox training episodes")
     parser.add_argument("-n", "--num-episodes", type=int, default=50, help="Number of episodes")
     parser.add_argument("-s", "--start-seed", type=int, default=1000, help="Starting seed")
+    parser.add_argument("-o", "--output", type=str, default=".episodes.json", help="Output file")
     parser.add_argument("--sample", type=int, default=5, help="Also save N sample episodes")
     args = parser.parse_args()
     save_episodes(episodes, args.output)
     if args.sample > 0:
+        sample_file = ".sample_episodes.json"
         save_episodes(episodes[:args.sample], sample_file)

notebooks/crisisinbox_grpo_simple.ipynb CHANGED Viewed

@@ -3,95 +3,376 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": "# CrisisInbox GRPO Training\n\nTrain a small LLM to triage crisis inbox messages using Group Relative Policy Optimization.\n\n**What this does:**\n1. Loads pre-generated episode data (inbox snapshots at decision points)\n2. For each prompt, the model generates an action (which message to handle + response)\n3. A reward function scores the action based on urgency, deadline, drift adaptation\n4. GRPO updates the model to prefer higher-reward actions\n\n**GPU profiles:**\n- **T4 / free Colab**: Qwen2.5-0.5B, 2048 ctx, 4-bit — runs in ~30 min\n- **H100 / A100**: Qwen2.5-3B, 4096 ctx, 4-bit — better quality, ~20 min"
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "metadata": {},
-   "source": "# Install dependencies\n!pip install unsloth trl transformers datasets accelerate peft -q\n!pip install huggingface_hub -q\n\n# Download episode data\n# Option 1: From HF dataset (recommended)\n# Option 2: From GitHub repo\n# Option 3: Generate locally with `python generate_episodes.py -n 100`\n\nimport os\nif not os.path.exists(\"episodes.json\"):\n    print(\"Downloading episodes.json from GitHub...\")\n    !wget -q --show-progress https://raw.githubusercontent.com/eptan/crisis-inbox/main/episodes.json\n    if not os.path.exists(\"episodes.json\"):\n        print(\"ERROR: Download failed. Upload episodes.json manually or generate with:\")\n        print(\"  !git clone https://github.com/eptan/crisis-inbox.git && cd crisis-inbox && python generate_episodes.py -n 100\")\nelse:\n    print(\"episodes.json already exists, skipping download\")\n\nprint(\"Setup complete\")",
    "outputs": []
   },
   {
    "cell_type": "code",
-   "source": "# === GPU PROFILE ===\n# Change this one variable to switch between T4 and H100 configs.\n# Everything else adapts automatically.\n\nimport torch\n\nif torch.cuda.is_available():\n    vram_gb = torch.cuda.get_device_properties(0).total_mem / 1e9\n    gpu_name = torch.cuda.get_device_name(0)\n    print(f\"GPU: {gpu_name} ({vram_gb:.0f} GB)\")\nelse:\n    vram_gb = 0\n    print(\"No GPU detected — config will default to smallest profile\")\n\n# Auto-select profile based on VRAM, or override manually\nif vram_gb >= 40:  # H100, A100\n    PROFILE = \"h100\"\n    MODEL_NAME = \"unsloth/Qwen2.5-3B-Instruct\"\n    MAX_SEQ_LENGTH = 4096\n    MAX_PROMPT_LENGTH = 3584\n    MAX_COMPLETION_LENGTH = 512\n    BATCH_SIZE = 4\n    GRAD_ACCUM = 2\n    NUM_GENERATIONS = 8\nelif vram_gb >= 14:  # T4, L4\n    PROFILE = \"t4\"\n    MODEL_NAME = \"unsloth/Qwen2.5-0.5B-Instruct\"\n    MAX_SEQ_LENGTH = 2048\n    MAX_PROMPT_LENGTH = 1792\n    MAX_COMPLETION_LENGTH = 256\n    BATCH_SIZE = 2\n    GRAD_ACCUM = 4\n    NUM_GENERATIONS = 4\nelse:\n    PROFILE = \"cpu\"\n    MODEL_NAME = \"unsloth/Qwen2.5-0.5B-Instruct\"\n    MAX_SEQ_LENGTH = 2048\n    MAX_PROMPT_LENGTH = 1792\n    MAX_COMPLETION_LENGTH = 256\n    BATCH_SIZE = 1\n    GRAD_ACCUM = 8\n    NUM_GENERATIONS = 2\n\nprint(f\"Profile: {PROFILE} | Model: {MODEL_NAME} | Context: {MAX_SEQ_LENGTH}\")",
    "metadata": {},
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "source": "import json\nimport re\nimport random\nfrom datasets import Dataset\n\n# Load episodes\nwith open(\"episodes.json\") as f:\n    episodes = json.load(f)\n\n# Check format — old format has 'messages'/'tasks', new format has 'decision_points'\nif episodes and \"decision_points\" not in episodes[0]:\n    old_keys = list(episodes[0].keys())\n    raise ValueError(\n        f\"episodes.json is in the old format (keys: {old_keys}).\\n\"\n        f\"Regenerate with: python generate_episodes.py -n 100\\n\"\n        f\"The old format used 'messages'/'tasks'/'schema_events'; \"\n        f\"the notebook requires 'decision_points' from generate_episodes.py.\"\n    )\n\n# Flatten to individual training prompts\nprompts = []\nfor ep in episodes:\n    for dp in ep[\"decision_points\"]:\n        prompts.append({\n            \"prompt\": dp[\"prompt\"],\n            \"hour\": dp[\"hour\"],\n            \"visible_count\": dp[\"visible_count\"],\n            \"episode_id\": ep[\"episode_id\"],\n            \"seed\": ep[\"seed\"],\n            \"drift_events\": ep[\"drift_events\"],\n            \"superseded\": ep.get(\"superseded_messages\", {}),\n            \"messages\": dp[\"visible_messages\"],\n        })\n\nif not prompts:\n    raise ValueError(\"No decision_points found in episodes; cannot train.\")\n\nprint(f\"Loaded {len(episodes)} episodes -> {len(prompts)} training prompts\")\nprint(f\"Average {len(prompts)/len(episodes):.1f} decision points per episode\")",
    "outputs": []
   },
   {
    "cell_type": "markdown",
-   "source": "## Reward Function\n\nScores agent actions based on:\n- **Urgency base** (critical=10, high=5, medium=3, low=1)\n- **Deadline timing** (early=bonus, late=penalty)\n- **Drift adaptation** (+50% for handling policy-change messages)\n- **Stale info penalty** (-50% for acting on superseded messages)\n- **Response quality** (penalty for short/empty responses)",
-   "metadata": {}
   },
   {
    "cell_type": "code",
-   "source": "def score_action(completion: str, prompt_data: dict) -> float:\n    \"\"\"\n    Score a model completion against the inbox state.\n    \n    The model should output: respond_to_message(msg_id, \"response text\")\n    We parse the message_id and response, then score based on the reward function.\n    \"\"\"\n    messages = prompt_data[\"messages\"]\n    hour = prompt_data[\"hour\"]\n    superseded = prompt_data.get(\"superseded\", {})\n    \n    # Parse the model output for message_id\n    msg_id = None\n    response_text = \"\"\n    \n    # Try to parse respond_to_message(msg_id, response)\n    match = re.search(r'respond_to_message\\s*\\(\\s*[\"\\']?(msg_\\d+)[\"\\']?\\s*,\\s*[\"\\'](.+?)[\"\\']', completion, re.DOTALL)\n    if match:\n        msg_id = match.group(1)\n        response_text = match.group(2)\n    else:\n        # Try simpler format: just a message ID mentioned\n        id_match = re.search(r'(msg_\\d+)', completion)\n        if id_match:\n            msg_id = id_match.group(1)\n            # No explicit response text — penalize via quality check below\n            response_text = \"\"\n    \n    if not msg_id:\n        return -1.0  # couldn't parse any action\n    \n    # Find the message in the inbox\n    target_msg = None\n    for msg in messages:\n        if msg[\"id\"] == msg_id:\n            target_msg = msg\n            break\n    \n    if target_msg is None:\n        return -0.5  # referenced a message not in inbox\n    \n    # Base reward by urgency\n    urgency_rewards = {\"critical\": 10.0, \"high\": 5.0, \"medium\": 3.0, \"low\": 1.0}\n    reward = urgency_rewards.get(target_msg[\"urgency\"], 1.0)\n    \n    # Deadline timing\n    deadline = target_msg.get(\"deadline_hours\")\n    if deadline is not None:\n        if hour <= deadline:\n            time_remaining_frac = (deadline - hour) / max(deadline, 1.0)\n            reward *= 1.0 + 0.5 * time_remaining_frac\n        else:\n            reward *= 0.25  # late penalty\n    \n    # Response quality\n    if len(response_text.strip()) < 10:\n        reward *= 0.5\n    \n    # Drift adaptation bonus\n    if target_msg.get(\"drift_flag\"):\n        reward *= 1.5\n    \n    # Stale info penalty\n    if target_msg[\"id\"] in superseded:\n        reward *= 0.5\n    \n    # Penalize choosing low-urgency when unhandled critical messages exist\n    unhandled_critical = any(\n        m[\"urgency\"] == \"critical\" and not m.get(\"handled\") and not m.get(\"superseded\")\n        for m in messages\n    )\n    if unhandled_critical and target_msg[\"urgency\"] in (\"low\", \"medium\"):\n        reward *= 0.3\n    \n    return round(reward, 2)\n\n\n# Test the reward function\ntest_data = prompts[0]\nprint(\"Testing reward function on first decision point:\")\nprint(f\"  Hour: {test_data['hour']}, Messages: {test_data['visible_count']}\")\n\n# Simulate good action (pick critical message)\ncritical_msgs = [m for m in test_data[\"messages\"] if m[\"urgency\"] == \"critical\"]\nif critical_msgs:\n    good_action = f'respond_to_message(\"{critical_msgs[0][\"id\"]}\", \"Acknowledged. Evacuating immediately with documents and medication.\")'\n    good_score = score_action(good_action, test_data)\n    print(f\"  Good action (critical msg): {good_score:.2f} pts\")\n\n# Simulate bad action (pick low-urgency message)\nlow_msgs = [m for m in test_data[\"messages\"] if m[\"urgency\"] == \"low\"]\nif low_msgs:\n    bad_action = f'respond_to_message(\"{low_msgs[0][\"id\"]}\", \"ok\")'\n    bad_score = score_action(bad_action, test_data)\n    print(f\"  Bad action (low msg, short response): {bad_score:.2f} pts\")\n\n# Simulate unparseable action\njunk_score = score_action(\"I think we should do something\", test_data)\nprint(f\"  Unparseable action: {junk_score:.2f} pts\")",
    "metadata": {},
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "markdown",
-   "source": "## Load Model & Configure GRPO",
-   "metadata": {}
   },
   {
    "cell_type": "code",
-   "source": "from unsloth import FastLanguageModel\n\nmodel, tokenizer = FastLanguageModel.from_pretrained(\n    model_name=MODEL_NAME,\n    max_seq_length=MAX_SEQ_LENGTH,\n    load_in_4bit=True,\n)\n\n# Add LoRA adapters — bigger r for bigger models\nlora_r = 32 if PROFILE == \"h100\" else 16\n\nmodel = FastLanguageModel.get_peft_model(\n    model,\n    r=lora_r,\n    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n                     \"gate_proj\", \"up_proj\", \"down_proj\"],\n    lora_alpha=lora_r,\n    lora_dropout=0,\n    bias=\"none\",\n    use_gradient_checkpointing=\"unsloth\",\n)\nprint(f\"Model loaded: {MODEL_NAME} | LoRA r={lora_r} | ctx={MAX_SEQ_LENGTH}\")",
    "metadata": {},
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "code",
-   "source": "# Build the training dataset\n# Each row needs a \"prompt\" field formatted as chat messages\ntrain_data = []\nfor p in prompts:\n    train_data.append({\n        \"prompt\": [\n            {\"role\": \"user\", \"content\": p[\"prompt\"]},\n        ],\n        # Store metadata for reward calculation (not used by trainer directly)\n        \"_hour\": p[\"hour\"],\n        \"_episode_id\": p[\"episode_id\"],\n    })\n\n# Shuffle and split\nrandom.seed(42)\nrandom.shuffle(train_data)\n\ndataset = Dataset.from_list(train_data)\nprint(f\"Training dataset: {len(dataset)} prompts\")\nprint(f\"Sample prompt length: {len(train_data[0]['prompt'][0]['content'])} chars\")",
    "metadata": {},
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "markdown",
-   "source": "## GRPO Training Loop\n\nThe reward function scores each completion by:\n1. Parsing which message the model chose to handle\n2. Checking urgency, deadline timing, drift flags\n3. Penalizing bad choices (low-urgency when critical exists, stale info)",
-   "metadata": {}
   },
   {
    "cell_type": "code",
-   "source": "from trl import GRPOConfig, GRPOTrainer\n\n# Build a lookup from (episode_id, hour) -> prompt metadata for reward scoring\nprompt_lookup = {}\nfor p in prompts:\n    key = (p[\"episode_id\"], p[\"hour\"])\n    prompt_lookup[key] = p\n\n\ndef reward_fn(prompts, completions, _episode_id, _hour, **kwargs):\n    \"\"\"\n    GRPO reward function. Scores each completion against its inbox state.\n\n    TRL passes extra dataset columns as keyword arguments, so _episode_id and\n    _hour come directly from the dataset — no need to reverse-lookup from text.\n    \"\"\"\n    rewards = []\n    for completion, ep_id, hour in zip(completions, _episode_id, _hour):\n        key = (ep_id, hour)\n        prompt_data = prompt_lookup.get(key)\n\n        if prompt_data is None:\n            rewards.append(0.0)\n            continue\n\n        if isinstance(completion, list):\n            comp_text = completion[-1][\"content\"] if completion else \"\"\n        else:\n            comp_text = str(completion)\n\n        score = score_action(comp_text, prompt_data)\n        rewards.append(score)\n\n    return rewards\n\n\nprint(f\"Prompt lookup: {len(prompt_lookup)} unique keys (expect {len(prompts)})\")\n\n# GRPO training config — all values from GPU profile\ntraining_args = GRPOConfig(\n    output_dir=\"crisisinbox-grpo-output\",\n    num_train_epochs=3,\n    per_device_train_batch_size=BATCH_SIZE,\n    gradient_accumulation_steps=GRAD_ACCUM,\n    learning_rate=5e-6,\n    max_completion_length=MAX_COMPLETION_LENGTH,\n    max_prompt_length=MAX_PROMPT_LENGTH,\n    num_generations=NUM_GENERATIONS,\n    logging_steps=10,\n    save_steps=100,\n    report_to=\"none\",\n    bf16=True,\n)\n\ntrainer = GRPOTrainer(\n    model=model,\n    processing_class=tokenizer,\n    reward_funcs=reward_fn,\n    args=training_args,\n    train_dataset=dataset,\n)\n\nprint(f\"Trainer configured — batch={BATCH_SIZE}, gen={NUM_GENERATIONS}, prompt≤{MAX_PROMPT_LENGTH}tok\")",
    "metadata": {},
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "code",
-   "source": "# Train!\ntrainer.train()\nprint(\"Training complete\")",
    "metadata": {},
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "markdown",
-   "source": "## Evaluate Trained Model\n\nSample prompts and check whether the model picks high-urgency messages and produces well-formatted actions.",
-   "metadata": {}
   },
   {
    "cell_type": "code",
-   "source": "# Evaluate on a few test prompts\nFastLanguageModel.for_inference(model)\n\neval_prompts = random.sample(prompts, min(10, len(prompts)))\ntotal_score = 0\n\nprint(f\"=== Trained Model Evaluation ({MODEL_NAME}) ===\\n\")\nfor p in eval_prompts:\n    messages = [{\"role\": \"user\", \"content\": p[\"prompt\"]}]\n    inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\", add_generation_prompt=True).to(\"cuda\")\n\n    with torch.no_grad():\n        output = model.generate(inputs, max_new_tokens=MAX_COMPLETION_LENGTH, temperature=0.7, do_sample=True)\n\n    completion = tokenizer.decode(output[0][inputs.shape[1]:], skip_special_tokens=True)\n    score = score_action(completion, p)\n    total_score += score\n\n    # Show a summary\n    msg_match = re.search(r'(msg_\\d+)', completion)\n    chosen_id = msg_match.group(1) if msg_match else \"none\"\n    chosen_msg = next((m for m in p[\"messages\"] if m[\"id\"] == chosen_id), None)\n    urgency = chosen_msg[\"urgency\"] if chosen_msg else \"?\"\n\n    print(f\"Hour {p['hour']:5.1f} | Chose: {chosen_id} ({urgency:8s}) | Score: {score:+.1f}\")\n\nprint(f\"\\nAverage score: {total_score / len(eval_prompts):.2f}\")",
    "metadata": {},
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "code",
-   "source": "# Save the trained model\nmodel.save_pretrained(\"crisisinbox-grpo-trained\")\ntokenizer.save_pretrained(\"crisisinbox-grpo-trained\")\nprint(\"Model saved to crisisinbox-grpo-trained/\")",
    "metadata": {},
    "execution_count": null,
    "outputs": []
   }

   {
    "cell_type": "markdown",
    "metadata": {},
+   "source": [
+    "# CrisisInbox GRPO Training\n",
+    "\n",
+    "Train a small LLM (Qwen2.5-0.5B) to triage crisis inbox messages using Group Relative Policy Optimization.\n",
+    "\n",
+    "**What this does:**\n",
+    "1. Loads pre-generated episode data (inbox snapshots at decision points)\n",
+    "2. For each prompt, the model generates an action (which message to handle + response)\n",
+    "3. A reward function scores the action based on urgency, deadline, drift adaptation\n",
+    "4. GRPO updates the model to prefer higher-reward actions\n",
+    "\n",
+    "Open in Google Colab with **T4 GPU** runtime."
+   ]
   },
   {
    "cell_type": "code",
    "metadata": {},
+   "source": "# Install dependencies\n!pip install unsloth trl transformers datasets accelerate peft -q\n!pip install huggingface_hub -q\nprint(\"Setup complete\")",
+   "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "code",
    "metadata": {},
+   "source": [
+    "# Avoid logging crash: transformers sometimes passes a Warning type to logger.warning(),\n",
+    "# which breaks %-style formatting. Patch so we don't pass that through.\n",
+    "import logging\n",
+    "import warnings\n",
+    "\n",
+    "def _patch_transformers_logging():\n",
+    "    try:\n",
+    "        import transformers.utils.logging as trans_log\n",
+    "        _orig = trans_log.logger.warning\n",
+    "        def _safe_warning(msg, *args, **kwargs):\n",
+    "            # If first extra arg is a Warning type (e.g. FutureWarning), drop it for % formatting\n",
+    "            if args and isinstance(args[0], type) and issubclass(args[0], Warning):\n",
+    "                args = ()\n",
+    "            return _orig(msg, *args, **kwargs)\n",
+    "        trans_log.logger.warning = _safe_warning\n",
+    "    except Exception:\n",
+    "        pass\n",
+    "    warnings.filterwarnings(\"ignore\", message=\".*attention mask API.*\", category=FutureWarning)\n",
+    "\n",
+    "_patch_transformers_logging()"
+   ],
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "import torch\n",
+    "\n",
+    "# Print GPU info (PyTorch uses total_memory, not total_mem)\n",
+    "if torch.cuda.is_available():\n",
+    "    props = torch.cuda.get_device_properties(0)\n",
+    "    total_bytes = getattr(props, \"total_memory\", None) or getattr(props, \"total_mem\", 0)\n",
+    "    vram_gb = total_bytes / 1e9 if total_bytes else 0\n",
+    "    if vram_gb == 0 and hasattr(torch.cuda, \"mem_get_info\"):\n",
+    "        _, total_bytes = torch.cuda.mem_get_info(0)\n",
+    "        vram_gb = total_bytes / 1e9\n",
+    "    print(f\"GPU: {torch.cuda.get_device_name(0)} ({vram_gb:.1f} GB)\")\n",
+    "else:\n",
+    "    print(\"No GPU available.\")"
+   ],
    "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
    "metadata": {},
+   "source": "import json\nimport re\nimport random\nimport os\nfrom datasets import Dataset\nfrom huggingface_hub import hf_hub_download\n\n# Load episodes from HF dataset\nEPISODES_FILE = \".episodes.json\"\nif not os.path.exists(EPISODES_FILE):\n    print(\"Downloading episodes from HF...\")\n    hf_hub_download(\n        repo_id=\"eptan/crisis-inbox-episodes\",\n        filename=\"episodes.json\",\n        repo_type=\"dataset\",\n        local_dir=\".\",\n        local_dir_use_symlinks=False,\n    )\n    os.rename(\"episodes.json\", EPISODES_FILE)\n\nwith open(EPISODES_FILE) as f:\n    episodes = json.load(f)\n\n# Flatten to individual training prompts\nprompts = []\nfor ep in episodes:\n    for dp in ep[\"decision_points\"]:\n        prompts.append({\n            \"prompt\": dp[\"prompt\"],\n            \"hour\": dp[\"hour\"],\n            \"visible_count\": dp[\"visible_count\"],\n            \"episode_id\": ep[\"episode_id\"],\n            \"seed\": ep[\"seed\"],\n            \"drift_events\": ep[\"drift_events\"],\n            \"superseded\": ep.get(\"superseded_messages\", {}),\n            \"messages\": dp[\"visible_messages\"],\n        })\n\nprint(f\"Loaded {len(episodes)} episodes -> {len(prompts)} training prompts\")\nprint(f\"Average {len(prompts)/len(episodes):.1f} decision points per episode\")",
+   "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Reward Function\n",
+    "\n",
+    "Scores agent actions based on:\n",
+    "- **Urgency base** (critical=10, high=5, medium=3, low=1)\n",
+    "- **Deadline timing** (early=bonus, late=penalty)\n",
+    "- **Drift adaptation** (+50% for handling policy-change messages)\n",
+    "- **Stale info penalty** (-50% for acting on superseded messages)\n",
+    "- **Response quality** (penalty for short/empty responses)"
+   ]
   },
   {
    "cell_type": "code",
    "metadata": {},
+   "source": [
+    "def score_action(completion: str, prompt_data: dict) -> float:\n",
+    "    \"\"\"\n",
+    "    Score a model completion against the inbox state.\n",
+    "    \n",
+    "    The model should output: respond_to_message(msg_id, \"response text\")\n",
+    "    We parse the message_id and response, then score based on the reward function.\n",
+    "    \"\"\"\n",
+    "    messages = prompt_data[\"messages\"]\n",
+    "    hour = prompt_data[\"hour\"]\n",
+    "    superseded = prompt_data.get(\"superseded\", {})\n",
+    "    \n",
+    "    # Parse the model output for message_id\n",
+    "    msg_id = None\n",
+    "    response_text = \"\"\n",
+    "    \n",
+    "    # Try to parse respond_to_message(msg_id, response)\n",
+    "    match = re.search(r'respond_to_message\\s*\\(\\s*[\"\\']?(msg_\\d+)[\"\\']?\\s*,\\s*[\"\\'](.+?)[\"\\']', completion, re.DOTALL)\n",
+    "    if match:\n",
+    "        msg_id = match.group(1)\n",
+    "        response_text = match.group(2)\n",
+    "    else:\n",
+    "        # Try simpler format: just a message ID mentioned\n",
+    "        id_match = re.search(r'(msg_\\d+)', completion)\n",
+    "        if id_match:\n",
+    "            msg_id = id_match.group(1)\n",
+    "            response_text = completion\n",
+    "    \n",
+    "    if not msg_id:\n",
+    "        return -1.0  # couldn't parse any action\n",
+    "    \n",
+    "    # Find the message in the inbox\n",
+    "    target_msg = None\n",
+    "    for msg in messages:\n",
+    "        if msg[\"id\"] == msg_id:\n",
+    "            target_msg = msg\n",
+    "            break\n",
+    "    \n",
+    "    if target_msg is None:\n",
+    "        return -0.5  # referenced a message not in inbox\n",
+    "    \n",
+    "    # Base reward by urgency\n",
+    "    urgency_rewards = {\"critical\": 10.0, \"high\": 5.0, \"medium\": 3.0, \"low\": 1.0}\n",
+    "    reward = urgency_rewards.get(target_msg[\"urgency\"], 1.0)\n",
+    "    \n",
+    "    # Deadline timing\n",
+    "    deadline = target_msg.get(\"deadline_hours\")\n",
+    "    if deadline is not None:\n",
+    "        if hour <= deadline:\n",
+    "            time_remaining_frac = (deadline - hour) / max(deadline, 1.0)\n",
+    "            reward *= 1.0 + 0.5 * time_remaining_frac\n",
+    "        else:\n",
+    "            reward *= 0.25  # late penalty\n",
+    "    \n",
+    "    # Response quality\n",
+    "    if len(response_text.strip()) < 10:\n",
+    "        reward *= 0.5\n",
+    "    \n",
+    "    # Drift adaptation bonus\n",
+    "    if target_msg.get(\"drift_flag\"):\n",
+    "        reward *= 1.5\n",
+    "    \n",
+    "    # Stale info penalty\n",
+    "    if target_msg[\"id\"] in superseded:\n",
+    "        reward *= 0.5\n",
+    "    \n",
+    "    # Bonus: penalize choosing low-urgency when critical exists\n",
+    "    has_critical = any(m[\"urgency\"] == \"critical\" for m in messages)\n",
+    "    if has_critical and target_msg[\"urgency\"] in (\"low\", \"medium\"):\n",
+    "        reward *= 0.3  # strong penalty for ignoring critical messages\n",
+    "    \n",
+    "    return round(reward, 2)\n",
+    "\n",
+    "\n",
+    "# Test the reward function\n",
+    "test_data = prompts[0]\n",
+    "print(\"Testing reward function on first decision point:\")\n",
+    "print(f\"  Hour: {test_data['hour']}, Messages: {test_data['visible_count']}\")\n",
+    "\n",
+    "# Simulate good action (pick critical message)\n",
+    "critical_msgs = [m for m in test_data[\"messages\"] if m[\"urgency\"] == \"critical\"]\n",
+    "if critical_msgs:\n",
+    "    good_action = f'respond_to_message(\"{critical_msgs[0][\"id\"]}\", \"Acknowledged. Evacuating immediately with documents and medication.\")'\n",
+    "    good_score = score_action(good_action, test_data)\n",
+    "    print(f\"  Good action (critical msg): {good_score:.2f} pts\")\n",
+    "\n",
+    "# Simulate bad action (pick low-urgency message)\n",
+    "low_msgs = [m for m in test_data[\"messages\"] if m[\"urgency\"] == \"low\"]\n",
+    "if low_msgs:\n",
+    "    bad_action = f'respond_to_message(\"{low_msgs[0][\"id\"]}\", \"ok\")'\n",
+    "    bad_score = score_action(bad_action, test_data)\n",
+    "    print(f\"  Bad action (low msg, short response): {bad_score:.2f} pts\")\n",
+    "\n",
+    "# Simulate unparseable action\n",
+    "junk_score = score_action(\"I think we should do something\", test_data)\n",
+    "print(f\"  Unparseable action: {junk_score:.2f} pts\")"
+   ],
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Model & Configure GRPO"
+   ]
   },
   {
    "cell_type": "code",
    "metadata": {},
+   "source": [
+    "from unsloth import FastLanguageModel\n",
+    "import torch\n",
+    "\n",
+    "# Load Qwen2.5-0.5B — small enough for T4 GPU\n",
+    "# Use a longer context window so prompt + completion\n",
+    "# comfortably fit without attention mask shape issues.\n",
+    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+    "    model_name=\"unsloth/Qwen2.5-0.5B-Instruct\",\n",
+    "    max_seq_length=4096,\n",
+    "    dtype=None,\n",
+    "    load_in_4bit=True,\n",
+    ")\n",
+    "\n",
+    "# Add LoRA adapters for efficient fine-tuning\n",
+    "model = FastLanguageModel.get_peft_model(\n",
+    "    model,\n",
+    "    r=16,\n",
+    "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
+    "                     \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
+    "    lora_alpha=16,\n",
+    "    lora_dropout=0,\n",
+    "    bias=\"none\",\n",
+    "    use_gradient_checkpointing=\"unsloth\",\n",
+    ")\n",
+    "\n",
+    "# GRPO expects left-padding so completion positions align across the batch\n",
+    "# (avoids completion_mask vs log-probs shape mismatch in masked_batch_mean).\n",
+    "if tokenizer.pad_token_id is None:\n",
+    "    tokenizer.pad_token_id = tokenizer.eos_token_id\n",
+    "tokenizer.padding_side = \"left\"\n",
+    "\n",
+    "print(\"Model loaded with LoRA adapters\")"
+   ],
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "code",
    "metadata": {},
+   "source": [
+    "# Build the training dataset\n",
+    "# Each row needs a \"prompt\" field formatted as chat messages.\n",
+    "# Use a conservative max length so every batch has identical shape (avoids mask mismatch).\n",
+    "MAX_PROMPT_LENGTH = 1024  # must match GRPOConfig max_prompt_length below\n",
+    "\n",
+    "train_data = []\n",
+    "for p in prompts:\n",
+    "    msgs = [{\"role\": \"user\", \"content\": p[\"prompt\"]}]\n",
+    "    tok = tokenizer.apply_chat_template(msgs, return_tensors=\"pt\", add_generation_prompt=True)\n",
+    "    # apply_chat_template can return a tensor or BatchEncoding. Do NOT use hasattr(tok, \"shape\")\n",
+    "    # (BatchEncoding.__getattr__ raises when attribute is missing). Use dict-like check instead.\n",
+    "    try:\n",
+    "        ids = tok[\"input_ids\"]\n",
+    "    except (TypeError, KeyError):\n",
+    "        ids = tok\n",
+    "    n_tokens = ids.shape[1] if ids.dim() > 1 else ids.shape[0]\n",
+    "    if n_tokens > MAX_PROMPT_LENGTH:\n",
+    "        continue  # skip overlong prompts so batch shapes stay consistent\n",
+    "    train_data.append({\n",
+    "        \"prompt\": msgs,\n",
+    "        \"_hour\": p[\"hour\"],\n",
+    "        \"_episode_id\": p[\"episode_id\"],\n",
+    "    })\n",
+    "\n",
+    "# Shuffle and split\n",
+    "random.seed(42)\n",
+    "random.shuffle(train_data)\n",
+    "\n",
+    "dataset = Dataset.from_list(train_data)\n",
+    "print(f\"Training dataset: {len(dataset)} prompts (after dropping prompts > {MAX_PROMPT_LENGTH} tokens)\")\n",
+    "print(f\"Sample prompt length: {len(train_data[0]['prompt'][0]['content'])} chars\")"
+   ],
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## GRPO Training Loop\n",
+    "\n",
+    "The reward function scores each completion by:\n",
+    "1. Parsing which message the model chose to handle\n",
+    "2. Checking urgency, deadline timing, drift flags\n",
+    "3. Penalizing bad choices (low-urgency when critical exists, stale info)"
+   ]
   },
   {
    "cell_type": "code",
    "metadata": {},
+   "source": "from trl import GRPOConfig, GRPOTrainer\n\n# Build a lookup from (episode_id, hour) -> prompt metadata for reward scoring\nprompt_lookup = {}\nfor p in prompts:\n    key = (p[\"episode_id\"], p[\"hour\"])\n    prompt_lookup[key] = p\n\n\ndef reward_fn(prompts, completions, _episode_id=None, _hour=None, **kwargs):\n    \"\"\"GRPO reward function. Scores each completion against its inbox state.\n    TRL passes extra dataset columns as keyword args.\"\"\"\n    rewards = []\n    for i, (prompt_msgs, completion) in enumerate(zip(prompts, completions)):\n        # Look up prompt data by (episode_id, hour) from dataset columns\n        prompt_data = None\n        if _episode_id is not None and _hour is not None:\n            ep_id = _episode_id[i] if hasattr(_episode_id, '__getitem__') else _episode_id\n            hour = _hour[i] if hasattr(_hour, '__getitem__') else _hour\n            # Convert tensor/numpy to Python scalar if needed\n            if hasattr(hour, 'item'):\n                hour = hour.item()\n            prompt_data = prompt_lookup.get((ep_id, hour))\n\n        if prompt_data is None:\n            rewards.append(0.0)\n            continue\n\n        # Extract completion text (trainer may pass token ids or message dicts)\n        if isinstance(completion, list):\n            if completion and isinstance(completion[0], (int, float)):\n                comp_text = tokenizer.decode(completion, skip_special_tokens=True)\n            else:\n                comp_text = completion[-1].get(\"content\", \"\") if completion else \"\"\n        else:\n            comp_text = str(completion)\n\n        score = score_action(comp_text, prompt_data)\n        rewards.append(score)\n\n    return rewards\n\n\n# GRPO training config: conservative batch/length to avoid mask shape mismatch.\ntraining_args = GRPOConfig(\n    output_dir=\"crisisinbox-grpo-output\",\n    num_train_epochs=3,\n    per_device_train_batch_size=1,\n    gradient_accumulation_steps=4,\n    steps_per_generation=4,\n    learning_rate=5e-6,\n    max_completion_length=256,\n    max_prompt_length=1024,\n    num_generations=2,\n    logging_steps=10,\n    save_steps=100,\n    report_to=\"none\",\n    bf16=False,\n)\n\ntrainer = GRPOTrainer(\n    model=model,\n    processing_class=tokenizer,\n    reward_funcs=reward_fn,\n    args=training_args,\n    train_dataset=dataset,\n)\n\nprint(f\"Trainer configured — {len(prompt_lookup)} unique (episode_id, hour) keys\")\nprint(\"Ready to train\")",
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "code",
    "metadata": {},
+   "source": [
+    "# Train!\n",
+    "trainer.train()\n",
+    "print(\"Training complete\")"
+   ],
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluate: Before vs After\n",
+    "\n",
+    "Compare the trained model's action choices against the base model on the same prompts."
+   ]
   },
   {
    "cell_type": "code",
    "metadata": {},
+   "source": [
+    "# Evaluate on a few test prompts\n",
+    "FastLanguageModel.for_inference(model)\n",
+    "\n",
+    "eval_prompts = random.sample(prompts, min(10, len(prompts)))\n",
+    "total_score = 0\n",
+    "\n",
+    "print(\"=== Trained Model Evaluation ===\\n\")\n",
+    "for p in eval_prompts:\n",
+    "    messages = [{\"role\": \"user\", \"content\": p[\"prompt\"]}]\n",
+    "    raw = tokenizer.apply_chat_template(messages, return_tensors=\"pt\", add_generation_prompt=True)\n",
+    "    # Do NOT use hasattr(raw, \"shape\") — BatchEncoding.__getattr__ raises. Use try/except.\n",
+    "    try:\n",
+    "        inputs = {k: v.to(\"cuda\") for k, v in raw.items()}\n",
+    "        prompt_len = inputs[\"input_ids\"].shape[1]\n",
+    "    except (TypeError, AttributeError):\n",
+    "        inputs = raw.to(\"cuda\")\n",
+    "        prompt_len = inputs.shape[1]\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        output = model.generate(inputs, max_new_tokens=200, temperature=0.7, do_sample=True)\n",
+    "\n",
+    "    completion = tokenizer.decode(output[0][prompt_len:], skip_special_tokens=True)\n",
+    "    score = score_action(completion, p)\n",
+    "    total_score += score\n",
+    "\n",
+    "    # Show a summary\n",
+    "    msg_match = re.search(r'(msg_\\d+)', completion)\n",
+    "    chosen_id = msg_match.group(1) if msg_match else \"none\"\n",
+    "    chosen_msg = next((m for m in p[\"messages\"] if m[\"id\"] == chosen_id), None)\n",
+    "    urgency = chosen_msg[\"urgency\"] if chosen_msg else \"?\"\n",
+    "\n",
+    "    print(f\"Hour {p['hour']:5.1f} | Chose: {chosen_id} ({urgency:8s}) | Score: {score:+.1f}\")\n",
+    "\n",
+    "print(f\"\\nAverage score: {total_score / len(eval_prompts):.2f}\")"
+   ],
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "code",
    "metadata": {},
+   "source": [
+    "# Save the trained model\n",
+    "model.save_pretrained(\"crisisinbox-grpo-trained\")\n",
+    "tokenizer.save_pretrained(\"crisisinbox-grpo-trained\")\n",
+    "print(\"Model saved to crisisinbox-grpo-trained/\")"
+   ],
    "execution_count": null,
    "outputs": []
   }

server/app.py CHANGED Viewed

@@ -23,7 +23,17 @@ except ImportError:
 class MCPAction(Action):
-    """Action class that deserializes both ListToolsAction and CallToolAction."""
     model_config = Action.model_config.copy()
     model_config["extra"] = "allow"

 class MCPAction(Action):
+    """Action class that deserializes both ListToolsAction and CallToolAction.
+    OpenEnv 0.2.1's WS handler passes a single action_cls to
+    deserialize_action(), but MCPToolClient sends both ListToolsAction
+    and CallToolAction through the "step" message path. Since the base
+    Action model uses extra="forbid", a fixed action_cls can't handle
+    both shapes. We override model_validate to route by the "type" field
+    so that MCPEnvironment.step() receives the correct Action subclass.
+    extra="allow" is needed because the two action types have different
+    field sets.
+    """
     model_config = Action.model_config.copy()
     model_config["extra"] = "allow"

server/crisis_inbox_environment.py CHANGED Viewed

@@ -34,6 +34,9 @@ class CrisisInboxEnvironment(MCPEnvironment):
     """
     Simulates a 48-hour post-disaster inbox triage scenario.
     The agent receives messages from family, employers, government agencies,
     insurance companies, and service providers. It must prioritize safety,
     meet deadlines, and adapt to changing rules (schema drift).

     """
     Simulates a 48-hour post-disaster inbox triage scenario.
+    Note: SUPPORTS_CONCURRENT_SESSIONS is False (default) because the
+    environment holds mutable per-episode state (_all_messages, _handled, etc).
     The agent receives messages from family, employers, government agencies,
     insurance companies, and service providers. It must prioritize safety,
     meet deadlines, and adapt to changing rules (schema drift).

training/crisisinbox_training.py CHANGED Viewed

@@ -4,8 +4,7 @@ Person B: ML Pipeline
 Run this in Google Colab:
 1. Upload this file
-2. Upload episodes.json from repo
-3. Run: python crisisinbox_training.py
 """
 import torch
@@ -16,12 +15,14 @@ from datasets import Dataset
 from unsloth import FastLanguageModel
 from trl import GRPOConfig, GRPOTrainer
-# Download episodes from GitHub repo
 print("Loading episodes...")
-import urllib.request
-urllib.request.urlretrieve(
-    "https://raw.githubusercontent.com/eptan/crisis-inbox/main/episodes.json",
-    "episodes.json"
 )
 with open("episodes.json", "r") as f:
@@ -33,6 +34,9 @@ print(f"✓ Loaded {len(EPISODES)} episodes")
 # PROMPT BUILDING
 # =============================================================================
 CRISIS_SYSTEM_PROMPT = """
 You are an assistant helping a working parent during a wildfire.
 You must triage messages, act on safety-critical items first,
@@ -56,10 +60,17 @@ def build_crisis_prompt(episode):
         msgs_str.append(
             f"[t={m['time']}h] {urgency} From {m['sender']} via {m['channel']}: {m['content']}{deadline_info}"
         )
     drift_str = []
     for d in episode.get("schema_events", []):
         drift_str.append(f"[t={d['time']}h] POLICY UPDATE: {d['kind']} -> {d.get('new_value', 'changed')}")
     user_content = (
         "Here is your 48-hour message history:\n\n"
@@ -80,22 +91,34 @@ def build_crisis_prompt(episode):
 def parse_plan(model_output):
     """Parse <plan> tag output into list of action dicts."""
     actions = []
-    plan_match = re.search(r'<plan>(.*?)</plan>', model_output, re.DOTALL | re.IGNORECASE)
-    if not plan_match:
         return []
-    plan_content = plan_match.group(1).strip()
-    lines = plan_content.split('\n')
     for line in lines:
         line = line.strip()
         if not line or not line[0].isdigit():
             continue
         # Extract time: [time=min X]
-        time_match = re.search(r'time=min (\d+)', line)
         time_min = int(time_match.group(1)) if time_match else 0
         # Extract action description
@@ -292,7 +315,9 @@ MODEL_NAME = "unsloth/Qwen2.5-0.5B-Instruct"
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name=MODEL_NAME,
-    max_seq_length=2048,
     dtype=None,
     load_in_4bit=True,
 )
@@ -355,7 +380,8 @@ training_args = GRPOConfig(
     per_device_train_batch_size=2,
     gradient_accumulation_steps=4,
     num_generations=4,
-    max_completion_length=512,
     temperature=0.7,
     learning_rate=1e-5,
     logging_steps=10,

 Run this in Google Colab:
 1. Upload this file
+2. Run: python crisisinbox_training.py
 """
 import torch
 from unsloth import FastLanguageModel
 from trl import GRPOConfig, GRPOTrainer
+# Download episodes from HF dataset
 print("Loading episodes...")
+from huggingface_hub import hf_hub_download
+hf_hub_download(
+    repo_id="eptan/crisis-inbox-episodes",
+    filename="episodes.json",
+    repo_type="dataset",
+    local_dir=".",
 )
 with open("episodes.json", "r") as f:
 # PROMPT BUILDING
 # =============================================================================
+MAX_MESSAGES = 40
+MAX_DRIFT_EVENTS = 20
 CRISIS_SYSTEM_PROMPT = """
 You are an assistant helping a working parent during a wildfire.
 You must triage messages, act on safety-critical items first,
         msgs_str.append(
             f"[t={m['time']}h] {urgency} From {m['sender']} via {m['channel']}: {m['content']}{deadline_info}"
         )
+    # Keep only the most recent messages to avoid overlong sequences.
+    if len(msgs_str) > MAX_MESSAGES:
+        msgs_str = msgs_str[-MAX_MESSAGES:]
     drift_str = []
     for d in episode.get("schema_events", []):
         drift_str.append(f"[t={d['time']}h] POLICY UPDATE: {d['kind']} -> {d.get('new_value', 'changed')}")
+    if len(drift_str) > MAX_DRIFT_EVENTS:
+        drift_str = drift_str[-MAX_DRIFT_EVENTS:]
     user_content = (
         "Here is your 48-hour message history:\n\n"
 def parse_plan(model_output):
     """Parse <plan> tag output into list of action dicts."""
+    if model_output is None:
+        return []
+    # TRL/Unsloth can return completions as lists (token ids, strings, or message dicts).
+    # Normalize to a single string before regex parsing.
+    if isinstance(model_output, list):
+        if model_output and isinstance(model_output[0], dict) and "content" in model_output[0]:
+            model_output = "\n".join(str(m.get("content", "")) for m in model_output)
+        else:
+            model_output = "\n".join(map(str, model_output))
+    else:
+        model_output = str(model_output)
     actions = []
+    plan_match = re.search(r"<plan>(.*?)</plan>", model_output, re.DOTALL | re.IGNORECASE)
+    plan_content = plan_match.group(1).strip() if plan_match else model_output.strip()
+    if not plan_content:
         return []
+    lines = plan_content.split("\n")
     for line in lines:
         line = line.strip()
         if not line or not line[0].isdigit():
             continue
         # Extract time: [time=min X]
+        time_match = re.search(r"time=min (\d+)", line, re.IGNORECASE)
         time_min = int(time_match.group(1)) if time_match else 0
         # Extract action description
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name=MODEL_NAME,
+    # Allow longer combined prompt + completion to avoid
+    # attention mask shape mismatches during training.
+    max_seq_length=4096,
     dtype=None,
     load_in_4bit=True,
 )
     per_device_train_batch_size=2,
     gradient_accumulation_steps=4,
     num_generations=4,
+    # Keep completions modest so prompt+completion stay well within max_seq_length.
+    max_completion_length=256,
     temperature=0.7,
     learning_rate=1e-5,
     logging_steps=10,