Spaces:

eptan
/

crisis-inbox

Sleeping

App Files Files Community

eptan commited on Mar 8

Commit

fdb5700

verified ·

1 Parent(s): d5fe40e

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

messages.py +275 -0
models.py +16 -0
notebooks/crisisinbox_grpo_connected copy.ipynbd +435 -0
notebooks/crisisinbox_grpo_connected.ipynb +3 -1
server/crisis_inbox_environment.py +77 -1
server/rewards.py +4 -0

messages.py CHANGED Viewed

@@ -1158,4 +1158,279 @@ ALL_MESSAGES: list[Message] = [
         urgency=Urgency.LOW,
         timestamp_hours=47.5,
     ),
 ]

         urgency=Urgency.LOW,
         timestamp_hours=47.5,
     ),
+    # ========== CONFLICTING DEADLINES ==========
+    # These pairs have overlapping deadlines — the agent can only do one.
+    # Conflict pair 1: School pickup vs Insurance call (both at hour ~8)
+    Message(
+        id="msg_074",
+        sender="Oakwood Elementary",
+        channel=Channel.PHONE,
+        subject="URGENT: Early dismissal pickup required by 2pm",
+        content=(
+            "This is Oakwood Elementary calling about Emma and Jake. Due to the storm, "
+            "we are doing an emergency early dismissal at 2pm today. Your sister listed you "
+            "as emergency pickup. If no authorized adult arrives by 2pm, we will need to "
+            "contact Child Protective Services per district policy. Please confirm."
+        ),
+        urgency=Urgency.CRITICAL,
+        timestamp_hours=6.0,
+        deadline_hours=8.0,
+        conflicts_with="msg_075",
+    ),
+    Message(
+        id="msg_075",
+        sender="State Farm Insurance",
+        channel=Channel.PHONE,
+        subject="Scheduled damage assessment call - don't miss",
+        content=(
+            "This is your scheduled callback from State Farm. An adjuster is available to "
+            "do a phone assessment of your property damage between 1:30pm and 2:15pm today ONLY. "
+            "If you miss this window, the next available slot is in 12 days. Missing the initial "
+            "assessment may delay your claim payout by 4-6 weeks. Please be available."
+        ),
+        urgency=Urgency.HIGH,
+        timestamp_hours=6.0,
+        deadline_hours=8.5,
+        dependencies=["msg_004"],
+        conflicts_with="msg_074",
+    ),
+    # Conflict pair 2: Boss presentation vs FEMA registration (both at hour ~14)
+    Message(
+        id="msg_076",
+        sender="Boss",
+        channel=Channel.SMS,
+        subject="Client pushed meeting to today - need you on Zoom at 2pm",
+        content=(
+            "Bad news, Meridian moved the meeting to today. I need you on the Zoom call at "
+            "2pm sharp to present your section. It's 30 minutes max. I already told them "
+            "you'd be there. This is the account we've been working on for 6 months. "
+            "Don't let me down."
+        ),
+        urgency=Urgency.HIGH,
+        timestamp_hours=12.0,
+        deadline_hours=14.5,
+        conflicts_with="msg_077",
+    ),
+    Message(
+        id="msg_077",
+        sender="FEMA",
+        channel=Channel.GOVERNMENT_ALERT,
+        subject="In-person registration window: 1pm-3pm TODAY ONLY",
+        content=(
+            "FEMA Disaster Recovery Center at Sacramento Convention Center is open for "
+            "in-person registration TODAY ONLY from 1pm to 3pm. In-person registrations "
+            "receive priority processing (2-3 weeks vs 6-8 weeks online). Bring ID, proof "
+            "of residence, and damage documentation. This is the only in-person session "
+            "scheduled for your zip code."
+        ),
+        urgency=Urgency.HIGH,
+        timestamp_hours=11.5,
+        deadline_hours=15.0,
+        conflicts_with="msg_076",
+    ),
+    # ========== ESCALATION CHAINS ==========
+    # These messages escalate (spawn angry follow-ups) if not handled in time.
+    Message(
+        id="msg_078",
+        sender="Neighbor Dave",
+        channel=Channel.SMS,
+        subject="Can you help me board up windows?",
+        content=(
+            "Hey man, the plywood I got is too big for me to handle alone. My wife's at her "
+            "mom's with the kids. Can you come over for like 20 minutes to help me board up "
+            "the front windows? I'll return the favor anytime. I'm at 422 Oak St."
+        ),
+        urgency=Urgency.MEDIUM,
+        timestamp_hours=3.0,
+        deadline_hours=6.0,
+        escalation_trigger="msg_078e",
+        escalation_delay_hours=1.0,
+    ),
+    # Escalation: Dave's follow-up (injected by environment if msg_078 unhandled by hour 7)
+    Message(
+        id="msg_078e",
+        sender="Neighbor Dave",
+        channel=Channel.SMS,
+        subject="Window broke. Thanks for nothing",
+        content=(
+            "Well the front window just blew in. Glass everywhere. Would've taken you 20 "
+            "minutes Dave. Twenty minutes. Now I've got water pouring into my living room "
+            "and I'm trying to tape a tarp up by myself. I hope whatever you were doing "
+            "was worth it. Don't bother coming now."
+        ),
+        urgency=Urgency.LOW,
+        timestamp_hours=7.0,
+    ),
+    Message(
+        id="msg_079",
+        sender="Boss",
+        channel=Channel.EMAIL,
+        subject="Slides due by 5pm - FINAL warning",
+        content=(
+            "I haven't received your section of the Meridian slides. I need them by 5pm "
+            "today or I'm giving your section to Sarah and we'll discuss this when things "
+            "settle down. I understand the situation but the client doesn't care about hurricanes. "
+            "5pm. Final."
+        ),
+        urgency=Urgency.HIGH,
+        timestamp_hours=15.0,
+        deadline_hours=17.0,
+        escalation_trigger="msg_079e",
+        escalation_delay_hours=2.0,
+    ),
+    # Escalation: Boss fires you from the project (injected if msg_079 unhandled by hour 19)
+    Message(
+        id="msg_079e",
+        sender="Boss",
+        channel=Channel.EMAIL,
+        subject="Re: Slides - Gave your section to Sarah",
+        content=(
+            "I waited. Nothing. Sarah's handling your section now. I covered for you with "
+            "the client but I'm not going to lie — this isn't a good look. We'll need to "
+            "have a conversation when you're back. I get it's a disaster but everyone else "
+            "managed to check in."
+        ),
+        urgency=Urgency.MEDIUM,
+        timestamp_hours=19.0,
+    ),
+    Message(
+        id="msg_080",
+        sender="Mom",
+        channel=Channel.SMS,
+        subject="WHY ARENT YOU ANSWERING",
+        content=(
+            "I've called you SEVEN times. Your father is in the car ready to drive down. "
+            "Please just send ONE TEXT so I know you're alive. I am losing my mind. "
+            "If I don't hear from you in the next hour I'm calling 911."
+        ),
+        urgency=Urgency.CRITICAL,
+        timestamp_hours=4.0,
+        deadline_hours=5.0,
+        escalation_trigger="msg_080e",
+        escalation_delay_hours=1.5,
+    ),
+    # Escalation: Mom actually calls 911 (injected if msg_080 unhandled by hour 6.5)
+    Message(
+        id="msg_080e",
+        sender="Mom",
+        channel=Channel.SMS,
+        subject="Called 911. Dad is driving down",
+        content=(
+            "That's it. I called 911 and filed a welfare check. Your father is on the highway. "
+            "I don't care if you're busy. I don't care if you think I'm overreacting. "
+            "You don't go SILENT during a hurricane. If you see this call me IMMEDIATELY. "
+            "I haven't slept."
+        ),
+        urgency=Urgency.HIGH,
+        timestamp_hours=6.5,
+    ),
+    # ========== MULTI-TURN CONVERSATIONS ==========
+    # Responding to these messages triggers a follow-up requiring another action.
+    Message(
+        id="msg_081",
+        sender="State Farm Insurance",
+        channel=Channel.EMAIL,
+        subject="Claim received - additional photos needed",
+        content=(
+            "Thank you for filing your initial claim (#SF-2026-84721). However, our adjuster "
+            "needs additional documentation before we can proceed: (1) Close-up photos of roof "
+            "damage, (2) Water line marks on interior walls, (3) Serial numbers of damaged "
+            "electronics. Please reply with these within 12 hours to keep your claim in the "
+            "expedited queue."
+        ),
+        urgency=Urgency.HIGH,
+        timestamp_hours=16.0,
+        deadline_hours=28.0,
+        dependencies=["msg_004"],
+        reply_trigger="msg_081r",
+    ),
+    # Reply: Adjuster confirms and asks one more thing
+    Message(
+        id="msg_081r",
+        sender="State Farm Insurance",
+        channel=Channel.EMAIL,
+        subject="Re: Claim #SF-2026-84721 - One more step",
+        content=(
+            "Got your photos, thank you. Your claim is being processed. One final step: "
+            "we need you to sign the digital authorization form I've attached. This authorizes "
+            "our contractor to begin repairs. Without your signature, repairs cannot start "
+            "even if the claim is approved. Please sign within 6 hours."
+        ),
+        urgency=Urgency.HIGH,
+        timestamp_hours=0.0,  # Timestamp set dynamically when injected
+        deadline_hours=0.0,   # Deadline set dynamically (current_hour + 6)
+    ),
+    Message(
+        id="msg_082",
+        sender="Sister",
+        channel=Channel.SMS,
+        subject="Can you keep the kids overnight?",
+        content=(
+            "Hey so my boss is now saying we have to work through the night because of the storm "
+            "damage at the warehouse. Can you keep Emma and Jake overnight? I know it's a lot "
+            "to ask right now but I literally have no other option. Mom and Dad's power is out. "
+            "They have their backpacks with PJs and stuff."
+        ),
+        urgency=Urgency.HIGH,
+        timestamp_hours=18.0,
+        deadline_hours=20.0,
+        reply_trigger="msg_082r",
+    ),
+    # Reply: Sister responds with logistics
+    Message(
+        id="msg_082r",
+        sender="Sister",
+        channel=Channel.SMS,
+        subject="Re: Kids overnight - Emma's medication",
+        content=(
+            "OMG thank you, you're a lifesaver. One thing — Emma needs her allergy medication "
+            "at 8pm. It's the pink liquid in her backpack front pocket. 5mL. She knows but she'll "
+            "try to skip it because it tastes bad. Don't let her. Also Jake needs a nightlight "
+            "or he won't sleep. Sorry I'm the worst. I owe you forever."
+        ),
+        urgency=Urgency.MEDIUM,
+        timestamp_hours=0.0,  # Set dynamically
+        deadline_hours=0.0,   # Set dynamically (current_hour + 2)
+    ),
+    Message(
+        id="msg_083",
+        sender="Neighbor Dave",
+        channel=Channel.SMS,
+        subject="Found your dog!!",
+        content=(
+            "Dude your dog is in my backyard! Max must have gotten out through the fence that blew "
+            "down. He's soaking wet but seems ok. I put him in my garage with a towel. Come get "
+            "him when you can but he seems pretty stressed — keeps whining. Let me know."
+        ),
+        urgency=Urgency.MEDIUM,
+        timestamp_hours=9.0,
+        reply_trigger="msg_083r",
+    ),
+    # Reply: Dave found something concerning about the dog
+    Message(
+        id="msg_083r",
+        sender="Neighbor Dave",
+        channel=Channel.SMS,
+        subject="Re: Your dog - he's limping",
+        content=(
+            "Hey so Max is limping on his back left leg. I didn't notice at first because he was "
+            "just laying down but when I gave him water he got up and he's definitely favoring it. "
+            "Might want to get him to a vet. I think the emergency vet on J Street is still open "
+            "despite the storm. Want me to drive you two over there?"
+        ),
+        urgency=Urgency.HIGH,
+        timestamp_hours=0.0,  # Set dynamically
+        deadline_hours=0.0,   # Set dynamically (current_hour + 4)
+    ),
 ]

models.py CHANGED Viewed

@@ -66,3 +66,19 @@ class Message(BaseModel):
         default=None,
         description="ID of a previous message this one replaces (due to drift)",
     )

         default=None,
         description="ID of a previous message this one replaces (due to drift)",
     )
+    conflicts_with: Optional[str] = Field(
+        default=None,
+        description="ID of another message with an overlapping deadline — only one can be handled",
+    )
+    escalation_trigger: Optional[str] = Field(
+        default=None,
+        description="ID of a follow-up message that appears if THIS message is not handled in time",
+    )
+    escalation_delay_hours: Optional[float] = Field(
+        default=None,
+        description="Hours after this message's deadline before the escalation fires",
+    )
+    reply_trigger: Optional[str] = Field(
+        default=None,
+        description="ID of a follow-up message injected when THIS message is handled (multi-turn)",
+    )

notebooks/crisisinbox_grpo_connected copy.ipynbd ADDED Viewed

	@@ -0,0 +1,435 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ym4tunggrm",
+   "source": "# CrisisInbox GRPO Training (Connected to HF Space)\n\nTrain a small LLM (Qwen2.5-0.5B) to triage crisis inbox messages using Group Relative Policy Optimization.\n\n**This notebook connects to the live CrisisInbox environment** deployed on HuggingFace Spaces at `https://eptan-crisis-inbox.hf.space` to collect training episodes in real-time, then trains the model using GRPO.\n\n**Stack:** HF TRL + PEFT (LoRA on full bf16 model \u2014 no quantization needed for 0.5B)\n\n**What this does:**\n1. Connects to the deployed CrisisInbox environment via WebSocket\n2. Collects episodes by interacting with the environment (reset, list tools, call tools)\n3. Builds training prompts from live environment observations\n4. Trains the model with GRPO using a reward function\n5. Evaluates the trained model against the live environment\n\nOpen in Google Colab or Northflank with a GPU runtime.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "p0j1w7pr7ib",
+   "source": [
+    "# Install dependencies (pure HF TRL + PEFT, no quantization needed for 0.5B model)\n",
+    "!pip install trl transformers datasets accelerate peft -q\n",
+    "!pip install \"openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git\" -q\n",
+    "!pip install huggingface_hub matplotlib -q\n",
+    "print(\"Setup complete\")\n"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "sg4tfghxfgb",
+   "source": "# Patch transformers logging crash\nimport logging\nimport warnings\n\ndef _patch_transformers_logging():\n    try:\n        import transformers.utils.logging as trans_log\n        _orig = trans_log.logger.warning\n        def _safe_warning(msg, *args, **kwargs):\n            if args and isinstance(args[0], type) and issubclass(args[0], Warning):\n                args = ()\n            return _orig(msg, *args, **kwargs)\n        trans_log.logger.warning = _safe_warning\n    except Exception:\n        pass\n    warnings.filterwarnings(\"ignore\", message=\".*attention mask API.*\", category=FutureWarning)\n\n_patch_transformers_logging()",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "t0a3hdk1jqk",
+   "source": "import torch\n\nif torch.cuda.is_available():\n    props = torch.cuda.get_device_properties(0)\n    total_bytes = getattr(props, \"total_memory\", None) or getattr(props, \"total_mem\", 0)\n    vram_gb = total_bytes / 1e9 if total_bytes else 0\n    if vram_gb == 0 and hasattr(torch.cuda, \"mem_get_info\"):\n        _, total_bytes = torch.cuda.mem_get_info(0)\n        vram_gb = total_bytes / 1e9\n    print(f\"GPU: {torch.cuda.get_device_name(0)} ({vram_gb:.1f} GB)\")\nelse:\n    print(\"No GPU available.\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9b4l2fp3jm5",
+   "source": "## Connect to CrisisInbox Environment\n\nConnect to the live environment on HuggingFace Spaces via WebSocket and collect episodes by running through the 48-hour simulation multiple times with different seeds.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "pmbt9gcp9hb",
+   "source": "import json\nimport time as _time\nfrom openenv.core.mcp_client import MCPToolClient\n\nBASE_URL = \"https://eptan-crisis-inbox.hf.space\"\n\n# Wake up the HF Space (may be sleeping) and verify connectivity\nprint(\"Connecting to HF Space (may take a moment if cold-starting)...\")\nfor attempt in range(3):\n    try:\n        with MCPToolClient(base_url=BASE_URL, connect_timeout_s=60.0).sync() as env:\n            env.reset(seed=0)\n            tools = env.list_tools()\n            print(f\"Connected! Available tools: {[t.name for t in tools]}\")\n            for t in tools:\n                print(f\"  - {t.name}: {t.description[:80]}...\")\n\n            status = json.loads(env.call_tool(\"get_status\"))\n            print(f\"\\nEnvironment ready \u2014 {status['messages_total_arrived']} messages at hour {status['current_hour']}\")\n        break\n    except Exception as e:\n        if attempt < 2:\n            print(f\"  Attempt {attempt + 1} failed ({e}), retrying in 10s...\")\n            _time.sleep(10)\n        else:\n            raise RuntimeError(f\"Could not connect to {BASE_URL} after 3 attempts: {e}\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "rfzviywy9od",
+   "source": [
+    "import random\n",
+    "\n",
+    "\n",
+    "def collect_episode(base_url, seed, time_steps=None):\n",
+    "    \"\"\"Collect one episode from the live environment using OpenEnv tools.\"\"\"\n",
+    "    if time_steps is None:\n",
+    "        time_steps = [0, 2, 6, 12, 18, 24, 30, 36, 42, 47]\n",
+    "\n",
+    "    superseded_msgs = {}\n",
+    "\n",
+    "    with MCPToolClient(\n",
+    "        base_url=base_url, connect_timeout_s=60.0, message_timeout_s=120.0,\n",
+    "    ).sync() as env:\n",
+    "        env.reset(seed=seed)\n",
+    "        decision_points = []\n",
+    "        current_hour = 0.0\n",
+    "\n",
+    "        for target_hour in time_steps:\n",
+    "            while current_hour < target_hour - 0.1:\n",
+    "                jump = min(4.0, target_hour - current_hour)\n",
+    "                env.call_tool(\"advance_time\", hours=jump)\n",
+    "                status = json.loads(env.call_tool(\"get_status\"))\n",
+    "                current_hour = status[\"current_hour\"]\n",
+    "\n",
+    "            inbox = json.loads(env.call_tool(\"get_inbox\"))\n",
+    "            prompt = env.call_tool(\"get_prompt\")  # Server builds the prompt\n",
+    "\n",
+    "            for m in inbox:\n",
+    "                if m.get(\"superseded\"):\n",
+    "                    superseded_msgs[m[\"id\"]] = \"\"\n",
+    "\n",
+    "            unhandled = [m for m in inbox if not m.get(\"handled\", False)]\n",
+    "            if not unhandled:\n",
+    "                continue\n",
+    "\n",
+    "            decision_points.append({\n",
+    "                \"hour\": target_hour,\n",
+    "                \"visible_count\": len(inbox),\n",
+    "                \"prompt\": prompt,\n",
+    "                \"messages\": inbox,\n",
+    "                \"superseded\": dict(superseded_msgs),\n",
+    "            })\n",
+    "\n",
+    "    return {\n",
+    "        \"episode_id\": f\"ep_{seed}\",\n",
+    "        \"seed\": seed,\n",
+    "        \"drift_events\": [],\n",
+    "        \"superseded_messages\": superseded_msgs,\n",
+    "        \"decision_points\": decision_points,\n",
+    "    }\n",
+    "\n",
+    "\n",
+    "# Test: collect one episode\n",
+    "print(\"Collecting test episode (seed=42)...\")\n",
+    "test_ep = collect_episode(BASE_URL, seed=42)\n",
+    "print(f\"Episode {test_ep['episode_id']}: {len(test_ep['decision_points'])} decision points\")\n",
+    "for dp in test_ep[\"decision_points\"]:\n",
+    "    print(f\"  Hour {dp['hour']:5.1f}: {dp['visible_count']} messages visible\")\n"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "nmmfb62s9ph",
+   "source": "# Collect multiple episodes from the live environment\nNUM_EPISODES = 10\nSEEDS = list(range(NUM_EPISODES))\n\nepisodes = []\nfor seed in SEEDS:\n    print(f\"Collecting episode {seed + 1}/{NUM_EPISODES} (seed={seed})...\", end=\" \")\n    for attempt in range(3):\n        try:\n            ep = collect_episode(BASE_URL, seed=seed)\n            episodes.append(ep)\n            print(f\"{len(ep['decision_points'])} decision points\")\n            break\n        except Exception as e:\n            if attempt < 2:\n                print(f\"retry {attempt + 1}...\", end=\" \")\n                _time.sleep(5)\n            else:\n                print(f\"FAILED ({e}), skipping\")\n\n# Flatten to training prompts\nprompts = []\nfor ep in episodes:\n    for dp in ep[\"decision_points\"]:\n        prompts.append({\n            \"prompt\": dp[\"prompt\"],\n            \"hour\": dp[\"hour\"],\n            \"visible_count\": dp[\"visible_count\"],\n            \"episode_id\": ep[\"episode_id\"],\n            \"seed\": ep[\"seed\"],\n            \"drift_events\": ep[\"drift_events\"],\n            \"superseded\": ep.get(\"superseded_messages\", {}),\n            \"messages\": dp[\"visible_messages\"],\n        })\n\nprint(f\"\\nCollected {len(episodes)} episodes -> {len(prompts)} training prompts\")\nprint(f\"Average {len(prompts)/len(episodes):.1f} decision points per episode\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "pg833f350e",
+   "source": "## Reward Function\n\nScores agent actions based on:\n- **Urgency base** (critical=10, high=5, medium=3, low=1)\n- **Deadline timing** (early=bonus, late=penalty)\n- **Drift adaptation** (+50% for handling policy-change messages)\n- **Stale info penalty** (-50% for acting on superseded messages)\n- **Response quality** (penalty for short/empty responses)",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "2xd2afp4g99",
+   "source": [
+    "import re\n",
+    "\n",
+    "\n",
+    "def score_action(completion, prompt_data):\n",
+    "    \"\"\"Score a model completion. Mirrors server's calculate_reward().\"\"\"\n",
+    "    messages = prompt_data[\"messages\"]\n",
+    "    hour = prompt_data[\"hour\"]\n",
+    "    superseded = prompt_data.get(\"superseded\", {})\n",
+    "\n",
+    "    match = re.search(\n",
+    "        r'respond_to_message\\s*\\(\\s*[\"\\']?(msg_\\d+)[\"\\']?\\s*,\\s*[\"\\'](.+?)[\"\\']',\n",
+    "        completion, re.DOTALL,\n",
+    "    )\n",
+    "    if match:\n",
+    "        msg_id, response_text = match.group(1), match.group(2)\n",
+    "    else:\n",
+    "        id_match = re.search(r'(msg_\\d+)', completion)\n",
+    "        if id_match:\n",
+    "            msg_id, response_text = id_match.group(1), completion[:200]\n",
+    "        else:\n",
+    "            return -1.0\n",
+    "\n",
+    "    target = next((m for m in messages if m[\"id\"] == msg_id), None)\n",
+    "    if target is None:\n",
+    "        return -0.5\n",
+    "\n",
+    "    urgency_rewards = {\"critical\": 10.0, \"high\": 5.0, \"medium\": 3.0, \"low\": 1.0}\n",
+    "    reward = urgency_rewards.get(target[\"urgency\"], 1.0)\n",
+    "\n",
+    "    deadline = target.get(\"deadline_hours\")\n",
+    "    if deadline is not None:\n",
+    "        if hour <= deadline:\n",
+    "            reward *= 1.0 + 0.5 * ((deadline - hour) / max(deadline, 1.0))\n",
+    "        else:\n",
+    "            reward *= 0.25\n",
+    "\n",
+    "    if len(response_text.strip()) < 10:\n",
+    "        reward *= 0.5\n",
+    "\n",
+    "    if target.get(\"drift_flag\"):\n",
+    "        reward *= 1.5\n",
+    "\n",
+    "    if target[\"id\"] in superseded:\n",
+    "        reward *= 0.5\n",
+    "\n",
+    "    unhandled = [m for m in messages if not m.get(\"handled\") and m[\"id\"] != msg_id]\n",
+    "    if any(m[\"urgency\"] == \"critical\" for m in unhandled) and target[\"urgency\"] in (\"low\", \"medium\"):\n",
+    "        reward *= 0.3\n",
+    "\n",
+    "    return round(reward, 2)\n",
+    "\n",
+    "\n",
+    "# Test\n",
+    "test_data = prompts[0]\n",
+    "print(\"Testing reward function:\")\n",
+    "print(f\"  Hour: {test_data['hour']}, Messages: {test_data['visible_count']}\")\n",
+    "critical_msgs = [m for m in test_data[\"messages\"] if m[\"urgency\"] == \"critical\"]\n",
+    "if critical_msgs:\n",
+    "    good = f'respond_to_message(\"{critical_msgs[0][\"id\"]}\", \"Evacuating now with documents.\")'\n",
+    "    print(f\"  Good action (critical): {score_action(good, test_data):.2f}\")\n",
+    "low_msgs = [m for m in test_data[\"messages\"] if m[\"urgency\"] == \"low\"]\n",
+    "if low_msgs:\n",
+    "    bad = f'respond_to_message(\"{low_msgs[0][\"id\"]}\", \"ok\")'\n",
+    "    print(f\"  Bad action (low, short): {score_action(bad, test_data):.2f}\")\n",
+    "print(f\"  Unparseable: {score_action('do something', test_data):.2f}\")\n"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0fr2fzreorqr",
+   "source": "## Load Model & Baseline Evaluation\n\nLoad the model, run a **pre-training baseline** against the live environment, then configure GRPO training.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "zey499u5w1a",
+   "source": "from transformers import AutoModelForCausalLM, AutoTokenizer\nfrom peft import LoraConfig\nimport torch\n\n# Auto-detect precision\n_use_bf16 = torch.cuda.is_bf16_supported() if torch.cuda.is_available() else False\n_compute_dtype = torch.bfloat16 if _use_bf16 else torch.float16\n\n# Load in full bf16/fp16 \u2014 no 4-bit quantization.\n# Qwen2.5-0.5B is ~1GB in bf16, fits easily on any GPU.\n# This avoids all bitsandbytes dtype mismatch issues with lm_head.\nmodel = AutoModelForCausalLM.from_pretrained(\n    \"Qwen/Qwen2.5-0.5B-Instruct\",\n    device_map=\"auto\",\n    torch_dtype=_compute_dtype,\n)\ntokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen2.5-0.5B-Instruct\")\n\n# Fix: TRL GRPOTrainer expects warnings_issued but newer transformers removed it.\nif not hasattr(model, \"warnings_issued\"):\n    model.warnings_issued = {}\n\n# GRPO requires left padding so completions align across the batch\ntokenizer.padding_side = \"left\"\nif tokenizer.pad_token_id is None:\n    tokenizer.pad_token = tokenizer.eos_token\n    tokenizer.pad_token_id = tokenizer.eos_token_id\n\n# LoRA config \u2014 passed to GRPOTrainer, not applied here\nlora_config = LoraConfig(\n    r=16,\n    lora_alpha=16,\n    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n                     \"gate_proj\", \"up_proj\", \"down_proj\"],\n    lora_dropout=0.0,\n    bias=\"none\",\n    task_type=\"CAUSAL_LM\",\n)\n\nprint(f\"Model loaded in {_compute_dtype} (no quantization)\")\nprint(f\"Precision: {'bf16' if _use_bf16 else 'fp16'}\")\nprint(f\"Model size: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M params\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8iigxyoxiks",
+   "source": "### Pre-Training Baseline\n\nEvaluate the **untrained** model against the live environment before any GRPO training. This gives us a baseline to measure improvement.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "6r2zhgg94fk",
+   "source": [
+    "# --- Pre-training baseline evaluation against live environment ---\n",
+    "from openenv.core.env_server.mcp_types import CallToolAction\n",
+    "\n",
+    "\n",
+    "def generate_action(model, tokenizer, prompt_text):\n",
+    "    \"\"\"Generate an action from the model.\"\"\"\n",
+    "    msgs = [{\"role\": \"user\", \"content\": prompt_text}]\n",
+    "    input_ids = tokenizer.apply_chat_template(msgs, return_tensors=\"pt\", add_generation_prompt=True)\n",
+    "    if not isinstance(input_ids, torch.Tensor):\n",
+    "        input_ids = input_ids[\"input_ids\"]\n",
+    "    input_ids = input_ids.to(\"cuda\")\n",
+    "    prompt_len = input_ids.shape[1]\n",
+    "    with torch.no_grad():\n",
+    "        output = model.generate(input_ids=input_ids, max_new_tokens=200, temperature=0.7,\n",
+    "                                pad_token_id=tokenizer.pad_token_id, do_sample=True)\n",
+    "    return tokenizer.decode(output[0][prompt_len:], skip_special_tokens=True)\n",
+    "\n",
+    "\n",
+    "def _extract_tool_result(obs):\n",
+    "    \"\"\"Extract JSON from a CallToolObservation (handles FastMCP wrapping).\"\"\"\n",
+    "    raw = getattr(obs, \"result\", None)\n",
+    "    if hasattr(raw, \"data\"):\n",
+    "        raw = raw.data\n",
+    "    if isinstance(raw, dict) and \"data\" in raw:\n",
+    "        raw = raw[\"data\"]\n",
+    "    if isinstance(raw, str):\n",
+    "        try:\n",
+    "            return json.loads(raw)\n",
+    "        except (json.JSONDecodeError, TypeError):\n",
+    "            return {}\n",
+    "    return raw if isinstance(raw, dict) else {}\n",
+    "\n",
+    "\n",
+    "def evaluate_on_live_env(model, tokenizer, base_url, seed, max_steps=20):\n",
+    "    \"\"\"Evaluate model against the live environment using OpenEnv step() flow.\"\"\"\n",
+    "    with MCPToolClient(base_url=base_url, connect_timeout_s=60.0, message_timeout_s=120.0).sync() as env:\n",
+    "        env.reset(seed=seed)\n",
+    "        total_reward = 0.0\n",
+    "        actions_taken = []\n",
+    "\n",
+    "        for step_i in range(max_steps):\n",
+    "            status = json.loads(env.call_tool(\"get_status\"))\n",
+    "            if status.get(\"done\"):\n",
+    "                break\n",
+    "\n",
+    "            inbox = json.loads(env.call_tool(\"get_inbox\"))\n",
+    "            current_hour = status[\"current_hour\"]\n",
+    "            unhandled = [m for m in inbox if not m.get(\"handled\", False)]\n",
+    "            if not unhandled:\n",
+    "                env.call_tool(\"advance_time\", hours=2.0)\n",
+    "                continue\n",
+    "\n",
+    "            # Use server's get_prompt tool\n",
+    "            prompt = env.call_tool(\"get_prompt\")\n",
+    "            completion = generate_action(model, tokenizer, prompt)\n",
+    "\n",
+    "            match = re.search(r'respond_to_message\\s*\\(\\s*[\"\\']?(msg_\\d+)[\"\\']?\\s*,\\s*[\"\\'](.+?)[\"\\']', completion, re.DOTALL)\n",
+    "            if not match:\n",
+    "                id_match = re.search(r'(msg_\\d+)', completion)\n",
+    "                if id_match:\n",
+    "                    msg_id, response_text = id_match.group(1), completion[:200]\n",
+    "                else:\n",
+    "                    env.call_tool(\"advance_time\", hours=1.0)\n",
+    "                    continue\n",
+    "            else:\n",
+    "                msg_id, response_text = match.group(1), match.group(2)\n",
+    "\n",
+    "            action = CallToolAction(\n",
+    "                tool_name=\"respond_to_message\",\n",
+    "                arguments={\"message_id\": msg_id, \"response\": response_text},\n",
+    "            )\n",
+    "            step_result = env.step(action)\n",
+    "            obs = step_result.observation\n",
+    "            reward = obs.reward if obs.reward is not None else 0.0\n",
+    "            done = obs.done\n",
+    "\n",
+    "            result_data = _extract_tool_result(obs)\n",
+    "            if \"error\" in result_data:\n",
+    "                env.call_tool(\"advance_time\", hours=1.0)\n",
+    "                continue\n",
+    "\n",
+    "            total_reward += reward\n",
+    "            target_msg = next((m for m in inbox if m[\"id\"] == msg_id), None)\n",
+    "            urgency = target_msg[\"urgency\"] if target_msg else \"?\"\n",
+    "            actions_taken.append({\"step\": step_i, \"hour\": current_hour, \"msg_id\": msg_id, \"urgency\": urgency, \"reward\": reward})\n",
+    "            print(f\"  Step {step_i:2d} | Hour {current_hour:5.1f} | {msg_id} ({urgency:8s}) | Reward: {reward:+.1f} | Total: {total_reward:.1f}\")\n",
+    "\n",
+    "            if done:\n",
+    "                break\n",
+    "\n",
+    "        final_status = json.loads(env.call_tool(\"get_status\"))\n",
+    "\n",
+    "    return {\"seed\": seed, \"total_reward\": total_reward, \"actions\": actions_taken, \"final_status\": final_status}\n",
+    "\n",
+    "\n",
+    "# Run baseline on 3 seeds\n",
+    "print(\"=== PRE-TRAINING BASELINE (untrained model) ===\\n\")\n",
+    "baseline_results = []\n",
+    "for seed in [99, 42, 7]:\n",
+    "    print(f\"--- Seed {seed} ---\")\n",
+    "    res = evaluate_on_live_env(model, tokenizer, BASE_URL, seed=seed)\n",
+    "    baseline_results.append(res)\n",
+    "    print(f\"  Total: {res['total_reward']:.1f} | Actions: {len(res['actions'])}\\n\")\n",
+    "\n",
+    "baseline_avg = sum(r[\"total_reward\"] for r in baseline_results) / len(baseline_results)\n",
+    "print(f\"Baseline average reward: {baseline_avg:.1f}\")\n"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "97iryd40qzt",
+   "source": "from datasets import Dataset\n\nMAX_PROMPT_LENGTH = 1024\n\ntrain_data = []\nfor p in prompts:\n    msgs = [{\"role\": \"user\", \"content\": p[\"prompt\"]}]\n    tok = tokenizer.apply_chat_template(msgs, truncation=True, max_length=1024, return_tensors=\"pt\", add_generation_prompt=True)\n    try:\n        ids = tok[\"input_ids\"]\n    except (TypeError, KeyError):\n        ids = tok\n    n_tokens = ids.shape[1] if ids.dim() > 1 else ids.shape[0]\n    if n_tokens > MAX_PROMPT_LENGTH:\n        continue\n    train_data.append({\n        \"prompt\": msgs,\n        \"_prompt_key\": p[\"prompt\"][:200],\n    })\n\nrandom.seed(42)\nrandom.shuffle(train_data)\n\ndataset = Dataset.from_list(train_data)\nprint(f\"Training dataset: {len(dataset)} prompts (after dropping prompts > {MAX_PROMPT_LENGTH} tokens)\")\nprint(f\"Sample prompt length: {len(train_data[0]['prompt'][0]['content'])} chars\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2vcnc1dr4d1",
+   "source": "## GRPO Training Loop\n\nThe reward function scores each completion by:\n1. Parsing which message the model chose to handle\n2. Checking urgency, deadline timing, drift flags\n3. Penalizing bad choices (low-urgency when critical exists, stale info)",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "9liiw6eifdo",
+   "source": "import gc\ngc.collect()\ntorch.cuda.empty_cache()",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "arbp96a9wi",
+   "source": "from trl import GRPOConfig, GRPOTrainer\n\n# Build lookup from prompt text -> prompt metadata for reward scoring\n# Use first 200 chars as key (reliable \u2014 TRL may not pass custom dataset columns)\nprompt_lookup = {}\nfor p in prompts:\n    key = p[\"prompt\"][:200]\n    prompt_lookup[key] = p\n\n\ndef reward_fn(prompts, completions, **kwargs):\n    \"\"\"GRPO reward function. Scores each completion against its inbox state.\"\"\"\n    rewards = []\n    for prompt_msgs, completion in zip(prompts, completions):\n        # Extract prompt text to look up metadata\n        if isinstance(prompt_msgs, list):\n            prompt_text = prompt_msgs[-1][\"content\"] if prompt_msgs else \"\"\n        else:\n            prompt_text = str(prompt_msgs)\n\n        key = prompt_text[:200]\n        prompt_data = prompt_lookup.get(key)\n\n        if prompt_data is None:\n            rewards.append(0.0)\n            continue\n\n        if isinstance(completion, list):\n            if completion and isinstance(completion[0], (int, float)):\n                comp_text = tokenizer.decode(completion, skip_special_tokens=True)\n            else:\n                comp_text = completion[-1].get(\"content\", \"\") if completion else \"\"\n        else:\n            comp_text = str(completion)\n\n        score = score_action(comp_text, prompt_data)\n        rewards.append(score)\n\n    return rewards\n\n\ntraining_args = GRPOConfig(\n    output_dir=\"crisisinbox-grpo-output\",\n    num_train_epochs=3,\n    per_device_train_batch_size=2,\n    gradient_accumulation_steps=2,\n    learning_rate=1e-5,\n    max_completion_length=256,\n    max_prompt_length=1024,\n    num_generations=4,\n    logging_steps=1,\n    save_steps=100,\n    bf16=_use_bf16,\n    fp16=not _use_bf16,\n    sync_ref_model=True,\n)\n\n# Let GRPOTrainer handle PEFT wrapping (avoids dtype mismatches from manual setup)\ntrainer = GRPOTrainer(\n    model=model,\n    processing_class=tokenizer,\n    reward_funcs=reward_fn,\n    args=training_args,\n    train_dataset=dataset,\n    peft_config=lora_config,\n)\n\n# After trainer init, update model ref to the PEFT-wrapped version\nmodel = trainer.model\n\nprint(f\"Trainer configured \u2014 {len(prompt_lookup)} unique prompt keys\")\nprint(f\"Precision: {'bf16' if _use_bf16 else 'fp16'}\")\nprint(f\"Training for {training_args.num_train_epochs} epochs\")\nprint(\"Ready to train\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "ni6dh0hkegm",
+   "source": "# Train!\ntrainer.train()\nprint(\"Training complete\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "gv86tb0swwl",
+   "source": "## Evaluate: Offline + Training Curve\n\nEvaluate the trained model on collected prompts and plot the training reward curve.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "it6zcy49jp9",
+   "source": [
+    "import matplotlib.pyplot as plt\nimport pandas as pd\n\nmodel.eval()\n\n# --- Post-training evaluation on same seeds as baseline ---\nprint(\"=== POST-TRAINING EVALUATION ===\\n\")\ntrained_results = []\nfor seed in [99, 42, 7]:\n    print(f\"--- Seed {seed} ---\")\n    res = evaluate_on_live_env(model, tokenizer, BASE_URL, seed=seed)\n    trained_results.append(res)\n    print(f\"  Total: {res['total_reward']:.1f} | Actions: {len(res['actions'])}\\n\")\n\ntrained_avg = sum(r[\"total_reward\"] for r in trained_results) / len(trained_results)\nprint(f\"Trained average reward: {trained_avg:.1f}\")\nprint(f\"Baseline average reward: {baseline_avg:.1f}\")\nimprovement = ((trained_avg - baseline_avg) / max(baseline_avg, 0.1)) * 100\nprint(f\"Improvement: {improvement:+.1f}%\")\n\n# --- Plot 1: Before/After Comparison Bar Chart ---\nfig, axes = plt.subplots(1, 3, figsize=(16, 5))\n\n# Bar chart: per-seed comparison\nseeds = [99, 42, 7]\nbaseline_scores = [r[\"total_reward\"] for r in baseline_results]\ntrained_scores = [r[\"total_reward\"] for r in trained_results]\n\nx = range(len(seeds))\nwidth = 0.35\nbars1 = axes[0].bar([i - width/2 for i in x], baseline_scores, width, label=\"Before Training\", color=\"#d62728\", alpha=0.8)\nbars2 = axes[0].bar([i + width/2 for i in x], trained_scores, width, label=\"After Training\", color=\"#2ca02c\", alpha=0.8)\naxes[0].set_xlabel(\"Episode Seed\")\naxes[0].set_ylabel(\"Total Reward\")\naxes[0].set_title(\"Before vs After GRPO Training\")\naxes[0].set_xticks(list(x))\naxes[0].set_xticklabels([f\"Seed {s}\" for s in seeds])\naxes[0].legend()\naxes[0].grid(axis=\"y\", linestyle=\"--\", alpha=0.6)\n# Add value labels on bars\nfor bar in bars1:\n    axes[0].text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.5,\n                 f'{bar.get_height():.1f}', ha='center', va='bottom', fontsize=9, color=\"#d62728\")\nfor bar in bars2:\n    axes[0].text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.5,\n                 f'{bar.get_height():.1f}', ha='center', va='bottom', fontsize=9, color=\"#2ca02c\")\n\n# Bar chart: average comparison\naxes[1].bar([\"Untrained\\n(Baseline)\", \"GRPO\\nTrained\"], [baseline_avg, trained_avg],\n            color=[\"#d62728\", \"#2ca02c\"], alpha=0.8, width=0.5)\naxes[1].set_ylabel(\"Average Reward\")\naxes[1].set_title(f\"Average Reward ({improvement:+.1f}% improvement)\")\naxes[1].grid(axis=\"y\", linestyle=\"--\", alpha=0.6)\naxes[1].text(0, baseline_avg + 0.5, f\"{baseline_avg:.1f}\", ha=\"center\", va=\"bottom\", fontweight=\"bold\")\naxes[1].text(1, trained_avg + 0.5, f\"{trained_avg:.1f}\", ha=\"center\", va=\"bottom\", fontweight=\"bold\")\n\n# Plot 2: Training reward curve\nhistory = pd.DataFrame(trainer.state.log_history)\nif \"rewards/reward_fn/mean\" in history.columns:\n    reward_steps = history.dropna(subset=[\"rewards/reward_fn/mean\"])\n    axes[2].plot(reward_steps[\"step\"], reward_steps[\"rewards/reward_fn/mean\"],\n                 label=\"Mean Reward\", color=\"#2ca02c\", linewidth=2)\n    axes[2].fill_between(reward_steps[\"step\"],\n                         reward_steps[\"rewards/reward_fn/mean\"] - reward_steps[\"rewards/reward_fn/std\"],\n                         reward_steps[\"rewards/reward_fn/mean\"] + reward_steps[\"rewards/reward_fn/std\"],\n                         alpha=0.2, color=\"#2ca02c\")\n    # Add baseline reference line\n    axes[2].axhline(y=baseline_avg, color=\"#d62728\", linestyle=\"--\", linewidth=1.5, label=f\"Baseline ({baseline_avg:.1f})\")\n    axes[2].set_xlabel(\"Training Steps\")\n    axes[2].set_ylabel(\"Reward\")\n    axes[2].set_title(\"GRPO Training Curve\")\n    axes[2].legend()\n    axes[2].grid(True, linestyle=\"--\", alpha=0.6)\nelse:\n    axes[2].text(0.5, 0.5, \"No reward history\\n(run trainer.train() first)\",\n                 ha=\"center\", va=\"center\", transform=axes[2].transAxes, fontsize=12)\n    axes[2].set_title(\"GRPO Training Curve\")\n\nplt.tight_layout()\nplt.savefig(\"crisisinbox_grpo_results.png\", dpi=150, bbox_inches=\"tight\")\nplt.show()\nprint(\"Results saved to crisisinbox_grpo_results.png\")"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "wob0szg0fv",
+   "source": "## Evaluate Against Live Environment\n\nRun the trained model in a closed loop against the actual CrisisInbox environment to get real server-side rewards.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "a5ih8db6zz",
+   "source": [
+    "# Additional live eval on a fresh seed (not used in baseline comparison)\nprint(\"=== Extra Live Evaluation (seed=123) ===\\n\")\nextra_result = evaluate_on_live_env(model, tokenizer, BASE_URL, seed=123, max_steps=25)\nprint(f\"\\nTotal reward: {extra_result['total_reward']:.1f}\")\nprint(f\"Actions taken: {len(extra_result['actions'])}\")\nprint(f\"Messages handled: {extra_result['final_status']['messages_handled']}\")"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "g76r4wc8jx",
+   "source": "# Save the trained model\nmodel.save_pretrained(\"crisisinbox-grpo-trained\")\ntokenizer.save_pretrained(\"crisisinbox-grpo-trained\")\nprint(\"Model saved to crisisinbox-grpo-trained/\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

notebooks/crisisinbox_grpo_connected.ipynb CHANGED Viewed

@@ -120,7 +120,9 @@
   {
    "cell_type": "code",
    "id": "nmmfb62s9ph",
-   "source": "# Collect multiple episodes from the live environment\nNUM_EPISODES = 10\nSEEDS = list(range(NUM_EPISODES))\n\nepisodes = []\nfor seed in SEEDS:\n    print(f\"Collecting episode {seed + 1}/{NUM_EPISODES} (seed={seed})...\", end=\" \")\n    for attempt in range(3):\n        try:\n            ep = collect_episode(BASE_URL, seed=seed)\n            episodes.append(ep)\n            print(f\"{len(ep['decision_points'])} decision points\")\n            break\n        except Exception as e:\n            if attempt < 2:\n                print(f\"retry {attempt + 1}...\", end=\" \")\n                _time.sleep(5)\n            else:\n                print(f\"FAILED ({e}), skipping\")\n\n# Flatten to training prompts\nprompts = []\nfor ep in episodes:\n    for dp in ep[\"decision_points\"]:\n        prompts.append({\n            \"prompt\": dp[\"prompt\"],\n            \"hour\": dp[\"hour\"],\n            \"visible_count\": dp[\"visible_count\"],\n            \"episode_id\": ep[\"episode_id\"],\n            \"seed\": ep[\"seed\"],\n            \"drift_events\": ep[\"drift_events\"],\n            \"superseded\": ep.get(\"superseded_messages\", {}),\n            \"messages\": dp[\"visible_messages\"],\n        })\n\nprint(f\"\\nCollected {len(episodes)} episodes -> {len(prompts)} training prompts\")\nprint(f\"Average {len(prompts)/len(episodes):.1f} decision points per episode\")",
    "metadata": {},
    "execution_count": null,
    "outputs": []

   {
    "cell_type": "code",
    "id": "nmmfb62s9ph",
+   "source": [
+    "# Collect multiple episodes from the live environment\nNUM_EPISODES = 10\nSEEDS = list(range(NUM_EPISODES))\n\nepisodes = []\nfor seed in SEEDS:\n    print(f\"Collecting episode {seed + 1}/{NUM_EPISODES} (seed={seed})...\", end=\" \")\n    for attempt in range(3):\n        try:\n            ep = collect_episode(BASE_URL, seed=seed)\n            episodes.append(ep)\n            print(f\"{len(ep['decision_points'])} decision points\")\n            break\n        except Exception as e:\n            if attempt < 2:\n                print(f\"retry {attempt + 1}...\", end=\" \")\n                _time.sleep(5)\n            else:\n                print(f\"FAILED ({e}), skipping\")\n\n# Flatten to training prompts\nprompts = []\nfor ep in episodes:\n    for dp in ep[\"decision_points\"]:\n        prompts.append({\n            \"prompt\": dp[\"prompt\"],\n            \"hour\": dp[\"hour\"],\n            \"visible_count\": dp[\"visible_count\"],\n            \"episode_id\": ep[\"episode_id\"],\n            \"seed\": ep[\"seed\"],\n            \"drift_events\": ep[\"drift_events\"],\n            \"superseded\": ep.get(\"superseded_messages\", {}),\n            \"messages\": dp[\"messages\"],\n        })\n\nprint(f\"\\nCollected {len(episodes)} episodes -> {len(prompts)} training prompts\")\nprint(f\"Average {len(prompts)/len(episodes):.1f} decision points per episode\")"
+   ],
    "metadata": {},
    "execution_count": null,
    "outputs": []

server/crisis_inbox_environment.py CHANGED Viewed

@@ -73,6 +73,9 @@ class CrisisInboxEnvironment(MCPEnvironment):
         self._drift_events: list[DriftEvent] = []
         self._fired_drifts: set[str] = set()
         self._superseded: dict[str, str] = {}  # old_msg_id -> new_msg_id
         self._rng = random.Random()
         @mcp.tool
@@ -99,6 +102,7 @@ class CrisisInboxEnvironment(MCPEnvironment):
                     "read": msg.id in self._read_msgs,
                     "drift_flag": msg.drift_flag,
                     "superseded": is_superseded,
                 })
             return json.dumps(summaries, indent=2)
@@ -170,6 +174,29 @@ class CrisisInboxEnvironment(MCPEnvironment):
             self._handled[message_id] = response
             self._score += reward
             # Advance time
             self._advance_clock(0.25)
@@ -243,11 +270,12 @@ class CrisisInboxEnvironment(MCPEnvironment):
                 status = "HANDLED" if msg.id in self._handled else "UNHANDLED"
                 drift = " [POLICY CHANGE]" if msg.drift_flag else ""
                 superseded = " [SUPERSEDED]" if msg.id in self._superseded else ""
                 deadline_str = f", deadline: hour {msg.deadline_hours}" if msg.deadline_hours else ""
                 lines.append(
                     f"[{status}] {msg.id} | {msg.urgency.value.upper()} | "
                     f"From: {msg.sender} via {msg.channel.value} | "
-                    f"\"{msg.subject}\"{deadline_str}{drift}{superseded}"
                 )
             lines.extend([
                 "=" * 60,
@@ -290,6 +318,7 @@ class CrisisInboxEnvironment(MCPEnvironment):
         self._current_hour = min(48.0, self._current_hour + hours)
         self._deliver_messages()
         self._fire_drift_events()
     def _deliver_messages(self):
         """Make messages visible if their timestamp has been reached."""
@@ -298,6 +327,22 @@ class CrisisInboxEnvironment(MCPEnvironment):
                 if not any(m.id == msg.id for m in self._visible_messages):
                     self._visible_messages.append(msg)
     def _fire_drift_events(self):
         """Fire any drift events whose trigger time has been reached."""
         for drift in self._drift_events:
@@ -354,6 +399,37 @@ class CrisisInboxEnvironment(MCPEnvironment):
                 )
             self._all_messages.append(m)
         self._visible_messages = []
         self._handled = {}
         self._read_msgs = set()

         self._drift_events: list[DriftEvent] = []
         self._fired_drifts: set[str] = set()
         self._superseded: dict[str, str] = {}  # old_msg_id -> new_msg_id
+        self._escalation_map: dict[str, Message] = {}  # parent_id -> escalation msg
+        self._reply_map: dict[str, Message] = {}  # parent_id -> reply msg
+        self._conflict_pairs: dict[str, str] = {}  # msg_id -> conflicting msg_id
         self._rng = random.Random()
         @mcp.tool
                     "read": msg.id in self._read_msgs,
                     "drift_flag": msg.drift_flag,
                     "superseded": is_superseded,
+                    "conflicts_with": msg.conflicts_with,
                 })
             return json.dumps(summaries, indent=2)
             self._handled[message_id] = response
             self._score += reward
+            # Conflict resolution: if this message conflicts with another,
+            # the conflicting message can no longer be handled (time conflict)
+            if msg.conflicts_with and msg.conflicts_with not in self._handled:
+                self._handled[msg.conflicts_with] = "[AUTO-EXPIRED: time conflict]"
+            # Multi-turn: if handling this message triggers a reply, inject it
+            if msg.reply_trigger and msg.reply_trigger in self._reply_map:
+                reply_msg = self._reply_map[msg.reply_trigger]
+                reply_msg.timestamp_hours = self._current_hour + 0.5
+                if reply_msg.deadline_hours is not None and reply_msg.deadline_hours == 0.0:
+                    # Dynamic deadline based on message content hints
+                    reply_msg.deadline_hours = self._current_hour + 6.0
+                if not any(m.id == reply_msg.id for m in self._all_messages):
+                    self._all_messages.append(reply_msg)
+            # Escalation: if this message had an escalation, cancel it
+            # (handled in time, no need to escalate)
+            if message_id in self._escalation_map:
+                esc = self._escalation_map[message_id]
+                # Remove from all_messages so it never appears
+                self._all_messages = [m for m in self._all_messages if m.id != esc.id]
+                self._visible_messages = [m for m in self._visible_messages if m.id != esc.id]
             # Advance time
             self._advance_clock(0.25)
                 status = "HANDLED" if msg.id in self._handled else "UNHANDLED"
                 drift = " [POLICY CHANGE]" if msg.drift_flag else ""
                 superseded = " [SUPERSEDED]" if msg.id in self._superseded else ""
+                conflict = f" [CONFLICTS WITH {msg.conflicts_with}]" if msg.conflicts_with else ""
                 deadline_str = f", deadline: hour {msg.deadline_hours}" if msg.deadline_hours else ""
                 lines.append(
                     f"[{status}] {msg.id} | {msg.urgency.value.upper()} | "
                     f"From: {msg.sender} via {msg.channel.value} | "
+                    f"\"{msg.subject}\"{deadline_str}{drift}{superseded}{conflict}"
                 )
             lines.extend([
                 "=" * 60,
         self._current_hour = min(48.0, self._current_hour + hours)
         self._deliver_messages()
         self._fire_drift_events()
+        self._fire_escalations()
     def _deliver_messages(self):
         """Make messages visible if their timestamp has been reached."""
                 if not any(m.id == msg.id for m in self._visible_messages):
                     self._visible_messages.append(msg)
+    def _fire_escalations(self):
+        """Inject escalation messages for unhandled messages past their deadline + delay."""
+        for parent_id, esc_msg in list(self._escalation_map.items()):
+            if parent_id in self._handled:
+                continue  # Handled in time, no escalation
+            # Find the parent message to check deadline
+            parent = next((m for m in self._all_messages if m.id == parent_id), None)
+            if parent is None or parent.deadline_hours is None:
+                continue
+            trigger_hour = parent.deadline_hours + (parent.escalation_delay_hours or 0.0)
+            if self._current_hour >= trigger_hour:
+                # Inject escalation message if not already present
+                if not any(m.id == esc_msg.id for m in self._all_messages):
+                    esc_msg.timestamp_hours = trigger_hour
+                    self._all_messages.append(esc_msg)
     def _fire_drift_events(self):
         """Fire any drift events whose trigger time has been reached."""
         for drift in self._drift_events:
                 )
             self._all_messages.append(m)
+        # Build escalation, reply, and conflict maps from loaded messages.
+        # Escalation and reply messages start outside the pool — they're injected
+        # dynamically when triggered.
+        self._escalation_map = {}
+        self._reply_map = {}
+        self._conflict_pairs = {}
+        # Collect IDs of escalation/reply targets so we can remove them from the pool
+        deferred_ids: set[str] = set()
+        for m in self._all_messages:
+            if m.escalation_trigger:
+                deferred_ids.add(m.escalation_trigger)
+            if m.reply_trigger:
+                deferred_ids.add(m.reply_trigger)
+            if m.conflicts_with:
+                self._conflict_pairs[m.id] = m.conflicts_with
+        # Pull deferred messages out of the pool into their maps
+        kept: list[Message] = []
+        for m in self._all_messages:
+            if m.id in deferred_ids:
+                # Find which parent references this
+                for parent in self._all_messages:
+                    if parent.escalation_trigger == m.id:
+                        self._escalation_map[parent.id] = m
+                    if parent.reply_trigger == m.id:
+                        self._reply_map[m.id] = m
+            else:
+                kept.append(m)
+        self._all_messages = kept
         self._visible_messages = []
         self._handled = {}
         self._read_msgs = set()

server/rewards.py CHANGED Viewed

@@ -96,6 +96,10 @@ def calculate_reward(
     if msg.id in superseded:
         reward *= 0.5
     # Priority penalty: choosing low/medium when unhandled critical messages exist
     if visible_messages and handled is not None:
         has_unhandled_critical = any(

     if msg.id in superseded:
         reward *= 0.5
+    # Conflict-resolution bonus: handling a message that forces a trade-off
+    if msg.conflicts_with:
+        reward *= 1.25
     # Priority penalty: choosing low/medium when unhandled critical messages exist
     if visible_messages and handled is not None:
         has_unhandled_critical = any(