Spaces:

Uddiii
/

Multi-Agentic

Sleeping

Uddiii commited on Apr 25

Commit

2df5c63

1 Parent(s): 71a0a91

chore(kaggle): rebuild notebook v3 + clean dev-scratch files

* New `kaggle/build_notebook.py` - single source of truth that regenerates
the Kaggle notebook + KAGGLE_QUICKSTART.md from scratch (run once whenever
the layout drifts).
* New `kaggle/KAGGLE_QUICKSTART.md` - concise step-by-step run order with
troubleshooting table for every dependency-hell symptom we hit.
* `kaggle/train_ermap_grpo_kaggle.ipynb` rebuilt as a clean 20-cell layout:
- new constant per-phase reward thresholds (P1=+1.2 / P2=+1.1 / P3=+1.0)
- idempotent REPAIR cell (pins torch 2.10 cu128 + bnb + unsloth/zoo + trl,
verifies in a subprocess so the kernel never gets poisoned mid-install)
- pre-flight Groq routing + PING smoke test using router._models / _clients
- explicit dry-run + HF-push hook + per-phase dashboards + final push.
* `train_grpo.py`: add optional `phase_episode_budgets` (fixed-budget
curriculum mode) alongside the existing reward-threshold early-stop.
Fully backward compatible (default behaviour unchanged); CLI flags
--phase{1,2,3}-budget.
* Remove dev-scratch files: `_smoke_dead_keys.py`, `ER_MAP/_verify.py`,
`ER_MAP/_replot.py`, `kaggle/KAGGLE.md` (replaced by KAGGLE_QUICKSTART).

Made-with: Cursor

Files changed (8) hide show

ER_MAP/_replot.py +0 -11
ER_MAP/_verify.py +0 -15
ER_MAP/training/train_grpo.py +94 -1
_smoke_dead_keys.py +0 -333
kaggle/KAGGLE.md +0 -265
kaggle/KAGGLE_QUICKSTART.md +104 -0
kaggle/build_notebook.py +880 -0
kaggle/train_ermap_grpo_kaggle.ipynb +478 -271

ER_MAP/_replot.py DELETED Viewed

@@ -1,11 +0,0 @@
-"""Quick script to regenerate reward curve from saved eval_results.json"""
-import json
-import sys
-sys.path.insert(0, ".")
-from ER_MAP.evaluate import plot_reward_curve
-with open("d:/Meta_Finals/ER_MAP/eval_results.json") as f:
-    results = json.load(f)
-plot_reward_curve(results, "d:/Meta_Finals/ER_MAP/reward_curve.png")
-print("Done!")

ER_MAP/_verify.py DELETED Viewed

@@ -1,15 +0,0 @@
-from ER_MAP.envs.randomizer import DISEASE_POOL, DIFFICULTY_TIERS, generate_ground_truth
-print(f"=== {len(DISEASE_POOL)} DISEASES ===")
-for d in DISEASE_POOL:
-    print(f"  {d['true_disease']}")
-print()
-print("=== DIFFICULTY TIERS ===")
-for tier in ["easy", "medium", "hard"]:
-    gt = generate_ground_truth(difficulty=tier)
-    p = gt["patient"]
-    print(f"  {tier.upper():8s} | compliance: {p['compliance']:20s} | comm: {p['communication']:20s} | {gt['disease']['true_disease']}")
-combos = 3 * 4 * 4 * 4 * 4 * 3 * 3 * 3 * 3 * 15
-print(f"\nTotal unique scenario combinations: {combos:,}")

ER_MAP/training/train_grpo.py CHANGED Viewed

@@ -686,9 +686,47 @@ def train(
     phase_min_win_rate: float = 0.20,
     convergence_window: int = 3,
     early_stop: bool = True,
 ):
     if phase_reward_targets is None:
         phase_reward_targets = {1: 1.5, 2: 1.2, 3: 1.0}
     """
     Main GRPO training loop with curriculum scheduling.
@@ -758,7 +796,17 @@ def train(
     logger.info(f"\nStarting GRPO training for up to {num_episodes} episodes "
                 f"(={num_episodes // group_size} GRPO updates)")
-    if early_stop:
         logger.info(
             f"  Early-stop ON: per-phase reward thresholds (sustained for "
             f"{convergence_window} groups, win-rate >= {phase_min_win_rate:.0%}):"
@@ -955,6 +1003,32 @@ def train(
             f"Phase Episodes: {s['phase_episodes']}"
         )
         # --- Per-phase early-stop / promotion check ------------------------
         # Maintain a buffer of the last `convergence_window` GRPO groups
         # with their (phase, rolling_avg, rolling_win). When ALL N entries
@@ -1099,8 +1173,26 @@ if __name__ == "__main__":
     parser.add_argument("--no-early-stop", action="store_true",
                         help="Disable early-stop (always run all configured episodes)")
     args = parser.parse_args()
     train(
         num_episodes=args.episodes,
         group_size=args.group_size,
@@ -1119,4 +1211,5 @@ if __name__ == "__main__":
         phase_min_win_rate=args.phase_min_win_rate,
         convergence_window=args.convergence_window,
         early_stop=not args.no_early_stop,
     )

     phase_min_win_rate: float = 0.20,
     convergence_window: int = 3,
     early_stop: bool = True,
+    # ---------------- Fixed-budget curriculum (alternative mode) -----------
+    # When set, training advances phases at FIXED episode counts instead of
+    # via the reward-target early-stop. Useful when you want a clean
+    # reward-growth curve over a known wall-clock budget. Example:
+    #   phase_episode_budgets = {1: 20, 2: 30, 3: 50}  # 100 episodes total
+    # When this is provided, `early_stop` is forced to False (the reward
+    # thresholds become observational, logged for plots only) and
+    # `num_episodes` is auto-set to sum(phase_episode_budgets.values()) if
+    # the caller passed a smaller / inconsistent value.
+    phase_episode_budgets: Optional[Dict[int, int]] = None,
 ):
     if phase_reward_targets is None:
         phase_reward_targets = {1: 1.5, 2: 1.2, 3: 1.0}
+    # Fixed-budget mode overrides early-stop and aligns num_episodes.
+    fixed_budget_mode = phase_episode_budgets is not None and len(phase_episode_budgets) > 0
+    if fixed_budget_mode:
+        # Sanity: must have all 3 phases keyed, all positive ints
+        missing = [p for p in (1, 2, 3) if p not in phase_episode_budgets]
+        if missing:
+            raise ValueError(
+                f"phase_episode_budgets must include all phases (1,2,3); missing: {missing}"
+            )
+        for _p, _n in phase_episode_budgets.items():
+            if not isinstance(_n, int) or _n <= 0:
+                raise ValueError(
+                    f"phase_episode_budgets[{_p}] must be a positive int, got {_n!r}"
+                )
+        budget_sum = sum(phase_episode_budgets.values())
+        if num_episodes != budget_sum:
+            logger.info(
+                f"[Fixed-budget] num_episodes ({num_episodes}) overridden to "
+                f"sum(phase_episode_budgets) = {budget_sum}"
+            )
+            num_episodes = budget_sum
+        if early_stop:
+            logger.info(
+                "[Fixed-budget] early_stop=True is incompatible with fixed budgets; "
+                "disabling early_stop. Reward targets will still be logged for plots."
+            )
+            early_stop = False
     """
     Main GRPO training loop with curriculum scheduling.
     logger.info(f"\nStarting GRPO training for up to {num_episodes} episodes "
                 f"(={num_episodes // group_size} GRPO updates)")
+    if fixed_budget_mode:
+        logger.info(
+            "  Fixed-budget curriculum: phases advance at fixed episode counts."
+        )
+        for _pid in sorted(phase_episode_budgets.keys()):
+            logger.info(
+                f"    Phase {_pid}: {phase_episode_budgets[_pid]} episodes "
+                f"(target avg-reward {phase_reward_targets.get(_pid, float('nan')):+.2f}, "
+                f"observational only)"
+            )
+    elif early_stop:
         logger.info(
             f"  Early-stop ON: per-phase reward thresholds (sustained for "
             f"{convergence_window} groups, win-rate >= {phase_min_win_rate:.0%}):"
             f"Phase Episodes: {s['phase_episodes']}"
         )
+        # --- Fixed-budget phase transition ---------------------------------
+        # When the operator pre-allocates per-phase episode budgets (e.g.
+        # P1=20, P2=30, P3=50), advance at the boundaries regardless of
+        # reward. Phase 3 budget exhaustion lets the outer-loop
+        # `num_episodes` cap end training naturally.
+        if fixed_budget_mode:
+            current_phase = s["phase"]
+            budget = phase_episode_budgets.get(current_phase, 0)
+            if (
+                current_phase < 3
+                and s["phase_episodes"] >= budget
+            ):
+                promoted = scheduler.force_promote(
+                    reason=(
+                        f"fixed-budget: completed {s['phase_episodes']} episodes "
+                        f"in Phase {current_phase} (budget={budget})"
+                    )
+                )
+                if promoted:
+                    new_phase = scheduler.phase_id
+                    logger.info(
+                        f"  [Fixed-budget] Phase {current_phase} budget exhausted "
+                        f"-> Phase {new_phase}: {scheduler.current_phase.name} "
+                        f"({phase_episode_budgets.get(new_phase, '?')} episodes allocated)"
+                    )
         # --- Per-phase early-stop / promotion check ------------------------
         # Maintain a buffer of the last `convergence_window` GRPO groups
         # with their (phase, rolling_avg, rolling_win). When ALL N entries
     parser.add_argument("--no-early-stop", action="store_true",
                         help="Disable early-stop (always run all configured episodes)")
+    # Fixed-budget curriculum (mutually exclusive with early-stop)
+    parser.add_argument("--phase1-budget", type=int, default=None,
+                        help="Fixed episode budget for Phase 1 (Tool Mastery)")
+    parser.add_argument("--phase2-budget", type=int, default=None,
+                        help="Fixed episode budget for Phase 2 (Clinical Reasoning)")
+    parser.add_argument("--phase3-budget", type=int, default=None,
+                        help="Fixed episode budget for Phase 3 (Empathetic Negotiation)")
     args = parser.parse_args()
+    _budgets = None
+    if any(b is not None for b in (args.phase1_budget, args.phase2_budget, args.phase3_budget)):
+        if not all(b is not None for b in (args.phase1_budget, args.phase2_budget, args.phase3_budget)):
+            parser.error("--phase{1,2,3}-budget must all be set together")
+        _budgets = {
+            1: args.phase1_budget,
+            2: args.phase2_budget,
+            3: args.phase3_budget,
+        }
     train(
         num_episodes=args.episodes,
         group_size=args.group_size,
         phase_min_win_rate=args.phase_min_win_rate,
         convergence_window=args.convergence_window,
         early_stop=not args.no_early_stop,
+        phase_episode_budgets=_budgets,
     )

_smoke_dead_keys.py DELETED Viewed

@@ -1,333 +0,0 @@
-"""
-Smoke test: simulate the EXACT failure mode from the user's last log
-(Patient + Nurse keys revoked, Doctor + Judge keys alive) and verify
-that:
-1. AgentRouter.query falls back to a live judge client
-2. DoctorBrain's chain advances past the dead Doctor key (we'll also
-   simulate Doctor revocation)
-3. TTS emotion adapter gets disabled after first 401 (no spam)
-Runs from the repo root: ``python _smoke_dead_keys.py``
-"""
-from __future__ import annotations
-import os
-import sys
-import importlib
-REPO_ROOT = os.path.abspath(os.path.dirname(__file__))
-if REPO_ROOT not in sys.path:
-    sys.path.insert(0, REPO_ROOT)
-CHECKS_PASSED = 0
-CHECKS_FAILED = 0
-def check(label, ok, detail=""):
-    global CHECKS_PASSED, CHECKS_FAILED
-    tag = "PASS" if ok else "FAIL"
-    if ok:
-        CHECKS_PASSED += 1
-    else:
-        CHECKS_FAILED += 1
-    line = f"  [{tag}] {label}"
-    if detail:
-        line += f" -- {detail}"
-    print(line, flush=True)
-# ---------------------------------------------------------------------------
-# 1. AgentRouter fallback chain (Patient + Nurse dead, judges alive)
-# ---------------------------------------------------------------------------
-print("\n--- Test 1: AgentRouter.query with patient+nurse dead ---", flush=True)
-# Inject env vars BEFORE importing dashboard so demo defaults still get set.
-os.environ.setdefault("GROQ_DOCTOR_API_KEY", "gsk_dummy_doctor")
-os.environ.setdefault("GROQ_NURSE_API_KEY", "gsk_dummy_nurse")
-os.environ.setdefault("GROQ_PATIENT_API_KEY", "gsk_dummy_patient")
-os.environ.setdefault("GROQ_EMPATHY_JUDGE_API_KEY", "gsk_dummy_judge")
-os.environ.setdefault("GROQ_MEDICAL_JUDGE_API_KEY", "gsk_dummy_judge")
-from ER_MAP.envs import api_router as _api_router_mod  # noqa: E402
-class _MockResp:
-    def __init__(self, content):
-        self.choices = [type("C", (), {"message": type("M", (), {"content": content})()})()]
-class _MockClient:
-    """Mock Groq client that either succeeds or raises a 401."""
-    def __init__(self, name, dead=False, payload='{"action":"speak","content":"OK"}'):
-        self.name = name
-        self.dead = dead
-        self.payload = payload
-        self.calls = 0
-        self.chat = type("Chat", (), {"completions": self})()
-    def create(self, **kw):
-        self.calls += 1
-        if self.dead:
-            raise Exception(
-                f"Error code: 401 - {{'error': {{'message': 'Invalid API Key', "
-                f"'type': 'invalid_request_error', 'code': 'invalid_api_key'}}}}"
-            )
-        return _MockResp(self.payload)
-router = _api_router_mod.AgentRouter(
-    api_key="x",
-    nurse_api_key="x",
-    patient_api_key="x",
-    empathy_judge_api_key="x",
-    medical_judge_api_key="x",
-)
-# Patch all 4 clients with mocks: patient + nurse are dead, judges are alive.
-mock_clients = {
-    "nurse":         _MockClient("nurse",         dead=True),
-    "patient":       _MockClient("patient",       dead=True),
-    "empathy_judge": _MockClient("empathy_judge", dead=False),
-    "medical_judge": _MockClient("medical_judge", dead=False),
-}
-router._clients = mock_clients
-router._dead_clients = set()  # let runtime detect deadness through the cascade
-# Query as nurse — should walk nurse -> patient -> medical_judge and succeed.
-result = router.query("nurse", "system", [{"role": "user", "content": "hi"}])
-check(
-    "router.query('nurse') falls through to a live judge",
-    result.get("action") == "speak",
-    f"got {result}",
-)
-check(
-    "nurse client was attempted",
-    mock_clients["nurse"].calls == 1,
-    f"calls={mock_clients['nurse'].calls}",
-)
-check(
-    "patient client was attempted (next in chain)",
-    mock_clients["patient"].calls == 1,
-    f"calls={mock_clients['patient'].calls}",
-)
-check(
-    "medical_judge client served the request",
-    mock_clients["medical_judge"].calls == 1,
-    f"calls={mock_clients['medical_judge'].calls}",
-)
-check(
-    "empathy_judge NOT called once medical_judge succeeded",
-    mock_clients["empathy_judge"].calls == 0,
-    f"calls={mock_clients['empathy_judge'].calls}",
-)
-check(
-    "nurse marked dead in router state",
-    "nurse" in router._dead_clients,
-)
-check(
-    "patient marked dead in router state",
-    "patient" in router._dead_clients,
-)
-# Subsequent queries should skip dead clients entirely.
-mock_clients["medical_judge"].calls = 0
-mock_clients["empathy_judge"].calls = 0
-mock_clients["nurse"].calls = 0
-mock_clients["patient"].calls = 0
-result2 = router.query("patient", "system", [{"role": "user", "content": "hi"}])
-check(
-    "second query (patient) skips dead clients",
-    mock_clients["nurse"].calls == 0 and mock_clients["patient"].calls == 0,
-)
-check(
-    "second query reaches a live judge",
-    result2.get("action") == "speak",
-    f"got {result2}",
-)
-# ---------------------------------------------------------------------------
-# 2. DoctorBrain key chain
-# ---------------------------------------------------------------------------
-print("\n--- Test 2: DoctorBrain with primary key dead, fallback alive ---", flush=True)
-from ER_MAP import dashboard as _dash  # noqa: E402
-class _DoctorMockChat:
-    def __init__(self, owner):
-        self.owner = owner
-        self.completions = self
-    def create(self, **kw):
-        self.owner.calls += 1
-        if self.owner.dead:
-            raise Exception("Error code: 401 - {'error': {'code': 'invalid_api_key'}}")
-        return _MockResp('{"action":"read_soap","content":"check the chart first"}')
-class _DoctorMockGroq:
-    def __init__(self, dead=False):
-        self.dead = dead
-        self.calls = 0
-        self.chat = _DoctorMockChat(self)
-# Build a brain with 3 keys: primary dead, second dead, third alive.
-brain = _dash.DoctorBrain(
-    api_key="key1",
-    fallback_api_keys=["key2", "key3"],
-    model="llama-3.1-8b-instant",
-)
-# Replace each entry's client with our mock.
-brain._chain[0]["client"] = _DoctorMockGroq(dead=True)   # key1 dead
-brain._chain[1]["client"] = _DoctorMockGroq(dead=True)   # key2 dead
-brain._chain[2]["client"] = _DoctorMockGroq(dead=False)  # key3 alive
-reply = brain.decide("Patient is here. Vitals pending.")
-check(
-    "DoctorBrain walks past 2 dead keys and uses the 3rd",
-    '"action":"read_soap"' in reply or "'action': 'read_soap'" in reply,
-    f"reply={reply[:120]}",
-)
-check(
-    "key1 marked dead",
-    brain._chain[0]["dead"] is True,
-)
-check(
-    "key2 marked dead",
-    brain._chain[1]["dead"] is True,
-)
-check(
-    "key3 still alive",
-    brain._chain[2]["dead"] is False,
-)
-check(
-    "key3 actually answered (call count)",
-    brain._chain[2]["client"].calls == 1,
-    f"calls={brain._chain[2]['client'].calls}",
-)
-# Second decide() should jump straight to key3 — no retries on the dead ones.
-brain._chain[0]["client"].calls = 0
-brain._chain[1]["client"].calls = 0
-brain._chain[2]["client"].calls = 0
-brain.decide("Now consider next step.")
-check(
-    "second decide() skips dead keys (no extra calls on key1/key2)",
-    brain._chain[0]["client"].calls == 0 and brain._chain[1]["client"].calls == 0,
-)
-check(
-    "second decide() served by key3 again",
-    brain._chain[2]["client"].calls == 1,
-)
-# All 3 dead → falls back to _smart_fallback_action (no crash).
-brain2 = _dash.DoctorBrain(
-    api_key="k1",
-    fallback_api_keys=["k2"],
-    model="llama-3.1-8b-instant",
-)
-brain2._chain[0]["client"] = _DoctorMockGroq(dead=True)
-brain2._chain[1]["client"] = _DoctorMockGroq(dead=True)
-reply3 = brain2.decide("Patient is here.")
-check(
-    "all keys dead -> _smart_fallback_action returns valid JSON",
-    reply3.startswith("{") and ('"tool"' in reply3 or '"action"' in reply3),
-    f"reply={reply3[:120]}",
-)
-# ---------------------------------------------------------------------------
-# 3. TTS emotion adapter auto-disable on 401
-# ---------------------------------------------------------------------------
-print("\n--- Test 3: TTS emotion adapter shuts down after first 401 ---", flush=True)
-from ER_MAP import tts_engine as _tts  # noqa: E402
-# Make sure ElevenLabs is forced off so we don't hit network.
-os.environ["ERMAP_DISABLE_ELEVENLABS"] = "1"
-eng = _tts.TTSEngine(elevenlabs_api_key="", groq_api_key="dummy")
-# Replace its Groq client with a mock that always raises 401.
-class _AlwaysAuthFail:
-    def __init__(self):
-        self.calls = 0
-        self.chat = self
-        self.completions = self
-    def create(self, **kw):
-        self.calls += 1
-        raise Exception("Error code: 401 - {'error': {'code': 'invalid_api_key'}}")
-mock_groq = _AlwaysAuthFail()
-eng._groq_client = mock_groq
-# Trigger the adapter: status helper should report auth=True the first time.
-text1, auth1 = _tts._emotionalize_with_status(
-    "Patient please describe your symptoms in detail.",
-    "patient_anxious_panicked",
-    eng._groq_client,
-    eng._groq_model,
-)
-check(
-    "first call hits Groq and observes 401",
-    auth1 is True and mock_groq.calls == 1,
-    f"auth={auth1} calls={mock_groq.calls}",
-)
-check(
-    "first call still returns usable text via regex fallback",
-    isinstance(text1, str) and len(text1) > 5,
-    f"text={text1[:80]}",
-)
-# Simulate the engine setting its dead flag and verify subsequent passes
-# never hit Groq again.
-eng._emotion_adapter_dead = True
-mock_groq.calls = 0
-# Run the same code path the engine uses internally:
-if eng._emotion_adapter_dead:
-    # Engine bypasses the LLM call entirely → no Groq invocation.
-    fallback_only = _tts._fallback_emotion_transform(
-        "Patient please describe your symptoms in detail.",
-        "patient_anxious_panicked",
-    )
-    fallback_calls = mock_groq.calls
-else:
-    fallback_calls = -1
-check(
-    "engine bypasses Groq once emotion adapter marked dead",
-    fallback_calls == 0,
-    f"calls after mark-dead={fallback_calls}",
-)
-check(
-    "regex fallback still produces speech",
-    isinstance(fallback_only, str) and len(fallback_only) > 5,
-    f"text={fallback_only[:80]}",
-)
-# ---------------------------------------------------------------------------
-# 4. Health probe smoke (returns DEAD_AUTH on a junk key without crashing)
-# ---------------------------------------------------------------------------
-print("\n--- Test 4: _probe_groq_key handles an invalid key gracefully ---", flush=True)
-status, detail = _dash._probe_groq_key("gsk_definitely_invalid_key", "llama-3.1-8b-instant", timeout_s=4.0)
-check(
-    "probe returns DEAD_AUTH for invalid key",
-    status == "DEAD_AUTH",
-    f"status={status} detail={detail}",
-)
-status_missing, _ = _dash._probe_groq_key("", "llama-3.1-8b-instant")
-check(
-    "probe returns MISSING for empty key (no network call)",
-    status_missing == "MISSING",
-)
-# ---------------------------------------------------------------------------
-print("\n" + "=" * 60, flush=True)
-print(f"  RESULT: {CHECKS_PASSED} passed, {CHECKS_FAILED} failed", flush=True)
-print("=" * 60, flush=True)
-sys.exit(0 if CHECKS_FAILED == 0 else 1)

kaggle/KAGGLE.md DELETED Viewed

@@ -1,265 +0,0 @@
-# Training ER-MAP on Kaggle Free Tier
-This guide walks you through training the ER-MAP **Doctor agent** with GRPO + 3-phase curriculum learning on Kaggle's free GPU tier — **zero dollars**, **30 GPU-hours/week**, **single Tesla T4 16 GB**.
-## TL;DR — fastest path to a converged Doctor
-1. **Fork** this repo on GitHub (it must be reachable from inside the Kaggle kernel).
-2. Get **5 Groq API keys** from https://console.groq.com/keys (one per role gives you 5x the daily quota; you can also use one key for everything if you don't mind sharing the rate-limit budget).
-3. Get one **HF write token** from https://huggingface.co/settings/tokens (fine-grained, scope: `write` to your own repos) — needed so checkpoints survive the 12-hour Kaggle session limit.
-4. **New Notebook on Kaggle** → Settings → **Accelerator: GPU T4 x2** → **Internet: On**.
-5. Add the secrets in the right sidebar (Add-ons → Secrets):
-   - `GROQ_NURSE_API_KEY`, `GROQ_PATIENT_API_KEY`, `GROQ_EMPATHY_JUDGE_API_KEY`, `GROQ_MEDICAL_JUDGE_API_KEY`
-   - `HF_TOKEN`
-   - *(optional)* `WANDB_API_KEY`
-6. Open `kaggle/train_ermap_grpo_kaggle.ipynb` from your fork inside Kaggle (File → Import Notebook → URL).
-7. Edit the two URLs in cell 2 (`GIT_URL`) and cell 5 (`HF_PUSH_REPO`) to your fork / username.
-8. **Run All**.
-Training **stops automatically** the instant the Doctor sustains a phase-specific reward bar for **3 consecutive GRPO groups** — `+1.5` in Phase 1 (force-promote), `+1.2` in Phase 2 (force-promote), `+1.0` in Phase 3 (END). This is the "train until optimal rewards are constantly received" guarantee — see the *Train-until-optimal* section below. `NUM_EPISODES=120` is just a hard cap; healthy runs converge between episodes 70-130 (~6-11 h on T4 ×2).
-You'll see one full 6-panel dashboard PNG **per curriculum phase** land in `/kaggle/working/er_map_grpo_checkpoints/plots/` after training finishes (`phase1_dashboard.png`, `phase2_dashboard.png`, `phase3_dashboard.png`, plus `all_phases_overview.png` and `all_phases_comparison.png`), and your final LoRA adapter will be sitting on Hugging Face Hub at `<you>/ermap-doctor-lora`.
-**What each per-phase dashboard shows:**
-| Panel | What it tells you |
-| --- | --- |
-| Reward growth | raw episode reward + rolling mean + verified rolling mean |
-| Rolling win rate (w=20) | did the policy actually get better in this phase? |
-| Outcome distribution over time | stacked WIN/PARTIAL/INCORRECT/AMA_LOSS/FATAL_LOSS bars per ~5-episode bin |
-| Reward components | mean of every reward component (process / treatment / empathy / labs / etc.) |
-| GRPO update stats | per-group loss + KL — should *not* explode |
-| Episode length | histogram of step counts — should rise from Phase 1 to Phase 3 |
----
-## Hardware feasibility
-| Resource | Kaggle Free Tier | What we use | Headroom |
-|---|---|---|---|
-| GPU | Tesla T4 16 GB | Llama-3.1-8B-4bit + LoRA(r=16) ≈ 7-9 GB | ~50% free |
-| RAM | 13 GB system | base model + tokenizer + buffers ≈ 5 GB | OK |
-| Disk | 73 GB | repo + checkpoints + cache ≈ 10 GB | OK |
-| Session | 12 h max | typical full Phase-1+early-Phase-2 = 6-8 h | OK |
-| Weekly | 30 GPU-h | one full curriculum run + a re-run = ~15-20 h | OK |
-| Internet | allowed | Groq calls per env step | OK |
-**Why Llama-3.1-8B over Qwen-3-4B (the other train_grpo.py default)?**
-- 8B reasons noticeably better on multi-turn clinical dialogue
-- 4-bit quant brings it to 5 GB — still fits on T4 with LoRA
-- Groq hosts the same 8B (8B-instant) so the deployed inference path matches the training distribution exactly
-If you ever need to fall back to a smaller model (e.g. for a P100 session), edit `MODEL_NAME` in cell 5 of the notebook to `unsloth/Qwen2.5-3B-Instruct-bnb-4bit`. Everything else stays the same.
----
-## Two ways to get the source onto Kaggle
-### Option A — public GitHub fork (recommended)
-In cell 2 of the notebook:
-```python
-GIT_URL = "https://github.com/YOUR_USERNAME/Meta_Finals.git"
-BRANCH  = "main"
-```
-The cell does a shallow clone into `/kaggle/working/Meta_Finals` and you're done.
-### Option B — upload as a Kaggle Dataset (no GitHub needed)
-1. Locally:
-   ```bash
-   cd D:/Meta_Finals
-   # Exclude heavy/regenerable folders before zipping.
-   tar --exclude='.git' --exclude='__pycache__' --exclude='*.ipynb_checkpoints' \
-       --exclude='er_map_grpo_checkpoints' \
-       -czf ermap-source.tar.gz .
-   ```
-2. Kaggle → **Datasets** → **New Dataset** → upload `ermap-source.tar.gz` → name it **`ermap-source`** → save.
-3. In your training notebook → right sidebar → **+ Add Data** → search for **`ermap-source`** → Add.
-4. Cell 2 of the notebook detects `/kaggle/input/ermap-source/` and copies it into `/kaggle/working/Meta_Finals` automatically.
-Use Option B when:
-- Your fork is private and you don't want to expose the repo
-- You have local edits not yet pushed
-- Bandwidth from Kaggle to GitHub is flaky
----
-## What happens when the 12-hour session ends mid-training
-Without intervention you'd lose everything. The notebook prevents this:
-1. **Periodic HF Hub push.** Cell 7 monkey-patches `save_lora_adapters()` so every checkpoint saved by the GRPO loop also pushes to your `HF_PUSH_REPO`. The training loop checkpoints every `group_size × 5` episodes (so every 10 episodes when `GROUP_SIZE=2`).
-2. **Resume on the next session.** Set `HF_RESUME_REPO` in cell 4 of the notebook on the *new* Kaggle session. The latest LoRA adapter is downloaded to `/kaggle/working/checkpoints/resume/` before training starts — but **the current `train_grpo.py` doesn't auto-load this folder yet**; for now use it as a manual recovery (load the adapter and continue training in code). A future PR will wire the auto-resume into `load_model_and_tokenizer`.
-In practice: a single 12-hour session is usually enough to clear Phase 1 and produce publishable per-phase dashboards, so resume is the safety net rather than the main path.
-> **Re-render plots from any saved metrics file** (locally or in another Kaggle session):
-> ```bash
-> python -m ER_MAP.plotting \
->     --metrics er_map_grpo_checkpoints/training_metrics.json \
->     --out     er_map_grpo_checkpoints/plots
-> ```
-> This is the same call the notebook makes — handy if you want to regenerate the charts after training, or restyle them without re-running training.
----
-## Per-role Groq keys vs. one shared key
-The dashboard ships with 4 distinct Groq clients (Nurse, Patient, Empathy Judge, Medical Judge) and a fallback chain that walks across all four if any fails auth. Per-key budgets are *shared* on Groq's free tier (limits are per-account, not per-key) — but the model split below buys you real headroom because **each model has its own daily pool**.
-### Default model assignment (traffic-shaping)
-| Role | Model | Free-tier pool | Why |
-|---|---|---|---|
-| Nurse | `llama-3.1-8b-instant` | 14 400 RPD / 500K TPD | high-volume (every env step) |
-| Patient | `llama-3.1-8b-instant` | shared 8B pool | high-volume (every env step) |
-| Empathy Judge | `llama-3.3-70b-versatile` | 1 000 RPD / 100K TPD | grading quality directly shapes reward |
-| Medical Judge | `llama-3.3-70b-versatile` | shared 70B pool | grading quality directly shapes reward |
-Quick budget check for **one full 120-episode training run**:
-| Pool | Estimated calls/run | Daily ceiling | Headroom |
-|---|---|---|---|
-| 8B-instant (Nurse + Patient) | ~2 880 | 14 400 RPD | ~5x |
-| 70B-versatile (judges) | ~720 | 1 000 RPD | ~1.4x |
-You can do **one training run per day per account** comfortably. If you need to retry inside the same day, drop one of the two judges to 8B-instant temporarily — the reward signal degrades a little, but training keeps moving.
-If you only have **one** Groq key total, set just `GROQ_API_KEY` as a Kaggle Secret. Everything still works — the AgentRouter falls back to the same client for all roles, and the per-model budgets still split traffic across pools.
----
-## What the reward-growth curve should look like
-If training is healthy, after ~80 episodes you should see:
-- **Rolling avg reward** climbs from ≈ -0.4 (random baseline) toward +1.5+ (the early-stop target)
-- **Rolling win rate** climbs from ~10% to 40%+
-- A **vertical red dashed line** marks the Phase 1 → Phase 2 promotion (typically episode 30-60), and a second one marks Phase 2 → Phase 3 (typically episode 60-90)
-- KL divergence stays in `[0.005, 0.05]` — if it spikes above 0.5 the model is drifting (lower `LEARNING_RATE` and re-run)
-If the curve is flat or trending down:
-- Check that Groq is actually responding (look for `Groq API error` lines in the log)
-- Check that `rewards.std()` is non-zero across the group (cell logs print `adv_std=`; if it's < 1e-6 GRPO skips the update)
-- Drop `GROUP_SIZE` from 2 → 1? **Don't** — group size 1 = no advantage signal = no GRPO update. Keep G ≥ 2.
----
-## Train-until-optimal — per-phase reward thresholds
-> *"I want training until certain optimal rewards are constantly received."*
-After every GRPO update the loop maintains a **rolling buffer of the last `CONVERGENCE_WINDOW=3` groups**. When all 3 entries are in the *same* current phase AND each has `rolling_avg_reward >= PHASE_REWARD_TARGETS[current_phase]`, the loop reacts:
-| Current phase | When buffer qualifies | Effect |
-|---|---|---|
-| Phase 1 (Tool Mastery) | sustained `+1.5` for 3 groups | force-promote to Phase 2, clear buffer |
-| Phase 2 (Clinical Reasoning) | sustained `+1.2` for 3 groups | force-promote to Phase 3, clear buffer |
-| Phase 3 (Empathetic Negotiation) | sustained `+1.0` for 3 groups | **END TRAINING** |
-The buffer is cleared after each promotion so stale entries cannot pre-satisfy the next phase's bar. A soft `PHASE_MIN_WIN_RATE=0.20` floor prevents stopping on partial-credit-only runs.
-If even one group in the window slips below the bar, the counter resets — guaranteeing the policy is *constantly* hitting the target, not transiently. `NUM_EPISODES` becomes a hard safety cap, not a fixed budget.
-### Why the targets descend from `1.5` → `1.2` → `1.0`
-The phases are not equally easy to score on. Looking at the reward function:
-| Phase | Best-case reward (clean win) | Realistic clean-policy mean |
-|---|---|---|
-| 1 — easy patient, clean SOAP | `+2.0` (full terminal_win) | `+1.6 .. +1.8` |
-| 2 — mixed compliance + noisy SOAP | `+1.7` (terminal_win - some lab noise) | `+1.2 .. +1.4` |
-| 3 — full persona randomization + consent costs | `+1.4` (terminal_win - empathy/AMA penalties) | `+1.0 .. +1.2` |
-So requiring `+1.5` in Phase 1 demonstrates real tool mastery (not just floor-grazing), while requiring `+1.0` in Phase 3 is genuinely hard — no random policy ever sustains it.
-### Defaults (in the notebook, section 5)
-```python
-EARLY_STOP_ENABLED   = True
-PHASE_REWARD_TARGETS = {1: 1.5, 2: 1.2, 3: 1.0}
-PHASE_MIN_WIN_RATE   = 0.20  # soft floor
-CONVERGENCE_WINDOW   = 3
-```
-### Reading the per-phase telemetry
-After every GRPO group the log prints:
-```
-[Scheduler] Phase 2 (Clinical Reasoning) | Win Rate: 42.0% | Avg Reward: +1.18 | Phase Episodes: 14
-[EarlyStop] Phase 2 target avg-reward >= +1.20: qualified 2/3 recent groups (need all 3 -> promote)
-```
-When the Phase-2 buffer fills with 3 qualifying groups:
-```
-[Scheduler] force_promote() called: sustained rolling-avg-reward +1.20 for 3 consecutive groups in Phase 2
-************************************************************
-  CURRICULUM PROMOTION: Clinical Reasoning -> Empathetic Negotiation
-************************************************************
-```
-When Phase 3 finally converges:
-```
-************************************************************
-  EARLY STOP: Phase 3 convergence reached after 92 episodes
-  Last 3 groups all sustained:
-    rolling_avg_reward >= +1.00
-    rolling_win_rate   >= 20%
-  in Phase 3 (Empathetic Negotiation)
-************************************************************
-```
-…and the loop exits cleanly into the final-save / final-push / plotting cells.
-### Per-phase wall-clock estimates on Kaggle T4 ×2
-| Phase | Typical episodes to hit target | Wall-clock | Why |
-|---|---|---|---|
-| 1 | 16 – 30 episodes (8 – 15 groups) | **~1.5 – 2.5 h** | Easy patients + clean SOAP; tool format is the only real lift. |
-| 2 | 24 – 40 episodes (12 – 20 groups) | **~2.0 – 3.5 h** | Most policy improvement happens here. |
-| 3 | 30 – 60 episodes (15 – 30 groups) | **~2.5 – 5.0 h** | Empathy + consent costs make `+1.0` genuinely hard. |
-| **Total** | 70 – 130 episodes | **~6 – 11 h** | Fits the 12 h GPU session with ~1 h margin. |
-Per-group wall-clock ≈ 8 – 12 min on T4 (depending on episode length); per-episode ≈ 3 – 5 min for env rollout + ≈ 1 – 2 min amortized for the GRPO update.
-### Tuning suggestions
-| Goal | What to change |
-|---|---|
-| Smoke run (converge fast on a weak policy) | `PHASE_REWARD_TARGETS={1: 0.5, 2: 0.4, 3: 0.3}`, `CONVERGENCE_WINDOW=2` |
-| Hackathon-grade Doctor | keep defaults |
-| Aim for SOTA on this benchmark | `PHASE_REWARD_TARGETS={1: 1.7, 2: 1.5, 3: 1.3}`, `CONVERGENCE_WINDOW=5` |
-| Disable entirely (run full 120 episodes regardless) | `EARLY_STOP_ENABLED=False` |
-| Resuming from a partial run | targets are unchanged — the buffer is rebuilt from the new session's groups, so nothing weird happens |
-### What NOT to do
-- Don't set `CONVERGENCE_WINDOW=1`. A single lucky group can pass any bar; you'll promote out of Phase 1 the instant the first easy patient is correctly discharged.
-- Don't lower the Phase-1 target below `+0.8`. The built-in scheduler already promotes Phase 1 → Phase 2 at `win_rate >= 40% AND avg_reward >= +0.3`, so a Phase-1 reward bar below that is dead code.
-- Don't raise the Phase-3 target above `+1.5`. The reward ceiling on a Phase-3 episode (after empathy/consent costs) is around `+1.4 .. +1.6`; sustaining `+1.5+` for 3 consecutive groups is essentially unachievable in 12 h.
----
-## Common Kaggle gotchas
-| Symptom | Fix |
-|---|---|
-| `Groq API error: 401 invalid_api_key` | Regenerate the key (Groq auto-revokes keys posted publicly). Update the Kaggle Secret. |
-| `OutOfMemoryError` on T4 | Drop `MAX_SEQ_LENGTH` from 2048 to 1536 inside `load_model_and_tokenizer`, or switch to `unsloth/Qwen2.5-3B-Instruct-bnb-4bit`. |
-| `unsloth import` failed | Restart kernel after `pip install` — Unsloth pins `xformers` versions and the running kernel keeps the old import cached. |
-| Checkpoints not appearing on HF Hub | Verify `HF_PUSH_REPO` doesn't still contain the `<your-username>` placeholder, and that `HF_TOKEN` has `write` scope. |
-| "Internet off" warning | Right sidebar → Settings → toggle Internet to **On**. (Default is off for new accounts.) |
----
-## Cost summary
-- **Kaggle**: free
-- **Groq API (training)**: free (within free-tier daily quotas, ~2 000 calls per full run)
-- **Hugging Face Hub**: free for the LoRA adapter (~50 MB) + free for the merged fp16 (~16 GB on a public repo, free up to 1 TB total)
-- **Wandb**: free for personal projects
-- **Total**: $0

kaggle/KAGGLE_QUICKSTART.md ADDED Viewed

	@@ -0,0 +1,104 @@

+# Kaggle Quickstart — ER-MAP GRPO Training (v3 stable)
+The Kaggle notebook is in `kaggle/train_ermap_grpo_kaggle.ipynb`. This file
+is the cheat sheet for running it end-to-end without the dependency hell
+that bit us in earlier attempts.
+## 0. Prerequisites (one-time)
+1. **GitHub fork** of this repo. The notebook clones from a public fork at
+   cell 6 — edit `GIT_URL`. Alternatively, upload the repo as a Kaggle
+   Dataset named `ermap-source` (Add Data → Upload).
+2. **Hugging Face write token** (`HF_TOKEN`) for pushing the trained
+   adapter. Create at https://huggingface.co/settings/tokens (fine-grained,
+   write access on a single model repo is enough).
+3. **Five Groq keys** (one each for Nurse / Patient / Empathy Judge /
+   Medical Judge / shared fallback). Free-tier accounts are fine; the
+   per-account limits multiply across keys.
+## 1. Create the Kaggle notebook
+1. Sign in to https://www.kaggle.com/code → **New Notebook**.
+2. Right sidebar:
+   - Accelerator: **GPU T4 ×2** (or P100)
+   - Internet: **On**
+   - Persistence: Files only
+3. **File → Upload Notebook** → choose `kaggle/train_ermap_grpo_kaggle.ipynb`
+   from this repo.
+## 2. Add Kaggle Secrets
+Add-ons → Secrets → Add a new secret. Required labels (exactly):
+| Label | Value |
+|---|---|
+| `GROQ_NURSE_API_KEY` | your nurse Groq key |
+| `GROQ_PATIENT_API_KEY` | your patient Groq key |
+| `GROQ_EMPATHY_JUDGE_API_KEY` | your empathy-judge Groq key |
+| `GROQ_MEDICAL_JUDGE_API_KEY` | your medical-judge Groq key |
+| `HF_TOKEN` | your HF write token |
+| `WANDB_API_KEY` *(optional)* | your W&B key (skip — disabled by default) |
+The notebook reads them via `kaggle_helpers.load_kaggle_secrets()` and
+exports them as env vars.
+## 3. Edit two placeholders in the notebook
+- **Cell 6:** `GIT_URL = "https://github.com/<your-fork>/Meta_Finals.git"`
+- **Cell 8:** `HF_PUSH_REPO = "<your-username>/ermap-doctor-lora"`
+If you uploaded the repo as a Kaggle Dataset instead, leave `GIT_URL` as the
+placeholder — cell 6 will detect `/kaggle/input/ermap-source` and copy from
+there.
+## 4. Run order (the only sequence that works)
+| Cell | What it does | Expected output |
+|---|---|---|
+| 2 | GPU + disk + python + internet sanity check | GPU listed, disk free > 8 GB |
+| 3 | **REPAIR** — pin torch 2.10 cu128, reinstall bitsandbytes, upgrade unsloth | `REPAIR OK` (or `RESTART REQUIRED`) |
+| **(restart)** | If cell 3 said RESTART REQUIRED → Run → Restart kernel | — |
+| 5 | Post-restart import verify | All `OK`, GPUs listed |
+| 6 | Clone / mount the repo | `OK. Repo at /kaggle/working/Meta_Finals` |
+| 7 | Wire Kaggle Secrets → env vars | `OK — at least one Groq key is wired` |
+| 8 | HF Hub config | `Starting fresh — no resume.` |
+| 9 | Hyperparameters (P1=+1.2, P2=+1.1, P3=+1.0) | thresholds printed |
+| 10 | **Pre-flight** — Groq routing + 4× PING | 4× `[PASS]`, then `OK` |
+| 11 | Dry-run smoke test (no GPU) | `Dry-run OK` |
+| 12 | Wire HF push hook | `Hub-push hook installed.` |
+| 13 | **REAL TRAINING** (4–6 h) | per-group rolling stats, eventual `EARLY STOP` |
+| 14 | Final push to HF | `Final checkpoints pushed: https://huggingface.co/...` |
+| 15 | Per-phase plots | 5 PNGs displayed inline |
+| 16 | Push plots to HF | `Plots pushed: ...` |
+| 17 | Inference smoke-test (optional) | 3 sample Doctor actions printed |
+## 5. Common failures & fixes
+| Symptom | Root cause | Fix |
+|---|---|---|
+| `numpy was upgraded mid-session` | numpy import poisoned by a previous cell | Restart kernel, re-run from cell 3 |
+| `Pillow incompatible with torchvision` | Pillow ABI mismatch | Restart kernel, re-run from cell 3 |
+| `PyTorch and torchvision compiled with different CUDA major` | torch upgraded to cu13 by a transient resolve | Re-run cell 3 (it pins cu128) and restart |
+| `cannot import name 'create_gradient_checkpointing_buffer'` | unsloth ↔ unsloth_zoo version drift | Re-run cell 3 (upgrades both in lockstep) |
+| `libnvJitLink.so.13 missing` | bitsandbytes built against different CUDA | Re-run cell 3 (force-reinstalls bitsandbytes after torch pin) |
+| Disk usage > quota | Kaggle's 20 GB working partition fills up | First line of cell 3 cleans `/tmp` and pip cache |
+| Pre-flight `[FAIL]` for a role | Groq key dead / quota exceeded | Generate a new key in console.groq.com → update Kaggle Secret → re-run cell 7+10 |
+| `[FAIL]` says `routing=WRONG` | env var not set when `AgentRouter()` was constructed | Re-run cell 9 BEFORE cell 10 |
+| Training freezes at episode 1 for >10 min | Doctor.generate hung; Unsloth import broke silently | Check cell 5 output for `unsloth` line; restart kernel and re-run cell 3 if missing |
+## 6. What the trained model gives you
+After cell 13 finishes (or hits the 12 h Kaggle session cap), you have:
+- `OUTPUT_DIR/final_lora/` — LoRA adapter weights (~50 MB), pushed to
+  `HF_PUSH_REPO`
+- `OUTPUT_DIR/final_merged_fp16/` — full Llama-3.1-8B fp16 merge with the
+  adapter applied (~16 GB), pushed to `HF_PUSH_REPO-merged`
+- `OUTPUT_DIR/training_metrics.json` — per-episode rewards, outcomes,
+  rolling stats — input for the per-phase plots
+- `OUTPUT_DIR/plots/*.png` — 5 dashboards (one per phase + cross-phase
+  overview + comparison bar)
+Use the LoRA adapter for the demo (quick to load, runs on a 4050 6 GB at
+~30 tok/s); use the merged fp16 if you need to host on a Vercel/HF Space
+without `peft`.

kaggle/build_notebook.py ADDED Viewed

	@@ -0,0 +1,880 @@

+"""
+kaggle/build_notebook.py
+========================
+Programmatically (re)builds `train_ermap_grpo_kaggle.ipynb` from scratch.
+Why a builder script?
+--------------------
+The hand-edited notebook drifted into a fragile state across many sessions:
+mixed early-stop / fixed-budget params, stale install snippets, dead pre-flight
+checks, etc. This script is the single source of truth — run it once and the
+notebook is regenerated as a clean, deterministic v3 layout.
+Run:
+    python kaggle/build_notebook.py
+Output:
+    kaggle/train_ermap_grpo_kaggle.ipynb     (overwritten)
+    kaggle/KAGGLE_QUICKSTART.md              (overwritten)
+"""
+from __future__ import annotations
+import json
+import textwrap
+from pathlib import Path
+# ---------------------------------------------------------------------------
+# Cell helpers
+# ---------------------------------------------------------------------------
+def md_cell(text: str) -> dict:
+    return {
+        "cell_type": "markdown",
+        "metadata": {},
+        "source": _split_keep_newlines(text),
+    }
+def code_cell(text: str) -> dict:
+    return {
+        "cell_type": "code",
+        "execution_count": None,
+        "metadata": {},
+        "outputs": [],
+        "source": _split_keep_newlines(text),
+    }
+def _split_keep_newlines(text: str) -> list[str]:
+    """Notebook 'source' fields expect each line to terminate with '\n'
+    except the last one. Splitting like this keeps `git diff` clean when
+    the notebook is regenerated."""
+    text = textwrap.dedent(text).lstrip("\n")
+    if not text.endswith("\n"):
+        text = text + "\n"
+    lines = text.splitlines(keepends=True)
+    if lines:
+        # The last line should NOT have a trailing newline (Jupyter convention).
+        if lines[-1].endswith("\n"):
+            lines[-1] = lines[-1].rstrip("\n")
+    return lines
+# ---------------------------------------------------------------------------
+# Cell sources
+# ---------------------------------------------------------------------------
+CELL_01_TITLE = """\
+# ER-MAP — Doctor Agent GRPO Training (Kaggle Free-Tier · v3 stable)
+Trains the **Doctor LLM** (Llama-3.1-8B-Instruct, 4-bit + LoRA r=16) via GRPO
+with a 3-phase curriculum on Kaggle's free GPU. Designed to survive Kaggle's
+pre-baked image quirks (numpy / Pillow ABI mismatches, torch + torchvision
+CUDA-major mismatches, transient `unsloth_zoo` upgrades).
+## TL;DR — How to run this notebook
+1. **Notebook settings (right sidebar):**
+   - Accelerator: **GPU T4 ×2** (or P100)
+   - Internet: **On**
+   - Persistence: Files only
+2. **Kaggle Secrets** (Add-ons → Secrets):
+   - **Required:** `GROQ_NURSE_API_KEY`, `GROQ_PATIENT_API_KEY`,
+     `GROQ_EMPATHY_JUDGE_API_KEY`, `GROQ_MEDICAL_JUDGE_API_KEY`, `HF_TOKEN`
+   - **Optional:** `WANDB_API_KEY`
+3. **Run cells 2 → 3 (sanity + REPAIR).** When cell 3 prints
+   `RESTART REQUIRED`, click **Run → Restart kernel**, then resume from cell 5.
+4. **Run cells 5 → 11 (verify + configure + dry-run + pre-flight).** Each cell
+   should print an `OK` line before moving on.
+5. **Run cell 13 (the long training cell, 4–6 hours).**
+6. **Run cells 14 → 17 (final push + plots + inference smoke-test).**
+## Curriculum + reward thresholds (this run)
+Constant per-phase rolling-avg-reward bars; sustained for **3 consecutive
+GRPO groups** triggers either a phase promotion or end-of-training.
+| Phase | Reward target (sustained ×3 groups) | Action when met |
+|---|---|---|
+| 1 — Tool Mastery | `+1.2` | force-promote to Phase 2 |
+| 2 — Clinical Reasoning | `+1.1` | force-promote to Phase 3 |
+| 3 — Empathetic Negotiation | `+1.0` | END TRAINING |
+Why these numbers? The un-trained 8B Doctor's baseline on the same env is
+`P1=+0.76, P2=+0.59, P3=+0.39`. Targets of `+1.2 / +1.1 / +1.0` correspond
+to roughly `1.6× / 1.9× / 2.6×` improvement over baseline — a meaningful
+signal but reachable inside Kaggle's 12 h session limit.
+"""
+CELL_02_SANITY = """\
+# === CELL 2 — Sanity check (GPU + disk + python + internet) ===
+# Run this FIRST. If any check fails, fix it before running the REPAIR cell.
+import os, shutil, subprocess, sys, socket
+print("--- GPU ---")
+try:
+    print(subprocess.check_output(
+        ["nvidia-smi", "--query-gpu=name,memory.total,memory.free", "--format=csv"],
+        timeout=10,
+    ).decode())
+except Exception as e:
+    print(f"nvidia-smi failed: {e}")
+    print("-> Set Accelerator to 'GPU T4 x2' in the right sidebar.")
+print("--- Disk (/kaggle/working) ---")
+total, used, free = shutil.disk_usage("/kaggle/working")
+print(f"  total={total/1e9:5.1f} GB | used={used/1e9:5.1f} GB | free={free/1e9:5.1f} GB")
+if free < 8 * 1e9:
+    print("  WARNING: free disk < 8 GB — repair cell may fail. "
+          "Consider 'Run > Restart and clear cell outputs' to reset /tmp.")
+print("--- Python ---")
+print(f"  python={sys.version.split()[0]} | exe={sys.executable}")
+print("--- Internet (api.groq.com:443) ---")
+try:
+    socket.create_connection(("api.groq.com", 443), timeout=5).close()
+    print("  reachable")
+except Exception as e:
+    print(f"  UNREACHABLE: {e}")
+    print("  -> Settings (right sidebar) -> Internet -> ON")
+"""
+CELL_03_REPAIR = """\
+# === CELL 3 — REPAIR CELL (idempotent full environment rebuild) ===
+# Single source of truth for ER-MAP's GPU stack. Safe to re-run. After it
+# finishes you'll see one of two final lines:
+#
+#   RESTART REQUIRED  -> Run -> Restart kernel, then resume from cell 5
+#   REPAIR OK         -> proceed directly to cell 5
+#
+# Note: this cell only runs shell commands and one isolated subprocess.
+# It deliberately does NOT `import torch / numpy / Pillow / unsloth` in the
+# kernel, so re-running it after a botched install does not poison further
+# attempts.
+print("=" * 72); print("  CELL 3 — REPAIR"); print("=" * 72)
+# 1. Clean caches (Kaggle's /kaggle/working is only 20 GB — installs
+#    routinely fill it after a few re-runs).
+print("[1/6] Cleaning pip + tmp + HF dataset caches...")
+get_ipython().system('pip cache purge -q || true')
+get_ipython().system('rm -rf /tmp/* /root/.cache/pip /root/.cache/huggingface/datasets 2>/dev/null || true')
+# 2. Pin torch + torchvision to the cu128 wheel (matches Kaggle's CUDA 12.8
+#    base image). DON'T let pip pull a generic CUDA-13 build — that breaks
+#    bitsandbytes (libnvJitLink.so.13 missing) and torchvision (CUDA-major
+#    mismatch RuntimeError at import time).
+print("[2/6] Installing torch==2.10.0 + torchvision==0.25.0 (cu128)...")
+get_ipython().system('pip install -q --no-cache-dir --force-reinstall '
+                     'torch==2.10.0 torchvision==0.25.0 '
+                     '--index-url https://download.pytorch.org/whl/cu128')
+# 3. Reinstall bitsandbytes against the now-pinned torch.
+print("[3/6] Reinstalling bitsandbytes...")
+get_ipython().system('pip install -q --no-cache-dir --force-reinstall bitsandbytes')
+# 4. Upgrade unsloth + unsloth_zoo + trl in lockstep. unsloth and
+#    unsloth_zoo are released as a matched pair; if pip pulls a fresh
+#    unsloth_zoo against an old unsloth you get
+#       ImportError: cannot import name 'create_gradient_checkpointing_buffer'
+print("[4/6] Upgrading unsloth + unsloth_zoo + trl...")
+get_ipython().system('pip install -q --upgrade --no-cache-dir '
+                     'unsloth unsloth_zoo "trl>=0.18.2"')
+# 5. ER-MAP runtime deps that aren't pre-installed on Kaggle.
+print("[5/6] Installing ER-MAP runtime deps...")
+get_ipython().system('pip install -q --no-cache-dir '
+                     '"groq>=0.18.0" "huggingface_hub>=0.25.0" '
+                     '"gymnasium>=0.29.0" "openenv-core>=0.1.0"')
+# 6. Verify in a SUBPROCESS (so the parent kernel never imports any of these
+#    while pip is mid-flight, which is what causes the
+#       'numpy was upgraded mid-session (loaded: X, installed: Y)' RuntimeError
+#    we kept hitting before).
+print("[6/6] Verifying via subprocess...")
+import subprocess, sys, json
+verify_script = r'''
+import json, sys
+out = {"ok": True, "details": {}, "errors": []}
+try:
+    import importlib.metadata as md
+    for pkg in ("torch", "torchvision", "bitsandbytes", "unsloth", "unsloth_zoo",
+                "trl", "transformers", "peft", "accelerate", "groq",
+                "huggingface_hub", "gymnasium", "numpy", "Pillow"):
+        try:
+            out["details"][pkg + "_installed"] = md.version(pkg)
+        except md.PackageNotFoundError:
+            out["details"][pkg + "_installed"] = None
+    import torch, torchvision, numpy as np, PIL, unsloth, unsloth_zoo, bitsandbytes, trl
+    out["details"]["torch_loaded"]        = torch.__version__
+    out["details"]["torch_cuda"]          = torch.version.cuda
+    out["details"]["cuda_available"]      = bool(torch.cuda.is_available())
+    out["details"]["gpu_count"]           = int(torch.cuda.device_count())
+    out["details"]["torchvision_loaded"]  = torchvision.__version__
+    out["details"]["numpy_loaded"]        = np.__version__
+    out["details"]["pillow_loaded"]       = PIL.__version__
+    out["details"]["unsloth_loaded"]      = unsloth.__version__
+    out["details"]["unsloth_zoo_loaded"]  = unsloth_zoo.__version__
+    out["details"]["bitsandbytes_loaded"] = bitsandbytes.__version__
+    out["details"]["trl_loaded"]          = trl.__version__
+    # Cross-check loaded-vs-installed for the C-extension libs that bit us
+    # on every previous run.
+    for pkg, loaded_key, installed_key in [
+        ("numpy",  "numpy_loaded",  "numpy_installed"),
+        ("Pillow", "pillow_loaded", "Pillow_installed"),
+        ("torch",  "torch_loaded",  "torch_installed"),
+    ]:
+        loaded = out["details"].get(loaded_key)
+        installed = out["details"].get(installed_key)
+        if loaded and installed and loaded != installed:
+            # Strip any local-version suffix (e.g. '+cu128') before compare.
+            if loaded.split("+")[0] != installed.split("+")[0]:
+                out["errors"].append(
+                    f"{pkg} mismatch: loaded={loaded} installed={installed}"
+                )
+except Exception as e:
+    out["ok"] = False
+    out["errors"].append(f"{type(e).__name__}: {e}")
+print(json.dumps(out, default=str))
+'''.lstrip()
+res = subprocess.run([sys.executable, "-c", verify_script],
+                     capture_output=True, text=True, timeout=180)
+print(res.stdout if res.stdout else "<no stdout>")
+if res.stderr:
+    print("---- subprocess stderr ----"); print(res.stderr)
+# Parse the LAST line of stdout (others are prints from package init).
+try:
+    last = res.stdout.strip().splitlines()[-1]
+    parsed = json.loads(last)
+except Exception:
+    parsed = {"ok": False, "errors": ["could not parse verification output"]}
+ok = parsed.get("ok") and not parsed.get("errors")
+d = parsed.get("details", {})
+print("\n" + "=" * 72)
+if ok:
+    print("  REPAIR OK")
+    print(f"    torch       : {d.get('torch_loaded')}  (CUDA {d.get('torch_cuda')})")
+    print(f"    torchvision : {d.get('torchvision_loaded')}")
+    print(f"    bitsandbytes: {d.get('bitsandbytes_loaded')}")
+    print(f"    unsloth     : {d.get('unsloth_loaded')} | unsloth_zoo: {d.get('unsloth_zoo_loaded')}")
+    print(f"    trl         : {d.get('trl_loaded')}")
+    print(f"    numpy       : {d.get('numpy_loaded')} | Pillow: {d.get('pillow_loaded')}")
+    print(f"    GPUs        : {d.get('gpu_count')}  (cuda_available={d.get('cuda_available')})")
+    print()
+    print("  -> If this kernel previously imported torch/numpy/Pillow/unsloth,")
+    print("     RESTART NOW (Run -> Restart kernel) before continuing to cell 5.")
+    print("     If this is a fresh kernel, you can proceed directly.")
+else:
+    print("  RESTART REQUIRED — issues detected:")
+    for e in parsed.get("errors", []):
+        print(f"    - {e}")
+    print()
+    print("  Action: Run -> Restart kernel, then re-run from cell 2.")
+print("=" * 72)
+"""
+CELL_04_RESTART = """\
+## ⚠ Restart kernel here if cell 3 said `RESTART REQUIRED`
+Click **Run → Restart kernel** (or **Run → Restart & clear cell outputs**),
+then resume from **cell 5**. Skipping the restart will produce ABI mismatch
+errors at the first GPU op.
+If cell 3 said `REPAIR OK` AND this is a fresh kernel that hasn't imported
+torch/numpy/Pillow/unsloth yet, you can proceed to cell 5 directly.
+"""
+CELL_05_VERIFY = """\
+# === CELL 5 — Post-restart verify (this kernel can import everything) ===
+import importlib.metadata as md
+print("--- Loaded versions in this kernel ---")
+import torch, numpy, PIL, torchvision, unsloth, unsloth_zoo, bitsandbytes, trl, transformers, peft
+versions = {
+    "torch":          torch.__version__,
+    "torchvision":    torchvision.__version__,
+    "numpy":          numpy.__version__,
+    "Pillow":         PIL.__version__,
+    "unsloth":        unsloth.__version__,
+    "unsloth_zoo":    unsloth_zoo.__version__,
+    "bitsandbytes":   bitsandbytes.__version__,
+    "trl":            trl.__version__,
+    "transformers":   transformers.__version__,
+    "peft":           peft.__version__,
+}
+all_ok = True
+for k, v in versions.items():
+    try:
+        inst = md.version(k)
+    except md.PackageNotFoundError:
+        inst = "(not installed)"
+    # Tolerate local version suffixes like '+cu128'
+    flag = "OK" if inst.split("+")[0] == v.split("+")[0] else f"MISMATCH (installed={inst})"
+    if "MISMATCH" in flag:
+        all_ok = False
+    print(f"  {k:14s}: loaded={v:20s} [{flag}]")
+print()
+print(f"  CUDA available : {torch.cuda.is_available()}")
+print(f"  GPU count      : {torch.cuda.device_count()}")
+if torch.cuda.is_available():
+    for i in range(torch.cuda.device_count()):
+        p = torch.cuda.get_device_properties(i)
+        print(f"  GPU {i}          : {p.name} ({p.total_memory/1e9:.1f} GB)")
+print()
+print("OK" if all_ok else "NOT OK — re-run cell 3 and restart kernel.")
+"""
+CELL_06_REPO = """\
+# === CELL 6 — Mount the ER-MAP repo into /kaggle/working ===
+import os, subprocess, sys
+# OPTION A: clone a public GitHub fork (preferred). Edit GIT_URL.
+GIT_URL    = "https://github.com/<your-fork>/Meta_Finals.git"
+BRANCH     = "main"
+REPO_ROOT  = "/kaggle/working/Meta_Finals"
+# OPTION B: Kaggle Dataset upload — set this if you uploaded the repo
+# as a Kaggle Dataset named "ermap-source" (Add Data -> Upload).
+DATASET_DIR = "/kaggle/input/ermap-source"
+if not os.path.isdir(f"{REPO_ROOT}/ER_MAP"):
+    if "<your-fork>" not in GIT_URL:
+        print(f"Cloning {GIT_URL}@{BRANCH} -> {REPO_ROOT}...")
+        out = subprocess.run(
+            ["git", "clone", "--depth", "1", "-b", BRANCH, GIT_URL, REPO_ROOT],
+            capture_output=True, text=True,
+        )
+        print(out.stdout); print(out.stderr)
+    elif os.path.isdir(DATASET_DIR):
+        print(f"Copying {DATASET_DIR} -> {REPO_ROOT}...")
+        import shutil
+        shutil.copytree(DATASET_DIR, REPO_ROOT, dirs_exist_ok=True)
+assert os.path.isdir(f"{REPO_ROOT}/ER_MAP"), (
+    "Repo not found.\\n"
+    " - Edit GIT_URL above to your GitHub fork, OR\\n"
+    " - Upload the repo as a Kaggle Dataset named 'ermap-source' (Add Data -> Upload)."
+)
+sys.path.insert(0, REPO_ROOT)
+sys.path.insert(0, f"{REPO_ROOT}/kaggle")
+print(f"OK. Repo at {REPO_ROOT}")
+"""
+CELL_07_SECRETS = """\
+# === CELL 7 — Wire Kaggle Secrets into env vars ===
+import os
+from kaggle_helpers import load_kaggle_secrets, kaggle_env_summary
+load_kaggle_secrets()
+kaggle_env_summary()
+# Hard fail if no Groq key — training would silently use mock LLMs.
+assert any(os.environ.get(k) for k in (
+    "GROQ_NURSE_API_KEY", "GROQ_PATIENT_API_KEY",
+    "GROQ_EMPATHY_JUDGE_API_KEY", "GROQ_MEDICAL_JUDGE_API_KEY",
+    "GROQ_API_KEY",
+)), ("No Groq key found in Kaggle Secrets. "
+     "Add at least GROQ_NURSE_API_KEY in Add-ons -> Secrets.")
+print("OK — at least one Groq key is wired.")
+"""
+CELL_08_HF = """\
+# === CELL 8 — Hugging Face Hub config (for checkpoint backup) ===
+import os
+from kaggle_helpers import push_checkpoint_to_hub, download_checkpoint_from_hub
+# EDIT the line below to your HF model id (e.g. "udayd/ermap-doctor-lora").
+HF_PUSH_REPO   = "<your-username>/ermap-doctor-lora"
+# To resume from a previous run, paste the same repo id here. Empty = fresh.
+HF_RESUME_REPO = ""
+RESUME_DIR = "/kaggle/working/checkpoints/resume"
+if HF_RESUME_REPO:
+    download_checkpoint_from_hub(HF_RESUME_REPO, RESUME_DIR)
+    contents = os.listdir(RESUME_DIR) if os.path.isdir(RESUME_DIR) else []
+    print(f"Resume dir: {contents or '(empty)'}")
+else:
+    print("Starting fresh — no resume.")
+if "<your-username>" in HF_PUSH_REPO:
+    print("\\nWARNING: HF_PUSH_REPO still has <your-username> placeholder.")
+    print("         Checkpoints will NOT be pushed to HF Hub.")
+    print("         Edit the cell above and re-run before training if you want backups.")
+"""
+CELL_09_HYPERPARAMS = """\
+# === CELL 9 — GRPO hyperparameters ===
+import os
+MODEL_NAME       = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
+GROUP_SIZE       = 2
+LEARNING_RATE    = 5e-6
+KL_BETA          = 0.04
+OUTPUT_DIR       = "/kaggle/working/er_map_grpo_checkpoints"
+PUSH_EVERY_EPS   = 20
+USE_WANDB        = False  # WANDB conflicts with protobuf 7 on Kaggle base image
+NUM_EPISODES     = 200    # hard cap; early-stop usually finishes first
+# --- Per-phase reward thresholds (constant for this run) -------------------
+# After every GRPO update we look at the last CONVERGENCE_WINDOW groups; if
+# ALL of them belong to the same current phase AND each has
+# rolling_avg_reward >= PHASE_REWARD_TARGETS[current_phase] AND
+# rolling_win_rate >= PHASE_MIN_WIN_RATE, we either:
+#   - force-promote to the next phase (Phase 1 / Phase 2), OR
+#   - terminate training (Phase 3).
+EARLY_STOP_ENABLED   = True
+PHASE_REWARD_TARGETS = {1: 1.2, 2: 1.1, 3: 1.0}
+PHASE_MIN_WIN_RATE   = 0.20
+CONVERGENCE_WINDOW   = 3
+# --- Per-episode budget controls (read by triage_env) ----------------------
+os.environ["ERMAP_MAX_EPISODE_STEPS"]      = "20"
+os.environ["ERMAP_MAX_INTERNAL_EXCHANGES"] = "5"
+# --- Groq traffic-shaping (8B for actors, 70B for judges) ------------------
+# High-volume conversational roles (Nurse + Patient) on the 8B-instant pool
+# (500K TPD, 14,400 RPD); the two judges stay on 70B-versatile because their
+# grading quality directly shapes the reward signal.
+os.environ["ERMAP_NURSE_MODEL"]            = "llama-3.1-8b-instant"
+os.environ["ERMAP_PATIENT_MODEL"]          = "llama-3.1-8b-instant"
+os.environ["ERMAP_EMPATHY_JUDGE_MODEL"]    = "llama-3.3-70b-versatile"
+os.environ["ERMAP_MEDICAL_JUDGE_MODEL"]    = "llama-3.3-70b-versatile"
+print("Hyperparameters set:")
+print(f"  NUM_EPISODES         = {NUM_EPISODES}")
+print(f"  GROUP_SIZE           = {GROUP_SIZE}")
+print(f"  PHASE_REWARD_TARGETS = {PHASE_REWARD_TARGETS}")
+print(f"  PHASE_MIN_WIN_RATE   = {PHASE_MIN_WIN_RATE}")
+print(f"  CONVERGENCE_WINDOW   = {CONVERGENCE_WINDOW}")
+print(f"  Nurse / Patient      = llama-3.1-8b-instant (actors, high-volume)")
+print(f"  Empathy / Med Judge  = llama-3.3-70b-versatile (graders, quality)")
+"""
+CELL_10_PREFLIGHT = """\
+# === CELL 10 — Pre-flight: Groq routing + key liveness ===
+# Verifies that:
+#  - each role is routed to the model you set in cell 9, and
+#  - each role's Groq key actually answers a 1-token "PING" prompt.
+import os
+from ER_MAP.envs.api_router import AgentRouter
+router = AgentRouter()
+expected = {
+    "nurse":         "llama-3.1-8b-instant",
+    "patient":       "llama-3.1-8b-instant",
+    "empathy_judge": "llama-3.3-70b-versatile",
+    "medical_judge": "llama-3.3-70b-versatile",
+}
+print("=" * 60); print("  PRE-FLIGHT — Groq routing + smoke test"); print("=" * 60)
+all_pass = True
+for role, exp in expected.items():
+    actual = router._models.get(role, "?")
+    routing_ok = (actual == exp)
+    client = router._clients.get(role)
+    if client is None:
+        print(f"  [SKIP] {role:14s} -> no Groq client (key missing)")
+        all_pass = False
+        continue
+    try:
+        resp = client.chat.completions.create(
+            model=exp,
+            messages=[{"role": "user", "content": "Reply with exactly: PING"}],
+            max_tokens=4, temperature=0,
+        )
+        api_ok = "PING" in (resp.choices[0].message.content or "").upper()
+        err = ""
+    except Exception as e:
+        api_ok = False
+        err = f" ({type(e).__name__}: {str(e)[:80]})"
+    flag = "PASS" if (routing_ok and api_ok) else "FAIL"
+    if flag == "FAIL":
+        all_pass = False
+    print(f"  [{flag}] {role:14s} -> {actual:30s} "
+          f"routing={'ok' if routing_ok else 'WRONG'}, "
+          f"api={'ok' if api_ok else 'fail'}{err}")
+print("=" * 60)
+print("OK" if all_pass else "NOT OK — fix routing/keys before training.")
+print("=" * 60)
+assert all_pass, "Pre-flight failed; do not proceed to training."
+"""
+CELL_11_DRYRUN = """\
+# === CELL 11 — Dry-run smoke test (no GPU, no model load) ===
+# Verifies the curriculum scheduler + reward verifier + per-phase early-stop
+# wiring before we burn GPU minutes on the real run.
+from ER_MAP.training.train_grpo import train
+_ = train(
+    num_episodes=8,
+    group_size=2,
+    model_name=MODEL_NAME,
+    learning_rate=LEARNING_RATE,
+    kl_beta=KL_BETA,
+    output_dir="/kaggle/working/_dryrun",
+    dry_run=True,
+    phase_reward_targets=PHASE_REWARD_TARGETS,
+    phase_min_win_rate=PHASE_MIN_WIN_RATE,
+    convergence_window=CONVERGENCE_WINDOW,
+    early_stop=EARLY_STOP_ENABLED,
+)
+print("\\nDry-run OK — scheduler + verifier + per-phase early-stop wiring is healthy.")
+"""
+CELL_12_HOOK = """\
+# === CELL 12 — Wire periodic HF Hub push into training ===
+# We monkey-patch save_lora_adapters so every checkpoint dump also pushes
+# the LoRA adapter to HF Hub. Failures are non-fatal — training keeps
+# running even if a push fails (e.g. transient HF 502).
+from ER_MAP.training import train_grpo as _tg
+_original_save = _tg.save_lora_adapters
+def save_lora_adapters_with_push(model, tokenizer, output_dir):
+    _original_save(model, tokenizer, output_dir)
+    if HF_PUSH_REPO and "<your-username>" not in HF_PUSH_REPO:
+        try:
+            push_checkpoint_to_hub(
+                output_dir, HF_PUSH_REPO,
+                commit_message=f"checkpoint @ {os.path.basename(output_dir)}",
+            )
+        except Exception as e:
+            print(f"  [hub-push] non-fatal failure: {e}")
+_tg.save_lora_adapters = save_lora_adapters_with_push
+print("Hub-push hook installed.")
+"""
+CELL_13_TRAIN_MD = """\
+## 13 · Run real training (the 4–6 hour cell)
+**Estimated wall-clock on Kaggle T4 ×2:**
+- ~3–5 min per episode (6–14 env steps × Doctor.generate + 4–8 × Groq calls)
+- ~1–2 min amortized per GRPO update (G=2 trajectories × response-token log-probs)
+- **Per-group ≈ 8–12 min** (2 episodes + 1 update)
+| Phase | Typical episodes to reach target | Wall-clock |
+|---|---|---|
+| 1 (target `+1.2` × 3) | 12 – 24 episodes (6 – 12 groups) | ~1.0 – 2.0 h |
+| 2 (target `+1.1` × 3) | 16 – 32 episodes (8 – 16 groups) | ~1.5 – 2.5 h |
+| 3 (target `+1.0` × 3) | 20 – 50 episodes (10 – 25 groups) | ~2.0 – 4.0 h |
+| **Total** | 50 – 100 episodes | **~4.5 – 8.5 h** |
+If `NUM_EPISODES=200` is exhausted before Phase 3 converges, training
+stops at the cap and the latest LoRA checkpoint is on HF Hub already
+(we push every 20 episodes), so resume in a fresh session via
+`HF_RESUME_REPO` in cell 8.
+"""
+CELL_13_TRAIN = """\
+# === CELL 13 — REAL TRAINING (4-6 h cell) ===
+metrics = train(
+    num_episodes=NUM_EPISODES,
+    group_size=GROUP_SIZE,
+    model_name=MODEL_NAME,
+    groq_api_key=os.environ.get("GROQ_NURSE_API_KEY", "")
+                  or os.environ.get("GROQ_API_KEY", ""),
+    learning_rate=LEARNING_RATE,
+    kl_beta=KL_BETA,
+    use_wandb=USE_WANDB,
+    output_dir=OUTPUT_DIR,
+    dry_run=False,
+    phase_reward_targets=PHASE_REWARD_TARGETS,
+    phase_min_win_rate=PHASE_MIN_WIN_RATE,
+    convergence_window=CONVERGENCE_WINDOW,
+    early_stop=EARLY_STOP_ENABLED,
+)
+print(f"\\nTraining returned {len(metrics)} metric records.")
+"""
+CELL_14_FINAL_PUSH = """\
+# === CELL 14 — Final push: adapters + merged fp16 ===
+FINAL_LORA_DIR   = f"{OUTPUT_DIR}/final_lora"
+FINAL_MERGED_DIR = f"{OUTPUT_DIR}/final_merged_fp16"
+if HF_PUSH_REPO and "<your-username>" not in HF_PUSH_REPO:
+    push_checkpoint_to_hub(FINAL_LORA_DIR, HF_PUSH_REPO,
+                           commit_message="final LoRA adapter")
+    if os.path.isdir(FINAL_MERGED_DIR):
+        push_checkpoint_to_hub(FINAL_MERGED_DIR, f"{HF_PUSH_REPO}-merged",
+                               commit_message="final merged fp16")
+    print(f"Final checkpoints pushed: https://huggingface.co/{HF_PUSH_REPO}")
+else:
+    print("HF_PUSH_REPO not configured — skipping final push.")
+"""
+CELL_15_PLOTS_MD = """\
+## 15 · Per-phase training graphs (one dashboard per curriculum phase)
+We render a 6-panel dashboard for **every phase that contains episodes**,
+plus a cross-phase overview and a phase-comparison bar chart. All PNGs are
+written to `er_map_grpo_checkpoints/plots/` and uploaded to HF Hub in the
+next cell so they survive Kaggle session expiry.
+Each per-phase dashboard contains:
+1. **Reward growth** — raw scatter + rolling mean (w=10) + verified rolling mean
+2. **Rolling win rate** — w=20 win-rate evolution within the phase
+3. **Outcome distribution over time** — stacked bars (WIN/PARTIAL/INCORRECT/AMA_LOSS/FATAL_LOSS)
+4. **Reward components** — mean of each component (process / treatment / empathy / labs / etc.)
+5. **GRPO update stats** — loss + KL divergence per group update
+6. **Episode length distribution** — histogram of step counts
+"""
+CELL_15_PLOTS = """\
+# === CELL 15 — Per-phase training dashboards ===
+from ER_MAP.plotting import plot_per_phase_dashboards
+from IPython.display import Image, display, Markdown
+PLOTS_DIR = f"{OUTPUT_DIR}/plots"
+written = plot_per_phase_dashboards(
+    metrics_path=f"{OUTPUT_DIR}/training_metrics.json",
+    output_dir=PLOTS_DIR,
+)
+print(f"Saved {len(written)} chart(s) to {PLOTS_DIR}:")
+for name, path in written.items():
+    size_kb = os.path.getsize(path) / 1024
+    print(f"  {name:<28s} -> {path}  ({size_kb:.0f} KB)")
+# Display each chart inline so the operator sees them without leaving Kaggle.
+ordered = (sorted(k for k in written if k.startswith("phase"))
+           + ["all_phases_overview", "all_phases_comparison"])
+for key in ordered:
+    if key not in written:
+        continue
+    display(Markdown(f"### {key.replace('_', ' ').title()}"))
+    display(Image(filename=written[key]))
+"""
+CELL_16_PUSH_PLOTS = """\
+# === CELL 16 — Push plots to HF Hub ===
+if HF_PUSH_REPO and "<your-username>" not in HF_PUSH_REPO:
+    push_checkpoint_to_hub(PLOTS_DIR, HF_PUSH_REPO,
+                           commit_message="per-phase training plots")
+    print(f"Plots pushed: https://huggingface.co/{HF_PUSH_REPO}/tree/main")
+else:
+    print("HF_PUSH_REPO not configured — plots stay only in /kaggle/working/.")
+"""
+CELL_17_INFER_MD = """\
+## 17 · (Optional) Inference smoke-test on the trained model
+Catches the classic 'merge path looked OK but the saved model emits garbage'
+failure mode before the demo.
+"""
+CELL_17_INFER = """\
+# === CELL 17 — Inference smoke-test on the trained model ===
+from ER_MAP.training.train_grpo import generate_doctor_action, load_model_and_tokenizer
+from peft import PeftModel
+base_model, tok = load_model_and_tokenizer(model_name=MODEL_NAME)
+trained = PeftModel.from_pretrained(base_model, FINAL_LORA_DIR)
+test_obs = (
+    '{"event":"episode_start","nurse_experience":"veteran",'
+    '"message":"Patient with chest pain, HR 120, BP 90/60, vague history.",'
+    '"soap_summary":{}}'
+)
+for i in range(3):
+    print(f"\\n--- Sample {i+1} ---")
+    print(generate_doctor_action(trained, tok, test_obs, max_new_tokens=160))
+"""
+# ---------------------------------------------------------------------------
+# Quickstart markdown (sibling file)
+# ---------------------------------------------------------------------------
+QUICKSTART_MD = """\
+# Kaggle Quickstart — ER-MAP GRPO Training (v3 stable)
+The Kaggle notebook is in `kaggle/train_ermap_grpo_kaggle.ipynb`. This file
+is the cheat sheet for running it end-to-end without the dependency hell
+that bit us in earlier attempts.
+## 0. Prerequisites (one-time)
+1. **GitHub fork** of this repo. The notebook clones from a public fork at
+   cell 6 — edit `GIT_URL`. Alternatively, upload the repo as a Kaggle
+   Dataset named `ermap-source` (Add Data → Upload).
+2. **Hugging Face write token** (`HF_TOKEN`) for pushing the trained
+   adapter. Create at https://huggingface.co/settings/tokens (fine-grained,
+   write access on a single model repo is enough).
+3. **Five Groq keys** (one each for Nurse / Patient / Empathy Judge /
+   Medical Judge / shared fallback). Free-tier accounts are fine; the
+   per-account limits multiply across keys.
+## 1. Create the Kaggle notebook
+1. Sign in to https://www.kaggle.com/code → **New Notebook**.
+2. Right sidebar:
+   - Accelerator: **GPU T4 ×2** (or P100)
+   - Internet: **On**
+   - Persistence: Files only
+3. **File → Upload Notebook** → choose `kaggle/train_ermap_grpo_kaggle.ipynb`
+   from this repo.
+## 2. Add Kaggle Secrets
+Add-ons → Secrets → Add a new secret. Required labels (exactly):
+| Label | Value |
+|---|---|
+| `GROQ_NURSE_API_KEY` | your nurse Groq key |
+| `GROQ_PATIENT_API_KEY` | your patient Groq key |
+| `GROQ_EMPATHY_JUDGE_API_KEY` | your empathy-judge Groq key |
+| `GROQ_MEDICAL_JUDGE_API_KEY` | your medical-judge Groq key |
+| `HF_TOKEN` | your HF write token |
+| `WANDB_API_KEY` *(optional)* | your W&B key (skip — disabled by default) |
+The notebook reads them via `kaggle_helpers.load_kaggle_secrets()` and
+exports them as env vars.
+## 3. Edit two placeholders in the notebook
+- **Cell 6:** `GIT_URL = "https://github.com/<your-fork>/Meta_Finals.git"`
+- **Cell 8:** `HF_PUSH_REPO = "<your-username>/ermap-doctor-lora"`
+If you uploaded the repo as a Kaggle Dataset instead, leave `GIT_URL` as the
+placeholder — cell 6 will detect `/kaggle/input/ermap-source` and copy from
+there.
+## 4. Run order (the only sequence that works)
+| Cell | What it does | Expected output |
+|---|---|---|
+| 2 | GPU + disk + python + internet sanity check | GPU listed, disk free > 8 GB |
+| 3 | **REPAIR** — pin torch 2.10 cu128, reinstall bitsandbytes, upgrade unsloth | `REPAIR OK` (or `RESTART REQUIRED`) |
+| **(restart)** | If cell 3 said RESTART REQUIRED → Run → Restart kernel | — |
+| 5 | Post-restart import verify | All `OK`, GPUs listed |
+| 6 | Clone / mount the repo | `OK. Repo at /kaggle/working/Meta_Finals` |
+| 7 | Wire Kaggle Secrets → env vars | `OK — at least one Groq key is wired` |
+| 8 | HF Hub config | `Starting fresh — no resume.` |
+| 9 | Hyperparameters (P1=+1.2, P2=+1.1, P3=+1.0) | thresholds printed |
+| 10 | **Pre-flight** — Groq routing + 4× PING | 4× `[PASS]`, then `OK` |
+| 11 | Dry-run smoke test (no GPU) | `Dry-run OK` |
+| 12 | Wire HF push hook | `Hub-push hook installed.` |
+| 13 | **REAL TRAINING** (4–6 h) | per-group rolling stats, eventual `EARLY STOP` |
+| 14 | Final push to HF | `Final checkpoints pushed: https://huggingface.co/...` |
+| 15 | Per-phase plots | 5 PNGs displayed inline |
+| 16 | Push plots to HF | `Plots pushed: ...` |
+| 17 | Inference smoke-test (optional) | 3 sample Doctor actions printed |
+## 5. Common failures & fixes
+| Symptom | Root cause | Fix |
+|---|---|---|
+| `numpy was upgraded mid-session` | numpy import poisoned by a previous cell | Restart kernel, re-run from cell 3 |
+| `Pillow incompatible with torchvision` | Pillow ABI mismatch | Restart kernel, re-run from cell 3 |
+| `PyTorch and torchvision compiled with different CUDA major` | torch upgraded to cu13 by a transient resolve | Re-run cell 3 (it pins cu128) and restart |
+| `cannot import name 'create_gradient_checkpointing_buffer'` | unsloth ↔ unsloth_zoo version drift | Re-run cell 3 (upgrades both in lockstep) |
+| `libnvJitLink.so.13 missing` | bitsandbytes built against different CUDA | Re-run cell 3 (force-reinstalls bitsandbytes after torch pin) |
+| Disk usage > quota | Kaggle's 20 GB working partition fills up | First line of cell 3 cleans `/tmp` and pip cache |
+| Pre-flight `[FAIL]` for a role | Groq key dead / quota exceeded | Generate a new key in console.groq.com → update Kaggle Secret → re-run cell 7+10 |
+| `[FAIL]` says `routing=WRONG` | env var not set when `AgentRouter()` was constructed | Re-run cell 9 BEFORE cell 10 |
+| Training freezes at episode 1 for >10 min | Doctor.generate hung; Unsloth import broke silently | Check cell 5 output for `unsloth` line; restart kernel and re-run cell 3 if missing |
+## 6. What the trained model gives you
+After cell 13 finishes (or hits the 12 h Kaggle session cap), you have:
+- `OUTPUT_DIR/final_lora/` — LoRA adapter weights (~50 MB), pushed to
+  `HF_PUSH_REPO`
+- `OUTPUT_DIR/final_merged_fp16/` — full Llama-3.1-8B fp16 merge with the
+  adapter applied (~16 GB), pushed to `HF_PUSH_REPO-merged`
+- `OUTPUT_DIR/training_metrics.json` — per-episode rewards, outcomes,
+  rolling stats — input for the per-phase plots
+- `OUTPUT_DIR/plots/*.png` — 5 dashboards (one per phase + cross-phase
+  overview + comparison bar)
+Use the LoRA adapter for the demo (quick to load, runs on a 4050 6 GB at
+~30 tok/s); use the merged fp16 if you need to host on a Vercel/HF Space
+without `peft`.
+"""
+# ---------------------------------------------------------------------------
+# Build the notebook
+# ---------------------------------------------------------------------------
+def build_notebook() -> dict:
+    cells = [
+        md_cell(CELL_01_TITLE),                     # 0
+        code_cell(CELL_02_SANITY),                  # 1
+        code_cell(CELL_03_REPAIR),                  # 2
+        md_cell(CELL_04_RESTART),                   # 3
+        code_cell(CELL_05_VERIFY),                  # 4
+        code_cell(CELL_06_REPO),                    # 5
+        code_cell(CELL_07_SECRETS),                 # 6
+        code_cell(CELL_08_HF),                      # 7
+        code_cell(CELL_09_HYPERPARAMS),             # 8
+        code_cell(CELL_10_PREFLIGHT),               # 9
+        code_cell(CELL_11_DRYRUN),                  # 10
+        code_cell(CELL_12_HOOK),                    # 11
+        md_cell(CELL_13_TRAIN_MD),                  # 12
+        code_cell(CELL_13_TRAIN),                   # 13
+        code_cell(CELL_14_FINAL_PUSH),              # 14
+        md_cell(CELL_15_PLOTS_MD),                  # 15
+        code_cell(CELL_15_PLOTS),                   # 16
+        code_cell(CELL_16_PUSH_PLOTS),              # 17
+        md_cell(CELL_17_INFER_MD),                  # 18
+        code_cell(CELL_17_INFER),                   # 19
+    ]
+    return {
+        "cells": cells,
+        "metadata": {
+            "kernelspec": {
+                "display_name": "Python 3",
+                "language": "python",
+                "name": "python3",
+            },
+            "language_info": {
+                "name": "python",
+                "version": "3.10",
+            },
+        },
+        "nbformat": 4,
+        "nbformat_minor": 5,
+    }
+def main() -> None:
+    here = Path(__file__).parent
+    nb_path = here / "train_ermap_grpo_kaggle.ipynb"
+    qs_path = here / "KAGGLE_QUICKSTART.md"
+    nb = build_notebook()
+    nb_path.write_text(json.dumps(nb, indent=1, ensure_ascii=False), encoding="utf-8")
+    qs_path.write_text(QUICKSTART_MD, encoding="utf-8")
+    n_md = sum(1 for c in nb["cells"] if c["cell_type"] == "markdown")
+    n_code = sum(1 for c in nb["cells"] if c["cell_type"] == "code")
+    print(f"Wrote {nb_path}  ({len(nb['cells'])} cells: {n_md} md / {n_code} code)")
+    print(f"Wrote {qs_path}  ({len(QUICKSTART_MD.splitlines())} lines)")
+if __name__ == "__main__":
+    main()

kaggle/train_ermap_grpo_kaggle.ipynb CHANGED Viewed

@@ -4,33 +4,45 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# ER-MAP — Doctor Agent GRPO Training (Kaggle Free-Tier)\n",
-    "\n",
-    "**Target hardware:** Tesla T4 16 GB (or P100 16 GB) — Kaggle's free GPU.\n",
-    "\n",
-    "**What this notebook does:**\n",
-    "1. Clones / mounts the ER-MAP repo\n",
-    "2. Installs the missing pieces (Unsloth, TRL, Groq, HF Hub) on top of Kaggle's pre-baked PyTorch image\n",
-    "3. Loads Llama-3.1-8B in 4-bit + LoRA(r=16) via Unsloth (~7 GB VRAM)\n",
-    "4. Runs the manual GRPO loop from `ER_MAP/training/train_grpo.py` with 3-phase curriculum learning\n",
-    "5. Pushes LoRA adapter checkpoints to a Hugging Face Hub repo every 20 episodes so the 12-hour Kaggle session limit doesn't lose progress\n",
-    "\n",
-    "**Required Kaggle Secrets** (Add-ons → Secrets):\n",
-    "- `GROQ_NURSE_API_KEY`, `GROQ_PATIENT_API_KEY`, `GROQ_EMPATHY_JUDGE_API_KEY`, `GROQ_MEDICAL_JUDGE_API_KEY` — for the multi-agent env actors and judges\n",
-    "- `HF_TOKEN` — to push checkpoints (use a fine-grained write token)\n",
-    "- `WANDB_API_KEY` — *optional*, for the reward-growth chart\n",
-    "\n",
-    "**Notebook settings (right sidebar):**\n",
-    "- Accelerator: **GPU T4 x2** (or P100)\n",
-    "- Internet: **On** *(Groq calls require this)*\n",
-    "- Persistence: Files only"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1 · Sanity check the GPU + clone the repo"
    ]
   },
   {
@@ -39,8 +51,38 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv\n",
-    "!python -c \"import torch; print('torch', torch.__version__, 'cuda', torch.cuda.is_available())\""
    ]
   },
   {
@@ -49,47 +91,159 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# --- OPTION A: clone the public repo (preferred) ----------------------\n",
-    "# Replace <your-github-fork> with your actual fork URL. Public fork\n",
-    "# works without any token. For a private repo, set HF_TOKEN OR pass\n",
-    "# a GH PAT via Kaggle Secrets.\n",
-    "GIT_URL = \"https://github.com/<your-github-fork>/Meta_Finals.git\"\n",
-    "BRANCH  = \"main\"\n",
-    "REPO_ROOT = \"/kaggle/working/Meta_Finals\"\n",
-    "\n",
-    "import os, subprocess\n",
-    "if not os.path.isdir(f\"{REPO_ROOT}/ER_MAP\") and \"<your-github-fork>\" not in GIT_URL:\n",
-    "    print(subprocess.check_output(\n",
-    "        [\"git\", \"clone\", \"--depth\", \"1\", \"-b\", BRANCH, GIT_URL, REPO_ROOT],\n",
-    "        stderr=subprocess.STDOUT,\n",
-    "    ).decode())\n",
-    "\n",
-    "# --- OPTION B: dataset upload (if you don't want to push to GitHub) ---\n",
-    "# 1. Locally: zip the repo (excluding .git, checkpoints, __pycache__).\n",
-    "# 2. Kaggle: New Dataset -> upload the zip -> name it `ermap-source`.\n",
-    "# 3. This notebook: Add Data -> ermap-source.\n",
-    "# 4. Run the next cell to copy /kaggle/input/ermap-source/ into\n",
-    "#    /kaggle/working/Meta_Finals (writeable).\n",
-    "DATASET_DIR = \"/kaggle/input/ermap-source\"\n",
-    "if not os.path.isdir(f\"{REPO_ROOT}/ER_MAP\") and os.path.isdir(DATASET_DIR):\n",
-    "    import shutil\n",
-    "    shutil.copytree(DATASET_DIR, REPO_ROOT, dirs_exist_ok=True)\n",
-    "    print(f\"Copied {DATASET_DIR} -> {REPO_ROOT}\")\n",
-    "\n",
-    "assert os.path.isdir(f\"{REPO_ROOT}/ER_MAP\"), (\n",
-    "    \"Repo not found. Either set GIT_URL above (Option A) or upload the \"\n",
-    "    \"repo as a Kaggle Dataset named 'ermap-source' (Option B).\"\n",
-    ")\n",
-    "print(\"Repo ready at\", REPO_ROOT)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 2 · Install the missing dependencies\n",
     "\n",
-    "Kaggle's GPU image already ships with PyTorch 2.x + CUDA 12 + transformers + accelerate + peft + bitsandbytes. We only add Unsloth (which pins matching xformers/triton), TRL, Gymnasium, Groq SDK, and the HF Hub client."
    ]
   },
   {
@@ -98,26 +252,46 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# --upgrade is critical: Kaggle's pre-baked layer often ships an\n",
-    "# OLD `unsloth` paired with whatever fresh `unsloth_zoo` pip pulled\n",
-    "# this morning, and the import then fails with:\n",
-    "#   ImportError: cannot import name 'create_gradient_checkpointing_buffer'\n",
-    "# Forcing both packages to upgrade in one resolve pass keeps them in lockstep.\n",
-    "!pip install -q --upgrade -r {REPO_ROOT}/kaggle/requirements_kaggle.txt\n",
-    "# Sanity check the unsloth import — it's the most fragile dep on Kaggle.\n",
-    "# If you see the gradient_checkpointing ImportError below, run:\n",
-    "#   !pip install --upgrade --force-reinstall --no-cache-dir unsloth unsloth_zoo\n",
-    "# in a NEW cell, then RESTART the kernel and re-run from cell 2.\n",
-    "!python -c \"import unsloth, unsloth_zoo; print('unsloth', unsloth.__version__, '| unsloth_zoo', unsloth_zoo.__version__)\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3 · Wire Kaggle Secrets into env vars\n",
-    "\n",
-    "ER-MAP reads `GROQ_NURSE_API_KEY` / `GROQ_PATIENT_API_KEY` / etc. directly from `os.environ`. The helper below copies your Kaggle Secrets into those env vars in one shot."
    ]
   },
   {
@@ -126,28 +300,63 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import sys, os\n",
-    "sys.path.insert(0, REPO_ROOT)               # so we can import ER_MAP\n",
-    "sys.path.insert(0, f\"{REPO_ROOT}/kaggle\")   # so we can import kaggle_helpers\n",
-    "\n",
-    "from kaggle_helpers import (\n",
-    "    load_kaggle_secrets,\n",
-    "    kaggle_env_summary,\n",
-    "    push_checkpoint_to_hub,\n",
-    "    download_checkpoint_from_hub,\n",
     ")\n",
     "\n",
-    "load_kaggle_secrets()\n",
-    "kaggle_env_summary()"
    ]
   },
   {
-   "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 4 · (Optional) Resume from a previous Kaggle session\n",
     "\n",
-    "If you've trained before and pushed an adapter to HF Hub, set `HF_RESUME_REPO` and run the cell to pull the latest LoRA adapter into `/kaggle/working/checkpoints/resume/`. The training cell will pick it up automatically."
    ]
   },
   {
@@ -156,68 +365,27 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "HF_PUSH_REPO   = \"<your-username>/ermap-doctor-lora\"   # where checkpoints will be pushed\n",
-    "HF_RESUME_REPO = \"\"  # e.g. \"<your-username>/ermap-doctor-lora\"; leave empty to start fresh\n",
     "\n",
     "RESUME_DIR = \"/kaggle/working/checkpoints/resume\"\n",
     "if HF_RESUME_REPO:\n",
     "    download_checkpoint_from_hub(HF_RESUME_REPO, RESUME_DIR)\n",
-    "    print(\"Resume dir contents:\", os.listdir(RESUME_DIR) if os.path.isdir(RESUME_DIR) else \"(empty)\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 5 · Configure the GRPO run\n",
-    "\n",
-    "The defaults below are tuned for **one 12-hour Kaggle session** on a single T4. They produce a clean upward reward-growth curve through Phase 1 + early Phase 2; if you have a second session, lower `--episodes` is fine because LoRA adapters resume cleanly via Step 4 above.\n",
-    "\n",
-    "| Parameter | Value | Reason |\n",
-    "|---|---|---|\n",
-    "| Model | `unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit` | Llama-3-family small-tier, 4-bit ~5 GB |\n",
-    "| LoRA rank | 16 | balances expressivity vs speed on T4 |\n",
-    "| Group size G | 2 | Kaggle's T4 fits G=2 comfortably; G=4 needs 30+ min/group |\n",
-    "| Episodes (cap) | 120 | hard cap; early-stop usually finishes first |\n",
-    "| LR | 5e-6 | conservative, prevents catastrophic forgetting on small group |\n",
-    "| KL beta | 0.04 | matches the paper's recipe; restrains drift from base policy |\n",
-    "| Max episode steps | 20 | matches `triage_env.py` default |\n",
-    "| Internal exchanges | 5 | shorter than default (8) to fit within 12 h budget |\n",
-    "\n",
-    "### Train-until-optimal (per-phase reward thresholds)\n",
-    "\n",
-    "Training **never** runs for a fixed episode budget. After every GRPO update we look at the last `CONVERGENCE_WINDOW=3` groups; if **all three** belong to the same current phase AND each has `rolling_avg_reward >= PHASE_REWARD_TARGETS[current_phase]`, we either:\n",
-    "\n",
-    "- **Phase 1 / Phase 2** → force-promote to the next curriculum phase (the buffer is then cleared so stale entries don't satisfy the next phase's check).\n",
-    "- **Phase 3** → terminate training (this is the 'optimal rewards constantly received' criterion).\n",
-    "\n",
-    "Why per-phase, not a single global bar? The phases are not equally difficult — Phase 1 wins are worth ~`+2.0` on the reward scale (full terminal_win on a clean SOAP) while Phase 3 wins routinely cost `~0.5` in consent / empathy friction even when the diagnosis is correct. A single global `+1.5` would either gate Phase 3 too aggressively or pass Phase 1 with garbage Phase-2 behavior.\n",
-    "\n",
-    "| Phase | Default target | Why this number | Action when met |\n",
-    "|---|---|---|---|\n",
-    "| 1 — Tool Mastery | `+1.5` | A Phase-1 episode that uses tools cleanly + discharges with the correct treatment lands at `+1.6 .. +2.0`. Sustaining `+1.5` means the model has tool-format down. | force-promote to Phase 2 |\n",
-    "| 2 — Clinical Reasoning | `+1.2` | Phase 2 adds noisy SOAP and mixed compliance. A solid clinician policy lands at `+1.2 .. +1.5`. | force-promote to Phase 3 |\n",
-    "| 3 — Empathetic Negotiation | `+1.0` | Phase 3 imposes empathy + consent costs (`-0.3..-0.6` per episode even on wins). Sustained `+1.0` here is genuinely hard and is the hackathon success criterion. | END TRAINING |\n",
-    "\n",
-    "| Knob | Default | Meaning |\n",
-    "|---|---|---|\n",
-    "| `PHASE_REWARD_TARGETS` | `{1: 1.5, 2: 1.2, 3: 1.0}` | per-phase sustained rolling-avg-reward bar |\n",
-    "| `PHASE_MIN_WIN_RATE` | `0.20` | soft floor on rolling win rate (sanity check) |\n",
-    "| `CONVERGENCE_WINDOW` | `3` | how many consecutive groups must hit the bar |\n",
-    "| `EARLY_STOP_ENABLED` | `True` | set `False` to always burn the full `NUM_EPISODES` budget |\n",
-    "\n",
-    "### Estimated wall-clock per phase on Kaggle T4 ×2\n",
-    "\n",
-    "Each episode = 6–14 env steps × (Doctor.generate ≈ 2–3 s) + 4–8 Groq calls (≈ 0.4–1.0 s each). One GRPO update over `G=2` trajectories = 1 forward + 1 backward over response tokens ≈ 60–120 s on T4. Net per-group wall-clock ≈ **8–12 minutes**.\n",
-    "\n",
-    "| Phase | Typical episodes to reach target | Wall-clock | Notes |\n",
-    "|---|---|---|---|\n",
-    "| 1 (target `+1.5` × 3) | 16 – 30 episodes (8 – 15 groups) | **~1.5 – 2.5 h** | Easy patients + clean SOAP — tool-format is the only thing the model has to learn. |\n",
-    "| 2 (target `+1.2` × 3) | 24 – 40 episodes (12 – 20 groups) | **~2.0 – 3.5 h** | Mixed-compliance patients + noisy SOAP. Bulk of the policy improvement happens here. |\n",
-    "| 3 (target `+1.0` × 3) | 30 – 60 episodes (15 – 30 groups) | **~2.5 – 5.0 h** | Hard patients + empathy/consent costs. May not converge in 12 h on a fresh base; that's why `NUM_EPISODES=120` is the hard cap. |\n",
-    "| **Total** | 70 – 130 episodes | **~6 – 11 h** | Fits inside Kaggle's 12 h GPU session with ~1 h margin. |\n",
     "\n",
-    "If Phase 3 doesn't converge before the 12 h limit, your latest LoRA checkpoint is already on HF Hub (we push every 20 episodes), so just resume in a fresh session via `HF_RESUME_REPO`."
    ]
   },
   {
@@ -226,65 +394,110 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# --- Training hyperparameters ---\n",
     "MODEL_NAME       = \"unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit\"\n",
-    "NUM_EPISODES     = 120          # HARD CAP; early-stop usually finishes first\n",
     "GROUP_SIZE       = 2\n",
     "LEARNING_RATE    = 5e-6\n",
     "KL_BETA          = 0.04\n",
     "OUTPUT_DIR       = \"/kaggle/working/er_map_grpo_checkpoints\"\n",
     "PUSH_EVERY_EPS   = 20\n",
-    "USE_WANDB        = bool(os.environ.get(\"WANDB_API_KEY\"))\n",
-    "\n",
-    "# --- Early-stopping (per-phase reward thresholds) ---\n",
-    "# After every GRPO update, we check the last CONVERGENCE_WINDOW groups.\n",
-    "# If ALL of them are in the SAME current phase AND each has\n",
-    "# rolling_avg_reward >= PHASE_REWARD_TARGETS[current_phase], we either:\n",
-    "#   - force-promote to the next phase (Phase 1, Phase 2), OR\n",
     "#   - terminate training (Phase 3).\n",
-    "# Baseline (un-trained Groq Doctor) avg reward by phase:\n",
-    "#   P1=+0.76, P2=+0.59, P3=+0.39\n",
-    "# So the +1.5/+1.2/+1.0 bar = 2.0x / 2.0x / 2.6x improvement.\n",
-    "EARLY_STOP_ENABLED       = True\n",
-    "PHASE_REWARD_TARGETS     = {1: 1.5, 2: 1.2, 3: 1.0}\n",
-    "PHASE_MIN_WIN_RATE       = 0.20  # soft floor; +1.0 reward implies >=20% wins\n",
-    "CONVERGENCE_WINDOW       = 3     # 3 consecutive groups must qualify\n",
-    "\n",
-    "# --- Per-episode budget controls (passed via env vars) ---\n",
     "os.environ[\"ERMAP_MAX_EPISODE_STEPS\"]      = \"20\"\n",
     "os.environ[\"ERMAP_MAX_INTERNAL_EXCHANGES\"] = \"5\"\n",
-    "# Doctor-on-Kaggle is the LOCAL trained model, NOT a Groq call. The\n",
-    "# Doctor's Groq key is therefore unused here, but Nurse / Patient /\n",
-    "# Empathy Judge / Medical Judge all hit Groq once per env step.\n",
-    "# Traffic-shaping: high-volume roleplay agents (Nurse + Patient) on the\n",
-    "# 8B-instant pool (500K TPD, 14,400 RPD); the two judges stay on 70B-\n",
-    "# versatile because their grading quality directly shapes the reward.\n",
     "os.environ[\"ERMAP_NURSE_MODEL\"]            = \"llama-3.1-8b-instant\"\n",
     "os.environ[\"ERMAP_PATIENT_MODEL\"]          = \"llama-3.1-8b-instant\"\n",
     "os.environ[\"ERMAP_EMPATHY_JUDGE_MODEL\"]    = \"llama-3.3-70b-versatile\"\n",
     "os.environ[\"ERMAP_MEDICAL_JUDGE_MODEL\"]    = \"llama-3.3-70b-versatile\"\n",
     "\n",
-    "# Sanity: at least one Groq key must be present, otherwise the env\n",
-    "# falls back to mock responses and the trained model won't see\n",
-    "# realistic dialogue.\n",
-    "assert any(\n",
-    "    os.environ.get(k) for k in [\n",
-    "        \"GROQ_NURSE_API_KEY\", \"GROQ_PATIENT_API_KEY\",\n",
-    "        \"GROQ_EMPATHY_JUDGE_API_KEY\", \"GROQ_MEDICAL_JUDGE_API_KEY\",\n",
-    "        \"GROQ_API_KEY\",\n",
-    "    ]\n",
-    "), (\"No Groq key found in Kaggle Secrets — add at least \"\n",
-    "    \"GROQ_NURSE_API_KEY before running training.\")\n",
-    "print(\"Hyperparameters and env vars set.\")"
    ]
   },
   {
-   "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 6 · Dry-run smoke test (no GPU, no model load)\n",
     "\n",
-    "Verifies the curriculum scheduler + reward verifier + metrics logger are wired correctly **before** burning GPU minutes on a real run."
    ]
   },
   {
@@ -293,6 +506,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "from ER_MAP.training.train_grpo import train\n",
     "\n",
     "_ = train(\n",
@@ -303,17 +520,12 @@
     "    kl_beta=KL_BETA,\n",
     "    output_dir=\"/kaggle/working/_dryrun\",\n",
     "    dry_run=True,\n",
     ")\n",
-    "print(\"Dry-run OK — scheduler + verifier wiring is healthy.\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 7 · Wire periodic HF-Hub push into the training loop\n",
-    "\n",
-    "We monkey-patch `save_lora_adapters` so every checkpoint dump also pushes to HF Hub. Failures are non-fatal — training keeps running even if the push fails."
    ]
   },
   {
@@ -322,18 +534,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "from ER_MAP.training import train_grpo as _tg\n",
     "_original_save = _tg.save_lora_adapters\n",
-    "_episode_marker = {\"n\": 0}\n",
     "\n",
     "def save_lora_adapters_with_push(model, tokenizer, output_dir):\n",
     "    _original_save(model, tokenizer, output_dir)\n",
-    "    _episode_marker[\"n\"] += 1\n",
     "    if HF_PUSH_REPO and \"<your-username>\" not in HF_PUSH_REPO:\n",
     "        try:\n",
     "            push_checkpoint_to_hub(\n",
-    "                output_dir,\n",
-    "                HF_PUSH_REPO,\n",
     "                commit_message=f\"checkpoint @ {os.path.basename(output_dir)}\",\n",
     "            )\n",
     "        except Exception as e:\n",
@@ -347,18 +561,25 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 8 · Run real training (the 6-11 hour cell)\n",
-    "\n",
-    "With the per-phase early-stop targets `{1: +1.5, 2: +1.2, 3: +1.0}` set above, expect:\n",
-    "\n",
-    "- ~3-5 minutes per episode (6-14 env steps × Doctor.generate + 4-8 × Groq calls)\n",
-    "- ~1-2 minutes amortized per GRPO update (G=2 trajectories × response-token log-probs)\n",
-    "- **Per-group wall-clock ≈ 8-12 min** (2 episodes + 1 update)\n",
-    "- **Phase 1 → Phase 2 force-promote** typically lands at **episode 16-30** (sustained `+1.5` × 3 groups)\n",
-    "- **Phase 2 → Phase 3 force-promote** typically lands at **episode 40-70**\n",
-    "- **Phase 3 EARLY STOP** typically lands at **episode 70-130** (sustained `+1.0` × 3 groups)\n",
-    "- Reward-growth signal (rolling avg) becomes visible after ~episode 20\n",
-    "- If `NUM_EPISODES=120` is exhausted before Phase 3 converges, training stops at the cap and the latest checkpoint is on HF Hub — resume in a fresh session via `HF_RESUME_REPO`."
    ]
   },
   {
@@ -367,31 +588,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "metrics = train(\n",
     "    num_episodes=NUM_EPISODES,\n",
     "    group_size=GROUP_SIZE,\n",
     "    model_name=MODEL_NAME,\n",
-    "    groq_api_key=os.environ.get(\"GROQ_NURSE_API_KEY\", \"\") or os.environ.get(\"GROQ_API_KEY\", \"\"),\n",
     "    learning_rate=LEARNING_RATE,\n",
     "    kl_beta=KL_BETA,\n",
     "    use_wandb=USE_WANDB,\n",
     "    output_dir=OUTPUT_DIR,\n",
     "    dry_run=False,\n",
-    "    # ----- Per-phase early-stop ('train until optimal rewards are constantly received') -----\n",
     "    phase_reward_targets=PHASE_REWARD_TARGETS,\n",
     "    phase_min_win_rate=PHASE_MIN_WIN_RATE,\n",
     "    convergence_window=CONVERGENCE_WINDOW,\n",
     "    early_stop=EARLY_STOP_ENABLED,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 9 · Final push: adapters + merged fp16 weights\n",
-    "\n",
-    "The training loop already wrote `final_lora/` and `final_merged_fp16/` to `OUTPUT_DIR`. We push both to HF Hub so you can serve them from Vercel / a HF Space without re-running training."
    ]
   },
   {
@@ -400,20 +614,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "FINAL_LORA_DIR   = f\"{OUTPUT_DIR}/final_lora\"\n",
     "FINAL_MERGED_DIR = f\"{OUTPUT_DIR}/final_merged_fp16\"\n",
     "\n",
     "if HF_PUSH_REPO and \"<your-username>\" not in HF_PUSH_REPO:\n",
-    "    push_checkpoint_to_hub(\n",
-    "        FINAL_LORA_DIR, HF_PUSH_REPO,\n",
-    "        commit_message=\"final LoRA adapter\",\n",
-    "    )\n",
     "    if os.path.isdir(FINAL_MERGED_DIR):\n",
-    "        push_checkpoint_to_hub(\n",
-    "            FINAL_MERGED_DIR, f\"{HF_PUSH_REPO}-merged\",\n",
-    "            commit_message=\"final merged fp16\",\n",
-    "        )\n",
-    "    print(\"Final checkpoints pushed.\")\n",
     "else:\n",
     "    print(\"HF_PUSH_REPO not configured — skipping final push.\")"
    ]
@@ -422,16 +633,20 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 10 · Per-phase training graphs (one dashboard per curriculum phase)\n",
     "\n",
-    "We render a complete 6-panel dashboard for every phase that contains episodes, plus a cross-phase overview and a phase-comparison bar chart. All PNGs are written to `er_map_grpo_checkpoints/plots/` and uploaded to HF Hub at the end of the notebook so they survive session expiry.\n",
     "\n",
-    "**Each per-phase dashboard contains:**\n",
     "1. **Reward growth** — raw scatter + rolling mean (w=10) + verified rolling mean\n",
-    "2. **Rolling win rate** — w=20 win rate evolution within the phase\n",
-    "3. **Outcome distribution over time** — stacked bars (WIN/PARTIAL/INCORRECT/AMA_LOSS/FATAL_LOSS) per episode bin\n",
-    "4. **Reward components** — mean of each component (process / treatment / empathy / labs / etc.) within the phase\n",
-    "5. **GRPO update statistics** — loss + KL divergence per group update\n",
     "6. **Episode length distribution** — histogram of step counts"
    ]
   },
@@ -441,6 +656,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "from ER_MAP.plotting import plot_per_phase_dashboards\n",
     "from IPython.display import Image, display, Markdown\n",
     "\n",
@@ -450,54 +666,44 @@
     "    output_dir=PLOTS_DIR,\n",
     ")\n",
     "\n",
-    "print(f\"Saved {len(written)} chart(s):\")\n",
     "for name, path in written.items():\n",
     "    size_kb = os.path.getsize(path) / 1024\n",
     "    print(f\"  {name:<28s} -> {path}  ({size_kb:.0f} KB)\")\n",
     "\n",
-    "# Display each chart inline in the notebook so the operator sees them\n",
-    "# without leaving Kaggle. Order: per-phase dashboards first (1, 2, 3),\n",
-    "# then the cross-phase overview, then the bar comparison.\n",
-    "ordered_keys = (\n",
-    "    sorted(k for k in written if k.startswith(\"phase\")) +\n",
-    "    [\"all_phases_overview\", \"all_phases_comparison\"]\n",
-    ")\n",
-    "for key in ordered_keys:\n",
     "    if key not in written:\n",
     "        continue\n",
     "    display(Markdown(f\"### {key.replace('_', ' ').title()}\"))\n",
     "    display(Image(filename=written[key]))"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 10b · Push the plots to HF Hub (so they survive session expiry)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "if HF_PUSH_REPO and \"<your-username>\" not in HF_PUSH_REPO:\n",
-    "    push_checkpoint_to_hub(\n",
-    "        PLOTS_DIR, HF_PUSH_REPO,\n",
-    "        commit_message=\"per-phase training plots\",\n",
-    "    )\n",
     "else:\n",
-    "    print(\"HF_PUSH_REPO not configured \u2014 plots stay only in /kaggle/working/.\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 11 · (Optional) Inference smoke-test on the trained model\n",
     "\n",
-    "Catches the classic \"merge path looked OK but the saved model emits garbage\" failure mode before the demo."
    ]
   },
   {
@@ -506,6 +712,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "from ER_MAP.training.train_grpo import generate_doctor_action, load_model_and_tokenizer\n",
     "from peft import PeftModel\n",
     "\n",
@@ -536,4 +743,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}

    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "# ER-MAP — Doctor Agent GRPO Training (Kaggle Free-Tier · v3 stable)\n",
+    "\n",
+    "Trains the **Doctor LLM** (Llama-3.1-8B-Instruct, 4-bit + LoRA r=16) via GRPO\n",
+    "with a 3-phase curriculum on Kaggle's free GPU. Designed to survive Kaggle's\n",
+    "pre-baked image quirks (numpy / Pillow ABI mismatches, torch + torchvision\n",
+    "CUDA-major mismatches, transient `unsloth_zoo` upgrades).\n",
+    "\n",
+    "## TL;DR — How to run this notebook\n",
+    "\n",
+    "1. **Notebook settings (right sidebar):**\n",
+    "   - Accelerator: **GPU T4 ×2** (or P100)\n",
+    "   - Internet: **On**\n",
+    "   - Persistence: Files only\n",
+    "2. **Kaggle Secrets** (Add-ons → Secrets):\n",
+    "   - **Required:** `GROQ_NURSE_API_KEY`, `GROQ_PATIENT_API_KEY`,\n",
+    "     `GROQ_EMPATHY_JUDGE_API_KEY`, `GROQ_MEDICAL_JUDGE_API_KEY`, `HF_TOKEN`\n",
+    "   - **Optional:** `WANDB_API_KEY`\n",
+    "3. **Run cells 2 → 3 (sanity + REPAIR).** When cell 3 prints\n",
+    "   `RESTART REQUIRED`, click **Run → Restart kernel**, then resume from cell 5.\n",
+    "4. **Run cells 5 → 11 (verify + configure + dry-run + pre-flight).** Each cell\n",
+    "   should print an `OK` line before moving on.\n",
+    "5. **Run cell 13 (the long training cell, 4–6 hours).**\n",
+    "6. **Run cells 14 → 17 (final push + plots + inference smoke-test).**\n",
+    "\n",
+    "## Curriculum + reward thresholds (this run)\n",
+    "\n",
+    "Constant per-phase rolling-avg-reward bars; sustained for **3 consecutive\n",
+    "GRPO groups** triggers either a phase promotion or end-of-training.\n",
+    "\n",
+    "| Phase | Reward target (sustained ×3 groups) | Action when met |\n",
+    "|---|---|---|\n",
+    "| 1 — Tool Mastery | `+1.2` | force-promote to Phase 2 |\n",
+    "| 2 — Clinical Reasoning | `+1.1` | force-promote to Phase 3 |\n",
+    "| 3 — Empathetic Negotiation | `+1.0` | END TRAINING |\n",
+    "\n",
+    "Why these numbers? The un-trained 8B Doctor's baseline on the same env is\n",
+    "`P1=+0.76, P2=+0.59, P3=+0.39`. Targets of `+1.2 / +1.1 / +1.0` correspond\n",
+    "to roughly `1.6× / 1.9× / 2.6×` improvement over baseline — a meaningful\n",
+    "signal but reachable inside Kaggle's 12 h session limit."
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# === CELL 2 — Sanity check (GPU + disk + python + internet) ===\n",
+    "# Run this FIRST. If any check fails, fix it before running the REPAIR cell.\n",
+    "\n",
+    "import os, shutil, subprocess, sys, socket\n",
+    "\n",
+    "print(\"--- GPU ---\")\n",
+    "try:\n",
+    "    print(subprocess.check_output(\n",
+    "        [\"nvidia-smi\", \"--query-gpu=name,memory.total,memory.free\", \"--format=csv\"],\n",
+    "        timeout=10,\n",
+    "    ).decode())\n",
+    "except Exception as e:\n",
+    "    print(f\"nvidia-smi failed: {e}\")\n",
+    "    print(\"-> Set Accelerator to 'GPU T4 x2' in the right sidebar.\")\n",
+    "\n",
+    "print(\"--- Disk (/kaggle/working) ---\")\n",
+    "total, used, free = shutil.disk_usage(\"/kaggle/working\")\n",
+    "print(f\"  total={total/1e9:5.1f} GB | used={used/1e9:5.1f} GB | free={free/1e9:5.1f} GB\")\n",
+    "if free < 8 * 1e9:\n",
+    "    print(\"  WARNING: free disk < 8 GB — repair cell may fail. \"\n",
+    "          \"Consider 'Run > Restart and clear cell outputs' to reset /tmp.\")\n",
+    "\n",
+    "print(\"--- Python ---\")\n",
+    "print(f\"  python={sys.version.split()[0]} | exe={sys.executable}\")\n",
+    "\n",
+    "print(\"--- Internet (api.groq.com:443) ---\")\n",
+    "try:\n",
+    "    socket.create_connection((\"api.groq.com\", 443), timeout=5).close()\n",
+    "    print(\"  reachable\")\n",
+    "except Exception as e:\n",
+    "    print(f\"  UNREACHABLE: {e}\")\n",
+    "    print(\"  -> Settings (right sidebar) -> Internet -> ON\")"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# === CELL 3 — REPAIR CELL (idempotent full environment rebuild) ===\n",
+    "# Single source of truth for ER-MAP's GPU stack. Safe to re-run. After it\n",
+    "# finishes you'll see one of two final lines:\n",
+    "#\n",
+    "#   RESTART REQUIRED  -> Run -> Restart kernel, then resume from cell 5\n",
+    "#   REPAIR OK         -> proceed directly to cell 5\n",
+    "#\n",
+    "# Note: this cell only runs shell commands and one isolated subprocess.\n",
+    "# It deliberately does NOT `import torch / numpy / Pillow / unsloth` in the\n",
+    "# kernel, so re-running it after a botched install does not poison further\n",
+    "# attempts.\n",
+    "\n",
+    "print(\"=\" * 72); print(\"  CELL 3 — REPAIR\"); print(\"=\" * 72)\n",
+    "\n",
+    "# 1. Clean caches (Kaggle's /kaggle/working is only 20 GB — installs\n",
+    "#    routinely fill it after a few re-runs).\n",
+    "print(\"[1/6] Cleaning pip + tmp + HF dataset caches...\")\n",
+    "get_ipython().system('pip cache purge -q || true')\n",
+    "get_ipython().system('rm -rf /tmp/* /root/.cache/pip /root/.cache/huggingface/datasets 2>/dev/null || true')\n",
+    "\n",
+    "# 2. Pin torch + torchvision to the cu128 wheel (matches Kaggle's CUDA 12.8\n",
+    "#    base image). DON'T let pip pull a generic CUDA-13 build — that breaks\n",
+    "#    bitsandbytes (libnvJitLink.so.13 missing) and torchvision (CUDA-major\n",
+    "#    mismatch RuntimeError at import time).\n",
+    "print(\"[2/6] Installing torch==2.10.0 + torchvision==0.25.0 (cu128)...\")\n",
+    "get_ipython().system('pip install -q --no-cache-dir --force-reinstall '\n",
+    "                     'torch==2.10.0 torchvision==0.25.0 '\n",
+    "                     '--index-url https://download.pytorch.org/whl/cu128')\n",
+    "\n",
+    "# 3. Reinstall bitsandbytes against the now-pinned torch.\n",
+    "print(\"[3/6] Reinstalling bitsandbytes...\")\n",
+    "get_ipython().system('pip install -q --no-cache-dir --force-reinstall bitsandbytes')\n",
+    "\n",
+    "# 4. Upgrade unsloth + unsloth_zoo + trl in lockstep. unsloth and\n",
+    "#    unsloth_zoo are released as a matched pair; if pip pulls a fresh\n",
+    "#    unsloth_zoo against an old unsloth you get\n",
+    "#       ImportError: cannot import name 'create_gradient_checkpointing_buffer'\n",
+    "print(\"[4/6] Upgrading unsloth + unsloth_zoo + trl...\")\n",
+    "get_ipython().system('pip install -q --upgrade --no-cache-dir '\n",
+    "                     'unsloth unsloth_zoo \"trl>=0.18.2\"')\n",
+    "\n",
+    "# 5. ER-MAP runtime deps that aren't pre-installed on Kaggle.\n",
+    "print(\"[5/6] Installing ER-MAP runtime deps...\")\n",
+    "get_ipython().system('pip install -q --no-cache-dir '\n",
+    "                     '\"groq>=0.18.0\" \"huggingface_hub>=0.25.0\" '\n",
+    "                     '\"gymnasium>=0.29.0\" \"openenv-core>=0.1.0\"')\n",
+    "\n",
+    "# 6. Verify in a SUBPROCESS (so the parent kernel never imports any of these\n",
+    "#    while pip is mid-flight, which is what causes the\n",
+    "#       'numpy was upgraded mid-session (loaded: X, installed: Y)' RuntimeError\n",
+    "#    we kept hitting before).\n",
+    "print(\"[6/6] Verifying via subprocess...\")\n",
+    "import subprocess, sys, json\n",
+    "\n",
+    "verify_script = r'''\n",
+    "import json, sys\n",
+    "out = {\"ok\": True, \"details\": {}, \"errors\": []}\n",
+    "try:\n",
+    "    import importlib.metadata as md\n",
+    "    for pkg in (\"torch\", \"torchvision\", \"bitsandbytes\", \"unsloth\", \"unsloth_zoo\",\n",
+    "                \"trl\", \"transformers\", \"peft\", \"accelerate\", \"groq\",\n",
+    "                \"huggingface_hub\", \"gymnasium\", \"numpy\", \"Pillow\"):\n",
+    "        try:\n",
+    "            out[\"details\"][pkg + \"_installed\"] = md.version(pkg)\n",
+    "        except md.PackageNotFoundError:\n",
+    "            out[\"details\"][pkg + \"_installed\"] = None\n",
+    "\n",
+    "    import torch, torchvision, numpy as np, PIL, unsloth, unsloth_zoo, bitsandbytes, trl\n",
+    "    out[\"details\"][\"torch_loaded\"]        = torch.__version__\n",
+    "    out[\"details\"][\"torch_cuda\"]          = torch.version.cuda\n",
+    "    out[\"details\"][\"cuda_available\"]      = bool(torch.cuda.is_available())\n",
+    "    out[\"details\"][\"gpu_count\"]           = int(torch.cuda.device_count())\n",
+    "    out[\"details\"][\"torchvision_loaded\"]  = torchvision.__version__\n",
+    "    out[\"details\"][\"numpy_loaded\"]        = np.__version__\n",
+    "    out[\"details\"][\"pillow_loaded\"]       = PIL.__version__\n",
+    "    out[\"details\"][\"unsloth_loaded\"]      = unsloth.__version__\n",
+    "    out[\"details\"][\"unsloth_zoo_loaded\"]  = unsloth_zoo.__version__\n",
+    "    out[\"details\"][\"bitsandbytes_loaded\"] = bitsandbytes.__version__\n",
+    "    out[\"details\"][\"trl_loaded\"]          = trl.__version__\n",
+    "\n",
+    "    # Cross-check loaded-vs-installed for the C-extension libs that bit us\n",
+    "    # on every previous run.\n",
+    "    for pkg, loaded_key, installed_key in [\n",
+    "        (\"numpy\",  \"numpy_loaded\",  \"numpy_installed\"),\n",
+    "        (\"Pillow\", \"pillow_loaded\", \"Pillow_installed\"),\n",
+    "        (\"torch\",  \"torch_loaded\",  \"torch_installed\"),\n",
+    "    ]:\n",
+    "        loaded = out[\"details\"].get(loaded_key)\n",
+    "        installed = out[\"details\"].get(installed_key)\n",
+    "        if loaded and installed and loaded != installed:\n",
+    "            # Strip any local-version suffix (e.g. '+cu128') before compare.\n",
+    "            if loaded.split(\"+\")[0] != installed.split(\"+\")[0]:\n",
+    "                out[\"errors\"].append(\n",
+    "                    f\"{pkg} mismatch: loaded={loaded} installed={installed}\"\n",
+    "                )\n",
+    "except Exception as e:\n",
+    "    out[\"ok\"] = False\n",
+    "    out[\"errors\"].append(f\"{type(e).__name__}: {e}\")\n",
+    "print(json.dumps(out, default=str))\n",
+    "'''.lstrip()\n",
+    "\n",
+    "res = subprocess.run([sys.executable, \"-c\", verify_script],\n",
+    "                     capture_output=True, text=True, timeout=180)\n",
+    "print(res.stdout if res.stdout else \"<no stdout>\")\n",
+    "if res.stderr:\n",
+    "    print(\"---- subprocess stderr ----\"); print(res.stderr)\n",
+    "\n",
+    "# Parse the LAST line of stdout (others are prints from package init).\n",
+    "try:\n",
+    "    last = res.stdout.strip().splitlines()[-1]\n",
+    "    parsed = json.loads(last)\n",
+    "except Exception:\n",
+    "    parsed = {\"ok\": False, \"errors\": [\"could not parse verification output\"]}\n",
+    "\n",
+    "ok = parsed.get(\"ok\") and not parsed.get(\"errors\")\n",
+    "d = parsed.get(\"details\", {})\n",
+    "\n",
+    "print(\"\n",
+    "\" + \"=\" * 72)\n",
+    "if ok:\n",
+    "    print(\"  REPAIR OK\")\n",
+    "    print(f\"    torch       : {d.get('torch_loaded')}  (CUDA {d.get('torch_cuda')})\")\n",
+    "    print(f\"    torchvision : {d.get('torchvision_loaded')}\")\n",
+    "    print(f\"    bitsandbytes: {d.get('bitsandbytes_loaded')}\")\n",
+    "    print(f\"    unsloth     : {d.get('unsloth_loaded')} | unsloth_zoo: {d.get('unsloth_zoo_loaded')}\")\n",
+    "    print(f\"    trl         : {d.get('trl_loaded')}\")\n",
+    "    print(f\"    numpy       : {d.get('numpy_loaded')} | Pillow: {d.get('pillow_loaded')}\")\n",
+    "    print(f\"    GPUs        : {d.get('gpu_count')}  (cuda_available={d.get('cuda_available')})\")\n",
+    "    print()\n",
+    "    print(\"  -> If this kernel previously imported torch/numpy/Pillow/unsloth,\")\n",
+    "    print(\"     RESTART NOW (Run -> Restart kernel) before continuing to cell 5.\")\n",
+    "    print(\"     If this is a fresh kernel, you can proceed directly.\")\n",
+    "else:\n",
+    "    print(\"  RESTART REQUIRED — issues detected:\")\n",
+    "    for e in parsed.get(\"errors\", []):\n",
+    "        print(f\"    - {e}\")\n",
+    "    print()\n",
+    "    print(\"  Action: Run -> Restart kernel, then re-run from cell 2.\")\n",
+    "print(\"=\" * 72)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## ⚠ Restart kernel here if cell 3 said `RESTART REQUIRED`\n",
+    "\n",
+    "Click **Run → Restart kernel** (or **Run → Restart & clear cell outputs**),\n",
+    "then resume from **cell 5**. Skipping the restart will produce ABI mismatch\n",
+    "errors at the first GPU op.\n",
     "\n",
+    "If cell 3 said `REPAIR OK` AND this is a fresh kernel that hasn't imported\n",
+    "torch/numpy/Pillow/unsloth yet, you can proceed to cell 5 directly."
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# === CELL 5 — Post-restart verify (this kernel can import everything) ===\n",
+    "import importlib.metadata as md\n",
+    "\n",
+    "print(\"--- Loaded versions in this kernel ---\")\n",
+    "import torch, numpy, PIL, torchvision, unsloth, unsloth_zoo, bitsandbytes, trl, transformers, peft\n",
+    "\n",
+    "versions = {\n",
+    "    \"torch\":          torch.__version__,\n",
+    "    \"torchvision\":    torchvision.__version__,\n",
+    "    \"numpy\":          numpy.__version__,\n",
+    "    \"Pillow\":         PIL.__version__,\n",
+    "    \"unsloth\":        unsloth.__version__,\n",
+    "    \"unsloth_zoo\":    unsloth_zoo.__version__,\n",
+    "    \"bitsandbytes\":   bitsandbytes.__version__,\n",
+    "    \"trl\":            trl.__version__,\n",
+    "    \"transformers\":   transformers.__version__,\n",
+    "    \"peft\":           peft.__version__,\n",
+    "}\n",
+    "all_ok = True\n",
+    "for k, v in versions.items():\n",
+    "    try:\n",
+    "        inst = md.version(k)\n",
+    "    except md.PackageNotFoundError:\n",
+    "        inst = \"(not installed)\"\n",
+    "    # Tolerate local version suffixes like '+cu128'\n",
+    "    flag = \"OK\" if inst.split(\"+\")[0] == v.split(\"+\")[0] else f\"MISMATCH (installed={inst})\"\n",
+    "    if \"MISMATCH\" in flag:\n",
+    "        all_ok = False\n",
+    "    print(f\"  {k:14s}: loaded={v:20s} [{flag}]\")\n",
+    "\n",
+    "print()\n",
+    "print(f\"  CUDA available : {torch.cuda.is_available()}\")\n",
+    "print(f\"  GPU count      : {torch.cuda.device_count()}\")\n",
+    "if torch.cuda.is_available():\n",
+    "    for i in range(torch.cuda.device_count()):\n",
+    "        p = torch.cuda.get_device_properties(i)\n",
+    "        print(f\"  GPU {i}          : {p.name} ({p.total_memory/1e9:.1f} GB)\")\n",
+    "\n",
+    "print()\n",
+    "print(\"OK\" if all_ok else \"NOT OK — re-run cell 3 and restart kernel.\")"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# === CELL 6 — Mount the ER-MAP repo into /kaggle/working ===\n",
+    "import os, subprocess, sys\n",
+    "\n",
+    "# OPTION A: clone a public GitHub fork (preferred). Edit GIT_URL.\n",
+    "GIT_URL    = \"https://github.com/<your-fork>/Meta_Finals.git\"\n",
+    "BRANCH     = \"main\"\n",
+    "REPO_ROOT  = \"/kaggle/working/Meta_Finals\"\n",
+    "\n",
+    "# OPTION B: Kaggle Dataset upload — set this if you uploaded the repo\n",
+    "# as a Kaggle Dataset named \"ermap-source\" (Add Data -> Upload).\n",
+    "DATASET_DIR = \"/kaggle/input/ermap-source\"\n",
+    "\n",
+    "if not os.path.isdir(f\"{REPO_ROOT}/ER_MAP\"):\n",
+    "    if \"<your-fork>\" not in GIT_URL:\n",
+    "        print(f\"Cloning {GIT_URL}@{BRANCH} -> {REPO_ROOT}...\")\n",
+    "        out = subprocess.run(\n",
+    "            [\"git\", \"clone\", \"--depth\", \"1\", \"-b\", BRANCH, GIT_URL, REPO_ROOT],\n",
+    "            capture_output=True, text=True,\n",
+    "        )\n",
+    "        print(out.stdout); print(out.stderr)\n",
+    "    elif os.path.isdir(DATASET_DIR):\n",
+    "        print(f\"Copying {DATASET_DIR} -> {REPO_ROOT}...\")\n",
+    "        import shutil\n",
+    "        shutil.copytree(DATASET_DIR, REPO_ROOT, dirs_exist_ok=True)\n",
+    "\n",
+    "assert os.path.isdir(f\"{REPO_ROOT}/ER_MAP\"), (\n",
+    "    \"Repo not found.\\n\"\n",
+    "    \" - Edit GIT_URL above to your GitHub fork, OR\\n\"\n",
+    "    \" - Upload the repo as a Kaggle Dataset named 'ermap-source' (Add Data -> Upload).\"\n",
     ")\n",
     "\n",
+    "sys.path.insert(0, REPO_ROOT)\n",
+    "sys.path.insert(0, f\"{REPO_ROOT}/kaggle\")\n",
+    "print(f\"OK. Repo at {REPO_ROOT}\")"
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
+    "# === CELL 7 — Wire Kaggle Secrets into env vars ===\n",
+    "import os\n",
+    "from kaggle_helpers import load_kaggle_secrets, kaggle_env_summary\n",
     "\n",
+    "load_kaggle_secrets()\n",
+    "kaggle_env_summary()\n",
+    "\n",
+    "# Hard fail if no Groq key — training would silently use mock LLMs.\n",
+    "assert any(os.environ.get(k) for k in (\n",
+    "    \"GROQ_NURSE_API_KEY\", \"GROQ_PATIENT_API_KEY\",\n",
+    "    \"GROQ_EMPATHY_JUDGE_API_KEY\", \"GROQ_MEDICAL_JUDGE_API_KEY\",\n",
+    "    \"GROQ_API_KEY\",\n",
+    ")), (\"No Groq key found in Kaggle Secrets. \"\n",
+    "     \"Add at least GROQ_NURSE_API_KEY in Add-ons -> Secrets.\")\n",
+    "print(\"OK — at least one Groq key is wired.\")"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# === CELL 8 — Hugging Face Hub config (for checkpoint backup) ===\n",
+    "import os\n",
+    "from kaggle_helpers import push_checkpoint_to_hub, download_checkpoint_from_hub\n",
+    "\n",
+    "# EDIT the line below to your HF model id (e.g. \"udayd/ermap-doctor-lora\").\n",
+    "HF_PUSH_REPO   = \"<your-username>/ermap-doctor-lora\"\n",
+    "# To resume from a previous run, paste the same repo id here. Empty = fresh.\n",
+    "HF_RESUME_REPO = \"\"\n",
     "\n",
     "RESUME_DIR = \"/kaggle/working/checkpoints/resume\"\n",
     "if HF_RESUME_REPO:\n",
     "    download_checkpoint_from_hub(HF_RESUME_REPO, RESUME_DIR)\n",
+    "    contents = os.listdir(RESUME_DIR) if os.path.isdir(RESUME_DIR) else []\n",
+    "    print(f\"Resume dir: {contents or '(empty)'}\")\n",
+    "else:\n",
+    "    print(\"Starting fresh — no resume.\")\n",
     "\n",
+    "if \"<your-username>\" in HF_PUSH_REPO:\n",
+    "    print(\"\\nWARNING: HF_PUSH_REPO still has <your-username> placeholder.\")\n",
+    "    print(\"         Checkpoints will NOT be pushed to HF Hub.\")\n",
+    "    print(\"         Edit the cell above and re-run before training if you want backups.\")"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# === CELL 9 — GRPO hyperparameters ===\n",
+    "import os\n",
+    "\n",
     "MODEL_NAME       = \"unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit\"\n",
     "GROUP_SIZE       = 2\n",
     "LEARNING_RATE    = 5e-6\n",
     "KL_BETA          = 0.04\n",
     "OUTPUT_DIR       = \"/kaggle/working/er_map_grpo_checkpoints\"\n",
     "PUSH_EVERY_EPS   = 20\n",
+    "USE_WANDB        = False  # WANDB conflicts with protobuf 7 on Kaggle base image\n",
+    "NUM_EPISODES     = 200    # hard cap; early-stop usually finishes first\n",
+    "\n",
+    "# --- Per-phase reward thresholds (constant for this run) -------------------\n",
+    "# After every GRPO update we look at the last CONVERGENCE_WINDOW groups; if\n",
+    "# ALL of them belong to the same current phase AND each has\n",
+    "# rolling_avg_reward >= PHASE_REWARD_TARGETS[current_phase] AND\n",
+    "# rolling_win_rate >= PHASE_MIN_WIN_RATE, we either:\n",
+    "#   - force-promote to the next phase (Phase 1 / Phase 2), OR\n",
     "#   - terminate training (Phase 3).\n",
+    "EARLY_STOP_ENABLED   = True\n",
+    "PHASE_REWARD_TARGETS = {1: 1.2, 2: 1.1, 3: 1.0}\n",
+    "PHASE_MIN_WIN_RATE   = 0.20\n",
+    "CONVERGENCE_WINDOW   = 3\n",
+    "\n",
+    "# --- Per-episode budget controls (read by triage_env) ----------------------\n",
     "os.environ[\"ERMAP_MAX_EPISODE_STEPS\"]      = \"20\"\n",
     "os.environ[\"ERMAP_MAX_INTERNAL_EXCHANGES\"] = \"5\"\n",
+    "\n",
+    "# --- Groq traffic-shaping (8B for actors, 70B for judges) ------------------\n",
+    "# High-volume conversational roles (Nurse + Patient) on the 8B-instant pool\n",
+    "# (500K TPD, 14,400 RPD); the two judges stay on 70B-versatile because their\n",
+    "# grading quality directly shapes the reward signal.\n",
     "os.environ[\"ERMAP_NURSE_MODEL\"]            = \"llama-3.1-8b-instant\"\n",
     "os.environ[\"ERMAP_PATIENT_MODEL\"]          = \"llama-3.1-8b-instant\"\n",
     "os.environ[\"ERMAP_EMPATHY_JUDGE_MODEL\"]    = \"llama-3.3-70b-versatile\"\n",
     "os.environ[\"ERMAP_MEDICAL_JUDGE_MODEL\"]    = \"llama-3.3-70b-versatile\"\n",
     "\n",
+    "print(\"Hyperparameters set:\")\n",
+    "print(f\"  NUM_EPISODES         = {NUM_EPISODES}\")\n",
+    "print(f\"  GROUP_SIZE           = {GROUP_SIZE}\")\n",
+    "print(f\"  PHASE_REWARD_TARGETS = {PHASE_REWARD_TARGETS}\")\n",
+    "print(f\"  PHASE_MIN_WIN_RATE   = {PHASE_MIN_WIN_RATE}\")\n",
+    "print(f\"  CONVERGENCE_WINDOW   = {CONVERGENCE_WINDOW}\")\n",
+    "print(f\"  Nurse / Patient      = llama-3.1-8b-instant (actors, high-volume)\")\n",
+    "print(f\"  Empathy / Med Judge  = llama-3.3-70b-versatile (graders, quality)\")"
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
+    "# === CELL 10 — Pre-flight: Groq routing + key liveness ===\n",
+    "# Verifies that:\n",
+    "#  - each role is routed to the model you set in cell 9, and\n",
+    "#  - each role's Groq key actually answers a 1-token \"PING\" prompt.\n",
+    "\n",
+    "import os\n",
+    "from ER_MAP.envs.api_router import AgentRouter\n",
+    "\n",
+    "router = AgentRouter()\n",
+    "expected = {\n",
+    "    \"nurse\":         \"llama-3.1-8b-instant\",\n",
+    "    \"patient\":       \"llama-3.1-8b-instant\",\n",
+    "    \"empathy_judge\": \"llama-3.3-70b-versatile\",\n",
+    "    \"medical_judge\": \"llama-3.3-70b-versatile\",\n",
+    "}\n",
+    "\n",
+    "print(\"=\" * 60); print(\"  PRE-FLIGHT — Groq routing + smoke test\"); print(\"=\" * 60)\n",
+    "all_pass = True\n",
+    "for role, exp in expected.items():\n",
+    "    actual = router._models.get(role, \"?\")\n",
+    "    routing_ok = (actual == exp)\n",
+    "    client = router._clients.get(role)\n",
+    "\n",
+    "    if client is None:\n",
+    "        print(f\"  [SKIP] {role:14s} -> no Groq client (key missing)\")\n",
+    "        all_pass = False\n",
+    "        continue\n",
     "\n",
+    "    try:\n",
+    "        resp = client.chat.completions.create(\n",
+    "            model=exp,\n",
+    "            messages=[{\"role\": \"user\", \"content\": \"Reply with exactly: PING\"}],\n",
+    "            max_tokens=4, temperature=0,\n",
+    "        )\n",
+    "        api_ok = \"PING\" in (resp.choices[0].message.content or \"\").upper()\n",
+    "        err = \"\"\n",
+    "    except Exception as e:\n",
+    "        api_ok = False\n",
+    "        err = f\" ({type(e).__name__}: {str(e)[:80]})\"\n",
+    "\n",
+    "    flag = \"PASS\" if (routing_ok and api_ok) else \"FAIL\"\n",
+    "    if flag == \"FAIL\":\n",
+    "        all_pass = False\n",
+    "    print(f\"  [{flag}] {role:14s} -> {actual:30s} \"\n",
+    "          f\"routing={'ok' if routing_ok else 'WRONG'}, \"\n",
+    "          f\"api={'ok' if api_ok else 'fail'}{err}\")\n",
+    "\n",
+    "print(\"=\" * 60)\n",
+    "print(\"OK\" if all_pass else \"NOT OK — fix routing/keys before training.\")\n",
+    "print(\"=\" * 60)\n",
+    "assert all_pass, \"Pre-flight failed; do not proceed to training.\""
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# === CELL 11 — Dry-run smoke test (no GPU, no model load) ===\n",
+    "# Verifies the curriculum scheduler + reward verifier + per-phase early-stop\n",
+    "# wiring before we burn GPU minutes on the real run.\n",
+    "\n",
     "from ER_MAP.training.train_grpo import train\n",
     "\n",
     "_ = train(\n",
     "    kl_beta=KL_BETA,\n",
     "    output_dir=\"/kaggle/working/_dryrun\",\n",
     "    dry_run=True,\n",
+    "    phase_reward_targets=PHASE_REWARD_TARGETS,\n",
+    "    phase_min_win_rate=PHASE_MIN_WIN_RATE,\n",
+    "    convergence_window=CONVERGENCE_WINDOW,\n",
+    "    early_stop=EARLY_STOP_ENABLED,\n",
     ")\n",
+    "print(\"\\nDry-run OK — scheduler + verifier + per-phase early-stop wiring is healthy.\")"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# === CELL 12 — Wire periodic HF Hub push into training ===\n",
+    "# We monkey-patch save_lora_adapters so every checkpoint dump also pushes\n",
+    "# the LoRA adapter to HF Hub. Failures are non-fatal — training keeps\n",
+    "# running even if a push fails (e.g. transient HF 502).\n",
+    "\n",
     "from ER_MAP.training import train_grpo as _tg\n",
     "_original_save = _tg.save_lora_adapters\n",
     "\n",
     "def save_lora_adapters_with_push(model, tokenizer, output_dir):\n",
     "    _original_save(model, tokenizer, output_dir)\n",
     "    if HF_PUSH_REPO and \"<your-username>\" not in HF_PUSH_REPO:\n",
     "        try:\n",
     "            push_checkpoint_to_hub(\n",
+    "                output_dir, HF_PUSH_REPO,\n",
     "                commit_message=f\"checkpoint @ {os.path.basename(output_dir)}\",\n",
     "            )\n",
     "        except Exception as e:\n",
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## 13 · Run real training (the 4–6 hour cell)\n",
+    "\n",
+    "**Estimated wall-clock on Kaggle T4 ×2:**\n",
+    "\n",
+    "- ~3–5 min per episode (6–14 env steps × Doctor.generate + 4–8 × Groq calls)\n",
+    "- ~1–2 min amortized per GRPO update (G=2 trajectories × response-token log-probs)\n",
+    "- **Per-group ≈ 8–12 min** (2 episodes + 1 update)\n",
+    "\n",
+    "| Phase | Typical episodes to reach target | Wall-clock |\n",
+    "|---|---|---|\n",
+    "| 1 (target `+1.2` × 3) | 12 – 24 episodes (6 – 12 groups) | ~1.0 – 2.0 h |\n",
+    "| 2 (target `+1.1` × 3) | 16 – 32 episodes (8 – 16 groups) | ~1.5 – 2.5 h |\n",
+    "| 3 (target `+1.0` × 3) | 20 – 50 episodes (10 – 25 groups) | ~2.0 – 4.0 h |\n",
+    "| **Total** | 50 – 100 episodes | **~4.5 – 8.5 h** |\n",
+    "\n",
+    "If `NUM_EPISODES=200` is exhausted before Phase 3 converges, training\n",
+    "stops at the cap and the latest LoRA checkpoint is on HF Hub already\n",
+    "(we push every 20 episodes), so resume in a fresh session via\n",
+    "`HF_RESUME_REPO` in cell 8."
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# === CELL 13 — REAL TRAINING (4-6 h cell) ===\n",
     "metrics = train(\n",
     "    num_episodes=NUM_EPISODES,\n",
     "    group_size=GROUP_SIZE,\n",
     "    model_name=MODEL_NAME,\n",
+    "    groq_api_key=os.environ.get(\"GROQ_NURSE_API_KEY\", \"\")\n",
+    "                  or os.environ.get(\"GROQ_API_KEY\", \"\"),\n",
     "    learning_rate=LEARNING_RATE,\n",
     "    kl_beta=KL_BETA,\n",
     "    use_wandb=USE_WANDB,\n",
     "    output_dir=OUTPUT_DIR,\n",
     "    dry_run=False,\n",
     "    phase_reward_targets=PHASE_REWARD_TARGETS,\n",
     "    phase_min_win_rate=PHASE_MIN_WIN_RATE,\n",
     "    convergence_window=CONVERGENCE_WINDOW,\n",
     "    early_stop=EARLY_STOP_ENABLED,\n",
+    ")\n",
+    "print(f\"\\nTraining returned {len(metrics)} metric records.\")"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# === CELL 14 — Final push: adapters + merged fp16 ===\n",
     "FINAL_LORA_DIR   = f\"{OUTPUT_DIR}/final_lora\"\n",
     "FINAL_MERGED_DIR = f\"{OUTPUT_DIR}/final_merged_fp16\"\n",
     "\n",
     "if HF_PUSH_REPO and \"<your-username>\" not in HF_PUSH_REPO:\n",
+    "    push_checkpoint_to_hub(FINAL_LORA_DIR, HF_PUSH_REPO,\n",
+    "                           commit_message=\"final LoRA adapter\")\n",
     "    if os.path.isdir(FINAL_MERGED_DIR):\n",
+    "        push_checkpoint_to_hub(FINAL_MERGED_DIR, f\"{HF_PUSH_REPO}-merged\",\n",
+    "                               commit_message=\"final merged fp16\")\n",
+    "    print(f\"Final checkpoints pushed: https://huggingface.co/{HF_PUSH_REPO}\")\n",
     "else:\n",
     "    print(\"HF_PUSH_REPO not configured — skipping final push.\")"
    ]
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## 15 · Per-phase training graphs (one dashboard per curriculum phase)\n",
+    "\n",
+    "We render a 6-panel dashboard for **every phase that contains episodes**,\n",
+    "plus a cross-phase overview and a phase-comparison bar chart. All PNGs are\n",
+    "written to `er_map_grpo_checkpoints/plots/` and uploaded to HF Hub in the\n",
+    "next cell so they survive Kaggle session expiry.\n",
     "\n",
+    "Each per-phase dashboard contains:\n",
     "\n",
     "1. **Reward growth** — raw scatter + rolling mean (w=10) + verified rolling mean\n",
+    "2. **Rolling win rate** — w=20 win-rate evolution within the phase\n",
+    "3. **Outcome distribution over time** — stacked bars (WIN/PARTIAL/INCORRECT/AMA_LOSS/FATAL_LOSS)\n",
+    "4. **Reward components** — mean of each component (process / treatment / empathy / labs / etc.)\n",
+    "5. **GRPO update stats** — loss + KL divergence per group update\n",
     "6. **Episode length distribution** — histogram of step counts"
    ]
   },
    "metadata": {},
    "outputs": [],
    "source": [
+    "# === CELL 15 — Per-phase training dashboards ===\n",
     "from ER_MAP.plotting import plot_per_phase_dashboards\n",
     "from IPython.display import Image, display, Markdown\n",
     "\n",
     "    output_dir=PLOTS_DIR,\n",
     ")\n",
     "\n",
+    "print(f\"Saved {len(written)} chart(s) to {PLOTS_DIR}:\")\n",
     "for name, path in written.items():\n",
     "    size_kb = os.path.getsize(path) / 1024\n",
     "    print(f\"  {name:<28s} -> {path}  ({size_kb:.0f} KB)\")\n",
     "\n",
+    "# Display each chart inline so the operator sees them without leaving Kaggle.\n",
+    "ordered = (sorted(k for k in written if k.startswith(\"phase\"))\n",
+    "           + [\"all_phases_overview\", \"all_phases_comparison\"])\n",
+    "for key in ordered:\n",
     "    if key not in written:\n",
     "        continue\n",
     "    display(Markdown(f\"### {key.replace('_', ' ').title()}\"))\n",
     "    display(Image(filename=written[key]))"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# === CELL 16 — Push plots to HF Hub ===\n",
     "if HF_PUSH_REPO and \"<your-username>\" not in HF_PUSH_REPO:\n",
+    "    push_checkpoint_to_hub(PLOTS_DIR, HF_PUSH_REPO,\n",
+    "                           commit_message=\"per-phase training plots\")\n",
+    "    print(f\"Plots pushed: https://huggingface.co/{HF_PUSH_REPO}/tree/main\")\n",
     "else:\n",
+    "    print(\"HF_PUSH_REPO not configured — plots stay only in /kaggle/working/.\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## 17 · (Optional) Inference smoke-test on the trained model\n",
     "\n",
+    "Catches the classic 'merge path looked OK but the saved model emits garbage'\n",
+    "failure mode before the demo."
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# === CELL 17 — Inference smoke-test on the trained model ===\n",
     "from ER_MAP.training.train_grpo import generate_doctor_action, load_model_and_tokenizer\n",
     "from peft import PeftModel\n",
     "\n",
  },
  "nbformat": 4,
  "nbformat_minor": 5
+}