Spaces:

garvitsachdeva
/

SpindleFlow-RL

Runtime error

App Files Files Community

garvitsachdeva Claude Sonnet 4.6 commited on Apr 25

Commit

c77f83c

1 Parent(s): 13dd91f

Fix Cell 1: skip audioop-lts on Python < 3.13 (Colab uses 3.12)

Browse files

Files changed (1) hide show

colab/SpindleFlow_RL_Training.ipynb +634 -670

colab/SpindleFlow_RL_Training.ipynb CHANGED Viewed

@@ -1,672 +1,636 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "gpuType": "T4",
-      "name": "SpindleFlow_RL_Training.ipynb"
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "accelerator": "GPU"
   },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "# SpindleFlow RL — Training Notebook\n",
-        "\n",
-        "**Hardware**: Runtime → Change runtime type → **T4 GPU**\n",
-        "\n",
-        "**Secrets** (key icon in left sidebar → Manage secrets):\n",
-        "\n",
-        "| Name | Required | Notes |\n",
-        "|---|---|---|\n",
-        "| `HF_TOKEN` | ✅ Yes | HuggingFace write token — hf.co/settings/tokens → New token (write) |\n",
-        "| `OPENAI_API_KEY` | ✅ Yes | GPT-4o-mini for task generation, finetuner, reward baseline |\n",
-        "\n",
-        "Run cells **top to bottom, one at a time**. Do NOT skip cells."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Cell 1 — Install dependencies & clone repo\n",
-        "Run once. After it finishes, **do NOT restart the runtime** — continue to Cell 2."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "import subprocess, os, sys\n",
-        "\n",
-        "subprocess.run([\n",
-        "    \"pip\", \"install\", \"-q\",\n",
-        "    \"openenv\", \"stable-baselines3\", \"sb3-contrib\", \"gymnasium\",\n",
-        "    \"sentence-transformers\", \"openai\", \"pyyaml\", \"trl\",\n",
-        "    \"transformers\", \"datasets\", \"torch\",\n",
-        "    \"matplotlib\", \"audioop-lts\", \"huggingface_hub\",\n",
-        "], check=True)\n",
-        "print(\"✅ Packages installed\")\n",
-        "\n",
-        "REPO = \"/content/kuchbhi/spindleflow-rl\"\n",
-        "if not os.path.isdir(REPO):\n",
-        "    subprocess.run(\n",
-        "        [\"git\", \"clone\", \"https://github.com/garvitsachdevaa/kuchbhi.git\"],\n",
-        "        cwd=\"/content\", check=True,\n",
-        "    )\n",
-        "    print(\"✅ Repo cloned\")\n",
-        "else:\n",
-        "    print(\"Repo already present — pulling latest\")\n",
-        "    subprocess.run([\"git\", \"pull\"], cwd=REPO, check=True)\n",
-        "\n",
-        "os.chdir(REPO)\n",
-        "sys.path.insert(0, \".\")\n",
-        "\n",
-        "import importlib.metadata\n",
-        "print(f\"OpenEnv version  : {importlib.metadata.version('openenv')}\")\n",
-        "\n",
-        "os.makedirs(\"/content/demo/assets\", exist_ok=True)\n",
-        "os.makedirs(\"/content/data\",         exist_ok=True)\n",
-        "os.makedirs(\"/content/checkpoints\",  exist_ok=True)\n",
-        "os.makedirs(\"/content/logs\",         exist_ok=True)\n",
-        "\n",
-        "print(f\"Working directory: {os.getcwd()}\")\n",
-        "print(\"✅ Setup complete\")"
-      ],
-      "outputs": [],
-      "execution_count": null
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Cell 2 — Set secrets & verify\n",
-        "Reads `HF_TOKEN` and `OPENAI_API_KEY` from Colab secrets.  \n",
-        "**Both must show ✅ before continuing.**"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "import os\n",
-        "from google.colab import userdata\n",
-        "\n",
-        "HF_TOKEN = userdata.get(\"HF_TOKEN\")\n",
-        "OPENAI_API_KEY = userdata.get(\"OPENAI_API_KEY\")\n",
-        "\n",
-        "if not HF_TOKEN:\n",
-        "    raise RuntimeError(\n",
-        "        \"HF_TOKEN not set.\\n\"\n",
-        "        \"Go to the key icon (left sidebar) → Add secret → Name: HF_TOKEN, \"\n",
-        "        \"Value: your write token from hf.co/settings/tokens → enable notebook access.\"\n",
-        "    )\n",
-        "\n",
-        "if not OPENAI_API_KEY:\n",
-        "    raise RuntimeError(\n",
-        "        \"OPENAI_API_KEY not set.\\n\"\n",
-        "        \"Go to the key icon (left sidebar) → Add secret → Name: OPENAI_API_KEY, \"\n",
-        "        \"Value: sk-... → enable notebook access.\"\n",
-        "    )\n",
-        "\n",
-        "# Inject into environment so all modules pick them up\n",
-        "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY\n",
-        "\n",
-        "print(f\"✅ HF_TOKEN      : {HF_TOKEN[:8]}...{HF_TOKEN[-4:]}\")\n",
-        "print(f\"✅ OPENAI_API_KEY: {OPENAI_API_KEY[:8]}...{OPENAI_API_KEY[-4:]}\")\n",
-        "print(\"Both secrets loaded — proceeding.\")"
-      ],
-      "outputs": [],
-      "execution_count": null
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Cell 3 — Patch env + smoke test\n",
-        "Adds `simulate_specialists` support and runs one end-to-end step to confirm the env works."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "import os as _os\n",
-        "import numpy as np\n",
-        "from env.spindleflow_env import SpindleFlowEnv\n",
-        "\n",
-        "# Monkey-patch: add simulate_specialists kwarg (fast per-step simulation)\n",
-        "if not getattr(SpindleFlowEnv, \"_simulate_patched\", False):\n",
-        "    _orig_init = SpindleFlowEnv.__init__\n",
-        "\n",
-        "    def _new_init(self, *args, simulate_specialists=False, **kwargs):\n",
-        "        _orig_init(self, *args, **kwargs)\n",
-        "        self.simulate_specialists = simulate_specialists\n",
-        "\n",
-        "    SpindleFlowEnv.__init__ = _new_init\n",
-        "\n",
-        "    _orig_call = SpindleFlowEnv._call_specialist\n",
-        "\n",
-        "    def _new_call(self, specialist_id, task, elapsed_ms, context=None):\n",
-        "        if getattr(self, \"simulate_specialists\", False):\n",
-        "            _key = _os.environ.pop(\"OPENAI_API_KEY\", None)\n",
-        "            try:\n",
-        "                return _orig_call(self, specialist_id, task, elapsed_ms, context=context)\n",
-        "            finally:\n",
-        "                if _key:\n",
-        "                    _os.environ[\"OPENAI_API_KEY\"] = _key\n",
-        "        return _orig_call(self, specialist_id, task, elapsed_ms, context=context)\n",
-        "\n",
-        "    SpindleFlowEnv._call_specialist = _new_call\n",
-        "    SpindleFlowEnv._simulate_patched = True\n",
-        "    print(\"✅ SpindleFlowEnv patched\")\n",
-        "else:\n",
-        "    print(\"Already patched — skipping\")\n",
-        "\n",
-        "env = SpindleFlowEnv(\n",
-        "    config_path=\"configs/training_config.yaml\",\n",
-        "    catalog_path=\"configs/specialist_catalog.yaml\",\n",
-        "    use_real_spindleflow=False,\n",
-        "    phase=1,\n",
-        "    simulate_specialists=True,\n",
-        ")\n",
-        "obs, info = env.reset()\n",
-        "print(f\"Observation shape : {obs.shape}\")\n",
-        "print(f\"Task              : {info['task'][:80]}\")\n",
-        "\n",
-        "action = env.action_space.sample()\n",
-        "obs2, reward, terminated, truncated, info2 = env.step(action)\n",
-        "print(f\"Step reward       : {reward:.4f}\")\n",
-        "print(f\"Action name       : {info2['action_name']}\")\n",
-        "print(f\"Reward components : {info2['reward_components']}\")\n",
-        "env.close()\n",
-        "print(\"✅ Environment OK\")"
-      ],
-      "outputs": [],
-      "execution_count": null
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Cell 4 — HuggingFace TRL check\n",
-        "Confirms TRL is importable (hackathon requirement)."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "import trl, torch\n",
-        "\n",
-        "print(f\"TRL version   : {trl.__version__}\")\n",
-        "print(f\"Torch version : {torch.__version__}\")\n",
-        "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
-        "if torch.cuda.is_available():\n",
-        "    print(f\"GPU           : {torch.cuda.get_device_name(0)}\")\n",
-        "\n",
-        "for _name in (\"PPOConfig\", \"GRPOConfig\", \"SFTConfig\"):\n",
-        "    _cls = getattr(trl, _name, None)\n",
-        "    if _cls is not None:\n",
-        "        print(f\"TRL config class: {_name} ✅\")\n",
-        "        break\n",
-        "else:\n",
-        "    print(\"TRL imported ✅ (config uses TrainingArguments in this version)\")\n",
-        "\n",
-        "print(\"✅ TRL requirement satisfied. Primary training uses RecurrentPPO (Cell 5).\")"
-      ],
-      "outputs": [],
-      "execution_count": null
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Cell 5 — RecurrentPPO training\n",
-        "\n",
-        "**What's happening:**\n",
-        "- Per-step specialist calls: local simulation (fast, no API cost)\n",
-        "- Task generation: GPT-4o-mini via `OPENAI_API_KEY` (diverse tasks)\n",
-        "- Finetuner: fires every 100 episodes via `OPENAI_API_KEY` (improves specialist prompts)\n",
-        "- Reward baseline: LLM-generated via `OPENAI_API_KEY` (accurate quality signal)\n",
-        "\n",
-        "**Expected runtime: 20–30 min on T4 GPU**"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "import time, yaml\n",
-        "import torch\n",
-        "import numpy as np\n",
-        "from sb3_contrib import RecurrentPPO\n",
-        "from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize\n",
-        "from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback\n",
-        "from policy.lstm_policy import build_policy_kwargs\n",
-        "from training.curriculum import CurriculumManager\n",
-        "from training.specialist_improvement_callback import SpecialistImprovementCallback\n",
-        "\n",
-        "_LOG_FILE = \"/content/logs/training_log.txt\"\n",
-        "\n",
-        "def _tlog(msg: str):\n",
-        "    ts = time.strftime(\"%H:%M:%S\")\n",
-        "    line = f\"[{ts}] {msg}\"\n",
-        "    print(line, flush=True)\n",
-        "    with open(_LOG_FILE, \"a\", encoding=\"utf-8\") as _f:\n",
-        "        _f.write(line + \"\\n\")\n",
-        "\n",
-        "with open(\"configs/training_config.yaml\") as f:\n",
-        "    _cfg = yaml.safe_load(f)\n",
-        "\n",
-        "curriculum = CurriculumManager(config_path=\"configs/training_config.yaml\")\n",
-        "\n",
-        "TOTAL_TIMESTEPS = 100_000   # ~10k episodes, ~20-25 min on T4\n",
-        "\n",
-        "\n",
-        "class RewardLogger(BaseCallback):\n",
-        "    def __init__(self, curriculum):\n",
-        "        super().__init__()\n",
-        "        self.episode_rewards = []\n",
-        "        self._running = 0.0\n",
-        "        self._curriculum = curriculum\n",
-        "\n",
-        "    def _on_step(self):\n",
-        "        for r, d in zip(\n",
-        "            self.locals.get(\"rewards\", []),\n",
-        "            self.locals.get(\"dones\",   []),\n",
-        "        ):\n",
-        "            self._running += float(r)\n",
-        "            if d:\n",
-        "                ep = self._running\n",
-        "                self.episode_rewards.append(ep)\n",
-        "                self._running = 0.0\n",
-        "                advanced = self._curriculum.on_episode_end(ep)\n",
-        "                n = len(self.episode_rewards)\n",
-        "                if advanced or n % 50 == 0:\n",
-        "                    _tlog(\n",
-        "                        f\"Ep {n:5d} | reward {ep:+.3f} | \"\n",
-        "                        f\"{self._curriculum.progress_str()}\"\n",
-        "                    )\n",
-        "        return True\n",
-        "\n",
-        "\n",
-        "def make_env():\n",
-        "    return SpindleFlowEnv(\n",
-        "        config_path=\"configs/training_config.yaml\",\n",
-        "        catalog_path=\"configs/specialist_catalog.yaml\",\n",
-        "        use_real_spindleflow=False,\n",
-        "        phase=1,\n",
-        "        simulate_specialists=True,\n",
-        "    )\n",
-        "\n",
-        "\n",
-        "vec_env = DummyVecEnv([make_env])\n",
-        "vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=True, clip_obs=10.0)\n",
-        "\n",
-        "_ppo  = _cfg.get(\"ppo\",  {})\n",
-        "_lstm = _cfg.get(\"lstm\", {})\n",
-        "\n",
-        "model = RecurrentPPO(\n",
-        "    policy=\"MlpLstmPolicy\",\n",
-        "    env=vec_env,\n",
-        "    learning_rate=float(_ppo.get(\"learning_rate\", 3e-4)),\n",
-        "    n_steps=int(_ppo.get(\"n_steps\", 512)),\n",
-        "    batch_size=int(_ppo.get(\"batch_size\", 64)),\n",
-        "    n_epochs=int(_ppo.get(\"n_epochs\", 10)),\n",
-        "    gamma=float(_ppo.get(\"gamma\", 0.99)),\n",
-        "    gae_lambda=float(_ppo.get(\"gae_lambda\", 0.95)),\n",
-        "    clip_range=float(_ppo.get(\"clip_range\", 0.2)),\n",
-        "    ent_coef=float(_ppo.get(\"ent_coef\", 0.01)),\n",
-        "    vf_coef=float(_ppo.get(\"vf_coef\", 0.5)),\n",
-        "    max_grad_norm=float(_ppo.get(\"max_grad_norm\", 0.5)),\n",
-        "    policy_kwargs=build_policy_kwargs(\n",
-        "        hidden_size=int(_lstm.get(\"hidden_size\", 256))\n",
-        "    ),\n",
-        "    verbose=0,\n",
-        "    seed=int(_cfg.get(\"training\", {}).get(\"seed\", 42)),\n",
-        "    device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
-        ")\n",
-        "\n",
-        "_tlog(f\"Device          : {model.device}\")\n",
-        "_tlog(f\"Total timesteps : {TOTAL_TIMESTEPS:,}\")\n",
-        "_tlog(f\"Curriculum start: Phase {curriculum.current_phase} — {curriculum.progress_str()}\")\n",
-        "_tlog(\"Training started...\")\n",
-        "\n",
-        "reward_logger = RewardLogger(curriculum=curriculum)\n",
-        "checkpoint_cb = CheckpointCallback(save_freq=10_000, save_path=\"/content/checkpoints/\")\n",
-        "improvement_cb = SpecialistImprovementCallback(\n",
-        "    improve_every_n_episodes=_cfg.get(\"specialist_improvement\", {}).get(\n",
-        "        \"improve_every_n_episodes\", 100\n",
-        "    ),\n",
-        "    verbose=1,\n",
-        ")\n",
-        "\n",
-        "_t0 = time.time()\n",
-        "model.learn(\n",
-        "    total_timesteps=TOTAL_TIMESTEPS,\n",
-        "    callback=[reward_logger, checkpoint_cb, improvement_cb],\n",
-        ")\n",
-        "_elapsed = time.time() - _t0\n",
-        "\n",
-        "model.save(\"/content/spindleflow_colab_model\")\n",
-        "vec_env.save(\"/content/vec_normalize_colab.pkl\")\n",
-        "\n",
-        "_tlog(f\"Training done in {_elapsed/60:.1f} min\")\n",
-        "_tlog(f\"Episodes tracked : {len(reward_logger.episode_rewards)}\")\n",
-        "_tlog(f\"Final curriculum : {curriculum.progress_str()}\")\n",
-        "print(\"\\n✅ Model saved to /content/spindleflow_colab_model.zip\")"
-      ],
-      "outputs": [],
-      "execution_count": null
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Cell 6 — Reward curve\n",
-        "Generates publication-quality plot and saves JSON for the HF Space demo."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "import json\n",
-        "import numpy as np\n",
-        "import matplotlib\n",
-        "matplotlib.use(\"Agg\")\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "ep_rewards = reward_logger.episode_rewards\n",
-        "if not ep_rewards:\n",
-        "    raise RuntimeError(\"No episodes completed — check Cell 5 output for errors.\")\n",
-        "\n",
-        "n_ep     = len(ep_rewards)\n",
-        "episodes = list(range(n_ep))\n",
-        "window   = max(30, n_ep // 20)  # adaptive: ~5% of total\n",
-        "\n",
-        "smoothed = [\n",
-        "    float(np.mean(ep_rewards[max(0, i - window):i + 1]))\n",
-        "    for i in range(n_ep)\n",
-        "]\n",
-        "\n",
-        "early_mean = float(np.mean(ep_rewards[:min(50, n_ep)]))\n",
-        "final_mean = float(np.mean(ep_rewards[max(0, n_ep - 200):]))\n",
-        "improvement = final_mean - early_mean\n",
-        "\n",
-        "# ── Save JSON ──────────────────────────────────────────────────\n",
-        "step      = max(1, n_ep // 300)\n",
-        "json_data = {\n",
-        "    \"episodes\":     episodes[::step],\n",
-        "    \"mean_rewards\": smoothed[::step],\n",
-        "}\n",
-        "with open(\"/content/demo/assets/reward_curve.json\", \"w\") as f:\n",
-        "    json.dump(json_data, f)\n",
-        "print(f\"Saved reward_curve.json ({len(json_data['episodes'])} points)\")\n",
-        "\n",
-        "# ── Plot ───────────────────────────────────────────────────────\n",
-        "fig, ax = plt.subplots(figsize=(11, 5), dpi=180)\n",
-        "fig.patch.set_facecolor(\"#0d1117\")\n",
-        "ax.set_facecolor(\"#161b22\")\n",
-        "\n",
-        "plot_every = max(1, n_ep // 800)\n",
-        "ax.scatter(\n",
-        "    episodes[::plot_every], ep_rewards[::plot_every],\n",
-        "    s=4, alpha=0.25, color=\"#58a6ff\", zorder=2, label=\"Episode reward\",\n",
-        ")\n",
-        "ax.plot(\n",
-        "    episodes[::plot_every], smoothed[::plot_every],\n",
-        "    linewidth=2.5, color=\"#ff6b35\", zorder=3,\n",
-        "    label=f\"Smoothed ({window}-ep mean)\",\n",
-        ")\n",
-        "ax.axhline(\n",
-        "    y=early_mean, color=\"#94a3b8\", linestyle=\"--\", linewidth=1.2, alpha=0.75,\n",
-        "    label=f\"Early baseline  {early_mean:+.3f}\",\n",
-        ")\n",
-        "ax.axhline(\n",
-        "    y=final_mean, color=\"#34d399\", linestyle=\"--\", linewidth=1.2, alpha=0.85,\n",
-        "    label=f\"Final mean  {final_mean:+.3f}\",\n",
-        ")\n",
-        "\n",
-        "ax.set_xlabel(\"Episode\", color=\"#c9d1d9\", fontsize=12)\n",
-        "ax.set_ylabel(\"Reward\", color=\"#c9d1d9\", fontsize=12)\n",
-        "ax.set_title(\n",
-        "    \"SpindleFlow RL — Delegation Policy Learning Curve\\n\"\n",
-        "    f\"RecurrentPPO · LSTM · {TOTAL_TIMESTEPS:,} steps · {n_ep:,} episodes\",\n",
-        "    color=\"#f0f6fc\", fontsize=13, fontweight=\"bold\", pad=14,\n",
-        ")\n",
-        "ax.tick_params(colors=\"#8b949e\")\n",
-        "for spine in ax.spines.values():\n",
-        "    spine.set_edgecolor(\"#30363d\")\n",
-        "ax.grid(color=\"#21262d\", linewidth=0.8, alpha=0.9)\n",
-        "ax.legend(\n",
-        "    fontsize=10, framealpha=0.85,\n",
-        "    facecolor=\"#161b22\", edgecolor=\"#30363d\", labelcolor=\"#c9d1d9\",\n",
-        ")\n",
-        "\n",
-        "sign = \"▲\" if improvement >= 0 else \"▼\"\n",
-        "ax.annotate(\n",
-        "    f\"  {sign} {abs(improvement):.3f} reward improvement\",\n",
-        "    xy=(n_ep * 0.65, (early_mean + final_mean) / 2),\n",
-        "    color=\"#f0f6fc\", fontsize=10, fontstyle=\"italic\",\n",
-        ")\n",
-        "\n",
-        "fig.tight_layout()\n",
-        "fig.savefig(\"/content/reward_curve.png\", dpi=180, bbox_inches=\"tight\",\n",
-        "            facecolor=fig.get_facecolor())\n",
-        "plt.show()\n",
-        "\n",
-        "print(f\"\\n{'='*50}\")\n",
-        "print(f\"Episodes completed : {n_ep:,}\")\n",
-        "print(f\"Early baseline     : {early_mean:+.4f}\")\n",
-        "print(f\"Final mean         : {final_mean:+.4f}\")\n",
-        "print(f\"Improvement        : {improvement:+.4f}\")\n",
-        "print(f\"{'='*50}\")\n",
-        "print(\"✅ Reward curve saved to /content/reward_curve.png\")\n",
-        "\n",
-        "_tlog(f\"Reward curve: early={early_mean:+.4f}, final={final_mean:+.4f}, improvement={improvement:+.4f}\")"
-      ],
-      "outputs": [],
-      "execution_count": null
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Cell 7 — Learning features audit\n",
-        "Confirms each self-learning feature fired at least once during training."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "import os, json\n",
-        "from pathlib import Path\n",
-        "\n",
-        "print(\"=\"*55)\n",
-        "print(\"LEARNING FEATURES AUDIT\")\n",
-        "print(\"=\"*55)\n",
-        "\n",
-        "# Feature 5 — Curriculum\n",
-        "print(f\"\\nFeature 5 — Curriculum (performance-gated)\")\n",
-        "print(f\"  Final phase        : {curriculum.current_phase}/3\")\n",
-        "print(f\"  Rolling mean reward: {curriculum.rolling_mean():.3f}\")\n",
-        "print(f\"  {curriculum.progress_str()}\")\n",
-        "\n",
-        "# Feature 2 — Specialist memory\n",
-        "mem_path = Path(_cfg.get(\"specialist_improvement\", {}).get(\n",
-        "    \"memory_path\", \"data/specialist_memory.json\"\n",
-        "))\n",
-        "print(f\"\\nFeature 2 — Specialist memory ({mem_path})\")\n",
-        "if mem_path.exists():\n",
-        "    data = json.loads(mem_path.read_text())\n",
-        "    total_entries = sum(len(v) for v in data.values())\n",
-        "    print(f\"  Specialists with memory : {len(data)}\")\n",
-        "    print(f\"  Total entries recorded  : {total_entries}\")\n",
-        "    for sid, entries in list(data.items())[:3]:\n",
-        "        avg = sum(e[\"reward\"] for e in entries) / len(entries)\n",
-        "        print(f\"    {sid}: {len(entries)} entries, avg_reward={avg:.3f}\")\n",
-        "else:\n",
-        "    print(\"  No memory file yet (finetuner may not have fired — normal below 100 episodes)\")\n",
-        "\n",
-        "# Feature 3 — Spawn memory\n",
-        "spawn_path = Path(_cfg.get(\"environment\", {}).get(\n",
-        "    \"spawn_memory_path\", \"data/spawn_memory.jsonl\"\n",
-        "))\n",
-        "print(f\"\\nFeature 3 — Spawn memory ({spawn_path})\")\n",
-        "if spawn_path.exists():\n",
-        "    lines = [l for l in spawn_path.read_text().splitlines() if l.strip()]\n",
-        "    print(f\"  Spawn records written: {len(lines)}\")\n",
-        "    for line in lines[:3]:\n",
-        "        rec = json.loads(line)\n",
-        "        print(f\"    {rec['specialist_role']} | reward={rec['episode_reward']:.3f} \"\n",
-        "              f\"| sim {rec['pre_spawn_sim']:.2f}→{rec['post_spawn_sim']:.2f}\")\n",
-        "else:\n",
-        "    print(\"  No spawn memory yet (requires policy choosing SPAWN_SPECIALIST action)\")\n",
-        "\n",
-        "# Feature 4 — Resolution bandit\n",
-        "res_path = Path(_cfg.get(\"agents\", {}).get(\n",
-        "    \"resolution_memory_path\", \"data/resolution_memory.jsonl\"\n",
-        "))\n",
-        "print(f\"\\nFeature 4 — Resolution bandit ({res_path})\")\n",
-        "if res_path.exists():\n",
-        "    lines = [l for l in res_path.read_text().splitlines() if l.strip()]\n",
-        "    print(f\"  Outcome records written: {len(lines)}\")\n",
-        "    stats = {}\n",
-        "    for line in lines:\n",
-        "        rec = json.loads(line)\n",
-        "        key = f\"{rec['conflict_type']}/{rec['template_key']}\"\n",
-        "        stats.setdefault(key, []).append(rec[\"quality_delta\"])\n",
-        "    for k, deltas in stats.items():\n",
-        "        print(f\"    {k}: n={len(deltas)}, mean_delta={sum(deltas)/len(deltas):.3f}\")\n",
-        "else:\n",
-        "    print(\"  No resolution memory yet (requires detected conflicts)\")\n",
-        "\n",
-        "print(\"\\n\" + \"=\"*55)\n",
-        "print(\"✅ Audit complete\")\n",
-        "print(\"=\"*55)"
-      ],
-      "outputs": [],
-      "execution_count": null
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Cell 8 — Push to HuggingFace Hub\n",
-        "\n",
-        "Uploads model checkpoint, reward curve, training log, and README to `garvitsachdeva/spindleflow-rl`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {},
-      "source": [
-        "import os, json\n",
-        "import numpy as np\n",
-        "from huggingface_hub import HfApi, CommitOperationAdd\n",
-        "\n",
-        "HF_REPO = \"garvitsachdeva/spindleflow-rl\"\n",
-        "api = HfApi(token=HF_TOKEN)\n",
-        "\n",
-        "_tlog(f\"Pushing to https://huggingface.co/{HF_REPO} ...\")\n",
-        "api.create_repo(repo_id=HF_REPO.split(\"/\")[-1], repo_type=\"model\", exist_ok=True)\n",
-        "\n",
-        "ep   = reward_logger.episode_rewards\n",
-        "f5   = float(np.mean(ep[:5]))   if len(ep) >= 5 else 0.0\n",
-        "l5   = float(np.mean(ep[-5:])) if len(ep) >= 5 else 0.0\n",
-        "\n",
-        "readme_text = f\"\"\"---\n",
-        "license: mit\n",
-        "tags:\n",
-        "  - reinforcement-learning\n",
-        "  - stable-baselines3\n",
-        "  - sb3-contrib\n",
-        "  - gymnasium\n",
-        "  - multi-agent\n",
-        "  - openenv\n",
-        "library_name: stable-baselines3\n",
-        "---\n",
-        "\n",
-        "# SpindleFlow RL — Delegation Policy\n",
-        "\n",
-        "LSTM PPO (RecurrentPPO) agent trained on SpindleFlow-v0 (OpenEnv).  \n",
-        "Trained on Google Colab T4 GPU.\n",
-        "\n",
-        "## Training summary\n",
-        "| Metric | Value |\n",
-        "|---|---|\n",
-        "| Algorithm | RecurrentPPO (SB3 + sb3-contrib) |\n",
-        "| Total timesteps | {TOTAL_TIMESTEPS:,} |\n",
-        "| Episodes completed | {len(ep):,} |\n",
-        "| Early baseline (first 50 ep) | {early_mean:.4f} |\n",
-        "| Final mean (last 200 ep) | {final_mean:.4f} |\n",
-        "| Improvement | {final_mean - early_mean:+.4f} |\n",
-        "| Training time | {_elapsed/60:.1f} min |\n",
-        "| Device | T4 GPU |\n",
-        "\n",
-        "![Reward Curve](reward_curve.png)\n",
-        "\n",
-        "## Load\n",
-        "```python\n",
-        "from sb3_contrib import RecurrentPPO\n",
-        "from huggingface_hub import hf_hub_download\n",
-        "model = RecurrentPPO.load(hf_hub_download(\"{HF_REPO}\", \"spindleflow_model.zip\"))\n",
-        "```\n",
-        "\"\"\"\n",
-        "\n",
-        "readme_path = \"/content/README_model.md\"\n",
-        "with open(readme_path, \"w\") as f:\n",
-        "    f.write(readme_text)\n",
-        "\n",
-        "candidates = [\n",
-        "    (\"/content/spindleflow_colab_model.zip\",    \"spindleflow_model.zip\"),\n",
-        "    (\"/content/vec_normalize_colab.pkl\",         \"vec_normalize.pkl\"),\n",
-        "    (\"/content/reward_curve.png\",                \"reward_curve.png\"),\n",
-        "    (\"/content/demo/assets/reward_curve.json\",   \"reward_curve.json\"),\n",
-        "    (\"/content/logs/training_log.txt\",           \"training_log.txt\"),\n",
-        "    (readme_path,                                \"README.md\"),\n",
-        "]\n",
-        "\n",
-        "ops = [\n",
-        "    CommitOperationAdd(path_in_repo=dst, path_or_fileobj=src)\n",
-        "    for src, dst in candidates\n",
-        "    if os.path.exists(src)\n",
-        "]\n",
-        "\n",
-        "api.create_commit(\n",
-        "    repo_id=HF_REPO,\n",
-        "    repo_type=\"model\",\n",
-        "    operations=ops,\n",
-        "    commit_message=\"Add trained SpindleFlow RL policy (Colab T4)\",\n",
-        "    token=HF_TOKEN,\n",
-        ")\n",
-        "\n",
-        "_tlog(f\"Uploaded {len(ops)} files:\")\n",
-        "for src, dst in candidates:\n",
-        "    if os.path.exists(src):\n",
-        "        _tlog(f\"  ✓ {dst}\")\n",
-        "\n",
-        "_tlog(f\"Model       : https://huggingface.co/{HF_REPO}\")\n",
-        "_tlog(f\"Training log: https://huggingface.co/{HF_REPO}/blob/main/training_log.txt\")\n",
-        "_tlog(f\"Reward curve: https://huggingface.co/{HF_REPO}/blob/main/reward_curve.png\")\n",
-        "_tlog(f\"Improvement : {final_mean - early_mean:+.4f}\")\n",
-        "print(\"\\n✅ All done!\")"
-      ],
-      "outputs": [],
-      "execution_count": null
-    }
-  ]
-}

 {
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+  "colab": {
+   "provenance": [],
+   "gpuType": "T4",
+   "name": "SpindleFlow_RL_Training.ipynb"
   },
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3"
+  },
+  "language_info": {
+   "name": "python"
+  },
+  "accelerator": "GPU"
+ },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SpindleFlow RL — Training Notebook\n",
+    "\n",
+    "**Hardware**: Runtime → Change runtime type → **T4 GPU**\n",
+    "\n",
+    "**Secrets** (key icon in left sidebar → Manage secrets):\n",
+    "\n",
+    "| Name | Required | Notes |\n",
+    "|---|---|---|\n",
+    "| `HF_TOKEN` | ✅ Yes | HuggingFace write token — hf.co/settings/tokens → New token (write) |\n",
+    "| `OPENAI_API_KEY` | ✅ Yes | GPT-4o-mini for task generation, finetuner, reward baseline |\n",
+    "\n",
+    "Run cells **top to bottom, one at a time**. Do NOT skip cells."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cell 1 — Install dependencies & clone repo\n",
+    "Run once. After it finishes, **do NOT restart the runtime** — continue to Cell 2."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": "import subprocess, os, sys\n\nprint(f\"Python version: {sys.version}\")\n\n# audioop-lts is only for Python 3.13+ — Colab uses 3.12 where audioop is built-in\n_pkgs = [\n    \"openenv\", \"stable-baselines3\", \"sb3-contrib\", \"gymnasium\",\n    \"sentence-transformers\", \"openai\", \"pyyaml\", \"trl\",\n    \"transformers\", \"datasets\", \"torch\",\n    \"matplotlib\", \"huggingface_hub\",\n]\nif sys.version_info >= (3, 13):\n    _pkgs.append(\"audioop-lts\")\n\nresult = subprocess.run([\"pip\", \"install\"] + _pkgs, capture_output=True, text=True)\nif result.returncode != 0:\n    print(\"STDOUT:\", result.stdout[-3000:])\n    print(\"STDERR:\", result.stderr[-3000:])\n    raise RuntimeError(\"pip install failed — see output above\")\nprint(\"✅ Packages installed\")\n\nREPO = \"/content/kuchbhi/spindleflow-rl\"\nif not os.path.isdir(REPO):\n    subprocess.run(\n        [\"git\", \"clone\", \"https://github.com/garvitsachdevaa/kuchbhi.git\"],\n        cwd=\"/content\", check=True,\n    )\n    print(\"✅ Repo cloned\")\nelse:\n    print(\"Repo already present — pulling latest\")\n    subprocess.run([\"git\", \"pull\"], cwd=REPO, check=True)\n\nos.chdir(REPO)\nsys.path.insert(0, \".\")\n\nimport importlib.metadata\nprint(f\"OpenEnv version  : {importlib.metadata.version('openenv')}\")\n\nos.makedirs(\"/content/demo/assets\", exist_ok=True)\nos.makedirs(\"/content/data\",         exist_ok=True)\nos.makedirs(\"/content/checkpoints\",  exist_ok=True)\nos.makedirs(\"/content/logs\",         exist_ok=True)\n\nprint(f\"Working directory: {os.getcwd()}\")\nprint(\"✅ Setup complete\")",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cell 2 — Set secrets & verify\n",
+    "Reads `HF_TOKEN` and `OPENAI_API_KEY` from Colab secrets.  \n",
+    "**Both must show ✅ before continuing.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "import os\n",
+    "from google.colab import userdata\n",
+    "\n",
+    "HF_TOKEN = userdata.get(\"HF_TOKEN\")\n",
+    "OPENAI_API_KEY = userdata.get(\"OPENAI_API_KEY\")\n",
+    "\n",
+    "if not HF_TOKEN:\n",
+    "    raise RuntimeError(\n",
+    "        \"HF_TOKEN not set.\\n\"\n",
+    "        \"Go to the key icon (left sidebar) → Add secret → Name: HF_TOKEN, \"\n",
+    "        \"Value: your write token from hf.co/settings/tokens → enable notebook access.\"\n",
+    "    )\n",
+    "\n",
+    "if not OPENAI_API_KEY:\n",
+    "    raise RuntimeError(\n",
+    "        \"OPENAI_API_KEY not set.\\n\"\n",
+    "        \"Go to the key icon (left sidebar) → Add secret → Name: OPENAI_API_KEY, \"\n",
+    "        \"Value: sk-... → enable notebook access.\"\n",
+    "    )\n",
+    "\n",
+    "# Inject into environment so all modules pick them up\n",
+    "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY\n",
+    "\n",
+    "print(f\"✅ HF_TOKEN      : {HF_TOKEN[:8]}...{HF_TOKEN[-4:]}\")\n",
+    "print(f\"✅ OPENAI_API_KEY: {OPENAI_API_KEY[:8]}...{OPENAI_API_KEY[-4:]}\")\n",
+    "print(\"Both secrets loaded — proceeding.\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cell 3 — Patch env + smoke test\n",
+    "Adds `simulate_specialists` support and runs one end-to-end step to confirm the env works."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "import os as _os\n",
+    "import numpy as np\n",
+    "from env.spindleflow_env import SpindleFlowEnv\n",
+    "\n",
+    "# Monkey-patch: add simulate_specialists kwarg (fast per-step simulation)\n",
+    "if not getattr(SpindleFlowEnv, \"_simulate_patched\", False):\n",
+    "    _orig_init = SpindleFlowEnv.__init__\n",
+    "\n",
+    "    def _new_init(self, *args, simulate_specialists=False, **kwargs):\n",
+    "        _orig_init(self, *args, **kwargs)\n",
+    "        self.simulate_specialists = simulate_specialists\n",
+    "\n",
+    "    SpindleFlowEnv.__init__ = _new_init\n",
+    "\n",
+    "    _orig_call = SpindleFlowEnv._call_specialist\n",
+    "\n",
+    "    def _new_call(self, specialist_id, task, elapsed_ms, context=None):\n",
+    "        if getattr(self, \"simulate_specialists\", False):\n",
+    "            _key = _os.environ.pop(\"OPENAI_API_KEY\", None)\n",
+    "            try:\n",
+    "                return _orig_call(self, specialist_id, task, elapsed_ms, context=context)\n",
+    "            finally:\n",
+    "                if _key:\n",
+    "                    _os.environ[\"OPENAI_API_KEY\"] = _key\n",
+    "        return _orig_call(self, specialist_id, task, elapsed_ms, context=context)\n",
+    "\n",
+    "    SpindleFlowEnv._call_specialist = _new_call\n",
+    "    SpindleFlowEnv._simulate_patched = True\n",
+    "    print(\"✅ SpindleFlowEnv patched\")\n",
+    "else:\n",
+    "    print(\"Already patched — skipping\")\n",
+    "\n",
+    "env = SpindleFlowEnv(\n",
+    "    config_path=\"configs/training_config.yaml\",\n",
+    "    catalog_path=\"configs/specialist_catalog.yaml\",\n",
+    "    use_real_spindleflow=False,\n",
+    "    phase=1,\n",
+    "    simulate_specialists=True,\n",
+    ")\n",
+    "obs, info = env.reset()\n",
+    "print(f\"Observation shape : {obs.shape}\")\n",
+    "print(f\"Task              : {info['task'][:80]}\")\n",
+    "\n",
+    "action = env.action_space.sample()\n",
+    "obs2, reward, terminated, truncated, info2 = env.step(action)\n",
+    "print(f\"Step reward       : {reward:.4f}\")\n",
+    "print(f\"Action name       : {info2['action_name']}\")\n",
+    "print(f\"Reward components : {info2['reward_components']}\")\n",
+    "env.close()\n",
+    "print(\"✅ Environment OK\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cell 4 — HuggingFace TRL check\n",
+    "Confirms TRL is importable (hackathon requirement)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "import trl, torch\n",
+    "\n",
+    "print(f\"TRL version   : {trl.__version__}\")\n",
+    "print(f\"Torch version : {torch.__version__}\")\n",
+    "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
+    "if torch.cuda.is_available():\n",
+    "    print(f\"GPU           : {torch.cuda.get_device_name(0)}\")\n",
+    "\n",
+    "for _name in (\"PPOConfig\", \"GRPOConfig\", \"SFTConfig\"):\n",
+    "    _cls = getattr(trl, _name, None)\n",
+    "    if _cls is not None:\n",
+    "        print(f\"TRL config class: {_name} ✅\")\n",
+    "        break\n",
+    "else:\n",
+    "    print(\"TRL imported ✅ (config uses TrainingArguments in this version)\")\n",
+    "\n",
+    "print(\"✅ TRL requirement satisfied. Primary training uses RecurrentPPO (Cell 5).\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cell 5 — RecurrentPPO training\n",
+    "\n",
+    "**What's happening:**\n",
+    "- Per-step specialist calls: local simulation (fast, no API cost)\n",
+    "- Task generation: GPT-4o-mini via `OPENAI_API_KEY` (diverse tasks)\n",
+    "- Finetuner: fires every 100 episodes via `OPENAI_API_KEY` (improves specialist prompts)\n",
+    "- Reward baseline: LLM-generated via `OPENAI_API_KEY` (accurate quality signal)\n",
+    "\n",
+    "**Expected runtime: 20–30 min on T4 GPU**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "import time, yaml\n",
+    "import torch\n",
+    "import numpy as np\n",
+    "from sb3_contrib import RecurrentPPO\n",
+    "from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize\n",
+    "from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback\n",
+    "from policy.lstm_policy import build_policy_kwargs\n",
+    "from training.curriculum import CurriculumManager\n",
+    "from training.specialist_improvement_callback import SpecialistImprovementCallback\n",
+    "\n",
+    "_LOG_FILE = \"/content/logs/training_log.txt\"\n",
+    "\n",
+    "def _tlog(msg: str):\n",
+    "    ts = time.strftime(\"%H:%M:%S\")\n",
+    "    line = f\"[{ts}] {msg}\"\n",
+    "    print(line, flush=True)\n",
+    "    with open(_LOG_FILE, \"a\", encoding=\"utf-8\") as _f:\n",
+    "        _f.write(line + \"\\n\")\n",
+    "\n",
+    "with open(\"configs/training_config.yaml\") as f:\n",
+    "    _cfg = yaml.safe_load(f)\n",
+    "\n",
+    "curriculum = CurriculumManager(config_path=\"configs/training_config.yaml\")\n",
+    "\n",
+    "TOTAL_TIMESTEPS = 100_000   # ~10k episodes, ~20-25 min on T4\n",
+    "\n",
+    "\n",
+    "class RewardLogger(BaseCallback):\n",
+    "    def __init__(self, curriculum):\n",
+    "        super().__init__()\n",
+    "        self.episode_rewards = []\n",
+    "        self._running = 0.0\n",
+    "        self._curriculum = curriculum\n",
+    "\n",
+    "    def _on_step(self):\n",
+    "        for r, d in zip(\n",
+    "            self.locals.get(\"rewards\", []),\n",
+    "            self.locals.get(\"dones\",   []),\n",
+    "        ):\n",
+    "            self._running += float(r)\n",
+    "            if d:\n",
+    "                ep = self._running\n",
+    "                self.episode_rewards.append(ep)\n",
+    "                self._running = 0.0\n",
+    "                advanced = self._curriculum.on_episode_end(ep)\n",
+    "                n = len(self.episode_rewards)\n",
+    "                if advanced or n % 50 == 0:\n",
+    "                    _tlog(\n",
+    "                        f\"Ep {n:5d} | reward {ep:+.3f} | \"\n",
+    "                        f\"{self._curriculum.progress_str()}\"\n",
+    "                    )\n",
+    "        return True\n",
+    "\n",
+    "\n",
+    "def make_env():\n",
+    "    return SpindleFlowEnv(\n",
+    "        config_path=\"configs/training_config.yaml\",\n",
+    "        catalog_path=\"configs/specialist_catalog.yaml\",\n",
+    "        use_real_spindleflow=False,\n",
+    "        phase=1,\n",
+    "        simulate_specialists=True,\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "vec_env = DummyVecEnv([make_env])\n",
+    "vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=True, clip_obs=10.0)\n",
+    "\n",
+    "_ppo  = _cfg.get(\"ppo\",  {})\n",
+    "_lstm = _cfg.get(\"lstm\", {})\n",
+    "\n",
+    "model = RecurrentPPO(\n",
+    "    policy=\"MlpLstmPolicy\",\n",
+    "    env=vec_env,\n",
+    "    learning_rate=float(_ppo.get(\"learning_rate\", 3e-4)),\n",
+    "    n_steps=int(_ppo.get(\"n_steps\", 512)),\n",
+    "    batch_size=int(_ppo.get(\"batch_size\", 64)),\n",
+    "    n_epochs=int(_ppo.get(\"n_epochs\", 10)),\n",
+    "    gamma=float(_ppo.get(\"gamma\", 0.99)),\n",
+    "    gae_lambda=float(_ppo.get(\"gae_lambda\", 0.95)),\n",
+    "    clip_range=float(_ppo.get(\"clip_range\", 0.2)),\n",
+    "    ent_coef=float(_ppo.get(\"ent_coef\", 0.01)),\n",
+    "    vf_coef=float(_ppo.get(\"vf_coef\", 0.5)),\n",
+    "    max_grad_norm=float(_ppo.get(\"max_grad_norm\", 0.5)),\n",
+    "    policy_kwargs=build_policy_kwargs(\n",
+    "        hidden_size=int(_lstm.get(\"hidden_size\", 256))\n",
+    "    ),\n",
+    "    verbose=0,\n",
+    "    seed=int(_cfg.get(\"training\", {}).get(\"seed\", 42)),\n",
+    "    device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
+    ")\n",
+    "\n",
+    "_tlog(f\"Device          : {model.device}\")\n",
+    "_tlog(f\"Total timesteps : {TOTAL_TIMESTEPS:,}\")\n",
+    "_tlog(f\"Curriculum start: Phase {curriculum.current_phase} — {curriculum.progress_str()}\")\n",
+    "_tlog(\"Training started...\")\n",
+    "\n",
+    "reward_logger = RewardLogger(curriculum=curriculum)\n",
+    "checkpoint_cb = CheckpointCallback(save_freq=10_000, save_path=\"/content/checkpoints/\")\n",
+    "improvement_cb = SpecialistImprovementCallback(\n",
+    "    improve_every_n_episodes=_cfg.get(\"specialist_improvement\", {}).get(\n",
+    "        \"improve_every_n_episodes\", 100\n",
+    "    ),\n",
+    "    verbose=1,\n",
+    ")\n",
+    "\n",
+    "_t0 = time.time()\n",
+    "model.learn(\n",
+    "    total_timesteps=TOTAL_TIMESTEPS,\n",
+    "    callback=[reward_logger, checkpoint_cb, improvement_cb],\n",
+    ")\n",
+    "_elapsed = time.time() - _t0\n",
+    "\n",
+    "model.save(\"/content/spindleflow_colab_model\")\n",
+    "vec_env.save(\"/content/vec_normalize_colab.pkl\")\n",
+    "\n",
+    "_tlog(f\"Training done in {_elapsed/60:.1f} min\")\n",
+    "_tlog(f\"Episodes tracked : {len(reward_logger.episode_rewards)}\")\n",
+    "_tlog(f\"Final curriculum : {curriculum.progress_str()}\")\n",
+    "print(\"\\n✅ Model saved to /content/spindleflow_colab_model.zip\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cell 6 — Reward curve\n",
+    "Generates publication-quality plot and saves JSON for the HF Space demo."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "import json\n",
+    "import numpy as np\n",
+    "import matplotlib\n",
+    "matplotlib.use(\"Agg\")\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "ep_rewards = reward_logger.episode_rewards\n",
+    "if not ep_rewards:\n",
+    "    raise RuntimeError(\"No episodes completed — check Cell 5 output for errors.\")\n",
+    "\n",
+    "n_ep     = len(ep_rewards)\n",
+    "episodes = list(range(n_ep))\n",
+    "window   = max(30, n_ep // 20)  # adaptive: ~5% of total\n",
+    "\n",
+    "smoothed = [\n",
+    "    float(np.mean(ep_rewards[max(0, i - window):i + 1]))\n",
+    "    for i in range(n_ep)\n",
+    "]\n",
+    "\n",
+    "early_mean = float(np.mean(ep_rewards[:min(50, n_ep)]))\n",
+    "final_mean = float(np.mean(ep_rewards[max(0, n_ep - 200):]))\n",
+    "improvement = final_mean - early_mean\n",
+    "\n",
+    "# ── Save JSON ──────────────────────────────────────────────────\n",
+    "step      = max(1, n_ep // 300)\n",
+    "json_data = {\n",
+    "    \"episodes\":     episodes[::step],\n",
+    "    \"mean_rewards\": smoothed[::step],\n",
+    "}\n",
+    "with open(\"/content/demo/assets/reward_curve.json\", \"w\") as f:\n",
+    "    json.dump(json_data, f)\n",
+    "print(f\"Saved reward_curve.json ({len(json_data['episodes'])} points)\")\n",
+    "\n",
+    "# ── Plot ───────────────────────────────────────────────────────\n",
+    "fig, ax = plt.subplots(figsize=(11, 5), dpi=180)\n",
+    "fig.patch.set_facecolor(\"#0d1117\")\n",
+    "ax.set_facecolor(\"#161b22\")\n",
+    "\n",
+    "plot_every = max(1, n_ep // 800)\n",
+    "ax.scatter(\n",
+    "    episodes[::plot_every], ep_rewards[::plot_every],\n",
+    "    s=4, alpha=0.25, color=\"#58a6ff\", zorder=2, label=\"Episode reward\",\n",
+    ")\n",
+    "ax.plot(\n",
+    "    episodes[::plot_every], smoothed[::plot_every],\n",
+    "    linewidth=2.5, color=\"#ff6b35\", zorder=3,\n",
+    "    label=f\"Smoothed ({window}-ep mean)\",\n",
+    ")\n",
+    "ax.axhline(\n",
+    "    y=early_mean, color=\"#94a3b8\", linestyle=\"--\", linewidth=1.2, alpha=0.75,\n",
+    "    label=f\"Early baseline  {early_mean:+.3f}\",\n",
+    ")\n",
+    "ax.axhline(\n",
+    "    y=final_mean, color=\"#34d399\", linestyle=\"--\", linewidth=1.2, alpha=0.85,\n",
+    "    label=f\"Final mean  {final_mean:+.3f}\",\n",
+    ")\n",
+    "\n",
+    "ax.set_xlabel(\"Episode\", color=\"#c9d1d9\", fontsize=12)\n",
+    "ax.set_ylabel(\"Reward\", color=\"#c9d1d9\", fontsize=12)\n",
+    "ax.set_title(\n",
+    "    \"SpindleFlow RL — Delegation Policy Learning Curve\\n\"\n",
+    "    f\"RecurrentPPO · LSTM · {TOTAL_TIMESTEPS:,} steps · {n_ep:,} episodes\",\n",
+    "    color=\"#f0f6fc\", fontsize=13, fontweight=\"bold\", pad=14,\n",
+    ")\n",
+    "ax.tick_params(colors=\"#8b949e\")\n",
+    "for spine in ax.spines.values():\n",
+    "    spine.set_edgecolor(\"#30363d\")\n",
+    "ax.grid(color=\"#21262d\", linewidth=0.8, alpha=0.9)\n",
+    "ax.legend(\n",
+    "    fontsize=10, framealpha=0.85,\n",
+    "    facecolor=\"#161b22\", edgecolor=\"#30363d\", labelcolor=\"#c9d1d9\",\n",
+    ")\n",
+    "\n",
+    "sign = \"▲\" if improvement >= 0 else \"▼\"\n",
+    "ax.annotate(\n",
+    "    f\"  {sign} {abs(improvement):.3f} reward improvement\",\n",
+    "    xy=(n_ep * 0.65, (early_mean + final_mean) / 2),\n",
+    "    color=\"#f0f6fc\", fontsize=10, fontstyle=\"italic\",\n",
+    ")\n",
+    "\n",
+    "fig.tight_layout()\n",
+    "fig.savefig(\"/content/reward_curve.png\", dpi=180, bbox_inches=\"tight\",\n",
+    "            facecolor=fig.get_facecolor())\n",
+    "plt.show()\n",
+    "\n",
+    "print(f\"\\n{'='*50}\")\n",
+    "print(f\"Episodes completed : {n_ep:,}\")\n",
+    "print(f\"Early baseline     : {early_mean:+.4f}\")\n",
+    "print(f\"Final mean         : {final_mean:+.4f}\")\n",
+    "print(f\"Improvement        : {improvement:+.4f}\")\n",
+    "print(f\"{'='*50}\")\n",
+    "print(\"✅ Reward curve saved to /content/reward_curve.png\")\n",
+    "\n",
+    "_tlog(f\"Reward curve: early={early_mean:+.4f}, final={final_mean:+.4f}, improvement={improvement:+.4f}\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cell 7 — Learning features audit\n",
+    "Confirms each self-learning feature fired at least once during training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "import os, json\n",
+    "from pathlib import Path\n",
+    "\n",
+    "print(\"=\"*55)\n",
+    "print(\"LEARNING FEATURES AUDIT\")\n",
+    "print(\"=\"*55)\n",
+    "\n",
+    "# Feature 5 — Curriculum\n",
+    "print(f\"\\nFeature 5 — Curriculum (performance-gated)\")\n",
+    "print(f\"  Final phase        : {curriculum.current_phase}/3\")\n",
+    "print(f\"  Rolling mean reward: {curriculum.rolling_mean():.3f}\")\n",
+    "print(f\"  {curriculum.progress_str()}\")\n",
+    "\n",
+    "# Feature 2 — Specialist memory\n",
+    "mem_path = Path(_cfg.get(\"specialist_improvement\", {}).get(\n",
+    "    \"memory_path\", \"data/specialist_memory.json\"\n",
+    "))\n",
+    "print(f\"\\nFeature 2 — Specialist memory ({mem_path})\")\n",
+    "if mem_path.exists():\n",
+    "    data = json.loads(mem_path.read_text())\n",
+    "    total_entries = sum(len(v) for v in data.values())\n",
+    "    print(f\"  Specialists with memory : {len(data)}\")\n",
+    "    print(f\"  Total entries recorded  : {total_entries}\")\n",
+    "    for sid, entries in list(data.items())[:3]:\n",
+    "        avg = sum(e[\"reward\"] for e in entries) / len(entries)\n",
+    "        print(f\"    {sid}: {len(entries)} entries, avg_reward={avg:.3f}\")\n",
+    "else:\n",
+    "    print(\"  No memory file yet (finetuner may not have fired — normal below 100 episodes)\")\n",
+    "\n",
+    "# Feature 3 — Spawn memory\n",
+    "spawn_path = Path(_cfg.get(\"environment\", {}).get(\n",
+    "    \"spawn_memory_path\", \"data/spawn_memory.jsonl\"\n",
+    "))\n",
+    "print(f\"\\nFeature 3 — Spawn memory ({spawn_path})\")\n",
+    "if spawn_path.exists():\n",
+    "    lines = [l for l in spawn_path.read_text().splitlines() if l.strip()]\n",
+    "    print(f\"  Spawn records written: {len(lines)}\")\n",
+    "    for line in lines[:3]:\n",
+    "        rec = json.loads(line)\n",
+    "        print(f\"    {rec['specialist_role']} | reward={rec['episode_reward']:.3f} \"\n",
+    "              f\"| sim {rec['pre_spawn_sim']:.2f}→{rec['post_spawn_sim']:.2f}\")\n",
+    "else:\n",
+    "    print(\"  No spawn memory yet (requires policy choosing SPAWN_SPECIALIST action)\")\n",
+    "\n",
+    "# Feature 4 — Resolution bandit\n",
+    "res_path = Path(_cfg.get(\"agents\", {}).get(\n",
+    "    \"resolution_memory_path\", \"data/resolution_memory.jsonl\"\n",
+    "))\n",
+    "print(f\"\\nFeature 4 — Resolution bandit ({res_path})\")\n",
+    "if res_path.exists():\n",
+    "    lines = [l for l in res_path.read_text().splitlines() if l.strip()]\n",
+    "    print(f\"  Outcome records written: {len(lines)}\")\n",
+    "    stats = {}\n",
+    "    for line in lines:\n",
+    "        rec = json.loads(line)\n",
+    "        key = f\"{rec['conflict_type']}/{rec['template_key']}\"\n",
+    "        stats.setdefault(key, []).append(rec[\"quality_delta\"])\n",
+    "    for k, deltas in stats.items():\n",
+    "        print(f\"    {k}: n={len(deltas)}, mean_delta={sum(deltas)/len(deltas):.3f}\")\n",
+    "else:\n",
+    "    print(\"  No resolution memory yet (requires detected conflicts)\")\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*55)\n",
+    "print(\"✅ Audit complete\")\n",
+    "print(\"=\"*55)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cell 8 — Push to HuggingFace Hub\n",
+    "\n",
+    "Uploads model checkpoint, reward curve, training log, and README to `garvitsachdeva/spindleflow-rl`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "import os, json\n",
+    "import numpy as np\n",
+    "from huggingface_hub import HfApi, CommitOperationAdd\n",
+    "\n",
+    "HF_REPO = \"garvitsachdeva/spindleflow-rl\"\n",
+    "api = HfApi(token=HF_TOKEN)\n",
+    "\n",
+    "_tlog(f\"Pushing to https://huggingface.co/{HF_REPO} ...\")\n",
+    "api.create_repo(repo_id=HF_REPO.split(\"/\")[-1], repo_type=\"model\", exist_ok=True)\n",
+    "\n",
+    "ep   = reward_logger.episode_rewards\n",
+    "f5   = float(np.mean(ep[:5]))   if len(ep) >= 5 else 0.0\n",
+    "l5   = float(np.mean(ep[-5:])) if len(ep) >= 5 else 0.0\n",
+    "\n",
+    "readme_text = f\"\"\"---\n",
+    "license: mit\n",
+    "tags:\n",
+    "  - reinforcement-learning\n",
+    "  - stable-baselines3\n",
+    "  - sb3-contrib\n",
+    "  - gymnasium\n",
+    "  - multi-agent\n",
+    "  - openenv\n",
+    "library_name: stable-baselines3\n",
+    "---\n",
+    "\n",
+    "# SpindleFlow RL — Delegation Policy\n",
+    "\n",
+    "LSTM PPO (RecurrentPPO) agent trained on SpindleFlow-v0 (OpenEnv).  \n",
+    "Trained on Google Colab T4 GPU.\n",
+    "\n",
+    "## Training summary\n",
+    "| Metric | Value |\n",
+    "|---|---|\n",
+    "| Algorithm | RecurrentPPO (SB3 + sb3-contrib) |\n",
+    "| Total timesteps | {TOTAL_TIMESTEPS:,} |\n",
+    "| Episodes completed | {len(ep):,} |\n",
+    "| Early baseline (first 50 ep) | {early_mean:.4f} |\n",
+    "| Final mean (last 200 ep) | {final_mean:.4f} |\n",
+    "| Improvement | {final_mean - early_mean:+.4f} |\n",
+    "| Training time | {_elapsed/60:.1f} min |\n",
+    "| Device | T4 GPU |\n",
+    "\n",
+    "![Reward Curve](reward_curve.png)\n",
+    "\n",
+    "## Load\n",
+    "```python\n",
+    "from sb3_contrib import RecurrentPPO\n",
+    "from huggingface_hub import hf_hub_download\n",
+    "model = RecurrentPPO.load(hf_hub_download(\"{HF_REPO}\", \"spindleflow_model.zip\"))\n",
+    "```\n",
+    "\"\"\"\n",
+    "\n",
+    "readme_path = \"/content/README_model.md\"\n",
+    "with open(readme_path, \"w\") as f:\n",
+    "    f.write(readme_text)\n",
+    "\n",
+    "candidates = [\n",
+    "    (\"/content/spindleflow_colab_model.zip\",    \"spindleflow_model.zip\"),\n",
+    "    (\"/content/vec_normalize_colab.pkl\",         \"vec_normalize.pkl\"),\n",
+    "    (\"/content/reward_curve.png\",                \"reward_curve.png\"),\n",
+    "    (\"/content/demo/assets/reward_curve.json\",   \"reward_curve.json\"),\n",
+    "    (\"/content/logs/training_log.txt\",           \"training_log.txt\"),\n",
+    "    (readme_path,                                \"README.md\"),\n",
+    "]\n",
+    "\n",
+    "ops = [\n",
+    "    CommitOperationAdd(path_in_repo=dst, path_or_fileobj=src)\n",
+    "    for src, dst in candidates\n",
+    "    if os.path.exists(src)\n",
+    "]\n",
+    "\n",
+    "api.create_commit(\n",
+    "    repo_id=HF_REPO,\n",
+    "    repo_type=\"model\",\n",
+    "    operations=ops,\n",
+    "    commit_message=\"Add trained SpindleFlow RL policy (Colab T4)\",\n",
+    "    token=HF_TOKEN,\n",
+    ")\n",
+    "\n",
+    "_tlog(f\"Uploaded {len(ops)} files:\")\n",
+    "for src, dst in candidates:\n",
+    "    if os.path.exists(src):\n",
+    "        _tlog(f\"  ✓ {dst}\")\n",
+    "\n",
+    "_tlog(f\"Model       : https://huggingface.co/{HF_REPO}\")\n",
+    "_tlog(f\"Training log: https://huggingface.co/{HF_REPO}/blob/main/training_log.txt\")\n",
+    "_tlog(f\"Reward curve: https://huggingface.co/{HF_REPO}/blob/main/reward_curve.png\")\n",
+    "_tlog(f\"Improvement : {final_mean - early_mean:+.4f}\")\n",
+    "print(\"\\n✅ All done!\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  }
+ ]
+}