Spaces:

garvitsachdeva
/

SpindleFlow-RL

Runtime error

App Files Files Community

garvitsachdeva Claude Sonnet 4.6 commited on Apr 25

Commit

13dd91f

1 Parent(s): c2b373f

Add Colab notebook: 8 runnable cells, both secrets, log + curve

Browse files

Files changed (1) hide show

colab/SpindleFlow_RL_Training.ipynb +672 -0

colab/SpindleFlow_RL_Training.ipynb ADDED Viewed

	@@ -0,0 +1,672 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4",
+      "name": "SpindleFlow_RL_Training.ipynb"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# SpindleFlow RL — Training Notebook\n",
+        "\n",
+        "**Hardware**: Runtime → Change runtime type → **T4 GPU**\n",
+        "\n",
+        "**Secrets** (key icon in left sidebar → Manage secrets):\n",
+        "\n",
+        "| Name | Required | Notes |\n",
+        "|---|---|---|\n",
+        "| `HF_TOKEN` | ✅ Yes | HuggingFace write token — hf.co/settings/tokens → New token (write) |\n",
+        "| `OPENAI_API_KEY` | ✅ Yes | GPT-4o-mini for task generation, finetuner, reward baseline |\n",
+        "\n",
+        "Run cells **top to bottom, one at a time**. Do NOT skip cells."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Cell 1 — Install dependencies & clone repo\n",
+        "Run once. After it finishes, **do NOT restart the runtime** — continue to Cell 2."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "import subprocess, os, sys\n",
+        "\n",
+        "subprocess.run([\n",
+        "    \"pip\", \"install\", \"-q\",\n",
+        "    \"openenv\", \"stable-baselines3\", \"sb3-contrib\", \"gymnasium\",\n",
+        "    \"sentence-transformers\", \"openai\", \"pyyaml\", \"trl\",\n",
+        "    \"transformers\", \"datasets\", \"torch\",\n",
+        "    \"matplotlib\", \"audioop-lts\", \"huggingface_hub\",\n",
+        "], check=True)\n",
+        "print(\"✅ Packages installed\")\n",
+        "\n",
+        "REPO = \"/content/kuchbhi/spindleflow-rl\"\n",
+        "if not os.path.isdir(REPO):\n",
+        "    subprocess.run(\n",
+        "        [\"git\", \"clone\", \"https://github.com/garvitsachdevaa/kuchbhi.git\"],\n",
+        "        cwd=\"/content\", check=True,\n",
+        "    )\n",
+        "    print(\"✅ Repo cloned\")\n",
+        "else:\n",
+        "    print(\"Repo already present — pulling latest\")\n",
+        "    subprocess.run([\"git\", \"pull\"], cwd=REPO, check=True)\n",
+        "\n",
+        "os.chdir(REPO)\n",
+        "sys.path.insert(0, \".\")\n",
+        "\n",
+        "import importlib.metadata\n",
+        "print(f\"OpenEnv version  : {importlib.metadata.version('openenv')}\")\n",
+        "\n",
+        "os.makedirs(\"/content/demo/assets\", exist_ok=True)\n",
+        "os.makedirs(\"/content/data\",         exist_ok=True)\n",
+        "os.makedirs(\"/content/checkpoints\",  exist_ok=True)\n",
+        "os.makedirs(\"/content/logs\",         exist_ok=True)\n",
+        "\n",
+        "print(f\"Working directory: {os.getcwd()}\")\n",
+        "print(\"✅ Setup complete\")"
+      ],
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Cell 2 — Set secrets & verify\n",
+        "Reads `HF_TOKEN` and `OPENAI_API_KEY` from Colab secrets.  \n",
+        "**Both must show ✅ before continuing.**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "import os\n",
+        "from google.colab import userdata\n",
+        "\n",
+        "HF_TOKEN = userdata.get(\"HF_TOKEN\")\n",
+        "OPENAI_API_KEY = userdata.get(\"OPENAI_API_KEY\")\n",
+        "\n",
+        "if not HF_TOKEN:\n",
+        "    raise RuntimeError(\n",
+        "        \"HF_TOKEN not set.\\n\"\n",
+        "        \"Go to the key icon (left sidebar) → Add secret → Name: HF_TOKEN, \"\n",
+        "        \"Value: your write token from hf.co/settings/tokens → enable notebook access.\"\n",
+        "    )\n",
+        "\n",
+        "if not OPENAI_API_KEY:\n",
+        "    raise RuntimeError(\n",
+        "        \"OPENAI_API_KEY not set.\\n\"\n",
+        "        \"Go to the key icon (left sidebar) → Add secret → Name: OPENAI_API_KEY, \"\n",
+        "        \"Value: sk-... → enable notebook access.\"\n",
+        "    )\n",
+        "\n",
+        "# Inject into environment so all modules pick them up\n",
+        "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY\n",
+        "\n",
+        "print(f\"✅ HF_TOKEN      : {HF_TOKEN[:8]}...{HF_TOKEN[-4:]}\")\n",
+        "print(f\"✅ OPENAI_API_KEY: {OPENAI_API_KEY[:8]}...{OPENAI_API_KEY[-4:]}\")\n",
+        "print(\"Both secrets loaded — proceeding.\")"
+      ],
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Cell 3 — Patch env + smoke test\n",
+        "Adds `simulate_specialists` support and runs one end-to-end step to confirm the env works."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "import os as _os\n",
+        "import numpy as np\n",
+        "from env.spindleflow_env import SpindleFlowEnv\n",
+        "\n",
+        "# Monkey-patch: add simulate_specialists kwarg (fast per-step simulation)\n",
+        "if not getattr(SpindleFlowEnv, \"_simulate_patched\", False):\n",
+        "    _orig_init = SpindleFlowEnv.__init__\n",
+        "\n",
+        "    def _new_init(self, *args, simulate_specialists=False, **kwargs):\n",
+        "        _orig_init(self, *args, **kwargs)\n",
+        "        self.simulate_specialists = simulate_specialists\n",
+        "\n",
+        "    SpindleFlowEnv.__init__ = _new_init\n",
+        "\n",
+        "    _orig_call = SpindleFlowEnv._call_specialist\n",
+        "\n",
+        "    def _new_call(self, specialist_id, task, elapsed_ms, context=None):\n",
+        "        if getattr(self, \"simulate_specialists\", False):\n",
+        "            _key = _os.environ.pop(\"OPENAI_API_KEY\", None)\n",
+        "            try:\n",
+        "                return _orig_call(self, specialist_id, task, elapsed_ms, context=context)\n",
+        "            finally:\n",
+        "                if _key:\n",
+        "                    _os.environ[\"OPENAI_API_KEY\"] = _key\n",
+        "        return _orig_call(self, specialist_id, task, elapsed_ms, context=context)\n",
+        "\n",
+        "    SpindleFlowEnv._call_specialist = _new_call\n",
+        "    SpindleFlowEnv._simulate_patched = True\n",
+        "    print(\"✅ SpindleFlowEnv patched\")\n",
+        "else:\n",
+        "    print(\"Already patched — skipping\")\n",
+        "\n",
+        "env = SpindleFlowEnv(\n",
+        "    config_path=\"configs/training_config.yaml\",\n",
+        "    catalog_path=\"configs/specialist_catalog.yaml\",\n",
+        "    use_real_spindleflow=False,\n",
+        "    phase=1,\n",
+        "    simulate_specialists=True,\n",
+        ")\n",
+        "obs, info = env.reset()\n",
+        "print(f\"Observation shape : {obs.shape}\")\n",
+        "print(f\"Task              : {info['task'][:80]}\")\n",
+        "\n",
+        "action = env.action_space.sample()\n",
+        "obs2, reward, terminated, truncated, info2 = env.step(action)\n",
+        "print(f\"Step reward       : {reward:.4f}\")\n",
+        "print(f\"Action name       : {info2['action_name']}\")\n",
+        "print(f\"Reward components : {info2['reward_components']}\")\n",
+        "env.close()\n",
+        "print(\"✅ Environment OK\")"
+      ],
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Cell 4 — HuggingFace TRL check\n",
+        "Confirms TRL is importable (hackathon requirement)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "import trl, torch\n",
+        "\n",
+        "print(f\"TRL version   : {trl.__version__}\")\n",
+        "print(f\"Torch version : {torch.__version__}\")\n",
+        "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
+        "if torch.cuda.is_available():\n",
+        "    print(f\"GPU           : {torch.cuda.get_device_name(0)}\")\n",
+        "\n",
+        "for _name in (\"PPOConfig\", \"GRPOConfig\", \"SFTConfig\"):\n",
+        "    _cls = getattr(trl, _name, None)\n",
+        "    if _cls is not None:\n",
+        "        print(f\"TRL config class: {_name} ✅\")\n",
+        "        break\n",
+        "else:\n",
+        "    print(\"TRL imported ✅ (config uses TrainingArguments in this version)\")\n",
+        "\n",
+        "print(\"✅ TRL requirement satisfied. Primary training uses RecurrentPPO (Cell 5).\")"
+      ],
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Cell 5 — RecurrentPPO training\n",
+        "\n",
+        "**What's happening:**\n",
+        "- Per-step specialist calls: local simulation (fast, no API cost)\n",
+        "- Task generation: GPT-4o-mini via `OPENAI_API_KEY` (diverse tasks)\n",
+        "- Finetuner: fires every 100 episodes via `OPENAI_API_KEY` (improves specialist prompts)\n",
+        "- Reward baseline: LLM-generated via `OPENAI_API_KEY` (accurate quality signal)\n",
+        "\n",
+        "**Expected runtime: 20–30 min on T4 GPU**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "import time, yaml\n",
+        "import torch\n",
+        "import numpy as np\n",
+        "from sb3_contrib import RecurrentPPO\n",
+        "from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize\n",
+        "from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback\n",
+        "from policy.lstm_policy import build_policy_kwargs\n",
+        "from training.curriculum import CurriculumManager\n",
+        "from training.specialist_improvement_callback import SpecialistImprovementCallback\n",
+        "\n",
+        "_LOG_FILE = \"/content/logs/training_log.txt\"\n",
+        "\n",
+        "def _tlog(msg: str):\n",
+        "    ts = time.strftime(\"%H:%M:%S\")\n",
+        "    line = f\"[{ts}] {msg}\"\n",
+        "    print(line, flush=True)\n",
+        "    with open(_LOG_FILE, \"a\", encoding=\"utf-8\") as _f:\n",
+        "        _f.write(line + \"\\n\")\n",
+        "\n",
+        "with open(\"configs/training_config.yaml\") as f:\n",
+        "    _cfg = yaml.safe_load(f)\n",
+        "\n",
+        "curriculum = CurriculumManager(config_path=\"configs/training_config.yaml\")\n",
+        "\n",
+        "TOTAL_TIMESTEPS = 100_000   # ~10k episodes, ~20-25 min on T4\n",
+        "\n",
+        "\n",
+        "class RewardLogger(BaseCallback):\n",
+        "    def __init__(self, curriculum):\n",
+        "        super().__init__()\n",
+        "        self.episode_rewards = []\n",
+        "        self._running = 0.0\n",
+        "        self._curriculum = curriculum\n",
+        "\n",
+        "    def _on_step(self):\n",
+        "        for r, d in zip(\n",
+        "            self.locals.get(\"rewards\", []),\n",
+        "            self.locals.get(\"dones\",   []),\n",
+        "        ):\n",
+        "            self._running += float(r)\n",
+        "            if d:\n",
+        "                ep = self._running\n",
+        "                self.episode_rewards.append(ep)\n",
+        "                self._running = 0.0\n",
+        "                advanced = self._curriculum.on_episode_end(ep)\n",
+        "                n = len(self.episode_rewards)\n",
+        "                if advanced or n % 50 == 0:\n",
+        "                    _tlog(\n",
+        "                        f\"Ep {n:5d} | reward {ep:+.3f} | \"\n",
+        "                        f\"{self._curriculum.progress_str()}\"\n",
+        "                    )\n",
+        "        return True\n",
+        "\n",
+        "\n",
+        "def make_env():\n",
+        "    return SpindleFlowEnv(\n",
+        "        config_path=\"configs/training_config.yaml\",\n",
+        "        catalog_path=\"configs/specialist_catalog.yaml\",\n",
+        "        use_real_spindleflow=False,\n",
+        "        phase=1,\n",
+        "        simulate_specialists=True,\n",
+        "    )\n",
+        "\n",
+        "\n",
+        "vec_env = DummyVecEnv([make_env])\n",
+        "vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=True, clip_obs=10.0)\n",
+        "\n",
+        "_ppo  = _cfg.get(\"ppo\",  {})\n",
+        "_lstm = _cfg.get(\"lstm\", {})\n",
+        "\n",
+        "model = RecurrentPPO(\n",
+        "    policy=\"MlpLstmPolicy\",\n",
+        "    env=vec_env,\n",
+        "    learning_rate=float(_ppo.get(\"learning_rate\", 3e-4)),\n",
+        "    n_steps=int(_ppo.get(\"n_steps\", 512)),\n",
+        "    batch_size=int(_ppo.get(\"batch_size\", 64)),\n",
+        "    n_epochs=int(_ppo.get(\"n_epochs\", 10)),\n",
+        "    gamma=float(_ppo.get(\"gamma\", 0.99)),\n",
+        "    gae_lambda=float(_ppo.get(\"gae_lambda\", 0.95)),\n",
+        "    clip_range=float(_ppo.get(\"clip_range\", 0.2)),\n",
+        "    ent_coef=float(_ppo.get(\"ent_coef\", 0.01)),\n",
+        "    vf_coef=float(_ppo.get(\"vf_coef\", 0.5)),\n",
+        "    max_grad_norm=float(_ppo.get(\"max_grad_norm\", 0.5)),\n",
+        "    policy_kwargs=build_policy_kwargs(\n",
+        "        hidden_size=int(_lstm.get(\"hidden_size\", 256))\n",
+        "    ),\n",
+        "    verbose=0,\n",
+        "    seed=int(_cfg.get(\"training\", {}).get(\"seed\", 42)),\n",
+        "    device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
+        ")\n",
+        "\n",
+        "_tlog(f\"Device          : {model.device}\")\n",
+        "_tlog(f\"Total timesteps : {TOTAL_TIMESTEPS:,}\")\n",
+        "_tlog(f\"Curriculum start: Phase {curriculum.current_phase} — {curriculum.progress_str()}\")\n",
+        "_tlog(\"Training started...\")\n",
+        "\n",
+        "reward_logger = RewardLogger(curriculum=curriculum)\n",
+        "checkpoint_cb = CheckpointCallback(save_freq=10_000, save_path=\"/content/checkpoints/\")\n",
+        "improvement_cb = SpecialistImprovementCallback(\n",
+        "    improve_every_n_episodes=_cfg.get(\"specialist_improvement\", {}).get(\n",
+        "        \"improve_every_n_episodes\", 100\n",
+        "    ),\n",
+        "    verbose=1,\n",
+        ")\n",
+        "\n",
+        "_t0 = time.time()\n",
+        "model.learn(\n",
+        "    total_timesteps=TOTAL_TIMESTEPS,\n",
+        "    callback=[reward_logger, checkpoint_cb, improvement_cb],\n",
+        ")\n",
+        "_elapsed = time.time() - _t0\n",
+        "\n",
+        "model.save(\"/content/spindleflow_colab_model\")\n",
+        "vec_env.save(\"/content/vec_normalize_colab.pkl\")\n",
+        "\n",
+        "_tlog(f\"Training done in {_elapsed/60:.1f} min\")\n",
+        "_tlog(f\"Episodes tracked : {len(reward_logger.episode_rewards)}\")\n",
+        "_tlog(f\"Final curriculum : {curriculum.progress_str()}\")\n",
+        "print(\"\\n✅ Model saved to /content/spindleflow_colab_model.zip\")"
+      ],
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Cell 6 — Reward curve\n",
+        "Generates publication-quality plot and saves JSON for the HF Space demo."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "import json\n",
+        "import numpy as np\n",
+        "import matplotlib\n",
+        "matplotlib.use(\"Agg\")\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "ep_rewards = reward_logger.episode_rewards\n",
+        "if not ep_rewards:\n",
+        "    raise RuntimeError(\"No episodes completed — check Cell 5 output for errors.\")\n",
+        "\n",
+        "n_ep     = len(ep_rewards)\n",
+        "episodes = list(range(n_ep))\n",
+        "window   = max(30, n_ep // 20)  # adaptive: ~5% of total\n",
+        "\n",
+        "smoothed = [\n",
+        "    float(np.mean(ep_rewards[max(0, i - window):i + 1]))\n",
+        "    for i in range(n_ep)\n",
+        "]\n",
+        "\n",
+        "early_mean = float(np.mean(ep_rewards[:min(50, n_ep)]))\n",
+        "final_mean = float(np.mean(ep_rewards[max(0, n_ep - 200):]))\n",
+        "improvement = final_mean - early_mean\n",
+        "\n",
+        "# ── Save JSON ──────────────────────────────────────────────────\n",
+        "step      = max(1, n_ep // 300)\n",
+        "json_data = {\n",
+        "    \"episodes\":     episodes[::step],\n",
+        "    \"mean_rewards\": smoothed[::step],\n",
+        "}\n",
+        "with open(\"/content/demo/assets/reward_curve.json\", \"w\") as f:\n",
+        "    json.dump(json_data, f)\n",
+        "print(f\"Saved reward_curve.json ({len(json_data['episodes'])} points)\")\n",
+        "\n",
+        "# ── Plot ───────────────────────────────────────────────────────\n",
+        "fig, ax = plt.subplots(figsize=(11, 5), dpi=180)\n",
+        "fig.patch.set_facecolor(\"#0d1117\")\n",
+        "ax.set_facecolor(\"#161b22\")\n",
+        "\n",
+        "plot_every = max(1, n_ep // 800)\n",
+        "ax.scatter(\n",
+        "    episodes[::plot_every], ep_rewards[::plot_every],\n",
+        "    s=4, alpha=0.25, color=\"#58a6ff\", zorder=2, label=\"Episode reward\",\n",
+        ")\n",
+        "ax.plot(\n",
+        "    episodes[::plot_every], smoothed[::plot_every],\n",
+        "    linewidth=2.5, color=\"#ff6b35\", zorder=3,\n",
+        "    label=f\"Smoothed ({window}-ep mean)\",\n",
+        ")\n",
+        "ax.axhline(\n",
+        "    y=early_mean, color=\"#94a3b8\", linestyle=\"--\", linewidth=1.2, alpha=0.75,\n",
+        "    label=f\"Early baseline  {early_mean:+.3f}\",\n",
+        ")\n",
+        "ax.axhline(\n",
+        "    y=final_mean, color=\"#34d399\", linestyle=\"--\", linewidth=1.2, alpha=0.85,\n",
+        "    label=f\"Final mean  {final_mean:+.3f}\",\n",
+        ")\n",
+        "\n",
+        "ax.set_xlabel(\"Episode\", color=\"#c9d1d9\", fontsize=12)\n",
+        "ax.set_ylabel(\"Reward\", color=\"#c9d1d9\", fontsize=12)\n",
+        "ax.set_title(\n",
+        "    \"SpindleFlow RL — Delegation Policy Learning Curve\\n\"\n",
+        "    f\"RecurrentPPO · LSTM · {TOTAL_TIMESTEPS:,} steps · {n_ep:,} episodes\",\n",
+        "    color=\"#f0f6fc\", fontsize=13, fontweight=\"bold\", pad=14,\n",
+        ")\n",
+        "ax.tick_params(colors=\"#8b949e\")\n",
+        "for spine in ax.spines.values():\n",
+        "    spine.set_edgecolor(\"#30363d\")\n",
+        "ax.grid(color=\"#21262d\", linewidth=0.8, alpha=0.9)\n",
+        "ax.legend(\n",
+        "    fontsize=10, framealpha=0.85,\n",
+        "    facecolor=\"#161b22\", edgecolor=\"#30363d\", labelcolor=\"#c9d1d9\",\n",
+        ")\n",
+        "\n",
+        "sign = \"▲\" if improvement >= 0 else \"▼\"\n",
+        "ax.annotate(\n",
+        "    f\"  {sign} {abs(improvement):.3f} reward improvement\",\n",
+        "    xy=(n_ep * 0.65, (early_mean + final_mean) / 2),\n",
+        "    color=\"#f0f6fc\", fontsize=10, fontstyle=\"italic\",\n",
+        ")\n",
+        "\n",
+        "fig.tight_layout()\n",
+        "fig.savefig(\"/content/reward_curve.png\", dpi=180, bbox_inches=\"tight\",\n",
+        "            facecolor=fig.get_facecolor())\n",
+        "plt.show()\n",
+        "\n",
+        "print(f\"\\n{'='*50}\")\n",
+        "print(f\"Episodes completed : {n_ep:,}\")\n",
+        "print(f\"Early baseline     : {early_mean:+.4f}\")\n",
+        "print(f\"Final mean         : {final_mean:+.4f}\")\n",
+        "print(f\"Improvement        : {improvement:+.4f}\")\n",
+        "print(f\"{'='*50}\")\n",
+        "print(\"✅ Reward curve saved to /content/reward_curve.png\")\n",
+        "\n",
+        "_tlog(f\"Reward curve: early={early_mean:+.4f}, final={final_mean:+.4f}, improvement={improvement:+.4f}\")"
+      ],
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Cell 7 — Learning features audit\n",
+        "Confirms each self-learning feature fired at least once during training."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "import os, json\n",
+        "from pathlib import Path\n",
+        "\n",
+        "print(\"=\"*55)\n",
+        "print(\"LEARNING FEATURES AUDIT\")\n",
+        "print(\"=\"*55)\n",
+        "\n",
+        "# Feature 5 — Curriculum\n",
+        "print(f\"\\nFeature 5 — Curriculum (performance-gated)\")\n",
+        "print(f\"  Final phase        : {curriculum.current_phase}/3\")\n",
+        "print(f\"  Rolling mean reward: {curriculum.rolling_mean():.3f}\")\n",
+        "print(f\"  {curriculum.progress_str()}\")\n",
+        "\n",
+        "# Feature 2 — Specialist memory\n",
+        "mem_path = Path(_cfg.get(\"specialist_improvement\", {}).get(\n",
+        "    \"memory_path\", \"data/specialist_memory.json\"\n",
+        "))\n",
+        "print(f\"\\nFeature 2 — Specialist memory ({mem_path})\")\n",
+        "if mem_path.exists():\n",
+        "    data = json.loads(mem_path.read_text())\n",
+        "    total_entries = sum(len(v) for v in data.values())\n",
+        "    print(f\"  Specialists with memory : {len(data)}\")\n",
+        "    print(f\"  Total entries recorded  : {total_entries}\")\n",
+        "    for sid, entries in list(data.items())[:3]:\n",
+        "        avg = sum(e[\"reward\"] for e in entries) / len(entries)\n",
+        "        print(f\"    {sid}: {len(entries)} entries, avg_reward={avg:.3f}\")\n",
+        "else:\n",
+        "    print(\"  No memory file yet (finetuner may not have fired — normal below 100 episodes)\")\n",
+        "\n",
+        "# Feature 3 — Spawn memory\n",
+        "spawn_path = Path(_cfg.get(\"environment\", {}).get(\n",
+        "    \"spawn_memory_path\", \"data/spawn_memory.jsonl\"\n",
+        "))\n",
+        "print(f\"\\nFeature 3 — Spawn memory ({spawn_path})\")\n",
+        "if spawn_path.exists():\n",
+        "    lines = [l for l in spawn_path.read_text().splitlines() if l.strip()]\n",
+        "    print(f\"  Spawn records written: {len(lines)}\")\n",
+        "    for line in lines[:3]:\n",
+        "        rec = json.loads(line)\n",
+        "        print(f\"    {rec['specialist_role']} | reward={rec['episode_reward']:.3f} \"\n",
+        "              f\"| sim {rec['pre_spawn_sim']:.2f}→{rec['post_spawn_sim']:.2f}\")\n",
+        "else:\n",
+        "    print(\"  No spawn memory yet (requires policy choosing SPAWN_SPECIALIST action)\")\n",
+        "\n",
+        "# Feature 4 — Resolution bandit\n",
+        "res_path = Path(_cfg.get(\"agents\", {}).get(\n",
+        "    \"resolution_memory_path\", \"data/resolution_memory.jsonl\"\n",
+        "))\n",
+        "print(f\"\\nFeature 4 — Resolution bandit ({res_path})\")\n",
+        "if res_path.exists():\n",
+        "    lines = [l for l in res_path.read_text().splitlines() if l.strip()]\n",
+        "    print(f\"  Outcome records written: {len(lines)}\")\n",
+        "    stats = {}\n",
+        "    for line in lines:\n",
+        "        rec = json.loads(line)\n",
+        "        key = f\"{rec['conflict_type']}/{rec['template_key']}\"\n",
+        "        stats.setdefault(key, []).append(rec[\"quality_delta\"])\n",
+        "    for k, deltas in stats.items():\n",
+        "        print(f\"    {k}: n={len(deltas)}, mean_delta={sum(deltas)/len(deltas):.3f}\")\n",
+        "else:\n",
+        "    print(\"  No resolution memory yet (requires detected conflicts)\")\n",
+        "\n",
+        "print(\"\\n\" + \"=\"*55)\n",
+        "print(\"✅ Audit complete\")\n",
+        "print(\"=\"*55)"
+      ],
+      "outputs": [],
+      "execution_count": null
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Cell 8 — Push to HuggingFace Hub\n",
+        "\n",
+        "Uploads model checkpoint, reward curve, training log, and README to `garvitsachdeva/spindleflow-rl`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "import os, json\n",
+        "import numpy as np\n",
+        "from huggingface_hub import HfApi, CommitOperationAdd\n",
+        "\n",
+        "HF_REPO = \"garvitsachdeva/spindleflow-rl\"\n",
+        "api = HfApi(token=HF_TOKEN)\n",
+        "\n",
+        "_tlog(f\"Pushing to https://huggingface.co/{HF_REPO} ...\")\n",
+        "api.create_repo(repo_id=HF_REPO.split(\"/\")[-1], repo_type=\"model\", exist_ok=True)\n",
+        "\n",
+        "ep   = reward_logger.episode_rewards\n",
+        "f5   = float(np.mean(ep[:5]))   if len(ep) >= 5 else 0.0\n",
+        "l5   = float(np.mean(ep[-5:])) if len(ep) >= 5 else 0.0\n",
+        "\n",
+        "readme_text = f\"\"\"---\n",
+        "license: mit\n",
+        "tags:\n",
+        "  - reinforcement-learning\n",
+        "  - stable-baselines3\n",
+        "  - sb3-contrib\n",
+        "  - gymnasium\n",
+        "  - multi-agent\n",
+        "  - openenv\n",
+        "library_name: stable-baselines3\n",
+        "---\n",
+        "\n",
+        "# SpindleFlow RL — Delegation Policy\n",
+        "\n",
+        "LSTM PPO (RecurrentPPO) agent trained on SpindleFlow-v0 (OpenEnv).  \n",
+        "Trained on Google Colab T4 GPU.\n",
+        "\n",
+        "## Training summary\n",
+        "| Metric | Value |\n",
+        "|---|---|\n",
+        "| Algorithm | RecurrentPPO (SB3 + sb3-contrib) |\n",
+        "| Total timesteps | {TOTAL_TIMESTEPS:,} |\n",
+        "| Episodes completed | {len(ep):,} |\n",
+        "| Early baseline (first 50 ep) | {early_mean:.4f} |\n",
+        "| Final mean (last 200 ep) | {final_mean:.4f} |\n",
+        "| Improvement | {final_mean - early_mean:+.4f} |\n",
+        "| Training time | {_elapsed/60:.1f} min |\n",
+        "| Device | T4 GPU |\n",
+        "\n",
+        "![Reward Curve](reward_curve.png)\n",
+        "\n",
+        "## Load\n",
+        "```python\n",
+        "from sb3_contrib import RecurrentPPO\n",
+        "from huggingface_hub import hf_hub_download\n",
+        "model = RecurrentPPO.load(hf_hub_download(\"{HF_REPO}\", \"spindleflow_model.zip\"))\n",
+        "```\n",
+        "\"\"\"\n",
+        "\n",
+        "readme_path = \"/content/README_model.md\"\n",
+        "with open(readme_path, \"w\") as f:\n",
+        "    f.write(readme_text)\n",
+        "\n",
+        "candidates = [\n",
+        "    (\"/content/spindleflow_colab_model.zip\",    \"spindleflow_model.zip\"),\n",
+        "    (\"/content/vec_normalize_colab.pkl\",         \"vec_normalize.pkl\"),\n",
+        "    (\"/content/reward_curve.png\",                \"reward_curve.png\"),\n",
+        "    (\"/content/demo/assets/reward_curve.json\",   \"reward_curve.json\"),\n",
+        "    (\"/content/logs/training_log.txt\",           \"training_log.txt\"),\n",
+        "    (readme_path,                                \"README.md\"),\n",
+        "]\n",
+        "\n",
+        "ops = [\n",
+        "    CommitOperationAdd(path_in_repo=dst, path_or_fileobj=src)\n",
+        "    for src, dst in candidates\n",
+        "    if os.path.exists(src)\n",
+        "]\n",
+        "\n",
+        "api.create_commit(\n",
+        "    repo_id=HF_REPO,\n",
+        "    repo_type=\"model\",\n",
+        "    operations=ops,\n",
+        "    commit_message=\"Add trained SpindleFlow RL policy (Colab T4)\",\n",
+        "    token=HF_TOKEN,\n",
+        ")\n",
+        "\n",
+        "_tlog(f\"Uploaded {len(ops)} files:\")\n",
+        "for src, dst in candidates:\n",
+        "    if os.path.exists(src):\n",
+        "        _tlog(f\"  ✓ {dst}\")\n",
+        "\n",
+        "_tlog(f\"Model       : https://huggingface.co/{HF_REPO}\")\n",
+        "_tlog(f\"Training log: https://huggingface.co/{HF_REPO}/blob/main/training_log.txt\")\n",
+        "_tlog(f\"Reward curve: https://huggingface.co/{HF_REPO}/blob/main/reward_curve.png\")\n",
+        "_tlog(f\"Improvement : {final_mean - early_mean:+.4f}\")\n",
+        "print(\"\\n✅ All done!\")"
+      ],
+      "outputs": [],
+      "execution_count": null
+    }
+  ]
+}