Spaces:

garvitsachdeva
/

SpindleFlow-RL

Runtime error

App Files Files Community

garvitsachdeva Claude Sonnet 4.6 commited on Apr 25

Commit

c8407b4

1 Parent(s): c77f83c

Remove Colab notebook — using plain .py script instead

Browse files

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

colab/SpindleFlow_RL_Training.ipynb +0 -636

colab/SpindleFlow_RL_Training.ipynb DELETED Viewed

@@ -1,636 +0,0 @@
-{
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
-  "colab": {
-   "provenance": [],
-   "gpuType": "T4",
-   "name": "SpindleFlow_RL_Training.ipynb"
-  },
-  "kernelspec": {
-   "name": "python3",
-   "display_name": "Python 3"
-  },
-  "language_info": {
-   "name": "python"
-  },
-  "accelerator": "GPU"
- },
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# SpindleFlow RL — Training Notebook\n",
-    "\n",
-    "**Hardware**: Runtime → Change runtime type → **T4 GPU**\n",
-    "\n",
-    "**Secrets** (key icon in left sidebar → Manage secrets):\n",
-    "\n",
-    "| Name | Required | Notes |\n",
-    "|---|---|---|\n",
-    "| `HF_TOKEN` | ✅ Yes | HuggingFace write token — hf.co/settings/tokens → New token (write) |\n",
-    "| `OPENAI_API_KEY` | ✅ Yes | GPT-4o-mini for task generation, finetuner, reward baseline |\n",
-    "\n",
-    "Run cells **top to bottom, one at a time**. Do NOT skip cells."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Cell 1 — Install dependencies & clone repo\n",
-    "Run once. After it finishes, **do NOT restart the runtime** — continue to Cell 2."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": "import subprocess, os, sys\n\nprint(f\"Python version: {sys.version}\")\n\n# audioop-lts is only for Python 3.13+ — Colab uses 3.12 where audioop is built-in\n_pkgs = [\n    \"openenv\", \"stable-baselines3\", \"sb3-contrib\", \"gymnasium\",\n    \"sentence-transformers\", \"openai\", \"pyyaml\", \"trl\",\n    \"transformers\", \"datasets\", \"torch\",\n    \"matplotlib\", \"huggingface_hub\",\n]\nif sys.version_info >= (3, 13):\n    _pkgs.append(\"audioop-lts\")\n\nresult = subprocess.run([\"pip\", \"install\"] + _pkgs, capture_output=True, text=True)\nif result.returncode != 0:\n    print(\"STDOUT:\", result.stdout[-3000:])\n    print(\"STDERR:\", result.stderr[-3000:])\n    raise RuntimeError(\"pip install failed — see output above\")\nprint(\"✅ Packages installed\")\n\nREPO = \"/content/kuchbhi/spindleflow-rl\"\nif not os.path.isdir(REPO):\n    subprocess.run(\n        [\"git\", \"clone\", \"https://github.com/garvitsachdevaa/kuchbhi.git\"],\n        cwd=\"/content\", check=True,\n    )\n    print(\"✅ Repo cloned\")\nelse:\n    print(\"Repo already present — pulling latest\")\n    subprocess.run([\"git\", \"pull\"], cwd=REPO, check=True)\n\nos.chdir(REPO)\nsys.path.insert(0, \".\")\n\nimport importlib.metadata\nprint(f\"OpenEnv version  : {importlib.metadata.version('openenv')}\")\n\nos.makedirs(\"/content/demo/assets\", exist_ok=True)\nos.makedirs(\"/content/data\",         exist_ok=True)\nos.makedirs(\"/content/checkpoints\",  exist_ok=True)\nos.makedirs(\"/content/logs\",         exist_ok=True)\n\nprint(f\"Working directory: {os.getcwd()}\")\nprint(\"✅ Setup complete\")",
-   "outputs": [],
-   "execution_count": null
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Cell 2 — Set secrets & verify\n",
-    "Reads `HF_TOKEN` and `OPENAI_API_KEY` from Colab secrets.  \n",
-    "**Both must show ✅ before continuing.**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": [
-    "import os\n",
-    "from google.colab import userdata\n",
-    "\n",
-    "HF_TOKEN = userdata.get(\"HF_TOKEN\")\n",
-    "OPENAI_API_KEY = userdata.get(\"OPENAI_API_KEY\")\n",
-    "\n",
-    "if not HF_TOKEN:\n",
-    "    raise RuntimeError(\n",
-    "        \"HF_TOKEN not set.\\n\"\n",
-    "        \"Go to the key icon (left sidebar) → Add secret → Name: HF_TOKEN, \"\n",
-    "        \"Value: your write token from hf.co/settings/tokens → enable notebook access.\"\n",
-    "    )\n",
-    "\n",
-    "if not OPENAI_API_KEY:\n",
-    "    raise RuntimeError(\n",
-    "        \"OPENAI_API_KEY not set.\\n\"\n",
-    "        \"Go to the key icon (left sidebar) → Add secret → Name: OPENAI_API_KEY, \"\n",
-    "        \"Value: sk-... → enable notebook access.\"\n",
-    "    )\n",
-    "\n",
-    "# Inject into environment so all modules pick them up\n",
-    "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY\n",
-    "\n",
-    "print(f\"✅ HF_TOKEN      : {HF_TOKEN[:8]}...{HF_TOKEN[-4:]}\")\n",
-    "print(f\"✅ OPENAI_API_KEY: {OPENAI_API_KEY[:8]}...{OPENAI_API_KEY[-4:]}\")\n",
-    "print(\"Both secrets loaded — proceeding.\")"
-   ],
-   "outputs": [],
-   "execution_count": null
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Cell 3 — Patch env + smoke test\n",
-    "Adds `simulate_specialists` support and runs one end-to-end step to confirm the env works."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": [
-    "import os as _os\n",
-    "import numpy as np\n",
-    "from env.spindleflow_env import SpindleFlowEnv\n",
-    "\n",
-    "# Monkey-patch: add simulate_specialists kwarg (fast per-step simulation)\n",
-    "if not getattr(SpindleFlowEnv, \"_simulate_patched\", False):\n",
-    "    _orig_init = SpindleFlowEnv.__init__\n",
-    "\n",
-    "    def _new_init(self, *args, simulate_specialists=False, **kwargs):\n",
-    "        _orig_init(self, *args, **kwargs)\n",
-    "        self.simulate_specialists = simulate_specialists\n",
-    "\n",
-    "    SpindleFlowEnv.__init__ = _new_init\n",
-    "\n",
-    "    _orig_call = SpindleFlowEnv._call_specialist\n",
-    "\n",
-    "    def _new_call(self, specialist_id, task, elapsed_ms, context=None):\n",
-    "        if getattr(self, \"simulate_specialists\", False):\n",
-    "            _key = _os.environ.pop(\"OPENAI_API_KEY\", None)\n",
-    "            try:\n",
-    "                return _orig_call(self, specialist_id, task, elapsed_ms, context=context)\n",
-    "            finally:\n",
-    "                if _key:\n",
-    "                    _os.environ[\"OPENAI_API_KEY\"] = _key\n",
-    "        return _orig_call(self, specialist_id, task, elapsed_ms, context=context)\n",
-    "\n",
-    "    SpindleFlowEnv._call_specialist = _new_call\n",
-    "    SpindleFlowEnv._simulate_patched = True\n",
-    "    print(\"✅ SpindleFlowEnv patched\")\n",
-    "else:\n",
-    "    print(\"Already patched — skipping\")\n",
-    "\n",
-    "env = SpindleFlowEnv(\n",
-    "    config_path=\"configs/training_config.yaml\",\n",
-    "    catalog_path=\"configs/specialist_catalog.yaml\",\n",
-    "    use_real_spindleflow=False,\n",
-    "    phase=1,\n",
-    "    simulate_specialists=True,\n",
-    ")\n",
-    "obs, info = env.reset()\n",
-    "print(f\"Observation shape : {obs.shape}\")\n",
-    "print(f\"Task              : {info['task'][:80]}\")\n",
-    "\n",
-    "action = env.action_space.sample()\n",
-    "obs2, reward, terminated, truncated, info2 = env.step(action)\n",
-    "print(f\"Step reward       : {reward:.4f}\")\n",
-    "print(f\"Action name       : {info2['action_name']}\")\n",
-    "print(f\"Reward components : {info2['reward_components']}\")\n",
-    "env.close()\n",
-    "print(\"✅ Environment OK\")"
-   ],
-   "outputs": [],
-   "execution_count": null
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Cell 4 — HuggingFace TRL check\n",
-    "Confirms TRL is importable (hackathon requirement)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": [
-    "import trl, torch\n",
-    "\n",
-    "print(f\"TRL version   : {trl.__version__}\")\n",
-    "print(f\"Torch version : {torch.__version__}\")\n",
-    "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
-    "if torch.cuda.is_available():\n",
-    "    print(f\"GPU           : {torch.cuda.get_device_name(0)}\")\n",
-    "\n",
-    "for _name in (\"PPOConfig\", \"GRPOConfig\", \"SFTConfig\"):\n",
-    "    _cls = getattr(trl, _name, None)\n",
-    "    if _cls is not None:\n",
-    "        print(f\"TRL config class: {_name} ✅\")\n",
-    "        break\n",
-    "else:\n",
-    "    print(\"TRL imported ✅ (config uses TrainingArguments in this version)\")\n",
-    "\n",
-    "print(\"✅ TRL requirement satisfied. Primary training uses RecurrentPPO (Cell 5).\")"
-   ],
-   "outputs": [],
-   "execution_count": null
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Cell 5 — RecurrentPPO training\n",
-    "\n",
-    "**What's happening:**\n",
-    "- Per-step specialist calls: local simulation (fast, no API cost)\n",
-    "- Task generation: GPT-4o-mini via `OPENAI_API_KEY` (diverse tasks)\n",
-    "- Finetuner: fires every 100 episodes via `OPENAI_API_KEY` (improves specialist prompts)\n",
-    "- Reward baseline: LLM-generated via `OPENAI_API_KEY` (accurate quality signal)\n",
-    "\n",
-    "**Expected runtime: 20–30 min on T4 GPU**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": [
-    "import time, yaml\n",
-    "import torch\n",
-    "import numpy as np\n",
-    "from sb3_contrib import RecurrentPPO\n",
-    "from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize\n",
-    "from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback\n",
-    "from policy.lstm_policy import build_policy_kwargs\n",
-    "from training.curriculum import CurriculumManager\n",
-    "from training.specialist_improvement_callback import SpecialistImprovementCallback\n",
-    "\n",
-    "_LOG_FILE = \"/content/logs/training_log.txt\"\n",
-    "\n",
-    "def _tlog(msg: str):\n",
-    "    ts = time.strftime(\"%H:%M:%S\")\n",
-    "    line = f\"[{ts}] {msg}\"\n",
-    "    print(line, flush=True)\n",
-    "    with open(_LOG_FILE, \"a\", encoding=\"utf-8\") as _f:\n",
-    "        _f.write(line + \"\\n\")\n",
-    "\n",
-    "with open(\"configs/training_config.yaml\") as f:\n",
-    "    _cfg = yaml.safe_load(f)\n",
-    "\n",
-    "curriculum = CurriculumManager(config_path=\"configs/training_config.yaml\")\n",
-    "\n",
-    "TOTAL_TIMESTEPS = 100_000   # ~10k episodes, ~20-25 min on T4\n",
-    "\n",
-    "\n",
-    "class RewardLogger(BaseCallback):\n",
-    "    def __init__(self, curriculum):\n",
-    "        super().__init__()\n",
-    "        self.episode_rewards = []\n",
-    "        self._running = 0.0\n",
-    "        self._curriculum = curriculum\n",
-    "\n",
-    "    def _on_step(self):\n",
-    "        for r, d in zip(\n",
-    "            self.locals.get(\"rewards\", []),\n",
-    "            self.locals.get(\"dones\",   []),\n",
-    "        ):\n",
-    "            self._running += float(r)\n",
-    "            if d:\n",
-    "                ep = self._running\n",
-    "                self.episode_rewards.append(ep)\n",
-    "                self._running = 0.0\n",
-    "                advanced = self._curriculum.on_episode_end(ep)\n",
-    "                n = len(self.episode_rewards)\n",
-    "                if advanced or n % 50 == 0:\n",
-    "                    _tlog(\n",
-    "                        f\"Ep {n:5d} | reward {ep:+.3f} | \"\n",
-    "                        f\"{self._curriculum.progress_str()}\"\n",
-    "                    )\n",
-    "        return True\n",
-    "\n",
-    "\n",
-    "def make_env():\n",
-    "    return SpindleFlowEnv(\n",
-    "        config_path=\"configs/training_config.yaml\",\n",
-    "        catalog_path=\"configs/specialist_catalog.yaml\",\n",
-    "        use_real_spindleflow=False,\n",
-    "        phase=1,\n",
-    "        simulate_specialists=True,\n",
-    "    )\n",
-    "\n",
-    "\n",
-    "vec_env = DummyVecEnv([make_env])\n",
-    "vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=True, clip_obs=10.0)\n",
-    "\n",
-    "_ppo  = _cfg.get(\"ppo\",  {})\n",
-    "_lstm = _cfg.get(\"lstm\", {})\n",
-    "\n",
-    "model = RecurrentPPO(\n",
-    "    policy=\"MlpLstmPolicy\",\n",
-    "    env=vec_env,\n",
-    "    learning_rate=float(_ppo.get(\"learning_rate\", 3e-4)),\n",
-    "    n_steps=int(_ppo.get(\"n_steps\", 512)),\n",
-    "    batch_size=int(_ppo.get(\"batch_size\", 64)),\n",
-    "    n_epochs=int(_ppo.get(\"n_epochs\", 10)),\n",
-    "    gamma=float(_ppo.get(\"gamma\", 0.99)),\n",
-    "    gae_lambda=float(_ppo.get(\"gae_lambda\", 0.95)),\n",
-    "    clip_range=float(_ppo.get(\"clip_range\", 0.2)),\n",
-    "    ent_coef=float(_ppo.get(\"ent_coef\", 0.01)),\n",
-    "    vf_coef=float(_ppo.get(\"vf_coef\", 0.5)),\n",
-    "    max_grad_norm=float(_ppo.get(\"max_grad_norm\", 0.5)),\n",
-    "    policy_kwargs=build_policy_kwargs(\n",
-    "        hidden_size=int(_lstm.get(\"hidden_size\", 256))\n",
-    "    ),\n",
-    "    verbose=0,\n",
-    "    seed=int(_cfg.get(\"training\", {}).get(\"seed\", 42)),\n",
-    "    device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
-    ")\n",
-    "\n",
-    "_tlog(f\"Device          : {model.device}\")\n",
-    "_tlog(f\"Total timesteps : {TOTAL_TIMESTEPS:,}\")\n",
-    "_tlog(f\"Curriculum start: Phase {curriculum.current_phase} — {curriculum.progress_str()}\")\n",
-    "_tlog(\"Training started...\")\n",
-    "\n",
-    "reward_logger = RewardLogger(curriculum=curriculum)\n",
-    "checkpoint_cb = CheckpointCallback(save_freq=10_000, save_path=\"/content/checkpoints/\")\n",
-    "improvement_cb = SpecialistImprovementCallback(\n",
-    "    improve_every_n_episodes=_cfg.get(\"specialist_improvement\", {}).get(\n",
-    "        \"improve_every_n_episodes\", 100\n",
-    "    ),\n",
-    "    verbose=1,\n",
-    ")\n",
-    "\n",
-    "_t0 = time.time()\n",
-    "model.learn(\n",
-    "    total_timesteps=TOTAL_TIMESTEPS,\n",
-    "    callback=[reward_logger, checkpoint_cb, improvement_cb],\n",
-    ")\n",
-    "_elapsed = time.time() - _t0\n",
-    "\n",
-    "model.save(\"/content/spindleflow_colab_model\")\n",
-    "vec_env.save(\"/content/vec_normalize_colab.pkl\")\n",
-    "\n",
-    "_tlog(f\"Training done in {_elapsed/60:.1f} min\")\n",
-    "_tlog(f\"Episodes tracked : {len(reward_logger.episode_rewards)}\")\n",
-    "_tlog(f\"Final curriculum : {curriculum.progress_str()}\")\n",
-    "print(\"\\n✅ Model saved to /content/spindleflow_colab_model.zip\")"
-   ],
-   "outputs": [],
-   "execution_count": null
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Cell 6 — Reward curve\n",
-    "Generates publication-quality plot and saves JSON for the HF Space demo."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": [
-    "import json\n",
-    "import numpy as np\n",
-    "import matplotlib\n",
-    "matplotlib.use(\"Agg\")\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "ep_rewards = reward_logger.episode_rewards\n",
-    "if not ep_rewards:\n",
-    "    raise RuntimeError(\"No episodes completed — check Cell 5 output for errors.\")\n",
-    "\n",
-    "n_ep     = len(ep_rewards)\n",
-    "episodes = list(range(n_ep))\n",
-    "window   = max(30, n_ep // 20)  # adaptive: ~5% of total\n",
-    "\n",
-    "smoothed = [\n",
-    "    float(np.mean(ep_rewards[max(0, i - window):i + 1]))\n",
-    "    for i in range(n_ep)\n",
-    "]\n",
-    "\n",
-    "early_mean = float(np.mean(ep_rewards[:min(50, n_ep)]))\n",
-    "final_mean = float(np.mean(ep_rewards[max(0, n_ep - 200):]))\n",
-    "improvement = final_mean - early_mean\n",
-    "\n",
-    "# ── Save JSON ──────────────────────────────────────────────────\n",
-    "step      = max(1, n_ep // 300)\n",
-    "json_data = {\n",
-    "    \"episodes\":     episodes[::step],\n",
-    "    \"mean_rewards\": smoothed[::step],\n",
-    "}\n",
-    "with open(\"/content/demo/assets/reward_curve.json\", \"w\") as f:\n",
-    "    json.dump(json_data, f)\n",
-    "print(f\"Saved reward_curve.json ({len(json_data['episodes'])} points)\")\n",
-    "\n",
-    "# ── Plot ───────────────────────────────────────────────────────\n",
-    "fig, ax = plt.subplots(figsize=(11, 5), dpi=180)\n",
-    "fig.patch.set_facecolor(\"#0d1117\")\n",
-    "ax.set_facecolor(\"#161b22\")\n",
-    "\n",
-    "plot_every = max(1, n_ep // 800)\n",
-    "ax.scatter(\n",
-    "    episodes[::plot_every], ep_rewards[::plot_every],\n",
-    "    s=4, alpha=0.25, color=\"#58a6ff\", zorder=2, label=\"Episode reward\",\n",
-    ")\n",
-    "ax.plot(\n",
-    "    episodes[::plot_every], smoothed[::plot_every],\n",
-    "    linewidth=2.5, color=\"#ff6b35\", zorder=3,\n",
-    "    label=f\"Smoothed ({window}-ep mean)\",\n",
-    ")\n",
-    "ax.axhline(\n",
-    "    y=early_mean, color=\"#94a3b8\", linestyle=\"--\", linewidth=1.2, alpha=0.75,\n",
-    "    label=f\"Early baseline  {early_mean:+.3f}\",\n",
-    ")\n",
-    "ax.axhline(\n",
-    "    y=final_mean, color=\"#34d399\", linestyle=\"--\", linewidth=1.2, alpha=0.85,\n",
-    "    label=f\"Final mean  {final_mean:+.3f}\",\n",
-    ")\n",
-    "\n",
-    "ax.set_xlabel(\"Episode\", color=\"#c9d1d9\", fontsize=12)\n",
-    "ax.set_ylabel(\"Reward\", color=\"#c9d1d9\", fontsize=12)\n",
-    "ax.set_title(\n",
-    "    \"SpindleFlow RL — Delegation Policy Learning Curve\\n\"\n",
-    "    f\"RecurrentPPO · LSTM · {TOTAL_TIMESTEPS:,} steps · {n_ep:,} episodes\",\n",
-    "    color=\"#f0f6fc\", fontsize=13, fontweight=\"bold\", pad=14,\n",
-    ")\n",
-    "ax.tick_params(colors=\"#8b949e\")\n",
-    "for spine in ax.spines.values():\n",
-    "    spine.set_edgecolor(\"#30363d\")\n",
-    "ax.grid(color=\"#21262d\", linewidth=0.8, alpha=0.9)\n",
-    "ax.legend(\n",
-    "    fontsize=10, framealpha=0.85,\n",
-    "    facecolor=\"#161b22\", edgecolor=\"#30363d\", labelcolor=\"#c9d1d9\",\n",
-    ")\n",
-    "\n",
-    "sign = \"▲\" if improvement >= 0 else \"▼\"\n",
-    "ax.annotate(\n",
-    "    f\"  {sign} {abs(improvement):.3f} reward improvement\",\n",
-    "    xy=(n_ep * 0.65, (early_mean + final_mean) / 2),\n",
-    "    color=\"#f0f6fc\", fontsize=10, fontstyle=\"italic\",\n",
-    ")\n",
-    "\n",
-    "fig.tight_layout()\n",
-    "fig.savefig(\"/content/reward_curve.png\", dpi=180, bbox_inches=\"tight\",\n",
-    "            facecolor=fig.get_facecolor())\n",
-    "plt.show()\n",
-    "\n",
-    "print(f\"\\n{'='*50}\")\n",
-    "print(f\"Episodes completed : {n_ep:,}\")\n",
-    "print(f\"Early baseline     : {early_mean:+.4f}\")\n",
-    "print(f\"Final mean         : {final_mean:+.4f}\")\n",
-    "print(f\"Improvement        : {improvement:+.4f}\")\n",
-    "print(f\"{'='*50}\")\n",
-    "print(\"✅ Reward curve saved to /content/reward_curve.png\")\n",
-    "\n",
-    "_tlog(f\"Reward curve: early={early_mean:+.4f}, final={final_mean:+.4f}, improvement={improvement:+.4f}\")"
-   ],
-   "outputs": [],
-   "execution_count": null
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Cell 7 — Learning features audit\n",
-    "Confirms each self-learning feature fired at least once during training."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": [
-    "import os, json\n",
-    "from pathlib import Path\n",
-    "\n",
-    "print(\"=\"*55)\n",
-    "print(\"LEARNING FEATURES AUDIT\")\n",
-    "print(\"=\"*55)\n",
-    "\n",
-    "# Feature 5 — Curriculum\n",
-    "print(f\"\\nFeature 5 — Curriculum (performance-gated)\")\n",
-    "print(f\"  Final phase        : {curriculum.current_phase}/3\")\n",
-    "print(f\"  Rolling mean reward: {curriculum.rolling_mean():.3f}\")\n",
-    "print(f\"  {curriculum.progress_str()}\")\n",
-    "\n",
-    "# Feature 2 — Specialist memory\n",
-    "mem_path = Path(_cfg.get(\"specialist_improvement\", {}).get(\n",
-    "    \"memory_path\", \"data/specialist_memory.json\"\n",
-    "))\n",
-    "print(f\"\\nFeature 2 — Specialist memory ({mem_path})\")\n",
-    "if mem_path.exists():\n",
-    "    data = json.loads(mem_path.read_text())\n",
-    "    total_entries = sum(len(v) for v in data.values())\n",
-    "    print(f\"  Specialists with memory : {len(data)}\")\n",
-    "    print(f\"  Total entries recorded  : {total_entries}\")\n",
-    "    for sid, entries in list(data.items())[:3]:\n",
-    "        avg = sum(e[\"reward\"] for e in entries) / len(entries)\n",
-    "        print(f\"    {sid}: {len(entries)} entries, avg_reward={avg:.3f}\")\n",
-    "else:\n",
-    "    print(\"  No memory file yet (finetuner may not have fired — normal below 100 episodes)\")\n",
-    "\n",
-    "# Feature 3 — Spawn memory\n",
-    "spawn_path = Path(_cfg.get(\"environment\", {}).get(\n",
-    "    \"spawn_memory_path\", \"data/spawn_memory.jsonl\"\n",
-    "))\n",
-    "print(f\"\\nFeature 3 — Spawn memory ({spawn_path})\")\n",
-    "if spawn_path.exists():\n",
-    "    lines = [l for l in spawn_path.read_text().splitlines() if l.strip()]\n",
-    "    print(f\"  Spawn records written: {len(lines)}\")\n",
-    "    for line in lines[:3]:\n",
-    "        rec = json.loads(line)\n",
-    "        print(f\"    {rec['specialist_role']} | reward={rec['episode_reward']:.3f} \"\n",
-    "              f\"| sim {rec['pre_spawn_sim']:.2f}→{rec['post_spawn_sim']:.2f}\")\n",
-    "else:\n",
-    "    print(\"  No spawn memory yet (requires policy choosing SPAWN_SPECIALIST action)\")\n",
-    "\n",
-    "# Feature 4 — Resolution bandit\n",
-    "res_path = Path(_cfg.get(\"agents\", {}).get(\n",
-    "    \"resolution_memory_path\", \"data/resolution_memory.jsonl\"\n",
-    "))\n",
-    "print(f\"\\nFeature 4 — Resolution bandit ({res_path})\")\n",
-    "if res_path.exists():\n",
-    "    lines = [l for l in res_path.read_text().splitlines() if l.strip()]\n",
-    "    print(f\"  Outcome records written: {len(lines)}\")\n",
-    "    stats = {}\n",
-    "    for line in lines:\n",
-    "        rec = json.loads(line)\n",
-    "        key = f\"{rec['conflict_type']}/{rec['template_key']}\"\n",
-    "        stats.setdefault(key, []).append(rec[\"quality_delta\"])\n",
-    "    for k, deltas in stats.items():\n",
-    "        print(f\"    {k}: n={len(deltas)}, mean_delta={sum(deltas)/len(deltas):.3f}\")\n",
-    "else:\n",
-    "    print(\"  No resolution memory yet (requires detected conflicts)\")\n",
-    "\n",
-    "print(\"\\n\" + \"=\"*55)\n",
-    "print(\"✅ Audit complete\")\n",
-    "print(\"=\"*55)"
-   ],
-   "outputs": [],
-   "execution_count": null
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Cell 8 — Push to HuggingFace Hub\n",
-    "\n",
-    "Uploads model checkpoint, reward curve, training log, and README to `garvitsachdeva/spindleflow-rl`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {},
-   "source": [
-    "import os, json\n",
-    "import numpy as np\n",
-    "from huggingface_hub import HfApi, CommitOperationAdd\n",
-    "\n",
-    "HF_REPO = \"garvitsachdeva/spindleflow-rl\"\n",
-    "api = HfApi(token=HF_TOKEN)\n",
-    "\n",
-    "_tlog(f\"Pushing to https://huggingface.co/{HF_REPO} ...\")\n",
-    "api.create_repo(repo_id=HF_REPO.split(\"/\")[-1], repo_type=\"model\", exist_ok=True)\n",
-    "\n",
-    "ep   = reward_logger.episode_rewards\n",
-    "f5   = float(np.mean(ep[:5]))   if len(ep) >= 5 else 0.0\n",
-    "l5   = float(np.mean(ep[-5:])) if len(ep) >= 5 else 0.0\n",
-    "\n",
-    "readme_text = f\"\"\"---\n",
-    "license: mit\n",
-    "tags:\n",
-    "  - reinforcement-learning\n",
-    "  - stable-baselines3\n",
-    "  - sb3-contrib\n",
-    "  - gymnasium\n",
-    "  - multi-agent\n",
-    "  - openenv\n",
-    "library_name: stable-baselines3\n",
-    "---\n",
-    "\n",
-    "# SpindleFlow RL — Delegation Policy\n",
-    "\n",
-    "LSTM PPO (RecurrentPPO) agent trained on SpindleFlow-v0 (OpenEnv).  \n",
-    "Trained on Google Colab T4 GPU.\n",
-    "\n",
-    "## Training summary\n",
-    "| Metric | Value |\n",
-    "|---|---|\n",
-    "| Algorithm | RecurrentPPO (SB3 + sb3-contrib) |\n",
-    "| Total timesteps | {TOTAL_TIMESTEPS:,} |\n",
-    "| Episodes completed | {len(ep):,} |\n",
-    "| Early baseline (first 50 ep) | {early_mean:.4f} |\n",
-    "| Final mean (last 200 ep) | {final_mean:.4f} |\n",
-    "| Improvement | {final_mean - early_mean:+.4f} |\n",
-    "| Training time | {_elapsed/60:.1f} min |\n",
-    "| Device | T4 GPU |\n",
-    "\n",
-    "![Reward Curve](reward_curve.png)\n",
-    "\n",
-    "## Load\n",
-    "```python\n",
-    "from sb3_contrib import RecurrentPPO\n",
-    "from huggingface_hub import hf_hub_download\n",
-    "model = RecurrentPPO.load(hf_hub_download(\"{HF_REPO}\", \"spindleflow_model.zip\"))\n",
-    "```\n",
-    "\"\"\"\n",
-    "\n",
-    "readme_path = \"/content/README_model.md\"\n",
-    "with open(readme_path, \"w\") as f:\n",
-    "    f.write(readme_text)\n",
-    "\n",
-    "candidates = [\n",
-    "    (\"/content/spindleflow_colab_model.zip\",    \"spindleflow_model.zip\"),\n",
-    "    (\"/content/vec_normalize_colab.pkl\",         \"vec_normalize.pkl\"),\n",
-    "    (\"/content/reward_curve.png\",                \"reward_curve.png\"),\n",
-    "    (\"/content/demo/assets/reward_curve.json\",   \"reward_curve.json\"),\n",
-    "    (\"/content/logs/training_log.txt\",           \"training_log.txt\"),\n",
-    "    (readme_path,                                \"README.md\"),\n",
-    "]\n",
-    "\n",
-    "ops = [\n",
-    "    CommitOperationAdd(path_in_repo=dst, path_or_fileobj=src)\n",
-    "    for src, dst in candidates\n",
-    "    if os.path.exists(src)\n",
-    "]\n",
-    "\n",
-    "api.create_commit(\n",
-    "    repo_id=HF_REPO,\n",
-    "    repo_type=\"model\",\n",
-    "    operations=ops,\n",
-    "    commit_message=\"Add trained SpindleFlow RL policy (Colab T4)\",\n",
-    "    token=HF_TOKEN,\n",
-    ")\n",
-    "\n",
-    "_tlog(f\"Uploaded {len(ops)} files:\")\n",
-    "for src, dst in candidates:\n",
-    "    if os.path.exists(src):\n",
-    "        _tlog(f\"  ✓ {dst}\")\n",
-    "\n",
-    "_tlog(f\"Model       : https://huggingface.co/{HF_REPO}\")\n",
-    "_tlog(f\"Training log: https://huggingface.co/{HF_REPO}/blob/main/training_log.txt\")\n",
-    "_tlog(f\"Reward curve: https://huggingface.co/{HF_REPO}/blob/main/reward_curve.png\")\n",
-    "_tlog(f\"Improvement : {final_mean - early_mean:+.4f}\")\n",
-    "print(\"\\n✅ All done!\")"
-   ],
-   "outputs": [],
-   "execution_count": null
-  }
- ]
-}