garvitsachdeva Claude Sonnet 4.6 commited on
Commit
c77f83c
Β·
1 Parent(s): 13dd91f

Fix Cell 1: skip audioop-lts on Python < 3.13 (Colab uses 3.12)

Browse files
Files changed (1) hide show
  1. colab/SpindleFlow_RL_Training.ipynb +634 -670
colab/SpindleFlow_RL_Training.ipynb CHANGED
@@ -1,672 +1,636 @@
1
  {
2
- "nbformat": 4,
3
- "nbformat_minor": 0,
4
- "metadata": {
5
- "colab": {
6
- "provenance": [],
7
- "gpuType": "T4",
8
- "name": "SpindleFlow_RL_Training.ipynb"
9
- },
10
- "kernelspec": {
11
- "name": "python3",
12
- "display_name": "Python 3"
13
- },
14
- "language_info": {
15
- "name": "python"
16
- },
17
- "accelerator": "GPU"
18
  },
19
- "cells": [
20
- {
21
- "cell_type": "markdown",
22
- "metadata": {},
23
- "source": [
24
- "# SpindleFlow RL β€” Training Notebook\n",
25
- "\n",
26
- "**Hardware**: Runtime β†’ Change runtime type β†’ **T4 GPU**\n",
27
- "\n",
28
- "**Secrets** (key icon in left sidebar β†’ Manage secrets):\n",
29
- "\n",
30
- "| Name | Required | Notes |\n",
31
- "|---|---|---|\n",
32
- "| `HF_TOKEN` | βœ… Yes | HuggingFace write token β€” hf.co/settings/tokens β†’ New token (write) |\n",
33
- "| `OPENAI_API_KEY` | βœ… Yes | GPT-4o-mini for task generation, finetuner, reward baseline |\n",
34
- "\n",
35
- "Run cells **top to bottom, one at a time**. Do NOT skip cells."
36
- ]
37
- },
38
- {
39
- "cell_type": "markdown",
40
- "metadata": {},
41
- "source": [
42
- "## Cell 1 β€” Install dependencies & clone repo\n",
43
- "Run once. After it finishes, **do NOT restart the runtime** β€” continue to Cell 2."
44
- ]
45
- },
46
- {
47
- "cell_type": "code",
48
- "metadata": {},
49
- "source": [
50
- "import subprocess, os, sys\n",
51
- "\n",
52
- "subprocess.run([\n",
53
- " \"pip\", \"install\", \"-q\",\n",
54
- " \"openenv\", \"stable-baselines3\", \"sb3-contrib\", \"gymnasium\",\n",
55
- " \"sentence-transformers\", \"openai\", \"pyyaml\", \"trl\",\n",
56
- " \"transformers\", \"datasets\", \"torch\",\n",
57
- " \"matplotlib\", \"audioop-lts\", \"huggingface_hub\",\n",
58
- "], check=True)\n",
59
- "print(\"βœ… Packages installed\")\n",
60
- "\n",
61
- "REPO = \"/content/kuchbhi/spindleflow-rl\"\n",
62
- "if not os.path.isdir(REPO):\n",
63
- " subprocess.run(\n",
64
- " [\"git\", \"clone\", \"https://github.com/garvitsachdevaa/kuchbhi.git\"],\n",
65
- " cwd=\"/content\", check=True,\n",
66
- " )\n",
67
- " print(\"βœ… Repo cloned\")\n",
68
- "else:\n",
69
- " print(\"Repo already present β€” pulling latest\")\n",
70
- " subprocess.run([\"git\", \"pull\"], cwd=REPO, check=True)\n",
71
- "\n",
72
- "os.chdir(REPO)\n",
73
- "sys.path.insert(0, \".\")\n",
74
- "\n",
75
- "import importlib.metadata\n",
76
- "print(f\"OpenEnv version : {importlib.metadata.version('openenv')}\")\n",
77
- "\n",
78
- "os.makedirs(\"/content/demo/assets\", exist_ok=True)\n",
79
- "os.makedirs(\"/content/data\", exist_ok=True)\n",
80
- "os.makedirs(\"/content/checkpoints\", exist_ok=True)\n",
81
- "os.makedirs(\"/content/logs\", exist_ok=True)\n",
82
- "\n",
83
- "print(f\"Working directory: {os.getcwd()}\")\n",
84
- "print(\"βœ… Setup complete\")"
85
- ],
86
- "outputs": [],
87
- "execution_count": null
88
- },
89
- {
90
- "cell_type": "markdown",
91
- "metadata": {},
92
- "source": [
93
- "## Cell 2 β€” Set secrets & verify\n",
94
- "Reads `HF_TOKEN` and `OPENAI_API_KEY` from Colab secrets. \n",
95
- "**Both must show βœ… before continuing.**"
96
- ]
97
- },
98
- {
99
- "cell_type": "code",
100
- "metadata": {},
101
- "source": [
102
- "import os\n",
103
- "from google.colab import userdata\n",
104
- "\n",
105
- "HF_TOKEN = userdata.get(\"HF_TOKEN\")\n",
106
- "OPENAI_API_KEY = userdata.get(\"OPENAI_API_KEY\")\n",
107
- "\n",
108
- "if not HF_TOKEN:\n",
109
- " raise RuntimeError(\n",
110
- " \"HF_TOKEN not set.\\n\"\n",
111
- " \"Go to the key icon (left sidebar) β†’ Add secret β†’ Name: HF_TOKEN, \"\n",
112
- " \"Value: your write token from hf.co/settings/tokens β†’ enable notebook access.\"\n",
113
- " )\n",
114
- "\n",
115
- "if not OPENAI_API_KEY:\n",
116
- " raise RuntimeError(\n",
117
- " \"OPENAI_API_KEY not set.\\n\"\n",
118
- " \"Go to the key icon (left sidebar) β†’ Add secret β†’ Name: OPENAI_API_KEY, \"\n",
119
- " \"Value: sk-... β†’ enable notebook access.\"\n",
120
- " )\n",
121
- "\n",
122
- "# Inject into environment so all modules pick them up\n",
123
- "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY\n",
124
- "\n",
125
- "print(f\"βœ… HF_TOKEN : {HF_TOKEN[:8]}...{HF_TOKEN[-4:]}\")\n",
126
- "print(f\"βœ… OPENAI_API_KEY: {OPENAI_API_KEY[:8]}...{OPENAI_API_KEY[-4:]}\")\n",
127
- "print(\"Both secrets loaded β€” proceeding.\")"
128
- ],
129
- "outputs": [],
130
- "execution_count": null
131
- },
132
- {
133
- "cell_type": "markdown",
134
- "metadata": {},
135
- "source": [
136
- "## Cell 3 β€” Patch env + smoke test\n",
137
- "Adds `simulate_specialists` support and runs one end-to-end step to confirm the env works."
138
- ]
139
- },
140
- {
141
- "cell_type": "code",
142
- "metadata": {},
143
- "source": [
144
- "import os as _os\n",
145
- "import numpy as np\n",
146
- "from env.spindleflow_env import SpindleFlowEnv\n",
147
- "\n",
148
- "# Monkey-patch: add simulate_specialists kwarg (fast per-step simulation)\n",
149
- "if not getattr(SpindleFlowEnv, \"_simulate_patched\", False):\n",
150
- " _orig_init = SpindleFlowEnv.__init__\n",
151
- "\n",
152
- " def _new_init(self, *args, simulate_specialists=False, **kwargs):\n",
153
- " _orig_init(self, *args, **kwargs)\n",
154
- " self.simulate_specialists = simulate_specialists\n",
155
- "\n",
156
- " SpindleFlowEnv.__init__ = _new_init\n",
157
- "\n",
158
- " _orig_call = SpindleFlowEnv._call_specialist\n",
159
- "\n",
160
- " def _new_call(self, specialist_id, task, elapsed_ms, context=None):\n",
161
- " if getattr(self, \"simulate_specialists\", False):\n",
162
- " _key = _os.environ.pop(\"OPENAI_API_KEY\", None)\n",
163
- " try:\n",
164
- " return _orig_call(self, specialist_id, task, elapsed_ms, context=context)\n",
165
- " finally:\n",
166
- " if _key:\n",
167
- " _os.environ[\"OPENAI_API_KEY\"] = _key\n",
168
- " return _orig_call(self, specialist_id, task, elapsed_ms, context=context)\n",
169
- "\n",
170
- " SpindleFlowEnv._call_specialist = _new_call\n",
171
- " SpindleFlowEnv._simulate_patched = True\n",
172
- " print(\"βœ… SpindleFlowEnv patched\")\n",
173
- "else:\n",
174
- " print(\"Already patched β€” skipping\")\n",
175
- "\n",
176
- "env = SpindleFlowEnv(\n",
177
- " config_path=\"configs/training_config.yaml\",\n",
178
- " catalog_path=\"configs/specialist_catalog.yaml\",\n",
179
- " use_real_spindleflow=False,\n",
180
- " phase=1,\n",
181
- " simulate_specialists=True,\n",
182
- ")\n",
183
- "obs, info = env.reset()\n",
184
- "print(f\"Observation shape : {obs.shape}\")\n",
185
- "print(f\"Task : {info['task'][:80]}\")\n",
186
- "\n",
187
- "action = env.action_space.sample()\n",
188
- "obs2, reward, terminated, truncated, info2 = env.step(action)\n",
189
- "print(f\"Step reward : {reward:.4f}\")\n",
190
- "print(f\"Action name : {info2['action_name']}\")\n",
191
- "print(f\"Reward components : {info2['reward_components']}\")\n",
192
- "env.close()\n",
193
- "print(\"βœ… Environment OK\")"
194
- ],
195
- "outputs": [],
196
- "execution_count": null
197
- },
198
- {
199
- "cell_type": "markdown",
200
- "metadata": {},
201
- "source": [
202
- "## Cell 4 β€” HuggingFace TRL check\n",
203
- "Confirms TRL is importable (hackathon requirement)."
204
- ]
205
- },
206
- {
207
- "cell_type": "code",
208
- "metadata": {},
209
- "source": [
210
- "import trl, torch\n",
211
- "\n",
212
- "print(f\"TRL version : {trl.__version__}\")\n",
213
- "print(f\"Torch version : {torch.__version__}\")\n",
214
- "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
215
- "if torch.cuda.is_available():\n",
216
- " print(f\"GPU : {torch.cuda.get_device_name(0)}\")\n",
217
- "\n",
218
- "for _name in (\"PPOConfig\", \"GRPOConfig\", \"SFTConfig\"):\n",
219
- " _cls = getattr(trl, _name, None)\n",
220
- " if _cls is not None:\n",
221
- " print(f\"TRL config class: {_name} βœ…\")\n",
222
- " break\n",
223
- "else:\n",
224
- " print(\"TRL imported βœ… (config uses TrainingArguments in this version)\")\n",
225
- "\n",
226
- "print(\"βœ… TRL requirement satisfied. Primary training uses RecurrentPPO (Cell 5).\")"
227
- ],
228
- "outputs": [],
229
- "execution_count": null
230
- },
231
- {
232
- "cell_type": "markdown",
233
- "metadata": {},
234
- "source": [
235
- "## Cell 5 β€” RecurrentPPO training\n",
236
- "\n",
237
- "**What's happening:**\n",
238
- "- Per-step specialist calls: local simulation (fast, no API cost)\n",
239
- "- Task generation: GPT-4o-mini via `OPENAI_API_KEY` (diverse tasks)\n",
240
- "- Finetuner: fires every 100 episodes via `OPENAI_API_KEY` (improves specialist prompts)\n",
241
- "- Reward baseline: LLM-generated via `OPENAI_API_KEY` (accurate quality signal)\n",
242
- "\n",
243
- "**Expected runtime: 20–30 min on T4 GPU**"
244
- ]
245
- },
246
- {
247
- "cell_type": "code",
248
- "metadata": {},
249
- "source": [
250
- "import time, yaml\n",
251
- "import torch\n",
252
- "import numpy as np\n",
253
- "from sb3_contrib import RecurrentPPO\n",
254
- "from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize\n",
255
- "from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback\n",
256
- "from policy.lstm_policy import build_policy_kwargs\n",
257
- "from training.curriculum import CurriculumManager\n",
258
- "from training.specialist_improvement_callback import SpecialistImprovementCallback\n",
259
- "\n",
260
- "_LOG_FILE = \"/content/logs/training_log.txt\"\n",
261
- "\n",
262
- "def _tlog(msg: str):\n",
263
- " ts = time.strftime(\"%H:%M:%S\")\n",
264
- " line = f\"[{ts}] {msg}\"\n",
265
- " print(line, flush=True)\n",
266
- " with open(_LOG_FILE, \"a\", encoding=\"utf-8\") as _f:\n",
267
- " _f.write(line + \"\\n\")\n",
268
- "\n",
269
- "with open(\"configs/training_config.yaml\") as f:\n",
270
- " _cfg = yaml.safe_load(f)\n",
271
- "\n",
272
- "curriculum = CurriculumManager(config_path=\"configs/training_config.yaml\")\n",
273
- "\n",
274
- "TOTAL_TIMESTEPS = 100_000 # ~10k episodes, ~20-25 min on T4\n",
275
- "\n",
276
- "\n",
277
- "class RewardLogger(BaseCallback):\n",
278
- " def __init__(self, curriculum):\n",
279
- " super().__init__()\n",
280
- " self.episode_rewards = []\n",
281
- " self._running = 0.0\n",
282
- " self._curriculum = curriculum\n",
283
- "\n",
284
- " def _on_step(self):\n",
285
- " for r, d in zip(\n",
286
- " self.locals.get(\"rewards\", []),\n",
287
- " self.locals.get(\"dones\", []),\n",
288
- " ):\n",
289
- " self._running += float(r)\n",
290
- " if d:\n",
291
- " ep = self._running\n",
292
- " self.episode_rewards.append(ep)\n",
293
- " self._running = 0.0\n",
294
- " advanced = self._curriculum.on_episode_end(ep)\n",
295
- " n = len(self.episode_rewards)\n",
296
- " if advanced or n % 50 == 0:\n",
297
- " _tlog(\n",
298
- " f\"Ep {n:5d} | reward {ep:+.3f} | \"\n",
299
- " f\"{self._curriculum.progress_str()}\"\n",
300
- " )\n",
301
- " return True\n",
302
- "\n",
303
- "\n",
304
- "def make_env():\n",
305
- " return SpindleFlowEnv(\n",
306
- " config_path=\"configs/training_config.yaml\",\n",
307
- " catalog_path=\"configs/specialist_catalog.yaml\",\n",
308
- " use_real_spindleflow=False,\n",
309
- " phase=1,\n",
310
- " simulate_specialists=True,\n",
311
- " )\n",
312
- "\n",
313
- "\n",
314
- "vec_env = DummyVecEnv([make_env])\n",
315
- "vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=True, clip_obs=10.0)\n",
316
- "\n",
317
- "_ppo = _cfg.get(\"ppo\", {})\n",
318
- "_lstm = _cfg.get(\"lstm\", {})\n",
319
- "\n",
320
- "model = RecurrentPPO(\n",
321
- " policy=\"MlpLstmPolicy\",\n",
322
- " env=vec_env,\n",
323
- " learning_rate=float(_ppo.get(\"learning_rate\", 3e-4)),\n",
324
- " n_steps=int(_ppo.get(\"n_steps\", 512)),\n",
325
- " batch_size=int(_ppo.get(\"batch_size\", 64)),\n",
326
- " n_epochs=int(_ppo.get(\"n_epochs\", 10)),\n",
327
- " gamma=float(_ppo.get(\"gamma\", 0.99)),\n",
328
- " gae_lambda=float(_ppo.get(\"gae_lambda\", 0.95)),\n",
329
- " clip_range=float(_ppo.get(\"clip_range\", 0.2)),\n",
330
- " ent_coef=float(_ppo.get(\"ent_coef\", 0.01)),\n",
331
- " vf_coef=float(_ppo.get(\"vf_coef\", 0.5)),\n",
332
- " max_grad_norm=float(_ppo.get(\"max_grad_norm\", 0.5)),\n",
333
- " policy_kwargs=build_policy_kwargs(\n",
334
- " hidden_size=int(_lstm.get(\"hidden_size\", 256))\n",
335
- " ),\n",
336
- " verbose=0,\n",
337
- " seed=int(_cfg.get(\"training\", {}).get(\"seed\", 42)),\n",
338
- " device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
339
- ")\n",
340
- "\n",
341
- "_tlog(f\"Device : {model.device}\")\n",
342
- "_tlog(f\"Total timesteps : {TOTAL_TIMESTEPS:,}\")\n",
343
- "_tlog(f\"Curriculum start: Phase {curriculum.current_phase} β€” {curriculum.progress_str()}\")\n",
344
- "_tlog(\"Training started...\")\n",
345
- "\n",
346
- "reward_logger = RewardLogger(curriculum=curriculum)\n",
347
- "checkpoint_cb = CheckpointCallback(save_freq=10_000, save_path=\"/content/checkpoints/\")\n",
348
- "improvement_cb = SpecialistImprovementCallback(\n",
349
- " improve_every_n_episodes=_cfg.get(\"specialist_improvement\", {}).get(\n",
350
- " \"improve_every_n_episodes\", 100\n",
351
- " ),\n",
352
- " verbose=1,\n",
353
- ")\n",
354
- "\n",
355
- "_t0 = time.time()\n",
356
- "model.learn(\n",
357
- " total_timesteps=TOTAL_TIMESTEPS,\n",
358
- " callback=[reward_logger, checkpoint_cb, improvement_cb],\n",
359
- ")\n",
360
- "_elapsed = time.time() - _t0\n",
361
- "\n",
362
- "model.save(\"/content/spindleflow_colab_model\")\n",
363
- "vec_env.save(\"/content/vec_normalize_colab.pkl\")\n",
364
- "\n",
365
- "_tlog(f\"Training done in {_elapsed/60:.1f} min\")\n",
366
- "_tlog(f\"Episodes tracked : {len(reward_logger.episode_rewards)}\")\n",
367
- "_tlog(f\"Final curriculum : {curriculum.progress_str()}\")\n",
368
- "print(\"\\nβœ… Model saved to /content/spindleflow_colab_model.zip\")"
369
- ],
370
- "outputs": [],
371
- "execution_count": null
372
- },
373
- {
374
- "cell_type": "markdown",
375
- "metadata": {},
376
- "source": [
377
- "## Cell 6 β€” Reward curve\n",
378
- "Generates publication-quality plot and saves JSON for the HF Space demo."
379
- ]
380
- },
381
- {
382
- "cell_type": "code",
383
- "metadata": {},
384
- "source": [
385
- "import json\n",
386
- "import numpy as np\n",
387
- "import matplotlib\n",
388
- "matplotlib.use(\"Agg\")\n",
389
- "import matplotlib.pyplot as plt\n",
390
- "\n",
391
- "ep_rewards = reward_logger.episode_rewards\n",
392
- "if not ep_rewards:\n",
393
- " raise RuntimeError(\"No episodes completed β€” check Cell 5 output for errors.\")\n",
394
- "\n",
395
- "n_ep = len(ep_rewards)\n",
396
- "episodes = list(range(n_ep))\n",
397
- "window = max(30, n_ep // 20) # adaptive: ~5% of total\n",
398
- "\n",
399
- "smoothed = [\n",
400
- " float(np.mean(ep_rewards[max(0, i - window):i + 1]))\n",
401
- " for i in range(n_ep)\n",
402
- "]\n",
403
- "\n",
404
- "early_mean = float(np.mean(ep_rewards[:min(50, n_ep)]))\n",
405
- "final_mean = float(np.mean(ep_rewards[max(0, n_ep - 200):]))\n",
406
- "improvement = final_mean - early_mean\n",
407
- "\n",
408
- "# ── Save JSON ──────────────────────────────────────────────────\n",
409
- "step = max(1, n_ep // 300)\n",
410
- "json_data = {\n",
411
- " \"episodes\": episodes[::step],\n",
412
- " \"mean_rewards\": smoothed[::step],\n",
413
- "}\n",
414
- "with open(\"/content/demo/assets/reward_curve.json\", \"w\") as f:\n",
415
- " json.dump(json_data, f)\n",
416
- "print(f\"Saved reward_curve.json ({len(json_data['episodes'])} points)\")\n",
417
- "\n",
418
- "# ── Plot ───────────────────────────────────────────────────────\n",
419
- "fig, ax = plt.subplots(figsize=(11, 5), dpi=180)\n",
420
- "fig.patch.set_facecolor(\"#0d1117\")\n",
421
- "ax.set_facecolor(\"#161b22\")\n",
422
- "\n",
423
- "plot_every = max(1, n_ep // 800)\n",
424
- "ax.scatter(\n",
425
- " episodes[::plot_every], ep_rewards[::plot_every],\n",
426
- " s=4, alpha=0.25, color=\"#58a6ff\", zorder=2, label=\"Episode reward\",\n",
427
- ")\n",
428
- "ax.plot(\n",
429
- " episodes[::plot_every], smoothed[::plot_every],\n",
430
- " linewidth=2.5, color=\"#ff6b35\", zorder=3,\n",
431
- " label=f\"Smoothed ({window}-ep mean)\",\n",
432
- ")\n",
433
- "ax.axhline(\n",
434
- " y=early_mean, color=\"#94a3b8\", linestyle=\"--\", linewidth=1.2, alpha=0.75,\n",
435
- " label=f\"Early baseline {early_mean:+.3f}\",\n",
436
- ")\n",
437
- "ax.axhline(\n",
438
- " y=final_mean, color=\"#34d399\", linestyle=\"--\", linewidth=1.2, alpha=0.85,\n",
439
- " label=f\"Final mean {final_mean:+.3f}\",\n",
440
- ")\n",
441
- "\n",
442
- "ax.set_xlabel(\"Episode\", color=\"#c9d1d9\", fontsize=12)\n",
443
- "ax.set_ylabel(\"Reward\", color=\"#c9d1d9\", fontsize=12)\n",
444
- "ax.set_title(\n",
445
- " \"SpindleFlow RL β€” Delegation Policy Learning Curve\\n\"\n",
446
- " f\"RecurrentPPO Β· LSTM Β· {TOTAL_TIMESTEPS:,} steps Β· {n_ep:,} episodes\",\n",
447
- " color=\"#f0f6fc\", fontsize=13, fontweight=\"bold\", pad=14,\n",
448
- ")\n",
449
- "ax.tick_params(colors=\"#8b949e\")\n",
450
- "for spine in ax.spines.values():\n",
451
- " spine.set_edgecolor(\"#30363d\")\n",
452
- "ax.grid(color=\"#21262d\", linewidth=0.8, alpha=0.9)\n",
453
- "ax.legend(\n",
454
- " fontsize=10, framealpha=0.85,\n",
455
- " facecolor=\"#161b22\", edgecolor=\"#30363d\", labelcolor=\"#c9d1d9\",\n",
456
- ")\n",
457
- "\n",
458
- "sign = \"β–²\" if improvement >= 0 else \"β–Ό\"\n",
459
- "ax.annotate(\n",
460
- " f\" {sign} {abs(improvement):.3f} reward improvement\",\n",
461
- " xy=(n_ep * 0.65, (early_mean + final_mean) / 2),\n",
462
- " color=\"#f0f6fc\", fontsize=10, fontstyle=\"italic\",\n",
463
- ")\n",
464
- "\n",
465
- "fig.tight_layout()\n",
466
- "fig.savefig(\"/content/reward_curve.png\", dpi=180, bbox_inches=\"tight\",\n",
467
- " facecolor=fig.get_facecolor())\n",
468
- "plt.show()\n",
469
- "\n",
470
- "print(f\"\\n{'='*50}\")\n",
471
- "print(f\"Episodes completed : {n_ep:,}\")\n",
472
- "print(f\"Early baseline : {early_mean:+.4f}\")\n",
473
- "print(f\"Final mean : {final_mean:+.4f}\")\n",
474
- "print(f\"Improvement : {improvement:+.4f}\")\n",
475
- "print(f\"{'='*50}\")\n",
476
- "print(\"βœ… Reward curve saved to /content/reward_curve.png\")\n",
477
- "\n",
478
- "_tlog(f\"Reward curve: early={early_mean:+.4f}, final={final_mean:+.4f}, improvement={improvement:+.4f}\")"
479
- ],
480
- "outputs": [],
481
- "execution_count": null
482
- },
483
- {
484
- "cell_type": "markdown",
485
- "metadata": {},
486
- "source": [
487
- "## Cell 7 β€” Learning features audit\n",
488
- "Confirms each self-learning feature fired at least once during training."
489
- ]
490
- },
491
- {
492
- "cell_type": "code",
493
- "metadata": {},
494
- "source": [
495
- "import os, json\n",
496
- "from pathlib import Path\n",
497
- "\n",
498
- "print(\"=\"*55)\n",
499
- "print(\"LEARNING FEATURES AUDIT\")\n",
500
- "print(\"=\"*55)\n",
501
- "\n",
502
- "# Feature 5 β€” Curriculum\n",
503
- "print(f\"\\nFeature 5 β€” Curriculum (performance-gated)\")\n",
504
- "print(f\" Final phase : {curriculum.current_phase}/3\")\n",
505
- "print(f\" Rolling mean reward: {curriculum.rolling_mean():.3f}\")\n",
506
- "print(f\" {curriculum.progress_str()}\")\n",
507
- "\n",
508
- "# Feature 2 β€” Specialist memory\n",
509
- "mem_path = Path(_cfg.get(\"specialist_improvement\", {}).get(\n",
510
- " \"memory_path\", \"data/specialist_memory.json\"\n",
511
- "))\n",
512
- "print(f\"\\nFeature 2 β€” Specialist memory ({mem_path})\")\n",
513
- "if mem_path.exists():\n",
514
- " data = json.loads(mem_path.read_text())\n",
515
- " total_entries = sum(len(v) for v in data.values())\n",
516
- " print(f\" Specialists with memory : {len(data)}\")\n",
517
- " print(f\" Total entries recorded : {total_entries}\")\n",
518
- " for sid, entries in list(data.items())[:3]:\n",
519
- " avg = sum(e[\"reward\"] for e in entries) / len(entries)\n",
520
- " print(f\" {sid}: {len(entries)} entries, avg_reward={avg:.3f}\")\n",
521
- "else:\n",
522
- " print(\" No memory file yet (finetuner may not have fired β€” normal below 100 episodes)\")\n",
523
- "\n",
524
- "# Feature 3 β€” Spawn memory\n",
525
- "spawn_path = Path(_cfg.get(\"environment\", {}).get(\n",
526
- " \"spawn_memory_path\", \"data/spawn_memory.jsonl\"\n",
527
- "))\n",
528
- "print(f\"\\nFeature 3 β€” Spawn memory ({spawn_path})\")\n",
529
- "if spawn_path.exists():\n",
530
- " lines = [l for l in spawn_path.read_text().splitlines() if l.strip()]\n",
531
- " print(f\" Spawn records written: {len(lines)}\")\n",
532
- " for line in lines[:3]:\n",
533
- " rec = json.loads(line)\n",
534
- " print(f\" {rec['specialist_role']} | reward={rec['episode_reward']:.3f} \"\n",
535
- " f\"| sim {rec['pre_spawn_sim']:.2f}β†’{rec['post_spawn_sim']:.2f}\")\n",
536
- "else:\n",
537
- " print(\" No spawn memory yet (requires policy choosing SPAWN_SPECIALIST action)\")\n",
538
- "\n",
539
- "# Feature 4 β€” Resolution bandit\n",
540
- "res_path = Path(_cfg.get(\"agents\", {}).get(\n",
541
- " \"resolution_memory_path\", \"data/resolution_memory.jsonl\"\n",
542
- "))\n",
543
- "print(f\"\\nFeature 4 β€” Resolution bandit ({res_path})\")\n",
544
- "if res_path.exists():\n",
545
- " lines = [l for l in res_path.read_text().splitlines() if l.strip()]\n",
546
- " print(f\" Outcome records written: {len(lines)}\")\n",
547
- " stats = {}\n",
548
- " for line in lines:\n",
549
- " rec = json.loads(line)\n",
550
- " key = f\"{rec['conflict_type']}/{rec['template_key']}\"\n",
551
- " stats.setdefault(key, []).append(rec[\"quality_delta\"])\n",
552
- " for k, deltas in stats.items():\n",
553
- " print(f\" {k}: n={len(deltas)}, mean_delta={sum(deltas)/len(deltas):.3f}\")\n",
554
- "else:\n",
555
- " print(\" No resolution memory yet (requires detected conflicts)\")\n",
556
- "\n",
557
- "print(\"\\n\" + \"=\"*55)\n",
558
- "print(\"βœ… Audit complete\")\n",
559
- "print(\"=\"*55)"
560
- ],
561
- "outputs": [],
562
- "execution_count": null
563
- },
564
- {
565
- "cell_type": "markdown",
566
- "metadata": {},
567
- "source": [
568
- "## Cell 8 β€” Push to HuggingFace Hub\n",
569
- "\n",
570
- "Uploads model checkpoint, reward curve, training log, and README to `garvitsachdeva/spindleflow-rl`."
571
- ]
572
- },
573
- {
574
- "cell_type": "code",
575
- "metadata": {},
576
- "source": [
577
- "import os, json\n",
578
- "import numpy as np\n",
579
- "from huggingface_hub import HfApi, CommitOperationAdd\n",
580
- "\n",
581
- "HF_REPO = \"garvitsachdeva/spindleflow-rl\"\n",
582
- "api = HfApi(token=HF_TOKEN)\n",
583
- "\n",
584
- "_tlog(f\"Pushing to https://huggingface.co/{HF_REPO} ...\")\n",
585
- "api.create_repo(repo_id=HF_REPO.split(\"/\")[-1], repo_type=\"model\", exist_ok=True)\n",
586
- "\n",
587
- "ep = reward_logger.episode_rewards\n",
588
- "f5 = float(np.mean(ep[:5])) if len(ep) >= 5 else 0.0\n",
589
- "l5 = float(np.mean(ep[-5:])) if len(ep) >= 5 else 0.0\n",
590
- "\n",
591
- "readme_text = f\"\"\"---\n",
592
- "license: mit\n",
593
- "tags:\n",
594
- " - reinforcement-learning\n",
595
- " - stable-baselines3\n",
596
- " - sb3-contrib\n",
597
- " - gymnasium\n",
598
- " - multi-agent\n",
599
- " - openenv\n",
600
- "library_name: stable-baselines3\n",
601
- "---\n",
602
- "\n",
603
- "# SpindleFlow RL β€” Delegation Policy\n",
604
- "\n",
605
- "LSTM PPO (RecurrentPPO) agent trained on SpindleFlow-v0 (OpenEnv). \n",
606
- "Trained on Google Colab T4 GPU.\n",
607
- "\n",
608
- "## Training summary\n",
609
- "| Metric | Value |\n",
610
- "|---|---|\n",
611
- "| Algorithm | RecurrentPPO (SB3 + sb3-contrib) |\n",
612
- "| Total timesteps | {TOTAL_TIMESTEPS:,} |\n",
613
- "| Episodes completed | {len(ep):,} |\n",
614
- "| Early baseline (first 50 ep) | {early_mean:.4f} |\n",
615
- "| Final mean (last 200 ep) | {final_mean:.4f} |\n",
616
- "| Improvement | {final_mean - early_mean:+.4f} |\n",
617
- "| Training time | {_elapsed/60:.1f} min |\n",
618
- "| Device | T4 GPU |\n",
619
- "\n",
620
- "![Reward Curve](reward_curve.png)\n",
621
- "\n",
622
- "## Load\n",
623
- "```python\n",
624
- "from sb3_contrib import RecurrentPPO\n",
625
- "from huggingface_hub import hf_hub_download\n",
626
- "model = RecurrentPPO.load(hf_hub_download(\"{HF_REPO}\", \"spindleflow_model.zip\"))\n",
627
- "```\n",
628
- "\"\"\"\n",
629
- "\n",
630
- "readme_path = \"/content/README_model.md\"\n",
631
- "with open(readme_path, \"w\") as f:\n",
632
- " f.write(readme_text)\n",
633
- "\n",
634
- "candidates = [\n",
635
- " (\"/content/spindleflow_colab_model.zip\", \"spindleflow_model.zip\"),\n",
636
- " (\"/content/vec_normalize_colab.pkl\", \"vec_normalize.pkl\"),\n",
637
- " (\"/content/reward_curve.png\", \"reward_curve.png\"),\n",
638
- " (\"/content/demo/assets/reward_curve.json\", \"reward_curve.json\"),\n",
639
- " (\"/content/logs/training_log.txt\", \"training_log.txt\"),\n",
640
- " (readme_path, \"README.md\"),\n",
641
- "]\n",
642
- "\n",
643
- "ops = [\n",
644
- " CommitOperationAdd(path_in_repo=dst, path_or_fileobj=src)\n",
645
- " for src, dst in candidates\n",
646
- " if os.path.exists(src)\n",
647
- "]\n",
648
- "\n",
649
- "api.create_commit(\n",
650
- " repo_id=HF_REPO,\n",
651
- " repo_type=\"model\",\n",
652
- " operations=ops,\n",
653
- " commit_message=\"Add trained SpindleFlow RL policy (Colab T4)\",\n",
654
- " token=HF_TOKEN,\n",
655
- ")\n",
656
- "\n",
657
- "_tlog(f\"Uploaded {len(ops)} files:\")\n",
658
- "for src, dst in candidates:\n",
659
- " if os.path.exists(src):\n",
660
- " _tlog(f\" βœ“ {dst}\")\n",
661
- "\n",
662
- "_tlog(f\"Model : https://huggingface.co/{HF_REPO}\")\n",
663
- "_tlog(f\"Training log: https://huggingface.co/{HF_REPO}/blob/main/training_log.txt\")\n",
664
- "_tlog(f\"Reward curve: https://huggingface.co/{HF_REPO}/blob/main/reward_curve.png\")\n",
665
- "_tlog(f\"Improvement : {final_mean - early_mean:+.4f}\")\n",
666
- "print(\"\\nβœ… All done!\")"
667
- ],
668
- "outputs": [],
669
- "execution_count": null
670
- }
671
- ]
672
- }
 
1
  {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4",
8
+ "name": "SpindleFlow_RL_Training.ipynb"
 
 
 
 
 
 
 
 
 
9
  },
10
+ "kernelspec": {
11
+ "name": "python3",
12
+ "display_name": "Python 3"
13
+ },
14
+ "language_info": {
15
+ "name": "python"
16
+ },
17
+ "accelerator": "GPU"
18
+ },
19
+ "cells": [
20
+ {
21
+ "cell_type": "markdown",
22
+ "metadata": {},
23
+ "source": [
24
+ "# SpindleFlow RL β€” Training Notebook\n",
25
+ "\n",
26
+ "**Hardware**: Runtime β†’ Change runtime type β†’ **T4 GPU**\n",
27
+ "\n",
28
+ "**Secrets** (key icon in left sidebar β†’ Manage secrets):\n",
29
+ "\n",
30
+ "| Name | Required | Notes |\n",
31
+ "|---|---|---|\n",
32
+ "| `HF_TOKEN` | βœ… Yes | HuggingFace write token β€” hf.co/settings/tokens β†’ New token (write) |\n",
33
+ "| `OPENAI_API_KEY` | βœ… Yes | GPT-4o-mini for task generation, finetuner, reward baseline |\n",
34
+ "\n",
35
+ "Run cells **top to bottom, one at a time**. Do NOT skip cells."
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "markdown",
40
+ "metadata": {},
41
+ "source": [
42
+ "## Cell 1 β€” Install dependencies & clone repo\n",
43
+ "Run once. After it finishes, **do NOT restart the runtime** β€” continue to Cell 2."
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "metadata": {},
49
+ "source": "import subprocess, os, sys\n\nprint(f\"Python version: {sys.version}\")\n\n# audioop-lts is only for Python 3.13+ β€” Colab uses 3.12 where audioop is built-in\n_pkgs = [\n \"openenv\", \"stable-baselines3\", \"sb3-contrib\", \"gymnasium\",\n \"sentence-transformers\", \"openai\", \"pyyaml\", \"trl\",\n \"transformers\", \"datasets\", \"torch\",\n \"matplotlib\", \"huggingface_hub\",\n]\nif sys.version_info >= (3, 13):\n _pkgs.append(\"audioop-lts\")\n\nresult = subprocess.run([\"pip\", \"install\"] + _pkgs, capture_output=True, text=True)\nif result.returncode != 0:\n print(\"STDOUT:\", result.stdout[-3000:])\n print(\"STDERR:\", result.stderr[-3000:])\n raise RuntimeError(\"pip install failed β€” see output above\")\nprint(\"βœ… Packages installed\")\n\nREPO = \"/content/kuchbhi/spindleflow-rl\"\nif not os.path.isdir(REPO):\n subprocess.run(\n [\"git\", \"clone\", \"https://github.com/garvitsachdevaa/kuchbhi.git\"],\n cwd=\"/content\", check=True,\n )\n print(\"βœ… Repo cloned\")\nelse:\n print(\"Repo already present β€” pulling latest\")\n subprocess.run([\"git\", \"pull\"], cwd=REPO, check=True)\n\nos.chdir(REPO)\nsys.path.insert(0, \".\")\n\nimport importlib.metadata\nprint(f\"OpenEnv version : {importlib.metadata.version('openenv')}\")\n\nos.makedirs(\"/content/demo/assets\", exist_ok=True)\nos.makedirs(\"/content/data\", exist_ok=True)\nos.makedirs(\"/content/checkpoints\", exist_ok=True)\nos.makedirs(\"/content/logs\", exist_ok=True)\n\nprint(f\"Working directory: {os.getcwd()}\")\nprint(\"βœ… Setup complete\")",
50
+ "outputs": [],
51
+ "execution_count": null
52
+ },
53
+ {
54
+ "cell_type": "markdown",
55
+ "metadata": {},
56
+ "source": [
57
+ "## Cell 2 β€” Set secrets & verify\n",
58
+ "Reads `HF_TOKEN` and `OPENAI_API_KEY` from Colab secrets. \n",
59
+ "**Both must show βœ… before continuing.**"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "metadata": {},
65
+ "source": [
66
+ "import os\n",
67
+ "from google.colab import userdata\n",
68
+ "\n",
69
+ "HF_TOKEN = userdata.get(\"HF_TOKEN\")\n",
70
+ "OPENAI_API_KEY = userdata.get(\"OPENAI_API_KEY\")\n",
71
+ "\n",
72
+ "if not HF_TOKEN:\n",
73
+ " raise RuntimeError(\n",
74
+ " \"HF_TOKEN not set.\\n\"\n",
75
+ " \"Go to the key icon (left sidebar) β†’ Add secret β†’ Name: HF_TOKEN, \"\n",
76
+ " \"Value: your write token from hf.co/settings/tokens β†’ enable notebook access.\"\n",
77
+ " )\n",
78
+ "\n",
79
+ "if not OPENAI_API_KEY:\n",
80
+ " raise RuntimeError(\n",
81
+ " \"OPENAI_API_KEY not set.\\n\"\n",
82
+ " \"Go to the key icon (left sidebar) β†’ Add secret β†’ Name: OPENAI_API_KEY, \"\n",
83
+ " \"Value: sk-... β†’ enable notebook access.\"\n",
84
+ " )\n",
85
+ "\n",
86
+ "# Inject into environment so all modules pick them up\n",
87
+ "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY\n",
88
+ "\n",
89
+ "print(f\"βœ… HF_TOKEN : {HF_TOKEN[:8]}...{HF_TOKEN[-4:]}\")\n",
90
+ "print(f\"βœ… OPENAI_API_KEY: {OPENAI_API_KEY[:8]}...{OPENAI_API_KEY[-4:]}\")\n",
91
+ "print(\"Both secrets loaded β€” proceeding.\")"
92
+ ],
93
+ "outputs": [],
94
+ "execution_count": null
95
+ },
96
+ {
97
+ "cell_type": "markdown",
98
+ "metadata": {},
99
+ "source": [
100
+ "## Cell 3 β€” Patch env + smoke test\n",
101
+ "Adds `simulate_specialists` support and runs one end-to-end step to confirm the env works."
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "metadata": {},
107
+ "source": [
108
+ "import os as _os\n",
109
+ "import numpy as np\n",
110
+ "from env.spindleflow_env import SpindleFlowEnv\n",
111
+ "\n",
112
+ "# Monkey-patch: add simulate_specialists kwarg (fast per-step simulation)\n",
113
+ "if not getattr(SpindleFlowEnv, \"_simulate_patched\", False):\n",
114
+ " _orig_init = SpindleFlowEnv.__init__\n",
115
+ "\n",
116
+ " def _new_init(self, *args, simulate_specialists=False, **kwargs):\n",
117
+ " _orig_init(self, *args, **kwargs)\n",
118
+ " self.simulate_specialists = simulate_specialists\n",
119
+ "\n",
120
+ " SpindleFlowEnv.__init__ = _new_init\n",
121
+ "\n",
122
+ " _orig_call = SpindleFlowEnv._call_specialist\n",
123
+ "\n",
124
+ " def _new_call(self, specialist_id, task, elapsed_ms, context=None):\n",
125
+ " if getattr(self, \"simulate_specialists\", False):\n",
126
+ " _key = _os.environ.pop(\"OPENAI_API_KEY\", None)\n",
127
+ " try:\n",
128
+ " return _orig_call(self, specialist_id, task, elapsed_ms, context=context)\n",
129
+ " finally:\n",
130
+ " if _key:\n",
131
+ " _os.environ[\"OPENAI_API_KEY\"] = _key\n",
132
+ " return _orig_call(self, specialist_id, task, elapsed_ms, context=context)\n",
133
+ "\n",
134
+ " SpindleFlowEnv._call_specialist = _new_call\n",
135
+ " SpindleFlowEnv._simulate_patched = True\n",
136
+ " print(\"βœ… SpindleFlowEnv patched\")\n",
137
+ "else:\n",
138
+ " print(\"Already patched β€” skipping\")\n",
139
+ "\n",
140
+ "env = SpindleFlowEnv(\n",
141
+ " config_path=\"configs/training_config.yaml\",\n",
142
+ " catalog_path=\"configs/specialist_catalog.yaml\",\n",
143
+ " use_real_spindleflow=False,\n",
144
+ " phase=1,\n",
145
+ " simulate_specialists=True,\n",
146
+ ")\n",
147
+ "obs, info = env.reset()\n",
148
+ "print(f\"Observation shape : {obs.shape}\")\n",
149
+ "print(f\"Task : {info['task'][:80]}\")\n",
150
+ "\n",
151
+ "action = env.action_space.sample()\n",
152
+ "obs2, reward, terminated, truncated, info2 = env.step(action)\n",
153
+ "print(f\"Step reward : {reward:.4f}\")\n",
154
+ "print(f\"Action name : {info2['action_name']}\")\n",
155
+ "print(f\"Reward components : {info2['reward_components']}\")\n",
156
+ "env.close()\n",
157
+ "print(\"βœ… Environment OK\")"
158
+ ],
159
+ "outputs": [],
160
+ "execution_count": null
161
+ },
162
+ {
163
+ "cell_type": "markdown",
164
+ "metadata": {},
165
+ "source": [
166
+ "## Cell 4 β€” HuggingFace TRL check\n",
167
+ "Confirms TRL is importable (hackathon requirement)."
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "metadata": {},
173
+ "source": [
174
+ "import trl, torch\n",
175
+ "\n",
176
+ "print(f\"TRL version : {trl.__version__}\")\n",
177
+ "print(f\"Torch version : {torch.__version__}\")\n",
178
+ "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
179
+ "if torch.cuda.is_available():\n",
180
+ " print(f\"GPU : {torch.cuda.get_device_name(0)}\")\n",
181
+ "\n",
182
+ "for _name in (\"PPOConfig\", \"GRPOConfig\", \"SFTConfig\"):\n",
183
+ " _cls = getattr(trl, _name, None)\n",
184
+ " if _cls is not None:\n",
185
+ " print(f\"TRL config class: {_name} βœ…\")\n",
186
+ " break\n",
187
+ "else:\n",
188
+ " print(\"TRL imported βœ… (config uses TrainingArguments in this version)\")\n",
189
+ "\n",
190
+ "print(\"βœ… TRL requirement satisfied. Primary training uses RecurrentPPO (Cell 5).\")"
191
+ ],
192
+ "outputs": [],
193
+ "execution_count": null
194
+ },
195
+ {
196
+ "cell_type": "markdown",
197
+ "metadata": {},
198
+ "source": [
199
+ "## Cell 5 β€” RecurrentPPO training\n",
200
+ "\n",
201
+ "**What's happening:**\n",
202
+ "- Per-step specialist calls: local simulation (fast, no API cost)\n",
203
+ "- Task generation: GPT-4o-mini via `OPENAI_API_KEY` (diverse tasks)\n",
204
+ "- Finetuner: fires every 100 episodes via `OPENAI_API_KEY` (improves specialist prompts)\n",
205
+ "- Reward baseline: LLM-generated via `OPENAI_API_KEY` (accurate quality signal)\n",
206
+ "\n",
207
+ "**Expected runtime: 20–30 min on T4 GPU**"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "metadata": {},
213
+ "source": [
214
+ "import time, yaml\n",
215
+ "import torch\n",
216
+ "import numpy as np\n",
217
+ "from sb3_contrib import RecurrentPPO\n",
218
+ "from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize\n",
219
+ "from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback\n",
220
+ "from policy.lstm_policy import build_policy_kwargs\n",
221
+ "from training.curriculum import CurriculumManager\n",
222
+ "from training.specialist_improvement_callback import SpecialistImprovementCallback\n",
223
+ "\n",
224
+ "_LOG_FILE = \"/content/logs/training_log.txt\"\n",
225
+ "\n",
226
+ "def _tlog(msg: str):\n",
227
+ " ts = time.strftime(\"%H:%M:%S\")\n",
228
+ " line = f\"[{ts}] {msg}\"\n",
229
+ " print(line, flush=True)\n",
230
+ " with open(_LOG_FILE, \"a\", encoding=\"utf-8\") as _f:\n",
231
+ " _f.write(line + \"\\n\")\n",
232
+ "\n",
233
+ "with open(\"configs/training_config.yaml\") as f:\n",
234
+ " _cfg = yaml.safe_load(f)\n",
235
+ "\n",
236
+ "curriculum = CurriculumManager(config_path=\"configs/training_config.yaml\")\n",
237
+ "\n",
238
+ "TOTAL_TIMESTEPS = 100_000 # ~10k episodes, ~20-25 min on T4\n",
239
+ "\n",
240
+ "\n",
241
+ "class RewardLogger(BaseCallback):\n",
242
+ " def __init__(self, curriculum):\n",
243
+ " super().__init__()\n",
244
+ " self.episode_rewards = []\n",
245
+ " self._running = 0.0\n",
246
+ " self._curriculum = curriculum\n",
247
+ "\n",
248
+ " def _on_step(self):\n",
249
+ " for r, d in zip(\n",
250
+ " self.locals.get(\"rewards\", []),\n",
251
+ " self.locals.get(\"dones\", []),\n",
252
+ " ):\n",
253
+ " self._running += float(r)\n",
254
+ " if d:\n",
255
+ " ep = self._running\n",
256
+ " self.episode_rewards.append(ep)\n",
257
+ " self._running = 0.0\n",
258
+ " advanced = self._curriculum.on_episode_end(ep)\n",
259
+ " n = len(self.episode_rewards)\n",
260
+ " if advanced or n % 50 == 0:\n",
261
+ " _tlog(\n",
262
+ " f\"Ep {n:5d} | reward {ep:+.3f} | \"\n",
263
+ " f\"{self._curriculum.progress_str()}\"\n",
264
+ " )\n",
265
+ " return True\n",
266
+ "\n",
267
+ "\n",
268
+ "def make_env():\n",
269
+ " return SpindleFlowEnv(\n",
270
+ " config_path=\"configs/training_config.yaml\",\n",
271
+ " catalog_path=\"configs/specialist_catalog.yaml\",\n",
272
+ " use_real_spindleflow=False,\n",
273
+ " phase=1,\n",
274
+ " simulate_specialists=True,\n",
275
+ " )\n",
276
+ "\n",
277
+ "\n",
278
+ "vec_env = DummyVecEnv([make_env])\n",
279
+ "vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=True, clip_obs=10.0)\n",
280
+ "\n",
281
+ "_ppo = _cfg.get(\"ppo\", {})\n",
282
+ "_lstm = _cfg.get(\"lstm\", {})\n",
283
+ "\n",
284
+ "model = RecurrentPPO(\n",
285
+ " policy=\"MlpLstmPolicy\",\n",
286
+ " env=vec_env,\n",
287
+ " learning_rate=float(_ppo.get(\"learning_rate\", 3e-4)),\n",
288
+ " n_steps=int(_ppo.get(\"n_steps\", 512)),\n",
289
+ " batch_size=int(_ppo.get(\"batch_size\", 64)),\n",
290
+ " n_epochs=int(_ppo.get(\"n_epochs\", 10)),\n",
291
+ " gamma=float(_ppo.get(\"gamma\", 0.99)),\n",
292
+ " gae_lambda=float(_ppo.get(\"gae_lambda\", 0.95)),\n",
293
+ " clip_range=float(_ppo.get(\"clip_range\", 0.2)),\n",
294
+ " ent_coef=float(_ppo.get(\"ent_coef\", 0.01)),\n",
295
+ " vf_coef=float(_ppo.get(\"vf_coef\", 0.5)),\n",
296
+ " max_grad_norm=float(_ppo.get(\"max_grad_norm\", 0.5)),\n",
297
+ " policy_kwargs=build_policy_kwargs(\n",
298
+ " hidden_size=int(_lstm.get(\"hidden_size\", 256))\n",
299
+ " ),\n",
300
+ " verbose=0,\n",
301
+ " seed=int(_cfg.get(\"training\", {}).get(\"seed\", 42)),\n",
302
+ " device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
303
+ ")\n",
304
+ "\n",
305
+ "_tlog(f\"Device : {model.device}\")\n",
306
+ "_tlog(f\"Total timesteps : {TOTAL_TIMESTEPS:,}\")\n",
307
+ "_tlog(f\"Curriculum start: Phase {curriculum.current_phase} β€” {curriculum.progress_str()}\")\n",
308
+ "_tlog(\"Training started...\")\n",
309
+ "\n",
310
+ "reward_logger = RewardLogger(curriculum=curriculum)\n",
311
+ "checkpoint_cb = CheckpointCallback(save_freq=10_000, save_path=\"/content/checkpoints/\")\n",
312
+ "improvement_cb = SpecialistImprovementCallback(\n",
313
+ " improve_every_n_episodes=_cfg.get(\"specialist_improvement\", {}).get(\n",
314
+ " \"improve_every_n_episodes\", 100\n",
315
+ " ),\n",
316
+ " verbose=1,\n",
317
+ ")\n",
318
+ "\n",
319
+ "_t0 = time.time()\n",
320
+ "model.learn(\n",
321
+ " total_timesteps=TOTAL_TIMESTEPS,\n",
322
+ " callback=[reward_logger, checkpoint_cb, improvement_cb],\n",
323
+ ")\n",
324
+ "_elapsed = time.time() - _t0\n",
325
+ "\n",
326
+ "model.save(\"/content/spindleflow_colab_model\")\n",
327
+ "vec_env.save(\"/content/vec_normalize_colab.pkl\")\n",
328
+ "\n",
329
+ "_tlog(f\"Training done in {_elapsed/60:.1f} min\")\n",
330
+ "_tlog(f\"Episodes tracked : {len(reward_logger.episode_rewards)}\")\n",
331
+ "_tlog(f\"Final curriculum : {curriculum.progress_str()}\")\n",
332
+ "print(\"\\nβœ… Model saved to /content/spindleflow_colab_model.zip\")"
333
+ ],
334
+ "outputs": [],
335
+ "execution_count": null
336
+ },
337
+ {
338
+ "cell_type": "markdown",
339
+ "metadata": {},
340
+ "source": [
341
+ "## Cell 6 β€” Reward curve\n",
342
+ "Generates publication-quality plot and saves JSON for the HF Space demo."
343
+ ]
344
+ },
345
+ {
346
+ "cell_type": "code",
347
+ "metadata": {},
348
+ "source": [
349
+ "import json\n",
350
+ "import numpy as np\n",
351
+ "import matplotlib\n",
352
+ "matplotlib.use(\"Agg\")\n",
353
+ "import matplotlib.pyplot as plt\n",
354
+ "\n",
355
+ "ep_rewards = reward_logger.episode_rewards\n",
356
+ "if not ep_rewards:\n",
357
+ " raise RuntimeError(\"No episodes completed β€” check Cell 5 output for errors.\")\n",
358
+ "\n",
359
+ "n_ep = len(ep_rewards)\n",
360
+ "episodes = list(range(n_ep))\n",
361
+ "window = max(30, n_ep // 20) # adaptive: ~5% of total\n",
362
+ "\n",
363
+ "smoothed = [\n",
364
+ " float(np.mean(ep_rewards[max(0, i - window):i + 1]))\n",
365
+ " for i in range(n_ep)\n",
366
+ "]\n",
367
+ "\n",
368
+ "early_mean = float(np.mean(ep_rewards[:min(50, n_ep)]))\n",
369
+ "final_mean = float(np.mean(ep_rewards[max(0, n_ep - 200):]))\n",
370
+ "improvement = final_mean - early_mean\n",
371
+ "\n",
372
+ "# ── Save JSON ──────────────────────────────────────────────────\n",
373
+ "step = max(1, n_ep // 300)\n",
374
+ "json_data = {\n",
375
+ " \"episodes\": episodes[::step],\n",
376
+ " \"mean_rewards\": smoothed[::step],\n",
377
+ "}\n",
378
+ "with open(\"/content/demo/assets/reward_curve.json\", \"w\") as f:\n",
379
+ " json.dump(json_data, f)\n",
380
+ "print(f\"Saved reward_curve.json ({len(json_data['episodes'])} points)\")\n",
381
+ "\n",
382
+ "# ── Plot ───────────────────────────────────────────────────────\n",
383
+ "fig, ax = plt.subplots(figsize=(11, 5), dpi=180)\n",
384
+ "fig.patch.set_facecolor(\"#0d1117\")\n",
385
+ "ax.set_facecolor(\"#161b22\")\n",
386
+ "\n",
387
+ "plot_every = max(1, n_ep // 800)\n",
388
+ "ax.scatter(\n",
389
+ " episodes[::plot_every], ep_rewards[::plot_every],\n",
390
+ " s=4, alpha=0.25, color=\"#58a6ff\", zorder=2, label=\"Episode reward\",\n",
391
+ ")\n",
392
+ "ax.plot(\n",
393
+ " episodes[::plot_every], smoothed[::plot_every],\n",
394
+ " linewidth=2.5, color=\"#ff6b35\", zorder=3,\n",
395
+ " label=f\"Smoothed ({window}-ep mean)\",\n",
396
+ ")\n",
397
+ "ax.axhline(\n",
398
+ " y=early_mean, color=\"#94a3b8\", linestyle=\"--\", linewidth=1.2, alpha=0.75,\n",
399
+ " label=f\"Early baseline {early_mean:+.3f}\",\n",
400
+ ")\n",
401
+ "ax.axhline(\n",
402
+ " y=final_mean, color=\"#34d399\", linestyle=\"--\", linewidth=1.2, alpha=0.85,\n",
403
+ " label=f\"Final mean {final_mean:+.3f}\",\n",
404
+ ")\n",
405
+ "\n",
406
+ "ax.set_xlabel(\"Episode\", color=\"#c9d1d9\", fontsize=12)\n",
407
+ "ax.set_ylabel(\"Reward\", color=\"#c9d1d9\", fontsize=12)\n",
408
+ "ax.set_title(\n",
409
+ " \"SpindleFlow RL β€” Delegation Policy Learning Curve\\n\"\n",
410
+ " f\"RecurrentPPO Β· LSTM Β· {TOTAL_TIMESTEPS:,} steps Β· {n_ep:,} episodes\",\n",
411
+ " color=\"#f0f6fc\", fontsize=13, fontweight=\"bold\", pad=14,\n",
412
+ ")\n",
413
+ "ax.tick_params(colors=\"#8b949e\")\n",
414
+ "for spine in ax.spines.values():\n",
415
+ " spine.set_edgecolor(\"#30363d\")\n",
416
+ "ax.grid(color=\"#21262d\", linewidth=0.8, alpha=0.9)\n",
417
+ "ax.legend(\n",
418
+ " fontsize=10, framealpha=0.85,\n",
419
+ " facecolor=\"#161b22\", edgecolor=\"#30363d\", labelcolor=\"#c9d1d9\",\n",
420
+ ")\n",
421
+ "\n",
422
+ "sign = \"β–²\" if improvement >= 0 else \"β–Ό\"\n",
423
+ "ax.annotate(\n",
424
+ " f\" {sign} {abs(improvement):.3f} reward improvement\",\n",
425
+ " xy=(n_ep * 0.65, (early_mean + final_mean) / 2),\n",
426
+ " color=\"#f0f6fc\", fontsize=10, fontstyle=\"italic\",\n",
427
+ ")\n",
428
+ "\n",
429
+ "fig.tight_layout()\n",
430
+ "fig.savefig(\"/content/reward_curve.png\", dpi=180, bbox_inches=\"tight\",\n",
431
+ " facecolor=fig.get_facecolor())\n",
432
+ "plt.show()\n",
433
+ "\n",
434
+ "print(f\"\\n{'='*50}\")\n",
435
+ "print(f\"Episodes completed : {n_ep:,}\")\n",
436
+ "print(f\"Early baseline : {early_mean:+.4f}\")\n",
437
+ "print(f\"Final mean : {final_mean:+.4f}\")\n",
438
+ "print(f\"Improvement : {improvement:+.4f}\")\n",
439
+ "print(f\"{'='*50}\")\n",
440
+ "print(\"βœ… Reward curve saved to /content/reward_curve.png\")\n",
441
+ "\n",
442
+ "_tlog(f\"Reward curve: early={early_mean:+.4f}, final={final_mean:+.4f}, improvement={improvement:+.4f}\")"
443
+ ],
444
+ "outputs": [],
445
+ "execution_count": null
446
+ },
447
+ {
448
+ "cell_type": "markdown",
449
+ "metadata": {},
450
+ "source": [
451
+ "## Cell 7 β€” Learning features audit\n",
452
+ "Confirms each self-learning feature fired at least once during training."
453
+ ]
454
+ },
455
+ {
456
+ "cell_type": "code",
457
+ "metadata": {},
458
+ "source": [
459
+ "import os, json\n",
460
+ "from pathlib import Path\n",
461
+ "\n",
462
+ "print(\"=\"*55)\n",
463
+ "print(\"LEARNING FEATURES AUDIT\")\n",
464
+ "print(\"=\"*55)\n",
465
+ "\n",
466
+ "# Feature 5 β€” Curriculum\n",
467
+ "print(f\"\\nFeature 5 β€” Curriculum (performance-gated)\")\n",
468
+ "print(f\" Final phase : {curriculum.current_phase}/3\")\n",
469
+ "print(f\" Rolling mean reward: {curriculum.rolling_mean():.3f}\")\n",
470
+ "print(f\" {curriculum.progress_str()}\")\n",
471
+ "\n",
472
+ "# Feature 2 β€” Specialist memory\n",
473
+ "mem_path = Path(_cfg.get(\"specialist_improvement\", {}).get(\n",
474
+ " \"memory_path\", \"data/specialist_memory.json\"\n",
475
+ "))\n",
476
+ "print(f\"\\nFeature 2 β€” Specialist memory ({mem_path})\")\n",
477
+ "if mem_path.exists():\n",
478
+ " data = json.loads(mem_path.read_text())\n",
479
+ " total_entries = sum(len(v) for v in data.values())\n",
480
+ " print(f\" Specialists with memory : {len(data)}\")\n",
481
+ " print(f\" Total entries recorded : {total_entries}\")\n",
482
+ " for sid, entries in list(data.items())[:3]:\n",
483
+ " avg = sum(e[\"reward\"] for e in entries) / len(entries)\n",
484
+ " print(f\" {sid}: {len(entries)} entries, avg_reward={avg:.3f}\")\n",
485
+ "else:\n",
486
+ " print(\" No memory file yet (finetuner may not have fired β€” normal below 100 episodes)\")\n",
487
+ "\n",
488
+ "# Feature 3 β€” Spawn memory\n",
489
+ "spawn_path = Path(_cfg.get(\"environment\", {}).get(\n",
490
+ " \"spawn_memory_path\", \"data/spawn_memory.jsonl\"\n",
491
+ "))\n",
492
+ "print(f\"\\nFeature 3 β€” Spawn memory ({spawn_path})\")\n",
493
+ "if spawn_path.exists():\n",
494
+ " lines = [l for l in spawn_path.read_text().splitlines() if l.strip()]\n",
495
+ " print(f\" Spawn records written: {len(lines)}\")\n",
496
+ " for line in lines[:3]:\n",
497
+ " rec = json.loads(line)\n",
498
+ " print(f\" {rec['specialist_role']} | reward={rec['episode_reward']:.3f} \"\n",
499
+ " f\"| sim {rec['pre_spawn_sim']:.2f}β†’{rec['post_spawn_sim']:.2f}\")\n",
500
+ "else:\n",
501
+ " print(\" No spawn memory yet (requires policy choosing SPAWN_SPECIALIST action)\")\n",
502
+ "\n",
503
+ "# Feature 4 β€” Resolution bandit\n",
504
+ "res_path = Path(_cfg.get(\"agents\", {}).get(\n",
505
+ " \"resolution_memory_path\", \"data/resolution_memory.jsonl\"\n",
506
+ "))\n",
507
+ "print(f\"\\nFeature 4 β€” Resolution bandit ({res_path})\")\n",
508
+ "if res_path.exists():\n",
509
+ " lines = [l for l in res_path.read_text().splitlines() if l.strip()]\n",
510
+ " print(f\" Outcome records written: {len(lines)}\")\n",
511
+ " stats = {}\n",
512
+ " for line in lines:\n",
513
+ " rec = json.loads(line)\n",
514
+ " key = f\"{rec['conflict_type']}/{rec['template_key']}\"\n",
515
+ " stats.setdefault(key, []).append(rec[\"quality_delta\"])\n",
516
+ " for k, deltas in stats.items():\n",
517
+ " print(f\" {k}: n={len(deltas)}, mean_delta={sum(deltas)/len(deltas):.3f}\")\n",
518
+ "else:\n",
519
+ " print(\" No resolution memory yet (requires detected conflicts)\")\n",
520
+ "\n",
521
+ "print(\"\\n\" + \"=\"*55)\n",
522
+ "print(\"βœ… Audit complete\")\n",
523
+ "print(\"=\"*55)"
524
+ ],
525
+ "outputs": [],
526
+ "execution_count": null
527
+ },
528
+ {
529
+ "cell_type": "markdown",
530
+ "metadata": {},
531
+ "source": [
532
+ "## Cell 8 β€” Push to HuggingFace Hub\n",
533
+ "\n",
534
+ "Uploads model checkpoint, reward curve, training log, and README to `garvitsachdeva/spindleflow-rl`."
535
+ ]
536
+ },
537
+ {
538
+ "cell_type": "code",
539
+ "metadata": {},
540
+ "source": [
541
+ "import os, json\n",
542
+ "import numpy as np\n",
543
+ "from huggingface_hub import HfApi, CommitOperationAdd\n",
544
+ "\n",
545
+ "HF_REPO = \"garvitsachdeva/spindleflow-rl\"\n",
546
+ "api = HfApi(token=HF_TOKEN)\n",
547
+ "\n",
548
+ "_tlog(f\"Pushing to https://huggingface.co/{HF_REPO} ...\")\n",
549
+ "api.create_repo(repo_id=HF_REPO.split(\"/\")[-1], repo_type=\"model\", exist_ok=True)\n",
550
+ "\n",
551
+ "ep = reward_logger.episode_rewards\n",
552
+ "f5 = float(np.mean(ep[:5])) if len(ep) >= 5 else 0.0\n",
553
+ "l5 = float(np.mean(ep[-5:])) if len(ep) >= 5 else 0.0\n",
554
+ "\n",
555
+ "readme_text = f\"\"\"---\n",
556
+ "license: mit\n",
557
+ "tags:\n",
558
+ " - reinforcement-learning\n",
559
+ " - stable-baselines3\n",
560
+ " - sb3-contrib\n",
561
+ " - gymnasium\n",
562
+ " - multi-agent\n",
563
+ " - openenv\n",
564
+ "library_name: stable-baselines3\n",
565
+ "---\n",
566
+ "\n",
567
+ "# SpindleFlow RL β€” Delegation Policy\n",
568
+ "\n",
569
+ "LSTM PPO (RecurrentPPO) agent trained on SpindleFlow-v0 (OpenEnv). \n",
570
+ "Trained on Google Colab T4 GPU.\n",
571
+ "\n",
572
+ "## Training summary\n",
573
+ "| Metric | Value |\n",
574
+ "|---|---|\n",
575
+ "| Algorithm | RecurrentPPO (SB3 + sb3-contrib) |\n",
576
+ "| Total timesteps | {TOTAL_TIMESTEPS:,} |\n",
577
+ "| Episodes completed | {len(ep):,} |\n",
578
+ "| Early baseline (first 50 ep) | {early_mean:.4f} |\n",
579
+ "| Final mean (last 200 ep) | {final_mean:.4f} |\n",
580
+ "| Improvement | {final_mean - early_mean:+.4f} |\n",
581
+ "| Training time | {_elapsed/60:.1f} min |\n",
582
+ "| Device | T4 GPU |\n",
583
+ "\n",
584
+ "![Reward Curve](reward_curve.png)\n",
585
+ "\n",
586
+ "## Load\n",
587
+ "```python\n",
588
+ "from sb3_contrib import RecurrentPPO\n",
589
+ "from huggingface_hub import hf_hub_download\n",
590
+ "model = RecurrentPPO.load(hf_hub_download(\"{HF_REPO}\", \"spindleflow_model.zip\"))\n",
591
+ "```\n",
592
+ "\"\"\"\n",
593
+ "\n",
594
+ "readme_path = \"/content/README_model.md\"\n",
595
+ "with open(readme_path, \"w\") as f:\n",
596
+ " f.write(readme_text)\n",
597
+ "\n",
598
+ "candidates = [\n",
599
+ " (\"/content/spindleflow_colab_model.zip\", \"spindleflow_model.zip\"),\n",
600
+ " (\"/content/vec_normalize_colab.pkl\", \"vec_normalize.pkl\"),\n",
601
+ " (\"/content/reward_curve.png\", \"reward_curve.png\"),\n",
602
+ " (\"/content/demo/assets/reward_curve.json\", \"reward_curve.json\"),\n",
603
+ " (\"/content/logs/training_log.txt\", \"training_log.txt\"),\n",
604
+ " (readme_path, \"README.md\"),\n",
605
+ "]\n",
606
+ "\n",
607
+ "ops = [\n",
608
+ " CommitOperationAdd(path_in_repo=dst, path_or_fileobj=src)\n",
609
+ " for src, dst in candidates\n",
610
+ " if os.path.exists(src)\n",
611
+ "]\n",
612
+ "\n",
613
+ "api.create_commit(\n",
614
+ " repo_id=HF_REPO,\n",
615
+ " repo_type=\"model\",\n",
616
+ " operations=ops,\n",
617
+ " commit_message=\"Add trained SpindleFlow RL policy (Colab T4)\",\n",
618
+ " token=HF_TOKEN,\n",
619
+ ")\n",
620
+ "\n",
621
+ "_tlog(f\"Uploaded {len(ops)} files:\")\n",
622
+ "for src, dst in candidates:\n",
623
+ " if os.path.exists(src):\n",
624
+ " _tlog(f\" βœ“ {dst}\")\n",
625
+ "\n",
626
+ "_tlog(f\"Model : https://huggingface.co/{HF_REPO}\")\n",
627
+ "_tlog(f\"Training log: https://huggingface.co/{HF_REPO}/blob/main/training_log.txt\")\n",
628
+ "_tlog(f\"Reward curve: https://huggingface.co/{HF_REPO}/blob/main/reward_curve.png\")\n",
629
+ "_tlog(f\"Improvement : {final_mean - early_mean:+.4f}\")\n",
630
+ "print(\"\\nβœ… All done!\")"
631
+ ],
632
+ "outputs": [],
633
+ "execution_count": null
634
+ }
635
+ ]
636
+ }