garvitsachdeva Claude Sonnet 4.6 commited on
Commit
c8407b4
·
1 Parent(s): c77f83c

Remove Colab notebook — using plain .py script instead

Browse files

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. colab/SpindleFlow_RL_Training.ipynb +0 -636
colab/SpindleFlow_RL_Training.ipynb DELETED
@@ -1,636 +0,0 @@
1
- {
2
- "nbformat": 4,
3
- "nbformat_minor": 0,
4
- "metadata": {
5
- "colab": {
6
- "provenance": [],
7
- "gpuType": "T4",
8
- "name": "SpindleFlow_RL_Training.ipynb"
9
- },
10
- "kernelspec": {
11
- "name": "python3",
12
- "display_name": "Python 3"
13
- },
14
- "language_info": {
15
- "name": "python"
16
- },
17
- "accelerator": "GPU"
18
- },
19
- "cells": [
20
- {
21
- "cell_type": "markdown",
22
- "metadata": {},
23
- "source": [
24
- "# SpindleFlow RL — Training Notebook\n",
25
- "\n",
26
- "**Hardware**: Runtime → Change runtime type → **T4 GPU**\n",
27
- "\n",
28
- "**Secrets** (key icon in left sidebar → Manage secrets):\n",
29
- "\n",
30
- "| Name | Required | Notes |\n",
31
- "|---|---|---|\n",
32
- "| `HF_TOKEN` | ✅ Yes | HuggingFace write token — hf.co/settings/tokens → New token (write) |\n",
33
- "| `OPENAI_API_KEY` | ✅ Yes | GPT-4o-mini for task generation, finetuner, reward baseline |\n",
34
- "\n",
35
- "Run cells **top to bottom, one at a time**. Do NOT skip cells."
36
- ]
37
- },
38
- {
39
- "cell_type": "markdown",
40
- "metadata": {},
41
- "source": [
42
- "## Cell 1 — Install dependencies & clone repo\n",
43
- "Run once. After it finishes, **do NOT restart the runtime** — continue to Cell 2."
44
- ]
45
- },
46
- {
47
- "cell_type": "code",
48
- "metadata": {},
49
- "source": "import subprocess, os, sys\n\nprint(f\"Python version: {sys.version}\")\n\n# audioop-lts is only for Python 3.13+ — Colab uses 3.12 where audioop is built-in\n_pkgs = [\n \"openenv\", \"stable-baselines3\", \"sb3-contrib\", \"gymnasium\",\n \"sentence-transformers\", \"openai\", \"pyyaml\", \"trl\",\n \"transformers\", \"datasets\", \"torch\",\n \"matplotlib\", \"huggingface_hub\",\n]\nif sys.version_info >= (3, 13):\n _pkgs.append(\"audioop-lts\")\n\nresult = subprocess.run([\"pip\", \"install\"] + _pkgs, capture_output=True, text=True)\nif result.returncode != 0:\n print(\"STDOUT:\", result.stdout[-3000:])\n print(\"STDERR:\", result.stderr[-3000:])\n raise RuntimeError(\"pip install failed — see output above\")\nprint(\"✅ Packages installed\")\n\nREPO = \"/content/kuchbhi/spindleflow-rl\"\nif not os.path.isdir(REPO):\n subprocess.run(\n [\"git\", \"clone\", \"https://github.com/garvitsachdevaa/kuchbhi.git\"],\n cwd=\"/content\", check=True,\n )\n print(\"✅ Repo cloned\")\nelse:\n print(\"Repo already present — pulling latest\")\n subprocess.run([\"git\", \"pull\"], cwd=REPO, check=True)\n\nos.chdir(REPO)\nsys.path.insert(0, \".\")\n\nimport importlib.metadata\nprint(f\"OpenEnv version : {importlib.metadata.version('openenv')}\")\n\nos.makedirs(\"/content/demo/assets\", exist_ok=True)\nos.makedirs(\"/content/data\", exist_ok=True)\nos.makedirs(\"/content/checkpoints\", exist_ok=True)\nos.makedirs(\"/content/logs\", exist_ok=True)\n\nprint(f\"Working directory: {os.getcwd()}\")\nprint(\"✅ Setup complete\")",
50
- "outputs": [],
51
- "execution_count": null
52
- },
53
- {
54
- "cell_type": "markdown",
55
- "metadata": {},
56
- "source": [
57
- "## Cell 2 — Set secrets & verify\n",
58
- "Reads `HF_TOKEN` and `OPENAI_API_KEY` from Colab secrets. \n",
59
- "**Both must show ✅ before continuing.**"
60
- ]
61
- },
62
- {
63
- "cell_type": "code",
64
- "metadata": {},
65
- "source": [
66
- "import os\n",
67
- "from google.colab import userdata\n",
68
- "\n",
69
- "HF_TOKEN = userdata.get(\"HF_TOKEN\")\n",
70
- "OPENAI_API_KEY = userdata.get(\"OPENAI_API_KEY\")\n",
71
- "\n",
72
- "if not HF_TOKEN:\n",
73
- " raise RuntimeError(\n",
74
- " \"HF_TOKEN not set.\\n\"\n",
75
- " \"Go to the key icon (left sidebar) → Add secret → Name: HF_TOKEN, \"\n",
76
- " \"Value: your write token from hf.co/settings/tokens → enable notebook access.\"\n",
77
- " )\n",
78
- "\n",
79
- "if not OPENAI_API_KEY:\n",
80
- " raise RuntimeError(\n",
81
- " \"OPENAI_API_KEY not set.\\n\"\n",
82
- " \"Go to the key icon (left sidebar) → Add secret → Name: OPENAI_API_KEY, \"\n",
83
- " \"Value: sk-... → enable notebook access.\"\n",
84
- " )\n",
85
- "\n",
86
- "# Inject into environment so all modules pick them up\n",
87
- "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY\n",
88
- "\n",
89
- "print(f\"✅ HF_TOKEN : {HF_TOKEN[:8]}...{HF_TOKEN[-4:]}\")\n",
90
- "print(f\"✅ OPENAI_API_KEY: {OPENAI_API_KEY[:8]}...{OPENAI_API_KEY[-4:]}\")\n",
91
- "print(\"Both secrets loaded — proceeding.\")"
92
- ],
93
- "outputs": [],
94
- "execution_count": null
95
- },
96
- {
97
- "cell_type": "markdown",
98
- "metadata": {},
99
- "source": [
100
- "## Cell 3 — Patch env + smoke test\n",
101
- "Adds `simulate_specialists` support and runs one end-to-end step to confirm the env works."
102
- ]
103
- },
104
- {
105
- "cell_type": "code",
106
- "metadata": {},
107
- "source": [
108
- "import os as _os\n",
109
- "import numpy as np\n",
110
- "from env.spindleflow_env import SpindleFlowEnv\n",
111
- "\n",
112
- "# Monkey-patch: add simulate_specialists kwarg (fast per-step simulation)\n",
113
- "if not getattr(SpindleFlowEnv, \"_simulate_patched\", False):\n",
114
- " _orig_init = SpindleFlowEnv.__init__\n",
115
- "\n",
116
- " def _new_init(self, *args, simulate_specialists=False, **kwargs):\n",
117
- " _orig_init(self, *args, **kwargs)\n",
118
- " self.simulate_specialists = simulate_specialists\n",
119
- "\n",
120
- " SpindleFlowEnv.__init__ = _new_init\n",
121
- "\n",
122
- " _orig_call = SpindleFlowEnv._call_specialist\n",
123
- "\n",
124
- " def _new_call(self, specialist_id, task, elapsed_ms, context=None):\n",
125
- " if getattr(self, \"simulate_specialists\", False):\n",
126
- " _key = _os.environ.pop(\"OPENAI_API_KEY\", None)\n",
127
- " try:\n",
128
- " return _orig_call(self, specialist_id, task, elapsed_ms, context=context)\n",
129
- " finally:\n",
130
- " if _key:\n",
131
- " _os.environ[\"OPENAI_API_KEY\"] = _key\n",
132
- " return _orig_call(self, specialist_id, task, elapsed_ms, context=context)\n",
133
- "\n",
134
- " SpindleFlowEnv._call_specialist = _new_call\n",
135
- " SpindleFlowEnv._simulate_patched = True\n",
136
- " print(\"✅ SpindleFlowEnv patched\")\n",
137
- "else:\n",
138
- " print(\"Already patched — skipping\")\n",
139
- "\n",
140
- "env = SpindleFlowEnv(\n",
141
- " config_path=\"configs/training_config.yaml\",\n",
142
- " catalog_path=\"configs/specialist_catalog.yaml\",\n",
143
- " use_real_spindleflow=False,\n",
144
- " phase=1,\n",
145
- " simulate_specialists=True,\n",
146
- ")\n",
147
- "obs, info = env.reset()\n",
148
- "print(f\"Observation shape : {obs.shape}\")\n",
149
- "print(f\"Task : {info['task'][:80]}\")\n",
150
- "\n",
151
- "action = env.action_space.sample()\n",
152
- "obs2, reward, terminated, truncated, info2 = env.step(action)\n",
153
- "print(f\"Step reward : {reward:.4f}\")\n",
154
- "print(f\"Action name : {info2['action_name']}\")\n",
155
- "print(f\"Reward components : {info2['reward_components']}\")\n",
156
- "env.close()\n",
157
- "print(\"✅ Environment OK\")"
158
- ],
159
- "outputs": [],
160
- "execution_count": null
161
- },
162
- {
163
- "cell_type": "markdown",
164
- "metadata": {},
165
- "source": [
166
- "## Cell 4 — HuggingFace TRL check\n",
167
- "Confirms TRL is importable (hackathon requirement)."
168
- ]
169
- },
170
- {
171
- "cell_type": "code",
172
- "metadata": {},
173
- "source": [
174
- "import trl, torch\n",
175
- "\n",
176
- "print(f\"TRL version : {trl.__version__}\")\n",
177
- "print(f\"Torch version : {torch.__version__}\")\n",
178
- "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
179
- "if torch.cuda.is_available():\n",
180
- " print(f\"GPU : {torch.cuda.get_device_name(0)}\")\n",
181
- "\n",
182
- "for _name in (\"PPOConfig\", \"GRPOConfig\", \"SFTConfig\"):\n",
183
- " _cls = getattr(trl, _name, None)\n",
184
- " if _cls is not None:\n",
185
- " print(f\"TRL config class: {_name} ✅\")\n",
186
- " break\n",
187
- "else:\n",
188
- " print(\"TRL imported ✅ (config uses TrainingArguments in this version)\")\n",
189
- "\n",
190
- "print(\"✅ TRL requirement satisfied. Primary training uses RecurrentPPO (Cell 5).\")"
191
- ],
192
- "outputs": [],
193
- "execution_count": null
194
- },
195
- {
196
- "cell_type": "markdown",
197
- "metadata": {},
198
- "source": [
199
- "## Cell 5 — RecurrentPPO training\n",
200
- "\n",
201
- "**What's happening:**\n",
202
- "- Per-step specialist calls: local simulation (fast, no API cost)\n",
203
- "- Task generation: GPT-4o-mini via `OPENAI_API_KEY` (diverse tasks)\n",
204
- "- Finetuner: fires every 100 episodes via `OPENAI_API_KEY` (improves specialist prompts)\n",
205
- "- Reward baseline: LLM-generated via `OPENAI_API_KEY` (accurate quality signal)\n",
206
- "\n",
207
- "**Expected runtime: 20–30 min on T4 GPU**"
208
- ]
209
- },
210
- {
211
- "cell_type": "code",
212
- "metadata": {},
213
- "source": [
214
- "import time, yaml\n",
215
- "import torch\n",
216
- "import numpy as np\n",
217
- "from sb3_contrib import RecurrentPPO\n",
218
- "from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize\n",
219
- "from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback\n",
220
- "from policy.lstm_policy import build_policy_kwargs\n",
221
- "from training.curriculum import CurriculumManager\n",
222
- "from training.specialist_improvement_callback import SpecialistImprovementCallback\n",
223
- "\n",
224
- "_LOG_FILE = \"/content/logs/training_log.txt\"\n",
225
- "\n",
226
- "def _tlog(msg: str):\n",
227
- " ts = time.strftime(\"%H:%M:%S\")\n",
228
- " line = f\"[{ts}] {msg}\"\n",
229
- " print(line, flush=True)\n",
230
- " with open(_LOG_FILE, \"a\", encoding=\"utf-8\") as _f:\n",
231
- " _f.write(line + \"\\n\")\n",
232
- "\n",
233
- "with open(\"configs/training_config.yaml\") as f:\n",
234
- " _cfg = yaml.safe_load(f)\n",
235
- "\n",
236
- "curriculum = CurriculumManager(config_path=\"configs/training_config.yaml\")\n",
237
- "\n",
238
- "TOTAL_TIMESTEPS = 100_000 # ~10k episodes, ~20-25 min on T4\n",
239
- "\n",
240
- "\n",
241
- "class RewardLogger(BaseCallback):\n",
242
- " def __init__(self, curriculum):\n",
243
- " super().__init__()\n",
244
- " self.episode_rewards = []\n",
245
- " self._running = 0.0\n",
246
- " self._curriculum = curriculum\n",
247
- "\n",
248
- " def _on_step(self):\n",
249
- " for r, d in zip(\n",
250
- " self.locals.get(\"rewards\", []),\n",
251
- " self.locals.get(\"dones\", []),\n",
252
- " ):\n",
253
- " self._running += float(r)\n",
254
- " if d:\n",
255
- " ep = self._running\n",
256
- " self.episode_rewards.append(ep)\n",
257
- " self._running = 0.0\n",
258
- " advanced = self._curriculum.on_episode_end(ep)\n",
259
- " n = len(self.episode_rewards)\n",
260
- " if advanced or n % 50 == 0:\n",
261
- " _tlog(\n",
262
- " f\"Ep {n:5d} | reward {ep:+.3f} | \"\n",
263
- " f\"{self._curriculum.progress_str()}\"\n",
264
- " )\n",
265
- " return True\n",
266
- "\n",
267
- "\n",
268
- "def make_env():\n",
269
- " return SpindleFlowEnv(\n",
270
- " config_path=\"configs/training_config.yaml\",\n",
271
- " catalog_path=\"configs/specialist_catalog.yaml\",\n",
272
- " use_real_spindleflow=False,\n",
273
- " phase=1,\n",
274
- " simulate_specialists=True,\n",
275
- " )\n",
276
- "\n",
277
- "\n",
278
- "vec_env = DummyVecEnv([make_env])\n",
279
- "vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=True, clip_obs=10.0)\n",
280
- "\n",
281
- "_ppo = _cfg.get(\"ppo\", {})\n",
282
- "_lstm = _cfg.get(\"lstm\", {})\n",
283
- "\n",
284
- "model = RecurrentPPO(\n",
285
- " policy=\"MlpLstmPolicy\",\n",
286
- " env=vec_env,\n",
287
- " learning_rate=float(_ppo.get(\"learning_rate\", 3e-4)),\n",
288
- " n_steps=int(_ppo.get(\"n_steps\", 512)),\n",
289
- " batch_size=int(_ppo.get(\"batch_size\", 64)),\n",
290
- " n_epochs=int(_ppo.get(\"n_epochs\", 10)),\n",
291
- " gamma=float(_ppo.get(\"gamma\", 0.99)),\n",
292
- " gae_lambda=float(_ppo.get(\"gae_lambda\", 0.95)),\n",
293
- " clip_range=float(_ppo.get(\"clip_range\", 0.2)),\n",
294
- " ent_coef=float(_ppo.get(\"ent_coef\", 0.01)),\n",
295
- " vf_coef=float(_ppo.get(\"vf_coef\", 0.5)),\n",
296
- " max_grad_norm=float(_ppo.get(\"max_grad_norm\", 0.5)),\n",
297
- " policy_kwargs=build_policy_kwargs(\n",
298
- " hidden_size=int(_lstm.get(\"hidden_size\", 256))\n",
299
- " ),\n",
300
- " verbose=0,\n",
301
- " seed=int(_cfg.get(\"training\", {}).get(\"seed\", 42)),\n",
302
- " device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
303
- ")\n",
304
- "\n",
305
- "_tlog(f\"Device : {model.device}\")\n",
306
- "_tlog(f\"Total timesteps : {TOTAL_TIMESTEPS:,}\")\n",
307
- "_tlog(f\"Curriculum start: Phase {curriculum.current_phase} — {curriculum.progress_str()}\")\n",
308
- "_tlog(\"Training started...\")\n",
309
- "\n",
310
- "reward_logger = RewardLogger(curriculum=curriculum)\n",
311
- "checkpoint_cb = CheckpointCallback(save_freq=10_000, save_path=\"/content/checkpoints/\")\n",
312
- "improvement_cb = SpecialistImprovementCallback(\n",
313
- " improve_every_n_episodes=_cfg.get(\"specialist_improvement\", {}).get(\n",
314
- " \"improve_every_n_episodes\", 100\n",
315
- " ),\n",
316
- " verbose=1,\n",
317
- ")\n",
318
- "\n",
319
- "_t0 = time.time()\n",
320
- "model.learn(\n",
321
- " total_timesteps=TOTAL_TIMESTEPS,\n",
322
- " callback=[reward_logger, checkpoint_cb, improvement_cb],\n",
323
- ")\n",
324
- "_elapsed = time.time() - _t0\n",
325
- "\n",
326
- "model.save(\"/content/spindleflow_colab_model\")\n",
327
- "vec_env.save(\"/content/vec_normalize_colab.pkl\")\n",
328
- "\n",
329
- "_tlog(f\"Training done in {_elapsed/60:.1f} min\")\n",
330
- "_tlog(f\"Episodes tracked : {len(reward_logger.episode_rewards)}\")\n",
331
- "_tlog(f\"Final curriculum : {curriculum.progress_str()}\")\n",
332
- "print(\"\\n✅ Model saved to /content/spindleflow_colab_model.zip\")"
333
- ],
334
- "outputs": [],
335
- "execution_count": null
336
- },
337
- {
338
- "cell_type": "markdown",
339
- "metadata": {},
340
- "source": [
341
- "## Cell 6 — Reward curve\n",
342
- "Generates publication-quality plot and saves JSON for the HF Space demo."
343
- ]
344
- },
345
- {
346
- "cell_type": "code",
347
- "metadata": {},
348
- "source": [
349
- "import json\n",
350
- "import numpy as np\n",
351
- "import matplotlib\n",
352
- "matplotlib.use(\"Agg\")\n",
353
- "import matplotlib.pyplot as plt\n",
354
- "\n",
355
- "ep_rewards = reward_logger.episode_rewards\n",
356
- "if not ep_rewards:\n",
357
- " raise RuntimeError(\"No episodes completed — check Cell 5 output for errors.\")\n",
358
- "\n",
359
- "n_ep = len(ep_rewards)\n",
360
- "episodes = list(range(n_ep))\n",
361
- "window = max(30, n_ep // 20) # adaptive: ~5% of total\n",
362
- "\n",
363
- "smoothed = [\n",
364
- " float(np.mean(ep_rewards[max(0, i - window):i + 1]))\n",
365
- " for i in range(n_ep)\n",
366
- "]\n",
367
- "\n",
368
- "early_mean = float(np.mean(ep_rewards[:min(50, n_ep)]))\n",
369
- "final_mean = float(np.mean(ep_rewards[max(0, n_ep - 200):]))\n",
370
- "improvement = final_mean - early_mean\n",
371
- "\n",
372
- "# ── Save JSON ──────────────────────────────────────────────────\n",
373
- "step = max(1, n_ep // 300)\n",
374
- "json_data = {\n",
375
- " \"episodes\": episodes[::step],\n",
376
- " \"mean_rewards\": smoothed[::step],\n",
377
- "}\n",
378
- "with open(\"/content/demo/assets/reward_curve.json\", \"w\") as f:\n",
379
- " json.dump(json_data, f)\n",
380
- "print(f\"Saved reward_curve.json ({len(json_data['episodes'])} points)\")\n",
381
- "\n",
382
- "# ── Plot ───────────────────────────────────────────────────────\n",
383
- "fig, ax = plt.subplots(figsize=(11, 5), dpi=180)\n",
384
- "fig.patch.set_facecolor(\"#0d1117\")\n",
385
- "ax.set_facecolor(\"#161b22\")\n",
386
- "\n",
387
- "plot_every = max(1, n_ep // 800)\n",
388
- "ax.scatter(\n",
389
- " episodes[::plot_every], ep_rewards[::plot_every],\n",
390
- " s=4, alpha=0.25, color=\"#58a6ff\", zorder=2, label=\"Episode reward\",\n",
391
- ")\n",
392
- "ax.plot(\n",
393
- " episodes[::plot_every], smoothed[::plot_every],\n",
394
- " linewidth=2.5, color=\"#ff6b35\", zorder=3,\n",
395
- " label=f\"Smoothed ({window}-ep mean)\",\n",
396
- ")\n",
397
- "ax.axhline(\n",
398
- " y=early_mean, color=\"#94a3b8\", linestyle=\"--\", linewidth=1.2, alpha=0.75,\n",
399
- " label=f\"Early baseline {early_mean:+.3f}\",\n",
400
- ")\n",
401
- "ax.axhline(\n",
402
- " y=final_mean, color=\"#34d399\", linestyle=\"--\", linewidth=1.2, alpha=0.85,\n",
403
- " label=f\"Final mean {final_mean:+.3f}\",\n",
404
- ")\n",
405
- "\n",
406
- "ax.set_xlabel(\"Episode\", color=\"#c9d1d9\", fontsize=12)\n",
407
- "ax.set_ylabel(\"Reward\", color=\"#c9d1d9\", fontsize=12)\n",
408
- "ax.set_title(\n",
409
- " \"SpindleFlow RL — Delegation Policy Learning Curve\\n\"\n",
410
- " f\"RecurrentPPO · LSTM · {TOTAL_TIMESTEPS:,} steps · {n_ep:,} episodes\",\n",
411
- " color=\"#f0f6fc\", fontsize=13, fontweight=\"bold\", pad=14,\n",
412
- ")\n",
413
- "ax.tick_params(colors=\"#8b949e\")\n",
414
- "for spine in ax.spines.values():\n",
415
- " spine.set_edgecolor(\"#30363d\")\n",
416
- "ax.grid(color=\"#21262d\", linewidth=0.8, alpha=0.9)\n",
417
- "ax.legend(\n",
418
- " fontsize=10, framealpha=0.85,\n",
419
- " facecolor=\"#161b22\", edgecolor=\"#30363d\", labelcolor=\"#c9d1d9\",\n",
420
- ")\n",
421
- "\n",
422
- "sign = \"▲\" if improvement >= 0 else \"▼\"\n",
423
- "ax.annotate(\n",
424
- " f\" {sign} {abs(improvement):.3f} reward improvement\",\n",
425
- " xy=(n_ep * 0.65, (early_mean + final_mean) / 2),\n",
426
- " color=\"#f0f6fc\", fontsize=10, fontstyle=\"italic\",\n",
427
- ")\n",
428
- "\n",
429
- "fig.tight_layout()\n",
430
- "fig.savefig(\"/content/reward_curve.png\", dpi=180, bbox_inches=\"tight\",\n",
431
- " facecolor=fig.get_facecolor())\n",
432
- "plt.show()\n",
433
- "\n",
434
- "print(f\"\\n{'='*50}\")\n",
435
- "print(f\"Episodes completed : {n_ep:,}\")\n",
436
- "print(f\"Early baseline : {early_mean:+.4f}\")\n",
437
- "print(f\"Final mean : {final_mean:+.4f}\")\n",
438
- "print(f\"Improvement : {improvement:+.4f}\")\n",
439
- "print(f\"{'='*50}\")\n",
440
- "print(\"✅ Reward curve saved to /content/reward_curve.png\")\n",
441
- "\n",
442
- "_tlog(f\"Reward curve: early={early_mean:+.4f}, final={final_mean:+.4f}, improvement={improvement:+.4f}\")"
443
- ],
444
- "outputs": [],
445
- "execution_count": null
446
- },
447
- {
448
- "cell_type": "markdown",
449
- "metadata": {},
450
- "source": [
451
- "## Cell 7 — Learning features audit\n",
452
- "Confirms each self-learning feature fired at least once during training."
453
- ]
454
- },
455
- {
456
- "cell_type": "code",
457
- "metadata": {},
458
- "source": [
459
- "import os, json\n",
460
- "from pathlib import Path\n",
461
- "\n",
462
- "print(\"=\"*55)\n",
463
- "print(\"LEARNING FEATURES AUDIT\")\n",
464
- "print(\"=\"*55)\n",
465
- "\n",
466
- "# Feature 5 — Curriculum\n",
467
- "print(f\"\\nFeature 5 — Curriculum (performance-gated)\")\n",
468
- "print(f\" Final phase : {curriculum.current_phase}/3\")\n",
469
- "print(f\" Rolling mean reward: {curriculum.rolling_mean():.3f}\")\n",
470
- "print(f\" {curriculum.progress_str()}\")\n",
471
- "\n",
472
- "# Feature 2 — Specialist memory\n",
473
- "mem_path = Path(_cfg.get(\"specialist_improvement\", {}).get(\n",
474
- " \"memory_path\", \"data/specialist_memory.json\"\n",
475
- "))\n",
476
- "print(f\"\\nFeature 2 — Specialist memory ({mem_path})\")\n",
477
- "if mem_path.exists():\n",
478
- " data = json.loads(mem_path.read_text())\n",
479
- " total_entries = sum(len(v) for v in data.values())\n",
480
- " print(f\" Specialists with memory : {len(data)}\")\n",
481
- " print(f\" Total entries recorded : {total_entries}\")\n",
482
- " for sid, entries in list(data.items())[:3]:\n",
483
- " avg = sum(e[\"reward\"] for e in entries) / len(entries)\n",
484
- " print(f\" {sid}: {len(entries)} entries, avg_reward={avg:.3f}\")\n",
485
- "else:\n",
486
- " print(\" No memory file yet (finetuner may not have fired — normal below 100 episodes)\")\n",
487
- "\n",
488
- "# Feature 3 — Spawn memory\n",
489
- "spawn_path = Path(_cfg.get(\"environment\", {}).get(\n",
490
- " \"spawn_memory_path\", \"data/spawn_memory.jsonl\"\n",
491
- "))\n",
492
- "print(f\"\\nFeature 3 — Spawn memory ({spawn_path})\")\n",
493
- "if spawn_path.exists():\n",
494
- " lines = [l for l in spawn_path.read_text().splitlines() if l.strip()]\n",
495
- " print(f\" Spawn records written: {len(lines)}\")\n",
496
- " for line in lines[:3]:\n",
497
- " rec = json.loads(line)\n",
498
- " print(f\" {rec['specialist_role']} | reward={rec['episode_reward']:.3f} \"\n",
499
- " f\"| sim {rec['pre_spawn_sim']:.2f}→{rec['post_spawn_sim']:.2f}\")\n",
500
- "else:\n",
501
- " print(\" No spawn memory yet (requires policy choosing SPAWN_SPECIALIST action)\")\n",
502
- "\n",
503
- "# Feature 4 — Resolution bandit\n",
504
- "res_path = Path(_cfg.get(\"agents\", {}).get(\n",
505
- " \"resolution_memory_path\", \"data/resolution_memory.jsonl\"\n",
506
- "))\n",
507
- "print(f\"\\nFeature 4 — Resolution bandit ({res_path})\")\n",
508
- "if res_path.exists():\n",
509
- " lines = [l for l in res_path.read_text().splitlines() if l.strip()]\n",
510
- " print(f\" Outcome records written: {len(lines)}\")\n",
511
- " stats = {}\n",
512
- " for line in lines:\n",
513
- " rec = json.loads(line)\n",
514
- " key = f\"{rec['conflict_type']}/{rec['template_key']}\"\n",
515
- " stats.setdefault(key, []).append(rec[\"quality_delta\"])\n",
516
- " for k, deltas in stats.items():\n",
517
- " print(f\" {k}: n={len(deltas)}, mean_delta={sum(deltas)/len(deltas):.3f}\")\n",
518
- "else:\n",
519
- " print(\" No resolution memory yet (requires detected conflicts)\")\n",
520
- "\n",
521
- "print(\"\\n\" + \"=\"*55)\n",
522
- "print(\"✅ Audit complete\")\n",
523
- "print(\"=\"*55)"
524
- ],
525
- "outputs": [],
526
- "execution_count": null
527
- },
528
- {
529
- "cell_type": "markdown",
530
- "metadata": {},
531
- "source": [
532
- "## Cell 8 — Push to HuggingFace Hub\n",
533
- "\n",
534
- "Uploads model checkpoint, reward curve, training log, and README to `garvitsachdeva/spindleflow-rl`."
535
- ]
536
- },
537
- {
538
- "cell_type": "code",
539
- "metadata": {},
540
- "source": [
541
- "import os, json\n",
542
- "import numpy as np\n",
543
- "from huggingface_hub import HfApi, CommitOperationAdd\n",
544
- "\n",
545
- "HF_REPO = \"garvitsachdeva/spindleflow-rl\"\n",
546
- "api = HfApi(token=HF_TOKEN)\n",
547
- "\n",
548
- "_tlog(f\"Pushing to https://huggingface.co/{HF_REPO} ...\")\n",
549
- "api.create_repo(repo_id=HF_REPO.split(\"/\")[-1], repo_type=\"model\", exist_ok=True)\n",
550
- "\n",
551
- "ep = reward_logger.episode_rewards\n",
552
- "f5 = float(np.mean(ep[:5])) if len(ep) >= 5 else 0.0\n",
553
- "l5 = float(np.mean(ep[-5:])) if len(ep) >= 5 else 0.0\n",
554
- "\n",
555
- "readme_text = f\"\"\"---\n",
556
- "license: mit\n",
557
- "tags:\n",
558
- " - reinforcement-learning\n",
559
- " - stable-baselines3\n",
560
- " - sb3-contrib\n",
561
- " - gymnasium\n",
562
- " - multi-agent\n",
563
- " - openenv\n",
564
- "library_name: stable-baselines3\n",
565
- "---\n",
566
- "\n",
567
- "# SpindleFlow RL — Delegation Policy\n",
568
- "\n",
569
- "LSTM PPO (RecurrentPPO) agent trained on SpindleFlow-v0 (OpenEnv). \n",
570
- "Trained on Google Colab T4 GPU.\n",
571
- "\n",
572
- "## Training summary\n",
573
- "| Metric | Value |\n",
574
- "|---|---|\n",
575
- "| Algorithm | RecurrentPPO (SB3 + sb3-contrib) |\n",
576
- "| Total timesteps | {TOTAL_TIMESTEPS:,} |\n",
577
- "| Episodes completed | {len(ep):,} |\n",
578
- "| Early baseline (first 50 ep) | {early_mean:.4f} |\n",
579
- "| Final mean (last 200 ep) | {final_mean:.4f} |\n",
580
- "| Improvement | {final_mean - early_mean:+.4f} |\n",
581
- "| Training time | {_elapsed/60:.1f} min |\n",
582
- "| Device | T4 GPU |\n",
583
- "\n",
584
- "![Reward Curve](reward_curve.png)\n",
585
- "\n",
586
- "## Load\n",
587
- "```python\n",
588
- "from sb3_contrib import RecurrentPPO\n",
589
- "from huggingface_hub import hf_hub_download\n",
590
- "model = RecurrentPPO.load(hf_hub_download(\"{HF_REPO}\", \"spindleflow_model.zip\"))\n",
591
- "```\n",
592
- "\"\"\"\n",
593
- "\n",
594
- "readme_path = \"/content/README_model.md\"\n",
595
- "with open(readme_path, \"w\") as f:\n",
596
- " f.write(readme_text)\n",
597
- "\n",
598
- "candidates = [\n",
599
- " (\"/content/spindleflow_colab_model.zip\", \"spindleflow_model.zip\"),\n",
600
- " (\"/content/vec_normalize_colab.pkl\", \"vec_normalize.pkl\"),\n",
601
- " (\"/content/reward_curve.png\", \"reward_curve.png\"),\n",
602
- " (\"/content/demo/assets/reward_curve.json\", \"reward_curve.json\"),\n",
603
- " (\"/content/logs/training_log.txt\", \"training_log.txt\"),\n",
604
- " (readme_path, \"README.md\"),\n",
605
- "]\n",
606
- "\n",
607
- "ops = [\n",
608
- " CommitOperationAdd(path_in_repo=dst, path_or_fileobj=src)\n",
609
- " for src, dst in candidates\n",
610
- " if os.path.exists(src)\n",
611
- "]\n",
612
- "\n",
613
- "api.create_commit(\n",
614
- " repo_id=HF_REPO,\n",
615
- " repo_type=\"model\",\n",
616
- " operations=ops,\n",
617
- " commit_message=\"Add trained SpindleFlow RL policy (Colab T4)\",\n",
618
- " token=HF_TOKEN,\n",
619
- ")\n",
620
- "\n",
621
- "_tlog(f\"Uploaded {len(ops)} files:\")\n",
622
- "for src, dst in candidates:\n",
623
- " if os.path.exists(src):\n",
624
- " _tlog(f\" ✓ {dst}\")\n",
625
- "\n",
626
- "_tlog(f\"Model : https://huggingface.co/{HF_REPO}\")\n",
627
- "_tlog(f\"Training log: https://huggingface.co/{HF_REPO}/blob/main/training_log.txt\")\n",
628
- "_tlog(f\"Reward curve: https://huggingface.co/{HF_REPO}/blob/main/reward_curve.png\")\n",
629
- "_tlog(f\"Improvement : {final_mean - early_mean:+.4f}\")\n",
630
- "print(\"\\n✅ All done!\")"
631
- ],
632
- "outputs": [],
633
- "execution_count": null
634
- }
635
- ]
636
- }