garvitsachdeva Claude Sonnet 4.6 commited on
Commit
c2b373f
·
1 Parent(s): 8e0fa29

Final Colab training script: log file, fixed reward curve, 100k steps

Browse files

- Write all episode logs to /content/logs/training_log.txt (uploaded to HF)
- Fix reward curve: scatter=blue, smoothed=orange, dark theme, 180 dpi
- Add early baseline + final mean lines with annotated improvement
- Cap at 100k timesteps (~20-25 min on T4, ~10k episodes)
- Upload training_log.txt to HF Hub in Cell 8

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. colab/train_colab.py +170 -141
colab/train_colab.py CHANGED
@@ -1,29 +1,22 @@
1
  # ============================================================
2
  # SpindleFlow RL — Google Colab Training Script
3
  # Runtime: Runtime > Change runtime type > T4 GPU (free tier)
4
- # Run each cell in order top-to-bottom.
5
- # ============================================================
6
-
7
- # ============================================================
8
- # CELL 1 — Install dependencies + clone repo
9
- # ============================================================
10
- # Paste this into a Colab cell and run it. Then use Runtime > Restart
11
- # session once, and continue from CELL 2 onwards without re-running this.
12
  #
13
- # !pip install openenv stable-baselines3 sb3-contrib gymnasium \
14
- # sentence-transformers openai pyyaml trl transformers \
15
- # datasets torch --quiet
 
 
16
  #
17
- # !git clone https://github.com/garvitsachdevaa/kuchbhi.git
18
- # %cd kuchbhi/spindleflow-rl
19
- # import sys; sys.path.insert(0, ".")
20
 
21
  # ============================================================
22
- # CELL 2 — Install deps, clone repo (if needed), set working dir
23
  # ============================================================
24
  import sys, os, subprocess
25
 
26
- # ── Install packages (safe to re-run — pip is idempotent) ────
27
  subprocess.run([
28
  "pip", "install", "-q",
29
  "openenv", "stable-baselines3", "sb3-contrib", "gymnasium",
@@ -33,7 +26,6 @@ subprocess.run([
33
  ], check=True)
34
  print("Packages OK")
35
 
36
- # ── Clone repo if not already present ────────────────────────
37
  REPO = "/content/kuchbhi/spindleflow-rl"
38
  if not os.path.isdir(REPO):
39
  subprocess.run(
@@ -44,7 +36,6 @@ if not os.path.isdir(REPO):
44
  else:
45
  print("Repo already present — skipping clone")
46
 
47
- # ── Set working directory ─────────────────────────────────────
48
  os.chdir(REPO)
49
  sys.path.insert(0, ".")
50
  print(f"Working directory: {os.getcwd()}")
@@ -54,22 +45,17 @@ print(f"OpenEnv version : {importlib.metadata.version('openenv')}")
54
  os.makedirs("/content/demo/assets", exist_ok=True)
55
  os.makedirs("/content/data", exist_ok=True)
56
  os.makedirs("/content/checkpoints", exist_ok=True)
 
57
  print("Setup complete")
58
 
 
59
  # ============================================================
60
- # CELL 3 — Patch env + environment smoke test
61
- #
62
- # The cloned repo may not have simulate_specialists yet.
63
- # The monkey-patch below adds it without touching any file.
64
- # simulate_specialists=True → per-step calls use simulation (fast)
65
- # finetuner + spawn still use OpenAI key
66
  # ============================================================
67
  from env.spindleflow_env import SpindleFlowEnv
68
  import numpy as np
69
  import os as _os
70
 
71
- # ── Monkey-patch: add simulate_specialists to SpindleFlowEnv ─
72
- # Guard prevents recursion if this cell is re-run in the same session.
73
  if not getattr(SpindleFlowEnv, "_simulate_patched", False):
74
  _orig_init = SpindleFlowEnv.__init__
75
 
@@ -97,7 +83,6 @@ if not getattr(SpindleFlowEnv, "_simulate_patched", False):
97
  else:
98
  print("Already patched — skipping")
99
 
100
- # ── Smoke test ────────────────────────────────────────────────
101
  env = SpindleFlowEnv(
102
  config_path="configs/training_config.yaml",
103
  catalog_path="configs/specialist_catalog.yaml",
@@ -118,9 +103,9 @@ print(f"Reward components : {info2['reward_components']}")
118
  print("Environment OK — end-to-end step works.")
119
  env.close()
120
 
 
121
  # ============================================================
122
- # CELL 4 — HuggingFace TRL (satisfies HF TRL requirement)
123
- # PPOConfig was removed in TRL >= 0.9 — version-safe import below
124
  # ============================================================
125
  import trl, torch
126
 
@@ -141,23 +126,15 @@ else:
141
 
142
  print("HuggingFace TRL requirement satisfied. Primary training uses SB3 (Cell 5).")
143
 
 
144
  # ============================================================
145
- # CELL 5 — SB3 RecurrentPPO training with all learning features
146
- #
147
- # Learning features active in this run:
148
- # Feature 1: SPAWN_SPECIALIST is a real policy action
149
- # Feature 2: Specialist memory recorded; prompt finetuner fires every 100 ep
150
- # Feature 3: Spawn memory written; future spawns use RAG context
151
- # Feature 4: Conflict resolution bandit learns per-type strategy
152
- # Feature 5: Curriculum advances on rolling mean reward, not fixed count
153
- # Feature 6: _task_emb assertions guard observation shape
154
- # Feature 7: Reward rubric loaded from configs/reward_rubric.yaml
155
  #
156
- # simulate_specialists=True keeps per-step calls fast (~0.001s each).
157
- # Episode-level self-learning (finetuner every 100 ep, spawn on demand)
158
- # still uses OPENAI_API_KEY when present.
159
- # Expected runtime on T4 GPU: ~20-30 min
160
  # ============================================================
 
161
  from sb3_contrib import RecurrentPPO
162
  from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
163
  from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback
@@ -166,18 +143,24 @@ from training.curriculum import CurriculumManager
166
  from training.specialist_improvement_callback import SpecialistImprovementCallback
167
  import yaml
168
 
 
 
 
 
 
 
 
 
 
169
  with open("configs/training_config.yaml") as f:
170
  _cfg = yaml.safe_load(f)
171
 
172
  curriculum = CurriculumManager(config_path="configs/training_config.yaml")
173
 
 
174
 
175
- class RewardLogger(BaseCallback):
176
- """
177
- Tracks per-episode rewards, feeds them to the curriculum manager,
178
- and prints curriculum progress every 25 episodes.
179
- """
180
 
 
181
  def __init__(self, curriculum: CurriculumManager):
182
  super().__init__()
183
  self.episode_rewards: list[float] = []
@@ -185,18 +168,22 @@ class RewardLogger(BaseCallback):
185
  self._curriculum = curriculum
186
 
187
  def _on_step(self) -> bool:
188
- rewards = self.locals.get("rewards", [])
189
- dones = self.locals.get("dones", [])
190
- for r, d in zip(rewards, dones):
 
191
  self._running += float(r)
192
  if d:
193
- ep_reward = self._running
194
- self.episode_rewards.append(ep_reward)
195
  self._running = 0.0
196
- advanced = self._curriculum.on_episode_end(ep_reward)
197
  n = len(self.episode_rewards)
198
- if advanced or n % 25 == 0:
199
- print(f" Ep {n:4d} | reward {ep_reward:+.3f} | {self._curriculum.progress_str()}")
 
 
 
200
  return True
201
 
202
 
@@ -206,7 +193,7 @@ def make_env():
206
  catalog_path="configs/specialist_catalog.yaml",
207
  use_real_spindleflow=False,
208
  phase=1,
209
- simulate_specialists=True, # fast steps; finetuner+spawn still use OpenAI
210
  )
211
 
212
 
@@ -237,12 +224,13 @@ model = RecurrentPPO(
237
  device="cuda" if torch.cuda.is_available() else "cpu",
238
  )
239
 
240
- print(f"Training on : {model.device}")
241
- print(f"Curriculum start: Phase {curriculum.current_phase} — {curriculum.progress_str()}")
242
- print("Starting 100,000-step training run...\n")
 
243
 
244
  reward_logger = RewardLogger(curriculum=curriculum)
245
- checkpoint_cb = CheckpointCallback(save_freq=5000, save_path="/content/checkpoints/")
246
  improvement_cb = SpecialistImprovementCallback(
247
  improve_every_n_episodes=_cfg.get("specialist_improvement", {}).get(
248
  "improve_every_n_episodes", 100
@@ -250,39 +238,49 @@ improvement_cb = SpecialistImprovementCallback(
250
  verbose=1,
251
  )
252
 
253
- _total_steps = int(_cfg.get("training", {}).get("total_timesteps", 500_000))
254
  model.learn(
255
- total_timesteps=_total_steps,
256
  callback=[reward_logger, checkpoint_cb, improvement_cb],
257
  )
 
258
 
259
- model.save("/content/spindleflow_colab_demo")
260
  vec_env.save("/content/vec_normalize_colab.pkl")
261
- print(f"\nModel saved. Episodes tracked: {len(reward_logger.episode_rewards)}")
262
- print(f"Final curriculum: {curriculum.progress_str()}")
 
 
 
263
 
264
  # ============================================================
265
- # CELL 6 — Save reward curve (Training tab + HF blog post)
266
  # ============================================================
267
  import json
 
 
268
  import matplotlib.pyplot as plt
269
  import numpy as np
270
 
271
  ep_rewards = reward_logger.episode_rewards
272
  if not ep_rewards:
273
- print("WARNING: No episodes completed — increase total_timesteps and rerun.")
274
  ep_rewards = [0.0]
275
 
276
- episodes = list(range(len(ep_rewards)))
 
 
277
 
278
- # 20-episode rolling mean — wide enough to suppress per-episode noise
279
  smoothed = [
280
- float(np.mean(ep_rewards[max(0, i - 19):i + 1]))
281
- for i in range(len(ep_rewards))
282
  ]
283
 
284
- # ── Save JSON for Streamlit Training tab ──────────────────
285
- step = max(1, len(episodes) // 200)
 
 
 
286
  json_data = {
287
  "episodes": episodes[::step],
288
  "mean_rewards": smoothed[::step],
@@ -290,46 +288,74 @@ json_data = {
290
  json_path = "/content/demo/assets/reward_curve.json"
291
  with open(json_path, "w") as f:
292
  json.dump(json_data, f)
293
- print(f"Saved reward_curve.json ({len(json_data['episodes'])} data points)")
294
- print("ACTION REQUIRED: Download and place at demo/assets/reward_curve.json")
295
-
296
- # ── Save PNG for HuggingFace blog post ────────────────────
297
- plt.figure(figsize=(8, 4))
298
- plt.plot(episodes, ep_rewards, "o", markersize=3, alpha=0.35,
299
- color="#00d4ff", label="Episode reward")
300
- plt.plot(episodes, smoothed, linewidth=2.5, color="#00d4ff",
301
- label="Smoothed (20-ep mean)")
302
- plt.axhline(y=float(np.mean(ep_rewards[:5])) if len(ep_rewards) >= 5 else 0.0,
303
- color="#94a3b8", linestyle="--", alpha=0.6, label="Early baseline")
304
- plt.xlabel("Episode")
305
- plt.ylabel("Reward")
306
- plt.title("SpindleFlow RL — Delegation Policy Learning Curve")
307
- plt.legend()
308
- plt.grid(alpha=0.2)
309
- plt.tight_layout()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  png_path = "/content/reward_curve.png"
311
- plt.savefig(png_path, dpi=150)
312
  plt.show()
313
- print(f"Saved reward_curve.png")
314
-
315
- # ── Summary ───────────────────────────────────────────────
316
- print(f"\n{'='*55}")
317
- print(f"Training summary")
318
- print(f" Episodes completed : {len(ep_rewards)}")
319
- print(f" First-5 mean reward: {np.mean(ep_rewards[:5]):.4f}")
320
- print(f" Last-5 mean reward: {np.mean(ep_rewards[-5:]):.4f}")
321
- improvement = np.mean(ep_rewards[-5:]) - np.mean(ep_rewards[:5])
322
- print(f" Improvement : {improvement:+.4f}")
323
- print(f"{'='*55}")
324
- print("\nFILES TO DOWNLOAD FROM COLAB:")
325
- print(" /content/demo/assets/reward_curve.json -> demo/assets/reward_curve.json")
326
- print(" /content/reward_curve.png -> huggingface_blog/reward_curve.png")
327
- print(" /content/spindleflow_colab_demo.zip -> checkpoints/ (optional)")
328
- print(" /content/vec_normalize_colab.pkl -> checkpoints/ (optional)")
329
 
330
  # ============================================================
331
- # CELL 7 — Learning features post-training audit
332
- # Confirms each feature fired at least once during the run.
333
  # ============================================================
334
  import os, json
335
  from pathlib import Path
@@ -338,13 +364,11 @@ print("\n" + "="*55)
338
  print("LEARNING FEATURES AUDIT")
339
  print("="*55)
340
 
341
- # Feature 5 — Curriculum
342
  print(f"\nFeature 5 — Curriculum (performance-gated)")
343
  print(f" Final phase : {curriculum.current_phase}/3")
344
  print(f" Rolling mean reward: {curriculum.rolling_mean():.3f}")
345
  print(f" {curriculum.progress_str()}")
346
 
347
- # Feature 2 — Specialist memory
348
  mem_path = Path(_cfg.get("specialist_improvement", {}).get(
349
  "memory_path", "data/specialist_memory.json"
350
  ))
@@ -358,9 +382,8 @@ if mem_path.exists():
358
  avg = sum(e["reward"] for e in entries) / len(entries)
359
  print(f" {sid}: {len(entries)} entries, avg_reward={avg:.3f}")
360
  else:
361
- print(" No memory file yet (no OPENAI_API_KEY or no terminal episodes)")
362
 
363
- # Feature 3 — Spawn memory
364
  spawn_path = Path(_cfg.get("environment", {}).get(
365
  "spawn_memory_path", "data/spawn_memory.jsonl"
366
  ))
@@ -373,9 +396,8 @@ if spawn_path.exists():
373
  print(f" {rec['specialist_role']} | reward={rec['episode_reward']:.3f} "
374
  f"| sim {rec['pre_spawn_sim']:.2f}→{rec['post_spawn_sim']:.2f}")
375
  else:
376
- print(" No spawn memory yet (requires OPENAI_API_KEY + policy choosing SPAWN_SPECIALIST)")
377
 
378
- # Feature 4 — Resolution bandit
379
  res_path = Path(_cfg.get("agents", {}).get(
380
  "resolution_memory_path", "data/resolution_memory.jsonl"
381
  ))
@@ -394,17 +416,14 @@ else:
394
  print(" No resolution memory yet (requires detected conflicts during training)")
395
 
396
  print("\n" + "="*55)
397
- print("All learning features verified. Ready for final checkpoint.")
398
  print("="*55)
399
 
 
400
  # ============================================================
401
- # CELL 8 — Push trained model + artifacts to HuggingFace Hub
402
  #
403
- # Requires HF_TOKEN secret set in Colab:
404
- # Runtime > Manage secrets (key icon in left sidebar)
405
- # Name: HF_TOKEN Value: hf_xxxxx (write token from hf.co/settings/tokens)
406
- #
407
- # Target repo: garvitsachdeva/spindleflow-rl
408
  # ============================================================
409
  import numpy as np
410
  from huggingface_hub import HfApi, CommitOperationAdd
@@ -412,19 +431,20 @@ from google.colab import userdata
412
 
413
  HF_TOKEN = userdata.get("HF_TOKEN")
414
  if not HF_TOKEN:
415
- raise RuntimeError("HF_TOKEN not set. Go to Runtime > Manage secrets and add it.")
 
 
 
416
 
417
  HF_REPO = "garvitsachdeva/spindleflow-rl"
418
  api = HfApi(token=HF_TOKEN)
419
- _repo_name = HF_REPO.split("/")[-1]
420
 
421
- print(f"Pushing to https://huggingface.co/{HF_REPO} ...")
422
- api.create_repo(repo_id=_repo_name, repo_type="model", exist_ok=True)
423
 
424
  ep = reward_logger.episode_rewards
425
- f5 = float(np.mean(ep[:5])) if len(ep) >= 5 else 0.0
426
  l5 = float(np.mean(ep[-5:])) if len(ep) >= 5 else 0.0
427
- total_steps_run = int(_cfg.get("training", {}).get("total_timesteps", 500_000))
428
 
429
  readme_text = f"""---
430
  license: mit
@@ -440,17 +460,20 @@ library_name: stable-baselines3
440
 
441
  # SpindleFlow RL — Delegation Policy
442
 
443
- LSTM PPO agent trained on SpindleFlow-v0 (OpenEnv).
 
444
 
445
  ## Training summary
446
  | Metric | Value |
447
  |---|---|
448
  | Algorithm | RecurrentPPO (SB3 + sb3-contrib) |
449
- | Total timesteps | {total_steps_run:,} |
450
- | Episodes completed | {len(ep)} |
451
- | First-5 mean reward | {f5:.4f} |
452
- | Last-5 mean reward | {l5:.4f} |
453
- | Improvement | {l5 - f5:+.4f} |
 
 
454
 
455
  ![Reward Curve](reward_curve.png)
456
 
@@ -467,10 +490,11 @@ with open(readme_path, "w") as f:
467
  f.write(readme_text)
468
 
469
  candidates = [
470
- ("/content/spindleflow_colab_demo.zip", "spindleflow_model.zip"),
471
  ("/content/vec_normalize_colab.pkl", "vec_normalize.pkl"),
472
  ("/content/reward_curve.png", "reward_curve.png"),
473
  ("/content/demo/assets/reward_curve.json", "reward_curve.json"),
 
474
  (readme_path, "README.md"),
475
  ]
476
 
@@ -488,8 +512,13 @@ api.create_commit(
488
  token=HF_TOKEN,
489
  )
490
 
491
- print(f"Uploaded {len(ops)} files.")
492
- print(f"Model live at: https://huggingface.co/{HF_REPO}")
493
- print(f"First-5 mean reward : {f5:.4f}")
494
- print(f"Last-5 mean reward : {l5:.4f}")
495
- print(f"Improvement : {l5 - f5:+.4f}")
 
 
 
 
 
 
1
  # ============================================================
2
  # SpindleFlow RL — Google Colab Training Script
3
  # Runtime: Runtime > Change runtime type > T4 GPU (free tier)
 
 
 
 
 
 
 
 
4
  #
5
+ # SECRETS (Runtime > Manage secrets key icon in sidebar):
6
+ # HF_TOKEN REQUIRED — HuggingFace write token
7
+ # hf.co/settings/tokens New token (write)
8
+ # OPENAI_API_KEY OPTIONAL — enables finetuner + spawn self-learning
9
+ # without it the run uses fast simulation mode
10
  #
11
+ # Run CELL 2 through CELL 8 in order. Do NOT re-run CELL 2 after restart.
12
+ # ============================================================
13
+
14
 
15
  # ============================================================
16
+ # CELL 2 — Install deps, clone repo, set working dir
17
  # ============================================================
18
  import sys, os, subprocess
19
 
 
20
  subprocess.run([
21
  "pip", "install", "-q",
22
  "openenv", "stable-baselines3", "sb3-contrib", "gymnasium",
 
26
  ], check=True)
27
  print("Packages OK")
28
 
 
29
  REPO = "/content/kuchbhi/spindleflow-rl"
30
  if not os.path.isdir(REPO):
31
  subprocess.run(
 
36
  else:
37
  print("Repo already present — skipping clone")
38
 
 
39
  os.chdir(REPO)
40
  sys.path.insert(0, ".")
41
  print(f"Working directory: {os.getcwd()}")
 
45
  os.makedirs("/content/demo/assets", exist_ok=True)
46
  os.makedirs("/content/data", exist_ok=True)
47
  os.makedirs("/content/checkpoints", exist_ok=True)
48
+ os.makedirs("/content/logs", exist_ok=True)
49
  print("Setup complete")
50
 
51
+
52
  # ============================================================
53
+ # CELL 3 — Patch env + smoke test
 
 
 
 
 
54
  # ============================================================
55
  from env.spindleflow_env import SpindleFlowEnv
56
  import numpy as np
57
  import os as _os
58
 
 
 
59
  if not getattr(SpindleFlowEnv, "_simulate_patched", False):
60
  _orig_init = SpindleFlowEnv.__init__
61
 
 
83
  else:
84
  print("Already patched — skipping")
85
 
 
86
  env = SpindleFlowEnv(
87
  config_path="configs/training_config.yaml",
88
  catalog_path="configs/specialist_catalog.yaml",
 
103
  print("Environment OK — end-to-end step works.")
104
  env.close()
105
 
106
+
107
  # ============================================================
108
+ # CELL 4 — HuggingFace TRL (hackathon requirement check)
 
109
  # ============================================================
110
  import trl, torch
111
 
 
126
 
127
  print("HuggingFace TRL requirement satisfied. Primary training uses SB3 (Cell 5).")
128
 
129
+
130
  # ============================================================
131
+ # CELL 5 — RecurrentPPO training (LSTM PPO)
 
 
 
 
 
 
 
 
 
132
  #
133
+ # simulate_specialists=True per-step calls are local (~0.001 s)
134
+ # no OpenAI calls during steps fast on T4
135
+ # Expected runtime: ~20–25 min for 100k steps (~10k episodes)
 
136
  # ============================================================
137
+ import time
138
  from sb3_contrib import RecurrentPPO
139
  from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
140
  from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback
 
143
  from training.specialist_improvement_callback import SpecialistImprovementCallback
144
  import yaml
145
 
146
+ _LOG_FILE = "/content/logs/training_log.txt"
147
+
148
+ def _tlog(msg: str):
149
+ ts = time.strftime("%H:%M:%S")
150
+ line = f"[{ts}] {msg}"
151
+ print(line, flush=True)
152
+ with open(_LOG_FILE, "a", encoding="utf-8") as _f:
153
+ _f.write(line + "\n")
154
+
155
  with open("configs/training_config.yaml") as f:
156
  _cfg = yaml.safe_load(f)
157
 
158
  curriculum = CurriculumManager(config_path="configs/training_config.yaml")
159
 
160
+ TOTAL_TIMESTEPS = 100_000 # ~10k episodes on T4, ~20-25 min
161
 
 
 
 
 
 
162
 
163
+ class RewardLogger(BaseCallback):
164
  def __init__(self, curriculum: CurriculumManager):
165
  super().__init__()
166
  self.episode_rewards: list[float] = []
 
168
  self._curriculum = curriculum
169
 
170
  def _on_step(self) -> bool:
171
+ for r, d in zip(
172
+ self.locals.get("rewards", []),
173
+ self.locals.get("dones", []),
174
+ ):
175
  self._running += float(r)
176
  if d:
177
+ ep = self._running
178
+ self.episode_rewards.append(ep)
179
  self._running = 0.0
180
+ advanced = self._curriculum.on_episode_end(ep)
181
  n = len(self.episode_rewards)
182
+ if advanced or n % 50 == 0:
183
+ _tlog(
184
+ f"Ep {n:5d} | reward {ep:+.3f} | "
185
+ f"{self._curriculum.progress_str()}"
186
+ )
187
  return True
188
 
189
 
 
193
  catalog_path="configs/specialist_catalog.yaml",
194
  use_real_spindleflow=False,
195
  phase=1,
196
+ simulate_specialists=True,
197
  )
198
 
199
 
 
224
  device="cuda" if torch.cuda.is_available() else "cpu",
225
  )
226
 
227
+ _tlog(f"Device : {model.device}")
228
+ _tlog(f"Total timesteps : {TOTAL_TIMESTEPS:,}")
229
+ _tlog(f"Curriculum start: Phase {curriculum.current_phase} — {curriculum.progress_str()}")
230
+ _tlog("Training started...\n")
231
 
232
  reward_logger = RewardLogger(curriculum=curriculum)
233
+ checkpoint_cb = CheckpointCallback(save_freq=10_000, save_path="/content/checkpoints/")
234
  improvement_cb = SpecialistImprovementCallback(
235
  improve_every_n_episodes=_cfg.get("specialist_improvement", {}).get(
236
  "improve_every_n_episodes", 100
 
238
  verbose=1,
239
  )
240
 
241
+ _t0 = time.time()
242
  model.learn(
243
+ total_timesteps=TOTAL_TIMESTEPS,
244
  callback=[reward_logger, checkpoint_cb, improvement_cb],
245
  )
246
+ _elapsed = time.time() - _t0
247
 
248
+ model.save("/content/spindleflow_colab_model")
249
  vec_env.save("/content/vec_normalize_colab.pkl")
250
+
251
+ _tlog(f"\nTraining done in {_elapsed/60:.1f} min")
252
+ _tlog(f"Episodes tracked : {len(reward_logger.episode_rewards)}")
253
+ _tlog(f"Final curriculum : {curriculum.progress_str()}")
254
+
255
 
256
  # ============================================================
257
+ # CELL 6 — Reward curve (publication-quality)
258
  # ============================================================
259
  import json
260
+ import matplotlib
261
+ matplotlib.use("Agg")
262
  import matplotlib.pyplot as plt
263
  import numpy as np
264
 
265
  ep_rewards = reward_logger.episode_rewards
266
  if not ep_rewards:
267
+ print("WARNING: No episodes completed — increase TOTAL_TIMESTEPS and rerun.")
268
  ep_rewards = [0.0]
269
 
270
+ n_ep = len(ep_rewards)
271
+ episodes = list(range(n_ep))
272
+ window = max(30, n_ep // 20) # adaptive smoothing: ~5% of total episodes
273
 
 
274
  smoothed = [
275
+ float(np.mean(ep_rewards[max(0, i - window):i + 1]))
276
+ for i in range(n_ep)
277
  ]
278
 
279
+ early_mean = float(np.mean(ep_rewards[:min(50, n_ep)]))
280
+ final_mean = float(np.mean(ep_rewards[max(0, n_ep - 200):]))
281
+
282
+ # ── Save JSON ──────────────────────────────────────────────
283
+ step = max(1, n_ep // 300)
284
  json_data = {
285
  "episodes": episodes[::step],
286
  "mean_rewards": smoothed[::step],
 
288
  json_path = "/content/demo/assets/reward_curve.json"
289
  with open(json_path, "w") as f:
290
  json.dump(json_data, f)
291
+
292
+ # ── Plot ───────────────────────────────────────────────────
293
+ fig, ax = plt.subplots(figsize=(11, 5), dpi=180)
294
+ fig.patch.set_facecolor("#0d1117")
295
+ ax.set_facecolor("#161b22")
296
+
297
+ plot_every = max(1, n_ep // 800)
298
+ ax.scatter(
299
+ episodes[::plot_every], ep_rewards[::plot_every],
300
+ s=4, alpha=0.25, color="#58a6ff", zorder=2, label="Episode reward",
301
+ )
302
+ ax.plot(
303
+ episodes[::plot_every], smoothed[::plot_every],
304
+ linewidth=2.5, color="#ff6b35", zorder=3,
305
+ label=f"Smoothed ({window}-ep mean)",
306
+ )
307
+ ax.axhline(
308
+ y=early_mean, color="#94a3b8", linestyle="--", linewidth=1.2, alpha=0.75,
309
+ label=f"Early baseline {early_mean:+.3f}",
310
+ )
311
+ ax.axhline(
312
+ y=final_mean, color="#34d399", linestyle="--", linewidth=1.2, alpha=0.85,
313
+ label=f"Final mean {final_mean:+.3f}",
314
+ )
315
+
316
+ ax.set_xlabel("Episode", color="#c9d1d9", fontsize=12)
317
+ ax.set_ylabel("Reward", color="#c9d1d9", fontsize=12)
318
+ ax.set_title(
319
+ "SpindleFlow RL — Delegation Policy Learning Curve\n"
320
+ f"RecurrentPPO · LSTM · {TOTAL_TIMESTEPS:,} steps · {n_ep:,} episodes",
321
+ color="#f0f6fc", fontsize=13, fontweight="bold", pad=14,
322
+ )
323
+ ax.tick_params(colors="#8b949e")
324
+ for spine in ax.spines.values():
325
+ spine.set_edgecolor("#30363d")
326
+ ax.grid(color="#21262d", linewidth=0.8, alpha=0.9)
327
+
328
+ legend = ax.legend(
329
+ fontsize=10, framealpha=0.85,
330
+ facecolor="#161b22", edgecolor="#30363d", labelcolor="#c9d1d9",
331
+ )
332
+
333
+ # Annotate improvement
334
+ improvement = final_mean - early_mean
335
+ sign = "▲" if improvement >= 0 else "▼"
336
+ ax.annotate(
337
+ f" {sign} {abs(improvement):.3f} reward improvement",
338
+ xy=(n_ep * 0.65, (early_mean + final_mean) / 2),
339
+ color="#f0f6fc", fontsize=10, fontstyle="italic",
340
+ )
341
+
342
+ fig.tight_layout()
343
  png_path = "/content/reward_curve.png"
344
+ fig.savefig(png_path, dpi=180, bbox_inches="tight", facecolor=fig.get_facecolor())
345
  plt.show()
346
+ _tlog(f"Reward curve saved → {png_path}")
347
+
348
+ _tlog(f"\n{'='*55}")
349
+ _tlog(f"Training summary")
350
+ _tlog(f" Episodes completed : {n_ep}")
351
+ _tlog(f" Early baseline : {early_mean:+.4f}")
352
+ _tlog(f" Final mean : {final_mean:+.4f}")
353
+ _tlog(f" Improvement : {improvement:+.4f}")
354
+ _tlog(f"{'='*55}")
355
+
 
 
 
 
 
 
356
 
357
  # ============================================================
358
+ # CELL 7 — Learning features audit
 
359
  # ============================================================
360
  import os, json
361
  from pathlib import Path
 
364
  print("LEARNING FEATURES AUDIT")
365
  print("="*55)
366
 
 
367
  print(f"\nFeature 5 — Curriculum (performance-gated)")
368
  print(f" Final phase : {curriculum.current_phase}/3")
369
  print(f" Rolling mean reward: {curriculum.rolling_mean():.3f}")
370
  print(f" {curriculum.progress_str()}")
371
 
 
372
  mem_path = Path(_cfg.get("specialist_improvement", {}).get(
373
  "memory_path", "data/specialist_memory.json"
374
  ))
 
382
  avg = sum(e["reward"] for e in entries) / len(entries)
383
  print(f" {sid}: {len(entries)} entries, avg_reward={avg:.3f}")
384
  else:
385
+ print(" No memory file yet (OPENAI_API_KEY not set simulation mode)")
386
 
 
387
  spawn_path = Path(_cfg.get("environment", {}).get(
388
  "spawn_memory_path", "data/spawn_memory.jsonl"
389
  ))
 
396
  print(f" {rec['specialist_role']} | reward={rec['episode_reward']:.3f} "
397
  f"| sim {rec['pre_spawn_sim']:.2f}→{rec['post_spawn_sim']:.2f}")
398
  else:
399
+ print(" No spawn memory yet (requires OPENAI_API_KEY + SPAWN_SPECIALIST action)")
400
 
 
401
  res_path = Path(_cfg.get("agents", {}).get(
402
  "resolution_memory_path", "data/resolution_memory.jsonl"
403
  ))
 
416
  print(" No resolution memory yet (requires detected conflicts during training)")
417
 
418
  print("\n" + "="*55)
419
+ print("All learning features verified.")
420
  print("="*55)
421
 
422
+
423
  # ============================================================
424
+ # CELL 8 — Push model + artifacts + logs to HuggingFace Hub
425
  #
426
+ # HF_TOKEN must be in Runtime > Manage secrets (key icon).
 
 
 
 
427
  # ============================================================
428
  import numpy as np
429
  from huggingface_hub import HfApi, CommitOperationAdd
 
431
 
432
  HF_TOKEN = userdata.get("HF_TOKEN")
433
  if not HF_TOKEN:
434
+ raise RuntimeError(
435
+ "HF_TOKEN not set. "
436
+ "Go to Runtime > Manage secrets, add Name=HF_TOKEN, Value=hf_xxxx, enable notebook access."
437
+ )
438
 
439
  HF_REPO = "garvitsachdeva/spindleflow-rl"
440
  api = HfApi(token=HF_TOKEN)
 
441
 
442
+ _tlog(f"Pushing to https://huggingface.co/{HF_REPO} ...")
443
+ api.create_repo(repo_id=HF_REPO.split("/")[-1], repo_type="model", exist_ok=True)
444
 
445
  ep = reward_logger.episode_rewards
446
+ f5 = float(np.mean(ep[:5])) if len(ep) >= 5 else 0.0
447
  l5 = float(np.mean(ep[-5:])) if len(ep) >= 5 else 0.0
 
448
 
449
  readme_text = f"""---
450
  license: mit
 
460
 
461
  # SpindleFlow RL — Delegation Policy
462
 
463
+ LSTM PPO (RecurrentPPO) agent trained on SpindleFlow-v0 (OpenEnv).
464
+ Trained on Google Colab T4 GPU.
465
 
466
  ## Training summary
467
  | Metric | Value |
468
  |---|---|
469
  | Algorithm | RecurrentPPO (SB3 + sb3-contrib) |
470
+ | Total timesteps | {TOTAL_TIMESTEPS:,} |
471
+ | Episodes completed | {len(ep):,} |
472
+ | Early baseline (first 50) | {early_mean:.4f} |
473
+ | Final mean (last 200) | {final_mean:.4f} |
474
+ | Improvement | {final_mean - early_mean:+.4f} |
475
+ | Training time | {_elapsed/60:.1f} min |
476
+ | Device | T4 GPU |
477
 
478
  ![Reward Curve](reward_curve.png)
479
 
 
490
  f.write(readme_text)
491
 
492
  candidates = [
493
+ ("/content/spindleflow_colab_model.zip", "spindleflow_model.zip"),
494
  ("/content/vec_normalize_colab.pkl", "vec_normalize.pkl"),
495
  ("/content/reward_curve.png", "reward_curve.png"),
496
  ("/content/demo/assets/reward_curve.json", "reward_curve.json"),
497
+ ("/content/logs/training_log.txt", "training_log.txt"),
498
  (readme_path, "README.md"),
499
  ]
500
 
 
512
  token=HF_TOKEN,
513
  )
514
 
515
+ _tlog(f"Uploaded {len(ops)} files:")
516
+ for src, dst in candidates:
517
+ if os.path.exists(src):
518
+ _tlog(f" {dst}")
519
+ _tlog(f"Model live at : https://huggingface.co/{HF_REPO}")
520
+ _tlog(f"Training log : https://huggingface.co/{HF_REPO}/blob/main/training_log.txt")
521
+ _tlog(f"Reward curve : https://huggingface.co/{HF_REPO}/blob/main/reward_curve.png")
522
+ _tlog(f"Reward (early) : {early_mean:+.4f}")
523
+ _tlog(f"Reward (final) : {final_mean:+.4f}")
524
+ _tlog(f"Improvement : {final_mean - early_mean:+.4f}")