garvitsachdeva Claude Sonnet 4.6 commited on
Commit
956acce
Β·
1 Parent(s): c8407b4

Rewrite train_colab.py: clean cells, fix audioop-lts, log file, reward curve

Browse files
Files changed (1) hide show
  1. colab/train_colab.py +201 -229
colab/train_colab.py CHANGED
@@ -1,61 +1,102 @@
1
  # ============================================================
2
- # SpindleFlow RL β€” Google Colab Training Script
3
- # Runtime: Runtime > Change runtime type > T4 GPU (free tier)
4
  #
5
- # SECRETS (Runtime > Manage secrets β€” key icon in sidebar):
6
- # HF_TOKEN REQUIRED β€” HuggingFace write token
7
- # hf.co/settings/tokens β†’ New token (write)
8
- # OPENAI_API_KEY OPTIONAL β€” enables finetuner + spawn self-learning
9
- # without it the run uses fast simulation mode
10
  #
11
- # Run CELL 2 through CELL 8 in order. Do NOT re-run CELL 2 after restart.
 
 
 
 
 
 
12
  # ============================================================
13
 
14
 
15
  # ============================================================
16
- # CELL 2 β€” Install deps, clone repo, set working dir
17
  # ============================================================
18
- import sys, os, subprocess
 
 
19
 
20
- subprocess.run([
21
- "pip", "install", "-q",
22
  "openenv", "stable-baselines3", "sb3-contrib", "gymnasium",
23
  "sentence-transformers", "openai", "pyyaml", "trl",
24
- "transformers", "datasets", "torch",
25
- "matplotlib", "audioop-lts", "huggingface_hub",
26
- ], check=True)
 
 
 
 
 
 
 
27
  print("Packages OK")
28
 
29
  REPO = "/content/kuchbhi/spindleflow-rl"
30
  if not os.path.isdir(REPO):
31
- subprocess.run(
32
- ["git", "clone", "https://github.com/garvitsachdevaa/kuchbhi.git"],
33
- cwd="/content", check=True,
34
- )
35
  print("Repo cloned")
36
  else:
37
- print("Repo already present β€” skipping clone")
 
38
 
39
  os.chdir(REPO)
40
  sys.path.insert(0, ".")
41
- print(f"Working directory: {os.getcwd()}")
42
-
43
- import openenv, importlib.metadata
44
- print(f"OpenEnv version : {importlib.metadata.version('openenv')}")
45
  os.makedirs("/content/demo/assets", exist_ok=True)
46
- os.makedirs("/content/data", exist_ok=True)
47
  os.makedirs("/content/checkpoints", exist_ok=True)
48
- os.makedirs("/content/logs", exist_ok=True)
49
- print("Setup complete")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
 
52
  # ============================================================
53
  # CELL 3 β€” Patch env + smoke test
54
  # ============================================================
55
- from env.spindleflow_env import SpindleFlowEnv
56
- import numpy as np
57
  import os as _os
 
 
58
 
 
 
 
59
  if not getattr(SpindleFlowEnv, "_simulate_patched", False):
60
  _orig_init = SpindleFlowEnv.__init__
61
 
@@ -79,9 +120,7 @@ if not getattr(SpindleFlowEnv, "_simulate_patched", False):
79
 
80
  SpindleFlowEnv._call_specialist = _new_call
81
  SpindleFlowEnv._simulate_patched = True
82
- print("SpindleFlowEnv patched OK")
83
- else:
84
- print("Already patched β€” skipping")
85
 
86
  env = SpindleFlowEnv(
87
  config_path="configs/training_config.yaml",
@@ -91,87 +130,80 @@ env = SpindleFlowEnv(
91
  simulate_specialists=True,
92
  )
93
  obs, info = env.reset()
94
- print(f"Observation shape : {obs.shape}")
95
- print(f"Task : {info['task'][:80]}")
96
-
97
- action = env.action_space.sample()
98
- obs2, reward, terminated, truncated, info2 = env.step(action)
99
- print(f"Step reward : {reward:.4f}")
100
- print(f"Action name : {info2['action_name']}")
101
- print(f"Called specialists: {info2['called_specialists']}")
102
- print(f"Reward components : {info2['reward_components']}")
103
- print("Environment OK β€” end-to-end step works.")
104
  env.close()
 
105
 
106
 
107
  # ============================================================
108
- # CELL 4 β€” HuggingFace TRL (hackathon requirement check)
109
  # ============================================================
110
  import trl, torch
111
 
112
- print(f"TRL version : {trl.__version__}")
113
- print(f"CUDA available: {torch.cuda.is_available()}")
 
 
 
114
 
115
- _found = None
116
  for _name in ("PPOConfig", "GRPOConfig", "SFTConfig"):
117
- _cls = getattr(trl, _name, None)
118
- if _cls is not None:
119
- _found = _name
120
  break
121
-
122
- if _found:
123
- print(f"TRL config class available: {_found}")
124
  else:
125
- print("TRL imported β€” config classes use TrainingArguments in this version")
126
 
127
- print("HuggingFace TRL requirement satisfied. Primary training uses SB3 (Cell 5).")
128
 
129
 
130
  # ============================================================
131
- # CELL 5 β€” RecurrentPPO training (LSTM PPO)
 
 
 
 
 
132
  #
133
- # simulate_specialists=True β€” per-step calls are local (~0.001 s)
134
- # no OpenAI calls during steps β†’ fast on T4
135
- # Expected runtime: ~20–25 min for 100k steps (~10k episodes)
136
  # ============================================================
137
- import time
138
  from sb3_contrib import RecurrentPPO
139
  from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
140
  from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback
141
  from policy.lstm_policy import build_policy_kwargs
142
  from training.curriculum import CurriculumManager
143
  from training.specialist_improvement_callback import SpecialistImprovementCallback
144
- import yaml
145
 
146
  _LOG_FILE = "/content/logs/training_log.txt"
147
 
148
- def _tlog(msg: str):
149
- ts = time.strftime("%H:%M:%S")
150
- line = f"[{ts}] {msg}"
151
  print(line, flush=True)
152
- with open(_LOG_FILE, "a", encoding="utf-8") as _f:
153
- _f.write(line + "\n")
154
 
155
  with open("configs/training_config.yaml") as f:
156
  _cfg = yaml.safe_load(f)
157
 
 
158
  curriculum = CurriculumManager(config_path="configs/training_config.yaml")
159
 
160
- TOTAL_TIMESTEPS = 100_000 # ~10k episodes on T4, ~20-25 min
161
-
162
 
163
  class RewardLogger(BaseCallback):
164
- def __init__(self, curriculum: CurriculumManager):
165
  super().__init__()
166
- self.episode_rewards: list[float] = []
167
- self._running: float = 0.0
168
  self._curriculum = curriculum
169
 
170
- def _on_step(self) -> bool:
171
- for r, d in zip(
172
- self.locals.get("rewards", []),
173
- self.locals.get("dones", []),
174
- ):
175
  self._running += float(r)
176
  if d:
177
  ep = self._running
@@ -180,10 +212,8 @@ class RewardLogger(BaseCallback):
180
  advanced = self._curriculum.on_episode_end(ep)
181
  n = len(self.episode_rewards)
182
  if advanced or n % 50 == 0:
183
- _tlog(
184
- f"Ep {n:5d} | reward {ep:+.3f} | "
185
- f"{self._curriculum.progress_str()}"
186
- )
187
  return True
188
 
189
 
@@ -224,17 +254,17 @@ model = RecurrentPPO(
224
  device="cuda" if torch.cuda.is_available() else "cpu",
225
  )
226
 
227
- _tlog(f"Device : {model.device}")
228
- _tlog(f"Total timesteps : {TOTAL_TIMESTEPS:,}")
229
- _tlog(f"Curriculum start: Phase {curriculum.current_phase} β€” {curriculum.progress_str()}")
230
- _tlog("Training started...\n")
231
 
232
- reward_logger = RewardLogger(curriculum=curriculum)
233
- checkpoint_cb = CheckpointCallback(save_freq=10_000, save_path="/content/checkpoints/")
 
234
  improvement_cb = SpecialistImprovementCallback(
235
  improve_every_n_episodes=_cfg.get("specialist_improvement", {}).get(
236
- "improve_every_n_episodes", 100
237
- ),
238
  verbose=1,
239
  )
240
 
@@ -245,196 +275,147 @@ model.learn(
245
  )
246
  _elapsed = time.time() - _t0
247
 
248
- model.save("/content/spindleflow_colab_model")
249
- vec_env.save("/content/vec_normalize_colab.pkl")
250
 
251
- _tlog(f"\nTraining done in {_elapsed/60:.1f} min")
252
- _tlog(f"Episodes tracked : {len(reward_logger.episode_rewards)}")
253
- _tlog(f"Final curriculum : {curriculum.progress_str()}")
 
254
 
255
 
256
  # ============================================================
257
- # CELL 6 β€” Reward curve (publication-quality)
258
  # ============================================================
259
- import json
260
- import matplotlib
261
  matplotlib.use("Agg")
262
  import matplotlib.pyplot as plt
263
- import numpy as np
264
 
265
  ep_rewards = reward_logger.episode_rewards
266
  if not ep_rewards:
267
- print("WARNING: No episodes completed β€” increase TOTAL_TIMESTEPS and rerun.")
268
- ep_rewards = [0.0]
269
 
270
- n_ep = len(ep_rewards)
271
- episodes = list(range(n_ep))
272
- window = max(30, n_ep // 20) # adaptive smoothing: ~5% of total episodes
273
 
274
  smoothed = [
275
  float(np.mean(ep_rewards[max(0, i - window):i + 1]))
276
  for i in range(n_ep)
277
  ]
278
 
279
- early_mean = float(np.mean(ep_rewards[:min(50, n_ep)]))
280
- final_mean = float(np.mean(ep_rewards[max(0, n_ep - 200):]))
 
281
 
282
- # ── Save JSON ──────────────────────────────────────────────
283
- step = max(1, n_ep // 300)
284
- json_data = {
285
- "episodes": episodes[::step],
286
- "mean_rewards": smoothed[::step],
287
- }
288
- json_path = "/content/demo/assets/reward_curve.json"
289
- with open(json_path, "w") as f:
290
- json.dump(json_data, f)
291
 
292
- # ── Plot ───────────────────────────────────────────────────
293
  fig, ax = plt.subplots(figsize=(11, 5), dpi=180)
294
  fig.patch.set_facecolor("#0d1117")
295
  ax.set_facecolor("#161b22")
296
 
297
- plot_every = max(1, n_ep // 800)
298
- ax.scatter(
299
- episodes[::plot_every], ep_rewards[::plot_every],
300
- s=4, alpha=0.25, color="#58a6ff", zorder=2, label="Episode reward",
301
- )
302
- ax.plot(
303
- episodes[::plot_every], smoothed[::plot_every],
304
- linewidth=2.5, color="#ff6b35", zorder=3,
305
- label=f"Smoothed ({window}-ep mean)",
306
- )
307
- ax.axhline(
308
- y=early_mean, color="#94a3b8", linestyle="--", linewidth=1.2, alpha=0.75,
309
- label=f"Early baseline {early_mean:+.3f}",
310
- )
311
- ax.axhline(
312
- y=final_mean, color="#34d399", linestyle="--", linewidth=1.2, alpha=0.85,
313
- label=f"Final mean {final_mean:+.3f}",
314
- )
315
 
316
  ax.set_xlabel("Episode", color="#c9d1d9", fontsize=12)
317
- ax.set_ylabel("Reward", color="#c9d1d9", fontsize=12)
318
  ax.set_title(
319
  "SpindleFlow RL β€” Delegation Policy Learning Curve\n"
320
  f"RecurrentPPO Β· LSTM Β· {TOTAL_TIMESTEPS:,} steps Β· {n_ep:,} episodes",
321
  color="#f0f6fc", fontsize=13, fontweight="bold", pad=14,
322
  )
323
  ax.tick_params(colors="#8b949e")
324
- for spine in ax.spines.values():
325
- spine.set_edgecolor("#30363d")
326
  ax.grid(color="#21262d", linewidth=0.8, alpha=0.9)
 
 
327
 
328
- legend = ax.legend(
329
- fontsize=10, framealpha=0.85,
330
- facecolor="#161b22", edgecolor="#30363d", labelcolor="#c9d1d9",
331
- )
332
-
333
- # Annotate improvement
334
- improvement = final_mean - early_mean
335
  sign = "β–²" if improvement >= 0 else "β–Ό"
336
- ax.annotate(
337
- f" {sign} {abs(improvement):.3f} reward improvement",
338
- xy=(n_ep * 0.65, (early_mean + final_mean) / 2),
339
- color="#f0f6fc", fontsize=10, fontstyle="italic",
340
- )
341
 
342
  fig.tight_layout()
343
- png_path = "/content/reward_curve.png"
344
- fig.savefig(png_path, dpi=180, bbox_inches="tight", facecolor=fig.get_facecolor())
345
  plt.show()
346
- _tlog(f"Reward curve saved β†’ {png_path}")
347
 
348
- _tlog(f"\n{'='*55}")
349
- _tlog(f"Training summary")
350
- _tlog(f" Episodes completed : {n_ep}")
351
- _tlog(f" Early baseline : {early_mean:+.4f}")
352
- _tlog(f" Final mean : {final_mean:+.4f}")
353
- _tlog(f" Improvement : {improvement:+.4f}")
354
- _tlog(f"{'='*55}")
355
 
356
 
357
  # ============================================================
358
  # CELL 7 β€” Learning features audit
359
  # ============================================================
360
- import os, json
361
  from pathlib import Path
362
 
363
- print("\n" + "="*55)
364
  print("LEARNING FEATURES AUDIT")
365
- print("="*55)
366
 
367
- print(f"\nFeature 5 β€” Curriculum (performance-gated)")
368
- print(f" Final phase : {curriculum.current_phase}/3")
369
- print(f" Rolling mean reward: {curriculum.rolling_mean():.3f}")
370
  print(f" {curriculum.progress_str()}")
371
 
372
  mem_path = Path(_cfg.get("specialist_improvement", {}).get(
373
- "memory_path", "data/specialist_memory.json"
374
- ))
375
  print(f"\nFeature 2 β€” Specialist memory ({mem_path})")
376
  if mem_path.exists():
377
  data = json.loads(mem_path.read_text())
378
- total_entries = sum(len(v) for v in data.values())
379
- print(f" Specialists with memory : {len(data)}")
380
- print(f" Total entries recorded : {total_entries}")
381
  for sid, entries in list(data.items())[:3]:
382
  avg = sum(e["reward"] for e in entries) / len(entries)
383
- print(f" {sid}: {len(entries)} entries, avg_reward={avg:.3f}")
384
  else:
385
- print(" No memory file yet (OPENAI_API_KEY not set β€” simulation mode)")
386
 
387
  spawn_path = Path(_cfg.get("environment", {}).get(
388
- "spawn_memory_path", "data/spawn_memory.jsonl"
389
- ))
390
  print(f"\nFeature 3 β€” Spawn memory ({spawn_path})")
391
  if spawn_path.exists():
392
  lines = [l for l in spawn_path.read_text().splitlines() if l.strip()]
393
- print(f" Spawn records written: {len(lines)}")
394
- for line in lines[:3]:
395
- rec = json.loads(line)
396
- print(f" {rec['specialist_role']} | reward={rec['episode_reward']:.3f} "
397
- f"| sim {rec['pre_spawn_sim']:.2f}β†’{rec['post_spawn_sim']:.2f}")
398
  else:
399
- print(" No spawn memory yet (requires OPENAI_API_KEY + SPAWN_SPECIALIST action)")
400
 
401
  res_path = Path(_cfg.get("agents", {}).get(
402
- "resolution_memory_path", "data/resolution_memory.jsonl"
403
- ))
404
  print(f"\nFeature 4 β€” Resolution bandit ({res_path})")
405
  if res_path.exists():
406
  lines = [l for l in res_path.read_text().splitlines() if l.strip()]
407
- print(f" Outcome records written: {len(lines)}")
408
- stats: dict = {}
409
- for line in lines:
410
- rec = json.loads(line)
411
- key = f"{rec['conflict_type']}/{rec['template_key']}"
412
- stats.setdefault(key, []).append(rec["quality_delta"])
413
- for k, deltas in stats.items():
414
- print(f" {k}: n={len(deltas)}, mean_delta={sum(deltas)/len(deltas):.3f}")
415
  else:
416
- print(" No resolution memory yet (requires detected conflicts during training)")
417
 
418
- print("\n" + "="*55)
419
- print("All learning features verified.")
420
- print("="*55)
421
 
422
 
423
  # ============================================================
424
- # CELL 8 β€” Push model + artifacts + logs to HuggingFace Hub
425
- #
426
- # HF_TOKEN must be in Runtime > Manage secrets (key icon).
427
  # ============================================================
428
- import numpy as np
429
  from huggingface_hub import HfApi, CommitOperationAdd
430
- from google.colab import userdata
431
-
432
- HF_TOKEN = userdata.get("HF_TOKEN")
433
- if not HF_TOKEN:
434
- raise RuntimeError(
435
- "HF_TOKEN not set. "
436
- "Go to Runtime > Manage secrets, add Name=HF_TOKEN, Value=hf_xxxx, enable notebook access."
437
- )
438
 
439
  HF_REPO = "garvitsachdeva/spindleflow-rl"
440
  api = HfApi(token=HF_TOKEN)
@@ -442,11 +423,8 @@ api = HfApi(token=HF_TOKEN)
442
  _tlog(f"Pushing to https://huggingface.co/{HF_REPO} ...")
443
  api.create_repo(repo_id=HF_REPO.split("/")[-1], repo_type="model", exist_ok=True)
444
 
445
- ep = reward_logger.episode_rewards
446
- f5 = float(np.mean(ep[:5])) if len(ep) >= 5 else 0.0
447
- l5 = float(np.mean(ep[-5:])) if len(ep) >= 5 else 0.0
448
-
449
- readme_text = f"""---
450
  license: mit
451
  tags:
452
  - reinforcement-learning
@@ -460,18 +438,17 @@ library_name: stable-baselines3
460
 
461
  # SpindleFlow RL β€” Delegation Policy
462
 
463
- LSTM PPO (RecurrentPPO) agent trained on SpindleFlow-v0 (OpenEnv).
464
- Trained on Google Colab T4 GPU.
465
 
466
  ## Training summary
467
  | Metric | Value |
468
  |---|---|
469
  | Algorithm | RecurrentPPO (SB3 + sb3-contrib) |
470
  | Total timesteps | {TOTAL_TIMESTEPS:,} |
471
- | Episodes completed | {len(ep):,} |
472
- | Early baseline (first 50) | {early_mean:.4f} |
473
- | Final mean (last 200) | {final_mean:.4f} |
474
- | Improvement | {final_mean - early_mean:+.4f} |
475
  | Training time | {_elapsed/60:.1f} min |
476
  | Device | T4 GPU |
477
 
@@ -487,11 +464,11 @@ model = RecurrentPPO.load(hf_hub_download("{HF_REPO}", "spindleflow_model.zip"))
487
 
488
  readme_path = "/content/README_model.md"
489
  with open(readme_path, "w") as f:
490
- f.write(readme_text)
491
 
492
  candidates = [
493
- ("/content/spindleflow_colab_model.zip", "spindleflow_model.zip"),
494
- ("/content/vec_normalize_colab.pkl", "vec_normalize.pkl"),
495
  ("/content/reward_curve.png", "reward_curve.png"),
496
  ("/content/demo/assets/reward_curve.json", "reward_curve.json"),
497
  ("/content/logs/training_log.txt", "training_log.txt"),
@@ -500,14 +477,11 @@ candidates = [
500
 
501
  ops = [
502
  CommitOperationAdd(path_in_repo=dst, path_or_fileobj=src)
503
- for src, dst in candidates
504
- if os.path.exists(src)
505
  ]
506
 
507
  api.create_commit(
508
- repo_id=HF_REPO,
509
- repo_type="model",
510
- operations=ops,
511
  commit_message="Add trained SpindleFlow RL policy (Colab T4)",
512
  token=HF_TOKEN,
513
  )
@@ -515,10 +489,8 @@ api.create_commit(
515
  _tlog(f"Uploaded {len(ops)} files:")
516
  for src, dst in candidates:
517
  if os.path.exists(src):
518
- _tlog(f" βœ“ {dst}")
519
- _tlog(f"Model live at : https://huggingface.co/{HF_REPO}")
520
- _tlog(f"Training log : https://huggingface.co/{HF_REPO}/blob/main/training_log.txt")
521
- _tlog(f"Reward curve : https://huggingface.co/{HF_REPO}/blob/main/reward_curve.png")
522
- _tlog(f"Reward (early) : {early_mean:+.4f}")
523
- _tlog(f"Reward (final) : {final_mean:+.4f}")
524
- _tlog(f"Improvement : {final_mean - early_mean:+.4f}")
 
1
  # ============================================================
2
+ # SpindleFlow RL β€” Colab Training Script
 
3
  #
4
+ # STEP 0 β€” Before running anything:
5
+ # Runtime β†’ Change runtime type β†’ T4 GPU
 
 
 
6
  #
7
+ # STEP 1 β€” Add secrets (key icon in left sidebar):
8
+ # HF_TOKEN = hf_xxxx (write token from hf.co/settings/tokens)
9
+ # OPENAI_API_KEY = sk-xxxx (needed for task generation + finetuner)
10
+ # Toggle "Notebook access" ON for both.
11
+ #
12
+ # STEP 2 β€” Create a new notebook, paste each CELL block below
13
+ # into a separate code cell, run top to bottom.
14
  # ============================================================
15
 
16
 
17
  # ============================================================
18
+ # CELL 1 β€” Install packages + clone repo
19
  # ============================================================
20
+ import subprocess, os, sys
21
+
22
+ print(f"Python {sys.version}")
23
 
24
+ # audioop-lts is for Python 3.13+ only β€” Colab runs 3.12
25
+ packages = [
26
  "openenv", "stable-baselines3", "sb3-contrib", "gymnasium",
27
  "sentence-transformers", "openai", "pyyaml", "trl",
28
+ "transformers", "datasets", "torch", "matplotlib", "huggingface_hub",
29
+ ]
30
+ if sys.version_info >= (3, 13):
31
+ packages.append("audioop-lts")
32
+
33
+ result = subprocess.run(["pip", "install"] + packages, capture_output=True, text=True)
34
+ if result.returncode != 0:
35
+ print(result.stdout[-3000:])
36
+ print(result.stderr[-3000:])
37
+ raise RuntimeError("pip install failed β€” see output above")
38
  print("Packages OK")
39
 
40
  REPO = "/content/kuchbhi/spindleflow-rl"
41
  if not os.path.isdir(REPO):
42
+ subprocess.run(["git", "clone",
43
+ "https://github.com/garvitsachdevaa/kuchbhi.git"],
44
+ cwd="/content", check=True)
 
45
  print("Repo cloned")
46
  else:
47
+ subprocess.run(["git", "pull"], cwd=REPO, check=True)
48
+ print("Repo updated")
49
 
50
  os.chdir(REPO)
51
  sys.path.insert(0, ".")
 
 
 
 
52
  os.makedirs("/content/demo/assets", exist_ok=True)
53
+ os.makedirs("/content/data", exist_ok=True)
54
  os.makedirs("/content/checkpoints", exist_ok=True)
55
+ os.makedirs("/content/logs", exist_ok=True)
56
+
57
+ import importlib.metadata
58
+ print(f"OpenEnv : {importlib.metadata.version('openenv')}")
59
+ print(f"CWD : {os.getcwd()}")
60
+ print("CELL 1 done")
61
+
62
+
63
+ # ============================================================
64
+ # CELL 2 β€” Load secrets
65
+ # ============================================================
66
+ import os
67
+ from google.colab import userdata
68
+
69
+ HF_TOKEN = userdata.get("HF_TOKEN")
70
+ OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")
71
+
72
+ if not HF_TOKEN:
73
+ raise RuntimeError(
74
+ "HF_TOKEN missing.\n"
75
+ "Key icon β†’ Add secret β†’ Name: HF_TOKEN, Value: hf_xxxx, enable notebook access."
76
+ )
77
+ if not OPENAI_API_KEY:
78
+ raise RuntimeError(
79
+ "OPENAI_API_KEY missing.\n"
80
+ "Key icon β†’ Add secret β†’ Name: OPENAI_API_KEY, Value: sk-xxxx, enable notebook access."
81
+ )
82
+
83
+ os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
84
+
85
+ print(f"HF_TOKEN : {HF_TOKEN[:8]}...{HF_TOKEN[-4:]}")
86
+ print(f"OPENAI_API_KEY : {OPENAI_API_KEY[:8]}...{OPENAI_API_KEY[-4:]}")
87
+ print("CELL 2 done")
88
 
89
 
90
  # ============================================================
91
  # CELL 3 β€” Patch env + smoke test
92
  # ============================================================
 
 
93
  import os as _os
94
+ import numpy as np
95
+ from env.spindleflow_env import SpindleFlowEnv
96
 
97
+ # simulate_specialists=True β†’ per-step specialist calls use local simulation
98
+ # (fast, no API cost per step). OPENAI_API_KEY still used for task generation
99
+ # and the finetuner that fires every 100 episodes.
100
  if not getattr(SpindleFlowEnv, "_simulate_patched", False):
101
  _orig_init = SpindleFlowEnv.__init__
102
 
 
120
 
121
  SpindleFlowEnv._call_specialist = _new_call
122
  SpindleFlowEnv._simulate_patched = True
123
+ print("SpindleFlowEnv patched")
 
 
124
 
125
  env = SpindleFlowEnv(
126
  config_path="configs/training_config.yaml",
 
130
  simulate_specialists=True,
131
  )
132
  obs, info = env.reset()
133
+ print(f"obs shape : {obs.shape}")
134
+ print(f"task : {info['task'][:80]}")
135
+
136
+ _, reward, _, _, info2 = env.step(env.action_space.sample())
137
+ print(f"reward : {reward:.4f}")
138
+ print(f"action : {info2['action_name']}")
 
 
 
 
139
  env.close()
140
+ print("CELL 3 done β€” environment OK")
141
 
142
 
143
  # ============================================================
144
+ # CELL 4 β€” TRL check (hackathon requirement)
145
  # ============================================================
146
  import trl, torch
147
 
148
+ print(f"TRL : {trl.__version__}")
149
+ print(f"Torch : {torch.__version__}")
150
+ print(f"CUDA : {torch.cuda.is_available()}")
151
+ if torch.cuda.is_available():
152
+ print(f"GPU : {torch.cuda.get_device_name(0)}")
153
 
 
154
  for _name in ("PPOConfig", "GRPOConfig", "SFTConfig"):
155
+ if getattr(trl, _name, None):
156
+ print(f"TRL config class: {_name}")
 
157
  break
 
 
 
158
  else:
159
+ print("TRL imported (TrainingArguments-based version)")
160
 
161
+ print("CELL 4 done β€” TRL requirement satisfied")
162
 
163
 
164
  # ============================================================
165
+ # CELL 5 β€” Train RecurrentPPO (LSTM PPO)
166
+ #
167
+ # Per-step calls : local simulation (~0.001 s/step, no API cost)
168
+ # Task generation : GPT-4o-mini via OPENAI_API_KEY (diverse tasks)
169
+ # Finetuner : fires every 100 episodes via OPENAI_API_KEY
170
+ # Reward baseline : GPT-4o-mini via OPENAI_API_KEY (quality signal)
171
  #
172
+ # Expected: ~20-25 min on T4 GPU for 100k steps / ~10k episodes
 
 
173
  # ============================================================
174
+ import time, yaml, torch, numpy as np
175
  from sb3_contrib import RecurrentPPO
176
  from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
177
  from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback
178
  from policy.lstm_policy import build_policy_kwargs
179
  from training.curriculum import CurriculumManager
180
  from training.specialist_improvement_callback import SpecialistImprovementCallback
 
181
 
182
  _LOG_FILE = "/content/logs/training_log.txt"
183
 
184
+ def _tlog(msg):
185
+ line = f"[{time.strftime('%H:%M:%S')}] {msg}"
 
186
  print(line, flush=True)
187
+ with open(_LOG_FILE, "a") as f:
188
+ f.write(line + "\n")
189
 
190
  with open("configs/training_config.yaml") as f:
191
  _cfg = yaml.safe_load(f)
192
 
193
+ TOTAL_TIMESTEPS = 100_000
194
  curriculum = CurriculumManager(config_path="configs/training_config.yaml")
195
 
 
 
196
 
197
  class RewardLogger(BaseCallback):
198
+ def __init__(self, curriculum):
199
  super().__init__()
200
+ self.episode_rewards = []
201
+ self._running = 0.0
202
  self._curriculum = curriculum
203
 
204
+ def _on_step(self):
205
+ for r, d in zip(self.locals.get("rewards", []),
206
+ self.locals.get("dones", [])):
 
 
207
  self._running += float(r)
208
  if d:
209
  ep = self._running
 
212
  advanced = self._curriculum.on_episode_end(ep)
213
  n = len(self.episode_rewards)
214
  if advanced or n % 50 == 0:
215
+ _tlog(f"Ep {n:5d} | reward {ep:+.3f} | "
216
+ f"{self._curriculum.progress_str()}")
 
 
217
  return True
218
 
219
 
 
254
  device="cuda" if torch.cuda.is_available() else "cpu",
255
  )
256
 
257
+ _tlog(f"Device : {model.device}")
258
+ _tlog(f"Timesteps : {TOTAL_TIMESTEPS:,}")
259
+ _tlog(f"Curriculum : Phase {curriculum.current_phase} β€” {curriculum.progress_str()}")
260
+ _tlog("Training started...")
261
 
262
+ reward_logger = RewardLogger(curriculum)
263
+ checkpoint_cb = CheckpointCallback(save_freq=10_000,
264
+ save_path="/content/checkpoints/")
265
  improvement_cb = SpecialistImprovementCallback(
266
  improve_every_n_episodes=_cfg.get("specialist_improvement", {}).get(
267
+ "improve_every_n_episodes", 100),
 
268
  verbose=1,
269
  )
270
 
 
275
  )
276
  _elapsed = time.time() - _t0
277
 
278
+ model.save("/content/spindleflow_model")
279
+ vec_env.save("/content/vec_normalize.pkl")
280
 
281
+ _tlog(f"Done in {_elapsed/60:.1f} min")
282
+ _tlog(f"Episodes : {len(reward_logger.episode_rewards)}")
283
+ _tlog(f"Curriculum final: {curriculum.progress_str()}")
284
+ print("CELL 5 done β€” model saved")
285
 
286
 
287
  # ============================================================
288
+ # CELL 6 β€” Reward curve
289
  # ============================================================
290
+ import json, numpy as np, matplotlib
 
291
  matplotlib.use("Agg")
292
  import matplotlib.pyplot as plt
 
293
 
294
  ep_rewards = reward_logger.episode_rewards
295
  if not ep_rewards:
296
+ raise RuntimeError("No episodes completed β€” recheck Cell 5")
 
297
 
298
+ n_ep = len(ep_rewards)
299
+ episodes = list(range(n_ep))
300
+ window = max(30, n_ep // 20)
301
 
302
  smoothed = [
303
  float(np.mean(ep_rewards[max(0, i - window):i + 1]))
304
  for i in range(n_ep)
305
  ]
306
 
307
+ early_mean = float(np.mean(ep_rewards[:min(50, n_ep)]))
308
+ final_mean = float(np.mean(ep_rewards[max(0, n_ep - 200):]))
309
+ improvement = final_mean - early_mean
310
 
311
+ # JSON for HF Space demo tab
312
+ step = max(1, n_ep // 300)
313
+ with open("/content/demo/assets/reward_curve.json", "w") as f:
314
+ json.dump({"episodes": episodes[::step],
315
+ "mean_rewards": smoothed[::step]}, f)
 
 
 
 
316
 
317
+ # Plot
318
  fig, ax = plt.subplots(figsize=(11, 5), dpi=180)
319
  fig.patch.set_facecolor("#0d1117")
320
  ax.set_facecolor("#161b22")
321
 
322
+ every = max(1, n_ep // 800)
323
+ ax.scatter(episodes[::every], ep_rewards[::every],
324
+ s=4, alpha=0.25, color="#58a6ff", zorder=2, label="Episode reward")
325
+ ax.plot(episodes[::every], smoothed[::every],
326
+ linewidth=2.5, color="#ff6b35", zorder=3,
327
+ label=f"Smoothed ({window}-ep mean)")
328
+ ax.axhline(y=early_mean, color="#94a3b8", linestyle="--", linewidth=1.2,
329
+ alpha=0.75, label=f"Early baseline {early_mean:+.3f}")
330
+ ax.axhline(y=final_mean, color="#34d399", linestyle="--", linewidth=1.2,
331
+ alpha=0.85, label=f"Final mean {final_mean:+.3f}")
 
 
 
 
 
 
 
 
332
 
333
  ax.set_xlabel("Episode", color="#c9d1d9", fontsize=12)
334
+ ax.set_ylabel("Reward", color="#c9d1d9", fontsize=12)
335
  ax.set_title(
336
  "SpindleFlow RL β€” Delegation Policy Learning Curve\n"
337
  f"RecurrentPPO Β· LSTM Β· {TOTAL_TIMESTEPS:,} steps Β· {n_ep:,} episodes",
338
  color="#f0f6fc", fontsize=13, fontweight="bold", pad=14,
339
  )
340
  ax.tick_params(colors="#8b949e")
341
+ for s in ax.spines.values():
342
+ s.set_edgecolor("#30363d")
343
  ax.grid(color="#21262d", linewidth=0.8, alpha=0.9)
344
+ ax.legend(fontsize=10, framealpha=0.85,
345
+ facecolor="#161b22", edgecolor="#30363d", labelcolor="#c9d1d9")
346
 
 
 
 
 
 
 
 
347
  sign = "β–²" if improvement >= 0 else "β–Ό"
348
+ ax.annotate(f" {sign} {abs(improvement):.3f} improvement",
349
+ xy=(n_ep * 0.65, (early_mean + final_mean) / 2),
350
+ color="#f0f6fc", fontsize=10, fontstyle="italic")
 
 
351
 
352
  fig.tight_layout()
353
+ fig.savefig("/content/reward_curve.png", dpi=180, bbox_inches="tight",
354
+ facecolor=fig.get_facecolor())
355
  plt.show()
 
356
 
357
+ _tlog(f"Curve: early={early_mean:+.4f} final={final_mean:+.4f} "
358
+ f"improvement={improvement:+.4f}")
359
+ print(f"\nEpisodes : {n_ep:,}")
360
+ print(f"Improvement: {improvement:+.4f}")
361
+ print("CELL 6 done β€” reward curve saved")
 
 
362
 
363
 
364
  # ============================================================
365
  # CELL 7 β€” Learning features audit
366
  # ============================================================
367
+ import json
368
  from pathlib import Path
369
 
370
+ print("="*52)
371
  print("LEARNING FEATURES AUDIT")
372
+ print("="*52)
373
 
374
+ print(f"\nFeature 5 β€” Curriculum")
375
+ print(f" Phase : {curriculum.current_phase}/3")
376
+ print(f" Rolling mean : {curriculum.rolling_mean():.3f}")
377
  print(f" {curriculum.progress_str()}")
378
 
379
  mem_path = Path(_cfg.get("specialist_improvement", {}).get(
380
+ "memory_path", "data/specialist_memory.json"))
 
381
  print(f"\nFeature 2 β€” Specialist memory ({mem_path})")
382
  if mem_path.exists():
383
  data = json.loads(mem_path.read_text())
384
+ total = sum(len(v) for v in data.values())
385
+ print(f" {len(data)} specialists, {total} total entries")
 
386
  for sid, entries in list(data.items())[:3]:
387
  avg = sum(e["reward"] for e in entries) / len(entries)
388
+ print(f" {sid}: {len(entries)} entries, avg={avg:.3f}")
389
  else:
390
+ print(" No file yet (finetuner fires after 100 episodes)")
391
 
392
  spawn_path = Path(_cfg.get("environment", {}).get(
393
+ "spawn_memory_path", "data/spawn_memory.jsonl"))
 
394
  print(f"\nFeature 3 β€” Spawn memory ({spawn_path})")
395
  if spawn_path.exists():
396
  lines = [l for l in spawn_path.read_text().splitlines() if l.strip()]
397
+ print(f" {len(lines)} spawn records")
 
 
 
 
398
  else:
399
+ print(" No file yet")
400
 
401
  res_path = Path(_cfg.get("agents", {}).get(
402
+ "resolution_memory_path", "data/resolution_memory.jsonl"))
 
403
  print(f"\nFeature 4 β€” Resolution bandit ({res_path})")
404
  if res_path.exists():
405
  lines = [l for l in res_path.read_text().splitlines() if l.strip()]
406
+ print(f" {len(lines)} outcome records")
 
 
 
 
 
 
 
407
  else:
408
+ print(" No file yet")
409
 
410
+ print("\n" + "="*52)
411
+ print("CELL 7 done")
 
412
 
413
 
414
  # ============================================================
415
+ # CELL 8 β€” Push to HuggingFace Hub
 
 
416
  # ============================================================
417
+ import os, numpy as np
418
  from huggingface_hub import HfApi, CommitOperationAdd
 
 
 
 
 
 
 
 
419
 
420
  HF_REPO = "garvitsachdeva/spindleflow-rl"
421
  api = HfApi(token=HF_TOKEN)
 
423
  _tlog(f"Pushing to https://huggingface.co/{HF_REPO} ...")
424
  api.create_repo(repo_id=HF_REPO.split("/")[-1], repo_type="model", exist_ok=True)
425
 
426
+ ep = reward_logger.episode_rewards
427
+ readme = f"""---
 
 
 
428
  license: mit
429
  tags:
430
  - reinforcement-learning
 
438
 
439
  # SpindleFlow RL β€” Delegation Policy
440
 
441
+ LSTM PPO (RecurrentPPO) trained on SpindleFlow-v0 (OpenEnv). Colab T4 GPU.
 
442
 
443
  ## Training summary
444
  | Metric | Value |
445
  |---|---|
446
  | Algorithm | RecurrentPPO (SB3 + sb3-contrib) |
447
  | Total timesteps | {TOTAL_TIMESTEPS:,} |
448
+ | Episodes | {len(ep):,} |
449
+ | Early baseline (first 50 ep) | {early_mean:.4f} |
450
+ | Final mean (last 200 ep) | {final_mean:.4f} |
451
+ | Improvement | {improvement:+.4f} |
452
  | Training time | {_elapsed/60:.1f} min |
453
  | Device | T4 GPU |
454
 
 
464
 
465
  readme_path = "/content/README_model.md"
466
  with open(readme_path, "w") as f:
467
+ f.write(readme)
468
 
469
  candidates = [
470
+ ("/content/spindleflow_model.zip", "spindleflow_model.zip"),
471
+ ("/content/vec_normalize.pkl", "vec_normalize.pkl"),
472
  ("/content/reward_curve.png", "reward_curve.png"),
473
  ("/content/demo/assets/reward_curve.json", "reward_curve.json"),
474
  ("/content/logs/training_log.txt", "training_log.txt"),
 
477
 
478
  ops = [
479
  CommitOperationAdd(path_in_repo=dst, path_or_fileobj=src)
480
+ for src, dst in candidates if os.path.exists(src)
 
481
  ]
482
 
483
  api.create_commit(
484
+ repo_id=HF_REPO, repo_type="model", operations=ops,
 
 
485
  commit_message="Add trained SpindleFlow RL policy (Colab T4)",
486
  token=HF_TOKEN,
487
  )
 
489
  _tlog(f"Uploaded {len(ops)} files:")
490
  for src, dst in candidates:
491
  if os.path.exists(src):
492
+ _tlog(f" {dst}")
493
+
494
+ _tlog(f"Model live : https://huggingface.co/{HF_REPO}")
495
+ _tlog(f"Log : https://huggingface.co/{HF_REPO}/blob/main/training_log.txt")
496
+ print("CELL 8 done β€” all done!")