Spaces:
Runtime error
Runtime error
Commit Β·
956acce
1
Parent(s): c8407b4
Rewrite train_colab.py: clean cells, fix audioop-lts, log file, reward curve
Browse files- colab/train_colab.py +201 -229
colab/train_colab.py
CHANGED
|
@@ -1,61 +1,102 @@
|
|
| 1 |
# ============================================================
|
| 2 |
-
# SpindleFlow RL β
|
| 3 |
-
# Runtime: Runtime > Change runtime type > T4 GPU (free tier)
|
| 4 |
#
|
| 5 |
-
#
|
| 6 |
-
#
|
| 7 |
-
# hf.co/settings/tokens β New token (write)
|
| 8 |
-
# OPENAI_API_KEY OPTIONAL β enables finetuner + spawn self-learning
|
| 9 |
-
# without it the run uses fast simulation mode
|
| 10 |
#
|
| 11 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
# ============================================================
|
| 13 |
|
| 14 |
|
| 15 |
# ============================================================
|
| 16 |
-
# CELL
|
| 17 |
# ============================================================
|
| 18 |
-
import
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
"openenv", "stable-baselines3", "sb3-contrib", "gymnasium",
|
| 23 |
"sentence-transformers", "openai", "pyyaml", "trl",
|
| 24 |
-
"transformers", "datasets", "torch",
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
print("Packages OK")
|
| 28 |
|
| 29 |
REPO = "/content/kuchbhi/spindleflow-rl"
|
| 30 |
if not os.path.isdir(REPO):
|
| 31 |
-
subprocess.run(
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
)
|
| 35 |
print("Repo cloned")
|
| 36 |
else:
|
| 37 |
-
|
|
|
|
| 38 |
|
| 39 |
os.chdir(REPO)
|
| 40 |
sys.path.insert(0, ".")
|
| 41 |
-
print(f"Working directory: {os.getcwd()}")
|
| 42 |
-
|
| 43 |
-
import openenv, importlib.metadata
|
| 44 |
-
print(f"OpenEnv version : {importlib.metadata.version('openenv')}")
|
| 45 |
os.makedirs("/content/demo/assets", exist_ok=True)
|
| 46 |
-
os.makedirs("/content/data",
|
| 47 |
os.makedirs("/content/checkpoints", exist_ok=True)
|
| 48 |
-
os.makedirs("/content/logs",
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
# ============================================================
|
| 53 |
# CELL 3 β Patch env + smoke test
|
| 54 |
# ============================================================
|
| 55 |
-
from env.spindleflow_env import SpindleFlowEnv
|
| 56 |
-
import numpy as np
|
| 57 |
import os as _os
|
|
|
|
|
|
|
| 58 |
|
|
|
|
|
|
|
|
|
|
| 59 |
if not getattr(SpindleFlowEnv, "_simulate_patched", False):
|
| 60 |
_orig_init = SpindleFlowEnv.__init__
|
| 61 |
|
|
@@ -79,9 +120,7 @@ if not getattr(SpindleFlowEnv, "_simulate_patched", False):
|
|
| 79 |
|
| 80 |
SpindleFlowEnv._call_specialist = _new_call
|
| 81 |
SpindleFlowEnv._simulate_patched = True
|
| 82 |
-
print("SpindleFlowEnv patched
|
| 83 |
-
else:
|
| 84 |
-
print("Already patched β skipping")
|
| 85 |
|
| 86 |
env = SpindleFlowEnv(
|
| 87 |
config_path="configs/training_config.yaml",
|
|
@@ -91,87 +130,80 @@ env = SpindleFlowEnv(
|
|
| 91 |
simulate_specialists=True,
|
| 92 |
)
|
| 93 |
obs, info = env.reset()
|
| 94 |
-
print(f"
|
| 95 |
-
print(f"
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
print(f"
|
| 100 |
-
print(f"Action name : {info2['action_name']}")
|
| 101 |
-
print(f"Called specialists: {info2['called_specialists']}")
|
| 102 |
-
print(f"Reward components : {info2['reward_components']}")
|
| 103 |
-
print("Environment OK β end-to-end step works.")
|
| 104 |
env.close()
|
|
|
|
| 105 |
|
| 106 |
|
| 107 |
# ============================================================
|
| 108 |
-
# CELL 4 β
|
| 109 |
# ============================================================
|
| 110 |
import trl, torch
|
| 111 |
|
| 112 |
-
print(f"TRL
|
| 113 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
_found = None
|
| 116 |
for _name in ("PPOConfig", "GRPOConfig", "SFTConfig"):
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
_found = _name
|
| 120 |
break
|
| 121 |
-
|
| 122 |
-
if _found:
|
| 123 |
-
print(f"TRL config class available: {_found}")
|
| 124 |
else:
|
| 125 |
-
print("TRL imported
|
| 126 |
|
| 127 |
-
print("
|
| 128 |
|
| 129 |
|
| 130 |
# ============================================================
|
| 131 |
-
# CELL 5 β RecurrentPPO
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
#
|
| 133 |
-
#
|
| 134 |
-
# no OpenAI calls during steps β fast on T4
|
| 135 |
-
# Expected runtime: ~20β25 min for 100k steps (~10k episodes)
|
| 136 |
# ============================================================
|
| 137 |
-
import time
|
| 138 |
from sb3_contrib import RecurrentPPO
|
| 139 |
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
|
| 140 |
from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback
|
| 141 |
from policy.lstm_policy import build_policy_kwargs
|
| 142 |
from training.curriculum import CurriculumManager
|
| 143 |
from training.specialist_improvement_callback import SpecialistImprovementCallback
|
| 144 |
-
import yaml
|
| 145 |
|
| 146 |
_LOG_FILE = "/content/logs/training_log.txt"
|
| 147 |
|
| 148 |
-
def _tlog(msg
|
| 149 |
-
|
| 150 |
-
line = f"[{ts}] {msg}"
|
| 151 |
print(line, flush=True)
|
| 152 |
-
with open(_LOG_FILE, "a"
|
| 153 |
-
|
| 154 |
|
| 155 |
with open("configs/training_config.yaml") as f:
|
| 156 |
_cfg = yaml.safe_load(f)
|
| 157 |
|
|
|
|
| 158 |
curriculum = CurriculumManager(config_path="configs/training_config.yaml")
|
| 159 |
|
| 160 |
-
TOTAL_TIMESTEPS = 100_000 # ~10k episodes on T4, ~20-25 min
|
| 161 |
-
|
| 162 |
|
| 163 |
class RewardLogger(BaseCallback):
|
| 164 |
-
def __init__(self, curriculum
|
| 165 |
super().__init__()
|
| 166 |
-
self.episode_rewards
|
| 167 |
-
self._running
|
| 168 |
self._curriculum = curriculum
|
| 169 |
|
| 170 |
-
def _on_step(self)
|
| 171 |
-
for r, d in zip(
|
| 172 |
-
|
| 173 |
-
self.locals.get("dones", []),
|
| 174 |
-
):
|
| 175 |
self._running += float(r)
|
| 176 |
if d:
|
| 177 |
ep = self._running
|
|
@@ -180,10 +212,8 @@ class RewardLogger(BaseCallback):
|
|
| 180 |
advanced = self._curriculum.on_episode_end(ep)
|
| 181 |
n = len(self.episode_rewards)
|
| 182 |
if advanced or n % 50 == 0:
|
| 183 |
-
_tlog(
|
| 184 |
-
|
| 185 |
-
f"{self._curriculum.progress_str()}"
|
| 186 |
-
)
|
| 187 |
return True
|
| 188 |
|
| 189 |
|
|
@@ -224,17 +254,17 @@ model = RecurrentPPO(
|
|
| 224 |
device="cuda" if torch.cuda.is_available() else "cpu",
|
| 225 |
)
|
| 226 |
|
| 227 |
-
_tlog(f"Device
|
| 228 |
-
_tlog(f"
|
| 229 |
-
_tlog(f"Curriculum
|
| 230 |
-
_tlog("Training started...
|
| 231 |
|
| 232 |
-
reward_logger = RewardLogger(curriculum
|
| 233 |
-
checkpoint_cb = CheckpointCallback(save_freq=10_000,
|
|
|
|
| 234 |
improvement_cb = SpecialistImprovementCallback(
|
| 235 |
improve_every_n_episodes=_cfg.get("specialist_improvement", {}).get(
|
| 236 |
-
"improve_every_n_episodes", 100
|
| 237 |
-
),
|
| 238 |
verbose=1,
|
| 239 |
)
|
| 240 |
|
|
@@ -245,196 +275,147 @@ model.learn(
|
|
| 245 |
)
|
| 246 |
_elapsed = time.time() - _t0
|
| 247 |
|
| 248 |
-
model.save("/content/
|
| 249 |
-
vec_env.save("/content/
|
| 250 |
|
| 251 |
-
_tlog(f"
|
| 252 |
-
_tlog(f"Episodes
|
| 253 |
-
_tlog(f"
|
|
|
|
| 254 |
|
| 255 |
|
| 256 |
# ============================================================
|
| 257 |
-
# CELL 6 β Reward curve
|
| 258 |
# ============================================================
|
| 259 |
-
import json
|
| 260 |
-
import matplotlib
|
| 261 |
matplotlib.use("Agg")
|
| 262 |
import matplotlib.pyplot as plt
|
| 263 |
-
import numpy as np
|
| 264 |
|
| 265 |
ep_rewards = reward_logger.episode_rewards
|
| 266 |
if not ep_rewards:
|
| 267 |
-
|
| 268 |
-
ep_rewards = [0.0]
|
| 269 |
|
| 270 |
-
n_ep
|
| 271 |
-
episodes
|
| 272 |
-
window
|
| 273 |
|
| 274 |
smoothed = [
|
| 275 |
float(np.mean(ep_rewards[max(0, i - window):i + 1]))
|
| 276 |
for i in range(n_ep)
|
| 277 |
]
|
| 278 |
|
| 279 |
-
early_mean
|
| 280 |
-
final_mean
|
|
|
|
| 281 |
|
| 282 |
-
#
|
| 283 |
-
step
|
| 284 |
-
|
| 285 |
-
"episodes":
|
| 286 |
-
|
| 287 |
-
}
|
| 288 |
-
json_path = "/content/demo/assets/reward_curve.json"
|
| 289 |
-
with open(json_path, "w") as f:
|
| 290 |
-
json.dump(json_data, f)
|
| 291 |
|
| 292 |
-
#
|
| 293 |
fig, ax = plt.subplots(figsize=(11, 5), dpi=180)
|
| 294 |
fig.patch.set_facecolor("#0d1117")
|
| 295 |
ax.set_facecolor("#161b22")
|
| 296 |
|
| 297 |
-
|
| 298 |
-
ax.scatter(
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
)
|
| 307 |
-
ax.axhline(
|
| 308 |
-
y=early_mean, color="#94a3b8", linestyle="--", linewidth=1.2, alpha=0.75,
|
| 309 |
-
label=f"Early baseline {early_mean:+.3f}",
|
| 310 |
-
)
|
| 311 |
-
ax.axhline(
|
| 312 |
-
y=final_mean, color="#34d399", linestyle="--", linewidth=1.2, alpha=0.85,
|
| 313 |
-
label=f"Final mean {final_mean:+.3f}",
|
| 314 |
-
)
|
| 315 |
|
| 316 |
ax.set_xlabel("Episode", color="#c9d1d9", fontsize=12)
|
| 317 |
-
ax.set_ylabel("Reward",
|
| 318 |
ax.set_title(
|
| 319 |
"SpindleFlow RL β Delegation Policy Learning Curve\n"
|
| 320 |
f"RecurrentPPO Β· LSTM Β· {TOTAL_TIMESTEPS:,} steps Β· {n_ep:,} episodes",
|
| 321 |
color="#f0f6fc", fontsize=13, fontweight="bold", pad=14,
|
| 322 |
)
|
| 323 |
ax.tick_params(colors="#8b949e")
|
| 324 |
-
for
|
| 325 |
-
|
| 326 |
ax.grid(color="#21262d", linewidth=0.8, alpha=0.9)
|
|
|
|
|
|
|
| 327 |
|
| 328 |
-
legend = ax.legend(
|
| 329 |
-
fontsize=10, framealpha=0.85,
|
| 330 |
-
facecolor="#161b22", edgecolor="#30363d", labelcolor="#c9d1d9",
|
| 331 |
-
)
|
| 332 |
-
|
| 333 |
-
# Annotate improvement
|
| 334 |
-
improvement = final_mean - early_mean
|
| 335 |
sign = "β²" if improvement >= 0 else "βΌ"
|
| 336 |
-
ax.annotate(
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
color="#f0f6fc", fontsize=10, fontstyle="italic",
|
| 340 |
-
)
|
| 341 |
|
| 342 |
fig.tight_layout()
|
| 343 |
-
|
| 344 |
-
|
| 345 |
plt.show()
|
| 346 |
-
_tlog(f"Reward curve saved β {png_path}")
|
| 347 |
|
| 348 |
-
_tlog(f"
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
_tlog(f" Improvement : {improvement:+.4f}")
|
| 354 |
-
_tlog(f"{'='*55}")
|
| 355 |
|
| 356 |
|
| 357 |
# ============================================================
|
| 358 |
# CELL 7 β Learning features audit
|
| 359 |
# ============================================================
|
| 360 |
-
import
|
| 361 |
from pathlib import Path
|
| 362 |
|
| 363 |
-
print("
|
| 364 |
print("LEARNING FEATURES AUDIT")
|
| 365 |
-
print("="*
|
| 366 |
|
| 367 |
-
print(f"\nFeature 5 β Curriculum
|
| 368 |
-
print(f"
|
| 369 |
-
print(f" Rolling mean
|
| 370 |
print(f" {curriculum.progress_str()}")
|
| 371 |
|
| 372 |
mem_path = Path(_cfg.get("specialist_improvement", {}).get(
|
| 373 |
-
"memory_path", "data/specialist_memory.json"
|
| 374 |
-
))
|
| 375 |
print(f"\nFeature 2 β Specialist memory ({mem_path})")
|
| 376 |
if mem_path.exists():
|
| 377 |
data = json.loads(mem_path.read_text())
|
| 378 |
-
|
| 379 |
-
print(f"
|
| 380 |
-
print(f" Total entries recorded : {total_entries}")
|
| 381 |
for sid, entries in list(data.items())[:3]:
|
| 382 |
avg = sum(e["reward"] for e in entries) / len(entries)
|
| 383 |
-
print(f" {sid}: {len(entries)} entries,
|
| 384 |
else:
|
| 385 |
-
print(" No
|
| 386 |
|
| 387 |
spawn_path = Path(_cfg.get("environment", {}).get(
|
| 388 |
-
"spawn_memory_path", "data/spawn_memory.jsonl"
|
| 389 |
-
))
|
| 390 |
print(f"\nFeature 3 β Spawn memory ({spawn_path})")
|
| 391 |
if spawn_path.exists():
|
| 392 |
lines = [l for l in spawn_path.read_text().splitlines() if l.strip()]
|
| 393 |
-
print(f"
|
| 394 |
-
for line in lines[:3]:
|
| 395 |
-
rec = json.loads(line)
|
| 396 |
-
print(f" {rec['specialist_role']} | reward={rec['episode_reward']:.3f} "
|
| 397 |
-
f"| sim {rec['pre_spawn_sim']:.2f}β{rec['post_spawn_sim']:.2f}")
|
| 398 |
else:
|
| 399 |
-
print(" No
|
| 400 |
|
| 401 |
res_path = Path(_cfg.get("agents", {}).get(
|
| 402 |
-
"resolution_memory_path", "data/resolution_memory.jsonl"
|
| 403 |
-
))
|
| 404 |
print(f"\nFeature 4 β Resolution bandit ({res_path})")
|
| 405 |
if res_path.exists():
|
| 406 |
lines = [l for l in res_path.read_text().splitlines() if l.strip()]
|
| 407 |
-
print(f"
|
| 408 |
-
stats: dict = {}
|
| 409 |
-
for line in lines:
|
| 410 |
-
rec = json.loads(line)
|
| 411 |
-
key = f"{rec['conflict_type']}/{rec['template_key']}"
|
| 412 |
-
stats.setdefault(key, []).append(rec["quality_delta"])
|
| 413 |
-
for k, deltas in stats.items():
|
| 414 |
-
print(f" {k}: n={len(deltas)}, mean_delta={sum(deltas)/len(deltas):.3f}")
|
| 415 |
else:
|
| 416 |
-
print(" No
|
| 417 |
|
| 418 |
-
print("\n" + "="*
|
| 419 |
-
print("
|
| 420 |
-
print("="*55)
|
| 421 |
|
| 422 |
|
| 423 |
# ============================================================
|
| 424 |
-
# CELL 8 β Push
|
| 425 |
-
#
|
| 426 |
-
# HF_TOKEN must be in Runtime > Manage secrets (key icon).
|
| 427 |
# ============================================================
|
| 428 |
-
import numpy as np
|
| 429 |
from huggingface_hub import HfApi, CommitOperationAdd
|
| 430 |
-
from google.colab import userdata
|
| 431 |
-
|
| 432 |
-
HF_TOKEN = userdata.get("HF_TOKEN")
|
| 433 |
-
if not HF_TOKEN:
|
| 434 |
-
raise RuntimeError(
|
| 435 |
-
"HF_TOKEN not set. "
|
| 436 |
-
"Go to Runtime > Manage secrets, add Name=HF_TOKEN, Value=hf_xxxx, enable notebook access."
|
| 437 |
-
)
|
| 438 |
|
| 439 |
HF_REPO = "garvitsachdeva/spindleflow-rl"
|
| 440 |
api = HfApi(token=HF_TOKEN)
|
|
@@ -442,11 +423,8 @@ api = HfApi(token=HF_TOKEN)
|
|
| 442 |
_tlog(f"Pushing to https://huggingface.co/{HF_REPO} ...")
|
| 443 |
api.create_repo(repo_id=HF_REPO.split("/")[-1], repo_type="model", exist_ok=True)
|
| 444 |
|
| 445 |
-
ep
|
| 446 |
-
|
| 447 |
-
l5 = float(np.mean(ep[-5:])) if len(ep) >= 5 else 0.0
|
| 448 |
-
|
| 449 |
-
readme_text = f"""---
|
| 450 |
license: mit
|
| 451 |
tags:
|
| 452 |
- reinforcement-learning
|
|
@@ -460,18 +438,17 @@ library_name: stable-baselines3
|
|
| 460 |
|
| 461 |
# SpindleFlow RL β Delegation Policy
|
| 462 |
|
| 463 |
-
LSTM PPO (RecurrentPPO)
|
| 464 |
-
Trained on Google Colab T4 GPU.
|
| 465 |
|
| 466 |
## Training summary
|
| 467 |
| Metric | Value |
|
| 468 |
|---|---|
|
| 469 |
| Algorithm | RecurrentPPO (SB3 + sb3-contrib) |
|
| 470 |
| Total timesteps | {TOTAL_TIMESTEPS:,} |
|
| 471 |
-
| Episodes
|
| 472 |
-
| Early baseline (first 50) | {early_mean:.4f} |
|
| 473 |
-
| Final mean (last 200) | {final_mean:.4f} |
|
| 474 |
-
| Improvement | {
|
| 475 |
| Training time | {_elapsed/60:.1f} min |
|
| 476 |
| Device | T4 GPU |
|
| 477 |
|
|
@@ -487,11 +464,11 @@ model = RecurrentPPO.load(hf_hub_download("{HF_REPO}", "spindleflow_model.zip"))
|
|
| 487 |
|
| 488 |
readme_path = "/content/README_model.md"
|
| 489 |
with open(readme_path, "w") as f:
|
| 490 |
-
f.write(
|
| 491 |
|
| 492 |
candidates = [
|
| 493 |
-
("/content/
|
| 494 |
-
("/content/
|
| 495 |
("/content/reward_curve.png", "reward_curve.png"),
|
| 496 |
("/content/demo/assets/reward_curve.json", "reward_curve.json"),
|
| 497 |
("/content/logs/training_log.txt", "training_log.txt"),
|
|
@@ -500,14 +477,11 @@ candidates = [
|
|
| 500 |
|
| 501 |
ops = [
|
| 502 |
CommitOperationAdd(path_in_repo=dst, path_or_fileobj=src)
|
| 503 |
-
for src, dst in candidates
|
| 504 |
-
if os.path.exists(src)
|
| 505 |
]
|
| 506 |
|
| 507 |
api.create_commit(
|
| 508 |
-
repo_id=HF_REPO,
|
| 509 |
-
repo_type="model",
|
| 510 |
-
operations=ops,
|
| 511 |
commit_message="Add trained SpindleFlow RL policy (Colab T4)",
|
| 512 |
token=HF_TOKEN,
|
| 513 |
)
|
|
@@ -515,10 +489,8 @@ api.create_commit(
|
|
| 515 |
_tlog(f"Uploaded {len(ops)} files:")
|
| 516 |
for src, dst in candidates:
|
| 517 |
if os.path.exists(src):
|
| 518 |
-
_tlog(f"
|
| 519 |
-
|
| 520 |
-
_tlog(f"
|
| 521 |
-
_tlog(f"
|
| 522 |
-
|
| 523 |
-
_tlog(f"Reward (final) : {final_mean:+.4f}")
|
| 524 |
-
_tlog(f"Improvement : {final_mean - early_mean:+.4f}")
|
|
|
|
| 1 |
# ============================================================
|
| 2 |
+
# SpindleFlow RL β Colab Training Script
|
|
|
|
| 3 |
#
|
| 4 |
+
# STEP 0 β Before running anything:
|
| 5 |
+
# Runtime β Change runtime type β T4 GPU
|
|
|
|
|
|
|
|
|
|
| 6 |
#
|
| 7 |
+
# STEP 1 β Add secrets (key icon in left sidebar):
|
| 8 |
+
# HF_TOKEN = hf_xxxx (write token from hf.co/settings/tokens)
|
| 9 |
+
# OPENAI_API_KEY = sk-xxxx (needed for task generation + finetuner)
|
| 10 |
+
# Toggle "Notebook access" ON for both.
|
| 11 |
+
#
|
| 12 |
+
# STEP 2 β Create a new notebook, paste each CELL block below
|
| 13 |
+
# into a separate code cell, run top to bottom.
|
| 14 |
# ============================================================
|
| 15 |
|
| 16 |
|
| 17 |
# ============================================================
|
| 18 |
+
# CELL 1 β Install packages + clone repo
|
| 19 |
# ============================================================
|
| 20 |
+
import subprocess, os, sys
|
| 21 |
+
|
| 22 |
+
print(f"Python {sys.version}")
|
| 23 |
|
| 24 |
+
# audioop-lts is for Python 3.13+ only β Colab runs 3.12
|
| 25 |
+
packages = [
|
| 26 |
"openenv", "stable-baselines3", "sb3-contrib", "gymnasium",
|
| 27 |
"sentence-transformers", "openai", "pyyaml", "trl",
|
| 28 |
+
"transformers", "datasets", "torch", "matplotlib", "huggingface_hub",
|
| 29 |
+
]
|
| 30 |
+
if sys.version_info >= (3, 13):
|
| 31 |
+
packages.append("audioop-lts")
|
| 32 |
+
|
| 33 |
+
result = subprocess.run(["pip", "install"] + packages, capture_output=True, text=True)
|
| 34 |
+
if result.returncode != 0:
|
| 35 |
+
print(result.stdout[-3000:])
|
| 36 |
+
print(result.stderr[-3000:])
|
| 37 |
+
raise RuntimeError("pip install failed β see output above")
|
| 38 |
print("Packages OK")
|
| 39 |
|
| 40 |
REPO = "/content/kuchbhi/spindleflow-rl"
|
| 41 |
if not os.path.isdir(REPO):
|
| 42 |
+
subprocess.run(["git", "clone",
|
| 43 |
+
"https://github.com/garvitsachdevaa/kuchbhi.git"],
|
| 44 |
+
cwd="/content", check=True)
|
|
|
|
| 45 |
print("Repo cloned")
|
| 46 |
else:
|
| 47 |
+
subprocess.run(["git", "pull"], cwd=REPO, check=True)
|
| 48 |
+
print("Repo updated")
|
| 49 |
|
| 50 |
os.chdir(REPO)
|
| 51 |
sys.path.insert(0, ".")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
os.makedirs("/content/demo/assets", exist_ok=True)
|
| 53 |
+
os.makedirs("/content/data", exist_ok=True)
|
| 54 |
os.makedirs("/content/checkpoints", exist_ok=True)
|
| 55 |
+
os.makedirs("/content/logs", exist_ok=True)
|
| 56 |
+
|
| 57 |
+
import importlib.metadata
|
| 58 |
+
print(f"OpenEnv : {importlib.metadata.version('openenv')}")
|
| 59 |
+
print(f"CWD : {os.getcwd()}")
|
| 60 |
+
print("CELL 1 done")
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# ============================================================
|
| 64 |
+
# CELL 2 β Load secrets
|
| 65 |
+
# ============================================================
|
| 66 |
+
import os
|
| 67 |
+
from google.colab import userdata
|
| 68 |
+
|
| 69 |
+
HF_TOKEN = userdata.get("HF_TOKEN")
|
| 70 |
+
OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")
|
| 71 |
+
|
| 72 |
+
if not HF_TOKEN:
|
| 73 |
+
raise RuntimeError(
|
| 74 |
+
"HF_TOKEN missing.\n"
|
| 75 |
+
"Key icon β Add secret β Name: HF_TOKEN, Value: hf_xxxx, enable notebook access."
|
| 76 |
+
)
|
| 77 |
+
if not OPENAI_API_KEY:
|
| 78 |
+
raise RuntimeError(
|
| 79 |
+
"OPENAI_API_KEY missing.\n"
|
| 80 |
+
"Key icon β Add secret β Name: OPENAI_API_KEY, Value: sk-xxxx, enable notebook access."
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
|
| 84 |
+
|
| 85 |
+
print(f"HF_TOKEN : {HF_TOKEN[:8]}...{HF_TOKEN[-4:]}")
|
| 86 |
+
print(f"OPENAI_API_KEY : {OPENAI_API_KEY[:8]}...{OPENAI_API_KEY[-4:]}")
|
| 87 |
+
print("CELL 2 done")
|
| 88 |
|
| 89 |
|
| 90 |
# ============================================================
|
| 91 |
# CELL 3 β Patch env + smoke test
|
| 92 |
# ============================================================
|
|
|
|
|
|
|
| 93 |
import os as _os
|
| 94 |
+
import numpy as np
|
| 95 |
+
from env.spindleflow_env import SpindleFlowEnv
|
| 96 |
|
| 97 |
+
# simulate_specialists=True β per-step specialist calls use local simulation
|
| 98 |
+
# (fast, no API cost per step). OPENAI_API_KEY still used for task generation
|
| 99 |
+
# and the finetuner that fires every 100 episodes.
|
| 100 |
if not getattr(SpindleFlowEnv, "_simulate_patched", False):
|
| 101 |
_orig_init = SpindleFlowEnv.__init__
|
| 102 |
|
|
|
|
| 120 |
|
| 121 |
SpindleFlowEnv._call_specialist = _new_call
|
| 122 |
SpindleFlowEnv._simulate_patched = True
|
| 123 |
+
print("SpindleFlowEnv patched")
|
|
|
|
|
|
|
| 124 |
|
| 125 |
env = SpindleFlowEnv(
|
| 126 |
config_path="configs/training_config.yaml",
|
|
|
|
| 130 |
simulate_specialists=True,
|
| 131 |
)
|
| 132 |
obs, info = env.reset()
|
| 133 |
+
print(f"obs shape : {obs.shape}")
|
| 134 |
+
print(f"task : {info['task'][:80]}")
|
| 135 |
+
|
| 136 |
+
_, reward, _, _, info2 = env.step(env.action_space.sample())
|
| 137 |
+
print(f"reward : {reward:.4f}")
|
| 138 |
+
print(f"action : {info2['action_name']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
env.close()
|
| 140 |
+
print("CELL 3 done β environment OK")
|
| 141 |
|
| 142 |
|
| 143 |
# ============================================================
|
| 144 |
+
# CELL 4 β TRL check (hackathon requirement)
|
| 145 |
# ============================================================
|
| 146 |
import trl, torch
|
| 147 |
|
| 148 |
+
print(f"TRL : {trl.__version__}")
|
| 149 |
+
print(f"Torch : {torch.__version__}")
|
| 150 |
+
print(f"CUDA : {torch.cuda.is_available()}")
|
| 151 |
+
if torch.cuda.is_available():
|
| 152 |
+
print(f"GPU : {torch.cuda.get_device_name(0)}")
|
| 153 |
|
|
|
|
| 154 |
for _name in ("PPOConfig", "GRPOConfig", "SFTConfig"):
|
| 155 |
+
if getattr(trl, _name, None):
|
| 156 |
+
print(f"TRL config class: {_name}")
|
|
|
|
| 157 |
break
|
|
|
|
|
|
|
|
|
|
| 158 |
else:
|
| 159 |
+
print("TRL imported (TrainingArguments-based version)")
|
| 160 |
|
| 161 |
+
print("CELL 4 done β TRL requirement satisfied")
|
| 162 |
|
| 163 |
|
| 164 |
# ============================================================
|
| 165 |
+
# CELL 5 β Train RecurrentPPO (LSTM PPO)
|
| 166 |
+
#
|
| 167 |
+
# Per-step calls : local simulation (~0.001 s/step, no API cost)
|
| 168 |
+
# Task generation : GPT-4o-mini via OPENAI_API_KEY (diverse tasks)
|
| 169 |
+
# Finetuner : fires every 100 episodes via OPENAI_API_KEY
|
| 170 |
+
# Reward baseline : GPT-4o-mini via OPENAI_API_KEY (quality signal)
|
| 171 |
#
|
| 172 |
+
# Expected: ~20-25 min on T4 GPU for 100k steps / ~10k episodes
|
|
|
|
|
|
|
| 173 |
# ============================================================
|
| 174 |
+
import time, yaml, torch, numpy as np
|
| 175 |
from sb3_contrib import RecurrentPPO
|
| 176 |
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
|
| 177 |
from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback
|
| 178 |
from policy.lstm_policy import build_policy_kwargs
|
| 179 |
from training.curriculum import CurriculumManager
|
| 180 |
from training.specialist_improvement_callback import SpecialistImprovementCallback
|
|
|
|
| 181 |
|
| 182 |
_LOG_FILE = "/content/logs/training_log.txt"
|
| 183 |
|
| 184 |
+
def _tlog(msg):
|
| 185 |
+
line = f"[{time.strftime('%H:%M:%S')}] {msg}"
|
|
|
|
| 186 |
print(line, flush=True)
|
| 187 |
+
with open(_LOG_FILE, "a") as f:
|
| 188 |
+
f.write(line + "\n")
|
| 189 |
|
| 190 |
with open("configs/training_config.yaml") as f:
|
| 191 |
_cfg = yaml.safe_load(f)
|
| 192 |
|
| 193 |
+
TOTAL_TIMESTEPS = 100_000
|
| 194 |
curriculum = CurriculumManager(config_path="configs/training_config.yaml")
|
| 195 |
|
|
|
|
|
|
|
| 196 |
|
| 197 |
class RewardLogger(BaseCallback):
|
| 198 |
+
def __init__(self, curriculum):
|
| 199 |
super().__init__()
|
| 200 |
+
self.episode_rewards = []
|
| 201 |
+
self._running = 0.0
|
| 202 |
self._curriculum = curriculum
|
| 203 |
|
| 204 |
+
def _on_step(self):
|
| 205 |
+
for r, d in zip(self.locals.get("rewards", []),
|
| 206 |
+
self.locals.get("dones", [])):
|
|
|
|
|
|
|
| 207 |
self._running += float(r)
|
| 208 |
if d:
|
| 209 |
ep = self._running
|
|
|
|
| 212 |
advanced = self._curriculum.on_episode_end(ep)
|
| 213 |
n = len(self.episode_rewards)
|
| 214 |
if advanced or n % 50 == 0:
|
| 215 |
+
_tlog(f"Ep {n:5d} | reward {ep:+.3f} | "
|
| 216 |
+
f"{self._curriculum.progress_str()}")
|
|
|
|
|
|
|
| 217 |
return True
|
| 218 |
|
| 219 |
|
|
|
|
| 254 |
device="cuda" if torch.cuda.is_available() else "cpu",
|
| 255 |
)
|
| 256 |
|
| 257 |
+
_tlog(f"Device : {model.device}")
|
| 258 |
+
_tlog(f"Timesteps : {TOTAL_TIMESTEPS:,}")
|
| 259 |
+
_tlog(f"Curriculum : Phase {curriculum.current_phase} β {curriculum.progress_str()}")
|
| 260 |
+
_tlog("Training started...")
|
| 261 |
|
| 262 |
+
reward_logger = RewardLogger(curriculum)
|
| 263 |
+
checkpoint_cb = CheckpointCallback(save_freq=10_000,
|
| 264 |
+
save_path="/content/checkpoints/")
|
| 265 |
improvement_cb = SpecialistImprovementCallback(
|
| 266 |
improve_every_n_episodes=_cfg.get("specialist_improvement", {}).get(
|
| 267 |
+
"improve_every_n_episodes", 100),
|
|
|
|
| 268 |
verbose=1,
|
| 269 |
)
|
| 270 |
|
|
|
|
| 275 |
)
|
| 276 |
_elapsed = time.time() - _t0
|
| 277 |
|
| 278 |
+
model.save("/content/spindleflow_model")
|
| 279 |
+
vec_env.save("/content/vec_normalize.pkl")
|
| 280 |
|
| 281 |
+
_tlog(f"Done in {_elapsed/60:.1f} min")
|
| 282 |
+
_tlog(f"Episodes : {len(reward_logger.episode_rewards)}")
|
| 283 |
+
_tlog(f"Curriculum final: {curriculum.progress_str()}")
|
| 284 |
+
print("CELL 5 done β model saved")
|
| 285 |
|
| 286 |
|
| 287 |
# ============================================================
|
| 288 |
+
# CELL 6 β Reward curve
|
| 289 |
# ============================================================
|
| 290 |
+
import json, numpy as np, matplotlib
|
|
|
|
| 291 |
matplotlib.use("Agg")
|
| 292 |
import matplotlib.pyplot as plt
|
|
|
|
| 293 |
|
| 294 |
ep_rewards = reward_logger.episode_rewards
|
| 295 |
if not ep_rewards:
|
| 296 |
+
raise RuntimeError("No episodes completed β recheck Cell 5")
|
|
|
|
| 297 |
|
| 298 |
+
n_ep = len(ep_rewards)
|
| 299 |
+
episodes = list(range(n_ep))
|
| 300 |
+
window = max(30, n_ep // 20)
|
| 301 |
|
| 302 |
smoothed = [
|
| 303 |
float(np.mean(ep_rewards[max(0, i - window):i + 1]))
|
| 304 |
for i in range(n_ep)
|
| 305 |
]
|
| 306 |
|
| 307 |
+
early_mean = float(np.mean(ep_rewards[:min(50, n_ep)]))
|
| 308 |
+
final_mean = float(np.mean(ep_rewards[max(0, n_ep - 200):]))
|
| 309 |
+
improvement = final_mean - early_mean
|
| 310 |
|
| 311 |
+
# JSON for HF Space demo tab
|
| 312 |
+
step = max(1, n_ep // 300)
|
| 313 |
+
with open("/content/demo/assets/reward_curve.json", "w") as f:
|
| 314 |
+
json.dump({"episodes": episodes[::step],
|
| 315 |
+
"mean_rewards": smoothed[::step]}, f)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
+
# Plot
|
| 318 |
fig, ax = plt.subplots(figsize=(11, 5), dpi=180)
|
| 319 |
fig.patch.set_facecolor("#0d1117")
|
| 320 |
ax.set_facecolor("#161b22")
|
| 321 |
|
| 322 |
+
every = max(1, n_ep // 800)
|
| 323 |
+
ax.scatter(episodes[::every], ep_rewards[::every],
|
| 324 |
+
s=4, alpha=0.25, color="#58a6ff", zorder=2, label="Episode reward")
|
| 325 |
+
ax.plot(episodes[::every], smoothed[::every],
|
| 326 |
+
linewidth=2.5, color="#ff6b35", zorder=3,
|
| 327 |
+
label=f"Smoothed ({window}-ep mean)")
|
| 328 |
+
ax.axhline(y=early_mean, color="#94a3b8", linestyle="--", linewidth=1.2,
|
| 329 |
+
alpha=0.75, label=f"Early baseline {early_mean:+.3f}")
|
| 330 |
+
ax.axhline(y=final_mean, color="#34d399", linestyle="--", linewidth=1.2,
|
| 331 |
+
alpha=0.85, label=f"Final mean {final_mean:+.3f}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
ax.set_xlabel("Episode", color="#c9d1d9", fontsize=12)
|
| 334 |
+
ax.set_ylabel("Reward", color="#c9d1d9", fontsize=12)
|
| 335 |
ax.set_title(
|
| 336 |
"SpindleFlow RL β Delegation Policy Learning Curve\n"
|
| 337 |
f"RecurrentPPO Β· LSTM Β· {TOTAL_TIMESTEPS:,} steps Β· {n_ep:,} episodes",
|
| 338 |
color="#f0f6fc", fontsize=13, fontweight="bold", pad=14,
|
| 339 |
)
|
| 340 |
ax.tick_params(colors="#8b949e")
|
| 341 |
+
for s in ax.spines.values():
|
| 342 |
+
s.set_edgecolor("#30363d")
|
| 343 |
ax.grid(color="#21262d", linewidth=0.8, alpha=0.9)
|
| 344 |
+
ax.legend(fontsize=10, framealpha=0.85,
|
| 345 |
+
facecolor="#161b22", edgecolor="#30363d", labelcolor="#c9d1d9")
|
| 346 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
sign = "β²" if improvement >= 0 else "βΌ"
|
| 348 |
+
ax.annotate(f" {sign} {abs(improvement):.3f} improvement",
|
| 349 |
+
xy=(n_ep * 0.65, (early_mean + final_mean) / 2),
|
| 350 |
+
color="#f0f6fc", fontsize=10, fontstyle="italic")
|
|
|
|
|
|
|
| 351 |
|
| 352 |
fig.tight_layout()
|
| 353 |
+
fig.savefig("/content/reward_curve.png", dpi=180, bbox_inches="tight",
|
| 354 |
+
facecolor=fig.get_facecolor())
|
| 355 |
plt.show()
|
|
|
|
| 356 |
|
| 357 |
+
_tlog(f"Curve: early={early_mean:+.4f} final={final_mean:+.4f} "
|
| 358 |
+
f"improvement={improvement:+.4f}")
|
| 359 |
+
print(f"\nEpisodes : {n_ep:,}")
|
| 360 |
+
print(f"Improvement: {improvement:+.4f}")
|
| 361 |
+
print("CELL 6 done β reward curve saved")
|
|
|
|
|
|
|
| 362 |
|
| 363 |
|
| 364 |
# ============================================================
|
| 365 |
# CELL 7 β Learning features audit
|
| 366 |
# ============================================================
|
| 367 |
+
import json
|
| 368 |
from pathlib import Path
|
| 369 |
|
| 370 |
+
print("="*52)
|
| 371 |
print("LEARNING FEATURES AUDIT")
|
| 372 |
+
print("="*52)
|
| 373 |
|
| 374 |
+
print(f"\nFeature 5 β Curriculum")
|
| 375 |
+
print(f" Phase : {curriculum.current_phase}/3")
|
| 376 |
+
print(f" Rolling mean : {curriculum.rolling_mean():.3f}")
|
| 377 |
print(f" {curriculum.progress_str()}")
|
| 378 |
|
| 379 |
mem_path = Path(_cfg.get("specialist_improvement", {}).get(
|
| 380 |
+
"memory_path", "data/specialist_memory.json"))
|
|
|
|
| 381 |
print(f"\nFeature 2 β Specialist memory ({mem_path})")
|
| 382 |
if mem_path.exists():
|
| 383 |
data = json.loads(mem_path.read_text())
|
| 384 |
+
total = sum(len(v) for v in data.values())
|
| 385 |
+
print(f" {len(data)} specialists, {total} total entries")
|
|
|
|
| 386 |
for sid, entries in list(data.items())[:3]:
|
| 387 |
avg = sum(e["reward"] for e in entries) / len(entries)
|
| 388 |
+
print(f" {sid}: {len(entries)} entries, avg={avg:.3f}")
|
| 389 |
else:
|
| 390 |
+
print(" No file yet (finetuner fires after 100 episodes)")
|
| 391 |
|
| 392 |
spawn_path = Path(_cfg.get("environment", {}).get(
|
| 393 |
+
"spawn_memory_path", "data/spawn_memory.jsonl"))
|
|
|
|
| 394 |
print(f"\nFeature 3 β Spawn memory ({spawn_path})")
|
| 395 |
if spawn_path.exists():
|
| 396 |
lines = [l for l in spawn_path.read_text().splitlines() if l.strip()]
|
| 397 |
+
print(f" {len(lines)} spawn records")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
else:
|
| 399 |
+
print(" No file yet")
|
| 400 |
|
| 401 |
res_path = Path(_cfg.get("agents", {}).get(
|
| 402 |
+
"resolution_memory_path", "data/resolution_memory.jsonl"))
|
|
|
|
| 403 |
print(f"\nFeature 4 β Resolution bandit ({res_path})")
|
| 404 |
if res_path.exists():
|
| 405 |
lines = [l for l in res_path.read_text().splitlines() if l.strip()]
|
| 406 |
+
print(f" {len(lines)} outcome records")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
else:
|
| 408 |
+
print(" No file yet")
|
| 409 |
|
| 410 |
+
print("\n" + "="*52)
|
| 411 |
+
print("CELL 7 done")
|
|
|
|
| 412 |
|
| 413 |
|
| 414 |
# ============================================================
|
| 415 |
+
# CELL 8 β Push to HuggingFace Hub
|
|
|
|
|
|
|
| 416 |
# ============================================================
|
| 417 |
+
import os, numpy as np
|
| 418 |
from huggingface_hub import HfApi, CommitOperationAdd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
|
| 420 |
HF_REPO = "garvitsachdeva/spindleflow-rl"
|
| 421 |
api = HfApi(token=HF_TOKEN)
|
|
|
|
| 423 |
_tlog(f"Pushing to https://huggingface.co/{HF_REPO} ...")
|
| 424 |
api.create_repo(repo_id=HF_REPO.split("/")[-1], repo_type="model", exist_ok=True)
|
| 425 |
|
| 426 |
+
ep = reward_logger.episode_rewards
|
| 427 |
+
readme = f"""---
|
|
|
|
|
|
|
|
|
|
| 428 |
license: mit
|
| 429 |
tags:
|
| 430 |
- reinforcement-learning
|
|
|
|
| 438 |
|
| 439 |
# SpindleFlow RL β Delegation Policy
|
| 440 |
|
| 441 |
+
LSTM PPO (RecurrentPPO) trained on SpindleFlow-v0 (OpenEnv). Colab T4 GPU.
|
|
|
|
| 442 |
|
| 443 |
## Training summary
|
| 444 |
| Metric | Value |
|
| 445 |
|---|---|
|
| 446 |
| Algorithm | RecurrentPPO (SB3 + sb3-contrib) |
|
| 447 |
| Total timesteps | {TOTAL_TIMESTEPS:,} |
|
| 448 |
+
| Episodes | {len(ep):,} |
|
| 449 |
+
| Early baseline (first 50 ep) | {early_mean:.4f} |
|
| 450 |
+
| Final mean (last 200 ep) | {final_mean:.4f} |
|
| 451 |
+
| Improvement | {improvement:+.4f} |
|
| 452 |
| Training time | {_elapsed/60:.1f} min |
|
| 453 |
| Device | T4 GPU |
|
| 454 |
|
|
|
|
| 464 |
|
| 465 |
readme_path = "/content/README_model.md"
|
| 466 |
with open(readme_path, "w") as f:
|
| 467 |
+
f.write(readme)
|
| 468 |
|
| 469 |
candidates = [
|
| 470 |
+
("/content/spindleflow_model.zip", "spindleflow_model.zip"),
|
| 471 |
+
("/content/vec_normalize.pkl", "vec_normalize.pkl"),
|
| 472 |
("/content/reward_curve.png", "reward_curve.png"),
|
| 473 |
("/content/demo/assets/reward_curve.json", "reward_curve.json"),
|
| 474 |
("/content/logs/training_log.txt", "training_log.txt"),
|
|
|
|
| 477 |
|
| 478 |
ops = [
|
| 479 |
CommitOperationAdd(path_in_repo=dst, path_or_fileobj=src)
|
| 480 |
+
for src, dst in candidates if os.path.exists(src)
|
|
|
|
| 481 |
]
|
| 482 |
|
| 483 |
api.create_commit(
|
| 484 |
+
repo_id=HF_REPO, repo_type="model", operations=ops,
|
|
|
|
|
|
|
| 485 |
commit_message="Add trained SpindleFlow RL policy (Colab T4)",
|
| 486 |
token=HF_TOKEN,
|
| 487 |
)
|
|
|
|
| 489 |
_tlog(f"Uploaded {len(ops)} files:")
|
| 490 |
for src, dst in candidates:
|
| 491 |
if os.path.exists(src):
|
| 492 |
+
_tlog(f" {dst}")
|
| 493 |
+
|
| 494 |
+
_tlog(f"Model live : https://huggingface.co/{HF_REPO}")
|
| 495 |
+
_tlog(f"Log : https://huggingface.co/{HF_REPO}/blob/main/training_log.txt")
|
| 496 |
+
print("CELL 8 done β all done!")
|
|
|
|
|
|