Spaces:

garvitsachdeva
/

SpindleFlow-RL

Runtime error

File size: 17,136 Bytes

02ff91f
956acce
02ff91f
3bce6e3
 
 
 
 
 
 
 
 
c2b373f
 
02ff91f
 
3bce6e3
02ff91f
956acce
 
175b600
02ff91f
956acce
02ff91f
 
175b600
 
956acce
 
 
 
175b600
 
 
 
956acce
 
175b600
 
02ff91f
21923ce
3bce6e3
 
 
175b600
02ff91f
 
956acce
 
02ff91f
 
175b600
3bce6e3
175b600
 
 
956acce
175b600
 
956acce
 
 
175b600
956acce
 
175b600
 
 
 
 
 
 
956acce
 
 
175b600
 
 
956acce
 
175b600
956acce
175b600
956acce
 
 
175b600
 
02ff91f
c2b373f
02ff91f
c2b373f
02ff91f
 
956acce
 
02ff91f
3bce6e3
 
02ff91f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3bce6e3
 
02ff91f
 
 
 
 
 
 
956acce
02ff91f
 
 
 
 
 
 
 
 
956acce
 
 
 
 
 
02ff91f
3bce6e3
02ff91f
c2b373f
02ff91f
956acce
02ff91f
 
 
956acce
 
 
 
 
02ff91f
 
956acce
3bce6e3
02ff91f
 
956acce
02ff91f
3bce6e3
02ff91f
c2b373f
02ff91f
956acce
 
3bce6e3
 
 
 
02ff91f
3bce6e3
02ff91f
956acce
02ff91f
 
 
 
 
 
 
c2b373f
 
956acce
 
c2b373f
956acce
 
c2b373f
02ff91f
 
 
956acce
02ff91f
 
 
c2b373f
956acce
02ff91f
956acce
 
02ff91f
 
956acce
 
 
02ff91f
 
c2b373f
 
02ff91f
c2b373f
02ff91f
c2b373f
956acce
 
02ff91f
 
 
 
 
 
 
 
 
c2b373f
02ff91f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3bce6e3
 
 
956acce
02ff91f
956acce
 
 
02ff91f
 
956acce
02ff91f
 
 
c2b373f
02ff91f
c2b373f
02ff91f
 
c2b373f
02ff91f
956acce
 
c2b373f
956acce
3bce6e3
956acce
3bce6e3
c2b373f
02ff91f
 
956acce
02ff91f
956acce
c2b373f
02ff91f
 
 
 
3bce6e3
02ff91f
956acce
 
3bce6e3
02ff91f
 
c2b373f
 
02ff91f
 
956acce
 
 
c2b373f
956acce
 
 
 
 
c2b373f
956acce
c2b373f
 
 
 
956acce
 
 
 
 
 
 
 
 
 
c2b373f
 
956acce
c2b373f
 
 
 
 
 
3bce6e3
 
c2b373f
956acce
 
c2b373f
 
956acce
 
 
c2b373f
 
956acce
 
02ff91f
c2b373f
3bce6e3
956acce
3bce6e3
956acce
3bce6e3
c2b373f
02ff91f
 
c2b373f
02ff91f
956acce
02ff91f
 
3bce6e3
02ff91f
3bce6e3
02ff91f
3bce6e3
956acce
 
02ff91f
 
 
956acce
02ff91f
 
 
956acce
3bce6e3
02ff91f
 
956acce
02ff91f
3bce6e3
02ff91f
 
956acce
02ff91f
 
 
956acce
3bce6e3
 
 
02ff91f
956acce
02ff91f
 
956acce
02ff91f
 
 
956acce
02ff91f
956acce
02ff91f
3bce6e3
 
8e0fa29
c2b373f
8e0fa29
956acce
8e0fa29
956acce
8e0fa29
 
aa154f2
 
8e0fa29
 
c2b373f
aa154f2
8e0fa29
956acce
 
8e0fa29
 
 
 
 
 
 
 
 
 
 
 
 
956acce
8e0fa29
 
 
 
 
c2b373f
956acce
 
 
 
c2b373f
 
8e0fa29
 
 
 
 
 
 
 
 
 
 
 
 
956acce
8e0fa29
 
3bce6e3
 
 
 
 
 
8e0fa29
 
 
 
956acce
8e0fa29
 
 
956acce
8e0fa29
 
 
 
c2b373f
 
 
956acce
 
 
3bce6e3

# ============================================================
# SpindleFlow RL — Colab Training Script
#
# BEFORE ANYTHING:
#   1. Runtime → Change runtime type → T4 GPU
#   2. Key icon (left sidebar) → Manage secrets → add:
#        HF_TOKEN       = hf_xxxx  (write token: hf.co/settings/tokens)
#        OPENAI_API_KEY = sk-xxxx
#      Toggle "Notebook access" ON for both.
#   3. Create a new Colab notebook.
#   4. Copy each CELL block below into its own code cell.
#   5. Run cells top to bottom, one at a time.
# ============================================================


# ============================================================
# CELL 1 — Install packages + clone/update repo
# ============================================================
import subprocess, os, sys

print(f"Python {sys.version}")

packages = [
    "openenv", "stable-baselines3", "sb3-contrib", "gymnasium",
    "sentence-transformers", "openai", "pyyaml", "trl",
    "transformers", "datasets", "torch", "matplotlib",
    "huggingface_hub", "python-dotenv",
]
if sys.version_info >= (3, 13):
    packages.append("audioop-lts")

result = subprocess.run(
    ["pip", "install", "-q"] + packages,
    capture_output=True, text=True
)
if result.returncode != 0:
    print(result.stderr[-3000:])
    raise RuntimeError("pip install failed")
print("Packages OK")

REPO = "/content/kuchbhi"
GIT_URL = "https://github.com/garvitsachdevaa/kuchbhi.git"

if not os.path.isdir(os.path.join(REPO, ".git")):
    subprocess.run(["git", "clone", "--depth=1", GIT_URL], cwd="/content", check=True)
    print("Repo cloned")
else:
    subprocess.run(["git", "pull"], cwd=REPO, check=True)
    print("Repo updated")

os.chdir(REPO)
sys.path.insert(0, REPO)

for d in ["/content/demo/assets", "/content/data",
          "/content/checkpoints", "/content/logs"]:
    os.makedirs(d, exist_ok=True)

print(f"CWD: {os.getcwd()}")
print("CELL 1 done ✓")


# ============================================================
# CELL 2 — Load secrets (with clear error messages)
# ============================================================
import os
try:
    from google.colab import userdata
    HF_TOKEN       = userdata.get("HF_TOKEN")
    OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")
except Exception:
    HF_TOKEN       = ""
    OPENAI_API_KEY = ""

if not HF_TOKEN:
    raise RuntimeError(
        "HF_TOKEN not found.\n"
        "Click the 🔑 icon → Add secret → Name: HF_TOKEN → toggle Notebook access ON\n"
        "Then Runtime → Restart and run all."
    )
if not OPENAI_API_KEY:
    print("⚠️  No OPENAI_API_KEY — simulation mode (no LLM calls, faster training)")

os.environ["HF_TOKEN"]       = HF_TOKEN
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

print(f"HF_TOKEN       : {HF_TOKEN[:8]}...{HF_TOKEN[-4:]}")
print(f"OPENAI_API_KEY : {'set' if OPENAI_API_KEY else 'NOT SET — simulation mode'}")
print("CELL 2 done ✓")


# ============================================================
# CELL 3 — Patch env + smoke test
# ============================================================
import os as _os
import numpy as np
from env.spindleflow_env import SpindleFlowEnv

# Adds simulate_specialists kwarg so per-step calls stay local/fast.
# OPENAI_API_KEY is still active for task generation + finetuner.
if not getattr(SpindleFlowEnv, "_simulate_patched", False):
    _orig_init = SpindleFlowEnv.__init__

    def _new_init(self, *args, simulate_specialists=False, **kwargs):
        _orig_init(self, *args, **kwargs)
        self.simulate_specialists = simulate_specialists

    SpindleFlowEnv.__init__ = _new_init

    _orig_call = SpindleFlowEnv._call_specialist

    def _new_call(self, specialist_id, task, elapsed_ms, context=None):
        if getattr(self, "simulate_specialists", False):
            _key = _os.environ.pop("OPENAI_API_KEY", None)
            try:
                return _orig_call(self, specialist_id, task,
                                  elapsed_ms, context=context)
            finally:
                if _key:
                    _os.environ["OPENAI_API_KEY"] = _key
        return _orig_call(self, specialist_id, task, elapsed_ms, context=context)

    SpindleFlowEnv._call_specialist = _new_call
    SpindleFlowEnv._simulate_patched = True
    print("SpindleFlowEnv patched")

env = SpindleFlowEnv(
    config_path="configs/training_config.yaml",
    catalog_path="configs/specialist_catalog.yaml",
    use_real_spindleflow=False,
    phase=1,
    simulate_specialists=True,
)
obs, info = env.reset()
print(f"obs shape : {obs.shape}")
print(f"task      : {info['task'][:80]}")

_, reward, _, _, info2 = env.step(env.action_space.sample())
print(f"reward    : {reward:.4f}")
print(f"action    : {info2['action_name']}")
env.close()
print("\nCELL 3 done ✓ — environment OK")


# ============================================================
# CELL 4 — TRL check (hackathon requirement)
# ============================================================
import trl, torch

print(f"TRL   : {trl.__version__}")
print(f"Torch : {torch.__version__}")
print(f"CUDA  : {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU   : {torch.cuda.get_device_name(0)}")

for _name in ("PPOConfig", "GRPOConfig", "SFTConfig"):
    if getattr(trl, _name, None):
        print(f"TRL config: {_name}")
        break
else:
    print("TRL imported (TrainingArguments-based version)")

print("\nCELL 4 done ✓ — TRL requirement satisfied")


# ============================================================
# CELL 5 — Train RecurrentPPO (LSTM PPO)
#
# Per-step specialist calls : local simulation (no API cost/latency)
# Task generation           : GPT-4o-mini via OPENAI_API_KEY
# Finetuner                 : fires every 100 episodes
# Reward baseline           : GPT-4o-mini via OPENAI_API_KEY
#
# Expected runtime: ~20–25 min on T4 for 100k steps (~10k episodes)
# ============================================================
import time, yaml, torch, numpy as np
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback
from policy.lstm_policy import build_policy_kwargs
from training.curriculum import CurriculumManager
from training.specialist_improvement_callback import SpecialistImprovementCallback

_LOG_FILE = "/content/logs/training_log.txt"

def _tlog(msg):
    line = f"[{time.strftime('%H:%M:%S')}] {msg}"
    print(line, flush=True)
    with open(_LOG_FILE, "a") as f:
        f.write(line + "\n")

with open("configs/training_config.yaml") as f:
    _cfg = yaml.safe_load(f)

TOTAL_TIMESTEPS = 100_000
curriculum = CurriculumManager(config_path="configs/training_config.yaml")


class RewardLogger(BaseCallback):
    def __init__(self, curriculum):
        super().__init__()
        self.episode_rewards = []
        self._running = 0.0
        self._curriculum = curriculum

    def _on_step(self):
        for r, d in zip(self.locals.get("rewards", []),
                        self.locals.get("dones",   [])):
            self._running += float(r)
            if d:
                ep = self._running
                self.episode_rewards.append(ep)
                self._running = 0.0
                advanced = self._curriculum.on_episode_end(ep)
                n = len(self.episode_rewards)
                if advanced or n % 50 == 0:
                    _tlog(f"Ep {n:5d} | reward {ep:+.3f} | "
                          f"{self._curriculum.progress_str()}")
        return True


def make_env():
    return SpindleFlowEnv(
        config_path="configs/training_config.yaml",
        catalog_path="configs/specialist_catalog.yaml",
        use_real_spindleflow=False,
        phase=1,
        simulate_specialists=True,
    )


vec_env = DummyVecEnv([make_env])
vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=True, clip_obs=10.0)

_ppo  = _cfg.get("ppo",  {})
_lstm = _cfg.get("lstm", {})

model = RecurrentPPO(
    policy="MlpLstmPolicy",
    env=vec_env,
    learning_rate=float(_ppo.get("learning_rate", 3e-4)),
    n_steps=int(_ppo.get("n_steps", 512)),
    batch_size=int(_ppo.get("batch_size", 64)),
    n_epochs=int(_ppo.get("n_epochs", 10)),
    gamma=float(_ppo.get("gamma", 0.99)),
    gae_lambda=float(_ppo.get("gae_lambda", 0.95)),
    clip_range=float(_ppo.get("clip_range", 0.2)),
    ent_coef=float(_ppo.get("ent_coef", 0.01)),
    vf_coef=float(_ppo.get("vf_coef", 0.5)),
    max_grad_norm=float(_ppo.get("max_grad_norm", 0.5)),
    policy_kwargs=build_policy_kwargs(
        hidden_size=int(_lstm.get("hidden_size", 256))
    ),
    verbose=0,
    seed=int(_cfg.get("training", {}).get("seed", 42)),
    device="cuda" if torch.cuda.is_available() else "cpu",
)

_tlog(f"Device     : {model.device}")
_tlog(f"Timesteps  : {TOTAL_TIMESTEPS:,}")
_tlog(f"Curriculum : Phase {curriculum.current_phase} — {curriculum.progress_str()}")
_tlog("Training started...")

reward_logger  = RewardLogger(curriculum)
checkpoint_cb  = CheckpointCallback(save_freq=10_000,
                                    save_path="/content/checkpoints/")
improvement_cb = SpecialistImprovementCallback(
    improve_every_n_episodes=_cfg.get("specialist_improvement", {}).get(
        "improve_every_n_episodes", 100),
    verbose=1,
)

_t0 = time.time()
model.learn(
    total_timesteps=TOTAL_TIMESTEPS,
    callback=[reward_logger, checkpoint_cb, improvement_cb],
)
_elapsed = time.time() - _t0

model.save("/content/spindleflow_model")
vec_env.save("/content/vec_normalize.pkl")

_tlog(f"Done in {_elapsed/60:.1f} min")
_tlog(f"Episodes        : {len(reward_logger.episode_rewards)}")
_tlog(f"Curriculum final: {curriculum.progress_str()}")
print("\nCELL 5 done ✓ — model saved")


# ============================================================
# CELL 6 — Reward curve
# ============================================================
import json, numpy as np, matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

ep_rewards = reward_logger.episode_rewards
if not ep_rewards:
    raise RuntimeError("No episodes recorded — check Cell 5 output for errors")

n_ep     = len(ep_rewards)
episodes = list(range(n_ep))
window   = max(30, n_ep // 20)   # adaptive: ~5% of run

smoothed = [
    float(np.mean(ep_rewards[max(0, i - window):i + 1]))
    for i in range(n_ep)
]

early_mean  = float(np.mean(ep_rewards[:min(50, n_ep)]))
final_mean  = float(np.mean(ep_rewards[max(0, n_ep - 200):]))
improvement = final_mean - early_mean

# JSON for HF Space demo tab
step = max(1, n_ep // 300)
with open("/content/demo/assets/reward_curve.json", "w") as f:
    json.dump({"episodes": episodes[::step],
               "mean_rewards": smoothed[::step]}, f)

# Plot
fig, ax = plt.subplots(figsize=(11, 5), dpi=180)
fig.patch.set_facecolor("#0d1117")
ax.set_facecolor("#161b22")

every = max(1, n_ep // 800)
ax.scatter(episodes[::every], ep_rewards[::every],
           s=4, alpha=0.25, color="#58a6ff", zorder=2, label="Episode reward")
ax.plot(episodes[::every], smoothed[::every],
        linewidth=2.5, color="#ff6b35", zorder=3,
        label=f"Smoothed ({window}-ep mean)")
ax.axhline(y=early_mean, color="#94a3b8", linestyle="--", linewidth=1.2,
           alpha=0.75, label=f"Early baseline  {early_mean:+.3f}")
ax.axhline(y=final_mean, color="#34d399", linestyle="--", linewidth=1.2,
           alpha=0.85, label=f"Final mean  {final_mean:+.3f}")

ax.set_xlabel("Episode", color="#c9d1d9", fontsize=12)
ax.set_ylabel("Reward",  color="#c9d1d9", fontsize=12)
ax.set_title(
    "SpindleFlow RL — Delegation Policy Learning Curve\n"
    f"RecurrentPPO · LSTM · {TOTAL_TIMESTEPS:,} steps · {n_ep:,} episodes",
    color="#f0f6fc", fontsize=13, fontweight="bold", pad=14,
)
ax.tick_params(colors="#8b949e")
for sp in ax.spines.values():
    sp.set_edgecolor("#30363d")
ax.grid(color="#21262d", linewidth=0.8, alpha=0.9)
ax.legend(fontsize=10, framealpha=0.85,
          facecolor="#161b22", edgecolor="#30363d", labelcolor="#c9d1d9")

sign = "▲" if improvement >= 0 else "▼"
ax.annotate(f"  {sign} {abs(improvement):.3f} improvement",
            xy=(n_ep * 0.65, (early_mean + final_mean) / 2),
            color="#f0f6fc", fontsize=10, fontstyle="italic")

fig.tight_layout()
fig.savefig("/content/reward_curve.png", dpi=180, bbox_inches="tight",
            facecolor=fig.get_facecolor())
plt.show()

_tlog(f"Curve: early={early_mean:+.4f}  final={final_mean:+.4f}  "
      f"improvement={improvement:+.4f}")
print(f"Episodes   : {n_ep:,}")
print(f"Improvement: {improvement:+.4f}")
print("\nCELL 6 done ✓ — reward curve saved")


# ============================================================
# CELL 7 — Learning features audit
# ============================================================
import json
from pathlib import Path

print("=" * 52)
print("LEARNING FEATURES AUDIT")
print("=" * 52)

print(f"\nFeature 5 — Curriculum (performance-gated)")
print(f"  Phase        : {curriculum.current_phase}/3")
print(f"  Rolling mean : {curriculum.rolling_mean():.3f}")
print(f"  {curriculum.progress_str()}")

mem_path = Path(_cfg.get("specialist_improvement", {}).get(
    "memory_path", "data/specialist_memory.json"))
print(f"\nFeature 2 — Specialist memory ({mem_path})")
if mem_path.exists():
    data = json.loads(mem_path.read_text())
    total = sum(len(v) for v in data.values())
    print(f"  {len(data)} specialists · {total} total entries")
    for sid, entries in list(data.items())[:3]:
        avg = sum(e["reward"] for e in entries) / len(entries)
        print(f"    {sid}: {len(entries)} entries, avg={avg:.3f}")
else:
    print("  No file yet (finetuner fires after 100 completed episodes)")

spawn_path = Path(_cfg.get("environment", {}).get(
    "spawn_memory_path", "data/spawn_memory.jsonl"))
print(f"\nFeature 3 — Spawn memory ({spawn_path})")
if spawn_path.exists():
    lines = [l for l in spawn_path.read_text().splitlines() if l.strip()]
    print(f"  {len(lines)} spawn records")
    for line in lines[:2]:
        rec = json.loads(line)
        print(f"    {rec['specialist_role']} | reward={rec['episode_reward']:.3f}")
else:
    print("  No file yet")

res_path = Path(_cfg.get("agents", {}).get(
    "resolution_memory_path", "data/resolution_memory.jsonl"))
print(f"\nFeature 4 — Resolution bandit ({res_path})")
if res_path.exists():
    lines = [l for l in res_path.read_text().splitlines() if l.strip()]
    print(f"  {len(lines)} outcome records")
else:
    print("  No file yet")

print("\n" + "=" * 52)
print("CELL 7 done ✓")


# ============================================================
# CELL 8 — Push to HuggingFace Hub
# ============================================================
import os, numpy as np
from huggingface_hub import HfApi, CommitOperationAdd

from huggingface_hub import whoami
HF_REPO = f"{whoami(token=HF_TOKEN)['name']}/spindleflow-rl"
api = HfApi(token=HF_TOKEN)

_tlog(f"Pushing to https://huggingface.co/{HF_REPO} ...")
api.create_repo(repo_id=HF_REPO, repo_type="model", exist_ok=True)

ep = reward_logger.episode_rewards
readme = f"""---
license: mit
tags:
  - reinforcement-learning
  - stable-baselines3
  - sb3-contrib
  - gymnasium
  - multi-agent
  - openenv
library_name: stable-baselines3
---

# SpindleFlow RL — Delegation Policy

LSTM PPO (RecurrentPPO) trained on SpindleFlow-v0 (OpenEnv). Colab T4 GPU.

## Training summary
| Metric | Value |
|---|---|
| Algorithm | RecurrentPPO (SB3 + sb3-contrib) |
| Total timesteps | {TOTAL_TIMESTEPS:,} |
| Episodes | {len(ep):,} |
| Early baseline (first 50 ep) | {early_mean:.4f} |
| Final mean (last 200 ep) | {final_mean:.4f} |
| Improvement | {improvement:+.4f} |
| Training time | {_elapsed/60:.1f} min |
| Device | T4 GPU |

![Reward Curve](reward_curve.png)

## Load
```python
from sb3_contrib import RecurrentPPO
from huggingface_hub import hf_hub_download
model = RecurrentPPO.load(hf_hub_download("{HF_REPO}", "spindleflow_model.zip"))
```
"""

readme_path = "/content/README_model.md"
with open(readme_path, "w") as f:
    f.write(readme)

candidates = [
    ("/content/spindleflow_model.zip",          "spindleflow_model.zip"),
    ("/content/vec_normalize.pkl",              "vec_normalize.pkl"),
    ("/content/reward_curve.png",               "reward_curve.png"),
    ("/content/demo/assets/reward_curve.json",  "reward_curve.json"),
    ("/content/logs/training_log.txt",          "training_log.txt"),
    (readme_path,                               "README.md"),
]

ops = [
    CommitOperationAdd(path_in_repo=dst, path_or_fileobj=src)
    for src, dst in candidates if os.path.exists(src)
]

api.create_commit(
    repo_id=HF_REPO, repo_type="model", operations=ops,
    commit_message="Add trained SpindleFlow RL policy (Colab T4)",
    token=HF_TOKEN,
)

_tlog(f"Uploaded {len(ops)} files:")
for src, dst in candidates:
    if os.path.exists(src):
        _tlog(f"  {dst}")
_tlog(f"Model live : https://huggingface.co/{HF_REPO}")
_tlog(f"Log        : https://huggingface.co/{HF_REPO}/blob/main/training_log.txt")
print("\nCELL 8 done ✓ — all done!")