# ============================================================
# SpindleFlow RL — Colab Training Script
#
# BEFORE ANYTHING:
#   1. Runtime → Change runtime type → T4 GPU
#   2. Key icon (left sidebar) → Manage secrets → add:
#        HF_TOKEN       = hf_xxxx  (write token: hf.co/settings/tokens)
#        OPENAI_API_KEY = sk-xxxx
#      Toggle "Notebook access" ON for both.
#   3. Create a new Colab notebook.
#   4. Copy each CELL block below into its own code cell.
#   5. Run cells top to bottom, one at a time.
# ============================================================


# ============================================================
# CELL 1 — Install packages + clone/update repo
# ============================================================
import subprocess, os, sys

print(f"Python {sys.version}")

packages = [
    "openenv", "stable-baselines3", "sb3-contrib", "gymnasium",
    "sentence-transformers", "openai", "pyyaml", "trl",
    "transformers", "datasets", "torch", "matplotlib",
    "huggingface_hub", "python-dotenv",
]
if sys.version_info >= (3, 13):
    packages.append("audioop-lts")

result = subprocess.run(
    ["pip", "install", "-q"] + packages,
    capture_output=True, text=True
)
if result.returncode != 0:
    print(result.stderr[-3000:])
    raise RuntimeError("pip install failed")
print("Packages OK")

REPO = "/content/kuchbhi"
GIT_URL = "https://github.com/garvitsachdevaa/kuchbhi.git"

if not os.path.isdir(os.path.join(REPO, ".git")):
    subprocess.run(["git", "clone", "--depth=1", GIT_URL], cwd="/content", check=True)
    print("Repo cloned")
else:
    subprocess.run(["git", "pull"], cwd=REPO, check=True)
    print("Repo updated")

os.chdir(REPO)
sys.path.insert(0, REPO)

for d in ["/content/demo/assets", "/content/data",
          "/content/checkpoints", "/content/logs"]:
    os.makedirs(d, exist_ok=True)

print(f"CWD: {os.getcwd()}")
print("CELL 1 done ✓")


# ============================================================
# CELL 2 — Load secrets (with clear error messages)
# ============================================================
import os
try:
    from google.colab import userdata
    HF_TOKEN       = userdata.get("HF_TOKEN")
    OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")
except Exception:
    HF_TOKEN       = ""
    OPENAI_API_KEY = ""

if not HF_TOKEN:
    raise RuntimeError(
        "HF_TOKEN not found.\n"
        "Click the 🔑 icon → Add secret → Name: HF_TOKEN → toggle Notebook access ON\n"
        "Then Runtime → Restart and run all."
    )
if not OPENAI_API_KEY:
    print("⚠️  No OPENAI_API_KEY — simulation mode (no LLM calls, faster training)")

os.environ["HF_TOKEN"]       = HF_TOKEN
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

print(f"HF_TOKEN       : {HF_TOKEN[:8]}...{HF_TOKEN[-4:]}")
print(f"OPENAI_API_KEY : {'set' if OPENAI_API_KEY else 'NOT SET — simulation mode'}")
print("CELL 2 done ✓")


# ============================================================
# CELL 3 — Patch env + smoke test
# ============================================================
import os as _os
import numpy as np
from env.spindleflow_env import SpindleFlowEnv

# Adds simulate_specialists kwarg so per-step calls stay local/fast.
# OPENAI_API_KEY is still active for task generation + finetuner.
if not getattr(SpindleFlowEnv, "_simulate_patched", False):
    _orig_init = SpindleFlowEnv.__init__

    def _new_init(self, *args, simulate_specialists=False, **kwargs):
        _orig_init(self, *args, **kwargs)
        self.simulate_specialists = simulate_specialists

    SpindleFlowEnv.__init__ = _new_init

    _orig_call = SpindleFlowEnv._call_specialist

    def _new_call(self, specialist_id, task, elapsed_ms, context=None):
        if getattr(self, "simulate_specialists", False):
            _key = _os.environ.pop("OPENAI_API_KEY", None)
            try:
                return _orig_call(self, specialist_id, task,
                                  elapsed_ms, context=context)
            finally:
                if _key:
                    _os.environ["OPENAI_API_KEY"] = _key
        return _orig_call(self, specialist_id, task, elapsed_ms, context=context)

    SpindleFlowEnv._call_specialist = _new_call
    SpindleFlowEnv._simulate_patched = True
    print("SpindleFlowEnv patched")

env = SpindleFlowEnv(
    config_path="configs/training_config.yaml",
    catalog_path="configs/specialist_catalog.yaml",
    use_real_spindleflow=False,
    phase=1,
    simulate_specialists=True,
)
obs, info = env.reset()
print(f"obs shape : {obs.shape}")
print(f"task      : {info['task'][:80]}")

_, reward, _, _, info2 = env.step(env.action_space.sample())
print(f"reward    : {reward:.4f}")
print(f"action    : {info2['action_name']}")
env.close()
print("\nCELL 3 done ✓ — environment OK")


# ============================================================
# CELL 4 — TRL check (hackathon requirement)
# ============================================================
import trl, torch

print(f"TRL   : {trl.__version__}")
print(f"Torch : {torch.__version__}")
print(f"CUDA  : {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU   : {torch.cuda.get_device_name(0)}")

for _name in ("PPOConfig", "GRPOConfig", "SFTConfig"):
    if getattr(trl, _name, None):
        print(f"TRL config: {_name}")
        break
else:
    print("TRL imported (TrainingArguments-based version)")

print("\nCELL 4 done ✓ — TRL requirement satisfied")


# ============================================================
# CELL 5 — Train RecurrentPPO (LSTM PPO)
#
# Per-step specialist calls : local simulation (no API cost/latency)
# Task generation           : GPT-4o-mini via OPENAI_API_KEY
# Finetuner                 : fires every 100 episodes
# Reward baseline           : GPT-4o-mini via OPENAI_API_KEY
#
# Expected runtime: ~20–25 min on T4 for 100k steps (~10k episodes)
# ============================================================
import time, yaml, torch, numpy as np
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.callbacks import CheckpointCallback, BaseCallback
from policy.lstm_policy import build_policy_kwargs
from training.curriculum import CurriculumManager
from training.specialist_improvement_callback import SpecialistImprovementCallback

_LOG_FILE = "/content/logs/training_log.txt"

def _tlog(msg):
    line = f"[{time.strftime('%H:%M:%S')}] {msg}"
    print(line, flush=True)
    with open(_LOG_FILE, "a") as f:
        f.write(line + "\n")

with open("configs/training_config.yaml") as f:
    _cfg = yaml.safe_load(f)

TOTAL_TIMESTEPS = 100_000
curriculum = CurriculumManager(config_path="configs/training_config.yaml")


class RewardLogger(BaseCallback):
    def __init__(self, curriculum):
        super().__init__()
        self.episode_rewards = []
        self._running = 0.0
        self._curriculum = curriculum

    def _on_step(self):
        for r, d in zip(self.locals.get("rewards", []),
                        self.locals.get("dones",   [])):
            self._running += float(r)
            if d:
                ep = self._running
                self.episode_rewards.append(ep)
                self._running = 0.0
                advanced = self._curriculum.on_episode_end(ep)
                n = len(self.episode_rewards)
                if advanced or n % 50 == 0:
                    _tlog(f"Ep {n:5d} | reward {ep:+.3f} | "
                          f"{self._curriculum.progress_str()}")
        return True


def make_env():
    return SpindleFlowEnv(
        config_path="configs/training_config.yaml",
        catalog_path="configs/specialist_catalog.yaml",
        use_real_spindleflow=False,
        phase=1,
        simulate_specialists=True,
    )


vec_env = DummyVecEnv([make_env])
vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=True, clip_obs=10.0)

_ppo  = _cfg.get("ppo",  {})
_lstm = _cfg.get("lstm", {})

model = RecurrentPPO(
    policy="MlpLstmPolicy",
    env=vec_env,
    learning_rate=float(_ppo.get("learning_rate", 3e-4)),
    n_steps=int(_ppo.get("n_steps", 512)),
    batch_size=int(_ppo.get("batch_size", 64)),
    n_epochs=int(_ppo.get("n_epochs", 10)),
    gamma=float(_ppo.get("gamma", 0.99)),
    gae_lambda=float(_ppo.get("gae_lambda", 0.95)),
    clip_range=float(_ppo.get("clip_range", 0.2)),
    ent_coef=float(_ppo.get("ent_coef", 0.01)),
    vf_coef=float(_ppo.get("vf_coef", 0.5)),
    max_grad_norm=float(_ppo.get("max_grad_norm", 0.5)),
    policy_kwargs=build_policy_kwargs(
        hidden_size=int(_lstm.get("hidden_size", 256))
    ),
    verbose=0,
    seed=int(_cfg.get("training", {}).get("seed", 42)),
    device="cuda" if torch.cuda.is_available() else "cpu",
)

_tlog(f"Device     : {model.device}")
_tlog(f"Timesteps  : {TOTAL_TIMESTEPS:,}")
_tlog(f"Curriculum : Phase {curriculum.current_phase} — {curriculum.progress_str()}")
_tlog("Training started...")

reward_logger  = RewardLogger(curriculum)
checkpoint_cb  = CheckpointCallback(save_freq=10_000,
                                    save_path="/content/checkpoints/")
improvement_cb = SpecialistImprovementCallback(
    improve_every_n_episodes=_cfg.get("specialist_improvement", {}).get(
        "improve_every_n_episodes", 100),
    verbose=1,
)

_t0 = time.time()
model.learn(
    total_timesteps=TOTAL_TIMESTEPS,
    callback=[reward_logger, checkpoint_cb, improvement_cb],
)
_elapsed = time.time() - _t0

model.save("/content/spindleflow_model")
vec_env.save("/content/vec_normalize.pkl")

_tlog(f"Done in {_elapsed/60:.1f} min")
_tlog(f"Episodes        : {len(reward_logger.episode_rewards)}")
_tlog(f"Curriculum final: {curriculum.progress_str()}")
print("\nCELL 5 done ✓ — model saved")


# ============================================================
# CELL 6 — Reward curve
# ============================================================
import json, numpy as np, matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

ep_rewards = reward_logger.episode_rewards
if not ep_rewards:
    raise RuntimeError("No episodes recorded — check Cell 5 output for errors")

n_ep     = len(ep_rewards)
episodes = list(range(n_ep))
window   = max(30, n_ep // 20)   # adaptive: ~5% of run

smoothed = [
    float(np.mean(ep_rewards[max(0, i - window):i + 1]))
    for i in range(n_ep)
]

early_mean  = float(np.mean(ep_rewards[:min(50, n_ep)]))
final_mean  = float(np.mean(ep_rewards[max(0, n_ep - 200):]))
improvement = final_mean - early_mean

# JSON for HF Space demo tab
step = max(1, n_ep // 300)
with open("/content/demo/assets/reward_curve.json", "w") as f:
    json.dump({"episodes": episodes[::step],
               "mean_rewards": smoothed[::step]}, f)

# Plot
fig, ax = plt.subplots(figsize=(11, 5), dpi=180)
fig.patch.set_facecolor("#0d1117")
ax.set_facecolor("#161b22")

every = max(1, n_ep // 800)
ax.scatter(episodes[::every], ep_rewards[::every],
           s=4, alpha=0.25, color="#58a6ff", zorder=2, label="Episode reward")
ax.plot(episodes[::every], smoothed[::every],
        linewidth=2.5, color="#ff6b35", zorder=3,
        label=f"Smoothed ({window}-ep mean)")
ax.axhline(y=early_mean, color="#94a3b8", linestyle="--", linewidth=1.2,
           alpha=0.75, label=f"Early baseline  {early_mean:+.3f}")
ax.axhline(y=final_mean, color="#34d399", linestyle="--", linewidth=1.2,
           alpha=0.85, label=f"Final mean  {final_mean:+.3f}")

ax.set_xlabel("Episode", color="#c9d1d9", fontsize=12)
ax.set_ylabel("Reward",  color="#c9d1d9", fontsize=12)
ax.set_title(
    "SpindleFlow RL — Delegation Policy Learning Curve\n"
    f"RecurrentPPO · LSTM · {TOTAL_TIMESTEPS:,} steps · {n_ep:,} episodes",
    color="#f0f6fc", fontsize=13, fontweight="bold", pad=14,
)
ax.tick_params(colors="#8b949e")
for sp in ax.spines.values():
    sp.set_edgecolor("#30363d")
ax.grid(color="#21262d", linewidth=0.8, alpha=0.9)
ax.legend(fontsize=10, framealpha=0.85,
          facecolor="#161b22", edgecolor="#30363d", labelcolor="#c9d1d9")

sign = "▲" if improvement >= 0 else "▼"
ax.annotate(f"  {sign} {abs(improvement):.3f} improvement",
            xy=(n_ep * 0.65, (early_mean + final_mean) / 2),
            color="#f0f6fc", fontsize=10, fontstyle="italic")

fig.tight_layout()
fig.savefig("/content/reward_curve.png", dpi=180, bbox_inches="tight",
            facecolor=fig.get_facecolor())
plt.show()

_tlog(f"Curve: early={early_mean:+.4f}  final={final_mean:+.4f}  "
      f"improvement={improvement:+.4f}")
print(f"Episodes   : {n_ep:,}")
print(f"Improvement: {improvement:+.4f}")
print("\nCELL 6 done ✓ — reward curve saved")


# ============================================================
# CELL 7 — Learning features audit
# ============================================================
import json
from pathlib import Path

print("=" * 52)
print("LEARNING FEATURES AUDIT")
print("=" * 52)

print(f"\nFeature 5 — Curriculum (performance-gated)")
print(f"  Phase        : {curriculum.current_phase}/3")
print(f"  Rolling mean : {curriculum.rolling_mean():.3f}")
print(f"  {curriculum.progress_str()}")

mem_path = Path(_cfg.get("specialist_improvement", {}).get(
    "memory_path", "data/specialist_memory.json"))
print(f"\nFeature 2 — Specialist memory ({mem_path})")
if mem_path.exists():
    data = json.loads(mem_path.read_text())
    total = sum(len(v) for v in data.values())
    print(f"  {len(data)} specialists · {total} total entries")
    for sid, entries in list(data.items())[:3]:
        avg = sum(e["reward"] for e in entries) / len(entries)
        print(f"    {sid}: {len(entries)} entries, avg={avg:.3f}")
else:
    print("  No file yet (finetuner fires after 100 completed episodes)")

spawn_path = Path(_cfg.get("environment", {}).get(
    "spawn_memory_path", "data/spawn_memory.jsonl"))
print(f"\nFeature 3 — Spawn memory ({spawn_path})")
if spawn_path.exists():
    lines = [l for l in spawn_path.read_text().splitlines() if l.strip()]
    print(f"  {len(lines)} spawn records")
    for line in lines[:2]:
        rec = json.loads(line)
        print(f"    {rec['specialist_role']} | reward={rec['episode_reward']:.3f}")
else:
    print("  No file yet")

res_path = Path(_cfg.get("agents", {}).get(
    "resolution_memory_path", "data/resolution_memory.jsonl"))
print(f"\nFeature 4 — Resolution bandit ({res_path})")
if res_path.exists():
    lines = [l for l in res_path.read_text().splitlines() if l.strip()]
    print(f"  {len(lines)} outcome records")
else:
    print("  No file yet")

print("\n" + "=" * 52)
print("CELL 7 done ✓")


# ============================================================
# CELL 8 — Push to HuggingFace Hub
# ============================================================
import os, numpy as np
from huggingface_hub import HfApi, CommitOperationAdd

from huggingface_hub import whoami
HF_REPO = f"{whoami(token=HF_TOKEN)['name']}/spindleflow-rl"
api = HfApi(token=HF_TOKEN)

_tlog(f"Pushing to https://huggingface.co/{HF_REPO} ...")
api.create_repo(repo_id=HF_REPO, repo_type="model", exist_ok=True)

ep = reward_logger.episode_rewards
readme = f"""---
license: mit
tags:
  - reinforcement-learning
  - stable-baselines3
  - sb3-contrib
  - gymnasium
  - multi-agent
  - openenv
library_name: stable-baselines3
---

# SpindleFlow RL — Delegation Policy

LSTM PPO (RecurrentPPO) trained on SpindleFlow-v0 (OpenEnv). Colab T4 GPU.

## Training summary
| Metric | Value |
|---|---|
| Algorithm | RecurrentPPO (SB3 + sb3-contrib) |
| Total timesteps | {TOTAL_TIMESTEPS:,} |
| Episodes | {len(ep):,} |
| Early baseline (first 50 ep) | {early_mean:.4f} |
| Final mean (last 200 ep) | {final_mean:.4f} |
| Improvement | {improvement:+.4f} |
| Training time | {_elapsed/60:.1f} min |
| Device | T4 GPU |

![Reward Curve](reward_curve.png)

## Load
```python
from sb3_contrib import RecurrentPPO
from huggingface_hub import hf_hub_download
model = RecurrentPPO.load(hf_hub_download("{HF_REPO}", "spindleflow_model.zip"))
```
"""

readme_path = "/content/README_model.md"
with open(readme_path, "w") as f:
    f.write(readme)

candidates = [
    ("/content/spindleflow_model.zip",          "spindleflow_model.zip"),
    ("/content/vec_normalize.pkl",              "vec_normalize.pkl"),
    ("/content/reward_curve.png",               "reward_curve.png"),
    ("/content/demo/assets/reward_curve.json",  "reward_curve.json"),
    ("/content/logs/training_log.txt",          "training_log.txt"),
    (readme_path,                               "README.md"),
]

ops = [
    CommitOperationAdd(path_in_repo=dst, path_or_fileobj=src)
    for src, dst in candidates if os.path.exists(src)
]

api.create_commit(
    repo_id=HF_REPO, repo_type="model", operations=ops,
    commit_message="Add trained SpindleFlow RL policy (Colab T4)",
    token=HF_TOKEN,
)

_tlog(f"Uploaded {len(ops)} files:")
for src, dst in candidates:
    if os.path.exists(src):
        _tlog(f"  {dst}")
_tlog(f"Model live : https://huggingface.co/{HF_REPO}")
_tlog(f"Log        : https://huggingface.co/{HF_REPO}/blob/main/training_log.txt")
print("\nCELL 8 done ✓ — all done!")