Spaces:

100XZX001
/

code-review-training

Sleeping

App Files Files Community

100XZX001 commited on Apr 26

Commit

868dd5a

verified ·

1 Parent(s): a3960cd

Upload 17 files

Browse files

Files changed (2) hide show

environment.py +1 -122
training.py +850 -726

environment.py CHANGED Viewed

@@ -30,8 +30,6 @@ from rubrics import (
 # ======================================================================
 # FULLY MARKOV OBSERVATION (NOTHING HIDDEN)
 # ======================================================================
 @dataclass
 class EnhancedObservation:
     code_snippet: str
@@ -77,7 +75,6 @@ def execute_code(code: str, timeout_sec: int = 5) -> Tuple[bool, str, str]:
         f.write(code)
         tmp_path = f.name
     try:
         result = subprocess.run(
             [sys.executable, tmp_path],
@@ -205,124 +202,6 @@ class CodeReviewEnv:
                 ExplorationRubric(penalty=-0.05, bonus=self.diversity_bonus * 0.7),
                 AntiHackingRubric(),
                 core_rubrics[-1],
             ]
         raise ValueError(f"Unknown reward_profile: {self.reward_profile}")
@@ -746,4 +625,4 @@ class CodeReviewEnv:
             test_results=self._test_results,
             step=self._step_count,
             done=self._done
-        )

 # ======================================================================
 # FULLY MARKOV OBSERVATION (NOTHING HIDDEN)
 # ======================================================================
 @dataclass
 class EnhancedObservation:
     code_snippet: str
         f.write(code)
         tmp_path = f.name
     try:
         result = subprocess.run(
             [sys.executable, tmp_path],
                 ExplorationRubric(penalty=-0.05, bonus=self.diversity_bonus * 0.7),
                 AntiHackingRubric(),
                 core_rubrics[-1],
             ]
         raise ValueError(f"Unknown reward_profile: {self.reward_profile}")
             test_results=self._test_results,
             step=self._step_count,
             done=self._done
+        )

training.py CHANGED Viewed

@@ -1,811 +1,935 @@
-# training.py  –  PPO + QLoRA + Supervised Warm-up
-# Model : Qwen/Qwen2.5-1.5B-Instruct  (via Unsloth – 2× faster, fits Colab T4)
-# Fixed : label-masking, BPE-boundary alignment, log-ratio clamping, OOM guards
-# Evidence: reward curves, before/after traces, per-difficulty breakdown, KL, entropy
-# ============================================================
-import os, json, random, re
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-import matplotlib.gridspec as gridspec
 import torch
 import torch.nn.functional as F
 from torch.optim import AdamW
-from dataclasses import dataclass, field
-from typing import List, Optional, Dict
-from collections import Counter, defaultdict
 import numpy as np
-# ── Unsloth gives 2× throughput with identical outputs ────────────────────────
 from unsloth import FastLanguageModel
 from environment import CodeReviewEnv
 from redteam import BUG_DB
-# Graceful import: use project map_to_env if available, else inline fallback.
-try:
-    from models import map_to_env as model_map_to_env
-    _HAVE_MODEL_MAP = True
-except (ImportError, AttributeError):
-    _HAVE_MODEL_MAP = False
-if not _HAVE_MODEL_MAP:
-    try:
-        from models import (RunTests, RunLinter, Inspect, ProposeFix,
-                            WriteComment, AskQuestion, Done, Skip, QueryDocs)
-        def model_map_to_env(action_type: str, content=None):
-            return {
-                "run_tests":  RunTests(),
-                "run_linter": RunLinter(),
-                "inspect":    Inspect(),
-                "query_docs": QueryDocs(content or "python bug fix"),
-                "fix":        ProposeFix(content or ""),
-                "comment":    WriteComment(content or ""),
-                "question":   AskQuestion(content or ""),
-                "done":       Done(),
-            }.get(action_type, Skip())
-    except ImportError:
-        # Last resort: duck-typed object the env can introspect.
-        class _EnvAction:
-            def __init__(self, **kw): self.__dict__.update(kw)
-        def model_map_to_env(action_type: str, content=None):
-            return _EnvAction(action_type=action_type, content=content)
-# ══════════════════════════════════════════════════════════════════════════════
-# CONFIG
-# ══════════════════════════════════════════════════════════════════════════════
-CFG = dict(
-    model_name       = "unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit",
-    max_seq_len      = 512,       # hard cap; prevents OOM on T4
-    lora_r           = 16,
-    lora_alpha       = 32,
-    # Warm-up
-    warmup_data      = "training_data.json",
-    warmup_epochs    = 2,
-    warmup_lr        = 2e-5,
-    warmup_grad_acc  = 4,         # effective batch = 4 examples
-    # PPO
-    ppo_iters        = 15,
-    trajs_per_iter   = 6,
-    max_steps        = 7,
-    ppo_lr           = 3e-5,
-    clip_eps         = 0.2,
-    entropy_coef     = 0.01,
-    gamma            = 0.99,
-    log_ratio_clamp  = 5.0,       # ← prevents exp-explosion / NaN loss
-    temp_start       = 0.8,
-    temp_end         = 0.1,
-    # Eval
-    eval_episodes    = 10,        # episodes per evaluation snapshot
 )
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-TASK_LEVELS = list(BUG_DB.keys())   # [easy, medium, hard, harder, hardest]
-# ══════════════════════════════════════════════════════════════════════════════
-# DATA STRUCTURES
-# ══════════════════════════════════════════════════════════════════════════════
 @dataclass
 class AgentAction:
     action_type: str
     content: Optional[str] = None
-@dataclass
-class Trajectory:
-    states:   List[str]
-    actions:  List[str]
-    rewards:  List[float]
-    logprobs: List[float]
-    dones:    List[bool]
-    task:     str = ""
-@dataclass
-class EvalSnapshot:
-    """Captures full agent behaviour for before/after comparison."""
-    avg_reward:    float
-    per_task:      Dict[str, float]  = field(default_factory=dict)
-    action_dist:   Dict[str, float]  = field(default_factory=dict)
-    success_rate:  float = 0.0
-    avg_steps:     float = 0.0
-    traces:        List[dict] = field(default_factory=list)
-# ══════════════════════════════════════════════════════════════════════════════
-# ACTION PARSER
-# ══════════════════════════════════════════════════════════════════════════════
-def parse_action(text: str) -> AgentAction:
-    """Robust parser: tries strict JSON, then regex, then keyword heuristic."""
-    text = text.strip()
     try:
-        d = json.loads(text)
-        return AgentAction(d.get("action_type","skip").lower(), d.get("content"))
-    except json.JSONDecodeError:
         pass
-    m = re.search(r'"action_type"\s*:\s*"(\w+)"', text)
-    if m:
-        cm = re.search(r'"content"\s*:\s*"(.*?)"', text, re.DOTALL)
-        return AgentAction(m.group(1).lower(), cm.group(1) if cm else None)
-    tl = text.lower()
-    for kw in ("run_tests","run_linter","inspect","query_docs","fix",
-               "comment","question","done"):
-        if kw in tl:
-            return AgentAction(kw)
-    return AgentAction("skip")
 def map_to_env(action: AgentAction):
     return model_map_to_env(action.action_type, action.content)
-# ══════════════════════════════════════════════════════════════════════════════
-# MODEL  (Qwen2.5-1.5B via Unsloth)
-# ══════════════════════════════════════════════════════════════════════════════
 def load_model():
-    print(f"Loading {CFG['model_name']} …")
     model, tokenizer = FastLanguageModel.from_pretrained(
-        model_name     = CFG["model_name"],
-        max_seq_length = CFG["max_seq_len"],
-        load_in_4bit   = True,
     )
     model = FastLanguageModel.get_peft_model(
         model,
-        r              = CFG["lora_r"],
-        lora_alpha     = CFG["lora_alpha"],
-        target_modules = ["q_proj","k_proj","v_proj","o_proj",
-                          "gate_proj","up_proj","down_proj"],
-        lora_dropout   = 0.0,
     )
-    tokenizer.pad_token = tokenizer.eos_token
-    print(f"  trainable params: "
-          f"{sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6:.1f}M")
     return model, tokenizer
-# ═══════════════════════════════════════════════════════════════��══════════════
-# PROMPT BUILDER
-# ══════════════════════════════════════════════════════════════════════════════
-def build_prompt(obs, history_lines: List[str]) -> str:
-    author_msg   = getattr(obs, "author_response",  "") or ""
-    tool_output  = getattr(obs, "last_tool_output", "") or ""
-    personality  = getattr(obs, "author_personality","defensive")
-    # Trim tool output to avoid context explosion
-    if len(tool_output) > 600:
-        tool_output = tool_output[:600] + " …[truncated]"
-    p = (
-        f"You are an AI code review agent. Convince the developer (personality: "
-        f"**{personality}**) to accept your fix. Name your fix function `fix`.\n\n"
-        "Evidence required: tests pass, lint clean, docs cited, reasoning uses "
-        "'because'/'therefore' (>30 words).\n\n"
-        "Workflow: inspect → run_tests → run_linter → query_docs → fix → "
-        "comment/question → done.\n\n"
-        f"Code:\n{obs.code_snippet}\n\n"
-        f"Author: {author_msg or '(no response yet – start with inspect)'}\n\n"
-        f"Last tool: {tool_output or '(none)'}\n\n"
-        "Actions: run_tests, run_linter, inspect, query_docs, fix, comment, question, done\n\n"
-        'Respond ONLY in JSON: {"action_type": "...", "content": "..."}'
-    )
-    if history_lines:
-        p += "\n\nRecent steps:\n" + "\n".join(history_lines[-4:])
-    return p
-# ══════════════════════════════════════════════════════════════════════════════
-# BUG FIX 1 – label masking in supervised warmup
-# (original: labels=inputs["input_ids"] trains on ALL tokens, including prompt)
-# ══════════════════════════════════════════════════════════════════════════════
-def _masked_labels(input_ids: torch.Tensor, prompt_len: int) -> torch.Tensor:
-    """Return labels with prompt positions set to -100 (ignored by CE loss)."""
-    labels = input_ids.clone()
-    labels[0, :prompt_len] = -100
-    return labels
-# ══════════════════════════════════════════════════════════════════════════════
-# BUG FIX 2 – BPE-boundary-safe logprob computation
-# (original: tokenize(prompt) + tokenize(action) ≠ tokenize(prompt+action))
-# ══════════════════════════════════════════════════════════════════════════════
-def _compute_action_logprob(
-    logits:      torch.Tensor,   # [1, seq_len, vocab]
-    input_ids:   torch.Tensor,   # [1, seq_len]
-    prompt_len:  int,            # #tokens in the prompt part of the joint sequence
-) -> tuple:
     """
-    Compute sum of log-probs for *action* tokens only, using the jointly
-    tokenised sequence so BPE boundaries are respected.
-    Returns (total_logprob, avg_entropy, n_tokens).
     """
-    action_len = input_ids.shape[1] - prompt_len
-    if action_len <= 0:
-        return torch.tensor(0.0, device=DEVICE), torch.tensor(0.0, device=DEVICE), 0
-    total_lp  = torch.tensor(0.0, device=DEVICE)
-    total_ent = torch.tensor(0.0, device=DEVICE)
-    for k in range(action_len):
-        pos = prompt_len + k           # position of the k-th action token
-        pred_pos = pos - 1             # logit at pred_pos predicts token at pos
-        if pred_pos < 0 or pred_pos >= logits.shape[1]:
-            continue
-        token_id  = input_ids[0, pos]
-        lp_dist   = F.log_softmax(logits[0, pred_pos], dim=-1)
-        total_lp  = total_lp  + lp_dist[token_id]
-        probs     = torch.exp(lp_dist)
-        total_ent = total_ent + (-(probs * lp_dist).sum()).detach()
-    n = action_len
-    return total_lp, total_ent / max(n, 1), n
-# ══════════════════════════════════════════════════════════════════════════════
-# GENERATION  (returns text + joint-sequence logprob)
-# ══════════════════════════════════════════════════════════════════════════════
-@torch.no_grad()
-def generate_action(prompt: str, model, tokenizer,
-                    temperature: float) -> tuple:
-    messages  = [{"role": "user", "content": prompt}]
-    formatted = tokenizer.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    inputs = tokenizer(
-        formatted, return_tensors="pt",
-        max_length=CFG["max_seq_len"] - 128,   # leave room for response
-        truncation=True
-    ).to(DEVICE)
-    prompt_len = inputs["input_ids"].shape[1]
-    gen_kwargs = dict(
-        max_new_tokens      = 128,
-        do_sample           = temperature > 0,
-        return_dict_in_generate = True,
-        output_scores       = True,
-        pad_token_id        = tokenizer.eos_token_id,
-        eos_token_id        = tokenizer.eos_token_id,
-    )
-    if temperature > 0:
-        gen_kwargs["temperature"] = temperature
-    out     = model.generate(**inputs, **gen_kwargs)
-    gen_ids = out.sequences[0][prompt_len:]
-    text    = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
-    if not text:
-        fallback = random.choice([
-            '{"action_type":"inspect"}',
-            '{"action_type":"run_tests"}',
-            '{"action_type":"run_linter"}',
-        ])
-        print(f"  [WARN] empty generation → fallback {fallback}")
-        # BUG FIX 3: don't use -100 sentinel; use a mildly negative logprob
-        # so that PPO ratio = exp(new - old) stays finite when re-evaluated
-        return fallback, -10.0
-    # Recompute logprob from the full joint sequence (BPE-safe)
-    joint_ids = torch.cat(
-        [inputs["input_ids"], gen_ids.unsqueeze(0).to(DEVICE)], dim=1
-    )
-    joint_ids = joint_ids[:, :CFG["max_seq_len"]]
-    logits = model(input_ids=joint_ids).logits
-    lp, _, _ = _compute_action_logprob(logits, joint_ids, prompt_len)
-    return text, lp.item()
-# ══════════════════════════════════════════════════════════════════════════════
-# TRAJECTORY COLLECTION
-# ══════════════════════════════════════════════════════════════════════════════
-# Per-action shaped rewards.  These create reward variance so that
-# trajectories with meaningful tool use beat inspect-only episodes.
-_STEP_REWARD = {
-    "run_tests":  +0.08,
-    "run_linter": +0.05,
-    "fix":        +0.15,
-    "comment":    +0.08,
-    "query_docs": +0.05,
-    "question":   +0.04,
-    "inspect":     0.00,   # neutral – observe before acting
-    "done":        0.00,   # env handles the terminal reward
-    "skip":       -0.10,   # penalise doing nothing
-}
-def collect_trajectory(env, model, tokenizer,
-                       max_steps: int, temperature: float,
-                       task: str) -> tuple:
-    """
-    FIX 4 – Override env done/reward for non-terminal actions.
-    Root cause of the degenerate policy:
-    • env.step(Inspect()) returns done=True, reward=+0.002
-    • agent discovers inspect → tiny reward → done is the easiest path
-    • every trajectory is identical → zero advantage → PPO does nothing
-    Fix: only accept env's done+reward when the agent explicitly emits
-    {"action_type": "done"}.  For every other action, use a shaped step
-    reward and force the episode to continue.
-    """
-    env.set_task(task)
-    obs = env.reset()
-    history: List[str] = []
-    traj = Trajectory([], [], [], [], [], task=task)
-    action_seq = []
-    for step_num in range(max_steps):
-        prompt = build_prompt(obs, history)
-        traj.states.append(prompt)
-        text, lp = generate_action(prompt, model, tokenizer, temperature)
-        traj.actions.append(text)
-        traj.logprobs.append(lp)
-        action = parse_action(text)
-        action_seq.append(action.action_type)
-        obs, reward, env_done, _ = env.step(map_to_env(action))
-        raw_r = float(reward.value)
-        if action.action_type == "done":
-            # Agent explicitly chose to terminate → honour env reward
-            shaped_r     = raw_r
-            effective_done = True
-        else:
-            # Intermediate step: use shaped reward, ignore env's done signal.
-            # Also keep a fraction of any large env reward (e.g. test pass).
-            shaped_r = _STEP_REWARD.get(action.action_type, 0.0)
-            if raw_r > 0.1:            # env signalling meaningful progress
-                shaped_r += raw_r * 0.3
-            effective_done = False     # ← key: don't let env short-circuit
-        traj.rewards.append(float(np.clip(shaped_r, -1.0, 1.0)))
-        traj.dones.append(effective_done)
-        history.append(f"Agent: {text[:120]}")
-        history.append(f"Env: {(obs.last_tool_output or '')[:120]}")
-        if effective_done:
-            break
-    return traj, action_seq
-# ══════════════════════════════════════════════════════════════════════════════
-# SUPERVISED WARM-UP  (BUG FIX 1: action-only label masking)
-# ══════════════════════════════════════════════════════════════════════════════
-def supervised_warmup(model, tokenizer):
-    print("\n" + "="*60)
-    print("SUPERVISED WARM-UP")
-    print("="*60)
-    with open(CFG["warmup_data"], encoding="utf-8") as f:
-        data = json.load(f)
-    opt = AdamW(model.parameters(), lr=CFG["warmup_lr"])
-    model.train()
-    loss_history = []
-    for epoch in range(CFG["warmup_epochs"]):
-        random.shuffle(data)
-        epoch_loss, n_valid = 0.0, 0
-        opt.zero_grad()
-        for step, ex in enumerate(data):
-            # ── Tokenise prompt and full sequence jointly ────────────────
-            prompt_chat = tokenizer.apply_chat_template(
-                [{"role": "user", "content": ex["prompt"]}],
-                tokenize=False, add_generation_prompt=True
-            )
-            full_chat = tokenizer.apply_chat_template(
-                [{"role": "user",      "content": ex["prompt"]},
-                 {"role": "assistant", "content": ex["action"]}],
-                tokenize=False
-            )
-            prompt_ids = tokenizer(
-                prompt_chat, return_tensors="pt",
-                max_length=CFG["max_seq_len"], truncation=True
-            )["input_ids"]
-            full_inputs = tokenizer(
-                full_chat, return_tensors="pt",
-                max_length=CFG["max_seq_len"], truncation=True
-            ).to(DEVICE)
-            prompt_len = prompt_ids.shape[1]
-            if prompt_len >= full_inputs["input_ids"].shape[1]:
-                continue  # action got truncated away
-            # BUG FIX 1 ── mask prompt tokens so loss is action-only
-            labels = _masked_labels(full_inputs["input_ids"], prompt_len)
-            out  = model(**full_inputs, labels=labels)
-            loss = out.loss / CFG["warmup_grad_acc"]
-            loss.backward()
-            if (step + 1) % CFG["warmup_grad_acc"] == 0:
-                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
-                opt.step()
-                opt.zero_grad()
-            epoch_loss += loss.item() * CFG["warmup_grad_acc"]
-            n_valid    += 1
-            if (step + 1) % 50 == 0:
-                print(f"  epoch {epoch+1}  step {step+1}/{len(data)}"
-                      f"  loss={epoch_loss/n_valid:.4f}")
-        avg = epoch_loss / max(n_valid, 1)
-        loss_history.append(avg)
-        print(f"  Epoch {epoch+1} complete: avg_loss={avg:.4f}")
-    torch.cuda.empty_cache()
-    print(f"✓ Warm-up done. Loss: {' → '.join(f'{l:.4f}' for l in loss_history)}\n")
-    return loss_history
-# ══════════════════════════════════════════════════════════════════════════════
-# EVALUATION  (produces rich EvalSnapshot for comparison plots)
-# ══════════════════════════════════════════════════════════════════════════════
-@torch.no_grad()
-def evaluate(env, model, tokenizer, label: str = "") -> EvalSnapshot:
-    model.eval()
-    per_task: Dict[str, List[float]] = defaultdict(list)
-    action_counter: Counter = Counter()
-    all_steps, all_success = [], []
-    traces = []
-    for ep in range(CFG["eval_episodes"]):
-        task = TASK_LEVELS[ep % len(TASK_LEVELS)]
-        traj, actions = collect_trajectory(
-            env, model, tokenizer, CFG["max_steps"], 0.0, task
-        )
-        ep_r = sum(traj.rewards)
-        per_task[task].append(ep_r)
-        action_counter.update(actions)
-        all_steps.append(len(traj.actions))
-        # FIX 6 – meaningful success = agent explicitly called "done".
-        # ep_r > 0 is misleading: even a single inspect returns +0.002.
-        all_success.append(1 if "done" in actions else 0)
-        traces.append({"task": task, "reward": round(ep_r, 4),
-                       "steps": len(traj.actions), "actions": actions})
-    total_actions = max(sum(action_counter.values()), 1)
-    snap = EvalSnapshot(
-        avg_reward   = float(np.mean([r for rs in per_task.values() for r in rs])),
-        per_task     = {t: float(np.mean(rs)) for t, rs in per_task.items()},
-        action_dist  = {a: c/total_actions for a, c in action_counter.most_common()},
-        success_rate = float(np.mean(all_success)),
-        avg_steps    = float(np.mean(all_steps)),
-        traces       = traces,
-    )
-    if label:
-        print(f"\n── {label} ──")
-        print(f"  avg_reward={snap.avg_reward:+.4f}  "
-              f"success={snap.success_rate:.0%}  steps={snap.avg_steps:.1f}")
-        print(f"  per-task: " +
-              "  ".join(f"{t}={v:+.3f}" for t,v in snap.per_task.items()))
-        print(f"  top actions: " +
-              "  ".join(f"{a}={p:.0%}" for a,p in list(snap.action_dist.items())[:5]))
     model.train()
-    return snap
-# ══════════════════════════════════════════════════════════════════════════════
-# PPO UPDATE  (BUG FIX 2 + 3: BPE-safe logprob + log-ratio clamping)
-# ══════════════════════════════════════════════════════════════════════════════
-def ppo_update(trajectories: List[Trajectory],
-               model, tokenizer, optimizer) -> dict:
-    model.train()
-    losses, kls, entropies = [], [], []
-    # ── Compute discounted returns and a global mean baseline ────────────────
-    all_returns = []
-    traj_returns = []
-    for traj in trajectories:
-        ret, running = [], 0.0
-        for r, done in zip(reversed(traj.rewards), reversed(traj.dones)):
-            running = r + CFG["gamma"] * (0.0 if done else running)
-            ret.insert(0, running)
-        traj_returns.append(ret)
-        all_returns.extend(ret)
-    # FIX 5 – Normalise advantages to zero mean / unit std.
-    # When all returns are identical (e.g. every episode returns 0.002),
-    # baseline = mean = every return, so adv = 0 for all steps, the
-    # policy loss is 0, and PPO never updates.  Normalising creates real
-    # signal: better-than-average trajectories get positive advantage,
-    # worse-than-average get negative, even if the absolute spread is tiny.
-    ret_arr  = np.array(all_returns) if all_returns else np.array([0.0])
-    ret_mean = float(ret_arr.mean())
-    ret_std  = float(ret_arr.std())
-    if ret_std < 1e-6:
-        # Truly zero variance – nothing to learn this iteration.
-        print("  [PPO] Zero return variance – skipping gradient update.")
-        return dict(loss=0.0, kl=0.0, entropy=0.0)
-    # Build a lookup so we can retrieve the normalised advantage by
-    # (trajectory index, step index) during the update loop below.
-    norm_returns: List[List[float]] = [
-        [(r - ret_mean) / (ret_std + 1e-8) for r in ret_list]
-        for ret_list in traj_returns
-    ]
-    for traj_idx, (traj, returns) in enumerate(zip(trajectories, traj_returns)):
-        for i in range(len(traj.states)):
-            state  = traj.states[i]
-            action = traj.actions[i]
-            old_lp = traj.logprobs[i]
-            adv    = norm_returns[traj_idx][i]   # ← normalised advantage
-            # ── Tokenise jointly (BPE FIX 2) ────────────────────────────────
-            prompt_chat = tokenizer.apply_chat_template(
-                [{"role": "user", "content": state}],
-                tokenize=False, add_generation_prompt=True
-            )
-            full_text = prompt_chat + action
-            full_ids = tokenizer(
-                full_text, return_tensors="pt",
-                max_length=CFG["max_seq_len"], truncation=True
-            ).to(DEVICE)
-            # Count prompt tokens IN THE JOINT SEQUENCE (not separately)
-            prompt_ids = tokenizer(
-                prompt_chat, return_tensors="pt",
-                max_length=CFG["max_seq_len"] - 10, truncation=True
-            )["input_ids"]
-            prompt_len = min(prompt_ids.shape[1], full_ids["input_ids"].shape[1] - 1)
-            logits = model(**full_ids).logits
-            new_lp, avg_ent, n_tokens = _compute_action_logprob(
-                logits, full_ids["input_ids"], prompt_len
             )
-            if n_tokens == 0:
-                continue
-            # BUG FIX 3 ── clamp log-ratio before exp to prevent NaN
-            old_lp_t  = torch.tensor(old_lp, dtype=torch.float32, device=DEVICE)
-            log_ratio = torch.clamp(new_lp - old_lp_t,
-                                    -CFG["log_ratio_clamp"],
-                                     CFG["log_ratio_clamp"])
-            ratio     = torch.exp(log_ratio)
-            adv_t = torch.tensor(adv, dtype=torch.float32, device=DEVICE)
-            s1    = ratio * adv_t
-            s2    = torch.clamp(ratio,
-                                1.0 - CFG["clip_eps"],
-                                1.0 + CFG["clip_eps"]) * adv_t
-            policy_loss = -torch.min(s1, s2)
-            loss        = policy_loss - CFG["entropy_coef"] * avg_ent
-            if torch.isnan(loss) or torch.isinf(loss):
                 continue
-            optimizer.zero_grad()
             loss.backward()
             torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
-            optimizer.step()
-            losses.append(loss.item())
-            kls.append((old_lp_t - new_lp).detach().cpu().item())
-            entropies.append(avg_ent.item())
-    torch.cuda.empty_cache()
-    return dict(
-        loss    = float(np.mean(losses))    if losses    else 0.0,
-        kl      = float(np.mean(kls))       if kls       else 0.0,
-        entropy = float(np.mean(entropies)) if entropies else 0.0,
-    )
-# ══════════════════════════════════════════════════════════════════════════════
-# PLOTTING  (rich evidence panel)
-# ══════════════════════════════════════════════════════════════════════════════
-def plot_all(warmup_losses, reward_hist, success_hist, kl_hist, entropy_hist,
-             baseline_snap: EvalSnapshot,
-             postwarmup_snap: EvalSnapshot,
-             final_snap: EvalSnapshot):
-    iters = list(range(1, len(reward_hist) + 1))
-    # ── Figure 1: training curves (2×3 grid) ─────────────────────────────────
-    fig = plt.figure(figsize=(18, 10))
-    gs  = gridspec.GridSpec(2, 3, figure=fig, hspace=0.45, wspace=0.35)
-    # (0,0) Warm-up loss
-    ax = fig.add_subplot(gs[0, 0])
-    ax.plot(range(1, len(warmup_losses)+1), warmup_losses,
-            marker="o", color="mediumpurple", linewidth=2)
-    ax.set_title("A. Warm-up CE Loss ↓", fontweight="bold")
-    ax.set_xlabel("Epoch"); ax.set_ylabel("Loss"); ax.grid(alpha=0.3)
-    # (0,1) PPO reward
-    ax = fig.add_subplot(gs[0, 1])
-    smooth = np.convolve(reward_hist, np.ones(3)/3, mode="same")
-    ax.plot(iters, reward_hist, alpha=0.35, color="steelblue", linewidth=1)
-    ax.plot(iters, smooth, color="steelblue", linewidth=2.5, label="reward (smoothed)")
-    ax.axhline(baseline_snap.avg_reward, color="gray", linestyle=":",
-               label=f"pre-warmup ({baseline_snap.avg_reward:+.3f})")
-    ax.axhline(postwarmup_snap.avg_reward, color="mediumpurple", linestyle="--",
-               label=f"post-warmup ({postwarmup_snap.avg_reward:+.3f})")
-    ax.axhline(final_snap.avg_reward, color="forestgreen", linestyle="-.",
-               label=f"final ({final_snap.avg_reward:+.3f})")
-    ax.set_title("B. PPO Reward ↑", fontweight="bold")
-    ax.set_xlabel("Iteration"); ax.set_ylabel("Avg Reward")
-    ax.legend(fontsize=7); ax.grid(alpha=0.3)
-    # (0,2) Success rate
-    ax = fig.add_subplot(gs[0, 2])
-    ax.plot(iters, success_hist, marker="s", color="seagreen", linewidth=2)
-    ax.set_ylim(0, 1)
-    ax.set_title("C. Episode Success Rate ↑", fontweight="bold")
-    ax.set_xlabel("Iteration"); ax.set_ylabel("Fraction")
-    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y,_: f"{y:.0%}"))
-    ax.grid(alpha=0.3)
-    # (1,0) KL divergence
-    ax = fig.add_subplot(gs[1, 0])
-    ax.plot(iters, kl_hist, marker="^", color="tomato", linewidth=2)
-    ax.axhline(0, color="gray", linewidth=0.8)
-    ax.set_title("D. KL Divergence", fontweight="bold")
-    ax.set_xlabel("Iteration"); ax.set_ylabel("KL"); ax.grid(alpha=0.3)
-    # (1,1) Entropy
-    ax = fig.add_subplot(gs[1, 1])
-    ax.plot(iters, entropy_hist, marker="D", color="darkorange", linewidth=2)
-    ax.set_title("E. Policy Entropy", fontweight="bold")
-    ax.set_xlabel("Iteration"); ax.set_ylabel("Entropy"); ax.grid(alpha=0.3)
-    # (1,2) Per-difficulty final reward
-    ax = fig.add_subplot(gs[1, 2])
-    tasks = TASK_LEVELS
-    vals_base  = [baseline_snap.per_task.get(t, 0)   for t in tasks]
-    vals_final = [final_snap.per_task.get(t, 0)       for t in tasks]
-    x = np.arange(len(tasks))
-    ax.bar(x - 0.2, vals_base,  0.35, label="baseline",color="lightcoral",  alpha=0.8)
-    ax.bar(x + 0.2, vals_final, 0.35, label="final",   color="steelblue",   alpha=0.8)
-    ax.set_xticks(x); ax.set_xticklabels(tasks, fontsize=8)
-    ax.set_title("F. Per-Difficulty Reward", fontweight="bold")
-    ax.set_ylabel("Avg Reward"); ax.legend(fontsize=8); ax.grid(alpha=0.3, axis="y")
-    ax.axhline(0, color="gray", linewidth=0.8)
-    fig.suptitle(f"Code-Review Agent – Full Training Evidence  "
-                 f"(Qwen2.5-1.5B, PPO + QLoRA)",
-                 fontsize=13, fontweight="bold")
-    fig.savefig("training_summary.png", dpi=150, bbox_inches="tight")
-    plt.close(fig)
-    print("  Saved: training_summary.png")
-    # ── Figure 2: before / after action distribution ─────────────────────────
-    fig, axes = plt.subplots(1, 3, figsize=(16, 4), sharey=False)
-    for ax, snap, title in zip(
-        axes,
-        [baseline_snap, postwarmup_snap, final_snap],
-        ["Before (baseline)", "After warm-up", "After PPO (final)"]
-    ):
-        if snap.action_dist:
-            labels = list(snap.action_dist.keys())
-            vals   = [snap.action_dist[l]*100 for l in labels]
-            bars   = ax.barh(labels, vals,
-                             color=plt.cm.tab10(np.linspace(0, 0.8, len(labels))))
-            ax.bar_label(bars, fmt="%.0f%%", padding=3, fontsize=8)
-        ax.set_xlim(0, 105)
-        ax.set_title(title, fontweight="bold")
-        ax.set_xlabel("% of actions")
-        ax.grid(alpha=0.3, axis="x")
-    fig.suptitle("Action Distribution: Before vs After Training",
-                 fontsize=12, fontweight="bold")
-    plt.tight_layout()
-    fig.savefig("action_distribution.png", dpi=150, bbox_inches="tight")
-    plt.close(fig)
-    print("  Saved: action_distribution.png")
-# ══════════════════════════════════════════════════════════════════════════════
-# MAIN
-# ══════════════════════════════════════════════════════════════════════════════
-def train():
     model, tokenizer = load_model()
     env = CodeReviewEnv()
-    # ── PHASE 0: pre-warmup baseline ────────────────────────────────────────
     print("\n" + "="*60)
-    print("PHASE 0 – BASELINE (untrained)")
     print("="*60)
-    baseline_snap = evaluate(env, model, tokenizer, "Baseline")
-    # ── PHASE 1: supervised warm-up ─────────────────────────────────────────
-    warmup_losses = supervised_warmup(model, tokenizer)
-    postwarmup_snap = evaluate(env, model, tokenizer, "Post-Warmup")
-    # ── PHASE 2: PPO ────────────────────────────────────────────────────────
-    optimizer = AdamW(model.parameters(), lr=CFG["ppo_lr"])
-    reward_hist, success_hist, kl_hist, entropy_hist = [], [], [], []
     print("\n" + "="*60)
-    print(f"PHASE 2 – PPO ({CFG['ppo_iters']} iterations × "
-          f"{CFG['trajs_per_iter']} trajectories)")
     print("="*60)
-    for it in range(CFG["ppo_iters"]):
-        # Linearly anneal exploration temperature
-        # FIX 7 – exponential decay with a floor (never below 0.35).
-        # Linear annealing to 0.1 collapses exploration before we learn
-        # anything; keeping >= 0.35 ensures trajectory diversity.
-        t = max(CFG["temp_start"] * (0.93 ** it), 0.35)
-        print(f"\n── Iteration {it+1}/{CFG['ppo_iters']}  temp={t:.2f} ──")
-        trajectories, action_counts = [], Counter()
-        successes = 0
-        for j in range(CFG["trajs_per_iter"]):
-            task = TASK_LEVELS[j % len(TASK_LEVELS)]
-            traj, actions = collect_trajectory(
-                env, model, tokenizer, CFG["max_steps"], t, task
-            )
-            trajectories.append(traj)
-            action_counts.update(actions)
-            ep_r = sum(traj.rewards)
-            # FIX 6b – consistent with evaluate(): only explicit done counts
-            successes += int("done" in actions)
-            print(f"  traj {j+1}/{CFG['trajs_per_iter']}  task={task}"
-                  f"  steps={len(traj.actions)}  reward={ep_r:+.3f}")
-        avg_r       = float(np.mean([sum(t.rewards) for t in trajectories]))
-        success_r   = successes / CFG["trajs_per_iter"]
-        m = ppo_update(trajectories, model, tokenizer, optimizer)
-        reward_hist.append(avg_r)
-        success_hist.append(success_r)
-        kl_hist.append(m["kl"])
-        entropy_hist.append(m["entropy"])
-        delta = avg_r - baseline_snap.avg_reward
-        print(f"  → avg_reward={avg_r:+.4f}  Δbaseline={delta:+.4f}"
-              f"  success={success_r:.0%}"
-              f"  loss={m['loss']:.4f}  kl={m['kl']:.4f}  ent={m['entropy']:.4f}")
-        print(f"  actions: {dict(action_counts.most_common(5))}")
-    # ── PHASE 3: final evaluation ───────────────────────────────────────────
-    print("\n" + "="*60)
-    print("PHASE 3 – FINAL EVALUATION")
-    print("="*60)
-    final_snap = evaluate(env, model, tokenizer, "Final")
-    # ── Summary table ───────────────────────────────────────────────────────
-    print("\n" + "="*60)
-    print("TRAINING SUMMARY")
-    print("="*60)
-    print(f"  {'Stage':<20} {'Reward':>10} {'Success':>10} {'Δ baseline':>12}")
-    print(f"  {'-'*54}")
-    for label, snap in [("Baseline",    baseline_snap),
-                        ("Post-warmup", postwarmup_snap),
-                        ("Final (PPO)", final_snap)]:
-        delta = snap.avg_reward - baseline_snap.avg_reward
-        print(f"  {label:<20} {snap.avg_reward:>+10.4f}"
-              f" {snap.success_rate:>10.0%}  {delta:>+11.4f}")
-    improve = final_snap.avg_reward - baseline_snap.avg_reward
-    verdict = "✓ LEARNED" if improve > 0 else "✗ NO IMPROVEMENT"
-    print(f"\n  {verdict}  (total Δ = {improve:+.4f})")
-    print("\nBefore → After traces (one per difficulty):")
-    btask = {t["task"]: t for t in baseline_snap.traces}
-    ftask = {t["task"]: t for t in final_snap.traces}
-    for task in TASK_LEVELS:
-        b = btask.get(task, {})
-        f = ftask.get(task, {})
-        print(f"  {task:8s}  baseline actions={b.get('actions',[])}  "
-              f"reward={b.get('reward',0):+.3f}"
-              f"  │  final actions={f.get('actions',[])}  "
-              f"reward={f.get('reward',0):+.3f}")
-    # ── Plots ───────────────────────────────────────────────────────────────
-    plot_all(warmup_losses, reward_hist, success_hist, kl_hist, entropy_hist,
-             baseline_snap, postwarmup_snap, final_snap)
-    print("\nAll done. Saved: training_summary.png  action_distribution.png")
 if __name__ == "__main__":
-    train()

+# training.py – Memory‑safe: Phi‑3‑mini + Expert Demos + Fast PPO (2 iterations)
+import os
+os.environ["TRITON_DISABLE"] = "1"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"        # Issue #12: prevent OOM from parallel tokenization
+import torch._dynamo
+torch._dynamo.config.disable = True
+import json
 import torch
 import torch.nn.functional as F
 from torch.optim import AdamW
+from dataclasses import dataclass
+from typing import List, Dict, Tuple, Optional
 import numpy as np
+import re
+import random
+import matplotlib.pyplot as plt
 from unsloth import FastLanguageModel
+from transformers import TrainingArguments
+from trl import SFTTrainer
+from datasets import Dataset
 from environment import CodeReviewEnv
 from redteam import BUG_DB
+from models import (
+    RunTests, RunLinter, Inspect,
+    ProposeFix, WriteComment, AskQuestion,
+    Done, Skip, QueryDocs, map_to_env as model_map_to_env
 )
+# ======================================================================
 @dataclass
 class AgentAction:
     action_type: str
     content: Optional[str] = None
+def parse_action(output: str) -> AgentAction:
     try:
+        data = json.loads(output)
+        return AgentAction(
+            action_type=data.get("action_type", "").lower(),
+            content=data.get("content")
+        )
+    except:
         pass
+    json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', output, re.DOTALL)
+    if json_match:
+        try:
+            data = json.loads(json_match.group(1))
+            return AgentAction(
+                action_type=data.get("action_type", "").lower(),
+                content=data.get("content")
+            )
+        except:
+            pass
+    action_pattern = r'"action_type"\s*:\s*"(\w+)"'
+    match = re.search(action_pattern, output)
+    if match:
+        return AgentAction(action_type=match.group(1).lower())
+    output_lower = output.lower()
+    if "test" in output_lower:
+        return AgentAction("run_tests")
+    if "lint" in output_lower:
+        return AgentAction("run_linter")
+    if "inspect" in output_lower:
+        return AgentAction("inspect")
+    if "doc" in output_lower or "documentation" in output_lower:
+        return AgentAction("query_docs", "bug fix guidance")
+    return AgentAction("invalid", output)
 def map_to_env(action: AgentAction):
     return model_map_to_env(action.action_type, action.content)
+# ======================================================================
 def load_model():
     model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name="unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
+        max_seq_length=480,               # smaller window for memory
+        load_in_4bit=True,
     )
     model = FastLanguageModel.get_peft_model(
         model,
+        r=16,
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj"
+        ],
+        lora_alpha=32,
+        lora_dropout=0.0,
     )
     return model, tokenizer
+def test_model_sanity(model, tokenizer) -> bool:
+    print("\n" + "="*60)
+    print("SANITY CHECK: Testing base model generation")
+    print("="*60)
+    test_prompt = "Hello, how are you?"
+    messages = [{"role": "user", "content": test_prompt}]
+    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(formatted, return_tensors="pt", max_length=256, truncation=True).to("cuda")
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=30,
+            do_sample=True,
+            temperature=0.7,
+            min_new_tokens=1,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
+        )
+    generated_ids = outputs[0][inputs['input_ids'].shape[1]:]
+    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+    print(f"Prompt: {test_prompt}")
+    print(f"Response: {repr(response)}")
+    if len(response) == 0:
+        print("❌ Model produces empty output – cannot train.")
+        return False
+    print("✓ Model sanity check PASSED\n")
+    return True
+# ======================================================================
+def _expert_fix_from_context(obs) -> str:
     """
+    Build a conservative fix template named `fix` (required by tests).
+    Uses bug hints + code snippet patterns to create realistic fixes.
     """
+    bug = (getattr(obs, "bug_description", "") or "").lower()
+    code = getattr(obs, "code_snippet", "") or ""
+    if "division" in bug or "average" in code.lower():
+        return (
+            "def fix(data):\n"
+            "    if not data:\n"
+            "        return 0\n"
+            "    return sum(data) / len(data)"
+        )
+    if "operator" in bug or "sign" in bug:
+        return (
+            "def fix(a, b):\n"
+            "    return a + b"
+        )
+    if "off_by_one" in bug or "loop" in bug:
+        return (
+            "def fix(items):\n"
+            "    return len(items)"
+        )
+    if "null" in bug or "key" in bug or "dict" in code.lower():
+        return (
+            "def fix(payload):\n"
+            "    users = payload.get('users', {})\n"
+            "    user_id = payload.get('id')\n"
+            "    return users.get(user_id)"
+        )
+    # Concurrency-heavy tasks (harder/hardest).
+    if "race" in bug or "missing_lock" in bug or "thread_safe" in bug or "global_nonatomic" in bug:
+        return (
+            "import threading\n"
+            "_lock = threading.Lock()\n"
+            "\n"
+            "def fix(counter):\n"
+            "    with _lock:\n"
+            "        if counter is None:\n"
+            "            return 0\n"
+            "        return counter + 1"
+        )
+    if "deadlock" in bug or "double_lock" in bug or "lock order" in bug or "nested_lock" in bug:
+        return (
+            "import threading\n"
+            "_lock_a = threading.Lock()\n"
+            "_lock_b = threading.Lock()\n"
+            "\n"
+            "def fix(work):\n"
+            "    first, second = (_lock_a, _lock_b)\n"
+            "    if id(first) > id(second):\n"
+            "        first, second = second, first\n"
+            "    with first:\n"
+            "        with second:\n"
+            "            return work() if callable(work) else work"
+        )
+    if "fork_join" in bug or "join" in bug:
+        return (
+            "import threading\n"
+            "\n"
+            "def fix(worker):\n"
+            "    t = threading.Thread(target=worker)\n"
+            "    t.start()\n"
+            "    t.join()\n"
+            "    return True"
+        )
+    # Generic safe fallback keeps the RL pipeline alive for unknown bugs.
+    return (
+        "def fix(data):\n"
+        "    if data is None:\n"
+        "        return None\n"
+        "    return data"
+    )
+def _expert_supervised_policy(obs) -> str:
+    """
+    Real workflow policy:
+    inspect -> tests/linter -> docs -> fix -> negotiate -> done.
+    """
+    author_msg = (getattr(obs, "author_response", "") or "").lower()
+    tool_output = (getattr(obs, "last_tool_output", "") or "").lower()
+    if not getattr(obs, "tests_run", False):
+        if "inspect" not in tool_output:
+            return '{"action_type": "inspect"}'
+        return '{"action_type": "run_tests"}'
+    if not getattr(obs, "linter_run", False):
+        return '{"action_type": "run_linter"}'
+    if not getattr(obs, "docs_queried", False):
+        return '{"action_type": "query_docs", "content": "python bug fixing best practices for edge cases and null safety"}'
+    # Use docs again on hard tasks when evidence is still weak.
+    if getattr(obs, "current_test_score", 0.0) < 0.6 and getattr(obs, "step", 0) >= 3:
+        bug_hint = (getattr(obs, "bug_description", "") or "concurrency bug").replace('"', "'")
+        return json.dumps(
+            {
+                "action_type": "query_docs",
+                "content": f"python {bug_hint} lock ordering race condition mitigation patterns",
+            }
+        )
+    # If test quality is poor, propose a concrete fix.
+    if getattr(obs, "current_test_score", 0.0) < 0.95:
+        fix_code = _expert_fix_from_context(obs)
+        return json.dumps({"action_type": "fix", "content": fix_code})
+    # If author is still unconvinced, provide causal explanation.
+    if author_msg and ("not convinced" in author_msg or "explain" in author_msg or "brief" in author_msg):
+        return (
+            '{"action_type": "comment", "content": "This fix works because it handles the failing edge case directly, '
+            'keeps behavior deterministic, and aligns with the observed test and lint feedback. '
+            'The change is intentionally small to reduce regression risk."}'
+        )
+    # If negotiation is strong enough and quality is good, terminate.
+    conf = float(getattr(obs, "author_confidence", 0.0))
+    threshold = float(getattr(obs, "author_threshold", 0.5))
+    score = float(getattr(obs, "current_test_score", 0.0))
+    if conf >= threshold and score >= 0.8:
+        return '{"action_type": "done"}'
+    # Nudge conversation forward when tests are okay but acceptance is pending.
+    return (
+        '{"action_type": "question", "content": "Would you like a quick walkthrough of a failing scenario, the root cause, and how the fix prevents regressions?"}'
+    )
+# ======================================================================
+def supervised_warmup(model, tokenizer, env, n_episodes=16, epochs=1, max_steps=8):
+    print("\n" + "="*60)
+    print("SUPERVISED WARM-UP: Real environment demonstrations")
+    print("="*60)
+    examples = []
+    tasks = ["easy", "medium", "hard", "harder", "hardest"]
+    for ep in range(n_episodes):
+        task = random.choice(tasks)
+        env.set_task(task)
+        obs = env.reset()
+        history = []
+        done = False
+        steps = 0
+        while not done and steps < max_steps:
+            prompt = build_prompt(obs, history)
+            action_text = _expert_supervised_policy(obs)
+            action = parse_action(action_text)
+            env_action = map_to_env(action)
+            next_obs, _, done, _ = env.step(env_action)
+            messages = [
+                {"role": "user", "content": prompt},
+                {"role": "assistant", "content": action_text},
+            ]
+            full_text = tokenizer.apply_chat_template(messages, tokenize=False)
+            examples.append({"text": full_text})
+            history.append(f"Agent: {action_text}")
+            history.append(f"Env: {next_obs.last_tool_output}")
+            history = history[-8:]
+            obs = next_obs
+            steps += 1
+        print(f"Supervised episode {ep+1}: task={task}, steps={steps}, done={done}")
+    if not examples:
+        print("No supervised examples generated; skipping warm-up.")
+        return
+    dataset = Dataset.from_list(examples)
+    trainer = SFTTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=dataset,
+        dataset_text_field="text",
+        max_seq_length=480,
+        args=TrainingArguments(
+            output_dir="warmup_output",
+            num_train_epochs=epochs,
+            per_device_train_batch_size=2,
+            gradient_accumulation_steps=2,
+            learning_rate=2e-5,
+            logging_steps=50,
+            save_strategy="no",
+            bf16=True,
+        ),
+    )
+    print(f"Training on {len(examples)} real env examples for {epochs} epochs...")
+    trainer.train()
+    print("✓ Supervised warm-up (real env) complete\n")
+    torch.cuda.empty_cache()
+# ======================================================================
+def generate_action_with_logprob(prompt, model, tokenizer, temperature=0.0, max_retries=2):
+    messages = [{"role": "user", "content": prompt}]
+    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(formatted, return_tensors="pt", max_length=480, truncation=True).to("cuda")
+    for attempt in range(max_retries):
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=64,
+                do_sample=(temperature > 0),
+                temperature=max(temperature, 0.01) if temperature > 0 else 1.0,
+                min_new_tokens=1,
+                return_dict_in_generate=True,
+                output_scores=True,
+            )
+        generated_ids = outputs.sequences[0][inputs['input_ids'].shape[1]:]
+        action_text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+        logprobs = []
+        for idx, token_id in enumerate(generated_ids):
+            if idx < len(outputs.scores):
+                token_logits = outputs.scores[idx][0]
+                token_logprob = F.log_softmax(token_logits, dim=-1)[token_id].item()
+                logprobs.append(token_logprob)
+        total_logprob = sum(logprobs) if logprobs else -100.0
+        if not action_text:
+            fallback_actions = [
+                '{"action_type": "run_tests"}',
+                '{"action_type": "run_linter"}',
+                '{"action_type": "inspect"}',
+                '{"action_type": "skip"}',
+            ]
+            action_text = random.choice(fallback_actions)
+            total_logprob = -50.0
+            print(f"[WARN] Empty generation → using fallback: {action_text}")
+            return action_text, total_logprob
+        try:
+            json.loads(action_text)
+            return action_text, total_logprob
+        except:
+            if attempt == max_retries - 1:
+                return '{"action_type":"skip"}', -100.0
+            continue
+    return '{"action_type":"skip"}', -100.0
+# ======================================================================
+def build_prompt(obs, history_lines: List[str]) -> str:
+    author_msg = getattr(obs, "author_response", "") or ""
+    tool_output = getattr(obs, "last_tool_output", "") or ""
+    author_personality = getattr(obs, "author_personality", "defensive")
+    prompt = f"""You are an AI code review agent. Your goal is to convince a simulated human developer to accept your proposed fix and name your proposed fix function fix.
+The developer has a **{author_personality}** personality and will only accept if you provide solid evidence:
+- Tests pass (high pass ratio)
+- Lint is clean (zero errors)
+- Documentation or references are provided
+- Your reasoning is clear, uses words like "because" or "therefore", and is detailed (over 30 words if needed)
+Workflow:
+1. Use `inspect` to understand the code.
+2. Use `run_tests` and `run_linter` to gather evidence.
+3. Use `query_docs` when you need references or language-specific guidance.
+4. Propose a fix (`fix`) and explain why it works (`comment` or `question`).
+5. If the developer pushes back, read their response carefully and address their specific concern.
+6. Once convinced, use `done` to finish.
+Code:
+{obs.code_snippet}
+Author says:
+{author_msg if author_msg else "(no response yet – start with inspection)"}
+Last tool output:
+{tool_output if tool_output else "(none)"}
+Available actions:
+run_tests, run_linter, inspect, query_docs, fix, comment, question, done
+Respond ONLY in JSON:
+{{"action_type": "...", "content": "..."}}"""
+    if history_lines:
+        history = "\n".join(history_lines[-6:])
+        prompt += f"\n\nPrevious steps:\n{history}"
+    return prompt
+# ======================================================================
+@dataclass
+class Trajectory:
+    states: List[str]
+    actions: List[str]
+    rewards: List[float]
+    logprobs: List[float]
+    dones: List[bool]
+    def __len__(self): return len(self.states)
+def collect_trajectory(env, model, tokenizer, max_steps=6, temperature=0.0):
+    obs = env.reset()
+    history_lines = []
+    states, actions, rewards, logprobs, dones = [], [], [], [], []
+    for step in range(max_steps):
+        prompt = build_prompt(obs, history_lines)
+        states.append(prompt)
+        action_text, logprob = generate_action_with_logprob(prompt, model, tokenizer, temperature)
+        actions.append(action_text)
+        logprobs.append(logprob)
+        action = parse_action(action_text)
+        env_action = map_to_env(action)
+        next_obs, reward, done, _ = env.step(env_action)
+        rewards.append(reward.value)
+        dones.append(done)
+        history_lines.append(f"Agent: {action_text}")
+        history_lines.append(f"Env: {next_obs.last_tool_output}")
+        obs = next_obs
+        if done: break
+    return Trajectory(states, actions, rewards, logprobs, dones)
+def collect_trajectories(env, model, tokenizer, n_trajectories, max_steps=6,
+                         task_levels=None, task_weights=None):
+    if task_levels is None:
+        task_levels = list(BUG_DB.keys())
+    if task_weights is not None and len(task_weights) != len(task_levels):
+        raise ValueError("task_weights must match task_levels length")
+    if task_weights is not None and sum(task_weights) <= 0:
+        raise ValueError("task_weights must have a positive total")
+    trajectories = []
+    for i in range(n_trajectories):
+        sampled_task = random.choices(task_levels, weights=task_weights, k=1)[0]
+        env.set_task(sampled_task)
+        traj = collect_trajectory(env, model, tokenizer, max_steps)
+        total_reward = sum(traj.rewards)
+        print(f"Trajectory {i+1}/{n_trajectories}: task={sampled_task}, steps={len(traj)}, reward={total_reward:.3f}")
+        trajectories.append(traj)
+    return trajectories
+def compute_returns_and_advantages(rewards, dones, gamma=0.99, standardize=True):
+    """
+    Compute discounted returns and REINFORCE-style baseline advantages.
+    Advantages are centered and optionally standardised.
+    """
+    n = len(rewards)
+    returns = [0.0]*n
+    running = 0.0
+    for t in reversed(range(n)):
+        if dones[t]: running = 0.0
+        running = rewards[t] + gamma * running
+        returns[t] = running
+    if standardize:
+        advantages = np.array(returns) - np.mean(returns)
+        adv_std = np.std(advantages) + 1e-8
+        advantages = (advantages / adv_std).tolist()
+    else:
+        advantages = returns.copy()
+    return advantages, returns
+def ppo_update(trajectories, model, tokenizer, optimizer, n_epochs=1, clip_epsilon=0.2,
+               entropy_coef=0.01, gamma=0.99):
     model.train()
+    all_states, all_actions, all_old_logprobs, all_advantages = [], [], [], []
+    for traj in trajectories:
+        advantages, _ = compute_returns_and_advantages(traj.rewards, traj.dones, gamma=gamma, standardize=True)
+        all_states.extend(traj.states)
+        all_actions.extend(traj.actions)
+        all_old_logprobs.extend(traj.logprobs)
+        all_advantages.extend(advantages)
+    n_samples = len(all_states)
+    total_loss, total_policy_loss, total_entropy, n_updates = 0.0, 0.0, 0.0, 0
+    for epoch in range(n_epochs):
+        indices = np.random.permutation(n_samples)
+        for i in indices:
+            state = all_states[i]
+            action = all_actions[i]
+            old_logprob = all_old_logprobs[i]
+            advantage = all_advantages[i]
+            messages = [{"role": "user", "content": state}]
+            formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            full_text = formatted + action
+            inputs = tokenizer(full_text, return_tensors="pt", max_length=480, truncation=True).to("cuda")
+            outputs = model(**inputs)
+            logits = outputs.logits
+            action_ids = tokenizer.encode(action, add_special_tokens=False)
+            prefix_ids = tokenizer.encode(formatted, add_special_tokens=False)
+            action_start = len(prefix_ids)
+            logprobs = []
+            entropy = 0.0
+            for idx, token_id in enumerate(action_ids):
+                position = action_start + idx - 1
+                if 0 <= position < logits.shape[1]:
+                    token_logits = logits[0, position]
+                    log_probs = F.log_softmax(token_logits, dim=-1)
+                    token_logprob = log_probs[token_id]
+                    logprobs.append(token_logprob)
+                    probs = F.softmax(token_logits, dim=-1)
+                    entropy += -(probs * log_probs).sum()
+            if not logprobs: continue
+            new_logprob = sum(logprobs)
+            avg_entropy = entropy / len(logprobs) if logprobs else 0.0
+            ratio = torch.exp(new_logprob - old_logprob)
+            surr1 = ratio * advantage
+            surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantage
+            policy_loss = -torch.min(surr1, surr2)
+            loss = policy_loss - entropy_coef * avg_entropy
+            optimizer.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+            total_loss += loss.item()
+            total_policy_loss += policy_loss.item()
+            total_entropy += avg_entropy.item()
+            n_updates += 1
+    torch.cuda.empty_cache()
+    return {"loss": total_loss / n_updates if n_updates else 0.0,
+            "policy_loss": total_policy_loss / n_updates if n_updates else 0.0,
+            "entropy": total_entropy / n_updates if n_updates else 0.0}
+def evaluate_policy(env, model, tokenizer, n_episodes=3, max_steps=6,
+                    task_levels=None, verbose=False):
+    """Evaluate the current policy across task levels. Returns metrics + optional traces."""
+    model.eval()
+    if task_levels is None:
+        task_levels = list(BUG_DB.keys())
+    total_rewards = []
+    traces = []  # human-readable behavior logs
+    for ep in range(n_episodes):
+        task = task_levels[ep % len(task_levels)]
+        env.set_task(task)
+        traj = collect_trajectory(env, model, tokenizer, max_steps, temperature=0.0)
+        ep_reward = sum(traj.rewards)
+        total_rewards.append(ep_reward)
+        if verbose:
+            actions_taken = []
+            for a in traj.actions:
+                try:
+                    actions_taken.append(json.loads(a).get("action_type", "?"))
+                except Exception:
+                    actions_taken.append("?")
+            traces.append({
+                "task": task,
+                "reward": round(ep_reward, 4),
+                "steps": len(traj),
+                "actions": actions_taken,
+            })
+    return {
+        "avg_reward": float(np.mean(total_rewards)),
+        "std_reward": float(np.std(total_rewards)),
+        "min_reward": float(np.min(total_rewards)),
+        "max_reward": float(np.max(total_rewards)),
+        "traces": traces,
+    }
+# ======================================================================
+# MANUAL WARM-UP (no SFTTrainer → no multiprocessing OOM)
+# ======================================================================
+def json_warmup(model, tokenizer, json_path="training_data.json",
+                n_episodes=20, epochs=2, lr=2e-5):
+    """
+    Supervised warm-up from pre-generated expert demonstrations.
+    Uses raw cross-entropy on action tokens with manual gradient steps.
+    NO SFTTrainer, NO multiprocessing – runs safely on any GPU.
+    """
+    print("\n" + "="*60)
+    print("SUPERVISED WARM-UP: training_data.json (manual cross-entropy)")
+    print("="*60)
+    with open(json_path, encoding="utf-8") as f:
+        data = json.load(f)
+    # Each episode = 7 steps. Select n_episodes worth.
+    steps_per_episode = 7
+    max_examples = n_episodes * steps_per_episode
+    if max_examples < len(data):
+        data = data[:max_examples]
+    print(f"  {len(data)} examples ({len(data)//steps_per_episode} episodes), "
+          f"{epochs} epoch(s), lr={lr}")
+    model.train()
+    warmup_opt = AdamW(model.parameters(), lr=lr)
+    warmup_losses = []   # per-epoch avg loss
+    for epoch in range(epochs):
+        random.shuffle(data)
+        epoch_loss = 0.0
+        n_valid = 0
+        for i, example in enumerate(data):
+            prompt = example["prompt"]
+            action = example["action"]
+            # ---- tokenize full sequence (prompt + action) ----
+            messages = [
+                {"role": "user", "content": prompt},
+                {"role": "assistant", "content": action},
+            ]
+            full_text = tokenizer.apply_chat_template(messages, tokenize=False)
+            inputs = tokenizer(full_text, return_tensors="pt",
+                               max_length=480, truncation=True).to("cuda")
+            # ---- find where the action tokens start ----
+            prompt_only = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                tokenize=False, add_generation_prompt=True
             )
+            prompt_ids = tokenizer.encode(prompt_only, add_special_tokens=False)
+            prompt_len = len(prompt_ids)
+            total_len = inputs.input_ids.shape[1]
+            if prompt_len >= total_len:
+                continue  # prompt was truncated away, skip
+            # ---- cross-entropy on action tokens only ----
+            outputs = model(**inputs)
+            logits = outputs.logits
+            # next-token prediction: logits[t] predicts token[t+1]
+            shift_logits = logits[0, prompt_len - 1 : total_len - 1]
+            shift_labels = inputs.input_ids[0, prompt_len : total_len]
+            min_len = min(shift_logits.shape[0], shift_labels.shape[0])
+            if min_len == 0:
                 continue
+            loss = F.cross_entropy(shift_logits[:min_len], shift_labels[:min_len])
+            warmup_opt.zero_grad()
             loss.backward()
             torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            warmup_opt.step()
+            epoch_loss += loss.item()
+            n_valid += 1
+            if (i + 1) % 25 == 0:
+                avg = epoch_loss / n_valid
+                print(f"    epoch {epoch+1}  step {i+1:3d}/{len(data)}  "
+                      f"running_loss={avg:.4f}")
+        avg_loss = epoch_loss / max(n_valid, 1)
+        warmup_losses.append(avg_loss)
+        print(f"  Epoch {epoch+1} done: avg_loss={avg_loss:.4f}  "
+              f"({n_valid} valid examples)")
+    torch.cuda.empty_cache()
+    print(f"✓ Warm-up complete.  Loss: "
+          f"{' → '.join(f'{l:.4f}' for l in warmup_losses)}\n")
+    return warmup_losses
+# ======================================================================
+# MAIN TRAINING PIPELINE
+# ======================================================================
+def train_ppo():
+    # --- Hyperparameters ---
+    n_iterations = 8            # enough for a clear upward trend
+    trajectories_per_iter = 4   # on-policy data per iteration
+    n_epochs = 1
+    max_steps = 6
+    learning_rate = 3e-5
+    clip_epsilon = 0.2
+    entropy_coef = 0.01
+    gamma = 0.99
+    # --- Pre-load embedder before LLM (Issue #13) ---
+    from rltool import ToolBox
+    print("Pre-loading sentence-transformer embedder...")
+    ToolBox._get_embedder()
+    print("✓ Embedder ready")
+    # --- Load model ---
+    print("Loading model...")
     model, tokenizer = load_model()
+    if not test_model_sanity(model, tokenizer):
+        return
     env = CodeReviewEnv()
+    task_levels = list(BUG_DB.keys())
+    # ==================================================================
+    # PHASE 0: BASELINE (untrained policy)
+    # ==================================================================
     print("\n" + "="*60)
+    print("PHASE 0 – BASELINE EVALUATION (untrained)")
     print("="*60)
+    baseline = evaluate_policy(env, model, tokenizer, n_episodes=5,
+                               max_steps=max_steps, task_levels=task_levels,
+                               verbose=True)
+    baseline_reward = baseline["avg_reward"]
+    print(f"Baseline avg reward: {baseline_reward:.4f}  "
+          f"(min={baseline['min_reward']:.4f}, max={baseline['max_reward']:.4f})")
+    print("Baseline behavior:")
+    for t in baseline["traces"]:
+        print(f"  task={t['task']:8s}  reward={t['reward']:+.4f}  "
+              f"steps={t['steps']}  actions={t['actions']}")
+    # ==================================================================
+    # PHASE 1: SUPERVISED WARM-UP (expert demos, manual CE)
+    # ==================================================================
+    warmup_losses = json_warmup(
+        model, tokenizer,
+        json_path="training_data.json",
+        n_episodes=20,  # 140 examples (20 × 7 steps)
+        epochs=2,
+        lr=2e-5,
+    )
+    # Post-warmup evaluation
+    print("="*60)
+    print("POST WARM-UP EVALUATION")
+    print("="*60)
+    post_warmup = evaluate_policy(env, model, tokenizer, n_episodes=5,
+                                  max_steps=max_steps, task_levels=task_levels,
+                                  verbose=True)
+    warmup_reward = post_warmup["avg_reward"]
+    print(f"Post-warmup avg reward: {warmup_reward:.4f}  "
+          f"(Δ vs baseline: {warmup_reward - baseline_reward:+.4f})")
+    print("Post-warmup behavior:")
+    for t in post_warmup["traces"]:
+        print(f"  task={t['task']:8s}  reward={t['reward']:+.4f}  "
+              f"steps={t['steps']}  actions={t['actions']}")
+    # ==================================================================
+    # PHASE 2: TRUE RL – PPO (on-policy, real environment interaction)
+    # ==================================================================
+    optimizer = AdamW(model.parameters(), lr=learning_rate)
+    print(f"\n{'='*60}")
+    print(f"PHASE 2 – PPO TRAINING: {n_iterations} iterations × "
+          f"{trajectories_per_iter} trajectories (true RL)")
+    print(f"{'='*60}\n")
+    reward_history = []
+    eval_history = []
+    loss_history = []
+    policy_loss_history = []
+    entropy_history = []
+    for iteration in range(n_iterations):
+        print(f"\n--- PPO Iteration {iteration + 1}/{n_iterations} ---")
+        # Collect on-policy trajectories from REAL environment
+        trajectories = collect_trajectories(
+            env, model, tokenizer, trajectories_per_iter, max_steps,
+            task_levels=task_levels, task_weights=None
+        )
+        avg_reward = float(np.mean([sum(t.rewards) for t in trajectories]))
+        reward_history.append(avg_reward)
+        print(f"  Collect  avg reward: {avg_reward:+.4f}")
+        # PPO policy gradient update
+        metrics = ppo_update(
+            trajectories, model, tokenizer, optimizer,
+            n_epochs=n_epochs, clip_epsilon=clip_epsilon,
+            entropy_coef=entropy_coef, gamma=gamma
+        )
+        loss_history.append(float(metrics["loss"]))
+        policy_loss_history.append(float(metrics["policy_loss"]))
+        entropy_history.append(float(metrics["entropy"]))
+        print(f"  Update   loss={metrics['loss']:.4f}  "
+              f"policy={metrics['policy_loss']:.4f}  "
+              f"entropy={metrics['entropy']:.4f}")
+        # Evaluate greedy policy after update
+        eval_m = evaluate_policy(env, model, tokenizer, n_episodes=3,
+                                 max_steps=max_steps, task_levels=task_levels,
+                                 verbose=False)
+        eval_history.append(eval_m["avg_reward"])
+        delta = eval_m["avg_reward"] - baseline_reward
+        print(f"  Eval     avg reward: {eval_m['avg_reward']:+.4f}  "
+              f"(Δ baseline: {delta:+.4f})")
+    # ==================================================================
+    # PHASE 3: FINAL EVALUATION (proof of learning)
+    # ==================================================================
     print("\n" + "="*60)
+    print("PHASE 3 – FINAL EVALUATION (after all training)")
     print("="*60)
+    final = evaluate_policy(env, model, tokenizer, n_episodes=5,
+                            max_steps=max_steps, task_levels=task_levels,
+                            verbose=True)
+    print(f"Final avg reward: {final['avg_reward']:.4f}  "
+          f"(min={final['min_reward']:.4f}, max={final['max_reward']:.4f})")
+    print("Final behavior:")
+    for t in final["traces"]:
+        print(f"  task={t['task']:8s}  reward={t['reward']:+.4f}  "
+              f"steps={t['steps']}  actions={t['actions']}")
+    total_improvement = final["avg_reward"] - baseline_reward
+    ppo_improvement = final["avg_reward"] - warmup_reward
+    print(f"\n{'='*60}")
+    print("TRAINING SUMMARY")
+    print(f"  Baseline reward:    {baseline_reward:+.4f}")
+    print(f"  Post-warmup reward: {warmup_reward:+.4f}  "
+          f"(warmup Δ: {warmup_reward - baseline_reward:+.4f})")
+    print(f"  Final reward:       {final['avg_reward']:+.4f}  "
+          f"(PPO Δ: {ppo_improvement:+.4f})")
+    print(f"  Total improvement:  {total_improvement:+.4f}")
+    print(f"  Reward trend (PPO): {' → '.join(f'{r:+.3f}' for r in reward_history)}")
+    print(f"  Loss trend (PPO):   {' → '.join(f'{l:.4f}' for l in loss_history)}")
+    if total_improvement > 0:
+        print(f"  ✓ Agent IMPROVED by {total_improvement:+.4f}")
+    else:
+        print(f"  ✗ No overall improvement detected")
+    print(f"{'='*60}")
+    # ==================================================================
+    # PLOTS
+    # ==================================================================
+    iters = list(range(1, n_iterations + 1))
+    # --- 1. Warm-up loss curve ---
+    if warmup_losses:
+        fig, ax = plt.subplots(figsize=(7, 4))
+        ax.plot(range(1, len(warmup_losses) + 1), warmup_losses,
+                marker="o", linewidth=2, color="tab:purple")
+        ax.set_title("Warm-up Loss (supervised, per epoch)",
+                     fontsize=13, fontweight="bold")
+        ax.set_xlabel("Epoch")
+        ax.set_ylabel("Cross-Entropy Loss")
+        ax.grid(alpha=0.3)
+        fig.tight_layout()
+        fig.savefig("warmup_loss.png", dpi=150)
+        plt.close(fig)
+    # --- 2. PPO reward curve ---
+    fig, ax = plt.subplots(figsize=(9, 5))
+    ax.plot(iters, reward_history, marker="o", linewidth=2,
+            label="Collect reward", color="tab:blue")
+    ax.plot(iters, eval_history, marker="s", linewidth=2, linestyle="--",
+            label="Eval reward", color="tab:green")
+    ax.axhline(y=baseline_reward, color="tab:gray", linestyle=":",
+               linewidth=1.5, label=f"Baseline ({baseline_reward:+.3f})")
+    ax.axhline(y=warmup_reward, color="tab:purple", linestyle=":",
+               linewidth=1.5, label=f"Post-warmup ({warmup_reward:+.3f})")
+    ax.set_title("PPO Reward per Iteration", fontsize=14, fontweight="bold")
+    ax.set_xlabel("Iteration")
+    ax.set_ylabel("Average Reward")
+    ax.legend(loc="best", fontsize=8)
+    ax.grid(alpha=0.3)
+    fig.tight_layout()
+    fig.savefig("reward_curve.png", dpi=150)
+    plt.close(fig)
+    # --- 3. PPO loss curve ---
+    fig, ax = plt.subplots(figsize=(9, 5))
+    ax.plot(iters, loss_history, marker="o", linewidth=2,
+            label="Total loss", color="tab:red")
+    ax.plot(iters, policy_loss_history, marker="^", linewidth=2, linestyle="--",
+            label="Policy loss", color="tab:orange")
+    ax.set_title("PPO Loss per Iteration", fontsize=14, fontweight="bold")
+    ax.set_xlabel("Iteration")
+    ax.set_ylabel("Loss")
+    ax.legend(loc="best")
+    ax.grid(alpha=0.3)
+    fig.tight_layout()
+    fig.savefig("loss_curve.png", dpi=150)
+    plt.close(fig)
+    # --- 4. Combined 3-panel summary ---
+    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
+    # Panel A: warm-up loss
+    if warmup_losses:
+        axes[0].plot(range(1, len(warmup_losses) + 1), warmup_losses,
+                     marker="o", linewidth=2, color="tab:purple")
+        axes[0].set_title("A. Warm-up Loss ↓")
+        axes[0].set_xlabel("Epoch")
+        axes[0].set_ylabel("CE Loss")
+        axes[0].grid(alpha=0.3)
+    # Panel B: PPO reward
+    axes[1].plot(iters, reward_history, marker="o", linewidth=2,
+                 color="tab:blue", label="Collect")
+    axes[1].plot(iters, eval_history, marker="s", linewidth=2,
+                 linestyle="--", color="tab:green", label="Eval")
+    axes[1].axhline(y=baseline_reward, color="tab:gray", linestyle=":",
+                    linewidth=1.5, label="Baseline")
+    axes[1].axhline(y=warmup_reward, color="tab:purple", linestyle=":",
+                    linewidth=1.5, label="Post-warmup")
+    axes[1].set_title("B. PPO Reward ↑")
+    axes[1].set_xlabel("Iteration")
+    axes[1].set_ylabel("Avg Reward")
+    axes[1].legend(fontsize=7)
+    axes[1].grid(alpha=0.3)
+    # Panel C: PPO loss
+    axes[2].plot(iters, loss_history, marker="o", linewidth=2,
+                 color="tab:red", label="Total")
+    axes[2].plot(iters, policy_loss_history, marker="^", linewidth=2,
+                 linestyle="--", color="tab:orange", label="Policy")
+    axes[2].set_title("C. PPO Loss ↓")
+    axes[2].set_xlabel("Iteration")
+    axes[2].set_ylabel("Loss")
+    axes[2].legend(fontsize=7)
+    axes[2].grid(alpha=0.3)
+    fig.suptitle("Code Review Agent – Full Training Evidence",
+                 fontsize=14, fontweight="bold")
+    fig.tight_layout()
+    fig.savefig("training_summary.png", dpi=150)
+    plt.close(fig)
+    print("Plots saved: warmup_loss.png, reward_curve.png, "
+          "loss_curve.png, training_summary.png")
+    print("="*60)
 if __name__ == "__main__":
+    train_ppo()