Spaces:

100XZX001
/

code-review-training

Sleeping

App Files Files Community

100XZX001 commited on Apr 26

Commit

a3960cd

verified ·

1 Parent(s): a1d7a9c

Update training.py

Browse files

Files changed (1) hide show

training.py +708 -328

training.py CHANGED Viewed

@@ -1,35 +1,96 @@
-# training.py – Clean PPO + QLoRA + Supervised Warm‑up (evidence‑driven RL)
-import os
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-import json
 import torch
 import torch.nn.functional as F
 from torch.optim import AdamW
-from dataclasses import dataclass
-from typing import List, Optional
 import numpy as np
-import random
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
-from collections import Counter
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-from peft import LoraConfig, get_peft_model, TaskType
 from environment import CodeReviewEnv
 from redteam import BUG_DB
-from models import map_to_env as model_map_to_env
-# =========================================================
-# DEVICE
-# =========================================================
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# =========================================================
 # DATA STRUCTURES
-# =========================================================
 @dataclass
 class AgentAction:
     action_type: str
@@ -37,305 +98,486 @@ class AgentAction:
 @dataclass
 class Trajectory:
-    states: List[str]
-    actions: List[str]
-    rewards: List[float]
     logprobs: List[float]
-    dones: List[bool]
-# =========================================================
 # ACTION PARSER
-# =========================================================
-def parse_action(output: str) -> AgentAction:
     try:
-        data = json.loads(output)
-        return AgentAction(
-            action_type=data.get("action_type", "").lower(),
-            content=data.get("content")
-        )
-    except:
-        return AgentAction("skip", None)
 def map_to_env(action: AgentAction):
     return model_map_to_env(action.action_type, action.content)
-# =========================================================
-# MODEL
-# =========================================================
 def load_model():
-    model_name = "microsoft/Phi-3-mini-4k-instruct"
-    bnb = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_compute_dtype=torch.bfloat16,
-        bnb_4bit_quant_type="nf4"
     )
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        quantization_config=bnb,
-        device_map="auto",
-        torch_dtype=torch.bfloat16
     )
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
     tokenizer.pad_token = tokenizer.eos_token
-    lora = LoraConfig(
-        r=16,
-        lora_alpha=32,
-        target_modules=["q_proj","k_proj","v_proj","o_proj",
-                       "gate_proj","up_proj","down_proj"],
-        lora_dropout=0.0,
-        bias="none",
-        task_type=TaskType.CAUSAL_LM
-    )
-    model = get_peft_model(model, lora)
-    model.gradient_checkpointing_enable()
     return model, tokenizer
-# =========================================================
-# PROMPT BUILDER (full environment context)
-# =========================================================
 def build_prompt(obs, history_lines: List[str]) -> str:
-    author_msg = getattr(obs, "author_response", "") or ""
-    tool_output = getattr(obs, "last_tool_output", "") or ""
-    author_personality = getattr(obs, "author_personality", "defensive")
-    prompt = f"""You are an AI code review agent. Your goal is to convince a simulated human developer to accept your proposed fix and name your proposed fix function fix.
-The developer has a **{author_personality}** personality and will only accept if you provide solid evidence:
-- Tests pass (high pass ratio)
-- Lint is clean (zero errors)
-- Documentation or references are provided
-- Your reasoning is clear, uses words like "because" or "therefore", and is detailed (over 30 words if needed)
-Workflow:
-1. Use `inspect` to understand the code.
-2. Use `run_tests` and `run_linter` to gather evidence.
-3. Use `query_docs` when you need references or language‑specific guidance.
-4. Propose a fix (`fix`) and explain why it works (`comment` or `question`).
-5. If the developer pushes back, read their response carefully and address their specific concern.
-6. Once convinced, use `done` to finish.
-Code:
-{obs.code_snippet}
-Author says:
-{author_msg if author_msg else "(no response yet – start with inspection)"}
-Last tool output:
-{tool_output if tool_output else "(none)"}
-Available actions:
-run_tests, run_linter, inspect, query_docs, fix, comment, question, done
-Respond ONLY in JSON:
-{{"action_type": "...", "content": "..."}}"""
     if history_lines:
-        history = "\n".join(history_lines[-6:])
-        prompt += f"\n\nPrevious steps:\n{history}"
-    return prompt
-# =========================================================
-# GENERATION
-# =========================================================
-def generate_action(prompt, model, tokenizer, temperature):
-    messages = [{"role": "user", "content": prompt}]
-    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = tokenizer(formatted, return_tensors="pt", truncation=True).to(DEVICE)
-    outputs = model.generate(
-        **inputs,
-        max_new_tokens=128,
-        do_sample=temperature > 0,
-        temperature=temperature if temperature > 0 else None,
-        return_dict_in_generate=True,
-        output_scores=True
     )
-    gen_ids = outputs.sequences[0][inputs["input_ids"].shape[1]:]
-    text = tokenizer.decode(gen_ids, skip_special_tokens=True)
-    logprobs = []
-    for i, token_id in enumerate(gen_ids):
-        if i < len(outputs.scores):
-            logits = outputs.scores[i][0]
-            lp = F.log_softmax(logits, dim=-1)[token_id]
-            logprobs.append(lp)
-    if not logprobs:
-        return '{"action_type":"skip"}', -100.0
-    return text, torch.stack(logprobs).sum().item()
-# =========================================================
 # TRAJECTORY COLLECTION
-# =========================================================
-def collect_trajectory(env, model, tokenizer, max_steps, temperature):
     obs = env.reset()
-    history_lines = []
-    states, actions, rewards, logprobs, dones = [], [], [], [], []
-    metrics = {"test_score": [], "actions": []}
-    for step in range(max_steps):
-        prompt = build_prompt(obs, history_lines)
-        states.append(prompt)
-        action_text, lp = generate_action(prompt, model, tokenizer, temperature)
-        actions.append(action_text)
-        logprobs.append(lp)
-        action = parse_action(action_text)
-        env_action = map_to_env(action)
-        next_obs, reward, done, _ = env.step(env_action)
-        rewards.append(float(np.clip(reward.value, -1, 1)))
-        dones.append(done)
-        history_lines.append(f"Agent: {action_text}")
-        history_lines.append(f"Env: {next_obs.last_tool_output}")
-        metrics["test_score"].append(getattr(next_obs, "current_test_score", 0.0))
-        metrics["actions"].append(action.action_type)
-        obs = next_obs
-        if done:
             break
-    return Trajectory(states, actions, rewards, logprobs, dones), metrics
-# =========================================================
-# SUPERVISED WARM‑UP
-# =========================================================
-def supervised_warmup(model, tokenizer, data_path="training_data.json", epochs=3):
-    print("\n=== SUPERVISED WARMUP ===")
-    with open(data_path) as f:
         data = json.load(f)
-    optimizer = AdamW(model.parameters(), lr=2e-5)
     model.train()
-    for epoch in range(epochs):
         random.shuffle(data)
-        total_loss = 0
-        for ex in data:
-            prompt = ex["prompt"]
-            action = ex["action"]
-            messages = [
-                {"role": "user", "content": prompt},
-                {"role": "assistant", "content": action},
-            ]
-            text = tokenizer.apply_chat_template(messages, tokenize=False)
-            inputs = tokenizer(text, return_tensors="pt", truncation=True).to(DEVICE)
-            outputs = model(**inputs, labels=inputs["input_ids"])
-            loss = outputs.loss
-            optimizer.zero_grad()
             loss.backward()
-            optimizer.step()
-            total_loss += loss.item()
-        print(f"Epoch {epoch+1} Loss: {total_loss/len(data):.4f}")
-    print("✓ Warmup done\n")
-# =========================================================
-# PPO UPDATE (FIXED advantage = return – baseline)
-# =========================================================
-def ppo_update(trajectories, model, tokenizer, optimizer, clip=0.2, gamma=0.99):
     model.train()
-    losses = []
-    kls = []
-    # =========================
-    # Compute returns + baseline
-    # =========================
     all_returns = []
     traj_returns = []
     for traj in trajectories:
-        returns = []
-        running = 0.0
-        for r in reversed(traj.rewards):
-            running = r + gamma * running
-            returns.insert(0, running)
-        returns = torch.tensor(returns, dtype=torch.float32, device=DEVICE)
-        traj_returns.append(returns)
-        all_returns.extend(returns.tolist())
-    baseline = torch.tensor(np.mean(all_returns), device=DEVICE) if all_returns else torch.tensor(0.0, device=DEVICE)
-    # =========================
-    # PPO update
-    # =========================
-    for traj, returns in zip(trajectories, traj_returns):
         for i in range(len(traj.states)):
-            state = traj.states[i]
             action = traj.actions[i]
-            old_lp = torch.tensor(traj.logprobs[i], device=DEVICE)
-            # Advantage (detached)
-            adv = (returns[i] - baseline).detach()
-            messages = [{"role": "user", "content": state}]
-            formatted = tokenizer.apply_chat_template(
-                messages, tokenize=False, add_generation_prompt=True
             )
-            full = formatted + action
-            inputs = tokenizer(full, return_tensors="pt", truncation=True).to(DEVICE)
-            logits = model(**inputs).logits
-            action_ids = tokenizer.encode(action, add_special_tokens=False)
-            prefix_len = len(tokenizer.encode(formatted, add_special_tokens=False))
-            logps = []
-            entropy = 0.0
-            for idx in range(len(action_ids)):
-                pos = prefix_len + idx
-                if pos == 0 or pos >= logits.shape[1]:
-                    continue
-                token_logits = logits[0, pos - 1]
-                log_probs = F.log_softmax(token_logits, dim=-1)
-                lp = log_probs[action_ids[idx]]
-                logps.append(lp)
-                probs = torch.exp(log_probs)
-                entropy += (-(probs * log_probs).sum()).detach()
-            if not logps:
                 continue
-            new_lp = torch.stack(logps).sum()
-            # PPO ratio
-            ratio = torch.exp(new_lp - old_lp)
-            s1 = ratio * adv
-            s2 = torch.clamp(ratio, 1 - clip, 1 + clip) * adv
             policy_loss = -torch.min(s1, s2)
-            loss = policy_loss - 0.01 * (entropy / len(logps))
-            if torch.isnan(loss):
                 continue
             optimizer.zero_grad()
@@ -343,89 +585,227 @@ def ppo_update(trajectories, model, tokenizer, optimizer, clip=0.2, gamma=0.99):
             torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
             optimizer.step()
-            kl = (old_lp - new_lp).detach().cpu().item()
-            kls.append(kl)
             losses.append(loss.item())
-    return (
-        float(np.mean(losses)) if losses else 0.0,
-        float(np.mean(kls)) if kls else 0.0,
     )
-# =========================================================
-# MAIN TRAINING LOOP
-# =========================================================
 def train():
     model, tokenizer = load_model()
     env = CodeReviewEnv()
-    # ---------- Supervised warm‑up ----------
-    supervised_warmup(model, tokenizer, data_path="training_data.json", epochs=3)
-    optimizer = AdamW(model.parameters(), lr=3e-5)
-    reward_hist, success_hist, kl_hist = [], [], []
-    task_levels = list(BUG_DB.keys())
-    # Baseline evaluation (after warm‑up, before PPO)
-    baseline_rewards = []
-    for _ in range(5):
-        env.set_task(random.choice(task_levels))
-        traj, _ = collect_trajectory(env, model, tokenizer, 6, 0.0)
-        baseline_rewards.append(sum(traj.rewards))
-    baseline_reward = np.mean(baseline_rewards)
-    print(f"Baseline reward: {baseline_reward:+.4f}")
-    # PPO iterations
-    for it in range(15):
-        print(f"\nIteration {it+1}")
-        temperature = max(0.7 * (1 - it/15), 0.1)
-        trajectories = []
-        successes = 0
-        action_counter = Counter()
-        for _ in range(6):
-            env.set_task(random.choice(task_levels))
-            traj, metrics = collect_trajectory(env, model, tokenizer, 6, temperature)
-            trajectories.append(traj)
-            for a in metrics["actions"]:
-                action_counter[a] += 1
-            if sum(traj.rewards) > 0:
-                successes += 1
-        avg_reward = np.mean([sum(t.rewards) for t in trajectories])
-        success_rate = successes / len(trajectories)
-        loss, kl = ppo_update(trajectories, model, tokenizer, optimizer)
-        reward_hist.append(avg_reward)
-        success_hist.append(success_rate)
-        kl_hist.append(kl)
-        print(f"Reward: {avg_reward:+.4f}  Success: {success_rate:.2%}  KL: {kl:.4f}")
-        print(f"Actions: {dict(action_counter)}")
-    # ===================== Plots =====================
-    iters = list(range(1, len(reward_hist)+1))
-    plt.figure()
-    plt.plot(iters, reward_hist)
-    plt.axhline(y=baseline_reward, linestyle="--", color="gray")
-    plt.title("PPO Reward Curve")
-    plt.savefig("reward_curve.png")
-    plt.figure()
-    plt.plot(iters, success_hist)
-    plt.title("Success Rate")
-    plt.savefig("success_rate.png")
-    plt.figure()
-    plt.plot(iters, kl_hist)
-    plt.title("KL Divergence")
-    plt.savefig("kl_divergence.png")
-    print(f"\nTraining complete. Plots saved.")
-    print(f"Final reward: {np.mean(reward_hist[-3:]):+.4f}")
 if __name__ == "__main__":
     train()

+# training.py  –  PPO + QLoRA + Supervised Warm-up
+# Model : Qwen/Qwen2.5-1.5B-Instruct  (via Unsloth – 2× faster, fits Colab T4)
+# Fixed : label-masking, BPE-boundary alignment, log-ratio clamping, OOM guards
+# Evidence: reward curves, before/after traces, per-difficulty breakdown, KL, entropy
+# ============================================================
+import os, json, random, re
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
 import torch
 import torch.nn.functional as F
 from torch.optim import AdamW
+from dataclasses import dataclass, field
+from typing import List, Optional, Dict
+from collections import Counter, defaultdict
 import numpy as np
+# ── Unsloth gives 2× throughput with identical outputs ────────────────────────
+from unsloth import FastLanguageModel
 from environment import CodeReviewEnv
 from redteam import BUG_DB
+# Graceful import: use project map_to_env if available, else inline fallback.
+try:
+    from models import map_to_env as model_map_to_env
+    _HAVE_MODEL_MAP = True
+except (ImportError, AttributeError):
+    _HAVE_MODEL_MAP = False
+if not _HAVE_MODEL_MAP:
+    try:
+        from models import (RunTests, RunLinter, Inspect, ProposeFix,
+                            WriteComment, AskQuestion, Done, Skip, QueryDocs)
+        def model_map_to_env(action_type: str, content=None):
+            return {
+                "run_tests":  RunTests(),
+                "run_linter": RunLinter(),
+                "inspect":    Inspect(),
+                "query_docs": QueryDocs(content or "python bug fix"),
+                "fix":        ProposeFix(content or ""),
+                "comment":    WriteComment(content or ""),
+                "question":   AskQuestion(content or ""),
+                "done":       Done(),
+            }.get(action_type, Skip())
+    except ImportError:
+        # Last resort: duck-typed object the env can introspect.
+        class _EnvAction:
+            def __init__(self, **kw): self.__dict__.update(kw)
+        def model_map_to_env(action_type: str, content=None):
+            return _EnvAction(action_type=action_type, content=content)
+# ══════════════════════════════════════════════════════════════════════════════
+# CONFIG
+# ══════════════════════════════════════════════════════════════════════════════
+CFG = dict(
+    model_name       = "unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit",
+    max_seq_len      = 512,       # hard cap; prevents OOM on T4
+    lora_r           = 16,
+    lora_alpha       = 32,
+    # Warm-up
+    warmup_data      = "training_data.json",
+    warmup_epochs    = 2,
+    warmup_lr        = 2e-5,
+    warmup_grad_acc  = 4,         # effective batch = 4 examples
+    # PPO
+    ppo_iters        = 15,
+    trajs_per_iter   = 6,
+    max_steps        = 7,
+    ppo_lr           = 3e-5,
+    clip_eps         = 0.2,
+    entropy_coef     = 0.01,
+    gamma            = 0.99,
+    log_ratio_clamp  = 5.0,       # ← prevents exp-explosion / NaN loss
+    temp_start       = 0.8,
+    temp_end         = 0.1,
+    # Eval
+    eval_episodes    = 10,        # episodes per evaluation snapshot
+)
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+TASK_LEVELS = list(BUG_DB.keys())   # [easy, medium, hard, harder, hardest]
+# ══════════════════════════════════════════════════════════════════════════════
 # DATA STRUCTURES
+# ══════════════════════════════════════════════════════════════════════════════
 @dataclass
 class AgentAction:
     action_type: str
 @dataclass
 class Trajectory:
+    states:   List[str]
+    actions:  List[str]
+    rewards:  List[float]
     logprobs: List[float]
+    dones:    List[bool]
+    task:     str = ""
+@dataclass
+class EvalSnapshot:
+    """Captures full agent behaviour for before/after comparison."""
+    avg_reward:    float
+    per_task:      Dict[str, float]  = field(default_factory=dict)
+    action_dist:   Dict[str, float]  = field(default_factory=dict)
+    success_rate:  float = 0.0
+    avg_steps:     float = 0.0
+    traces:        List[dict] = field(default_factory=list)
+# ══════════════════════════════════════════════════════════════════════════════
 # ACTION PARSER
+# ══════════════════════════════════════════════════════════════════════════════
+def parse_action(text: str) -> AgentAction:
+    """Robust parser: tries strict JSON, then regex, then keyword heuristic."""
+    text = text.strip()
     try:
+        d = json.loads(text)
+        return AgentAction(d.get("action_type","skip").lower(), d.get("content"))
+    except json.JSONDecodeError:
+        pass
+    m = re.search(r'"action_type"\s*:\s*"(\w+)"', text)
+    if m:
+        cm = re.search(r'"content"\s*:\s*"(.*?)"', text, re.DOTALL)
+        return AgentAction(m.group(1).lower(), cm.group(1) if cm else None)
+    tl = text.lower()
+    for kw in ("run_tests","run_linter","inspect","query_docs","fix",
+               "comment","question","done"):
+        if kw in tl:
+            return AgentAction(kw)
+    return AgentAction("skip")
 def map_to_env(action: AgentAction):
     return model_map_to_env(action.action_type, action.content)
+# ══════════════════════════════════════════════════════════════════════════════
+# MODEL  (Qwen2.5-1.5B via Unsloth)
+# ══════════════════════════════════════════════════════════════════════════════
 def load_model():
+    print(f"Loading {CFG['model_name']} …")
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name     = CFG["model_name"],
+        max_seq_length = CFG["max_seq_len"],
+        load_in_4bit   = True,
     )
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r              = CFG["lora_r"],
+        lora_alpha     = CFG["lora_alpha"],
+        target_modules = ["q_proj","k_proj","v_proj","o_proj",
+                          "gate_proj","up_proj","down_proj"],
+        lora_dropout   = 0.0,
     )
     tokenizer.pad_token = tokenizer.eos_token
+    print(f"  trainable params: "
+          f"{sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6:.1f}M")
     return model, tokenizer
+# ══════════════════════════════════════════════════════════════════════════════
+# PROMPT BUILDER
+# ═════════════���════════════════════════════════════════════════════════════════
 def build_prompt(obs, history_lines: List[str]) -> str:
+    author_msg   = getattr(obs, "author_response",  "") or ""
+    tool_output  = getattr(obs, "last_tool_output", "") or ""
+    personality  = getattr(obs, "author_personality","defensive")
+    # Trim tool output to avoid context explosion
+    if len(tool_output) > 600:
+        tool_output = tool_output[:600] + " …[truncated]"
+    p = (
+        f"You are an AI code review agent. Convince the developer (personality: "
+        f"**{personality}**) to accept your fix. Name your fix function `fix`.\n\n"
+        "Evidence required: tests pass, lint clean, docs cited, reasoning uses "
+        "'because'/'therefore' (>30 words).\n\n"
+        "Workflow: inspect → run_tests → run_linter → query_docs → fix → "
+        "comment/question → done.\n\n"
+        f"Code:\n{obs.code_snippet}\n\n"
+        f"Author: {author_msg or '(no response yet – start with inspect)'}\n\n"
+        f"Last tool: {tool_output or '(none)'}\n\n"
+        "Actions: run_tests, run_linter, inspect, query_docs, fix, comment, question, done\n\n"
+        'Respond ONLY in JSON: {"action_type": "...", "content": "..."}'
+    )
     if history_lines:
+        p += "\n\nRecent steps:\n" + "\n".join(history_lines[-4:])
+    return p
+# ══════════════════════════════════════════════════════════════════════════════
+# BUG FIX 1 – label masking in supervised warmup
+# (original: labels=inputs["input_ids"] trains on ALL tokens, including prompt)
+# ══════════════════════════════════════════════════════════════════════════════
+def _masked_labels(input_ids: torch.Tensor, prompt_len: int) -> torch.Tensor:
+    """Return labels with prompt positions set to -100 (ignored by CE loss)."""
+    labels = input_ids.clone()
+    labels[0, :prompt_len] = -100
+    return labels
+# ══════════════════════════════════════════════════════════════════════════════
+# BUG FIX 2 – BPE-boundary-safe logprob computation
+# (original: tokenize(prompt) + tokenize(action) ≠ tokenize(prompt+action))
+# ══════════════════════════════════════���═══════════════════════════════════════
+def _compute_action_logprob(
+    logits:      torch.Tensor,   # [1, seq_len, vocab]
+    input_ids:   torch.Tensor,   # [1, seq_len]
+    prompt_len:  int,            # #tokens in the prompt part of the joint sequence
+) -> tuple:
+    """
+    Compute sum of log-probs for *action* tokens only, using the jointly
+    tokenised sequence so BPE boundaries are respected.
+    Returns (total_logprob, avg_entropy, n_tokens).
+    """
+    action_len = input_ids.shape[1] - prompt_len
+    if action_len <= 0:
+        return torch.tensor(0.0, device=DEVICE), torch.tensor(0.0, device=DEVICE), 0
+    total_lp  = torch.tensor(0.0, device=DEVICE)
+    total_ent = torch.tensor(0.0, device=DEVICE)
+    for k in range(action_len):
+        pos = prompt_len + k           # position of the k-th action token
+        pred_pos = pos - 1             # logit at pred_pos predicts token at pos
+        if pred_pos < 0 or pred_pos >= logits.shape[1]:
+            continue
+        token_id  = input_ids[0, pos]
+        lp_dist   = F.log_softmax(logits[0, pred_pos], dim=-1)
+        total_lp  = total_lp  + lp_dist[token_id]
+        probs     = torch.exp(lp_dist)
+        total_ent = total_ent + (-(probs * lp_dist).sum()).detach()
+    n = action_len
+    return total_lp, total_ent / max(n, 1), n
+# ══════════════════════════════════════════════════════════════════════════════
+# GENERATION  (returns text + joint-sequence logprob)
+# ══════════════════════════════════════════════════════════════════════════════
+@torch.no_grad()
+def generate_action(prompt: str, model, tokenizer,
+                    temperature: float) -> tuple:
+    messages  = [{"role": "user", "content": prompt}]
+    formatted = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
     )
+    inputs = tokenizer(
+        formatted, return_tensors="pt",
+        max_length=CFG["max_seq_len"] - 128,   # leave room for response
+        truncation=True
+    ).to(DEVICE)
+    prompt_len = inputs["input_ids"].shape[1]
+    gen_kwargs = dict(
+        max_new_tokens      = 128,
+        do_sample           = temperature > 0,
+        return_dict_in_generate = True,
+        output_scores       = True,
+        pad_token_id        = tokenizer.eos_token_id,
+        eos_token_id        = tokenizer.eos_token_id,
+    )
+    if temperature > 0:
+        gen_kwargs["temperature"] = temperature
+    out     = model.generate(**inputs, **gen_kwargs)
+    gen_ids = out.sequences[0][prompt_len:]
+    text    = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
+    if not text:
+        fallback = random.choice([
+            '{"action_type":"inspect"}',
+            '{"action_type":"run_tests"}',
+            '{"action_type":"run_linter"}',
+        ])
+        print(f"  [WARN] empty generation → fallback {fallback}")
+        # BUG FIX 3: don't use -100 sentinel; use a mildly negative logprob
+        # so that PPO ratio = exp(new - old) stays finite when re-evaluated
+        return fallback, -10.0
+    # Recompute logprob from the full joint sequence (BPE-safe)
+    joint_ids = torch.cat(
+        [inputs["input_ids"], gen_ids.unsqueeze(0).to(DEVICE)], dim=1
+    )
+    joint_ids = joint_ids[:, :CFG["max_seq_len"]]
+    logits = model(input_ids=joint_ids).logits
+    lp, _, _ = _compute_action_logprob(logits, joint_ids, prompt_len)
+    return text, lp.item()
+# ══════════════════════════════════════════════════════════════════════════════
 # TRAJECTORY COLLECTION
+# ══════════════════════════════════════════════════════════════════════════════
+# Per-action shaped rewards.  These create reward variance so that
+# trajectories with meaningful tool use beat inspect-only episodes.
+_STEP_REWARD = {
+    "run_tests":  +0.08,
+    "run_linter": +0.05,
+    "fix":        +0.15,
+    "comment":    +0.08,
+    "query_docs": +0.05,
+    "question":   +0.04,
+    "inspect":     0.00,   # neutral – observe before acting
+    "done":        0.00,   # env handles the terminal reward
+    "skip":       -0.10,   # penalise doing nothing
+}
+def collect_trajectory(env, model, tokenizer,
+                       max_steps: int, temperature: float,
+                       task: str) -> tuple:
+    """
+    FIX 4 – Override env done/reward for non-terminal actions.
+    Root cause of the degenerate policy:
+    • env.step(Inspect()) returns done=True, reward=+0.002
+    • agent discovers inspect → tiny reward → done is the easiest path
+    • every trajectory is identical → zero advantage → PPO does nothing
+    Fix: only accept env's done+reward when the agent explicitly emits
+    {"action_type": "done"}.  For every other action, use a shaped step
+    reward and force the episode to continue.
+    """
+    env.set_task(task)
     obs = env.reset()
+    history: List[str] = []
+    traj = Trajectory([], [], [], [], [], task=task)
+    action_seq = []
+    for step_num in range(max_steps):
+        prompt = build_prompt(obs, history)
+        traj.states.append(prompt)
+        text, lp = generate_action(prompt, model, tokenizer, temperature)
+        traj.actions.append(text)
+        traj.logprobs.append(lp)
+        action = parse_action(text)
+        action_seq.append(action.action_type)
+        obs, reward, env_done, _ = env.step(map_to_env(action))
+        raw_r = float(reward.value)
+        if action.action_type == "done":
+            # Agent explicitly chose to terminate → honour env reward
+            shaped_r     = raw_r
+            effective_done = True
+        else:
+            # Intermediate step: use shaped reward, ignore env's done signal.
+            # Also keep a fraction of any large env reward (e.g. test pass).
+            shaped_r = _STEP_REWARD.get(action.action_type, 0.0)
+            if raw_r > 0.1:            # env signalling meaningful progress
+                shaped_r += raw_r * 0.3
+            effective_done = False     # ← key: don't let env short-circuit
+        traj.rewards.append(float(np.clip(shaped_r, -1.0, 1.0)))
+        traj.dones.append(effective_done)
+        history.append(f"Agent: {text[:120]}")
+        history.append(f"Env: {(obs.last_tool_output or '')[:120]}")
+        if effective_done:
             break
+    return traj, action_seq
+# ══════════════════════════════════════════════════════════════════════════════
+# SUPERVISED WARM-UP  (BUG FIX 1: action-only label masking)
+# ══════════════════════════════════════════════════════════════════════════════
+def supervised_warmup(model, tokenizer):
+    print("\n" + "="*60)
+    print("SUPERVISED WARM-UP")
+    print("="*60)
+    with open(CFG["warmup_data"], encoding="utf-8") as f:
         data = json.load(f)
+    opt = AdamW(model.parameters(), lr=CFG["warmup_lr"])
     model.train()
+    loss_history = []
+    for epoch in range(CFG["warmup_epochs"]):
         random.shuffle(data)
+        epoch_loss, n_valid = 0.0, 0
+        opt.zero_grad()
+        for step, ex in enumerate(data):
+            # ── Tokenise prompt and full sequence jointly ────────────────
+            prompt_chat = tokenizer.apply_chat_template(
+                [{"role": "user", "content": ex["prompt"]}],
+                tokenize=False, add_generation_prompt=True
+            )
+            full_chat = tokenizer.apply_chat_template(
+                [{"role": "user",      "content": ex["prompt"]},
+                 {"role": "assistant", "content": ex["action"]}],
+                tokenize=False
+            )
+            prompt_ids = tokenizer(
+                prompt_chat, return_tensors="pt",
+                max_length=CFG["max_seq_len"], truncation=True
+            )["input_ids"]
+            full_inputs = tokenizer(
+                full_chat, return_tensors="pt",
+                max_length=CFG["max_seq_len"], truncation=True
+            ).to(DEVICE)
+            prompt_len = prompt_ids.shape[1]
+            if prompt_len >= full_inputs["input_ids"].shape[1]:
+                continue  # action got truncated away
+            # BUG FIX 1 ── mask prompt tokens so loss is action-only
+            labels = _masked_labels(full_inputs["input_ids"], prompt_len)
+            out  = model(**full_inputs, labels=labels)
+            loss = out.loss / CFG["warmup_grad_acc"]
             loss.backward()
+            if (step + 1) % CFG["warmup_grad_acc"] == 0:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+                opt.step()
+                opt.zero_grad()
+            epoch_loss += loss.item() * CFG["warmup_grad_acc"]
+            n_valid    += 1
+            if (step + 1) % 50 == 0:
+                print(f"  epoch {epoch+1}  step {step+1}/{len(data)}"
+                      f"  loss={epoch_loss/n_valid:.4f}")
+        avg = epoch_loss / max(n_valid, 1)
+        loss_history.append(avg)
+        print(f"  Epoch {epoch+1} complete: avg_loss={avg:.4f}")
+    torch.cuda.empty_cache()
+    print(f"✓ Warm-up done. Loss: {' → '.join(f'{l:.4f}' for l in loss_history)}\n")
+    return loss_history
+# ══════════════════════════════════════════════════════════════════════════════
+# EVALUATION  (produces rich EvalSnapshot for comparison plots)
+# ══════════════════════════════════════════════════════════════════════════════
+@torch.no_grad()
+def evaluate(env, model, tokenizer, label: str = "") -> EvalSnapshot:
+    model.eval()
+    per_task: Dict[str, List[float]] = defaultdict(list)
+    action_counter: Counter = Counter()
+    all_steps, all_success = [], []
+    traces = []
+    for ep in range(CFG["eval_episodes"]):
+        task = TASK_LEVELS[ep % len(TASK_LEVELS)]
+        traj, actions = collect_trajectory(
+            env, model, tokenizer, CFG["max_steps"], 0.0, task
+        )
+        ep_r = sum(traj.rewards)
+        per_task[task].append(ep_r)
+        action_counter.update(actions)
+        all_steps.append(len(traj.actions))
+        # FIX 6 – meaningful success = agent explicitly called "done".
+        # ep_r > 0 is misleading: even a single inspect returns +0.002.
+        all_success.append(1 if "done" in actions else 0)
+        traces.append({"task": task, "reward": round(ep_r, 4),
+                       "steps": len(traj.actions), "actions": actions})
+    total_actions = max(sum(action_counter.values()), 1)
+    snap = EvalSnapshot(
+        avg_reward   = float(np.mean([r for rs in per_task.values() for r in rs])),
+        per_task     = {t: float(np.mean(rs)) for t, rs in per_task.items()},
+        action_dist  = {a: c/total_actions for a, c in action_counter.most_common()},
+        success_rate = float(np.mean(all_success)),
+        avg_steps    = float(np.mean(all_steps)),
+        traces       = traces,
+    )
+    if label:
+        print(f"\n── {label} ──")
+        print(f"  avg_reward={snap.avg_reward:+.4f}  "
+              f"success={snap.success_rate:.0%}  steps={snap.avg_steps:.1f}")
+        print(f"  per-task: " +
+              "  ".join(f"{t}={v:+.3f}" for t,v in snap.per_task.items()))
+        print(f"  top actions: " +
+              "  ".join(f"{a}={p:.0%}" for a,p in list(snap.action_dist.items())[:5]))
     model.train()
+    return snap
+# ══════════════════════════════════════════════════════════════════════════════
+# PPO UPDATE  (BUG FIX 2 + 3: BPE-safe logprob + log-ratio clamping)
+# ══════════════════════════════════════════════════════════════════════════════
+def ppo_update(trajectories: List[Trajectory],
+               model, tokenizer, optimizer) -> dict:
+    model.train()
+    losses, kls, entropies = [], [], []
+    # ── Compute discounted returns and a global mean baseline ────────────────
     all_returns = []
     traj_returns = []
     for traj in trajectories:
+        ret, running = [], 0.0
+        for r, done in zip(reversed(traj.rewards), reversed(traj.dones)):
+            running = r + CFG["gamma"] * (0.0 if done else running)
+            ret.insert(0, running)
+        traj_returns.append(ret)
+        all_returns.extend(ret)
+    # FIX 5 – Normalise advantages to zero mean / unit std.
+    # When all returns are identical (e.g. every episode returns 0.002),
+    # baseline = mean = every return, so adv = 0 for all steps, the
+    # policy loss is 0, and PPO never updates.  Normalising creates real
+    # signal: better-than-average trajectories get positive advantage,
+    # worse-than-average get negative, even if the absolute spread is tiny.
+    ret_arr  = np.array(all_returns) if all_returns else np.array([0.0])
+    ret_mean = float(ret_arr.mean())
+    ret_std  = float(ret_arr.std())
+    if ret_std < 1e-6:
+        # Truly zero variance – nothing to learn this iteration.
+        print("  [PPO] Zero return variance – skipping gradient update.")
+        return dict(loss=0.0, kl=0.0, entropy=0.0)
+    # Build a lookup so we can retrieve the normalised advantage by
+    # (trajectory index, step index) during the update loop below.
+    norm_returns: List[List[float]] = [
+        [(r - ret_mean) / (ret_std + 1e-8) for r in ret_list]
+        for ret_list in traj_returns
+    ]
+    for traj_idx, (traj, returns) in enumerate(zip(trajectories, traj_returns)):
         for i in range(len(traj.states)):
+            state  = traj.states[i]
             action = traj.actions[i]
+            old_lp = traj.logprobs[i]
+            adv    = norm_returns[traj_idx][i]   # ← normalised advantage
+            # ── Tokenise jointly (BPE FIX 2) ────────────────────────────────
+            prompt_chat = tokenizer.apply_chat_template(
+                [{"role": "user", "content": state}],
+                tokenize=False, add_generation_prompt=True
             )
+            full_text = prompt_chat + action
+            full_ids = tokenizer(
+                full_text, return_tensors="pt",
+                max_length=CFG["max_seq_len"], truncation=True
+            ).to(DEVICE)
+            # Count prompt tokens IN THE JOINT SEQUENCE (not separately)
+            prompt_ids = tokenizer(
+                prompt_chat, return_tensors="pt",
+                max_length=CFG["max_seq_len"] - 10, truncation=True
+            )["input_ids"]
+            prompt_len = min(prompt_ids.shape[1], full_ids["input_ids"].shape[1] - 1)
+            logits = model(**full_ids).logits
+            new_lp, avg_ent, n_tokens = _compute_action_logprob(
+                logits, full_ids["input_ids"], prompt_len
+            )
+            if n_tokens == 0:
                 continue
+            # BUG FIX 3 ── clamp log-ratio before exp to prevent NaN
+            old_lp_t  = torch.tensor(old_lp, dtype=torch.float32, device=DEVICE)
+            log_ratio = torch.clamp(new_lp - old_lp_t,
+                                    -CFG["log_ratio_clamp"],
+                                     CFG["log_ratio_clamp"])
+            ratio     = torch.exp(log_ratio)
+            adv_t = torch.tensor(adv, dtype=torch.float32, device=DEVICE)
+            s1    = ratio * adv_t
+            s2    = torch.clamp(ratio,
+                                1.0 - CFG["clip_eps"],
+                                1.0 + CFG["clip_eps"]) * adv_t
             policy_loss = -torch.min(s1, s2)
+            loss        = policy_loss - CFG["entropy_coef"] * avg_ent
+            if torch.isnan(loss) or torch.isinf(loss):
                 continue
             optimizer.zero_grad()
             torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
             optimizer.step()
             losses.append(loss.item())
+            kls.append((old_lp_t - new_lp).detach().cpu().item())
+            entropies.append(avg_ent.item())
+    torch.cuda.empty_cache()
+    return dict(
+        loss    = float(np.mean(losses))    if losses    else 0.0,
+        kl      = float(np.mean(kls))       if kls       else 0.0,
+        entropy = float(np.mean(entropies)) if entropies else 0.0,
     )
+# ══════════════════════════════════════════════════════════════════════════════
+# PLOTTING  (rich evidence panel)
+# ══════════════════════════════════════════════════════════════════════════════
+def plot_all(warmup_losses, reward_hist, success_hist, kl_hist, entropy_hist,
+             baseline_snap: EvalSnapshot,
+             postwarmup_snap: EvalSnapshot,
+             final_snap: EvalSnapshot):
+    iters = list(range(1, len(reward_hist) + 1))
+    # ── Figure 1: training curves (2×3 grid) ─────────────────────────────────
+    fig = plt.figure(figsize=(18, 10))
+    gs  = gridspec.GridSpec(2, 3, figure=fig, hspace=0.45, wspace=0.35)
+    # (0,0) Warm-up loss
+    ax = fig.add_subplot(gs[0, 0])
+    ax.plot(range(1, len(warmup_losses)+1), warmup_losses,
+            marker="o", color="mediumpurple", linewidth=2)
+    ax.set_title("A. Warm-up CE Loss ↓", fontweight="bold")
+    ax.set_xlabel("Epoch"); ax.set_ylabel("Loss"); ax.grid(alpha=0.3)
+    # (0,1) PPO reward
+    ax = fig.add_subplot(gs[0, 1])
+    smooth = np.convolve(reward_hist, np.ones(3)/3, mode="same")
+    ax.plot(iters, reward_hist, alpha=0.35, color="steelblue", linewidth=1)
+    ax.plot(iters, smooth, color="steelblue", linewidth=2.5, label="reward (smoothed)")
+    ax.axhline(baseline_snap.avg_reward, color="gray", linestyle=":",
+               label=f"pre-warmup ({baseline_snap.avg_reward:+.3f})")
+    ax.axhline(postwarmup_snap.avg_reward, color="mediumpurple", linestyle="--",
+               label=f"post-warmup ({postwarmup_snap.avg_reward:+.3f})")
+    ax.axhline(final_snap.avg_reward, color="forestgreen", linestyle="-.",
+               label=f"final ({final_snap.avg_reward:+.3f})")
+    ax.set_title("B. PPO Reward ↑", fontweight="bold")
+    ax.set_xlabel("Iteration"); ax.set_ylabel("Avg Reward")
+    ax.legend(fontsize=7); ax.grid(alpha=0.3)
+    # (0,2) Success rate
+    ax = fig.add_subplot(gs[0, 2])
+    ax.plot(iters, success_hist, marker="s", color="seagreen", linewidth=2)
+    ax.set_ylim(0, 1)
+    ax.set_title("C. Episode Success Rate ↑", fontweight="bold")
+    ax.set_xlabel("Iteration"); ax.set_ylabel("Fraction")
+    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y,_: f"{y:.0%}"))
+    ax.grid(alpha=0.3)
+    # (1,0) KL divergence
+    ax = fig.add_subplot(gs[1, 0])
+    ax.plot(iters, kl_hist, marker="^", color="tomato", linewidth=2)
+    ax.axhline(0, color="gray", linewidth=0.8)
+    ax.set_title("D. KL Divergence", fontweight="bold")
+    ax.set_xlabel("Iteration"); ax.set_ylabel("KL"); ax.grid(alpha=0.3)
+    # (1,1) Entropy
+    ax = fig.add_subplot(gs[1, 1])
+    ax.plot(iters, entropy_hist, marker="D", color="darkorange", linewidth=2)
+    ax.set_title("E. Policy Entropy", fontweight="bold")
+    ax.set_xlabel("Iteration"); ax.set_ylabel("Entropy"); ax.grid(alpha=0.3)
+    # (1,2) Per-difficulty final reward
+    ax = fig.add_subplot(gs[1, 2])
+    tasks = TASK_LEVELS
+    vals_base  = [baseline_snap.per_task.get(t, 0)   for t in tasks]
+    vals_final = [final_snap.per_task.get(t, 0)       for t in tasks]
+    x = np.arange(len(tasks))
+    ax.bar(x - 0.2, vals_base,  0.35, label="baseline",color="lightcoral",  alpha=0.8)
+    ax.bar(x + 0.2, vals_final, 0.35, label="final",   color="steelblue",   alpha=0.8)
+    ax.set_xticks(x); ax.set_xticklabels(tasks, fontsize=8)
+    ax.set_title("F. Per-Difficulty Reward", fontweight="bold")
+    ax.set_ylabel("Avg Reward"); ax.legend(fontsize=8); ax.grid(alpha=0.3, axis="y")
+    ax.axhline(0, color="gray", linewidth=0.8)
+    fig.suptitle(f"Code-Review Agent – Full Training Evidence  "
+                 f"(Qwen2.5-1.5B, PPO + QLoRA)",
+                 fontsize=13, fontweight="bold")
+    fig.savefig("training_summary.png", dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    print("  Saved: training_summary.png")
+    # ── Figure 2: before / after action distribution ─────────────────────────
+    fig, axes = plt.subplots(1, 3, figsize=(16, 4), sharey=False)
+    for ax, snap, title in zip(
+        axes,
+        [baseline_snap, postwarmup_snap, final_snap],
+        ["Before (baseline)", "After warm-up", "After PPO (final)"]
+    ):
+        if snap.action_dist:
+            labels = list(snap.action_dist.keys())
+            vals   = [snap.action_dist[l]*100 for l in labels]
+            bars   = ax.barh(labels, vals,
+                             color=plt.cm.tab10(np.linspace(0, 0.8, len(labels))))
+            ax.bar_label(bars, fmt="%.0f%%", padding=3, fontsize=8)
+        ax.set_xlim(0, 105)
+        ax.set_title(title, fontweight="bold")
+        ax.set_xlabel("% of actions")
+        ax.grid(alpha=0.3, axis="x")
+    fig.suptitle("Action Distribution: Before vs After Training",
+                 fontsize=12, fontweight="bold")
+    plt.tight_layout()
+    fig.savefig("action_distribution.png", dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    print("  Saved: action_distribution.png")
+# ══════════════════════════════════════════════════════════════════════════════
+# MAIN
+# ══════════════════════════════════════════════════════════════════════════════
 def train():
     model, tokenizer = load_model()
     env = CodeReviewEnv()
+    # ── PHASE 0: pre-warmup baseline ────────────────────────────────────────
+    print("\n" + "="*60)
+    print("PHASE 0 – BASELINE (untrained)")
+    print("="*60)
+    baseline_snap = evaluate(env, model, tokenizer, "Baseline")
+    # ── PHASE 1: supervised warm-up ─────────────────────────────────────────
+    warmup_losses = supervised_warmup(model, tokenizer)
+    postwarmup_snap = evaluate(env, model, tokenizer, "Post-Warmup")
+    # ── PHASE 2: PPO ────────────────────────────────────────────────────────
+    optimizer = AdamW(model.parameters(), lr=CFG["ppo_lr"])
+    reward_hist, success_hist, kl_hist, entropy_hist = [], [], [], []
+    print("\n" + "="*60)
+    print(f"PHASE 2 – PPO ({CFG['ppo_iters']} iterations × "
+          f"{CFG['trajs_per_iter']} trajectories)")
+    print("="*60)
+    for it in range(CFG["ppo_iters"]):
+        # Linearly anneal exploration temperature
+        # FIX 7 – exponential decay with a floor (never below 0.35).
+        # Linear annealing to 0.1 collapses exploration before we learn
+        # anything; keeping >= 0.35 ensures trajectory diversity.
+        t = max(CFG["temp_start"] * (0.93 ** it), 0.35)
+        print(f"\n── Iteration {it+1}/{CFG['ppo_iters']}  temp={t:.2f} ──")
+        trajectories, action_counts = [], Counter()
+        successes = 0
+        for j in range(CFG["trajs_per_iter"]):
+            task = TASK_LEVELS[j % len(TASK_LEVELS)]
+            traj, actions = collect_trajectory(
+                env, model, tokenizer, CFG["max_steps"], t, task
+            )
+            trajectories.append(traj)
+            action_counts.update(actions)
+            ep_r = sum(traj.rewards)
+            # FIX 6b – consistent with evaluate(): only explicit done counts
+            successes += int("done" in actions)
+            print(f"  traj {j+1}/{CFG['trajs_per_iter']}  task={task}"
+                  f"  steps={len(traj.actions)}  reward={ep_r:+.3f}")
+        avg_r       = float(np.mean([sum(t.rewards) for t in trajectories]))
+        success_r   = successes / CFG["trajs_per_iter"]
+        m = ppo_update(trajectories, model, tokenizer, optimizer)
+        reward_hist.append(avg_r)
+        success_hist.append(success_r)
+        kl_hist.append(m["kl"])
+        entropy_hist.append(m["entropy"])
+        delta = avg_r - baseline_snap.avg_reward
+        print(f"  → avg_reward={avg_r:+.4f}  Δbaseline={delta:+.4f}"
+              f"  success={success_r:.0%}"
+              f"  loss={m['loss']:.4f}  kl={m['kl']:.4f}  ent={m['entropy']:.4f}")
+        print(f"  actions: {dict(action_counts.most_common(5))}")
+    # ── PHASE 3: final evaluation ───────────────────────────────────────────
+    print("\n" + "="*60)
+    print("PHASE 3 – FINAL EVALUATION")
+    print("="*60)
+    final_snap = evaluate(env, model, tokenizer, "Final")
+    # ── Summary table ───────────────────────────────────────────────────────
+    print("\n" + "="*60)
+    print("TRAINING SUMMARY")
+    print("="*60)
+    print(f"  {'Stage':<20} {'Reward':>10} {'Success':>10} {'Δ baseline':>12}")
+    print(f"  {'-'*54}")
+    for label, snap in [("Baseline",    baseline_snap),
+                        ("Post-warmup", postwarmup_snap),
+                        ("Final (PPO)", final_snap)]:
+        delta = snap.avg_reward - baseline_snap.avg_reward
+        print(f"  {label:<20} {snap.avg_reward:>+10.4f}"
+              f" {snap.success_rate:>10.0%}  {delta:>+11.4f}")
+    improve = final_snap.avg_reward - baseline_snap.avg_reward
+    verdict = "✓ LEARNED" if improve > 0 else "✗ NO IMPROVEMENT"
+    print(f"\n  {verdict}  (total Δ = {improve:+.4f})")
+    print("\nBefore → After traces (one per difficulty):")
+    btask = {t["task"]: t for t in baseline_snap.traces}
+    ftask = {t["task"]: t for t in final_snap.traces}
+    for task in TASK_LEVELS:
+        b = btask.get(task, {})
+        f = ftask.get(task, {})
+        print(f"  {task:8s}  baseline actions={b.get('actions',[])}  "
+              f"reward={b.get('reward',0):+.3f}"
+              f"  │  final actions={f.get('actions',[])}  "
+              f"reward={f.get('reward',0):+.3f}")
+    # ── Plots ───────────────────────────────────────────────────────────────
+    plot_all(warmup_losses, reward_hist, success_hist, kl_hist, entropy_hist,
+             baseline_snap, postwarmup_snap, final_snap)
+    print("\nAll done. Saved: training_summary.png  action_distribution.png")
 if __name__ == "__main__":
     train()