Spaces:

100XZX001
/

code-review-training

Sleeping

App Files Files Community

100XZX001 commited on Apr 25

Commit

3d70980

verified ·

1 Parent(s): 374db2f

Update training.py

Browse files

Files changed (1) hide show

training.py +126 -297

training.py CHANGED Viewed

@@ -1,10 +1,7 @@
-# training.py – Memory‑safe: Phi‑3‑mini + Expert Demos + Fast PPO (2 iterations)
 import os
-os.environ["TRITON_DISABLE"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-import torch._dynamo
-torch._dynamo.config.disable = True
 import json
 import torch
 import torch.nn.functional as F
@@ -15,13 +12,16 @@ import numpy as np
 import re
 import random
 import matplotlib
-matplotlib.use('Agg')          # ← add this line
 import matplotlib.pyplot as plt
-from unsloth import FastLanguageModel
-from transformers import TrainingArguments
-from trl import SFTTrainer
-from datasets import Dataset
 from environment import CodeReviewEnv
 from redteam import BUG_DB
@@ -74,25 +74,46 @@ def parse_action(output: str) -> AgentAction:
 def map_to_env(action: AgentAction):
     return model_map_to_env(action.action_type, action.content)
 # ======================================================================
 def load_model():
-    model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name="unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
-    max_seq_length=480,
-    load_in_4bit=True,
     )
-    model = FastLanguageModel.get_peft_model(
-        model,
         r=16,
         target_modules=[
             "q_proj", "k_proj", "v_proj", "o_proj",
             "gate_proj", "up_proj", "down_proj"
         ],
-        lora_alpha=32,
         lora_dropout=0.0,
     )
     return model, tokenizer
 def test_model_sanity(model, tokenizer) -> bool:
     print("\n" + "="*60)
     print("SANITY CHECK: Testing base model generation")
@@ -100,7 +121,7 @@ def test_model_sanity(model, tokenizer) -> bool:
     test_prompt = "Hello, how are you?"
     messages = [{"role": "user", "content": test_prompt}]
     formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = tokenizer(formatted, return_tensors="pt", max_length=2048, truncation=True).to("cuda")
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
@@ -123,10 +144,7 @@ def test_model_sanity(model, tokenizer) -> bool:
 # ======================================================================
 def _expert_fix_from_context(obs) -> str:
-    """
-    Build a conservative fix template named `fix` (required by tests).
-    Uses bug hints + code snippet patterns to create realistic fixes.
-    """
     bug = (getattr(obs, "bug_description", "") or "").lower()
     code = getattr(obs, "code_snippet", "") or ""
@@ -158,7 +176,6 @@ def _expert_fix_from_context(obs) -> str:
             "    return users.get(user_id)"
         )
-    # Concurrency-heavy tasks (harder/hardest).
     if "race" in bug or "missing_lock" in bug or "thread_safe" in bug or "global_nonatomic" in bug:
         return (
             "import threading\n"
@@ -197,7 +214,6 @@ def _expert_fix_from_context(obs) -> str:
             "    return True"
         )
-    # Generic safe fallback keeps the RL pipeline alive for unknown bugs.
     return (
         "def fix(data):\n"
         "    if data is None:\n"
@@ -207,10 +223,7 @@ def _expert_fix_from_context(obs) -> str:
 def _expert_supervised_policy(obs) -> str:
-    """
-    Real workflow policy:
-    inspect -> tests/linter -> docs -> fix -> negotiate -> done.
-    """
     author_msg = (getattr(obs, "author_response", "") or "").lower()
     tool_output = (getattr(obs, "last_tool_output", "") or "").lower()
@@ -225,7 +238,6 @@ def _expert_supervised_policy(obs) -> str:
     if not getattr(obs, "docs_queried", False):
         return '{"action_type": "query_docs", "content": "python bug fixing best practices for edge cases and null safety"}'
-    # Use docs again on hard tasks when evidence is still weak.
     if getattr(obs, "current_test_score", 0.0) < 0.6 and getattr(obs, "step", 0) >= 3:
         bug_hint = (getattr(obs, "bug_description", "") or "concurrency bug").replace('"', "'")
         return json.dumps(
@@ -235,12 +247,10 @@ def _expert_supervised_policy(obs) -> str:
             }
         )
-    # If test quality is poor, propose a concrete fix.
     if getattr(obs, "current_test_score", 0.0) < 0.95:
         fix_code = _expert_fix_from_context(obs)
         return json.dumps({"action_type": "fix", "content": fix_code})
-    # If author is still unconvinced, provide causal explanation.
     if author_msg and ("not convinced" in author_msg or "explain" in author_msg or "brief" in author_msg):
         return (
             '{"action_type": "comment", "content": "This fix works because it handles the failing edge case directly, '
@@ -248,94 +258,79 @@ def _expert_supervised_policy(obs) -> str:
             'The change is intentionally small to reduce regression risk."}'
         )
-    # If negotiation is strong enough and quality is good, terminate.
     conf = float(getattr(obs, "author_confidence", 0.0))
     threshold = float(getattr(obs, "author_threshold", 0.5))
     score = float(getattr(obs, "current_test_score", 0.0))
     if conf >= threshold and score >= 0.8:
         return '{"action_type": "done"}'
-    # Nudge conversation forward when tests are okay but acceptance is pending.
     return (
         '{"action_type": "question", "content": "Would you like a quick walkthrough of a failing scenario, the root cause, and how the fix prevents regressions?"}'
     )
 # ======================================================================
-def supervised_warmup(model, tokenizer, env, n_episodes=16, epochs=1, max_steps=8):
-    print("\n" + "="*60)
-    print("SUPERVISED WARM-UP: Real environment demonstrations")
-    print("="*60)
-    examples = []
-    tasks = ["easy", "medium", "hard", "harder", "hardest"]
-    for ep in range(n_episodes):
-        task = random.choice(tasks)
-        env.set_task(task)
-        obs = env.reset()
-        history = []
-        done = False
-        steps = 0
-        while not done and steps < max_steps:
-            prompt = build_prompt(obs, history)
-            action_text = _expert_supervised_policy(obs)
-            action = parse_action(action_text)
-            env_action = map_to_env(action)
-            next_obs, _, done, _ = env.step(env_action)
-            messages = [
-                {"role": "user", "content": prompt},
-                {"role": "assistant", "content": action_text},
-            ]
-            full_text = tokenizer.apply_chat_template(messages, tokenize=False)
-            examples.append({"text": full_text})
-            history.append(f"Agent: {action_text}")
-            history.append(f"Env: {next_obs.last_tool_output}")
-            history = history[-8:]
-            obs = next_obs
-            steps += 1
-        print(f"Supervised episode {ep+1}: task={task}, steps={steps}, done={done}")
-    if not examples:
-        print("No supervised examples generated; skipping warm-up.")
-        return
-    dataset = Dataset.from_list(examples)
-    trainer = SFTTrainer(
-        model=model,
-        tokenizer=tokenizer,
-        train_dataset=dataset,
-        dataset_text_field="text",
-        max_seq_length=2048,
-        args=TrainingArguments(
-            output_dir="warmup_output",
-            num_train_epochs=epochs,
-            per_device_train_batch_size=2,
-            gradient_accumulation_steps=2,
-            learning_rate=2e-5,
-            logging_steps=50,
-            save_strategy="no",
-            bf16=True,
-        ),
-    )
-    print(f"Training on {len(examples)} real env examples for {epochs} epochs...")
-    trainer.train()
-    print("✓ Supervised warm-up (real env) complete\n")
-    torch.cuda.empty_cache()
 # ======================================================================
 def generate_action_with_logprob(prompt, model, tokenizer, temperature=0.0, max_retries=2):
     messages = [{"role": "user", "content": prompt}]
     formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = tokenizer(formatted, return_tensors="pt", max_length=2048, truncation=True).to("cuda")
     for attempt in range(max_retries):
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
-                max_new_tokens=64,
                 do_sample=(temperature > 0),
                 temperature=max(temperature, 0.01) if temperature > 0 else 1.0,
                 min_new_tokens=1,
@@ -344,7 +339,7 @@ def generate_action_with_logprob(prompt, model, tokenizer, temperature=0.0, max_
             )
         generated_ids = outputs.sequences[0][inputs['input_ids'].shape[1]:]
         action_text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
         logprobs = []
         for idx, token_id in enumerate(generated_ids):
             if idx < len(outputs.scores):
@@ -352,7 +347,7 @@ def generate_action_with_logprob(prompt, model, tokenizer, temperature=0.0, max_
                 token_logprob = F.log_softmax(token_logits, dim=-1)[token_id].item()
                 logprobs.append(token_logprob)
         total_logprob = sum(logprobs) if logprobs else -100.0
         if not action_text:
             fallback_actions = [
                 '{"action_type": "run_tests"}',
@@ -364,7 +359,7 @@ def generate_action_with_logprob(prompt, model, tokenizer, temperature=0.0, max_
             total_logprob = -50.0
             print(f"[WARN] Empty generation → using fallback: {action_text}")
             return action_text, total_logprob
         try:
             json.loads(action_text)
             return action_text, total_logprob
@@ -374,58 +369,6 @@ def generate_action_with_logprob(prompt, model, tokenizer, temperature=0.0, max_
             continue
     return '{"action_type":"skip"}', -100.0
-# ======================================================================
-def build_prompt(obs, history_lines: List[str]) -> str:
-    author_msg = getattr(obs, "author_response", "") or ""
-    tool_output = getattr(obs, "last_tool_output", "") or ""
-    author_personality = getattr(obs, "author_personality", "defensive")
-    prompt = f"""You are an AI code review agent. Your goal is to convince a simulated human developer to accept your proposed fix and name your proposed fix function fix.
-The developer has a **{author_personality}** personality and will only accept if you provide solid evidence:
-- Tests pass (high pass ratio)
-- Lint is clean (zero errors)
-- Documentation or references are provided
-- Your reasoning is clear, uses words like "because" or "therefore", and is detailed (over 30 words if needed)
-Workflow:
-1. Use `inspect` to understand the code.
-2. Use `run_tests` and `run_linter` to gather evidence.
-3. Use `query_docs` when you need references or language-specific guidance.
-4. Propose a fix (`fix`) and explain why it works (`comment` or `question`).
-5. If the developer pushes back, read their response carefully and address their specific concern.
-6. Once convinced, use `done` to finish.
-Code:
-{obs.code_snippet}
-Author says:
-{author_msg if author_msg else "(no response yet – start with inspection)"}
-Last tool output:
-{tool_output if tool_output else "(none)"}
-Available actions:
-run_tests, run_linter, inspect, query_docs, fix, comment, question, done
-Respond ONLY in JSON:
-{{"action_type": "...", "content": "..."}}"""
-    if history_lines:
-        history = "\n".join(history_lines[-6:])
-        prompt += f"\n\nPrevious steps:\n{history}"
-    return prompt
-# ======================================================================
-@dataclass
-class Trajectory:
-    states: List[str]
-    actions: List[str]
-    rewards: List[float]
-    logprobs: List[float]
-    dones: List[bool]
-    def __len__(self): return len(self.states)
 def collect_trajectory(env, model, tokenizer, max_steps=6, temperature=0.0):
     obs = env.reset()
     history_lines = []
@@ -466,10 +409,6 @@ def collect_trajectories(env, model, tokenizer, n_trajectories, max_steps=6,
     return trajectories
 def compute_returns_and_advantages(rewards, dones, gamma=0.99, standardize=True):
-    """
-    Compute discounted returns and REINFORCE-style baseline advantages.
-    Advantages are centered and optionally standardised.
-    """
     n = len(rewards)
     returns = [0.0]*n
     running = 0.0
@@ -507,7 +446,7 @@ def ppo_update(trajectories, model, tokenizer, optimizer, n_epochs=1, clip_epsil
             messages = [{"role": "user", "content": state}]
             formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
             full_text = formatted + action
-            inputs = tokenizer(full_text, return_tensors="pt", max_length=2048, truncation=True).to("cuda")
             outputs = model(**inputs)
             logits = outputs.logits
             action_ids = tokenizer.encode(action, add_special_tokens=False)
@@ -527,8 +466,7 @@ def ppo_update(trajectories, model, tokenizer, optimizer, n_epochs=1, clip_epsil
             if not logprobs: continue
             new_logprob = sum(logprobs)
             avg_entropy = entropy / len(logprobs) if logprobs else 0.0
-            log_ratio = torch.clamp(new_logprob - old_logprob, min=-10.0, max=10.0)  # ← guard
-            ratio     = torch.exp(log_ratio)
             surr1 = ratio * advantage
             surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantage
             policy_loss = -torch.min(surr1, surr2)
@@ -548,12 +486,11 @@ def ppo_update(trajectories, model, tokenizer, optimizer, n_epochs=1, clip_epsil
 def evaluate_policy(env, model, tokenizer, n_episodes=3, max_steps=6,
                     task_levels=None, verbose=False):
-    """Evaluate the current policy across task levels. Returns metrics + optional traces."""
     model.eval()
     if task_levels is None:
         task_levels = list(BUG_DB.keys())
     total_rewards = []
-    traces = []  # human-readable behavior logs
     for ep in range(n_episodes):
         task = task_levels[ep % len(task_levels)]
         env.set_task(task)
@@ -563,10 +500,8 @@ def evaluate_policy(env, model, tokenizer, n_episodes=3, max_steps=6,
         if verbose:
             actions_taken = []
             for a in traj.actions:
-                try:
-                    actions_taken.append(json.loads(a).get("action_type", "?"))
-                except Exception:
-                    actions_taken.append("?")
             traces.append({
                 "task": task,
                 "reward": round(ep_reward, 4),
@@ -582,15 +517,9 @@ def evaluate_policy(env, model, tokenizer, n_episodes=3, max_steps=6,
     }
 # ======================================================================
-# MANUAL WARM-UP (no SFTTrainer → no multiprocessing OOM)
-# ======================================================================
 def json_warmup(model, tokenizer, json_path="training_data.json",
-                n_episodes=20, epochs=2, lr=2e-5):
-    """
-    Supervised warm-up from pre-generated expert demonstrations.
-    Uses raw cross-entropy on action tokens with manual gradient steps.
-    NO SFTTrainer, NO multiprocessing – runs safely on any GPU.
-    """
     print("\n" + "="*60)
     print("SUPERVISED WARM-UP: training_data.json (manual cross-entropy)")
     print("="*60)
@@ -598,7 +527,6 @@ def json_warmup(model, tokenizer, json_path="training_data.json",
     with open(json_path, encoding="utf-8") as f:
         data = json.load(f)
-    # Each episode = 7 steps. Select n_episodes worth.
     steps_per_episode = 7
     max_examples = n_episodes * steps_per_episode
     if max_examples < len(data):
@@ -609,7 +537,7 @@ def json_warmup(model, tokenizer, json_path="training_data.json",
     model.train()
     warmup_opt = AdamW(model.parameters(), lr=lr)
-    warmup_losses = []   # per-epoch avg loss
     for epoch in range(epochs):
         random.shuffle(data)
@@ -620,15 +548,13 @@ def json_warmup(model, tokenizer, json_path="training_data.json",
             prompt = example["prompt"]
             action = example["action"]
-            # ---- tokenize full sequence (prompt + action) ----
             messages = [
                 {"role": "user", "content": prompt},
                 {"role": "assistant", "content": action},
             ]
             full_text = tokenizer.apply_chat_template(messages, tokenize=False)
-            inputs = tokenizer(full_text, return_tensors="pt", max_length=2048, truncation=True).to("cuda")
-            # ---- find where the action tokens start ----
             prompt_only = tokenizer.apply_chat_template(
                 [{"role": "user", "content": prompt}],
                 tokenize=False, add_generation_prompt=True
@@ -638,13 +564,11 @@ def json_warmup(model, tokenizer, json_path="training_data.json",
             total_len = inputs.input_ids.shape[1]
             if prompt_len >= total_len:
-                continue  # prompt was truncated away, skip
-            # ---- cross-entropy on action tokens only ----
             outputs = model(**inputs)
             logits = outputs.logits
-            # next-token prediction: logits[t] predicts token[t+1]
             shift_logits = logits[0, prompt_len - 1 : total_len - 1]
             shift_labels = inputs.input_ids[0, prompt_len : total_len]
@@ -680,11 +604,9 @@ def json_warmup(model, tokenizer, json_path="training_data.json",
 # ======================================================================
 # MAIN TRAINING PIPELINE
-# ======================================================================
 def train_ppo():
-    # --- Hyperparameters ---
-    n_iterations = 15            # enough for a clear upward trend
-    trajectories_per_iter = 6   # on-policy data per iteration
     n_epochs = 2
     max_steps = 8
     learning_rate = 3e-5
@@ -692,23 +614,19 @@ def train_ppo():
     entropy_coef = 0.01
     gamma = 0.99
-    # --- Pre-load embedder before LLM (Issue #13) ---
     from rltool import ToolBox
     print("Pre-loading sentence-transformer embedder...")
     ToolBox._get_embedder()
     print("✓ Embedder ready")
-    # --- Load model ---
-    print("Loading model...")
     model, tokenizer = load_model()
     if not test_model_sanity(model, tokenizer):
         return
     env = CodeReviewEnv()
     task_levels = list(BUG_DB.keys())
-    # ==================================================================
-    # PHASE 0: BASELINE (untrained policy)
-    # ==================================================================
     print("\n" + "="*60)
     print("PHASE 0 – BASELINE EVALUATION (untrained)")
     print("="*60)
@@ -723,18 +641,10 @@ def train_ppo():
         print(f"  task={t['task']:8s}  reward={t['reward']:+.4f}  "
               f"steps={t['steps']}  actions={t['actions']}")
-    # ==================================================================
-    # PHASE 1: SUPERVISED WARM-UP (expert demos, manual CE)
-    # ==================================================================
-    warmup_losses = json_warmup(
-        model, tokenizer,
-        json_path="training_data.json",
-        n_episodes=30,  # 140 examples (20 × 7 steps)
-        epochs=3,
-        lr=2e-5,
-    )
-    # Post-warmup evaluation
     print("="*60)
     print("POST WARM-UP EVALUATION")
     print("="*60)
@@ -749,25 +659,15 @@ def train_ppo():
         print(f"  task={t['task']:8s}  reward={t['reward']:+.4f}  "
               f"steps={t['steps']}  actions={t['actions']}")
-    # ==================================================================
-    # PHASE 2: TRUE RL – PPO (on-policy, real environment interaction)
-    # ==================================================================
     optimizer = AdamW(model.parameters(), lr=learning_rate)
     print(f"\n{'='*60}")
     print(f"PHASE 2 – PPO TRAINING: {n_iterations} iterations × "
           f"{trajectories_per_iter} trajectories (true RL)")
     print(f"{'='*60}\n")
-    reward_history = []
-    eval_history = []
-    loss_history = []
-    policy_loss_history = []
-    entropy_history = []
     for iteration in range(n_iterations):
         print(f"\n--- PPO Iteration {iteration + 1}/{n_iterations} ---")
-        # Collect on-policy trajectories from REAL environment
         trajectories = collect_trajectories(
             env, model, tokenizer, trajectories_per_iter, max_steps,
             task_levels=task_levels, task_weights=None
@@ -776,20 +676,16 @@ def train_ppo():
         reward_history.append(avg_reward)
         print(f"  Collect  avg reward: {avg_reward:+.4f}")
-        # PPO policy gradient update
         metrics = ppo_update(
             trajectories, model, tokenizer, optimizer,
             n_epochs=n_epochs, clip_epsilon=clip_epsilon,
             entropy_coef=entropy_coef, gamma=gamma
         )
         loss_history.append(float(metrics["loss"]))
-        policy_loss_history.append(float(metrics["policy_loss"]))
-        entropy_history.append(float(metrics["entropy"]))
         print(f"  Update   loss={metrics['loss']:.4f}  "
               f"policy={metrics['policy_loss']:.4f}  "
               f"entropy={metrics['entropy']:.4f}")
-        # Evaluate greedy policy after update
         eval_m = evaluate_policy(env, model, tokenizer, n_episodes=3,
                                  max_steps=max_steps, task_levels=task_levels,
                                  verbose=False)
@@ -798,9 +694,6 @@ def train_ppo():
         print(f"  Eval     avg reward: {eval_m['avg_reward']:+.4f}  "
               f"(Δ baseline: {delta:+.4f})")
-    # ==================================================================
-    # PHASE 3: FINAL EVALUATION (proof of learning)
-    # ==================================================================
     print("\n" + "="*60)
     print("PHASE 3 – FINAL EVALUATION (after all training)")
     print("="*60)
@@ -832,27 +725,19 @@ def train_ppo():
         print(f"  ✗ No overall improvement detected")
     print(f"{'='*60}")
-    # ==================================================================
-    # PLOTS
-    # ==================================================================
     iters = list(range(1, n_iterations + 1))
-    # --- 1. Warm-up loss curve ---
     if warmup_losses:
         fig, ax = plt.subplots(figsize=(7, 4))
-        ax.plot(range(1, len(warmup_losses) + 1), warmup_losses,
                 marker="o", linewidth=2, color="tab:purple")
-        ax.set_title("Warm-up Loss (supervised, per epoch)",
-                     fontsize=13, fontweight="bold")
-        ax.set_xlabel("Epoch")
-        ax.set_ylabel("Cross-Entropy Loss")
-        ax.grid(alpha=0.3)
-        fig.tight_layout()
-        fig.savefig("warmup_loss.png", dpi=150)
-        plt.close(fig)
-    # --- 2. PPO reward curve ---
-    fig, ax = plt.subplots(figsize=(9, 5))
     ax.plot(iters, reward_history, marker="o", linewidth=2,
             label="Collect reward", color="tab:blue")
     ax.plot(iters, eval_history, marker="s", linewidth=2, linestyle="--",
@@ -862,75 +747,19 @@ def train_ppo():
     ax.axhline(y=warmup_reward, color="tab:purple", linestyle=":",
                linewidth=1.5, label=f"Post-warmup ({warmup_reward:+.3f})")
     ax.set_title("PPO Reward per Iteration", fontsize=14, fontweight="bold")
-    ax.set_xlabel("Iteration")
-    ax.set_ylabel("Average Reward")
-    ax.legend(loc="best", fontsize=8)
-    ax.grid(alpha=0.3)
-    fig.tight_layout()
-    fig.savefig("reward_curve.png", dpi=150)
-    plt.close(fig)
-    # --- 3. PPO loss curve ---
-    fig, ax = plt.subplots(figsize=(9, 5))
     ax.plot(iters, loss_history, marker="o", linewidth=2,
             label="Total loss", color="tab:red")
-    ax.plot(iters, policy_loss_history, marker="^", linewidth=2, linestyle="--",
-            label="Policy loss", color="tab:orange")
     ax.set_title("PPO Loss per Iteration", fontsize=14, fontweight="bold")
-    ax.set_xlabel("Iteration")
-    ax.set_ylabel("Loss")
-    ax.legend(loc="best")
-    ax.grid(alpha=0.3)
-    fig.tight_layout()
-    fig.savefig("loss_curve.png", dpi=150)
-    plt.close(fig)
-    # --- 4. Combined 3-panel summary ---
-    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
-    # Panel A: warm-up loss
-    if warmup_losses:
-        axes[0].plot(range(1, len(warmup_losses) + 1), warmup_losses,
-                     marker="o", linewidth=2, color="tab:purple")
-        axes[0].set_title("A. Warm-up Loss ↓")
-        axes[0].set_xlabel("Epoch")
-        axes[0].set_ylabel("CE Loss")
-        axes[0].grid(alpha=0.3)
-    # Panel B: PPO reward
-    axes[1].plot(iters, reward_history, marker="o", linewidth=2,
-                 color="tab:blue", label="Collect")
-    axes[1].plot(iters, eval_history, marker="s", linewidth=2,
-                 linestyle="--", color="tab:green", label="Eval")
-    axes[1].axhline(y=baseline_reward, color="tab:gray", linestyle=":",
-                    linewidth=1.5, label="Baseline")
-    axes[1].axhline(y=warmup_reward, color="tab:purple", linestyle=":",
-                    linewidth=1.5, label="Post-warmup")
-    axes[1].set_title("B. PPO Reward ↑")
-    axes[1].set_xlabel("Iteration")
-    axes[1].set_ylabel("Avg Reward")
-    axes[1].legend(fontsize=7)
-    axes[1].grid(alpha=0.3)
-    # Panel C: PPO loss
-    axes[2].plot(iters, loss_history, marker="o", linewidth=2,
-                 color="tab:red", label="Total")
-    axes[2].plot(iters, policy_loss_history, marker="^", linewidth=2,
-                 linestyle="--", color="tab:orange", label="Policy")
-    axes[2].set_title("C. PPO Loss ↓")
-    axes[2].set_xlabel("Iteration")
-    axes[2].set_ylabel("Loss")
-    axes[2].legend(fontsize=7)
-    axes[2].grid(alpha=0.3)
-    fig.suptitle("Code Review Agent – Full Training Evidence",
-                 fontsize=14, fontweight="bold")
-    fig.tight_layout()
-    fig.savefig("training_summary.png", dpi=150)
-    plt.close(fig)
-    print("Plots saved: warmup_loss.png, reward_curve.png, "
-          "loss_curve.png, training_summary.png")
     print("="*60)
 if __name__ == "__main__":

+# training.py – Vanilla bitsandbytes QLoRA + custom PPO (no unsloth, no Triton)
 import os
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import json
 import torch
 import torch.nn.functional as F
 import re
 import random
 import matplotlib
+matplotlib.use('Agg')
 import matplotlib.pyplot as plt
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    TrainingArguments
+)
+from peft import LoraConfig, get_peft_model, TaskType
 from environment import CodeReviewEnv
 from redteam import BUG_DB
 def map_to_env(action: AgentAction):
     return model_map_to_env(action.action_type, action.content)
+# ======================================================================
+# Model loading – no unsloth, no Triton kernels
 # ======================================================================
 def load_model():
+    model_name = "microsoft/Phi-3-mini-4k-instruct"
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
     )
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer.pad_token = tokenizer.eos_token
+    lora_config = LoraConfig(
         r=16,
+        lora_alpha=32,
         target_modules=[
             "q_proj", "k_proj", "v_proj", "o_proj",
             "gate_proj", "up_proj", "down_proj"
         ],
         lora_dropout=0.0,
+        bias="none",
+        task_type=TaskType.CAUSAL_LM,
     )
+    model = get_peft_model(model, lora_config)
     return model, tokenizer
+# ======================================================================
 def test_model_sanity(model, tokenizer) -> bool:
     print("\n" + "="*60)
     print("SANITY CHECK: Testing base model generation")
     test_prompt = "Hello, how are you?"
     messages = [{"role": "user", "content": test_prompt}]
     formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(formatted, return_tensors="pt", max_length=256, truncation=True).to("cuda")
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
 # ======================================================================
 def _expert_fix_from_context(obs) -> str:
+    """Build a conservative fix template based on bug hints."""
     bug = (getattr(obs, "bug_description", "") or "").lower()
     code = getattr(obs, "code_snippet", "") or ""
             "    return users.get(user_id)"
         )
     if "race" in bug or "missing_lock" in bug or "thread_safe" in bug or "global_nonatomic" in bug:
         return (
             "import threading\n"
             "    return True"
         )
     return (
         "def fix(data):\n"
         "    if data is None:\n"
 def _expert_supervised_policy(obs) -> str:
+    """Real workflow policy: inspect -> tests/linter -> docs -> fix -> negotiate -> done."""
     author_msg = (getattr(obs, "author_response", "") or "").lower()
     tool_output = (getattr(obs, "last_tool_output", "") or "").lower()
     if not getattr(obs, "docs_queried", False):
         return '{"action_type": "query_docs", "content": "python bug fixing best practices for edge cases and null safety"}'
     if getattr(obs, "current_test_score", 0.0) < 0.6 and getattr(obs, "step", 0) >= 3:
         bug_hint = (getattr(obs, "bug_description", "") or "concurrency bug").replace('"', "'")
         return json.dumps(
             }
         )
     if getattr(obs, "current_test_score", 0.0) < 0.95:
         fix_code = _expert_fix_from_context(obs)
         return json.dumps({"action_type": "fix", "content": fix_code})
     if author_msg and ("not convinced" in author_msg or "explain" in author_msg or "brief" in author_msg):
         return (
             '{"action_type": "comment", "content": "This fix works because it handles the failing edge case directly, '
             'The change is intentionally small to reduce regression risk."}'
         )
     conf = float(getattr(obs, "author_confidence", 0.0))
     threshold = float(getattr(obs, "author_threshold", 0.5))
     score = float(getattr(obs, "current_test_score", 0.0))
     if conf >= threshold and score >= 0.8:
         return '{"action_type": "done"}'
     return (
         '{"action_type": "question", "content": "Would you like a quick walkthrough of a failing scenario, the root cause, and how the fix prevents regressions?"}'
     )
 # ======================================================================
+def build_prompt(obs, history_lines: List[str]) -> str:
+    author_msg = getattr(obs, "author_response", "") or ""
+    tool_output = getattr(obs, "last_tool_output", "") or ""
+    author_personality = getattr(obs, "author_personality", "defensive")
+    prompt = f"""You are an AI code review agent. Your goal is to convince a simulated human developer to accept your proposed fix and name your proposed fix function fix.
+The developer has a **{author_personality}** personality and will only accept if you provide solid evidence:
+- Tests pass (high pass ratio)
+- Lint is clean (zero errors)
+- Documentation or references are provided
+- Your reasoning is clear, uses words like "because" or "therefore", and is detailed (over 30 words if needed)
+Workflow:
+1. Use `inspect` to understand the code.
+2. Use `run_tests` and `run_linter` to gather evidence.
+3. Use `query_docs` when you need references or language-specific guidance.
+4. Propose a fix (`fix`) and explain why it works (`comment` or `question`).
+5. If the developer pushes back, read their response carefully and address their specific concern.
+6. Once convinced, use `done` to finish.
+Code:
+{obs.code_snippet}
+Author says:
+{author_msg if author_msg else "(no response yet – start with inspection)"}
+Last tool output:
+{tool_output if tool_output else "(none)"}
+Available actions:
+run_tests, run_linter, inspect, query_docs, fix, comment, question, done
+Respond ONLY in JSON:
+{{"action_type": "...", "content": "..."}}"""
+    if history_lines:
+        history = "\n".join(history_lines[-6:])
+        prompt += f"\n\nPrevious steps:\n{history}"
+    return prompt
 # ======================================================================
+@dataclass
+class Trajectory:
+    states: List[str]
+    actions: List[str]
+    rewards: List[float]
+    logprobs: List[float]
+    dones: List[bool]
+    def __len__(self): return len(self.states)
 def generate_action_with_logprob(prompt, model, tokenizer, temperature=0.0, max_retries=2):
     messages = [{"role": "user", "content": prompt}]
     formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # 1024 max length, no unsloth
+    inputs = tokenizer(formatted, return_tensors="pt", max_length=1024, truncation=True).to("cuda")
     for attempt in range(max_retries):
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
+                max_new_tokens=128,
                 do_sample=(temperature > 0),
                 temperature=max(temperature, 0.01) if temperature > 0 else 1.0,
                 min_new_tokens=1,
             )
         generated_ids = outputs.sequences[0][inputs['input_ids'].shape[1]:]
         action_text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
         logprobs = []
         for idx, token_id in enumerate(generated_ids):
             if idx < len(outputs.scores):
                 token_logprob = F.log_softmax(token_logits, dim=-1)[token_id].item()
                 logprobs.append(token_logprob)
         total_logprob = sum(logprobs) if logprobs else -100.0
         if not action_text:
             fallback_actions = [
                 '{"action_type": "run_tests"}',
             total_logprob = -50.0
             print(f"[WARN] Empty generation → using fallback: {action_text}")
             return action_text, total_logprob
         try:
             json.loads(action_text)
             return action_text, total_logprob
             continue
     return '{"action_type":"skip"}', -100.0
 def collect_trajectory(env, model, tokenizer, max_steps=6, temperature=0.0):
     obs = env.reset()
     history_lines = []
     return trajectories
 def compute_returns_and_advantages(rewards, dones, gamma=0.99, standardize=True):
     n = len(rewards)
     returns = [0.0]*n
     running = 0.0
             messages = [{"role": "user", "content": state}]
             formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
             full_text = formatted + action
+            inputs = tokenizer(full_text, return_tensors="pt", max_length=1024, truncation=True).to("cuda")
             outputs = model(**inputs)
             logits = outputs.logits
             action_ids = tokenizer.encode(action, add_special_tokens=False)
             if not logprobs: continue
             new_logprob = sum(logprobs)
             avg_entropy = entropy / len(logprobs) if logprobs else 0.0
+            ratio = torch.exp(new_logprob - old_logprob)
             surr1 = ratio * advantage
             surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantage
             policy_loss = -torch.min(surr1, surr2)
 def evaluate_policy(env, model, tokenizer, n_episodes=3, max_steps=6,
                     task_levels=None, verbose=False):
     model.eval()
     if task_levels is None:
         task_levels = list(BUG_DB.keys())
     total_rewards = []
+    traces = []
     for ep in range(n_episodes):
         task = task_levels[ep % len(task_levels)]
         env.set_task(task)
         if verbose:
             actions_taken = []
             for a in traj.actions:
+                try: actions_taken.append(json.loads(a).get("action_type", "?"))
+                except: actions_taken.append("?")
             traces.append({
                 "task": task,
                 "reward": round(ep_reward, 4),
     }
 # ======================================================================
+# Manual warm-up from JSON (no SFTTrainer, no Unsloth)
 def json_warmup(model, tokenizer, json_path="training_data.json",
+                n_episodes=25, epochs=3, lr=2e-5):
     print("\n" + "="*60)
     print("SUPERVISED WARM-UP: training_data.json (manual cross-entropy)")
     print("="*60)
     with open(json_path, encoding="utf-8") as f:
         data = json.load(f)
     steps_per_episode = 7
     max_examples = n_episodes * steps_per_episode
     if max_examples < len(data):
     model.train()
     warmup_opt = AdamW(model.parameters(), lr=lr)
+    warmup_losses = []
     for epoch in range(epochs):
         random.shuffle(data)
             prompt = example["prompt"]
             action = example["action"]
             messages = [
                 {"role": "user", "content": prompt},
                 {"role": "assistant", "content": action},
             ]
             full_text = tokenizer.apply_chat_template(messages, tokenize=False)
+            inputs = tokenizer(full_text, return_tensors="pt", max_length=1024, truncation=True).to("cuda")
             prompt_only = tokenizer.apply_chat_template(
                 [{"role": "user", "content": prompt}],
                 tokenize=False, add_generation_prompt=True
             total_len = inputs.input_ids.shape[1]
             if prompt_len >= total_len:
+                continue
             outputs = model(**inputs)
             logits = outputs.logits
             shift_logits = logits[0, prompt_len - 1 : total_len - 1]
             shift_labels = inputs.input_ids[0, prompt_len : total_len]
 # ======================================================================
 # MAIN TRAINING PIPELINE
 def train_ppo():
+    n_iterations = 15
+    trajectories_per_iter = 6
     n_epochs = 2
     max_steps = 8
     learning_rate = 3e-5
     entropy_coef = 0.01
     gamma = 0.99
+    # Pre-load embedder (unchanged)
     from rltool import ToolBox
     print("Pre-loading sentence-transformer embedder...")
     ToolBox._get_embedder()
     print("✓ Embedder ready")
     model, tokenizer = load_model()
     if not test_model_sanity(model, tokenizer):
         return
     env = CodeReviewEnv()
     task_levels = list(BUG_DB.keys())
+    # Phase 0: baseline
     print("\n" + "="*60)
     print("PHASE 0 – BASELINE EVALUATION (untrained)")
     print("="*60)
         print(f"  task={t['task']:8s}  reward={t['reward']:+.4f}  "
               f"steps={t['steps']}  actions={t['actions']}")
+    # Phase 1: supervised warm-up
+    warmup_losses = json_warmup(model, tokenizer, json_path="training_data.json",
+                                n_episodes=25, epochs=3, lr=2e-5)
     print("="*60)
     print("POST WARM-UP EVALUATION")
     print("="*60)
         print(f"  task={t['task']:8s}  reward={t['reward']:+.4f}  "
               f"steps={t['steps']}  actions={t['actions']}")
     optimizer = AdamW(model.parameters(), lr=learning_rate)
     print(f"\n{'='*60}")
     print(f"PHASE 2 – PPO TRAINING: {n_iterations} iterations × "
           f"{trajectories_per_iter} trajectories (true RL)")
     print(f"{'='*60}\n")
+    reward_history, eval_history, loss_history = [], [], []
     for iteration in range(n_iterations):
         print(f"\n--- PPO Iteration {iteration + 1}/{n_iterations} ---")
         trajectories = collect_trajectories(
             env, model, tokenizer, trajectories_per_iter, max_steps,
             task_levels=task_levels, task_weights=None
         reward_history.append(avg_reward)
         print(f"  Collect  avg reward: {avg_reward:+.4f}")
         metrics = ppo_update(
             trajectories, model, tokenizer, optimizer,
             n_epochs=n_epochs, clip_epsilon=clip_epsilon,
             entropy_coef=entropy_coef, gamma=gamma
         )
         loss_history.append(float(metrics["loss"]))
         print(f"  Update   loss={metrics['loss']:.4f}  "
               f"policy={metrics['policy_loss']:.4f}  "
               f"entropy={metrics['entropy']:.4f}")
         eval_m = evaluate_policy(env, model, tokenizer, n_episodes=3,
                                  max_steps=max_steps, task_levels=task_levels,
                                  verbose=False)
         print(f"  Eval     avg reward: {eval_m['avg_reward']:+.4f}  "
               f"(Δ baseline: {delta:+.4f})")
     print("\n" + "="*60)
     print("PHASE 3 – FINAL EVALUATION (after all training)")
     print("="*60)
         print(f"  ✗ No overall improvement detected")
     print(f"{'='*60}")
+    # Plots
     iters = list(range(1, n_iterations + 1))
     if warmup_losses:
         fig, ax = plt.subplots(figsize=(7, 4))
+        ax.plot(range(1, len(warmup_losses)+1), warmup_losses,
                 marker="o", linewidth=2, color="tab:purple")
+        ax.set_title("Warm-up Loss (supervised, per epoch)", fontsize=13, fontweight="bold")
+        ax.set_xlabel("Epoch"); ax.set_ylabel("Cross-Entropy Loss")
+        ax.grid(alpha=0.3); fig.tight_layout()
+        fig.savefig("warmup_loss.png", dpi=150); plt.close(fig)
+    fig, ax = plt.subplots(figsize=(9,5))
     ax.plot(iters, reward_history, marker="o", linewidth=2,
             label="Collect reward", color="tab:blue")
     ax.plot(iters, eval_history, marker="s", linewidth=2, linestyle="--",
     ax.axhline(y=warmup_reward, color="tab:purple", linestyle=":",
                linewidth=1.5, label=f"Post-warmup ({warmup_reward:+.3f})")
     ax.set_title("PPO Reward per Iteration", fontsize=14, fontweight="bold")
+    ax.set_xlabel("Iteration"); ax.set_ylabel("Average Reward")
+    ax.legend(loc="best", fontsize=8); ax.grid(alpha=0.3)
+    fig.tight_layout(); fig.savefig("reward_curve.png", dpi=150); plt.close(fig)
+    fig, ax = plt.subplots(figsize=(9,5))
     ax.plot(iters, loss_history, marker="o", linewidth=2,
             label="Total loss", color="tab:red")
     ax.set_title("PPO Loss per Iteration", fontsize=14, fontweight="bold")
+    ax.set_xlabel("Iteration"); ax.set_ylabel("Loss")
+    ax.legend(loc="best"); ax.grid(alpha=0.3)
+    fig.tight_layout(); fig.savefig("loss_curve.png", dpi=150); plt.close(fig)
+    print("Plots saved: warmup_loss.png, reward_curve.png, loss_curve.png")
     print("="*60)
 if __name__ == "__main__":