Spaces:

100XZX001
/

CodeReview-Professional-Workflow

Sleeping

App Files Files Community

100XZX001 commited on Apr 23

Commit

6b4dcf0

verified ·

1 Parent(s): 778f292

Update training.py

Browse files

Files changed (1) hide show

training.py +210 -100

training.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# training.py – True PPO-based RL training with multi-step trajectories
 import json
 import torch
@@ -7,11 +7,15 @@ from torch.optim import AdamW
 from dataclasses import dataclass
 from typing import List, Dict, Tuple, Optional
 import numpy as np
 from unsloth import FastLanguageModel
 from transformers import TrainingArguments
-# Import your environment and actions
 from environment import CodeReviewEnv
 from models import (
     RunTests, RunLinter, Inspect,
@@ -20,7 +24,7 @@ from models import (
 )
 # ======================================================================
-# 1. ACTION PARSING (unchanged from original)
 # ======================================================================
 @dataclass
 class AgentAction:
@@ -28,6 +32,8 @@ class AgentAction:
     content: Optional[str] = None
 def parse_action(output: str) -> AgentAction:
     try:
         data = json.loads(output)
         return AgentAction(
@@ -35,7 +41,36 @@ def parse_action(output: str) -> AgentAction:
             content=data.get("content")
         )
     except:
-        return AgentAction("invalid", output)
 def map_to_env(action: AgentAction):
     if action.action_type == "run_tests":
@@ -56,7 +91,7 @@ def map_to_env(action: AgentAction):
         return Skip()
 # ======================================================================
-# 2. MODEL SETUP
 # ======================================================================
 def load_model():
     model, tokenizer = FastLanguageModel.from_pretrained(
@@ -64,32 +99,139 @@ def load_model():
         max_seq_length=2048,
         load_in_4bit=True,
     )
     model = FastLanguageModel.get_peft_model(
         model,
-        r=64,
         target_modules=[
             "q_proj", "k_proj", "v_proj", "o_proj",
             "gate_proj", "up_proj", "down_proj"
         ],
-        lora_alpha=64,
-        lora_dropout=0,
     )
     return model, tokenizer
 # ======================================================================
-# 3. ACTION GENERATION WITH LOGPROB TRACKING
 # ======================================================================
 def generate_action_with_logprob(
     prompt: str,
     model,
     tokenizer,
-    temperature: float = 0.8,
     max_retries: int = 2
 ) -> Tuple[str, float]:
-    """
-    Generate action and return (action_text, logprob)
-    """
-    formatted = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
     inputs = tokenizer(formatted, return_tensors="pt").to("cuda")
     for attempt in range(max_retries):
@@ -97,25 +239,37 @@ def generate_action_with_logprob(
             outputs = model.generate(
                 **inputs,
                 max_new_tokens=128,
-                do_sample=True,
-                temperature=temperature,
                 return_dict_in_generate=True,
                 output_scores=True,
             )
-        # Extract generated tokens
         generated_ids = outputs.sequences[0][inputs['input_ids'].shape[1]:]
         action_text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
-        # Compute logprob for the generated sequence
         logprobs = []
         for idx, token_id in enumerate(generated_ids):
             if idx < len(outputs.scores):
-                token_logits = outputs.scores[idx][0]  # [vocab_size]
                 token_logprob = F.log_softmax(token_logits, dim=-1)[token_id].item()
                 logprobs.append(token_logprob)
-        total_logprob = sum(logprobs)
         # Validate JSON
         try:
@@ -123,14 +277,13 @@ def generate_action_with_logprob(
             return action_text, total_logprob
         except:
             if attempt == max_retries - 1:
-                # Return skip action with penalty logprob
                 return '{"action_type":"skip"}', -100.0
             continue
     return '{"action_type":"skip"}', -100.0
 # ======================================================================
-# 4. PROMPT BUILDER (unchanged from original)
 # ======================================================================
 def build_prompt(obs, history_lines: List[str]) -> str:
     prompt = f"""You are a code review agent.
@@ -153,7 +306,7 @@ Respond ONLY in JSON:
     return prompt
 # ======================================================================
-# 5. TRAJECTORY STORAGE
 # ======================================================================
 @dataclass
 class Trajectory:
@@ -176,18 +329,15 @@ class Trajectory:
         }
 # ======================================================================
-# 6. ROLLOUT COLLECTION
 # ======================================================================
 def collect_trajectory(
     env: CodeReviewEnv,
     model,
     tokenizer,
     max_steps: int = 10,
-    temperature: float = 0.8
 ) -> Trajectory:
-    """
-    Collect a single trajectory with full RL data.
-    """
     obs = env.reset()
     history_lines = []
@@ -198,18 +348,15 @@ def collect_trajectory(
     dones = []
     for step in range(max_steps):
-        # Build prompt
         prompt = build_prompt(obs, history_lines)
         states.append(prompt)
-        # Generate action with logprob
         action_text, logprob = generate_action_with_logprob(
             prompt, model, tokenizer, temperature
         )
         actions.append(action_text)
         logprobs.append(logprob)
-        # Parse and execute
         action = parse_action(action_text)
         env_action = map_to_env(action)
         next_obs, reward, done, _ = env.step(env_action)
@@ -217,7 +364,6 @@ def collect_trajectory(
         rewards.append(reward.value)
         dones.append(done)
-        # Update history
         history_lines.append(f"Agent: {action_text}")
         history_lines.append(f"Env: {next_obs.last_tool_output}")
@@ -234,7 +380,6 @@ def collect_trajectories(
     n_trajectories: int,
     max_steps: int = 10
 ) -> List[Trajectory]:
-    """Collect multiple trajectories."""
     trajectories = []
     for i in range(n_trajectories):
         traj = collect_trajectory(env, model, tokenizer, max_steps)
@@ -245,7 +390,7 @@ def collect_trajectories(
     return trajectories
 # ======================================================================
-# 7. ADVANTAGE ESTIMATION (GAE)
 # ======================================================================
 def compute_gae(
     rewards: List[float],
@@ -254,13 +399,7 @@ def compute_gae(
     gamma: float = 0.99,
     lambda_: float = 0.95
 ) -> Tuple[List[float], List[float]]:
-    """
-    Compute Generalized Advantage Estimation.
-    If no value function provided, use reward-to-go as returns.
-    """
     n = len(rewards)
-    # Compute returns (reward-to-go)
     returns = [0.0] * n
     running_return = 0.0
     for t in reversed(range(n)):
@@ -269,16 +408,13 @@ def compute_gae(
         running_return = rewards[t] + gamma * running_return
         returns[t] = running_return
-    # If no value function, use returns as advantages (centered)
     if values is None:
         advantages = returns
-        # Normalize advantages
         adv_mean = np.mean(advantages)
         adv_std = np.std(advantages) + 1e-8
         advantages = [(a - adv_mean) / adv_std for a in advantages]
         return advantages, returns
-    # GAE with value function
     advantages = [0.0] * n
     gae = 0.0
     for t in reversed(range(n)):
@@ -289,44 +425,39 @@ def compute_gae(
         gae = delta + gamma * lambda_ * gae
         advantages[t] = gae
-    # Normalize
     adv_mean = np.mean(advantages)
     adv_std = np.std(advantages) + 1e-8
     advantages = [(a - adv_mean) / adv_std for a in advantages]
     return advantages, returns
 # ======================================================================
-# 8. COMPUTE NEW LOGPROBS (for PPO ratio)
 # ======================================================================
 def compute_logprob(prompt: str, action: str, model, tokenizer) -> float:
-    """
-    Compute log probability of action given prompt.
-    """
-    formatted = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n{action}"
-    inputs = tokenizer(formatted, return_tensors="pt").to("cuda")
     with torch.no_grad():
         outputs = model(**inputs)
-        logits = outputs.logits  # [1, seq_len, vocab_size]
-    # Get action tokens
     action_ids = tokenizer.encode(action, add_special_tokens=False)
-    action_start = inputs['input_ids'].shape[1] - len(action_ids)
-    # Compute logprob for action tokens
     logprobs = []
     for idx, token_id in enumerate(action_ids):
-        position = action_start + idx - 1  # -1 because logits are shifted
-        if position >= 0 and position < logits.shape[1]:
             token_logits = logits[0, position]
             token_logprob = F.log_softmax(token_logits, dim=-1)[token_id].item()
             logprobs.append(token_logprob)
     return sum(logprobs) if logprobs else -100.0
 # ======================================================================
-# 9. PPO UPDATE
 # ======================================================================
 def ppo_update(
     trajectories: List[Trajectory],
@@ -339,12 +470,8 @@ def ppo_update(
     gamma: float = 0.99,
     lambda_: float = 0.95,
 ) -> Dict[str, float]:
-    """
-    Perform PPO policy update.
-    """
     model.train()
-    # Flatten all trajectories into single dataset
     all_states = []
     all_actions = []
     all_old_logprobs = []
@@ -352,11 +479,9 @@ def ppo_update(
     all_returns = []
     for traj in trajectories:
-        # Compute advantages for this trajectory
         advantages, returns = compute_gae(
             traj.rewards, traj.dones, values=None, gamma=gamma, lambda_=lambda_
         )
         all_states.extend(traj.states)
         all_actions.extend(traj.actions)
         all_old_logprobs.extend(traj.logprobs)
@@ -364,46 +489,42 @@ def ppo_update(
         all_returns.extend(returns)
     n_samples = len(all_states)
     total_loss = 0.0
     total_policy_loss = 0.0
     total_entropy = 0.0
     n_updates = 0
-    # Multiple epochs over the data
     for epoch in range(n_epochs):
-        # Shuffle data
         indices = np.random.permutation(n_samples)
         for i in indices:
             state = all_states[i]
             action = all_actions[i]
             old_logprob = all_old_logprobs[i]
             advantage = all_advantages[i]
-            # Compute new logprob with gradient
-            formatted = f"<start_of_turn>user\n{state}<end_of_turn>\n<start_of_turn>model\n{action}"
-            inputs = tokenizer(formatted, return_tensors="pt").to("cuda")
             outputs = model(**inputs)
             logits = outputs.logits
-            # Get action tokens
             action_ids = tokenizer.encode(action, add_special_tokens=False)
-            action_start = inputs['input_ids'].shape[1] - len(action_ids)
-            # Compute logprob for action
             logprobs = []
             entropy = 0.0
             for idx, token_id in enumerate(action_ids):
                 position = action_start + idx - 1
-                if position >= 0 and position < logits.shape[1]:
                     token_logits = logits[0, position]
                     log_probs = F.log_softmax(token_logits, dim=-1)
                     token_logprob = log_probs[token_id]
                     logprobs.append(token_logprob)
-                    # Entropy
                     probs = F.softmax(token_logits, dim=-1)
                     entropy += -(probs * log_probs).sum()
@@ -413,16 +534,12 @@ def ppo_update(
             new_logprob = sum(logprobs)
             avg_entropy = entropy / len(logprobs) if logprobs else 0.0
-            # PPO objective
             ratio = torch.exp(new_logprob - old_logprob)
             surr1 = ratio * advantage
             surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantage
             policy_loss = -torch.min(surr1, surr2)
-            # Total loss
             loss = policy_loss - entropy_coef * avg_entropy
-            # Backprop
             optimizer.zero_grad()
             loss.backward()
             torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
@@ -440,7 +557,7 @@ def ppo_update(
     }
 # ======================================================================
-# 10. EVALUATION
 # ======================================================================
 def evaluate_policy(
     env: CodeReviewEnv,
@@ -449,22 +566,16 @@ def evaluate_policy(
     n_episodes: int = 10,
     max_steps: int = 10
 ) -> Dict[str, float]:
-    """
-    Evaluate policy over multiple episodes.
-    """
     model.eval()
     total_rewards = []
     episode_lengths = []
     success_count = 0
     for _ in range(n_episodes):
-        traj = collect_trajectory(env, model, tokenizer, max_steps, temperature=0.5)
         total_reward = sum(traj.rewards)
         total_rewards.append(total_reward)
         episode_lengths.append(len(traj))
-        # Define success (e.g., reward > threshold)
         if total_reward > 0.5:
             success_count += 1
@@ -476,7 +587,7 @@ def evaluate_policy(
     }
 # ======================================================================
-# 11. MAIN TRAINING LOOP
 # ======================================================================
 def train_ppo(
     n_iterations: int = 50,
@@ -490,14 +601,18 @@ def train_ppo(
     lambda_: float = 0.95,
     eval_every: int = 5,
 ):
-    """
-    Main PPO training loop.
-    """
     print("Loading model...")
     model, tokenizer = load_model()
-    optimizer = AdamW(model.parameters(), lr=learning_rate)
     env = CodeReviewEnv()
     print(f"\n{'='*60}")
@@ -510,20 +625,17 @@ def train_ppo(
     for iteration in range(n_iterations):
         print(f"\n--- Iteration {iteration + 1}/{n_iterations} ---")
-        # Collect trajectories
         print("Collecting trajectories...")
         trajectories = collect_trajectories(
             env, model, tokenizer, trajectories_per_iter, max_steps
         )
-        # Compute statistics
         avg_reward = np.mean([sum(t.rewards) for t in trajectories])
         avg_length = np.mean([len(t) for t in trajectories])
         print(f"Avg reward: {avg_reward:.3f}")
         print(f"Avg length: {avg_length:.1f}")
-        # PPO update
         print("Updating policy...")
         metrics = ppo_update(
             trajectories,
@@ -541,7 +653,6 @@ def train_ppo(
         print(f"Policy loss: {metrics['policy_loss']:.4f}")
         print(f"Entropy: {metrics['entropy']:.4f}")
-        # Evaluation
         if (iteration + 1) % eval_every == 0:
             print("\nEvaluating policy...")
             eval_metrics = evaluate_policy(env, model, tokenizer, n_episodes=10)
@@ -549,7 +660,6 @@ def train_ppo(
             print(f"Eval success rate: {eval_metrics['success_rate']:.2%}")
             print(f"Eval avg length: {eval_metrics['avg_length']:.1f}")
-    # Final save
     print("\n" + "="*60)
     print("Training complete. Saving model...")
     model.save_pretrained("ppo_final_model")
@@ -558,7 +668,7 @@ def train_ppo(
     print("="*60)
 # ======================================================================
-# 12. ENTRY POINT
 # ======================================================================
 if __name__ == "__main__":
     train_ppo(

+# training.py – FIXED PPO training (no variable names changed)
 import json
 import torch
 from dataclasses import dataclass
 from typing import List, Dict, Tuple, Optional
 import numpy as np
+import re
+import random
 from unsloth import FastLanguageModel
 from transformers import TrainingArguments
+from trl import SFTTrainer
+from datasets import Dataset
+# Import your environment and actions (unchanged)
 from environment import CodeReviewEnv
 from models import (
     RunTests, RunLinter, Inspect,
 )
 # ======================================================================
+# 1. ACTION PARSING (improved with fallback)
 # ======================================================================
 @dataclass
 class AgentAction:
     content: Optional[str] = None
 def parse_action(output: str) -> AgentAction:
+    """Robust JSON parsing with regex fallback and keyword detection."""
+    # Try strict JSON first
     try:
         data = json.loads(output)
         return AgentAction(
             content=data.get("content")
         )
     except:
+        pass
+    # Try to extract JSON from markdown blocks
+    json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', output, re.DOTALL)
+    if json_match:
+        try:
+            data = json.loads(json_match.group(1))
+            return AgentAction(
+                action_type=data.get("action_type", "").lower(),
+                content=data.get("content")
+            )
+        except:
+            pass
+    # Try to find "action_type" field with regex
+    action_pattern = r'"action_type"\s*:\s*"(\w+)"'
+    match = re.search(action_pattern, output)
+    if match:
+        return AgentAction(action_type=match.group(1).lower())
+    # Keyword detection as last resort
+    output_lower = output.lower()
+    if "test" in output_lower:
+        return AgentAction("run_tests")
+    if "lint" in output_lower:
+        return AgentAction("run_linter")
+    if "inspect" in output_lower:
+        return AgentAction("inspect")
+    return AgentAction("invalid", output)
 def map_to_env(action: AgentAction):
     if action.action_type == "run_tests":
         return Skip()
 # ======================================================================
+# 2. MODEL SETUP (stabilised LoRA)
 # ======================================================================
 def load_model():
     model, tokenizer = FastLanguageModel.from_pretrained(
         max_seq_length=2048,
         load_in_4bit=True,
     )
+    # FIXED: Lower rank (16), dropout=0 for stability
     model = FastLanguageModel.get_peft_model(
         model,
+        r=16,                     # was 64 → causes collapse
         target_modules=[
             "q_proj", "k_proj", "v_proj", "o_proj",
             "gate_proj", "up_proj", "down_proj"
         ],
+        lora_alpha=32,            # adjusted for r=16
+        lora_dropout=0.0,         # dropout can cause empty outputs
     )
+    # Ensure tokenizer has correct chat template for Gemma-2
+    if tokenizer.chat_template is None:
+        tokenizer.chat_template = "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}<start_of_turn>user\n{{ message['content'] }}<end_of_turn>\n<start_of_turn>model\n{% elif message['role'] == 'assistant' %}{{ message['content'] }}<end_of_turn>\n{% endif %}{% endfor %}"
     return model, tokenizer
 # ======================================================================
+# 3. MODEL SANITY CHECK (new – ensures model can generate text)
+# ======================================================================
+def test_model_sanity(model, tokenizer) -> bool:
+    print("\n" + "="*60)
+    print("SANITY CHECK: Testing base model generation")
+    print("="*60)
+    test_prompt = "Hello, how are you?"
+    messages = [{"role": "user", "content": test_prompt}]
+    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(formatted, return_tensors="pt").to("cuda")
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=30,
+            do_sample=True,
+            temperature=0.7,
+            min_new_tokens=1,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
+        )
+    generated_ids = outputs[0][inputs['input_ids'].shape[1]:]
+    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+    print(f"Prompt: {test_prompt}")
+    print(f"Response: {repr(response)}")
+    if len(response) == 0:
+        print("❌ Model produces empty output – cannot train.")
+        return False
+    print("✓ Model sanity check PASSED\n")
+    return True
+# ======================================================================
+# 4. SUPERVISED WARM-UP (teaches JSON output)
+# ======================================================================
+def supervised_warmup(model, tokenizer, n_examples=500, epochs=2):
+    print("\n" + "="*60)
+    print("SUPERVISED WARM-UP: Teaching JSON format")
+    print("="*60)
+    examples = []
+    action_templates = [
+        '{"action_type": "run_tests"}',
+        '{"action_type": "run_linter"}',
+        '{"action_type": "inspect"}',
+        '{"action_type": "fix", "content": "def corrected():\n    pass"}',
+        '{"action_type": "comment", "content": "This looks good."}',
+        '{"action_type": "question", "content": "Why is this variable used?"}',
+        '{"action_type": "done"}',
+    ]
+    for i in range(n_examples):
+        code = f"def example_{i}():\n    return {i % 10}"
+        last_outputs = [
+            "Tests passed: 2/3",
+            "Linter found 1 error",
+            "Inspection complete",
+            "No previous action",
+        ]
+        last_output = random.choice(last_outputs)
+        # Use same prompt structure as build_prompt
+        prompt = f"""You are a code review agent.
+Code:
+{code}
+Last Output:
+{last_output}
+Available actions:
+run_tests, run_linter, inspect, fix, comment, question, done
+Respond ONLY in JSON:
+{{"action_type": "...", "content": "..."}}"""
+        action_json = random.choice(action_templates)
+        messages = [
+            {"role": "user", "content": prompt},
+            {"role": "assistant", "content": action_json}
+        ]
+        full_text = tokenizer.apply_chat_template(messages, tokenize=False)
+        examples.append({"text": full_text})
+    dataset = Dataset.from_list(examples)
+    trainer = SFTTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=dataset,
+        dataset_text_field="text",
+        max_seq_length=512,
+        args=TrainingArguments(
+            output_dir="warmup_output",
+            num_train_epochs=epochs,
+            per_device_train_batch_size=4,
+            gradient_accumulation_steps=2,
+            learning_rate=2e-5,
+            logging_steps=50,
+            save_strategy="no",
+            fp16=True,
+        ),
+    )
+    print(f"Training on {n_examples} examples for {epochs} epochs...")
+    trainer.train()
+    print("✓ Warm-up complete\n")
+# ======================================================================
+# 5. ACTION GENERATION WITH LOGPROB TRACKING (fixed)
 # ======================================================================
 def generate_action_with_logprob(
     prompt: str,
     model,
     tokenizer,
+    temperature: float = 0.0,   # changed: greedy by default for stability
     max_retries: int = 2
 ) -> Tuple[str, float]:
+    """Generate action using correct chat template, with fallback."""
+    messages = [{"role": "user", "content": prompt}]
+    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = tokenizer(formatted, return_tensors="pt").to("cuda")
     for attempt in range(max_retries):
             outputs = model.generate(
                 **inputs,
                 max_new_tokens=128,
+                do_sample=(temperature > 0),
+                temperature=max(temperature, 0.01) if temperature > 0 else 1.0,
+                min_new_tokens=1,
                 return_dict_in_generate=True,
                 output_scores=True,
             )
         generated_ids = outputs.sequences[0][inputs['input_ids'].shape[1]:]
         action_text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+        # Compute logprob
         logprobs = []
         for idx, token_id in enumerate(generated_ids):
             if idx < len(outputs.scores):
+                token_logits = outputs.scores[idx][0]
                 token_logprob = F.log_softmax(token_logits, dim=-1)[token_id].item()
                 logprobs.append(token_logprob)
+        total_logprob = sum(logprobs) if logprobs else -100.0
+        # If empty, use fallback
+        if not action_text:
+            fallback_actions = [
+                '{"action_type": "run_tests"}',
+                '{"action_type": "run_linter"}',
+                '{"action_type": "inspect"}',
+                '{"action_type": "skip"}',
+            ]
+            action_text = random.choice(fallback_actions)
+            total_logprob = -50.0
+            print(f"[WARN] Empty generation → using fallback: {action_text}")
+            return action_text, total_logprob
         # Validate JSON
         try:
             return action_text, total_logprob
         except:
             if attempt == max_retries - 1:
                 return '{"action_type":"skip"}', -100.0
             continue
     return '{"action_type":"skip"}', -100.0
 # ======================================================================
+# 6. PROMPT BUILDER (unchanged – exactly as you wrote)
 # ======================================================================
 def build_prompt(obs, history_lines: List[str]) -> str:
     prompt = f"""You are a code review agent.
     return prompt
 # ======================================================================
+# 7. TRAJECTORY STORAGE (unchanged)
 # ======================================================================
 @dataclass
 class Trajectory:
         }
 # ======================================================================
+# 8. ROLLOUT COLLECTION (uses fixed generate)
 # ======================================================================
 def collect_trajectory(
     env: CodeReviewEnv,
     model,
     tokenizer,
     max_steps: int = 10,
+    temperature: float = 0.0   # changed to greedy
 ) -> Trajectory:
     obs = env.reset()
     history_lines = []
     dones = []
     for step in range(max_steps):
         prompt = build_prompt(obs, history_lines)
         states.append(prompt)
         action_text, logprob = generate_action_with_logprob(
             prompt, model, tokenizer, temperature
         )
         actions.append(action_text)
         logprobs.append(logprob)
         action = parse_action(action_text)
         env_action = map_to_env(action)
         next_obs, reward, done, _ = env.step(env_action)
         rewards.append(reward.value)
         dones.append(done)
         history_lines.append(f"Agent: {action_text}")
         history_lines.append(f"Env: {next_obs.last_tool_output}")
     n_trajectories: int,
     max_steps: int = 10
 ) -> List[Trajectory]:
     trajectories = []
     for i in range(n_trajectories):
         traj = collect_trajectory(env, model, tokenizer, max_steps)
     return trajectories
 # ======================================================================
+# 9. ADVANTAGE ESTIMATION (unchanged)
 # ======================================================================
 def compute_gae(
     rewards: List[float],
     gamma: float = 0.99,
     lambda_: float = 0.95
 ) -> Tuple[List[float], List[float]]:
     n = len(rewards)
     returns = [0.0] * n
     running_return = 0.0
     for t in reversed(range(n)):
         running_return = rewards[t] + gamma * running_return
         returns[t] = running_return
     if values is None:
         advantages = returns
         adv_mean = np.mean(advantages)
         adv_std = np.std(advantages) + 1e-8
         advantages = [(a - adv_mean) / adv_std for a in advantages]
         return advantages, returns
     advantages = [0.0] * n
     gae = 0.0
     for t in reversed(range(n)):
         gae = delta + gamma * lambda_ * gae
         advantages[t] = gae
     adv_mean = np.mean(advantages)
     adv_std = np.std(advantages) + 1e-8
     advantages = [(a - adv_mean) / adv_std for a in advantages]
     return advantages, returns
 # ======================================================================
+# 10. COMPUTE NEW LOGPROBS (unchanged)
 # ======================================================================
 def compute_logprob(prompt: str, action: str, model, tokenizer) -> float:
+    messages = [{"role": "user", "content": prompt}]
+    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    full_text = formatted + action
+    inputs = tokenizer(full_text, return_tensors="pt").to("cuda")
     with torch.no_grad():
         outputs = model(**inputs)
+        logits = outputs.logits
     action_ids = tokenizer.encode(action, add_special_tokens=False)
+    prefix_ids = tokenizer.encode(formatted, add_special_tokens=False)
+    action_start = len(prefix_ids)
     logprobs = []
     for idx, token_id in enumerate(action_ids):
+        position = action_start + idx - 1
+        if 0 <= position < logits.shape[1]:
             token_logits = logits[0, position]
             token_logprob = F.log_softmax(token_logits, dim=-1)[token_id].item()
             logprobs.append(token_logprob)
     return sum(logprobs) if logprobs else -100.0
 # ======================================================================
+# 11. PPO UPDATE (unchanged except uses compute_logprob correctly)
 # ======================================================================
 def ppo_update(
     trajectories: List[Trajectory],
     gamma: float = 0.99,
     lambda_: float = 0.95,
 ) -> Dict[str, float]:
     model.train()
     all_states = []
     all_actions = []
     all_old_logprobs = []
     all_returns = []
     for traj in trajectories:
         advantages, returns = compute_gae(
             traj.rewards, traj.dones, values=None, gamma=gamma, lambda_=lambda_
         )
         all_states.extend(traj.states)
         all_actions.extend(traj.actions)
         all_old_logprobs.extend(traj.logprobs)
         all_returns.extend(returns)
     n_samples = len(all_states)
     total_loss = 0.0
     total_policy_loss = 0.0
     total_entropy = 0.0
     n_updates = 0
     for epoch in range(n_epochs):
         indices = np.random.permutation(n_samples)
         for i in indices:
             state = all_states[i]
             action = all_actions[i]
             old_logprob = all_old_logprobs[i]
             advantage = all_advantages[i]
+            # Use the same chat template for PPO update
+            messages = [{"role": "user", "content": state}]
+            formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            full_text = formatted + action
+            inputs = tokenizer(full_text, return_tensors="pt").to("cuda")
             outputs = model(**inputs)
             logits = outputs.logits
             action_ids = tokenizer.encode(action, add_special_tokens=False)
+            prefix_ids = tokenizer.encode(formatted, add_special_tokens=False)
+            action_start = len(prefix_ids)
             logprobs = []
             entropy = 0.0
             for idx, token_id in enumerate(action_ids):
                 position = action_start + idx - 1
+                if 0 <= position < logits.shape[1]:
                     token_logits = logits[0, position]
                     log_probs = F.log_softmax(token_logits, dim=-1)
                     token_logprob = log_probs[token_id]
                     logprobs.append(token_logprob)
                     probs = F.softmax(token_logits, dim=-1)
                     entropy += -(probs * log_probs).sum()
             new_logprob = sum(logprobs)
             avg_entropy = entropy / len(logprobs) if logprobs else 0.0
             ratio = torch.exp(new_logprob - old_logprob)
             surr1 = ratio * advantage
             surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantage
             policy_loss = -torch.min(surr1, surr2)
             loss = policy_loss - entropy_coef * avg_entropy
             optimizer.zero_grad()
             loss.backward()
             torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
     }
 # ======================================================================
+# 12. EVALUATION (unchanged)
 # ======================================================================
 def evaluate_policy(
     env: CodeReviewEnv,
     n_episodes: int = 10,
     max_steps: int = 10
 ) -> Dict[str, float]:
     model.eval()
     total_rewards = []
     episode_lengths = []
     success_count = 0
     for _ in range(n_episodes):
+        traj = collect_trajectory(env, model, tokenizer, max_steps, temperature=0.0)
         total_reward = sum(traj.rewards)
         total_rewards.append(total_reward)
         episode_lengths.append(len(traj))
         if total_reward > 0.5:
             success_count += 1
     }
 # ======================================================================
+# 13. MAIN TRAINING LOOP (added sanity check and warm-up)
 # ======================================================================
 def train_ppo(
     n_iterations: int = 50,
     lambda_: float = 0.95,
     eval_every: int = 5,
 ):
     print("Loading model...")
     model, tokenizer = load_model()
+    # NEW: Sanity check before any training
+    if not test_model_sanity(model, tokenizer):
+        print("\n❌ Model sanity check failed – cannot proceed.")
+        return
+    # NEW: Supervised warm-up to teach JSON format
+    supervised_warmup(model, tokenizer, n_examples=500, epochs=2)
+    optimizer = AdamW(model.parameters(), lr=learning_rate)
     env = CodeReviewEnv()
     print(f"\n{'='*60}")
     for iteration in range(n_iterations):
         print(f"\n--- Iteration {iteration + 1}/{n_iterations} ---")
         print("Collecting trajectories...")
         trajectories = collect_trajectories(
             env, model, tokenizer, trajectories_per_iter, max_steps
         )
         avg_reward = np.mean([sum(t.rewards) for t in trajectories])
         avg_length = np.mean([len(t) for t in trajectories])
         print(f"Avg reward: {avg_reward:.3f}")
         print(f"Avg length: {avg_length:.1f}")
         print("Updating policy...")
         metrics = ppo_update(
             trajectories,
         print(f"Policy loss: {metrics['policy_loss']:.4f}")
         print(f"Entropy: {metrics['entropy']:.4f}")
         if (iteration + 1) % eval_every == 0:
             print("\nEvaluating policy...")
             eval_metrics = evaluate_policy(env, model, tokenizer, n_episodes=10)
             print(f"Eval success rate: {eval_metrics['success_rate']:.2%}")
             print(f"Eval avg length: {eval_metrics['avg_length']:.1f}")
     print("\n" + "="*60)
     print("Training complete. Saving model...")
     model.save_pretrained("ppo_final_model")
     print("="*60)
 # ======================================================================
+# 14. ENTRY POINT (unchanged)
 # ======================================================================
 if __name__ == "__main__":
     train_ppo(