Spaces:

100XZX001
/

code-review-training

Sleeping

App Files Files Community

100XZX001 commited on Apr 25

Commit

76f5801

verified ·

1 Parent(s): 0f1f590

Update training.py

Browse files

Files changed (1) hide show

training.py +92 -66

training.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# training.py – Memory‑safe: Phi‑3‑mini + Expert Demos + Fast PPO (2 iterations)
 import os
 os.environ["TRITON_DISABLE"] = "1"
@@ -14,6 +14,7 @@ import numpy as np
 import re
 import random
 import matplotlib.pyplot as plt
 from unsloth import FastLanguageModel
 from transformers import TrainingArguments
@@ -25,9 +26,15 @@ from redteam import BUG_DB
 from models import (
     RunTests, RunLinter, Inspect,
     ProposeFix, WriteComment, AskQuestion,
-    Done, Skip, QueryDocs, map_to_env as model_map_to_env
 )
 # ======================================================================
 @dataclass
 class AgentAction:
@@ -69,13 +76,30 @@ def parse_action(output: str) -> AgentAction:
     return AgentAction("invalid", output)
 def map_to_env(action: AgentAction):
-    return model_map_to_env(action.action_type, action.content)
 # ======================================================================
 def load_model():
     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name="unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
-        max_seq_length=480,               # smaller window for memory
         load_in_4bit=True,
     )
     model = FastLanguageModel.get_peft_model(
@@ -120,10 +144,7 @@ def test_model_sanity(model, tokenizer) -> bool:
 # ======================================================================
 def _expert_fix_from_context(obs) -> str:
-    """
-    Build a conservative fix template named `fix` (required by tests).
-    Uses bug hints + code snippet patterns to create realistic fixes.
-    """
     bug = (getattr(obs, "bug_description", "") or "").lower()
     code = getattr(obs, "code_snippet", "") or ""
@@ -134,19 +155,16 @@ def _expert_fix_from_context(obs) -> str:
             "        return 0\n"
             "    return sum(data) / len(data)"
         )
     if "operator" in bug or "sign" in bug:
         return (
             "def fix(a, b):\n"
             "    return a + b"
         )
     if "off_by_one" in bug or "loop" in bug:
         return (
             "def fix(items):\n"
             "    return len(items)"
         )
     if "null" in bug or "key" in bug or "dict" in code.lower():
         return (
             "def fix(payload):\n"
@@ -154,8 +172,6 @@ def _expert_fix_from_context(obs) -> str:
             "    user_id = payload.get('id')\n"
             "    return users.get(user_id)"
         )
-    # Concurrency-heavy tasks (harder/hardest).
     if "race" in bug or "missing_lock" in bug or "thread_safe" in bug or "global_nonatomic" in bug:
         return (
             "import threading\n"
@@ -167,7 +183,6 @@ def _expert_fix_from_context(obs) -> str:
             "            return 0\n"
             "        return counter + 1"
         )
     if "deadlock" in bug or "double_lock" in bug or "lock order" in bug or "nested_lock" in bug:
         return (
             "import threading\n"
@@ -182,7 +197,6 @@ def _expert_fix_from_context(obs) -> str:
             "        with second:\n"
             "            return work() if callable(work) else work"
         )
     if "fork_join" in bug or "join" in bug:
         return (
             "import threading\n"
@@ -193,8 +207,6 @@ def _expert_fix_from_context(obs) -> str:
             "    t.join()\n"
             "    return True"
         )
-    # Generic safe fallback keeps the RL pipeline alive for unknown bugs.
     return (
         "def fix(data):\n"
         "    if data is None:\n"
@@ -202,12 +214,8 @@ def _expert_fix_from_context(obs) -> str:
         "    return data"
     )
 def _expert_supervised_policy(obs) -> str:
-    """
-    Real workflow policy:
-    inspect -> tests/linter -> docs -> fix -> negotiate -> done.
-    """
     author_msg = (getattr(obs, "author_response", "") or "").lower()
     tool_output = (getattr(obs, "last_tool_output", "") or "").lower()
@@ -222,22 +230,17 @@ def _expert_supervised_policy(obs) -> str:
     if not getattr(obs, "docs_queried", False):
         return '{"action_type": "query_docs", "content": "python bug fixing best practices for edge cases and null safety"}'
-    # Use docs again on hard tasks when evidence is still weak.
     if getattr(obs, "current_test_score", 0.0) < 0.6 and getattr(obs, "step", 0) >= 3:
         bug_hint = (getattr(obs, "bug_description", "") or "concurrency bug").replace('"', "'")
-        return json.dumps(
-            {
-                "action_type": "query_docs",
-                "content": f"python {bug_hint} lock ordering race condition mitigation patterns",
-            }
-        )
-    # If test quality is poor, propose a concrete fix.
     if getattr(obs, "current_test_score", 0.0) < 0.95:
         fix_code = _expert_fix_from_context(obs)
         return json.dumps({"action_type": "fix", "content": fix_code})
-    # If author is still unconvinced, provide causal explanation.
     if author_msg and ("not convinced" in author_msg or "explain" in author_msg or "brief" in author_msg):
         return (
             '{"action_type": "comment", "content": "This fix works because it handles the failing edge case directly, '
@@ -245,55 +248,72 @@ def _expert_supervised_policy(obs) -> str:
             'The change is intentionally small to reduce regression risk."}'
         )
-    # If negotiation is strong enough and quality is good, terminate.
     conf = float(getattr(obs, "author_confidence", 0.0))
     threshold = float(getattr(obs, "author_threshold", 0.5))
     score = float(getattr(obs, "current_test_score", 0.0))
     if conf >= threshold and score >= 0.8:
         return '{"action_type": "done"}'
-    # Nudge conversation forward when tests are okay but acceptance is pending.
     return (
-        '{"action_type": "question", "content": "Would you like a quick walkthrough of a failing scenario, the root cause, and how the fix prevents regressions?"}'
     )
 # ======================================================================
-def supervised_warmup(model, tokenizer, env, n_episodes=16, epochs=1, max_steps=8):
     print("\n" + "="*60)
     print("SUPERVISED WARM-UP: Real environment demonstrations")
     print("="*60)
     examples = []
-    tasks = ["easy", "medium", "hard", "harder", "hardest"]
-    for ep in range(n_episodes):
-        task = random.choice(tasks)
-        env.set_task(task)
-        obs = env.reset()
-        history = []
-        done = False
-        steps = 0
-        while not done and steps < max_steps:
-            prompt = build_prompt(obs, history)
-            action_text = _expert_supervised_policy(obs)
-            action = parse_action(action_text)
-            env_action = map_to_env(action)
-            next_obs, _, done, _ = env.step(env_action)
             messages = [
                 {"role": "user", "content": prompt},
-                {"role": "assistant", "content": action_text},
             ]
             full_text = tokenizer.apply_chat_template(messages, tokenize=False)
             examples.append({"text": full_text})
-            history.append(f"Agent: {action_text}")
-            history.append(f"Env: {next_obs.last_tool_output}")
-            history = history[-8:]
-            obs = next_obs
-            steps += 1
-        print(f"Supervised episode {ep+1}: task={task}, steps={steps}, done={done}")
     if not examples:
         print("No supervised examples generated; skipping warm-up.")
@@ -317,9 +337,9 @@ def supervised_warmup(model, tokenizer, env, n_episodes=16, epochs=1, max_steps=
             bf16=True,
         ),
     )
-    print(f"Training on {len(examples)} real env examples for {epochs} epochs...")
     trainer.train()
-    print("✓ Supervised warm-up (real env) complete\n")
     torch.cuda.empty_cache()
 # ======================================================================
@@ -551,7 +571,7 @@ def evaluate_policy(env, model, tokenizer, n_episodes=2, max_steps=6):
     return {"avg_reward": np.mean(total_rewards), "std_reward": np.std(total_rewards)}
 # ======================================================================
-def train_ppo():
     n_iterations = 2
     trajectories_per_iter = 2
     n_epochs = 1
@@ -568,8 +588,11 @@ def train_ppo():
         return
     env = CodeReviewEnv()
-    # Warm-up (real env demonstrations with expert policy)
-    supervised_warmup(model, tokenizer, env, n_episodes=16, epochs=1, max_steps=8)
     optimizer = AdamW(model.parameters(), lr=learning_rate)
     task_levels = list(BUG_DB.keys())
@@ -607,6 +630,9 @@ def train_ppo():
         plt.grid(alpha=0.3); plt.tight_layout(); plt.savefig("loss_curve.png", dpi=150); plt.close()
     print("Plots saved as reward_curve.png and loss_curve.png.")
     print("="*60)
 if __name__ == "__main__":
-    train_ppo()

+# training.py – Pre‑loaded embedder, dual supervised warm‑up, 2 PPO iterations
 import os
 os.environ["TRITON_DISABLE"] = "1"
 import re
 import random
 import matplotlib.pyplot as plt
+from pathlib import Path
 from unsloth import FastLanguageModel
 from transformers import TrainingArguments
 from models import (
     RunTests, RunLinter, Inspect,
     ProposeFix, WriteComment, AskQuestion,
+    Done, Skip, QueryDocs
 )
+# Pre‑load the sentence‑transformer model to avoid OOM during warm‑up
+from rltool import ToolBox
+print("Pre‑loading documentation retriever …")
+ToolBox._get_embedder()
+print("Done.")
 # ======================================================================
 @dataclass
 class AgentAction:
     return AgentAction("invalid", output)
 def map_to_env(action: AgentAction):
+    if action.action_type == "run_tests":
+        return RunTests()
+    elif action.action_type == "run_linter":
+        return RunLinter()
+    elif action.action_type == "inspect":
+        return Inspect()
+    elif action.action_type == "fix":
+        return ProposeFix(fix_code=action.content or "")
+    elif action.action_type == "comment":
+        return WriteComment(comment_text=action.content or "")
+    elif action.action_type == "question":
+        return AskQuestion(question=action.content or "")
+    elif action.action_type == "query_docs":
+        return QueryDocs(query_topic=action.content or "")
+    elif action.action_type == "done":
+        return Done()
+    else:
+        return Skip()
 # ======================================================================
 def load_model():
     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name="unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
+        max_seq_length=480,
         load_in_4bit=True,
     )
     model = FastLanguageModel.get_peft_model(
 # ======================================================================
 def _expert_fix_from_context(obs) -> str:
+    """Build a conservative fix template based on bug hints."""
     bug = (getattr(obs, "bug_description", "") or "").lower()
     code = getattr(obs, "code_snippet", "") or ""
             "        return 0\n"
             "    return sum(data) / len(data)"
         )
     if "operator" in bug or "sign" in bug:
         return (
             "def fix(a, b):\n"
             "    return a + b"
         )
     if "off_by_one" in bug or "loop" in bug:
         return (
             "def fix(items):\n"
             "    return len(items)"
         )
     if "null" in bug or "key" in bug or "dict" in code.lower():
         return (
             "def fix(payload):\n"
             "    user_id = payload.get('id')\n"
             "    return users.get(user_id)"
         )
     if "race" in bug or "missing_lock" in bug or "thread_safe" in bug or "global_nonatomic" in bug:
         return (
             "import threading\n"
             "            return 0\n"
             "        return counter + 1"
         )
     if "deadlock" in bug or "double_lock" in bug or "lock order" in bug or "nested_lock" in bug:
         return (
             "import threading\n"
             "        with second:\n"
             "            return work() if callable(work) else work"
         )
     if "fork_join" in bug or "join" in bug:
         return (
             "import threading\n"
             "    t.join()\n"
             "    return True"
         )
     return (
         "def fix(data):\n"
         "    if data is None:\n"
         "    return data"
     )
 def _expert_supervised_policy(obs) -> str:
+    """Expert policy used during supervised warm‑up."""
     author_msg = (getattr(obs, "author_response", "") or "").lower()
     tool_output = (getattr(obs, "last_tool_output", "") or "").lower()
     if not getattr(obs, "docs_queried", False):
         return '{"action_type": "query_docs", "content": "python bug fixing best practices for edge cases and null safety"}'
     if getattr(obs, "current_test_score", 0.0) < 0.6 and getattr(obs, "step", 0) >= 3:
         bug_hint = (getattr(obs, "bug_description", "") or "concurrency bug").replace('"', "'")
+        return json.dumps({
+            "action_type": "query_docs",
+            "content": f"python {bug_hint} lock ordering race condition mitigation patterns"
+        })
     if getattr(obs, "current_test_score", 0.0) < 0.95:
         fix_code = _expert_fix_from_context(obs)
         return json.dumps({"action_type": "fix", "content": fix_code})
     if author_msg and ("not convinced" in author_msg or "explain" in author_msg or "brief" in author_msg):
         return (
             '{"action_type": "comment", "content": "This fix works because it handles the failing edge case directly, '
             'The change is intentionally small to reduce regression risk."}'
         )
     conf = float(getattr(obs, "author_confidence", 0.0))
     threshold = float(getattr(obs, "author_threshold", 0.5))
     score = float(getattr(obs, "current_test_score", 0.0))
     if conf >= threshold and score >= 0.8:
         return '{"action_type": "done"}'
     return (
+        '{"action_type": "question", "content": "Would you like a quick walkthrough of a failing scenario, '
+        'the root cause, and how the fix prevents regressions?"}'
     )
 # ======================================================================
+def supervised_warmup(model, tokenizer, env, n_episodes=16, epochs=1, max_steps=8,
+                      json_path: Optional[str] = None):
+    """
+    Supervised warm‑up using either a JSON file of (prompt, action) pairs,
+    or a rule‑based expert playing in the real environment.
+    """
     print("\n" + "="*60)
     print("SUPERVISED WARM-UP: Real environment demonstrations")
     print("="*60)
     examples = []
+    if json_path and Path(json_path).exists():
+        print(f"Loading training examples from {json_path} ...")
+        with open(json_path, 'r', encoding='utf-8') as f:
+            raw_pairs = json.load(f)
+        for pair in raw_pairs:
+            prompt = pair["prompt"]
+            action = pair["action"]
             messages = [
                 {"role": "user", "content": prompt},
+                {"role": "assistant", "content": action}
             ]
             full_text = tokenizer.apply_chat_template(messages, tokenize=False)
             examples.append({"text": full_text})
+        print(f"Loaded {len(examples)} examples from JSON.")
+    else:
+        # Fallback to real environment rollouts with expert policy
+        tasks = ["easy", "medium", "hard", "harder", "hardest"]
+        for ep in range(n_episodes):
+            task = random.choice(tasks)
+            env.set_task(task)
+            obs = env.reset()
+            history = []
+            done = False
+            steps = 0
+            while not done and steps < max_steps:
+                prompt = build_prompt(obs, history)
+                action_text = _expert_supervised_policy(obs)
+                action = parse_action(action_text)
+                env_action = map_to_env(action)
+                next_obs, _, done, _ = env.step(env_action)
+                messages = [
+                    {"role": "user", "content": prompt},
+                    {"role": "assistant", "content": action_text},
+                ]
+                full_text = tokenizer.apply_chat_template(messages, tokenize=False)
+                examples.append({"text": full_text})
+                history.append(f"Agent: {action_text}")
+                history.append(f"Env: {next_obs.last_tool_output}")
+                history = history[-8:]
+                obs = next_obs
+                steps += 1
+            print(f"Supervised episode {ep+1}: task={task}, steps={steps}, done={done}")
     if not examples:
         print("No supervised examples generated; skipping warm-up.")
             bf16=True,
         ),
     )
+    print(f"Training on {len(examples)} examples for {epochs} epochs...")
     trainer.train()
+    print("✓ Supervised warm-up complete\n")
     torch.cuda.empty_cache()
 # ======================================================================
     return {"avg_reward": np.mean(total_rewards), "std_reward": np.std(total_rewards)}
 # ======================================================================
+def train_ppo(json_dataset_path: Optional[str] = None):
     n_iterations = 2
     trajectories_per_iter = 2
     n_epochs = 1
         return
     env = CodeReviewEnv()
+    # Run supervised warm‑up twice (if JSON provided, it will be used each time)
+    supervised_warmup(model, tokenizer, env, n_episodes=12, epochs=1, max_steps=8,
+                      json_path=json_dataset_path)
+    supervised_warmup(model, tokenizer, env, n_episodes=12, epochs=1, max_steps=8,
+                      json_path=json_dataset_path)
     optimizer = AdamW(model.parameters(), lr=learning_rate)
     task_levels = list(BUG_DB.keys())
         plt.grid(alpha=0.3); plt.tight_layout(); plt.savefig("loss_curve.png", dpi=150); plt.close()
     print("Plots saved as reward_curve.png and loss_curve.png.")
     print("="*60)
+# ======================================================================
 if __name__ == "__main__":
+    # Optionally provide a path to a JSON file of training pairs.
+    # Example: {"prompt": "You are a code review agent...", "action": "{\"action_type\": \"inspect\"}"}
+    train_ppo(json_dataset_path=None)   # set to your JSON file path if you have one