Spaces:

100XZX001
/

CodeReview-Professional-Workflow

Sleeping

App Files Files Community

100XZX001 commited on 19 days ago

Commit

94b1baf

verified ·

1 Parent(s): 86c792b

Upload 23 files

Browse files

Files changed (16) hide show

.gitattributes +35 -35
.gitignore +3 -0
Dockerfile +23 -23
README.md +117 -14
__init__.py +12 -10
app.py +104 -104
environment.py +624 -624
grader.py +13 -12
models.py +111 -111
pyproject.toml +38 -30
requirements-training.txt +9 -0
requirements.txt +5 -11
rltool.py +143 -127
rubrics.py +136 -136
training.py +934 -934
training_data.json +0 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,35 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__/
+*.pyc
+tmp/

Dockerfile CHANGED Viewed

@@ -1,24 +1,24 @@
-# Dockerfile – OpenEnv server with FastAPI and all dependencies
-FROM python:3.10-slim
-# Install system dependencies required for chromadb and sentence-transformers
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential \
-    && rm -rf /var/lib/apt/lists/*
-WORKDIR /app
-# Copy requirements and install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy the rest of the application
-COPY . .
-# Expose the port used by the FastAPI server
-EXPOSE 7860
-# Run the server using uvicorn
-# Note: 'server.app:app' assumes the FastAPI app is in server/app.py
-ENV ENABLE_WEB_INTERFACE=true
 CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]

+# Dockerfile – OpenEnv server with FastAPI and all dependencies
+FROM python:3.10-slim
+# Install system dependencies required for chromadb and sentence-transformers
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application
+COPY . .
+# Expose the port used by the FastAPI server
+EXPOSE 7860
+# Run the server using uvicorn
+# Note: 'server.app:app' assumes the FastAPI app is in server/app.py
+ENV ENABLE_WEB_INTERFACE=true
 CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,14 +1,117 @@
----
-title: CodeReview Training
-emoji: 🤖
-colorFrom: blue
-colorTo: green
-sdk: gradio
-sdk_version: 4.44.0
-app_file: app.py
-pinned: false
----
-# CodeReview PPO Training
-This Space trains an LLM agent to fix injected bugs using PPO and rubrics.

+---
+title: CodeReview Training
+emoji: "🤖"
+colorFrom: blue
+colorTo: green
+sdk: docker
+pinned: false
+---
+# CodeReview Professional Workflow
+`CodeReview Professional Workflow` is an OpenEnv environment for training code-fixing agents on realistic review loops instead of one-shot coding tasks. The agent has to inspect buggy code, run tests, lint the patch, query docs, and persuade a simulated author before the episode is considered solved.
+## Quick links
+| Artifact | Link |
+| --- | --- |
+| Hugging Face Space | [100XZX001/CodeReview-Professional-Workflow](https://huggingface.co/spaces/100XZX001/CodeReview-Professional-Workflow) |
+| Colab-ready training notebook | [notebooks/code_review_unsloth_training.ipynb](notebooks/code_review_unsloth_training.ipynb) |
+| Local training script | [training.py](training.py) |
+| OpenEnv manifest | [openenv.yaml](openenv.yaml) |
+| Submission slide deck | [submission_assets/code_review_openenv_submission.pptx](submission_assets/code_review_openenv_submission.pptx) |
+| Training artifacts folder | [outputs/README.md](outputs/README.md) |
+## Why this environment
+Most code agents are evaluated on static patch generation. Real review work is messier:
+- you have to diagnose the failure mode before patching
+- you often need tool feedback before you know whether the fix is safe
+- you may need to explain the fix to another developer before it is accepted
+This environment turns that workflow into a multi-step RL setting with dense rewards and stateful interaction.
+## How the environment works
+Each episode samples one injected bug from five difficulty bands:
+1. `easy`: null checks, missing defaults, simple indexing mistakes
+2. `medium`: off-by-one and wrong-operator bugs
+3. `hard`: numerical safety failures like divide-by-zero
+4. `harder`: concurrency issues like missing locks
+5. `hardest`: deadlock and coordination mistakes
+The agent can take actions such as:
+- `inspect`
+- `run_tests`
+- `run_linter`
+- `query_docs`
+- `fix`
+- `comment`
+- `question`
+- `done`
+Rewards combine test delta, lint delta, tool usage, exploration behavior, step penalties, and terminal success. The observation includes the current code, latest tool output, previous scores, author confidence, progress counters, and recent action history.
+## OpenEnv-first setup
+This repo is structured as an OpenEnv environment rather than a custom one-off app:
+- the environment metadata lives in [openenv.yaml](openenv.yaml)
+- the Space is configured as a Docker-based OpenEnv deployment
+- runtime dependencies are kept lightweight for the Space build
+- training-only packages live separately so judges can run the environment without pulling the full training stack
+The project now targets `openenv-core>=0.2.3`.
+## Training
+The main training entrypoint is [training.py](training.py), which uses Unsloth plus a PPO-style loop over real environment interaction. For judges who want a rerunnable workflow, the repo also includes a Colab-ready notebook:
+- [notebooks/code_review_unsloth_training.ipynb](notebooks/code_review_unsloth_training.ipynb)
+### Install locally
+```bash
+pip install -e .
+pip install -r requirements-training.txt
+```
+### Run training
+```bash
+python training.py
+```
+The training run writes the evidence plots in the working directory:
+- `warmup_loss.png`
+- `reward_curve.png`
+- `loss_curve.png`
+- `training_summary.png`
+For submission hygiene, copy a real run into `outputs/<run-name>/` and link that folder from this README before final judging.
+## Results and evidence
+The expected evidence bundle for a real training run is:
+- warm-up loss curve
+- PPO reward curve
+- PPO loss curve
+- combined summary panel
+Use [outputs/README.md](outputs/README.md) as the landing page for committed run artifacts.
+## Submission materials
+This repo is set up so every judge-facing artifact can be reached from this README:
+- environment Space: [100XZX001/CodeReview-Professional-Workflow](https://huggingface.co/spaces/100XZX001/CodeReview-Professional-Workflow)
+- training notebook: [notebooks/code_review_unsloth_training.ipynb](notebooks/code_review_unsloth_training.ipynb)
+- slide deck: [submission_assets/code_review_openenv_submission.pptx](submission_assets/code_review_openenv_submission.pptx)
+- evidence folder: [outputs/README.md](outputs/README.md)
+No large video files are stored in the repo; any future video or blog submission should be linked by URL from this README.

__init__.py CHANGED Viewed

@@ -4,13 +4,15 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-"""Criticrl  Environment."""
-from .client import CriticrlEnv
-from .models import CriticrlAction, CriticrlObservation
-__all__ = [
-    "CriticrlAction",
-    "CriticrlObservation",
-    "CriticrlEnv",
-]

 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Code Review Professional Workflow OpenEnv package."""
+from .client import CodeReviewEnv
+from .models import AnyAction, Observation, Reward, State
+__all__ = [
+    "AnyAction",
+    "Observation",
+    "Reward",
+    "State",
+    "CodeReviewEnv",
+]

app.py CHANGED Viewed

@@ -1,104 +1,104 @@
-# server/app.py – OpenEnv HTTP server
-import sys
-import os
-sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
-from fastapi import FastAPI, HTTPException
-from environment import CodeReviewEnv
-from models import AnyAction, Observation, Reward, State, action_adapter
-app = FastAPI(title="Code Review Environment", version="1.0.0")
-env = CodeReviewEnv()
-# ----------------------------------------------------------------------
-# Health & metadata endpoints
-# ----------------------------------------------------------------------
-@app.get("/")
-def root():
-    print("[ROOT] Health check hit")
-    return {"status": "crazy good"}
-@app.get("/health")
-def health():
-    print("[HEALTH] Service is healthy")
-    return {"status": "healthy"}
-@app.get("/metadata")
-def metadata():
-    print("[METADATA] Requested")
-    return {
-        "name": "Code Review Professional Workflow",
-        "description": (
-            "Multi‑turn code review environment for professional‑level bug fixing. "
-            "The agent must inspect, test, lint, query documentation, and negotiate with "
-            "a simulated (persona‑driven) author to get a fix accepted. "
-            "Includes 25 bugs across 5 difficulty levels, AST‑based injection, "
-            "a reward‑shaping system (full/core profiles), and curriculum learning. "
-            "Designed for RL training (PPO, DPO, or any policy‑gradient method)."
-        )
-    }
-@app.get("/schema")
-def schema():
-    print("[SCHEMA] Requested")
-    return {
-        "action": AnyAction.model_json_schema(),
-        "observation": Observation.model_json_schema(),
-        "state": State.model_json_schema()
-    }
-@app.post("/mcp")
-def mcp():
-    print("[MCP] Ping received")
-    return {"jsonrpc": "2.0", "result": None}
-# ----------------------------------------------------------------------
-# Environment endpoints
-# ----------------------------------------------------------------------
-@app.post("/reset")
-def reset(task: str = "easy"):
-    try:
-        print(f"[RESET] Starting new episode | task={task}")
-        env.set_task(task)
-        obs = env.reset()
-        print(f"[RESET DONE] step={env._step_count}")
-        return obs.__dict__
-    except Exception as e:
-        print(f"[RESET ERROR] {e}")
-        raise HTTPException(status_code=400, detail=str(e))
-@app.post("/step")
-def step(action: dict):
-    try:
-        print(f"[STEP INPUT] {action}")
-        parsed_action = action_adapter.validate_python(action)
-        obs, reward, done, info = env.step(parsed_action)
-        print(f"[STEP OUTPUT] reward={reward.value:.4f} | done={done}")
-        return {
-            "observation": obs.__dict__,
-            "reward": reward.value,
-            "done": done,
-            "info": info
-        }
-    except Exception as e:
-        print(f"[STEP ERROR] {e}")
-        raise HTTPException(status_code=400, detail=str(e))
-@app.get("/state")
-def state():
-    print("[STATE] Requested")
-    return env._get_observation().__dict__
-# ----------------------------------------------------------------------
-# Main entry point (for local testing)
-# ----------------------------------------------------------------------
-if __name__ == "__main__":
-    import uvicorn
-    print("[SERVER START] Running on http://0.0.0.0:7860")
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+# server/app.py – OpenEnv HTTP server
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+from fastapi import FastAPI, HTTPException
+from environment import CodeReviewEnv
+from models import AnyAction, Observation, Reward, State, action_adapter
+app = FastAPI(title="Code Review Environment", version="1.0.0")
+env = CodeReviewEnv()
+# ----------------------------------------------------------------------
+# Health & metadata endpoints
+# ----------------------------------------------------------------------
+@app.get("/")
+def root():
+    print("[ROOT] Health check hit")
+    return {"status": "crazy good"}
+@app.get("/health")
+def health():
+    print("[HEALTH] Service is healthy")
+    return {"status": "healthy"}
+@app.get("/metadata")
+def metadata():
+    print("[METADATA] Requested")
+    return {
+        "name": "Code Review Professional Workflow",
+        "description": (
+            "Multi‑turn code review environment for professional‑level bug fixing. "
+            "The agent must inspect, test, lint, query documentation, and negotiate with "
+            "a simulated (persona‑driven) author to get a fix accepted. "
+            "Includes 25 bugs across 5 difficulty levels, AST‑based injection, "
+            "a reward‑shaping system (full/core profiles), and curriculum learning. "
+            "Designed for RL training (PPO, DPO, or any policy‑gradient method)."
+        )
+    }
+@app.get("/schema")
+def schema():
+    print("[SCHEMA] Requested")
+    return {
+        "action": AnyAction.model_json_schema(),
+        "observation": Observation.model_json_schema(),
+        "state": State.model_json_schema()
+    }
+@app.post("/mcp")
+def mcp():
+    print("[MCP] Ping received")
+    return {"jsonrpc": "2.0", "result": None}
+# ----------------------------------------------------------------------
+# Environment endpoints
+# ----------------------------------------------------------------------
+@app.post("/reset")
+def reset(task: str = "easy"):
+    try:
+        print(f"[RESET] Starting new episode | task={task}")
+        env.set_task(task)
+        obs = env.reset()
+        print(f"[RESET DONE] step={env._step_count}")
+        return obs.__dict__
+    except Exception as e:
+        print(f"[RESET ERROR] {e}")
+        raise HTTPException(status_code=400, detail=str(e))
+@app.post("/step")
+def step(action: dict):
+    try:
+        print(f"[STEP INPUT] {action}")
+        parsed_action = action_adapter.validate_python(action)
+        obs, reward, done, info = env.step(parsed_action)
+        print(f"[STEP OUTPUT] reward={reward.value:.4f} | done={done}")
+        return {
+            "observation": obs.__dict__,
+            "reward": reward.value,
+            "done": done,
+            "info": info
+        }
+    except Exception as e:
+        print(f"[STEP ERROR] {e}")
+        raise HTTPException(status_code=400, detail=str(e))
+@app.get("/state")
+def state():
+    print("[STATE] Requested")
+    return env._get_observation().__dict__
+# ----------------------------------------------------------------------
+# Main entry point (for local testing)
+# ----------------------------------------------------------------------
+if __name__ == "__main__":
+    import uvicorn
+    print("[SERVER START] Running on http://0.0.0.0:7860")
+    uvicorn.run(app, host="0.0.0.0", port=7860)

environment.py CHANGED Viewed

@@ -1,628 +1,628 @@
-# environment.py – FULLY CORRECTED RL Environment (TRUE Markov + Fixed Bugs)
-import sys
-import subprocess
-import tempfile
-import os
-import re
-from dataclasses import dataclass, field
-from typing import Tuple, Dict, Any, Optional, List
-from models import (
-    AnyAction, WriteComment, ProposeFix, Execute, Inspect,
-    RunLinter, RunTests, QueryDocs, Skip, Done, AskQuestion,
-    Observation, Reward, State
-)
-from redteam import RedTeam
-from test_runner import TestRunner
-from author import PersonaAuthor
-from rltool import ToolBox
-from rubrics import (
-    ToolUsageRubric,
-    TestDeltaRubric,
-    LintDeltaRubric,
-    TerminalSuccessRubric,
-    ExplorationRubric,
-    AntiHackingRubric,
-    StepPenaltyRubric,
-)
-# ======================================================================
-# FULLY MARKOV OBSERVATION (NOTHING HIDDEN)
-# ======================================================================
-@dataclass
-class EnhancedObservation:
-    code_snippet: str
-    last_tool_output: str
-    current_test_score: float
-    current_lint_score: float
-    negotiation_score: float
-    previous_test_score: float
-    previous_lint_score: float
-    author_confidence: float
-    author_threshold: float
-    step: int
-    max_steps: int
-    progress_ratio: float
-    tests_run: bool
-    linter_run: bool
-    docs_queried: bool
-    last_action_type: str
-    action_history: List[str]
-    done: bool
-    bug_description: str
-    comments_count: int
-    # default fields must be at the very end
-    author_response: str = ""
-# ======================================================================
-# HELPER FUNCTIONS
-# ======================================================================
-def execute_code(code: str, timeout_sec: int = 5) -> Tuple[bool, str, str]:
-    if not code.strip():
-        return False, "", "Error: Empty code"
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as f:
-        f.write(code)
-        tmp_path = f.name
-    try:
-        result = subprocess.run(
-            [sys.executable, tmp_path],
-            capture_output=True,
-            text=True,
-            timeout=timeout_sec
-        )
-        success = (result.returncode == 0)
-        return success, result.stdout, result.stderr
-    except subprocess.TimeoutExpired:
-        return False, "", f"Timeout after {timeout_sec}s"
-    except Exception as e:
-        return False, "", f"Execution error: {str(e)}"
-    finally:
-        try:
-            os.unlink(tmp_path)
-        except:
-            pass
-# ======================================================================
-# ENHANCED CODE REVIEW ENVIRONMENT
-# ======================================================================
-@dataclass
-class CodeReviewEnv:
-    task: str = "easy"
-    max_steps: int = 10
-    step_penalty: float = 0.01
-    reward_profile: str = "full"  # "full" or "core"
-    # Curriculum learning
-    auto_difficulty: bool = False
-    success_threshold: float = 0.7
-    # Reward shaping parameters
-    delta_weight: float = 0.3
-    tool_usage_bonus: float = 0.05
-    diversity_bonus: float = 0.03
-    _red_team: Optional[RedTeam] = field(init=False, default=None)
-    _author: Optional[PersonaAuthor] = field(init=False, default=None)
-    _current_code: str = field(init=False, default="")
-    _current_bug_id: str = field(init=False, default="")
-    _bug_description: str = field(init=False, default="")
-    _oracle_fix: str = field(init=False, default="")
-    _comments: list = field(init=False, default_factory=list)
-    _test_results: Optional[str] = field(init=False, default=None)
-    _lint_results: Optional[str] = field(init=False, default=None)
-    _doc_results: Optional[str] = field(init=False, default=None)
-    _step_count: int = field(init=False, default=0)
-    _done: bool = field(init=False, default=False)
-    # State tracking for dense rewards
-    _previous_test_score: float = field(init=False, default=0.0)
-    _previous_lint_score: float = field(init=False, default=0.0)
-    _current_test_score: float = field(init=False, default=0.0)
-    _current_lint_score: float = field(init=False, default=0.0)
-    # Tool usage tracking
-    _tests_run: bool = field(init=False, default=False)
-    _linter_run: bool = field(init=False, default=False)
-    _docs_queried: bool = field(init=False, default=False)
-    # Action history
-    _action_history: List[str] = field(init=False, default_factory=list)
-    _last_action_type: str = field(init=False, default="none")
-    _last_author_response: str = field(init=False, default="")
-    # FIXED: Track CUMULATIVE episode reward
-    _episode_total_reward: float = field(init=False, default=0.0)
-    _episode_rewards: List[float] = field(init=False, default_factory=list)
-    _difficulty_level: int = field(init=False, default=0)
-    # Bug-id bridge:
-    # RedTeam has fine-grained IDs, while TestRunner currently expects a
-    # smaller canonical set. Keep this mapping here so both modules can evolve
-    # independently without breaking evaluation.
-    _BUG_ID_CANONICAL_MAP = {
-        # Easy-family
-        "simple_typo": "null_check",
-        "default_value": "null_check",
-        "empty_return": "null_check",
-        "string_index": "off_by_one",
-        # Medium-family
-        "loop_skip": "off_by_one",
-        "sign_error": "wrong_operator",
-        "swap_args": "wrong_operator",
-        "uninitialised_var": "null_check",
-        # Hard-family
-        "division_by_zero_empty": "division_by_zero",
-        "division_by_zero_zero": "division_by_zero",
-        "float_precision": "division_by_zero",
-        "abs_usage": "division_by_zero",
-        "round_error": "division_by_zero",
-    }
-    # ===================================================================
-    def __post_init__(self):
-        self.set_task(self.task)
-    # ===================================================================
-    def _build_rubrics(self):
-        """
-        Build rubric stack from a named reward profile.
-        - full: richer shaping for exploration/tool-use behavior
-        - core: minimal stable signal for quick ablations/baselines
-        """
-        core_rubrics = [
-            TestDeltaRubric(weight=self.delta_weight),
-            LintDeltaRubric(weight=self.delta_weight),
-            TerminalSuccessRubric(),
-            StepPenaltyRubric(penalty=self.step_penalty),
-        ]
-        if self.reward_profile == "core":
-            return core_rubrics
-        if self.reward_profile == "full":
-            return [
-                *core_rubrics[:-1],  # step penalty appended at end for consistent ordering
-                ToolUsageRubric(bonus=self.tool_usage_bonus),
-                ExplorationRubric(penalty=-0.05, bonus=self.diversity_bonus * 0.7),
-                AntiHackingRubric(),
-                core_rubrics[-1],
-            ]
-        raise ValueError(f"Unknown reward_profile: {self.reward_profile}")
-    # ===================================================================
-    def set_task(self, task: str):
-        if task not in ["easy", "medium", "hard", "harder", "hardest"]:
-            raise ValueError(f"Unknown task: {task}")
-        self.task = task
-        # Use stochastic bug sampling across episodes; fixed seed here would
-        # repeatedly select the same bug and weaken training diversity.
-        self._red_team = RedTeam(task, seed=None)
-        self._author = PersonaAuthor()
-        self.rubrics = self._build_rubrics()
-        task_to_level = {
-            "easy": 0, "medium": 1, "hard": 2,
-            "harder": 3, "hardest": 4
-        }
-        self._difficulty_level = task_to_level[task]
-        self._reset_internal()
-    # ===================================================================
-    def _reset_internal(self):
-        self._step_count = 0                         # ← FIXED
-        self._comments = []
-        self._test_results = None
-        self._lint_results = None
-        self._doc_results = None
-        self._done = False
-        # Reset state tracking
-        self._previous_test_score = 0.0
-        self._previous_lint_score = 0.0
-        self._current_test_score = 0.0
-        self._current_lint_score = 0.0
-        self._tests_run = False
-        self._linter_run = False
-        self._docs_queried = False
-        self._action_history = []
-        self._last_action_type = "none"
-        self._last_author_response = ""
-        # FIXED: Reset episode cumulative reward
-        self._episode_total_reward = 0.0
-        self._author.reset()
-        # Base tasks
-        if self.task == "easy":
-            original = "def get_user(id):\n    if id in users:\n        return users[id]"
-        elif self.task == "medium":
-            original = "def process_items(items):\n    for item in items:\n        print(item)"
-        elif self.task == "hard":
-            original = "def average(data):\n    if not data:\n        return 0\n    return sum(data) / len(data)"
-        elif self.task == "harder":
-            original = "counter = 0\ndef increment():\n    global counter\n    with lock:\n        counter += 1"
-        else:
-            original = "def safe_work():\n    with lock1:\n        with lock2:\n            do_work()"
-        buggy_code, bug_id, desc, oracle = self._red_team.inject_bug(original)
-        self._current_code = buggy_code
-        self._current_bug_id = bug_id
-        self._bug_description = desc
-        self._oracle_fix = oracle
-        self._comments.append(f"[RedTeam] {desc}")
-    # ===================================================================
-    def reset(self) -> EnhancedObservation:
-        """Reset with optional curriculum adjustment."""
-        if self.auto_difficulty and len(self._episode_rewards) > 0:
-            recent_performance = sum(self._episode_rewards[-5:]) / min(5, len(self._episode_rewards))
-            if recent_performance > self.success_threshold and self._difficulty_level < 4:
-                self._difficulty_level += 1
-                print(f"[Curriculum] Increasing difficulty to level {self._difficulty_level}")
-            elif recent_performance < 0.3 and self._difficulty_level > 0:
-                self._difficulty_level -= 1
-                print(f"[Curriculum] Decreasing difficulty to level {self._difficulty_level}")
-            level_to_task = {0: "easy", 1: "medium", 2: "hard", 3: "harder", 4: "hardest"}
-            self.task = level_to_task[self._difficulty_level]
-            # Keep curriculum stochastic for better coverage within each level.
-            self._red_team = RedTeam(self.task, seed=None)
-        self._reset_internal()
-        return self._get_observation()
-    # ===================================================================
-    def _get_observation(self) -> EnhancedObservation:
-        """Return COMPLETE Markov state."""
-        # Keep the author's message separate from tool output.
-        # Using `_test_results` here can leak unrelated outputs (tests/linter/docs)
-        # and gives the policy a noisy signal for dialogue actions.
-        if self._last_action_type in ("comment", "question", "fix"):
-            author_response = self._last_author_response
-        else:
-            author_response = ""
-        return EnhancedObservation(
-            code_snippet=self._current_code,
-            last_tool_output=self._test_results or "",
-            author_response=author_response,          # ← now field exists
-            current_test_score=self._current_test_score,
-            current_lint_score=self._current_lint_score,
-            negotiation_score=self._author.get_negotiation_score(),
-            previous_test_score=self._previous_test_score,
-            previous_lint_score=self._previous_lint_score,
-            author_confidence=self._author._confidence,
-            author_threshold=self._author.thresholds.get(self._author.personality, 0.5),
-            step=self._step_count,
-            max_steps=self.max_steps,
-            # Guard against accidental `max_steps=0` configs.
-            progress_ratio=(self._step_count / self.max_steps) if self.max_steps > 0 else 1.0,
-            tests_run=self._tests_run,
-            linter_run=self._linter_run,
-            docs_queried=self._docs_queried,
-            last_action_type=self._last_action_type,
-            action_history=self._action_history[-5:],
-            done=self._done,
-            bug_description=self._bug_description,
-            comments_count=len(self._comments),
-        )
-    # ===================================================================
-    def _get_action_type(self, action: AnyAction) -> str:
-        """Extract action type as string."""
-        if isinstance(action, RunTests):
-            return "run_tests"
-        elif isinstance(action, RunLinter):
-            return "run_linter"
-        elif isinstance(action, QueryDocs):
-            return "query_docs"
-        elif isinstance(action, Execute):
-            return "execute"
-        elif isinstance(action, Inspect):
-            return "inspect"
-        elif isinstance(action, WriteComment):
-            return "comment"
-        elif isinstance(action, AskQuestion):
-            return "question"
-        elif isinstance(action, ProposeFix):
-            return "fix"
-        elif isinstance(action, Done):
-            return "done"
-        elif isinstance(action, Skip):
-            return "skip"
-        else:
-            return "unknown"
-    # ===================================================================
-    def _get_test_runner_bug_id(self) -> str:
-        """
-        Normalize RedTeam bug ids to the canonical ids understood by TestRunner.
-        Falls back to the original id for known direct matches.
-        """
-        return self._BUG_ID_CANONICAL_MAP.get(self._current_bug_id, self._current_bug_id)
-    # ===================================================================
-    def step(self, action: AnyAction) -> Tuple[EnhancedObservation, Reward, bool, Dict[str, Any]]:
-        """
-        TRUE RL STEP with:
-        - Complete Markov observations (no hidden state)
-        - Dense intermediate rewards
-        - Delta-based credit assignment (no double-counting)
-        - Proper episode reward tracking
-        """
-        if self._done:
-            raise RuntimeError("Episode already finished")
-        # Store previous metrics for delta computation
-        self._previous_test_score = self._current_test_score
-        self._previous_lint_score = self._current_lint_score
-        # Snapshot tool-usage flags BEFORE action mutates them.
-        # Rubrics use these to detect true "first-use" behavior.
-        prev_tests_run = self._tests_run
-        prev_linter_run = self._linter_run
-        prev_docs_queried = self._docs_queried
-        base_reward = 0.0
-        action_type = self._get_action_type(action)
-        # Update action history
-        self._action_history.append(action_type)
-        self._last_action_type = action_type
-        # ==============================================================
-        # TOOL ACTIONS
-        # ==============================================================
-        if isinstance(action, Execute):
-            success, stdout, stderr = execute_code(self._current_code)
-            output = (stdout + stderr).strip() or "No output"
-            self._test_results = f"[Execute] {'Success' if success else 'Failed'}\n{output[:300]}"
-            base_reward = 0.001 if success else -0.05
-        elif isinstance(action, Inspect):
-            self._test_results = f"[Inspect]\n{self._current_code[:500]}"
-            base_reward = 0.001
-        elif isinstance(action, RunLinter):
-            lint_output = ToolBox.run_linter(self._current_code)
-            self._lint_results = lint_output[:500]
-            self._test_results = f"[Linter]\n{self._lint_results}"
-            self._current_lint_score = self._run_linter_score(self._current_code)
-            self._linter_run = True
-            base_reward = 0.002
-        elif isinstance(action, RunTests):
-            runner = TestRunner(self._get_test_runner_bug_id())
-            score, output = runner.run_tests(self._current_code)
-            self._current_test_score = score
-            self._tests_run = True
-            self._test_results = f"[Tests] Score: {score:.2f}\n{output[:300]}"
-            base_reward = 0.002
-            if score > 0.8:
-                base_reward += 0.005
-        elif isinstance(action, QueryDocs):
-            # Normalize query to avoid rewarding empty/noisy requests.
-            query_topic = (action.query_topic or "").strip()
-            doc = ToolBox.query_docs(query_topic if query_topic else "general bug fixing")
-            self._doc_results = doc
-            self._test_results = f"[Docs]\n{doc[:400]}"
-            self._docs_queried = True
-            base_reward = 0.001
-        # ==============================================================
-        # COMMUNICATION ACTIONS
-        # ==============================================================
-        elif isinstance(action, WriteComment):
-            self._comments.append(f"Agent: {action.comment_text}")
-            response = self._author.respond(
-                agent_comment=action.comment_text,
-                test_results=self._test_results,
-                lint_results=self._lint_results,
-                doc_results=self._doc_results,
-                proposed_fix=None,
-                original_code=self._current_code
-            )
-            self._comments.append(f"Author: {response}")
-            self._last_author_response = response
-            self._test_results = f"[Comment] Author: {response[:200]}"
-            base_reward = 0.001
-        elif isinstance(action, AskQuestion):
-            self._comments.append(f"Agent: {action.question}")
-            response = self._author.respond(
-                agent_question=action.question,
-                test_results=self._test_results,
-                lint_results=self._lint_results,
-                doc_results=self._doc_results,
-                proposed_fix=None,
-                original_code=self._current_code                  # ← FIXED
-            )
-            self._comments.append(f"Author: {response}")
-            self._last_author_response = response
-            self._test_results = f"[Question] Author: {response[:200]}"
-            base_reward = 0.002
-        # ==============================================================
-        # FINAL FIX ACTION
-        # ==============================================================
-        elif isinstance(action, ProposeFix):
-            if not action.fix_code:
-                base_reward = -0.05
-                self._done = True
-            else:
-                # Save original code BEFORE overwriting (for author.respond)
-                original_buggy = self._current_code
-                self._current_code = action.fix_code
-                runner = TestRunner(self._get_test_runner_bug_id())
-                test_score, test_output = runner.run_tests(self._current_code)
-                lint_score = self._run_linter_score(self._current_code)
-                negotiation_score = self._author.get_negotiation_score()
-                self._current_test_score = test_score
-                self._current_lint_score = lint_score
-                # Author gating – determines if the episode ends, reward is separate
-                threshold = self._author.thresholds.get(self._author.personality, 0.5)
-                if self._author._confidence < threshold:
-                    if self._step_count < self.max_steps:
-                        self._done = False
-                    else:
-                        self._done = True
-                else:
-                    self._done = True
-                # Get author's verbal feedback (pushback/acceptance)
-                author_feedback = self._author.respond(
-                    agent_comment=f"Proposed fix:\n{action.fix_code}",
-                    test_results=f"Score: {test_score:.2f}",
-                    lint_results=f"Score: {lint_score:.2f}",
-                    doc_results=self._doc_results,
-                    proposed_fix=action.fix_code,
-                    original_code=original_buggy   # now correctly the buggy code, not the fix
-                )
-                self._test_results = f"[Fix] Author: {author_feedback[:200]}"
-                self._comments.append(f"Author: {author_feedback}")
-                self._last_author_response = author_feedback
-                base_reward = 0.001   # rubrics provide the real signal
-        # ==============================================================
-        # TERMINATION ACTIONS
-        # ==============================================================
-        elif isinstance(action, Skip):
-            base_reward = -0.03
-            self._done = True
-        elif isinstance(action, Done):
-            if self._tests_run:
-                base_reward = self._current_test_score * 0.5 - 0.2
-            else:
-                base_reward = -0.04
-            self._done = True
-        else:
-            base_reward = -0.02
-            self._done = True
-        # ==============================================================
-        # STEP UPDATE (before rubric computation so info contains final step)
-        # ==============================================================
-        self._step_count += 1
-        if self._step_count >= self.max_steps:
-            self._done = True
-        # Get fresh observation (needed for rubrics that may read obs)
-        obs = self._get_observation()
-        # Prepare info dict (rubrics may need action_type and deltas)
-        info = {
-            "action_type": action_type,
-            "test_score": self._current_test_score,
-            "lint_score": self._current_lint_score,
-            "test_delta": self._current_test_score - self._previous_test_score,
-            "lint_delta": self._current_lint_score - self._previous_lint_score,
-            "prev_tests_run": prev_tests_run,
-            "prev_linter_run": prev_linter_run,
-            "prev_docs_queried": prev_docs_queried,
-            "docs_query_len": len((action.query_topic or "").strip()) if isinstance(action, QueryDocs) else 0,
-            "base_reward": base_reward,
-        }
-        # ==============================================================
-        # COMPUTE FINAL REWARD USING RUBRICS
-        # ==============================================================
-        rubric_score = sum(r(self, action, obs, None, self._done, info) for r in self.rubrics)
-        final_reward = 0.4 * base_reward + rubric_score
-        final_reward = max(-1.0, min(1.0, final_reward))   # safety clip
-        # Track cumulative episode reward
-        self._episode_total_reward += final_reward
-        # Store episode total if done
-        if self._done:
-            self._episode_rewards.append(self._episode_total_reward)
-        # Complete info
-        info["final_reward"] = final_reward
-        info["episode_total"] = self._episode_total_reward
-        return obs, Reward(value=final_reward), self._done, info
-    # ===================================================================
-    def _run_linter_score(self, code: str) -> float:
-        """Run pylint and return normalized score [0, 1]."""
-        try:
-            with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
-                f.write(code)
-                tmp_path = f.name
             result = subprocess.run(
-                ['pylint', tmp_path, '--score=y', '--exit-zero'],
                 capture_output=True,
                 text=True,
                 timeout=5
-            )
-            match = re.search(r"rated at (\d+\.\d+)/10", result.stdout)
-            if match:
-                return float(match.group(1)) / 10.0
-            return 0.0
-        except:
-            return 0.0
-        finally:
-            try:
-                os.unlink(tmp_path)
-            except:
-                pass
-    # ===================================================================
-    def state(self) -> State:
-        """Legacy compatibility."""
-        return State(
-            pr_title="Code Review",
-            pr_description=self._bug_description,
-            code_snippet=self._current_code,
-            comments=self._comments.copy(),
-            test_results=self._test_results,
-            step=self._step_count,
-            done=self._done
-        )

+# environment.py – FULLY CORRECTED RL Environment (TRUE Markov + Fixed Bugs)
+import sys
+import subprocess
+import tempfile
+import os
+import re
+from dataclasses import dataclass, field
+from typing import Tuple, Dict, Any, Optional, List
+from models import (
+    AnyAction, WriteComment, ProposeFix, Execute, Inspect,
+    RunLinter, RunTests, QueryDocs, Skip, Done, AskQuestion,
+    Observation, Reward, State
+)
+from redteam import RedTeam
+from test_runner import TestRunner
+from author import PersonaAuthor
+from rltool import ToolBox
+from rubrics import (
+    ToolUsageRubric,
+    TestDeltaRubric,
+    LintDeltaRubric,
+    TerminalSuccessRubric,
+    ExplorationRubric,
+    AntiHackingRubric,
+    StepPenaltyRubric,
+)
+# ======================================================================
+# FULLY MARKOV OBSERVATION (NOTHING HIDDEN)
+# ======================================================================
+@dataclass
+class EnhancedObservation:
+    code_snippet: str
+    last_tool_output: str
+    current_test_score: float
+    current_lint_score: float
+    negotiation_score: float
+    previous_test_score: float
+    previous_lint_score: float
+    author_confidence: float
+    author_threshold: float
+    step: int
+    max_steps: int
+    progress_ratio: float
+    tests_run: bool
+    linter_run: bool
+    docs_queried: bool
+    last_action_type: str
+    action_history: List[str]
+    done: bool
+    bug_description: str
+    comments_count: int
+    # default fields must be at the very end
+    author_response: str = ""
+# ======================================================================
+# HELPER FUNCTIONS
+# ======================================================================
+def execute_code(code: str, timeout_sec: int = 5) -> Tuple[bool, str, str]:
+    if not code.strip():
+        return False, "", "Error: Empty code"
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as f:
+        f.write(code)
+        tmp_path = f.name
+    try:
+        result = subprocess.run(
+            [sys.executable, tmp_path],
+            capture_output=True,
+            text=True,
+            timeout=timeout_sec
+        )
+        success = (result.returncode == 0)
+        return success, result.stdout, result.stderr
+    except subprocess.TimeoutExpired:
+        return False, "", f"Timeout after {timeout_sec}s"
+    except Exception as e:
+        return False, "", f"Execution error: {str(e)}"
+    finally:
+        try:
+            os.unlink(tmp_path)
+        except:
+            pass
+# ======================================================================
+# ENHANCED CODE REVIEW ENVIRONMENT
+# ======================================================================
+@dataclass
+class CodeReviewEnv:
+    task: str = "easy"
+    max_steps: int = 10
+    step_penalty: float = 0.01
+    reward_profile: str = "full"  # "full" or "core"
+    # Curriculum learning
+    auto_difficulty: bool = False
+    success_threshold: float = 0.7
+    # Reward shaping parameters
+    delta_weight: float = 0.3
+    tool_usage_bonus: float = 0.05
+    diversity_bonus: float = 0.03
+    _red_team: Optional[RedTeam] = field(init=False, default=None)
+    _author: Optional[PersonaAuthor] = field(init=False, default=None)
+    _current_code: str = field(init=False, default="")
+    _current_bug_id: str = field(init=False, default="")
+    _bug_description: str = field(init=False, default="")
+    _oracle_fix: str = field(init=False, default="")
+    _comments: list = field(init=False, default_factory=list)
+    _test_results: Optional[str] = field(init=False, default=None)
+    _lint_results: Optional[str] = field(init=False, default=None)
+    _doc_results: Optional[str] = field(init=False, default=None)
+    _step_count: int = field(init=False, default=0)
+    _done: bool = field(init=False, default=False)
+    # State tracking for dense rewards
+    _previous_test_score: float = field(init=False, default=0.0)
+    _previous_lint_score: float = field(init=False, default=0.0)
+    _current_test_score: float = field(init=False, default=0.0)
+    _current_lint_score: float = field(init=False, default=0.0)
+    # Tool usage tracking
+    _tests_run: bool = field(init=False, default=False)
+    _linter_run: bool = field(init=False, default=False)
+    _docs_queried: bool = field(init=False, default=False)
+    # Action history
+    _action_history: List[str] = field(init=False, default_factory=list)
+    _last_action_type: str = field(init=False, default="none")
+    _last_author_response: str = field(init=False, default="")
+    # FIXED: Track CUMULATIVE episode reward
+    _episode_total_reward: float = field(init=False, default=0.0)
+    _episode_rewards: List[float] = field(init=False, default_factory=list)
+    _difficulty_level: int = field(init=False, default=0)
+    # Bug-id bridge:
+    # RedTeam has fine-grained IDs, while TestRunner currently expects a
+    # smaller canonical set. Keep this mapping here so both modules can evolve
+    # independently without breaking evaluation.
+    _BUG_ID_CANONICAL_MAP = {
+        # Easy-family
+        "simple_typo": "null_check",
+        "default_value": "null_check",
+        "empty_return": "null_check",
+        "string_index": "off_by_one",
+        # Medium-family
+        "loop_skip": "off_by_one",
+        "sign_error": "wrong_operator",
+        "swap_args": "wrong_operator",
+        "uninitialised_var": "null_check",
+        # Hard-family
+        "division_by_zero_empty": "division_by_zero",
+        "division_by_zero_zero": "division_by_zero",
+        "float_precision": "division_by_zero",
+        "abs_usage": "division_by_zero",
+        "round_error": "division_by_zero",
+    }
+    # ===================================================================
+    def __post_init__(self):
+        self.set_task(self.task)
+    # ===================================================================
+    def _build_rubrics(self):
+        """
+        Build rubric stack from a named reward profile.
+        - full: richer shaping for exploration/tool-use behavior
+        - core: minimal stable signal for quick ablations/baselines
+        """
+        core_rubrics = [
+            TestDeltaRubric(weight=self.delta_weight),
+            LintDeltaRubric(weight=self.delta_weight),
+            TerminalSuccessRubric(),
+            StepPenaltyRubric(penalty=self.step_penalty),
+        ]
+        if self.reward_profile == "core":
+            return core_rubrics
+        if self.reward_profile == "full":
+            return [
+                *core_rubrics[:-1],  # step penalty appended at end for consistent ordering
+                ToolUsageRubric(bonus=self.tool_usage_bonus),
+                ExplorationRubric(penalty=-0.05, bonus=self.diversity_bonus * 0.7),
+                AntiHackingRubric(),
+                core_rubrics[-1],
+            ]
+        raise ValueError(f"Unknown reward_profile: {self.reward_profile}")
+    # ===================================================================
+    def set_task(self, task: str):
+        if task not in ["easy", "medium", "hard", "harder", "hardest"]:
+            raise ValueError(f"Unknown task: {task}")
+        self.task = task
+        # Use stochastic bug sampling across episodes; fixed seed here would
+        # repeatedly select the same bug and weaken training diversity.
+        self._red_team = RedTeam(task, seed=None)
+        self._author = PersonaAuthor()
+        self.rubrics = self._build_rubrics()
+        task_to_level = {
+            "easy": 0, "medium": 1, "hard": 2,
+            "harder": 3, "hardest": 4
+        }
+        self._difficulty_level = task_to_level[task]
+        self._reset_internal()
+    # ===================================================================
+    def _reset_internal(self):
+        self._step_count = 0                         # ← FIXED
+        self._comments = []
+        self._test_results = None
+        self._lint_results = None
+        self._doc_results = None
+        self._done = False
+        # Reset state tracking
+        self._previous_test_score = 0.0
+        self._previous_lint_score = 0.0
+        self._current_test_score = 0.0
+        self._current_lint_score = 0.0
+        self._tests_run = False
+        self._linter_run = False
+        self._docs_queried = False
+        self._action_history = []
+        self._last_action_type = "none"
+        self._last_author_response = ""
+        # FIXED: Reset episode cumulative reward
+        self._episode_total_reward = 0.0
+        self._author.reset()
+        # Base tasks
+        if self.task == "easy":
+            original = "def get_user(id):\n    if id in users:\n        return users[id]"
+        elif self.task == "medium":
+            original = "def process_items(items):\n    for item in items:\n        print(item)"
+        elif self.task == "hard":
+            original = "def average(data):\n    if not data:\n        return 0\n    return sum(data) / len(data)"
+        elif self.task == "harder":
+            original = "counter = 0\ndef increment():\n    global counter\n    with lock:\n        counter += 1"
+        else:
+            original = "def safe_work():\n    with lock1:\n        with lock2:\n            do_work()"
+        buggy_code, bug_id, desc, oracle = self._red_team.inject_bug(original)
+        self._current_code = buggy_code
+        self._current_bug_id = bug_id
+        self._bug_description = desc
+        self._oracle_fix = oracle
+        self._comments.append(f"[RedTeam] {desc}")
+    # ===================================================================
+    def reset(self) -> EnhancedObservation:
+        """Reset with optional curriculum adjustment."""
+        if self.auto_difficulty and len(self._episode_rewards) > 0:
+            recent_performance = sum(self._episode_rewards[-5:]) / min(5, len(self._episode_rewards))
+            if recent_performance > self.success_threshold and self._difficulty_level < 4:
+                self._difficulty_level += 1
+                print(f"[Curriculum] Increasing difficulty to level {self._difficulty_level}")
+            elif recent_performance < 0.3 and self._difficulty_level > 0:
+                self._difficulty_level -= 1
+                print(f"[Curriculum] Decreasing difficulty to level {self._difficulty_level}")
+            level_to_task = {0: "easy", 1: "medium", 2: "hard", 3: "harder", 4: "hardest"}
+            self.task = level_to_task[self._difficulty_level]
+            # Keep curriculum stochastic for better coverage within each level.
+            self._red_team = RedTeam(self.task, seed=None)
+        self._reset_internal()
+        return self._get_observation()
+    # ===================================================================
+    def _get_observation(self) -> EnhancedObservation:
+        """Return COMPLETE Markov state."""
+        # Keep the author's message separate from tool output.
+        # Using `_test_results` here can leak unrelated outputs (tests/linter/docs)
+        # and gives the policy a noisy signal for dialogue actions.
+        if self._last_action_type in ("comment", "question", "fix"):
+            author_response = self._last_author_response
+        else:
+            author_response = ""
+        return EnhancedObservation(
+            code_snippet=self._current_code,
+            last_tool_output=self._test_results or "",
+            author_response=author_response,          # ← now field exists
+            current_test_score=self._current_test_score,
+            current_lint_score=self._current_lint_score,
+            negotiation_score=self._author.get_negotiation_score(),
+            previous_test_score=self._previous_test_score,
+            previous_lint_score=self._previous_lint_score,
+            author_confidence=self._author._confidence,
+            author_threshold=self._author.thresholds.get(self._author.personality, 0.5),
+            step=self._step_count,
+            max_steps=self.max_steps,
+            # Guard against accidental `max_steps=0` configs.
+            progress_ratio=(self._step_count / self.max_steps) if self.max_steps > 0 else 1.0,
+            tests_run=self._tests_run,
+            linter_run=self._linter_run,
+            docs_queried=self._docs_queried,
+            last_action_type=self._last_action_type,
+            action_history=self._action_history[-5:],
+            done=self._done,
+            bug_description=self._bug_description,
+            comments_count=len(self._comments),
+        )
+    # ===================================================================
+    def _get_action_type(self, action: AnyAction) -> str:
+        """Extract action type as string."""
+        if isinstance(action, RunTests):
+            return "run_tests"
+        elif isinstance(action, RunLinter):
+            return "run_linter"
+        elif isinstance(action, QueryDocs):
+            return "query_docs"
+        elif isinstance(action, Execute):
+            return "execute"
+        elif isinstance(action, Inspect):
+            return "inspect"
+        elif isinstance(action, WriteComment):
+            return "comment"
+        elif isinstance(action, AskQuestion):
+            return "question"
+        elif isinstance(action, ProposeFix):
+            return "fix"
+        elif isinstance(action, Done):
+            return "done"
+        elif isinstance(action, Skip):
+            return "skip"
+        else:
+            return "unknown"
+    # ===================================================================
+    def _get_test_runner_bug_id(self) -> str:
+        """
+        Normalize RedTeam bug ids to the canonical ids understood by TestRunner.
+        Falls back to the original id for known direct matches.
+        """
+        return self._BUG_ID_CANONICAL_MAP.get(self._current_bug_id, self._current_bug_id)
+    # ===================================================================
+    def step(self, action: AnyAction) -> Tuple[EnhancedObservation, Reward, bool, Dict[str, Any]]:
+        """
+        TRUE RL STEP with:
+        - Complete Markov observations (no hidden state)
+        - Dense intermediate rewards
+        - Delta-based credit assignment (no double-counting)
+        - Proper episode reward tracking
+        """
+        if self._done:
+            raise RuntimeError("Episode already finished")
+        # Store previous metrics for delta computation
+        self._previous_test_score = self._current_test_score
+        self._previous_lint_score = self._current_lint_score
+        # Snapshot tool-usage flags BEFORE action mutates them.
+        # Rubrics use these to detect true "first-use" behavior.
+        prev_tests_run = self._tests_run
+        prev_linter_run = self._linter_run
+        prev_docs_queried = self._docs_queried
+        base_reward = 0.0
+        action_type = self._get_action_type(action)
+        # Update action history
+        self._action_history.append(action_type)
+        self._last_action_type = action_type
+        # ==============================================================
+        # TOOL ACTIONS
+        # ==============================================================
+        if isinstance(action, Execute):
+            success, stdout, stderr = execute_code(self._current_code)
+            output = (stdout + stderr).strip() or "No output"
+            self._test_results = f"[Execute] {'Success' if success else 'Failed'}\n{output[:300]}"
+            base_reward = 0.001 if success else -0.05
+        elif isinstance(action, Inspect):
+            self._test_results = f"[Inspect]\n{self._current_code[:500]}"
+            base_reward = 0.001
+        elif isinstance(action, RunLinter):
+            lint_output = ToolBox.run_linter(self._current_code)
+            self._lint_results = lint_output[:500]
+            self._test_results = f"[Linter]\n{self._lint_results}"
+            self._current_lint_score = self._run_linter_score(self._current_code)
+            self._linter_run = True
+            base_reward = 0.002
+        elif isinstance(action, RunTests):
+            runner = TestRunner(self._get_test_runner_bug_id())
+            score, output = runner.run_tests(self._current_code)
+            self._current_test_score = score
+            self._tests_run = True
+            self._test_results = f"[Tests] Score: {score:.2f}\n{output[:300]}"
+            base_reward = 0.002
+            if score > 0.8:
+                base_reward += 0.005
+        elif isinstance(action, QueryDocs):
+            # Normalize query to avoid rewarding empty/noisy requests.
+            query_topic = (action.query_topic or "").strip()
+            doc = ToolBox.query_docs(query_topic if query_topic else "general bug fixing")
+            self._doc_results = doc
+            self._test_results = f"[Docs]\n{doc[:400]}"
+            self._docs_queried = True
+            base_reward = 0.001
+        # ==============================================================
+        # COMMUNICATION ACTIONS
+        # ==============================================================
+        elif isinstance(action, WriteComment):
+            self._comments.append(f"Agent: {action.comment_text}")
+            response = self._author.respond(
+                agent_comment=action.comment_text,
+                test_results=self._test_results,
+                lint_results=self._lint_results,
+                doc_results=self._doc_results,
+                proposed_fix=None,
+                original_code=self._current_code
+            )
+            self._comments.append(f"Author: {response}")
+            self._last_author_response = response
+            self._test_results = f"[Comment] Author: {response[:200]}"
+            base_reward = 0.001
+        elif isinstance(action, AskQuestion):
+            self._comments.append(f"Agent: {action.question}")
+            response = self._author.respond(
+                agent_question=action.question,
+                test_results=self._test_results,
+                lint_results=self._lint_results,
+                doc_results=self._doc_results,
+                proposed_fix=None,
+                original_code=self._current_code                  # ← FIXED
+            )
+            self._comments.append(f"Author: {response}")
+            self._last_author_response = response
+            self._test_results = f"[Question] Author: {response[:200]}"
+            base_reward = 0.002
+        # ==============================================================
+        # FINAL FIX ACTION
+        # ==============================================================
+        elif isinstance(action, ProposeFix):
+            if not action.fix_code:
+                base_reward = -0.05
+                self._done = True
+            else:
+                # Save original code BEFORE overwriting (for author.respond)
+                original_buggy = self._current_code
+                self._current_code = action.fix_code
+                runner = TestRunner(self._get_test_runner_bug_id())
+                test_score, test_output = runner.run_tests(self._current_code)
+                lint_score = self._run_linter_score(self._current_code)
+                negotiation_score = self._author.get_negotiation_score()
+                self._current_test_score = test_score
+                self._current_lint_score = lint_score
+                # Author gating – determines if the episode ends, reward is separate
+                threshold = self._author.thresholds.get(self._author.personality, 0.5)
+                if self._author._confidence < threshold:
+                    if self._step_count < self.max_steps:
+                        self._done = False
+                    else:
+                        self._done = True
+                else:
+                    self._done = True
+                # Get author's verbal feedback (pushback/acceptance)
+                author_feedback = self._author.respond(
+                    agent_comment=f"Proposed fix:\n{action.fix_code}",
+                    test_results=f"Score: {test_score:.2f}",
+                    lint_results=f"Score: {lint_score:.2f}",
+                    doc_results=self._doc_results,
+                    proposed_fix=action.fix_code,
+                    original_code=original_buggy   # now correctly the buggy code, not the fix
+                )
+                self._test_results = f"[Fix] Author: {author_feedback[:200]}"
+                self._comments.append(f"Author: {author_feedback}")
+                self._last_author_response = author_feedback
+                base_reward = 0.001   # rubrics provide the real signal
+        # ==============================================================
+        # TERMINATION ACTIONS
+        # ==============================================================
+        elif isinstance(action, Skip):
+            base_reward = -0.03
+            self._done = True
+        elif isinstance(action, Done):
+            if self._tests_run:
+                base_reward = self._current_test_score * 0.5 - 0.2
+            else:
+                base_reward = -0.04
+            self._done = True
+        else:
+            base_reward = -0.02
+            self._done = True
+        # ==============================================================
+        # STEP UPDATE (before rubric computation so info contains final step)
+        # ==============================================================
+        self._step_count += 1
+        if self._step_count >= self.max_steps:
+            self._done = True
+        # Get fresh observation (needed for rubrics that may read obs)
+        obs = self._get_observation()
+        # Prepare info dict (rubrics may need action_type and deltas)
+        info = {
+            "action_type": action_type,
+            "test_score": self._current_test_score,
+            "lint_score": self._current_lint_score,
+            "test_delta": self._current_test_score - self._previous_test_score,
+            "lint_delta": self._current_lint_score - self._previous_lint_score,
+            "prev_tests_run": prev_tests_run,
+            "prev_linter_run": prev_linter_run,
+            "prev_docs_queried": prev_docs_queried,
+            "docs_query_len": len((action.query_topic or "").strip()) if isinstance(action, QueryDocs) else 0,
+            "base_reward": base_reward,
+        }
+        # ==============================================================
+        # COMPUTE FINAL REWARD USING RUBRICS
+        # ==============================================================
+        rubric_score = sum(r(self, action, obs, None, self._done, info) for r in self.rubrics)
+        final_reward = 0.4 * base_reward + rubric_score
+        final_reward = max(-1.0, min(1.0, final_reward))   # safety clip
+        # Track cumulative episode reward
+        self._episode_total_reward += final_reward
+        # Store episode total if done
+        if self._done:
+            self._episode_rewards.append(self._episode_total_reward)
+        # Complete info
+        info["final_reward"] = final_reward
+        info["episode_total"] = self._episode_total_reward
+        return obs, Reward(value=final_reward), self._done, info
+    # ===================================================================
+    def _run_linter_score(self, code: str) -> float:
+        """Run pylint and return normalized score [0, 1]."""
+        try:
+            with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+                f.write(code)
+                tmp_path = f.name
             result = subprocess.run(
+                [sys.executable, '-m', 'pylint', tmp_path, '--score=y', '--exit-zero'],
                 capture_output=True,
                 text=True,
                 timeout=5
+            )
+            match = re.search(r"rated at (\d+\.\d+)/10", result.stdout)
+            if match:
+                return float(match.group(1)) / 10.0
+            return 0.0
+        except:
+            return 0.0
+        finally:
+            try:
+                os.unlink(tmp_path)
+            except:
+                pass
+    # ===================================================================
+    def state(self) -> State:
+        """Legacy compatibility."""
+        return State(
+            pr_title="Code Review",
+            pr_description=self._bug_description,
+            code_snippet=self._current_code,
+            comments=self._comments.copy(),
+            test_results=self._test_results,
+            step=self._step_count,
+            done=self._done
+        )

grader.py CHANGED Viewed

@@ -1,11 +1,12 @@
 # grader.py – Production‑grade, continuous reward, exploit‑aware, example of  monolithic scoring
 import ast
-import subprocess
-import tempfile
-import os
-import re
-from dataclasses import dataclass
-from typing import Optional
 @dataclass
 class RigorousGrader:
@@ -105,11 +106,11 @@ class RigorousGrader:
                 f.write(code)
                 f.flush()
                 tmp_path = f.name
-            result = subprocess.run(
-                ['pylint', tmp_path, '--score=y', '--exit-zero'],
-                capture_output=True,
-                text=True,
-                timeout=5
             )
             match = re.search(r"rated at (\d+\.\d+)/10", result.stdout)
             if match:
@@ -139,4 +140,4 @@ class RigorousGrader:
             total = max(len(nodes_prop), len(nodes_oracle))
             return common / total if total > 0 else 0.0
         except:
-            return 0.0

 # grader.py – Production‑grade, continuous reward, exploit‑aware, example of  monolithic scoring
 import ast
+import subprocess
+import tempfile
+import os
+import re
+import sys
+from dataclasses import dataclass
+from typing import Optional
 @dataclass
 class RigorousGrader:
                 f.write(code)
                 f.flush()
                 tmp_path = f.name
+            result = subprocess.run(
+                [sys.executable, '-m', 'pylint', tmp_path, '--score=y', '--exit-zero'],
+                capture_output=True,
+                text=True,
+                timeout=5
             )
             match = re.search(r"rated at (\d+\.\d+)/10", result.stdout)
             if match:
             total = max(len(nodes_prop), len(nodes_oracle))
             return common / total if total > 0 else 0.0
         except:
+            return 0.0

models.py CHANGED Viewed

@@ -1,112 +1,112 @@
-# models.py – Typed Models (Discriminated Unions, POMDP Separation)
-from typing import Literal, Union, Annotated, Optional
-from pydantic import BaseModel, Field, TypeAdapter, field_validator
-# ----------------------------------------------------------------------
-# Action classes (discriminated union)
-# ----------------------------------------------------------------------
-class Action(BaseModel):
-    action_type: Literal["comment", "skip", "done", "question",
-                         "fix", "execute", "inspect", "run_linter",
-                         "run_tests", "query_docs"]
-class WriteComment(Action):
-    action_type: Literal["comment"] = "comment"
-    comment_text: str = Field(..., min_length=1)
-class Skip(Action):
-    action_type: Literal["skip"] = "skip"
-class Done(Action):
-    action_type: Literal["done"] = "done"
-class AskQuestion(Action):
-    action_type: Literal["question"] = "question"
-    question: str = Field(..., min_length=1)
-class ProposeFix(Action):
-    action_type: Literal["fix"] = "fix"
-    fix_code: str = Field(..., min_length=1)
-    @field_validator('fix_code')
-    @classmethod
-    def not_empty(cls, v: str) -> str:
-        if not v.strip():
-            raise ValueError('fix_code cannot be empty')
-        return v
-class Execute(Action):
-    action_type: Literal["execute"] = "execute"
-class Inspect(Action):
-    action_type: Literal["inspect"] = "inspect"
-class RunLinter(Action):
-    action_type: Literal["run_linter"] = "run_linter"
-class RunTests(Action):
-    action_type: Literal["run_tests"] = "run_tests"
-class QueryDocs(Action):
-    action_type: Literal["query_docs"] = "query_docs"
-    query_topic: str = Field(..., min_length=1)
-# Discriminated union for one‑line polymorphic deserialization
-AnyAction = Annotated[
-    Union[WriteComment, Skip, Done, AskQuestion, ProposeFix,
-          Execute, Inspect, RunLinter, RunTests, QueryDocs],
-    Field(discriminator='action_type')
-]
-action_adapter = TypeAdapter(AnyAction)
-def map_to_env(action_type: str, content: Optional[str] = None) -> AnyAction:
-    """
-    Convert lightweight agent outputs into typed environment actions.
-    Kept at module level so training/inference code can reuse one mapping.
-    """
-    if action_type == "run_tests":
-        return RunTests()
-    if action_type == "run_linter":
-        return RunLinter()
-    if action_type == "inspect":
-        return Inspect()
-    if action_type == "fix":
-        return ProposeFix(fix_code=content or "")
-    if action_type == "comment":
-        return WriteComment(comment_text=content or "")
-    if action_type == "question":
-        return AskQuestion(question=content or "")
-    if action_type == "query_docs":
-        return QueryDocs(query_topic=content or "")
-    if action_type == "done":
-        return Done()
-    return Skip()
-# ----------------------------------------------------------------------
-# Observation (POMDP – what the agent sees)
-# ----------------------------------------------------------------------
-class Observation(BaseModel):
-    # Base schema model used by API metadata endpoints.
-    # Keep this lightweight for compatibility with legacy callers.
-    code_snippet: str
-    last_tool_output: str = ""
-    step: int = 0
-    done: bool = False
-# ----------------------------------------------------------------------
-# Reward (lightweight)
-# ----------------------------------------------------------------------
-class Reward(BaseModel):
-    value: float
-# ----------------------------------------------------------------------
-# State (full environment state – not exposed to agent)
-# ----------------------------------------------------------------------
-class State(BaseModel):
-    pr_title: str
-    pr_description: str
-    code_snippet: str
-    comments: list[str]
-    test_results: Optional[str]
-    step: int
     done: bool

+# models.py – Typed Models (Discriminated Unions, POMDP Separation)
+from typing import Literal, Union, Annotated, Optional
+from pydantic import BaseModel, Field, TypeAdapter, field_validator
+# ----------------------------------------------------------------------
+# Action classes (discriminated union)
+# ----------------------------------------------------------------------
+class Action(BaseModel):
+    action_type: Literal["comment", "skip", "done", "question",
+                         "fix", "execute", "inspect", "run_linter",
+                         "run_tests", "query_docs"]
+class WriteComment(Action):
+    action_type: Literal["comment"] = "comment"
+    comment_text: str = Field(..., min_length=1)
+class Skip(Action):
+    action_type: Literal["skip"] = "skip"
+class Done(Action):
+    action_type: Literal["done"] = "done"
+class AskQuestion(Action):
+    action_type: Literal["question"] = "question"
+    question: str = Field(..., min_length=1)
+class ProposeFix(Action):
+    action_type: Literal["fix"] = "fix"
+    fix_code: str = Field(..., min_length=1)
+    @field_validator('fix_code')
+    @classmethod
+    def not_empty(cls, v: str) -> str:
+        if not v.strip():
+            raise ValueError('fix_code cannot be empty')
+        return v
+class Execute(Action):
+    action_type: Literal["execute"] = "execute"
+class Inspect(Action):
+    action_type: Literal["inspect"] = "inspect"
+class RunLinter(Action):
+    action_type: Literal["run_linter"] = "run_linter"
+class RunTests(Action):
+    action_type: Literal["run_tests"] = "run_tests"
+class QueryDocs(Action):
+    action_type: Literal["query_docs"] = "query_docs"
+    query_topic: str = Field(..., min_length=1)
+# Discriminated union for one‑line polymorphic deserialization
+AnyAction = Annotated[
+    Union[WriteComment, Skip, Done, AskQuestion, ProposeFix,
+          Execute, Inspect, RunLinter, RunTests, QueryDocs],
+    Field(discriminator='action_type')
+]
+action_adapter = TypeAdapter(AnyAction)
+def map_to_env(action_type: str, content: Optional[str] = None) -> AnyAction:
+    """
+    Convert lightweight agent outputs into typed environment actions.
+    Kept at module level so training/inference code can reuse one mapping.
+    """
+    if action_type == "run_tests":
+        return RunTests()
+    if action_type == "run_linter":
+        return RunLinter()
+    if action_type == "inspect":
+        return Inspect()
+    if action_type == "fix":
+        return ProposeFix(fix_code=content or "")
+    if action_type == "comment":
+        return WriteComment(comment_text=content or "")
+    if action_type == "question":
+        return AskQuestion(question=content or "")
+    if action_type == "query_docs":
+        return QueryDocs(query_topic=content or "")
+    if action_type == "done":
+        return Done()
+    return Skip()
+# ----------------------------------------------------------------------
+# Observation (POMDP – what the agent sees)
+# ----------------------------------------------------------------------
+class Observation(BaseModel):
+    # Base schema model used by API metadata endpoints.
+    # Keep this lightweight for compatibility with legacy callers.
+    code_snippet: str
+    last_tool_output: str = ""
+    step: int = 0
+    done: bool = False
+# ----------------------------------------------------------------------
+# Reward (lightweight)
+# ----------------------------------------------------------------------
+class Reward(BaseModel):
+    value: float
+# ----------------------------------------------------------------------
+# State (full environment state – not exposed to agent)
+# ----------------------------------------------------------------------
+class State(BaseModel):
+    pr_title: str
+    pr_description: str
+    code_snippet: str
+    comments: list[str]
+    test_results: Optional[str]
+    step: int
     done: bool

pyproject.toml CHANGED Viewed

@@ -1,30 +1,38 @@
-[build-system]
-requires = ["setuptools>=61.0", "wheel"]
-build-backend = "setuptools.build_meta"
-[project]
-name = "code_review_professional"
-version = "1.0.0"
-description = "Multi‑turn code review environment with AST injection, DPO training, and author negotiation"
-authors = [{name = "yuvraj gupta", email = "yuvraj467229@gmail.com"}]
-license = {text = "MIT"}
-readme = "README.md"
-requires-python = ">=3.10"
-dependencies = [
-    "openenv-core>=0.2.0",
-    "fastapi>=0.115.0",
-    "uvicorn>=0.24.0",
-    "unsloth>=2025.3.1",
-    "trl>=0.15.0",
-    "accelerate>=1.2.0",
-    "pylint>=3.3.0",
-    "sentence-transformers>=3.3.0",
-    "datasets>=3.3.0",
-    "chromadb>=0.5.0",
-]
-[project.optional-dependencies]
-dev = ["pytest>=7.0", "black>=23.0", "isort>=5.0"]
-[tool.openenv]
-server = "server.app:app"

+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "code_review_professional"
+version = "1.0.0"
+description = "Multi-turn code review environment for OpenEnv with author negotiation and RL training hooks."
+authors = [{name = "yuvraj gupta", email = "yuvraj467229@gmail.com"}]
+license = {text = "MIT"}
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "openenv-core>=0.2.3",
+    "fastapi>=0.115.0",
+    "uvicorn>=0.24.0",
+    "pylint>=3.3.0",
+]
+[project.optional-dependencies]
+dev = ["pytest>=7.0", "black>=23.0", "isort>=5.0"]
+training = [
+    "accelerate>=1.2.0",
+    "chromadb>=0.5.0",
+    "datasets>=3.3.0",
+    "matplotlib>=3.9.0",
+    "sentence-transformers>=3.3.0",
+    "torch>=2.4.0",
+    "trl>=0.15.0",
+    "unsloth>=2025.3.1",
+]
+[project.urls]
+Homepage = "https://huggingface.co/spaces/100XZX001/CodeReview-Professional-Workflow"
+Repository = "https://huggingface.co/spaces/100XZX001/CodeReview-Professional-Workflow"
+[tool.openenv]
+server = "server.app:app"

requirements-training.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+accelerate>=1.2.0
+chromadb>=0.5.0
+datasets>=3.3.0
+matplotlib>=3.9.0
+sentence-transformers>=3.3.0
+torch>=2.4.0
+transformers>=4.48.0
+trl>=0.15.0
+unsloth>=2025.3.1

requirements.txt CHANGED Viewed

@@ -1,11 +1,5 @@
-unsloth
-transformers
-trl
-datasets
-torch
-sentence-transformers
-chromadb
-pylint
-pydantic
-matplotlib
-huggingface_hub

+openenv-core>=0.2.3
+fastapi>=0.115.0
+uvicorn>=0.24.0
+pylint>=3.3.0
+pydantic>=2.8.0

rltool.py CHANGED Viewed

@@ -1,127 +1,143 @@
-# tools.py – Real vector retrieval for query_docs, linter, and test runner
-import subprocess
-import tempfile
-import os
-from dataclasses import dataclass
-from sentence_transformers import SentenceTransformer
-import chromadb
-@dataclass
-class ToolBox:
-    _embedder = None
-    _client = None
-    _collection = None
-    @classmethod
-    def _get_embedder(cls):
-        if cls._embedder is None:
-            cls._embedder = SentenceTransformer('all-MiniLM-L6-v2')
-        return cls._embedder
-    @classmethod
-    def _get_collection(cls):
-        if cls._collection is None:
-            cls._client = chromadb.Client()
-            cls._collection = cls._client.create_collection("docs")
-            # Pre‑load real documentation snippets (can be extended)
-            docs = [
-                "KeyError occurs when a dictionary key is missing. Use dict.get() or check 'if key in dict'.",
-                "pylint error C0304: missing final newline. Add a newline at the end of file.",
-                "Deadlock happens when two threads acquire locks in opposite order. Always acquire locks in the same order.",
-                "Division by zero: check if list is empty before calculating average, or use try/except.",
-                "Threading.Lock: use 'with lock:' to automatically acquire and release.",
-                "Off‑by‑one errors: adjust loop ranges, e.g., range(1, len(arr)-1).",
-            ]
-            embedder = cls._get_embedder()
-            embeddings = embedder.encode(docs).tolist()
-            for i, doc in enumerate(docs):
-                cls._collection.add(ids=[str(i)], documents=[doc], embeddings=[embeddings[i]])
-        return cls._collection
-    @staticmethod
-    def run_linter(code: str) -> str:
-        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as f:
-            f.write(code)
-            f.flush()
-            tmp_path = f.name
-        try:
-            result = subprocess.run(
-                ['pylint', tmp_path, '--exit-zero', '--output-format=text'],
-                capture_output=True,
-                text=True,
-                timeout=10,
-                encoding='utf-8'
-            )
-            output = result.stdout
-            if "Your code has been rated" in output:
-                output = output.split("Your code has been rated")[0]
-            output = output.strip()
-            if not output:
-                return "No linting issues found."
-            return output[:500]
-        except FileNotFoundError:
-            return "Linter (pylint) not installed."
-        except subprocess.TimeoutExpired:
-            return "Linter timed out."
-        except Exception as e:
-            return f"Linter error: {str(e)}"
-        finally:
-            try:
-                os.unlink(tmp_path)
-            except:
-                pass
-    @staticmethod
-    def run_tests(test_script: str) -> str:
-        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as f:
-            f.write(test_script)
-            f.flush()
-            tmp_path = f.name
-        try:
-            result = subprocess.run(
-                ['python', tmp_path],
-                capture_output=True,
-                text=True,
-                timeout=10,
-                encoding='utf-8'
-            )
-            output = result.stdout + result.stderr
-            return output.strip() or "Test executed successfully (no output)."
-        except subprocess.TimeoutExpired:
-            return "Test execution timed out."
-        except Exception as e:
-            return f"Test runner error: {str(e)}"
-        finally:
-            try:
-                os.unlink(tmp_path)
-            except:
-                pass
-    @classmethod
-    def query_docs(cls, topic: str) -> str:
-        """Retrieve top 3 relevant docs. Forces agent to reason across multiple hints."""
-        try:
-            embedder = cls._get_embedder()
-            collection = cls._get_collection()
-            query_emb = embedder.encode([topic]).tolist()
-            # Get top 3 results (not just 1)
-            results = collection.query(query_embeddings=query_emb, n_results=3)
-            if results['documents'] and results['documents'][0]:
-                # Return concatenated snippets, labelled for clarity
-                snippets = []
-                for i, doc in enumerate(results['documents'][0]):
-                    snippets.append(f"[{i+1}] {doc}")
-                return "Relevant documentation:\n" + "\n".join(snippets)
-            return "No relevant documentation found."
-        except Exception:
-            # Fallback to keyword matching
-            topic_lower = topic.lower()
-            fallback = {
-                "null check": "To avoid KeyError, use 'if key in dict:' before accessing.",
-                "keyerror": "Catch KeyError with try/except or use dict.get().",
-                "deadlock": "Always acquire locks in the same order to avoid deadlock.",
-            }
-            for key, value in fallback.items():
-                if key in topic_lower:
-                    return value
-            return "No relevant documentation found. Try being more specific."

+# tools.py - Real vector retrieval for query_docs, linter, and test runner
+import os
+import subprocess
+import sys
+import tempfile
+from dataclasses import dataclass
+try:
+    from sentence_transformers import SentenceTransformer
+except ImportError:
+    SentenceTransformer = None
+try:
+    import chromadb
+except ImportError:
+    chromadb = None
+@dataclass
+class ToolBox:
+    _embedder = None
+    _client = None
+    _collection = None
+    @classmethod
+    def _get_embedder(cls):
+        if cls._embedder is None:
+            if SentenceTransformer is None:
+                return None
+            cls._embedder = SentenceTransformer("all-MiniLM-L6-v2")
+        return cls._embedder
+    @classmethod
+    def _get_collection(cls):
+        if cls._collection is None:
+            if chromadb is None:
+                return None
+            cls._client = chromadb.Client()
+            cls._collection = cls._client.create_collection("docs")
+            docs = [
+                "KeyError occurs when a dictionary key is missing. Use dict.get() or check 'if key in dict'.",
+                "pylint error C0304: missing final newline. Add a newline at the end of file.",
+                "Deadlock happens when two threads acquire locks in opposite order. Always acquire locks in the same order.",
+                "Division by zero: check if list is empty before calculating average, or use try/except.",
+                "Threading.Lock: use 'with lock:' to automatically acquire and release.",
+                "Off-by-one errors: adjust loop ranges, e.g., range(1, len(arr)-1).",
+            ]
+            embedder = cls._get_embedder()
+            if embedder is None:
+                return None
+            embeddings = embedder.encode(docs).tolist()
+            for i, doc in enumerate(docs):
+                cls._collection.add(ids=[str(i)], documents=[doc], embeddings=[embeddings[i]])
+        return cls._collection
+    @staticmethod
+    def run_linter(code: str) -> str:
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False, encoding="utf-8") as f:
+            f.write(code)
+            f.flush()
+            tmp_path = f.name
+        try:
+            result = subprocess.run(
+                [sys.executable, "-m", "pylint", tmp_path, "--exit-zero", "--output-format=text"],
+                capture_output=True,
+                text=True,
+                timeout=10,
+                encoding="utf-8",
+            )
+            output = result.stdout
+            if "Your code has been rated" in output:
+                output = output.split("Your code has been rated")[0]
+            output = output.strip()
+            if not output:
+                return "No linting issues found."
+            return output[:500]
+        except FileNotFoundError:
+            return "Linter (pylint) not installed."
+        except subprocess.TimeoutExpired:
+            return "Linter timed out."
+        except Exception as e:
+            return f"Linter error: {str(e)}"
+        finally:
+            try:
+                os.unlink(tmp_path)
+            except OSError:
+                pass
+    @staticmethod
+    def run_tests(test_script: str) -> str:
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False, encoding="utf-8") as f:
+            f.write(test_script)
+            f.flush()
+            tmp_path = f.name
+        try:
+            result = subprocess.run(
+                [sys.executable, tmp_path],
+                capture_output=True,
+                text=True,
+                timeout=10,
+                encoding="utf-8",
+            )
+            output = result.stdout + result.stderr
+            return output.strip() or "Test executed successfully (no output)."
+        except subprocess.TimeoutExpired:
+            return "Test execution timed out."
+        except Exception as e:
+            return f"Test runner error: {str(e)}"
+        finally:
+            try:
+                os.unlink(tmp_path)
+            except OSError:
+                pass
+    @classmethod
+    def query_docs(cls, topic: str) -> str:
+        """Retrieve top 3 relevant docs; fall back cleanly when vector deps are missing."""
+        try:
+            embedder = cls._get_embedder()
+            collection = cls._get_collection()
+            if embedder is None or collection is None:
+                raise RuntimeError("Vector retrieval dependencies are unavailable")
+            query_emb = embedder.encode([topic]).tolist()
+            results = collection.query(query_embeddings=query_emb, n_results=3)
+            if results["documents"] and results["documents"][0]:
+                snippets = []
+                for i, doc in enumerate(results["documents"][0]):
+                    snippets.append(f"[{i + 1}] {doc}")
+                return "Relevant documentation:\n" + "\n".join(snippets)
+            return "No relevant documentation found."
+        except Exception:
+            topic_lower = topic.lower()
+            fallback = {
+                "null check": "To avoid KeyError, use 'if key in dict:' before accessing.",
+                "keyerror": "Catch KeyError with try/except or use dict.get().",
+                "deadlock": "Always acquire locks in the same order to avoid deadlock.",
+                "race": "Protect shared state with a lock or make the update atomic.",
+                "division": "Guard empty inputs before dividing or return a safe default.",
+            }
+            for key, value in fallback.items():
+                if key in topic_lower:
+                    return value
+            return "No relevant documentation found. Try being more specific."

rubrics.py CHANGED Viewed

@@ -1,136 +1,136 @@
-# rubrics.py – Self-contained Rubrics (no external OpenEnv dependency)
-class Rubric:
-    """Minimal Rubric base – compatible with OpenEnv but self‑contained."""
-    def __call__(self, env, action, obs, reward, done, info):
-        return 0.0
-# --------------------------------------------------------------------------------
-# 1. TOOL‑USAGE BONUS
-# --------------------------------------------------------------------------------
-class ToolUsageRubric(Rubric):
-    def __init__(self, bonus: float = 0.05):
-        self.bonus = bonus
-    def __call__(self, env, action, obs, reward, done, info):
-        score = 0.0
-        action_type = info.get("action_type", "")
-        # Use pre-action flags from `info` so first-use bonuses are
-        # computed correctly even though env flags are mutated in-step.
-        prev_tests_run = info.get("prev_tests_run", env._tests_run)
-        prev_linter_run = info.get("prev_linter_run", env._linter_run)
-        prev_docs_queried = info.get("prev_docs_queried", env._docs_queried)
-        if action_type == "run_tests":
-            if not prev_tests_run:
-                score += self.bonus
-            score += 0.015
-        elif action_type == "run_linter":
-            if not prev_linter_run:
-                score += self.bonus
-            score += 0.015
-        elif action_type == "query_docs":
-            if not prev_docs_queried:
-                score += self.bonus * 0.5
-            # Encourage docs usage when it is likely useful:
-            # - early exploration phase
-            # - non-trivial query text
-            if env._step_count <= 4 and info.get("docs_query_len", 0) >= 8:
-                score += 0.01
-            # Discourage repeated docs calls after the first-use signal.
-            if prev_docs_queried:
-                score -= 0.01
-        elif action_type == "question" and env._step_count <= 3:
-            score += 0.02
-        return score
-# --------------------------------------------------------------------------------
-# 2. DELTA‑BASED REWARDS
-# --------------------------------------------------------------------------------
-class TestDeltaRubric(Rubric):
-    def __init__(self, weight: float = 0.3):
-        self.weight = weight
-    def __call__(self, env, action, obs, reward, done, info):
-        delta = env._current_test_score - env._previous_test_score
-        effective = self.weight
-        if info.get("action_type") == "fix":
-            effective *= 0.4
-        return effective * delta
-class LintDeltaRubric(Rubric):
-    def __init__(self, weight: float = 0.3):
-        self.weight = weight
-    def __call__(self, env, action, obs, reward, done, info):
-        delta = env._current_lint_score - env._previous_lint_score
-        effective = self.weight * 0.5
-        if info.get("action_type") == "fix":
-            effective *= 0.4
-        return effective * delta
-# --------------------------------------------------------------------------------
-# 3. TERMINAL SUCCESS BONUS
-# --------------------------------------------------------------------------------
-class TerminalSuccessRubric(Rubric):
-    def __call__(self, env, action, obs, reward, done, info):
-        if info.get("action_type") != "fix":
-            return 0.0
-        score = 0.0
-        if env._current_test_score > 0.95:
-            score += 0.4
-        elif env._current_test_score > 0.85:
-            score += 0.2
-        return score
-# --------------------------------------------------------------------------------
-# 4. EXPLORATION & DIVERSITY
-# --------------------------------------------------------------------------------
-class ExplorationRubric(Rubric):
-    def __init__(self, penalty: float = -0.05, bonus: float = 0.021):
-        self.penalty = penalty
-        self.bonus = bonus
-    def __call__(self, env, action, obs, reward, done, info):
-        if len(env._action_history) < 3:
-            return 0.0
-        recent = env._action_history[-3:]
-        unique = len(set(recent))
-        if unique == 1:
-            return self.penalty
-        elif unique == 3:
-            return self.bonus
-        return 0.0
-# --------------------------------------------------------------------------------
-# 5. ANTI‑HACKING & CONSISTENCY
-# --------------------------------------------------------------------------------
-class AntiHackingRubric(Rubric):
-    def __call__(self, env, action, obs, reward, done, info):
-        if info.get("action_type") != "fix":
-            return 0.0
-        score = 0.0
-        if not env._tests_run:
-            score -= 0.25
-        if env._step_count < 2:
-            score -= 0.1
-        if env._tests_run and env._linter_run:
-            score += 0.02
-        return score
-# --------------------------------------------------------------------------------
-# 6. STEP PENALTY
-# --------------------------------------------------------------------------------
-class StepPenaltyRubric(Rubric):
-    def __init__(self, penalty: float = -0.01):
-        self.penalty = penalty
-    def __call__(self, env, action, obs, reward, done, info):
-        return self.penalty

+# rubrics.py – Self-contained Rubrics (no external OpenEnv dependency)
+class Rubric:
+    """Minimal Rubric base – compatible with OpenEnv but self‑contained."""
+    def __call__(self, env, action, obs, reward, done, info):
+        return 0.0
+# --------------------------------------------------------------------------------
+# 1. TOOL‑USAGE BONUS
+# --------------------------------------------------------------------------------
+class ToolUsageRubric(Rubric):
+    def __init__(self, bonus: float = 0.05):
+        self.bonus = bonus
+    def __call__(self, env, action, obs, reward, done, info):
+        score = 0.0
+        action_type = info.get("action_type", "")
+        # Use pre-action flags from `info` so first-use bonuses are
+        # computed correctly even though env flags are mutated in-step.
+        prev_tests_run = info.get("prev_tests_run", env._tests_run)
+        prev_linter_run = info.get("prev_linter_run", env._linter_run)
+        prev_docs_queried = info.get("prev_docs_queried", env._docs_queried)
+        if action_type == "run_tests":
+            if not prev_tests_run:
+                score += self.bonus
+            score += 0.015
+        elif action_type == "run_linter":
+            if not prev_linter_run:
+                score += self.bonus
+            score += 0.015
+        elif action_type == "query_docs":
+            if not prev_docs_queried:
+                score += self.bonus * 0.5
+            # Encourage docs usage when it is likely useful:
+            # - early exploration phase
+            # - non-trivial query text
+            if env._step_count <= 4 and info.get("docs_query_len", 0) >= 8:
+                score += 0.01
+            # Discourage repeated docs calls after the first-use signal.
+            if prev_docs_queried:
+                score -= 0.01
+        elif action_type == "question" and env._step_count <= 3:
+            score += 0.02
+        return score
+# --------------------------------------------------------------------------------
+# 2. DELTA‑BASED REWARDS
+# --------------------------------------------------------------------------------
+class TestDeltaRubric(Rubric):
+    def __init__(self, weight: float = 0.3):
+        self.weight = weight
+    def __call__(self, env, action, obs, reward, done, info):
+        delta = env._current_test_score - env._previous_test_score
+        effective = self.weight
+        if info.get("action_type") == "fix":
+            effective *= 0.4
+        return effective * delta
+class LintDeltaRubric(Rubric):
+    def __init__(self, weight: float = 0.3):
+        self.weight = weight
+    def __call__(self, env, action, obs, reward, done, info):
+        delta = env._current_lint_score - env._previous_lint_score
+        effective = self.weight * 0.5
+        if info.get("action_type") == "fix":
+            effective *= 0.4
+        return effective * delta
+# --------------------------------------------------------------------------------
+# 3. TERMINAL SUCCESS BONUS
+# --------------------------------------------------------------------------------
+class TerminalSuccessRubric(Rubric):
+    def __call__(self, env, action, obs, reward, done, info):
+        if info.get("action_type") != "fix":
+            return 0.0
+        score = 0.0
+        if env._current_test_score > 0.95:
+            score += 0.4
+        elif env._current_test_score > 0.85:
+            score += 0.2
+        return score
+# --------------------------------------------------------------------------------
+# 4. EXPLORATION & DIVERSITY
+# --------------------------------------------------------------------------------
+class ExplorationRubric(Rubric):
+    def __init__(self, penalty: float = -0.05, bonus: float = 0.021):
+        self.penalty = penalty
+        self.bonus = bonus
+    def __call__(self, env, action, obs, reward, done, info):
+        if len(env._action_history) < 3:
+            return 0.0
+        recent = env._action_history[-3:]
+        unique = len(set(recent))
+        if unique == 1:
+            return self.penalty
+        elif unique == 3:
+            return self.bonus
+        return 0.0
+# --------------------------------------------------------------------------------
+# 5. ANTI‑HACKING & CONSISTENCY
+# --------------------------------------------------------------------------------
+class AntiHackingRubric(Rubric):
+    def __call__(self, env, action, obs, reward, done, info):
+        if info.get("action_type") != "fix":
+            return 0.0
+        score = 0.0
+        if not env._tests_run:
+            score -= 0.25
+        if env._step_count < 2:
+            score -= 0.1
+        if env._tests_run and env._linter_run:
+            score += 0.02
+        return score
+# --------------------------------------------------------------------------------
+# 6. STEP PENALTY
+# --------------------------------------------------------------------------------
+class StepPenaltyRubric(Rubric):
+    def __init__(self, penalty: float = -0.01):
+        self.penalty = penalty
+    def __call__(self, env, action, obs, reward, done, info):
+        return self.penalty

training.py CHANGED Viewed

@@ -1,935 +1,935 @@
-# training.py – Memory‑safe: Phi‑3‑mini + Expert Demos + Fast PPO (2 iterations)
-import os
-os.environ["TRITON_DISABLE"] = "1"
-os.environ["TOKENIZERS_PARALLELISM"] = "false"        # Issue #12: prevent OOM from parallel tokenization
-import torch._dynamo
-torch._dynamo.config.disable = True
-import json
-import torch
-import torch.nn.functional as F
-from torch.optim import AdamW
-from dataclasses import dataclass
-from typing import List, Dict, Tuple, Optional
-import numpy as np
-import re
-import random
-import matplotlib.pyplot as plt
-from unsloth import FastLanguageModel
-from transformers import TrainingArguments
-from trl import SFTTrainer
-from datasets import Dataset
-from environment import CodeReviewEnv
-from redteam import BUG_DB
-from models import (
-    RunTests, RunLinter, Inspect,
-    ProposeFix, WriteComment, AskQuestion,
-    Done, Skip, QueryDocs, map_to_env as model_map_to_env
-)
-# ======================================================================
-@dataclass
-class AgentAction:
-    action_type: str
-    content: Optional[str] = None
-def parse_action(output: str) -> AgentAction:
-    try:
-        data = json.loads(output)
-        return AgentAction(
-            action_type=data.get("action_type", "").lower(),
-            content=data.get("content")
-        )
-    except:
-        pass
-    json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', output, re.DOTALL)
-    if json_match:
-        try:
-            data = json.loads(json_match.group(1))
-            return AgentAction(
-                action_type=data.get("action_type", "").lower(),
-                content=data.get("content")
-            )
-        except:
-            pass
-    action_pattern = r'"action_type"\s*:\s*"(\w+)"'
-    match = re.search(action_pattern, output)
-    if match:
-        return AgentAction(action_type=match.group(1).lower())
-    output_lower = output.lower()
-    if "test" in output_lower:
-        return AgentAction("run_tests")
-    if "lint" in output_lower:
-        return AgentAction("run_linter")
-    if "inspect" in output_lower:
-        return AgentAction("inspect")
-    if "doc" in output_lower or "documentation" in output_lower:
-        return AgentAction("query_docs", "bug fix guidance")
-    return AgentAction("invalid", output)
-def map_to_env(action: AgentAction):
-    return model_map_to_env(action.action_type, action.content)
-# ======================================================================
-def load_model():
-    model, tokenizer = FastLanguageModel.from_pretrained(
-        model_name="unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
-        max_seq_length=480,               # smaller window for memory
-        load_in_4bit=True,
-    )
-    model = FastLanguageModel.get_peft_model(
-        model,
-        r=16,
-        target_modules=[
-            "q_proj", "k_proj", "v_proj", "o_proj",
-            "gate_proj", "up_proj", "down_proj"
-        ],
-        lora_alpha=32,
-        lora_dropout=0.0,
-    )
-    return model, tokenizer
-def test_model_sanity(model, tokenizer) -> bool:
-    print("\n" + "="*60)
-    print("SANITY CHECK: Testing base model generation")
-    print("="*60)
-    test_prompt = "Hello, how are you?"
-    messages = [{"role": "user", "content": test_prompt}]
-    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = tokenizer(formatted, return_tensors="pt", max_length=256, truncation=True).to("cuda")
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=30,
-            do_sample=True,
-            temperature=0.7,
-            min_new_tokens=1,
-            eos_token_id=tokenizer.eos_token_id,
-            pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
-        )
-    generated_ids = outputs[0][inputs['input_ids'].shape[1]:]
-    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
-    print(f"Prompt: {test_prompt}")
-    print(f"Response: {repr(response)}")
-    if len(response) == 0:
-        print("❌ Model produces empty output – cannot train.")
-        return False
-    print("✓ Model sanity check PASSED\n")
-    return True
-# ======================================================================
-def _expert_fix_from_context(obs) -> str:
-    """
-    Build a conservative fix template named `fix` (required by tests).
-    Uses bug hints + code snippet patterns to create realistic fixes.
-    """
-    bug = (getattr(obs, "bug_description", "") or "").lower()
-    code = getattr(obs, "code_snippet", "") or ""
-    if "division" in bug or "average" in code.lower():
-        return (
-            "def fix(data):\n"
-            "    if not data:\n"
-            "        return 0\n"
-            "    return sum(data) / len(data)"
-        )
-    if "operator" in bug or "sign" in bug:
-        return (
-            "def fix(a, b):\n"
-            "    return a + b"
-        )
-    if "off_by_one" in bug or "loop" in bug:
-        return (
-            "def fix(items):\n"
-            "    return len(items)"
-        )
-    if "null" in bug or "key" in bug or "dict" in code.lower():
-        return (
-            "def fix(payload):\n"
-            "    users = payload.get('users', {})\n"
-            "    user_id = payload.get('id')\n"
-            "    return users.get(user_id)"
-        )
-    # Concurrency-heavy tasks (harder/hardest).
-    if "race" in bug or "missing_lock" in bug or "thread_safe" in bug or "global_nonatomic" in bug:
-        return (
-            "import threading\n"
-            "_lock = threading.Lock()\n"
-            "\n"
-            "def fix(counter):\n"
-            "    with _lock:\n"
-            "        if counter is None:\n"
-            "            return 0\n"
-            "        return counter + 1"
-        )
-    if "deadlock" in bug or "double_lock" in bug or "lock order" in bug or "nested_lock" in bug:
-        return (
-            "import threading\n"
-            "_lock_a = threading.Lock()\n"
-            "_lock_b = threading.Lock()\n"
-            "\n"
-            "def fix(work):\n"
-            "    first, second = (_lock_a, _lock_b)\n"
-            "    if id(first) > id(second):\n"
-            "        first, second = second, first\n"
-            "    with first:\n"
-            "        with second:\n"
-            "            return work() if callable(work) else work"
-        )
-    if "fork_join" in bug or "join" in bug:
-        return (
-            "import threading\n"
-            "\n"
-            "def fix(worker):\n"
-            "    t = threading.Thread(target=worker)\n"
-            "    t.start()\n"
-            "    t.join()\n"
-            "    return True"
-        )
-    # Generic safe fallback keeps the RL pipeline alive for unknown bugs.
-    return (
-        "def fix(data):\n"
-        "    if data is None:\n"
-        "        return None\n"
-        "    return data"
-    )
-def _expert_supervised_policy(obs) -> str:
-    """
-    Real workflow policy:
-    inspect -> tests/linter -> docs -> fix -> negotiate -> done.
-    """
-    author_msg = (getattr(obs, "author_response", "") or "").lower()
-    tool_output = (getattr(obs, "last_tool_output", "") or "").lower()
-    if not getattr(obs, "tests_run", False):
-        if "inspect" not in tool_output:
-            return '{"action_type": "inspect"}'
-        return '{"action_type": "run_tests"}'
-    if not getattr(obs, "linter_run", False):
-        return '{"action_type": "run_linter"}'
-    if not getattr(obs, "docs_queried", False):
-        return '{"action_type": "query_docs", "content": "python bug fixing best practices for edge cases and null safety"}'
-    # Use docs again on hard tasks when evidence is still weak.
-    if getattr(obs, "current_test_score", 0.0) < 0.6 and getattr(obs, "step", 0) >= 3:
-        bug_hint = (getattr(obs, "bug_description", "") or "concurrency bug").replace('"', "'")
-        return json.dumps(
-            {
-                "action_type": "query_docs",
-                "content": f"python {bug_hint} lock ordering race condition mitigation patterns",
-            }
-        )
-    # If test quality is poor, propose a concrete fix.
-    if getattr(obs, "current_test_score", 0.0) < 0.95:
-        fix_code = _expert_fix_from_context(obs)
-        return json.dumps({"action_type": "fix", "content": fix_code})
-    # If author is still unconvinced, provide causal explanation.
-    if author_msg and ("not convinced" in author_msg or "explain" in author_msg or "brief" in author_msg):
-        return (
-            '{"action_type": "comment", "content": "This fix works because it handles the failing edge case directly, '
-            'keeps behavior deterministic, and aligns with the observed test and lint feedback. '
-            'The change is intentionally small to reduce regression risk."}'
-        )
-    # If negotiation is strong enough and quality is good, terminate.
-    conf = float(getattr(obs, "author_confidence", 0.0))
-    threshold = float(getattr(obs, "author_threshold", 0.5))
-    score = float(getattr(obs, "current_test_score", 0.0))
-    if conf >= threshold and score >= 0.8:
-        return '{"action_type": "done"}'
-    # Nudge conversation forward when tests are okay but acceptance is pending.
-    return (
-        '{"action_type": "question", "content": "Would you like a quick walkthrough of a failing scenario, the root cause, and how the fix prevents regressions?"}'
-    )
-# ======================================================================
-def supervised_warmup(model, tokenizer, env, n_episodes=16, epochs=1, max_steps=8):
-    print("\n" + "="*60)
-    print("SUPERVISED WARM-UP: Real environment demonstrations")
-    print("="*60)
-    examples = []
-    tasks = ["easy", "medium", "hard", "harder", "hardest"]
-    for ep in range(n_episodes):
-        task = random.choice(tasks)
-        env.set_task(task)
-        obs = env.reset()
-        history = []
-        done = False
-        steps = 0
-        while not done and steps < max_steps:
-            prompt = build_prompt(obs, history)
-            action_text = _expert_supervised_policy(obs)
-            action = parse_action(action_text)
-            env_action = map_to_env(action)
-            next_obs, _, done, _ = env.step(env_action)
-            messages = [
-                {"role": "user", "content": prompt},
-                {"role": "assistant", "content": action_text},
-            ]
-            full_text = tokenizer.apply_chat_template(messages, tokenize=False)
-            examples.append({"text": full_text})
-            history.append(f"Agent: {action_text}")
-            history.append(f"Env: {next_obs.last_tool_output}")
-            history = history[-8:]
-            obs = next_obs
-            steps += 1
-        print(f"Supervised episode {ep+1}: task={task}, steps={steps}, done={done}")
-    if not examples:
-        print("No supervised examples generated; skipping warm-up.")
-        return
-    dataset = Dataset.from_list(examples)
-    trainer = SFTTrainer(
-        model=model,
-        tokenizer=tokenizer,
-        train_dataset=dataset,
-        dataset_text_field="text",
-        max_seq_length=480,
-        args=TrainingArguments(
-            output_dir="warmup_output",
-            num_train_epochs=epochs,
-            per_device_train_batch_size=2,
-            gradient_accumulation_steps=2,
-            learning_rate=2e-5,
-            logging_steps=50,
-            save_strategy="no",
-            bf16=True,
-        ),
-    )
-    print(f"Training on {len(examples)} real env examples for {epochs} epochs...")
-    trainer.train()
-    print("✓ Supervised warm-up (real env) complete\n")
-    torch.cuda.empty_cache()
-# ======================================================================
-def generate_action_with_logprob(prompt, model, tokenizer, temperature=0.0, max_retries=2):
-    messages = [{"role": "user", "content": prompt}]
-    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = tokenizer(formatted, return_tensors="pt", max_length=480, truncation=True).to("cuda")
-    for attempt in range(max_retries):
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=64,
-                do_sample=(temperature > 0),
-                temperature=max(temperature, 0.01) if temperature > 0 else 1.0,
-                min_new_tokens=1,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-        generated_ids = outputs.sequences[0][inputs['input_ids'].shape[1]:]
-        action_text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
-        logprobs = []
-        for idx, token_id in enumerate(generated_ids):
-            if idx < len(outputs.scores):
-                token_logits = outputs.scores[idx][0]
-                token_logprob = F.log_softmax(token_logits, dim=-1)[token_id].item()
-                logprobs.append(token_logprob)
-        total_logprob = sum(logprobs) if logprobs else -100.0
-        if not action_text:
-            fallback_actions = [
-                '{"action_type": "run_tests"}',
-                '{"action_type": "run_linter"}',
-                '{"action_type": "inspect"}',
-                '{"action_type": "skip"}',
-            ]
-            action_text = random.choice(fallback_actions)
-            total_logprob = -50.0
-            print(f"[WARN] Empty generation → using fallback: {action_text}")
-            return action_text, total_logprob
-        try:
-            json.loads(action_text)
-            return action_text, total_logprob
-        except:
-            if attempt == max_retries - 1:
-                return '{"action_type":"skip"}', -100.0
-            continue
-    return '{"action_type":"skip"}', -100.0
-# ======================================================================
-def build_prompt(obs, history_lines: List[str]) -> str:
-    author_msg = getattr(obs, "author_response", "") or ""
-    tool_output = getattr(obs, "last_tool_output", "") or ""
-    author_personality = getattr(obs, "author_personality", "defensive")
-    prompt = f"""You are an AI code review agent. Your goal is to convince a simulated human developer to accept your proposed fix and name your proposed fix function fix.
-The developer has a **{author_personality}** personality and will only accept if you provide solid evidence:
-- Tests pass (high pass ratio)
-- Lint is clean (zero errors)
-- Documentation or references are provided
-- Your reasoning is clear, uses words like "because" or "therefore", and is detailed (over 30 words if needed)
-Workflow:
-1. Use `inspect` to understand the code.
-2. Use `run_tests` and `run_linter` to gather evidence.
-3. Use `query_docs` when you need references or language-specific guidance.
-4. Propose a fix (`fix`) and explain why it works (`comment` or `question`).
-5. If the developer pushes back, read their response carefully and address their specific concern.
-6. Once convinced, use `done` to finish.
-Code:
-{obs.code_snippet}
-Author says:
-{author_msg if author_msg else "(no response yet – start with inspection)"}
-Last tool output:
-{tool_output if tool_output else "(none)"}
-Available actions:
-run_tests, run_linter, inspect, query_docs, fix, comment, question, done
-Respond ONLY in JSON:
-{{"action_type": "...", "content": "..."}}"""
-    if history_lines:
-        history = "\n".join(history_lines[-6:])
-        prompt += f"\n\nPrevious steps:\n{history}"
-    return prompt
-# ======================================================================
-@dataclass
-class Trajectory:
-    states: List[str]
-    actions: List[str]
-    rewards: List[float]
-    logprobs: List[float]
-    dones: List[bool]
-    def __len__(self): return len(self.states)
-def collect_trajectory(env, model, tokenizer, max_steps=6, temperature=0.0):
-    obs = env.reset()
-    history_lines = []
-    states, actions, rewards, logprobs, dones = [], [], [], [], []
-    for step in range(max_steps):
-        prompt = build_prompt(obs, history_lines)
-        states.append(prompt)
-        action_text, logprob = generate_action_with_logprob(prompt, model, tokenizer, temperature)
-        actions.append(action_text)
-        logprobs.append(logprob)
-        action = parse_action(action_text)
-        env_action = map_to_env(action)
-        next_obs, reward, done, _ = env.step(env_action)
-        rewards.append(reward.value)
-        dones.append(done)
-        history_lines.append(f"Agent: {action_text}")
-        history_lines.append(f"Env: {next_obs.last_tool_output}")
-        obs = next_obs
-        if done: break
-    return Trajectory(states, actions, rewards, logprobs, dones)
-def collect_trajectories(env, model, tokenizer, n_trajectories, max_steps=6,
-                         task_levels=None, task_weights=None):
-    if task_levels is None:
-        task_levels = list(BUG_DB.keys())
-    if task_weights is not None and len(task_weights) != len(task_levels):
-        raise ValueError("task_weights must match task_levels length")
-    if task_weights is not None and sum(task_weights) <= 0:
-        raise ValueError("task_weights must have a positive total")
-    trajectories = []
-    for i in range(n_trajectories):
-        sampled_task = random.choices(task_levels, weights=task_weights, k=1)[0]
-        env.set_task(sampled_task)
-        traj = collect_trajectory(env, model, tokenizer, max_steps)
-        total_reward = sum(traj.rewards)
-        print(f"Trajectory {i+1}/{n_trajectories}: task={sampled_task}, steps={len(traj)}, reward={total_reward:.3f}")
-        trajectories.append(traj)
-    return trajectories
-def compute_returns_and_advantages(rewards, dones, gamma=0.99, standardize=True):
-    """
-    Compute discounted returns and REINFORCE-style baseline advantages.
-    Advantages are centered and optionally standardised.
-    """
-    n = len(rewards)
-    returns = [0.0]*n
-    running = 0.0
-    for t in reversed(range(n)):
-        if dones[t]: running = 0.0
-        running = rewards[t] + gamma * running
-        returns[t] = running
-    if standardize:
-        advantages = np.array(returns) - np.mean(returns)
-        adv_std = np.std(advantages) + 1e-8
-        advantages = (advantages / adv_std).tolist()
-    else:
-        advantages = returns.copy()
-    return advantages, returns
-def ppo_update(trajectories, model, tokenizer, optimizer, n_epochs=1, clip_epsilon=0.2,
-               entropy_coef=0.01, gamma=0.99):
-    model.train()
-    all_states, all_actions, all_old_logprobs, all_advantages = [], [], [], []
-    for traj in trajectories:
-        advantages, _ = compute_returns_and_advantages(traj.rewards, traj.dones, gamma=gamma, standardize=True)
-        all_states.extend(traj.states)
-        all_actions.extend(traj.actions)
-        all_old_logprobs.extend(traj.logprobs)
-        all_advantages.extend(advantages)
-    n_samples = len(all_states)
-    total_loss, total_policy_loss, total_entropy, n_updates = 0.0, 0.0, 0.0, 0
-    for epoch in range(n_epochs):
-        indices = np.random.permutation(n_samples)
-        for i in indices:
-            state = all_states[i]
-            action = all_actions[i]
-            old_logprob = all_old_logprobs[i]
-            advantage = all_advantages[i]
-            messages = [{"role": "user", "content": state}]
-            formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            full_text = formatted + action
-            inputs = tokenizer(full_text, return_tensors="pt", max_length=480, truncation=True).to("cuda")
-            outputs = model(**inputs)
-            logits = outputs.logits
-            action_ids = tokenizer.encode(action, add_special_tokens=False)
-            prefix_ids = tokenizer.encode(formatted, add_special_tokens=False)
-            action_start = len(prefix_ids)
-            logprobs = []
-            entropy = 0.0
-            for idx, token_id in enumerate(action_ids):
-                position = action_start + idx - 1
-                if 0 <= position < logits.shape[1]:
-                    token_logits = logits[0, position]
-                    log_probs = F.log_softmax(token_logits, dim=-1)
-                    token_logprob = log_probs[token_id]
-                    logprobs.append(token_logprob)
-                    probs = F.softmax(token_logits, dim=-1)
-                    entropy += -(probs * log_probs).sum()
-            if not logprobs: continue
-            new_logprob = sum(logprobs)
-            avg_entropy = entropy / len(logprobs) if logprobs else 0.0
-            ratio = torch.exp(new_logprob - old_logprob)
-            surr1 = ratio * advantage
-            surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantage
-            policy_loss = -torch.min(surr1, surr2)
-            loss = policy_loss - entropy_coef * avg_entropy
-            optimizer.zero_grad()
-            loss.backward()
-            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
-            optimizer.step()
-            total_loss += loss.item()
-            total_policy_loss += policy_loss.item()
-            total_entropy += avg_entropy.item()
-            n_updates += 1
-    torch.cuda.empty_cache()
-    return {"loss": total_loss / n_updates if n_updates else 0.0,
-            "policy_loss": total_policy_loss / n_updates if n_updates else 0.0,
-            "entropy": total_entropy / n_updates if n_updates else 0.0}
-def evaluate_policy(env, model, tokenizer, n_episodes=3, max_steps=6,
-                    task_levels=None, verbose=False):
-    """Evaluate the current policy across task levels. Returns metrics + optional traces."""
-    model.eval()
-    if task_levels is None:
-        task_levels = list(BUG_DB.keys())
-    total_rewards = []
-    traces = []  # human-readable behavior logs
-    for ep in range(n_episodes):
-        task = task_levels[ep % len(task_levels)]
-        env.set_task(task)
-        traj = collect_trajectory(env, model, tokenizer, max_steps, temperature=0.0)
-        ep_reward = sum(traj.rewards)
-        total_rewards.append(ep_reward)
-        if verbose:
-            actions_taken = []
-            for a in traj.actions:
-                try:
-                    actions_taken.append(json.loads(a).get("action_type", "?"))
-                except Exception:
-                    actions_taken.append("?")
-            traces.append({
-                "task": task,
-                "reward": round(ep_reward, 4),
-                "steps": len(traj),
-                "actions": actions_taken,
-            })
-    return {
-        "avg_reward": float(np.mean(total_rewards)),
-        "std_reward": float(np.std(total_rewards)),
-        "min_reward": float(np.min(total_rewards)),
-        "max_reward": float(np.max(total_rewards)),
-        "traces": traces,
-    }
-# ======================================================================
-# MANUAL WARM-UP (no SFTTrainer → no multiprocessing OOM)
-# ======================================================================
-def json_warmup(model, tokenizer, json_path="training_data.json",
-                n_episodes=20, epochs=2, lr=2e-5):
-    """
-    Supervised warm-up from pre-generated expert demonstrations.
-    Uses raw cross-entropy on action tokens with manual gradient steps.
-    NO SFTTrainer, NO multiprocessing – runs safely on any GPU.
-    """
-    print("\n" + "="*60)
-    print("SUPERVISED WARM-UP: training_data.json (manual cross-entropy)")
-    print("="*60)
-    with open(json_path, encoding="utf-8") as f:
-        data = json.load(f)
-    # Each episode = 7 steps. Select n_episodes worth.
-    steps_per_episode = 7
-    max_examples = n_episodes * steps_per_episode
-    if max_examples < len(data):
-        data = data[:max_examples]
-    print(f"  {len(data)} examples ({len(data)//steps_per_episode} episodes), "
-          f"{epochs} epoch(s), lr={lr}")
-    model.train()
-    warmup_opt = AdamW(model.parameters(), lr=lr)
-    warmup_losses = []   # per-epoch avg loss
-    for epoch in range(epochs):
-        random.shuffle(data)
-        epoch_loss = 0.0
-        n_valid = 0
-        for i, example in enumerate(data):
-            prompt = example["prompt"]
-            action = example["action"]
-            # ---- tokenize full sequence (prompt + action) ----
-            messages = [
-                {"role": "user", "content": prompt},
-                {"role": "assistant", "content": action},
-            ]
-            full_text = tokenizer.apply_chat_template(messages, tokenize=False)
-            inputs = tokenizer(full_text, return_tensors="pt",
-                               max_length=480, truncation=True).to("cuda")
-            # ---- find where the action tokens start ----
-            prompt_only = tokenizer.apply_chat_template(
-                [{"role": "user", "content": prompt}],
-                tokenize=False, add_generation_prompt=True
-            )
-            prompt_ids = tokenizer.encode(prompt_only, add_special_tokens=False)
-            prompt_len = len(prompt_ids)
-            total_len = inputs.input_ids.shape[1]
-            if prompt_len >= total_len:
-                continue  # prompt was truncated away, skip
-            # ---- cross-entropy on action tokens only ----
-            outputs = model(**inputs)
-            logits = outputs.logits
-            # next-token prediction: logits[t] predicts token[t+1]
-            shift_logits = logits[0, prompt_len - 1 : total_len - 1]
-            shift_labels = inputs.input_ids[0, prompt_len : total_len]
-            min_len = min(shift_logits.shape[0], shift_labels.shape[0])
-            if min_len == 0:
-                continue
-            loss = F.cross_entropy(shift_logits[:min_len], shift_labels[:min_len])
-            warmup_opt.zero_grad()
-            loss.backward()
-            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
-            warmup_opt.step()
-            epoch_loss += loss.item()
-            n_valid += 1
-            if (i + 1) % 25 == 0:
-                avg = epoch_loss / n_valid
-                print(f"    epoch {epoch+1}  step {i+1:3d}/{len(data)}  "
-                      f"running_loss={avg:.4f}")
-        avg_loss = epoch_loss / max(n_valid, 1)
-        warmup_losses.append(avg_loss)
-        print(f"  Epoch {epoch+1} done: avg_loss={avg_loss:.4f}  "
-              f"({n_valid} valid examples)")
-    torch.cuda.empty_cache()
-    print(f"✓ Warm-up complete.  Loss: "
-          f"{' → '.join(f'{l:.4f}' for l in warmup_losses)}\n")
-    return warmup_losses
-# ======================================================================
-# MAIN TRAINING PIPELINE
-# ======================================================================
-def train_ppo():
-    # --- Hyperparameters ---
-    n_iterations = 8            # enough for a clear upward trend
-    trajectories_per_iter = 4   # on-policy data per iteration
-    n_epochs = 1
-    max_steps = 6
-    learning_rate = 3e-5
-    clip_epsilon = 0.2
-    entropy_coef = 0.01
-    gamma = 0.99
-    # --- Pre-load embedder before LLM (Issue #13) ---
-    from rltool import ToolBox
-    print("Pre-loading sentence-transformer embedder...")
-    ToolBox._get_embedder()
-    print("✓ Embedder ready")
-    # --- Load model ---
-    print("Loading model...")
-    model, tokenizer = load_model()
-    if not test_model_sanity(model, tokenizer):
-        return
-    env = CodeReviewEnv()
-    task_levels = list(BUG_DB.keys())
-    # ==================================================================
-    # PHASE 0: BASELINE (untrained policy)
-    # ==================================================================
-    print("\n" + "="*60)
-    print("PHASE 0 – BASELINE EVALUATION (untrained)")
-    print("="*60)
-    baseline = evaluate_policy(env, model, tokenizer, n_episodes=5,
-                               max_steps=max_steps, task_levels=task_levels,
-                               verbose=True)
-    baseline_reward = baseline["avg_reward"]
-    print(f"Baseline avg reward: {baseline_reward:.4f}  "
-          f"(min={baseline['min_reward']:.4f}, max={baseline['max_reward']:.4f})")
-    print("Baseline behavior:")
-    for t in baseline["traces"]:
-        print(f"  task={t['task']:8s}  reward={t['reward']:+.4f}  "
-              f"steps={t['steps']}  actions={t['actions']}")
-    # ==================================================================
-    # PHASE 1: SUPERVISED WARM-UP (expert demos, manual CE)
-    # ==================================================================
-    warmup_losses = json_warmup(
-        model, tokenizer,
-        json_path="training_data.json",
-        n_episodes=20,  # 140 examples (20 × 7 steps)
-        epochs=2,
-        lr=2e-5,
-    )
-    # Post-warmup evaluation
-    print("="*60)
-    print("POST WARM-UP EVALUATION")
-    print("="*60)
-    post_warmup = evaluate_policy(env, model, tokenizer, n_episodes=5,
-                                  max_steps=max_steps, task_levels=task_levels,
-                                  verbose=True)
-    warmup_reward = post_warmup["avg_reward"]
-    print(f"Post-warmup avg reward: {warmup_reward:.4f}  "
-          f"(Δ vs baseline: {warmup_reward - baseline_reward:+.4f})")
-    print("Post-warmup behavior:")
-    for t in post_warmup["traces"]:
-        print(f"  task={t['task']:8s}  reward={t['reward']:+.4f}  "
-              f"steps={t['steps']}  actions={t['actions']}")
-    # ==================================================================
-    # PHASE 2: TRUE RL – PPO (on-policy, real environment interaction)
-    # ==================================================================
-    optimizer = AdamW(model.parameters(), lr=learning_rate)
-    print(f"\n{'='*60}")
-    print(f"PHASE 2 – PPO TRAINING: {n_iterations} iterations × "
-          f"{trajectories_per_iter} trajectories (true RL)")
-    print(f"{'='*60}\n")
-    reward_history = []
-    eval_history = []
-    loss_history = []
-    policy_loss_history = []
-    entropy_history = []
-    for iteration in range(n_iterations):
-        print(f"\n--- PPO Iteration {iteration + 1}/{n_iterations} ---")
-        # Collect on-policy trajectories from REAL environment
-        trajectories = collect_trajectories(
-            env, model, tokenizer, trajectories_per_iter, max_steps,
-            task_levels=task_levels, task_weights=None
-        )
-        avg_reward = float(np.mean([sum(t.rewards) for t in trajectories]))
-        reward_history.append(avg_reward)
-        print(f"  Collect  avg reward: {avg_reward:+.4f}")
-        # PPO policy gradient update
-        metrics = ppo_update(
-            trajectories, model, tokenizer, optimizer,
-            n_epochs=n_epochs, clip_epsilon=clip_epsilon,
-            entropy_coef=entropy_coef, gamma=gamma
-        )
-        loss_history.append(float(metrics["loss"]))
-        policy_loss_history.append(float(metrics["policy_loss"]))
-        entropy_history.append(float(metrics["entropy"]))
-        print(f"  Update   loss={metrics['loss']:.4f}  "
-              f"policy={metrics['policy_loss']:.4f}  "
-              f"entropy={metrics['entropy']:.4f}")
-        # Evaluate greedy policy after update
-        eval_m = evaluate_policy(env, model, tokenizer, n_episodes=3,
-                                 max_steps=max_steps, task_levels=task_levels,
-                                 verbose=False)
-        eval_history.append(eval_m["avg_reward"])
-        delta = eval_m["avg_reward"] - baseline_reward
-        print(f"  Eval     avg reward: {eval_m['avg_reward']:+.4f}  "
-              f"(Δ baseline: {delta:+.4f})")
-    # ==================================================================
-    # PHASE 3: FINAL EVALUATION (proof of learning)
-    # ==================================================================
-    print("\n" + "="*60)
-    print("PHASE 3 – FINAL EVALUATION (after all training)")
-    print("="*60)
-    final = evaluate_policy(env, model, tokenizer, n_episodes=5,
-                            max_steps=max_steps, task_levels=task_levels,
-                            verbose=True)
-    print(f"Final avg reward: {final['avg_reward']:.4f}  "
-          f"(min={final['min_reward']:.4f}, max={final['max_reward']:.4f})")
-    print("Final behavior:")
-    for t in final["traces"]:
-        print(f"  task={t['task']:8s}  reward={t['reward']:+.4f}  "
-              f"steps={t['steps']}  actions={t['actions']}")
-    total_improvement = final["avg_reward"] - baseline_reward
-    ppo_improvement = final["avg_reward"] - warmup_reward
-    print(f"\n{'='*60}")
-    print("TRAINING SUMMARY")
-    print(f"  Baseline reward:    {baseline_reward:+.4f}")
-    print(f"  Post-warmup reward: {warmup_reward:+.4f}  "
-          f"(warmup Δ: {warmup_reward - baseline_reward:+.4f})")
-    print(f"  Final reward:       {final['avg_reward']:+.4f}  "
-          f"(PPO Δ: {ppo_improvement:+.4f})")
-    print(f"  Total improvement:  {total_improvement:+.4f}")
-    print(f"  Reward trend (PPO): {' → '.join(f'{r:+.3f}' for r in reward_history)}")
-    print(f"  Loss trend (PPO):   {' → '.join(f'{l:.4f}' for l in loss_history)}")
-    if total_improvement > 0:
-        print(f"  ✓ Agent IMPROVED by {total_improvement:+.4f}")
-    else:
-        print(f"  ✗ No overall improvement detected")
-    print(f"{'='*60}")
-    # ==================================================================
-    # PLOTS
-    # ==================================================================
-    iters = list(range(1, n_iterations + 1))
-    # --- 1. Warm-up loss curve ---
-    if warmup_losses:
-        fig, ax = plt.subplots(figsize=(7, 4))
-        ax.plot(range(1, len(warmup_losses) + 1), warmup_losses,
-                marker="o", linewidth=2, color="tab:purple")
-        ax.set_title("Warm-up Loss (supervised, per epoch)",
-                     fontsize=13, fontweight="bold")
-        ax.set_xlabel("Epoch")
-        ax.set_ylabel("Cross-Entropy Loss")
-        ax.grid(alpha=0.3)
-        fig.tight_layout()
-        fig.savefig("warmup_loss.png", dpi=150)
-        plt.close(fig)
-    # --- 2. PPO reward curve ---
-    fig, ax = plt.subplots(figsize=(9, 5))
-    ax.plot(iters, reward_history, marker="o", linewidth=2,
-            label="Collect reward", color="tab:blue")
-    ax.plot(iters, eval_history, marker="s", linewidth=2, linestyle="--",
-            label="Eval reward", color="tab:green")
-    ax.axhline(y=baseline_reward, color="tab:gray", linestyle=":",
-               linewidth=1.5, label=f"Baseline ({baseline_reward:+.3f})")
-    ax.axhline(y=warmup_reward, color="tab:purple", linestyle=":",
-               linewidth=1.5, label=f"Post-warmup ({warmup_reward:+.3f})")
-    ax.set_title("PPO Reward per Iteration", fontsize=14, fontweight="bold")
-    ax.set_xlabel("Iteration")
-    ax.set_ylabel("Average Reward")
-    ax.legend(loc="best", fontsize=8)
-    ax.grid(alpha=0.3)
-    fig.tight_layout()
-    fig.savefig("reward_curve.png", dpi=150)
-    plt.close(fig)
-    # --- 3. PPO loss curve ---
-    fig, ax = plt.subplots(figsize=(9, 5))
-    ax.plot(iters, loss_history, marker="o", linewidth=2,
-            label="Total loss", color="tab:red")
-    ax.plot(iters, policy_loss_history, marker="^", linewidth=2, linestyle="--",
-            label="Policy loss", color="tab:orange")
-    ax.set_title("PPO Loss per Iteration", fontsize=14, fontweight="bold")
-    ax.set_xlabel("Iteration")
-    ax.set_ylabel("Loss")
-    ax.legend(loc="best")
-    ax.grid(alpha=0.3)
-    fig.tight_layout()
-    fig.savefig("loss_curve.png", dpi=150)
-    plt.close(fig)
-    # --- 4. Combined 3-panel summary ---
-    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
-    # Panel A: warm-up loss
-    if warmup_losses:
-        axes[0].plot(range(1, len(warmup_losses) + 1), warmup_losses,
-                     marker="o", linewidth=2, color="tab:purple")
-        axes[0].set_title("A. Warm-up Loss ↓")
-        axes[0].set_xlabel("Epoch")
-        axes[0].set_ylabel("CE Loss")
-        axes[0].grid(alpha=0.3)
-    # Panel B: PPO reward
-    axes[1].plot(iters, reward_history, marker="o", linewidth=2,
-                 color="tab:blue", label="Collect")
-    axes[1].plot(iters, eval_history, marker="s", linewidth=2,
-                 linestyle="--", color="tab:green", label="Eval")
-    axes[1].axhline(y=baseline_reward, color="tab:gray", linestyle=":",
-                    linewidth=1.5, label="Baseline")
-    axes[1].axhline(y=warmup_reward, color="tab:purple", linestyle=":",
-                    linewidth=1.5, label="Post-warmup")
-    axes[1].set_title("B. PPO Reward ↑")
-    axes[1].set_xlabel("Iteration")
-    axes[1].set_ylabel("Avg Reward")
-    axes[1].legend(fontsize=7)
-    axes[1].grid(alpha=0.3)
-    # Panel C: PPO loss
-    axes[2].plot(iters, loss_history, marker="o", linewidth=2,
-                 color="tab:red", label="Total")
-    axes[2].plot(iters, policy_loss_history, marker="^", linewidth=2,
-                 linestyle="--", color="tab:orange", label="Policy")
-    axes[2].set_title("C. PPO Loss ↓")
-    axes[2].set_xlabel("Iteration")
-    axes[2].set_ylabel("Loss")
-    axes[2].legend(fontsize=7)
-    axes[2].grid(alpha=0.3)
-    fig.suptitle("Code Review Agent – Full Training Evidence",
-                 fontsize=14, fontweight="bold")
-    fig.tight_layout()
-    fig.savefig("training_summary.png", dpi=150)
-    plt.close(fig)
-    print("Plots saved: warmup_loss.png, reward_curve.png, "
-          "loss_curve.png, training_summary.png")
-    print("="*60)
-if __name__ == "__main__":
     train_ppo()

+# training.py – Memory‑safe: Phi‑3‑mini + Expert Demos + Fast PPO (2 iterations)
+import os
+os.environ["TRITON_DISABLE"] = "1"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"        # Issue #12: prevent OOM from parallel tokenization
+import torch._dynamo
+torch._dynamo.config.disable = True
+import json
+import torch
+import torch.nn.functional as F
+from torch.optim import AdamW
+from dataclasses import dataclass
+from typing import List, Dict, Tuple, Optional
+import numpy as np
+import re
+import random
+import matplotlib.pyplot as plt
+from unsloth import FastLanguageModel
+from transformers import TrainingArguments
+from trl import SFTTrainer
+from datasets import Dataset
+from environment import CodeReviewEnv
+from redteam import BUG_DB
+from models import (
+    RunTests, RunLinter, Inspect,
+    ProposeFix, WriteComment, AskQuestion,
+    Done, Skip, QueryDocs, map_to_env as model_map_to_env
+)
+# ======================================================================
+@dataclass
+class AgentAction:
+    action_type: str
+    content: Optional[str] = None
+def parse_action(output: str) -> AgentAction:
+    try:
+        data = json.loads(output)
+        return AgentAction(
+            action_type=data.get("action_type", "").lower(),
+            content=data.get("content")
+        )
+    except:
+        pass
+    json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', output, re.DOTALL)
+    if json_match:
+        try:
+            data = json.loads(json_match.group(1))
+            return AgentAction(
+                action_type=data.get("action_type", "").lower(),
+                content=data.get("content")
+            )
+        except:
+            pass
+    action_pattern = r'"action_type"\s*:\s*"(\w+)"'
+    match = re.search(action_pattern, output)
+    if match:
+        return AgentAction(action_type=match.group(1).lower())
+    output_lower = output.lower()
+    if "test" in output_lower:
+        return AgentAction("run_tests")
+    if "lint" in output_lower:
+        return AgentAction("run_linter")
+    if "inspect" in output_lower:
+        return AgentAction("inspect")
+    if "doc" in output_lower or "documentation" in output_lower:
+        return AgentAction("query_docs", "bug fix guidance")
+    return AgentAction("invalid", output)
+def map_to_env(action: AgentAction):
+    return model_map_to_env(action.action_type, action.content)
+# ======================================================================
+def load_model():
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name="unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
+        max_seq_length=480,               # smaller window for memory
+        load_in_4bit=True,
+    )
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=16,
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj"
+        ],
+        lora_alpha=32,
+        lora_dropout=0.0,
+    )
+    return model, tokenizer
+def test_model_sanity(model, tokenizer) -> bool:
+    print("\n" + "="*60)
+    print("SANITY CHECK: Testing base model generation")
+    print("="*60)
+    test_prompt = "Hello, how are you?"
+    messages = [{"role": "user", "content": test_prompt}]
+    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(formatted, return_tensors="pt", max_length=256, truncation=True).to("cuda")
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=30,
+            do_sample=True,
+            temperature=0.7,
+            min_new_tokens=1,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
+        )
+    generated_ids = outputs[0][inputs['input_ids'].shape[1]:]
+    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+    print(f"Prompt: {test_prompt}")
+    print(f"Response: {repr(response)}")
+    if len(response) == 0:
+        print("❌ Model produces empty output – cannot train.")
+        return False
+    print("✓ Model sanity check PASSED\n")
+    return True
+# ======================================================================
+def _expert_fix_from_context(obs) -> str:
+    """
+    Build a conservative fix template named `fix` (required by tests).
+    Uses bug hints + code snippet patterns to create realistic fixes.
+    """
+    bug = (getattr(obs, "bug_description", "") or "").lower()
+    code = getattr(obs, "code_snippet", "") or ""
+    if "division" in bug or "average" in code.lower():
+        return (
+            "def fix(data):\n"
+            "    if not data:\n"
+            "        return 0\n"
+            "    return sum(data) / len(data)"
+        )
+    if "operator" in bug or "sign" in bug:
+        return (
+            "def fix(a, b):\n"
+            "    return a + b"
+        )
+    if "off_by_one" in bug or "loop" in bug:
+        return (
+            "def fix(items):\n"
+            "    return len(items)"
+        )
+    if "null" in bug or "key" in bug or "dict" in code.lower():
+        return (
+            "def fix(payload):\n"
+            "    users = payload.get('users', {})\n"
+            "    user_id = payload.get('id')\n"
+            "    return users.get(user_id)"
+        )
+    # Concurrency-heavy tasks (harder/hardest).
+    if "race" in bug or "missing_lock" in bug or "thread_safe" in bug or "global_nonatomic" in bug:
+        return (
+            "import threading\n"
+            "_lock = threading.Lock()\n"
+            "\n"
+            "def fix(counter):\n"
+            "    with _lock:\n"
+            "        if counter is None:\n"
+            "            return 0\n"
+            "        return counter + 1"
+        )
+    if "deadlock" in bug or "double_lock" in bug or "lock order" in bug or "nested_lock" in bug:
+        return (
+            "import threading\n"
+            "_lock_a = threading.Lock()\n"
+            "_lock_b = threading.Lock()\n"
+            "\n"
+            "def fix(work):\n"
+            "    first, second = (_lock_a, _lock_b)\n"
+            "    if id(first) > id(second):\n"
+            "        first, second = second, first\n"
+            "    with first:\n"
+            "        with second:\n"
+            "            return work() if callable(work) else work"
+        )
+    if "fork_join" in bug or "join" in bug:
+        return (
+            "import threading\n"
+            "\n"
+            "def fix(worker):\n"
+            "    t = threading.Thread(target=worker)\n"
+            "    t.start()\n"
+            "    t.join()\n"
+            "    return True"
+        )
+    # Generic safe fallback keeps the RL pipeline alive for unknown bugs.
+    return (
+        "def fix(data):\n"
+        "    if data is None:\n"
+        "        return None\n"
+        "    return data"
+    )
+def _expert_supervised_policy(obs) -> str:
+    """
+    Real workflow policy:
+    inspect -> tests/linter -> docs -> fix -> negotiate -> done.
+    """
+    author_msg = (getattr(obs, "author_response", "") or "").lower()
+    tool_output = (getattr(obs, "last_tool_output", "") or "").lower()
+    if not getattr(obs, "tests_run", False):
+        if "inspect" not in tool_output:
+            return '{"action_type": "inspect"}'
+        return '{"action_type": "run_tests"}'
+    if not getattr(obs, "linter_run", False):
+        return '{"action_type": "run_linter"}'
+    if not getattr(obs, "docs_queried", False):
+        return '{"action_type": "query_docs", "content": "python bug fixing best practices for edge cases and null safety"}'
+    # Use docs again on hard tasks when evidence is still weak.
+    if getattr(obs, "current_test_score", 0.0) < 0.6 and getattr(obs, "step", 0) >= 3:
+        bug_hint = (getattr(obs, "bug_description", "") or "concurrency bug").replace('"', "'")
+        return json.dumps(
+            {
+                "action_type": "query_docs",
+                "content": f"python {bug_hint} lock ordering race condition mitigation patterns",
+            }
+        )
+    # If test quality is poor, propose a concrete fix.
+    if getattr(obs, "current_test_score", 0.0) < 0.95:
+        fix_code = _expert_fix_from_context(obs)
+        return json.dumps({"action_type": "fix", "content": fix_code})
+    # If author is still unconvinced, provide causal explanation.
+    if author_msg and ("not convinced" in author_msg or "explain" in author_msg or "brief" in author_msg):
+        return (
+            '{"action_type": "comment", "content": "This fix works because it handles the failing edge case directly, '
+            'keeps behavior deterministic, and aligns with the observed test and lint feedback. '
+            'The change is intentionally small to reduce regression risk."}'
+        )
+    # If negotiation is strong enough and quality is good, terminate.
+    conf = float(getattr(obs, "author_confidence", 0.0))
+    threshold = float(getattr(obs, "author_threshold", 0.5))
+    score = float(getattr(obs, "current_test_score", 0.0))
+    if conf >= threshold and score >= 0.8:
+        return '{"action_type": "done"}'
+    # Nudge conversation forward when tests are okay but acceptance is pending.
+    return (
+        '{"action_type": "question", "content": "Would you like a quick walkthrough of a failing scenario, the root cause, and how the fix prevents regressions?"}'
+    )
+# ======================================================================
+def supervised_warmup(model, tokenizer, env, n_episodes=16, epochs=1, max_steps=8):
+    print("\n" + "="*60)
+    print("SUPERVISED WARM-UP: Real environment demonstrations")
+    print("="*60)
+    examples = []
+    tasks = ["easy", "medium", "hard", "harder", "hardest"]
+    for ep in range(n_episodes):
+        task = random.choice(tasks)
+        env.set_task(task)
+        obs = env.reset()
+        history = []
+        done = False
+        steps = 0
+        while not done and steps < max_steps:
+            prompt = build_prompt(obs, history)
+            action_text = _expert_supervised_policy(obs)
+            action = parse_action(action_text)
+            env_action = map_to_env(action)
+            next_obs, _, done, _ = env.step(env_action)
+            messages = [
+                {"role": "user", "content": prompt},
+                {"role": "assistant", "content": action_text},
+            ]
+            full_text = tokenizer.apply_chat_template(messages, tokenize=False)
+            examples.append({"text": full_text})
+            history.append(f"Agent: {action_text}")
+            history.append(f"Env: {next_obs.last_tool_output}")
+            history = history[-8:]
+            obs = next_obs
+            steps += 1
+        print(f"Supervised episode {ep+1}: task={task}, steps={steps}, done={done}")
+    if not examples:
+        print("No supervised examples generated; skipping warm-up.")
+        return
+    dataset = Dataset.from_list(examples)
+    trainer = SFTTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=dataset,
+        dataset_text_field="text",
+        max_seq_length=480,
+        args=TrainingArguments(
+            output_dir="warmup_output",
+            num_train_epochs=epochs,
+            per_device_train_batch_size=2,
+            gradient_accumulation_steps=2,
+            learning_rate=2e-5,
+            logging_steps=50,
+            save_strategy="no",
+            bf16=True,
+        ),
+    )
+    print(f"Training on {len(examples)} real env examples for {epochs} epochs...")
+    trainer.train()
+    print("✓ Supervised warm-up (real env) complete\n")
+    torch.cuda.empty_cache()
+# ======================================================================
+def generate_action_with_logprob(prompt, model, tokenizer, temperature=0.0, max_retries=2):
+    messages = [{"role": "user", "content": prompt}]
+    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = tokenizer(formatted, return_tensors="pt", max_length=480, truncation=True).to("cuda")
+    for attempt in range(max_retries):
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=64,
+                do_sample=(temperature > 0),
+                temperature=max(temperature, 0.01) if temperature > 0 else 1.0,
+                min_new_tokens=1,
+                return_dict_in_generate=True,
+                output_scores=True,
+            )
+        generated_ids = outputs.sequences[0][inputs['input_ids'].shape[1]:]
+        action_text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+        logprobs = []
+        for idx, token_id in enumerate(generated_ids):
+            if idx < len(outputs.scores):
+                token_logits = outputs.scores[idx][0]
+                token_logprob = F.log_softmax(token_logits, dim=-1)[token_id].item()
+                logprobs.append(token_logprob)
+        total_logprob = sum(logprobs) if logprobs else -100.0
+        if not action_text:
+            fallback_actions = [
+                '{"action_type": "run_tests"}',
+                '{"action_type": "run_linter"}',
+                '{"action_type": "inspect"}',
+                '{"action_type": "skip"}',
+            ]
+            action_text = random.choice(fallback_actions)
+            total_logprob = -50.0
+            print(f"[WARN] Empty generation → using fallback: {action_text}")
+            return action_text, total_logprob
+        try:
+            json.loads(action_text)
+            return action_text, total_logprob
+        except:
+            if attempt == max_retries - 1:
+                return '{"action_type":"skip"}', -100.0
+            continue
+    return '{"action_type":"skip"}', -100.0
+# ======================================================================
+def build_prompt(obs, history_lines: List[str]) -> str:
+    author_msg = getattr(obs, "author_response", "") or ""
+    tool_output = getattr(obs, "last_tool_output", "") or ""
+    author_personality = getattr(obs, "author_personality", "defensive")
+    prompt = f"""You are an AI code review agent. Your goal is to convince a simulated human developer to accept your proposed fix and name your proposed fix function fix.
+The developer has a **{author_personality}** personality and will only accept if you provide solid evidence:
+- Tests pass (high pass ratio)
+- Lint is clean (zero errors)
+- Documentation or references are provided
+- Your reasoning is clear, uses words like "because" or "therefore", and is detailed (over 30 words if needed)
+Workflow:
+1. Use `inspect` to understand the code.
+2. Use `run_tests` and `run_linter` to gather evidence.
+3. Use `query_docs` when you need references or language-specific guidance.
+4. Propose a fix (`fix`) and explain why it works (`comment` or `question`).
+5. If the developer pushes back, read their response carefully and address their specific concern.
+6. Once convinced, use `done` to finish.
+Code:
+{obs.code_snippet}
+Author says:
+{author_msg if author_msg else "(no response yet – start with inspection)"}
+Last tool output:
+{tool_output if tool_output else "(none)"}
+Available actions:
+run_tests, run_linter, inspect, query_docs, fix, comment, question, done
+Respond ONLY in JSON:
+{{"action_type": "...", "content": "..."}}"""
+    if history_lines:
+        history = "\n".join(history_lines[-6:])
+        prompt += f"\n\nPrevious steps:\n{history}"
+    return prompt
+# ======================================================================
+@dataclass
+class Trajectory:
+    states: List[str]
+    actions: List[str]
+    rewards: List[float]
+    logprobs: List[float]
+    dones: List[bool]
+    def __len__(self): return len(self.states)
+def collect_trajectory(env, model, tokenizer, max_steps=6, temperature=0.0):
+    obs = env.reset()
+    history_lines = []
+    states, actions, rewards, logprobs, dones = [], [], [], [], []
+    for step in range(max_steps):
+        prompt = build_prompt(obs, history_lines)
+        states.append(prompt)
+        action_text, logprob = generate_action_with_logprob(prompt, model, tokenizer, temperature)
+        actions.append(action_text)
+        logprobs.append(logprob)
+        action = parse_action(action_text)
+        env_action = map_to_env(action)
+        next_obs, reward, done, _ = env.step(env_action)
+        rewards.append(reward.value)
+        dones.append(done)
+        history_lines.append(f"Agent: {action_text}")
+        history_lines.append(f"Env: {next_obs.last_tool_output}")
+        obs = next_obs
+        if done: break
+    return Trajectory(states, actions, rewards, logprobs, dones)
+def collect_trajectories(env, model, tokenizer, n_trajectories, max_steps=6,
+                         task_levels=None, task_weights=None):
+    if task_levels is None:
+        task_levels = list(BUG_DB.keys())
+    if task_weights is not None and len(task_weights) != len(task_levels):
+        raise ValueError("task_weights must match task_levels length")
+    if task_weights is not None and sum(task_weights) <= 0:
+        raise ValueError("task_weights must have a positive total")
+    trajectories = []
+    for i in range(n_trajectories):
+        sampled_task = random.choices(task_levels, weights=task_weights, k=1)[0]
+        env.set_task(sampled_task)
+        traj = collect_trajectory(env, model, tokenizer, max_steps)
+        total_reward = sum(traj.rewards)
+        print(f"Trajectory {i+1}/{n_trajectories}: task={sampled_task}, steps={len(traj)}, reward={total_reward:.3f}")
+        trajectories.append(traj)
+    return trajectories
+def compute_returns_and_advantages(rewards, dones, gamma=0.99, standardize=True):
+    """
+    Compute discounted returns and REINFORCE-style baseline advantages.
+    Advantages are centered and optionally standardised.
+    """
+    n = len(rewards)
+    returns = [0.0]*n
+    running = 0.0
+    for t in reversed(range(n)):
+        if dones[t]: running = 0.0
+        running = rewards[t] + gamma * running
+        returns[t] = running
+    if standardize:
+        advantages = np.array(returns) - np.mean(returns)
+        adv_std = np.std(advantages) + 1e-8
+        advantages = (advantages / adv_std).tolist()
+    else:
+        advantages = returns.copy()
+    return advantages, returns
+def ppo_update(trajectories, model, tokenizer, optimizer, n_epochs=1, clip_epsilon=0.2,
+               entropy_coef=0.01, gamma=0.99):
+    model.train()
+    all_states, all_actions, all_old_logprobs, all_advantages = [], [], [], []
+    for traj in trajectories:
+        advantages, _ = compute_returns_and_advantages(traj.rewards, traj.dones, gamma=gamma, standardize=True)
+        all_states.extend(traj.states)
+        all_actions.extend(traj.actions)
+        all_old_logprobs.extend(traj.logprobs)
+        all_advantages.extend(advantages)
+    n_samples = len(all_states)
+    total_loss, total_policy_loss, total_entropy, n_updates = 0.0, 0.0, 0.0, 0
+    for epoch in range(n_epochs):
+        indices = np.random.permutation(n_samples)
+        for i in indices:
+            state = all_states[i]
+            action = all_actions[i]
+            old_logprob = all_old_logprobs[i]
+            advantage = all_advantages[i]
+            messages = [{"role": "user", "content": state}]
+            formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            full_text = formatted + action
+            inputs = tokenizer(full_text, return_tensors="pt", max_length=480, truncation=True).to("cuda")
+            outputs = model(**inputs)
+            logits = outputs.logits
+            action_ids = tokenizer.encode(action, add_special_tokens=False)
+            prefix_ids = tokenizer.encode(formatted, add_special_tokens=False)
+            action_start = len(prefix_ids)
+            logprobs = []
+            entropy = 0.0
+            for idx, token_id in enumerate(action_ids):
+                position = action_start + idx - 1
+                if 0 <= position < logits.shape[1]:
+                    token_logits = logits[0, position]
+                    log_probs = F.log_softmax(token_logits, dim=-1)
+                    token_logprob = log_probs[token_id]
+                    logprobs.append(token_logprob)
+                    probs = F.softmax(token_logits, dim=-1)
+                    entropy += -(probs * log_probs).sum()
+            if not logprobs: continue
+            new_logprob = sum(logprobs)
+            avg_entropy = entropy / len(logprobs) if logprobs else 0.0
+            ratio = torch.exp(new_logprob - old_logprob)
+            surr1 = ratio * advantage
+            surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantage
+            policy_loss = -torch.min(surr1, surr2)
+            loss = policy_loss - entropy_coef * avg_entropy
+            optimizer.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+            total_loss += loss.item()
+            total_policy_loss += policy_loss.item()
+            total_entropy += avg_entropy.item()
+            n_updates += 1
+    torch.cuda.empty_cache()
+    return {"loss": total_loss / n_updates if n_updates else 0.0,
+            "policy_loss": total_policy_loss / n_updates if n_updates else 0.0,
+            "entropy": total_entropy / n_updates if n_updates else 0.0}
+def evaluate_policy(env, model, tokenizer, n_episodes=3, max_steps=6,
+                    task_levels=None, verbose=False):
+    """Evaluate the current policy across task levels. Returns metrics + optional traces."""
+    model.eval()
+    if task_levels is None:
+        task_levels = list(BUG_DB.keys())
+    total_rewards = []
+    traces = []  # human-readable behavior logs
+    for ep in range(n_episodes):
+        task = task_levels[ep % len(task_levels)]
+        env.set_task(task)
+        traj = collect_trajectory(env, model, tokenizer, max_steps, temperature=0.0)
+        ep_reward = sum(traj.rewards)
+        total_rewards.append(ep_reward)
+        if verbose:
+            actions_taken = []
+            for a in traj.actions:
+                try:
+                    actions_taken.append(json.loads(a).get("action_type", "?"))
+                except Exception:
+                    actions_taken.append("?")
+            traces.append({
+                "task": task,
+                "reward": round(ep_reward, 4),
+                "steps": len(traj),
+                "actions": actions_taken,
+            })
+    return {
+        "avg_reward": float(np.mean(total_rewards)),
+        "std_reward": float(np.std(total_rewards)),
+        "min_reward": float(np.min(total_rewards)),
+        "max_reward": float(np.max(total_rewards)),
+        "traces": traces,
+    }
+# ======================================================================
+# MANUAL WARM-UP (no SFTTrainer → no multiprocessing OOM)
+# ======================================================================
+def json_warmup(model, tokenizer, json_path="training_data.json",
+                n_episodes=20, epochs=2, lr=2e-5):
+    """
+    Supervised warm-up from pre-generated expert demonstrations.
+    Uses raw cross-entropy on action tokens with manual gradient steps.
+    NO SFTTrainer, NO multiprocessing – runs safely on any GPU.
+    """
+    print("\n" + "="*60)
+    print("SUPERVISED WARM-UP: training_data.json (manual cross-entropy)")
+    print("="*60)
+    with open(json_path, encoding="utf-8") as f:
+        data = json.load(f)
+    # Each episode = 7 steps. Select n_episodes worth.
+    steps_per_episode = 7
+    max_examples = n_episodes * steps_per_episode
+    if max_examples < len(data):
+        data = data[:max_examples]
+    print(f"  {len(data)} examples ({len(data)//steps_per_episode} episodes), "
+          f"{epochs} epoch(s), lr={lr}")
+    model.train()
+    warmup_opt = AdamW(model.parameters(), lr=lr)
+    warmup_losses = []   # per-epoch avg loss
+    for epoch in range(epochs):
+        random.shuffle(data)
+        epoch_loss = 0.0
+        n_valid = 0
+        for i, example in enumerate(data):
+            prompt = example["prompt"]
+            action = example["action"]
+            # ---- tokenize full sequence (prompt + action) ----
+            messages = [
+                {"role": "user", "content": prompt},
+                {"role": "assistant", "content": action},
+            ]
+            full_text = tokenizer.apply_chat_template(messages, tokenize=False)
+            inputs = tokenizer(full_text, return_tensors="pt",
+                               max_length=480, truncation=True).to("cuda")
+            # ---- find where the action tokens start ----
+            prompt_only = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                tokenize=False, add_generation_prompt=True
+            )
+            prompt_ids = tokenizer.encode(prompt_only, add_special_tokens=False)
+            prompt_len = len(prompt_ids)
+            total_len = inputs.input_ids.shape[1]
+            if prompt_len >= total_len:
+                continue  # prompt was truncated away, skip
+            # ---- cross-entropy on action tokens only ----
+            outputs = model(**inputs)
+            logits = outputs.logits
+            # next-token prediction: logits[t] predicts token[t+1]
+            shift_logits = logits[0, prompt_len - 1 : total_len - 1]
+            shift_labels = inputs.input_ids[0, prompt_len : total_len]
+            min_len = min(shift_logits.shape[0], shift_labels.shape[0])
+            if min_len == 0:
+                continue
+            loss = F.cross_entropy(shift_logits[:min_len], shift_labels[:min_len])
+            warmup_opt.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            warmup_opt.step()
+            epoch_loss += loss.item()
+            n_valid += 1
+            if (i + 1) % 25 == 0:
+                avg = epoch_loss / n_valid
+                print(f"    epoch {epoch+1}  step {i+1:3d}/{len(data)}  "
+                      f"running_loss={avg:.4f}")
+        avg_loss = epoch_loss / max(n_valid, 1)
+        warmup_losses.append(avg_loss)
+        print(f"  Epoch {epoch+1} done: avg_loss={avg_loss:.4f}  "
+              f"({n_valid} valid examples)")
+    torch.cuda.empty_cache()
+    print(f"✓ Warm-up complete.  Loss: "
+          f"{' → '.join(f'{l:.4f}' for l in warmup_losses)}\n")
+    return warmup_losses
+# ======================================================================
+# MAIN TRAINING PIPELINE
+# ======================================================================
+def train_ppo():
+    # --- Hyperparameters ---
+    n_iterations = 8            # enough for a clear upward trend
+    trajectories_per_iter = 4   # on-policy data per iteration
+    n_epochs = 1
+    max_steps = 6
+    learning_rate = 3e-5
+    clip_epsilon = 0.2
+    entropy_coef = 0.01
+    gamma = 0.99
+    # --- Pre-load embedder before LLM (Issue #13) ---
+    from rltool import ToolBox
+    print("Pre-loading sentence-transformer embedder...")
+    ToolBox._get_embedder()
+    print("✓ Embedder ready")
+    # --- Load model ---
+    print("Loading model...")
+    model, tokenizer = load_model()
+    if not test_model_sanity(model, tokenizer):
+        return
+    env = CodeReviewEnv()
+    task_levels = list(BUG_DB.keys())
+    # ==================================================================
+    # PHASE 0: BASELINE (untrained policy)
+    # ==================================================================
+    print("\n" + "="*60)
+    print("PHASE 0 – BASELINE EVALUATION (untrained)")
+    print("="*60)
+    baseline = evaluate_policy(env, model, tokenizer, n_episodes=5,
+                               max_steps=max_steps, task_levels=task_levels,
+                               verbose=True)
+    baseline_reward = baseline["avg_reward"]
+    print(f"Baseline avg reward: {baseline_reward:.4f}  "
+          f"(min={baseline['min_reward']:.4f}, max={baseline['max_reward']:.4f})")
+    print("Baseline behavior:")
+    for t in baseline["traces"]:
+        print(f"  task={t['task']:8s}  reward={t['reward']:+.4f}  "
+              f"steps={t['steps']}  actions={t['actions']}")
+    # ==================================================================
+    # PHASE 1: SUPERVISED WARM-UP (expert demos, manual CE)
+    # ==================================================================
+    warmup_losses = json_warmup(
+        model, tokenizer,
+        json_path="training_data.json",
+        n_episodes=20,  # 140 examples (20 × 7 steps)
+        epochs=2,
+        lr=2e-5,
+    )
+    # Post-warmup evaluation
+    print("="*60)
+    print("POST WARM-UP EVALUATION")
+    print("="*60)
+    post_warmup = evaluate_policy(env, model, tokenizer, n_episodes=5,
+                                  max_steps=max_steps, task_levels=task_levels,
+                                  verbose=True)
+    warmup_reward = post_warmup["avg_reward"]
+    print(f"Post-warmup avg reward: {warmup_reward:.4f}  "
+          f"(Δ vs baseline: {warmup_reward - baseline_reward:+.4f})")
+    print("Post-warmup behavior:")
+    for t in post_warmup["traces"]:
+        print(f"  task={t['task']:8s}  reward={t['reward']:+.4f}  "
+              f"steps={t['steps']}  actions={t['actions']}")
+    # ==================================================================
+    # PHASE 2: TRUE RL – PPO (on-policy, real environment interaction)
+    # ==================================================================
+    optimizer = AdamW(model.parameters(), lr=learning_rate)
+    print(f"\n{'='*60}")
+    print(f"PHASE 2 – PPO TRAINING: {n_iterations} iterations × "
+          f"{trajectories_per_iter} trajectories (true RL)")
+    print(f"{'='*60}\n")
+    reward_history = []
+    eval_history = []
+    loss_history = []
+    policy_loss_history = []
+    entropy_history = []
+    for iteration in range(n_iterations):
+        print(f"\n--- PPO Iteration {iteration + 1}/{n_iterations} ---")
+        # Collect on-policy trajectories from REAL environment
+        trajectories = collect_trajectories(
+            env, model, tokenizer, trajectories_per_iter, max_steps,
+            task_levels=task_levels, task_weights=None
+        )
+        avg_reward = float(np.mean([sum(t.rewards) for t in trajectories]))
+        reward_history.append(avg_reward)
+        print(f"  Collect  avg reward: {avg_reward:+.4f}")
+        # PPO policy gradient update
+        metrics = ppo_update(
+            trajectories, model, tokenizer, optimizer,
+            n_epochs=n_epochs, clip_epsilon=clip_epsilon,
+            entropy_coef=entropy_coef, gamma=gamma
+        )
+        loss_history.append(float(metrics["loss"]))
+        policy_loss_history.append(float(metrics["policy_loss"]))
+        entropy_history.append(float(metrics["entropy"]))
+        print(f"  Update   loss={metrics['loss']:.4f}  "
+              f"policy={metrics['policy_loss']:.4f}  "
+              f"entropy={metrics['entropy']:.4f}")
+        # Evaluate greedy policy after update
+        eval_m = evaluate_policy(env, model, tokenizer, n_episodes=3,
+                                 max_steps=max_steps, task_levels=task_levels,
+                                 verbose=False)
+        eval_history.append(eval_m["avg_reward"])
+        delta = eval_m["avg_reward"] - baseline_reward
+        print(f"  Eval     avg reward: {eval_m['avg_reward']:+.4f}  "
+              f"(Δ baseline: {delta:+.4f})")
+    # ==================================================================
+    # PHASE 3: FINAL EVALUATION (proof of learning)
+    # ==================================================================
+    print("\n" + "="*60)
+    print("PHASE 3 – FINAL EVALUATION (after all training)")
+    print("="*60)
+    final = evaluate_policy(env, model, tokenizer, n_episodes=5,
+                            max_steps=max_steps, task_levels=task_levels,
+                            verbose=True)
+    print(f"Final avg reward: {final['avg_reward']:.4f}  "
+          f"(min={final['min_reward']:.4f}, max={final['max_reward']:.4f})")
+    print("Final behavior:")
+    for t in final["traces"]:
+        print(f"  task={t['task']:8s}  reward={t['reward']:+.4f}  "
+              f"steps={t['steps']}  actions={t['actions']}")
+    total_improvement = final["avg_reward"] - baseline_reward
+    ppo_improvement = final["avg_reward"] - warmup_reward
+    print(f"\n{'='*60}")
+    print("TRAINING SUMMARY")
+    print(f"  Baseline reward:    {baseline_reward:+.4f}")
+    print(f"  Post-warmup reward: {warmup_reward:+.4f}  "
+          f"(warmup Δ: {warmup_reward - baseline_reward:+.4f})")
+    print(f"  Final reward:       {final['avg_reward']:+.4f}  "
+          f"(PPO Δ: {ppo_improvement:+.4f})")
+    print(f"  Total improvement:  {total_improvement:+.4f}")
+    print(f"  Reward trend (PPO): {' → '.join(f'{r:+.3f}' for r in reward_history)}")
+    print(f"  Loss trend (PPO):   {' → '.join(f'{l:.4f}' for l in loss_history)}")
+    if total_improvement > 0:
+        print(f"  ✓ Agent IMPROVED by {total_improvement:+.4f}")
+    else:
+        print(f"  ✗ No overall improvement detected")
+    print(f"{'='*60}")
+    # ==================================================================
+    # PLOTS
+    # ==================================================================
+    iters = list(range(1, n_iterations + 1))
+    # --- 1. Warm-up loss curve ---
+    if warmup_losses:
+        fig, ax = plt.subplots(figsize=(7, 4))
+        ax.plot(range(1, len(warmup_losses) + 1), warmup_losses,
+                marker="o", linewidth=2, color="tab:purple")
+        ax.set_title("Warm-up Loss (supervised, per epoch)",
+                     fontsize=13, fontweight="bold")
+        ax.set_xlabel("Epoch")
+        ax.set_ylabel("Cross-Entropy Loss")
+        ax.grid(alpha=0.3)
+        fig.tight_layout()
+        fig.savefig("warmup_loss.png", dpi=150)
+        plt.close(fig)
+    # --- 2. PPO reward curve ---
+    fig, ax = plt.subplots(figsize=(9, 5))
+    ax.plot(iters, reward_history, marker="o", linewidth=2,
+            label="Collect reward", color="tab:blue")
+    ax.plot(iters, eval_history, marker="s", linewidth=2, linestyle="--",
+            label="Eval reward", color="tab:green")
+    ax.axhline(y=baseline_reward, color="tab:gray", linestyle=":",
+               linewidth=1.5, label=f"Baseline ({baseline_reward:+.3f})")
+    ax.axhline(y=warmup_reward, color="tab:purple", linestyle=":",
+               linewidth=1.5, label=f"Post-warmup ({warmup_reward:+.3f})")
+    ax.set_title("PPO Reward per Iteration", fontsize=14, fontweight="bold")
+    ax.set_xlabel("Iteration")
+    ax.set_ylabel("Average Reward")
+    ax.legend(loc="best", fontsize=8)
+    ax.grid(alpha=0.3)
+    fig.tight_layout()
+    fig.savefig("reward_curve.png", dpi=150)
+    plt.close(fig)
+    # --- 3. PPO loss curve ---
+    fig, ax = plt.subplots(figsize=(9, 5))
+    ax.plot(iters, loss_history, marker="o", linewidth=2,
+            label="Total loss", color="tab:red")
+    ax.plot(iters, policy_loss_history, marker="^", linewidth=2, linestyle="--",
+            label="Policy loss", color="tab:orange")
+    ax.set_title("PPO Loss per Iteration", fontsize=14, fontweight="bold")
+    ax.set_xlabel("Iteration")
+    ax.set_ylabel("Loss")
+    ax.legend(loc="best")
+    ax.grid(alpha=0.3)
+    fig.tight_layout()
+    fig.savefig("loss_curve.png", dpi=150)
+    plt.close(fig)
+    # --- 4. Combined 3-panel summary ---
+    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
+    # Panel A: warm-up loss
+    if warmup_losses:
+        axes[0].plot(range(1, len(warmup_losses) + 1), warmup_losses,
+                     marker="o", linewidth=2, color="tab:purple")
+        axes[0].set_title("A. Warm-up Loss ↓")
+        axes[0].set_xlabel("Epoch")
+        axes[0].set_ylabel("CE Loss")
+        axes[0].grid(alpha=0.3)
+    # Panel B: PPO reward
+    axes[1].plot(iters, reward_history, marker="o", linewidth=2,
+                 color="tab:blue", label="Collect")
+    axes[1].plot(iters, eval_history, marker="s", linewidth=2,
+                 linestyle="--", color="tab:green", label="Eval")
+    axes[1].axhline(y=baseline_reward, color="tab:gray", linestyle=":",
+                    linewidth=1.5, label="Baseline")
+    axes[1].axhline(y=warmup_reward, color="tab:purple", linestyle=":",
+                    linewidth=1.5, label="Post-warmup")
+    axes[1].set_title("B. PPO Reward ↑")
+    axes[1].set_xlabel("Iteration")
+    axes[1].set_ylabel("Avg Reward")
+    axes[1].legend(fontsize=7)
+    axes[1].grid(alpha=0.3)
+    # Panel C: PPO loss
+    axes[2].plot(iters, loss_history, marker="o", linewidth=2,
+                 color="tab:red", label="Total")
+    axes[2].plot(iters, policy_loss_history, marker="^", linewidth=2,
+                 linestyle="--", color="tab:orange", label="Policy")
+    axes[2].set_title("C. PPO Loss ↓")
+    axes[2].set_xlabel("Iteration")
+    axes[2].set_ylabel("Loss")
+    axes[2].legend(fontsize=7)
+    axes[2].grid(alpha=0.3)
+    fig.suptitle("Code Review Agent – Full Training Evidence",
+                 fontsize=14, fontweight="bold")
+    fig.tight_layout()
+    fig.savefig("training_summary.png", dpi=150)
+    plt.close(fig)
+    print("Plots saved: warmup_loss.png, reward_curve.png, "
+          "loss_curve.png, training_summary.png")
+    print("="*60)
+if __name__ == "__main__":
     train_ppo()

training_data.json CHANGED Viewed

The diff for this file is too large to render. See raw diff