Spaces:

Spirit-26
/

code-review-environment

Running

Harshit2N commited on 2 days ago

Commit

5e92b80

1 Parent(s): b61cfff

Enhance Code Review Environment with Action History, Valid Actions, and Improved Grading

- Added action history tracking in CodeReviewEnv to store recent actions.
- Implemented valid_actions method to return available actions based on the current state.
- Updated reset method to accept a seed for randomization.
- Improved step method to handle action processing and state completion more robustly.
- Enhanced TaskGrader with new grading metrics for false positives and efficiency.
- Updated diagnostics to include efficiency bonus and false positive penalties.
- Added render and summary methods in CodeReviewEnv for better visualization and reporting.
- Refactored inference.py to support batch processing of tasks and improved output handling.
- Added difficulty levels to tasks in TaskDefinitions for better task categorization.

Files changed (5) hide show

environment/env.py +142 -32
environment/graders.py +103 -57
environment/init.py +3 -1
environment/tasks.py +24 -18
inference.py +135 -153

environment/env.py CHANGED Viewed

@@ -1,4 +1,6 @@
-from typing import Dict, Any, Tuple, Optional
 from environment.models import (
     ReviewAction,
     ReviewState,
@@ -9,24 +11,30 @@ from environment.graders import TaskGrader, RewardCalculator
 class CodeReviewEnv:
     def __init__(self):
         self._state: Optional[ReviewState] = None
         self.grader: Optional[TaskGrader] = None
         self.reward_calculator = RewardCalculator()
         self.max_steps = 50
         self.current_task_id: Optional[str] = None
-    def reset(self, task_id: str = None) -> Dict[str, Any]:
         if task_id is None:
             task_id = "bug_detection_easy_1"
         self.current_task_id = task_id
         task_data = TaskDefinitions.get_task(task_id)
         code_context = TaskDefinitions.create_code_context(task_data)
         task_metadata = TaskDefinitions.create_task_metadata(task_data)
         self._state = ReviewState(
             code_context=code_context,
             task_metadata=task_metadata,
@@ -38,75 +46,93 @@ class CodeReviewEnv:
             last_action_valid=True,
             last_error=None
         )
         self.grader = TaskGrader(task_metadata.expected_issues)
         self.reward_calculator.reset()
         return self._get_observation()
     def step(self, action: Dict[str, Any]) -> Tuple[Dict[str, Any], float, bool, Dict[str, Any]]:
         if self._state is None:
             return {}, -0.1, True, {"error": "Environment not initialized. Call reset() first."}
         if self._state.is_complete:
             return self._get_observation(), 0.0, True, {"error": "Episode already complete"}
         try:
             review_action = ReviewAction(**action)
         except Exception as e:
             self._state.last_action_valid = False
             self._state.last_error = str(e)
             return self._get_observation(), -0.1, False, {"error": str(e), "last_action_valid": False}
         self._state.current_step += 1
         self._process_action(review_action)
         if review_action.action_type.value == "approve" and not review_action.final_decision:
             review_action.final_decision = "approved"
         elif review_action.action_type.value == "request_changes" and not review_action.final_decision:
             review_action.final_decision = "changes_requested"
-        if self._state.current_step >= self.max_steps:
-            self._state.is_complete = True
-            if not self._state.final_decision:
-                self._state.final_decision = "changes_requested"
         if review_action.final_decision and not self._state.is_complete:
             self._state.is_complete = True
             self._state.final_decision = review_action.final_decision
         reward = self.reward_calculator.calculate_reward(
             review_action,
             self._state.comments_made,
             self._state.suggestions_made,
             self._state.final_decision or "changes_requested",
-            self.grader,
             self._state.last_action_valid,
         )
-        diagnostics = self.grader.get_diagnostics(
             comments=self._state.comments_made,
             suggestions=self._state.suggestions_made,
             final_decision=self._state.final_decision or "changes_requested",
         )
         info = {
             "step": self._state.current_step,
             "last_action_valid": self._state.last_action_valid,
             "error": self._state.last_error,
             "task_score": self.get_task_score(),
             "diagnostics": diagnostics,
         }
         return self._get_observation(), reward, self._state.is_complete, info
     def _process_action(self, action: ReviewAction):
         if self._state is None:
             return
         self._state.last_action_valid = True
         self._state.last_error = None
         if action.action_type.value == "add_comment":
             for comment in action.comments:
                 if comment.line_number <= self._state.code_context.line_count:
@@ -114,7 +140,7 @@ class CodeReviewEnv:
                 else:
                     self._state.last_action_valid = False
                     self._state.last_error = f"Line {comment.line_number} out of range"
         elif action.action_type.value == "suggest_fix":
             for suggestion in action.suggestions:
                 if suggestion.original_line <= self._state.code_context.line_count:
@@ -122,18 +148,34 @@ class CodeReviewEnv:
                 else:
                     self._state.last_action_valid = False
                     self._state.last_error = f"Line {suggestion.original_line} out of range"
         elif action.action_type.value == "mark_as_resolved":
             for comment in action.comments:
                 for existing_comment in self._state.comments_made:
                     if existing_comment.line_number == comment.line_number:
                         existing_comment.resolved = True
     def _get_observation(self) -> Dict[str, Any]:
         if self._state is None:
             return {}
-        return Observation(
             code_diff=self._state.code_context.code_diff,
             file_context=self._state.code_context.surrounding_code,
             file_path=self._state.code_context.file_path,
@@ -147,7 +189,73 @@ class CodeReviewEnv:
             review_complete=self._state.is_complete,
             final_decision_made=self._state.final_decision
         ).model_dump()
     def get_task_score(self) -> float:
         if not self.grader or self._state is None:
             return 0.0
@@ -156,11 +264,13 @@ class CodeReviewEnv:
             comments=self._state.comments_made,
             suggestions=self._state.suggestions_made,
             final_decision=self._state.final_decision or "changes_requested",
         )
     def close(self):
         pass
     def state(self) -> Dict[str, Any]:
         if self._state:
             return self._state.model_dump()

+from typing import Dict, Any, Tuple, Optional, List, Deque
+from collections import deque
+import random
 from environment.models import (
     ReviewAction,
     ReviewState,
 class CodeReviewEnv:
     def __init__(self):
         self._state: Optional[ReviewState] = None
         self.grader: Optional[TaskGrader] = None
         self.reward_calculator = RewardCalculator()
         self.max_steps = 50
         self.current_task_id: Optional[str] = None
+        self._action_history: Deque[Dict[str, Any]] = deque(maxlen=5)
+        self._seed: Optional[int] = None
+    def reset(self, task_id: Optional[str] = None, seed: Optional[int] = None) -> Dict[str, Any]:
+        if seed is not None:
+            self._seed = seed
+            random.seed(seed)
         if task_id is None:
             task_id = "bug_detection_easy_1"
         self.current_task_id = task_id
         task_data = TaskDefinitions.get_task(task_id)
         code_context = TaskDefinitions.create_code_context(task_data)
         task_metadata = TaskDefinitions.create_task_metadata(task_data)
         self._state = ReviewState(
             code_context=code_context,
             task_metadata=task_metadata,
             last_action_valid=True,
             last_error=None
         )
         self.grader = TaskGrader(task_metadata.expected_issues)
         self.reward_calculator.reset()
+        self._action_history.clear()
         return self._get_observation()
     def step(self, action: Dict[str, Any]) -> Tuple[Dict[str, Any], float, bool, Dict[str, Any]]:
         if self._state is None:
             return {}, -0.1, True, {"error": "Environment not initialized. Call reset() first."}
         if self._state.is_complete:
             return self._get_observation(), 0.0, True, {"error": "Episode already complete"}
+        if self.grader is None:
+            return self._get_observation(), -0.1, True, {"error": "Environment not initialized. Call reset() first."}
+        grader = self.grader
         try:
             review_action = ReviewAction(**action)
         except Exception as e:
             self._state.last_action_valid = False
             self._state.last_error = str(e)
             return self._get_observation(), -0.1, False, {"error": str(e), "last_action_valid": False}
         self._state.current_step += 1
         self._process_action(review_action)
+        self._action_history.append({
+            "step": self._state.current_step,
+            "action_type": review_action.action_type.value,
+            "num_comments": len(review_action.comments),
+            "num_suggestions": len(review_action.suggestions),
+            "final_decision": review_action.final_decision,
+        })
         if review_action.action_type.value == "approve" and not review_action.final_decision:
             review_action.final_decision = "approved"
         elif review_action.action_type.value == "request_changes" and not review_action.final_decision:
             review_action.final_decision = "changes_requested"
         if review_action.final_decision and not self._state.is_complete:
             self._state.is_complete = True
             self._state.final_decision = review_action.final_decision
+        if self._state.current_step >= self.max_steps and not self._state.is_complete:
+            self._state.is_complete = True
+            if not self._state.final_decision:
+                self._state.final_decision = "changes_requested"
         reward = self.reward_calculator.calculate_reward(
             review_action,
             self._state.comments_made,
             self._state.suggestions_made,
             self._state.final_decision or "changes_requested",
+            grader,
             self._state.last_action_valid,
+            steps_taken=self._state.current_step,
+            max_steps=self.max_steps,
         )
+        diagnostics = grader.get_diagnostics(
             comments=self._state.comments_made,
             suggestions=self._state.suggestions_made,
             final_decision=self._state.final_decision or "changes_requested",
+            steps_taken=self._state.current_step,
+            max_steps=self.max_steps,
         )
         info = {
             "step": self._state.current_step,
             "last_action_valid": self._state.last_action_valid,
             "error": self._state.last_error,
             "task_score": self.get_task_score(),
             "diagnostics": diagnostics,
+            "valid_actions": self.valid_actions(),
         }
         return self._get_observation(), reward, self._state.is_complete, info
     def _process_action(self, action: ReviewAction):
         if self._state is None:
             return
         self._state.last_action_valid = True
         self._state.last_error = None
         if action.action_type.value == "add_comment":
             for comment in action.comments:
                 if comment.line_number <= self._state.code_context.line_count:
                 else:
                     self._state.last_action_valid = False
                     self._state.last_error = f"Line {comment.line_number} out of range"
         elif action.action_type.value == "suggest_fix":
             for suggestion in action.suggestions:
                 if suggestion.original_line <= self._state.code_context.line_count:
                 else:
                     self._state.last_action_valid = False
                     self._state.last_error = f"Line {suggestion.original_line} out of range"
         elif action.action_type.value == "mark_as_resolved":
+            if not self._state.comments_made:
+                self._state.last_action_valid = False
+                self._state.last_error = "No comments exist to mark as resolved"
+                return
             for comment in action.comments:
                 for existing_comment in self._state.comments_made:
                     if existing_comment.line_number == comment.line_number:
                         existing_comment.resolved = True
+    def valid_actions(self) -> List[str]:
+        if self._state is None:
+            return []
+        actions = ["add_comment", "approve", "request_changes"]
+        if self._state.comments_made:
+            actions.append("suggest_fix")
+            actions.append("mark_as_resolved")
+        return actions
     def _get_observation(self) -> Dict[str, Any]:
         if self._state is None:
             return {}
+        obs = Observation(
             code_diff=self._state.code_context.code_diff,
             file_context=self._state.code_context.surrounding_code,
             file_path=self._state.code_context.file_path,
             review_complete=self._state.is_complete,
             final_decision_made=self._state.final_decision
         ).model_dump()
+        obs["action_history"] = list(self._action_history)
+        obs["valid_actions"] = self.valid_actions()
+        obs["line_count"] = self._state.code_context.line_count
+        return obs
+    def render(self):
+        if self._state is None:
+            print("Environment not initialized.")
+            return
+        print("=" * 60)
+        print(f"FILE     : {self._state.code_context.file_path}")
+        print(f"LANGUAGE : {self._state.code_context.language}")
+        print(f"STEP     : {self._state.current_step}/{self.max_steps}")
+        print(f"DONE     : {self._state.is_complete}")
+        print(f"DECISION : {self._state.final_decision or 'pending'}")
+        print(f"SCORE    : {self.get_task_score():.3f}")
+        print("-" * 60)
+        print("CODE DIFF:")
+        for i, line in enumerate(self._state.code_context.code_diff.split("\n"), start=1):
+            print(f"  {i}: {line}")
+        print("-" * 60)
+        if self._state.comments_made:
+            print(f"COMMENTS ({len(self._state.comments_made)}):")
+            for c in self._state.comments_made:
+                print(f"  Line {c.line_number} [{c.severity}]: {c.content}")
+        if self._state.suggestions_made:
+            print(f"SUGGESTIONS ({len(self._state.suggestions_made)}):")
+            for s in self._state.suggestions_made:
+                print(f"  Line {s.original_line}: {s.suggested_code}")
+        print(f"VALID ACTIONS: {self.valid_actions()}")
+        print("=" * 60)
+    def summary(self) -> Dict[str, Any]:
+        if not self.grader or self._state is None:
+            return {}
+        diagnostics = self.grader.get_diagnostics(
+            comments=self._state.comments_made,
+            suggestions=self._state.suggestions_made,
+            final_decision=self._state.final_decision or "changes_requested",
+            steps_taken=self._state.current_step,
+            max_steps=self.max_steps,
+        )
+        print("\n--- Episode Summary ---")
+        print(f"  Task            : {self.current_task_id}")
+        print(f"  Steps taken     : {self._state.current_step}/{self.max_steps}")
+        print(f"  Final decision  : {self._state.final_decision or 'none'}")
+        print(f"  Score           : {diagnostics['score']}")
+        print(f"  Precision       : {diagnostics['precision']}")
+        print(f"  Recall          : {diagnostics['recall']}")
+        print(f"  F1              : {diagnostics['f1']}")
+        print(f"  True positives  : {diagnostics['true_positive_count']}")
+        print(f"  False positives : {diagnostics['false_positive_count']}")
+        print(f"  False negatives : {diagnostics['false_negative_count']}")
+        print(f"  FP penalty      : {diagnostics['false_positive_penalty']}")
+        print(f"  Efficiency bonus: {diagnostics['efficiency_bonus']}")
+        print("-----------------------")
+        return diagnostics
     def get_task_score(self) -> float:
         if not self.grader or self._state is None:
             return 0.0
             comments=self._state.comments_made,
             suggestions=self._state.suggestions_made,
             final_decision=self._state.final_decision or "changes_requested",
+            steps_taken=self._state.current_step,
+            max_steps=self.max_steps,
         )
     def close(self):
         pass
     def state(self) -> Dict[str, Any]:
         if self._state:
             return self._state.model_dump()

environment/graders.py CHANGED Viewed

@@ -3,7 +3,7 @@ from environment.models import Comment, Suggestion, ReviewAction
 class TaskGrader:
     def __init__(self, expected_issues: List[Dict[str, Any]]):
         self.expected_issues = expected_issues
@@ -24,54 +24,105 @@ class TaskGrader:
             return True
         return expected_type in comment_text
     def grade_detection(self, comments: List[Comment]) -> float:
         if not self.expected_issues:
-            # No-issue tasks reward restraint and penalize false positives.
             issue_comments = [c for c in comments if c.is_issue]
             return 1.0 if not issue_comments else 0.0
         if not comments:
             return 0.0
-        matched_expected_indexes: Set[int] = set()
-        for idx, expected in enumerate(self.expected_issues):
             for comment in comments:
-                if not comment.is_issue:
-                    continue
-                if self._match_issue(expected, comment):
-                    matched_expected_indexes.add(idx)
-                    break
-        return len(matched_expected_indexes) / len(self.expected_issues)
     def grade_suggestions(self, suggestions: List[Suggestion]) -> float:
         if not self.expected_issues:
             return 1.0 if not suggestions else 0.0
         if not suggestions:
             return 0.0
         matched_expected_indexes: Set[int] = set()
         for idx, expected in enumerate(self.expected_issues):
             for suggestion in suggestions:
-                if suggestion.original_line == expected.get("line"):
                     matched_expected_indexes.add(idx)
                     break
         return min(1.0, len(matched_expected_indexes) / len(self.expected_issues))
     def grade_decision(self, final_decision: str) -> float:
         if not self.expected_issues:
             return 1.0 if final_decision == "approved" else 0.0
-        # If task includes expected issues, a safe review should request changes.
         return 1.0 if final_decision == "changes_requested" else 0.0
     def get_diagnostics(self,
                         comments: List[Comment],
                         suggestions: List[Suggestion],
-                        final_decision: str) -> Dict[str, Any]:
         issue_comments = [c for c in comments if c.is_issue]
         expected_count = len(self.expected_issues)
@@ -96,30 +147,16 @@ class TaskGrader:
             precision = true_positives / max(1, len(issue_comments))
             recall = true_positives / expected_count
-        severity_weights = {
-            "low": 0.25,
-            "medium": 0.5,
-            "high": 0.75,
-            "critical": 1.0,
-        }
-        weighted_found = 0.0
-        weighted_total = 0.0
-        for expected_idx, expected in enumerate(self.expected_issues):
-            weight = severity_weights.get(str(expected.get("severity", "medium")).lower(), 0.5)
-            weighted_total += weight
-            if expected_idx in matched_expected_indexes:
-                weighted_found += weight
-        severity_weighted_detection = 1.0 if weighted_total == 0 else (weighted_found / weighted_total)
         detection_score = self.grade_detection(comments)
         suggestion_score = self.grade_suggestions(suggestions)
         decision_score = self.grade_decision(final_decision)
-        false_positive_rate = false_positives / max(1, len(issue_comments))
-        false_positive_penalty = min(0.4, false_positive_rate * 0.25)
         raw_score = (detection_score * 0.4) + (suggestion_score * 0.3) + (decision_score * 0.3)
-        final_score = max(0.0, min(1.0, raw_score - false_positive_penalty))
         return {
             "expected_issue_count": expected_count,
@@ -128,61 +165,70 @@ class TaskGrader:
             "false_negative_count": false_negatives,
             "precision": round(precision, 4),
             "recall": round(recall, 4),
-            "severity_weighted_detection": round(severity_weighted_detection, 4),
             "detection_score": round(detection_score, 4),
             "suggestion_score": round(suggestion_score, 4),
             "decision_score": round(decision_score, 4),
             "false_positive_penalty": round(false_positive_penalty, 4),
             "score": round(final_score, 4),
         }
     def compute_score(self,
                       comments: List[Comment],
                       suggestions: List[Suggestion],
-                      final_decision: str) -> float:
-        diagnostics = self.get_diagnostics(comments, suggestions, final_decision)
         return float(diagnostics["score"])
     def compute_score_from_state(self,
                                  comments: List[Comment],
                                  suggestions: List[Suggestion],
-                                 final_decision: str) -> float:
-        return self.compute_score(comments, suggestions, final_decision)
 class RewardCalculator:
     def __init__(self):
         self.last_score = 0.0
-    def calculate_reward(self,
                          current_action: ReviewAction,
                          all_comments: List[Comment],
                          all_suggestions: List[Suggestion],
                          final_decision: str,
                          grader: TaskGrader,
-                         last_action_valid: bool) -> float:
         current_score = grader.compute_score(
             comments=all_comments,
             suggestions=all_suggestions,
             final_decision=final_decision,
         )
         reward = current_score - self.last_score
         if current_action.action_type.value in ["add_comment", "suggest_fix"]:
             reward += 0.03
         if not last_action_valid:
             reward -= 0.15
         if not current_action.comments and not current_action.suggestions:
             if current_action.action_type.value in ["approve", "request_changes"]:
                 pass
             else:
                 reward -= 0.1
         for comment in current_action.comments:
             if comment.severity == "critical":
                 reward += 0.2
@@ -190,19 +236,19 @@ class RewardCalculator:
                 reward += 0.1
             elif comment.severity == "medium":
                 reward += 0.05
         if len(current_action.suggestions) > 0:
             reward += 0.05 * len(current_action.suggestions)
         if current_action.final_decision:
             optimal_decision = "changes_requested" if grader.expected_issues else "approved"
             reward += 0.1 if current_action.final_decision == optimal_decision else -0.1
         reward = max(-0.5, min(1.0, reward))
         self.last_score = current_score
         return reward
     def reset(self):
         self.last_score = 0.0

 class TaskGrader:
     def __init__(self, expected_issues: List[Dict[str, Any]]):
         self.expected_issues = expected_issues
             return True
         return expected_type in comment_text
+    def _partial_credit(self, expected: Dict[str, Any], comment: Comment) -> float:
+        expected_line = int(expected.get("line", 0) or 0)
+        expected_type = self._normalize(expected.get("type", ""))
+        comment_text = self._normalize(comment.content)
+        if not comment.is_issue:
+            return 0.0
+        keyword_tokens = expected_type.replace("_", " ").split()
+        content_match = expected_type in comment_text or any(t in comment_text for t in keyword_tokens)
+        if comment.line_number == expected_line and content_match:
+            return 1.0
+        distance = abs(comment.line_number - expected_line)
+        if distance <= 2 and content_match:
+            return max(0.0, 1.0 - distance * 0.25)
+        if content_match:
+            return 0.2
+        return 0.0
     def grade_detection(self, comments: List[Comment]) -> float:
         if not self.expected_issues:
             issue_comments = [c for c in comments if c.is_issue]
             return 1.0 if not issue_comments else 0.0
         if not comments:
             return 0.0
+        total_credit = 0.0
+        for expected in self.expected_issues:
+            best_credit = 0.0
             for comment in comments:
+                credit = self._partial_credit(expected, comment)
+                if credit > best_credit:
+                    best_credit = credit
+            total_credit += best_credit
+        return min(1.0, total_credit / len(self.expected_issues))
     def grade_suggestions(self, suggestions: List[Suggestion]) -> float:
         if not self.expected_issues:
             return 1.0 if not suggestions else 0.0
         if not suggestions:
             return 0.0
         matched_expected_indexes: Set[int] = set()
         for idx, expected in enumerate(self.expected_issues):
             for suggestion in suggestions:
+                distance = abs(suggestion.original_line - expected.get("line", 0))
+                if distance <= 1:
                     matched_expected_indexes.add(idx)
                     break
         return min(1.0, len(matched_expected_indexes) / len(self.expected_issues))
     def grade_decision(self, final_decision: str) -> float:
         if not self.expected_issues:
             return 1.0 if final_decision == "approved" else 0.0
         return 1.0 if final_decision == "changes_requested" else 0.0
+    def grade_false_positives(self, comments: List[Comment]) -> float:
+        if not self.expected_issues:
+            return 0.0
+        issue_comments = [c for c in comments if c.is_issue]
+        if not issue_comments:
+            return 0.0
+        matched_comment_indexes: Set[int] = set()
+        for expected in self.expected_issues:
+            for idx, comment in enumerate(issue_comments):
+                if self._partial_credit(expected, comment) > 0:
+                    matched_comment_indexes.add(idx)
+        false_positive_count = len(issue_comments) - len(matched_comment_indexes)
+        false_positive_rate = false_positive_count / max(1, len(issue_comments))
+        return min(0.4, false_positive_rate * 0.25)
+    def grade_efficiency(self, steps_taken: int, max_steps: int) -> float:
+        if max_steps <= 0:
+            return 0.0
+        ratio = steps_taken / max_steps
+        if ratio <= 0.1:
+            return 0.1
+        if ratio <= 0.2:
+            return 0.05
+        return 0.0
     def get_diagnostics(self,
                         comments: List[Comment],
                         suggestions: List[Suggestion],
+                        final_decision: str,
+                        steps_taken: int = 0,
+                        max_steps: int = 50) -> Dict[str, Any]:
         issue_comments = [c for c in comments if c.is_issue]
         expected_count = len(self.expected_issues)
             precision = true_positives / max(1, len(issue_comments))
             recall = true_positives / expected_count
+        f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0
         detection_score = self.grade_detection(comments)
         suggestion_score = self.grade_suggestions(suggestions)
         decision_score = self.grade_decision(final_decision)
+        false_positive_penalty = self.grade_false_positives(comments)
+        efficiency_bonus = self.grade_efficiency(steps_taken, max_steps)
         raw_score = (detection_score * 0.4) + (suggestion_score * 0.3) + (decision_score * 0.3)
+        final_score = max(0.0, min(1.0, raw_score - false_positive_penalty + efficiency_bonus))
         return {
             "expected_issue_count": expected_count,
             "false_negative_count": false_negatives,
             "precision": round(precision, 4),
             "recall": round(recall, 4),
+            "f1": round(f1, 4),
             "detection_score": round(detection_score, 4),
             "suggestion_score": round(suggestion_score, 4),
             "decision_score": round(decision_score, 4),
             "false_positive_penalty": round(false_positive_penalty, 4),
+            "efficiency_bonus": round(efficiency_bonus, 4),
             "score": round(final_score, 4),
         }
     def compute_score(self,
                       comments: List[Comment],
                       suggestions: List[Suggestion],
+                      final_decision: str,
+                      steps_taken: int = 0,
+                      max_steps: int = 50) -> float:
+        diagnostics = self.get_diagnostics(comments, suggestions, final_decision, steps_taken, max_steps)
         return float(diagnostics["score"])
     def compute_score_from_state(self,
                                  comments: List[Comment],
                                  suggestions: List[Suggestion],
+                                 final_decision: str,
+                                 steps_taken: int = 0,
+                                 max_steps: int = 50) -> float:
+        return self.compute_score(comments, suggestions, final_decision, steps_taken, max_steps)
 class RewardCalculator:
     def __init__(self):
         self.last_score = 0.0
+    def calculate_reward(self,
                          current_action: ReviewAction,
                          all_comments: List[Comment],
                          all_suggestions: List[Suggestion],
                          final_decision: str,
                          grader: TaskGrader,
+                         last_action_valid: bool,
+                         steps_taken: int = 0,
+                         max_steps: int = 50) -> float:
         current_score = grader.compute_score(
             comments=all_comments,
             suggestions=all_suggestions,
             final_decision=final_decision,
+            steps_taken=steps_taken,
+            max_steps=max_steps,
         )
         reward = current_score - self.last_score
         if current_action.action_type.value in ["add_comment", "suggest_fix"]:
             reward += 0.03
         if not last_action_valid:
             reward -= 0.15
         if not current_action.comments and not current_action.suggestions:
             if current_action.action_type.value in ["approve", "request_changes"]:
                 pass
             else:
                 reward -= 0.1
         for comment in current_action.comments:
             if comment.severity == "critical":
                 reward += 0.2
                 reward += 0.1
             elif comment.severity == "medium":
                 reward += 0.05
         if len(current_action.suggestions) > 0:
             reward += 0.05 * len(current_action.suggestions)
         if current_action.final_decision:
             optimal_decision = "changes_requested" if grader.expected_issues else "approved"
             reward += 0.1 if current_action.final_decision == optimal_decision else -0.1
         reward = max(-0.5, min(1.0, reward))
         self.last_score = current_score
         return reward
     def reset(self):
         self.last_score = 0.0

environment/init.py CHANGED Viewed

@@ -9,6 +9,7 @@ from environment.models import (
     ReviewState,
     Observation
 )
 __all__ = [
     "CodeReviewEnv",
@@ -19,5 +20,6 @@ __all__ = [
     "CodeContext",
     "TaskMetadata",
     "ReviewState",
-    "Observation"
 ]

     ReviewState,
     Observation
 )
+from environment.tasks import TaskDefinitions
 __all__ = [
     "CodeReviewEnv",
     "CodeContext",
     "TaskMetadata",
     "ReviewState",
+    "Observation",
+    "TaskDefinitions",
 ]

environment/tasks.py CHANGED Viewed

@@ -9,11 +9,12 @@ class TaskDefinitions:
         "bug_detection_medium": "memory_leak_medium_1",
         "bug_detection_hard": "security_hard_1",
     }
     EASY_TASKS = [
         {
             "task_id": "bug_detection_easy_1",
             "task_name": "Division by Zero",
             "description": "Find the division by zero vulnerability in the calculate_average function",
             "code_diff": """def calculate_average(numbers):
     total = sum(numbers)
@@ -21,11 +22,11 @@ class TaskDefinitions:
             "surrounding_code": """class StatisticsCalculator:
     def __init__(self):
         self.results = []
     def calculate_average(self, numbers):
         total = sum(numbers)
         return total / len(numbers)
     def add_result(self, value):
         self.results.append(value)""",
             "file_path": "statistics.py",
@@ -43,6 +44,7 @@ class TaskDefinitions:
         {
             "task_id": "bug_detection_easy_2",
             "task_name": "Off-by-One Error",
             "description": "Find the off-by-one error in the array iteration",
             "code_diff": """def process_items(items):
     for i in range(len(items)):
@@ -70,6 +72,7 @@ class TaskDefinitions:
         {
             "task_id": "approve_easy_3",
             "task_name": "Approve Safe Refactor",
             "description": "No issues expected: approve this small readability refactor",
             "code_diff": """def normalize_name(name):
     cleaned = name.strip()
@@ -86,11 +89,12 @@ def format_username(user):
             "expected_issues": []
         }
     ]
     MEDIUM_TASKS = [
         {
             "task_id": "memory_leak_medium_1",
             "task_name": "File Handle Leak",
             "description": "Find the memory leak where file handles are not properly closed",
             "code_diff": """def read_files(file_list):
     contents = []
@@ -127,6 +131,7 @@ def write_output(data, filename):
         {
             "task_id": "performance_medium_2",
             "task_name": "Inefficient String Concatenation",
             "description": "Find the performance issue with string concatenation in a loop",
             "code_diff": """def build_string(items):
     result = ""
@@ -156,6 +161,7 @@ def format_output(data):
         {
             "task_id": "approve_medium_3",
             "task_name": "Approve Safe Query Helper",
             "description": "No issues expected: approve this query helper cleanup",
             "code_diff": """def build_user_query(limit):
     safe_limit = max(1, int(limit))
@@ -173,11 +179,12 @@ def run_user_query(db, limit):
             "expected_issues": []
         }
     ]
     HARD_TASKS = [
         {
             "task_id": "security_hard_1",
             "task_name": "SQL Injection Vulnerability",
             "description": "Find the SQL injection vulnerability in the database query",
             "code_diff": """def get_user_data(user_id):
     query = f"SELECT * FROM users WHERE id = {user_id}"
@@ -205,11 +212,12 @@ def get_all_users():
         {
             "task_id": "race_condition_hard_2",
             "task_name": "Race Condition",
             "description": "Find the race condition in the thread-safe counter",
             "code_diff": """class Counter:
     def __init__(self):
         self.count = 0
     def increment(self):
         current = self.count
         self.count = current + 1
@@ -219,12 +227,12 @@ def get_all_users():
 class Counter:
     def __init__(self):
         self.count = 0
     def increment(self):
         current = self.count
         self.count = current + 1
         return self.count
     def get_count(self):
         return self.count""",
             "file_path": "counter.py",
@@ -242,6 +250,7 @@ class Counter:
         {
             "task_id": "approve_hard_3",
             "task_name": "Approve Thread-Safe Counter",
             "description": "No issues expected: approve this lock-based concurrency fix",
             "code_diff": """class Counter:
     def __init__(self):
@@ -269,7 +278,7 @@ class Counter:
             "expected_issues": []
         }
     ]
     @classmethod
     def get_task(cls, task_id: str) -> Dict[str, Any]:
         canonical_task_id = cls.TASK_ALIASES.get(task_id, task_id)
@@ -277,12 +286,13 @@ class Counter:
         for task in all_tasks:
             if task["task_id"] == canonical_task_id:
                 return task
         return cls.EASY_TASKS[0]
     @classmethod
     def get_all_tasks(cls) -> List[Dict[str, Any]]:
         return cls.EASY_TASKS + cls.MEDIUM_TASKS + cls.HARD_TASKS
     @classmethod
     def get_tasks_by_difficulty(cls, difficulty: str) -> List[Dict[str, Any]]:
         if difficulty == "easy":
@@ -292,7 +302,7 @@ class Counter:
         elif difficulty == "hard":
             return cls.HARD_TASKS
         return []
     @classmethod
     def create_code_context(cls, task_data: Dict[str, Any]) -> CodeContext:
         return CodeContext(
@@ -303,15 +313,11 @@ class Counter:
             language=task_data["language"],
             line_count=task_data["line_count"]
         )
     @classmethod
     def create_task_metadata(cls, task_data: Dict[str, Any]) -> TaskMetadata:
-        difficulty = "easy"
-        if "medium" in task_data["task_id"]:
-            difficulty = "medium"
-        elif "hard" in task_data["task_id"]:
-            difficulty = "hard"
         return TaskMetadata(
             task_id=task_data["task_id"],
             task_name=task_data["task_name"],

         "bug_detection_medium": "memory_leak_medium_1",
         "bug_detection_hard": "security_hard_1",
     }
     EASY_TASKS = [
         {
             "task_id": "bug_detection_easy_1",
             "task_name": "Division by Zero",
+            "difficulty": "easy",
             "description": "Find the division by zero vulnerability in the calculate_average function",
             "code_diff": """def calculate_average(numbers):
     total = sum(numbers)
             "surrounding_code": """class StatisticsCalculator:
     def __init__(self):
         self.results = []
     def calculate_average(self, numbers):
         total = sum(numbers)
         return total / len(numbers)
     def add_result(self, value):
         self.results.append(value)""",
             "file_path": "statistics.py",
         {
             "task_id": "bug_detection_easy_2",
             "task_name": "Off-by-One Error",
+            "difficulty": "easy",
             "description": "Find the off-by-one error in the array iteration",
             "code_diff": """def process_items(items):
     for i in range(len(items)):
         {
             "task_id": "approve_easy_3",
             "task_name": "Approve Safe Refactor",
+            "difficulty": "easy",
             "description": "No issues expected: approve this small readability refactor",
             "code_diff": """def normalize_name(name):
     cleaned = name.strip()
             "expected_issues": []
         }
     ]
     MEDIUM_TASKS = [
         {
             "task_id": "memory_leak_medium_1",
             "task_name": "File Handle Leak",
+            "difficulty": "medium",
             "description": "Find the memory leak where file handles are not properly closed",
             "code_diff": """def read_files(file_list):
     contents = []
         {
             "task_id": "performance_medium_2",
             "task_name": "Inefficient String Concatenation",
+            "difficulty": "medium",
             "description": "Find the performance issue with string concatenation in a loop",
             "code_diff": """def build_string(items):
     result = ""
         {
             "task_id": "approve_medium_3",
             "task_name": "Approve Safe Query Helper",
+            "difficulty": "medium",
             "description": "No issues expected: approve this query helper cleanup",
             "code_diff": """def build_user_query(limit):
     safe_limit = max(1, int(limit))
             "expected_issues": []
         }
     ]
     HARD_TASKS = [
         {
             "task_id": "security_hard_1",
             "task_name": "SQL Injection Vulnerability",
+            "difficulty": "hard",
             "description": "Find the SQL injection vulnerability in the database query",
             "code_diff": """def get_user_data(user_id):
     query = f"SELECT * FROM users WHERE id = {user_id}"
         {
             "task_id": "race_condition_hard_2",
             "task_name": "Race Condition",
+            "difficulty": "hard",
             "description": "Find the race condition in the thread-safe counter",
             "code_diff": """class Counter:
     def __init__(self):
         self.count = 0
     def increment(self):
         current = self.count
         self.count = current + 1
 class Counter:
     def __init__(self):
         self.count = 0
     def increment(self):
         current = self.count
         self.count = current + 1
         return self.count
     def get_count(self):
         return self.count""",
             "file_path": "counter.py",
         {
             "task_id": "approve_hard_3",
             "task_name": "Approve Thread-Safe Counter",
+            "difficulty": "hard",
             "description": "No issues expected: approve this lock-based concurrency fix",
             "code_diff": """class Counter:
     def __init__(self):
             "expected_issues": []
         }
     ]
     @classmethod
     def get_task(cls, task_id: str) -> Dict[str, Any]:
         canonical_task_id = cls.TASK_ALIASES.get(task_id, task_id)
         for task in all_tasks:
             if task["task_id"] == canonical_task_id:
                 return task
+        print(f"WARNING: task_id '{task_id}' not found, falling back to bug_detection_easy_1")
         return cls.EASY_TASKS[0]
     @classmethod
     def get_all_tasks(cls) -> List[Dict[str, Any]]:
         return cls.EASY_TASKS + cls.MEDIUM_TASKS + cls.HARD_TASKS
     @classmethod
     def get_tasks_by_difficulty(cls, difficulty: str) -> List[Dict[str, Any]]:
         if difficulty == "easy":
         elif difficulty == "hard":
             return cls.HARD_TASKS
         return []
     @classmethod
     def create_code_context(cls, task_data: Dict[str, Any]) -> CodeContext:
         return CodeContext(
             language=task_data["language"],
             line_count=task_data["line_count"]
         )
     @classmethod
     def create_task_metadata(cls, task_data: Dict[str, Any]) -> TaskMetadata:
+        difficulty = task_data.get("difficulty", "easy")
         return TaskMetadata(
             task_id=task_data["task_id"],
             task_name=task_data["task_name"],

inference.py CHANGED Viewed

@@ -6,7 +6,7 @@ import os
 import json
 import argparse
 import sys
-from typing import Dict, Any
 from openai import OpenAI
 API_BASE_URL = os.environ.get("API_BASE_URL", "")
@@ -72,9 +72,8 @@ class LLMClient:
         print(f"Endpoint: {self.base_url}")
         print(f"Model: {self.model}\n")
-    def chat_completion(self, messages: list, temperature: float = 0.7, max_tokens: int = 2000) -> str:
         last_error = None
-        # Retry once for flaky local-model responses.
         for _ in range(2):
             try:
                 completion = self.client.chat.completions.create(
@@ -132,69 +131,34 @@ class CodeReviewAgent:
         if " / len(" in code_diff:
             line = self._line_number(code_diff, " / len(", 1)
             line = self._task_expected_line(observation, line)
-            return {
-                "line_number": line,
-                "content": "Possible division_by_zero when list is empty before dividing by len(...).",
-                "is_issue": True,
-                "severity": "high",
-            }
         if "open(" in code_diff and ".read(" in code_diff and "with open" not in code_diff:
             line = self._line_number(code_diff, "open(", 1)
             line = self._task_expected_line(observation, line)
-            return {
-                "line_number": line,
-                "content": "Potential resource_leak: file handle opened without context manager or explicit close().",
-                "is_issue": True,
-                "severity": "high",
-            }
         if "SELECT" in code_diff and "{" in code_diff and "}" in code_diff:
             line = self._line_number(code_diff, "SELECT", 1)
             line = self._task_expected_line(observation, line)
-            return {
-                "line_number": line,
-                "content": "Potential sql_injection due to string interpolation in SQL query.",
-                "is_issue": True,
-                "severity": "critical",
-            }
         if "i + 1" in code_diff and "range(len(" in code_diff:
             line = self._line_number(code_diff, "i + 1", 1)
             line = self._task_expected_line(observation, line)
-            return {
-                "line_number": line,
-                "content": "Potential index_error: i + 1 can go out of bounds on the last iteration.",
-                "is_issue": True,
-                "severity": "medium",
-            }
         if "result = result +" in code_diff:
             line = self._line_number(code_diff, "result = result +", 1)
             line = self._task_expected_line(observation, line)
-            return {
-                "line_number": line,
-                "content": "Potential performance issue from repeated string concatenation in a loop.",
-                "is_issue": True,
-                "severity": "medium",
-            }
         if "current = self.count" in code_diff and "self.count = current + 1" in code_diff:
             line = self._line_number(code_diff, "self.count = current + 1", 1)
             line = self._task_expected_line(observation, line)
-            return {
-                "line_number": line,
-                "content": "Potential race_condition: increment is not atomic without synchronization.",
-                "is_issue": True,
-                "severity": "high",
-            }
-        return {
-            "line_number": 1,
-            "content": "Potential correctness issue requires manual validation.",
-            "is_issue": True,
-            "severity": "low",
-        }
     def _heuristic_suggestion(self, observation: Dict[str, Any]) -> Dict[str, Any]:
         code_diff = observation.get("code_diff", "")
@@ -202,62 +166,34 @@ class CodeReviewAgent:
         if " / len(" in code_diff:
             line = self._line_number(code_diff, " / len(", 1)
             line = self._task_expected_line(observation, line)
-            return {
-                "original_line": line,
-                "suggested_code": "return total / len(numbers) if numbers else 0",
-                "explanation": "Guard against empty input before division.",
-            }
         if "open(" in code_diff and ".read(" in code_diff and "with open" not in code_diff:
             line = self._line_number(code_diff, "open(", 1)
             line = self._task_expected_line(observation, line)
-            return {
-                "original_line": line,
-                "suggested_code": "with open(filename, 'r') as f:\n        data = f.read()",
-                "explanation": "Use a context manager so file handles are always closed.",
-            }
         if "SELECT" in code_diff and "{" in code_diff and "}" in code_diff:
             line = self._line_number(code_diff, "SELECT", 1)
             line = self._task_expected_line(observation, line)
-            return {
-                "original_line": line,
-                "suggested_code": "query = \"SELECT * FROM users WHERE id = ?\"\nreturn database.execute(query, [user_id])",
-                "explanation": "Use parameterized queries to prevent SQL injection.",
-            }
         if "i + 1" in code_diff and "range(len(" in code_diff:
             line = self._line_number(code_diff, "i + 1", 1)
             line = self._task_expected_line(observation, line)
-            return {
-                "original_line": line,
-                "suggested_code": "for i in range(len(items) - 1):\n    item = items[i]\n    next_item = items[i + 1]\n    process_pair(item, next_item)",
-                "explanation": "Stop one element early to avoid indexing past the array end.",
-            }
         if "result = result +" in code_diff:
             line = self._line_number(code_diff, "result = result +", 1)
             line = self._task_expected_line(observation, line)
-            return {
-                "original_line": line,
-                "suggested_code": "return \",\".join(items)",
-                "explanation": "join() avoids quadratic-time string concatenation.",
-            }
         if "current = self.count" in code_diff and "self.count = current + 1" in code_diff:
             line = self._line_number(code_diff, "self.count = current + 1", 1)
             line = self._task_expected_line(observation, line)
-            return {
-                "original_line": line,
-                "suggested_code": "with self._lock:\n    self.count += 1\n    return self.count",
-                "explanation": "Protect shared state with a lock for thread safety.",
-            }
-        return {
-            "original_line": 1,
-            "suggested_code": "# apply targeted fix here",
-            "explanation": "Provide a minimal fix for the identified issue.",
-        }
     def _coerce_action_for_phase(self, action_data: Dict[str, Any], observation: Dict[str, Any]) -> Dict[str, Any]:
         phase = self.phase
@@ -265,39 +201,19 @@ class CodeReviewAgent:
         if phase == 1:
             if no_issue_task:
-                return {
-                    "action_type": "add_comment",
-                    "comments": [],
-                    "suggestions": [],
-                    "final_decision": None,
-                }
             comments = action_data.get("comments") or []
             if action_data.get("action_type") != "add_comment" or not comments:
                 comments = [self._heuristic_comment(observation)]
-            return {
-                "action_type": "add_comment",
-                "comments": comments,
-                "suggestions": [],
-                "final_decision": None,
-            }
         if phase == 2:
             if no_issue_task:
-                return {
-                    "action_type": "suggest_fix",
-                    "comments": [],
-                    "suggestions": [],
-                    "final_decision": None,
-                }
             suggestions = action_data.get("suggestions") or []
             if action_data.get("action_type") != "suggest_fix" or not suggestions:
                 suggestions = [self._heuristic_suggestion(observation)]
-            return {
-                "action_type": "suggest_fix",
-                "comments": [],
-                "suggestions": suggestions,
-                "final_decision": None,
-            }
         prior_comments = observation.get("previous_comments", [])
         prior_suggestions = observation.get("previous_suggestions", [])
@@ -309,6 +225,11 @@ class CodeReviewAgent:
             "final_decision": final_decision,
         }
     def get_action(self, observation: Dict[str, Any]) -> str:
         system_prompt = """You are an expert code reviewer. You MUST follow this exact sequence:
@@ -360,6 +281,8 @@ Respond ONLY with a valid JSON object, no extra text:
             for s in prev_suggestions
         ]) or "None yet"
         if self.phase == 1:
             phase_instruction = """
 YOUR TASK NOW (Phase 1 - Add Comments):
@@ -397,6 +320,7 @@ File Context:
 {observation.get('file_context', '')}
 Current Step: {observation.get('current_step', 0)}/{observation.get('max_steps', 50)}
 Comments already made:
 {comments_text}
@@ -438,7 +362,6 @@ Respond with JSON only.
                 action_data["suggestions"] = []
             action_data = self._coerce_action_for_phase(action_data, observation)
             self.phase += 1
             return json.dumps(action_data)
@@ -478,41 +401,18 @@ Respond with JSON only.
             return {"action_type": "request_changes", "comments": [], "suggestions": []}
-def main():
-    sys.path.append('.')
-    try:
-        from environment.env import CodeReviewEnv
-    except ImportError as e:
-        print(f"Failed to import environment: {e}")
-        print("Make sure you're in the correct directory and environment is installed.")
-        sys.exit(1)
-    parser = argparse.ArgumentParser(description="Run code review agent")
-    parser.add_argument("--task-id", type=str, default="bug_detection_easy_1")
-    parser.add_argument("--max-steps", type=int, default=50)
-    parser.add_argument("--output", type=str, default="baseline_results.json")
-    args = parser.parse_args()
-    print("=" * 60)
-    print("Code Review Agent")
-    print("=" * 60)
-    env = CodeReviewEnv()
-    env.max_steps = args.max_steps
-    agent = CodeReviewAgent()
-    obs = env.reset(task_id=args.task_id)
     done = False
     step = 0
     total_reward = 0.0
-    print(f"\nTask    : {args.task_id}")
     print(f"Desc    : {obs.get('task_description', 'N/A')}")
-    print(f"Model   : {MODEL_NAME}")
     print("-" * 60)
-    while not done and step < args.max_steps:
         action_str = agent.get_action(obs)
         action = agent.parse_action(action_str)
         action = agent.validate_action(action, obs)
@@ -521,7 +421,7 @@ def main():
         total_reward += reward
         step += 1
-        print(f"\nStep {step}/{args.max_steps}:")
         print(f"  Phase       : {agent.phase - 1}")
         print(f"  Action      : {action.get('action_type')}")
         print(f"  Comments    : {len(action.get('comments', []))}")
@@ -529,38 +429,120 @@ def main():
         print(f"  Reward      : {reward:.3f}")
         print(f"  Total       : {total_reward:.3f}")
         print(f"  Score       : {info.get('task_score', 0):.3f}")
         if info.get('last_action_valid') is False:
             print(f"  Warning     : {info.get('error', 'Invalid action')}")
     final_score = env.get_task_score()
-    print("\n" + "=" * 60)
-    print("Final Results:")
-    print(f"  Task         : {args.task_id}")
-    print(f"  Total Reward : {total_reward:.3f}")
-    print(f"  Task Score   : {final_score:.3f}/1.0")
-    print(f"  Steps        : {step}")
-    print("=" * 60)
-    env.close()
-    results = {
-        "task_id": args.task_id,
         "total_reward": round(total_reward, 4),
         "task_score": round(final_score, 4),
         "steps": step,
-        "max_steps": args.max_steps,
-        "provider": "openai-client",
         "model": MODEL_NAME,
-        "api_base_url": API_BASE_URL
     }
-    with open(args.output, "w") as f:
-        json.dump(results, f, indent=2)
-    print(f"\nResults saved to {args.output}")
 if __name__ == "__main__":
-    main()

 import json
 import argparse
 import sys
+from typing import Dict, Any, List
 from openai import OpenAI
 API_BASE_URL = os.environ.get("API_BASE_URL", "")
         print(f"Endpoint: {self.base_url}")
         print(f"Model: {self.model}\n")
+    def chat_completion(self, messages: list, temperature: float = 0.0, max_tokens: int = 2000) -> str:
         last_error = None
         for _ in range(2):
             try:
                 completion = self.client.chat.completions.create(
         if " / len(" in code_diff:
             line = self._line_number(code_diff, " / len(", 1)
             line = self._task_expected_line(observation, line)
+            return {"line_number": line, "content": "Possible division_by_zero when list is empty before dividing by len(...).", "is_issue": True, "severity": "high"}
         if "open(" in code_diff and ".read(" in code_diff and "with open" not in code_diff:
             line = self._line_number(code_diff, "open(", 1)
             line = self._task_expected_line(observation, line)
+            return {"line_number": line, "content": "Potential resource_leak: file handle opened without context manager or explicit close().", "is_issue": True, "severity": "high"}
         if "SELECT" in code_diff and "{" in code_diff and "}" in code_diff:
             line = self._line_number(code_diff, "SELECT", 1)
             line = self._task_expected_line(observation, line)
+            return {"line_number": line, "content": "Potential sql_injection due to string interpolation in SQL query.", "is_issue": True, "severity": "critical"}
         if "i + 1" in code_diff and "range(len(" in code_diff:
             line = self._line_number(code_diff, "i + 1", 1)
             line = self._task_expected_line(observation, line)
+            return {"line_number": line, "content": "Potential index_error: i + 1 can go out of bounds on the last iteration.", "is_issue": True, "severity": "medium"}
         if "result = result +" in code_diff:
             line = self._line_number(code_diff, "result = result +", 1)
             line = self._task_expected_line(observation, line)
+            return {"line_number": line, "content": "Potential performance issue from repeated string concatenation in a loop.", "is_issue": True, "severity": "medium"}
         if "current = self.count" in code_diff and "self.count = current + 1" in code_diff:
             line = self._line_number(code_diff, "self.count = current + 1", 1)
             line = self._task_expected_line(observation, line)
+            return {"line_number": line, "content": "Potential race_condition: increment is not atomic without synchronization.", "is_issue": True, "severity": "high"}
+        return {"line_number": 1, "content": "Potential correctness issue requires manual validation.", "is_issue": True, "severity": "low"}
     def _heuristic_suggestion(self, observation: Dict[str, Any]) -> Dict[str, Any]:
         code_diff = observation.get("code_diff", "")
         if " / len(" in code_diff:
             line = self._line_number(code_diff, " / len(", 1)
             line = self._task_expected_line(observation, line)
+            return {"original_line": line, "suggested_code": "return total / len(numbers) if numbers else 0", "explanation": "Guard against empty input before division."}
         if "open(" in code_diff and ".read(" in code_diff and "with open" not in code_diff:
             line = self._line_number(code_diff, "open(", 1)
             line = self._task_expected_line(observation, line)
+            return {"original_line": line, "suggested_code": "with open(filename, 'r') as f:\n        data = f.read()", "explanation": "Use a context manager so file handles are always closed."}
         if "SELECT" in code_diff and "{" in code_diff and "}" in code_diff:
             line = self._line_number(code_diff, "SELECT", 1)
             line = self._task_expected_line(observation, line)
+            return {"original_line": line, "suggested_code": "query = \"SELECT * FROM users WHERE id = ?\"\nreturn database.execute(query, [user_id])", "explanation": "Use parameterized queries to prevent SQL injection."}
         if "i + 1" in code_diff and "range(len(" in code_diff:
             line = self._line_number(code_diff, "i + 1", 1)
             line = self._task_expected_line(observation, line)
+            return {"original_line": line, "suggested_code": "for i in range(len(items) - 1):\n    item = items[i]\n    next_item = items[i + 1]\n    process_pair(item, next_item)", "explanation": "Stop one element early to avoid indexing past the array end."}
         if "result = result +" in code_diff:
             line = self._line_number(code_diff, "result = result +", 1)
             line = self._task_expected_line(observation, line)
+            return {"original_line": line, "suggested_code": "return \",\".join(items)", "explanation": "join() avoids quadratic-time string concatenation."}
         if "current = self.count" in code_diff and "self.count = current + 1" in code_diff:
             line = self._line_number(code_diff, "self.count = current + 1", 1)
             line = self._task_expected_line(observation, line)
+            return {"original_line": line, "suggested_code": "with self._lock:\n    self.count += 1\n    return self.count", "explanation": "Protect shared state with a lock for thread safety."}
+        return {"original_line": 1, "suggested_code": "# apply targeted fix here", "explanation": "Provide a minimal fix for the identified issue."}
     def _coerce_action_for_phase(self, action_data: Dict[str, Any], observation: Dict[str, Any]) -> Dict[str, Any]:
         phase = self.phase
         if phase == 1:
             if no_issue_task:
+                return {"action_type": "add_comment", "comments": [], "suggestions": [], "final_decision": None}
             comments = action_data.get("comments") or []
             if action_data.get("action_type") != "add_comment" or not comments:
                 comments = [self._heuristic_comment(observation)]
+            return {"action_type": "add_comment", "comments": comments, "suggestions": [], "final_decision": None}
         if phase == 2:
             if no_issue_task:
+                return {"action_type": "suggest_fix", "comments": [], "suggestions": [], "final_decision": None}
             suggestions = action_data.get("suggestions") or []
             if action_data.get("action_type") != "suggest_fix" or not suggestions:
                 suggestions = [self._heuristic_suggestion(observation)]
+            return {"action_type": "suggest_fix", "comments": [], "suggestions": suggestions, "final_decision": None}
         prior_comments = observation.get("previous_comments", [])
         prior_suggestions = observation.get("previous_suggestions", [])
             "final_decision": final_decision,
         }
+    def reset(self):
+        self.phase = 1
+        self.model_unavailable = False
+        self.history = []
     def get_action(self, observation: Dict[str, Any]) -> str:
         system_prompt = """You are an expert code reviewer. You MUST follow this exact sequence:
             for s in prev_suggestions
         ]) or "None yet"
+        valid_actions = observation.get("valid_actions", [])
         if self.phase == 1:
             phase_instruction = """
 YOUR TASK NOW (Phase 1 - Add Comments):
 {observation.get('file_context', '')}
 Current Step: {observation.get('current_step', 0)}/{observation.get('max_steps', 50)}
+Valid Actions: {valid_actions}
 Comments already made:
 {comments_text}
                 action_data["suggestions"] = []
             action_data = self._coerce_action_for_phase(action_data, observation)
             self.phase += 1
             return json.dumps(action_data)
             return {"action_type": "request_changes", "comments": [], "suggestions": []}
+def run_episode(env, agent, task_id: str, max_steps: int) -> Dict[str, Any]:
+    agent.reset()
+    obs = env.reset(task_id=task_id)
     done = False
     step = 0
     total_reward = 0.0
+    print(f"\nTask    : {task_id}")
     print(f"Desc    : {obs.get('task_description', 'N/A')}")
     print("-" * 60)
+    while not done and step < max_steps:
         action_str = agent.get_action(obs)
         action = agent.parse_action(action_str)
         action = agent.validate_action(action, obs)
         total_reward += reward
         step += 1
+        print(f"\nStep {step}/{max_steps}:")
         print(f"  Phase       : {agent.phase - 1}")
         print(f"  Action      : {action.get('action_type')}")
         print(f"  Comments    : {len(action.get('comments', []))}")
         print(f"  Reward      : {reward:.3f}")
         print(f"  Total       : {total_reward:.3f}")
         print(f"  Score       : {info.get('task_score', 0):.3f}")
+        print(f"  Valid Actions: {info.get('valid_actions', [])}")
         if info.get('last_action_valid') is False:
             print(f"  Warning     : {info.get('error', 'Invalid action')}")
     final_score = env.get_task_score()
+    diagnostics = env.summary()
+    return {
+        "task_id": task_id,
         "total_reward": round(total_reward, 4),
         "task_score": round(final_score, 4),
         "steps": step,
+        "max_steps": max_steps,
+        "precision": diagnostics.get("precision", 0),
+        "recall": diagnostics.get("recall", 0),
+        "f1": diagnostics.get("f1", 0),
+        "false_positive_count": diagnostics.get("false_positive_count", 0),
+        "efficiency_bonus": diagnostics.get("efficiency_bonus", 0),
         "model": MODEL_NAME,
+        "api_base_url": API_BASE_URL,
+    }
+def run_batch(env, agent, task_ids: List[str], max_steps: int, output: str):
+    all_results = []
+    print("=" * 60)
+    print(f"Batch Evaluation: {len(task_ids)} tasks")
+    print("=" * 60)
+    for task_id in task_ids:
+        result = run_episode(env, agent, task_id, max_steps)
+        all_results.append(result)
+    avg_score = sum(r["task_score"] for r in all_results) / len(all_results)
+    avg_reward = sum(r["total_reward"] for r in all_results) / len(all_results)
+    avg_f1 = sum(r["f1"] for r in all_results) / len(all_results)
+    print("\n" + "=" * 60)
+    print("Batch Results:")
+    print(f"  Tasks evaluated : {len(all_results)}")
+    print(f"  Avg Task Score  : {avg_score:.3f}")
+    print(f"  Avg Reward      : {avg_reward:.3f}")
+    print(f"  Avg F1          : {avg_f1:.3f}")
+    print("=" * 60)
+    batch_output = {
+        "summary": {
+            "total_tasks": len(all_results),
+            "avg_task_score": round(avg_score, 4),
+            "avg_total_reward": round(avg_reward, 4),
+            "avg_f1": round(avg_f1, 4),
+            "model": MODEL_NAME,
+        },
+        "results": all_results,
     }
+    with open(output, "w") as f:
+        json.dump(batch_output, f, indent=2)
+    print(f"\nBatch results saved to {output}")
+def main():
+    sys.path.append('.')
+    try:
+        from environment.env import CodeReviewEnv
+    except ImportError as e:
+        print(f"Failed to import environment: {e}")
+        print("Make sure you're in the correct directory and environment is installed.")
+        sys.exit(1)
+    parser = argparse.ArgumentParser(description="Run code review agent")
+    parser.add_argument("--task-id", type=str, default="bug_detection_easy_1")
+    parser.add_argument("--max-steps", type=int, default=50)
+    parser.add_argument("--output", type=str, default="baseline_results.json")
+    parser.add_argument("--batch", action="store_true", help="Run all tasks in batch mode")
+    parser.add_argument("--difficulty", type=str, default=None, help="Filter batch by difficulty: easy, medium, hard")
+    args = parser.parse_args()
+    print("=" * 60)
+    print("Code Review Agent")
+    print("=" * 60)
+    env = CodeReviewEnv()
+    env.max_steps = args.max_steps
+    agent = CodeReviewAgent()
+    if args.batch:
+        from environment.tasks import TaskDefinitions
+        if args.difficulty:
+            task_ids = [t["task_id"] for t in TaskDefinitions.get_tasks_by_difficulty(args.difficulty)]
+        else:
+            task_ids = [t["task_id"] for t in TaskDefinitions.get_all_tasks()]
+        run_batch(env, agent, task_ids, args.max_steps, args.output)
+    else:
+        result = run_episode(env, agent, args.task_id, args.max_steps)
+        print("\n" + "=" * 60)
+        print("Final Results:")
+        print(f"  Task         : {result['task_id']}")
+        print(f"  Total Reward : {result['total_reward']:.3f}")
+        print(f"  Task Score   : {result['task_score']:.3f}/1.0")
+        print(f"  Steps        : {result['steps']}")
+        print("=" * 60)
+        with open(args.output, "w") as f:
+            json.dump(result, f, indent=2)
+        print(f"\nResults saved to {args.output}")
+    env.close()
 if __name__ == "__main__":
+    main()