Spaces:

Tejasghatule
/

code-revieww-env

Sleeping

App Files Files Community

codemaverick2 commited on 11 days ago

Commit

78f3eb2

1 Parent(s): e48a1e4

Add diversity/exploration bonuses, near-miss type check, context truncation

Browse files

Files changed (6) hide show

README.md +6 -2
inference.py +6 -0
server/environment.py +31 -2
server/graders.py +5 -3
tests/test_environment.py +53 -0
tests/test_graders.py +10 -3

README.md CHANGED Viewed

@@ -208,7 +208,9 @@ Near-miss (±3-5 lines): graduated partial credit via exponential decay
 | TP + early (first 40% of steps) | +0.02 bonus |
 | TP + high confidence (≥0.7) | +0.01 bonus |
 | PBRS potential shaping (Φ(s')−Φ(s)) | +0.03–0.08 |
-| Near-miss (±3-5 lines, exponential decay) | +0.020–0.055 |
 | False positive | −0.05 |
 | False positive flood (4th+ FP) | escalating −0.03 extra |
 | High-confidence FP | −0.03 extra |
@@ -220,7 +222,9 @@ Near-miss (±3-5 lines): graduated partial credit via exponential decay
 ### Reward shaping foundations
 - **Potential-Based Reward Shaping** (Ng et al. 1999): Φ(s) = (tp/total_gt) × 0.5. Policy-invariant shaping that improves sample efficiency without changing the optimal policy.
-- **Graduated near-miss** (exponential decay): reward = 0.10 × e^(−0.6 × (line_diff − 2)) for lines 3-5 off. Gives smooth gradient signal for line-number refinement.
 - **Variable-Length Return Normalization** (VL Norm 2025): normalized_return = cumulative_reward / steps_used. Makes return comparable across tasks of different lengths.
 - **Flood protection**: escalating FP penalty prevents reward hacking via flag-spamming.

 | TP + early (first 40% of steps) | +0.02 bonus |
 | TP + high confidence (≥0.7) | +0.01 bonus |
 | PBRS potential shaping (Φ(s')−Φ(s)) | +0.03–0.08 |
+| Diversity bonus (first TP in new issue category) | +0.02 |
+| Exploration bonus (first TP in new file, multi-file tasks) | +0.01 |
+| Near-miss (±3-5 lines, compatible type, exp decay) | +0.020–0.055 |
 | False positive | −0.05 |
 | False positive flood (4th+ FP) | escalating −0.03 extra |
 | High-confidence FP | −0.03 extra |
 ### Reward shaping foundations
 - **Potential-Based Reward Shaping** (Ng et al. 1999): Φ(s) = (tp/total_gt) × 0.5. Policy-invariant shaping that improves sample efficiency without changing the optimal policy.
+- **Graduated near-miss** (exponential decay): reward = 0.10 × e^(−0.6 × (line_diff − 2)) for lines 3-5 off with compatible issue type. Gives smooth gradient signal for line-number refinement.
+- **Diversity bonus**: +0.02 for first TP in a new issue category (security/bug/performance). Encourages covering all issue types instead of spamming one.
+- **Exploration bonus**: +0.01 for first TP in a new file (multi-file tasks only). Encourages cross-file coverage.
 - **Variable-Length Return Normalization** (VL Norm 2025): normalized_return = cumulative_reward / steps_used. Makes return comparable across tasks of different lengths.
 - **Flood protection**: escalating FP penalty prevents reward hacking via flag-spamming.

inference.py CHANGED Viewed

@@ -404,6 +404,12 @@ def run_task(task_id: str, http_client: httpx.Client) -> dict:
         if combined_feedback:
             messages.append({"role": "user", "content": combined_feedback})
         atype = action.get("action_type", "")
         print(f"    Step {step_count:2d}: {atype:20s} | reward={str(last_reward):8s} | score={obs.get('current_score', 0.0):.3f}")

         if combined_feedback:
             messages.append({"role": "user", "content": combined_feedback})
+        # Context window management: keep system + initial prompt + last 12 exchanges
+        # This prevents token limit errors on long episodes (25+ steps)
+        max_history = 2 + 24  # system + initial user + 12 assistant/user pairs
+        if len(messages) > max_history:
+            messages = messages[:2] + messages[-(max_history - 2):]
         atype = action.get("action_type", "")
         print(f"    Step {step_count:2d}: {atype:20s} | reward={str(last_reward):8s} | score={obs.get('current_score', 0.0):.3f}")

server/environment.py CHANGED Viewed

@@ -44,6 +44,10 @@ _VALIDATION_PENALTY = -0.02
 # Flood protection: escalating FP penalty
 _FP_FLOOD_THRESHOLD = 3             # FPs before escalation kicks in
 _FP_FLOOD_MULTIPLIER = 1.5          # each extra FP beyond threshold costs 1.5x more
 _SEV_RANK = {"low": 0, "medium": 1, "high": 2, "critical": 3}
@@ -80,6 +84,8 @@ class CodeReviewEnvironment(_BaseEnv):
         self._fp_count: int = 0           # total false positives this episode
         self._matched_gt_indices: Set[int] = set()  # GT indices already matched
         self._episode_rewards: List[float] = []  # for VL return normalization
     def reset(
         self,
@@ -104,6 +110,8 @@ class CodeReviewEnvironment(_BaseEnv):
         self._fp_count = 0
         self._matched_gt_indices = set()
         self._episode_rewards = []
         self._state = ReviewState(
             task_id=task_id,
@@ -401,6 +409,11 @@ class CodeReviewEnvironment(_BaseEnv):
             fix_suggestion=action.fix_suggestion,
         )
         # Classify: TP, near-miss (with line distance), or FP
         is_tp = False
         is_near = False
@@ -460,16 +473,32 @@ class CodeReviewEnvironment(_BaseEnv):
             pbrs_bonus = round(phi_after - phi_before, 4)
             reward_breakdown["pbrs_shaping"] = pbrs_bonus
-            reward = base_reward + severity_bonus + temporal_bonus + confidence_bonus + pbrs_bonus
             reward_breakdown["total"] = round(reward, 4)
             sev_note = f", severity +{severity_bonus:.2f}" if severity_bonus else ""
             temp_note = f", early +{temporal_bonus:.2f}" if temporal_bonus else ""
             conf_note = f", conf +{confidence_bonus:.2f}" if confidence_bonus else ""
             pbrs_note = f", progress +{pbrs_bonus:.2f}" if pbrs_bonus > 0 else ""
             feedback = (
                 f"Correct! Issue at {action.filename}:{action.line_number} confirmed. "
-                f"[+{reward:.2f}{sev_note}{temp_note}{conf_note}{pbrs_note}]"
             )
         elif is_near:

 # Flood protection: escalating FP penalty
 _FP_FLOOD_THRESHOLD = 3             # FPs before escalation kicks in
 _FP_FLOOD_MULTIPLIER = 1.5          # each extra FP beyond threshold costs 1.5x more
+# Diversity bonus: reward for covering a new issue category
+_DIVERSITY_BONUS = 0.02             # first TP in a new issue_type category
+# Exploration bonus: first flag in a previously unflagged file
+_FILE_EXPLORATION_BONUS = 0.01
 _SEV_RANK = {"low": 0, "medium": 1, "high": 2, "critical": 3}
         self._fp_count: int = 0           # total false positives this episode
         self._matched_gt_indices: Set[int] = set()  # GT indices already matched
         self._episode_rewards: List[float] = []  # for VL return normalization
+        self._found_categories: Set[str] = set()  # issue types already found (for diversity bonus)
+        self._flagged_files: Set[str] = set()      # files already flagged (for exploration bonus)
     def reset(
         self,
         self._fp_count = 0
         self._matched_gt_indices = set()
         self._episode_rewards = []
+        self._found_categories = set()
+        self._flagged_files = set()
         self._state = ReviewState(
             task_id=task_id,
             fix_suggestion=action.fix_suggestion,
         )
+        # Track file exploration
+        is_new_file = action.filename not in self._flagged_files
+        if action.filename:
+            self._flagged_files.add(action.filename)
         # Classify: TP, near-miss (with line distance), or FP
         is_tp = False
         is_near = False
             pbrs_bonus = round(phi_after - phi_before, 4)
             reward_breakdown["pbrs_shaping"] = pbrs_bonus
+            # Diversity bonus: first TP in a new issue category
+            diversity_bonus = 0.0
+            gt_type = matched_gt_issue.issue_type
+            if gt_type not in self._found_categories:
+                self._found_categories.add(gt_type)
+                diversity_bonus = _DIVERSITY_BONUS
+                reward_breakdown["diversity_bonus"] = diversity_bonus
+            # Exploration bonus: first flag in a new file (multi-file tasks)
+            exploration_bonus = 0.0
+            if is_new_file and len(self._task.get("code_files", {})) > 1:
+                exploration_bonus = _FILE_EXPLORATION_BONUS
+                reward_breakdown["exploration_bonus"] = exploration_bonus
+            reward = (base_reward + severity_bonus + temporal_bonus +
+                      confidence_bonus + pbrs_bonus + diversity_bonus + exploration_bonus)
             reward_breakdown["total"] = round(reward, 4)
             sev_note = f", severity +{severity_bonus:.2f}" if severity_bonus else ""
             temp_note = f", early +{temporal_bonus:.2f}" if temporal_bonus else ""
             conf_note = f", conf +{confidence_bonus:.2f}" if confidence_bonus else ""
             pbrs_note = f", progress +{pbrs_bonus:.2f}" if pbrs_bonus > 0 else ""
+            div_note = f", new-type +{diversity_bonus:.2f}" if diversity_bonus else ""
             feedback = (
                 f"Correct! Issue at {action.filename}:{action.line_number} confirmed. "
+                f"[+{reward:.2f}{sev_note}{temp_note}{conf_note}{pbrs_note}{div_note}]"
             )
         elif is_near:

server/graders.py CHANGED Viewed

@@ -58,21 +58,23 @@ def match_quality(flagged: Issue, gt: Issue) -> str:
     """
     Return quality of match between flagged and gt:
       "exact"  — within ±2 lines and right issue type
-      "near"   — within ±3-5 lines and same file (regardless of type)
       "none"   — no meaningful match
     """
     if flagged.filename != gt.filename:
         return "none"
     line_diff = abs(flagged.line_number - gt.line_number)
     if line_diff <= EXACT_TOLERANCE:
-        compat = _TYPE_COMPAT.get(gt.issue_type, {gt.issue_type})
         if flagged.issue_type in compat:
             return "exact"
     if line_diff <= NEAR_TOLERANCE:
-        return "near"
     return "none"

     """
     Return quality of match between flagged and gt:
       "exact"  — within ±2 lines and right issue type
+      "near"   — within ±3-5 lines, same file, and compatible issue type
       "none"   — no meaningful match
     """
     if flagged.filename != gt.filename:
         return "none"
     line_diff = abs(flagged.line_number - gt.line_number)
+    compat = _TYPE_COMPAT.get(gt.issue_type, {gt.issue_type})
     if line_diff <= EXACT_TOLERANCE:
         if flagged.issue_type in compat:
             return "exact"
     if line_diff <= NEAR_TOLERANCE:
+        # Near-miss requires compatible type to avoid rewarding wrong-type flags
+        if flagged.issue_type in compat:
+            return "near"
     return "none"

tests/test_environment.py CHANGED Viewed

@@ -838,3 +838,56 @@ class TestFunctionRanges:
     def test_function_ranges_nonempty_for_python(self, env):
         obs = env.reset(task_id="bug-detection")
         assert len(obs.code_metadata["function_ranges"]) > 0

     def test_function_ranges_nonempty_for_python(self, env):
         obs = env.reset(task_id="bug-detection")
         assert len(obs.code_metadata["function_ranges"]) > 0
+# ---------------------------------------------------------------------------
+# Diversity bonus
+# ---------------------------------------------------------------------------
+class TestDiversityBonus:
+    def test_first_tp_in_category_gets_diversity_bonus(self, env):
+        """First TP in a new issue category should include diversity_bonus."""
+        env.reset(task_id="security-audit")
+        obs = env.step(ReviewAction(
+            action_type="flag_issue", line_number=8, filename="app.py",
+            issue_type="security", severity="high", description="hardcoded secret"
+        ))
+        # First security TP → should have diversity bonus
+        assert obs.reward_breakdown.get("diversity_bonus", 0) > 0
+    def test_second_tp_same_category_no_diversity_bonus(self, env):
+        """Second TP in same category should NOT get diversity bonus."""
+        env.reset(task_id="security-audit")
+        env.step(ReviewAction(
+            action_type="flag_issue", line_number=8, filename="app.py",
+            issue_type="security", severity="high", description="hardcoded secret"
+        ))
+        obs2 = env.step(ReviewAction(
+            action_type="flag_issue", line_number=19, filename="app.py",
+            issue_type="security", severity="critical", description="sql injection"
+        ))
+        assert obs2.reward_breakdown.get("diversity_bonus", 0) == 0
+# ---------------------------------------------------------------------------
+# Exploration bonus (multi-file tasks)
+# ---------------------------------------------------------------------------
+class TestExplorationBonus:
+    def test_multifile_first_flag_gets_exploration_bonus(self, env):
+        """First flag in a new file of a multi-file task gets exploration bonus."""
+        env.reset(task_id="comprehensive-review")
+        obs = env.step(ReviewAction(
+            action_type="flag_issue", line_number=7, filename="models.py",
+            issue_type="security", severity="critical", description="plaintext password"
+        ))
+        assert obs.reward_breakdown.get("exploration_bonus", 0) > 0
+    def test_singlefile_no_exploration_bonus(self, env):
+        """Single-file tasks should not give exploration bonus."""
+        env.reset(task_id="bug-detection")
+        obs = env.step(ReviewAction(
+            action_type="flag_issue", line_number=6, filename="utils.py",
+            issue_type="bug", severity="high", description="off by one"
+        ))
+        assert obs.reward_breakdown.get("exploration_bonus", 0) == 0

tests/test_graders.py CHANGED Viewed

@@ -104,11 +104,18 @@ class TestMatchQuality:
         gt = _issue(6, "utils.py", "bug", "high")
         assert match_quality(f, gt) == "none"
-    def test_near_ignores_type_difference(self):
-        """Near match checks same file + line range, ignores type."""
         f = _issue(10, "utils.py", "performance", "high")
         gt = _issue(6, "utils.py", "bug", "high")
-        # 4 lines away → near
         assert match_quality(f, gt) == "near"
     def test_near_tolerance_constant(self):

         gt = _issue(6, "utils.py", "bug", "high")
         assert match_quality(f, gt) == "none"
+    def test_near_requires_compatible_type(self):
+        """Near match requires compatible issue type (not just proximity)."""
         f = _issue(10, "utils.py", "performance", "high")
         gt = _issue(6, "utils.py", "bug", "high")
+        # 4 lines away but wrong type → none
+        assert match_quality(f, gt) == "none"
+    def test_near_with_compatible_type(self):
+        """Near match works with compatible type (bug/logic)."""
+        f = _issue(10, "utils.py", "logic", "high")
+        gt = _issue(6, "utils.py", "bug", "high")
+        # 4 lines away, compatible type → near
         assert match_quality(f, gt) == "near"
     def test_near_tolerance_constant(self):