Spaces:

100XZX001
/

CodeReview-Professional-Workflow

Sleeping

App Files Files Community

100XZX001 commited on Apr 22

Commit

fd00203

verified ·

1 Parent(s): fcaf7ff

Update environment.py

Browse files

Files changed (1) hide show

environment.py +58 -58

environment.py CHANGED Viewed

@@ -287,100 +287,100 @@ class CodeReviewEnv:
     # ===================================================================
     def _compute_dense_reward(
-    self,
-    action: AnyAction,
-    base_reward: float,
-    action_type: str
-) -> float:
-    """
-    Stabilized dense reward:
-    - Decoupled terminal bonus
-    - Controlled base scaling
-    - Symmetric delta handling
-    - Reduced reward hacking surface
-    """
     # ============================================================
     # 0. BASE REWARD (controlled contribution)
     # ============================================================
-    reward = 0.4 * base_reward   # ↓ reduce dominance
     # ============================================================
     # 1. DELTA REWARDS (primary learning signal)
     # ============================================================
-    effective_delta_weight = self.delta_weight
-    if action_type == "propose_fix":
-        effective_delta_weight *= 0.4  # stronger cut to avoid overlap
-    test_delta = self._current_test_score - self._previous_test_score
-    lint_delta = self._current_lint_score - self._previous_lint_score
     # symmetric (no artificial dampening for negatives)
-    reward += effective_delta_weight * test_delta
-    reward += 0.5 * effective_delta_weight * lint_delta
     # ============================================================
     # 2. TERMINAL SUCCESS BONUS (clean & isolated)
     # ============================================================
-    if action_type == "propose_fix":
-        if self._current_test_score > 0.95:
-            reward += 0.4   # slightly reduced to prevent saturation
-        elif self._current_test_score > 0.85:
-            reward += 0.2   # smoother gradient instead of jump
     # ============================================================
     # 3. TOOL USAGE (early guidance only)
     # ============================================================
-    if action_type == "run_tests":
-        if not self._tests_run:
-            reward += self.tool_usage_bonus
-        reward += 0.015
-    elif action_type == "run_linter":
-        if not self._linter_run:
-            reward += self.tool_usage_bonus
-        reward += 0.015
-    elif action_type == "query_docs":
-        if not self._docs_queried:
-            reward += self.tool_usage_bonus * 0.5
-    elif action_type == "ask_question":
-        if self._step_count <= 3:
-            reward += 0.02   # tighter window
     # ============================================================
     # 4. EXPLORATION (less noisy)
     # ============================================================
-    if len(self._action_history) >= 3:
-        recent = self._action_history[-3:]
-        unique = len(set(recent))
-        if unique == 1:
-            reward -= 0.05
-        elif unique == 3:
-            reward += self.diversity_bonus * 0.7  # reduce randomness bias
     # ============================================================
     # 5. ANTI-HACKING
     # ============================================================
-    if action_type == "propose_fix":
-        if not self._tests_run:
-            reward -= 0.25   # stronger enforcement
-        if self._step_count < 2:
-            reward -= 0.1
-        if self._tests_run and self._linter_run:
-            reward += 0.02
     # ============================================================
     # 6. STEP PENALTY (progress pressure)
     # ============================================================
-    reward -= self.step_penalty
     # ============================================================
     # 7. CLIP (final safety)
     # ============================================================
-    return max(-1.0, min(1.0, reward))
     # ===================================================================

     # ===================================================================
     def _compute_dense_reward(
+        self,
+        action: AnyAction,
+        base_reward: float,
+        action_type: str
+    ) -> float:
+        """
+        Stabilized dense reward:
+        - Decoupled terminal bonus
+        - Controlled base scaling
+        - Symmetric delta handling
+        - Reduced reward hacking surface
+        """
     # ============================================================
     # 0. BASE REWARD (controlled contribution)
     # ============================================================
+        reward = 0.4 * base_reward   # ↓ reduce dominance
     # ============================================================
     # 1. DELTA REWARDS (primary learning signal)
     # ============================================================
+        effective_delta_weight = self.delta_weight
+        if action_type == "propose_fix":
+            effective_delta_weight *= 0.4  # stronger cut to avoid overlap
+        test_delta = self._current_test_score - self._previous_test_score
+        lint_delta = self._current_lint_score - self._previous_lint_score
     # symmetric (no artificial dampening for negatives)
+        reward += effective_delta_weight * test_delta
+        reward += 0.5 * effective_delta_weight * lint_delta
     # ============================================================
     # 2. TERMINAL SUCCESS BONUS (clean & isolated)
     # ============================================================
+        if action_type == "propose_fix":
+            if self._current_test_score > 0.95:
+                reward += 0.4   # slightly reduced to prevent saturation
+            elif self._current_test_score > 0.85:
+                reward += 0.2   # smoother gradient instead of jump
     # ============================================================
     # 3. TOOL USAGE (early guidance only)
     # ============================================================
+        if action_type == "run_tests":
+            if not self._tests_run:
+                reward += self.tool_usage_bonus
+            reward += 0.015
+        elif action_type == "run_linter":
+            if not self._linter_run:
+                reward += self.tool_usage_bonus
+            reward += 0.015
+        elif action_type == "query_docs":
+            if not self._docs_queried:
+                reward += self.tool_usage_bonus * 0.5
+        elif action_type == "ask_question":
+            if self._step_count <= 3:
+                reward += 0.02   # tighter window
     # ============================================================
     # 4. EXPLORATION (less noisy)
     # ============================================================
+        if len(self._action_history) >= 3:
+            recent = self._action_history[-3:]
+            unique = len(set(recent))
+            if unique == 1:
+                reward -= 0.05
+            elif unique == 3:
+                reward += self.diversity_bonus * 0.7  # reduce randomness bias
     # ============================================================
     # 5. ANTI-HACKING
     # ============================================================
+        if action_type == "propose_fix":
+            if not self._tests_run:
+                reward -= 0.25   # stronger enforcement
+            if self._step_count < 2:
+                reward -= 0.1
+            if self._tests_run and self._linter_run:
+                reward += 0.02
     # ============================================================
     # 6. STEP PENALTY (progress pressure)
     # ============================================================
+        reward -= self.step_penalty
     # ============================================================
     # 7. CLIP (final safety)
     # ============================================================
+        return max(-1.0, min(1.0, reward))
     # ===================================================================