Spaces:

100XZX001
/

CodeReview-Professional-Workflow

Sleeping

App Files Files Community

100XZX001 commited on Apr 22

Commit

4b1fdf1

verified ·

1 Parent(s): 73f8ffa

Update environment.py

Browse files

Files changed (1) hide show

environment.py +105 -100

environment.py CHANGED Viewed

@@ -287,95 +287,100 @@ class CodeReviewEnv:
     # ===================================================================
     def _compute_dense_reward(
-        self,
-        action: AnyAction,
-        base_reward: float,
-        action_type: str
-    ) -> float:
-        """
-        Compute dense reward with:
-        1. Delta-based improvement rewards
-        2. Tool usage bonuses
-        3. Exploration incentives
-        4. Anti-hacking penalties
-        FIXED: Reduced delta weight for ProposeFix to avoid double-counting
-        """
-        reward = base_reward
-        # FIXED: Reduce delta impact for ProposeFix (already includes test_score in base)
-        effective_delta_weight = self.delta_weight
-        if action_type == "propose_fix":
-            effective_delta_weight *= 0.5  # Prevent double-counting
-        # ============================================================
-        # 1. DELTA-BASED REWARDS (credit assignment)
-        # ============================================================
-        test_delta = self._current_test_score - self._previous_test_score
-        lint_delta = self._current_lint_score - self._previous_lint_score
-        if test_delta > 0:
-            reward += effective_delta_weight * test_delta
-        elif test_delta < 0:
-            reward += effective_delta_weight * test_delta * 0.5
-        if lint_delta > 0:
-            reward += effective_delta_weight * 0.5 * lint_delta
-        # ============================================================
-        # 2. TOOL USAGE BONUSES
-        # ============================================================
-        if action_type == "run_tests":
-            if not self._tests_run:
-                reward += self.tool_usage_bonus
-            reward += 0.02
-        elif action_type == "run_linter":
-            if not self._linter_run:
-                reward += self.tool_usage_bonus
             reward += 0.02
-        elif action_type == "query_docs":
-            if not self._docs_queried:
-                reward += self.tool_usage_bonus * 0.5
-        elif action_type == "ask_question":
-            if 1 <= self._step_count <= 5:
-                reward += 0.03
-        # ============================================================
-        # 3. EXPLORATION INCENTIVES
-        # ============================================================
-        if len(self._action_history) >= 3:
-            recent_actions = self._action_history[-3:]
-            action_counts = Counter(recent_actions)
-            most_common_count = action_counts.most_common(1)[0][1]
-            if most_common_count >= 3:
-                reward -= 0.05  # Repetition penalty
-            elif len(set(recent_actions)) == 3:
-                reward += self.diversity_bonus  # Diversity bonus
-        # ============================================================
-        # 4. ANTI-HACKING PENALTIES
-        # ============================================================
-        if action_type == "propose_fix":
-            if not self._tests_run:
-                reward -= 0.2
-            if self._step_count < 2:
-                reward -= 0.15
-            if self._tests_run and self._linter_run:
-                reward += 0.1
-        # ============================================================
-        # 5. STEP PENALTY
-        # ============================================================
-        reward -= self.step_penalty
-        # ============================================================
-        # 6. NORMALIZE TO [-1, 1]
-        # ============================================================
-        reward = max(-1.0, min(1.0, reward))
         return reward
@@ -436,11 +441,11 @@ class CodeReviewEnv:
             success, stdout, stderr = execute_code(self._current_code)
             output = (stdout + stderr).strip() or "No output"
             self._test_results = f"[Execute] {'Success' if success else 'Failed'}\n{output[:300]}"
-            base_reward = 0.01 if success else -0.05
         elif isinstance(action, Inspect):
             self._test_results = f"[Inspect]\n{self._current_code[:500]}"
-            base_reward = 0.01
         elif isinstance(action, RunLinter):
             lint_output = ToolBox.run_linter(self._current_code)
@@ -449,7 +454,7 @@ class CodeReviewEnv:
             self._current_lint_score = self._run_linter_score(self._current_code)
             self._linter_run = True
-            base_reward = 0.02
         elif isinstance(action, RunTests):
             runner = TestRunner(self._current_bug_id)
@@ -459,17 +464,17 @@ class CodeReviewEnv:
             self._tests_run = True
             self._test_results = f"[Tests] Score: {score:.2f}\n{output[:300]}"
-            base_reward = 0.02
             if score > 0.8:
-                base_reward += 0.05
         elif isinstance(action, QueryDocs):
             doc = ToolBox.query_docs(action.query_topic)
             self._doc_results = doc
             self._test_results = f"[Docs]\n{doc[:400]}"
             self._docs_queried = True
-            base_reward = 0.01
         # ==============================================================
         # COMMUNICATION ACTIONS
@@ -488,7 +493,7 @@ class CodeReviewEnv:
             self._comments.append(f"Author: {response}")
             self._test_results = f"[Comment] Author: {response[:200]}"
-            base_reward = 0.01
         elif isinstance(action, AskQuestion):
             self._comments.append(f"Agent: {action.question}")
@@ -504,14 +509,14 @@ class CodeReviewEnv:
             self._comments.append(f"Author: {response}")
             self._test_results = f"[Question] Author: {response[:200]}"
-            base_reward = 0.02
         # ==============================================================
         # FINAL FIX ACTION
         # ==============================================================
         elif isinstance(action, ProposeFix):
             if not action.fix_code:
-                base_reward = -0.5
                 self._done = True
             else:
                 self._current_code = action.fix_code
@@ -561,18 +566,18 @@ class CodeReviewEnv:
         # TERMINATION ACTIONS
         # ==============================================================
         elif isinstance(action, Skip):
-            base_reward = -0.3
             self._done = True
         elif isinstance(action, Done):
             if self._tests_run:
                 base_reward = self._current_test_score * 0.5 - 0.2
             else:
-                base_reward = -0.4
             self._done = True
         else:
-            base_reward = -0.2
             self._done = True
         # ==============================================================

     # ===================================================================
     def _compute_dense_reward(
+    self,
+    action: AnyAction,
+    base_reward: float,
+    action_type: str
+) -> float:
+    """
+    Stabilized dense reward:
+    - Decoupled terminal bonus
+    - Controlled base scaling
+    - Symmetric delta handling
+    - Reduced reward hacking surface
+    """
+    # ============================================================
+    # 0. BASE REWARD (controlled contribution)
+    # ============================================================
+    reward = 0.4 * base_reward   # ↓ reduce dominance
+    # ============================================================
+    # 1. DELTA REWARDS (primary learning signal)
+    # ============================================================
+    effective_delta_weight = self.delta_weight
+    if action_type == "propose_fix":
+        effective_delta_weight *= 0.4  # stronger cut to avoid overlap
+    test_delta = self._current_test_score - self._previous_test_score
+    lint_delta = self._current_lint_score - self._previous_lint_score
+    # symmetric (no artificial dampening for negatives)
+    reward += effective_delta_weight * test_delta
+    reward += 0.5 * effective_delta_weight * lint_delta
+    # ============================================================
+    # 2. TERMINAL SUCCESS BONUS (clean & isolated)
+    # ============================================================
+    if action_type == "propose_fix":
+        if self._current_test_score > 0.95:
+            reward += 0.4   # slightly reduced to prevent saturation
+        elif self._current_test_score > 0.85:
+            reward += 0.2   # smoother gradient instead of jump
+    # ============================================================
+    # 3. TOOL USAGE (early guidance only)
+    # ============================================================
+    if action_type == "run_tests":
+        if not self._tests_run:
+            reward += self.tool_usage_bonus
+        reward += 0.015
+    elif action_type == "run_linter":
+        if not self._linter_run:
+            reward += self.tool_usage_bonus
+        reward += 0.015
+    elif action_type == "query_docs":
+        if not self._docs_queried:
+            reward += self.tool_usage_bonus * 0.5
+    elif action_type == "ask_question":
+        if self._step_count <= 3:
+            reward += 0.02   # tighter window
+    # ============================================================
+    # 4. EXPLORATION (less noisy)
+    # ============================================================
+    if len(self._action_history) >= 3:
+        recent = self._action_history[-3:]
+        unique = len(set(recent))
+        if unique == 1:
+            reward -= 0.05
+        elif unique == 3:
+            reward += self.diversity_bonus * 0.7  # reduce randomness bias
+    # ============================================================
+    # 5. ANTI-HACKING
+    # ============================================================
+    if action_type == "propose_fix":
+        if not self._tests_run:
+            reward -= 0.25   # stronger enforcement
+        if self._step_count < 2:
+            reward -= 0.1
+        if self._tests_run and self._linter_run:
             reward += 0.02
+    # ============================================================
+    # 6. STEP PENALTY (progress pressure)
+    # ============================================================
+    reward -= self.step_penalty
+    # ============================================================
+    # 7. CLIP (final safety)
+    # ============================================================
+    return max(-1.0, min(1.0, reward))
         return reward
             success, stdout, stderr = execute_code(self._current_code)
             output = (stdout + stderr).strip() or "No output"
             self._test_results = f"[Execute] {'Success' if success else 'Failed'}\n{output[:300]}"
+            base_reward = 0.001 if success else -0.05
         elif isinstance(action, Inspect):
             self._test_results = f"[Inspect]\n{self._current_code[:500]}"
+            base_reward = 0.001
         elif isinstance(action, RunLinter):
             lint_output = ToolBox.run_linter(self._current_code)
             self._current_lint_score = self._run_linter_score(self._current_code)
             self._linter_run = True
+            base_reward = 0.002
         elif isinstance(action, RunTests):
             runner = TestRunner(self._current_bug_id)
             self._tests_run = True
             self._test_results = f"[Tests] Score: {score:.2f}\n{output[:300]}"
+            base_reward = 0.002
             if score > 0.8:
+                base_reward += 0.005
         elif isinstance(action, QueryDocs):
             doc = ToolBox.query_docs(action.query_topic)
             self._doc_results = doc
             self._test_results = f"[Docs]\n{doc[:400]}"
             self._docs_queried = True
+            base_reward = 0.001
         # ==============================================================
         # COMMUNICATION ACTIONS
             self._comments.append(f"Author: {response}")
             self._test_results = f"[Comment] Author: {response[:200]}"
+            base_reward = 0.001
         elif isinstance(action, AskQuestion):
             self._comments.append(f"Agent: {action.question}")
             self._comments.append(f"Author: {response}")
             self._test_results = f"[Question] Author: {response[:200]}"
+            base_reward = 0.002
         # ==============================================================
         # FINAL FIX ACTION
         # ==============================================================
         elif isinstance(action, ProposeFix):
             if not action.fix_code:
+                base_reward = -0.05
                 self._done = True
             else:
                 self._current_code = action.fix_code
         # TERMINATION ACTIONS
         # ==============================================================
         elif isinstance(action, Skip):
+            base_reward = -0.03
             self._done = True
         elif isinstance(action, Done):
             if self._tests_run:
                 base_reward = self._current_test_score * 0.5 - 0.2
             else:
+                base_reward = -0.04
             self._done = True
         else:
+            base_reward = -0.02
             self._done = True
         # ==============================================================