Update environment.py
Browse files- environment.py +58 -58
environment.py
CHANGED
|
@@ -287,100 +287,100 @@ class CodeReviewEnv:
|
|
| 287 |
|
| 288 |
# ===================================================================
|
| 289 |
def _compute_dense_reward(
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
) -> float:
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
|
| 303 |
# ============================================================
|
| 304 |
# 0. BASE REWARD (controlled contribution)
|
| 305 |
# ============================================================
|
| 306 |
-
|
| 307 |
|
| 308 |
# ============================================================
|
| 309 |
# 1. DELTA REWARDS (primary learning signal)
|
| 310 |
# ============================================================
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
|
| 315 |
-
|
| 316 |
-
|
| 317 |
|
| 318 |
# symmetric (no artificial dampening for negatives)
|
| 319 |
-
|
| 320 |
-
|
| 321 |
|
| 322 |
# ============================================================
|
| 323 |
# 2. TERMINAL SUCCESS BONUS (clean & isolated)
|
| 324 |
# ============================================================
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
|
| 331 |
# ============================================================
|
| 332 |
# 3. TOOL USAGE (early guidance only)
|
| 333 |
# ============================================================
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
|
| 352 |
# ============================================================
|
| 353 |
# 4. EXPLORATION (less noisy)
|
| 354 |
# ============================================================
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
|
| 364 |
# ============================================================
|
| 365 |
# 5. ANTI-HACKING
|
| 366 |
# ============================================================
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
|
| 375 |
# ============================================================
|
| 376 |
# 6. STEP PENALTY (progress pressure)
|
| 377 |
# ============================================================
|
| 378 |
-
|
| 379 |
|
| 380 |
# ============================================================
|
| 381 |
# 7. CLIP (final safety)
|
| 382 |
# ============================================================
|
| 383 |
-
|
| 384 |
|
| 385 |
|
| 386 |
# ===================================================================
|
|
|
|
| 287 |
|
| 288 |
# ===================================================================
|
| 289 |
def _compute_dense_reward(
|
| 290 |
+
self,
|
| 291 |
+
action: AnyAction,
|
| 292 |
+
base_reward: float,
|
| 293 |
+
action_type: str
|
| 294 |
+
) -> float:
|
| 295 |
+
"""
|
| 296 |
+
Stabilized dense reward:
|
| 297 |
+
- Decoupled terminal bonus
|
| 298 |
+
- Controlled base scaling
|
| 299 |
+
- Symmetric delta handling
|
| 300 |
+
- Reduced reward hacking surface
|
| 301 |
+
"""
|
| 302 |
|
| 303 |
# ============================================================
|
| 304 |
# 0. BASE REWARD (controlled contribution)
|
| 305 |
# ============================================================
|
| 306 |
+
reward = 0.4 * base_reward # ↓ reduce dominance
|
| 307 |
|
| 308 |
# ============================================================
|
| 309 |
# 1. DELTA REWARDS (primary learning signal)
|
| 310 |
# ============================================================
|
| 311 |
+
effective_delta_weight = self.delta_weight
|
| 312 |
+
if action_type == "propose_fix":
|
| 313 |
+
effective_delta_weight *= 0.4 # stronger cut to avoid overlap
|
| 314 |
|
| 315 |
+
test_delta = self._current_test_score - self._previous_test_score
|
| 316 |
+
lint_delta = self._current_lint_score - self._previous_lint_score
|
| 317 |
|
| 318 |
# symmetric (no artificial dampening for negatives)
|
| 319 |
+
reward += effective_delta_weight * test_delta
|
| 320 |
+
reward += 0.5 * effective_delta_weight * lint_delta
|
| 321 |
|
| 322 |
# ============================================================
|
| 323 |
# 2. TERMINAL SUCCESS BONUS (clean & isolated)
|
| 324 |
# ============================================================
|
| 325 |
+
if action_type == "propose_fix":
|
| 326 |
+
if self._current_test_score > 0.95:
|
| 327 |
+
reward += 0.4 # slightly reduced to prevent saturation
|
| 328 |
+
elif self._current_test_score > 0.85:
|
| 329 |
+
reward += 0.2 # smoother gradient instead of jump
|
| 330 |
|
| 331 |
# ============================================================
|
| 332 |
# 3. TOOL USAGE (early guidance only)
|
| 333 |
# ============================================================
|
| 334 |
+
if action_type == "run_tests":
|
| 335 |
+
if not self._tests_run:
|
| 336 |
+
reward += self.tool_usage_bonus
|
| 337 |
+
reward += 0.015
|
| 338 |
+
|
| 339 |
+
elif action_type == "run_linter":
|
| 340 |
+
if not self._linter_run:
|
| 341 |
+
reward += self.tool_usage_bonus
|
| 342 |
+
reward += 0.015
|
| 343 |
+
|
| 344 |
+
elif action_type == "query_docs":
|
| 345 |
+
if not self._docs_queried:
|
| 346 |
+
reward += self.tool_usage_bonus * 0.5
|
| 347 |
+
|
| 348 |
+
elif action_type == "ask_question":
|
| 349 |
+
if self._step_count <= 3:
|
| 350 |
+
reward += 0.02 # tighter window
|
| 351 |
|
| 352 |
# ============================================================
|
| 353 |
# 4. EXPLORATION (less noisy)
|
| 354 |
# ============================================================
|
| 355 |
+
if len(self._action_history) >= 3:
|
| 356 |
+
recent = self._action_history[-3:]
|
| 357 |
+
unique = len(set(recent))
|
| 358 |
|
| 359 |
+
if unique == 1:
|
| 360 |
+
reward -= 0.05
|
| 361 |
+
elif unique == 3:
|
| 362 |
+
reward += self.diversity_bonus * 0.7 # reduce randomness bias
|
| 363 |
|
| 364 |
# ============================================================
|
| 365 |
# 5. ANTI-HACKING
|
| 366 |
# ============================================================
|
| 367 |
+
if action_type == "propose_fix":
|
| 368 |
+
if not self._tests_run:
|
| 369 |
+
reward -= 0.25 # stronger enforcement
|
| 370 |
+
if self._step_count < 2:
|
| 371 |
+
reward -= 0.1
|
| 372 |
+
if self._tests_run and self._linter_run:
|
| 373 |
+
reward += 0.02
|
| 374 |
|
| 375 |
# ============================================================
|
| 376 |
# 6. STEP PENALTY (progress pressure)
|
| 377 |
# ============================================================
|
| 378 |
+
reward -= self.step_penalty
|
| 379 |
|
| 380 |
# ============================================================
|
| 381 |
# 7. CLIP (final safety)
|
| 382 |
# ============================================================
|
| 383 |
+
return max(-1.0, min(1.0, reward))
|
| 384 |
|
| 385 |
|
| 386 |
# ===================================================================
|