Update environment.py
Browse files- environment.py +105 -100
environment.py
CHANGED
|
@@ -287,95 +287,100 @@ class CodeReviewEnv:
|
|
| 287 |
|
| 288 |
# ===================================================================
|
| 289 |
def _compute_dense_reward(
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
reward += 0.02
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
# 3. EXPLORATION INCENTIVES
|
| 348 |
-
# ============================================================
|
| 349 |
-
if len(self._action_history) >= 3:
|
| 350 |
-
recent_actions = self._action_history[-3:]
|
| 351 |
-
action_counts = Counter(recent_actions)
|
| 352 |
-
most_common_count = action_counts.most_common(1)[0][1]
|
| 353 |
-
|
| 354 |
-
if most_common_count >= 3:
|
| 355 |
-
reward -= 0.05 # Repetition penalty
|
| 356 |
-
elif len(set(recent_actions)) == 3:
|
| 357 |
-
reward += self.diversity_bonus # Diversity bonus
|
| 358 |
-
|
| 359 |
-
# ============================================================
|
| 360 |
-
# 4. ANTI-HACKING PENALTIES
|
| 361 |
-
# ============================================================
|
| 362 |
-
if action_type == "propose_fix":
|
| 363 |
-
if not self._tests_run:
|
| 364 |
-
reward -= 0.2
|
| 365 |
-
if self._step_count < 2:
|
| 366 |
-
reward -= 0.15
|
| 367 |
-
if self._tests_run and self._linter_run:
|
| 368 |
-
reward += 0.1
|
| 369 |
-
|
| 370 |
-
# ============================================================
|
| 371 |
-
# 5. STEP PENALTY
|
| 372 |
-
# ============================================================
|
| 373 |
-
reward -= self.step_penalty
|
| 374 |
-
|
| 375 |
-
# ============================================================
|
| 376 |
-
# 6. NORMALIZE TO [-1, 1]
|
| 377 |
-
# ============================================================
|
| 378 |
-
reward = max(-1.0, min(1.0, reward))
|
| 379 |
|
| 380 |
return reward
|
| 381 |
|
|
@@ -436,11 +441,11 @@ class CodeReviewEnv:
|
|
| 436 |
success, stdout, stderr = execute_code(self._current_code)
|
| 437 |
output = (stdout + stderr).strip() or "No output"
|
| 438 |
self._test_results = f"[Execute] {'Success' if success else 'Failed'}\n{output[:300]}"
|
| 439 |
-
base_reward = 0.
|
| 440 |
|
| 441 |
elif isinstance(action, Inspect):
|
| 442 |
self._test_results = f"[Inspect]\n{self._current_code[:500]}"
|
| 443 |
-
base_reward = 0.
|
| 444 |
|
| 445 |
elif isinstance(action, RunLinter):
|
| 446 |
lint_output = ToolBox.run_linter(self._current_code)
|
|
@@ -449,7 +454,7 @@ class CodeReviewEnv:
|
|
| 449 |
|
| 450 |
self._current_lint_score = self._run_linter_score(self._current_code)
|
| 451 |
self._linter_run = True
|
| 452 |
-
base_reward = 0.
|
| 453 |
|
| 454 |
elif isinstance(action, RunTests):
|
| 455 |
runner = TestRunner(self._current_bug_id)
|
|
@@ -459,17 +464,17 @@ class CodeReviewEnv:
|
|
| 459 |
self._tests_run = True
|
| 460 |
|
| 461 |
self._test_results = f"[Tests] Score: {score:.2f}\n{output[:300]}"
|
| 462 |
-
base_reward = 0.
|
| 463 |
|
| 464 |
if score > 0.8:
|
| 465 |
-
base_reward += 0.
|
| 466 |
|
| 467 |
elif isinstance(action, QueryDocs):
|
| 468 |
doc = ToolBox.query_docs(action.query_topic)
|
| 469 |
self._doc_results = doc
|
| 470 |
self._test_results = f"[Docs]\n{doc[:400]}"
|
| 471 |
self._docs_queried = True
|
| 472 |
-
base_reward = 0.
|
| 473 |
|
| 474 |
# ==============================================================
|
| 475 |
# COMMUNICATION ACTIONS
|
|
@@ -488,7 +493,7 @@ class CodeReviewEnv:
|
|
| 488 |
|
| 489 |
self._comments.append(f"Author: {response}")
|
| 490 |
self._test_results = f"[Comment] Author: {response[:200]}"
|
| 491 |
-
base_reward = 0.
|
| 492 |
|
| 493 |
elif isinstance(action, AskQuestion):
|
| 494 |
self._comments.append(f"Agent: {action.question}")
|
|
@@ -504,14 +509,14 @@ class CodeReviewEnv:
|
|
| 504 |
|
| 505 |
self._comments.append(f"Author: {response}")
|
| 506 |
self._test_results = f"[Question] Author: {response[:200]}"
|
| 507 |
-
base_reward = 0.
|
| 508 |
|
| 509 |
# ==============================================================
|
| 510 |
# FINAL FIX ACTION
|
| 511 |
# ==============================================================
|
| 512 |
elif isinstance(action, ProposeFix):
|
| 513 |
if not action.fix_code:
|
| 514 |
-
base_reward = -0.
|
| 515 |
self._done = True
|
| 516 |
else:
|
| 517 |
self._current_code = action.fix_code
|
|
@@ -561,18 +566,18 @@ class CodeReviewEnv:
|
|
| 561 |
# TERMINATION ACTIONS
|
| 562 |
# ==============================================================
|
| 563 |
elif isinstance(action, Skip):
|
| 564 |
-
base_reward = -0.
|
| 565 |
self._done = True
|
| 566 |
|
| 567 |
elif isinstance(action, Done):
|
| 568 |
if self._tests_run:
|
| 569 |
base_reward = self._current_test_score * 0.5 - 0.2
|
| 570 |
else:
|
| 571 |
-
base_reward = -0.
|
| 572 |
self._done = True
|
| 573 |
|
| 574 |
else:
|
| 575 |
-
base_reward = -0.
|
| 576 |
self._done = True
|
| 577 |
|
| 578 |
# ==============================================================
|
|
|
|
| 287 |
|
| 288 |
# ===================================================================
|
| 289 |
def _compute_dense_reward(
|
| 290 |
+
self,
|
| 291 |
+
action: AnyAction,
|
| 292 |
+
base_reward: float,
|
| 293 |
+
action_type: str
|
| 294 |
+
) -> float:
|
| 295 |
+
"""
|
| 296 |
+
Stabilized dense reward:
|
| 297 |
+
- Decoupled terminal bonus
|
| 298 |
+
- Controlled base scaling
|
| 299 |
+
- Symmetric delta handling
|
| 300 |
+
- Reduced reward hacking surface
|
| 301 |
+
"""
|
| 302 |
+
|
| 303 |
+
# ============================================================
|
| 304 |
+
# 0. BASE REWARD (controlled contribution)
|
| 305 |
+
# ============================================================
|
| 306 |
+
reward = 0.4 * base_reward # ↓ reduce dominance
|
| 307 |
+
|
| 308 |
+
# ============================================================
|
| 309 |
+
# 1. DELTA REWARDS (primary learning signal)
|
| 310 |
+
# ============================================================
|
| 311 |
+
effective_delta_weight = self.delta_weight
|
| 312 |
+
if action_type == "propose_fix":
|
| 313 |
+
effective_delta_weight *= 0.4 # stronger cut to avoid overlap
|
| 314 |
+
|
| 315 |
+
test_delta = self._current_test_score - self._previous_test_score
|
| 316 |
+
lint_delta = self._current_lint_score - self._previous_lint_score
|
| 317 |
+
|
| 318 |
+
# symmetric (no artificial dampening for negatives)
|
| 319 |
+
reward += effective_delta_weight * test_delta
|
| 320 |
+
reward += 0.5 * effective_delta_weight * lint_delta
|
| 321 |
+
|
| 322 |
+
# ============================================================
|
| 323 |
+
# 2. TERMINAL SUCCESS BONUS (clean & isolated)
|
| 324 |
+
# ============================================================
|
| 325 |
+
if action_type == "propose_fix":
|
| 326 |
+
if self._current_test_score > 0.95:
|
| 327 |
+
reward += 0.4 # slightly reduced to prevent saturation
|
| 328 |
+
elif self._current_test_score > 0.85:
|
| 329 |
+
reward += 0.2 # smoother gradient instead of jump
|
| 330 |
+
|
| 331 |
+
# ============================================================
|
| 332 |
+
# 3. TOOL USAGE (early guidance only)
|
| 333 |
+
# ============================================================
|
| 334 |
+
if action_type == "run_tests":
|
| 335 |
+
if not self._tests_run:
|
| 336 |
+
reward += self.tool_usage_bonus
|
| 337 |
+
reward += 0.015
|
| 338 |
+
|
| 339 |
+
elif action_type == "run_linter":
|
| 340 |
+
if not self._linter_run:
|
| 341 |
+
reward += self.tool_usage_bonus
|
| 342 |
+
reward += 0.015
|
| 343 |
+
|
| 344 |
+
elif action_type == "query_docs":
|
| 345 |
+
if not self._docs_queried:
|
| 346 |
+
reward += self.tool_usage_bonus * 0.5
|
| 347 |
+
|
| 348 |
+
elif action_type == "ask_question":
|
| 349 |
+
if self._step_count <= 3:
|
| 350 |
+
reward += 0.02 # tighter window
|
| 351 |
+
|
| 352 |
+
# ============================================================
|
| 353 |
+
# 4. EXPLORATION (less noisy)
|
| 354 |
+
# ============================================================
|
| 355 |
+
if len(self._action_history) >= 3:
|
| 356 |
+
recent = self._action_history[-3:]
|
| 357 |
+
unique = len(set(recent))
|
| 358 |
+
|
| 359 |
+
if unique == 1:
|
| 360 |
+
reward -= 0.05
|
| 361 |
+
elif unique == 3:
|
| 362 |
+
reward += self.diversity_bonus * 0.7 # reduce randomness bias
|
| 363 |
+
|
| 364 |
+
# ============================================================
|
| 365 |
+
# 5. ANTI-HACKING
|
| 366 |
+
# ============================================================
|
| 367 |
+
if action_type == "propose_fix":
|
| 368 |
+
if not self._tests_run:
|
| 369 |
+
reward -= 0.25 # stronger enforcement
|
| 370 |
+
if self._step_count < 2:
|
| 371 |
+
reward -= 0.1
|
| 372 |
+
if self._tests_run and self._linter_run:
|
| 373 |
reward += 0.02
|
| 374 |
+
|
| 375 |
+
# ============================================================
|
| 376 |
+
# 6. STEP PENALTY (progress pressure)
|
| 377 |
+
# ============================================================
|
| 378 |
+
reward -= self.step_penalty
|
| 379 |
+
|
| 380 |
+
# ============================================================
|
| 381 |
+
# 7. CLIP (final safety)
|
| 382 |
+
# ============================================================
|
| 383 |
+
return max(-1.0, min(1.0, reward))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
|
| 385 |
return reward
|
| 386 |
|
|
|
|
| 441 |
success, stdout, stderr = execute_code(self._current_code)
|
| 442 |
output = (stdout + stderr).strip() or "No output"
|
| 443 |
self._test_results = f"[Execute] {'Success' if success else 'Failed'}\n{output[:300]}"
|
| 444 |
+
base_reward = 0.001 if success else -0.05
|
| 445 |
|
| 446 |
elif isinstance(action, Inspect):
|
| 447 |
self._test_results = f"[Inspect]\n{self._current_code[:500]}"
|
| 448 |
+
base_reward = 0.001
|
| 449 |
|
| 450 |
elif isinstance(action, RunLinter):
|
| 451 |
lint_output = ToolBox.run_linter(self._current_code)
|
|
|
|
| 454 |
|
| 455 |
self._current_lint_score = self._run_linter_score(self._current_code)
|
| 456 |
self._linter_run = True
|
| 457 |
+
base_reward = 0.002
|
| 458 |
|
| 459 |
elif isinstance(action, RunTests):
|
| 460 |
runner = TestRunner(self._current_bug_id)
|
|
|
|
| 464 |
self._tests_run = True
|
| 465 |
|
| 466 |
self._test_results = f"[Tests] Score: {score:.2f}\n{output[:300]}"
|
| 467 |
+
base_reward = 0.002
|
| 468 |
|
| 469 |
if score > 0.8:
|
| 470 |
+
base_reward += 0.005
|
| 471 |
|
| 472 |
elif isinstance(action, QueryDocs):
|
| 473 |
doc = ToolBox.query_docs(action.query_topic)
|
| 474 |
self._doc_results = doc
|
| 475 |
self._test_results = f"[Docs]\n{doc[:400]}"
|
| 476 |
self._docs_queried = True
|
| 477 |
+
base_reward = 0.001
|
| 478 |
|
| 479 |
# ==============================================================
|
| 480 |
# COMMUNICATION ACTIONS
|
|
|
|
| 493 |
|
| 494 |
self._comments.append(f"Author: {response}")
|
| 495 |
self._test_results = f"[Comment] Author: {response[:200]}"
|
| 496 |
+
base_reward = 0.001
|
| 497 |
|
| 498 |
elif isinstance(action, AskQuestion):
|
| 499 |
self._comments.append(f"Agent: {action.question}")
|
|
|
|
| 509 |
|
| 510 |
self._comments.append(f"Author: {response}")
|
| 511 |
self._test_results = f"[Question] Author: {response[:200]}"
|
| 512 |
+
base_reward = 0.002
|
| 513 |
|
| 514 |
# ==============================================================
|
| 515 |
# FINAL FIX ACTION
|
| 516 |
# ==============================================================
|
| 517 |
elif isinstance(action, ProposeFix):
|
| 518 |
if not action.fix_code:
|
| 519 |
+
base_reward = -0.05
|
| 520 |
self._done = True
|
| 521 |
else:
|
| 522 |
self._current_code = action.fix_code
|
|
|
|
| 566 |
# TERMINATION ACTIONS
|
| 567 |
# ==============================================================
|
| 568 |
elif isinstance(action, Skip):
|
| 569 |
+
base_reward = -0.03
|
| 570 |
self._done = True
|
| 571 |
|
| 572 |
elif isinstance(action, Done):
|
| 573 |
if self._tests_run:
|
| 574 |
base_reward = self._current_test_score * 0.5 - 0.2
|
| 575 |
else:
|
| 576 |
+
base_reward = -0.04
|
| 577 |
self._done = True
|
| 578 |
|
| 579 |
else:
|
| 580 |
+
base_reward = -0.02
|
| 581 |
self._done = True
|
| 582 |
|
| 583 |
# ==============================================================
|