100XZX001 commited on
Commit
4b1fdf1
·
verified ·
1 Parent(s): 73f8ffa

Update environment.py

Browse files
Files changed (1) hide show
  1. environment.py +105 -100
environment.py CHANGED
@@ -287,95 +287,100 @@ class CodeReviewEnv:
287
 
288
  # ===================================================================
289
  def _compute_dense_reward(
290
- self,
291
- action: AnyAction,
292
- base_reward: float,
293
- action_type: str
294
- ) -> float:
295
- """
296
- Compute dense reward with:
297
- 1. Delta-based improvement rewards
298
- 2. Tool usage bonuses
299
- 3. Exploration incentives
300
- 4. Anti-hacking penalties
301
-
302
- FIXED: Reduced delta weight for ProposeFix to avoid double-counting
303
- """
304
- reward = base_reward
305
-
306
- # FIXED: Reduce delta impact for ProposeFix (already includes test_score in base)
307
- effective_delta_weight = self.delta_weight
308
- if action_type == "propose_fix":
309
- effective_delta_weight *= 0.5 # Prevent double-counting
310
-
311
- # ============================================================
312
- # 1. DELTA-BASED REWARDS (credit assignment)
313
- # ============================================================
314
- test_delta = self._current_test_score - self._previous_test_score
315
- lint_delta = self._current_lint_score - self._previous_lint_score
316
-
317
- if test_delta > 0:
318
- reward += effective_delta_weight * test_delta
319
- elif test_delta < 0:
320
- reward += effective_delta_weight * test_delta * 0.5
321
-
322
- if lint_delta > 0:
323
- reward += effective_delta_weight * 0.5 * lint_delta
324
-
325
- # ============================================================
326
- # 2. TOOL USAGE BONUSES
327
- # ============================================================
328
- if action_type == "run_tests":
329
- if not self._tests_run:
330
- reward += self.tool_usage_bonus
331
- reward += 0.02
332
-
333
- elif action_type == "run_linter":
334
- if not self._linter_run:
335
- reward += self.tool_usage_bonus
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  reward += 0.02
337
-
338
- elif action_type == "query_docs":
339
- if not self._docs_queried:
340
- reward += self.tool_usage_bonus * 0.5
341
-
342
- elif action_type == "ask_question":
343
- if 1 <= self._step_count <= 5:
344
- reward += 0.03
345
-
346
- # ============================================================
347
- # 3. EXPLORATION INCENTIVES
348
- # ============================================================
349
- if len(self._action_history) >= 3:
350
- recent_actions = self._action_history[-3:]
351
- action_counts = Counter(recent_actions)
352
- most_common_count = action_counts.most_common(1)[0][1]
353
-
354
- if most_common_count >= 3:
355
- reward -= 0.05 # Repetition penalty
356
- elif len(set(recent_actions)) == 3:
357
- reward += self.diversity_bonus # Diversity bonus
358
-
359
- # ============================================================
360
- # 4. ANTI-HACKING PENALTIES
361
- # ============================================================
362
- if action_type == "propose_fix":
363
- if not self._tests_run:
364
- reward -= 0.2
365
- if self._step_count < 2:
366
- reward -= 0.15
367
- if self._tests_run and self._linter_run:
368
- reward += 0.1
369
-
370
- # ============================================================
371
- # 5. STEP PENALTY
372
- # ============================================================
373
- reward -= self.step_penalty
374
-
375
- # ============================================================
376
- # 6. NORMALIZE TO [-1, 1]
377
- # ============================================================
378
- reward = max(-1.0, min(1.0, reward))
379
 
380
  return reward
381
 
@@ -436,11 +441,11 @@ class CodeReviewEnv:
436
  success, stdout, stderr = execute_code(self._current_code)
437
  output = (stdout + stderr).strip() or "No output"
438
  self._test_results = f"[Execute] {'Success' if success else 'Failed'}\n{output[:300]}"
439
- base_reward = 0.01 if success else -0.05
440
 
441
  elif isinstance(action, Inspect):
442
  self._test_results = f"[Inspect]\n{self._current_code[:500]}"
443
- base_reward = 0.01
444
 
445
  elif isinstance(action, RunLinter):
446
  lint_output = ToolBox.run_linter(self._current_code)
@@ -449,7 +454,7 @@ class CodeReviewEnv:
449
 
450
  self._current_lint_score = self._run_linter_score(self._current_code)
451
  self._linter_run = True
452
- base_reward = 0.02
453
 
454
  elif isinstance(action, RunTests):
455
  runner = TestRunner(self._current_bug_id)
@@ -459,17 +464,17 @@ class CodeReviewEnv:
459
  self._tests_run = True
460
 
461
  self._test_results = f"[Tests] Score: {score:.2f}\n{output[:300]}"
462
- base_reward = 0.02
463
 
464
  if score > 0.8:
465
- base_reward += 0.05
466
 
467
  elif isinstance(action, QueryDocs):
468
  doc = ToolBox.query_docs(action.query_topic)
469
  self._doc_results = doc
470
  self._test_results = f"[Docs]\n{doc[:400]}"
471
  self._docs_queried = True
472
- base_reward = 0.01
473
 
474
  # ==============================================================
475
  # COMMUNICATION ACTIONS
@@ -488,7 +493,7 @@ class CodeReviewEnv:
488
 
489
  self._comments.append(f"Author: {response}")
490
  self._test_results = f"[Comment] Author: {response[:200]}"
491
- base_reward = 0.01
492
 
493
  elif isinstance(action, AskQuestion):
494
  self._comments.append(f"Agent: {action.question}")
@@ -504,14 +509,14 @@ class CodeReviewEnv:
504
 
505
  self._comments.append(f"Author: {response}")
506
  self._test_results = f"[Question] Author: {response[:200]}"
507
- base_reward = 0.02
508
 
509
  # ==============================================================
510
  # FINAL FIX ACTION
511
  # ==============================================================
512
  elif isinstance(action, ProposeFix):
513
  if not action.fix_code:
514
- base_reward = -0.5
515
  self._done = True
516
  else:
517
  self._current_code = action.fix_code
@@ -561,18 +566,18 @@ class CodeReviewEnv:
561
  # TERMINATION ACTIONS
562
  # ==============================================================
563
  elif isinstance(action, Skip):
564
- base_reward = -0.3
565
  self._done = True
566
 
567
  elif isinstance(action, Done):
568
  if self._tests_run:
569
  base_reward = self._current_test_score * 0.5 - 0.2
570
  else:
571
- base_reward = -0.4
572
  self._done = True
573
 
574
  else:
575
- base_reward = -0.2
576
  self._done = True
577
 
578
  # ==============================================================
 
287
 
288
  # ===================================================================
289
  def _compute_dense_reward(
290
+ self,
291
+ action: AnyAction,
292
+ base_reward: float,
293
+ action_type: str
294
+ ) -> float:
295
+ """
296
+ Stabilized dense reward:
297
+ - Decoupled terminal bonus
298
+ - Controlled base scaling
299
+ - Symmetric delta handling
300
+ - Reduced reward hacking surface
301
+ """
302
+
303
+ # ============================================================
304
+ # 0. BASE REWARD (controlled contribution)
305
+ # ============================================================
306
+ reward = 0.4 * base_reward # reduce dominance
307
+
308
+ # ============================================================
309
+ # 1. DELTA REWARDS (primary learning signal)
310
+ # ============================================================
311
+ effective_delta_weight = self.delta_weight
312
+ if action_type == "propose_fix":
313
+ effective_delta_weight *= 0.4 # stronger cut to avoid overlap
314
+
315
+ test_delta = self._current_test_score - self._previous_test_score
316
+ lint_delta = self._current_lint_score - self._previous_lint_score
317
+
318
+ # symmetric (no artificial dampening for negatives)
319
+ reward += effective_delta_weight * test_delta
320
+ reward += 0.5 * effective_delta_weight * lint_delta
321
+
322
+ # ============================================================
323
+ # 2. TERMINAL SUCCESS BONUS (clean & isolated)
324
+ # ============================================================
325
+ if action_type == "propose_fix":
326
+ if self._current_test_score > 0.95:
327
+ reward += 0.4 # slightly reduced to prevent saturation
328
+ elif self._current_test_score > 0.85:
329
+ reward += 0.2 # smoother gradient instead of jump
330
+
331
+ # ============================================================
332
+ # 3. TOOL USAGE (early guidance only)
333
+ # ============================================================
334
+ if action_type == "run_tests":
335
+ if not self._tests_run:
336
+ reward += self.tool_usage_bonus
337
+ reward += 0.015
338
+
339
+ elif action_type == "run_linter":
340
+ if not self._linter_run:
341
+ reward += self.tool_usage_bonus
342
+ reward += 0.015
343
+
344
+ elif action_type == "query_docs":
345
+ if not self._docs_queried:
346
+ reward += self.tool_usage_bonus * 0.5
347
+
348
+ elif action_type == "ask_question":
349
+ if self._step_count <= 3:
350
+ reward += 0.02 # tighter window
351
+
352
+ # ============================================================
353
+ # 4. EXPLORATION (less noisy)
354
+ # ============================================================
355
+ if len(self._action_history) >= 3:
356
+ recent = self._action_history[-3:]
357
+ unique = len(set(recent))
358
+
359
+ if unique == 1:
360
+ reward -= 0.05
361
+ elif unique == 3:
362
+ reward += self.diversity_bonus * 0.7 # reduce randomness bias
363
+
364
+ # ============================================================
365
+ # 5. ANTI-HACKING
366
+ # ============================================================
367
+ if action_type == "propose_fix":
368
+ if not self._tests_run:
369
+ reward -= 0.25 # stronger enforcement
370
+ if self._step_count < 2:
371
+ reward -= 0.1
372
+ if self._tests_run and self._linter_run:
373
  reward += 0.02
374
+
375
+ # ============================================================
376
+ # 6. STEP PENALTY (progress pressure)
377
+ # ============================================================
378
+ reward -= self.step_penalty
379
+
380
+ # ============================================================
381
+ # 7. CLIP (final safety)
382
+ # ============================================================
383
+ return max(-1.0, min(1.0, reward))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
 
385
  return reward
386
 
 
441
  success, stdout, stderr = execute_code(self._current_code)
442
  output = (stdout + stderr).strip() or "No output"
443
  self._test_results = f"[Execute] {'Success' if success else 'Failed'}\n{output[:300]}"
444
+ base_reward = 0.001 if success else -0.05
445
 
446
  elif isinstance(action, Inspect):
447
  self._test_results = f"[Inspect]\n{self._current_code[:500]}"
448
+ base_reward = 0.001
449
 
450
  elif isinstance(action, RunLinter):
451
  lint_output = ToolBox.run_linter(self._current_code)
 
454
 
455
  self._current_lint_score = self._run_linter_score(self._current_code)
456
  self._linter_run = True
457
+ base_reward = 0.002
458
 
459
  elif isinstance(action, RunTests):
460
  runner = TestRunner(self._current_bug_id)
 
464
  self._tests_run = True
465
 
466
  self._test_results = f"[Tests] Score: {score:.2f}\n{output[:300]}"
467
+ base_reward = 0.002
468
 
469
  if score > 0.8:
470
+ base_reward += 0.005
471
 
472
  elif isinstance(action, QueryDocs):
473
  doc = ToolBox.query_docs(action.query_topic)
474
  self._doc_results = doc
475
  self._test_results = f"[Docs]\n{doc[:400]}"
476
  self._docs_queried = True
477
+ base_reward = 0.001
478
 
479
  # ==============================================================
480
  # COMMUNICATION ACTIONS
 
493
 
494
  self._comments.append(f"Author: {response}")
495
  self._test_results = f"[Comment] Author: {response[:200]}"
496
+ base_reward = 0.001
497
 
498
  elif isinstance(action, AskQuestion):
499
  self._comments.append(f"Agent: {action.question}")
 
509
 
510
  self._comments.append(f"Author: {response}")
511
  self._test_results = f"[Question] Author: {response[:200]}"
512
+ base_reward = 0.002
513
 
514
  # ==============================================================
515
  # FINAL FIX ACTION
516
  # ==============================================================
517
  elif isinstance(action, ProposeFix):
518
  if not action.fix_code:
519
+ base_reward = -0.05
520
  self._done = True
521
  else:
522
  self._current_code = action.fix_code
 
566
  # TERMINATION ACTIONS
567
  # ==============================================================
568
  elif isinstance(action, Skip):
569
+ base_reward = -0.03
570
  self._done = True
571
 
572
  elif isinstance(action, Done):
573
  if self._tests_run:
574
  base_reward = self._current_test_score * 0.5 - 0.2
575
  else:
576
+ base_reward = -0.04
577
  self._done = True
578
 
579
  else:
580
+ base_reward = -0.02
581
  self._done = True
582
 
583
  # ==============================================================