100XZX001 commited on
Commit
fd00203
·
verified ·
1 Parent(s): fcaf7ff

Update environment.py

Browse files
Files changed (1) hide show
  1. environment.py +58 -58
environment.py CHANGED
@@ -287,100 +287,100 @@ class CodeReviewEnv:
287
 
288
  # ===================================================================
289
  def _compute_dense_reward(
290
- self,
291
- action: AnyAction,
292
- base_reward: float,
293
- action_type: str
294
- ) -> float:
295
- """
296
- Stabilized dense reward:
297
- - Decoupled terminal bonus
298
- - Controlled base scaling
299
- - Symmetric delta handling
300
- - Reduced reward hacking surface
301
- """
302
 
303
  # ============================================================
304
  # 0. BASE REWARD (controlled contribution)
305
  # ============================================================
306
- reward = 0.4 * base_reward # ↓ reduce dominance
307
 
308
  # ============================================================
309
  # 1. DELTA REWARDS (primary learning signal)
310
  # ============================================================
311
- effective_delta_weight = self.delta_weight
312
- if action_type == "propose_fix":
313
- effective_delta_weight *= 0.4 # stronger cut to avoid overlap
314
 
315
- test_delta = self._current_test_score - self._previous_test_score
316
- lint_delta = self._current_lint_score - self._previous_lint_score
317
 
318
  # symmetric (no artificial dampening for negatives)
319
- reward += effective_delta_weight * test_delta
320
- reward += 0.5 * effective_delta_weight * lint_delta
321
 
322
  # ============================================================
323
  # 2. TERMINAL SUCCESS BONUS (clean & isolated)
324
  # ============================================================
325
- if action_type == "propose_fix":
326
- if self._current_test_score > 0.95:
327
- reward += 0.4 # slightly reduced to prevent saturation
328
- elif self._current_test_score > 0.85:
329
- reward += 0.2 # smoother gradient instead of jump
330
 
331
  # ============================================================
332
  # 3. TOOL USAGE (early guidance only)
333
  # ============================================================
334
- if action_type == "run_tests":
335
- if not self._tests_run:
336
- reward += self.tool_usage_bonus
337
- reward += 0.015
338
-
339
- elif action_type == "run_linter":
340
- if not self._linter_run:
341
- reward += self.tool_usage_bonus
342
- reward += 0.015
343
-
344
- elif action_type == "query_docs":
345
- if not self._docs_queried:
346
- reward += self.tool_usage_bonus * 0.5
347
-
348
- elif action_type == "ask_question":
349
- if self._step_count <= 3:
350
- reward += 0.02 # tighter window
351
 
352
  # ============================================================
353
  # 4. EXPLORATION (less noisy)
354
  # ============================================================
355
- if len(self._action_history) >= 3:
356
- recent = self._action_history[-3:]
357
- unique = len(set(recent))
358
 
359
- if unique == 1:
360
- reward -= 0.05
361
- elif unique == 3:
362
- reward += self.diversity_bonus * 0.7 # reduce randomness bias
363
 
364
  # ============================================================
365
  # 5. ANTI-HACKING
366
  # ============================================================
367
- if action_type == "propose_fix":
368
- if not self._tests_run:
369
- reward -= 0.25 # stronger enforcement
370
- if self._step_count < 2:
371
- reward -= 0.1
372
- if self._tests_run and self._linter_run:
373
- reward += 0.02
374
 
375
  # ============================================================
376
  # 6. STEP PENALTY (progress pressure)
377
  # ============================================================
378
- reward -= self.step_penalty
379
 
380
  # ============================================================
381
  # 7. CLIP (final safety)
382
  # ============================================================
383
- return max(-1.0, min(1.0, reward))
384
 
385
 
386
  # ===================================================================
 
287
 
288
  # ===================================================================
289
  def _compute_dense_reward(
290
+ self,
291
+ action: AnyAction,
292
+ base_reward: float,
293
+ action_type: str
294
+ ) -> float:
295
+ """
296
+ Stabilized dense reward:
297
+ - Decoupled terminal bonus
298
+ - Controlled base scaling
299
+ - Symmetric delta handling
300
+ - Reduced reward hacking surface
301
+ """
302
 
303
  # ============================================================
304
  # 0. BASE REWARD (controlled contribution)
305
  # ============================================================
306
+ reward = 0.4 * base_reward # ↓ reduce dominance
307
 
308
  # ============================================================
309
  # 1. DELTA REWARDS (primary learning signal)
310
  # ============================================================
311
+ effective_delta_weight = self.delta_weight
312
+ if action_type == "propose_fix":
313
+ effective_delta_weight *= 0.4 # stronger cut to avoid overlap
314
 
315
+ test_delta = self._current_test_score - self._previous_test_score
316
+ lint_delta = self._current_lint_score - self._previous_lint_score
317
 
318
  # symmetric (no artificial dampening for negatives)
319
+ reward += effective_delta_weight * test_delta
320
+ reward += 0.5 * effective_delta_weight * lint_delta
321
 
322
  # ============================================================
323
  # 2. TERMINAL SUCCESS BONUS (clean & isolated)
324
  # ============================================================
325
+ if action_type == "propose_fix":
326
+ if self._current_test_score > 0.95:
327
+ reward += 0.4 # slightly reduced to prevent saturation
328
+ elif self._current_test_score > 0.85:
329
+ reward += 0.2 # smoother gradient instead of jump
330
 
331
  # ============================================================
332
  # 3. TOOL USAGE (early guidance only)
333
  # ============================================================
334
+ if action_type == "run_tests":
335
+ if not self._tests_run:
336
+ reward += self.tool_usage_bonus
337
+ reward += 0.015
338
+
339
+ elif action_type == "run_linter":
340
+ if not self._linter_run:
341
+ reward += self.tool_usage_bonus
342
+ reward += 0.015
343
+
344
+ elif action_type == "query_docs":
345
+ if not self._docs_queried:
346
+ reward += self.tool_usage_bonus * 0.5
347
+
348
+ elif action_type == "ask_question":
349
+ if self._step_count <= 3:
350
+ reward += 0.02 # tighter window
351
 
352
  # ============================================================
353
  # 4. EXPLORATION (less noisy)
354
  # ============================================================
355
+ if len(self._action_history) >= 3:
356
+ recent = self._action_history[-3:]
357
+ unique = len(set(recent))
358
 
359
+ if unique == 1:
360
+ reward -= 0.05
361
+ elif unique == 3:
362
+ reward += self.diversity_bonus * 0.7 # reduce randomness bias
363
 
364
  # ============================================================
365
  # 5. ANTI-HACKING
366
  # ============================================================
367
+ if action_type == "propose_fix":
368
+ if not self._tests_run:
369
+ reward -= 0.25 # stronger enforcement
370
+ if self._step_count < 2:
371
+ reward -= 0.1
372
+ if self._tests_run and self._linter_run:
373
+ reward += 0.02
374
 
375
  # ============================================================
376
  # 6. STEP PENALTY (progress pressure)
377
  # ============================================================
378
+ reward -= self.step_penalty
379
 
380
  # ============================================================
381
  # 7. CLIP (final safety)
382
  # ============================================================
383
+ return max(-1.0, min(1.0, reward))
384
 
385
 
386
  # ===================================================================