100XZX001 commited on
Commit
380d64d
·
verified ·
1 Parent(s): 54bfec3

Update environment.py

Browse files
Files changed (1) hide show
  1. environment.py +56 -53
environment.py CHANGED
@@ -242,49 +242,44 @@ class CodeReviewEnv:
242
 
243
  # ===================================================================
244
  def _get_observation(self) -> EnhancedObservation:
245
- """
246
- Return COMPLETE Markov state.
247
- NOTHING is hidden - reward depends ONLY on (state, action).
248
- """
249
- return EnhancedObservation(
250
- code_snippet=self._current_code,
251
- last_tool_output=self._test_results or "",
252
-
253
- # Current metrics
254
- current_test_score=self._current_test_score,
255
- current_lint_score=self._current_lint_score,
256
- negotiation_score=self._author.get_negotiation_score(),
257
-
258
- # EXPOSED: Previous metrics (for delta understanding)
259
- previous_test_score=self._previous_test_score,
260
- previous_lint_score=self._previous_lint_score,
261
-
262
- # EXPOSED: Author internal state (affects gating)
263
- author_confidence=self._author._confidence,
264
- author_threshold=self._author.thresholds.get(self._author.personality, 0.5),
265
-
266
- # Progress
267
- step=self._step_count,
268
- max_steps=self.max_steps,
269
- progress_ratio=self._step_count / self.max_steps,
270
-
271
- # Tool usage
272
- tests_run=self._tests_run,
273
- linter_run=self._linter_run,
274
- docs_queried=self._docs_queried,
275
-
276
- # Action history
277
- last_action_type=self._last_action_type,
278
- action_history=self._action_history[-5:],
279
-
280
- # Terminal
281
- done=self._done,
282
-
283
- # Context
284
- bug_description=self._bug_description,
285
- comments_count=len(self._comments),
286
- )
287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  # ===================================================================
289
  def _compute_dense_reward(
290
  self,
@@ -519,35 +514,30 @@ class CodeReviewEnv:
519
  self._done = True
520
  else:
521
  self._current_code = action.fix_code
522
-
523
  runner = TestRunner(self._current_bug_id)
524
  test_score, test_output = runner.run_tests(self._current_code)
525
  lint_score = self._run_linter_score(self._current_code)
526
  negotiation_score = self._author.get_negotiation_score()
527
-
528
- # Update current scores
529
  self._current_test_score = test_score
530
  self._current_lint_score = lint_score
531
-
532
- # Component reward (scaled down to allow delta distribution)
533
  component_reward = (
534
  0.4 * test_score +
535
  0.15 * lint_score +
536
  0.15 * negotiation_score
537
  )
538
-
539
  efficiency = 1.0 - (self._step_count / self.max_steps)
540
  component_reward += 0.1 * efficiency
541
-
542
- # Cross-signal consistency
543
  if test_score > 0.8 and lint_score < 0.3:
544
  component_reward *= 0.85
545
  if test_score < 0.3 and lint_score > 0.8:
546
  component_reward *= 0.75
547
  if test_score > 0.8 and negotiation_score < 0.3:
548
  component_reward *= 0.8
549
-
550
- # Author gating
551
  threshold = self._author.thresholds.get(self._author.personality, 0.5)
552
  if self._author._confidence < threshold:
553
  component_reward = max(0.0, component_reward - 0.2)
@@ -557,7 +547,20 @@ class CodeReviewEnv:
557
  self._done = True
558
  else:
559
  self._done = True
560
-
 
 
 
 
 
 
 
 
 
 
 
 
 
561
  base_reward = component_reward
562
  self._test_results = f"[Fix] Test: {test_score:.2f}, Lint: {lint_score:.2f}\n{test_output[:200]}"
563
 
 
242
 
243
  # ===================================================================
244
  def _get_observation(self) -> EnhancedObservation:
245
+ """Return COMPLETE Markov state."""
246
+ # Compute author response: only after comment/question/fix does the author actually speak
247
+ if self._last_action_type in ("write_comment", "ask_question", "propose_fix"):
248
+ author_response = self._test_results or ""
249
+ else:
250
+ author_response = ""
251
+
252
+ return EnhancedObservation(
253
+ code_snippet=self._current_code,
254
+ last_tool_output=self._test_results or "",
255
+ author_response=author_response, # ← fixed
256
+
257
+ current_test_score=self._current_test_score,
258
+ current_lint_score=self._current_lint_score,
259
+ negotiation_score=self._author.get_negotiation_score(),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
+ previous_test_score=self._previous_test_score,
262
+ previous_lint_score=self._previous_lint_score,
263
+
264
+ author_confidence=self._author._confidence,
265
+ author_threshold=self._author.thresholds.get(self._author.personality, 0.5),
266
+
267
+ step=self._step_count,
268
+ max_steps=self.max_steps,
269
+ progress_ratio=self._step_count / self.max_steps,
270
+
271
+ tests_run=self._tests_run,
272
+ linter_run=self._linter_run,
273
+ docs_queried=self._docs_queried,
274
+
275
+ last_action_type=self._last_action_type,
276
+ action_history=self._action_history[-5:],
277
+
278
+ done=self._done,
279
+
280
+ bug_description=self._bug_description,
281
+ comments_count=len(self._comments),
282
+ )
283
  # ===================================================================
284
  def _compute_dense_reward(
285
  self,
 
514
  self._done = True
515
  else:
516
  self._current_code = action.fix_code
517
+
518
  runner = TestRunner(self._current_bug_id)
519
  test_score, test_output = runner.run_tests(self._current_code)
520
  lint_score = self._run_linter_score(self._current_code)
521
  negotiation_score = self._author.get_negotiation_score()
522
+
 
523
  self._current_test_score = test_score
524
  self._current_lint_score = lint_score
525
+
 
526
  component_reward = (
527
  0.4 * test_score +
528
  0.15 * lint_score +
529
  0.15 * negotiation_score
530
  )
 
531
  efficiency = 1.0 - (self._step_count / self.max_steps)
532
  component_reward += 0.1 * efficiency
533
+
 
534
  if test_score > 0.8 and lint_score < 0.3:
535
  component_reward *= 0.85
536
  if test_score < 0.3 and lint_score > 0.8:
537
  component_reward *= 0.75
538
  if test_score > 0.8 and negotiation_score < 0.3:
539
  component_reward *= 0.8
540
+
 
541
  threshold = self._author.thresholds.get(self._author.personality, 0.5)
542
  if self._author._confidence < threshold:
543
  component_reward = max(0.0, component_reward - 0.2)
 
547
  self._done = True
548
  else:
549
  self._done = True
550
+
551
+ # Get author's verbal feedback (pushback or acceptance)
552
+ author_feedback = self._author.respond(
553
+ agent_comment=f"Proposed fix:\n{action.fix_code}",
554
+ test_results=f"Score: {test_score:.2f}",
555
+ lint_results=f"Score: {lint_score:.2f}",
556
+ doc_results=self._doc_results,
557
+ proposed_fix=action.fix_code,
558
+ original_code=self._current_code # note: original should be the buggy code
559
+ )
560
+ # Keep the author's reply as the main output (so agent sees it)
561
+ self._test_results = f"[Fix] Author: {author_feedback[:200]}"
562
+ self._comments.append(f"Author: {author_feedback}")
563
+
564
  base_reward = component_reward
565
  self._test_results = f"[Fix] Test: {test_score:.2f}, Lint: {lint_score:.2f}\n{test_output[:200]}"
566