Update environment.py
Browse files- environment.py +56 -53
environment.py
CHANGED
|
@@ -242,49 +242,44 @@ class CodeReviewEnv:
|
|
| 242 |
|
| 243 |
# ===================================================================
|
| 244 |
def _get_observation(self) -> EnhancedObservation:
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
""
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
previous_lint_score=self._previous_lint_score,
|
| 261 |
-
|
| 262 |
-
# EXPOSED: Author internal state (affects gating)
|
| 263 |
-
author_confidence=self._author._confidence,
|
| 264 |
-
author_threshold=self._author.thresholds.get(self._author.personality, 0.5),
|
| 265 |
-
|
| 266 |
-
# Progress
|
| 267 |
-
step=self._step_count,
|
| 268 |
-
max_steps=self.max_steps,
|
| 269 |
-
progress_ratio=self._step_count / self.max_steps,
|
| 270 |
-
|
| 271 |
-
# Tool usage
|
| 272 |
-
tests_run=self._tests_run,
|
| 273 |
-
linter_run=self._linter_run,
|
| 274 |
-
docs_queried=self._docs_queried,
|
| 275 |
-
|
| 276 |
-
# Action history
|
| 277 |
-
last_action_type=self._last_action_type,
|
| 278 |
-
action_history=self._action_history[-5:],
|
| 279 |
-
|
| 280 |
-
# Terminal
|
| 281 |
-
done=self._done,
|
| 282 |
-
|
| 283 |
-
# Context
|
| 284 |
-
bug_description=self._bug_description,
|
| 285 |
-
comments_count=len(self._comments),
|
| 286 |
-
)
|
| 287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
# ===================================================================
|
| 289 |
def _compute_dense_reward(
|
| 290 |
self,
|
|
@@ -519,35 +514,30 @@ class CodeReviewEnv:
|
|
| 519 |
self._done = True
|
| 520 |
else:
|
| 521 |
self._current_code = action.fix_code
|
| 522 |
-
|
| 523 |
runner = TestRunner(self._current_bug_id)
|
| 524 |
test_score, test_output = runner.run_tests(self._current_code)
|
| 525 |
lint_score = self._run_linter_score(self._current_code)
|
| 526 |
negotiation_score = self._author.get_negotiation_score()
|
| 527 |
-
|
| 528 |
-
# Update current scores
|
| 529 |
self._current_test_score = test_score
|
| 530 |
self._current_lint_score = lint_score
|
| 531 |
-
|
| 532 |
-
# Component reward (scaled down to allow delta distribution)
|
| 533 |
component_reward = (
|
| 534 |
0.4 * test_score +
|
| 535 |
0.15 * lint_score +
|
| 536 |
0.15 * negotiation_score
|
| 537 |
)
|
| 538 |
-
|
| 539 |
efficiency = 1.0 - (self._step_count / self.max_steps)
|
| 540 |
component_reward += 0.1 * efficiency
|
| 541 |
-
|
| 542 |
-
# Cross-signal consistency
|
| 543 |
if test_score > 0.8 and lint_score < 0.3:
|
| 544 |
component_reward *= 0.85
|
| 545 |
if test_score < 0.3 and lint_score > 0.8:
|
| 546 |
component_reward *= 0.75
|
| 547 |
if test_score > 0.8 and negotiation_score < 0.3:
|
| 548 |
component_reward *= 0.8
|
| 549 |
-
|
| 550 |
-
# Author gating
|
| 551 |
threshold = self._author.thresholds.get(self._author.personality, 0.5)
|
| 552 |
if self._author._confidence < threshold:
|
| 553 |
component_reward = max(0.0, component_reward - 0.2)
|
|
@@ -557,7 +547,20 @@ class CodeReviewEnv:
|
|
| 557 |
self._done = True
|
| 558 |
else:
|
| 559 |
self._done = True
|
| 560 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 561 |
base_reward = component_reward
|
| 562 |
self._test_results = f"[Fix] Test: {test_score:.2f}, Lint: {lint_score:.2f}\n{test_output[:200]}"
|
| 563 |
|
|
|
|
| 242 |
|
| 243 |
# ===================================================================
|
| 244 |
def _get_observation(self) -> EnhancedObservation:
|
| 245 |
+
"""Return COMPLETE Markov state."""
|
| 246 |
+
# Compute author response: only after comment/question/fix does the author actually speak
|
| 247 |
+
if self._last_action_type in ("write_comment", "ask_question", "propose_fix"):
|
| 248 |
+
author_response = self._test_results or ""
|
| 249 |
+
else:
|
| 250 |
+
author_response = ""
|
| 251 |
+
|
| 252 |
+
return EnhancedObservation(
|
| 253 |
+
code_snippet=self._current_code,
|
| 254 |
+
last_tool_output=self._test_results or "",
|
| 255 |
+
author_response=author_response, # ← fixed
|
| 256 |
+
|
| 257 |
+
current_test_score=self._current_test_score,
|
| 258 |
+
current_lint_score=self._current_lint_score,
|
| 259 |
+
negotiation_score=self._author.get_negotiation_score(),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
|
| 261 |
+
previous_test_score=self._previous_test_score,
|
| 262 |
+
previous_lint_score=self._previous_lint_score,
|
| 263 |
+
|
| 264 |
+
author_confidence=self._author._confidence,
|
| 265 |
+
author_threshold=self._author.thresholds.get(self._author.personality, 0.5),
|
| 266 |
+
|
| 267 |
+
step=self._step_count,
|
| 268 |
+
max_steps=self.max_steps,
|
| 269 |
+
progress_ratio=self._step_count / self.max_steps,
|
| 270 |
+
|
| 271 |
+
tests_run=self._tests_run,
|
| 272 |
+
linter_run=self._linter_run,
|
| 273 |
+
docs_queried=self._docs_queried,
|
| 274 |
+
|
| 275 |
+
last_action_type=self._last_action_type,
|
| 276 |
+
action_history=self._action_history[-5:],
|
| 277 |
+
|
| 278 |
+
done=self._done,
|
| 279 |
+
|
| 280 |
+
bug_description=self._bug_description,
|
| 281 |
+
comments_count=len(self._comments),
|
| 282 |
+
)
|
| 283 |
# ===================================================================
|
| 284 |
def _compute_dense_reward(
|
| 285 |
self,
|
|
|
|
| 514 |
self._done = True
|
| 515 |
else:
|
| 516 |
self._current_code = action.fix_code
|
| 517 |
+
|
| 518 |
runner = TestRunner(self._current_bug_id)
|
| 519 |
test_score, test_output = runner.run_tests(self._current_code)
|
| 520 |
lint_score = self._run_linter_score(self._current_code)
|
| 521 |
negotiation_score = self._author.get_negotiation_score()
|
| 522 |
+
|
|
|
|
| 523 |
self._current_test_score = test_score
|
| 524 |
self._current_lint_score = lint_score
|
| 525 |
+
|
|
|
|
| 526 |
component_reward = (
|
| 527 |
0.4 * test_score +
|
| 528 |
0.15 * lint_score +
|
| 529 |
0.15 * negotiation_score
|
| 530 |
)
|
|
|
|
| 531 |
efficiency = 1.0 - (self._step_count / self.max_steps)
|
| 532 |
component_reward += 0.1 * efficiency
|
| 533 |
+
|
|
|
|
| 534 |
if test_score > 0.8 and lint_score < 0.3:
|
| 535 |
component_reward *= 0.85
|
| 536 |
if test_score < 0.3 and lint_score > 0.8:
|
| 537 |
component_reward *= 0.75
|
| 538 |
if test_score > 0.8 and negotiation_score < 0.3:
|
| 539 |
component_reward *= 0.8
|
| 540 |
+
|
|
|
|
| 541 |
threshold = self._author.thresholds.get(self._author.personality, 0.5)
|
| 542 |
if self._author._confidence < threshold:
|
| 543 |
component_reward = max(0.0, component_reward - 0.2)
|
|
|
|
| 547 |
self._done = True
|
| 548 |
else:
|
| 549 |
self._done = True
|
| 550 |
+
|
| 551 |
+
# Get author's verbal feedback (pushback or acceptance)
|
| 552 |
+
author_feedback = self._author.respond(
|
| 553 |
+
agent_comment=f"Proposed fix:\n{action.fix_code}",
|
| 554 |
+
test_results=f"Score: {test_score:.2f}",
|
| 555 |
+
lint_results=f"Score: {lint_score:.2f}",
|
| 556 |
+
doc_results=self._doc_results,
|
| 557 |
+
proposed_fix=action.fix_code,
|
| 558 |
+
original_code=self._current_code # note: original should be the buggy code
|
| 559 |
+
)
|
| 560 |
+
# Keep the author's reply as the main output (so agent sees it)
|
| 561 |
+
self._test_results = f"[Fix] Author: {author_feedback[:200]}"
|
| 562 |
+
self._comments.append(f"Author: {author_feedback}")
|
| 563 |
+
|
| 564 |
base_reward = component_reward
|
| 565 |
self._test_results = f"[Fix] Test: {test_score:.2f}, Lint: {lint_score:.2f}\n{test_output[:200]}"
|
| 566 |
|