100XZX001 commited on
Commit
bc6e73e
·
verified ·
1 Parent(s): 5087fec

Update environment.py

Browse files
Files changed (1) hide show
  1. environment.py +88 -179
environment.py CHANGED
@@ -7,18 +7,25 @@ import os
7
  import re
8
  from dataclasses import dataclass, field
9
  from typing import Tuple, Dict, Any, Optional, List
10
- from collections import Counter
11
 
12
  from models import (
13
  AnyAction, WriteComment, ProposeFix, Execute, Inspect,
14
  RunLinter, RunTests, QueryDocs, Skip, Done, AskQuestion,
15
  Observation, Reward, State
16
  )
17
- from grader import RigorousGrader
18
  from redteam import RedTeam
19
  from test_runner import TestRunner
20
  from author import PersonaAuthor
21
  from rltool import ToolBox
 
 
 
 
 
 
 
 
 
22
 
23
  # ======================================================================
24
  # FULLY MARKOV OBSERVATION (NOTHING HIDDEN)
@@ -32,37 +39,38 @@ class EnhancedObservation:
32
  # Code state
33
  code_snippet: str
34
  last_tool_output: str
35
-
 
36
  # Current metrics
37
  current_test_score: float
38
  current_lint_score: float
39
  negotiation_score: float
40
-
41
  # CRITICAL: Previous metrics (for understanding deltas)
42
  previous_test_score: float
43
  previous_lint_score: float
44
-
45
  # CRITICAL: Author internal state (affects reward gating)
46
  author_confidence: float
47
  author_threshold: float # When author accepts
48
-
49
  # Progress tracking
50
  step: int
51
  max_steps: int
52
  progress_ratio: float
53
-
54
  # Tool usage flags
55
  tests_run: bool
56
  linter_run: bool
57
  docs_queried: bool
58
-
59
  # Action history (with outcomes)
60
  last_action_type: str
61
  action_history: List[str] # Last 5 actions
62
-
63
  # Terminal flag
64
  done: bool
65
-
66
  # Additional context
67
  bug_description: str
68
  comments_count: int
@@ -107,16 +115,16 @@ class CodeReviewEnv:
107
  task: str = "easy"
108
  max_steps: int = 10
109
  step_penalty: float = 0.01
110
-
111
  # Curriculum learning
112
  auto_difficulty: bool = False
113
  success_threshold: float = 0.7
114
-
115
  # Reward shaping parameters
116
  delta_weight: float = 0.3
117
  tool_usage_bonus: float = 0.05
118
  diversity_bonus: float = 0.03
119
-
120
  _red_team: Optional[RedTeam] = field(init=False, default=None)
121
  _author: Optional[PersonaAuthor] = field(init=False, default=None)
122
 
@@ -132,22 +140,22 @@ class CodeReviewEnv:
132
 
133
  _step_count: int = field(init=False, default=0)
134
  _done: bool = field(init=False, default=False)
135
-
136
  # State tracking for dense rewards
137
  _previous_test_score: float = field(init=False, default=0.0)
138
  _previous_lint_score: float = field(init=False, default=0.0)
139
  _current_test_score: float = field(init=False, default=0.0)
140
  _current_lint_score: float = field(init=False, default=0.0)
141
-
142
  # Tool usage tracking
143
  _tests_run: bool = field(init=False, default=False)
144
  _linter_run: bool = field(init=False, default=False)
145
  _docs_queried: bool = field(init=False, default=False)
146
-
147
  # Action history
148
  _action_history: List[str] = field(init=False, default_factory=list)
149
  _last_action_type: str = field(init=False, default="none")
150
-
151
  # FIXED: Track CUMULATIVE episode reward
152
  _episode_total_reward: float = field(init=False, default=0.0)
153
  _episode_rewards: List[float] = field(init=False, default_factory=list)
@@ -165,37 +173,46 @@ class CodeReviewEnv:
165
  self.task = task
166
  self._red_team = RedTeam(task)
167
  self._author = PersonaAuthor()
 
 
 
 
 
 
 
 
 
168
 
169
  task_to_level = {
170
- "easy": 0, "medium": 1, "hard": 2,
171
  "harder": 3, "hardest": 4
172
  }
173
  self._difficulty_level = task_to_level[task]
174
-
175
  self._reset_internal()
176
 
177
  # ===================================================================
178
  def _reset_internal(self):
179
- self._step_count = 0
180
  self._comments = []
181
  self._test_results = None
182
  self._lint_results = None
183
  self._doc_results = None
184
  self._done = False
185
-
186
  # Reset state tracking
187
  self._previous_test_score = 0.0
188
  self._previous_lint_score = 0.0
189
  self._current_test_score = 0.0
190
  self._current_lint_score = 0.0
191
-
192
  self._tests_run = False
193
  self._linter_run = False
194
  self._docs_queried = False
195
-
196
  self._action_history = []
197
  self._last_action_type = "none"
198
-
199
  # FIXED: Reset episode cumulative reward
200
  self._episode_total_reward = 0.0
201
 
@@ -225,18 +242,18 @@ class CodeReviewEnv:
225
  """Reset with optional curriculum adjustment."""
226
  if self.auto_difficulty and len(self._episode_rewards) > 0:
227
  recent_performance = sum(self._episode_rewards[-5:]) / min(5, len(self._episode_rewards))
228
-
229
  if recent_performance > self.success_threshold and self._difficulty_level < 4:
230
  self._difficulty_level += 1
231
  print(f"[Curriculum] Increasing difficulty to level {self._difficulty_level}")
232
  elif recent_performance < 0.3 and self._difficulty_level > 0:
233
  self._difficulty_level -= 1
234
  print(f"[Curriculum] Decreasing difficulty to level {self._difficulty_level}")
235
-
236
  level_to_task = {0: "easy", 1: "medium", 2: "hard", 3: "harder", 4: "hardest"}
237
  self.task = level_to_task[self._difficulty_level]
238
  self._red_team = RedTeam(self.task)
239
-
240
  self._reset_internal()
241
  return self._get_observation()
242
 
@@ -252,9 +269,9 @@ class CodeReviewEnv:
252
  return EnhancedObservation(
253
  code_snippet=self._current_code,
254
  last_tool_output=self._test_results or "",
255
- author_response=author_response, # ← fixed
256
 
257
- current_test_score=self._current_test_score,
258
  current_lint_score=self._current_lint_score,
259
  negotiation_score=self._author.get_negotiation_score(),
260
 
@@ -279,104 +296,7 @@ class CodeReviewEnv:
279
 
280
  bug_description=self._bug_description,
281
  comments_count=len(self._comments),
282
- )
283
- # ===================================================================
284
- def _compute_dense_reward(
285
- self,
286
- action: AnyAction,
287
- base_reward: float,
288
- action_type: str
289
- ) -> float:
290
- """
291
- Stabilized dense reward:
292
- - Decoupled terminal bonus
293
- - Controlled base scaling
294
- - Symmetric delta handling
295
- - Reduced reward hacking surface
296
- """
297
-
298
- # ============================================================
299
- # 0. BASE REWARD (controlled contribution)
300
- # ============================================================
301
- reward = 0.4 * base_reward # ↓ reduce dominance
302
-
303
- # ============================================================
304
- # 1. DELTA REWARDS (primary learning signal)
305
- # ============================================================
306
- effective_delta_weight = self.delta_weight
307
- if action_type == "propose_fix":
308
- effective_delta_weight *= 0.4 # stronger cut to avoid overlap
309
-
310
- test_delta = self._current_test_score - self._previous_test_score
311
- lint_delta = self._current_lint_score - self._previous_lint_score
312
-
313
- # symmetric (no artificial dampening for negatives)
314
- reward += effective_delta_weight * test_delta
315
- reward += 0.5 * effective_delta_weight * lint_delta
316
-
317
- # ============================================================
318
- # 2. TERMINAL SUCCESS BONUS (clean & isolated)
319
- # ============================================================
320
- if action_type == "propose_fix":
321
- if self._current_test_score > 0.95:
322
- reward += 0.4 # slightly reduced to prevent saturation
323
- elif self._current_test_score > 0.85:
324
- reward += 0.2 # smoother gradient instead of jump
325
-
326
- # ============================================================
327
- # 3. TOOL USAGE (early guidance only)
328
- # ============================================================
329
- if action_type == "run_tests":
330
- if not self._tests_run:
331
- reward += self.tool_usage_bonus
332
- reward += 0.015
333
-
334
- elif action_type == "run_linter":
335
- if not self._linter_run:
336
- reward += self.tool_usage_bonus
337
- reward += 0.015
338
-
339
- elif action_type == "query_docs":
340
- if not self._docs_queried:
341
- reward += self.tool_usage_bonus * 0.5
342
-
343
- elif action_type == "ask_question":
344
- if self._step_count <= 3:
345
- reward += 0.02 # tighter window
346
-
347
- # ============================================================
348
- # 4. EXPLORATION (less noisy)
349
- # ============================================================
350
- if len(self._action_history) >= 3:
351
- recent = self._action_history[-3:]
352
- unique = len(set(recent))
353
-
354
- if unique == 1:
355
- reward -= 0.05
356
- elif unique == 3:
357
- reward += self.diversity_bonus * 0.7 # reduce randomness bias
358
-
359
- # ============================================================
360
- # 5. ANTI-HACKING
361
- # ============================================================
362
- if action_type == "propose_fix":
363
- if not self._tests_run:
364
- reward -= 0.25 # stronger enforcement
365
- if self._step_count < 2:
366
- reward -= 0.1
367
- if self._tests_run and self._linter_run:
368
- reward += 0.02
369
-
370
- # ============================================================
371
- # 6. STEP PENALTY (progress pressure)
372
- # ============================================================
373
- reward -= self.step_penalty
374
-
375
- # ============================================================
376
- # 7. CLIP (final safety)
377
- # ============================================================
378
- return max(-1.0, min(1.0, reward))
379
-
380
 
381
  # ===================================================================
382
  def _get_action_type(self, action: AnyAction) -> str:
@@ -419,11 +339,10 @@ class CodeReviewEnv:
419
  # Store previous metrics for delta computation
420
  self._previous_test_score = self._current_test_score
421
  self._previous_lint_score = self._current_lint_score
422
-
423
  base_reward = 0.0
424
- info = {}
425
  action_type = self._get_action_type(action)
426
-
427
  # Update action history
428
  self._action_history.append(action_type)
429
  self._last_action_type = action_type
@@ -445,7 +364,7 @@ class CodeReviewEnv:
445
  lint_output = ToolBox.run_linter(self._current_code)
446
  self._lint_results = lint_output[:500]
447
  self._test_results = f"[Linter]\n{self._lint_results}"
448
-
449
  self._current_lint_score = self._run_linter_score(self._current_code)
450
  self._linter_run = True
451
  base_reward = 0.002
@@ -453,13 +372,13 @@ class CodeReviewEnv:
453
  elif isinstance(action, RunTests):
454
  runner = TestRunner(self._current_bug_id)
455
  score, output = runner.run_tests(self._current_code)
456
-
457
  self._current_test_score = score
458
  self._tests_run = True
459
-
460
  self._test_results = f"[Tests] Score: {score:.2f}\n{output[:300]}"
461
  base_reward = 0.002
462
-
463
  if score > 0.8:
464
  base_reward += 0.005
465
 
@@ -475,7 +394,7 @@ class CodeReviewEnv:
475
  # ==============================================================
476
  elif isinstance(action, WriteComment):
477
  self._comments.append(f"Agent: {action.comment_text}")
478
-
479
  response = self._author.respond(
480
  agent_comment=action.comment_text,
481
  test_results=self._test_results,
@@ -484,23 +403,23 @@ class CodeReviewEnv:
484
  proposed_fix=None,
485
  original_code=self._current_code
486
  )
487
-
488
  self._comments.append(f"Author: {response}")
489
  self._test_results = f"[Comment] Author: {response[:200]}"
490
  base_reward = 0.001
491
 
492
  elif isinstance(action, AskQuestion):
493
  self._comments.append(f"Agent: {action.question}")
494
-
495
  response = self._author.respond(
496
  agent_question=action.question,
497
  test_results=self._test_results,
498
  lint_results=self._lint_results,
499
  doc_results=self._doc_results,
500
  proposed_fix=None,
501
- original_code=original_buggy
502
  )
503
-
504
  self._comments.append(f"Author: {response}")
505
  self._test_results = f"[Question] Author: {response[:200]}"
506
  base_reward = 0.002
@@ -513,6 +432,7 @@ class CodeReviewEnv:
513
  base_reward = -0.05
514
  self._done = True
515
  else:
 
516
  original_buggy = self._current_code
517
  self._current_code = action.fix_code
518
 
@@ -524,24 +444,9 @@ class CodeReviewEnv:
524
  self._current_test_score = test_score
525
  self._current_lint_score = lint_score
526
 
527
- component_reward = (
528
- 0.4 * test_score +
529
- 0.15 * lint_score +
530
- 0.15 * negotiation_score
531
- )
532
- efficiency = 1.0 - (self._step_count / self.max_steps)
533
- component_reward += 0.1 * efficiency
534
-
535
- if test_score > 0.8 and lint_score < 0.3:
536
- component_reward *= 0.85
537
- if test_score < 0.3 and lint_score > 0.8:
538
- component_reward *= 0.75
539
- if test_score > 0.8 and negotiation_score < 0.3:
540
- component_reward *= 0.8
541
-
542
  threshold = self._author.thresholds.get(self._author.personality, 0.5)
543
  if self._author._confidence < threshold:
544
- component_reward = max(0.0, component_reward - 0.2)
545
  if self._step_count < self.max_steps:
546
  self._done = False
547
  else:
@@ -549,21 +454,19 @@ class CodeReviewEnv:
549
  else:
550
  self._done = True
551
 
552
- # Get author's verbal feedback (pushback or acceptance)
553
  author_feedback = self._author.respond(
554
  agent_comment=f"Proposed fix:\n{action.fix_code}",
555
  test_results=f"Score: {test_score:.2f}",
556
  lint_results=f"Score: {lint_score:.2f}",
557
  doc_results=self._doc_results,
558
  proposed_fix=action.fix_code,
559
- original_code=self._current_code # note: original should be the buggy code
560
  )
561
- # Keep the author's reply as the main output (so agent sees it)
562
  self._test_results = f"[Fix] Author: {author_feedback[:200]}"
563
  self._comments.append(f"Author: {author_feedback}")
564
 
565
- base_reward = component_reward
566
- self._test_results = f"[Fix] Test: {test_score:.2f}, Lint: {lint_score:.2f}\n{test_output[:200]}"
567
 
568
  # ==============================================================
569
  # TERMINATION ACTIONS
@@ -584,37 +487,43 @@ class CodeReviewEnv:
584
  self._done = True
585
 
586
  # ==============================================================
587
- # COMPUTE FINAL DENSE REWARD (with action_type for fix detection)
588
- # ==============================================================
589
- final_reward = self._compute_dense_reward(action, base_reward, action_type)
590
-
591
- # FIXED: Track CUMULATIVE episode reward
592
- self._episode_total_reward += final_reward
593
-
594
- # ==============================================================
595
- # STEP UPDATE
596
  # ==============================================================
597
  self._step_count += 1
598
-
599
  if self._step_count >= self.max_steps:
600
  self._done = True
601
-
602
- # FIXED: Store TOTAL episode reward, not just last step
603
- if self._done:
604
- self._episode_rewards.append(self._episode_total_reward)
605
-
606
  obs = self._get_observation()
607
-
 
608
  info = {
 
609
  "test_score": self._current_test_score,
610
  "lint_score": self._current_lint_score,
611
  "test_delta": self._current_test_score - self._previous_test_score,
612
  "lint_delta": self._current_lint_score - self._previous_lint_score,
613
  "base_reward": base_reward,
614
- "final_reward": final_reward,
615
- "episode_total": self._episode_total_reward,
616
  }
617
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
618
  return obs, Reward(value=final_reward), self._done, info
619
 
620
  # ===================================================================
 
7
  import re
8
  from dataclasses import dataclass, field
9
  from typing import Tuple, Dict, Any, Optional, List
 
10
 
11
  from models import (
12
  AnyAction, WriteComment, ProposeFix, Execute, Inspect,
13
  RunLinter, RunTests, QueryDocs, Skip, Done, AskQuestion,
14
  Observation, Reward, State
15
  )
 
16
  from redteam import RedTeam
17
  from test_runner import TestRunner
18
  from author import PersonaAuthor
19
  from rltool import ToolBox
20
+ from rubrics import (
21
+ ToolUsageRubric,
22
+ TestDeltaRubric,
23
+ LintDeltaRubric,
24
+ TerminalSuccessRubric,
25
+ ExplorationRubric,
26
+ AntiHackingRubric,
27
+ StepPenaltyRubric,
28
+ )
29
 
30
  # ======================================================================
31
  # FULLY MARKOV OBSERVATION (NOTHING HIDDEN)
 
39
  # Code state
40
  code_snippet: str
41
  last_tool_output: str
42
+ author_response: str = "" # ← ADDED
43
+
44
  # Current metrics
45
  current_test_score: float
46
  current_lint_score: float
47
  negotiation_score: float
48
+
49
  # CRITICAL: Previous metrics (for understanding deltas)
50
  previous_test_score: float
51
  previous_lint_score: float
52
+
53
  # CRITICAL: Author internal state (affects reward gating)
54
  author_confidence: float
55
  author_threshold: float # When author accepts
56
+
57
  # Progress tracking
58
  step: int
59
  max_steps: int
60
  progress_ratio: float
61
+
62
  # Tool usage flags
63
  tests_run: bool
64
  linter_run: bool
65
  docs_queried: bool
66
+
67
  # Action history (with outcomes)
68
  last_action_type: str
69
  action_history: List[str] # Last 5 actions
70
+
71
  # Terminal flag
72
  done: bool
73
+
74
  # Additional context
75
  bug_description: str
76
  comments_count: int
 
115
  task: str = "easy"
116
  max_steps: int = 10
117
  step_penalty: float = 0.01
118
+
119
  # Curriculum learning
120
  auto_difficulty: bool = False
121
  success_threshold: float = 0.7
122
+
123
  # Reward shaping parameters
124
  delta_weight: float = 0.3
125
  tool_usage_bonus: float = 0.05
126
  diversity_bonus: float = 0.03
127
+
128
  _red_team: Optional[RedTeam] = field(init=False, default=None)
129
  _author: Optional[PersonaAuthor] = field(init=False, default=None)
130
 
 
140
 
141
  _step_count: int = field(init=False, default=0)
142
  _done: bool = field(init=False, default=False)
143
+
144
  # State tracking for dense rewards
145
  _previous_test_score: float = field(init=False, default=0.0)
146
  _previous_lint_score: float = field(init=False, default=0.0)
147
  _current_test_score: float = field(init=False, default=0.0)
148
  _current_lint_score: float = field(init=False, default=0.0)
149
+
150
  # Tool usage tracking
151
  _tests_run: bool = field(init=False, default=False)
152
  _linter_run: bool = field(init=False, default=False)
153
  _docs_queried: bool = field(init=False, default=False)
154
+
155
  # Action history
156
  _action_history: List[str] = field(init=False, default_factory=list)
157
  _last_action_type: str = field(init=False, default="none")
158
+
159
  # FIXED: Track CUMULATIVE episode reward
160
  _episode_total_reward: float = field(init=False, default=0.0)
161
  _episode_rewards: List[float] = field(init=False, default_factory=list)
 
173
  self.task = task
174
  self._red_team = RedTeam(task)
175
  self._author = PersonaAuthor()
176
+ self.rubrics = [
177
+ TestDeltaRubric(weight=self.delta_weight),
178
+ LintDeltaRubric(weight=self.delta_weight),
179
+ ToolUsageRubric(bonus=self.tool_usage_bonus),
180
+ TerminalSuccessRubric(),
181
+ ExplorationRubric(penalty=-0.05, bonus=self.diversity_bonus * 0.7),
182
+ AntiHackingRubric(),
183
+ StepPenaltyRubric(penalty=self.step_penalty),
184
+ ]
185
 
186
  task_to_level = {
187
+ "easy": 0, "medium": 1, "hard": 2,
188
  "harder": 3, "hardest": 4
189
  }
190
  self._difficulty_level = task_to_level[task]
191
+
192
  self._reset_internal()
193
 
194
  # ===================================================================
195
  def _reset_internal(self):
196
+ self._step_count = 0 # ← FIXED
197
  self._comments = []
198
  self._test_results = None
199
  self._lint_results = None
200
  self._doc_results = None
201
  self._done = False
202
+
203
  # Reset state tracking
204
  self._previous_test_score = 0.0
205
  self._previous_lint_score = 0.0
206
  self._current_test_score = 0.0
207
  self._current_lint_score = 0.0
208
+
209
  self._tests_run = False
210
  self._linter_run = False
211
  self._docs_queried = False
212
+
213
  self._action_history = []
214
  self._last_action_type = "none"
215
+
216
  # FIXED: Reset episode cumulative reward
217
  self._episode_total_reward = 0.0
218
 
 
242
  """Reset with optional curriculum adjustment."""
243
  if self.auto_difficulty and len(self._episode_rewards) > 0:
244
  recent_performance = sum(self._episode_rewards[-5:]) / min(5, len(self._episode_rewards))
245
+
246
  if recent_performance > self.success_threshold and self._difficulty_level < 4:
247
  self._difficulty_level += 1
248
  print(f"[Curriculum] Increasing difficulty to level {self._difficulty_level}")
249
  elif recent_performance < 0.3 and self._difficulty_level > 0:
250
  self._difficulty_level -= 1
251
  print(f"[Curriculum] Decreasing difficulty to level {self._difficulty_level}")
252
+
253
  level_to_task = {0: "easy", 1: "medium", 2: "hard", 3: "harder", 4: "hardest"}
254
  self.task = level_to_task[self._difficulty_level]
255
  self._red_team = RedTeam(self.task)
256
+
257
  self._reset_internal()
258
  return self._get_observation()
259
 
 
269
  return EnhancedObservation(
270
  code_snippet=self._current_code,
271
  last_tool_output=self._test_results or "",
272
+ author_response=author_response, # ← now field exists
273
 
274
+ current_test_score=self._current_test_score,
275
  current_lint_score=self._current_lint_score,
276
  negotiation_score=self._author.get_negotiation_score(),
277
 
 
296
 
297
  bug_description=self._bug_description,
298
  comments_count=len(self._comments),
299
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
 
301
  # ===================================================================
302
  def _get_action_type(self, action: AnyAction) -> str:
 
339
  # Store previous metrics for delta computation
340
  self._previous_test_score = self._current_test_score
341
  self._previous_lint_score = self._current_lint_score
342
+
343
  base_reward = 0.0
 
344
  action_type = self._get_action_type(action)
345
+
346
  # Update action history
347
  self._action_history.append(action_type)
348
  self._last_action_type = action_type
 
364
  lint_output = ToolBox.run_linter(self._current_code)
365
  self._lint_results = lint_output[:500]
366
  self._test_results = f"[Linter]\n{self._lint_results}"
367
+
368
  self._current_lint_score = self._run_linter_score(self._current_code)
369
  self._linter_run = True
370
  base_reward = 0.002
 
372
  elif isinstance(action, RunTests):
373
  runner = TestRunner(self._current_bug_id)
374
  score, output = runner.run_tests(self._current_code)
375
+
376
  self._current_test_score = score
377
  self._tests_run = True
378
+
379
  self._test_results = f"[Tests] Score: {score:.2f}\n{output[:300]}"
380
  base_reward = 0.002
381
+
382
  if score > 0.8:
383
  base_reward += 0.005
384
 
 
394
  # ==============================================================
395
  elif isinstance(action, WriteComment):
396
  self._comments.append(f"Agent: {action.comment_text}")
397
+
398
  response = self._author.respond(
399
  agent_comment=action.comment_text,
400
  test_results=self._test_results,
 
403
  proposed_fix=None,
404
  original_code=self._current_code
405
  )
406
+
407
  self._comments.append(f"Author: {response}")
408
  self._test_results = f"[Comment] Author: {response[:200]}"
409
  base_reward = 0.001
410
 
411
  elif isinstance(action, AskQuestion):
412
  self._comments.append(f"Agent: {action.question}")
413
+
414
  response = self._author.respond(
415
  agent_question=action.question,
416
  test_results=self._test_results,
417
  lint_results=self._lint_results,
418
  doc_results=self._doc_results,
419
  proposed_fix=None,
420
+ original_code=self._current_code # ← FIXED
421
  )
422
+
423
  self._comments.append(f"Author: {response}")
424
  self._test_results = f"[Question] Author: {response[:200]}"
425
  base_reward = 0.002
 
432
  base_reward = -0.05
433
  self._done = True
434
  else:
435
+ # Save original code BEFORE overwriting (for author.respond)
436
  original_buggy = self._current_code
437
  self._current_code = action.fix_code
438
 
 
444
  self._current_test_score = test_score
445
  self._current_lint_score = lint_score
446
 
447
+ # Author gating – determines if the episode ends, reward is separate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
  threshold = self._author.thresholds.get(self._author.personality, 0.5)
449
  if self._author._confidence < threshold:
 
450
  if self._step_count < self.max_steps:
451
  self._done = False
452
  else:
 
454
  else:
455
  self._done = True
456
 
457
+ # Get author's verbal feedback (pushback/acceptance)
458
  author_feedback = self._author.respond(
459
  agent_comment=f"Proposed fix:\n{action.fix_code}",
460
  test_results=f"Score: {test_score:.2f}",
461
  lint_results=f"Score: {lint_score:.2f}",
462
  doc_results=self._doc_results,
463
  proposed_fix=action.fix_code,
464
+ original_code=original_buggy # now correctly the buggy code, not the fix
465
  )
 
466
  self._test_results = f"[Fix] Author: {author_feedback[:200]}"
467
  self._comments.append(f"Author: {author_feedback}")
468
 
469
+ base_reward = 0.001 # rubrics provide the real signal
 
470
 
471
  # ==============================================================
472
  # TERMINATION ACTIONS
 
487
  self._done = True
488
 
489
  # ==============================================================
490
+ # STEP UPDATE (before rubric computation so info contains final step)
 
 
 
 
 
 
 
 
491
  # ==============================================================
492
  self._step_count += 1
 
493
  if self._step_count >= self.max_steps:
494
  self._done = True
495
+
496
+ # Get fresh observation (needed for rubrics that may read obs)
 
 
 
497
  obs = self._get_observation()
498
+
499
+ # Prepare info dict (rubrics may need action_type and deltas)
500
  info = {
501
+ "action_type": action_type,
502
  "test_score": self._current_test_score,
503
  "lint_score": self._current_lint_score,
504
  "test_delta": self._current_test_score - self._previous_test_score,
505
  "lint_delta": self._current_lint_score - self._previous_lint_score,
506
  "base_reward": base_reward,
 
 
507
  }
508
+
509
+ # ==============================================================
510
+ # COMPUTE FINAL REWARD USING RUBRICS
511
+ # ==============================================================
512
+ rubric_score = sum(r(self, action, obs, None, self._done, info) for r in self.rubrics)
513
+ final_reward = 0.4 * base_reward + rubric_score
514
+ final_reward = max(-1.0, min(1.0, final_reward)) # safety clip
515
+
516
+ # Track cumulative episode reward
517
+ self._episode_total_reward += final_reward
518
+
519
+ # Store episode total if done
520
+ if self._done:
521
+ self._episode_rewards.append(self._episode_total_reward)
522
+
523
+ # Complete info
524
+ info["final_reward"] = final_reward
525
+ info["episode_total"] = self._episode_total_reward
526
+
527
  return obs, Reward(value=final_reward), self._done, info
528
 
529
  # ===================================================================