databoysu commited on
Commit
fd95d06
·
1 Parent(s): a27cb68

improving cot

Browse files
Files changed (1) hide show
  1. inference.py +55 -32
inference.py CHANGED
@@ -47,37 +47,35 @@ MAX_STEPS = int(os.getenv("MAX_STEPS", "50"))
47
  SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.99"))
48
 
49
  SYSTEM_PROMPT = """\
50
- You are the policy controller for a Python debugging RL environment.
 
51
 
52
- Operating contract:
53
  1. Output exactly one CodeAction object per turn.
54
- 2. Use the prior conversation history, especially your earlier thoughts and failed outputs, to avoid repeating mistakes.
55
- 3. Treat PARSE_ERROR feedback as hard constraints that must be corrected on the next turn.
56
-
57
- Reasoning procedure (must be reflected in thought):
58
- 1. Read localized_context and last_execution_output first.
59
- 2. If tests failed, identify the most likely failing line and root cause.
60
- 3. Decide the next single best action that maximizes test progress.
61
- 4. If editing, reference exact line keys from code_dict and provide correctly indented replacement code.
62
- 5. Only SUBMIT when current evidence shows all tests pass.
63
-
64
- Action policy:
65
- - VIEW_CODE: only when you need to re-orient line mapping.
66
- - RUN_TESTS: use to obtain fresh traceback evidence and validate edits.
67
- - REPLACE_LINES: apply a focused fix for one concrete bug.
68
- - UNDO_EDIT: use when latest edit regressed behavior.
69
- - RESET_TO_ORIGINAL: last resort recovery.
70
- - SUBMIT: only after explicit all-pass confirmation.
71
-
72
- Formatting policy:
73
- - Return a valid CodeAction object only.
74
- - No markdown, no prose outside the action fields.
75
- - Use EXACT keys: thought, action_type, start_line, end_line, new_code_block.
76
- - NEVER use legacy keys such as type, lines, or source.
77
-
78
- Valid JSON examples:
79
- {"thought":"I need traceback details before editing.","action_type":"RUN_TESTS","start_line":null,"end_line":null,"new_code_block":null}
80
- {"thought":"Line 2 slicing step is wrong; replace only that line.","action_type":"REPLACE_LINES","start_line":2,"end_line":2,"new_code_block":" return s[::-1]"}
81
  """
82
 
83
 
@@ -156,7 +154,7 @@ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[
156
  def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
157
  rewards_str = ",".join(f"{r:.2f}" for r in rewards)
158
  print(
159
- f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
160
  flush=True,
161
  )
162
 
@@ -264,6 +262,9 @@ async def run(difficulty: Optional[str] = None, show_thought: bool = False) -> N
264
  score = 0.0
265
  success = False
266
  started = False
 
 
 
267
 
268
  try:
269
  if LOCAL_IMAGE_NAME:
@@ -335,6 +336,27 @@ async def run(difficulty: Optional[str] = None, show_thought: bool = False) -> N
335
  ),
336
  )
337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  result = await env.step(action)
339
 
340
  reward = float(result.reward or 0.0)
@@ -360,8 +382,9 @@ async def run(difficulty: Optional[str] = None, show_thought: bool = False) -> N
360
  if done:
361
  break
362
 
363
- score = _compute_score(result, rewards)
364
- success = score >= SUCCESS_SCORE_THRESHOLD
 
365
 
366
  except Exception as exc:
367
  if not started:
 
47
  SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.99"))
48
 
49
  SYSTEM_PROMPT = """\
50
+ You are an autonomous Software Engineering RL Agent.
51
+ You are strictly evaluated on your ability to reason deeply before taking action.
52
 
53
+ OPERATING CONTRACT:
54
  1. Output exactly one CodeAction object per turn.
55
+ 2. You MUST read your conversation history. If you just tried an edit and the tests still fail, DO NOT repeat the same edit.
56
+ 3. PARSE_ERROR means your last output was invalid. Fix your formatting immediately.
57
+
58
+ HOW TO THINK (The 'thought' field is mandatory):
59
+ Before choosing an action_type, your 'thought' MUST contain these exact 3 sentences:
60
+ 1. "Observation: [State what you see in the test output or traceback]"
61
+ 2. "Diagnosis: [Explain exactly which line is causing the bug and why]"
62
+ 3. "Plan: [State exactly what tool you will use next to fix it]"
63
+
64
+ ACTION POLICY:
65
+ - VIEW_CODE: Read the code mapping.
66
+ - RUN_TESTS: Execute tests to get the traceback.
67
+ - REPLACE_LINES: Apply a focused fix. Use EXACT line numbers from code_dict.
68
+ - UNDO_EDIT: Revert if the last edit caused a SyntaxError.
69
+ - SUBMIT: Use this ONLY when the last_execution_output explicitly confirms all tests pass.
70
+ - RESET_TO_ORIGINAL: Use this if wanting to reset the code file to try again or with a different strategy.
71
+
72
+ VALID JSON EXAMPLES (Follow this exact thought depth):
73
+
74
+ Example 1 (Planning an edit):
75
+ {"thought":"Observation: The last_execution_output shows an IndexError on line 12 because 'i+1' is out of bounds. Diagnosis: The loop condition 'for i in range(len(arr))' goes to the end of the array, so 'arr[i+1]' fails on the last iteration. Plan: I will use REPLACE_LINES on line 10 to change the loop to 'range(len(arr)-1)'.","action_type":"REPLACE_LINES","start_line":10,"end_line":10,"new_code_block":" for i in range(len(arr) - 1):"}
76
+
77
+ Example 2 (Testing):
78
+ {"thought":"Observation: I just replaced line 10 with the corrected loop condition. Diagnosis: I need to verify if this change fixed the IndexError and didn't break other boundary tests. Plan: I will use RUN_TESTS to get fresh evidence.","action_type":"RUN_TESTS","start_line":null,"end_line":null,"new_code_block":null}
 
 
 
79
  """
80
 
81
 
 
154
  def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
155
  rewards_str = ",".join(f"{r:.2f}" for r in rewards)
156
  print(
157
+ f"[END] success={str(success).lower()} steps={steps} score={score:.2f} rewards={rewards_str}",
158
  flush=True,
159
  )
160
 
 
262
  score = 0.0
263
  success = False
264
  started = False
265
+ kill_switch_triggered = False
266
+ last_action_type: Optional[str] = None
267
+ consecutive_same_action_count = 0
268
 
269
  try:
270
  if LOCAL_IMAGE_NAME:
 
336
  ),
337
  )
338
 
339
+ current_action_type = action.action_type
340
+ if current_action_type == last_action_type:
341
+ consecutive_same_action_count += 1
342
+ else:
343
+ consecutive_same_action_count = 1
344
+ last_action_type = current_action_type
345
+
346
+ if (
347
+ current_action_type == "RUN_TESTS"
348
+ and consecutive_same_action_count >= 3
349
+ ):
350
+ kill_switch_triggered = True
351
+ history.append(
352
+ "KILL_SWITCH: RUN_TESTS selected 3 times consecutively. "
353
+ "Terminating episode early to prevent looping."
354
+ )
355
+ steps_taken = step
356
+ success = False
357
+ score = 0.0
358
+ break
359
+
360
  result = await env.step(action)
361
 
362
  reward = float(result.reward or 0.0)
 
382
  if done:
383
  break
384
 
385
+ if not kill_switch_triggered:
386
+ score = _compute_score(result, rewards)
387
+ success = score >= SUCCESS_SCORE_THRESHOLD
388
 
389
  except Exception as exc:
390
  if not started: