Spaces:
Sleeping
Sleeping
databoysu commited on
Commit ·
fd95d06
1
Parent(s): a27cb68
improving cot
Browse files- inference.py +55 -32
inference.py
CHANGED
|
@@ -47,37 +47,35 @@ MAX_STEPS = int(os.getenv("MAX_STEPS", "50"))
|
|
| 47 |
SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.99"))
|
| 48 |
|
| 49 |
SYSTEM_PROMPT = """\
|
| 50 |
-
You are
|
|
|
|
| 51 |
|
| 52 |
-
|
| 53 |
1. Output exactly one CodeAction object per turn.
|
| 54 |
-
2.
|
| 55 |
-
3.
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
-
|
| 66 |
-
-
|
| 67 |
-
-
|
| 68 |
-
-
|
| 69 |
-
- RESET_TO_ORIGINAL:
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
Valid JSON examples:
|
| 79 |
-
{"thought":"I need traceback details before editing.","action_type":"RUN_TESTS","start_line":null,"end_line":null,"new_code_block":null}
|
| 80 |
-
{"thought":"Line 2 slicing step is wrong; replace only that line.","action_type":"REPLACE_LINES","start_line":2,"end_line":2,"new_code_block":" return s[::-1]"}
|
| 81 |
"""
|
| 82 |
|
| 83 |
|
|
@@ -156,7 +154,7 @@ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[
|
|
| 156 |
def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
|
| 157 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 158 |
print(
|
| 159 |
-
f"[END] success={str(success).lower()} steps={steps} score={score:.
|
| 160 |
flush=True,
|
| 161 |
)
|
| 162 |
|
|
@@ -264,6 +262,9 @@ async def run(difficulty: Optional[str] = None, show_thought: bool = False) -> N
|
|
| 264 |
score = 0.0
|
| 265 |
success = False
|
| 266 |
started = False
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
try:
|
| 269 |
if LOCAL_IMAGE_NAME:
|
|
@@ -335,6 +336,27 @@ async def run(difficulty: Optional[str] = None, show_thought: bool = False) -> N
|
|
| 335 |
),
|
| 336 |
)
|
| 337 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
result = await env.step(action)
|
| 339 |
|
| 340 |
reward = float(result.reward or 0.0)
|
|
@@ -360,8 +382,9 @@ async def run(difficulty: Optional[str] = None, show_thought: bool = False) -> N
|
|
| 360 |
if done:
|
| 361 |
break
|
| 362 |
|
| 363 |
-
|
| 364 |
-
|
|
|
|
| 365 |
|
| 366 |
except Exception as exc:
|
| 367 |
if not started:
|
|
|
|
| 47 |
SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.99"))
|
| 48 |
|
| 49 |
SYSTEM_PROMPT = """\
|
| 50 |
+
You are an autonomous Software Engineering RL Agent.
|
| 51 |
+
You are strictly evaluated on your ability to reason deeply before taking action.
|
| 52 |
|
| 53 |
+
OPERATING CONTRACT:
|
| 54 |
1. Output exactly one CodeAction object per turn.
|
| 55 |
+
2. You MUST read your conversation history. If you just tried an edit and the tests still fail, DO NOT repeat the same edit.
|
| 56 |
+
3. PARSE_ERROR means your last output was invalid. Fix your formatting immediately.
|
| 57 |
+
|
| 58 |
+
HOW TO THINK (The 'thought' field is mandatory):
|
| 59 |
+
Before choosing an action_type, your 'thought' MUST contain these exact 3 sentences:
|
| 60 |
+
1. "Observation: [State what you see in the test output or traceback]"
|
| 61 |
+
2. "Diagnosis: [Explain exactly which line is causing the bug and why]"
|
| 62 |
+
3. "Plan: [State exactly what tool you will use next to fix it]"
|
| 63 |
+
|
| 64 |
+
ACTION POLICY:
|
| 65 |
+
- VIEW_CODE: Read the code mapping.
|
| 66 |
+
- RUN_TESTS: Execute tests to get the traceback.
|
| 67 |
+
- REPLACE_LINES: Apply a focused fix. Use EXACT line numbers from code_dict.
|
| 68 |
+
- UNDO_EDIT: Revert if the last edit caused a SyntaxError.
|
| 69 |
+
- SUBMIT: Use this ONLY when the last_execution_output explicitly confirms all tests pass.
|
| 70 |
+
- RESET_TO_ORIGINAL: Use this if wanting to reset the code file to try again or with a different strategy.
|
| 71 |
+
|
| 72 |
+
VALID JSON EXAMPLES (Follow this exact thought depth):
|
| 73 |
+
|
| 74 |
+
Example 1 (Planning an edit):
|
| 75 |
+
{"thought":"Observation: The last_execution_output shows an IndexError on line 12 because 'i+1' is out of bounds. Diagnosis: The loop condition 'for i in range(len(arr))' goes to the end of the array, so 'arr[i+1]' fails on the last iteration. Plan: I will use REPLACE_LINES on line 10 to change the loop to 'range(len(arr)-1)'.","action_type":"REPLACE_LINES","start_line":10,"end_line":10,"new_code_block":" for i in range(len(arr) - 1):"}
|
| 76 |
+
|
| 77 |
+
Example 2 (Testing):
|
| 78 |
+
{"thought":"Observation: I just replaced line 10 with the corrected loop condition. Diagnosis: I need to verify if this change fixed the IndexError and didn't break other boundary tests. Plan: I will use RUN_TESTS to get fresh evidence.","action_type":"RUN_TESTS","start_line":null,"end_line":null,"new_code_block":null}
|
|
|
|
|
|
|
|
|
|
| 79 |
"""
|
| 80 |
|
| 81 |
|
|
|
|
| 154 |
def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
|
| 155 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 156 |
print(
|
| 157 |
+
f"[END] success={str(success).lower()} steps={steps} score={score:.2f} rewards={rewards_str}",
|
| 158 |
flush=True,
|
| 159 |
)
|
| 160 |
|
|
|
|
| 262 |
score = 0.0
|
| 263 |
success = False
|
| 264 |
started = False
|
| 265 |
+
kill_switch_triggered = False
|
| 266 |
+
last_action_type: Optional[str] = None
|
| 267 |
+
consecutive_same_action_count = 0
|
| 268 |
|
| 269 |
try:
|
| 270 |
if LOCAL_IMAGE_NAME:
|
|
|
|
| 336 |
),
|
| 337 |
)
|
| 338 |
|
| 339 |
+
current_action_type = action.action_type
|
| 340 |
+
if current_action_type == last_action_type:
|
| 341 |
+
consecutive_same_action_count += 1
|
| 342 |
+
else:
|
| 343 |
+
consecutive_same_action_count = 1
|
| 344 |
+
last_action_type = current_action_type
|
| 345 |
+
|
| 346 |
+
if (
|
| 347 |
+
current_action_type == "RUN_TESTS"
|
| 348 |
+
and consecutive_same_action_count >= 3
|
| 349 |
+
):
|
| 350 |
+
kill_switch_triggered = True
|
| 351 |
+
history.append(
|
| 352 |
+
"KILL_SWITCH: RUN_TESTS selected 3 times consecutively. "
|
| 353 |
+
"Terminating episode early to prevent looping."
|
| 354 |
+
)
|
| 355 |
+
steps_taken = step
|
| 356 |
+
success = False
|
| 357 |
+
score = 0.0
|
| 358 |
+
break
|
| 359 |
+
|
| 360 |
result = await env.step(action)
|
| 361 |
|
| 362 |
reward = float(result.reward or 0.0)
|
|
|
|
| 382 |
if done:
|
| 383 |
break
|
| 384 |
|
| 385 |
+
if not kill_switch_triggered:
|
| 386 |
+
score = _compute_score(result, rewards)
|
| 387 |
+
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 388 |
|
| 389 |
except Exception as exc:
|
| 390 |
if not started:
|