databoysu commited on
Commit
26f55d2
·
1 Parent(s): b65a477

improving thought and reasoning

Browse files
Files changed (1) hide show
  1. inference.py +36 -2
inference.py CHANGED
@@ -19,6 +19,7 @@ import argparse
19
  import asyncio
20
  import json
21
  import os
 
22
  import sys
23
  from pathlib import Path
24
  from typing import Any, Optional
@@ -78,6 +79,13 @@ How to read last_execution_output correctly:
78
  - Treat syntax errors as highest priority and fix them before semantic issues.
79
  - Never claim success unless output clearly indicates complete pass status.
80
 
 
 
 
 
 
 
 
81
  Action policy:
82
  - VIEW_CODE when line mapping or surrounding context is insufficient.
83
  - RUN_TESTS to collect fresh evidence after edits or when uncertain.
@@ -85,9 +93,12 @@ Action policy:
85
  - UNDO_EDIT if latest change worsened results or introduced new failures.
86
  - RESET_TO_ORIGINAL only as last-resort recovery.
87
  - SUBMIT only when last_execution_output explicitly and unambiguously indicates all tests passed.
 
 
88
 
89
  Submit gate (hard rule):
90
  - If any failure, error, traceback, xfailed/unfinished signal, or uncertainty remains, do not SUBMIT.
 
91
 
92
  Self-check before finalizing response:
93
  - Is this valid JSON?
@@ -146,7 +157,21 @@ def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> No
146
  )
147
 
148
 
 
 
 
 
 
 
 
 
 
 
 
149
  def _build_observation_text(observation: Any) -> str:
 
 
 
150
  code_dict = getattr(observation, "code_dict", {}) or {}
151
  sorted_items = sorted(
152
  ((int(line_num), text) for line_num, text in code_dict.items()),
@@ -160,8 +185,10 @@ def _build_observation_text(observation: Any) -> str:
160
  f"step_count={observation.step_count}\n"
161
  f"steps_remaining={observation.steps_remaining}\n"
162
  f"syntax_error={observation.syntax_error}\n"
 
 
163
  f"localized_context=\n{observation.localized_context}\n\n"
164
- f"last_execution_output=\n{observation.last_execution_output}\n\n"
165
  f"code_preview=\n{code_preview}"
166
  )
167
 
@@ -295,12 +322,19 @@ async def run(difficulty: Optional[str] = None, show_thought: bool = False) -> N
295
  print(action.thought, file=sys.stderr, flush=True)
296
  else:
297
  obs_text = _build_observation_text(result.observation)
 
 
 
298
  history_messages.append(
299
  {
300
  "role": "user",
301
  "content": (
302
  "Pick the single best next action and return only one valid CodeAction JSON object. "
303
- "Use localized_context/last_execution_output as evidence, and do not SUBMIT unless all tests explicitly pass.\n\n"
 
 
 
 
304
  f"action_trajectory={(' -> '.join(action_trajectory) if action_trajectory else 'none')}\n\n"
305
  f"{obs_text}"
306
  ),
 
19
  import asyncio
20
  import json
21
  import os
22
+ import re
23
  import sys
24
  from pathlib import Path
25
  from typing import Any, Optional
 
79
  - Treat syntax errors as highest priority and fix them before semantic issues.
80
  - Never claim success unless output clearly indicates complete pass status.
81
 
82
+ Terminal decision rule (no waiting):
83
+ - If last_execution_output contains both a full pass count pattern (for example, "Tests Passed: N/N")
84
+ and the success marker "SUCCESS: ALL TESTS PASSED", the next action must be SUBMIT.
85
+ - If all_tests_pass_signal=true in the observation, the next action must be SUBMIT.
86
+ - Once this pass signal is present, RUN_TESTS is no longer a valid next action.
87
+ - Do not wait for extra confirmation, additional logs, or another RUN_TESTS cycle after this signal.
88
+
89
  Action policy:
90
  - VIEW_CODE when line mapping or surrounding context is insufficient.
91
  - RUN_TESTS to collect fresh evidence after edits or when uncertain.
 
93
  - UNDO_EDIT if latest change worsened results or introduced new failures.
94
  - RESET_TO_ORIGINAL only as last-resort recovery.
95
  - SUBMIT only when last_execution_output explicitly and unambiguously indicates all tests passed.
96
+ - After RUN_TESTS, do not choose RUN_TESTS again immediately unless test evidence is genuinely missing.
97
+ - Treat "no output" as invalid reasoning when pass_count_summary or traceback text is present.
98
 
99
  Submit gate (hard rule):
100
  - If any failure, error, traceback, xfailed/unfinished signal, or uncertainty remains, do not SUBMIT.
101
+ - If all-tests-passed signal is present, do SUBMIT immediately on this turn.
102
 
103
  Self-check before finalizing response:
104
  - Is this valid JSON?
 
157
  )
158
 
159
 
160
+ def _extract_pass_signal_fields(last_execution_output: str) -> tuple[str, bool]:
161
+ pass_count_match = re.search(r"Tests Passed:\s*(\d+)\s*/\s*(\d+)", last_execution_output)
162
+ pass_count_text = pass_count_match.group(0) if pass_count_match else "unknown"
163
+ all_tests_pass_signal = (
164
+ ("SUCCESS: ALL TESTS PASSED" in last_execution_output)
165
+ and bool(pass_count_match)
166
+ and (pass_count_match.group(1) == pass_count_match.group(2))
167
+ )
168
+ return pass_count_text, all_tests_pass_signal
169
+
170
+
171
  def _build_observation_text(observation: Any) -> str:
172
+ last_execution_output = str(getattr(observation, "last_execution_output", "") or "")
173
+ pass_count_text, all_tests_pass_signal = _extract_pass_signal_fields(last_execution_output)
174
+
175
  code_dict = getattr(observation, "code_dict", {}) or {}
176
  sorted_items = sorted(
177
  ((int(line_num), text) for line_num, text in code_dict.items()),
 
185
  f"step_count={observation.step_count}\n"
186
  f"steps_remaining={observation.steps_remaining}\n"
187
  f"syntax_error={observation.syntax_error}\n"
188
+ f"pass_count_summary={pass_count_text}\n"
189
+ f"all_tests_pass_signal={str(all_tests_pass_signal).lower()}\n"
190
  f"localized_context=\n{observation.localized_context}\n\n"
191
+ f"last_execution_output=\n{last_execution_output}\n\n"
192
  f"code_preview=\n{code_preview}"
193
  )
194
 
 
322
  print(action.thought, file=sys.stderr, flush=True)
323
  else:
324
  obs_text = _build_observation_text(result.observation)
325
+ obs_last_output = str(getattr(result.observation, "last_execution_output", "") or "")
326
+ pass_count_text, all_tests_pass_signal = _extract_pass_signal_fields(obs_last_output)
327
+ last_action = action_trajectory[-1] if action_trajectory else "none"
328
  history_messages.append(
329
  {
330
  "role": "user",
331
  "content": (
332
  "Pick the single best next action and return only one valid CodeAction JSON object. "
333
+ "Use localized_context/last_execution_output as evidence, and do not SUBMIT unless all tests explicitly pass. "
334
+ "If all_tests_pass_signal=true, you must choose SUBMIT now and must not choose RUN_TESTS again. "
335
+ "Do not wait for additional test output when all_tests_pass_signal=true. "
336
+ "If last_action was RUN_TESTS and all_tests_pass_signal=false, choose REPLACE_LINES or VIEW_CODE next, not RUN_TESTS again.\n\n"
337
+ f"decision_guard: last_action={last_action}, pass_count_summary={pass_count_text}, all_tests_pass_signal={str(all_tests_pass_signal).lower()}\n\n"
338
  f"action_trajectory={(' -> '.join(action_trajectory) if action_trajectory else 'none')}\n\n"
339
  f"{obs_text}"
340
  ),