Spaces:
Sleeping
Sleeping
databoysu commited on
Commit ·
26f55d2
1
Parent(s): b65a477
improving thought and reasoning
Browse files- inference.py +36 -2
inference.py
CHANGED
|
@@ -19,6 +19,7 @@ import argparse
|
|
| 19 |
import asyncio
|
| 20 |
import json
|
| 21 |
import os
|
|
|
|
| 22 |
import sys
|
| 23 |
from pathlib import Path
|
| 24 |
from typing import Any, Optional
|
|
@@ -78,6 +79,13 @@ How to read last_execution_output correctly:
|
|
| 78 |
- Treat syntax errors as highest priority and fix them before semantic issues.
|
| 79 |
- Never claim success unless output clearly indicates complete pass status.
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
Action policy:
|
| 82 |
- VIEW_CODE when line mapping or surrounding context is insufficient.
|
| 83 |
- RUN_TESTS to collect fresh evidence after edits or when uncertain.
|
|
@@ -85,9 +93,12 @@ Action policy:
|
|
| 85 |
- UNDO_EDIT if latest change worsened results or introduced new failures.
|
| 86 |
- RESET_TO_ORIGINAL only as last-resort recovery.
|
| 87 |
- SUBMIT only when last_execution_output explicitly and unambiguously indicates all tests passed.
|
|
|
|
|
|
|
| 88 |
|
| 89 |
Submit gate (hard rule):
|
| 90 |
- If any failure, error, traceback, xfailed/unfinished signal, or uncertainty remains, do not SUBMIT.
|
|
|
|
| 91 |
|
| 92 |
Self-check before finalizing response:
|
| 93 |
- Is this valid JSON?
|
|
@@ -146,7 +157,21 @@ def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> No
|
|
| 146 |
)
|
| 147 |
|
| 148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
def _build_observation_text(observation: Any) -> str:
|
|
|
|
|
|
|
|
|
|
| 150 |
code_dict = getattr(observation, "code_dict", {}) or {}
|
| 151 |
sorted_items = sorted(
|
| 152 |
((int(line_num), text) for line_num, text in code_dict.items()),
|
|
@@ -160,8 +185,10 @@ def _build_observation_text(observation: Any) -> str:
|
|
| 160 |
f"step_count={observation.step_count}\n"
|
| 161 |
f"steps_remaining={observation.steps_remaining}\n"
|
| 162 |
f"syntax_error={observation.syntax_error}\n"
|
|
|
|
|
|
|
| 163 |
f"localized_context=\n{observation.localized_context}\n\n"
|
| 164 |
-
f"last_execution_output=\n{
|
| 165 |
f"code_preview=\n{code_preview}"
|
| 166 |
)
|
| 167 |
|
|
@@ -295,12 +322,19 @@ async def run(difficulty: Optional[str] = None, show_thought: bool = False) -> N
|
|
| 295 |
print(action.thought, file=sys.stderr, flush=True)
|
| 296 |
else:
|
| 297 |
obs_text = _build_observation_text(result.observation)
|
|
|
|
|
|
|
|
|
|
| 298 |
history_messages.append(
|
| 299 |
{
|
| 300 |
"role": "user",
|
| 301 |
"content": (
|
| 302 |
"Pick the single best next action and return only one valid CodeAction JSON object. "
|
| 303 |
-
"Use localized_context/last_execution_output as evidence, and do not SUBMIT unless all tests explicitly pass.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
f"action_trajectory={(' -> '.join(action_trajectory) if action_trajectory else 'none')}\n\n"
|
| 305 |
f"{obs_text}"
|
| 306 |
),
|
|
|
|
| 19 |
import asyncio
|
| 20 |
import json
|
| 21 |
import os
|
| 22 |
+
import re
|
| 23 |
import sys
|
| 24 |
from pathlib import Path
|
| 25 |
from typing import Any, Optional
|
|
|
|
| 79 |
- Treat syntax errors as highest priority and fix them before semantic issues.
|
| 80 |
- Never claim success unless output clearly indicates complete pass status.
|
| 81 |
|
| 82 |
+
Terminal decision rule (no waiting):
|
| 83 |
+
- If last_execution_output contains both a full pass count pattern (for example, "Tests Passed: N/N")
|
| 84 |
+
and the success marker "SUCCESS: ALL TESTS PASSED", the next action must be SUBMIT.
|
| 85 |
+
- If all_tests_pass_signal=true in the observation, the next action must be SUBMIT.
|
| 86 |
+
- Once this pass signal is present, RUN_TESTS is no longer a valid next action.
|
| 87 |
+
- Do not wait for extra confirmation, additional logs, or another RUN_TESTS cycle after this signal.
|
| 88 |
+
|
| 89 |
Action policy:
|
| 90 |
- VIEW_CODE when line mapping or surrounding context is insufficient.
|
| 91 |
- RUN_TESTS to collect fresh evidence after edits or when uncertain.
|
|
|
|
| 93 |
- UNDO_EDIT if latest change worsened results or introduced new failures.
|
| 94 |
- RESET_TO_ORIGINAL only as last-resort recovery.
|
| 95 |
- SUBMIT only when last_execution_output explicitly and unambiguously indicates all tests passed.
|
| 96 |
+
- After RUN_TESTS, do not choose RUN_TESTS again immediately unless test evidence is genuinely missing.
|
| 97 |
+
- Treat "no output" as invalid reasoning when pass_count_summary or traceback text is present.
|
| 98 |
|
| 99 |
Submit gate (hard rule):
|
| 100 |
- If any failure, error, traceback, xfailed/unfinished signal, or uncertainty remains, do not SUBMIT.
|
| 101 |
+
- If all-tests-passed signal is present, do SUBMIT immediately on this turn.
|
| 102 |
|
| 103 |
Self-check before finalizing response:
|
| 104 |
- Is this valid JSON?
|
|
|
|
| 157 |
)
|
| 158 |
|
| 159 |
|
| 160 |
+
def _extract_pass_signal_fields(last_execution_output: str) -> tuple[str, bool]:
|
| 161 |
+
pass_count_match = re.search(r"Tests Passed:\s*(\d+)\s*/\s*(\d+)", last_execution_output)
|
| 162 |
+
pass_count_text = pass_count_match.group(0) if pass_count_match else "unknown"
|
| 163 |
+
all_tests_pass_signal = (
|
| 164 |
+
("SUCCESS: ALL TESTS PASSED" in last_execution_output)
|
| 165 |
+
and bool(pass_count_match)
|
| 166 |
+
and (pass_count_match.group(1) == pass_count_match.group(2))
|
| 167 |
+
)
|
| 168 |
+
return pass_count_text, all_tests_pass_signal
|
| 169 |
+
|
| 170 |
+
|
| 171 |
def _build_observation_text(observation: Any) -> str:
|
| 172 |
+
last_execution_output = str(getattr(observation, "last_execution_output", "") or "")
|
| 173 |
+
pass_count_text, all_tests_pass_signal = _extract_pass_signal_fields(last_execution_output)
|
| 174 |
+
|
| 175 |
code_dict = getattr(observation, "code_dict", {}) or {}
|
| 176 |
sorted_items = sorted(
|
| 177 |
((int(line_num), text) for line_num, text in code_dict.items()),
|
|
|
|
| 185 |
f"step_count={observation.step_count}\n"
|
| 186 |
f"steps_remaining={observation.steps_remaining}\n"
|
| 187 |
f"syntax_error={observation.syntax_error}\n"
|
| 188 |
+
f"pass_count_summary={pass_count_text}\n"
|
| 189 |
+
f"all_tests_pass_signal={str(all_tests_pass_signal).lower()}\n"
|
| 190 |
f"localized_context=\n{observation.localized_context}\n\n"
|
| 191 |
+
f"last_execution_output=\n{last_execution_output}\n\n"
|
| 192 |
f"code_preview=\n{code_preview}"
|
| 193 |
)
|
| 194 |
|
|
|
|
| 322 |
print(action.thought, file=sys.stderr, flush=True)
|
| 323 |
else:
|
| 324 |
obs_text = _build_observation_text(result.observation)
|
| 325 |
+
obs_last_output = str(getattr(result.observation, "last_execution_output", "") or "")
|
| 326 |
+
pass_count_text, all_tests_pass_signal = _extract_pass_signal_fields(obs_last_output)
|
| 327 |
+
last_action = action_trajectory[-1] if action_trajectory else "none"
|
| 328 |
history_messages.append(
|
| 329 |
{
|
| 330 |
"role": "user",
|
| 331 |
"content": (
|
| 332 |
"Pick the single best next action and return only one valid CodeAction JSON object. "
|
| 333 |
+
"Use localized_context/last_execution_output as evidence, and do not SUBMIT unless all tests explicitly pass. "
|
| 334 |
+
"If all_tests_pass_signal=true, you must choose SUBMIT now and must not choose RUN_TESTS again. "
|
| 335 |
+
"Do not wait for additional test output when all_tests_pass_signal=true. "
|
| 336 |
+
"If last_action was RUN_TESTS and all_tests_pass_signal=false, choose REPLACE_LINES or VIEW_CODE next, not RUN_TESTS again.\n\n"
|
| 337 |
+
f"decision_guard: last_action={last_action}, pass_count_summary={pass_count_text}, all_tests_pass_signal={str(all_tests_pass_signal).lower()}\n\n"
|
| 338 |
f"action_trajectory={(' -> '.join(action_trajectory) if action_trajectory else 'none')}\n\n"
|
| 339 |
f"{obs_text}"
|
| 340 |
),
|