| """Phase 5 β Self-Debugging Engine. |
| |
| Drives the autonomous fix loop: |
| |
| generate β execute β test β critique β fix β repeat |
| |
| Key components: |
| 1. CodeGenerator β LLM produces code for a task description |
| 2. CodeExecutor β runs the code in the sandbox |
| 3. TestRunner β runs inline unit tests or verifiers |
| 4. CritiqueEngine β reviews code quality + error output |
| 5. FixEngine β generates targeted patch based on failure + critique |
| 6. AutonomousLoop β orchestrates the full generateβtestβfix cycle |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| import logging |
| import os |
| import re |
| import time |
| import uuid |
| from dataclasses import dataclass, field |
| from typing import Any, Dict, List, Optional, Tuple |
|
|
| from ..router.smart_router import get_router |
| from . import tools as agent_tools |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| |
|
|
| @dataclass |
| class CodeAttempt: |
| attempt_num: int |
| code: str |
| language: str = "python" |
| filename: str = "" |
| execution_output: str = "" |
| execution_error: str = "" |
| test_passed: bool = False |
| critique: str = "" |
| fix_notes: str = "" |
| duration_secs: float = 0.0 |
|
|
| def to_dict(self) -> Dict[str, Any]: |
| return { |
| "attempt": self.attempt_num, |
| "language": self.language, |
| "filename": self.filename, |
| "code_lines": self.code.count("\n") + 1, |
| "execution_output": self.execution_output[:500], |
| "execution_error": self.execution_error[:500], |
| "test_passed": self.test_passed, |
| "critique": self.critique[:300], |
| "fix_notes": self.fix_notes[:300], |
| "duration_secs": round(self.duration_secs, 2), |
| } |
|
|
|
|
| @dataclass |
| class DebugResult: |
| task_id: int |
| goal: str |
| attempts: List[CodeAttempt] = field(default_factory=list) |
| final_code: str = "" |
| final_filename: str = "" |
| final_output: str = "" |
| success: bool = False |
| total_duration_secs: float = 0.0 |
|
|
| def to_dict(self) -> Dict[str, Any]: |
| return { |
| "task_id": self.task_id, |
| "goal": self.goal, |
| "total_attempts": len(self.attempts), |
| "success": self.success, |
| "final_code_lines": self.final_code.count("\n") + 1 if self.final_code else 0, |
| "final_filename": self.final_filename, |
| "final_output": self.final_output[:1000], |
| "attempts": [a.to_dict() for a in self.attempts], |
| "total_duration_secs": round(self.total_duration_secs, 2), |
| } |
|
|
|
|
| |
|
|
| async def generate_code( |
| goal: str, |
| language: str = "python", |
| previous_error: str = "", |
| critique: str = "", |
| previous_code: str = "", |
| memory_context: str = "", |
| ) -> str: |
| """Generate code for the goal, optionally fixing previous attempt.""" |
| if previous_error or critique: |
| system = ( |
| "You are an expert software engineer. Fix the code based on the error and critique. " |
| "Return ONLY the complete fixed code, no markdown fences, no explanations." |
| ) |
| user = ( |
| f"Goal: {goal}\n" |
| f"Previous code:\n{previous_code[:1500] if previous_code else 'none'}\n\n" |
| f"Error:\n{previous_error[:800] if previous_error else 'none'}\n\n" |
| f"Critique:\n{critique[:600] if critique else 'none'}\n\n" |
| f"Write the complete fixed {language} code:" |
| ) |
| else: |
| system = ( |
| f"You are an expert software engineer. Write clean, working {language} code. " |
| "Return ONLY the code, no markdown fences, no explanations." |
| ) |
| user = ( |
| f"Goal: {goal}\n" |
| f"Memory context: {memory_context[:400] if memory_context else 'none'}\n\n" |
| f"Write complete {language} code that achieves this goal:" |
| ) |
|
|
| router = get_router() |
| raw = await router.chat( |
| messages=[ |
| {"role": "system", "content": system}, |
| {"role": "user", "content": user}, |
| ], |
| temperature=0.2, |
| max_tokens=1500, |
| ) |
|
|
| |
| code = raw or "" |
| code = re.sub(r"^```(?:python|javascript|bash|sh)?\n?", "", code.strip()) |
| code = re.sub(r"\n?```$", "", code.strip()) |
| return code.strip() |
|
|
|
|
| |
|
|
| async def generate_tests(goal: str, code: str, language: str = "python") -> str: |
| """Generate simple unit tests for the code.""" |
| system = ( |
| "You are a QA engineer. Write minimal unit tests that verify the code achieves the goal. " |
| "Use pytest for Python. Return ONLY the test code." |
| ) |
| user = ( |
| f"Goal: {goal}\n" |
| f"Code to test:\n{code[:1200]}\n\n" |
| "Write tests that verify the code works correctly. Keep tests simple and runnable:" |
| ) |
| router = get_router() |
| raw = await router.chat( |
| messages=[ |
| {"role": "system", "content": system}, |
| {"role": "user", "content": user}, |
| ], |
| temperature=0.1, |
| max_tokens=600, |
| ) |
| code_block = raw or "" |
| code_block = re.sub(r"^```(?:python)?\n?", "", code_block.strip()) |
| code_block = re.sub(r"\n?```$", "", code_block.strip()) |
| return code_block.strip() |
|
|
|
|
| |
|
|
| async def critique_code( |
| goal: str, |
| code: str, |
| execution_output: str, |
| execution_error: str, |
| test_passed: bool, |
| ) -> str: |
| """Analyse code quality and execution outcome. Returns actionable critique.""" |
| system = ( |
| "You are a senior code reviewer. Give a concise, actionable critique. " |
| "Focus on: correctness, error handling, edge cases, missing logic." |
| ) |
| user = ( |
| f"Goal: {goal}\n\n" |
| f"Code:\n{code[:1000]}\n\n" |
| f"Execution output: {execution_output[:400] if execution_output else 'none'}\n" |
| f"Execution error: {execution_error[:400] if execution_error else 'none'}\n" |
| f"Tests passed: {test_passed}\n\n" |
| "Provide a short critique (2-4 sentences) of what needs to be fixed:" |
| ) |
| router = get_router() |
| critique = await router.chat( |
| messages=[ |
| {"role": "system", "content": system}, |
| {"role": "user", "content": user}, |
| ], |
| temperature=0.3, |
| max_tokens=300, |
| ) |
| return (critique or "").strip() |
|
|
|
|
| |
|
|
| def execute_code( |
| workdir: Any, |
| code: str, |
| filename: str, |
| language: str = "python", |
| timeout: int = 30, |
| ) -> Tuple[str, str, bool]: |
| """Execute code in sandbox. Returns (stdout, stderr, success).""" |
| agent_tools.write_file(workdir, filename, code) |
|
|
| if language == "python": |
| result = agent_tools.run_shell(workdir, f"python3 {filename}", timeout=timeout) |
| elif language in ("javascript", "js", "node"): |
| result = agent_tools.run_shell(workdir, f"node {filename}", timeout=timeout) |
| elif language in ("bash", "sh"): |
| result = agent_tools.run_shell(workdir, f"bash {filename}", timeout=timeout) |
| else: |
| result = agent_tools.run_shell(workdir, f"python3 {filename}", timeout=timeout) |
|
|
| stdout = result.get("stdout", "") |
| stderr = result.get("stderr", "") |
| returncode = result.get("returncode", -1) |
| success = returncode == 0 and not stderr.strip() |
| return stdout, stderr, success |
|
|
|
|
| def run_tests(workdir: Any, test_code: str, test_filename: str) -> Tuple[str, str, bool]: |
| """Run pytest tests. Returns (stdout, stderr, all_passed).""" |
| agent_tools.write_file(workdir, test_filename, test_code) |
| result = agent_tools.run_shell(workdir, f"python3 -m pytest {test_filename} -v --tb=short 2>&1", timeout=60) |
| output = result.get("stdout", "") + result.get("stderr", "") |
| passed = "passed" in output and "failed" not in output and "error" not in output.lower() |
| return output, "", passed |
|
|
|
|
| |
|
|
| class SelfDebuggingEngine: |
| """Phase 5 β autonomous generate β execute β test β critique β fix loop. |
| |
| Runs up to ``max_attempts`` code generation+fix cycles, stopping when |
| all tests pass or max attempts is reached. |
| """ |
|
|
| def __init__( |
| self, |
| task_id: int, |
| max_attempts: int = 5, |
| language: str = "python", |
| run_tests_flag: bool = True, |
| ) -> None: |
| self.task_id = task_id |
| self.max_attempts = max_attempts |
| self.language = language |
| self.run_tests_flag = run_tests_flag |
| self.workdir = agent_tools.get_workdir(task_id) |
|
|
| async def debug( |
| self, |
| goal: str, |
| initial_code: str = "", |
| memory_context: str = "", |
| ) -> DebugResult: |
| """Run the self-debugging loop.""" |
| result = DebugResult(task_id=self.task_id, goal=goal) |
| start_total = time.monotonic() |
|
|
| code = initial_code |
| previous_error = "" |
| critique = "" |
| test_code = "" |
|
|
| |
| if self.run_tests_flag and not initial_code: |
| try: |
| test_code = await generate_tests(goal, "# placeholder", self.language) |
| except Exception as e: |
| logger.warning("Test generation failed: %s", e) |
|
|
| for attempt_num in range(1, self.max_attempts + 1): |
| attempt = CodeAttempt(attempt_num=attempt_num, language=self.language) |
| attempt_start = time.monotonic() |
|
|
| logger.info( |
| "SelfDebug task=%d attempt=%d/%d", self.task_id, attempt_num, self.max_attempts |
| ) |
|
|
| |
| try: |
| code = await generate_code( |
| goal=goal, |
| language=self.language, |
| previous_error=previous_error, |
| critique=critique, |
| previous_code=code, |
| memory_context=memory_context, |
| ) |
| except Exception as e: |
| attempt.execution_error = f"Code generation failed: {e}" |
| result.attempts.append(attempt) |
| continue |
|
|
| attempt.code = code |
| fname = f"solution_v{attempt_num}_{uuid.uuid4().hex[:4]}.py" |
| attempt.filename = fname |
|
|
| |
| stdout, stderr, exec_ok = execute_code( |
| self.workdir, code, fname, self.language |
| ) |
| attempt.execution_output = stdout |
| attempt.execution_error = stderr |
|
|
| |
| if test_code and self.run_tests_flag: |
| |
| try: |
| test_code = await generate_tests(goal, code, self.language) |
| test_fname = f"test_solution_v{attempt_num}.py" |
| test_out, _, test_ok = run_tests(self.workdir, test_code, test_fname) |
| attempt.test_passed = test_ok |
| if not test_ok: |
| attempt.execution_error += f"\nTest output: {test_out[:400]}" |
| except Exception as e: |
| attempt.test_passed = exec_ok |
| logger.warning("Test run error: %s", e) |
| else: |
| attempt.test_passed = exec_ok |
|
|
| |
| if not attempt.test_passed: |
| try: |
| critique = await critique_code( |
| goal, code, stdout, stderr, attempt.test_passed |
| ) |
| attempt.critique = critique |
| except Exception as e: |
| logger.warning("Critique failed: %s", e) |
|
|
| previous_error = stderr or ("tests failed" if not attempt.test_passed else "") |
|
|
| attempt.duration_secs = time.monotonic() - attempt_start |
| result.attempts.append(attempt) |
|
|
| if attempt.test_passed: |
| result.success = True |
| result.final_code = code |
| result.final_filename = fname |
| result.final_output = stdout |
| break |
|
|
| result.total_duration_secs = time.monotonic() - start_total |
|
|
| |
| if not result.success and result.attempts: |
| last = result.attempts[-1] |
| result.final_code = last.code |
| result.final_filename = last.filename |
| result.final_output = last.execution_output |
| |
| result.success = not last.execution_error and bool(last.execution_output) |
|
|
| return result |
|
|