"""Phase 5 — Self-Debugging Engine. Drives the autonomous fix loop: generate → execute → test → critique → fix → repeat Key components: 1. CodeGenerator — LLM produces code for a task description 2. CodeExecutor — runs the code in the sandbox 3. TestRunner — runs inline unit tests or verifiers 4. CritiqueEngine — reviews code quality + error output 5. FixEngine — generates targeted patch based on failure + critique 6. AutonomousLoop — orchestrates the full generate→test→fix cycle """ from __future__ import annotations import json import logging import os import re import time import uuid from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Tuple from ..router.smart_router import get_router from . import tools as agent_tools logger = logging.getLogger(__name__) # ─── Data Models ───────────────────────────────────────────────────────────── @dataclass class CodeAttempt: attempt_num: int code: str language: str = "python" filename: str = "" execution_output: str = "" execution_error: str = "" test_passed: bool = False critique: str = "" fix_notes: str = "" duration_secs: float = 0.0 def to_dict(self) -> Dict[str, Any]: return { "attempt": self.attempt_num, "language": self.language, "filename": self.filename, "code_lines": self.code.count("\n") + 1, "execution_output": self.execution_output[:500], "execution_error": self.execution_error[:500], "test_passed": self.test_passed, "critique": self.critique[:300], "fix_notes": self.fix_notes[:300], "duration_secs": round(self.duration_secs, 2), } @dataclass class DebugResult: task_id: int goal: str attempts: List[CodeAttempt] = field(default_factory=list) final_code: str = "" final_filename: str = "" final_output: str = "" success: bool = False total_duration_secs: float = 0.0 def to_dict(self) -> Dict[str, Any]: return { "task_id": self.task_id, "goal": self.goal, "total_attempts": len(self.attempts), "success": self.success, "final_code_lines": self.final_code.count("\n") + 1 if self.final_code else 0, "final_filename": self.final_filename, "final_output": self.final_output[:1000], "attempts": [a.to_dict() for a in self.attempts], "total_duration_secs": round(self.total_duration_secs, 2), } # ─── Code Generator ────────────────────────────────────────────────────────── async def generate_code( goal: str, language: str = "python", previous_error: str = "", critique: str = "", previous_code: str = "", memory_context: str = "", ) -> str: """Generate code for the goal, optionally fixing previous attempt.""" if previous_error or critique: system = ( "You are an expert software engineer. Fix the code based on the error and critique. " "Return ONLY the complete fixed code, no markdown fences, no explanations." ) user = ( f"Goal: {goal}\n" f"Previous code:\n{previous_code[:1500] if previous_code else 'none'}\n\n" f"Error:\n{previous_error[:800] if previous_error else 'none'}\n\n" f"Critique:\n{critique[:600] if critique else 'none'}\n\n" f"Write the complete fixed {language} code:" ) else: system = ( f"You are an expert software engineer. Write clean, working {language} code. " "Return ONLY the code, no markdown fences, no explanations." ) user = ( f"Goal: {goal}\n" f"Memory context: {memory_context[:400] if memory_context else 'none'}\n\n" f"Write complete {language} code that achieves this goal:" ) router = get_router() raw = await router.chat( messages=[ {"role": "system", "content": system}, {"role": "user", "content": user}, ], temperature=0.2, max_tokens=1500, ) # Strip markdown fences if present code = raw or "" code = re.sub(r"^```(?:python|javascript|bash|sh)?\n?", "", code.strip()) code = re.sub(r"\n?```$", "", code.strip()) return code.strip() # ─── Test Generator ────────────────────────────────────────────────────────── async def generate_tests(goal: str, code: str, language: str = "python") -> str: """Generate simple unit tests for the code.""" system = ( "You are a QA engineer. Write minimal unit tests that verify the code achieves the goal. " "Use pytest for Python. Return ONLY the test code." ) user = ( f"Goal: {goal}\n" f"Code to test:\n{code[:1200]}\n\n" "Write tests that verify the code works correctly. Keep tests simple and runnable:" ) router = get_router() raw = await router.chat( messages=[ {"role": "system", "content": system}, {"role": "user", "content": user}, ], temperature=0.1, max_tokens=600, ) code_block = raw or "" code_block = re.sub(r"^```(?:python)?\n?", "", code_block.strip()) code_block = re.sub(r"\n?```$", "", code_block.strip()) return code_block.strip() # ─── Critique Engine ───────────────────────────────────────────────────────── async def critique_code( goal: str, code: str, execution_output: str, execution_error: str, test_passed: bool, ) -> str: """Analyse code quality and execution outcome. Returns actionable critique.""" system = ( "You are a senior code reviewer. Give a concise, actionable critique. " "Focus on: correctness, error handling, edge cases, missing logic." ) user = ( f"Goal: {goal}\n\n" f"Code:\n{code[:1000]}\n\n" f"Execution output: {execution_output[:400] if execution_output else 'none'}\n" f"Execution error: {execution_error[:400] if execution_error else 'none'}\n" f"Tests passed: {test_passed}\n\n" "Provide a short critique (2-4 sentences) of what needs to be fixed:" ) router = get_router() critique = await router.chat( messages=[ {"role": "system", "content": system}, {"role": "user", "content": user}, ], temperature=0.3, max_tokens=300, ) return (critique or "").strip() # ─── Code Executor ──────────────────────────────────────────────────────────── def execute_code( workdir: Any, code: str, filename: str, language: str = "python", timeout: int = 30, ) -> Tuple[str, str, bool]: """Execute code in sandbox. Returns (stdout, stderr, success).""" agent_tools.write_file(workdir, filename, code) if language == "python": result = agent_tools.run_shell(workdir, f"python3 {filename}", timeout=timeout) elif language in ("javascript", "js", "node"): result = agent_tools.run_shell(workdir, f"node {filename}", timeout=timeout) elif language in ("bash", "sh"): result = agent_tools.run_shell(workdir, f"bash {filename}", timeout=timeout) else: result = agent_tools.run_shell(workdir, f"python3 {filename}", timeout=timeout) stdout = result.get("stdout", "") stderr = result.get("stderr", "") returncode = result.get("returncode", -1) success = returncode == 0 and not stderr.strip() return stdout, stderr, success def run_tests(workdir: Any, test_code: str, test_filename: str) -> Tuple[str, str, bool]: """Run pytest tests. Returns (stdout, stderr, all_passed).""" agent_tools.write_file(workdir, test_filename, test_code) result = agent_tools.run_shell(workdir, f"python3 -m pytest {test_filename} -v --tb=short 2>&1", timeout=60) output = result.get("stdout", "") + result.get("stderr", "") passed = "passed" in output and "failed" not in output and "error" not in output.lower() return output, "", passed # ─── Autonomous Coding Loop ─────────────────────────────────────────────────── class SelfDebuggingEngine: """Phase 5 — autonomous generate → execute → test → critique → fix loop. Runs up to ``max_attempts`` code generation+fix cycles, stopping when all tests pass or max attempts is reached. """ def __init__( self, task_id: int, max_attempts: int = 5, language: str = "python", run_tests_flag: bool = True, ) -> None: self.task_id = task_id self.max_attempts = max_attempts self.language = language self.run_tests_flag = run_tests_flag self.workdir = agent_tools.get_workdir(task_id) async def debug( self, goal: str, initial_code: str = "", memory_context: str = "", ) -> DebugResult: """Run the self-debugging loop.""" result = DebugResult(task_id=self.task_id, goal=goal) start_total = time.monotonic() code = initial_code previous_error = "" critique = "" test_code = "" # Generate tests once at start if self.run_tests_flag and not initial_code: try: test_code = await generate_tests(goal, "# placeholder", self.language) except Exception as e: logger.warning("Test generation failed: %s", e) for attempt_num in range(1, self.max_attempts + 1): attempt = CodeAttempt(attempt_num=attempt_num, language=self.language) attempt_start = time.monotonic() logger.info( "SelfDebug task=%d attempt=%d/%d", self.task_id, attempt_num, self.max_attempts ) # ── Generate / fix code ───────────────────────────────────────── # try: code = await generate_code( goal=goal, language=self.language, previous_error=previous_error, critique=critique, previous_code=code, memory_context=memory_context, ) except Exception as e: attempt.execution_error = f"Code generation failed: {e}" result.attempts.append(attempt) continue attempt.code = code fname = f"solution_v{attempt_num}_{uuid.uuid4().hex[:4]}.py" attempt.filename = fname # ── Execute ───────────────────────────────────────────────────── # stdout, stderr, exec_ok = execute_code( self.workdir, code, fname, self.language ) attempt.execution_output = stdout attempt.execution_error = stderr # ── Run tests (if available) ──────────────────────────────────── # if test_code and self.run_tests_flag: # Regenerate tests with actual code try: test_code = await generate_tests(goal, code, self.language) test_fname = f"test_solution_v{attempt_num}.py" test_out, _, test_ok = run_tests(self.workdir, test_code, test_fname) attempt.test_passed = test_ok if not test_ok: attempt.execution_error += f"\nTest output: {test_out[:400]}" except Exception as e: attempt.test_passed = exec_ok logger.warning("Test run error: %s", e) else: attempt.test_passed = exec_ok # ── Critique ──────────────────────────────────────────────────── # if not attempt.test_passed: try: critique = await critique_code( goal, code, stdout, stderr, attempt.test_passed ) attempt.critique = critique except Exception as e: logger.warning("Critique failed: %s", e) previous_error = stderr or ("tests failed" if not attempt.test_passed else "") attempt.duration_secs = time.monotonic() - attempt_start result.attempts.append(attempt) if attempt.test_passed: result.success = True result.final_code = code result.final_filename = fname result.final_output = stdout break result.total_duration_secs = time.monotonic() - start_total # If never succeeded, use last attempt if not result.success and result.attempts: last = result.attempts[-1] result.final_code = last.code result.final_filename = last.filename result.final_output = last.execution_output # Partial success if code ran without error result.success = not last.execution_error and bool(last.execution_output) return result