PYAE1994's picture
feat(phase5): Upload apps/backend/agent/self_debug.py
19c8c50 verified
"""Phase 5 β€” Self-Debugging Engine.
Drives the autonomous fix loop:
generate β†’ execute β†’ test β†’ critique β†’ fix β†’ repeat
Key components:
1. CodeGenerator β€” LLM produces code for a task description
2. CodeExecutor β€” runs the code in the sandbox
3. TestRunner β€” runs inline unit tests or verifiers
4. CritiqueEngine β€” reviews code quality + error output
5. FixEngine β€” generates targeted patch based on failure + critique
6. AutonomousLoop — orchestrates the full generate→test→fix cycle
"""
from __future__ import annotations
import json
import logging
import os
import re
import time
import uuid
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Tuple
from ..router.smart_router import get_router
from . import tools as agent_tools
logger = logging.getLogger(__name__)
# ─── Data Models ─────────────────────────────────────────────────────────────
@dataclass
class CodeAttempt:
attempt_num: int
code: str
language: str = "python"
filename: str = ""
execution_output: str = ""
execution_error: str = ""
test_passed: bool = False
critique: str = ""
fix_notes: str = ""
duration_secs: float = 0.0
def to_dict(self) -> Dict[str, Any]:
return {
"attempt": self.attempt_num,
"language": self.language,
"filename": self.filename,
"code_lines": self.code.count("\n") + 1,
"execution_output": self.execution_output[:500],
"execution_error": self.execution_error[:500],
"test_passed": self.test_passed,
"critique": self.critique[:300],
"fix_notes": self.fix_notes[:300],
"duration_secs": round(self.duration_secs, 2),
}
@dataclass
class DebugResult:
task_id: int
goal: str
attempts: List[CodeAttempt] = field(default_factory=list)
final_code: str = ""
final_filename: str = ""
final_output: str = ""
success: bool = False
total_duration_secs: float = 0.0
def to_dict(self) -> Dict[str, Any]:
return {
"task_id": self.task_id,
"goal": self.goal,
"total_attempts": len(self.attempts),
"success": self.success,
"final_code_lines": self.final_code.count("\n") + 1 if self.final_code else 0,
"final_filename": self.final_filename,
"final_output": self.final_output[:1000],
"attempts": [a.to_dict() for a in self.attempts],
"total_duration_secs": round(self.total_duration_secs, 2),
}
# ─── Code Generator ──────────────────────────────────────────────────────────
async def generate_code(
goal: str,
language: str = "python",
previous_error: str = "",
critique: str = "",
previous_code: str = "",
memory_context: str = "",
) -> str:
"""Generate code for the goal, optionally fixing previous attempt."""
if previous_error or critique:
system = (
"You are an expert software engineer. Fix the code based on the error and critique. "
"Return ONLY the complete fixed code, no markdown fences, no explanations."
)
user = (
f"Goal: {goal}\n"
f"Previous code:\n{previous_code[:1500] if previous_code else 'none'}\n\n"
f"Error:\n{previous_error[:800] if previous_error else 'none'}\n\n"
f"Critique:\n{critique[:600] if critique else 'none'}\n\n"
f"Write the complete fixed {language} code:"
)
else:
system = (
f"You are an expert software engineer. Write clean, working {language} code. "
"Return ONLY the code, no markdown fences, no explanations."
)
user = (
f"Goal: {goal}\n"
f"Memory context: {memory_context[:400] if memory_context else 'none'}\n\n"
f"Write complete {language} code that achieves this goal:"
)
router = get_router()
raw = await router.chat(
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
temperature=0.2,
max_tokens=1500,
)
# Strip markdown fences if present
code = raw or ""
code = re.sub(r"^```(?:python|javascript|bash|sh)?\n?", "", code.strip())
code = re.sub(r"\n?```$", "", code.strip())
return code.strip()
# ─── Test Generator ──────────────────────────────────────────────────────────
async def generate_tests(goal: str, code: str, language: str = "python") -> str:
"""Generate simple unit tests for the code."""
system = (
"You are a QA engineer. Write minimal unit tests that verify the code achieves the goal. "
"Use pytest for Python. Return ONLY the test code."
)
user = (
f"Goal: {goal}\n"
f"Code to test:\n{code[:1200]}\n\n"
"Write tests that verify the code works correctly. Keep tests simple and runnable:"
)
router = get_router()
raw = await router.chat(
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
temperature=0.1,
max_tokens=600,
)
code_block = raw or ""
code_block = re.sub(r"^```(?:python)?\n?", "", code_block.strip())
code_block = re.sub(r"\n?```$", "", code_block.strip())
return code_block.strip()
# ─── Critique Engine ─────────────────────────────────────────────────────────
async def critique_code(
goal: str,
code: str,
execution_output: str,
execution_error: str,
test_passed: bool,
) -> str:
"""Analyse code quality and execution outcome. Returns actionable critique."""
system = (
"You are a senior code reviewer. Give a concise, actionable critique. "
"Focus on: correctness, error handling, edge cases, missing logic."
)
user = (
f"Goal: {goal}\n\n"
f"Code:\n{code[:1000]}\n\n"
f"Execution output: {execution_output[:400] if execution_output else 'none'}\n"
f"Execution error: {execution_error[:400] if execution_error else 'none'}\n"
f"Tests passed: {test_passed}\n\n"
"Provide a short critique (2-4 sentences) of what needs to be fixed:"
)
router = get_router()
critique = await router.chat(
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
temperature=0.3,
max_tokens=300,
)
return (critique or "").strip()
# ─── Code Executor ────────────────────────────────────────────────────────────
def execute_code(
workdir: Any,
code: str,
filename: str,
language: str = "python",
timeout: int = 30,
) -> Tuple[str, str, bool]:
"""Execute code in sandbox. Returns (stdout, stderr, success)."""
agent_tools.write_file(workdir, filename, code)
if language == "python":
result = agent_tools.run_shell(workdir, f"python3 {filename}", timeout=timeout)
elif language in ("javascript", "js", "node"):
result = agent_tools.run_shell(workdir, f"node {filename}", timeout=timeout)
elif language in ("bash", "sh"):
result = agent_tools.run_shell(workdir, f"bash {filename}", timeout=timeout)
else:
result = agent_tools.run_shell(workdir, f"python3 {filename}", timeout=timeout)
stdout = result.get("stdout", "")
stderr = result.get("stderr", "")
returncode = result.get("returncode", -1)
success = returncode == 0 and not stderr.strip()
return stdout, stderr, success
def run_tests(workdir: Any, test_code: str, test_filename: str) -> Tuple[str, str, bool]:
"""Run pytest tests. Returns (stdout, stderr, all_passed)."""
agent_tools.write_file(workdir, test_filename, test_code)
result = agent_tools.run_shell(workdir, f"python3 -m pytest {test_filename} -v --tb=short 2>&1", timeout=60)
output = result.get("stdout", "") + result.get("stderr", "")
passed = "passed" in output and "failed" not in output and "error" not in output.lower()
return output, "", passed
# ─── Autonomous Coding Loop ───────────────────────────────────────────────────
class SelfDebuggingEngine:
"""Phase 5 β€” autonomous generate β†’ execute β†’ test β†’ critique β†’ fix loop.
Runs up to ``max_attempts`` code generation+fix cycles, stopping when
all tests pass or max attempts is reached.
"""
def __init__(
self,
task_id: int,
max_attempts: int = 5,
language: str = "python",
run_tests_flag: bool = True,
) -> None:
self.task_id = task_id
self.max_attempts = max_attempts
self.language = language
self.run_tests_flag = run_tests_flag
self.workdir = agent_tools.get_workdir(task_id)
async def debug(
self,
goal: str,
initial_code: str = "",
memory_context: str = "",
) -> DebugResult:
"""Run the self-debugging loop."""
result = DebugResult(task_id=self.task_id, goal=goal)
start_total = time.monotonic()
code = initial_code
previous_error = ""
critique = ""
test_code = ""
# Generate tests once at start
if self.run_tests_flag and not initial_code:
try:
test_code = await generate_tests(goal, "# placeholder", self.language)
except Exception as e:
logger.warning("Test generation failed: %s", e)
for attempt_num in range(1, self.max_attempts + 1):
attempt = CodeAttempt(attempt_num=attempt_num, language=self.language)
attempt_start = time.monotonic()
logger.info(
"SelfDebug task=%d attempt=%d/%d", self.task_id, attempt_num, self.max_attempts
)
# ── Generate / fix code ───────────────────────────────────────── #
try:
code = await generate_code(
goal=goal,
language=self.language,
previous_error=previous_error,
critique=critique,
previous_code=code,
memory_context=memory_context,
)
except Exception as e:
attempt.execution_error = f"Code generation failed: {e}"
result.attempts.append(attempt)
continue
attempt.code = code
fname = f"solution_v{attempt_num}_{uuid.uuid4().hex[:4]}.py"
attempt.filename = fname
# ── Execute ───────────────────────────────────────────────────── #
stdout, stderr, exec_ok = execute_code(
self.workdir, code, fname, self.language
)
attempt.execution_output = stdout
attempt.execution_error = stderr
# ── Run tests (if available) ──────────────────────────────────── #
if test_code and self.run_tests_flag:
# Regenerate tests with actual code
try:
test_code = await generate_tests(goal, code, self.language)
test_fname = f"test_solution_v{attempt_num}.py"
test_out, _, test_ok = run_tests(self.workdir, test_code, test_fname)
attempt.test_passed = test_ok
if not test_ok:
attempt.execution_error += f"\nTest output: {test_out[:400]}"
except Exception as e:
attempt.test_passed = exec_ok
logger.warning("Test run error: %s", e)
else:
attempt.test_passed = exec_ok
# ── Critique ──────────────────────────────────────────────────── #
if not attempt.test_passed:
try:
critique = await critique_code(
goal, code, stdout, stderr, attempt.test_passed
)
attempt.critique = critique
except Exception as e:
logger.warning("Critique failed: %s", e)
previous_error = stderr or ("tests failed" if not attempt.test_passed else "")
attempt.duration_secs = time.monotonic() - attempt_start
result.attempts.append(attempt)
if attempt.test_passed:
result.success = True
result.final_code = code
result.final_filename = fname
result.final_output = stdout
break
result.total_duration_secs = time.monotonic() - start_total
# If never succeeded, use last attempt
if not result.success and result.attempts:
last = result.attempts[-1]
result.final_code = last.code
result.final_filename = last.filename
result.final_output = last.execution_output
# Partial success if code ran without error
result.success = not last.execution_error and bool(last.execution_output)
return result