Spaces:

PYAE1994
/

openhands-genspark-agent

Running

App Files Files Community

openhands-genspark-agent / apps /backend /agent /self_debug.py

PYAE1994

feat(phase5): Upload apps/backend/agent/self_debug.py

19c8c50 verified 15 days ago

raw

history blame contribute delete

14.2 kB

	"""Phase 5 — Self-Debugging Engine.

	Drives the autonomous fix loop:

	generate → execute → test → critique → fix → repeat

	Key components:
	1. CodeGenerator — LLM produces code for a task description
	2. CodeExecutor — runs the code in the sandbox
	3. TestRunner — runs inline unit tests or verifiers
	4. CritiqueEngine — reviews code quality + error output
	5. FixEngine — generates targeted patch based on failure + critique
	6. AutonomousLoop — orchestrates the full generate→test→fix cycle
	"""

	from __future__ import annotations

	import json
	import logging
	import os
	import re
	import time
	import uuid
	from dataclasses import dataclass, field
	from typing import Any, Dict, List, Optional, Tuple

	from ..router.smart_router import get_router
	from . import tools as agent_tools

	logger = logging.getLogger(__name__)


	# ─── Data Models ─────────────────────────────────────────────────────────────

	@dataclass
	class CodeAttempt:
	attempt_num: int
	code: str
	language: str = "python"
	filename: str = ""
	execution_output: str = ""
	execution_error: str = ""
	test_passed: bool = False
	critique: str = ""
	fix_notes: str = ""
	duration_secs: float = 0.0

	def to_dict(self) -> Dict[str, Any]:
	return {
	"attempt": self.attempt_num,
	"language": self.language,
	"filename": self.filename,
	"code_lines": self.code.count("\n") + 1,
	"execution_output": self.execution_output[:500],
	"execution_error": self.execution_error[:500],
	"test_passed": self.test_passed,
	"critique": self.critique[:300],
	"fix_notes": self.fix_notes[:300],
	"duration_secs": round(self.duration_secs, 2),
	}


	@dataclass
	class DebugResult:
	task_id: int
	goal: str
	attempts: List[CodeAttempt] = field(default_factory=list)
	final_code: str = ""
	final_filename: str = ""
	final_output: str = ""
	success: bool = False
	total_duration_secs: float = 0.0

	def to_dict(self) -> Dict[str, Any]:
	return {
	"task_id": self.task_id,
	"goal": self.goal,
	"total_attempts": len(self.attempts),
	"success": self.success,
	"final_code_lines": self.final_code.count("\n") + 1 if self.final_code else 0,
	"final_filename": self.final_filename,
	"final_output": self.final_output[:1000],
	"attempts": [a.to_dict() for a in self.attempts],
	"total_duration_secs": round(self.total_duration_secs, 2),
	}


	# ─── Code Generator ──────────────────────────────────────────────────────────

	async def generate_code(
	goal: str,
	language: str = "python",
	previous_error: str = "",
	critique: str = "",
	previous_code: str = "",
	memory_context: str = "",
	) -> str:
	"""Generate code for the goal, optionally fixing previous attempt."""
	if previous_error or critique:
	system = (
	"You are an expert software engineer. Fix the code based on the error and critique. "
	"Return ONLY the complete fixed code, no markdown fences, no explanations."
	)
	user = (
	f"Goal: {goal}\n"
	f"Previous code:\n{previous_code[:1500] if previous_code else 'none'}\n\n"
	f"Error:\n{previous_error[:800] if previous_error else 'none'}\n\n"
	f"Critique:\n{critique[:600] if critique else 'none'}\n\n"
	f"Write the complete fixed {language} code:"
	)
	else:
	system = (
	f"You are an expert software engineer. Write clean, working {language} code. "
	"Return ONLY the code, no markdown fences, no explanations."
	)
	user = (
	f"Goal: {goal}\n"
	f"Memory context: {memory_context[:400] if memory_context else 'none'}\n\n"
	f"Write complete {language} code that achieves this goal:"
	)

	router = get_router()
	raw = await router.chat(
	messages=[
	{"role": "system", "content": system},
	{"role": "user", "content": user},
	],
	temperature=0.2,
	max_tokens=1500,
	)

	# Strip markdown fences if present
	code = raw or ""
	code = re.sub(r"^```(?:python\|javascript\|bash\|sh)?\n?", "", code.strip())
	code = re.sub(r"\n?```$", "", code.strip())
	return code.strip()


	# ─── Test Generator ──────────────────────────────────────────────────────────

	async def generate_tests(goal: str, code: str, language: str = "python") -> str:
	"""Generate simple unit tests for the code."""
	system = (
	"You are a QA engineer. Write minimal unit tests that verify the code achieves the goal. "
	"Use pytest for Python. Return ONLY the test code."
	)
	user = (
	f"Goal: {goal}\n"
	f"Code to test:\n{code[:1200]}\n\n"
	"Write tests that verify the code works correctly. Keep tests simple and runnable:"
	)
	router = get_router()
	raw = await router.chat(
	messages=[
	{"role": "system", "content": system},
	{"role": "user", "content": user},
	],
	temperature=0.1,
	max_tokens=600,
	)
	code_block = raw or ""
	code_block = re.sub(r"^```(?:python)?\n?", "", code_block.strip())
	code_block = re.sub(r"\n?```$", "", code_block.strip())
	return code_block.strip()


	# ─── Critique Engine ─────────────────────────────────────────────────────────

	async def critique_code(
	goal: str,
	code: str,
	execution_output: str,
	execution_error: str,
	test_passed: bool,
	) -> str:
	"""Analyse code quality and execution outcome. Returns actionable critique."""
	system = (
	"You are a senior code reviewer. Give a concise, actionable critique. "
	"Focus on: correctness, error handling, edge cases, missing logic."
	)
	user = (
	f"Goal: {goal}\n\n"
	f"Code:\n{code[:1000]}\n\n"
	f"Execution output: {execution_output[:400] if execution_output else 'none'}\n"
	f"Execution error: {execution_error[:400] if execution_error else 'none'}\n"
	f"Tests passed: {test_passed}\n\n"
	"Provide a short critique (2-4 sentences) of what needs to be fixed:"
	)
	router = get_router()
	critique = await router.chat(
	messages=[
	{"role": "system", "content": system},
	{"role": "user", "content": user},
	],
	temperature=0.3,
	max_tokens=300,
	)
	return (critique or "").strip()


	# ─── Code Executor ────────────────────────────────────────────────────────────

	def execute_code(
	workdir: Any,
	code: str,
	filename: str,
	language: str = "python",
	timeout: int = 30,
	) -> Tuple[str, str, bool]:
	"""Execute code in sandbox. Returns (stdout, stderr, success)."""
	agent_tools.write_file(workdir, filename, code)

	if language == "python":
	result = agent_tools.run_shell(workdir, f"python3 {filename}", timeout=timeout)
	elif language in ("javascript", "js", "node"):
	result = agent_tools.run_shell(workdir, f"node {filename}", timeout=timeout)
	elif language in ("bash", "sh"):
	result = agent_tools.run_shell(workdir, f"bash {filename}", timeout=timeout)
	else:
	result = agent_tools.run_shell(workdir, f"python3 {filename}", timeout=timeout)

	stdout = result.get("stdout", "")
	stderr = result.get("stderr", "")
	returncode = result.get("returncode", -1)
	success = returncode == 0 and not stderr.strip()
	return stdout, stderr, success


	def run_tests(workdir: Any, test_code: str, test_filename: str) -> Tuple[str, str, bool]:
	"""Run pytest tests. Returns (stdout, stderr, all_passed)."""
	agent_tools.write_file(workdir, test_filename, test_code)
	result = agent_tools.run_shell(workdir, f"python3 -m pytest {test_filename} -v --tb=short 2>&1", timeout=60)
	output = result.get("stdout", "") + result.get("stderr", "")
	passed = "passed" in output and "failed" not in output and "error" not in output.lower()
	return output, "", passed


	# ─── Autonomous Coding Loop ───────────────────────────────────────────────────

	class SelfDebuggingEngine:
	"""Phase 5 — autonomous generate → execute → test → critique → fix loop.

	Runs up to ``max_attempts`` code generation+fix cycles, stopping when
	all tests pass or max attempts is reached.
	"""

	def __init__(
	self,
	task_id: int,
	max_attempts: int = 5,
	language: str = "python",
	run_tests_flag: bool = True,
	) -> None:
	self.task_id = task_id
	self.max_attempts = max_attempts
	self.language = language
	self.run_tests_flag = run_tests_flag
	self.workdir = agent_tools.get_workdir(task_id)

	async def debug(
	self,
	goal: str,
	initial_code: str = "",
	memory_context: str = "",
	) -> DebugResult:
	"""Run the self-debugging loop."""
	result = DebugResult(task_id=self.task_id, goal=goal)
	start_total = time.monotonic()

	code = initial_code
	previous_error = ""
	critique = ""
	test_code = ""

	# Generate tests once at start
	if self.run_tests_flag and not initial_code:
	try:
	test_code = await generate_tests(goal, "# placeholder", self.language)
	except Exception as e:
	logger.warning("Test generation failed: %s", e)

	for attempt_num in range(1, self.max_attempts + 1):
	attempt = CodeAttempt(attempt_num=attempt_num, language=self.language)
	attempt_start = time.monotonic()

	logger.info(
	"SelfDebug task=%d attempt=%d/%d", self.task_id, attempt_num, self.max_attempts
	)

	# ── Generate / fix code ───────────────────────────────────────── #
	try:
	code = await generate_code(
	goal=goal,
	language=self.language,
	previous_error=previous_error,
	critique=critique,
	previous_code=code,
	memory_context=memory_context,
	)
	except Exception as e:
	attempt.execution_error = f"Code generation failed: {e}"
	result.attempts.append(attempt)
	continue

	attempt.code = code
	fname = f"solution_v{attempt_num}_{uuid.uuid4().hex[:4]}.py"
	attempt.filename = fname

	# ── Execute ───────────────────────────────────────────────────── #
	stdout, stderr, exec_ok = execute_code(
	self.workdir, code, fname, self.language
	)
	attempt.execution_output = stdout
	attempt.execution_error = stderr

	# ── Run tests (if available) ──────────────────────────────────── #
	if test_code and self.run_tests_flag:
	# Regenerate tests with actual code
	try:
	test_code = await generate_tests(goal, code, self.language)
	test_fname = f"test_solution_v{attempt_num}.py"
	test_out, _, test_ok = run_tests(self.workdir, test_code, test_fname)
	attempt.test_passed = test_ok
	if not test_ok:
	attempt.execution_error += f"\nTest output: {test_out[:400]}"
	except Exception as e:
	attempt.test_passed = exec_ok
	logger.warning("Test run error: %s", e)
	else:
	attempt.test_passed = exec_ok

	# ── Critique ──────────────────────────────────────────────────── #
	if not attempt.test_passed:
	try:
	critique = await critique_code(
	goal, code, stdout, stderr, attempt.test_passed
	)
	attempt.critique = critique
	except Exception as e:
	logger.warning("Critique failed: %s", e)

	previous_error = stderr or ("tests failed" if not attempt.test_passed else "")

	attempt.duration_secs = time.monotonic() - attempt_start
	result.attempts.append(attempt)

	if attempt.test_passed:
	result.success = True
	result.final_code = code
	result.final_filename = fname
	result.final_output = stdout
	break

	result.total_duration_secs = time.monotonic() - start_total

	# If never succeeded, use last attempt
	if not result.success and result.attempts:
	last = result.attempts[-1]
	result.final_code = last.code
	result.final_filename = last.filename
	result.final_output = last.execution_output
	# Partial success if code ran without error
	result.success = not last.execution_error and bool(last.execution_output)

	return result