Spaces:

PYAE1994
/

openhands-genspark-agent

Running

File size: 14,232 Bytes

19c8c50

"""Phase 5 — Self-Debugging Engine.

Drives the autonomous fix loop:

  generate → execute → test → critique → fix → repeat

Key components:
  1. CodeGenerator    — LLM produces code for a task description
  2. CodeExecutor     — runs the code in the sandbox
  3. TestRunner       — runs inline unit tests or verifiers
  4. CritiqueEngine   — reviews code quality + error output
  5. FixEngine        — generates targeted patch based on failure + critique
  6. AutonomousLoop   — orchestrates the full generate→test→fix cycle
"""

from __future__ import annotations

import json
import logging
import os
import re
import time
import uuid
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Tuple

from ..router.smart_router import get_router
from . import tools as agent_tools

logger = logging.getLogger(__name__)


# ─── Data Models ─────────────────────────────────────────────────────────────

@dataclass
class CodeAttempt:
    attempt_num: int
    code: str
    language: str = "python"
    filename: str = ""
    execution_output: str = ""
    execution_error: str = ""
    test_passed: bool = False
    critique: str = ""
    fix_notes: str = ""
    duration_secs: float = 0.0

    def to_dict(self) -> Dict[str, Any]:
        return {
            "attempt": self.attempt_num,
            "language": self.language,
            "filename": self.filename,
            "code_lines": self.code.count("\n") + 1,
            "execution_output": self.execution_output[:500],
            "execution_error": self.execution_error[:500],
            "test_passed": self.test_passed,
            "critique": self.critique[:300],
            "fix_notes": self.fix_notes[:300],
            "duration_secs": round(self.duration_secs, 2),
        }


@dataclass
class DebugResult:
    task_id: int
    goal: str
    attempts: List[CodeAttempt] = field(default_factory=list)
    final_code: str = ""
    final_filename: str = ""
    final_output: str = ""
    success: bool = False
    total_duration_secs: float = 0.0

    def to_dict(self) -> Dict[str, Any]:
        return {
            "task_id": self.task_id,
            "goal": self.goal,
            "total_attempts": len(self.attempts),
            "success": self.success,
            "final_code_lines": self.final_code.count("\n") + 1 if self.final_code else 0,
            "final_filename": self.final_filename,
            "final_output": self.final_output[:1000],
            "attempts": [a.to_dict() for a in self.attempts],
            "total_duration_secs": round(self.total_duration_secs, 2),
        }


# ─── Code Generator ──────────────────────────────────────────────────────────

async def generate_code(
    goal: str,
    language: str = "python",
    previous_error: str = "",
    critique: str = "",
    previous_code: str = "",
    memory_context: str = "",
) -> str:
    """Generate code for the goal, optionally fixing previous attempt."""
    if previous_error or critique:
        system = (
            "You are an expert software engineer. Fix the code based on the error and critique. "
            "Return ONLY the complete fixed code, no markdown fences, no explanations."
        )
        user = (
            f"Goal: {goal}\n"
            f"Previous code:\n{previous_code[:1500] if previous_code else 'none'}\n\n"
            f"Error:\n{previous_error[:800] if previous_error else 'none'}\n\n"
            f"Critique:\n{critique[:600] if critique else 'none'}\n\n"
            f"Write the complete fixed {language} code:"
        )
    else:
        system = (
            f"You are an expert software engineer. Write clean, working {language} code. "
            "Return ONLY the code, no markdown fences, no explanations."
        )
        user = (
            f"Goal: {goal}\n"
            f"Memory context: {memory_context[:400] if memory_context else 'none'}\n\n"
            f"Write complete {language} code that achieves this goal:"
        )

    router = get_router()
    raw = await router.chat(
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
        temperature=0.2,
        max_tokens=1500,
    )

    # Strip markdown fences if present
    code = raw or ""
    code = re.sub(r"^```(?:python|javascript|bash|sh)?\n?", "", code.strip())
    code = re.sub(r"\n?```$", "", code.strip())
    return code.strip()


# ─── Test Generator ──────────────────────────────────────────────────────────

async def generate_tests(goal: str, code: str, language: str = "python") -> str:
    """Generate simple unit tests for the code."""
    system = (
        "You are a QA engineer. Write minimal unit tests that verify the code achieves the goal. "
        "Use pytest for Python. Return ONLY the test code."
    )
    user = (
        f"Goal: {goal}\n"
        f"Code to test:\n{code[:1200]}\n\n"
        "Write tests that verify the code works correctly. Keep tests simple and runnable:"
    )
    router = get_router()
    raw = await router.chat(
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
        temperature=0.1,
        max_tokens=600,
    )
    code_block = raw or ""
    code_block = re.sub(r"^```(?:python)?\n?", "", code_block.strip())
    code_block = re.sub(r"\n?```$", "", code_block.strip())
    return code_block.strip()


# ─── Critique Engine ─────────────────────────────────────────────────────────

async def critique_code(
    goal: str,
    code: str,
    execution_output: str,
    execution_error: str,
    test_passed: bool,
) -> str:
    """Analyse code quality and execution outcome. Returns actionable critique."""
    system = (
        "You are a senior code reviewer. Give a concise, actionable critique. "
        "Focus on: correctness, error handling, edge cases, missing logic."
    )
    user = (
        f"Goal: {goal}\n\n"
        f"Code:\n{code[:1000]}\n\n"
        f"Execution output: {execution_output[:400] if execution_output else 'none'}\n"
        f"Execution error: {execution_error[:400] if execution_error else 'none'}\n"
        f"Tests passed: {test_passed}\n\n"
        "Provide a short critique (2-4 sentences) of what needs to be fixed:"
    )
    router = get_router()
    critique = await router.chat(
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
        temperature=0.3,
        max_tokens=300,
    )
    return (critique or "").strip()


# ─── Code Executor ────────────────────────────────────────────────────────────

def execute_code(
    workdir: Any,
    code: str,
    filename: str,
    language: str = "python",
    timeout: int = 30,
) -> Tuple[str, str, bool]:
    """Execute code in sandbox. Returns (stdout, stderr, success)."""
    agent_tools.write_file(workdir, filename, code)

    if language == "python":
        result = agent_tools.run_shell(workdir, f"python3 {filename}", timeout=timeout)
    elif language in ("javascript", "js", "node"):
        result = agent_tools.run_shell(workdir, f"node {filename}", timeout=timeout)
    elif language in ("bash", "sh"):
        result = agent_tools.run_shell(workdir, f"bash {filename}", timeout=timeout)
    else:
        result = agent_tools.run_shell(workdir, f"python3 {filename}", timeout=timeout)

    stdout = result.get("stdout", "")
    stderr = result.get("stderr", "")
    returncode = result.get("returncode", -1)
    success = returncode == 0 and not stderr.strip()
    return stdout, stderr, success


def run_tests(workdir: Any, test_code: str, test_filename: str) -> Tuple[str, str, bool]:
    """Run pytest tests. Returns (stdout, stderr, all_passed)."""
    agent_tools.write_file(workdir, test_filename, test_code)
    result = agent_tools.run_shell(workdir, f"python3 -m pytest {test_filename} -v --tb=short 2>&1", timeout=60)
    output = result.get("stdout", "") + result.get("stderr", "")
    passed = "passed" in output and "failed" not in output and "error" not in output.lower()
    return output, "", passed


# ─── Autonomous Coding Loop ───────────────────────────────────────────────────

class SelfDebuggingEngine:
    """Phase 5 — autonomous generate → execute → test → critique → fix loop.

    Runs up to ``max_attempts`` code generation+fix cycles, stopping when
    all tests pass or max attempts is reached.
    """

    def __init__(
        self,
        task_id: int,
        max_attempts: int = 5,
        language: str = "python",
        run_tests_flag: bool = True,
    ) -> None:
        self.task_id = task_id
        self.max_attempts = max_attempts
        self.language = language
        self.run_tests_flag = run_tests_flag
        self.workdir = agent_tools.get_workdir(task_id)

    async def debug(
        self,
        goal: str,
        initial_code: str = "",
        memory_context: str = "",
    ) -> DebugResult:
        """Run the self-debugging loop."""
        result = DebugResult(task_id=self.task_id, goal=goal)
        start_total = time.monotonic()

        code = initial_code
        previous_error = ""
        critique = ""
        test_code = ""

        # Generate tests once at start
        if self.run_tests_flag and not initial_code:
            try:
                test_code = await generate_tests(goal, "# placeholder", self.language)
            except Exception as e:
                logger.warning("Test generation failed: %s", e)

        for attempt_num in range(1, self.max_attempts + 1):
            attempt = CodeAttempt(attempt_num=attempt_num, language=self.language)
            attempt_start = time.monotonic()

            logger.info(
                "SelfDebug task=%d attempt=%d/%d", self.task_id, attempt_num, self.max_attempts
            )

            # ── Generate / fix code ───────────────────────────────────────── #
            try:
                code = await generate_code(
                    goal=goal,
                    language=self.language,
                    previous_error=previous_error,
                    critique=critique,
                    previous_code=code,
                    memory_context=memory_context,
                )
            except Exception as e:
                attempt.execution_error = f"Code generation failed: {e}"
                result.attempts.append(attempt)
                continue

            attempt.code = code
            fname = f"solution_v{attempt_num}_{uuid.uuid4().hex[:4]}.py"
            attempt.filename = fname

            # ── Execute ───────────────────────────────────────────────────── #
            stdout, stderr, exec_ok = execute_code(
                self.workdir, code, fname, self.language
            )
            attempt.execution_output = stdout
            attempt.execution_error = stderr

            # ── Run tests (if available) ──────────────────────────────────── #
            if test_code and self.run_tests_flag:
                # Regenerate tests with actual code
                try:
                    test_code = await generate_tests(goal, code, self.language)
                    test_fname = f"test_solution_v{attempt_num}.py"
                    test_out, _, test_ok = run_tests(self.workdir, test_code, test_fname)
                    attempt.test_passed = test_ok
                    if not test_ok:
                        attempt.execution_error += f"\nTest output: {test_out[:400]}"
                except Exception as e:
                    attempt.test_passed = exec_ok
                    logger.warning("Test run error: %s", e)
            else:
                attempt.test_passed = exec_ok

            # ── Critique ──────────────────────────────────────────────────── #
            if not attempt.test_passed:
                try:
                    critique = await critique_code(
                        goal, code, stdout, stderr, attempt.test_passed
                    )
                    attempt.critique = critique
                except Exception as e:
                    logger.warning("Critique failed: %s", e)

            previous_error = stderr or ("tests failed" if not attempt.test_passed else "")

            attempt.duration_secs = time.monotonic() - attempt_start
            result.attempts.append(attempt)

            if attempt.test_passed:
                result.success = True
                result.final_code = code
                result.final_filename = fname
                result.final_output = stdout
                break

        result.total_duration_secs = time.monotonic() - start_total

        # If never succeeded, use last attempt
        if not result.success and result.attempts:
            last = result.attempts[-1]
            result.final_code = last.code
            result.final_filename = last.filename
            result.final_output = last.execution_output
            # Partial success if code ran without error
            result.success = not last.execution_error and bool(last.execution_output)

        return result