Spaces:

DEVessi
/

devops_sandbox

Runtime error

File size: 15,536 Bytes

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""
Self-Healing DevOps Sandbox — Environment Implementation.

Runs entirely natively on the host filesystem (Hugging Face Spaces compatible).
The RL agent executes bash commands to diagnose and fix 3 bugs via direct subprocesses.
"""

import logging
import os
import shutil
import subprocess
import sys
from pathlib import Path
from typing import Any, Optional
from uuid import uuid4

from openenv.core.env_server.interfaces import Environment
from openenv.core.env_server.types import State

try:
    from ..models import BashAction, TerminalObservation
except ImportError:
    from models import BashAction, TerminalObservation

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
EXPECTED_PORT = 3000          # The port the fixed app should listen on
MAX_STEPS = 50                # Episode budget
SIMULATED_APP_DIR = Path(__file__).resolve().parent.parent / "simulated_app"

class DevOpsSandbox(Environment):
    """
    RL environment: fix a broken Node.js backend.
    No longer uses Docker (Docker-in-Docker is unsupported in HF Spaces).
    Instead, uses native subprocess.run() in a reset /app/ directory.
    """

    SUPPORTS_CONCURRENT_SESSIONS: bool = False

    def __init__(self):
        super().__init__()
        self._state = State(episode_id=str(uuid4()), step_count=0)
        self._current_dir: str = "/app"
        self._last_score: float = 0.0
        
        # When running on Windows locally, `/app` and `/app_backup` don't exist naturally,
        # so we will use absolute paths mapped to our repo if they aren't at root.
        # But for HF Space (Linux), /app will be at root.
        if sys.platform == "win32":
            # For Windows local dev, use safe paths inside the workspace
            workspace = Path(__file__).resolve().parent.parent
            self._app_dir = str(workspace / ".app_sandbox")
            self._app_backup_dir = str(SIMULATED_APP_DIR)
            self._tmp_dir = str(workspace / ".tmp")
            os.makedirs(self._tmp_dir, exist_ok=True)
            self._current_dir = self._app_dir
        else:
            # For Hugging Face Spaces (Linux)
            self._app_dir = "/app"
            self._app_backup_dir = "/app_backup"
            self._tmp_dir = "/tmp"
            self._current_dir = "/app"

    def reset(
        self,
        seed: Optional[int] = None,
        episode_id: Optional[str] = None,
        **kwargs: Any,
    ) -> TerminalObservation:
        """Reset the environment state by copying the backup to the working dir."""
        eid = episode_id or str(uuid4())
        self._state = State(episode_id=eid, step_count=0)
        self._last_score = 0.0
        self._current_dir = self._app_dir

        self._reset_filesystem()
        self._inject_grader_script()

        # Gather initial observation
        init_stdout = self._exec_cmd(f"ls -la {self._app_dir} && echo '---' && cat {os.path.join(self._app_dir, 'config.json')}")

        task_prompt = (
            "=== SELF-HEALING DEVOPS SANDBOX ===\n"
            f"You have been dropped into a container with a broken Node.js Express backend in {self._app_dir}.\n\n"
            "YOUR MISSION: Diagnose and fix ALL bugs so that:\n"
            "  1. The app starts without errors on port 3000\n"
            "  2. GET /health returns HTTP 200\n"
            "  3. GET /api/users returns HTTP 200 with valid JSON\n"
            "  4. GET /api/data returns HTTP 200 with valid JSON\n\n"
            "HINTS:\n"
            "  - Check config files for wrong settings\n"
            "  - Look for syntax errors that prevent startup\n"
            "  - Watch out for async/await issues\n\n"
            "Use bash commands to explore, edit files, and test.\n"
            "When you think you've fixed everything, run: npm start\n\n"
            "--- INITIAL DIRECTORY LISTING ---\n"
            f"{init_stdout}\n"
        )

        return TerminalObservation(
            stdout=task_prompt,
            stderr="",
            current_dir=self._current_dir,
            task_id="devops_sandbox",
            grader_score=0.0,
            grader_feedback="Episode started. Fix the bugs!",
            done=False,
            reward=0.0,
        )

    def step(
        self,
        action: BashAction,  # type: ignore[override]
        timeout_s: Optional[float] = None,
        **kwargs: Any,
    ) -> TerminalObservation:
        """Execute the agent's command natively, run grader, return observation."""
        self._state.step_count += 1

        command = action.command.strip()
        if not command:
            return TerminalObservation(
                stdout="",
                stderr="Empty command. Please provide a bash command.",
                current_dir=self._current_dir,
                task_id="devops_sandbox",
                grader_score=self._last_score,
                grader_feedback="No command executed.",
                done=False,
                reward=self._last_score,
            )

        # Handle 'cd' commands manually since subprocess run is transient
        if command.startswith("cd "):
            target = command[3:].strip()
            # Handle standard cd edge cases
            if target == "" or target == "~":
                # Assuming /app is home for this exercise
                new_dir = self._app_dir
            elif target.startswith("/"):
                new_dir = os.path.normpath(target)
            else:
                new_dir = os.path.normpath(os.path.join(self._current_dir, target))
            
            if os.path.isdir(new_dir):
                self._current_dir = new_dir
                stdout, stderr = "", ""
            else:
                stdout, stderr = "", f"bash: cd: {target}: No such file or directory"
                
            # Run the grader anyway, even if just a cd
            score, feedback = self._grade()
            self._last_score = score
            episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)

            return TerminalObservation(
                stdout=stdout,
                stderr=stderr,
                current_dir=self._current_dir,
                task_id="devops_sandbox",
                grader_score=score,
                grader_feedback=feedback,
                done=episode_done,
                reward=score,
            )

        # Execute normal command
        try:
            timeout = timeout_s or 30.0
            stdout, stderr = self._exec_cmd_split(command, timeout=timeout)
        except Exception as e:
            stdout, stderr = "", f"Command execution error: {e}"

        score, feedback = self._grade()
        self._last_score = score
        episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)

        return TerminalObservation(
            stdout=stdout,
            stderr=stderr,
            current_dir=self._current_dir,
            task_id="devops_sandbox",
            grader_score=score,
            grader_feedback=feedback,
            done=episode_done,
            reward=score,
        )

    @property
    def state(self) -> State:
        return self._state

    def close(self) -> None:
        # pkill node servers that we might have spawned during the session
        self._exec_cmd("pkill -f 'node server.js'")

    # ==================================================================
    #  FILESYSTEM & EXECUTION HELPERS
    # ==================================================================
    def _reset_filesystem(self) -> None:
        """Replace the current working /app with the pristine /app_backup."""
        # Ensure we don't accidentally wipe out the whole host on windows if paths are wrong
        if os.path.exists(self._app_dir):
            shutil.rmtree(self._app_dir, ignore_errors=True)
            
        os.makedirs(self._app_dir, exist_ok=True)
        
        # Copy from backup to app dir
        if os.path.exists(self._app_backup_dir):
            for item in os.listdir(self._app_backup_dir):
                s = os.path.join(self._app_backup_dir, item)
                d = os.path.join(self._app_dir, item)
                if os.path.isdir(s):
                    shutil.copytree(s, d, dirs_exist_ok=True)
                else:
                    shutil.copy2(s, d)
        else:
            logger.warning(f"Backup directory {self._app_backup_dir} not found. Ensure Dockerfile copied simulated_app here.")

    def _exec_cmd(self, cmd: str, timeout: float = 30.0) -> str:
        """Execute command natively; return combined output."""
        stdout, stderr = self._exec_cmd_split(cmd, timeout)
        return (stdout + "\n" + stderr).strip()

    def _exec_cmd_split(self, cmd: str, timeout: float = 30.0) -> tuple:
        """Execute command natively; return (stdout, stderr)."""
        kwargs = {
            "cwd": self._current_dir,
            "shell": True,
            "capture_output": True,
            "timeout": timeout,
        }
        
        # Hugging Face space requires POSIX bash, windows uses powershell/cmd
        if sys.platform != "win32":
            kwargs["executable"] = "/bin/bash"

        try:
            result = subprocess.run(cmd, **kwargs)
            return (
                result.stdout.decode(errors="replace"),
                result.stderr.decode(errors="replace"),
            )
        except subprocess.TimeoutExpired:
            return ("", "[command timed out]")
        except Exception as e:
            return ("", f"[exec error: {e}]")

    # ==================================================================
    #  GRADER
    # ==================================================================
    def _inject_grader_script(self) -> None:
        self.grader_path = os.path.join(self._tmp_dir, "grader.sh")
        lines = [
            '#!/bin/bash',
            'set -m',
            '',
            'pkill -f "node server.js" 2>/dev/null',
            'sleep 0.5',
            '',
            f'cd {self._app_dir}',
            f'node server.js > {self._tmp_dir}/node.log 2>&1 &',
            'NODE_PID=$!',
            '',
            'for i in 1 2 3 4; do',
            '  sleep 1',
            '  if curl -s http://localhost:3000/health > /dev/null 2>&1; then',
            '    break',
            '  fi',
            'done',
            '',
            f'STARTUP_LOG=$(cat {self._tmp_dir}/node.log 2>/dev/null)',
            '',
            f"HEALTH_CODE=$(curl -s -o {self._tmp_dir}/health.json -w '%{{http_code}}' http://localhost:3000/health 2>/dev/null)",
            f"USERS_CODE=$(curl -s -o {self._tmp_dir}/users.json -w '%{{http_code}}' http://localhost:3000/api/users 2>/dev/null)",
            f"DATA_CODE=$(curl -s -o {self._tmp_dir}/data.json -w '%{{http_code}}' http://localhost:3000/api/data 2>/dev/null)",
            f'USERS_BODY=$(cat {self._tmp_dir}/users.json 2>/dev/null)',
            f'DATA_BODY=$(cat {self._tmp_dir}/data.json 2>/dev/null)',
            '',
            'kill $NODE_PID 2>/dev/null',
            'wait $NODE_PID 2>/dev/null',
            '',
            'echo "GRADER_STARTUP_LOG:${STARTUP_LOG}"',
            'echo "GRADER_HEALTH_CODE:${HEALTH_CODE}"',
            'echo "GRADER_USERS_CODE:${USERS_CODE}"',
            'echo "GRADER_DATA_CODE:${DATA_CODE}"',
            'echo "GRADER_USERS_BODY:${USERS_BODY}"',
            'echo "GRADER_DATA_BODY:${DATA_BODY}"',
        ]
        
        script_content = '\n'.join(lines) + '\n'
        with open(self.grader_path, "w", newline='\n') as f:
            f.write(script_content)
            
        if sys.platform != "win32":
            subprocess.run(["chmod", "+x", self.grader_path])

    def _grade(self) -> tuple:
        score = 0.0
        feedback_parts = []

        try:
            if sys.platform == "win32":
                # We use bash via wsl or bash.exe on Windows if we can, 
                # but if not we might fail grading natively on Windows unless Git Bash is installed.
                raw = self._exec_cmd(f"bash {self.grader_path}", timeout=20.0)
            else:
                raw = self._exec_cmd(f"/bin/bash {self.grader_path}", timeout=20.0)

            results = {}
            for line in raw.splitlines():
                if line.startswith("GRADER_"):
                    key, _, value = line.partition(":")
                    results[key] = value.strip()

            startup_log = results.get("GRADER_STARTUP_LOG", "")
            health_code = results.get("GRADER_HEALTH_CODE", "000")
            users_code = results.get("GRADER_USERS_CODE", "000")
            data_code = results.get("GRADER_DATA_CODE", "000")
            users_body = results.get("GRADER_USERS_BODY", "")
            data_body = results.get("GRADER_DATA_BODY", "")

            has_syntax_error = "SyntaxError" in startup_log
            has_crash = (has_syntax_error
                         or "Cannot find module" in startup_log
                         or "ReferenceError" in startup_log)
            app_listening = f"Server running on port {EXPECTED_PORT}" in startup_log

            if has_crash and not app_listening:
                feedback_parts.append(f"✗ App crashes on startup")
                if has_syntax_error:
                    feedback_parts.append("(SyntaxError detected)")
                return (score, " | ".join(feedback_parts))

            if app_listening:
                score += 0.35
                feedback_parts.append("✓ App starts on port 3000 (+0.35)")
            else:
                feedback_parts.append("✗ App not listening on port 3000")
                return (score, " | ".join(feedback_parts))

            if health_code == "200":
                score += 0.10
                feedback_parts.append("✓ /health returns 200 (+0.10)")
            else:
                feedback_parts.append(f"✗ /health returned {health_code}")

            if users_code == "200":
                if '"users"' in users_body:
                    score += 0.15
                    feedback_parts.append("✓ /api/users returns valid JSON (+0.15)")
                else:
                    score += 0.05
                    feedback_parts.append("~ /api/users 200 but bad body (+0.05)")
            else:
                feedback_parts.append(f"✗ /api/users returned {users_code}")

            if data_code == "200":
                if '"records"' in data_body:
                    score += 0.25
                    feedback_parts.append("✓ /api/data returns valid JSON (+0.25)")
                else:
                    score += 0.05
                    feedback_parts.append("~ /api/data 200 but bad body (+0.05)")
            else:
                feedback_parts.append(f"✗ /api/data returned {data_code}")

            if score >= 0.85:
                score = min(score + 0.15, 1.0)
                feedback_parts.append("✓ All endpoints healthy — FULL SCORE (+0.15)")

        except Exception as exc:
            logger.exception("Grader error")
            feedback_parts.append(f"Grader error (score preserved): {exc}")

        score = round(min(max(score, 0.0), 1.0), 2)
        return (score, " | ".join(feedback_parts))