Spaces:
Runtime error
Runtime error
| # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the BSD-style license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| """ | |
| Self-Healing DevOps Sandbox β Environment Implementation. | |
| Runs entirely natively on the host filesystem (Hugging Face Spaces compatible). | |
| The RL agent executes bash commands to diagnose and fix 3 bugs via direct subprocesses. | |
| """ | |
| import logging | |
| import os | |
| import shutil | |
| import subprocess | |
| import sys | |
| from pathlib import Path | |
| from typing import Any, Optional | |
| from uuid import uuid4 | |
| from openenv.core.env_server.interfaces import Environment | |
| from openenv.core.env_server.types import State | |
| try: | |
| from ..models import BashAction, TerminalObservation | |
| except ImportError: | |
| from models import BashAction, TerminalObservation | |
| logger = logging.getLogger(__name__) | |
| # --------------------------------------------------------------------------- | |
| # Constants | |
| # --------------------------------------------------------------------------- | |
| EXPECTED_PORT = 3000 # The port the fixed app should listen on | |
| MAX_STEPS = 50 # Episode budget | |
| SIMULATED_APP_DIR = Path(__file__).resolve().parent.parent / "simulated_app" | |
| class DevOpsSandbox(Environment): | |
| """ | |
| RL environment: fix a broken Node.js backend. | |
| No longer uses Docker (Docker-in-Docker is unsupported in HF Spaces). | |
| Instead, uses native subprocess.run() in a reset /app/ directory. | |
| """ | |
| SUPPORTS_CONCURRENT_SESSIONS: bool = False | |
| def __init__(self): | |
| super().__init__() | |
| self._state = State(episode_id=str(uuid4()), step_count=0) | |
| self._current_dir: str = "/app" | |
| self._last_score: float = 0.0 | |
| # When running on Windows locally, `/app` and `/app_backup` don't exist naturally, | |
| # so we will use absolute paths mapped to our repo if they aren't at root. | |
| # But for HF Space (Linux), /app will be at root. | |
| if sys.platform == "win32": | |
| # For Windows local dev, use safe paths inside the workspace | |
| workspace = Path(__file__).resolve().parent.parent | |
| self._app_dir = str(workspace / ".app_sandbox") | |
| self._app_backup_dir = str(SIMULATED_APP_DIR) | |
| self._tmp_dir = str(workspace / ".tmp") | |
| os.makedirs(self._tmp_dir, exist_ok=True) | |
| self._current_dir = self._app_dir | |
| else: | |
| # For Hugging Face Spaces (Linux) | |
| self._app_dir = "/app" | |
| self._app_backup_dir = "/app_backup" | |
| self._tmp_dir = "/tmp" | |
| self._current_dir = "/app" | |
| def reset( | |
| self, | |
| seed: Optional[int] = None, | |
| episode_id: Optional[str] = None, | |
| **kwargs: Any, | |
| ) -> TerminalObservation: | |
| """Reset the environment state by copying the backup to the working dir.""" | |
| eid = episode_id or str(uuid4()) | |
| self._state = State(episode_id=eid, step_count=0) | |
| self._last_score = 0.0 | |
| self._current_dir = self._app_dir | |
| self._reset_filesystem() | |
| self._inject_grader_script() | |
| # Gather initial observation | |
| init_stdout = self._exec_cmd(f"ls -la {self._app_dir} && echo '---' && cat {os.path.join(self._app_dir, 'config.json')}") | |
| task_prompt = ( | |
| "=== SELF-HEALING DEVOPS SANDBOX ===\n" | |
| f"You have been dropped into a container with a broken Node.js Express backend in {self._app_dir}.\n\n" | |
| "YOUR MISSION: Diagnose and fix ALL bugs so that:\n" | |
| " 1. The app starts without errors on port 3000\n" | |
| " 2. GET /health returns HTTP 200\n" | |
| " 3. GET /api/users returns HTTP 200 with valid JSON\n" | |
| " 4. GET /api/data returns HTTP 200 with valid JSON\n\n" | |
| "HINTS:\n" | |
| " - Check config files for wrong settings\n" | |
| " - Look for syntax errors that prevent startup\n" | |
| " - Watch out for async/await issues\n\n" | |
| "Use bash commands to explore, edit files, and test.\n" | |
| "When you think you've fixed everything, run: npm start\n\n" | |
| "--- INITIAL DIRECTORY LISTING ---\n" | |
| f"{init_stdout}\n" | |
| ) | |
| return TerminalObservation( | |
| stdout=task_prompt, | |
| stderr="", | |
| current_dir=self._current_dir, | |
| task_id="devops_sandbox", | |
| grader_score=0.0, | |
| grader_feedback="Episode started. Fix the bugs!", | |
| done=False, | |
| reward=0.0, | |
| ) | |
| def step( | |
| self, | |
| action: BashAction, # type: ignore[override] | |
| timeout_s: Optional[float] = None, | |
| **kwargs: Any, | |
| ) -> TerminalObservation: | |
| """Execute the agent's command natively, run grader, return observation.""" | |
| self._state.step_count += 1 | |
| command = action.command.strip() | |
| if not command: | |
| return TerminalObservation( | |
| stdout="", | |
| stderr="Empty command. Please provide a bash command.", | |
| current_dir=self._current_dir, | |
| task_id="devops_sandbox", | |
| grader_score=self._last_score, | |
| grader_feedback="No command executed.", | |
| done=False, | |
| reward=self._last_score, | |
| ) | |
| # Handle 'cd' commands manually since subprocess run is transient | |
| if command.startswith("cd "): | |
| target = command[3:].strip() | |
| # Handle standard cd edge cases | |
| if target == "" or target == "~": | |
| # Assuming /app is home for this exercise | |
| new_dir = self._app_dir | |
| elif target.startswith("/"): | |
| new_dir = os.path.normpath(target) | |
| else: | |
| new_dir = os.path.normpath(os.path.join(self._current_dir, target)) | |
| if os.path.isdir(new_dir): | |
| self._current_dir = new_dir | |
| stdout, stderr = "", "" | |
| else: | |
| stdout, stderr = "", f"bash: cd: {target}: No such file or directory" | |
| # Run the grader anyway, even if just a cd | |
| score, feedback = self._grade() | |
| self._last_score = score | |
| episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS) | |
| return TerminalObservation( | |
| stdout=stdout, | |
| stderr=stderr, | |
| current_dir=self._current_dir, | |
| task_id="devops_sandbox", | |
| grader_score=score, | |
| grader_feedback=feedback, | |
| done=episode_done, | |
| reward=score, | |
| ) | |
| # Execute normal command | |
| try: | |
| timeout = timeout_s or 30.0 | |
| stdout, stderr = self._exec_cmd_split(command, timeout=timeout) | |
| except Exception as e: | |
| stdout, stderr = "", f"Command execution error: {e}" | |
| score, feedback = self._grade() | |
| self._last_score = score | |
| episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS) | |
| return TerminalObservation( | |
| stdout=stdout, | |
| stderr=stderr, | |
| current_dir=self._current_dir, | |
| task_id="devops_sandbox", | |
| grader_score=score, | |
| grader_feedback=feedback, | |
| done=episode_done, | |
| reward=score, | |
| ) | |
| def state(self) -> State: | |
| return self._state | |
| def close(self) -> None: | |
| # pkill node servers that we might have spawned during the session | |
| self._exec_cmd("pkill -f 'node server.js'") | |
| # ================================================================== | |
| # FILESYSTEM & EXECUTION HELPERS | |
| # ================================================================== | |
| def _reset_filesystem(self) -> None: | |
| """Replace the current working /app with the pristine /app_backup.""" | |
| # Ensure we don't accidentally wipe out the whole host on windows if paths are wrong | |
| if os.path.exists(self._app_dir): | |
| shutil.rmtree(self._app_dir, ignore_errors=True) | |
| os.makedirs(self._app_dir, exist_ok=True) | |
| # Copy from backup to app dir | |
| if os.path.exists(self._app_backup_dir): | |
| for item in os.listdir(self._app_backup_dir): | |
| s = os.path.join(self._app_backup_dir, item) | |
| d = os.path.join(self._app_dir, item) | |
| if os.path.isdir(s): | |
| shutil.copytree(s, d, dirs_exist_ok=True) | |
| else: | |
| shutil.copy2(s, d) | |
| else: | |
| logger.warning(f"Backup directory {self._app_backup_dir} not found. Ensure Dockerfile copied simulated_app here.") | |
| def _exec_cmd(self, cmd: str, timeout: float = 30.0) -> str: | |
| """Execute command natively; return combined output.""" | |
| stdout, stderr = self._exec_cmd_split(cmd, timeout) | |
| return (stdout + "\n" + stderr).strip() | |
| def _exec_cmd_split(self, cmd: str, timeout: float = 30.0) -> tuple: | |
| """Execute command natively; return (stdout, stderr).""" | |
| kwargs = { | |
| "cwd": self._current_dir, | |
| "shell": True, | |
| "capture_output": True, | |
| "timeout": timeout, | |
| } | |
| # Hugging Face space requires POSIX bash, windows uses powershell/cmd | |
| if sys.platform != "win32": | |
| kwargs["executable"] = "/bin/bash" | |
| try: | |
| result = subprocess.run(cmd, **kwargs) | |
| return ( | |
| result.stdout.decode(errors="replace"), | |
| result.stderr.decode(errors="replace"), | |
| ) | |
| except subprocess.TimeoutExpired: | |
| return ("", "[command timed out]") | |
| except Exception as e: | |
| return ("", f"[exec error: {e}]") | |
| # ================================================================== | |
| # GRADER | |
| # ================================================================== | |
| def _inject_grader_script(self) -> None: | |
| self.grader_path = os.path.join(self._tmp_dir, "grader.sh") | |
| lines = [ | |
| '#!/bin/bash', | |
| 'set -m', | |
| '', | |
| 'pkill -f "node server.js" 2>/dev/null', | |
| 'sleep 0.5', | |
| '', | |
| f'cd {self._app_dir}', | |
| f'node server.js > {self._tmp_dir}/node.log 2>&1 &', | |
| 'NODE_PID=$!', | |
| '', | |
| 'for i in 1 2 3 4; do', | |
| ' sleep 1', | |
| ' if curl -s http://localhost:3000/health > /dev/null 2>&1; then', | |
| ' break', | |
| ' fi', | |
| 'done', | |
| '', | |
| f'STARTUP_LOG=$(cat {self._tmp_dir}/node.log 2>/dev/null)', | |
| '', | |
| f"HEALTH_CODE=$(curl -s -o {self._tmp_dir}/health.json -w '%{{http_code}}' http://localhost:3000/health 2>/dev/null)", | |
| f"USERS_CODE=$(curl -s -o {self._tmp_dir}/users.json -w '%{{http_code}}' http://localhost:3000/api/users 2>/dev/null)", | |
| f"DATA_CODE=$(curl -s -o {self._tmp_dir}/data.json -w '%{{http_code}}' http://localhost:3000/api/data 2>/dev/null)", | |
| f'USERS_BODY=$(cat {self._tmp_dir}/users.json 2>/dev/null)', | |
| f'DATA_BODY=$(cat {self._tmp_dir}/data.json 2>/dev/null)', | |
| '', | |
| 'kill $NODE_PID 2>/dev/null', | |
| 'wait $NODE_PID 2>/dev/null', | |
| '', | |
| 'echo "GRADER_STARTUP_LOG:${STARTUP_LOG}"', | |
| 'echo "GRADER_HEALTH_CODE:${HEALTH_CODE}"', | |
| 'echo "GRADER_USERS_CODE:${USERS_CODE}"', | |
| 'echo "GRADER_DATA_CODE:${DATA_CODE}"', | |
| 'echo "GRADER_USERS_BODY:${USERS_BODY}"', | |
| 'echo "GRADER_DATA_BODY:${DATA_BODY}"', | |
| ] | |
| script_content = '\n'.join(lines) + '\n' | |
| with open(self.grader_path, "w", newline='\n') as f: | |
| f.write(script_content) | |
| if sys.platform != "win32": | |
| subprocess.run(["chmod", "+x", self.grader_path]) | |
| def _grade(self) -> tuple: | |
| score = 0.0 | |
| feedback_parts = [] | |
| try: | |
| if sys.platform == "win32": | |
| # We use bash via wsl or bash.exe on Windows if we can, | |
| # but if not we might fail grading natively on Windows unless Git Bash is installed. | |
| raw = self._exec_cmd(f"bash {self.grader_path}", timeout=20.0) | |
| else: | |
| raw = self._exec_cmd(f"/bin/bash {self.grader_path}", timeout=20.0) | |
| results = {} | |
| for line in raw.splitlines(): | |
| if line.startswith("GRADER_"): | |
| key, _, value = line.partition(":") | |
| results[key] = value.strip() | |
| startup_log = results.get("GRADER_STARTUP_LOG", "") | |
| health_code = results.get("GRADER_HEALTH_CODE", "000") | |
| users_code = results.get("GRADER_USERS_CODE", "000") | |
| data_code = results.get("GRADER_DATA_CODE", "000") | |
| users_body = results.get("GRADER_USERS_BODY", "") | |
| data_body = results.get("GRADER_DATA_BODY", "") | |
| has_syntax_error = "SyntaxError" in startup_log | |
| has_crash = (has_syntax_error | |
| or "Cannot find module" in startup_log | |
| or "ReferenceError" in startup_log) | |
| app_listening = f"Server running on port {EXPECTED_PORT}" in startup_log | |
| if has_crash and not app_listening: | |
| feedback_parts.append(f"β App crashes on startup") | |
| if has_syntax_error: | |
| feedback_parts.append("(SyntaxError detected)") | |
| return (score, " | ".join(feedback_parts)) | |
| if app_listening: | |
| score += 0.35 | |
| feedback_parts.append("β App starts on port 3000 (+0.35)") | |
| else: | |
| feedback_parts.append("β App not listening on port 3000") | |
| return (score, " | ".join(feedback_parts)) | |
| if health_code == "200": | |
| score += 0.10 | |
| feedback_parts.append("β /health returns 200 (+0.10)") | |
| else: | |
| feedback_parts.append(f"β /health returned {health_code}") | |
| if users_code == "200": | |
| if '"users"' in users_body: | |
| score += 0.15 | |
| feedback_parts.append("β /api/users returns valid JSON (+0.15)") | |
| else: | |
| score += 0.05 | |
| feedback_parts.append("~ /api/users 200 but bad body (+0.05)") | |
| else: | |
| feedback_parts.append(f"β /api/users returned {users_code}") | |
| if data_code == "200": | |
| if '"records"' in data_body: | |
| score += 0.25 | |
| feedback_parts.append("β /api/data returns valid JSON (+0.25)") | |
| else: | |
| score += 0.05 | |
| feedback_parts.append("~ /api/data 200 but bad body (+0.05)") | |
| else: | |
| feedback_parts.append(f"β /api/data returned {data_code}") | |
| if score >= 0.85: | |
| score = min(score + 0.15, 1.0) | |
| feedback_parts.append("β All endpoints healthy β FULL SCORE (+0.15)") | |
| except Exception as exc: | |
| logger.exception("Grader error") | |
| feedback_parts.append(f"Grader error (score preserved): {exc}") | |
| score = round(min(max(score, 0.0), 1.0), 2) | |
| return (score, " | ".join(feedback_parts)) | |