devops_sandbox / server /devops_sandbox_environment.py
DEVessi's picture
Upload folder using huggingface_hub
e9c3076 verified
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""
Self-Healing DevOps Sandbox β€” Environment Implementation.
Runs entirely natively on the host filesystem (Hugging Face Spaces compatible).
The RL agent executes bash commands to diagnose and fix 3 bugs via direct subprocesses.
"""
import logging
import os
import shutil
import subprocess
import sys
from pathlib import Path
from typing import Any, Optional
from uuid import uuid4
from openenv.core.env_server.interfaces import Environment
from openenv.core.env_server.types import State
try:
from ..models import BashAction, TerminalObservation
except ImportError:
from models import BashAction, TerminalObservation
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
EXPECTED_PORT = 3000 # The port the fixed app should listen on
MAX_STEPS = 50 # Episode budget
SIMULATED_APP_DIR = Path(__file__).resolve().parent.parent / "simulated_app"
class DevOpsSandbox(Environment):
"""
RL environment: fix a broken Node.js backend.
No longer uses Docker (Docker-in-Docker is unsupported in HF Spaces).
Instead, uses native subprocess.run() in a reset /app/ directory.
"""
SUPPORTS_CONCURRENT_SESSIONS: bool = False
def __init__(self):
super().__init__()
self._state = State(episode_id=str(uuid4()), step_count=0)
self._current_dir: str = "/app"
self._last_score: float = 0.0
# When running on Windows locally, `/app` and `/app_backup` don't exist naturally,
# so we will use absolute paths mapped to our repo if they aren't at root.
# But for HF Space (Linux), /app will be at root.
if sys.platform == "win32":
# For Windows local dev, use safe paths inside the workspace
workspace = Path(__file__).resolve().parent.parent
self._app_dir = str(workspace / ".app_sandbox")
self._app_backup_dir = str(SIMULATED_APP_DIR)
self._tmp_dir = str(workspace / ".tmp")
os.makedirs(self._tmp_dir, exist_ok=True)
self._current_dir = self._app_dir
else:
# For Hugging Face Spaces (Linux)
self._app_dir = "/app"
self._app_backup_dir = "/app_backup"
self._tmp_dir = "/tmp"
self._current_dir = "/app"
def reset(
self,
seed: Optional[int] = None,
episode_id: Optional[str] = None,
**kwargs: Any,
) -> TerminalObservation:
"""Reset the environment state by copying the backup to the working dir."""
eid = episode_id or str(uuid4())
self._state = State(episode_id=eid, step_count=0)
self._last_score = 0.0
self._current_dir = self._app_dir
self._reset_filesystem()
self._inject_grader_script()
# Gather initial observation
init_stdout = self._exec_cmd(f"ls -la {self._app_dir} && echo '---' && cat {os.path.join(self._app_dir, 'config.json')}")
task_prompt = (
"=== SELF-HEALING DEVOPS SANDBOX ===\n"
f"You have been dropped into a container with a broken Node.js Express backend in {self._app_dir}.\n\n"
"YOUR MISSION: Diagnose and fix ALL bugs so that:\n"
" 1. The app starts without errors on port 3000\n"
" 2. GET /health returns HTTP 200\n"
" 3. GET /api/users returns HTTP 200 with valid JSON\n"
" 4. GET /api/data returns HTTP 200 with valid JSON\n\n"
"HINTS:\n"
" - Check config files for wrong settings\n"
" - Look for syntax errors that prevent startup\n"
" - Watch out for async/await issues\n\n"
"Use bash commands to explore, edit files, and test.\n"
"When you think you've fixed everything, run: npm start\n\n"
"--- INITIAL DIRECTORY LISTING ---\n"
f"{init_stdout}\n"
)
return TerminalObservation(
stdout=task_prompt,
stderr="",
current_dir=self._current_dir,
task_id="devops_sandbox",
grader_score=0.0,
grader_feedback="Episode started. Fix the bugs!",
done=False,
reward=0.0,
)
def step(
self,
action: BashAction, # type: ignore[override]
timeout_s: Optional[float] = None,
**kwargs: Any,
) -> TerminalObservation:
"""Execute the agent's command natively, run grader, return observation."""
self._state.step_count += 1
command = action.command.strip()
if not command:
return TerminalObservation(
stdout="",
stderr="Empty command. Please provide a bash command.",
current_dir=self._current_dir,
task_id="devops_sandbox",
grader_score=self._last_score,
grader_feedback="No command executed.",
done=False,
reward=self._last_score,
)
# Handle 'cd' commands manually since subprocess run is transient
if command.startswith("cd "):
target = command[3:].strip()
# Handle standard cd edge cases
if target == "" or target == "~":
# Assuming /app is home for this exercise
new_dir = self._app_dir
elif target.startswith("/"):
new_dir = os.path.normpath(target)
else:
new_dir = os.path.normpath(os.path.join(self._current_dir, target))
if os.path.isdir(new_dir):
self._current_dir = new_dir
stdout, stderr = "", ""
else:
stdout, stderr = "", f"bash: cd: {target}: No such file or directory"
# Run the grader anyway, even if just a cd
score, feedback = self._grade()
self._last_score = score
episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
return TerminalObservation(
stdout=stdout,
stderr=stderr,
current_dir=self._current_dir,
task_id="devops_sandbox",
grader_score=score,
grader_feedback=feedback,
done=episode_done,
reward=score,
)
# Execute normal command
try:
timeout = timeout_s or 30.0
stdout, stderr = self._exec_cmd_split(command, timeout=timeout)
except Exception as e:
stdout, stderr = "", f"Command execution error: {e}"
score, feedback = self._grade()
self._last_score = score
episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
return TerminalObservation(
stdout=stdout,
stderr=stderr,
current_dir=self._current_dir,
task_id="devops_sandbox",
grader_score=score,
grader_feedback=feedback,
done=episode_done,
reward=score,
)
@property
def state(self) -> State:
return self._state
def close(self) -> None:
# pkill node servers that we might have spawned during the session
self._exec_cmd("pkill -f 'node server.js'")
# ==================================================================
# FILESYSTEM & EXECUTION HELPERS
# ==================================================================
def _reset_filesystem(self) -> None:
"""Replace the current working /app with the pristine /app_backup."""
# Ensure we don't accidentally wipe out the whole host on windows if paths are wrong
if os.path.exists(self._app_dir):
shutil.rmtree(self._app_dir, ignore_errors=True)
os.makedirs(self._app_dir, exist_ok=True)
# Copy from backup to app dir
if os.path.exists(self._app_backup_dir):
for item in os.listdir(self._app_backup_dir):
s = os.path.join(self._app_backup_dir, item)
d = os.path.join(self._app_dir, item)
if os.path.isdir(s):
shutil.copytree(s, d, dirs_exist_ok=True)
else:
shutil.copy2(s, d)
else:
logger.warning(f"Backup directory {self._app_backup_dir} not found. Ensure Dockerfile copied simulated_app here.")
def _exec_cmd(self, cmd: str, timeout: float = 30.0) -> str:
"""Execute command natively; return combined output."""
stdout, stderr = self._exec_cmd_split(cmd, timeout)
return (stdout + "\n" + stderr).strip()
def _exec_cmd_split(self, cmd: str, timeout: float = 30.0) -> tuple:
"""Execute command natively; return (stdout, stderr)."""
kwargs = {
"cwd": self._current_dir,
"shell": True,
"capture_output": True,
"timeout": timeout,
}
# Hugging Face space requires POSIX bash, windows uses powershell/cmd
if sys.platform != "win32":
kwargs["executable"] = "/bin/bash"
try:
result = subprocess.run(cmd, **kwargs)
return (
result.stdout.decode(errors="replace"),
result.stderr.decode(errors="replace"),
)
except subprocess.TimeoutExpired:
return ("", "[command timed out]")
except Exception as e:
return ("", f"[exec error: {e}]")
# ==================================================================
# GRADER
# ==================================================================
def _inject_grader_script(self) -> None:
self.grader_path = os.path.join(self._tmp_dir, "grader.sh")
lines = [
'#!/bin/bash',
'set -m',
'',
'pkill -f "node server.js" 2>/dev/null',
'sleep 0.5',
'',
f'cd {self._app_dir}',
f'node server.js > {self._tmp_dir}/node.log 2>&1 &',
'NODE_PID=$!',
'',
'for i in 1 2 3 4; do',
' sleep 1',
' if curl -s http://localhost:3000/health > /dev/null 2>&1; then',
' break',
' fi',
'done',
'',
f'STARTUP_LOG=$(cat {self._tmp_dir}/node.log 2>/dev/null)',
'',
f"HEALTH_CODE=$(curl -s -o {self._tmp_dir}/health.json -w '%{{http_code}}' http://localhost:3000/health 2>/dev/null)",
f"USERS_CODE=$(curl -s -o {self._tmp_dir}/users.json -w '%{{http_code}}' http://localhost:3000/api/users 2>/dev/null)",
f"DATA_CODE=$(curl -s -o {self._tmp_dir}/data.json -w '%{{http_code}}' http://localhost:3000/api/data 2>/dev/null)",
f'USERS_BODY=$(cat {self._tmp_dir}/users.json 2>/dev/null)',
f'DATA_BODY=$(cat {self._tmp_dir}/data.json 2>/dev/null)',
'',
'kill $NODE_PID 2>/dev/null',
'wait $NODE_PID 2>/dev/null',
'',
'echo "GRADER_STARTUP_LOG:${STARTUP_LOG}"',
'echo "GRADER_HEALTH_CODE:${HEALTH_CODE}"',
'echo "GRADER_USERS_CODE:${USERS_CODE}"',
'echo "GRADER_DATA_CODE:${DATA_CODE}"',
'echo "GRADER_USERS_BODY:${USERS_BODY}"',
'echo "GRADER_DATA_BODY:${DATA_BODY}"',
]
script_content = '\n'.join(lines) + '\n'
with open(self.grader_path, "w", newline='\n') as f:
f.write(script_content)
if sys.platform != "win32":
subprocess.run(["chmod", "+x", self.grader_path])
def _grade(self) -> tuple:
score = 0.0
feedback_parts = []
try:
if sys.platform == "win32":
# We use bash via wsl or bash.exe on Windows if we can,
# but if not we might fail grading natively on Windows unless Git Bash is installed.
raw = self._exec_cmd(f"bash {self.grader_path}", timeout=20.0)
else:
raw = self._exec_cmd(f"/bin/bash {self.grader_path}", timeout=20.0)
results = {}
for line in raw.splitlines():
if line.startswith("GRADER_"):
key, _, value = line.partition(":")
results[key] = value.strip()
startup_log = results.get("GRADER_STARTUP_LOG", "")
health_code = results.get("GRADER_HEALTH_CODE", "000")
users_code = results.get("GRADER_USERS_CODE", "000")
data_code = results.get("GRADER_DATA_CODE", "000")
users_body = results.get("GRADER_USERS_BODY", "")
data_body = results.get("GRADER_DATA_BODY", "")
has_syntax_error = "SyntaxError" in startup_log
has_crash = (has_syntax_error
or "Cannot find module" in startup_log
or "ReferenceError" in startup_log)
app_listening = f"Server running on port {EXPECTED_PORT}" in startup_log
if has_crash and not app_listening:
feedback_parts.append(f"βœ— App crashes on startup")
if has_syntax_error:
feedback_parts.append("(SyntaxError detected)")
return (score, " | ".join(feedback_parts))
if app_listening:
score += 0.35
feedback_parts.append("βœ“ App starts on port 3000 (+0.35)")
else:
feedback_parts.append("βœ— App not listening on port 3000")
return (score, " | ".join(feedback_parts))
if health_code == "200":
score += 0.10
feedback_parts.append("βœ“ /health returns 200 (+0.10)")
else:
feedback_parts.append(f"βœ— /health returned {health_code}")
if users_code == "200":
if '"users"' in users_body:
score += 0.15
feedback_parts.append("βœ“ /api/users returns valid JSON (+0.15)")
else:
score += 0.05
feedback_parts.append("~ /api/users 200 but bad body (+0.05)")
else:
feedback_parts.append(f"βœ— /api/users returned {users_code}")
if data_code == "200":
if '"records"' in data_body:
score += 0.25
feedback_parts.append("βœ“ /api/data returns valid JSON (+0.25)")
else:
score += 0.05
feedback_parts.append("~ /api/data 200 but bad body (+0.05)")
else:
feedback_parts.append(f"βœ— /api/data returned {data_code}")
if score >= 0.85:
score = min(score + 0.15, 1.0)
feedback_parts.append("βœ“ All endpoints healthy β€” FULL SCORE (+0.15)")
except Exception as exc:
logger.exception("Grader error")
feedback_parts.append(f"Grader error (score preserved): {exc}")
score = round(min(max(score, 0.0), 1.0), 2)
return (score, " | ".join(feedback_parts))