DEVessi commited on
Commit
e9c3076
·
verified ·
1 Parent(s): cd601a6

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. Dockerfile +39 -51
  2. inference.py +203 -0
  3. server/Dockerfile +80 -0
  4. server/devops_sandbox_environment.py +142 -206
Dockerfile CHANGED
@@ -1,81 +1,69 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
 
7
- # Multi-stage build using openenv-base
8
- # This Dockerfile is flexible and works for both:
9
- # - In-repo environments (with local OpenEnv sources)
10
- # - Standalone environments (with openenv from PyPI/Git)
11
- # The build script (openenv build) handles context detection and sets appropriate build args.
12
 
13
- ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
- FROM ${BASE_IMAGE} AS builder
15
-
16
- WORKDIR /app
17
-
18
- # Ensure git is available (required for installing dependencies from VCS)
19
  RUN apt-get update && \
20
- apt-get install -y --no-install-recommends git && \
 
 
21
  rm -rf /var/lib/apt/lists/*
22
 
23
- # Build argument to control whether we're building standalone or in-repo
24
- ARG BUILD_MODE=in-repo
25
- ARG ENV_NAME=devops_sandbox
26
 
27
- # Copy environment code (always at root of build context)
28
- COPY . /app/env
 
29
 
30
- # For in-repo builds, openenv is already vendored in the build context
31
- # For standalone builds, openenv will be installed via pyproject.toml
32
- WORKDIR /app/env
33
-
34
- # Ensure uv is available (for local builds where base image lacks it)
35
  RUN if ! command -v uv >/dev/null 2>&1; then \
36
  curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
  mv /root/.local/bin/uv /usr/local/bin/uv && \
38
  mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
  fi
40
-
41
  # Install dependencies using uv sync
42
- # If uv.lock exists, use it; otherwise resolve on the fly
43
  RUN --mount=type=cache,target=/root/.cache/uv \
44
  if [ -f uv.lock ]; then \
45
- uv sync --frozen --no-install-project --no-editable; \
46
- else \
47
  uv sync --no-install-project --no-editable; \
48
- fi
49
-
50
- RUN --mount=type=cache,target=/root/.cache/uv \
51
- if [ -f uv.lock ]; then \
52
- uv sync --frozen --no-editable; \
53
  else \
54
  uv sync --no-editable; \
55
  fi
56
 
57
  # Final runtime stage
58
- FROM ${BASE_IMAGE}
 
 
59
 
60
- WORKDIR /app
 
 
 
 
 
 
 
61
 
62
  # Copy the virtual environment from builder
63
- COPY --from=builder /app/env/.venv /app/.venv
64
 
65
  # Copy the environment code
66
- COPY --from=builder /app/env /app/env
67
-
68
- # Set PATH to use the virtual environment
69
- ENV PATH="/app/.venv/bin:$PATH"
70
 
71
- # Set PYTHONPATH so imports work correctly
72
- ENV PYTHONPATH="/app/env:$PYTHONPATH"
 
 
 
 
73
 
74
- # Health check
75
- HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
76
- CMD curl -f http://localhost:8000/health || exit 1
77
 
78
- # Run the FastAPI server
79
- # The module path is constructed to work with the /app/env structure
80
  ENV ENABLE_WEB_INTERFACE=true
81
- CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
 
 
 
1
+ # Hugging Face Spaces Dockerfile for Self-Healing DevOps Sandbox
2
+ FROM ghcr.io/meta-pytorch/openenv-base:latest AS builder
 
 
 
3
 
4
+ USER root
 
 
 
 
5
 
6
+ # Install Node.js 20 & utilities needed by the sandbox
 
 
 
 
 
7
  RUN apt-get update && \
8
+ apt-get install -y --no-install-recommends git curl bash && \
9
+ curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
10
+ apt-get install -y nodejs && \
11
  rm -rf /var/lib/apt/lists/*
12
 
13
+ WORKDIR /server_app
 
 
14
 
15
+ # Copy environment code
16
+ COPY . /server_app/env
17
+ WORKDIR /server_app/env
18
 
19
+ # Ensure uv is available
 
 
 
 
20
  RUN if ! command -v uv >/dev/null 2>&1; then \
21
  curl -LsSf https://astral.sh/uv/install.sh | sh && \
22
  mv /root/.local/bin/uv /usr/local/bin/uv && \
23
  mv /root/.local/bin/uvx /usr/local/bin/uvx; \
24
  fi
25
+
26
  # Install dependencies using uv sync
 
27
  RUN --mount=type=cache,target=/root/.cache/uv \
28
  if [ -f uv.lock ]; then \
 
 
29
  uv sync --no-install-project --no-editable; \
 
 
 
 
 
30
  else \
31
  uv sync --no-editable; \
32
  fi
33
 
34
  # Final runtime stage
35
+ FROM ghcr.io/meta-pytorch/openenv-base:latest
36
+
37
+ USER root
38
 
39
+ # Re-install Node.js 20 in the runtime image
40
+ RUN apt-get update && \
41
+ apt-get install -y --no-install-recommends git curl bash psmisc && \
42
+ curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
43
+ apt-get install -y nodejs && \
44
+ rm -rf /var/lib/apt/lists/*
45
+
46
+ WORKDIR /server_app
47
 
48
  # Copy the virtual environment from builder
49
+ COPY --from=builder /server_app/env/.venv /server_app/.venv
50
 
51
  # Copy the environment code
52
+ COPY --from=builder /server_app/env /server_app/env
 
 
 
53
 
54
+ # Ensure the backup folder for reset() is available and perfectly clean
55
+ # The environment script will do `cp -r /app_backup/* /app/` during every reset
56
+ RUN mkdir -p /app_backup && \
57
+ cp -r /server_app/env/simulated_app/* /app_backup/ && \
58
+ mkdir -p /app && \
59
+ chmod -R 777 /app /app_backup
60
 
61
+ # Export Paths
62
+ ENV PATH="/server_app/.venv/bin:$PATH"
63
+ ENV PYTHONPATH="/server_app/env:$PYTHONPATH"
64
 
65
+ # Run the FastAPI server on port 7860 for HuggingFace Spaces
 
66
  ENV ENABLE_WEB_INTERFACE=true
67
+ EXPOSE 7860
68
+
69
+ CMD ["sh", "-c", "cd /server_app/env && uvicorn server.app:app --host 0.0.0.0 --port 7860"]
inference.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the BSD-style license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ """
9
+ Baseline inference script for the Self-Healing DevOps Sandbox.
10
+
11
+ Uses an LLM (via the OpenAI-compatible API) to diagnose and fix a broken
12
+ Node.js backend running inside a Docker container.
13
+
14
+ Usage:
15
+ export OPENAI_API_KEY="sk-..."
16
+ python baseline.py
17
+
18
+ # Or with a custom endpoint (e.g., local vLLM):
19
+ export OPENAI_BASE_URL="http://localhost:8080/v1"
20
+ python baseline.py
21
+ """
22
+
23
+ import json
24
+ import os
25
+ import sys
26
+
27
+ try:
28
+ from openai import OpenAI
29
+ except ImportError:
30
+ print("ERROR: 'openai' package is required. Install with: pip install openai")
31
+ sys.exit(1)
32
+
33
+ from devops_sandbox import BashAction, DevopsSandboxEnv
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Configuration
37
+ # ---------------------------------------------------------------------------
38
+ ENV_URL = os.getenv("DEVOPS_SANDBOX_URL", "http://localhost:8000")
39
+ MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
40
+ MAX_TURNS = int(os.getenv("MAX_TURNS", "30"))
41
+
42
+ SYSTEM_PROMPT = """\
43
+ You are an expert DevOps engineer and Node.js developer.
44
+
45
+ You have been dropped into a Linux container with a broken Express.js backend in /app.
46
+ Your goal is to diagnose and fix ALL bugs so the app runs correctly.
47
+
48
+ RULES:
49
+ 1. Respond ONLY with a JSON object: {"command": "<bash command>"}
50
+ 2. Use standard bash/Linux commands (ls, cat, grep, sed, node, npm, etc.)
51
+ 3. Do NOT use interactive editors (vi, nano). Use sed or echo/cat with redirection.
52
+ 4. After fixing bugs, restart the app with: cd /app && npm start &
53
+ 5. Be methodical: read files first, understand the bug, then fix it.
54
+
55
+ EXPECTED FINAL STATE:
56
+ - App starts without errors on port 3000
57
+ - GET /health → 200
58
+ - GET /api/users → 200 with JSON containing "users" array
59
+ - GET /api/data → 200 with JSON containing "records" array
60
+ """
61
+
62
+
63
+ def extract_command(llm_response: str) -> str:
64
+ """Extract a bash command from the LLM's response (JSON or raw text)."""
65
+ # Try JSON parsing first
66
+ try:
67
+ data = json.loads(llm_response.strip())
68
+ if isinstance(data, dict) and "command" in data:
69
+ return data["command"]
70
+ except (json.JSONDecodeError, TypeError):
71
+ pass
72
+
73
+ # Try extracting from markdown code block
74
+ if "```" in llm_response:
75
+ lines = llm_response.split("```")
76
+ for block in lines[1::2]: # odd indices are code blocks
77
+ code = block.strip()
78
+ if code.startswith("json"):
79
+ code = code[4:].strip()
80
+ try:
81
+ data = json.loads(code)
82
+ if isinstance(data, dict) and "command" in data:
83
+ return data["command"]
84
+ except (json.JSONDecodeError, TypeError):
85
+ pass
86
+ elif code.startswith("bash") or code.startswith("sh"):
87
+ code = code.split("\n", 1)[-1].strip()
88
+ return code
89
+ else:
90
+ first_line = code.split("\n")[0].strip()
91
+ if first_line:
92
+ return first_line
93
+
94
+ # Fallback: treat entire response as a command
95
+ cmd = llm_response.strip().strip("`").strip()
96
+ if cmd.startswith("{"):
97
+ # One more try
98
+ try:
99
+ return json.loads(cmd)["command"]
100
+ except Exception:
101
+ pass
102
+ return cmd
103
+
104
+
105
+ def main():
106
+ print("=" * 60)
107
+ print(" Self-Healing DevOps Sandbox — Baseline Agent")
108
+ print("=" * 60)
109
+
110
+ client = OpenAI()
111
+
112
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}]
113
+
114
+ with DevopsSandboxEnv(base_url=ENV_URL).sync() as env:
115
+ # Reset the environment
116
+ print("\n[*] Resetting environment...")
117
+ result = env.reset()
118
+ obs = result.observation
119
+
120
+ print(f"\n[INIT] Task prompt:\n{obs.stdout[:500]}...")
121
+ print(f"[INIT] Score: {obs.grader_score} | Feedback: {obs.grader_feedback}")
122
+
123
+ # Add initial observation to messages
124
+ messages.append({
125
+ "role": "user",
126
+ "content": (
127
+ f"Here is the initial state of the broken app:\n\n"
128
+ f"```\n{obs.stdout}\n```\n\n"
129
+ f"Current directory: {obs.current_dir}\n"
130
+ f"Score: {obs.grader_score}/1.0\n\n"
131
+ f"What bash command should I run first?"
132
+ ),
133
+ })
134
+
135
+ for turn in range(1, MAX_TURNS + 1):
136
+ print(f"\n{'─' * 40}")
137
+ print(f"Turn {turn}/{MAX_TURNS}")
138
+ print(f"{'─' * 40}")
139
+
140
+ # Get LLM response
141
+ try:
142
+ response = client.chat.completions.create(
143
+ model=MODEL,
144
+ messages=messages,
145
+ temperature=0.2,
146
+ max_tokens=256,
147
+ )
148
+ llm_text = response.choices[0].message.content or ""
149
+ except Exception as e:
150
+ print(f"[ERROR] LLM call failed: {e}")
151
+ break
152
+
153
+ # Extract command
154
+ command = extract_command(llm_text)
155
+ if not command:
156
+ print("[WARN] Could not extract command from LLM response")
157
+ command = "ls -la /app"
158
+
159
+ print(f"[CMD] {command}")
160
+
161
+ # Execute in environment
162
+ result = env.step(BashAction(command=command))
163
+ obs = result.observation
164
+
165
+ stdout_preview = obs.stdout[:300] if obs.stdout else "(empty)"
166
+ stderr_preview = obs.stderr[:200] if obs.stderr else "(none)"
167
+ print(f"[OUT] {stdout_preview}")
168
+ if obs.stderr:
169
+ print(f"[ERR] {stderr_preview}")
170
+ print(f"[SCORE] {obs.grader_score:.2f} | {obs.grader_feedback}")
171
+
172
+ # Add to conversation
173
+ messages.append({"role": "assistant", "content": llm_text})
174
+ messages.append({
175
+ "role": "user",
176
+ "content": (
177
+ f"Command output:\n"
178
+ f"stdout:\n```\n{obs.stdout}\n```\n"
179
+ f"stderr:\n```\n{obs.stderr}\n```\n"
180
+ f"Current score: {obs.grader_score}/1.0\n"
181
+ f"Grader feedback: {obs.grader_feedback}\n\n"
182
+ f"What command should I run next?"
183
+ ),
184
+ })
185
+
186
+ # Check if done
187
+ if result.done:
188
+ print(f"\n{'=' * 60}")
189
+ if obs.grader_score >= 1.0:
190
+ print(" ✅ ALL BUGS FIXED — PERFECT SCORE!")
191
+ else:
192
+ print(f" Episode ended. Final score: {obs.grader_score:.2f}/1.0")
193
+ print(f"{'=' * 60}")
194
+ break
195
+ else:
196
+ print(f"\n[!] Max turns ({MAX_TURNS}) reached.")
197
+ print(f" Final score: {obs.grader_score:.2f}/1.0")
198
+
199
+ print("\n[*] Done.")
200
+
201
+
202
+ if __name__ == "__main__":
203
+ main()
server/Dockerfile ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Multi-stage build using openenv-base
8
+ # This Dockerfile is flexible and works for both:
9
+ # - In-repo environments (with local OpenEnv sources)
10
+ # - Standalone environments (with openenv from PyPI/Git)
11
+ # The build script (openenv build) handles context detection and sets appropriate build args.
12
+
13
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ${BASE_IMAGE} AS builder
15
+
16
+ WORKDIR /app
17
+
18
+ # Ensure git is available (required for installing dependencies from VCS)
19
+ RUN apt-get update && \
20
+ apt-get install -y --no-install-recommends git && \
21
+ rm -rf /var/lib/apt/lists/*
22
+
23
+ # Build argument to control whether we're building standalone or in-repo
24
+ ARG BUILD_MODE=in-repo
25
+ ARG ENV_NAME=devops_sandbox
26
+
27
+ # Copy environment code (always at root of build context)
28
+ COPY . /app/env
29
+
30
+ # For in-repo builds, openenv is already vendored in the build context
31
+ # For standalone builds, openenv will be installed via pyproject.toml
32
+ WORKDIR /app/env
33
+
34
+ # Ensure uv is available (for local builds where base image lacks it)
35
+ RUN if ! command -v uv >/dev/null 2>&1; then \
36
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
38
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
+ fi
40
+
41
+ # Install dependencies using uv sync
42
+ # If uv.lock exists, use it; otherwise resolve on the fly
43
+ RUN --mount=type=cache,target=/root/.cache/uv \
44
+ if [ -f uv.lock ]; then \
45
+ uv sync --frozen --no-install-project --no-editable; \
46
+ else \
47
+ uv sync --no-install-project --no-editable; \
48
+ fi
49
+
50
+ RUN --mount=type=cache,target=/root/.cache/uv \
51
+ if [ -f uv.lock ]; then \
52
+ uv sync --frozen --no-editable; \
53
+ else \
54
+ uv sync --no-editable; \
55
+ fi
56
+
57
+ # Final runtime stage
58
+ FROM ${BASE_IMAGE}
59
+
60
+ WORKDIR /app
61
+
62
+ # Copy the virtual environment from builder
63
+ COPY --from=builder /app/env/.venv /app/.venv
64
+
65
+ # Copy the environment code
66
+ COPY --from=builder /app/env /app/env
67
+
68
+ # Set PATH to use the virtual environment
69
+ ENV PATH="/app/.venv/bin:$PATH"
70
+
71
+ # Set PYTHONPATH so imports work correctly
72
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
73
+
74
+ # Health check
75
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
76
+ CMD curl -f http://localhost:8000/health || exit 1
77
+
78
+ # Run the FastAPI server
79
+ # The module path is constructed to work with the /app/env structure
80
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
server/devops_sandbox_environment.py CHANGED
@@ -7,15 +7,15 @@
7
  """
8
  Self-Healing DevOps Sandbox — Environment Implementation.
9
 
10
- Spins up an isolated Docker container with a broken Node.js backend.
11
- The RL agent executes bash commands to diagnose and fix 3 bugs.
12
- A programmatic grader awards partial credit (0.0 → 1.0) after every step.
13
  """
14
 
15
  import logging
16
  import os
 
17
  import subprocess
18
- import time
19
  from pathlib import Path
20
  from typing import Any, Optional
21
  from uuid import uuid4
@@ -33,71 +33,64 @@ logger = logging.getLogger(__name__)
33
  # ---------------------------------------------------------------------------
34
  # Constants
35
  # ---------------------------------------------------------------------------
36
- CONTAINER_NAME_PREFIX = "devops_sandbox_"
37
- IMAGE_NAME = "devops-sandbox-node:latest"
38
  EXPECTED_PORT = 3000 # The port the fixed app should listen on
39
  MAX_STEPS = 50 # Episode budget
40
  SIMULATED_APP_DIR = Path(__file__).resolve().parent.parent / "simulated_app"
41
 
42
-
43
  class DevOpsSandbox(Environment):
44
  """
45
- RL environment: fix a broken Node.js backend inside a Docker container.
46
-
47
- reset() build image (if needed) + start container + return initial obs
48
- step() → docker exec the agent's command + run grader → obs + reward
49
- close() → tear down container
50
  """
51
 
52
  SUPPORTS_CONCURRENT_SESSIONS: bool = False
53
 
54
- # ------------------------------------------------------------------
55
- # Lifecycle
56
- # ------------------------------------------------------------------
57
  def __init__(self):
58
  super().__init__()
59
  self._state = State(episode_id=str(uuid4()), step_count=0)
60
- self._container_name: Optional[str] = None
61
- self._container_running: bool = False
62
  self._current_dir: str = "/app"
63
  self._last_score: float = 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- # ------------------------------------------------------------------
66
- # reset
67
- # ------------------------------------------------------------------
68
  def reset(
69
  self,
70
  seed: Optional[int] = None,
71
  episode_id: Optional[str] = None,
72
  **kwargs: Any,
73
  ) -> TerminalObservation:
74
- """Build the Docker image, start the container, return the task prompt."""
75
- # Cleanup previous episode
76
- self._cleanup_container()
77
-
78
- # New episode
79
  eid = episode_id or str(uuid4())
80
  self._state = State(episode_id=eid, step_count=0)
81
  self._last_score = 0.0
82
- self._current_dir = "/app"
83
-
84
- # Build image (idempotent — Docker caches layers)
85
- self._build_image()
86
 
87
- # Start container
88
- self._container_name = f"{CONTAINER_NAME_PREFIX}{eid[:8]}"
89
- self._start_container()
90
-
91
- # Inject the grader script into the container
92
  self._inject_grader_script()
93
 
94
  # Gather initial observation
95
- init_stdout = self._docker_exec("ls -la /app && echo '---' && cat /app/config.json")
96
 
97
  task_prompt = (
98
  "=== SELF-HEALING DEVOPS SANDBOX ===\n"
99
- "You have been dropped into a Docker container with a broken Node.js "
100
- "Express backend in /app.\n\n"
101
  "YOUR MISSION: Diagnose and fix ALL bugs so that:\n"
102
  " 1. The app starts without errors on port 3000\n"
103
  " 2. GET /health returns HTTP 200\n"
@@ -124,31 +117,15 @@ class DevOpsSandbox(Environment):
124
  reward=0.0,
125
  )
126
 
127
- # ------------------------------------------------------------------
128
- # step
129
- # ------------------------------------------------------------------
130
  def step(
131
  self,
132
  action: BashAction, # type: ignore[override]
133
  timeout_s: Optional[float] = None,
134
  **kwargs: Any,
135
  ) -> TerminalObservation:
136
- """Execute the agent's bash command, run grader, return observation."""
137
  self._state.step_count += 1
138
 
139
- if not self._container_running:
140
- return TerminalObservation(
141
- stdout="",
142
- stderr="ERROR: Container is not running. Call reset() first.",
143
- current_dir=self._current_dir,
144
- task_id="devops_sandbox",
145
- grader_score=0.0,
146
- grader_feedback="Container not running.",
147
- done=True,
148
- reward=0.0,
149
- )
150
-
151
- # Execute the command
152
  command = action.command.strip()
153
  if not command:
154
  return TerminalObservation(
@@ -162,16 +139,49 @@ class DevOpsSandbox(Environment):
162
  reward=self._last_score,
163
  )
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  try:
166
  timeout = timeout_s or 30.0
167
- stdout, stderr = self._docker_exec_split(command, timeout=timeout)
168
  except Exception as e:
169
  stdout, stderr = "", f"Command execution error: {e}"
170
 
171
- # Run the grader
172
  score, feedback = self._grade()
173
  self._last_score = score
174
-
175
  episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
176
 
177
  return TerminalObservation(
@@ -185,29 +195,71 @@ class DevOpsSandbox(Environment):
185
  reward=score,
186
  )
187
 
188
- # ------------------------------------------------------------------
189
- # state
190
- # ------------------------------------------------------------------
191
  @property
192
  def state(self) -> State:
193
  return self._state
194
 
195
- # ------------------------------------------------------------------
196
- # close
197
- # ------------------------------------------------------------------
198
  def close(self) -> None:
199
- self._cleanup_container()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
  # ==================================================================
202
- # GRADER — partial reward (0.0 → 1.0)
203
- # The grader script is injected as a file into the container at
204
- # reset() time, then executed via `bash /tmp/grader.sh` to avoid
205
- # Windows subprocess escaping issues with complex bash scripts.
206
  # ==================================================================
207
  def _inject_grader_script(self) -> None:
208
- """Write the grader bash script into the container as /tmp/grader.sh."""
209
- # Use a heredoc via docker exec to write the file
210
- # We write it line-by-line to avoid any escaping issues
211
  lines = [
212
  '#!/bin/bash',
213
  'set -m',
@@ -215,8 +267,8 @@ class DevOpsSandbox(Environment):
215
  'pkill -f "node server.js" 2>/dev/null',
216
  'sleep 0.5',
217
  '',
218
- 'cd /app',
219
- 'node server.js > /tmp/node.log 2>&1 &',
220
  'NODE_PID=$!',
221
  '',
222
  'for i in 1 2 3 4; do',
@@ -226,13 +278,13 @@ class DevOpsSandbox(Environment):
226
  ' fi',
227
  'done',
228
  '',
229
- 'STARTUP_LOG=$(cat /tmp/node.log 2>/dev/null)',
230
  '',
231
- "HEALTH_CODE=$(curl -s -o /tmp/health.json -w '%{http_code}' http://localhost:3000/health 2>/dev/null)",
232
- "USERS_CODE=$(curl -s -o /tmp/users.json -w '%{http_code}' http://localhost:3000/api/users 2>/dev/null)",
233
- "DATA_CODE=$(curl -s -o /tmp/data.json -w '%{http_code}' http://localhost:3000/api/data 2>/dev/null)",
234
- 'USERS_BODY=$(cat /tmp/users.json 2>/dev/null)',
235
- 'DATA_BODY=$(cat /tmp/data.json 2>/dev/null)',
236
  '',
237
  'kill $NODE_PID 2>/dev/null',
238
  'wait $NODE_PID 2>/dev/null',
@@ -244,39 +296,26 @@ class DevOpsSandbox(Environment):
244
  'echo "GRADER_USERS_BODY:${USERS_BODY}"',
245
  'echo "GRADER_DATA_BODY:${DATA_BODY}"',
246
  ]
 
247
  script_content = '\n'.join(lines) + '\n'
248
-
249
- # Write via docker cp using a temp file on the host
250
- import tempfile
251
- with tempfile.NamedTemporaryFile(
252
- mode='w', suffix='.sh', delete=False, newline='\n'
253
- ) as f:
254
  f.write(script_content)
255
- tmp_path = f.name
256
-
257
- try:
258
- subprocess.run(
259
- ["docker", "cp", tmp_path, f"{self._container_name}:/tmp/grader.sh"],
260
- check=True,
261
- capture_output=True,
262
- timeout=10,
263
- )
264
- self._docker_exec("chmod +x /tmp/grader.sh")
265
- finally:
266
- os.unlink(tmp_path)
267
 
268
  def _grade(self) -> tuple:
269
- """
270
- Run the grader script inside the container.
271
- Returns (score: float, feedback: str).
272
- """
273
  score = 0.0
274
  feedback_parts = []
275
 
276
  try:
277
- raw = self._docker_exec("bash /tmp/grader.sh", timeout=20.0)
 
 
 
 
 
278
 
279
- # Parse structured output
280
  results = {}
281
  for line in raw.splitlines():
282
  if line.startswith("GRADER_"):
@@ -290,7 +329,6 @@ class DevOpsSandbox(Environment):
290
  users_body = results.get("GRADER_USERS_BODY", "")
291
  data_body = results.get("GRADER_DATA_BODY", "")
292
 
293
- # --- Check 1: App starts on correct port ---
294
  has_syntax_error = "SyntaxError" in startup_log
295
  has_crash = (has_syntax_error
296
  or "Cannot find module" in startup_log
@@ -310,14 +348,12 @@ class DevOpsSandbox(Environment):
310
  feedback_parts.append("✗ App not listening on port 3000")
311
  return (score, " | ".join(feedback_parts))
312
 
313
- # --- Check 2: /health ---
314
  if health_code == "200":
315
  score += 0.10
316
  feedback_parts.append("✓ /health returns 200 (+0.10)")
317
  else:
318
  feedback_parts.append(f"✗ /health returned {health_code}")
319
 
320
- # --- Check 3: /api/users ---
321
  if users_code == "200":
322
  if '"users"' in users_body:
323
  score += 0.15
@@ -328,7 +364,6 @@ class DevOpsSandbox(Environment):
328
  else:
329
  feedback_parts.append(f"✗ /api/users returned {users_code}")
330
 
331
- # --- Check 4: /api/data ---
332
  if data_code == "200":
333
  if '"records"' in data_body:
334
  score += 0.25
@@ -339,7 +374,6 @@ class DevOpsSandbox(Environment):
339
  else:
340
  feedback_parts.append(f"✗ /api/data returned {data_code}")
341
 
342
- # --- Check 5: all endpoints correct ---
343
  if score >= 0.85:
344
  score = min(score + 0.15, 1.0)
345
  feedback_parts.append("✓ All endpoints healthy — FULL SCORE (+0.15)")
@@ -350,101 +384,3 @@ class DevOpsSandbox(Environment):
350
 
351
  score = round(min(max(score, 0.0), 1.0), 2)
352
  return (score, " | ".join(feedback_parts))
353
-
354
- # ==================================================================
355
- # DOCKER HELPERS
356
- # ==================================================================
357
- def _build_image(self) -> None:
358
- """Build the sandbox Docker image from simulated_app/."""
359
- try:
360
- logger.info("Building Docker image %s …", IMAGE_NAME)
361
- subprocess.run(
362
- ["docker", "build", "-t", IMAGE_NAME, "."],
363
- cwd=str(SIMULATED_APP_DIR),
364
- check=True,
365
- capture_output=True,
366
- timeout=120,
367
- )
368
- logger.info("Docker image built successfully.")
369
- except subprocess.CalledProcessError as e:
370
- logger.error("Docker build failed: %s", e.stderr.decode(errors="replace"))
371
- raise RuntimeError(f"Docker build failed: {e.stderr.decode(errors='replace')}") from e
372
- except FileNotFoundError:
373
- raise RuntimeError(
374
- "Docker CLI not found. Ensure Docker is installed and on PATH."
375
- )
376
-
377
- def _start_container(self) -> None:
378
- """Run the sandbox container in detached mode."""
379
- try:
380
- # Remove stale container with same name
381
- subprocess.run(
382
- ["docker", "rm", "-f", self._container_name],
383
- capture_output=True,
384
- timeout=10,
385
- )
386
- subprocess.run(
387
- [
388
- "docker", "run", "-d",
389
- "--init",
390
- "--name", self._container_name,
391
- IMAGE_NAME,
392
- ],
393
- check=True,
394
- capture_output=True,
395
- timeout=30,
396
- )
397
- self._container_running = True
398
- logger.info("Container %s started.", self._container_name)
399
- except subprocess.CalledProcessError as e:
400
- raise RuntimeError(
401
- f"Failed to start container: {e.stderr.decode(errors='replace')}"
402
- ) from e
403
-
404
- def _docker_exec(self, cmd: str, timeout: float = 30.0) -> str:
405
- """Execute a command inside the running container and return combined output."""
406
- try:
407
- result = subprocess.run(
408
- ["docker", "exec", self._container_name, "bash", "-c", cmd],
409
- capture_output=True,
410
- timeout=timeout,
411
- )
412
- out = result.stdout.decode(errors="replace")
413
- err = result.stderr.decode(errors="replace")
414
- return (out + err).strip()
415
- except subprocess.TimeoutExpired:
416
- return "[command timed out]"
417
- except Exception as e:
418
- return f"[docker exec error: {e}]"
419
-
420
- def _docker_exec_split(self, cmd: str, timeout: float = 30.0) -> tuple:
421
- """Execute command; return (stdout, stderr) separately."""
422
- try:
423
- result = subprocess.run(
424
- ["docker", "exec", self._container_name, "bash", "-c", cmd],
425
- capture_output=True,
426
- timeout=timeout,
427
- )
428
- return (
429
- result.stdout.decode(errors="replace"),
430
- result.stderr.decode(errors="replace"),
431
- )
432
- except subprocess.TimeoutExpired:
433
- return ("", "[command timed out]")
434
- except Exception as e:
435
- return ("", f"[docker exec error: {e}]")
436
-
437
- def _cleanup_container(self) -> None:
438
- """Stop and remove the container if it exists."""
439
- if self._container_name:
440
- try:
441
- subprocess.run(
442
- ["docker", "rm", "-f", self._container_name],
443
- capture_output=True,
444
- timeout=15,
445
- )
446
- logger.info("Container %s removed.", self._container_name)
447
- except Exception:
448
- pass
449
- self._container_running = False
450
- self._container_name = None
 
7
  """
8
  Self-Healing DevOps Sandbox — Environment Implementation.
9
 
10
+ Runs entirely natively on the host filesystem (Hugging Face Spaces compatible).
11
+ The RL agent executes bash commands to diagnose and fix 3 bugs via direct subprocesses.
 
12
  """
13
 
14
  import logging
15
  import os
16
+ import shutil
17
  import subprocess
18
+ import sys
19
  from pathlib import Path
20
  from typing import Any, Optional
21
  from uuid import uuid4
 
33
  # ---------------------------------------------------------------------------
34
  # Constants
35
  # ---------------------------------------------------------------------------
 
 
36
  EXPECTED_PORT = 3000 # The port the fixed app should listen on
37
  MAX_STEPS = 50 # Episode budget
38
  SIMULATED_APP_DIR = Path(__file__).resolve().parent.parent / "simulated_app"
39
 
 
40
  class DevOpsSandbox(Environment):
41
  """
42
+ RL environment: fix a broken Node.js backend.
43
+ No longer uses Docker (Docker-in-Docker is unsupported in HF Spaces).
44
+ Instead, uses native subprocess.run() in a reset /app/ directory.
 
 
45
  """
46
 
47
  SUPPORTS_CONCURRENT_SESSIONS: bool = False
48
 
 
 
 
49
  def __init__(self):
50
  super().__init__()
51
  self._state = State(episode_id=str(uuid4()), step_count=0)
 
 
52
  self._current_dir: str = "/app"
53
  self._last_score: float = 0.0
54
+
55
+ # When running on Windows locally, `/app` and `/app_backup` don't exist naturally,
56
+ # so we will use absolute paths mapped to our repo if they aren't at root.
57
+ # But for HF Space (Linux), /app will be at root.
58
+ if sys.platform == "win32":
59
+ # For Windows local dev, use safe paths inside the workspace
60
+ workspace = Path(__file__).resolve().parent.parent
61
+ self._app_dir = str(workspace / ".app_sandbox")
62
+ self._app_backup_dir = str(SIMULATED_APP_DIR)
63
+ self._tmp_dir = str(workspace / ".tmp")
64
+ os.makedirs(self._tmp_dir, exist_ok=True)
65
+ self._current_dir = self._app_dir
66
+ else:
67
+ # For Hugging Face Spaces (Linux)
68
+ self._app_dir = "/app"
69
+ self._app_backup_dir = "/app_backup"
70
+ self._tmp_dir = "/tmp"
71
+ self._current_dir = "/app"
72
 
 
 
 
73
  def reset(
74
  self,
75
  seed: Optional[int] = None,
76
  episode_id: Optional[str] = None,
77
  **kwargs: Any,
78
  ) -> TerminalObservation:
79
+ """Reset the environment state by copying the backup to the working dir."""
 
 
 
 
80
  eid = episode_id or str(uuid4())
81
  self._state = State(episode_id=eid, step_count=0)
82
  self._last_score = 0.0
83
+ self._current_dir = self._app_dir
 
 
 
84
 
85
+ self._reset_filesystem()
 
 
 
 
86
  self._inject_grader_script()
87
 
88
  # Gather initial observation
89
+ init_stdout = self._exec_cmd(f"ls -la {self._app_dir} && echo '---' && cat {os.path.join(self._app_dir, 'config.json')}")
90
 
91
  task_prompt = (
92
  "=== SELF-HEALING DEVOPS SANDBOX ===\n"
93
+ f"You have been dropped into a container with a broken Node.js Express backend in {self._app_dir}.\n\n"
 
94
  "YOUR MISSION: Diagnose and fix ALL bugs so that:\n"
95
  " 1. The app starts without errors on port 3000\n"
96
  " 2. GET /health returns HTTP 200\n"
 
117
  reward=0.0,
118
  )
119
 
 
 
 
120
  def step(
121
  self,
122
  action: BashAction, # type: ignore[override]
123
  timeout_s: Optional[float] = None,
124
  **kwargs: Any,
125
  ) -> TerminalObservation:
126
+ """Execute the agent's command natively, run grader, return observation."""
127
  self._state.step_count += 1
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  command = action.command.strip()
130
  if not command:
131
  return TerminalObservation(
 
139
  reward=self._last_score,
140
  )
141
 
142
+ # Handle 'cd' commands manually since subprocess run is transient
143
+ if command.startswith("cd "):
144
+ target = command[3:].strip()
145
+ # Handle standard cd edge cases
146
+ if target == "" or target == "~":
147
+ # Assuming /app is home for this exercise
148
+ new_dir = self._app_dir
149
+ elif target.startswith("/"):
150
+ new_dir = os.path.normpath(target)
151
+ else:
152
+ new_dir = os.path.normpath(os.path.join(self._current_dir, target))
153
+
154
+ if os.path.isdir(new_dir):
155
+ self._current_dir = new_dir
156
+ stdout, stderr = "", ""
157
+ else:
158
+ stdout, stderr = "", f"bash: cd: {target}: No such file or directory"
159
+
160
+ # Run the grader anyway, even if just a cd
161
+ score, feedback = self._grade()
162
+ self._last_score = score
163
+ episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
164
+
165
+ return TerminalObservation(
166
+ stdout=stdout,
167
+ stderr=stderr,
168
+ current_dir=self._current_dir,
169
+ task_id="devops_sandbox",
170
+ grader_score=score,
171
+ grader_feedback=feedback,
172
+ done=episode_done,
173
+ reward=score,
174
+ )
175
+
176
+ # Execute normal command
177
  try:
178
  timeout = timeout_s or 30.0
179
+ stdout, stderr = self._exec_cmd_split(command, timeout=timeout)
180
  except Exception as e:
181
  stdout, stderr = "", f"Command execution error: {e}"
182
 
 
183
  score, feedback = self._grade()
184
  self._last_score = score
 
185
  episode_done = (score >= 1.0) or (self._state.step_count >= MAX_STEPS)
186
 
187
  return TerminalObservation(
 
195
  reward=score,
196
  )
197
 
 
 
 
198
  @property
199
  def state(self) -> State:
200
  return self._state
201
 
 
 
 
202
  def close(self) -> None:
203
+ # pkill node servers that we might have spawned during the session
204
+ self._exec_cmd("pkill -f 'node server.js'")
205
+
206
+ # ==================================================================
207
+ # FILESYSTEM & EXECUTION HELPERS
208
+ # ==================================================================
209
+ def _reset_filesystem(self) -> None:
210
+ """Replace the current working /app with the pristine /app_backup."""
211
+ # Ensure we don't accidentally wipe out the whole host on windows if paths are wrong
212
+ if os.path.exists(self._app_dir):
213
+ shutil.rmtree(self._app_dir, ignore_errors=True)
214
+
215
+ os.makedirs(self._app_dir, exist_ok=True)
216
+
217
+ # Copy from backup to app dir
218
+ if os.path.exists(self._app_backup_dir):
219
+ for item in os.listdir(self._app_backup_dir):
220
+ s = os.path.join(self._app_backup_dir, item)
221
+ d = os.path.join(self._app_dir, item)
222
+ if os.path.isdir(s):
223
+ shutil.copytree(s, d, dirs_exist_ok=True)
224
+ else:
225
+ shutil.copy2(s, d)
226
+ else:
227
+ logger.warning(f"Backup directory {self._app_backup_dir} not found. Ensure Dockerfile copied simulated_app here.")
228
+
229
+ def _exec_cmd(self, cmd: str, timeout: float = 30.0) -> str:
230
+ """Execute command natively; return combined output."""
231
+ stdout, stderr = self._exec_cmd_split(cmd, timeout)
232
+ return (stdout + "\n" + stderr).strip()
233
+
234
+ def _exec_cmd_split(self, cmd: str, timeout: float = 30.0) -> tuple:
235
+ """Execute command natively; return (stdout, stderr)."""
236
+ kwargs = {
237
+ "cwd": self._current_dir,
238
+ "shell": True,
239
+ "capture_output": True,
240
+ "timeout": timeout,
241
+ }
242
+
243
+ # Hugging Face space requires POSIX bash, windows uses powershell/cmd
244
+ if sys.platform != "win32":
245
+ kwargs["executable"] = "/bin/bash"
246
+
247
+ try:
248
+ result = subprocess.run(cmd, **kwargs)
249
+ return (
250
+ result.stdout.decode(errors="replace"),
251
+ result.stderr.decode(errors="replace"),
252
+ )
253
+ except subprocess.TimeoutExpired:
254
+ return ("", "[command timed out]")
255
+ except Exception as e:
256
+ return ("", f"[exec error: {e}]")
257
 
258
  # ==================================================================
259
+ # GRADER
 
 
 
260
  # ==================================================================
261
  def _inject_grader_script(self) -> None:
262
+ self.grader_path = os.path.join(self._tmp_dir, "grader.sh")
 
 
263
  lines = [
264
  '#!/bin/bash',
265
  'set -m',
 
267
  'pkill -f "node server.js" 2>/dev/null',
268
  'sleep 0.5',
269
  '',
270
+ f'cd {self._app_dir}',
271
+ f'node server.js > {self._tmp_dir}/node.log 2>&1 &',
272
  'NODE_PID=$!',
273
  '',
274
  'for i in 1 2 3 4; do',
 
278
  ' fi',
279
  'done',
280
  '',
281
+ f'STARTUP_LOG=$(cat {self._tmp_dir}/node.log 2>/dev/null)',
282
  '',
283
+ f"HEALTH_CODE=$(curl -s -o {self._tmp_dir}/health.json -w '%{{http_code}}' http://localhost:3000/health 2>/dev/null)",
284
+ f"USERS_CODE=$(curl -s -o {self._tmp_dir}/users.json -w '%{{http_code}}' http://localhost:3000/api/users 2>/dev/null)",
285
+ f"DATA_CODE=$(curl -s -o {self._tmp_dir}/data.json -w '%{{http_code}}' http://localhost:3000/api/data 2>/dev/null)",
286
+ f'USERS_BODY=$(cat {self._tmp_dir}/users.json 2>/dev/null)',
287
+ f'DATA_BODY=$(cat {self._tmp_dir}/data.json 2>/dev/null)',
288
  '',
289
  'kill $NODE_PID 2>/dev/null',
290
  'wait $NODE_PID 2>/dev/null',
 
296
  'echo "GRADER_USERS_BODY:${USERS_BODY}"',
297
  'echo "GRADER_DATA_BODY:${DATA_BODY}"',
298
  ]
299
+
300
  script_content = '\n'.join(lines) + '\n'
301
+ with open(self.grader_path, "w", newline='\n') as f:
 
 
 
 
 
302
  f.write(script_content)
303
+
304
+ if sys.platform != "win32":
305
+ subprocess.run(["chmod", "+x", self.grader_path])
 
 
 
 
 
 
 
 
 
306
 
307
  def _grade(self) -> tuple:
 
 
 
 
308
  score = 0.0
309
  feedback_parts = []
310
 
311
  try:
312
+ if sys.platform == "win32":
313
+ # We use bash via wsl or bash.exe on Windows if we can,
314
+ # but if not we might fail grading natively on Windows unless Git Bash is installed.
315
+ raw = self._exec_cmd(f"bash {self.grader_path}", timeout=20.0)
316
+ else:
317
+ raw = self._exec_cmd(f"/bin/bash {self.grader_path}", timeout=20.0)
318
 
 
319
  results = {}
320
  for line in raw.splitlines():
321
  if line.startswith("GRADER_"):
 
329
  users_body = results.get("GRADER_USERS_BODY", "")
330
  data_body = results.get("GRADER_DATA_BODY", "")
331
 
 
332
  has_syntax_error = "SyntaxError" in startup_log
333
  has_crash = (has_syntax_error
334
  or "Cannot find module" in startup_log
 
348
  feedback_parts.append("✗ App not listening on port 3000")
349
  return (score, " | ".join(feedback_parts))
350
 
 
351
  if health_code == "200":
352
  score += 0.10
353
  feedback_parts.append("✓ /health returns 200 (+0.10)")
354
  else:
355
  feedback_parts.append(f"✗ /health returned {health_code}")
356
 
 
357
  if users_code == "200":
358
  if '"users"' in users_body:
359
  score += 0.15
 
364
  else:
365
  feedback_parts.append(f"✗ /api/users returned {users_code}")
366
 
 
367
  if data_code == "200":
368
  if '"records"' in data_body:
369
  score += 0.25
 
374
  else:
375
  feedback_parts.append(f"✗ /api/data returned {data_code}")
376
 
 
377
  if score >= 0.85:
378
  score = min(score + 0.15, 1.0)
379
  feedback_parts.append("✓ All endpoints healthy — FULL SCORE (+0.15)")
 
384
 
385
  score = round(min(max(score, 0.0), 1.0), 2)
386
  return (score, " | ".join(feedback_parts))