sergiopaniego HF Staff commited on
Commit
5d897b1
·
verified ·
1 Parent(s): 224946d

Upload folder using huggingface_hub

Browse files
Files changed (10) hide show
  1. Dockerfile +81 -0
  2. README.md +202 -5
  3. __init__.py +18 -0
  4. client.py +75 -0
  5. models.py +58 -0
  6. openenv.yaml +7 -0
  7. pyproject.toml +44 -0
  8. server/__init__.py +12 -0
  9. server/app.py +104 -0
  10. server/tbench2_env_environment.py +694 -0
Dockerfile ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Multi-stage build using openenv-base
8
+ # This Dockerfile is flexible and works for both:
9
+ # - In-repo environments (with local OpenEnv sources)
10
+ # - Standalone environments (with openenv from PyPI/Git)
11
+ # The build script (openenv build) handles context detection and sets appropriate build args.
12
+
13
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ${BASE_IMAGE} AS builder
15
+
16
+ WORKDIR /app
17
+
18
+ # Ensure git is available (required for installing dependencies from VCS)
19
+ RUN apt-get update && \
20
+ apt-get install -y --no-install-recommends git && \
21
+ rm -rf /var/lib/apt/lists/*
22
+
23
+ # Build argument to control whether we're building standalone or in-repo
24
+ ARG BUILD_MODE=in-repo
25
+ ARG ENV_NAME=tbench2_env
26
+
27
+ # Copy environment code (always at root of build context)
28
+ COPY . /app/env
29
+
30
+ # For in-repo builds, openenv is already vendored in the build context
31
+ # For standalone builds, openenv will be installed via pyproject.toml
32
+ WORKDIR /app/env
33
+
34
+ # Ensure uv is available (for local builds where base image lacks it)
35
+ RUN if ! command -v uv >/dev/null 2>&1; then \
36
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
38
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
+ fi
40
+
41
+ # Install dependencies using uv sync
42
+ # If uv.lock exists, use it; otherwise resolve on the fly
43
+ RUN --mount=type=cache,target=/root/.cache/uv \
44
+ if [ -f uv.lock ]; then \
45
+ uv sync --frozen --no-install-project --no-editable; \
46
+ else \
47
+ uv sync --no-install-project --no-editable; \
48
+ fi
49
+
50
+ RUN --mount=type=cache,target=/root/.cache/uv \
51
+ if [ -f uv.lock ]; then \
52
+ uv sync --frozen --no-editable; \
53
+ else \
54
+ uv sync --no-editable; \
55
+ fi
56
+
57
+ # Final runtime stage
58
+ FROM ${BASE_IMAGE}
59
+
60
+ WORKDIR /app
61
+
62
+ # Copy the virtual environment from builder
63
+ COPY --from=builder /app/env/.venv /app/.venv
64
+
65
+ # Copy the environment code
66
+ COPY --from=builder /app/env /app/env
67
+
68
+ # Set PATH to use the virtual environment
69
+ ENV PATH="/app/.venv/bin:$PATH"
70
+
71
+ # Set PYTHONPATH so imports work correctly
72
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
73
+
74
+ # Health check
75
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
76
+ CMD curl -f http://localhost:8000/health || exit 1
77
+
78
+ # Run the FastAPI server
79
+ # The module path is constructed to work with the /app/env structure
80
+ ENV ENABLE_WEB_INTERFACE=true
81
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
README.md CHANGED
@@ -1,10 +1,207 @@
1
  ---
2
- title: Tbench2
3
- emoji: 🐠
4
- colorFrom: purple
5
- colorTo: indigo
6
  sdk: docker
7
  pinned: false
 
 
 
 
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: TB2 Environment Server
3
+ emoji: "🧪"
4
+ colorFrom: blue
5
+ colorTo: blue
6
  sdk: docker
7
  pinned: false
8
+ app_port: 8000
9
+ base_path: /web
10
+ tags:
11
+ - openenv
12
+ - terminal-bench-2
13
+ - spaces
14
  ---
15
 
16
+ # TB2 Environment (Terminal-Bench 2)
17
+
18
+ OpenEnv wrapper for [Terminal-Bench 2](https://github.com/laude-institute/terminal-bench-2) tasks. Supports two execution modes:
19
+
20
+ | Mode | Description | Use Case |
21
+ |------|-------------|----------|
22
+ | **Local** | Runs commands in the server process (no Docker) | Hugging Face Spaces, environments without Docker access |
23
+ | **Docker** | Runs each task in its own container | Full TB2.0 fidelity with custom task images |
24
+
25
+ ## Quick Start
26
+
27
+ ```python
28
+ from tbench2_env import Tbench2Env, Tbench2Action
29
+
30
+ env = Tbench2Env(base_url="http://localhost:8000")
31
+ result = env.reset(task_id="headless-terminal")
32
+ print(result.observation.instruction)
33
+
34
+ result = env.step(Tbench2Action(action_type="exec", command="ls -la"))
35
+ print(result.observation.output)
36
+
37
+ result = env.step(Tbench2Action(action_type="evaluate"))
38
+ print(result.reward, result.done)
39
+
40
+ env.close()
41
+ ```
42
+
43
+ ## Building the Docker Image
44
+
45
+ Before using the environment, build the Docker image:
46
+
47
+ ```bash
48
+ # From project root
49
+ docker build -t tbench2-env:latest -f envs/tbench2_env/server/Dockerfile .
50
+ ```
51
+
52
+ ## Environment Details
53
+
54
+ ### Action
55
+ **Tbench2Action**: Controls interaction with the TB2 task session
56
+
57
+ | Field | Type | Default | Description |
58
+ |-------|------|---------|-------------|
59
+ | `action_type` | str | `"exec"` | Action to perform (`exec`, `write`, `view`, `wait`, `kill`, `write_file`, `evaluate`, `close`) |
60
+ | `command` | str | `""` | Shell command or input to send |
61
+ | `session_id` | str \| None | `None` | Session ID for streaming processes |
62
+ | `block` | bool | `True` | Whether to block until command completes |
63
+ | `wait_seconds` | float \| None | `None` | Time to wait (for `wait` action) |
64
+ | `file_path` | str | `""` | File path (for `write_file` action) |
65
+ | `content` | str | `""` | Content to write (for `write_file` action) |
66
+
67
+ ### Observation
68
+ **Tbench2Observation**: Contains the environment response
69
+
70
+ | Field | Type | Description |
71
+ |-------|------|-------------|
72
+ | `instruction` | str | Task instruction/prompt from the TB2 task |
73
+ | `output` | str | Command output (stdout/stderr) |
74
+ | `success` | bool | Whether the action succeeded |
75
+ | `error` | str | Error message if action failed |
76
+ | `task_id` | str | Current task identifier |
77
+ | `task_path` | str | Path to the task directory |
78
+ | `session_id` | str \| None | Session ID for streaming processes |
79
+ | `action_type` | str | The action type that produced this observation |
80
+ | `info` | dict | Additional metadata |
81
+
82
+ ### State
83
+ **Tbench2State**: Server-side state for the task session
84
+
85
+ | Field | Type | Description |
86
+ |-------|------|-------------|
87
+ | `task_id` | str | Current task identifier |
88
+ | `task_path` | str | Path to the task directory |
89
+ | `session_id` | str | Active session ID |
90
+ | `terminal_ready` | bool | Whether the terminal is ready for commands |
91
+ | `last_action_type` | str | Last action type executed |
92
+ | `last_command` | str | Last command executed |
93
+ | `last_output` | str | Output from last command |
94
+
95
+ ## Execution Modes
96
+
97
+ ### Local Mode (Default)
98
+
99
+ Commands execute directly in the server process. Ideal for HF Spaces where Docker-in-Docker is unavailable.
100
+
101
+ ```bash
102
+ # Default - local mode
103
+ python -m tbench2_env.server.app
104
+
105
+ # Or explicitly set mode
106
+ TB2_MODE=local python -m tbench2_env.server.app
107
+ ```
108
+
109
+ **Note:** Local mode ignores Docker images specified in task.toml. Tasks requiring specific runtime environments may fail.
110
+
111
+ ### Docker Mode
112
+
113
+ Each task runs in its own Docker container, using the image specified in the task's `task.toml`:
114
+
115
+ ```bash
116
+ # Enable Docker mode
117
+ TB2_MODE=docker python -m tbench2_env.server.app
118
+ ```
119
+
120
+ **Requirements:**
121
+ - Docker socket mounted at `/var/run/docker.sock`
122
+ - Sufficient disk space for container images
123
+ - Network access to pull images if not cached
124
+
125
+ **Environment Variables for Docker Mode:**
126
+ - `TB2_MODE=docker` - Enable Docker-backed execution
127
+ - Docker socket must be accessible (mounted volume)
128
+
129
+ ## Action Types
130
+
131
+ | Action | Description | Required Fields |
132
+ |--------|-------------|-----------------|
133
+ | `exec` | Run a shell command | `command`, optionally `block`, `session_id` |
134
+ | `write` | Send input to a running session | `session_id`, `command` |
135
+ | `view` | Read pending output | `session_id` |
136
+ | `wait` | Wait for output | `session_id`, optionally `wait_seconds` |
137
+ | `kill` | Terminate a running session | `session_id` |
138
+ | `write_file` | Write content to a file | `file_path`, `content` |
139
+ | `evaluate` | Run pytest tests, return reward | (none) |
140
+ | `close` | Stop and cleanup | (none) |
141
+
142
+ ## Session IDs (Streaming Processes)
143
+
144
+ `session_id` is **only** required when you start a non-blocking process and want to interact with it (`write`, `view`, `wait`, `kill`). For plain `exec` commands, you can omit it.
145
+
146
+ Example (Python):
147
+ ```python
148
+ # Start a long-running process
149
+ env.step(Tbench2Action(action_type="exec", command="python -i", block=False, session_id="sess1"))
150
+
151
+ # Send input to it
152
+ env.step(Tbench2Action(action_type="write", session_id="sess1", command="print(2+2)\n"))
153
+
154
+ # Read its output
155
+ env.step(Tbench2Action(action_type="view", session_id="sess1"))
156
+ ```
157
+
158
+ ## Environment Variables
159
+
160
+ | Variable | Default | Description |
161
+ |----------|---------|-------------|
162
+ | `TB2_MODE` | `local` | Execution mode: `local` or `docker` |
163
+ | `TB2_TASKS_DIR` | (auto-download) | Path to local Terminal-Bench-2 repo checkout |
164
+ | `TB2_OUTPUT_DIR` | `/tmp/tbench2_env_runs` | Directory for session logs and cache |
165
+ | `TB2_CACHE_DIR` | `$TB2_OUTPUT_DIR/repo_cache` | Where to extract TB2 repo |
166
+ | `TB2_REPO_URL` | (GitHub main.zip) | Repo zip URL for auto-download |
167
+
168
+ ## Reward
169
+
170
+ Binary reward on `evaluate` action:
171
+ - `1.0` - All pytest tests pass (exit code 0)
172
+ - `0.0` - Tests fail (non-zero exit code)
173
+
174
+ Intermediate steps return `reward=None`.
175
+
176
+ ## Running the Server
177
+
178
+ ```bash
179
+ # Install dependencies
180
+ uv sync --all-extras
181
+
182
+ # Local mode (default, for Spaces)
183
+ python -m tbench2_env.server.app --port 8000
184
+
185
+ # Docker mode (full TB2.0 compatibility)
186
+ TB2_MODE=docker python -m tbench2_env.server.app --port 8000
187
+
188
+ # With local TB2 repo
189
+ TB2_TASKS_DIR=/path/to/terminal-bench-2 python -m tbench2_env.server.app
190
+ ```
191
+
192
+ ## Project Structure
193
+
194
+ ```
195
+ tbench2_env/
196
+ ├── __init__.py # Module exports (Tbench2Env, Tbench2Action, etc.)
197
+ ├── README.md # This file
198
+ ├── client.py # Tbench2Env client implementation
199
+ ├── models.py # Tbench2Action, Tbench2Observation, Tbench2State
200
+ ├── openenv.yaml # OpenEnv configuration
201
+ ├── pyproject.toml # Package dependencies
202
+ └── server/
203
+ ├── __init__.py # Server exports
204
+ ├── app.py # FastAPI application
205
+ ├── tbench2_env_environment.py # Core environment logic
206
+ └── Dockerfile # Container image definition
207
+ ```
__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Tbench2 Env Environment."""
8
+
9
+ from .client import Tbench2Env
10
+ from .models import Tbench2Action, Tbench2Observation, Tbench2State
11
+
12
+
13
+ __all__ = [
14
+ "Tbench2Action",
15
+ "Tbench2Observation",
16
+ "Tbench2Env",
17
+ "Tbench2State",
18
+ ]
client.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """TB2 Environment Client."""
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any
12
+
13
+
14
+ # Support both in-repo and standalone imports
15
+ try:
16
+ # In-repo imports (when running from OpenEnv repository)
17
+ from openenv.core.client_types import StepResult
18
+ from openenv.core.env_client import EnvClient
19
+
20
+ from .models import Tbench2Action, Tbench2Observation, Tbench2State
21
+ except ImportError:
22
+ # Standalone imports (when environment is standalone with openenv from pip)
23
+ from openenv.core.client_types import StepResult
24
+ from openenv.core.env_client import EnvClient
25
+
26
+ from models import Tbench2Action, Tbench2Observation, Tbench2State
27
+
28
+
29
+ class Tbench2Env(EnvClient[Tbench2Action, Tbench2Observation, Tbench2State]):
30
+ """HTTP client for the TB2 environment."""
31
+
32
+ def _step_payload(self, action: Tbench2Action) -> dict[str, Any]:
33
+ return {
34
+ "action_type": action.action_type,
35
+ "command": action.command,
36
+ "session_id": action.session_id,
37
+ "block": action.block,
38
+ "wait_seconds": action.wait_seconds,
39
+ "file_path": action.file_path,
40
+ "content": action.content,
41
+ }
42
+
43
+ def _parse_result(self, payload: dict[str, Any]) -> StepResult[Tbench2Observation]:
44
+ obs_data = payload.get("observation", {})
45
+ observation = Tbench2Observation(
46
+ instruction=obs_data.get("instruction", ""),
47
+ output=obs_data.get("output", ""),
48
+ success=obs_data.get("success", True),
49
+ error=obs_data.get("error", ""),
50
+ task_id=obs_data.get("task_id", ""),
51
+ task_path=obs_data.get("task_path", ""),
52
+ session_id=obs_data.get("session_id"),
53
+ action_type=obs_data.get("action_type", ""),
54
+ info=obs_data.get("info", {}),
55
+ reward=payload.get("reward"),
56
+ done=payload.get("done", False),
57
+ metadata=obs_data.get("metadata", {}),
58
+ )
59
+ return StepResult(
60
+ observation=observation,
61
+ reward=payload.get("reward"),
62
+ done=payload.get("done", False),
63
+ )
64
+
65
+ def _parse_state(self, payload: dict[str, Any]) -> Tbench2State:
66
+ return Tbench2State(
67
+ episode_id=payload.get("episode_id"),
68
+ step_count=payload.get("step_count", 0),
69
+ task_id=payload.get("task_id", ""),
70
+ task_path=payload.get("task_path", ""),
71
+ terminal_ready=payload.get("terminal_ready", False),
72
+ last_action_type=payload.get("last_action_type", ""),
73
+ last_command=payload.get("last_command", ""),
74
+ last_output=payload.get("last_output", ""),
75
+ )
models.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Data models for the TB2 environment.
9
+ """
10
+
11
+ from pydantic import Field
12
+
13
+
14
+ # Support both in-repo and standalone imports
15
+ try:
16
+ # In-repo imports (when running from OpenEnv repository)
17
+ from openenv.core.env_server.types import Action, Observation, State
18
+ except ImportError:
19
+ # Standalone imports (when environment is standalone with openenv from pip)
20
+ from openenv.core.env_server.types import Action, Observation, State
21
+
22
+
23
+ class Tbench2Action(Action):
24
+ """Action for interacting with a TB2 task session."""
25
+
26
+ action_type: str = Field(default="exec")
27
+ command: str = Field(default="")
28
+ session_id: str | None = Field(default=None)
29
+ block: bool = Field(default=True)
30
+ wait_seconds: float | None = Field(default=None)
31
+ file_path: str = Field(default="")
32
+ content: str = Field(default="")
33
+
34
+
35
+ class Tbench2Observation(Observation):
36
+ """Observation returned from the TB2 environment."""
37
+
38
+ instruction: str = Field(default="")
39
+ output: str = Field(default="")
40
+ success: bool = Field(default=True)
41
+ error: str = Field(default="")
42
+ task_id: str = Field(default="")
43
+ task_path: str = Field(default="")
44
+ session_id: str | None = Field(default=None)
45
+ action_type: str = Field(default="")
46
+ info: dict = Field(default_factory=dict)
47
+
48
+
49
+ class Tbench2State(State):
50
+ """Server-side state for a TB2 task."""
51
+
52
+ task_id: str = Field(default="")
53
+ task_path: str = Field(default="")
54
+ session_id: str = Field(default="")
55
+ terminal_ready: bool = Field(default=False)
56
+ last_action_type: str = Field(default="")
57
+ last_command: str = Field(default="")
58
+ last_output: str = Field(default="")
openenv.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: tbench2
3
+ type: space
4
+ runtime: fastapi
5
+ app: server.app:app
6
+ port: 8000
7
+
pyproject.toml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ [build-system]
8
+ requires = ["setuptools>=45", "wheel"]
9
+ build-backend = "setuptools.build_meta"
10
+
11
+ [project]
12
+ name = "openenv-tbench2_env"
13
+ version = "0.1.0"
14
+ description = "Tbench2 Env environment for OpenEnv"
15
+ requires-python = ">=3.10"
16
+ dependencies = [
17
+ # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
18
+ # install from github
19
+ "openenv-core @ git+https://github.com/meta-pytorch/OpenEnv.git",
20
+ "pytest>=8.4.0",
21
+ # Environment-specific dependencies
22
+ # Add all dependencies needed for your environment here
23
+ "camel-ai",
24
+ # Docker-backed mode (optional, for full TB2.0 compatibility)
25
+ "docker>=7.0.0",
26
+ # TOML parsing (tomllib for Python 3.11+, tomli for older versions)
27
+ "tomli>=2.0.0; python_version < '3.11'",
28
+ ]
29
+
30
+ [project.optional-dependencies]
31
+ dev = [
32
+ "pytest>=8.0.0",
33
+ "pytest-cov>=4.0.0",
34
+ ]
35
+
36
+ [project.scripts]
37
+ # Server entry point - enables running via: uv run --project . server
38
+ # or: python -m tbench2_env.server.app
39
+ server = "tbench2_env.server.app:main"
40
+
41
+ [tool.setuptools]
42
+ include-package-data = true
43
+ packages = ["tbench2_env", "tbench2_env.server"]
44
+ package-dir = { "tbench2_env" = ".", "tbench2_env.server" = "server" }
server/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Tbench2 Env environment server components."""
8
+
9
+ from .tbench2_env_environment import Tbench2DockerEnvironment, Tbench2Environment
10
+
11
+
12
+ __all__ = ["Tbench2Environment", "Tbench2DockerEnvironment"]
server/app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ FastAPI application for the Tbench2 Env Environment.
9
+
10
+ This module creates an HTTP server that exposes the Tbench2Environment
11
+ over HTTP and WebSocket endpoints, compatible with EnvClient.
12
+
13
+ Endpoints:
14
+ - POST /reset: Reset the environment
15
+ - POST /step: Execute an action
16
+ - GET /state: Get current environment state
17
+ - GET /schema: Get action/observation schemas
18
+ - WS /ws: WebSocket endpoint for persistent sessions
19
+
20
+ Usage:
21
+ # Development (with auto-reload):
22
+ uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
23
+
24
+ # Production:
25
+ uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
26
+
27
+ # Or run directly:
28
+ python -m server.app
29
+ """
30
+
31
+ import os
32
+
33
+
34
+ try:
35
+ from openenv.core.env_server.http_server import create_app
36
+
37
+ # In-repo imports
38
+ from tbench2_env.models import Tbench2Action, Tbench2Observation
39
+
40
+ from .tbench2_env_environment import Tbench2DockerEnvironment, Tbench2Environment
41
+ except Exception as e: # pragma: no cover
42
+ # Standalone imports (when environment is standalone with openenv from pip)
43
+ from openenv.core.env_server.http_server import create_app
44
+ from server.tbench2_env_environment import Tbench2DockerEnvironment, Tbench2Environment
45
+
46
+ from models import Tbench2Action, Tbench2Observation
47
+
48
+ _IMPORT_ERROR = e
49
+
50
+
51
+ # Determine which environment class to use based on TB2_MODE
52
+ _TB2_MODE = os.getenv("TB2_MODE", "local").lower()
53
+
54
+ if _TB2_MODE == "docker":
55
+ _DEFAULT_ENVIRONMENT = Tbench2DockerEnvironment
56
+ _ENV_SUFFIX = " (Docker mode)"
57
+ elif _TB2_MODE == "auto":
58
+ # Auto-detect: try Docker, fall back to local
59
+ _DEFAULT_ENVIRONMENT = Tbench2Environment
60
+ _ENV_SUFFIX = " (auto-detect mode)"
61
+ else:
62
+ _DEFAULT_ENVIRONMENT = Tbench2Environment
63
+ _ENV_SUFFIX = " (local mode)"
64
+
65
+
66
+ # Create the app with web interface and README integration
67
+ app = create_app(
68
+ _DEFAULT_ENVIRONMENT,
69
+ Tbench2Action,
70
+ Tbench2Observation,
71
+ env_name="tbench2_env" + _ENV_SUFFIX,
72
+ max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
73
+ )
74
+
75
+
76
+ def main(host: str = "0.0.0.0", port: int = 8000):
77
+ """
78
+ Entry point for direct execution via uv run or python -m.
79
+
80
+ This function enables running the server without Docker:
81
+ uv run --project . server
82
+ uv run --project . server --port 8001
83
+ python -m tbench2_env.server.app
84
+
85
+ Args:
86
+ host: Host address to bind to (default: "0.0.0.0")
87
+ port: Port number to listen on (default: 8000)
88
+
89
+ For production deployments, consider using uvicorn directly with
90
+ multiple workers:
91
+ uvicorn tbench2_env.server.app:app --workers 4
92
+ """
93
+ import uvicorn
94
+
95
+ uvicorn.run(app, host=host, port=port)
96
+
97
+
98
+ if __name__ == "__main__":
99
+ import argparse
100
+
101
+ parser = argparse.ArgumentParser()
102
+ parser.add_argument("--port", type=int, default=8000)
103
+ args = parser.parse_args()
104
+ main(port=args.port)
server/tbench2_env_environment.py ADDED
@@ -0,0 +1,694 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """TB2 environment server implementation (Spaces-compatible local mode)."""
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import os
13
+ import sys
14
+ import urllib.request
15
+ import zipfile
16
+ from pathlib import Path
17
+ from typing import Any
18
+ from uuid import uuid4
19
+
20
+
21
+ if sys.version_info >= (3, 11):
22
+ import tomllib
23
+ else:
24
+ import tomli as tomllib
25
+
26
+ from openenv.core.env_server.interfaces import Environment
27
+
28
+
29
+ # Support both in-repo and standalone imports
30
+ try:
31
+ # In-repo imports (when running from OpenEnv repository)
32
+ from tbench2_env.models import Tbench2Action, Tbench2Observation, Tbench2State
33
+ except ImportError:
34
+ # Standalone imports (when environment is standalone with openenv from pip)
35
+ from models import Tbench2Action, Tbench2Observation, Tbench2State
36
+
37
+ _CAMEL_IMPORT_ERROR: Exception | None = None
38
+
39
+
40
+ def _require_terminal_toolkit() -> Any:
41
+ global _CAMEL_IMPORT_ERROR
42
+ if _CAMEL_IMPORT_ERROR is not None:
43
+ raise RuntimeError(
44
+ "camel-ai (TerminalToolkit) is required for TB2. Install from PyPI or from the CAMEL repo."
45
+ ) from _CAMEL_IMPORT_ERROR
46
+
47
+ try:
48
+ from camel.toolkits import TerminalToolkit
49
+ except Exception as exc: # pragma: no cover
50
+ _CAMEL_IMPORT_ERROR = exc
51
+ raise RuntimeError(
52
+ "camel-ai (TerminalToolkit) is required for TB2. Install from PyPI or from the CAMEL repo."
53
+ ) from exc
54
+
55
+ return TerminalToolkit
56
+
57
+
58
+ def _download_tb2_repo(cache_dir: Path) -> Path:
59
+ repo_url = os.getenv(
60
+ "TB2_REPO_URL",
61
+ "https://github.com/laude-institute/terminal-bench-2/archive/refs/heads/main.zip",
62
+ )
63
+ cache_dir.mkdir(parents=True, exist_ok=True)
64
+ archive_path = cache_dir / "terminal-bench-2.zip"
65
+
66
+ if not archive_path.exists():
67
+ urllib.request.urlretrieve(repo_url, archive_path)
68
+
69
+ with zipfile.ZipFile(archive_path) as zf:
70
+ root = zf.namelist()[0].split("/")[0]
71
+ extract_dir = cache_dir / root
72
+ if not extract_dir.exists():
73
+ zf.extractall(cache_dir)
74
+
75
+ return extract_dir
76
+
77
+
78
+ def _read_instruction(task_dir: Path) -> str:
79
+ instruction_path = task_dir / "instruction.md"
80
+ if instruction_path.exists():
81
+ return instruction_path.read_text(encoding="utf-8")
82
+ return ""
83
+
84
+
85
+ def _read_timeout(task_dir: Path, fallback: float) -> float:
86
+ task_toml = task_dir / "task.toml"
87
+ if not task_toml.exists():
88
+ return fallback
89
+ try:
90
+ data = tomllib.loads(task_toml.read_text(encoding="utf-8"))
91
+ except Exception:
92
+ return fallback
93
+ verifier = data.get("verifier", {})
94
+ return float(verifier.get("timeout_sec", fallback))
95
+
96
+
97
+ class Tbench2Environment(Environment[Tbench2Action, Tbench2Observation, Tbench2State]):
98
+ """OpenEnv wrapper around Terminal-Bench 2 tasks (local execution)."""
99
+
100
+ SUPPORTS_CONCURRENT_SESSIONS: bool = True
101
+
102
+ def __init__(
103
+ self,
104
+ tasks_dir: str | None = None,
105
+ output_dir: str | None = None,
106
+ command_timeout_s: float = 20.0,
107
+ safe_mode: bool = False,
108
+ ) -> None:
109
+ super().__init__()
110
+ self.tasks_dir = tasks_dir or os.getenv("TB2_TASKS_DIR", "")
111
+ self.output_dir = Path(output_dir or os.getenv("TB2_OUTPUT_DIR", "/tmp/tbench2_env_runs"))
112
+ self.command_timeout_s = command_timeout_s
113
+ self.safe_mode = safe_mode
114
+
115
+ self._state = Tbench2State()
116
+ self._task_dir: Path | None = None
117
+ self._terminal_toolkit = None
118
+ self._instruction = ""
119
+
120
+ def reset(
121
+ self,
122
+ seed: int | None = None,
123
+ episode_id: str | None = None,
124
+ **kwargs: Any,
125
+ ) -> Tbench2Observation:
126
+ del seed
127
+
128
+ TerminalToolkit = _require_terminal_toolkit()
129
+
130
+ task_id = kwargs.get("task_id") or kwargs.get("task_name")
131
+ task_path = kwargs.get("task_path") or kwargs.get("path")
132
+
133
+ task_dir = self._resolve_task_path(task_id, task_path)
134
+ resolved_task_id = task_id or task_dir.name
135
+
136
+ self._instruction = _read_instruction(task_dir)
137
+ self._task_dir = task_dir
138
+
139
+ trial_name = f"{resolved_task_id}.{episode_id or uuid4().hex}"
140
+ session_logs_dir = self.output_dir / trial_name / "terminal_toolkit_session_logs"
141
+ session_logs_dir.mkdir(parents=True, exist_ok=True)
142
+
143
+ self._terminal_toolkit = TerminalToolkit(
144
+ timeout=self.command_timeout_s,
145
+ working_directory=str(task_dir),
146
+ use_docker_backend=False,
147
+ session_logs_dir=session_logs_dir,
148
+ safe_mode=self.safe_mode,
149
+ )
150
+
151
+ self._state = Tbench2State(
152
+ episode_id=episode_id or str(uuid4()),
153
+ step_count=0,
154
+ task_id=resolved_task_id,
155
+ task_path=str(task_dir),
156
+ terminal_ready=True,
157
+ )
158
+
159
+ return Tbench2Observation(
160
+ instruction=self._instruction,
161
+ output="",
162
+ success=True,
163
+ error="",
164
+ task_id=resolved_task_id,
165
+ task_path=str(task_dir),
166
+ session_id=None,
167
+ action_type="reset",
168
+ info={},
169
+ reward=0.0,
170
+ done=False,
171
+ )
172
+
173
+ def step(
174
+ self,
175
+ action: Tbench2Action,
176
+ timeout_s: float | None = None,
177
+ **kwargs: Any,
178
+ ) -> Tbench2Observation:
179
+ del timeout_s, kwargs
180
+
181
+ if not isinstance(action, Tbench2Action):
182
+ raise TypeError(f"Expected Tbench2Action, got {type(action)}")
183
+
184
+ if self._terminal_toolkit is None or self._task_dir is None:
185
+ raise RuntimeError("TB2 environment not initialized. Call reset() first.")
186
+
187
+ self._state.step_count += 1
188
+ self._state.last_action_type = action.action_type
189
+ self._state.last_command = action.command
190
+
191
+ output = ""
192
+ error = ""
193
+ success = True
194
+ reward = None
195
+ done = False
196
+ info: dict[str, Any] = {}
197
+ session_id = action.session_id or "tb2-session"
198
+
199
+ try:
200
+ if action.action_type == "exec":
201
+ output = self._terminal_toolkit.shell_exec(
202
+ command=action.command,
203
+ block=action.block,
204
+ id=session_id,
205
+ )
206
+ elif action.action_type == "write":
207
+ self._ensure_session_id(action.session_id, action.action_type)
208
+ output = self._terminal_toolkit.shell_write_to_process(
209
+ id=action.session_id,
210
+ command=action.command,
211
+ )
212
+ elif action.action_type == "view":
213
+ self._ensure_session_id(action.session_id, action.action_type)
214
+ output = self._terminal_toolkit.shell_view(id=action.session_id)
215
+ elif action.action_type == "wait":
216
+ self._ensure_session_id(action.session_id, action.action_type)
217
+ wait_seconds = action.wait_seconds or 0.0
218
+ output = self._terminal_toolkit.shell_wait(
219
+ id=action.session_id,
220
+ wait_seconds=wait_seconds,
221
+ )
222
+ elif action.action_type == "kill":
223
+ self._ensure_session_id(action.session_id, action.action_type)
224
+ self._terminal_toolkit.shell_kill_process(id=action.session_id)
225
+ output = f"Killed session {action.session_id}"
226
+ elif action.action_type == "write_file":
227
+ self._terminal_toolkit.shell_write_content_to_file(
228
+ content=action.content,
229
+ file_path=action.file_path,
230
+ )
231
+ output = f"Wrote content to {action.file_path}"
232
+ elif action.action_type == "evaluate":
233
+ output, reward, info = self._evaluate_task()
234
+ done = True
235
+ elif action.action_type == "close":
236
+ self.close()
237
+ output = "Closed TB2 environment."
238
+ done = True
239
+ else:
240
+ raise ValueError(f"Unsupported action_type: {action.action_type}")
241
+ except Exception as exc: # pragma: no cover
242
+ success = False
243
+ error = str(exc)
244
+
245
+ self._state.last_output = output
246
+ self._state.session_id = session_id or ""
247
+
248
+ return Tbench2Observation(
249
+ instruction=self._instruction,
250
+ output=output,
251
+ success=success,
252
+ error=error,
253
+ task_id=self._state.task_id,
254
+ task_path=self._state.task_path,
255
+ session_id=session_id or "",
256
+ action_type=action.action_type,
257
+ info=info,
258
+ reward=reward,
259
+ done=done,
260
+ )
261
+
262
+ @property
263
+ def state(self) -> Tbench2State:
264
+ return self._state
265
+
266
+ def close(self) -> None:
267
+ self._terminal_toolkit = None
268
+ self._task_dir = None
269
+ self._instruction = ""
270
+
271
+ def _resolve_task_path(self, task_id: str | None, task_path: str | None) -> Path:
272
+ if task_path:
273
+ resolved = Path(task_path).expanduser().resolve()
274
+ if not resolved.exists():
275
+ raise FileNotFoundError(f"Task path not found: {resolved}")
276
+ return resolved
277
+
278
+ if not task_id:
279
+ raise ValueError("Provide task_id or task_path to reset TB2 environment.")
280
+
281
+ if not self.tasks_dir:
282
+ cache_dir = Path(os.getenv("TB2_CACHE_DIR", str(self.output_dir / "repo_cache")))
283
+ repo_dir = _download_tb2_repo(cache_dir)
284
+ resolved = repo_dir / task_id
285
+ else:
286
+ resolved = Path(self.tasks_dir).expanduser().resolve() / task_id
287
+
288
+ if not resolved.exists():
289
+ raise FileNotFoundError(f"Task path not found: {resolved}")
290
+ return resolved
291
+
292
+ def _ensure_session_id(self, session_id: str | None, action_type: str) -> None:
293
+ if not session_id:
294
+ raise ValueError(f"session_id is required for action_type='{action_type}'")
295
+
296
+ def _evaluate_task(self) -> tuple[str, float, dict[str, Any]]:
297
+ if self._task_dir is None:
298
+ raise RuntimeError("TB2 environment not initialized. Call reset() first.")
299
+ if self._terminal_toolkit is None:
300
+ raise RuntimeError("Terminal toolkit not initialized.")
301
+
302
+ _read_timeout(self._task_dir, fallback=900.0) # Validate timeout config
303
+ tests_dir = self._task_dir / "tests"
304
+ cmd = f"cd {self._task_dir} && python -m pytest -q {tests_dir} -rA; echo __TB2_EXIT_CODE__:$?"
305
+ output = self._terminal_toolkit.shell_exec(
306
+ id="tb2-tests",
307
+ command=cmd,
308
+ block=True,
309
+ )
310
+
311
+ exit_code = 1
312
+ marker = "__TB2_EXIT_CODE__"
313
+ for line in output.splitlines()[::-1]:
314
+ if marker in line:
315
+ try:
316
+ exit_code = int(line.split(":", 1)[1].strip())
317
+ except Exception:
318
+ exit_code = 1
319
+ break
320
+
321
+ reward = 1.0 if exit_code == 0 else 0.0
322
+ info = {"tests_passed": exit_code == 0, "exit_code": exit_code}
323
+ return output, reward, info
324
+
325
+
326
+ class Tbench2DockerEnvironment(Environment[Tbench2Action, Tbench2Observation, Tbench2State]):
327
+ """OpenEnv wrapper around Terminal-Bench 2 tasks with Docker isolation.
328
+
329
+ This environment runs each task in its own Docker container, reading
330
+ the image specification from task.toml's [environment] section.
331
+
332
+ Requires:
333
+ - Docker socket mounted (/var/run/docker.sock)
334
+ - Sufficient disk space for container images
335
+ """
336
+
337
+ SUPPORTS_CONCURRENT_SESSIONS: bool = True
338
+
339
+ def __init__(
340
+ self,
341
+ tasks_dir: str | None = None,
342
+ output_dir: str | None = None,
343
+ command_timeout_s: float = 300.0,
344
+ safe_mode: bool = True,
345
+ ) -> None:
346
+ super().__init__()
347
+ self.tasks_dir = tasks_dir or os.getenv("TB2_TASKS_DIR", "")
348
+ self.output_dir = Path(output_dir or os.getenv("TB2_OUTPUT_DIR", "/tmp/tbench2_env_runs"))
349
+ self.command_timeout_s = command_timeout_s
350
+ self.safe_mode = safe_mode
351
+
352
+ self._state = Tbench2State()
353
+ self._task_dir: Path | None = None
354
+ self._docker_client = None
355
+ self._container = None
356
+ self._instruction = ""
357
+ self._task_image = ""
358
+ self._task_config: dict[str, Any] = {}
359
+
360
+ def _get_docker_client(self) -> Any:
361
+ """Lazy initialization of Docker client."""
362
+ if self._docker_client is None:
363
+ try:
364
+ import docker
365
+
366
+ self._docker_client = docker.from_env()
367
+ except Exception as exc:
368
+ raise RuntimeError(
369
+ f"Docker client not available. Ensure Docker socket is mounted. Error: {exc}"
370
+ ) from exc
371
+ return self._docker_client
372
+
373
+ def reset(
374
+ self,
375
+ seed: int | None = None,
376
+ episode_id: str | None = None,
377
+ **kwargs: Any,
378
+ ) -> Tbench2Observation:
379
+ del seed
380
+
381
+ task_id = kwargs.get("task_id") or kwargs.get("task_name")
382
+ task_path = kwargs.get("task_path") or kwargs.get("path")
383
+
384
+ task_dir = self._resolve_task_path(task_id, task_path)
385
+ resolved_task_id = task_id or task_dir.name
386
+
387
+ # Read task configuration including Docker image
388
+ task_toml_path = task_dir / "task.toml"
389
+ if task_toml_path.exists():
390
+ self._task_config = tomllib.loads(task_toml_path.read_text(encoding="utf-8"))
391
+ self._task_image = self._task_config.get("environment", {}).get("docker_image", "")
392
+ else:
393
+ self._task_image = ""
394
+ self._task_config = {}
395
+
396
+ self._instruction = _read_instruction(task_dir)
397
+ self._task_dir = task_dir
398
+
399
+ # Create trial directory for logs
400
+ trial_name = f"{resolved_task_id}.{episode_id or uuid4().hex}"
401
+ trial_dir = self.output_dir / trial_name
402
+ trial_dir.mkdir(parents=True, exist_ok=True)
403
+
404
+ # Start Docker container if image is specified
405
+ if self._task_image:
406
+ self._start_container(task_dir, trial_dir)
407
+ else:
408
+ # Fallback to local mode if no image specified
409
+ self._state = Tbench2State(
410
+ episode_id=episode_id or str(uuid4()),
411
+ step_count=0,
412
+ task_id=resolved_task_id,
413
+ task_path=str(task_dir),
414
+ terminal_ready=not self._task_image, # Ready if no container needed
415
+ )
416
+
417
+ return Tbench2Observation(
418
+ instruction=self._instruction,
419
+ output="",
420
+ success=True,
421
+ error="",
422
+ task_id=resolved_task_id,
423
+ task_path=str(task_dir),
424
+ session_id=None,
425
+ action_type="reset",
426
+ info={"docker_image": self._task_image} if self._task_image else {},
427
+ reward=0.0,
428
+ done=False,
429
+ )
430
+
431
+ def _start_container(self, task_dir: Path, trial_dir: Path) -> None:
432
+ """Start a Docker container for the task.
433
+
434
+ Uses file copying instead of bind mounts to support Docker-in-Docker
435
+ scenarios where the server runs inside a container. Bind mounts reference
436
+ host paths, which don't exist when the server is containerized.
437
+ """
438
+ docker = self._get_docker_client()
439
+
440
+ try:
441
+ # Pull image if needed
442
+ try:
443
+ docker.images.get(self._task_image)
444
+ except Exception:
445
+ logging.info(f"Pulling image {self._task_image}...")
446
+ docker.images.pull(self._task_image)
447
+
448
+ # Start container WITHOUT bind mounts (for DinD compatibility)
449
+ self._container = docker.containers.run(
450
+ image=self._task_image,
451
+ command="sleep infinity",
452
+ detach=True,
453
+ network_mode="host",
454
+ working_dir="/task",
455
+ remove=False,
456
+ )
457
+
458
+ # Copy task files into container using tar archive
459
+ # This works in Docker-in-Docker because we read files from our
460
+ # filesystem and stream them to the container via the Docker API
461
+ self._copy_dir_to_container(task_dir, "/task")
462
+
463
+ self._state = Tbench2State(
464
+ episode_id=str(uuid4()),
465
+ step_count=0,
466
+ task_id=task_dir.name,
467
+ task_path=str(task_dir),
468
+ terminal_ready=True,
469
+ )
470
+
471
+ except Exception as exc:
472
+ raise RuntimeError(f"Failed to start container: {exc}") from exc
473
+
474
+ def _copy_dir_to_container(self, src_dir: Path, dest_path: str) -> None:
475
+ """Copy a directory into the container using tar archive.
476
+
477
+ This method streams files via the Docker API, avoiding bind mount
478
+ issues in Docker-in-Docker scenarios.
479
+ """
480
+ import io
481
+ import tarfile
482
+
483
+ if self._container is None:
484
+ raise RuntimeError("Container not started")
485
+
486
+ # Create tar archive in memory
487
+ tar_stream = io.BytesIO()
488
+ with tarfile.open(fileobj=tar_stream, mode="w") as tar:
489
+ for item in src_dir.rglob("*"):
490
+ arcname = str(item.relative_to(src_dir))
491
+ tar.add(str(item), arcname=arcname)
492
+
493
+ tar_stream.seek(0)
494
+
495
+ # Copy to container
496
+ self._container.put_archive(dest_path, tar_stream.getvalue())
497
+
498
+ def _exec_in_container(self, command: str, workdir: str = "/task") -> tuple[int, str]:
499
+ """Execute a command inside the container."""
500
+ if self._container is None:
501
+ raise RuntimeError("Container not started. Call reset() first.")
502
+
503
+ exit_code, output = self._container.exec_run(
504
+ cmd=f"bash -c 'cd {workdir} && {command}'",
505
+ workdir="/task",
506
+ stdout=True,
507
+ stderr=True,
508
+ )
509
+ return exit_code, output.decode("utf-8", errors="replace")
510
+
511
+ def step(
512
+ self,
513
+ action: Tbench2Action,
514
+ timeout_s: float | None = None,
515
+ **kwargs: Any,
516
+ ) -> Tbench2Observation:
517
+ del timeout_s, kwargs
518
+
519
+ if not isinstance(action, Tbench2Action):
520
+ raise TypeError(f"Expected Tbench2Action, got {type(action)}")
521
+
522
+ if self._task_dir is None:
523
+ raise RuntimeError("TB2 environment not initialized. Call reset() first.")
524
+
525
+ self._state.step_count += 1
526
+ self._state.last_action_type = action.action_type
527
+ self._state.last_command = action.command
528
+
529
+ output = ""
530
+ error = ""
531
+ success = True
532
+ reward = None
533
+ done = False
534
+ info: dict[str, Any] = {}
535
+ session_id = action.session_id or "tb2-session"
536
+
537
+ try:
538
+ if action.action_type == "exec":
539
+ if self._container:
540
+ exit_code, output = self._exec_in_container(action.command)
541
+ success = exit_code == 0
542
+ else:
543
+ # Fallback to local execution
544
+ import subprocess
545
+
546
+ result = subprocess.run(
547
+ action.command,
548
+ shell=True,
549
+ capture_output=True,
550
+ text=True,
551
+ timeout=self.command_timeout_s,
552
+ )
553
+ output = result.stdout + result.stderr
554
+ success = result.returncode == 0
555
+
556
+ elif action.action_type == "write_file":
557
+ if self._container:
558
+ # Write to container
559
+ exit_code, _ = self._exec_in_container(f"cat > {action.file_path} << 'EOF'\n{action.content}\nEOF")
560
+ success = exit_code == 0
561
+ output = f"Wrote to {action.file_path}"
562
+ else:
563
+ # Local write
564
+ Path(action.file_path).write_text(action.content)
565
+ output = f"Wrote to {action.file_path}"
566
+
567
+ elif action.action_type == "evaluate":
568
+ if self._container:
569
+ output, reward, info = self._evaluate_docker()
570
+ else:
571
+ output, reward, info = self._evaluate_local()
572
+ done = True
573
+
574
+ elif action.action_type == "close":
575
+ self.close()
576
+ output = "Closed TB2 environment."
577
+ done = True
578
+
579
+ else:
580
+ raise ValueError(f"Unsupported action_type in Docker mode: {action.action_type}")
581
+
582
+ except Exception as exc:
583
+ success = False
584
+ error = str(exc)
585
+
586
+ self._state.last_output = output
587
+ self._state.session_id = session_id or ""
588
+
589
+ return Tbench2Observation(
590
+ instruction=self._instruction,
591
+ output=output,
592
+ success=success,
593
+ error=error,
594
+ task_id=self._state.task_id,
595
+ task_path=self._state.task_path,
596
+ session_id=session_id or "",
597
+ action_type=action.action_type,
598
+ info=info,
599
+ reward=reward,
600
+ done=done,
601
+ )
602
+
603
+ def _evaluate_docker(self) -> tuple[str, float, dict[str, Any]]:
604
+ """Evaluate task inside Docker container."""
605
+ if self._container is None:
606
+ raise RuntimeError("Container not started.")
607
+ assert self._task_dir is not None, "Task directory not set"
608
+
609
+ # Run pytest in the container's /task directory
610
+ # Use exit code marker for consistency with local mode
611
+ cmd = "cd /task && python -m pytest -q tests/ -rA; echo __TB2_EXIT_CODE__:$?"
612
+
613
+ exit_code, output = self._container.exec_run(
614
+ cmd=f"bash -c '{cmd}'",
615
+ workdir="/task",
616
+ stdout=True,
617
+ stderr=True,
618
+ )
619
+ output_str = output.decode("utf-8", errors="replace")
620
+
621
+ # Parse exit code from marker (same logic as local mode)
622
+ ec = 1
623
+ marker = "__TB2_EXIT_CODE__"
624
+ for line in output_str.splitlines()[::-1]:
625
+ if marker in line:
626
+ try:
627
+ ec = int(line.split(":", 1)[1].strip())
628
+ except Exception:
629
+ ec = 1
630
+ break
631
+
632
+ reward = 1.0 if ec == 0 else 0.0
633
+ info = {"tests_passed": ec == 0, "exit_code": ec}
634
+ return output_str, reward, info
635
+
636
+ def _evaluate_local(self) -> tuple[str, float, dict[str, Any]]:
637
+ """Evaluate task locally (fallback)."""
638
+ if self._task_dir is None:
639
+ raise RuntimeError("Task not initialized.")
640
+
641
+ tests_dir = self._task_dir / "tests"
642
+ cmd = f"cd {self._task_dir} && python -m pytest -q {tests_dir} -rA; echo __TB2_EXIT_CODE__:$?"
643
+
644
+ import subprocess
645
+
646
+ result = subprocess.run(
647
+ cmd,
648
+ shell=True,
649
+ capture_output=True,
650
+ text=True,
651
+ timeout=900.0,
652
+ )
653
+ output = result.stdout + result.stderr
654
+ exit_code = result.returncode
655
+
656
+ reward = 1.0 if exit_code == 0 else 0.0
657
+ info = {"tests_passed": exit_code == 0, "exit_code": exit_code}
658
+ return output, reward, info
659
+
660
+ @property
661
+ def state(self) -> Tbench2State:
662
+ return self._state
663
+
664
+ def close(self) -> None:
665
+ if self._container:
666
+ try:
667
+ self._container.stop(timeout=10)
668
+ self._container.remove(force=True)
669
+ except Exception:
670
+ pass
671
+ self._container = None
672
+ self._task_dir = None
673
+ self._instruction = ""
674
+
675
+ def _resolve_task_path(self, task_id: str | None, task_path: str | None) -> Path:
676
+ if task_path:
677
+ resolved = Path(task_path).expanduser().resolve()
678
+ if not resolved.exists():
679
+ raise FileNotFoundError(f"Task path not found: {resolved}")
680
+ return resolved
681
+
682
+ if not task_id:
683
+ raise ValueError("Provide task_id or task_path to reset TB2 environment.")
684
+
685
+ if not self.tasks_dir:
686
+ cache_dir = Path(os.getenv("TB2_CACHE_DIR", str(self.output_dir / "repo_cache")))
687
+ repo_dir = _download_tb2_repo(cache_dir)
688
+ resolved = repo_dir / task_id
689
+ else:
690
+ resolved = Path(self.tasks_dir).expanduser().resolve() / task_id
691
+
692
+ if not resolved.exists():
693
+ raise FileNotFoundError(f"Task path not found: {resolved}")
694
+ return resolved