sergiopaniego HF Staff commited on
Commit
6b2d0fd
·
verified ·
1 Parent(s): f056a5e

Upload folder using huggingface_hub

Browse files
Files changed (10) hide show
  1. Dockerfile +95 -0
  2. README.md +201 -4
  3. __init__.py +18 -0
  4. client.py +75 -0
  5. models.py +58 -0
  6. openenv.yaml +7 -0
  7. pyproject.toml +46 -0
  8. server/__init__.py +12 -0
  9. server/app.py +104 -0
  10. server/tbench2_env_environment.py +724 -0
Dockerfile ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Multi-stage build using openenv-base
8
+ # This Dockerfile is flexible and works for both:
9
+ # - In-repo environments (with local OpenEnv sources)
10
+ # - Standalone environments (with openenv from PyPI/Git)
11
+ # The build script (openenv build) handles context detection and sets appropriate build args.
12
+
13
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ${BASE_IMAGE} AS builder
15
+
16
+ WORKDIR /app
17
+
18
+ # Ensure git is available (required for installing dependencies from VCS)
19
+ RUN apt-get update && \
20
+ apt-get install -y --no-install-recommends git && \
21
+ rm -rf /var/lib/apt/lists/*
22
+
23
+ # Build argument to control whether we're building standalone or in-repo
24
+ ARG BUILD_MODE=in-repo
25
+ ARG ENV_NAME=tbench2_env
26
+
27
+ # Copy environment code (always at root of build context)
28
+ COPY . /app/env
29
+
30
+ # For in-repo builds, openenv is already vendored in the build context
31
+ # For standalone builds, openenv will be installed via pyproject.toml
32
+ WORKDIR /app/env
33
+
34
+ # Ensure uv is available (for local builds where base image lacks it)
35
+ RUN if ! command -v uv >/dev/null 2>&1; then \
36
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
38
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
+ fi
40
+
41
+ # Install dependencies using uv sync
42
+ # If uv.lock exists, use it; otherwise resolve on the fly
43
+ RUN --mount=type=cache,target=/root/.cache/uv \
44
+ if [ -f uv.lock ]; then \
45
+ uv sync --frozen --no-install-project --no-editable; \
46
+ else \
47
+ uv sync --no-install-project --no-editable; \
48
+ fi
49
+
50
+ RUN --mount=type=cache,target=/root/.cache/uv \
51
+ if [ -f uv.lock ]; then \
52
+ uv sync --frozen --no-editable; \
53
+ else \
54
+ uv sync --no-editable; \
55
+ fi
56
+
57
+ # Final runtime stage
58
+ FROM ${BASE_IMAGE}
59
+
60
+ WORKDIR /app
61
+
62
+ # Install git and git-lfs for cloning HuggingFace datasets
63
+ RUN apt-get update && \
64
+ apt-get install -y --no-install-recommends git git-lfs && \
65
+ rm -rf /var/lib/apt/lists/* && \
66
+ git lfs install
67
+
68
+ # Clone SETA dataset from HuggingFace
69
+ # Source: https://huggingface.co/datasets/camel-ai/seta-env
70
+ RUN git clone --depth 1 https://huggingface.co/datasets/camel-ai/seta-env /app/seta-env
71
+
72
+ # Set TB2_TASKS_DIR to point to SETA tasks
73
+ # Tasks are in /app/seta-env/Dataset/ with numeric IDs (1, 2, 3, etc.)
74
+ ENV TB2_TASKS_DIR="/app/seta-env/Dataset"
75
+
76
+ # Copy the virtual environment from builder
77
+ COPY --from=builder /app/env/.venv /app/.venv
78
+
79
+ # Copy the environment code
80
+ COPY --from=builder /app/env /app/env
81
+
82
+ # Set PATH to use the virtual environment
83
+ ENV PATH="/app/.venv/bin:$PATH"
84
+
85
+ # Set PYTHONPATH so imports work correctly
86
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
87
+
88
+ # Health check
89
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
90
+ CMD curl -f http://localhost:8000/health || exit 1
91
+
92
+ # Run the FastAPI server
93
+ # The module path is constructed to work with the /app/env structure
94
+ ENV ENABLE_WEB_INTERFACE=true
95
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
README.md CHANGED
@@ -1,10 +1,207 @@
1
  ---
2
- title: Tbench2
3
- emoji: 🏃
4
  colorFrom: blue
5
- colorTo: pink
6
  sdk: docker
7
  pinned: false
 
 
 
 
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: TB2 Environment Server
3
+ emoji: "🧪"
4
  colorFrom: blue
5
+ colorTo: blue
6
  sdk: docker
7
  pinned: false
8
+ app_port: 8000
9
+ base_path: /web
10
+ tags:
11
+ - openenv
12
+ - terminal-bench-2
13
+ - spaces
14
  ---
15
 
16
+ # TB2 Environment (Terminal-Bench 2)
17
+
18
+ OpenEnv wrapper for [Terminal-Bench 2](https://github.com/laude-institute/terminal-bench-2) tasks. Supports two execution modes:
19
+
20
+ | Mode | Description | Use Case |
21
+ |------|-------------|----------|
22
+ | **Local** | Runs commands in the server process (no Docker) | Hugging Face Spaces, environments without Docker access |
23
+ | **Docker** | Runs each task in its own container | Full TB2.0 fidelity with custom task images |
24
+
25
+ ## Quick Start
26
+
27
+ ```python
28
+ from tbench2_env import Tbench2Env, Tbench2Action
29
+
30
+ env = Tbench2Env(base_url="http://localhost:8000")
31
+ result = env.reset(task_id="headless-terminal")
32
+ print(result.observation.instruction)
33
+
34
+ result = env.step(Tbench2Action(action_type="exec", command="ls -la"))
35
+ print(result.observation.output)
36
+
37
+ result = env.step(Tbench2Action(action_type="evaluate"))
38
+ print(result.reward, result.done)
39
+
40
+ env.close()
41
+ ```
42
+
43
+ ## Building the Docker Image
44
+
45
+ Before using the environment, build the Docker image:
46
+
47
+ ```bash
48
+ # From project root
49
+ docker build -t tbench2-env:latest -f envs/tbench2_env/server/Dockerfile .
50
+ ```
51
+
52
+ ## Environment Details
53
+
54
+ ### Action
55
+ **Tbench2Action**: Controls interaction with the TB2 task session
56
+
57
+ | Field | Type | Default | Description |
58
+ |-------|------|---------|-------------|
59
+ | `action_type` | str | `"exec"` | Action to perform (`exec`, `write`, `view`, `wait`, `kill`, `write_file`, `evaluate`, `close`) |
60
+ | `command` | str | `""` | Shell command or input to send |
61
+ | `session_id` | str \| None | `None` | Session ID for streaming processes |
62
+ | `block` | bool | `True` | Whether to block until command completes |
63
+ | `wait_seconds` | float \| None | `None` | Time to wait (for `wait` action) |
64
+ | `file_path` | str | `""` | File path (for `write_file` action) |
65
+ | `content` | str | `""` | Content to write (for `write_file` action) |
66
+
67
+ ### Observation
68
+ **Tbench2Observation**: Contains the environment response
69
+
70
+ | Field | Type | Description |
71
+ |-------|------|-------------|
72
+ | `instruction` | str | Task instruction/prompt from the TB2 task |
73
+ | `output` | str | Command output (stdout/stderr) |
74
+ | `success` | bool | Whether the action succeeded |
75
+ | `error` | str | Error message if action failed |
76
+ | `task_id` | str | Current task identifier |
77
+ | `task_path` | str | Path to the task directory |
78
+ | `session_id` | str \| None | Session ID for streaming processes |
79
+ | `action_type` | str | The action type that produced this observation |
80
+ | `info` | dict | Additional metadata |
81
+
82
+ ### State
83
+ **Tbench2State**: Server-side state for the task session
84
+
85
+ | Field | Type | Description |
86
+ |-------|------|-------------|
87
+ | `task_id` | str | Current task identifier |
88
+ | `task_path` | str | Path to the task directory |
89
+ | `session_id` | str | Active session ID |
90
+ | `terminal_ready` | bool | Whether the terminal is ready for commands |
91
+ | `last_action_type` | str | Last action type executed |
92
+ | `last_command` | str | Last command executed |
93
+ | `last_output` | str | Output from last command |
94
+
95
+ ## Execution Modes
96
+
97
+ ### Local Mode (Default)
98
+
99
+ Commands execute directly in the server process. Ideal for HF Spaces where Docker-in-Docker is unavailable.
100
+
101
+ ```bash
102
+ # Default - local mode
103
+ python -m tbench2_env.server.app
104
+
105
+ # Or explicitly set mode
106
+ TB2_MODE=local python -m tbench2_env.server.app
107
+ ```
108
+
109
+ **Note:** Local mode ignores Docker images specified in task.toml. Tasks requiring specific runtime environments may fail.
110
+
111
+ ### Docker Mode
112
+
113
+ Each task runs in its own Docker container, using the image specified in the task's `task.toml`:
114
+
115
+ ```bash
116
+ # Enable Docker mode
117
+ TB2_MODE=docker python -m tbench2_env.server.app
118
+ ```
119
+
120
+ **Requirements:**
121
+ - Docker socket mounted at `/var/run/docker.sock`
122
+ - Sufficient disk space for container images
123
+ - Network access to pull images if not cached
124
+
125
+ **Environment Variables for Docker Mode:**
126
+ - `TB2_MODE=docker` - Enable Docker-backed execution
127
+ - Docker socket must be accessible (mounted volume)
128
+
129
+ ## Action Types
130
+
131
+ | Action | Description | Required Fields |
132
+ |--------|-------------|-----------------|
133
+ | `exec` | Run a shell command | `command`, optionally `block`, `session_id` |
134
+ | `write` | Send input to a running session | `session_id`, `command` |
135
+ | `view` | Read pending output | `session_id` |
136
+ | `wait` | Wait for output | `session_id`, optionally `wait_seconds` |
137
+ | `kill` | Terminate a running session | `session_id` |
138
+ | `write_file` | Write content to a file | `file_path`, `content` |
139
+ | `evaluate` | Run pytest tests, return reward | (none) |
140
+ | `close` | Stop and cleanup | (none) |
141
+
142
+ ## Session IDs (Streaming Processes)
143
+
144
+ `session_id` is **only** required when you start a non-blocking process and want to interact with it (`write`, `view`, `wait`, `kill`). For plain `exec` commands, you can omit it.
145
+
146
+ Example (Python):
147
+ ```python
148
+ # Start a long-running process
149
+ env.step(Tbench2Action(action_type="exec", command="python -i", block=False, session_id="sess1"))
150
+
151
+ # Send input to it
152
+ env.step(Tbench2Action(action_type="write", session_id="sess1", command="print(2+2)\n"))
153
+
154
+ # Read its output
155
+ env.step(Tbench2Action(action_type="view", session_id="sess1"))
156
+ ```
157
+
158
+ ## Environment Variables
159
+
160
+ | Variable | Default | Description |
161
+ |----------|---------|-------------|
162
+ | `TB2_MODE` | `local` | Execution mode: `local` or `docker` |
163
+ | `TB2_TASKS_DIR` | (auto-download) | Path to local Terminal-Bench-2 repo checkout |
164
+ | `TB2_OUTPUT_DIR` | `/tmp/tbench2_env_runs` | Directory for session logs and cache |
165
+ | `TB2_CACHE_DIR` | `$TB2_OUTPUT_DIR/repo_cache` | Where to extract TB2 repo |
166
+ | `TB2_REPO_URL` | (GitHub main.zip) | Repo zip URL for auto-download |
167
+
168
+ ## Reward
169
+
170
+ Binary reward on `evaluate` action:
171
+ - `1.0` - All pytest tests pass (exit code 0)
172
+ - `0.0` - Tests fail (non-zero exit code)
173
+
174
+ Intermediate steps return `reward=None`.
175
+
176
+ ## Running the Server
177
+
178
+ ```bash
179
+ # Install dependencies
180
+ uv sync --all-extras
181
+
182
+ # Local mode (default, for Spaces)
183
+ python -m tbench2_env.server.app --port 8000
184
+
185
+ # Docker mode (full TB2.0 compatibility)
186
+ TB2_MODE=docker python -m tbench2_env.server.app --port 8000
187
+
188
+ # With local TB2 repo
189
+ TB2_TASKS_DIR=/path/to/terminal-bench-2 python -m tbench2_env.server.app
190
+ ```
191
+
192
+ ## Project Structure
193
+
194
+ ```
195
+ tbench2_env/
196
+ ├── __init__.py # Module exports (Tbench2Env, Tbench2Action, etc.)
197
+ ├── README.md # This file
198
+ ├── client.py # Tbench2Env client implementation
199
+ ├── models.py # Tbench2Action, Tbench2Observation, Tbench2State
200
+ ├── openenv.yaml # OpenEnv configuration
201
+ ├── pyproject.toml # Package dependencies
202
+ └── server/
203
+ ├── __init__.py # Server exports
204
+ ├── app.py # FastAPI application
205
+ ├── tbench2_env_environment.py # Core environment logic
206
+ └── Dockerfile # Container image definition
207
+ ```
__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Tbench2 Env Environment."""
8
+
9
+ from .client import Tbench2Env
10
+ from .models import Tbench2Action, Tbench2Observation, Tbench2State
11
+
12
+
13
+ __all__ = [
14
+ "Tbench2Action",
15
+ "Tbench2Observation",
16
+ "Tbench2Env",
17
+ "Tbench2State",
18
+ ]
client.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """TB2 Environment Client."""
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any
12
+
13
+
14
+ # Support both in-repo and standalone imports
15
+ try:
16
+ # In-repo imports (when running from OpenEnv repository)
17
+ from openenv.core.client_types import StepResult
18
+ from openenv.core.env_client import EnvClient
19
+
20
+ from .models import Tbench2Action, Tbench2Observation, Tbench2State
21
+ except ImportError:
22
+ # Standalone imports (when environment is standalone with openenv from pip)
23
+ from openenv.core.client_types import StepResult
24
+ from openenv.core.env_client import EnvClient
25
+
26
+ from models import Tbench2Action, Tbench2Observation, Tbench2State
27
+
28
+
29
+ class Tbench2Env(EnvClient[Tbench2Action, Tbench2Observation, Tbench2State]):
30
+ """HTTP client for the TB2 environment."""
31
+
32
+ def _step_payload(self, action: Tbench2Action) -> dict[str, Any]:
33
+ return {
34
+ "action_type": action.action_type,
35
+ "command": action.command,
36
+ "session_id": action.session_id,
37
+ "block": action.block,
38
+ "wait_seconds": action.wait_seconds,
39
+ "file_path": action.file_path,
40
+ "content": action.content,
41
+ }
42
+
43
+ def _parse_result(self, payload: dict[str, Any]) -> StepResult[Tbench2Observation]:
44
+ obs_data = payload.get("observation", {})
45
+ observation = Tbench2Observation(
46
+ instruction=obs_data.get("instruction", ""),
47
+ output=obs_data.get("output", ""),
48
+ success=obs_data.get("success", True),
49
+ error=obs_data.get("error", ""),
50
+ task_id=obs_data.get("task_id", ""),
51
+ task_path=obs_data.get("task_path", ""),
52
+ session_id=obs_data.get("session_id"),
53
+ action_type=obs_data.get("action_type", ""),
54
+ info=obs_data.get("info", {}),
55
+ reward=payload.get("reward"),
56
+ done=payload.get("done", False),
57
+ metadata=obs_data.get("metadata", {}),
58
+ )
59
+ return StepResult(
60
+ observation=observation,
61
+ reward=payload.get("reward"),
62
+ done=payload.get("done", False),
63
+ )
64
+
65
+ def _parse_state(self, payload: dict[str, Any]) -> Tbench2State:
66
+ return Tbench2State(
67
+ episode_id=payload.get("episode_id"),
68
+ step_count=payload.get("step_count", 0),
69
+ task_id=payload.get("task_id", ""),
70
+ task_path=payload.get("task_path", ""),
71
+ terminal_ready=payload.get("terminal_ready", False),
72
+ last_action_type=payload.get("last_action_type", ""),
73
+ last_command=payload.get("last_command", ""),
74
+ last_output=payload.get("last_output", ""),
75
+ )
models.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Data models for the TB2 environment.
9
+ """
10
+
11
+ from pydantic import Field
12
+
13
+
14
+ # Support both in-repo and standalone imports
15
+ try:
16
+ # In-repo imports (when running from OpenEnv repository)
17
+ from openenv.core.env_server.types import Action, Observation, State
18
+ except ImportError:
19
+ # Standalone imports (when environment is standalone with openenv from pip)
20
+ from openenv.core.env_server.types import Action, Observation, State
21
+
22
+
23
+ class Tbench2Action(Action):
24
+ """Action for interacting with a TB2 task session."""
25
+
26
+ action_type: str = Field(default="exec")
27
+ command: str = Field(default="")
28
+ session_id: str | None = Field(default=None)
29
+ block: bool = Field(default=True)
30
+ wait_seconds: float | None = Field(default=None)
31
+ file_path: str = Field(default="")
32
+ content: str = Field(default="")
33
+
34
+
35
+ class Tbench2Observation(Observation):
36
+ """Observation returned from the TB2 environment."""
37
+
38
+ instruction: str = Field(default="")
39
+ output: str = Field(default="")
40
+ success: bool = Field(default=True)
41
+ error: str = Field(default="")
42
+ task_id: str = Field(default="")
43
+ task_path: str = Field(default="")
44
+ session_id: str | None = Field(default=None)
45
+ action_type: str = Field(default="")
46
+ info: dict = Field(default_factory=dict)
47
+
48
+
49
+ class Tbench2State(State):
50
+ """Server-side state for a TB2 task."""
51
+
52
+ task_id: str = Field(default="")
53
+ task_path: str = Field(default="")
54
+ session_id: str = Field(default="")
55
+ terminal_ready: bool = Field(default=False)
56
+ last_action_type: str = Field(default="")
57
+ last_command: str = Field(default="")
58
+ last_output: str = Field(default="")
openenv.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: tbench2
3
+ type: space
4
+ runtime: fastapi
5
+ app: server.app:app
6
+ port: 8000
7
+
pyproject.toml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ [build-system]
8
+ requires = ["setuptools>=45", "wheel"]
9
+ build-backend = "setuptools.build_meta"
10
+
11
+ [project]
12
+ name = "openenv-tbench2_env"
13
+ version = "0.1.0"
14
+ description = "Tbench2 Env environment for OpenEnv"
15
+ requires-python = ">=3.10"
16
+ dependencies = [
17
+ # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
18
+ # install from github
19
+ "openenv-core @ git+https://github.com/meta-pytorch/OpenEnv.git",
20
+ "pytest>=8.4.0",
21
+ # Environment-specific dependencies
22
+ # Add all dependencies needed for your environment here
23
+ "camel-ai",
24
+ # Docker-backed mode (optional, for full TB2.0 compatibility)
25
+ "docker>=7.0.0",
26
+ # TOML parsing (tomllib for Python 3.11+, tomli for older versions)
27
+ "tomli>=2.0.0; python_version < '3.11'",
28
+ # YAML parsing (for SETA dataset task.yaml format)
29
+ "pyyaml>=6.0.0",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ dev = [
34
+ "pytest>=8.0.0",
35
+ "pytest-cov>=4.0.0",
36
+ ]
37
+
38
+ [project.scripts]
39
+ # Server entry point - enables running via: uv run --project . server
40
+ # or: python -m tbench2_env.server.app
41
+ server = "tbench2_env.server.app:main"
42
+
43
+ [tool.setuptools]
44
+ include-package-data = true
45
+ packages = ["tbench2_env", "tbench2_env.server"]
46
+ package-dir = { "tbench2_env" = ".", "tbench2_env.server" = "server" }
server/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Tbench2 Env environment server components."""
8
+
9
+ from .tbench2_env_environment import Tbench2DockerEnvironment, Tbench2Environment
10
+
11
+
12
+ __all__ = ["Tbench2Environment", "Tbench2DockerEnvironment"]
server/app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ FastAPI application for the Tbench2 Env Environment.
9
+
10
+ This module creates an HTTP server that exposes the Tbench2Environment
11
+ over HTTP and WebSocket endpoints, compatible with EnvClient.
12
+
13
+ Endpoints:
14
+ - POST /reset: Reset the environment
15
+ - POST /step: Execute an action
16
+ - GET /state: Get current environment state
17
+ - GET /schema: Get action/observation schemas
18
+ - WS /ws: WebSocket endpoint for persistent sessions
19
+
20
+ Usage:
21
+ # Development (with auto-reload):
22
+ uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
23
+
24
+ # Production:
25
+ uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
26
+
27
+ # Or run directly:
28
+ python -m server.app
29
+ """
30
+
31
+ import os
32
+
33
+
34
+ try:
35
+ from openenv.core.env_server.http_server import create_app
36
+
37
+ # In-repo imports
38
+ from tbench2_env.models import Tbench2Action, Tbench2Observation
39
+
40
+ from .tbench2_env_environment import Tbench2DockerEnvironment, Tbench2Environment
41
+ except Exception as e: # pragma: no cover
42
+ # Standalone imports (when environment is standalone with openenv from pip)
43
+ from openenv.core.env_server.http_server import create_app
44
+ from server.tbench2_env_environment import Tbench2DockerEnvironment, Tbench2Environment
45
+
46
+ from models import Tbench2Action, Tbench2Observation
47
+
48
+ _IMPORT_ERROR = e
49
+
50
+
51
+ # Determine which environment class to use based on TB2_MODE
52
+ _TB2_MODE = os.getenv("TB2_MODE", "local").lower()
53
+
54
+ if _TB2_MODE == "docker":
55
+ _DEFAULT_ENVIRONMENT = Tbench2DockerEnvironment
56
+ _ENV_SUFFIX = " (Docker mode)"
57
+ elif _TB2_MODE == "auto":
58
+ # Auto-detect: try Docker, fall back to local
59
+ _DEFAULT_ENVIRONMENT = Tbench2Environment
60
+ _ENV_SUFFIX = " (auto-detect mode)"
61
+ else:
62
+ _DEFAULT_ENVIRONMENT = Tbench2Environment
63
+ _ENV_SUFFIX = " (local mode)"
64
+
65
+
66
+ # Create the app with web interface and README integration
67
+ app = create_app(
68
+ _DEFAULT_ENVIRONMENT,
69
+ Tbench2Action,
70
+ Tbench2Observation,
71
+ env_name="tbench2_env" + _ENV_SUFFIX,
72
+ max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
73
+ )
74
+
75
+
76
+ def main(host: str = "0.0.0.0", port: int = 8000):
77
+ """
78
+ Entry point for direct execution via uv run or python -m.
79
+
80
+ This function enables running the server without Docker:
81
+ uv run --project . server
82
+ uv run --project . server --port 8001
83
+ python -m tbench2_env.server.app
84
+
85
+ Args:
86
+ host: Host address to bind to (default: "0.0.0.0")
87
+ port: Port number to listen on (default: 8000)
88
+
89
+ For production deployments, consider using uvicorn directly with
90
+ multiple workers:
91
+ uvicorn tbench2_env.server.app:app --workers 4
92
+ """
93
+ import uvicorn
94
+
95
+ uvicorn.run(app, host=host, port=port)
96
+
97
+
98
+ if __name__ == "__main__":
99
+ import argparse
100
+
101
+ parser = argparse.ArgumentParser()
102
+ parser.add_argument("--port", type=int, default=8000)
103
+ args = parser.parse_args()
104
+ main(port=args.port)
server/tbench2_env_environment.py ADDED
@@ -0,0 +1,724 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """TB2 environment server implementation (Spaces-compatible local mode)."""
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import os
13
+ import sys
14
+ import urllib.request
15
+ import zipfile
16
+ from pathlib import Path
17
+ from typing import Any
18
+ from uuid import uuid4
19
+
20
+
21
+ if sys.version_info >= (3, 11):
22
+ import tomllib
23
+ else:
24
+ import tomli as tomllib
25
+
26
+ from openenv.core.env_server.interfaces import Environment
27
+
28
+
29
+ # Support both in-repo and standalone imports
30
+ try:
31
+ # In-repo imports (when running from OpenEnv repository)
32
+ from tbench2_env.models import Tbench2Action, Tbench2Observation, Tbench2State
33
+ except ImportError:
34
+ # Standalone imports (when environment is standalone with openenv from pip)
35
+ from models import Tbench2Action, Tbench2Observation, Tbench2State
36
+
37
+ _CAMEL_IMPORT_ERROR: Exception | None = None
38
+
39
+
40
+ def _require_terminal_toolkit() -> Any:
41
+ global _CAMEL_IMPORT_ERROR
42
+ if _CAMEL_IMPORT_ERROR is not None:
43
+ raise RuntimeError(
44
+ "camel-ai (TerminalToolkit) is required for TB2. Install from PyPI or from the CAMEL repo."
45
+ ) from _CAMEL_IMPORT_ERROR
46
+
47
+ try:
48
+ from camel.toolkits import TerminalToolkit
49
+ except Exception as exc: # pragma: no cover
50
+ _CAMEL_IMPORT_ERROR = exc
51
+ raise RuntimeError(
52
+ "camel-ai (TerminalToolkit) is required for TB2. Install from PyPI or from the CAMEL repo."
53
+ ) from exc
54
+
55
+ return TerminalToolkit
56
+
57
+
58
+ def _download_tb2_repo(cache_dir: Path) -> Path:
59
+ repo_url = os.getenv(
60
+ "TB2_REPO_URL",
61
+ "https://github.com/laude-institute/terminal-bench-2/archive/refs/heads/main.zip",
62
+ )
63
+ cache_dir.mkdir(parents=True, exist_ok=True)
64
+ archive_path = cache_dir / "terminal-bench-2.zip"
65
+
66
+ if not archive_path.exists():
67
+ urllib.request.urlretrieve(repo_url, archive_path)
68
+
69
+ with zipfile.ZipFile(archive_path) as zf:
70
+ root = zf.namelist()[0].split("/")[0]
71
+ extract_dir = cache_dir / root
72
+ if not extract_dir.exists():
73
+ zf.extractall(cache_dir)
74
+
75
+ return extract_dir
76
+
77
+
78
+ def _read_instruction(task_dir: Path) -> str:
79
+ """Read task instruction from instruction.md or task.yaml (SETA format)."""
80
+ # Try instruction.md first (Terminal-Bench-2 format)
81
+ instruction_path = task_dir / "instruction.md"
82
+ if instruction_path.exists():
83
+ return instruction_path.read_text(encoding="utf-8")
84
+
85
+ # Try task.yaml (SETA dataset format)
86
+ # Source: https://huggingface.co/datasets/camel-ai/seta-env
87
+ task_yaml_path = task_dir / "task.yaml"
88
+ if task_yaml_path.exists():
89
+ try:
90
+ import yaml
91
+
92
+ data = yaml.safe_load(task_yaml_path.read_text(encoding="utf-8"))
93
+ if isinstance(data, dict) and "instruction" in data:
94
+ return data["instruction"]
95
+ except Exception:
96
+ pass
97
+
98
+ return ""
99
+
100
+
101
+ def _read_timeout(task_dir: Path, fallback: float) -> float:
102
+ task_toml = task_dir / "task.toml"
103
+ if not task_toml.exists():
104
+ return fallback
105
+ try:
106
+ data = tomllib.loads(task_toml.read_text(encoding="utf-8"))
107
+ except Exception:
108
+ return fallback
109
+ verifier = data.get("verifier", {})
110
+ return float(verifier.get("timeout_sec", fallback))
111
+
112
+
113
+ class Tbench2Environment(Environment[Tbench2Action, Tbench2Observation, Tbench2State]):
114
+ """OpenEnv wrapper around Terminal-Bench 2 tasks (local execution)."""
115
+
116
+ SUPPORTS_CONCURRENT_SESSIONS: bool = True
117
+
118
+ def __init__(
119
+ self,
120
+ tasks_dir: str | None = None,
121
+ output_dir: str | None = None,
122
+ command_timeout_s: float = 20.0,
123
+ safe_mode: bool = False,
124
+ ) -> None:
125
+ super().__init__()
126
+ self.tasks_dir = tasks_dir or os.getenv("TB2_TASKS_DIR", "")
127
+ self.output_dir = Path(output_dir or os.getenv("TB2_OUTPUT_DIR", "/tmp/tbench2_env_runs"))
128
+ self.command_timeout_s = command_timeout_s
129
+ self.safe_mode = safe_mode
130
+
131
+ self._state = Tbench2State()
132
+ self._task_dir: Path | None = None
133
+ self._terminal_toolkit = None
134
+ self._instruction = ""
135
+
136
+ def reset(
137
+ self,
138
+ seed: int | None = None,
139
+ episode_id: str | None = None,
140
+ **kwargs: Any,
141
+ ) -> Tbench2Observation:
142
+ del seed
143
+
144
+ TerminalToolkit = _require_terminal_toolkit()
145
+
146
+ task_id = kwargs.get("task_id") or kwargs.get("task_name")
147
+ task_path = kwargs.get("task_path") or kwargs.get("path")
148
+
149
+ task_dir = self._resolve_task_path(task_id, task_path)
150
+ resolved_task_id = task_id or task_dir.name
151
+
152
+ self._instruction = _read_instruction(task_dir)
153
+ self._task_dir = task_dir
154
+
155
+ trial_name = f"{resolved_task_id}.{episode_id or uuid4().hex}"
156
+ session_logs_dir = self.output_dir / trial_name / "terminal_toolkit_session_logs"
157
+ session_logs_dir.mkdir(parents=True, exist_ok=True)
158
+
159
+ self._terminal_toolkit = TerminalToolkit(
160
+ timeout=self.command_timeout_s,
161
+ working_directory=str(task_dir),
162
+ use_docker_backend=False,
163
+ session_logs_dir=session_logs_dir,
164
+ safe_mode=self.safe_mode,
165
+ )
166
+
167
+ self._state = Tbench2State(
168
+ episode_id=episode_id or str(uuid4()),
169
+ step_count=0,
170
+ task_id=resolved_task_id,
171
+ task_path=str(task_dir),
172
+ terminal_ready=True,
173
+ )
174
+
175
+ return Tbench2Observation(
176
+ instruction=self._instruction,
177
+ output="",
178
+ success=True,
179
+ error="",
180
+ task_id=resolved_task_id,
181
+ task_path=str(task_dir),
182
+ session_id=None,
183
+ action_type="reset",
184
+ info={},
185
+ reward=0.0,
186
+ done=False,
187
+ )
188
+
189
+ def step(
190
+ self,
191
+ action: Tbench2Action,
192
+ timeout_s: float | None = None,
193
+ **kwargs: Any,
194
+ ) -> Tbench2Observation:
195
+ del timeout_s, kwargs
196
+
197
+ if not isinstance(action, Tbench2Action):
198
+ raise TypeError(f"Expected Tbench2Action, got {type(action)}")
199
+
200
+ if self._terminal_toolkit is None or self._task_dir is None:
201
+ raise RuntimeError("TB2 environment not initialized. Call reset() first.")
202
+
203
+ self._state.step_count += 1
204
+ self._state.last_action_type = action.action_type
205
+ self._state.last_command = action.command
206
+
207
+ output = ""
208
+ error = ""
209
+ success = True
210
+ reward = None
211
+ done = False
212
+ info: dict[str, Any] = {}
213
+ session_id = action.session_id or "tb2-session"
214
+
215
+ try:
216
+ if action.action_type == "exec":
217
+ output = self._terminal_toolkit.shell_exec(
218
+ command=action.command,
219
+ block=action.block,
220
+ id=session_id,
221
+ )
222
+ elif action.action_type == "write":
223
+ self._ensure_session_id(action.session_id, action.action_type)
224
+ output = self._terminal_toolkit.shell_write_to_process(
225
+ id=action.session_id,
226
+ command=action.command,
227
+ )
228
+ elif action.action_type == "view":
229
+ self._ensure_session_id(action.session_id, action.action_type)
230
+ output = self._terminal_toolkit.shell_view(id=action.session_id)
231
+ elif action.action_type == "wait":
232
+ self._ensure_session_id(action.session_id, action.action_type)
233
+ wait_seconds = action.wait_seconds or 0.0
234
+ output = self._terminal_toolkit.shell_wait(
235
+ id=action.session_id,
236
+ wait_seconds=wait_seconds,
237
+ )
238
+ elif action.action_type == "kill":
239
+ self._ensure_session_id(action.session_id, action.action_type)
240
+ self._terminal_toolkit.shell_kill_process(id=action.session_id)
241
+ output = f"Killed session {action.session_id}"
242
+ elif action.action_type == "write_file":
243
+ self._terminal_toolkit.shell_write_content_to_file(
244
+ content=action.content,
245
+ file_path=action.file_path,
246
+ )
247
+ output = f"Wrote content to {action.file_path}"
248
+ elif action.action_type == "evaluate":
249
+ output, reward, info = self._evaluate_task()
250
+ done = True
251
+ elif action.action_type == "close":
252
+ self.close()
253
+ output = "Closed TB2 environment."
254
+ done = True
255
+ else:
256
+ raise ValueError(f"Unsupported action_type: {action.action_type}")
257
+ except Exception as exc: # pragma: no cover
258
+ success = False
259
+ error = str(exc)
260
+
261
+ self._state.last_output = output
262
+ self._state.session_id = session_id or ""
263
+
264
+ return Tbench2Observation(
265
+ instruction=self._instruction,
266
+ output=output,
267
+ success=success,
268
+ error=error,
269
+ task_id=self._state.task_id,
270
+ task_path=self._state.task_path,
271
+ session_id=session_id or "",
272
+ action_type=action.action_type,
273
+ info=info,
274
+ reward=reward,
275
+ done=done,
276
+ )
277
+
278
+ @property
279
+ def state(self) -> Tbench2State:
280
+ return self._state
281
+
282
+ def close(self) -> None:
283
+ self._terminal_toolkit = None
284
+ self._task_dir = None
285
+ self._instruction = ""
286
+
287
+ def _resolve_task_path(self, task_id: str | None, task_path: str | None) -> Path:
288
+ if task_path:
289
+ resolved = Path(task_path).expanduser().resolve()
290
+ if not resolved.exists():
291
+ raise FileNotFoundError(f"Task path not found: {resolved}")
292
+ return resolved
293
+
294
+ if not task_id:
295
+ raise ValueError("Provide task_id or task_path to reset TB2 environment.")
296
+
297
+ if not self.tasks_dir:
298
+ cache_dir = Path(os.getenv("TB2_CACHE_DIR", str(self.output_dir / "repo_cache")))
299
+ repo_dir = _download_tb2_repo(cache_dir)
300
+ resolved = repo_dir / task_id
301
+ else:
302
+ resolved = Path(self.tasks_dir).expanduser().resolve() / task_id
303
+
304
+ if not resolved.exists():
305
+ raise FileNotFoundError(f"Task path not found: {resolved}")
306
+ return resolved
307
+
308
+ def _ensure_session_id(self, session_id: str | None, action_type: str) -> None:
309
+ if not session_id:
310
+ raise ValueError(f"session_id is required for action_type='{action_type}'")
311
+
312
+ def _evaluate_task(self) -> tuple[str, float, dict[str, Any]]:
313
+ if self._task_dir is None:
314
+ raise RuntimeError("TB2 environment not initialized. Call reset() first.")
315
+ if self._terminal_toolkit is None:
316
+ raise RuntimeError("Terminal toolkit not initialized.")
317
+
318
+ _read_timeout(self._task_dir, fallback=900.0) # Validate timeout config
319
+
320
+ # Determine evaluation method based on task format
321
+ run_tests_sh = self._task_dir / "run-tests.sh"
322
+ tests_dir = self._task_dir / "tests"
323
+
324
+ if run_tests_sh.exists():
325
+ # SETA format: use run-tests.sh
326
+ # Source: https://huggingface.co/datasets/camel-ai/seta-env
327
+ cmd = f"cd {self._task_dir} && bash run-tests.sh; echo __TB2_EXIT_CODE__:$?"
328
+ elif tests_dir.exists():
329
+ # Terminal-Bench-2 format: use pytest
330
+ cmd = f"cd {self._task_dir} && python -m pytest -q {tests_dir} -rA; echo __TB2_EXIT_CODE__:$?"
331
+ else:
332
+ # No tests found
333
+ return "No tests found (neither run-tests.sh nor tests/ directory)", 0.0, {"tests_passed": False, "exit_code": -1}
334
+
335
+ output = self._terminal_toolkit.shell_exec(
336
+ id="tb2-tests",
337
+ command=cmd,
338
+ block=True,
339
+ )
340
+
341
+ exit_code = 1
342
+ marker = "__TB2_EXIT_CODE__"
343
+ for line in output.splitlines()[::-1]:
344
+ if marker in line:
345
+ try:
346
+ exit_code = int(line.split(":", 1)[1].strip())
347
+ except Exception:
348
+ exit_code = 1
349
+ break
350
+
351
+ reward = 1.0 if exit_code == 0 else 0.0
352
+ info = {"tests_passed": exit_code == 0, "exit_code": exit_code}
353
+ return output, reward, info
354
+
355
+
356
+ class Tbench2DockerEnvironment(Environment[Tbench2Action, Tbench2Observation, Tbench2State]):
357
+ """OpenEnv wrapper around Terminal-Bench 2 tasks with Docker isolation.
358
+
359
+ This environment runs each task in its own Docker container, reading
360
+ the image specification from task.toml's [environment] section.
361
+
362
+ Requires:
363
+ - Docker socket mounted (/var/run/docker.sock)
364
+ - Sufficient disk space for container images
365
+ """
366
+
367
+ SUPPORTS_CONCURRENT_SESSIONS: bool = True
368
+
369
+ def __init__(
370
+ self,
371
+ tasks_dir: str | None = None,
372
+ output_dir: str | None = None,
373
+ command_timeout_s: float = 300.0,
374
+ safe_mode: bool = True,
375
+ ) -> None:
376
+ super().__init__()
377
+ self.tasks_dir = tasks_dir or os.getenv("TB2_TASKS_DIR", "")
378
+ self.output_dir = Path(output_dir or os.getenv("TB2_OUTPUT_DIR", "/tmp/tbench2_env_runs"))
379
+ self.command_timeout_s = command_timeout_s
380
+ self.safe_mode = safe_mode
381
+
382
+ self._state = Tbench2State()
383
+ self._task_dir: Path | None = None
384
+ self._docker_client = None
385
+ self._container = None
386
+ self._instruction = ""
387
+ self._task_image = ""
388
+ self._task_config: dict[str, Any] = {}
389
+
390
+ def _get_docker_client(self) -> Any:
391
+ """Lazy initialization of Docker client."""
392
+ if self._docker_client is None:
393
+ try:
394
+ import docker
395
+
396
+ self._docker_client = docker.from_env()
397
+ except Exception as exc:
398
+ raise RuntimeError(
399
+ f"Docker client not available. Ensure Docker socket is mounted. Error: {exc}"
400
+ ) from exc
401
+ return self._docker_client
402
+
403
+ def reset(
404
+ self,
405
+ seed: int | None = None,
406
+ episode_id: str | None = None,
407
+ **kwargs: Any,
408
+ ) -> Tbench2Observation:
409
+ del seed
410
+
411
+ task_id = kwargs.get("task_id") or kwargs.get("task_name")
412
+ task_path = kwargs.get("task_path") or kwargs.get("path")
413
+
414
+ task_dir = self._resolve_task_path(task_id, task_path)
415
+ resolved_task_id = task_id or task_dir.name
416
+
417
+ # Read task configuration including Docker image
418
+ task_toml_path = task_dir / "task.toml"
419
+ if task_toml_path.exists():
420
+ self._task_config = tomllib.loads(task_toml_path.read_text(encoding="utf-8"))
421
+ self._task_image = self._task_config.get("environment", {}).get("docker_image", "")
422
+ else:
423
+ self._task_image = ""
424
+ self._task_config = {}
425
+
426
+ self._instruction = _read_instruction(task_dir)
427
+ self._task_dir = task_dir
428
+
429
+ # Create trial directory for logs
430
+ trial_name = f"{resolved_task_id}.{episode_id or uuid4().hex}"
431
+ trial_dir = self.output_dir / trial_name
432
+ trial_dir.mkdir(parents=True, exist_ok=True)
433
+
434
+ # Start Docker container if image is specified
435
+ if self._task_image:
436
+ self._start_container(task_dir, trial_dir)
437
+ else:
438
+ # Fallback to local mode if no image specified
439
+ self._state = Tbench2State(
440
+ episode_id=episode_id or str(uuid4()),
441
+ step_count=0,
442
+ task_id=resolved_task_id,
443
+ task_path=str(task_dir),
444
+ terminal_ready=not self._task_image, # Ready if no container needed
445
+ )
446
+
447
+ return Tbench2Observation(
448
+ instruction=self._instruction,
449
+ output="",
450
+ success=True,
451
+ error="",
452
+ task_id=resolved_task_id,
453
+ task_path=str(task_dir),
454
+ session_id=None,
455
+ action_type="reset",
456
+ info={"docker_image": self._task_image} if self._task_image else {},
457
+ reward=0.0,
458
+ done=False,
459
+ )
460
+
461
+ def _start_container(self, task_dir: Path, trial_dir: Path) -> None:
462
+ """Start a Docker container for the task.
463
+
464
+ Uses file copying instead of bind mounts to support Docker-in-Docker
465
+ scenarios where the server runs inside a container. Bind mounts reference
466
+ host paths, which don't exist when the server is containerized.
467
+ """
468
+ docker = self._get_docker_client()
469
+
470
+ try:
471
+ # Pull image if needed
472
+ try:
473
+ docker.images.get(self._task_image)
474
+ except Exception:
475
+ logging.info(f"Pulling image {self._task_image}...")
476
+ docker.images.pull(self._task_image)
477
+
478
+ # Start container WITHOUT bind mounts (for DinD compatibility)
479
+ self._container = docker.containers.run(
480
+ image=self._task_image,
481
+ command="sleep infinity",
482
+ detach=True,
483
+ network_mode="host",
484
+ working_dir="/task",
485
+ remove=False,
486
+ )
487
+
488
+ # Copy task files into container using tar archive
489
+ # This works in Docker-in-Docker because we read files from our
490
+ # filesystem and stream them to the container via the Docker API
491
+ self._copy_dir_to_container(task_dir, "/task")
492
+
493
+ self._state = Tbench2State(
494
+ episode_id=str(uuid4()),
495
+ step_count=0,
496
+ task_id=task_dir.name,
497
+ task_path=str(task_dir),
498
+ terminal_ready=True,
499
+ )
500
+
501
+ except Exception as exc:
502
+ raise RuntimeError(f"Failed to start container: {exc}") from exc
503
+
504
+ def _copy_dir_to_container(self, src_dir: Path, dest_path: str) -> None:
505
+ """Copy a directory into the container using tar archive.
506
+
507
+ This method streams files via the Docker API, avoiding bind mount
508
+ issues in Docker-in-Docker scenarios.
509
+ """
510
+ import io
511
+ import tarfile
512
+
513
+ if self._container is None:
514
+ raise RuntimeError("Container not started")
515
+
516
+ # Create tar archive in memory
517
+ tar_stream = io.BytesIO()
518
+ with tarfile.open(fileobj=tar_stream, mode="w") as tar:
519
+ for item in src_dir.rglob("*"):
520
+ arcname = str(item.relative_to(src_dir))
521
+ tar.add(str(item), arcname=arcname)
522
+
523
+ tar_stream.seek(0)
524
+
525
+ # Copy to container
526
+ self._container.put_archive(dest_path, tar_stream.getvalue())
527
+
528
+ def _exec_in_container(self, command: str, workdir: str = "/task") -> tuple[int, str]:
529
+ """Execute a command inside the container."""
530
+ if self._container is None:
531
+ raise RuntimeError("Container not started. Call reset() first.")
532
+
533
+ exit_code, output = self._container.exec_run(
534
+ cmd=f"bash -c 'cd {workdir} && {command}'",
535
+ workdir="/task",
536
+ stdout=True,
537
+ stderr=True,
538
+ )
539
+ return exit_code, output.decode("utf-8", errors="replace")
540
+
541
+ def step(
542
+ self,
543
+ action: Tbench2Action,
544
+ timeout_s: float | None = None,
545
+ **kwargs: Any,
546
+ ) -> Tbench2Observation:
547
+ del timeout_s, kwargs
548
+
549
+ if not isinstance(action, Tbench2Action):
550
+ raise TypeError(f"Expected Tbench2Action, got {type(action)}")
551
+
552
+ if self._task_dir is None:
553
+ raise RuntimeError("TB2 environment not initialized. Call reset() first.")
554
+
555
+ self._state.step_count += 1
556
+ self._state.last_action_type = action.action_type
557
+ self._state.last_command = action.command
558
+
559
+ output = ""
560
+ error = ""
561
+ success = True
562
+ reward = None
563
+ done = False
564
+ info: dict[str, Any] = {}
565
+ session_id = action.session_id or "tb2-session"
566
+
567
+ try:
568
+ if action.action_type == "exec":
569
+ if self._container:
570
+ exit_code, output = self._exec_in_container(action.command)
571
+ success = exit_code == 0
572
+ else:
573
+ # Fallback to local execution
574
+ import subprocess
575
+
576
+ result = subprocess.run(
577
+ action.command,
578
+ shell=True,
579
+ capture_output=True,
580
+ text=True,
581
+ timeout=self.command_timeout_s,
582
+ )
583
+ output = result.stdout + result.stderr
584
+ success = result.returncode == 0
585
+
586
+ elif action.action_type == "write_file":
587
+ if self._container:
588
+ # Write to container
589
+ exit_code, _ = self._exec_in_container(f"cat > {action.file_path} << 'EOF'\n{action.content}\nEOF")
590
+ success = exit_code == 0
591
+ output = f"Wrote to {action.file_path}"
592
+ else:
593
+ # Local write
594
+ Path(action.file_path).write_text(action.content)
595
+ output = f"Wrote to {action.file_path}"
596
+
597
+ elif action.action_type == "evaluate":
598
+ if self._container:
599
+ output, reward, info = self._evaluate_docker()
600
+ else:
601
+ output, reward, info = self._evaluate_local()
602
+ done = True
603
+
604
+ elif action.action_type == "close":
605
+ self.close()
606
+ output = "Closed TB2 environment."
607
+ done = True
608
+
609
+ else:
610
+ raise ValueError(f"Unsupported action_type in Docker mode: {action.action_type}")
611
+
612
+ except Exception as exc:
613
+ success = False
614
+ error = str(exc)
615
+
616
+ self._state.last_output = output
617
+ self._state.session_id = session_id or ""
618
+
619
+ return Tbench2Observation(
620
+ instruction=self._instruction,
621
+ output=output,
622
+ success=success,
623
+ error=error,
624
+ task_id=self._state.task_id,
625
+ task_path=self._state.task_path,
626
+ session_id=session_id or "",
627
+ action_type=action.action_type,
628
+ info=info,
629
+ reward=reward,
630
+ done=done,
631
+ )
632
+
633
+ def _evaluate_docker(self) -> tuple[str, float, dict[str, Any]]:
634
+ """Evaluate task inside Docker container."""
635
+ if self._container is None:
636
+ raise RuntimeError("Container not started.")
637
+ assert self._task_dir is not None, "Task directory not set"
638
+
639
+ # Run pytest in the container's /task directory
640
+ # Use exit code marker for consistency with local mode
641
+ cmd = "cd /task && python -m pytest -q tests/ -rA; echo __TB2_EXIT_CODE__:$?"
642
+
643
+ exit_code, output = self._container.exec_run(
644
+ cmd=f"bash -c '{cmd}'",
645
+ workdir="/task",
646
+ stdout=True,
647
+ stderr=True,
648
+ )
649
+ output_str = output.decode("utf-8", errors="replace")
650
+
651
+ # Parse exit code from marker (same logic as local mode)
652
+ ec = 1
653
+ marker = "__TB2_EXIT_CODE__"
654
+ for line in output_str.splitlines()[::-1]:
655
+ if marker in line:
656
+ try:
657
+ ec = int(line.split(":", 1)[1].strip())
658
+ except Exception:
659
+ ec = 1
660
+ break
661
+
662
+ reward = 1.0 if ec == 0 else 0.0
663
+ info = {"tests_passed": ec == 0, "exit_code": ec}
664
+ return output_str, reward, info
665
+
666
+ def _evaluate_local(self) -> tuple[str, float, dict[str, Any]]:
667
+ """Evaluate task locally (fallback)."""
668
+ if self._task_dir is None:
669
+ raise RuntimeError("Task not initialized.")
670
+
671
+ tests_dir = self._task_dir / "tests"
672
+ cmd = f"cd {self._task_dir} && python -m pytest -q {tests_dir} -rA; echo __TB2_EXIT_CODE__:$?"
673
+
674
+ import subprocess
675
+
676
+ result = subprocess.run(
677
+ cmd,
678
+ shell=True,
679
+ capture_output=True,
680
+ text=True,
681
+ timeout=900.0,
682
+ )
683
+ output = result.stdout + result.stderr
684
+ exit_code = result.returncode
685
+
686
+ reward = 1.0 if exit_code == 0 else 0.0
687
+ info = {"tests_passed": exit_code == 0, "exit_code": exit_code}
688
+ return output, reward, info
689
+
690
+ @property
691
+ def state(self) -> Tbench2State:
692
+ return self._state
693
+
694
+ def close(self) -> None:
695
+ if self._container:
696
+ try:
697
+ self._container.stop(timeout=10)
698
+ self._container.remove(force=True)
699
+ except Exception:
700
+ pass
701
+ self._container = None
702
+ self._task_dir = None
703
+ self._instruction = ""
704
+
705
+ def _resolve_task_path(self, task_id: str | None, task_path: str | None) -> Path:
706
+ if task_path:
707
+ resolved = Path(task_path).expanduser().resolve()
708
+ if not resolved.exists():
709
+ raise FileNotFoundError(f"Task path not found: {resolved}")
710
+ return resolved
711
+
712
+ if not task_id:
713
+ raise ValueError("Provide task_id or task_path to reset TB2 environment.")
714
+
715
+ if not self.tasks_dir:
716
+ cache_dir = Path(os.getenv("TB2_CACHE_DIR", str(self.output_dir / "repo_cache")))
717
+ repo_dir = _download_tb2_repo(cache_dir)
718
+ resolved = repo_dir / task_id
719
+ else:
720
+ resolved = Path(self.tasks_dir).expanduser().resolve() / task_id
721
+
722
+ if not resolved.exists():
723
+ raise FileNotFoundError(f"Task path not found: {resolved}")
724
+ return resolved