AdithyaSK HF Staff commited on
Commit
fd986d0
·
verified ·
1 Parent(s): 2ce722e

Upload folder using huggingface_hub

Browse files
Dockerfile ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Multi-stage build using openenv-base
8
+ # This Dockerfile is flexible and works for both:
9
+ # - In-repo environments (with local OpenEnv sources)
10
+ # - Standalone environments (with openenv from PyPI/Git)
11
+ # The build script (openenv build) handles context detection and sets appropriate build args.
12
+
13
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ${BASE_IMAGE} AS builder
15
+
16
+ WORKDIR /app
17
+
18
+ # Ensure git is available (required for installing dependencies from VCS)
19
+ RUN apt-get update && \
20
+ apt-get install -y --no-install-recommends git && \
21
+ rm -rf /var/lib/apt/lists/*
22
+
23
+ # Build argument to control whether we're building standalone or in-repo
24
+ ARG BUILD_MODE=in-repo
25
+ ARG ENV_NAME=terminus_env
26
+
27
+ # Copy environment code (always at root of build context)
28
+ COPY . /app/env
29
+
30
+ # For in-repo builds, openenv is already vendored in the build context
31
+ # For standalone builds, openenv will be installed via pyproject.toml
32
+ WORKDIR /app/env
33
+
34
+ # Ensure uv is available (for local builds where base image lacks it)
35
+ RUN if ! command -v uv >/dev/null 2>&1; then \
36
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
38
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
+ fi
40
+
41
+ # Install dependencies using uv sync
42
+ # If uv.lock exists, use it; otherwise resolve on the fly
43
+ RUN --mount=type=cache,target=/root/.cache/uv \
44
+ if [ -f uv.lock ]; then \
45
+ uv sync --frozen --no-install-project --no-editable; \
46
+ else \
47
+ uv sync --no-install-project --no-editable; \
48
+ fi
49
+
50
+ RUN --mount=type=cache,target=/root/.cache/uv \
51
+ if [ -f uv.lock ]; then \
52
+ uv sync --frozen --no-editable; \
53
+ else \
54
+ uv sync --no-editable; \
55
+ fi
56
+
57
+ # Final runtime stage
58
+ FROM ${BASE_IMAGE}
59
+
60
+ WORKDIR /app
61
+
62
+ # Copy the virtual environment from builder
63
+ COPY --from=builder /app/env/.venv /app/.venv
64
+
65
+ # Copy the environment code
66
+ COPY --from=builder /app/env /app/env
67
+
68
+ # Set PATH to use the virtual environment
69
+ ENV PATH="/app/.venv/bin:$PATH"
70
+
71
+ # Set PYTHONPATH so imports work correctly
72
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
73
+ ENV ENABLE_WEB_INTERFACE=true
74
+
75
+ # Health check
76
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
77
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health', timeout=2).read()" || exit 1
78
+
79
+ # Run the FastAPI server
80
+ # The module path is constructed to work with the /app/env structure
81
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
README.md CHANGED
@@ -1,10 +1,97 @@
1
  ---
2
- title: Terminus Env
3
- emoji: 🌖
4
  colorFrom: green
5
- colorTo: indigo
6
  sdk: docker
7
  pinned: false
 
 
 
 
 
 
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Terminus Environment Server
3
+ emoji: 🛠️
4
  colorFrom: green
5
+ colorTo: blue
6
  sdk: docker
7
  pinned: false
8
+ app_port: 8000
9
+ base_path: /web
10
+ tags:
11
+ - openenv
12
+ - terminus
13
+ - harbor
14
+ - coding
15
+ short_description: Single-tool Terminus-2 coding environment via Harbor
16
  ---
17
 
18
+ # Terminus Environment
19
+
20
+ `terminus_env` exposes Harbor's Terminus-2 software-engineering agent through
21
+ one MCP tool. Each call materializes a temporary Harbor task from the supplied
22
+ instruction, setup commands, and verification commands, then runs:
23
+
24
+ ```bash
25
+ harbor run --agent terminus-2 --model <model> --path <tasks> --task-name <task>
26
+ ```
27
+
28
+ The tool returns a JSON rollout result with the generated command, process exit
29
+ code, output tails, captured task files, reward if one is printed, and any
30
+ error.
31
+
32
+ ## Tool
33
+
34
+ - `run_rollout(...)`: create and run one Harbor Terminus-2 task.
35
+
36
+ Key arguments:
37
+
38
+ - `instruction`: coding task prompt.
39
+ - `model`: LiteLLM-style model name passed to Harbor.
40
+ - `setup`: shell commands baked into the generated task Dockerfile.
41
+ - `verify`: shell commands written to `tests/test.sh`.
42
+ - `api_base`: optional model API base URL.
43
+ - `api_key`: optional provider API key exported as common provider env vars.
44
+ - `env`: optional Harbor execution provider, for example `daytona`.
45
+ - `max_turns`, `temperature`, `collect_rollout_details`: Terminus-2 settings.
46
+
47
+ ## Quick Start
48
+
49
+ ```python
50
+ from terminus_env import TerminusEnv
51
+
52
+ with TerminusEnv(base_url="http://localhost:8000").sync() as env:
53
+ env.reset()
54
+ result = env.call_tool(
55
+ "run_rollout",
56
+ instruction="Fix the failing Flask endpoint and make tests pass.",
57
+ model="openai/gpt-4.1",
58
+ setup=["pip install -q flask pytest"],
59
+ verify=["pytest -q"],
60
+ max_turns=100,
61
+ )
62
+ print(result)
63
+ ```
64
+
65
+ ## Local Server
66
+
67
+ ```bash
68
+ cd envs/terminus_env
69
+ uv run --project . server
70
+ ```
71
+
72
+ The API and custom web UI are served on port 8000. The UI is mounted at `/web`.
73
+
74
+ ## Docker
75
+
76
+ ```bash
77
+ cd envs/terminus_env
78
+ openenv build -t terminus-env
79
+ docker run -p 8000:8000 terminus-env
80
+ ```
81
+
82
+ ## Configuration
83
+
84
+ - `MAX_CONCURRENT_ENVS`: maximum concurrent WebSocket sessions. Defaults to `4`.
85
+ - Provider-specific model keys can be passed per call with `api_key` or mounted
86
+ into the server environment.
87
+ - Harbor task execution may require an execution provider and provider secrets
88
+ depending on where the server is hosted. Pass the provider with `env`, for
89
+ example `env="daytona"`, and configure the corresponding Harbor credentials
90
+ in the host environment.
91
+
92
+ ## Notes
93
+
94
+ This environment intentionally follows a single-tool pattern: OpenEnv owns the
95
+ environment transport and Harbor owns the Terminus-2 rollout loop. It does not
96
+ introduce a shared OpenEnv sandbox abstraction; sandbox provider unification can
97
+ come later behind an RFC-backed API.
__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Terminus Environment for OpenEnv."""
8
+
9
+ from openenv.core.env_server.mcp_types import CallToolAction, ListToolsAction
10
+
11
+ from .client import TerminusEnv
12
+ from .models import CommandResult, TerminusRolloutResult, TerminusState
13
+
14
+ __all__ = [
15
+ "TerminusEnv",
16
+ "TerminusState",
17
+ "TerminusRolloutResult",
18
+ "CommandResult",
19
+ "CallToolAction",
20
+ "ListToolsAction",
21
+ ]
client.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Client for the Terminus environment."""
8
+
9
+ from openenv.core.mcp_client import MCPToolClient
10
+
11
+
12
+ class TerminusEnv(MCPToolClient):
13
+ """MCP client for calling the Terminus single-rollout tool."""
14
+
15
+ pass
models.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Models for the Terminus environment."""
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any
12
+
13
+ from openenv.core.env_server.types import State
14
+ from pydantic import BaseModel, Field
15
+
16
+
17
+ class CommandResult(BaseModel):
18
+ """Outcome of one shell command used by setup or verification."""
19
+
20
+ command: str
21
+ exit_code: int | None = None
22
+ stdout: str = ""
23
+ stderr: str = ""
24
+
25
+
26
+ class TerminusRolloutResult(BaseModel):
27
+ """JSON payload returned by the ``run_rollout`` MCP tool."""
28
+
29
+ task_id: str = ""
30
+ reward: float | None = None
31
+ done: bool = True
32
+ exit_code: int | None = None
33
+ command: list[str] = Field(default_factory=list)
34
+ task_dir: str = ""
35
+ setup: list[str] = Field(default_factory=list)
36
+ verify: list[str] = Field(default_factory=list)
37
+ stdout_tail: str = ""
38
+ stderr_tail: str = ""
39
+ files: dict[str, str] = Field(default_factory=dict)
40
+ error: str | None = None
41
+ metadata: dict[str, Any] = Field(default_factory=dict)
42
+
43
+
44
+ class TerminusState(State):
45
+ """Per-session state for Terminus rollout calls."""
46
+
47
+ rollouts_completed: int = 0
48
+ last_task_id: str | None = None
49
+ last_reward: float | None = None
50
+ last_exit_code: int | None = None
51
+ last_error: str | None = None
openenv.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: terminus_env
3
+ type: space
4
+ runtime: fastapi
5
+ app: server.app:app
6
+ port: 8000
7
+
pyproject.toml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ [build-system]
8
+ requires = ["setuptools>=45", "wheel"]
9
+ build-backend = "setuptools.build_meta"
10
+
11
+ [project]
12
+ name = "openenv-terminus-env"
13
+ version = "0.1.0"
14
+ description = "Single-tool Terminus-2 coding environment for OpenEnv via Harbor"
15
+ requires-python = ">=3.12"
16
+ dependencies = [
17
+ "openenv-core[core]>=0.2.2",
18
+ "fastapi>=0.115.0",
19
+ "fastmcp>=3.0.0",
20
+ "gradio>=4.0.0",
21
+ "harbor>=0.3.0",
22
+ "pydantic>=2.0.0",
23
+ "requests>=2.31.0",
24
+ "uvicorn>=0.24.0",
25
+ ]
26
+
27
+ [project.optional-dependencies]
28
+ dev = [
29
+ "pytest>=8.0.0",
30
+ "pytest-cov>=4.0.0",
31
+ ]
32
+
33
+ [project.scripts]
34
+ server = "terminus_env.server.app:main"
35
+
36
+ [tool.setuptools]
37
+ include-package-data = true
38
+ packages = ["terminus_env", "terminus_env.server"]
39
+ package-dir = { "terminus_env" = ".", "terminus_env.server" = "server" }
40
+
41
+ [tool.setuptools.package-data]
42
+ terminus_env = ["**/*.txt", "**/*.yaml"]
server/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Terminus Env environment server components."""
8
+
9
+ from .terminus_env_environment import TerminusEnvironment
10
+
11
+ __all__ = ["TerminusEnvironment"]
server/app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """FastAPI app for the Terminus environment."""
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import os
13
+ from pathlib import Path
14
+ from typing import Any, Dict
15
+
16
+ from openenv.core.env_server.http_server import create_app
17
+ from openenv.core.env_server.mcp_types import CallToolAction, CallToolObservation
18
+ from pydantic import field_validator
19
+
20
+ try:
21
+ from .terminus_env_environment import TerminusEnvironment
22
+ except ImportError: # pragma: no cover
23
+ from server.terminus_env_environment import TerminusEnvironment # type: ignore
24
+
25
+
26
+ def _load_env_file() -> None:
27
+ candidate = Path(__file__).resolve().parents[1] / ".env"
28
+ if not candidate.exists():
29
+ return
30
+ for raw in candidate.read_text(encoding="utf-8").splitlines():
31
+ line = raw.strip()
32
+ if not line or line.startswith("#") or "=" not in line:
33
+ continue
34
+ key, _, value = line.partition("=")
35
+ key = key.strip()
36
+ value = value.strip().strip('"').strip("'")
37
+ if key and key not in os.environ:
38
+ os.environ[key] = value
39
+
40
+
41
+ class TerminusCallToolAction(CallToolAction):
42
+ """CallToolAction that accepts JSON strings for web UI arguments."""
43
+
44
+ @field_validator("arguments", mode="before")
45
+ @classmethod
46
+ def parse_arguments(cls, value: Any) -> Dict[str, Any]:
47
+ if isinstance(value, str):
48
+ return json.loads(value)
49
+ return value
50
+
51
+
52
+ _load_env_file()
53
+ os.environ.setdefault("ENABLE_WEB_INTERFACE", "true")
54
+
55
+ app = create_app(
56
+ TerminusEnvironment,
57
+ TerminusCallToolAction,
58
+ CallToolObservation,
59
+ env_name="terminus_env",
60
+ max_concurrent_envs=int(os.getenv("MAX_CONCURRENT_ENVS", "4")),
61
+ )
62
+
63
+
64
+ def main(host: str = "0.0.0.0", port: int = 8000) -> None:
65
+ import uvicorn
66
+
67
+ uvicorn.run(app, host=host, port=port)
68
+
69
+
70
+ if __name__ == "__main__":
71
+ main()
server/requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ openenv-core[core]>=0.2.2
2
+ fastapi>=0.115.0
3
+ fastmcp>=3.0.0
4
+ gradio>=4.0.0
5
+ harbor>=0.3.0
6
+ pydantic>=2.0.0
7
+ requests>=2.31.0
8
+ uvicorn>=0.24.0
9
+
10
+
server/terminus_env_environment.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Terminus environment implementation.
8
+
9
+ The environment exposes one MCP tool, ``run_rollout``. The tool materializes a
10
+ temporary Harbor task, invokes ``harbor run --agent terminus-2`` on that task,
11
+ and returns a compact JSON rollout result.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import os
17
+ import re
18
+ import shutil
19
+ import subprocess
20
+ import tempfile
21
+ from pathlib import Path
22
+ from typing import Any, Optional
23
+ from uuid import uuid4
24
+
25
+ from fastmcp import FastMCP
26
+ from openenv.core.env_server.mcp_environment import MCPEnvironment
27
+ from openenv.core.env_server.types import Action, Observation
28
+
29
+ try:
30
+ from ..models import TerminusRolloutResult, TerminusState
31
+ except ImportError: # pragma: no cover
32
+ from models import TerminusRolloutResult, TerminusState
33
+
34
+
35
+ _TAIL_CHARS = 8000
36
+ _DEFAULT_TIMEOUT_S = 900
37
+
38
+
39
+ class TerminusEnvironment(MCPEnvironment):
40
+ """Single-tool MCP wrapper around Harbor's Terminus-2 agent."""
41
+
42
+ SUPPORTS_CONCURRENT_SESSIONS = True
43
+
44
+ def __init__(self):
45
+ self._state = TerminusState(episode_id=str(uuid4()), step_count=0)
46
+
47
+ mcp = FastMCP("terminus_env")
48
+
49
+ @mcp.tool
50
+ def run_rollout(
51
+ instruction: str,
52
+ model: str = "",
53
+ api_base: str = "",
54
+ api_key: str = "",
55
+ setup: Optional[list[str]] = None,
56
+ verify: Optional[list[str]] = None,
57
+ task_id: str = "",
58
+ env: str = "",
59
+ max_turns: int = 100,
60
+ temperature: float = 0.7,
61
+ collect_rollout_details: bool = False,
62
+ timeout_s: int = _DEFAULT_TIMEOUT_S,
63
+ keep_task_dir: bool = False,
64
+ ) -> str:
65
+ """Run one Terminus-2 rollout on a generated Harbor task.
66
+
67
+ Args:
68
+ instruction: Task instruction for Terminus-2.
69
+ model: LiteLLM-style model name passed to Harbor.
70
+ api_base: Optional custom model API base URL.
71
+ api_key: Optional API key exported for common providers.
72
+ setup: Shell commands baked into the task environment.
73
+ verify: Shell commands written into ``tests/test.sh``.
74
+ task_id: Optional identifier for bookkeeping.
75
+ env: Optional Harbor environment provider, e.g. ``daytona``.
76
+ max_turns: Terminus-2 turn limit.
77
+ temperature: Sampling temperature.
78
+ collect_rollout_details: Ask Terminus-2 to collect rollout details.
79
+ timeout_s: Wall-clock timeout for the Harbor process.
80
+ keep_task_dir: Keep generated task files for debugging.
81
+
82
+ Returns:
83
+ JSON-serialized TerminusRolloutResult.
84
+ """
85
+ return self._run_rollout(
86
+ instruction=instruction,
87
+ model=model,
88
+ api_base=api_base,
89
+ api_key=api_key,
90
+ setup=list(setup or []),
91
+ verify=list(verify or []),
92
+ task_id=task_id,
93
+ env=env,
94
+ max_turns=max_turns,
95
+ temperature=temperature,
96
+ collect_rollout_details=collect_rollout_details,
97
+ timeout_s=timeout_s,
98
+ keep_task_dir=keep_task_dir,
99
+ ).model_dump_json()
100
+
101
+ super().__init__(mcp)
102
+
103
+ def reset(
104
+ self,
105
+ seed: Optional[int] = None,
106
+ episode_id: Optional[str] = None,
107
+ **_: Any,
108
+ ) -> Observation:
109
+ self._state = TerminusState(
110
+ episode_id=episode_id or str(uuid4()),
111
+ step_count=0,
112
+ )
113
+ return Observation(
114
+ done=False,
115
+ reward=None,
116
+ metadata={
117
+ "status": "ready",
118
+ "message": "terminus_env ready. Call run_rollout(...).",
119
+ },
120
+ )
121
+
122
+ def _step_impl(
123
+ self,
124
+ action: Action,
125
+ timeout_s: Optional[float] = None,
126
+ **_: Any,
127
+ ) -> Observation:
128
+ return Observation(
129
+ done=False,
130
+ reward=None,
131
+ metadata={
132
+ "error": (
133
+ f"Unknown action type: {type(action).__name__}. "
134
+ "Use ListToolsAction or CallToolAction for MCP interactions."
135
+ )
136
+ },
137
+ )
138
+
139
+ def step(
140
+ self,
141
+ action: Action,
142
+ timeout_s: Optional[float] = None,
143
+ **kwargs: Any,
144
+ ) -> Observation:
145
+ self._state.step_count += 1
146
+ return super().step(action, timeout_s=timeout_s, **kwargs)
147
+
148
+ async def step_async(
149
+ self,
150
+ action: Action,
151
+ timeout_s: Optional[float] = None,
152
+ **kwargs: Any,
153
+ ) -> Observation:
154
+ self._state.step_count += 1
155
+ return await super().step_async(action, timeout_s=timeout_s, **kwargs)
156
+
157
+ @property
158
+ def state(self) -> TerminusState:
159
+ return self._state
160
+
161
+ def _run_rollout(
162
+ self,
163
+ *,
164
+ instruction: str,
165
+ model: str,
166
+ api_base: str,
167
+ api_key: str,
168
+ setup: list[str],
169
+ verify: list[str],
170
+ task_id: str,
171
+ env: str,
172
+ max_turns: int,
173
+ temperature: float,
174
+ collect_rollout_details: bool,
175
+ timeout_s: int,
176
+ keep_task_dir: bool,
177
+ ) -> TerminusRolloutResult:
178
+ resolved_task_id = task_id or f"terminus-{uuid4()}"
179
+ result = TerminusRolloutResult(
180
+ task_id=resolved_task_id,
181
+ setup=setup,
182
+ verify=verify,
183
+ )
184
+
185
+ if not instruction.strip():
186
+ result.error = "instruction is required"
187
+ self._record_result(result)
188
+ return result
189
+ if not model.strip():
190
+ result.error = "model is required"
191
+ self._record_result(result)
192
+ return result
193
+ if shutil.which("harbor") is None:
194
+ result.error = (
195
+ "harbor CLI is not installed. Install the terminus_env "
196
+ "dependencies or run `uv tool install harbor`."
197
+ )
198
+ self._record_result(result)
199
+ return result
200
+
201
+ task_root = Path(tempfile.mkdtemp(prefix=f"{resolved_task_id}-"))
202
+ result.task_dir = str(task_root)
203
+ try:
204
+ _write_harbor_task(task_root, instruction, setup, verify)
205
+ command = _build_harbor_command(
206
+ task_root=task_root,
207
+ task_id=task_root.name,
208
+ model=model,
209
+ api_base=api_base,
210
+ env=env,
211
+ max_turns=max_turns,
212
+ temperature=temperature,
213
+ collect_rollout_details=collect_rollout_details,
214
+ )
215
+ result.command = command
216
+
217
+ completed = _run_command(
218
+ command,
219
+ timeout_s=timeout_s,
220
+ api_key=api_key,
221
+ )
222
+ result.exit_code = completed.returncode
223
+ result.stdout_tail = (completed.stdout or "")[-_TAIL_CHARS:]
224
+ result.stderr_tail = (completed.stderr or "")[-_TAIL_CHARS:]
225
+ result.reward = _extract_reward(completed.stdout, completed.stderr)
226
+ result.error = None if completed.returncode == 0 else "harbor run failed"
227
+ result.files = _collect_small_files(task_root)
228
+ except subprocess.TimeoutExpired as exc:
229
+ result.exit_code = None
230
+ result.stdout_tail = (exc.stdout or "")[-_TAIL_CHARS:]
231
+ result.stderr_tail = (exc.stderr or "")[-_TAIL_CHARS:]
232
+ result.error = f"harbor run timed out after {timeout_s}s"
233
+ except Exception as exc: # noqa: BLE001
234
+ result.error = f"{type(exc).__name__}: {exc}"
235
+ finally:
236
+ if not keep_task_dir:
237
+ shutil.rmtree(task_root, ignore_errors=True)
238
+
239
+ self._record_result(result)
240
+ return result
241
+
242
+ def _record_result(self, result: TerminusRolloutResult) -> None:
243
+ self._state.rollouts_completed += 1
244
+ self._state.last_task_id = result.task_id
245
+ self._state.last_reward = result.reward
246
+ self._state.last_exit_code = result.exit_code
247
+ self._state.last_error = result.error
248
+
249
+
250
+ def _write_harbor_task(
251
+ task_root: Path,
252
+ instruction: str,
253
+ setup: list[str],
254
+ verify: list[str],
255
+ ) -> None:
256
+ (task_root / "environment").mkdir(parents=True, exist_ok=True)
257
+ (task_root / "tests").mkdir(parents=True, exist_ok=True)
258
+ (task_root / "solution").mkdir(parents=True, exist_ok=True)
259
+
260
+ (task_root / "instruction.md").write_text(instruction, encoding="utf-8")
261
+ (task_root / "task.toml").write_text(
262
+ 'name = "openenv-terminus-task"\n',
263
+ encoding="utf-8",
264
+ )
265
+ (task_root / "environment" / "Dockerfile").write_text(
266
+ _build_dockerfile(setup),
267
+ encoding="utf-8",
268
+ )
269
+ test_script = _build_test_script(verify)
270
+ test_path = task_root / "tests" / "test.sh"
271
+ test_path.write_text(test_script, encoding="utf-8")
272
+ test_path.chmod(0o755)
273
+
274
+
275
+ def _build_dockerfile(setup: list[str]) -> str:
276
+ setup_block = "\n".join(setup)
277
+ run_setup = ""
278
+ if setup_block.strip():
279
+ run_setup = "\nRUN <<'EOF'\nset -eux\n" + setup_block + "\nEOF\n"
280
+ return (
281
+ "FROM python:3.12-slim\n"
282
+ "WORKDIR /workspace\n"
283
+ "RUN apt-get update && apt-get install -y --no-install-recommends "
284
+ "bash ca-certificates curl git tmux && rm -rf /var/lib/apt/lists/*\n"
285
+ f"{run_setup}"
286
+ )
287
+
288
+
289
+ def _build_test_script(verify: list[str]) -> str:
290
+ if not verify:
291
+ return "#!/usr/bin/env bash\nset -euo pipefail\nexit 0\n"
292
+ return "#!/usr/bin/env bash\nset -euo pipefail\n" + "\n".join(verify) + "\n"
293
+
294
+
295
+ def _build_harbor_command(
296
+ *,
297
+ task_root: Path,
298
+ task_id: str,
299
+ model: str,
300
+ api_base: str,
301
+ env: str,
302
+ max_turns: int,
303
+ temperature: float,
304
+ collect_rollout_details: bool,
305
+ ) -> list[str]:
306
+ command = [
307
+ "harbor",
308
+ "run",
309
+ "--agent",
310
+ "terminus-2",
311
+ "--model",
312
+ model,
313
+ "--path",
314
+ str(task_root.parent),
315
+ "--task-name",
316
+ task_id,
317
+ ]
318
+ if env:
319
+ command.extend(["--env", env])
320
+ if api_base:
321
+ command.extend(["--agent-kwarg", f"api_base={api_base}"])
322
+ command.extend(
323
+ [
324
+ "--agent-kwarg",
325
+ f"max_turns={max_turns}",
326
+ "--agent-kwarg",
327
+ f"temperature={temperature}",
328
+ ]
329
+ )
330
+ if collect_rollout_details:
331
+ command.extend(["--agent-kwarg", "collect_rollout_details=true"])
332
+ return command
333
+
334
+
335
+ def _run_command(
336
+ command: list[str],
337
+ *,
338
+ timeout_s: int,
339
+ api_key: str,
340
+ ) -> subprocess.CompletedProcess[str]:
341
+ env = os.environ.copy()
342
+ if api_key:
343
+ env.setdefault("OPENAI_API_KEY", api_key)
344
+ env.setdefault("ANTHROPIC_API_KEY", api_key)
345
+ return subprocess.run(
346
+ command,
347
+ check=False,
348
+ capture_output=True,
349
+ text=True,
350
+ timeout=timeout_s,
351
+ env=env,
352
+ )
353
+
354
+
355
+ def _extract_reward(*texts: str) -> float | None:
356
+ joined = "\n".join(texts)
357
+ for pattern in [
358
+ r'"reward"\s*:\s*(-?\d+(?:\.\d+)?)',
359
+ r"reward['\"]?\s*[:=]\s*(-?\d+(?:\.\d+)?)",
360
+ ]:
361
+ match = re.search(pattern, joined, flags=re.IGNORECASE)
362
+ if match:
363
+ return float(match.group(1))
364
+ return None
365
+
366
+
367
+ def _collect_small_files(root: Path) -> dict[str, str]:
368
+ files: dict[str, str] = {}
369
+ for path in root.rglob("*"):
370
+ if not path.is_file() or path.stat().st_size > 64_000:
371
+ continue
372
+ rel = str(path.relative_to(root))
373
+ if rel.startswith("."):
374
+ continue
375
+ try:
376
+ files[rel] = path.read_text(encoding="utf-8")[:_TAIL_CHARS]
377
+ except UnicodeDecodeError:
378
+ continue
379
+ return files
uv.lock ADDED
The diff for this file is too large to render. See raw diff