Auto-start CPU sandboxes for sessions (#200)
Browse files* Auto-start CPU sandboxes for sessions
Co-authored-by: Codex <codex@openai.com>
* Retry sandbox runtime visibility checks
Co-authored-by: Codex <codex@openai.com>
* Stabilize auto CPU sandbox creation
Co-authored-by: OpenAI Codex <codex@openai.com>
* Address sandbox PR review comments
Co-authored-by: OpenAI Codex <codex@openai.com>
---------
Co-authored-by: Codex <codex@openai.com>
- agent/core/agent_loop.py +3 -1
- agent/core/session.py +4 -0
- agent/prompts/system_prompt_v3.yaml +4 -2
- agent/tools/sandbox_client.py +36 -1
- agent/tools/sandbox_tool.py +295 -50
- backend/routes/agent.py +13 -0
- backend/session_manager.py +30 -20
- frontend/src/components/Layout/AppLayout.tsx +33 -0
- tests/unit/test_sandbox_auto_start.py +31 -0
- tests/unit/test_sandbox_private_spaces.py +305 -0
- tests/unit/test_session_manager_persistence.py +50 -0
agent/core/agent_loop.py
CHANGED
|
@@ -32,6 +32,7 @@ from agent.core.prompt_caching import with_prompt_caching
|
|
| 32 |
from agent.core.session import Event, OpType, Session
|
| 33 |
from agent.core.tools import ToolRouter
|
| 34 |
from agent.tools.jobs_tool import CPU_FLAVORS
|
|
|
|
| 35 |
|
| 36 |
logger = logging.getLogger(__name__)
|
| 37 |
|
|
@@ -155,7 +156,8 @@ def _base_needs_approval(
|
|
| 155 |
return False
|
| 156 |
|
| 157 |
if tool_name == "sandbox_create":
|
| 158 |
-
|
|
|
|
| 159 |
|
| 160 |
if tool_name == "hf_jobs":
|
| 161 |
operation = _operation(tool_args)
|
|
|
|
| 32 |
from agent.core.session import Event, OpType, Session
|
| 33 |
from agent.core.tools import ToolRouter
|
| 34 |
from agent.tools.jobs_tool import CPU_FLAVORS
|
| 35 |
+
from agent.tools.sandbox_tool import DEFAULT_CPU_SANDBOX_HARDWARE
|
| 36 |
|
| 37 |
logger = logging.getLogger(__name__)
|
| 38 |
|
|
|
|
| 156 |
return False
|
| 157 |
|
| 158 |
if tool_name == "sandbox_create":
|
| 159 |
+
hardware = tool_args.get("hardware") or DEFAULT_CPU_SANDBOX_HARDWARE
|
| 160 |
+
return hardware != DEFAULT_CPU_SANDBOX_HARDWARE
|
| 161 |
|
| 162 |
if tool_name == "hf_jobs":
|
| 163 |
operation = _operation(tool_args)
|
agent/core/session.py
CHANGED
|
@@ -116,6 +116,10 @@ class Session:
|
|
| 116 |
self._cancelled = asyncio.Event()
|
| 117 |
self.pending_approval: Optional[dict[str, Any]] = None
|
| 118 |
self.sandbox = None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
self._running_job_ids: set[str] = set() # HF job IDs currently executing
|
| 120 |
self.notification_gateway = notification_gateway
|
| 121 |
self.notification_destinations = list(notification_destinations or [])
|
|
|
|
| 116 |
self._cancelled = asyncio.Event()
|
| 117 |
self.pending_approval: Optional[dict[str, Any]] = None
|
| 118 |
self.sandbox = None
|
| 119 |
+
self.sandbox_hardware: Optional[str] = None
|
| 120 |
+
self.sandbox_preload_task: Optional[asyncio.Task] = None
|
| 121 |
+
self.sandbox_preload_error: Optional[str] = None
|
| 122 |
+
self.sandbox_preload_cancel_event: Any | None = None
|
| 123 |
self._running_job_ids: set[str] = set() # HF job IDs currently executing
|
| 124 |
self.notification_gateway = notification_gateway
|
| 125 |
self.notification_destinations = list(notification_destinations or [])
|
agent/prompts/system_prompt_v3.yaml
CHANGED
|
@@ -122,8 +122,10 @@ system_prompt: |
|
|
| 122 |
|
| 123 |
# Sandbox-first development
|
| 124 |
|
| 125 |
-
For non-trivial scripts, develop and test
|
| 126 |
-
|
|
|
|
|
|
|
| 127 |
|
| 128 |
Use GPU sandbox (t4-small minimum) when testing code that uses CUDA, bf16, or model loading. CPU sandboxes cannot test GPU code paths.
|
| 129 |
|
|
|
|
| 122 |
|
| 123 |
# Sandbox-first development
|
| 124 |
|
| 125 |
+
A private cpu-basic sandbox is already available for normal code execution in each session. For non-trivial scripts, develop and test there before launching via hf_jobs:
|
| 126 |
+
write script → pip install → test with small run using bash/read/write/edit → fix errors → launch via hf_jobs at scale
|
| 127 |
+
|
| 128 |
+
Do NOT call sandbox_create before normal CPU work. Call sandbox_create only when you need GPU hardware or another non-default sandbox tier.
|
| 129 |
|
| 130 |
Use GPU sandbox (t4-small minimum) when testing code that uses CUDA, bf16, or model loading. CPU sandboxes cannot test GPU code paths.
|
| 131 |
|
agent/tools/sandbox_client.py
CHANGED
|
@@ -66,6 +66,15 @@ WAIT_TIMEOUT = 600
|
|
| 66 |
WAIT_INTERVAL = 5
|
| 67 |
API_WAIT_TIMEOUT = 180
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
_DOCKERFILE = """\
|
| 70 |
FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
|
| 71 |
|
|
@@ -615,6 +624,16 @@ class Sandbox:
|
|
| 615 |
|
| 616 |
_check_cancel()
|
| 617 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
# Inject secrets BEFORE uploading server files (which triggers rebuild).
|
| 619 |
# Secrets added after a Space is running aren't available until restart,
|
| 620 |
# so they must be set before the build/start cycle.
|
|
@@ -633,8 +652,24 @@ class Sandbox:
|
|
| 633 |
deadline = time.time() + wait_timeout
|
| 634 |
while time.time() < deadline:
|
| 635 |
_check_cancel()
|
| 636 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 637 |
if runtime.stage == "RUNNING":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 638 |
_log(f"Space is running (hardware: {runtime.hardware})")
|
| 639 |
break
|
| 640 |
if runtime.stage in ("RUNTIME_ERROR", "BUILD_ERROR"):
|
|
|
|
| 66 |
WAIT_INTERVAL = 5
|
| 67 |
API_WAIT_TIMEOUT = 180
|
| 68 |
|
| 69 |
+
|
| 70 |
+
def _is_transient_space_visibility_error(error: Exception) -> bool:
|
| 71 |
+
"""Return True when a newly duplicated Space is not queryable yet."""
|
| 72 |
+
response = getattr(error, "response", None)
|
| 73 |
+
if getattr(response, "status_code", None) == 404:
|
| 74 |
+
return True
|
| 75 |
+
message = str(error)
|
| 76 |
+
return "Repository Not Found" in message or "404 Client Error" in message
|
| 77 |
+
|
| 78 |
_DOCKERFILE = """\
|
| 79 |
FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
|
| 80 |
|
|
|
|
| 624 |
|
| 625 |
_check_cancel()
|
| 626 |
|
| 627 |
+
# Some template duplicates can initially inherit the template hardware.
|
| 628 |
+
# Explicitly request the target tier so automatic CPU sandboxes never
|
| 629 |
+
# silently come up on GPU hardware.
|
| 630 |
+
api.request_space_hardware(
|
| 631 |
+
space_id,
|
| 632 |
+
hardware=hardware,
|
| 633 |
+
sleep_time=sleep_time,
|
| 634 |
+
)
|
| 635 |
+
_log(f"Requested hardware: {hardware}")
|
| 636 |
+
|
| 637 |
# Inject secrets BEFORE uploading server files (which triggers rebuild).
|
| 638 |
# Secrets added after a Space is running aren't available until restart,
|
| 639 |
# so they must be set before the build/start cycle.
|
|
|
|
| 652 |
deadline = time.time() + wait_timeout
|
| 653 |
while time.time() < deadline:
|
| 654 |
_check_cancel()
|
| 655 |
+
try:
|
| 656 |
+
runtime = api.get_space_runtime(space_id)
|
| 657 |
+
except Exception as e:
|
| 658 |
+
if _is_transient_space_visibility_error(e):
|
| 659 |
+
_log(" Space runtime not visible yet...")
|
| 660 |
+
time.sleep(WAIT_INTERVAL)
|
| 661 |
+
continue
|
| 662 |
+
raise
|
| 663 |
if runtime.stage == "RUNNING":
|
| 664 |
+
current_hardware = runtime.hardware or getattr(
|
| 665 |
+
runtime, "requested_hardware", None
|
| 666 |
+
)
|
| 667 |
+
if current_hardware != hardware:
|
| 668 |
+
_log(
|
| 669 |
+
f" RUNNING on {current_hardware}; waiting for {hardware}..."
|
| 670 |
+
)
|
| 671 |
+
time.sleep(WAIT_INTERVAL)
|
| 672 |
+
continue
|
| 673 |
_log(f"Space is running (hardware: {runtime.hardware})")
|
| 674 |
break
|
| 675 |
if runtime.stage in ("RUNTIME_ERROR", "BUILD_ERROR"):
|
agent/tools/sandbox_tool.py
CHANGED
|
@@ -2,11 +2,11 @@
|
|
| 2 |
Sandbox tools — expose the Sandbox client as agent tools.
|
| 3 |
|
| 4 |
5 tools total:
|
| 5 |
-
sandbox_create —
|
| 6 |
-
bash, read, write, edit — operations on the sandbox
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
"""
|
| 11 |
|
| 12 |
from __future__ import annotations
|
|
@@ -15,6 +15,7 @@ import asyncio
|
|
| 15 |
import logging
|
| 16 |
import re
|
| 17 |
import threading
|
|
|
|
| 18 |
from datetime import datetime, timedelta, timezone
|
| 19 |
from typing import Any
|
| 20 |
|
|
@@ -26,6 +27,8 @@ from agent.tools.trackio_seed import ensure_trackio_dashboard
|
|
| 26 |
|
| 27 |
logger = logging.getLogger(__name__)
|
| 28 |
|
|
|
|
|
|
|
| 29 |
# Match the exact suffix pattern Sandbox.create produces: "sandbox-<8 hex>".
|
| 30 |
# Used to identify orphan sandboxes from prior sessions safely (won't match
|
| 31 |
# user-renamed lookalikes).
|
|
@@ -36,6 +39,23 @@ _SANDBOX_NAME_RE = re.compile(r"^sandbox-[a-f0-9]{8}$")
|
|
| 36 |
# so we leave it alone.
|
| 37 |
_ORPHAN_STALE_AFTER = timedelta(hours=1)
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
def _looks_like_path(script: str) -> bool:
|
| 41 |
"""Return True if the script string looks like a file path (not inline code)."""
|
|
@@ -124,7 +144,7 @@ def _cleanup_user_orphan_sandboxes(
|
|
| 124 |
cutoff = datetime.now(timezone.utc) - _ORPHAN_STALE_AFTER
|
| 125 |
deleted = 0
|
| 126 |
try:
|
| 127 |
-
spaces = list(api.list_spaces(author=owner, limit=200))
|
| 128 |
except Exception as e:
|
| 129 |
log(f"orphan sweep: list_spaces failed: {e}")
|
| 130 |
return 0
|
|
@@ -140,6 +160,9 @@ def _cleanup_user_orphan_sandboxes(
|
|
| 140 |
last_mod = datetime.fromisoformat(last_mod.replace("Z", "+00:00"))
|
| 141 |
except ValueError:
|
| 142 |
last_mod = None
|
|
|
|
|
|
|
|
|
|
| 143 |
if last_mod and last_mod > cutoff:
|
| 144 |
# Recent — could be a concurrent live session. Skip.
|
| 145 |
continue
|
|
@@ -158,8 +181,9 @@ def _cleanup_user_orphan_sandboxes(
|
|
| 158 |
|
| 159 |
async def _ensure_sandbox(
|
| 160 |
session: Any,
|
| 161 |
-
hardware: str =
|
| 162 |
extra_secrets: dict[str, str] | None = None,
|
|
|
|
| 163 |
**create_kwargs,
|
| 164 |
) -> tuple[Sandbox | None, str | None]:
|
| 165 |
"""
|
|
@@ -184,6 +208,45 @@ async def _ensure_sandbox(
|
|
| 184 |
if not owner:
|
| 185 |
return None, "Could not determine HF username from token."
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
await session.send_event(
|
| 188 |
Event(
|
| 189 |
event_type="tool_log",
|
|
@@ -203,27 +266,10 @@ async def _ensure_sandbox(
|
|
| 203 |
Event(event_type="tool_log", data={"tool": "sandbox", "log": msg}),
|
| 204 |
)
|
| 205 |
|
| 206 |
-
# Before we create a new sandbox, sweep this user's stale sandboxes from
|
| 207 |
-
# prior sessions. ``_cleanup_sandbox`` in session_manager fires only on
|
| 208 |
-
# clean session exit; pod kills, WebSocket drops, etc. leave orphans
|
| 209 |
-
# behind, and they accumulate on every new session forever (observed
|
| 210 |
-
# 2310 leaked across the Hub on 2026-04-27). Doing the cleanup here at
|
| 211 |
-
# session start = self-healing, no separate cron needed.
|
| 212 |
-
#
|
| 213 |
-
# The 1h staleness filter is the safety: a sandbox modified in the last
|
| 214 |
-
# hour might still be tied to a live session in another tab, so we skip.
|
| 215 |
-
# Anything older has no realistic chance of being active given typical
|
| 216 |
-
# session lengths.
|
| 217 |
-
try:
|
| 218 |
-
await asyncio.to_thread(_cleanup_user_orphan_sandboxes, api, owner, _log)
|
| 219 |
-
except Exception as e:
|
| 220 |
-
# Cleanup is best-effort — never block sandbox_create on it.
|
| 221 |
-
_log(f"orphan sandbox sweep failed (non-fatal): {e}")
|
| 222 |
-
|
| 223 |
# Bridge asyncio cancel event to a threading.Event for the blocking create call.
|
| 224 |
# We poll session._cancelled from the main loop in a background task and set
|
| 225 |
# a threading.Event that Sandbox.create checks during its polling loops.
|
| 226 |
-
cancel_flag = threading.Event()
|
| 227 |
|
| 228 |
async def _watch_cancel():
|
| 229 |
await session._cancelled.wait()
|
|
@@ -245,7 +291,7 @@ async def _ensure_sandbox(
|
|
| 245 |
"cancel_event": cancel_flag,
|
| 246 |
**create_kwargs,
|
| 247 |
}
|
| 248 |
-
if hardware !=
|
| 249 |
kwargs["sleep_time"] = 2700
|
| 250 |
import time as _t
|
| 251 |
_t_start = _t.monotonic()
|
|
@@ -255,7 +301,18 @@ async def _ensure_sandbox(
|
|
| 255 |
return None, "Sandbox creation cancelled by user."
|
| 256 |
finally:
|
| 257 |
watcher_task.cancel()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
session.sandbox = sb
|
|
|
|
|
|
|
| 259 |
|
| 260 |
# Telemetry: sandbox creation (infra consumption signal)
|
| 261 |
from agent.core import telemetry
|
|
@@ -286,19 +343,146 @@ async def _ensure_sandbox(
|
|
| 286 |
return sb, None
|
| 287 |
|
| 288 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
# ── sandbox_create tool ──────────────────────────────────────────────
|
| 290 |
|
| 291 |
SANDBOX_CREATE_TOOL_SPEC = {
|
| 292 |
"name": "sandbox_create",
|
| 293 |
"description": (
|
| 294 |
-
"Create
|
| 295 |
-
"
|
| 296 |
-
"
|
|
|
|
|
|
|
| 297 |
"Sandboxes are always created as private HF Spaces.\n\n"
|
| 298 |
-
"Use this when: you need to develop, test, and iterate on scripts before launching via hf_jobs. "
|
| 299 |
-
"Especially for training scripts where you need to verify imports, test on a small subset, and fix errors interactively.\n\n"
|
| 300 |
-
"Skip this when: the task is a simple one-shot operation (status check, resource search, quick data query), "
|
| 301 |
-
"or the script is copied from a verified working example with minimal changes.\n\n"
|
| 302 |
"For ML code that uses CUDA, bf16, or model loading: use GPU hardware (t4-small minimum). "
|
| 303 |
"CPU sandboxes cannot run GPU code paths — your test will not catch GPU-related errors.\n\n"
|
| 304 |
"Before choosing hardware, estimate your VRAM needs (models you run, training data size). Rule of thumb: bf16/fp16 ≈ 2 bytes/param, "
|
|
@@ -318,7 +502,10 @@ SANDBOX_CREATE_TOOL_SPEC = {
|
|
| 318 |
"hardware": {
|
| 319 |
"type": "string",
|
| 320 |
"enum": [e.value for e in SpaceHardware],
|
| 321 |
-
"description":
|
|
|
|
|
|
|
|
|
|
| 322 |
},
|
| 323 |
"trackio_space_id": {
|
| 324 |
"type": "string",
|
|
@@ -346,7 +533,7 @@ async def sandbox_create_handler(
|
|
| 346 |
args: dict[str, Any], session: Any = None, tool_call_id: str | None = None
|
| 347 |
) -> tuple[str, bool]:
|
| 348 |
"""Handle sandbox_create tool calls."""
|
| 349 |
-
hardware = args.get("hardware",
|
| 350 |
trackio_space_id = args.get("trackio_space_id") or None
|
| 351 |
trackio_project = args.get("trackio_project") or None
|
| 352 |
|
|
@@ -364,24 +551,76 @@ async def sandbox_create_handler(
|
|
| 364 |
data["trackioProject"] = trackio_project
|
| 365 |
await session.send_event(Event(event_type="tool_state_change", data=data))
|
| 366 |
|
| 367 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
if session and getattr(session, "sandbox", None):
|
| 369 |
sb = session.sandbox
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
requested_hardware = args.get("hardware")
|
| 371 |
lockout_note = ""
|
| 372 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
lockout_note = (
|
| 374 |
f"\nRequested hardware: {requested_hardware}\n"
|
| 375 |
"Hardware cannot be changed by calling sandbox_create again. "
|
| 376 |
"Delete the existing sandbox first if you need a different tier."
|
| 377 |
)
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
|
| 386 |
create_kwargs: dict[str, Any] = {}
|
| 387 |
|
|
@@ -420,11 +659,11 @@ def _make_tool_handler(sandbox_tool_name: str):
|
|
| 420 |
"""Factory: create a handler for a sandbox operation tool."""
|
| 421 |
|
| 422 |
async def handler(args: dict[str, Any], session: Any = None) -> tuple[str, bool]:
|
| 423 |
-
|
| 424 |
-
if
|
| 425 |
-
return
|
| 426 |
-
|
| 427 |
-
|
| 428 |
|
| 429 |
try:
|
| 430 |
result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
|
|
@@ -449,7 +688,7 @@ def get_sandbox_tools():
|
|
| 449 |
|
| 450 |
tools = []
|
| 451 |
|
| 452 |
-
# sandbox_create (
|
| 453 |
tools.append(
|
| 454 |
ToolSpec(
|
| 455 |
name=SANDBOX_CREATE_TOOL_SPEC["name"],
|
|
@@ -462,10 +701,16 @@ def get_sandbox_tools():
|
|
| 462 |
# Operation tools (auto-execute, no approval needed)
|
| 463 |
for name in Sandbox.TOOLS.keys():
|
| 464 |
spec = Sandbox.TOOLS[name]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
tools.append(
|
| 466 |
ToolSpec(
|
| 467 |
name=name,
|
| 468 |
-
description=
|
| 469 |
parameters=spec["parameters"],
|
| 470 |
handler=_make_tool_handler(name),
|
| 471 |
)
|
|
|
|
| 2 |
Sandbox tools — expose the Sandbox client as agent tools.
|
| 3 |
|
| 4 |
5 tools total:
|
| 5 |
+
sandbox_create — create/replace sandbox for non-default hardware
|
| 6 |
+
bash, read, write, edit — operations on the active sandbox
|
| 7 |
|
| 8 |
+
A cpu-basic sandbox is preloaded for each session. Operation tools wait for it
|
| 9 |
+
if startup is still in progress.
|
| 10 |
"""
|
| 11 |
|
| 12 |
from __future__ import annotations
|
|
|
|
| 15 |
import logging
|
| 16 |
import re
|
| 17 |
import threading
|
| 18 |
+
import weakref
|
| 19 |
from datetime import datetime, timedelta, timezone
|
| 20 |
from typing import Any
|
| 21 |
|
|
|
|
| 27 |
|
| 28 |
logger = logging.getLogger(__name__)
|
| 29 |
|
| 30 |
+
DEFAULT_CPU_SANDBOX_HARDWARE = "cpu-basic"
|
| 31 |
+
|
| 32 |
# Match the exact suffix pattern Sandbox.create produces: "sandbox-<8 hex>".
|
| 33 |
# Used to identify orphan sandboxes from prior sessions safely (won't match
|
| 34 |
# user-renamed lookalikes).
|
|
|
|
| 39 |
# so we leave it alone.
|
| 40 |
_ORPHAN_STALE_AFTER = timedelta(hours=1)
|
| 41 |
|
| 42 |
+
# HF Space duplication/build APIs can behave poorly when multiple private
|
| 43 |
+
# sandboxes are created concurrently for the same namespace. Keep session
|
| 44 |
+
# creation non-blocking, but serialize the actual Hub create path per owner.
|
| 45 |
+
_SANDBOX_CREATE_LOCKS: weakref.WeakKeyDictionary[
|
| 46 |
+
asyncio.AbstractEventLoop, dict[str, asyncio.Lock]
|
| 47 |
+
] = weakref.WeakKeyDictionary()
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _get_sandbox_create_lock(owner: str) -> asyncio.Lock:
|
| 51 |
+
loop = asyncio.get_running_loop()
|
| 52 |
+
locks = _SANDBOX_CREATE_LOCKS.setdefault(loop, {})
|
| 53 |
+
lock = locks.get(owner)
|
| 54 |
+
if lock is None:
|
| 55 |
+
lock = asyncio.Lock()
|
| 56 |
+
locks[owner] = lock
|
| 57 |
+
return lock
|
| 58 |
+
|
| 59 |
|
| 60 |
def _looks_like_path(script: str) -> bool:
|
| 61 |
"""Return True if the script string looks like a file path (not inline code)."""
|
|
|
|
| 144 |
cutoff = datetime.now(timezone.utc) - _ORPHAN_STALE_AFTER
|
| 145 |
deleted = 0
|
| 146 |
try:
|
| 147 |
+
spaces = list(api.list_spaces(author=owner, limit=200, full=True))
|
| 148 |
except Exception as e:
|
| 149 |
log(f"orphan sweep: list_spaces failed: {e}")
|
| 150 |
return 0
|
|
|
|
| 160 |
last_mod = datetime.fromisoformat(last_mod.replace("Z", "+00:00"))
|
| 161 |
except ValueError:
|
| 162 |
last_mod = None
|
| 163 |
+
if last_mod is None:
|
| 164 |
+
log(f"orphan sweep: skipping {space.id}; missing lastModified")
|
| 165 |
+
continue
|
| 166 |
if last_mod and last_mod > cutoff:
|
| 167 |
# Recent — could be a concurrent live session. Skip.
|
| 168 |
continue
|
|
|
|
| 181 |
|
| 182 |
async def _ensure_sandbox(
|
| 183 |
session: Any,
|
| 184 |
+
hardware: str = DEFAULT_CPU_SANDBOX_HARDWARE,
|
| 185 |
extra_secrets: dict[str, str] | None = None,
|
| 186 |
+
cancel_event: threading.Event | None = None,
|
| 187 |
**create_kwargs,
|
| 188 |
) -> tuple[Sandbox | None, str | None]:
|
| 189 |
"""
|
|
|
|
| 208 |
if not owner:
|
| 209 |
return None, "Could not determine HF username from token."
|
| 210 |
|
| 211 |
+
create_lock = _get_sandbox_create_lock(owner)
|
| 212 |
+
if create_lock.locked():
|
| 213 |
+
await session.send_event(
|
| 214 |
+
Event(
|
| 215 |
+
event_type="tool_log",
|
| 216 |
+
data={
|
| 217 |
+
"tool": "sandbox",
|
| 218 |
+
"log": "Waiting for sandbox creation slot...",
|
| 219 |
+
},
|
| 220 |
+
)
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
async with create_lock:
|
| 224 |
+
if getattr(session, "sandbox", None):
|
| 225 |
+
return session.sandbox, None
|
| 226 |
+
|
| 227 |
+
return await _create_sandbox_locked(
|
| 228 |
+
session,
|
| 229 |
+
api=api,
|
| 230 |
+
owner=owner,
|
| 231 |
+
hardware=hardware,
|
| 232 |
+
extra_secrets=extra_secrets,
|
| 233 |
+
cancel_event=cancel_event,
|
| 234 |
+
**create_kwargs,
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
async def _create_sandbox_locked(
|
| 239 |
+
session: Any,
|
| 240 |
+
*,
|
| 241 |
+
api: HfApi,
|
| 242 |
+
owner: str,
|
| 243 |
+
hardware: str,
|
| 244 |
+
extra_secrets: dict[str, str] | None = None,
|
| 245 |
+
cancel_event: threading.Event | None = None,
|
| 246 |
+
**create_kwargs,
|
| 247 |
+
) -> tuple[Sandbox | None, str | None]:
|
| 248 |
+
"""Create the Space while the per-owner sandbox creation lock is held."""
|
| 249 |
+
token = session.hf_token
|
| 250 |
await session.send_event(
|
| 251 |
Event(
|
| 252 |
event_type="tool_log",
|
|
|
|
| 266 |
Event(event_type="tool_log", data={"tool": "sandbox", "log": msg}),
|
| 267 |
)
|
| 268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
# Bridge asyncio cancel event to a threading.Event for the blocking create call.
|
| 270 |
# We poll session._cancelled from the main loop in a background task and set
|
| 271 |
# a threading.Event that Sandbox.create checks during its polling loops.
|
| 272 |
+
cancel_flag = cancel_event or threading.Event()
|
| 273 |
|
| 274 |
async def _watch_cancel():
|
| 275 |
await session._cancelled.wait()
|
|
|
|
| 291 |
"cancel_event": cancel_flag,
|
| 292 |
**create_kwargs,
|
| 293 |
}
|
| 294 |
+
if hardware != DEFAULT_CPU_SANDBOX_HARDWARE:
|
| 295 |
kwargs["sleep_time"] = 2700
|
| 296 |
import time as _t
|
| 297 |
_t_start = _t.monotonic()
|
|
|
|
| 301 |
return None, "Sandbox creation cancelled by user."
|
| 302 |
finally:
|
| 303 |
watcher_task.cancel()
|
| 304 |
+
|
| 305 |
+
if cancel_flag.is_set():
|
| 306 |
+
if getattr(sb, "_owns_space", False):
|
| 307 |
+
try:
|
| 308 |
+
await asyncio.to_thread(sb.delete)
|
| 309 |
+
except Exception as e:
|
| 310 |
+
logger.warning("Failed to delete cancelled sandbox %s: %s", sb.space_id, e)
|
| 311 |
+
return None, "Sandbox creation cancelled by user."
|
| 312 |
+
|
| 313 |
session.sandbox = sb
|
| 314 |
+
session.sandbox_hardware = hardware
|
| 315 |
+
session.sandbox_preload_error = None
|
| 316 |
|
| 317 |
# Telemetry: sandbox creation (infra consumption signal)
|
| 318 |
from agent.core import telemetry
|
|
|
|
| 343 |
return sb, None
|
| 344 |
|
| 345 |
|
| 346 |
+
def start_cpu_sandbox_preload(session: Any) -> asyncio.Task | None:
|
| 347 |
+
"""Start a background ``cpu-basic`` sandbox for this session."""
|
| 348 |
+
if not session or getattr(session, "sandbox", None):
|
| 349 |
+
return None
|
| 350 |
+
|
| 351 |
+
existing_task = getattr(session, "sandbox_preload_task", None)
|
| 352 |
+
if existing_task and not existing_task.done():
|
| 353 |
+
return existing_task
|
| 354 |
+
|
| 355 |
+
cancel_event = threading.Event()
|
| 356 |
+
session.sandbox_preload_cancel_event = cancel_event
|
| 357 |
+
session.sandbox_preload_error = None
|
| 358 |
+
|
| 359 |
+
async def _preload() -> Sandbox | None:
|
| 360 |
+
try:
|
| 361 |
+
sb, error = await _ensure_sandbox(
|
| 362 |
+
session,
|
| 363 |
+
hardware=DEFAULT_CPU_SANDBOX_HARDWARE,
|
| 364 |
+
cancel_event=cancel_event,
|
| 365 |
+
)
|
| 366 |
+
if error:
|
| 367 |
+
session.sandbox_preload_error = error
|
| 368 |
+
return None
|
| 369 |
+
return sb
|
| 370 |
+
except asyncio.CancelledError:
|
| 371 |
+
cancel_event.set()
|
| 372 |
+
session.sandbox_preload_error = "Sandbox creation cancelled by user."
|
| 373 |
+
raise
|
| 374 |
+
except Exception as e:
|
| 375 |
+
session.sandbox_preload_error = f"Failed to create sandbox: {e}"
|
| 376 |
+
logger.warning("CPU sandbox preload failed: %s", e)
|
| 377 |
+
return None
|
| 378 |
+
|
| 379 |
+
task = asyncio.create_task(_preload())
|
| 380 |
+
session.sandbox_preload_task = task
|
| 381 |
+
return task
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
async def cancel_sandbox_preload(session: Any) -> None:
|
| 385 |
+
"""Best-effort cancellation for an in-flight CPU sandbox preload."""
|
| 386 |
+
cancel_event = getattr(session, "sandbox_preload_cancel_event", None)
|
| 387 |
+
if cancel_event is not None:
|
| 388 |
+
cancel_event.set()
|
| 389 |
+
|
| 390 |
+
task = getattr(session, "sandbox_preload_task", None)
|
| 391 |
+
if not task or task.done():
|
| 392 |
+
return
|
| 393 |
+
|
| 394 |
+
current_task = asyncio.current_task()
|
| 395 |
+
if task is current_task:
|
| 396 |
+
return
|
| 397 |
+
|
| 398 |
+
try:
|
| 399 |
+
await asyncio.wait_for(asyncio.shield(task), timeout=30)
|
| 400 |
+
except asyncio.TimeoutError:
|
| 401 |
+
logger.warning(
|
| 402 |
+
"Timed out waiting for CPU sandbox preload cancellation; "
|
| 403 |
+
"task is still live, cancelling asyncio wrapper"
|
| 404 |
+
)
|
| 405 |
+
task.cancel()
|
| 406 |
+
except asyncio.CancelledError:
|
| 407 |
+
raise
|
| 408 |
+
except Exception:
|
| 409 |
+
pass
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
async def get_active_or_preloaded_sandbox(
|
| 413 |
+
session: Any,
|
| 414 |
+
) -> tuple[Sandbox | None, str | None]:
|
| 415 |
+
"""Return the active sandbox, waiting for the startup preload if needed."""
|
| 416 |
+
if not session:
|
| 417 |
+
return None, "No session available."
|
| 418 |
+
if getattr(session, "sandbox", None):
|
| 419 |
+
return session.sandbox, None
|
| 420 |
+
|
| 421 |
+
task = getattr(session, "sandbox_preload_task", None)
|
| 422 |
+
if task:
|
| 423 |
+
try:
|
| 424 |
+
await asyncio.shield(task)
|
| 425 |
+
except asyncio.CancelledError:
|
| 426 |
+
raise
|
| 427 |
+
except Exception as e:
|
| 428 |
+
session.sandbox_preload_error = f"Failed to create sandbox: {e}"
|
| 429 |
+
|
| 430 |
+
if getattr(session, "sandbox", None):
|
| 431 |
+
return session.sandbox, None
|
| 432 |
+
|
| 433 |
+
preload_error = getattr(session, "sandbox_preload_error", None)
|
| 434 |
+
if preload_error:
|
| 435 |
+
return None, preload_error
|
| 436 |
+
|
| 437 |
+
return None, "Sandbox is still starting. Please retry shortly."
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
async def teardown_session_sandbox(session: Any) -> None:
|
| 441 |
+
"""Cancel sandbox preload and delete the active owned sandbox, if present."""
|
| 442 |
+
if not session:
|
| 443 |
+
return
|
| 444 |
+
|
| 445 |
+
await cancel_sandbox_preload(session)
|
| 446 |
+
|
| 447 |
+
sandbox = getattr(session, "sandbox", None)
|
| 448 |
+
session.sandbox = None
|
| 449 |
+
session.sandbox_hardware = None
|
| 450 |
+
|
| 451 |
+
if not (sandbox and getattr(sandbox, "_owns_space", False)):
|
| 452 |
+
return
|
| 453 |
+
|
| 454 |
+
space_id = getattr(sandbox, "space_id", None)
|
| 455 |
+
last_err: Exception | None = None
|
| 456 |
+
for attempt in range(3):
|
| 457 |
+
try:
|
| 458 |
+
logger.info("Deleting sandbox %s (attempt %s/3)...", space_id, attempt + 1)
|
| 459 |
+
await asyncio.to_thread(sandbox.delete)
|
| 460 |
+
from agent.core import telemetry
|
| 461 |
+
await telemetry.record_sandbox_destroy(session, sandbox)
|
| 462 |
+
return
|
| 463 |
+
except Exception as e:
|
| 464 |
+
last_err = e
|
| 465 |
+
if attempt < 2:
|
| 466 |
+
await asyncio.sleep(2 ** attempt)
|
| 467 |
+
logger.error(
|
| 468 |
+
"Failed to delete sandbox %s after 3 attempts: %s. "
|
| 469 |
+
"Orphan — sweep script will pick it up.",
|
| 470 |
+
space_id,
|
| 471 |
+
last_err,
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
+
|
| 475 |
# ── sandbox_create tool ──────────────────────────────────────────────
|
| 476 |
|
| 477 |
SANDBOX_CREATE_TOOL_SPEC = {
|
| 478 |
"name": "sandbox_create",
|
| 479 |
"description": (
|
| 480 |
+
"Create or replace the session sandbox when non-default hardware is needed.\n\n"
|
| 481 |
+
"A private cpu-basic sandbox is already started automatically for each session. "
|
| 482 |
+
"For normal CPU code execution, call bash/read/write/edit directly; do NOT call sandbox_create first.\n\n"
|
| 483 |
+
"Use sandbox_create when: you need GPU hardware, cpu-upgrade, or Trackio secrets before running code. "
|
| 484 |
+
"The active sandbox persists across tool calls within the session. pip install works out of the box. "
|
| 485 |
"Sandboxes are always created as private HF Spaces.\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
"For ML code that uses CUDA, bf16, or model loading: use GPU hardware (t4-small minimum). "
|
| 487 |
"CPU sandboxes cannot run GPU code paths — your test will not catch GPU-related errors.\n\n"
|
| 488 |
"Before choosing hardware, estimate your VRAM needs (models you run, training data size). Rule of thumb: bf16/fp16 ≈ 2 bytes/param, "
|
|
|
|
| 502 |
"hardware": {
|
| 503 |
"type": "string",
|
| 504 |
"enum": [e.value for e in SpaceHardware],
|
| 505 |
+
"description": (
|
| 506 |
+
"Hardware tier for the sandbox. Omit for the existing auto-started "
|
| 507 |
+
"cpu-basic sandbox; choose GPU/cpu-upgrade only when needed."
|
| 508 |
+
),
|
| 509 |
},
|
| 510 |
"trackio_space_id": {
|
| 511 |
"type": "string",
|
|
|
|
| 533 |
args: dict[str, Any], session: Any = None, tool_call_id: str | None = None
|
| 534 |
) -> tuple[str, bool]:
|
| 535 |
"""Handle sandbox_create tool calls."""
|
| 536 |
+
hardware = args.get("hardware", DEFAULT_CPU_SANDBOX_HARDWARE)
|
| 537 |
trackio_space_id = args.get("trackio_space_id") or None
|
| 538 |
trackio_project = args.get("trackio_project") or None
|
| 539 |
|
|
|
|
| 551 |
data["trackioProject"] = trackio_project
|
| 552 |
await session.send_event(Event(event_type="tool_state_change", data=data))
|
| 553 |
|
| 554 |
+
preload_task = getattr(session, "sandbox_preload_task", None)
|
| 555 |
+
if (
|
| 556 |
+
session
|
| 557 |
+
and not getattr(session, "sandbox", None)
|
| 558 |
+
and preload_task
|
| 559 |
+
and not preload_task.done()
|
| 560 |
+
and hardware == DEFAULT_CPU_SANDBOX_HARDWARE
|
| 561 |
+
):
|
| 562 |
+
sb, error = await get_active_or_preloaded_sandbox(session)
|
| 563 |
+
if error:
|
| 564 |
+
return error, False
|
| 565 |
+
if sb:
|
| 566 |
+
await _emit_trackio_state(sb)
|
| 567 |
+
return (
|
| 568 |
+
f"Sandbox already active: {sb.space_id}\n"
|
| 569 |
+
f"URL: {sb.url}\n"
|
| 570 |
+
f"Hardware: {DEFAULT_CPU_SANDBOX_HARDWARE}\n"
|
| 571 |
+
f"Use bash/read/write/edit to interact with it."
|
| 572 |
+
), True
|
| 573 |
+
|
| 574 |
+
if (
|
| 575 |
+
session
|
| 576 |
+
and not getattr(session, "sandbox", None)
|
| 577 |
+
and preload_task
|
| 578 |
+
and not preload_task.done()
|
| 579 |
+
and hardware != DEFAULT_CPU_SANDBOX_HARDWARE
|
| 580 |
+
):
|
| 581 |
+
await cancel_sandbox_preload(session)
|
| 582 |
+
|
| 583 |
+
# If sandbox already exists, return its info or replace the auto CPU sandbox
|
| 584 |
if session and getattr(session, "sandbox", None):
|
| 585 |
sb = session.sandbox
|
| 586 |
+
active_hardware = getattr(session, "sandbox_hardware", None)
|
| 587 |
+
if active_hardware == hardware:
|
| 588 |
+
await _emit_trackio_state(sb)
|
| 589 |
+
return (
|
| 590 |
+
f"Sandbox already active: {sb.space_id}\n"
|
| 591 |
+
f"URL: {sb.url}\n"
|
| 592 |
+
f"Hardware: {active_hardware}\n"
|
| 593 |
+
f"Use bash/read/write/edit to interact with it."
|
| 594 |
+
), True
|
| 595 |
+
|
| 596 |
requested_hardware = args.get("hardware")
|
| 597 |
lockout_note = ""
|
| 598 |
+
if (
|
| 599 |
+
active_hardware == DEFAULT_CPU_SANDBOX_HARDWARE
|
| 600 |
+
and hardware != DEFAULT_CPU_SANDBOX_HARDWARE
|
| 601 |
+
):
|
| 602 |
+
await teardown_session_sandbox(session)
|
| 603 |
+
elif requested_hardware:
|
| 604 |
lockout_note = (
|
| 605 |
f"\nRequested hardware: {requested_hardware}\n"
|
| 606 |
"Hardware cannot be changed by calling sandbox_create again. "
|
| 607 |
"Delete the existing sandbox first if you need a different tier."
|
| 608 |
)
|
| 609 |
+
await _emit_trackio_state(sb)
|
| 610 |
+
return (
|
| 611 |
+
f"Sandbox already active: {sb.space_id}\n"
|
| 612 |
+
f"URL: {sb.url}\n"
|
| 613 |
+
f"{lockout_note}\n"
|
| 614 |
+
f"Use bash/read/write/edit to interact with it."
|
| 615 |
+
), True
|
| 616 |
+
else:
|
| 617 |
+
await _emit_trackio_state(sb)
|
| 618 |
+
return (
|
| 619 |
+
f"Sandbox already active: {sb.space_id}\n"
|
| 620 |
+
f"URL: {sb.url}\n"
|
| 621 |
+
f"Hardware: {active_hardware or 'unknown'}\n"
|
| 622 |
+
f"Use bash/read/write/edit to interact with it."
|
| 623 |
+
), True
|
| 624 |
|
| 625 |
create_kwargs: dict[str, Any] = {}
|
| 626 |
|
|
|
|
| 659 |
"""Factory: create a handler for a sandbox operation tool."""
|
| 660 |
|
| 661 |
async def handler(args: dict[str, Any], session: Any = None) -> tuple[str, bool]:
|
| 662 |
+
sb, error = await get_active_or_preloaded_sandbox(session)
|
| 663 |
+
if error:
|
| 664 |
+
return error, False
|
| 665 |
+
if not sb:
|
| 666 |
+
return "Sandbox is still starting. Please retry shortly.", False
|
| 667 |
|
| 668 |
try:
|
| 669 |
result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
|
|
|
|
| 688 |
|
| 689 |
tools = []
|
| 690 |
|
| 691 |
+
# sandbox_create (for GPU or other non-default hardware)
|
| 692 |
tools.append(
|
| 693 |
ToolSpec(
|
| 694 |
name=SANDBOX_CREATE_TOOL_SPEC["name"],
|
|
|
|
| 701 |
# Operation tools (auto-execute, no approval needed)
|
| 702 |
for name in Sandbox.TOOLS.keys():
|
| 703 |
spec = Sandbox.TOOLS[name]
|
| 704 |
+
description = (
|
| 705 |
+
"Uses the session's active sandbox. A private cpu-basic sandbox is "
|
| 706 |
+
"started automatically for normal CPU work; call sandbox_create only "
|
| 707 |
+
"for GPU or other non-default hardware.\n\n"
|
| 708 |
+
+ spec["description"]
|
| 709 |
+
)
|
| 710 |
tools.append(
|
| 711 |
ToolSpec(
|
| 712 |
name=name,
|
| 713 |
+
description=description,
|
| 714 |
parameters=spec["parameters"],
|
| 715 |
handler=_make_tool_handler(name),
|
| 716 |
)
|
backend/routes/agent.py
CHANGED
|
@@ -41,6 +41,7 @@ from agent.core.llm_params import _resolve_llm_params
|
|
| 41 |
logger = logging.getLogger(__name__)
|
| 42 |
|
| 43 |
router = APIRouter(prefix="/api", tags=["agent"])
|
|
|
|
| 44 |
|
| 45 |
DEFAULT_CLAUDE_MODEL_ID = "bedrock/us.anthropic.claude-opus-4-6-v1"
|
| 46 |
GATED_MODEL_IDS = {
|
|
@@ -559,6 +560,18 @@ async def list_sessions(user: dict = Depends(get_current_user)) -> list[SessionI
|
|
| 559 |
return [SessionInfo(**s) for s in sessions]
|
| 560 |
|
| 561 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
@router.delete("/session/{session_id}")
|
| 563 |
async def delete_session(
|
| 564 |
session_id: str, user: dict = Depends(get_current_user)
|
|
|
|
| 41 |
logger = logging.getLogger(__name__)
|
| 42 |
|
| 43 |
router = APIRouter(prefix="/api", tags=["agent"])
|
| 44 |
+
_background_teardown_tasks: set[asyncio.Task] = set()
|
| 45 |
|
| 46 |
DEFAULT_CLAUDE_MODEL_ID = "bedrock/us.anthropic.claude-opus-4-6-v1"
|
| 47 |
GATED_MODEL_IDS = {
|
|
|
|
| 560 |
return [SessionInfo(**s) for s in sessions]
|
| 561 |
|
| 562 |
|
| 563 |
+
@router.post("/session/{session_id}/sandbox/teardown")
|
| 564 |
+
async def teardown_session_sandbox(
|
| 565 |
+
session_id: str, user: dict = Depends(get_current_user)
|
| 566 |
+
) -> dict:
|
| 567 |
+
"""Best-effort sandbox teardown that preserves durable chat history."""
|
| 568 |
+
await _check_session_access(session_id, user)
|
| 569 |
+
task = asyncio.create_task(session_manager.teardown_sandbox(session_id))
|
| 570 |
+
_background_teardown_tasks.add(task)
|
| 571 |
+
task.add_done_callback(_background_teardown_tasks.discard)
|
| 572 |
+
return {"status": "teardown_requested", "session_id": session_id}
|
| 573 |
+
|
| 574 |
+
|
| 575 |
@router.delete("/session/{session_id}")
|
| 576 |
async def delete_session(
|
| 577 |
session_id: str, user: dict = Depends(get_current_user)
|
backend/session_manager.py
CHANGED
|
@@ -336,6 +336,20 @@ class SessionManager:
|
|
| 336 |
agent_session.task = task
|
| 337 |
return agent_session
|
| 338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
@staticmethod
|
| 340 |
def _can_access_session(agent_session: AgentSession, user_id: str) -> bool:
|
| 341 |
return (
|
|
@@ -519,6 +533,7 @@ class SessionManager:
|
|
| 519 |
hf_username=hf_username,
|
| 520 |
)
|
| 521 |
return started
|
|
|
|
| 522 |
logger.info("Restored session %s for user %s", session_id, owner or user_id)
|
| 523 |
return agent_session
|
| 524 |
|
|
@@ -599,6 +614,7 @@ class SessionManager:
|
|
| 599 |
event_queue=event_queue,
|
| 600 |
tool_router=tool_router,
|
| 601 |
)
|
|
|
|
| 602 |
await self.persist_session_snapshot(agent_session, runtime_state="idle")
|
| 603 |
|
| 604 |
if is_pro is not None and user_id and user_id != "dev":
|
|
@@ -705,27 +721,9 @@ class SessionManager:
|
|
| 705 |
with exponential backoff. A single missed delete = a permanently
|
| 706 |
orphaned Space, so the cost of an extra retry beats the alternative.
|
| 707 |
"""
|
| 708 |
-
|
| 709 |
-
if not (sandbox and getattr(sandbox, "_owns_space", False)):
|
| 710 |
-
return
|
| 711 |
|
| 712 |
-
|
| 713 |
-
last_err: Exception | None = None
|
| 714 |
-
for attempt in range(3):
|
| 715 |
-
try:
|
| 716 |
-
logger.info(f"Deleting sandbox {space_id} (attempt {attempt + 1}/3)...")
|
| 717 |
-
await asyncio.to_thread(sandbox.delete)
|
| 718 |
-
from agent.core import telemetry
|
| 719 |
-
await telemetry.record_sandbox_destroy(session, sandbox)
|
| 720 |
-
return
|
| 721 |
-
except Exception as e:
|
| 722 |
-
last_err = e
|
| 723 |
-
if attempt < 2:
|
| 724 |
-
await asyncio.sleep(2 ** attempt)
|
| 725 |
-
logger.error(
|
| 726 |
-
f"Failed to delete sandbox {space_id} after 3 attempts: {last_err}. "
|
| 727 |
-
f"Orphan — sweep script will pick it up."
|
| 728 |
-
)
|
| 729 |
|
| 730 |
async def _run_session(
|
| 731 |
self,
|
|
@@ -905,6 +903,18 @@ class SessionManager:
|
|
| 905 |
|
| 906 |
return True
|
| 907 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 908 |
async def update_session_title(self, session_id: str, title: str | None) -> None:
|
| 909 |
"""Persist a user-visible title for sidebar rehydration."""
|
| 910 |
agent_session = self.sessions.get(session_id)
|
|
|
|
| 336 |
agent_session.task = task
|
| 337 |
return agent_session
|
| 338 |
|
| 339 |
+
@staticmethod
|
| 340 |
+
def _start_cpu_sandbox_preload(agent_session: AgentSession) -> None:
|
| 341 |
+
"""Kick off a best-effort cpu-basic sandbox for the session."""
|
| 342 |
+
try:
|
| 343 |
+
from agent.tools.sandbox_tool import start_cpu_sandbox_preload
|
| 344 |
+
|
| 345 |
+
start_cpu_sandbox_preload(agent_session.session)
|
| 346 |
+
except Exception as e:
|
| 347 |
+
logger.warning(
|
| 348 |
+
"Failed to start CPU sandbox preload for %s: %s",
|
| 349 |
+
agent_session.session_id,
|
| 350 |
+
e,
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
@staticmethod
|
| 354 |
def _can_access_session(agent_session: AgentSession, user_id: str) -> bool:
|
| 355 |
return (
|
|
|
|
| 533 |
hf_username=hf_username,
|
| 534 |
)
|
| 535 |
return started
|
| 536 |
+
self._start_cpu_sandbox_preload(agent_session)
|
| 537 |
logger.info("Restored session %s for user %s", session_id, owner or user_id)
|
| 538 |
return agent_session
|
| 539 |
|
|
|
|
| 614 |
event_queue=event_queue,
|
| 615 |
tool_router=tool_router,
|
| 616 |
)
|
| 617 |
+
self._start_cpu_sandbox_preload(agent_session)
|
| 618 |
await self.persist_session_snapshot(agent_session, runtime_state="idle")
|
| 619 |
|
| 620 |
if is_pro is not None and user_id and user_id != "dev":
|
|
|
|
| 721 |
with exponential backoff. A single missed delete = a permanently
|
| 722 |
orphaned Space, so the cost of an extra retry beats the alternative.
|
| 723 |
"""
|
| 724 |
+
from agent.tools.sandbox_tool import teardown_session_sandbox
|
|
|
|
|
|
|
| 725 |
|
| 726 |
+
await teardown_session_sandbox(session)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 727 |
|
| 728 |
async def _run_session(
|
| 729 |
self,
|
|
|
|
| 903 |
|
| 904 |
return True
|
| 905 |
|
| 906 |
+
async def teardown_sandbox(self, session_id: str) -> bool:
|
| 907 |
+
"""Delete only this session's sandbox runtime, preserving chat state."""
|
| 908 |
+
async with self._lock:
|
| 909 |
+
agent_session = self.sessions.get(session_id)
|
| 910 |
+
|
| 911 |
+
if not agent_session or not agent_session.is_active:
|
| 912 |
+
return False
|
| 913 |
+
|
| 914 |
+
await self._cleanup_sandbox(agent_session.session)
|
| 915 |
+
await self.persist_session_snapshot(agent_session, runtime_state="idle")
|
| 916 |
+
return True
|
| 917 |
+
|
| 918 |
async def update_session_title(self, session_id: str, title: str | None) -> None:
|
| 919 |
"""Persist a user-visible title for sidebar rehydration."""
|
| 920 |
agent_session = self.sessions.get(session_id)
|
frontend/src/components/Layout/AppLayout.tsx
CHANGED
|
@@ -122,6 +122,39 @@ export default function AppLayout() {
|
|
| 122 |
};
|
| 123 |
}, [isConnected, activeSessionId]);
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
const handleSessionDead = useCallback(
|
| 126 |
(deadSessionId: string) => {
|
| 127 |
// Backend lost this session — mark it expired so the chat shows a
|
|
|
|
| 122 |
};
|
| 123 |
}, [isConnected, activeSessionId]);
|
| 124 |
|
| 125 |
+
// Best-effort sandbox cleanup when the browser tab/window closes. This
|
| 126 |
+
// preserves durable chat history; explicit delete still removes the session.
|
| 127 |
+
useEffect(() => {
|
| 128 |
+
const teardownSandboxes = () => {
|
| 129 |
+
const liveSessionIds = useSessionStore
|
| 130 |
+
.getState()
|
| 131 |
+
.sessions
|
| 132 |
+
.filter((session) => session.isActive && !session.expired)
|
| 133 |
+
.map((session) => session.id);
|
| 134 |
+
|
| 135 |
+
for (const sessionId of liveSessionIds) {
|
| 136 |
+
const url = `/api/session/${sessionId}/sandbox/teardown`;
|
| 137 |
+
const body = '{}';
|
| 138 |
+
const blob = new Blob([body], { type: 'application/json' });
|
| 139 |
+
|
| 140 |
+
if (navigator.sendBeacon?.(url, blob)) {
|
| 141 |
+
continue;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
fetch(url, {
|
| 145 |
+
method: 'POST',
|
| 146 |
+
body,
|
| 147 |
+
keepalive: true,
|
| 148 |
+
credentials: 'include',
|
| 149 |
+
headers: { 'Content-Type': 'application/json' },
|
| 150 |
+
}).catch(() => {});
|
| 151 |
+
}
|
| 152 |
+
};
|
| 153 |
+
|
| 154 |
+
window.addEventListener('pagehide', teardownSandboxes);
|
| 155 |
+
return () => window.removeEventListener('pagehide', teardownSandboxes);
|
| 156 |
+
}, []);
|
| 157 |
+
|
| 158 |
const handleSessionDead = useCallback(
|
| 159 |
(deadSessionId: string) => {
|
| 160 |
// Backend lost this session — mark it expired so the chat shows a
|
tests/unit/test_sandbox_auto_start.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from types import SimpleNamespace
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from agent.core.agent_loop import _needs_approval
|
| 5 |
+
from agent.tools.sandbox_tool import get_sandbox_tools
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def test_default_cpu_sandbox_create_does_not_require_approval():
|
| 9 |
+
config = SimpleNamespace(yolo_mode=False)
|
| 10 |
+
|
| 11 |
+
assert _needs_approval("sandbox_create", {}, config) is False
|
| 12 |
+
assert _needs_approval("sandbox_create", {"hardware": "cpu-basic"}, config) is False
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_non_default_sandbox_create_still_requires_approval():
|
| 16 |
+
config = SimpleNamespace(yolo_mode=False)
|
| 17 |
+
|
| 18 |
+
assert _needs_approval("sandbox_create", {"hardware": "cpu-upgrade"}, config) is True
|
| 19 |
+
assert _needs_approval("sandbox_create", {"hardware": "t4-small"}, config) is True
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def test_prompt_and_tool_specs_do_not_require_cpu_sandbox_create():
|
| 23 |
+
prompt = Path("agent/prompts/system_prompt_v3.yaml").read_text()
|
| 24 |
+
tool_specs = {tool.name: tool.description for tool in get_sandbox_tools()}
|
| 25 |
+
|
| 26 |
+
assert "sandbox_create → install deps" not in prompt
|
| 27 |
+
assert "Do NOT call sandbox_create before normal CPU work" in prompt
|
| 28 |
+
assert "cpu-basic sandbox is already available" in prompt
|
| 29 |
+
|
| 30 |
+
assert "cpu-basic sandbox is already started automatically" in tool_specs["sandbox_create"]
|
| 31 |
+
assert "started automatically for normal CPU work" in tool_specs["bash"]
|
tests/unit/test_sandbox_private_spaces.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
import asyncio
|
|
|
|
|
|
|
| 2 |
from types import SimpleNamespace
|
| 3 |
|
| 4 |
from agent.core import telemetry
|
|
@@ -9,6 +11,7 @@ from agent.tools.sandbox_tool import sandbox_create_handler
|
|
| 9 |
|
| 10 |
def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
|
| 11 |
duplicate_kwargs = {}
|
|
|
|
| 12 |
|
| 13 |
class FakeApi:
|
| 14 |
def __init__(self, token=None):
|
|
@@ -17,6 +20,10 @@ def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
|
|
| 17 |
def duplicate_space(self, **kwargs):
|
| 18 |
duplicate_kwargs.update(kwargs)
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
def add_space_secret(self, *args, **kwargs):
|
| 21 |
pass
|
| 22 |
|
|
@@ -34,6 +41,54 @@ def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
|
|
| 34 |
Sandbox.create(owner="alice", token="hf-token", log=lambda msg: None)
|
| 35 |
|
| 36 |
assert duplicate_kwargs["private"] is True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
def test_sandbox_tool_forces_private_spaces(monkeypatch):
|
|
@@ -68,6 +123,29 @@ def test_sandbox_tool_forces_private_spaces(monkeypatch):
|
|
| 68 |
assert "Visibility: private" in out
|
| 69 |
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
def test_ensure_sandbox_overrides_private_argument(monkeypatch):
|
| 72 |
captured_kwargs = {}
|
| 73 |
|
|
@@ -114,3 +192,230 @@ def test_ensure_sandbox_overrides_private_argument(monkeypatch):
|
|
| 114 |
assert error is None
|
| 115 |
assert sb is not None
|
| 116 |
assert captured_kwargs["private"] is True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import asyncio
|
| 2 |
+
import threading
|
| 3 |
+
import time
|
| 4 |
from types import SimpleNamespace
|
| 5 |
|
| 6 |
from agent.core import telemetry
|
|
|
|
| 11 |
|
| 12 |
def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
|
| 13 |
duplicate_kwargs = {}
|
| 14 |
+
requested_hardware = []
|
| 15 |
|
| 16 |
class FakeApi:
|
| 17 |
def __init__(self, token=None):
|
|
|
|
| 20 |
def duplicate_space(self, **kwargs):
|
| 21 |
duplicate_kwargs.update(kwargs)
|
| 22 |
|
| 23 |
+
def request_space_hardware(self, space_id, hardware, sleep_time=None):
|
| 24 |
+
requested_hardware.append((space_id, hardware, sleep_time))
|
| 25 |
+
return SimpleNamespace(stage="BUILDING", hardware=None)
|
| 26 |
+
|
| 27 |
def add_space_secret(self, *args, **kwargs):
|
| 28 |
pass
|
| 29 |
|
|
|
|
| 41 |
Sandbox.create(owner="alice", token="hf-token", log=lambda msg: None)
|
| 42 |
|
| 43 |
assert duplicate_kwargs["private"] is True
|
| 44 |
+
assert requested_hardware == [(duplicate_kwargs["to_id"], "cpu-basic", None)]
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def test_sandbox_client_retries_transient_runtime_404(monkeypatch):
|
| 48 |
+
runtime_calls = 0
|
| 49 |
+
|
| 50 |
+
class FakeResponse:
|
| 51 |
+
status_code = 404
|
| 52 |
+
|
| 53 |
+
class FakeRuntime404(Exception):
|
| 54 |
+
response = FakeResponse()
|
| 55 |
+
|
| 56 |
+
def __str__(self):
|
| 57 |
+
return "404 Client Error: Repository Not Found"
|
| 58 |
+
|
| 59 |
+
class FakeApi:
|
| 60 |
+
def __init__(self, token=None):
|
| 61 |
+
self.token = token
|
| 62 |
+
|
| 63 |
+
def duplicate_space(self, **kwargs):
|
| 64 |
+
pass
|
| 65 |
+
|
| 66 |
+
def request_space_hardware(self, space_id, hardware, sleep_time=None):
|
| 67 |
+
return SimpleNamespace(stage="BUILDING", hardware=None)
|
| 68 |
+
|
| 69 |
+
def add_space_secret(self, *args, **kwargs):
|
| 70 |
+
pass
|
| 71 |
+
|
| 72 |
+
def get_space_runtime(self, space_id):
|
| 73 |
+
nonlocal runtime_calls
|
| 74 |
+
runtime_calls += 1
|
| 75 |
+
if runtime_calls == 1:
|
| 76 |
+
raise FakeRuntime404()
|
| 77 |
+
return SimpleNamespace(stage="RUNNING", hardware="cpu-basic")
|
| 78 |
+
|
| 79 |
+
monkeypatch.setattr(sandbox_client, "HfApi", FakeApi)
|
| 80 |
+
monkeypatch.setattr(sandbox_client.time, "sleep", lambda seconds: None)
|
| 81 |
+
monkeypatch.setattr(
|
| 82 |
+
Sandbox,
|
| 83 |
+
"_setup_server",
|
| 84 |
+
staticmethod(lambda *args, **kwargs: None),
|
| 85 |
+
)
|
| 86 |
+
monkeypatch.setattr(Sandbox, "_wait_for_api", lambda self, *args, **kwargs: None)
|
| 87 |
+
|
| 88 |
+
sandbox = Sandbox.create(owner="alice", token="hf-token", log=lambda msg: None)
|
| 89 |
+
|
| 90 |
+
assert sandbox.space_id.startswith("alice/sandbox-")
|
| 91 |
+
assert runtime_calls == 2
|
| 92 |
|
| 93 |
|
| 94 |
def test_sandbox_tool_forces_private_spaces(monkeypatch):
|
|
|
|
| 123 |
assert "Visibility: private" in out
|
| 124 |
|
| 125 |
|
| 126 |
+
def test_orphan_sweep_preserves_spaces_without_last_modified():
|
| 127 |
+
deleted: list[str] = []
|
| 128 |
+
logs: list[str] = []
|
| 129 |
+
|
| 130 |
+
class FakeApi:
|
| 131 |
+
def list_spaces(self, **kwargs):
|
| 132 |
+
assert kwargs["full"] is True
|
| 133 |
+
return [SimpleNamespace(id="alice/sandbox-12345678")]
|
| 134 |
+
|
| 135 |
+
def delete_repo(self, repo_id, repo_type):
|
| 136 |
+
deleted.append(repo_id)
|
| 137 |
+
|
| 138 |
+
count = sandbox_tool._cleanup_user_orphan_sandboxes(
|
| 139 |
+
FakeApi(),
|
| 140 |
+
"alice",
|
| 141 |
+
logs.append,
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
assert count == 0
|
| 145 |
+
assert deleted == []
|
| 146 |
+
assert logs == ["orphan sweep: skipping alice/sandbox-12345678; missing lastModified"]
|
| 147 |
+
|
| 148 |
+
|
| 149 |
def test_ensure_sandbox_overrides_private_argument(monkeypatch):
|
| 150 |
captured_kwargs = {}
|
| 151 |
|
|
|
|
| 192 |
assert error is None
|
| 193 |
assert sb is not None
|
| 194 |
assert captured_kwargs["private"] is True
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def test_sandbox_creation_is_serialized_per_owner(monkeypatch):
|
| 198 |
+
active_creates = 0
|
| 199 |
+
max_active_creates = 0
|
| 200 |
+
active_lock = threading.Lock()
|
| 201 |
+
|
| 202 |
+
class FakeApi:
|
| 203 |
+
def __init__(self, token=None):
|
| 204 |
+
self.token = token
|
| 205 |
+
|
| 206 |
+
def whoami(self):
|
| 207 |
+
return {"name": "alice"}
|
| 208 |
+
|
| 209 |
+
class FakeSession:
|
| 210 |
+
def __init__(self):
|
| 211 |
+
self.hf_token = "hf-token"
|
| 212 |
+
self.sandbox = None
|
| 213 |
+
self.event_queue = SimpleNamespace(put_nowait=lambda event: None)
|
| 214 |
+
self._cancelled = asyncio.Event()
|
| 215 |
+
|
| 216 |
+
async def send_event(self, event):
|
| 217 |
+
pass
|
| 218 |
+
|
| 219 |
+
def fake_create(**kwargs):
|
| 220 |
+
nonlocal active_creates, max_active_creates
|
| 221 |
+
with active_lock:
|
| 222 |
+
active_creates += 1
|
| 223 |
+
max_active_creates = max(max_active_creates, active_creates)
|
| 224 |
+
time.sleep(0.02)
|
| 225 |
+
with active_lock:
|
| 226 |
+
active_creates -= 1
|
| 227 |
+
return SimpleNamespace(
|
| 228 |
+
space_id=f"alice/sandbox-{kwargs['hardware']}",
|
| 229 |
+
url="https://huggingface.co/spaces/alice/sandbox",
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
async def fake_record_sandbox_create(*args, **kwargs):
|
| 233 |
+
pass
|
| 234 |
+
|
| 235 |
+
monkeypatch.setattr(sandbox_tool, "HfApi", FakeApi)
|
| 236 |
+
monkeypatch.setattr(sandbox_tool, "_cleanup_user_orphan_sandboxes", lambda *args: 0)
|
| 237 |
+
monkeypatch.setattr(Sandbox, "create", staticmethod(fake_create))
|
| 238 |
+
monkeypatch.setattr(telemetry, "record_sandbox_create", fake_record_sandbox_create)
|
| 239 |
+
monkeypatch.setattr("huggingface_hub.metadata_update", lambda *args, **kwargs: None)
|
| 240 |
+
|
| 241 |
+
async def run():
|
| 242 |
+
await asyncio.gather(
|
| 243 |
+
sandbox_tool._ensure_sandbox(FakeSession()),
|
| 244 |
+
sandbox_tool._ensure_sandbox(FakeSession()),
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
asyncio.run(run())
|
| 248 |
+
|
| 249 |
+
assert max_active_creates == 1
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def test_sandbox_operation_waits_for_cpu_preload():
|
| 253 |
+
calls: list[tuple[str, dict]] = []
|
| 254 |
+
|
| 255 |
+
class FakeSandbox:
|
| 256 |
+
def call_tool(self, name, args):
|
| 257 |
+
calls.append((name, args))
|
| 258 |
+
return SimpleNamespace(success=True, output="preloaded-ok", error="")
|
| 259 |
+
|
| 260 |
+
async def run():
|
| 261 |
+
session = SimpleNamespace(
|
| 262 |
+
sandbox=None,
|
| 263 |
+
sandbox_preload_error=None,
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
async def preload():
|
| 267 |
+
await asyncio.sleep(0)
|
| 268 |
+
session.sandbox = FakeSandbox()
|
| 269 |
+
|
| 270 |
+
session.sandbox_preload_task = asyncio.create_task(preload())
|
| 271 |
+
handler = sandbox_tool._make_tool_handler("bash")
|
| 272 |
+
return await handler({"command": "echo ok"}, session=session)
|
| 273 |
+
|
| 274 |
+
out, ok = asyncio.run(run())
|
| 275 |
+
|
| 276 |
+
assert ok is True
|
| 277 |
+
assert out == "preloaded-ok"
|
| 278 |
+
assert calls == [("bash", {"command": "echo ok"})]
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def test_default_sandbox_create_waits_for_cpu_preload():
|
| 282 |
+
class FakeSandbox:
|
| 283 |
+
space_id = "alice/sandbox-cpu"
|
| 284 |
+
url = "https://huggingface.co/spaces/alice/sandbox-cpu"
|
| 285 |
+
|
| 286 |
+
async def run():
|
| 287 |
+
session = SimpleNamespace(
|
| 288 |
+
sandbox=None,
|
| 289 |
+
sandbox_preload_error=None,
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
async def preload():
|
| 293 |
+
await asyncio.sleep(0)
|
| 294 |
+
session.sandbox = FakeSandbox()
|
| 295 |
+
session.sandbox_hardware = "cpu-basic"
|
| 296 |
+
|
| 297 |
+
session.sandbox_preload_task = asyncio.create_task(preload())
|
| 298 |
+
return await sandbox_tool.sandbox_create_handler({}, session=session)
|
| 299 |
+
|
| 300 |
+
out, ok = asyncio.run(run())
|
| 301 |
+
|
| 302 |
+
assert ok is True
|
| 303 |
+
assert "Sandbox already active: alice/sandbox-cpu" in out
|
| 304 |
+
assert "Hardware: cpu-basic" in out
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
def test_sandbox_create_replaces_auto_cpu_sandbox(monkeypatch):
|
| 308 |
+
deleted: list[str] = []
|
| 309 |
+
|
| 310 |
+
class FakeSession:
|
| 311 |
+
def __init__(self):
|
| 312 |
+
self.sandbox = SimpleNamespace(
|
| 313 |
+
space_id="alice/sandbox-cpu",
|
| 314 |
+
url="https://huggingface.co/spaces/alice/sandbox-cpu",
|
| 315 |
+
_owns_space=True,
|
| 316 |
+
delete=lambda: deleted.append("alice/sandbox-cpu"),
|
| 317 |
+
)
|
| 318 |
+
self.sandbox_hardware = "cpu-basic"
|
| 319 |
+
self.sandbox_preload_task = None
|
| 320 |
+
self.sandbox_preload_cancel_event = None
|
| 321 |
+
|
| 322 |
+
async def send_event(self, event):
|
| 323 |
+
pass
|
| 324 |
+
|
| 325 |
+
gpu_sandbox = SimpleNamespace(
|
| 326 |
+
space_id="alice/sandbox-gpu",
|
| 327 |
+
url="https://huggingface.co/spaces/alice/sandbox-gpu",
|
| 328 |
+
_owns_space=True,
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
async def fake_ensure_sandbox(session, hardware="cpu-basic", **kwargs):
|
| 332 |
+
session.sandbox = gpu_sandbox
|
| 333 |
+
session.sandbox_hardware = hardware
|
| 334 |
+
return gpu_sandbox, None
|
| 335 |
+
|
| 336 |
+
async def fake_record_sandbox_destroy(*args, **kwargs):
|
| 337 |
+
pass
|
| 338 |
+
|
| 339 |
+
monkeypatch.setattr(sandbox_tool, "_ensure_sandbox", fake_ensure_sandbox)
|
| 340 |
+
monkeypatch.setattr(telemetry, "record_sandbox_destroy", fake_record_sandbox_destroy)
|
| 341 |
+
|
| 342 |
+
session = FakeSession()
|
| 343 |
+
out, ok = asyncio.run(
|
| 344 |
+
sandbox_tool.sandbox_create_handler(
|
| 345 |
+
{"hardware": "a100-large"},
|
| 346 |
+
session=session,
|
| 347 |
+
)
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
assert ok is True
|
| 351 |
+
assert deleted == ["alice/sandbox-cpu"]
|
| 352 |
+
assert session.sandbox is gpu_sandbox
|
| 353 |
+
assert session.sandbox_hardware == "a100-large"
|
| 354 |
+
assert "Hardware: a100-large" in out
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
def test_teardown_cancels_preload_and_deletes_owned_sandbox(monkeypatch):
|
| 358 |
+
deleted: list[str] = []
|
| 359 |
+
|
| 360 |
+
async def fake_record_sandbox_destroy(*args, **kwargs):
|
| 361 |
+
pass
|
| 362 |
+
|
| 363 |
+
monkeypatch.setattr(telemetry, "record_sandbox_destroy", fake_record_sandbox_destroy)
|
| 364 |
+
|
| 365 |
+
async def run():
|
| 366 |
+
cancel_event = threading.Event()
|
| 367 |
+
|
| 368 |
+
async def preload():
|
| 369 |
+
await asyncio.sleep(0)
|
| 370 |
+
|
| 371 |
+
session = SimpleNamespace(
|
| 372 |
+
sandbox=SimpleNamespace(
|
| 373 |
+
space_id="alice/sandbox-12345678",
|
| 374 |
+
_owns_space=True,
|
| 375 |
+
delete=lambda: deleted.append("alice/sandbox-12345678"),
|
| 376 |
+
),
|
| 377 |
+
sandbox_hardware="cpu-basic",
|
| 378 |
+
sandbox_preload_task=asyncio.create_task(preload()),
|
| 379 |
+
sandbox_preload_cancel_event=cancel_event,
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
await sandbox_tool.teardown_session_sandbox(session)
|
| 383 |
+
return session, cancel_event
|
| 384 |
+
|
| 385 |
+
session, cancel_event = asyncio.run(run())
|
| 386 |
+
|
| 387 |
+
assert cancel_event.is_set()
|
| 388 |
+
assert deleted == ["alice/sandbox-12345678"]
|
| 389 |
+
assert session.sandbox is None
|
| 390 |
+
assert session.sandbox_hardware is None
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
def test_cancel_sandbox_preload_cancels_task_after_timeout(monkeypatch):
|
| 394 |
+
async def run():
|
| 395 |
+
async def fake_wait_for(awaitable, timeout):
|
| 396 |
+
await asyncio.sleep(0)
|
| 397 |
+
raise asyncio.TimeoutError
|
| 398 |
+
|
| 399 |
+
monkeypatch.setattr(sandbox_tool.asyncio, "wait_for", fake_wait_for)
|
| 400 |
+
|
| 401 |
+
cancel_event = threading.Event()
|
| 402 |
+
blocker = asyncio.Event()
|
| 403 |
+
|
| 404 |
+
async def preload():
|
| 405 |
+
await blocker.wait()
|
| 406 |
+
|
| 407 |
+
task = asyncio.create_task(preload())
|
| 408 |
+
session = SimpleNamespace(
|
| 409 |
+
sandbox_preload_task=task,
|
| 410 |
+
sandbox_preload_cancel_event=cancel_event,
|
| 411 |
+
)
|
| 412 |
+
|
| 413 |
+
await sandbox_tool.cancel_sandbox_preload(session)
|
| 414 |
+
await asyncio.sleep(0)
|
| 415 |
+
|
| 416 |
+
return task.cancelled(), cancel_event.is_set()
|
| 417 |
+
|
| 418 |
+
task_cancelled, cancel_event_set = asyncio.run(run())
|
| 419 |
+
|
| 420 |
+
assert task_cancelled is True
|
| 421 |
+
assert cancel_event_set is True
|
tests/unit/test_session_manager_persistence.py
CHANGED
|
@@ -186,6 +186,12 @@ async def test_concurrent_lazy_restore_starts_only_one_agent_task():
|
|
| 186 |
store = RestoreStore(delay=0.01)
|
| 187 |
manager = _manager_with_store(store)
|
| 188 |
stop = _install_fake_runtime(manager)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
try:
|
| 191 |
first, second = await asyncio.gather(
|
|
@@ -197,12 +203,56 @@ async def test_concurrent_lazy_restore_starts_only_one_agent_task():
|
|
| 197 |
assert first is second
|
| 198 |
assert list(manager.sessions) == ["persisted-session"]
|
| 199 |
assert manager.run_calls == 1 # type: ignore[attr-defined]
|
|
|
|
| 200 |
assert not stop.is_set()
|
| 201 |
finally:
|
| 202 |
stop.set()
|
| 203 |
await _cancel_runtime_tasks(manager)
|
| 204 |
|
| 205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
@pytest.mark.asyncio
|
| 207 |
async def test_lazy_restore_preserves_pending_approval_tool_calls():
|
| 208 |
store = RestoreStore(
|
|
|
|
| 186 |
store = RestoreStore(delay=0.01)
|
| 187 |
manager = _manager_with_store(store)
|
| 188 |
stop = _install_fake_runtime(manager)
|
| 189 |
+
scheduled: list[str] = []
|
| 190 |
+
|
| 191 |
+
def fake_start_cpu_sandbox_preload(agent_session: AgentSession) -> None:
|
| 192 |
+
scheduled.append(agent_session.session_id)
|
| 193 |
+
|
| 194 |
+
manager._start_cpu_sandbox_preload = fake_start_cpu_sandbox_preload # type: ignore[method-assign]
|
| 195 |
|
| 196 |
try:
|
| 197 |
first, second = await asyncio.gather(
|
|
|
|
| 203 |
assert first is second
|
| 204 |
assert list(manager.sessions) == ["persisted-session"]
|
| 205 |
assert manager.run_calls == 1 # type: ignore[attr-defined]
|
| 206 |
+
assert scheduled == ["persisted-session"]
|
| 207 |
assert not stop.is_set()
|
| 208 |
finally:
|
| 209 |
stop.set()
|
| 210 |
await _cancel_runtime_tasks(manager)
|
| 211 |
|
| 212 |
|
| 213 |
+
@pytest.mark.asyncio
|
| 214 |
+
async def test_create_session_schedules_cpu_sandbox_preload():
|
| 215 |
+
manager = _manager_with_store(NoopSessionStore())
|
| 216 |
+
stop = _install_fake_runtime(manager)
|
| 217 |
+
scheduled: list[str] = []
|
| 218 |
+
|
| 219 |
+
def fake_start_cpu_sandbox_preload(agent_session: AgentSession) -> None:
|
| 220 |
+
scheduled.append(agent_session.session_id)
|
| 221 |
+
|
| 222 |
+
manager._start_cpu_sandbox_preload = fake_start_cpu_sandbox_preload # type: ignore[method-assign]
|
| 223 |
+
|
| 224 |
+
try:
|
| 225 |
+
session_id = await manager.create_session(user_id="owner", hf_token="token")
|
| 226 |
+
|
| 227 |
+
assert scheduled == [session_id]
|
| 228 |
+
assert session_id in manager.sessions
|
| 229 |
+
finally:
|
| 230 |
+
stop.set()
|
| 231 |
+
await _cancel_runtime_tasks(manager)
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
@pytest.mark.asyncio
|
| 235 |
+
async def test_lazy_restore_schedules_cpu_sandbox_preload():
|
| 236 |
+
manager = _manager_with_store(RestoreStore())
|
| 237 |
+
stop = _install_fake_runtime(manager)
|
| 238 |
+
scheduled: list[str] = []
|
| 239 |
+
|
| 240 |
+
def fake_start_cpu_sandbox_preload(agent_session: AgentSession) -> None:
|
| 241 |
+
scheduled.append(agent_session.session_id)
|
| 242 |
+
|
| 243 |
+
manager._start_cpu_sandbox_preload = fake_start_cpu_sandbox_preload # type: ignore[method-assign]
|
| 244 |
+
|
| 245 |
+
try:
|
| 246 |
+
restored = await manager.ensure_session_loaded("persisted-session", user_id="owner")
|
| 247 |
+
|
| 248 |
+
assert restored is not None
|
| 249 |
+
assert scheduled == ["persisted-session"]
|
| 250 |
+
assert "persisted-session" in manager.sessions
|
| 251 |
+
finally:
|
| 252 |
+
stop.set()
|
| 253 |
+
await _cancel_runtime_tasks(manager)
|
| 254 |
+
|
| 255 |
+
|
| 256 |
@pytest.mark.asyncio
|
| 257 |
async def test_lazy_restore_preserves_pending_approval_tool_calls():
|
| 258 |
store = RestoreStore(
|