ml-intern

Sleeping

App Files Files Community

lewtun HF Staff Codex commited on May 1

Commit

8615c28

unverified ·

1 Parent(s): 77324b8

Auto-start CPU sandboxes for sessions (#200)

Browse files

* Auto-start CPU sandboxes for sessions

Co-authored-by: Codex <codex@openai.com>

* Retry sandbox runtime visibility checks

Co-authored-by: Codex <codex@openai.com>

* Stabilize auto CPU sandbox creation

Co-authored-by: OpenAI Codex <codex@openai.com>

* Address sandbox PR review comments

Co-authored-by: OpenAI Codex <codex@openai.com>

---------

Co-authored-by: Codex <codex@openai.com>

Files changed (11) hide show

agent/core/agent_loop.py +3 -1
agent/core/session.py +4 -0
agent/prompts/system_prompt_v3.yaml +4 -2
agent/tools/sandbox_client.py +36 -1
agent/tools/sandbox_tool.py +295 -50
backend/routes/agent.py +13 -0
backend/session_manager.py +30 -20
frontend/src/components/Layout/AppLayout.tsx +33 -0
tests/unit/test_sandbox_auto_start.py +31 -0
tests/unit/test_sandbox_private_spaces.py +305 -0
tests/unit/test_session_manager_persistence.py +50 -0

agent/core/agent_loop.py CHANGED Viewed

@@ -32,6 +32,7 @@ from agent.core.prompt_caching import with_prompt_caching
 from agent.core.session import Event, OpType, Session
 from agent.core.tools import ToolRouter
 from agent.tools.jobs_tool import CPU_FLAVORS
 logger = logging.getLogger(__name__)
@@ -155,7 +156,8 @@ def _base_needs_approval(
         return False
     if tool_name == "sandbox_create":
-        return True
     if tool_name == "hf_jobs":
         operation = _operation(tool_args)

 from agent.core.session import Event, OpType, Session
 from agent.core.tools import ToolRouter
 from agent.tools.jobs_tool import CPU_FLAVORS
+from agent.tools.sandbox_tool import DEFAULT_CPU_SANDBOX_HARDWARE
 logger = logging.getLogger(__name__)
         return False
     if tool_name == "sandbox_create":
+        hardware = tool_args.get("hardware") or DEFAULT_CPU_SANDBOX_HARDWARE
+        return hardware != DEFAULT_CPU_SANDBOX_HARDWARE
     if tool_name == "hf_jobs":
         operation = _operation(tool_args)

agent/core/session.py CHANGED Viewed

@@ -116,6 +116,10 @@ class Session:
         self._cancelled = asyncio.Event()
         self.pending_approval: Optional[dict[str, Any]] = None
         self.sandbox = None
         self._running_job_ids: set[str] = set()  # HF job IDs currently executing
         self.notification_gateway = notification_gateway
         self.notification_destinations = list(notification_destinations or [])

         self._cancelled = asyncio.Event()
         self.pending_approval: Optional[dict[str, Any]] = None
         self.sandbox = None
+        self.sandbox_hardware: Optional[str] = None
+        self.sandbox_preload_task: Optional[asyncio.Task] = None
+        self.sandbox_preload_error: Optional[str] = None
+        self.sandbox_preload_cancel_event: Any | None = None
         self._running_job_ids: set[str] = set()  # HF job IDs currently executing
         self.notification_gateway = notification_gateway
         self.notification_destinations = list(notification_destinations or [])

agent/prompts/system_prompt_v3.yaml CHANGED Viewed

@@ -122,8 +122,10 @@ system_prompt: |
   # Sandbox-first development
-  For non-trivial scripts, develop and test in a sandbox before launching via hf_jobs:
-    sandbox_create → install deps → write script → test with small run → fix errors → launch via hf_jobs at scale
   Use GPU sandbox (t4-small minimum) when testing code that uses CUDA, bf16, or model loading. CPU sandboxes cannot test GPU code paths.

   # Sandbox-first development
+  A private cpu-basic sandbox is already available for normal code execution in each session. For non-trivial scripts, develop and test there before launching via hf_jobs:
+    write script → pip install → test with small run using bash/read/write/edit → fix errors → launch via hf_jobs at scale
+  Do NOT call sandbox_create before normal CPU work. Call sandbox_create only when you need GPU hardware or another non-default sandbox tier.
   Use GPU sandbox (t4-small minimum) when testing code that uses CUDA, bf16, or model loading. CPU sandboxes cannot test GPU code paths.

agent/tools/sandbox_client.py CHANGED Viewed

@@ -66,6 +66,15 @@ WAIT_TIMEOUT = 600
 WAIT_INTERVAL = 5
 API_WAIT_TIMEOUT = 180
 _DOCKERFILE = """\
 FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
@@ -615,6 +624,16 @@ class Sandbox:
         _check_cancel()
         # Inject secrets BEFORE uploading server files (which triggers rebuild).
         # Secrets added after a Space is running aren't available until restart,
         # so they must be set before the build/start cycle.
@@ -633,8 +652,24 @@ class Sandbox:
         deadline = time.time() + wait_timeout
         while time.time() < deadline:
             _check_cancel()
-            runtime = api.get_space_runtime(space_id)
             if runtime.stage == "RUNNING":
                 _log(f"Space is running (hardware: {runtime.hardware})")
                 break
             if runtime.stage in ("RUNTIME_ERROR", "BUILD_ERROR"):

 WAIT_INTERVAL = 5
 API_WAIT_TIMEOUT = 180
+def _is_transient_space_visibility_error(error: Exception) -> bool:
+    """Return True when a newly duplicated Space is not queryable yet."""
+    response = getattr(error, "response", None)
+    if getattr(response, "status_code", None) == 404:
+        return True
+    message = str(error)
+    return "Repository Not Found" in message or "404 Client Error" in message
 _DOCKERFILE = """\
 FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
         _check_cancel()
+        # Some template duplicates can initially inherit the template hardware.
+        # Explicitly request the target tier so automatic CPU sandboxes never
+        # silently come up on GPU hardware.
+        api.request_space_hardware(
+            space_id,
+            hardware=hardware,
+            sleep_time=sleep_time,
+        )
+        _log(f"Requested hardware: {hardware}")
         # Inject secrets BEFORE uploading server files (which triggers rebuild).
         # Secrets added after a Space is running aren't available until restart,
         # so they must be set before the build/start cycle.
         deadline = time.time() + wait_timeout
         while time.time() < deadline:
             _check_cancel()
+            try:
+                runtime = api.get_space_runtime(space_id)
+            except Exception as e:
+                if _is_transient_space_visibility_error(e):
+                    _log("  Space runtime not visible yet...")
+                    time.sleep(WAIT_INTERVAL)
+                    continue
+                raise
             if runtime.stage == "RUNNING":
+                current_hardware = runtime.hardware or getattr(
+                    runtime, "requested_hardware", None
+                )
+                if current_hardware != hardware:
+                    _log(
+                        f"  RUNNING on {current_hardware}; waiting for {hardware}..."
+                    )
+                    time.sleep(WAIT_INTERVAL)
+                    continue
                 _log(f"Space is running (hardware: {runtime.hardware})")
                 break
             if runtime.stage in ("RUNTIME_ERROR", "BUILD_ERROR"):

agent/tools/sandbox_tool.py CHANGED Viewed

@@ -2,11 +2,11 @@
 Sandbox tools — expose the Sandbox client as agent tools.
 5 tools total:
-  sandbox_create — explicit sandbox creation (requires approval)
-  bash, read, write, edit — operations on the sandbox
-If any operation tool is called without an active sandbox,
-a cpu-basic sandbox is auto-created (no approval needed).
 """
 from __future__ import annotations
@@ -15,6 +15,7 @@ import asyncio
 import logging
 import re
 import threading
 from datetime import datetime, timedelta, timezone
 from typing import Any
@@ -26,6 +27,8 @@ from agent.tools.trackio_seed import ensure_trackio_dashboard
 logger = logging.getLogger(__name__)
 # Match the exact suffix pattern Sandbox.create produces: "sandbox-<8 hex>".
 # Used to identify orphan sandboxes from prior sessions safely (won't match
 # user-renamed lookalikes).
@@ -36,6 +39,23 @@ _SANDBOX_NAME_RE = re.compile(r"^sandbox-[a-f0-9]{8}$")
 # so we leave it alone.
 _ORPHAN_STALE_AFTER = timedelta(hours=1)
 def _looks_like_path(script: str) -> bool:
     """Return True if the script string looks like a file path (not inline code)."""
@@ -124,7 +144,7 @@ def _cleanup_user_orphan_sandboxes(
     cutoff = datetime.now(timezone.utc) - _ORPHAN_STALE_AFTER
     deleted = 0
     try:
-        spaces = list(api.list_spaces(author=owner, limit=200))
     except Exception as e:
         log(f"orphan sweep: list_spaces failed: {e}")
         return 0
@@ -140,6 +160,9 @@ def _cleanup_user_orphan_sandboxes(
                 last_mod = datetime.fromisoformat(last_mod.replace("Z", "+00:00"))
             except ValueError:
                 last_mod = None
         if last_mod and last_mod > cutoff:
             # Recent — could be a concurrent live session. Skip.
             continue
@@ -158,8 +181,9 @@ def _cleanup_user_orphan_sandboxes(
 async def _ensure_sandbox(
     session: Any,
-    hardware: str = "cpu-basic",
     extra_secrets: dict[str, str] | None = None,
     **create_kwargs,
 ) -> tuple[Sandbox | None, str | None]:
     """
@@ -184,6 +208,45 @@ async def _ensure_sandbox(
     if not owner:
         return None, "Could not determine HF username from token."
     await session.send_event(
         Event(
             event_type="tool_log",
@@ -203,27 +266,10 @@ async def _ensure_sandbox(
             Event(event_type="tool_log", data={"tool": "sandbox", "log": msg}),
         )
-    # Before we create a new sandbox, sweep this user's stale sandboxes from
-    # prior sessions. ``_cleanup_sandbox`` in session_manager fires only on
-    # clean session exit; pod kills, WebSocket drops, etc. leave orphans
-    # behind, and they accumulate on every new session forever (observed
-    # 2310 leaked across the Hub on 2026-04-27). Doing the cleanup here at
-    # session start = self-healing, no separate cron needed.
-    #
-    # The 1h staleness filter is the safety: a sandbox modified in the last
-    # hour might still be tied to a live session in another tab, so we skip.
-    # Anything older has no realistic chance of being active given typical
-    # session lengths.
-    try:
-        await asyncio.to_thread(_cleanup_user_orphan_sandboxes, api, owner, _log)
-    except Exception as e:
-        # Cleanup is best-effort — never block sandbox_create on it.
-        _log(f"orphan sandbox sweep failed (non-fatal): {e}")
     # Bridge asyncio cancel event to a threading.Event for the blocking create call.
     # We poll session._cancelled from the main loop in a background task and set
     # a threading.Event that Sandbox.create checks during its polling loops.
-    cancel_flag = threading.Event()
     async def _watch_cancel():
         await session._cancelled.wait()
@@ -245,7 +291,7 @@ async def _ensure_sandbox(
         "cancel_event": cancel_flag,
         **create_kwargs,
     }
-    if hardware != "cpu-basic":
         kwargs["sleep_time"] = 2700
     import time as _t
     _t_start = _t.monotonic()
@@ -255,7 +301,18 @@ async def _ensure_sandbox(
         return None, "Sandbox creation cancelled by user."
     finally:
         watcher_task.cancel()
     session.sandbox = sb
     # Telemetry: sandbox creation (infra consumption signal)
     from agent.core import telemetry
@@ -286,19 +343,146 @@ async def _ensure_sandbox(
     return sb, None
 # ── sandbox_create tool ──────────────────────────────────────────────
 SANDBOX_CREATE_TOOL_SPEC = {
     "name": "sandbox_create",
     "description": (
-        "Create a persistent remote Linux environment for developing and testing scripts.\n\n"
-        "Workflow: sandbox_create → write script → pip install → test with small run → fix errors → hf_jobs at scale.\n"
-        "The sandbox persists across tool calls within the session. pip install works out of the box. "
         "Sandboxes are always created as private HF Spaces.\n\n"
-        "Use this when: you need to develop, test, and iterate on scripts before launching via hf_jobs. "
-        "Especially for training scripts where you need to verify imports, test on a small subset, and fix errors interactively.\n\n"
-        "Skip this when: the task is a simple one-shot operation (status check, resource search, quick data query), "
-        "or the script is copied from a verified working example with minimal changes.\n\n"
         "For ML code that uses CUDA, bf16, or model loading: use GPU hardware (t4-small minimum). "
         "CPU sandboxes cannot run GPU code paths — your test will not catch GPU-related errors.\n\n"
         "Before choosing hardware, estimate your VRAM needs (models you run, training data size). Rule of thumb: bf16/fp16 ≈ 2 bytes/param, "
@@ -318,7 +502,10 @@ SANDBOX_CREATE_TOOL_SPEC = {
             "hardware": {
                 "type": "string",
                 "enum": [e.value for e in SpaceHardware],
-                "description": "Hardware tier for the sandbox (default: cpu-basic)",
             },
             "trackio_space_id": {
                 "type": "string",
@@ -346,7 +533,7 @@ async def sandbox_create_handler(
     args: dict[str, Any], session: Any = None, tool_call_id: str | None = None
 ) -> tuple[str, bool]:
     """Handle sandbox_create tool calls."""
-    hardware = args.get("hardware", "cpu-basic")
     trackio_space_id = args.get("trackio_space_id") or None
     trackio_project = args.get("trackio_project") or None
@@ -364,24 +551,76 @@ async def sandbox_create_handler(
             data["trackioProject"] = trackio_project
         await session.send_event(Event(event_type="tool_state_change", data=data))
-    # If sandbox already exists, return its info
     if session and getattr(session, "sandbox", None):
         sb = session.sandbox
         requested_hardware = args.get("hardware")
         lockout_note = ""
-        if requested_hardware:
             lockout_note = (
                 f"\nRequested hardware: {requested_hardware}\n"
                 "Hardware cannot be changed by calling sandbox_create again. "
                 "Delete the existing sandbox first if you need a different tier."
             )
-        await _emit_trackio_state(sb)
-        return (
-            f"Sandbox already active: {sb.space_id}\n"
-            f"URL: {sb.url}\n"
-            f"{lockout_note}\n"
-            f"Use bash/read/write/edit to interact with it."
-        ), True
     create_kwargs: dict[str, Any] = {}
@@ -420,11 +659,11 @@ def _make_tool_handler(sandbox_tool_name: str):
     """Factory: create a handler for a sandbox operation tool."""
     async def handler(args: dict[str, Any], session: Any = None) -> tuple[str, bool]:
-        # Require sandbox to exist — user must approve sandbox_create first
-        if not session or not getattr(session, "sandbox", None):
-            return "No sandbox running. Call sandbox_create first to start one.", False
-        sb = session.sandbox
         try:
             result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
@@ -449,7 +688,7 @@ def get_sandbox_tools():
     tools = []
-    # sandbox_create (explicit creation, requires approval)
     tools.append(
         ToolSpec(
             name=SANDBOX_CREATE_TOOL_SPEC["name"],
@@ -462,10 +701,16 @@ def get_sandbox_tools():
     # Operation tools (auto-execute, no approval needed)
     for name in Sandbox.TOOLS.keys():
         spec = Sandbox.TOOLS[name]
         tools.append(
             ToolSpec(
                 name=name,
-                description=spec["description"],
                 parameters=spec["parameters"],
                 handler=_make_tool_handler(name),
             )

 Sandbox tools — expose the Sandbox client as agent tools.
 5 tools total:
+  sandbox_create — create/replace sandbox for non-default hardware
+  bash, read, write, edit — operations on the active sandbox
+A cpu-basic sandbox is preloaded for each session. Operation tools wait for it
+if startup is still in progress.
 """
 from __future__ import annotations
 import logging
 import re
 import threading
+import weakref
 from datetime import datetime, timedelta, timezone
 from typing import Any
 logger = logging.getLogger(__name__)
+DEFAULT_CPU_SANDBOX_HARDWARE = "cpu-basic"
 # Match the exact suffix pattern Sandbox.create produces: "sandbox-<8 hex>".
 # Used to identify orphan sandboxes from prior sessions safely (won't match
 # user-renamed lookalikes).
 # so we leave it alone.
 _ORPHAN_STALE_AFTER = timedelta(hours=1)
+# HF Space duplication/build APIs can behave poorly when multiple private
+# sandboxes are created concurrently for the same namespace. Keep session
+# creation non-blocking, but serialize the actual Hub create path per owner.
+_SANDBOX_CREATE_LOCKS: weakref.WeakKeyDictionary[
+    asyncio.AbstractEventLoop, dict[str, asyncio.Lock]
+] = weakref.WeakKeyDictionary()
+def _get_sandbox_create_lock(owner: str) -> asyncio.Lock:
+    loop = asyncio.get_running_loop()
+    locks = _SANDBOX_CREATE_LOCKS.setdefault(loop, {})
+    lock = locks.get(owner)
+    if lock is None:
+        lock = asyncio.Lock()
+        locks[owner] = lock
+    return lock
 def _looks_like_path(script: str) -> bool:
     """Return True if the script string looks like a file path (not inline code)."""
     cutoff = datetime.now(timezone.utc) - _ORPHAN_STALE_AFTER
     deleted = 0
     try:
+        spaces = list(api.list_spaces(author=owner, limit=200, full=True))
     except Exception as e:
         log(f"orphan sweep: list_spaces failed: {e}")
         return 0
                 last_mod = datetime.fromisoformat(last_mod.replace("Z", "+00:00"))
             except ValueError:
                 last_mod = None
+        if last_mod is None:
+            log(f"orphan sweep: skipping {space.id}; missing lastModified")
+            continue
         if last_mod and last_mod > cutoff:
             # Recent — could be a concurrent live session. Skip.
             continue
 async def _ensure_sandbox(
     session: Any,
+    hardware: str = DEFAULT_CPU_SANDBOX_HARDWARE,
     extra_secrets: dict[str, str] | None = None,
+    cancel_event: threading.Event | None = None,
     **create_kwargs,
 ) -> tuple[Sandbox | None, str | None]:
     """
     if not owner:
         return None, "Could not determine HF username from token."
+    create_lock = _get_sandbox_create_lock(owner)
+    if create_lock.locked():
+        await session.send_event(
+            Event(
+                event_type="tool_log",
+                data={
+                    "tool": "sandbox",
+                    "log": "Waiting for sandbox creation slot...",
+                },
+            )
+        )
+    async with create_lock:
+        if getattr(session, "sandbox", None):
+            return session.sandbox, None
+        return await _create_sandbox_locked(
+            session,
+            api=api,
+            owner=owner,
+            hardware=hardware,
+            extra_secrets=extra_secrets,
+            cancel_event=cancel_event,
+            **create_kwargs,
+        )
+async def _create_sandbox_locked(
+    session: Any,
+    *,
+    api: HfApi,
+    owner: str,
+    hardware: str,
+    extra_secrets: dict[str, str] | None = None,
+    cancel_event: threading.Event | None = None,
+    **create_kwargs,
+) -> tuple[Sandbox | None, str | None]:
+    """Create the Space while the per-owner sandbox creation lock is held."""
+    token = session.hf_token
     await session.send_event(
         Event(
             event_type="tool_log",
             Event(event_type="tool_log", data={"tool": "sandbox", "log": msg}),
         )
     # Bridge asyncio cancel event to a threading.Event for the blocking create call.
     # We poll session._cancelled from the main loop in a background task and set
     # a threading.Event that Sandbox.create checks during its polling loops.
+    cancel_flag = cancel_event or threading.Event()
     async def _watch_cancel():
         await session._cancelled.wait()
         "cancel_event": cancel_flag,
         **create_kwargs,
     }
+    if hardware != DEFAULT_CPU_SANDBOX_HARDWARE:
         kwargs["sleep_time"] = 2700
     import time as _t
     _t_start = _t.monotonic()
         return None, "Sandbox creation cancelled by user."
     finally:
         watcher_task.cancel()
+    if cancel_flag.is_set():
+        if getattr(sb, "_owns_space", False):
+            try:
+                await asyncio.to_thread(sb.delete)
+            except Exception as e:
+                logger.warning("Failed to delete cancelled sandbox %s: %s", sb.space_id, e)
+        return None, "Sandbox creation cancelled by user."
     session.sandbox = sb
+    session.sandbox_hardware = hardware
+    session.sandbox_preload_error = None
     # Telemetry: sandbox creation (infra consumption signal)
     from agent.core import telemetry
     return sb, None
+def start_cpu_sandbox_preload(session: Any) -> asyncio.Task | None:
+    """Start a background ``cpu-basic`` sandbox for this session."""
+    if not session or getattr(session, "sandbox", None):
+        return None
+    existing_task = getattr(session, "sandbox_preload_task", None)
+    if existing_task and not existing_task.done():
+        return existing_task
+    cancel_event = threading.Event()
+    session.sandbox_preload_cancel_event = cancel_event
+    session.sandbox_preload_error = None
+    async def _preload() -> Sandbox | None:
+        try:
+            sb, error = await _ensure_sandbox(
+                session,
+                hardware=DEFAULT_CPU_SANDBOX_HARDWARE,
+                cancel_event=cancel_event,
+            )
+            if error:
+                session.sandbox_preload_error = error
+                return None
+            return sb
+        except asyncio.CancelledError:
+            cancel_event.set()
+            session.sandbox_preload_error = "Sandbox creation cancelled by user."
+            raise
+        except Exception as e:
+            session.sandbox_preload_error = f"Failed to create sandbox: {e}"
+            logger.warning("CPU sandbox preload failed: %s", e)
+            return None
+    task = asyncio.create_task(_preload())
+    session.sandbox_preload_task = task
+    return task
+async def cancel_sandbox_preload(session: Any) -> None:
+    """Best-effort cancellation for an in-flight CPU sandbox preload."""
+    cancel_event = getattr(session, "sandbox_preload_cancel_event", None)
+    if cancel_event is not None:
+        cancel_event.set()
+    task = getattr(session, "sandbox_preload_task", None)
+    if not task or task.done():
+        return
+    current_task = asyncio.current_task()
+    if task is current_task:
+        return
+    try:
+        await asyncio.wait_for(asyncio.shield(task), timeout=30)
+    except asyncio.TimeoutError:
+        logger.warning(
+            "Timed out waiting for CPU sandbox preload cancellation; "
+            "task is still live, cancelling asyncio wrapper"
+        )
+        task.cancel()
+    except asyncio.CancelledError:
+        raise
+    except Exception:
+        pass
+async def get_active_or_preloaded_sandbox(
+    session: Any,
+) -> tuple[Sandbox | None, str | None]:
+    """Return the active sandbox, waiting for the startup preload if needed."""
+    if not session:
+        return None, "No session available."
+    if getattr(session, "sandbox", None):
+        return session.sandbox, None
+    task = getattr(session, "sandbox_preload_task", None)
+    if task:
+        try:
+            await asyncio.shield(task)
+        except asyncio.CancelledError:
+            raise
+        except Exception as e:
+            session.sandbox_preload_error = f"Failed to create sandbox: {e}"
+    if getattr(session, "sandbox", None):
+        return session.sandbox, None
+    preload_error = getattr(session, "sandbox_preload_error", None)
+    if preload_error:
+        return None, preload_error
+    return None, "Sandbox is still starting. Please retry shortly."
+async def teardown_session_sandbox(session: Any) -> None:
+    """Cancel sandbox preload and delete the active owned sandbox, if present."""
+    if not session:
+        return
+    await cancel_sandbox_preload(session)
+    sandbox = getattr(session, "sandbox", None)
+    session.sandbox = None
+    session.sandbox_hardware = None
+    if not (sandbox and getattr(sandbox, "_owns_space", False)):
+        return
+    space_id = getattr(sandbox, "space_id", None)
+    last_err: Exception | None = None
+    for attempt in range(3):
+        try:
+            logger.info("Deleting sandbox %s (attempt %s/3)...", space_id, attempt + 1)
+            await asyncio.to_thread(sandbox.delete)
+            from agent.core import telemetry
+            await telemetry.record_sandbox_destroy(session, sandbox)
+            return
+        except Exception as e:
+            last_err = e
+            if attempt < 2:
+                await asyncio.sleep(2 ** attempt)
+    logger.error(
+        "Failed to delete sandbox %s after 3 attempts: %s. "
+        "Orphan — sweep script will pick it up.",
+        space_id,
+        last_err,
+    )
 # ── sandbox_create tool ──────────────────────────────────────────────
 SANDBOX_CREATE_TOOL_SPEC = {
     "name": "sandbox_create",
     "description": (
+        "Create or replace the session sandbox when non-default hardware is needed.\n\n"
+        "A private cpu-basic sandbox is already started automatically for each session. "
+        "For normal CPU code execution, call bash/read/write/edit directly; do NOT call sandbox_create first.\n\n"
+        "Use sandbox_create when: you need GPU hardware, cpu-upgrade, or Trackio secrets before running code. "
+        "The active sandbox persists across tool calls within the session. pip install works out of the box. "
         "Sandboxes are always created as private HF Spaces.\n\n"
         "For ML code that uses CUDA, bf16, or model loading: use GPU hardware (t4-small minimum). "
         "CPU sandboxes cannot run GPU code paths — your test will not catch GPU-related errors.\n\n"
         "Before choosing hardware, estimate your VRAM needs (models you run, training data size). Rule of thumb: bf16/fp16 ≈ 2 bytes/param, "
             "hardware": {
                 "type": "string",
                 "enum": [e.value for e in SpaceHardware],
+                "description": (
+                    "Hardware tier for the sandbox. Omit for the existing auto-started "
+                    "cpu-basic sandbox; choose GPU/cpu-upgrade only when needed."
+                ),
             },
             "trackio_space_id": {
                 "type": "string",
     args: dict[str, Any], session: Any = None, tool_call_id: str | None = None
 ) -> tuple[str, bool]:
     """Handle sandbox_create tool calls."""
+    hardware = args.get("hardware", DEFAULT_CPU_SANDBOX_HARDWARE)
     trackio_space_id = args.get("trackio_space_id") or None
     trackio_project = args.get("trackio_project") or None
             data["trackioProject"] = trackio_project
         await session.send_event(Event(event_type="tool_state_change", data=data))
+    preload_task = getattr(session, "sandbox_preload_task", None)
+    if (
+        session
+        and not getattr(session, "sandbox", None)
+        and preload_task
+        and not preload_task.done()
+        and hardware == DEFAULT_CPU_SANDBOX_HARDWARE
+    ):
+        sb, error = await get_active_or_preloaded_sandbox(session)
+        if error:
+            return error, False
+        if sb:
+            await _emit_trackio_state(sb)
+            return (
+                f"Sandbox already active: {sb.space_id}\n"
+                f"URL: {sb.url}\n"
+                f"Hardware: {DEFAULT_CPU_SANDBOX_HARDWARE}\n"
+                f"Use bash/read/write/edit to interact with it."
+            ), True
+    if (
+        session
+        and not getattr(session, "sandbox", None)
+        and preload_task
+        and not preload_task.done()
+        and hardware != DEFAULT_CPU_SANDBOX_HARDWARE
+    ):
+        await cancel_sandbox_preload(session)
+    # If sandbox already exists, return its info or replace the auto CPU sandbox
     if session and getattr(session, "sandbox", None):
         sb = session.sandbox
+        active_hardware = getattr(session, "sandbox_hardware", None)
+        if active_hardware == hardware:
+            await _emit_trackio_state(sb)
+            return (
+                f"Sandbox already active: {sb.space_id}\n"
+                f"URL: {sb.url}\n"
+                f"Hardware: {active_hardware}\n"
+                f"Use bash/read/write/edit to interact with it."
+            ), True
         requested_hardware = args.get("hardware")
         lockout_note = ""
+        if (
+            active_hardware == DEFAULT_CPU_SANDBOX_HARDWARE
+            and hardware != DEFAULT_CPU_SANDBOX_HARDWARE
+        ):
+            await teardown_session_sandbox(session)
+        elif requested_hardware:
             lockout_note = (
                 f"\nRequested hardware: {requested_hardware}\n"
                 "Hardware cannot be changed by calling sandbox_create again. "
                 "Delete the existing sandbox first if you need a different tier."
             )
+            await _emit_trackio_state(sb)
+            return (
+                f"Sandbox already active: {sb.space_id}\n"
+                f"URL: {sb.url}\n"
+                f"{lockout_note}\n"
+                f"Use bash/read/write/edit to interact with it."
+            ), True
+        else:
+            await _emit_trackio_state(sb)
+            return (
+                f"Sandbox already active: {sb.space_id}\n"
+                f"URL: {sb.url}\n"
+                f"Hardware: {active_hardware or 'unknown'}\n"
+                f"Use bash/read/write/edit to interact with it."
+            ), True
     create_kwargs: dict[str, Any] = {}
     """Factory: create a handler for a sandbox operation tool."""
     async def handler(args: dict[str, Any], session: Any = None) -> tuple[str, bool]:
+        sb, error = await get_active_or_preloaded_sandbox(session)
+        if error:
+            return error, False
+        if not sb:
+            return "Sandbox is still starting. Please retry shortly.", False
         try:
             result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
     tools = []
+    # sandbox_create (for GPU or other non-default hardware)
     tools.append(
         ToolSpec(
             name=SANDBOX_CREATE_TOOL_SPEC["name"],
     # Operation tools (auto-execute, no approval needed)
     for name in Sandbox.TOOLS.keys():
         spec = Sandbox.TOOLS[name]
+        description = (
+            "Uses the session's active sandbox. A private cpu-basic sandbox is "
+            "started automatically for normal CPU work; call sandbox_create only "
+            "for GPU or other non-default hardware.\n\n"
+            + spec["description"]
+        )
         tools.append(
             ToolSpec(
                 name=name,
+                description=description,
                 parameters=spec["parameters"],
                 handler=_make_tool_handler(name),
             )

backend/routes/agent.py CHANGED Viewed

@@ -41,6 +41,7 @@ from agent.core.llm_params import _resolve_llm_params
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api", tags=["agent"])
 DEFAULT_CLAUDE_MODEL_ID = "bedrock/us.anthropic.claude-opus-4-6-v1"
 GATED_MODEL_IDS = {
@@ -559,6 +560,18 @@ async def list_sessions(user: dict = Depends(get_current_user)) -> list[SessionI
     return [SessionInfo(**s) for s in sessions]
 @router.delete("/session/{session_id}")
 async def delete_session(
     session_id: str, user: dict = Depends(get_current_user)

 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api", tags=["agent"])
+_background_teardown_tasks: set[asyncio.Task] = set()
 DEFAULT_CLAUDE_MODEL_ID = "bedrock/us.anthropic.claude-opus-4-6-v1"
 GATED_MODEL_IDS = {
     return [SessionInfo(**s) for s in sessions]
+@router.post("/session/{session_id}/sandbox/teardown")
+async def teardown_session_sandbox(
+    session_id: str, user: dict = Depends(get_current_user)
+) -> dict:
+    """Best-effort sandbox teardown that preserves durable chat history."""
+    await _check_session_access(session_id, user)
+    task = asyncio.create_task(session_manager.teardown_sandbox(session_id))
+    _background_teardown_tasks.add(task)
+    task.add_done_callback(_background_teardown_tasks.discard)
+    return {"status": "teardown_requested", "session_id": session_id}
 @router.delete("/session/{session_id}")
 async def delete_session(
     session_id: str, user: dict = Depends(get_current_user)

backend/session_manager.py CHANGED Viewed

@@ -336,6 +336,20 @@ class SessionManager:
         agent_session.task = task
         return agent_session
     @staticmethod
     def _can_access_session(agent_session: AgentSession, user_id: str) -> bool:
         return (
@@ -519,6 +533,7 @@ class SessionManager:
                 hf_username=hf_username,
             )
             return started
         logger.info("Restored session %s for user %s", session_id, owner or user_id)
         return agent_session
@@ -599,6 +614,7 @@ class SessionManager:
             event_queue=event_queue,
             tool_router=tool_router,
         )
         await self.persist_session_snapshot(agent_session, runtime_state="idle")
         if is_pro is not None and user_id and user_id != "dev":
@@ -705,27 +721,9 @@ class SessionManager:
         with exponential backoff. A single missed delete = a permanently
         orphaned Space, so the cost of an extra retry beats the alternative.
         """
-        sandbox = getattr(session, "sandbox", None)
-        if not (sandbox and getattr(sandbox, "_owns_space", False)):
-            return
-        space_id = getattr(sandbox, "space_id", None)
-        last_err: Exception | None = None
-        for attempt in range(3):
-            try:
-                logger.info(f"Deleting sandbox {space_id} (attempt {attempt + 1}/3)...")
-                await asyncio.to_thread(sandbox.delete)
-                from agent.core import telemetry
-                await telemetry.record_sandbox_destroy(session, sandbox)
-                return
-            except Exception as e:
-                last_err = e
-                if attempt < 2:
-                    await asyncio.sleep(2 ** attempt)
-        logger.error(
-            f"Failed to delete sandbox {space_id} after 3 attempts: {last_err}. "
-            f"Orphan — sweep script will pick it up."
-        )
     async def _run_session(
         self,
@@ -905,6 +903,18 @@ class SessionManager:
         return True
     async def update_session_title(self, session_id: str, title: str | None) -> None:
         """Persist a user-visible title for sidebar rehydration."""
         agent_session = self.sessions.get(session_id)

         agent_session.task = task
         return agent_session
+    @staticmethod
+    def _start_cpu_sandbox_preload(agent_session: AgentSession) -> None:
+        """Kick off a best-effort cpu-basic sandbox for the session."""
+        try:
+            from agent.tools.sandbox_tool import start_cpu_sandbox_preload
+            start_cpu_sandbox_preload(agent_session.session)
+        except Exception as e:
+            logger.warning(
+                "Failed to start CPU sandbox preload for %s: %s",
+                agent_session.session_id,
+                e,
+            )
     @staticmethod
     def _can_access_session(agent_session: AgentSession, user_id: str) -> bool:
         return (
                 hf_username=hf_username,
             )
             return started
+        self._start_cpu_sandbox_preload(agent_session)
         logger.info("Restored session %s for user %s", session_id, owner or user_id)
         return agent_session
             event_queue=event_queue,
             tool_router=tool_router,
         )
+        self._start_cpu_sandbox_preload(agent_session)
         await self.persist_session_snapshot(agent_session, runtime_state="idle")
         if is_pro is not None and user_id and user_id != "dev":
         with exponential backoff. A single missed delete = a permanently
         orphaned Space, so the cost of an extra retry beats the alternative.
         """
+        from agent.tools.sandbox_tool import teardown_session_sandbox
+        await teardown_session_sandbox(session)
     async def _run_session(
         self,
         return True
+    async def teardown_sandbox(self, session_id: str) -> bool:
+        """Delete only this session's sandbox runtime, preserving chat state."""
+        async with self._lock:
+            agent_session = self.sessions.get(session_id)
+        if not agent_session or not agent_session.is_active:
+            return False
+        await self._cleanup_sandbox(agent_session.session)
+        await self.persist_session_snapshot(agent_session, runtime_state="idle")
+        return True
     async def update_session_title(self, session_id: str, title: str | None) -> None:
         """Persist a user-visible title for sidebar rehydration."""
         agent_session = self.sessions.get(session_id)

frontend/src/components/Layout/AppLayout.tsx CHANGED Viewed

@@ -122,6 +122,39 @@ export default function AppLayout() {
     };
   }, [isConnected, activeSessionId]);
   const handleSessionDead = useCallback(
     (deadSessionId: string) => {
       // Backend lost this session — mark it expired so the chat shows a

     };
   }, [isConnected, activeSessionId]);
+  // Best-effort sandbox cleanup when the browser tab/window closes. This
+  // preserves durable chat history; explicit delete still removes the session.
+  useEffect(() => {
+    const teardownSandboxes = () => {
+      const liveSessionIds = useSessionStore
+        .getState()
+        .sessions
+        .filter((session) => session.isActive && !session.expired)
+        .map((session) => session.id);
+      for (const sessionId of liveSessionIds) {
+        const url = `/api/session/${sessionId}/sandbox/teardown`;
+        const body = '{}';
+        const blob = new Blob([body], { type: 'application/json' });
+        if (navigator.sendBeacon?.(url, blob)) {
+          continue;
+        }
+        fetch(url, {
+          method: 'POST',
+          body,
+          keepalive: true,
+          credentials: 'include',
+          headers: { 'Content-Type': 'application/json' },
+        }).catch(() => {});
+      }
+    };
+    window.addEventListener('pagehide', teardownSandboxes);
+    return () => window.removeEventListener('pagehide', teardownSandboxes);
+  }, []);
   const handleSessionDead = useCallback(
     (deadSessionId: string) => {
       // Backend lost this session — mark it expired so the chat shows a

tests/unit/test_sandbox_auto_start.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from types import SimpleNamespace
+from pathlib import Path
+from agent.core.agent_loop import _needs_approval
+from agent.tools.sandbox_tool import get_sandbox_tools
+def test_default_cpu_sandbox_create_does_not_require_approval():
+    config = SimpleNamespace(yolo_mode=False)
+    assert _needs_approval("sandbox_create", {}, config) is False
+    assert _needs_approval("sandbox_create", {"hardware": "cpu-basic"}, config) is False
+def test_non_default_sandbox_create_still_requires_approval():
+    config = SimpleNamespace(yolo_mode=False)
+    assert _needs_approval("sandbox_create", {"hardware": "cpu-upgrade"}, config) is True
+    assert _needs_approval("sandbox_create", {"hardware": "t4-small"}, config) is True
+def test_prompt_and_tool_specs_do_not_require_cpu_sandbox_create():
+    prompt = Path("agent/prompts/system_prompt_v3.yaml").read_text()
+    tool_specs = {tool.name: tool.description for tool in get_sandbox_tools()}
+    assert "sandbox_create → install deps" not in prompt
+    assert "Do NOT call sandbox_create before normal CPU work" in prompt
+    assert "cpu-basic sandbox is already available" in prompt
+    assert "cpu-basic sandbox is already started automatically" in tool_specs["sandbox_create"]
+    assert "started automatically for normal CPU work" in tool_specs["bash"]

tests/unit/test_sandbox_private_spaces.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import asyncio
 from types import SimpleNamespace
 from agent.core import telemetry
@@ -9,6 +11,7 @@ from agent.tools.sandbox_tool import sandbox_create_handler
 def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
     duplicate_kwargs = {}
     class FakeApi:
         def __init__(self, token=None):
@@ -17,6 +20,10 @@ def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
         def duplicate_space(self, **kwargs):
             duplicate_kwargs.update(kwargs)
         def add_space_secret(self, *args, **kwargs):
             pass
@@ -34,6 +41,54 @@ def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
     Sandbox.create(owner="alice", token="hf-token", log=lambda msg: None)
     assert duplicate_kwargs["private"] is True
 def test_sandbox_tool_forces_private_spaces(monkeypatch):
@@ -68,6 +123,29 @@ def test_sandbox_tool_forces_private_spaces(monkeypatch):
     assert "Visibility: private" in out
 def test_ensure_sandbox_overrides_private_argument(monkeypatch):
     captured_kwargs = {}
@@ -114,3 +192,230 @@ def test_ensure_sandbox_overrides_private_argument(monkeypatch):
     assert error is None
     assert sb is not None
     assert captured_kwargs["private"] is True

 import asyncio
+import threading
+import time
 from types import SimpleNamespace
 from agent.core import telemetry
 def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
     duplicate_kwargs = {}
+    requested_hardware = []
     class FakeApi:
         def __init__(self, token=None):
         def duplicate_space(self, **kwargs):
             duplicate_kwargs.update(kwargs)
+        def request_space_hardware(self, space_id, hardware, sleep_time=None):
+            requested_hardware.append((space_id, hardware, sleep_time))
+            return SimpleNamespace(stage="BUILDING", hardware=None)
         def add_space_secret(self, *args, **kwargs):
             pass
     Sandbox.create(owner="alice", token="hf-token", log=lambda msg: None)
     assert duplicate_kwargs["private"] is True
+    assert requested_hardware == [(duplicate_kwargs["to_id"], "cpu-basic", None)]
+def test_sandbox_client_retries_transient_runtime_404(monkeypatch):
+    runtime_calls = 0
+    class FakeResponse:
+        status_code = 404
+    class FakeRuntime404(Exception):
+        response = FakeResponse()
+        def __str__(self):
+            return "404 Client Error: Repository Not Found"
+    class FakeApi:
+        def __init__(self, token=None):
+            self.token = token
+        def duplicate_space(self, **kwargs):
+            pass
+        def request_space_hardware(self, space_id, hardware, sleep_time=None):
+            return SimpleNamespace(stage="BUILDING", hardware=None)
+        def add_space_secret(self, *args, **kwargs):
+            pass
+        def get_space_runtime(self, space_id):
+            nonlocal runtime_calls
+            runtime_calls += 1
+            if runtime_calls == 1:
+                raise FakeRuntime404()
+            return SimpleNamespace(stage="RUNNING", hardware="cpu-basic")
+    monkeypatch.setattr(sandbox_client, "HfApi", FakeApi)
+    monkeypatch.setattr(sandbox_client.time, "sleep", lambda seconds: None)
+    monkeypatch.setattr(
+        Sandbox,
+        "_setup_server",
+        staticmethod(lambda *args, **kwargs: None),
+    )
+    monkeypatch.setattr(Sandbox, "_wait_for_api", lambda self, *args, **kwargs: None)
+    sandbox = Sandbox.create(owner="alice", token="hf-token", log=lambda msg: None)
+    assert sandbox.space_id.startswith("alice/sandbox-")
+    assert runtime_calls == 2
 def test_sandbox_tool_forces_private_spaces(monkeypatch):
     assert "Visibility: private" in out
+def test_orphan_sweep_preserves_spaces_without_last_modified():
+    deleted: list[str] = []
+    logs: list[str] = []
+    class FakeApi:
+        def list_spaces(self, **kwargs):
+            assert kwargs["full"] is True
+            return [SimpleNamespace(id="alice/sandbox-12345678")]
+        def delete_repo(self, repo_id, repo_type):
+            deleted.append(repo_id)
+    count = sandbox_tool._cleanup_user_orphan_sandboxes(
+        FakeApi(),
+        "alice",
+        logs.append,
+    )
+    assert count == 0
+    assert deleted == []
+    assert logs == ["orphan sweep: skipping alice/sandbox-12345678; missing lastModified"]
 def test_ensure_sandbox_overrides_private_argument(monkeypatch):
     captured_kwargs = {}
     assert error is None
     assert sb is not None
     assert captured_kwargs["private"] is True
+def test_sandbox_creation_is_serialized_per_owner(monkeypatch):
+    active_creates = 0
+    max_active_creates = 0
+    active_lock = threading.Lock()
+    class FakeApi:
+        def __init__(self, token=None):
+            self.token = token
+        def whoami(self):
+            return {"name": "alice"}
+    class FakeSession:
+        def __init__(self):
+            self.hf_token = "hf-token"
+            self.sandbox = None
+            self.event_queue = SimpleNamespace(put_nowait=lambda event: None)
+            self._cancelled = asyncio.Event()
+        async def send_event(self, event):
+            pass
+    def fake_create(**kwargs):
+        nonlocal active_creates, max_active_creates
+        with active_lock:
+            active_creates += 1
+            max_active_creates = max(max_active_creates, active_creates)
+        time.sleep(0.02)
+        with active_lock:
+            active_creates -= 1
+        return SimpleNamespace(
+            space_id=f"alice/sandbox-{kwargs['hardware']}",
+            url="https://huggingface.co/spaces/alice/sandbox",
+        )
+    async def fake_record_sandbox_create(*args, **kwargs):
+        pass
+    monkeypatch.setattr(sandbox_tool, "HfApi", FakeApi)
+    monkeypatch.setattr(sandbox_tool, "_cleanup_user_orphan_sandboxes", lambda *args: 0)
+    monkeypatch.setattr(Sandbox, "create", staticmethod(fake_create))
+    monkeypatch.setattr(telemetry, "record_sandbox_create", fake_record_sandbox_create)
+    monkeypatch.setattr("huggingface_hub.metadata_update", lambda *args, **kwargs: None)
+    async def run():
+        await asyncio.gather(
+            sandbox_tool._ensure_sandbox(FakeSession()),
+            sandbox_tool._ensure_sandbox(FakeSession()),
+        )
+    asyncio.run(run())
+    assert max_active_creates == 1
+def test_sandbox_operation_waits_for_cpu_preload():
+    calls: list[tuple[str, dict]] = []
+    class FakeSandbox:
+        def call_tool(self, name, args):
+            calls.append((name, args))
+            return SimpleNamespace(success=True, output="preloaded-ok", error="")
+    async def run():
+        session = SimpleNamespace(
+            sandbox=None,
+            sandbox_preload_error=None,
+        )
+        async def preload():
+            await asyncio.sleep(0)
+            session.sandbox = FakeSandbox()
+        session.sandbox_preload_task = asyncio.create_task(preload())
+        handler = sandbox_tool._make_tool_handler("bash")
+        return await handler({"command": "echo ok"}, session=session)
+    out, ok = asyncio.run(run())
+    assert ok is True
+    assert out == "preloaded-ok"
+    assert calls == [("bash", {"command": "echo ok"})]
+def test_default_sandbox_create_waits_for_cpu_preload():
+    class FakeSandbox:
+        space_id = "alice/sandbox-cpu"
+        url = "https://huggingface.co/spaces/alice/sandbox-cpu"
+    async def run():
+        session = SimpleNamespace(
+            sandbox=None,
+            sandbox_preload_error=None,
+        )
+        async def preload():
+            await asyncio.sleep(0)
+            session.sandbox = FakeSandbox()
+            session.sandbox_hardware = "cpu-basic"
+        session.sandbox_preload_task = asyncio.create_task(preload())
+        return await sandbox_tool.sandbox_create_handler({}, session=session)
+    out, ok = asyncio.run(run())
+    assert ok is True
+    assert "Sandbox already active: alice/sandbox-cpu" in out
+    assert "Hardware: cpu-basic" in out
+def test_sandbox_create_replaces_auto_cpu_sandbox(monkeypatch):
+    deleted: list[str] = []
+    class FakeSession:
+        def __init__(self):
+            self.sandbox = SimpleNamespace(
+                space_id="alice/sandbox-cpu",
+                url="https://huggingface.co/spaces/alice/sandbox-cpu",
+                _owns_space=True,
+                delete=lambda: deleted.append("alice/sandbox-cpu"),
+            )
+            self.sandbox_hardware = "cpu-basic"
+            self.sandbox_preload_task = None
+            self.sandbox_preload_cancel_event = None
+        async def send_event(self, event):
+            pass
+    gpu_sandbox = SimpleNamespace(
+        space_id="alice/sandbox-gpu",
+        url="https://huggingface.co/spaces/alice/sandbox-gpu",
+        _owns_space=True,
+    )
+    async def fake_ensure_sandbox(session, hardware="cpu-basic", **kwargs):
+        session.sandbox = gpu_sandbox
+        session.sandbox_hardware = hardware
+        return gpu_sandbox, None
+    async def fake_record_sandbox_destroy(*args, **kwargs):
+        pass
+    monkeypatch.setattr(sandbox_tool, "_ensure_sandbox", fake_ensure_sandbox)
+    monkeypatch.setattr(telemetry, "record_sandbox_destroy", fake_record_sandbox_destroy)
+    session = FakeSession()
+    out, ok = asyncio.run(
+        sandbox_tool.sandbox_create_handler(
+            {"hardware": "a100-large"},
+            session=session,
+        )
+    )
+    assert ok is True
+    assert deleted == ["alice/sandbox-cpu"]
+    assert session.sandbox is gpu_sandbox
+    assert session.sandbox_hardware == "a100-large"
+    assert "Hardware: a100-large" in out
+def test_teardown_cancels_preload_and_deletes_owned_sandbox(monkeypatch):
+    deleted: list[str] = []
+    async def fake_record_sandbox_destroy(*args, **kwargs):
+        pass
+    monkeypatch.setattr(telemetry, "record_sandbox_destroy", fake_record_sandbox_destroy)
+    async def run():
+        cancel_event = threading.Event()
+        async def preload():
+            await asyncio.sleep(0)
+        session = SimpleNamespace(
+            sandbox=SimpleNamespace(
+                space_id="alice/sandbox-12345678",
+                _owns_space=True,
+                delete=lambda: deleted.append("alice/sandbox-12345678"),
+            ),
+            sandbox_hardware="cpu-basic",
+            sandbox_preload_task=asyncio.create_task(preload()),
+            sandbox_preload_cancel_event=cancel_event,
+        )
+        await sandbox_tool.teardown_session_sandbox(session)
+        return session, cancel_event
+    session, cancel_event = asyncio.run(run())
+    assert cancel_event.is_set()
+    assert deleted == ["alice/sandbox-12345678"]
+    assert session.sandbox is None
+    assert session.sandbox_hardware is None
+def test_cancel_sandbox_preload_cancels_task_after_timeout(monkeypatch):
+    async def run():
+        async def fake_wait_for(awaitable, timeout):
+            await asyncio.sleep(0)
+            raise asyncio.TimeoutError
+        monkeypatch.setattr(sandbox_tool.asyncio, "wait_for", fake_wait_for)
+        cancel_event = threading.Event()
+        blocker = asyncio.Event()
+        async def preload():
+            await blocker.wait()
+        task = asyncio.create_task(preload())
+        session = SimpleNamespace(
+            sandbox_preload_task=task,
+            sandbox_preload_cancel_event=cancel_event,
+        )
+        await sandbox_tool.cancel_sandbox_preload(session)
+        await asyncio.sleep(0)
+        return task.cancelled(), cancel_event.is_set()
+    task_cancelled, cancel_event_set = asyncio.run(run())
+    assert task_cancelled is True
+    assert cancel_event_set is True

tests/unit/test_session_manager_persistence.py CHANGED Viewed

@@ -186,6 +186,12 @@ async def test_concurrent_lazy_restore_starts_only_one_agent_task():
     store = RestoreStore(delay=0.01)
     manager = _manager_with_store(store)
     stop = _install_fake_runtime(manager)
     try:
         first, second = await asyncio.gather(
@@ -197,12 +203,56 @@ async def test_concurrent_lazy_restore_starts_only_one_agent_task():
         assert first is second
         assert list(manager.sessions) == ["persisted-session"]
         assert manager.run_calls == 1  # type: ignore[attr-defined]
         assert not stop.is_set()
     finally:
         stop.set()
         await _cancel_runtime_tasks(manager)
 @pytest.mark.asyncio
 async def test_lazy_restore_preserves_pending_approval_tool_calls():
     store = RestoreStore(

     store = RestoreStore(delay=0.01)
     manager = _manager_with_store(store)
     stop = _install_fake_runtime(manager)
+    scheduled: list[str] = []
+    def fake_start_cpu_sandbox_preload(agent_session: AgentSession) -> None:
+        scheduled.append(agent_session.session_id)
+    manager._start_cpu_sandbox_preload = fake_start_cpu_sandbox_preload  # type: ignore[method-assign]
     try:
         first, second = await asyncio.gather(
         assert first is second
         assert list(manager.sessions) == ["persisted-session"]
         assert manager.run_calls == 1  # type: ignore[attr-defined]
+        assert scheduled == ["persisted-session"]
         assert not stop.is_set()
     finally:
         stop.set()
         await _cancel_runtime_tasks(manager)
+@pytest.mark.asyncio
+async def test_create_session_schedules_cpu_sandbox_preload():
+    manager = _manager_with_store(NoopSessionStore())
+    stop = _install_fake_runtime(manager)
+    scheduled: list[str] = []
+    def fake_start_cpu_sandbox_preload(agent_session: AgentSession) -> None:
+        scheduled.append(agent_session.session_id)
+    manager._start_cpu_sandbox_preload = fake_start_cpu_sandbox_preload  # type: ignore[method-assign]
+    try:
+        session_id = await manager.create_session(user_id="owner", hf_token="token")
+        assert scheduled == [session_id]
+        assert session_id in manager.sessions
+    finally:
+        stop.set()
+        await _cancel_runtime_tasks(manager)
+@pytest.mark.asyncio
+async def test_lazy_restore_schedules_cpu_sandbox_preload():
+    manager = _manager_with_store(RestoreStore())
+    stop = _install_fake_runtime(manager)
+    scheduled: list[str] = []
+    def fake_start_cpu_sandbox_preload(agent_session: AgentSession) -> None:
+        scheduled.append(agent_session.session_id)
+    manager._start_cpu_sandbox_preload = fake_start_cpu_sandbox_preload  # type: ignore[method-assign]
+    try:
+        restored = await manager.ensure_session_loaded("persisted-session", user_id="owner")
+        assert restored is not None
+        assert scheduled == ["persisted-session"]
+        assert "persisted-session" in manager.sessions
+    finally:
+        stop.set()
+        await _cancel_runtime_tasks(manager)
 @pytest.mark.asyncio
 async def test_lazy_restore_preserves_pending_approval_tool_calls():
     store = RestoreStore(