lewtun HF Staff Codex commited on
Commit
8615c28
·
unverified ·
1 Parent(s): 77324b8

Auto-start CPU sandboxes for sessions (#200)

Browse files

* Auto-start CPU sandboxes for sessions

Co-authored-by: Codex <codex@openai.com>

* Retry sandbox runtime visibility checks

Co-authored-by: Codex <codex@openai.com>

* Stabilize auto CPU sandbox creation

Co-authored-by: OpenAI Codex <codex@openai.com>

* Address sandbox PR review comments

Co-authored-by: OpenAI Codex <codex@openai.com>

---------

Co-authored-by: Codex <codex@openai.com>

agent/core/agent_loop.py CHANGED
@@ -32,6 +32,7 @@ from agent.core.prompt_caching import with_prompt_caching
32
  from agent.core.session import Event, OpType, Session
33
  from agent.core.tools import ToolRouter
34
  from agent.tools.jobs_tool import CPU_FLAVORS
 
35
 
36
  logger = logging.getLogger(__name__)
37
 
@@ -155,7 +156,8 @@ def _base_needs_approval(
155
  return False
156
 
157
  if tool_name == "sandbox_create":
158
- return True
 
159
 
160
  if tool_name == "hf_jobs":
161
  operation = _operation(tool_args)
 
32
  from agent.core.session import Event, OpType, Session
33
  from agent.core.tools import ToolRouter
34
  from agent.tools.jobs_tool import CPU_FLAVORS
35
+ from agent.tools.sandbox_tool import DEFAULT_CPU_SANDBOX_HARDWARE
36
 
37
  logger = logging.getLogger(__name__)
38
 
 
156
  return False
157
 
158
  if tool_name == "sandbox_create":
159
+ hardware = tool_args.get("hardware") or DEFAULT_CPU_SANDBOX_HARDWARE
160
+ return hardware != DEFAULT_CPU_SANDBOX_HARDWARE
161
 
162
  if tool_name == "hf_jobs":
163
  operation = _operation(tool_args)
agent/core/session.py CHANGED
@@ -116,6 +116,10 @@ class Session:
116
  self._cancelled = asyncio.Event()
117
  self.pending_approval: Optional[dict[str, Any]] = None
118
  self.sandbox = None
 
 
 
 
119
  self._running_job_ids: set[str] = set() # HF job IDs currently executing
120
  self.notification_gateway = notification_gateway
121
  self.notification_destinations = list(notification_destinations or [])
 
116
  self._cancelled = asyncio.Event()
117
  self.pending_approval: Optional[dict[str, Any]] = None
118
  self.sandbox = None
119
+ self.sandbox_hardware: Optional[str] = None
120
+ self.sandbox_preload_task: Optional[asyncio.Task] = None
121
+ self.sandbox_preload_error: Optional[str] = None
122
+ self.sandbox_preload_cancel_event: Any | None = None
123
  self._running_job_ids: set[str] = set() # HF job IDs currently executing
124
  self.notification_gateway = notification_gateway
125
  self.notification_destinations = list(notification_destinations or [])
agent/prompts/system_prompt_v3.yaml CHANGED
@@ -122,8 +122,10 @@ system_prompt: |
122
 
123
  # Sandbox-first development
124
 
125
- For non-trivial scripts, develop and test in a sandbox before launching via hf_jobs:
126
- sandbox_create install deps write script → test with small run → fix errors → launch via hf_jobs at scale
 
 
127
 
128
  Use GPU sandbox (t4-small minimum) when testing code that uses CUDA, bf16, or model loading. CPU sandboxes cannot test GPU code paths.
129
 
 
122
 
123
  # Sandbox-first development
124
 
125
+ A private cpu-basic sandbox is already available for normal code execution in each session. For non-trivial scripts, develop and test there before launching via hf_jobs:
126
+ write scriptpip install → test with small run using bash/read/write/edit → fix errors → launch via hf_jobs at scale
127
+
128
+ Do NOT call sandbox_create before normal CPU work. Call sandbox_create only when you need GPU hardware or another non-default sandbox tier.
129
 
130
  Use GPU sandbox (t4-small minimum) when testing code that uses CUDA, bf16, or model loading. CPU sandboxes cannot test GPU code paths.
131
 
agent/tools/sandbox_client.py CHANGED
@@ -66,6 +66,15 @@ WAIT_TIMEOUT = 600
66
  WAIT_INTERVAL = 5
67
  API_WAIT_TIMEOUT = 180
68
 
 
 
 
 
 
 
 
 
 
69
  _DOCKERFILE = """\
70
  FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
71
 
@@ -615,6 +624,16 @@ class Sandbox:
615
 
616
  _check_cancel()
617
 
 
 
 
 
 
 
 
 
 
 
618
  # Inject secrets BEFORE uploading server files (which triggers rebuild).
619
  # Secrets added after a Space is running aren't available until restart,
620
  # so they must be set before the build/start cycle.
@@ -633,8 +652,24 @@ class Sandbox:
633
  deadline = time.time() + wait_timeout
634
  while time.time() < deadline:
635
  _check_cancel()
636
- runtime = api.get_space_runtime(space_id)
 
 
 
 
 
 
 
637
  if runtime.stage == "RUNNING":
 
 
 
 
 
 
 
 
 
638
  _log(f"Space is running (hardware: {runtime.hardware})")
639
  break
640
  if runtime.stage in ("RUNTIME_ERROR", "BUILD_ERROR"):
 
66
  WAIT_INTERVAL = 5
67
  API_WAIT_TIMEOUT = 180
68
 
69
+
70
+ def _is_transient_space_visibility_error(error: Exception) -> bool:
71
+ """Return True when a newly duplicated Space is not queryable yet."""
72
+ response = getattr(error, "response", None)
73
+ if getattr(response, "status_code", None) == 404:
74
+ return True
75
+ message = str(error)
76
+ return "Repository Not Found" in message or "404 Client Error" in message
77
+
78
  _DOCKERFILE = """\
79
  FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
80
 
 
624
 
625
  _check_cancel()
626
 
627
+ # Some template duplicates can initially inherit the template hardware.
628
+ # Explicitly request the target tier so automatic CPU sandboxes never
629
+ # silently come up on GPU hardware.
630
+ api.request_space_hardware(
631
+ space_id,
632
+ hardware=hardware,
633
+ sleep_time=sleep_time,
634
+ )
635
+ _log(f"Requested hardware: {hardware}")
636
+
637
  # Inject secrets BEFORE uploading server files (which triggers rebuild).
638
  # Secrets added after a Space is running aren't available until restart,
639
  # so they must be set before the build/start cycle.
 
652
  deadline = time.time() + wait_timeout
653
  while time.time() < deadline:
654
  _check_cancel()
655
+ try:
656
+ runtime = api.get_space_runtime(space_id)
657
+ except Exception as e:
658
+ if _is_transient_space_visibility_error(e):
659
+ _log(" Space runtime not visible yet...")
660
+ time.sleep(WAIT_INTERVAL)
661
+ continue
662
+ raise
663
  if runtime.stage == "RUNNING":
664
+ current_hardware = runtime.hardware or getattr(
665
+ runtime, "requested_hardware", None
666
+ )
667
+ if current_hardware != hardware:
668
+ _log(
669
+ f" RUNNING on {current_hardware}; waiting for {hardware}..."
670
+ )
671
+ time.sleep(WAIT_INTERVAL)
672
+ continue
673
  _log(f"Space is running (hardware: {runtime.hardware})")
674
  break
675
  if runtime.stage in ("RUNTIME_ERROR", "BUILD_ERROR"):
agent/tools/sandbox_tool.py CHANGED
@@ -2,11 +2,11 @@
2
  Sandbox tools — expose the Sandbox client as agent tools.
3
 
4
  5 tools total:
5
- sandbox_create — explicit sandbox creation (requires approval)
6
- bash, read, write, edit — operations on the sandbox
7
 
8
- If any operation tool is called without an active sandbox,
9
- a cpu-basic sandbox is auto-created (no approval needed).
10
  """
11
 
12
  from __future__ import annotations
@@ -15,6 +15,7 @@ import asyncio
15
  import logging
16
  import re
17
  import threading
 
18
  from datetime import datetime, timedelta, timezone
19
  from typing import Any
20
 
@@ -26,6 +27,8 @@ from agent.tools.trackio_seed import ensure_trackio_dashboard
26
 
27
  logger = logging.getLogger(__name__)
28
 
 
 
29
  # Match the exact suffix pattern Sandbox.create produces: "sandbox-<8 hex>".
30
  # Used to identify orphan sandboxes from prior sessions safely (won't match
31
  # user-renamed lookalikes).
@@ -36,6 +39,23 @@ _SANDBOX_NAME_RE = re.compile(r"^sandbox-[a-f0-9]{8}$")
36
  # so we leave it alone.
37
  _ORPHAN_STALE_AFTER = timedelta(hours=1)
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  def _looks_like_path(script: str) -> bool:
41
  """Return True if the script string looks like a file path (not inline code)."""
@@ -124,7 +144,7 @@ def _cleanup_user_orphan_sandboxes(
124
  cutoff = datetime.now(timezone.utc) - _ORPHAN_STALE_AFTER
125
  deleted = 0
126
  try:
127
- spaces = list(api.list_spaces(author=owner, limit=200))
128
  except Exception as e:
129
  log(f"orphan sweep: list_spaces failed: {e}")
130
  return 0
@@ -140,6 +160,9 @@ def _cleanup_user_orphan_sandboxes(
140
  last_mod = datetime.fromisoformat(last_mod.replace("Z", "+00:00"))
141
  except ValueError:
142
  last_mod = None
 
 
 
143
  if last_mod and last_mod > cutoff:
144
  # Recent — could be a concurrent live session. Skip.
145
  continue
@@ -158,8 +181,9 @@ def _cleanup_user_orphan_sandboxes(
158
 
159
  async def _ensure_sandbox(
160
  session: Any,
161
- hardware: str = "cpu-basic",
162
  extra_secrets: dict[str, str] | None = None,
 
163
  **create_kwargs,
164
  ) -> tuple[Sandbox | None, str | None]:
165
  """
@@ -184,6 +208,45 @@ async def _ensure_sandbox(
184
  if not owner:
185
  return None, "Could not determine HF username from token."
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  await session.send_event(
188
  Event(
189
  event_type="tool_log",
@@ -203,27 +266,10 @@ async def _ensure_sandbox(
203
  Event(event_type="tool_log", data={"tool": "sandbox", "log": msg}),
204
  )
205
 
206
- # Before we create a new sandbox, sweep this user's stale sandboxes from
207
- # prior sessions. ``_cleanup_sandbox`` in session_manager fires only on
208
- # clean session exit; pod kills, WebSocket drops, etc. leave orphans
209
- # behind, and they accumulate on every new session forever (observed
210
- # 2310 leaked across the Hub on 2026-04-27). Doing the cleanup here at
211
- # session start = self-healing, no separate cron needed.
212
- #
213
- # The 1h staleness filter is the safety: a sandbox modified in the last
214
- # hour might still be tied to a live session in another tab, so we skip.
215
- # Anything older has no realistic chance of being active given typical
216
- # session lengths.
217
- try:
218
- await asyncio.to_thread(_cleanup_user_orphan_sandboxes, api, owner, _log)
219
- except Exception as e:
220
- # Cleanup is best-effort — never block sandbox_create on it.
221
- _log(f"orphan sandbox sweep failed (non-fatal): {e}")
222
-
223
  # Bridge asyncio cancel event to a threading.Event for the blocking create call.
224
  # We poll session._cancelled from the main loop in a background task and set
225
  # a threading.Event that Sandbox.create checks during its polling loops.
226
- cancel_flag = threading.Event()
227
 
228
  async def _watch_cancel():
229
  await session._cancelled.wait()
@@ -245,7 +291,7 @@ async def _ensure_sandbox(
245
  "cancel_event": cancel_flag,
246
  **create_kwargs,
247
  }
248
- if hardware != "cpu-basic":
249
  kwargs["sleep_time"] = 2700
250
  import time as _t
251
  _t_start = _t.monotonic()
@@ -255,7 +301,18 @@ async def _ensure_sandbox(
255
  return None, "Sandbox creation cancelled by user."
256
  finally:
257
  watcher_task.cancel()
 
 
 
 
 
 
 
 
 
258
  session.sandbox = sb
 
 
259
 
260
  # Telemetry: sandbox creation (infra consumption signal)
261
  from agent.core import telemetry
@@ -286,19 +343,146 @@ async def _ensure_sandbox(
286
  return sb, None
287
 
288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  # ── sandbox_create tool ──────────────────────────────────────────────
290
 
291
  SANDBOX_CREATE_TOOL_SPEC = {
292
  "name": "sandbox_create",
293
  "description": (
294
- "Create a persistent remote Linux environment for developing and testing scripts.\n\n"
295
- "Workflow: sandbox_create write script pip install test with small run → fix errors → hf_jobs at scale.\n"
296
- "The sandbox persists across tool calls within the session. pip install works out of the box. "
 
 
297
  "Sandboxes are always created as private HF Spaces.\n\n"
298
- "Use this when: you need to develop, test, and iterate on scripts before launching via hf_jobs. "
299
- "Especially for training scripts where you need to verify imports, test on a small subset, and fix errors interactively.\n\n"
300
- "Skip this when: the task is a simple one-shot operation (status check, resource search, quick data query), "
301
- "or the script is copied from a verified working example with minimal changes.\n\n"
302
  "For ML code that uses CUDA, bf16, or model loading: use GPU hardware (t4-small minimum). "
303
  "CPU sandboxes cannot run GPU code paths — your test will not catch GPU-related errors.\n\n"
304
  "Before choosing hardware, estimate your VRAM needs (models you run, training data size). Rule of thumb: bf16/fp16 ≈ 2 bytes/param, "
@@ -318,7 +502,10 @@ SANDBOX_CREATE_TOOL_SPEC = {
318
  "hardware": {
319
  "type": "string",
320
  "enum": [e.value for e in SpaceHardware],
321
- "description": "Hardware tier for the sandbox (default: cpu-basic)",
 
 
 
322
  },
323
  "trackio_space_id": {
324
  "type": "string",
@@ -346,7 +533,7 @@ async def sandbox_create_handler(
346
  args: dict[str, Any], session: Any = None, tool_call_id: str | None = None
347
  ) -> tuple[str, bool]:
348
  """Handle sandbox_create tool calls."""
349
- hardware = args.get("hardware", "cpu-basic")
350
  trackio_space_id = args.get("trackio_space_id") or None
351
  trackio_project = args.get("trackio_project") or None
352
 
@@ -364,24 +551,76 @@ async def sandbox_create_handler(
364
  data["trackioProject"] = trackio_project
365
  await session.send_event(Event(event_type="tool_state_change", data=data))
366
 
367
- # If sandbox already exists, return its info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
  if session and getattr(session, "sandbox", None):
369
  sb = session.sandbox
 
 
 
 
 
 
 
 
 
 
370
  requested_hardware = args.get("hardware")
371
  lockout_note = ""
372
- if requested_hardware:
 
 
 
 
 
373
  lockout_note = (
374
  f"\nRequested hardware: {requested_hardware}\n"
375
  "Hardware cannot be changed by calling sandbox_create again. "
376
  "Delete the existing sandbox first if you need a different tier."
377
  )
378
- await _emit_trackio_state(sb)
379
- return (
380
- f"Sandbox already active: {sb.space_id}\n"
381
- f"URL: {sb.url}\n"
382
- f"{lockout_note}\n"
383
- f"Use bash/read/write/edit to interact with it."
384
- ), True
 
 
 
 
 
 
 
 
385
 
386
  create_kwargs: dict[str, Any] = {}
387
 
@@ -420,11 +659,11 @@ def _make_tool_handler(sandbox_tool_name: str):
420
  """Factory: create a handler for a sandbox operation tool."""
421
 
422
  async def handler(args: dict[str, Any], session: Any = None) -> tuple[str, bool]:
423
- # Require sandbox to exist — user must approve sandbox_create first
424
- if not session or not getattr(session, "sandbox", None):
425
- return "No sandbox running. Call sandbox_create first to start one.", False
426
-
427
- sb = session.sandbox
428
 
429
  try:
430
  result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
@@ -449,7 +688,7 @@ def get_sandbox_tools():
449
 
450
  tools = []
451
 
452
- # sandbox_create (explicit creation, requires approval)
453
  tools.append(
454
  ToolSpec(
455
  name=SANDBOX_CREATE_TOOL_SPEC["name"],
@@ -462,10 +701,16 @@ def get_sandbox_tools():
462
  # Operation tools (auto-execute, no approval needed)
463
  for name in Sandbox.TOOLS.keys():
464
  spec = Sandbox.TOOLS[name]
 
 
 
 
 
 
465
  tools.append(
466
  ToolSpec(
467
  name=name,
468
- description=spec["description"],
469
  parameters=spec["parameters"],
470
  handler=_make_tool_handler(name),
471
  )
 
2
  Sandbox tools — expose the Sandbox client as agent tools.
3
 
4
  5 tools total:
5
+ sandbox_create — create/replace sandbox for non-default hardware
6
+ bash, read, write, edit — operations on the active sandbox
7
 
8
+ A cpu-basic sandbox is preloaded for each session. Operation tools wait for it
9
+ if startup is still in progress.
10
  """
11
 
12
  from __future__ import annotations
 
15
  import logging
16
  import re
17
  import threading
18
+ import weakref
19
  from datetime import datetime, timedelta, timezone
20
  from typing import Any
21
 
 
27
 
28
  logger = logging.getLogger(__name__)
29
 
30
+ DEFAULT_CPU_SANDBOX_HARDWARE = "cpu-basic"
31
+
32
  # Match the exact suffix pattern Sandbox.create produces: "sandbox-<8 hex>".
33
  # Used to identify orphan sandboxes from prior sessions safely (won't match
34
  # user-renamed lookalikes).
 
39
  # so we leave it alone.
40
  _ORPHAN_STALE_AFTER = timedelta(hours=1)
41
 
42
+ # HF Space duplication/build APIs can behave poorly when multiple private
43
+ # sandboxes are created concurrently for the same namespace. Keep session
44
+ # creation non-blocking, but serialize the actual Hub create path per owner.
45
+ _SANDBOX_CREATE_LOCKS: weakref.WeakKeyDictionary[
46
+ asyncio.AbstractEventLoop, dict[str, asyncio.Lock]
47
+ ] = weakref.WeakKeyDictionary()
48
+
49
+
50
+ def _get_sandbox_create_lock(owner: str) -> asyncio.Lock:
51
+ loop = asyncio.get_running_loop()
52
+ locks = _SANDBOX_CREATE_LOCKS.setdefault(loop, {})
53
+ lock = locks.get(owner)
54
+ if lock is None:
55
+ lock = asyncio.Lock()
56
+ locks[owner] = lock
57
+ return lock
58
+
59
 
60
  def _looks_like_path(script: str) -> bool:
61
  """Return True if the script string looks like a file path (not inline code)."""
 
144
  cutoff = datetime.now(timezone.utc) - _ORPHAN_STALE_AFTER
145
  deleted = 0
146
  try:
147
+ spaces = list(api.list_spaces(author=owner, limit=200, full=True))
148
  except Exception as e:
149
  log(f"orphan sweep: list_spaces failed: {e}")
150
  return 0
 
160
  last_mod = datetime.fromisoformat(last_mod.replace("Z", "+00:00"))
161
  except ValueError:
162
  last_mod = None
163
+ if last_mod is None:
164
+ log(f"orphan sweep: skipping {space.id}; missing lastModified")
165
+ continue
166
  if last_mod and last_mod > cutoff:
167
  # Recent — could be a concurrent live session. Skip.
168
  continue
 
181
 
182
  async def _ensure_sandbox(
183
  session: Any,
184
+ hardware: str = DEFAULT_CPU_SANDBOX_HARDWARE,
185
  extra_secrets: dict[str, str] | None = None,
186
+ cancel_event: threading.Event | None = None,
187
  **create_kwargs,
188
  ) -> tuple[Sandbox | None, str | None]:
189
  """
 
208
  if not owner:
209
  return None, "Could not determine HF username from token."
210
 
211
+ create_lock = _get_sandbox_create_lock(owner)
212
+ if create_lock.locked():
213
+ await session.send_event(
214
+ Event(
215
+ event_type="tool_log",
216
+ data={
217
+ "tool": "sandbox",
218
+ "log": "Waiting for sandbox creation slot...",
219
+ },
220
+ )
221
+ )
222
+
223
+ async with create_lock:
224
+ if getattr(session, "sandbox", None):
225
+ return session.sandbox, None
226
+
227
+ return await _create_sandbox_locked(
228
+ session,
229
+ api=api,
230
+ owner=owner,
231
+ hardware=hardware,
232
+ extra_secrets=extra_secrets,
233
+ cancel_event=cancel_event,
234
+ **create_kwargs,
235
+ )
236
+
237
+
238
+ async def _create_sandbox_locked(
239
+ session: Any,
240
+ *,
241
+ api: HfApi,
242
+ owner: str,
243
+ hardware: str,
244
+ extra_secrets: dict[str, str] | None = None,
245
+ cancel_event: threading.Event | None = None,
246
+ **create_kwargs,
247
+ ) -> tuple[Sandbox | None, str | None]:
248
+ """Create the Space while the per-owner sandbox creation lock is held."""
249
+ token = session.hf_token
250
  await session.send_event(
251
  Event(
252
  event_type="tool_log",
 
266
  Event(event_type="tool_log", data={"tool": "sandbox", "log": msg}),
267
  )
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  # Bridge asyncio cancel event to a threading.Event for the blocking create call.
270
  # We poll session._cancelled from the main loop in a background task and set
271
  # a threading.Event that Sandbox.create checks during its polling loops.
272
+ cancel_flag = cancel_event or threading.Event()
273
 
274
  async def _watch_cancel():
275
  await session._cancelled.wait()
 
291
  "cancel_event": cancel_flag,
292
  **create_kwargs,
293
  }
294
+ if hardware != DEFAULT_CPU_SANDBOX_HARDWARE:
295
  kwargs["sleep_time"] = 2700
296
  import time as _t
297
  _t_start = _t.monotonic()
 
301
  return None, "Sandbox creation cancelled by user."
302
  finally:
303
  watcher_task.cancel()
304
+
305
+ if cancel_flag.is_set():
306
+ if getattr(sb, "_owns_space", False):
307
+ try:
308
+ await asyncio.to_thread(sb.delete)
309
+ except Exception as e:
310
+ logger.warning("Failed to delete cancelled sandbox %s: %s", sb.space_id, e)
311
+ return None, "Sandbox creation cancelled by user."
312
+
313
  session.sandbox = sb
314
+ session.sandbox_hardware = hardware
315
+ session.sandbox_preload_error = None
316
 
317
  # Telemetry: sandbox creation (infra consumption signal)
318
  from agent.core import telemetry
 
343
  return sb, None
344
 
345
 
346
+ def start_cpu_sandbox_preload(session: Any) -> asyncio.Task | None:
347
+ """Start a background ``cpu-basic`` sandbox for this session."""
348
+ if not session or getattr(session, "sandbox", None):
349
+ return None
350
+
351
+ existing_task = getattr(session, "sandbox_preload_task", None)
352
+ if existing_task and not existing_task.done():
353
+ return existing_task
354
+
355
+ cancel_event = threading.Event()
356
+ session.sandbox_preload_cancel_event = cancel_event
357
+ session.sandbox_preload_error = None
358
+
359
+ async def _preload() -> Sandbox | None:
360
+ try:
361
+ sb, error = await _ensure_sandbox(
362
+ session,
363
+ hardware=DEFAULT_CPU_SANDBOX_HARDWARE,
364
+ cancel_event=cancel_event,
365
+ )
366
+ if error:
367
+ session.sandbox_preload_error = error
368
+ return None
369
+ return sb
370
+ except asyncio.CancelledError:
371
+ cancel_event.set()
372
+ session.sandbox_preload_error = "Sandbox creation cancelled by user."
373
+ raise
374
+ except Exception as e:
375
+ session.sandbox_preload_error = f"Failed to create sandbox: {e}"
376
+ logger.warning("CPU sandbox preload failed: %s", e)
377
+ return None
378
+
379
+ task = asyncio.create_task(_preload())
380
+ session.sandbox_preload_task = task
381
+ return task
382
+
383
+
384
+ async def cancel_sandbox_preload(session: Any) -> None:
385
+ """Best-effort cancellation for an in-flight CPU sandbox preload."""
386
+ cancel_event = getattr(session, "sandbox_preload_cancel_event", None)
387
+ if cancel_event is not None:
388
+ cancel_event.set()
389
+
390
+ task = getattr(session, "sandbox_preload_task", None)
391
+ if not task or task.done():
392
+ return
393
+
394
+ current_task = asyncio.current_task()
395
+ if task is current_task:
396
+ return
397
+
398
+ try:
399
+ await asyncio.wait_for(asyncio.shield(task), timeout=30)
400
+ except asyncio.TimeoutError:
401
+ logger.warning(
402
+ "Timed out waiting for CPU sandbox preload cancellation; "
403
+ "task is still live, cancelling asyncio wrapper"
404
+ )
405
+ task.cancel()
406
+ except asyncio.CancelledError:
407
+ raise
408
+ except Exception:
409
+ pass
410
+
411
+
412
+ async def get_active_or_preloaded_sandbox(
413
+ session: Any,
414
+ ) -> tuple[Sandbox | None, str | None]:
415
+ """Return the active sandbox, waiting for the startup preload if needed."""
416
+ if not session:
417
+ return None, "No session available."
418
+ if getattr(session, "sandbox", None):
419
+ return session.sandbox, None
420
+
421
+ task = getattr(session, "sandbox_preload_task", None)
422
+ if task:
423
+ try:
424
+ await asyncio.shield(task)
425
+ except asyncio.CancelledError:
426
+ raise
427
+ except Exception as e:
428
+ session.sandbox_preload_error = f"Failed to create sandbox: {e}"
429
+
430
+ if getattr(session, "sandbox", None):
431
+ return session.sandbox, None
432
+
433
+ preload_error = getattr(session, "sandbox_preload_error", None)
434
+ if preload_error:
435
+ return None, preload_error
436
+
437
+ return None, "Sandbox is still starting. Please retry shortly."
438
+
439
+
440
+ async def teardown_session_sandbox(session: Any) -> None:
441
+ """Cancel sandbox preload and delete the active owned sandbox, if present."""
442
+ if not session:
443
+ return
444
+
445
+ await cancel_sandbox_preload(session)
446
+
447
+ sandbox = getattr(session, "sandbox", None)
448
+ session.sandbox = None
449
+ session.sandbox_hardware = None
450
+
451
+ if not (sandbox and getattr(sandbox, "_owns_space", False)):
452
+ return
453
+
454
+ space_id = getattr(sandbox, "space_id", None)
455
+ last_err: Exception | None = None
456
+ for attempt in range(3):
457
+ try:
458
+ logger.info("Deleting sandbox %s (attempt %s/3)...", space_id, attempt + 1)
459
+ await asyncio.to_thread(sandbox.delete)
460
+ from agent.core import telemetry
461
+ await telemetry.record_sandbox_destroy(session, sandbox)
462
+ return
463
+ except Exception as e:
464
+ last_err = e
465
+ if attempt < 2:
466
+ await asyncio.sleep(2 ** attempt)
467
+ logger.error(
468
+ "Failed to delete sandbox %s after 3 attempts: %s. "
469
+ "Orphan — sweep script will pick it up.",
470
+ space_id,
471
+ last_err,
472
+ )
473
+
474
+
475
  # ── sandbox_create tool ──────────────────────────────────────────────
476
 
477
  SANDBOX_CREATE_TOOL_SPEC = {
478
  "name": "sandbox_create",
479
  "description": (
480
+ "Create or replace the session sandbox when non-default hardware is needed.\n\n"
481
+ "A private cpu-basic sandbox is already started automatically for each session. "
482
+ "For normal CPU code execution, call bash/read/write/edit directly; do NOT call sandbox_create first.\n\n"
483
+ "Use sandbox_create when: you need GPU hardware, cpu-upgrade, or Trackio secrets before running code. "
484
+ "The active sandbox persists across tool calls within the session. pip install works out of the box. "
485
  "Sandboxes are always created as private HF Spaces.\n\n"
 
 
 
 
486
  "For ML code that uses CUDA, bf16, or model loading: use GPU hardware (t4-small minimum). "
487
  "CPU sandboxes cannot run GPU code paths — your test will not catch GPU-related errors.\n\n"
488
  "Before choosing hardware, estimate your VRAM needs (models you run, training data size). Rule of thumb: bf16/fp16 ≈ 2 bytes/param, "
 
502
  "hardware": {
503
  "type": "string",
504
  "enum": [e.value for e in SpaceHardware],
505
+ "description": (
506
+ "Hardware tier for the sandbox. Omit for the existing auto-started "
507
+ "cpu-basic sandbox; choose GPU/cpu-upgrade only when needed."
508
+ ),
509
  },
510
  "trackio_space_id": {
511
  "type": "string",
 
533
  args: dict[str, Any], session: Any = None, tool_call_id: str | None = None
534
  ) -> tuple[str, bool]:
535
  """Handle sandbox_create tool calls."""
536
+ hardware = args.get("hardware", DEFAULT_CPU_SANDBOX_HARDWARE)
537
  trackio_space_id = args.get("trackio_space_id") or None
538
  trackio_project = args.get("trackio_project") or None
539
 
 
551
  data["trackioProject"] = trackio_project
552
  await session.send_event(Event(event_type="tool_state_change", data=data))
553
 
554
+ preload_task = getattr(session, "sandbox_preload_task", None)
555
+ if (
556
+ session
557
+ and not getattr(session, "sandbox", None)
558
+ and preload_task
559
+ and not preload_task.done()
560
+ and hardware == DEFAULT_CPU_SANDBOX_HARDWARE
561
+ ):
562
+ sb, error = await get_active_or_preloaded_sandbox(session)
563
+ if error:
564
+ return error, False
565
+ if sb:
566
+ await _emit_trackio_state(sb)
567
+ return (
568
+ f"Sandbox already active: {sb.space_id}\n"
569
+ f"URL: {sb.url}\n"
570
+ f"Hardware: {DEFAULT_CPU_SANDBOX_HARDWARE}\n"
571
+ f"Use bash/read/write/edit to interact with it."
572
+ ), True
573
+
574
+ if (
575
+ session
576
+ and not getattr(session, "sandbox", None)
577
+ and preload_task
578
+ and not preload_task.done()
579
+ and hardware != DEFAULT_CPU_SANDBOX_HARDWARE
580
+ ):
581
+ await cancel_sandbox_preload(session)
582
+
583
+ # If sandbox already exists, return its info or replace the auto CPU sandbox
584
  if session and getattr(session, "sandbox", None):
585
  sb = session.sandbox
586
+ active_hardware = getattr(session, "sandbox_hardware", None)
587
+ if active_hardware == hardware:
588
+ await _emit_trackio_state(sb)
589
+ return (
590
+ f"Sandbox already active: {sb.space_id}\n"
591
+ f"URL: {sb.url}\n"
592
+ f"Hardware: {active_hardware}\n"
593
+ f"Use bash/read/write/edit to interact with it."
594
+ ), True
595
+
596
  requested_hardware = args.get("hardware")
597
  lockout_note = ""
598
+ if (
599
+ active_hardware == DEFAULT_CPU_SANDBOX_HARDWARE
600
+ and hardware != DEFAULT_CPU_SANDBOX_HARDWARE
601
+ ):
602
+ await teardown_session_sandbox(session)
603
+ elif requested_hardware:
604
  lockout_note = (
605
  f"\nRequested hardware: {requested_hardware}\n"
606
  "Hardware cannot be changed by calling sandbox_create again. "
607
  "Delete the existing sandbox first if you need a different tier."
608
  )
609
+ await _emit_trackio_state(sb)
610
+ return (
611
+ f"Sandbox already active: {sb.space_id}\n"
612
+ f"URL: {sb.url}\n"
613
+ f"{lockout_note}\n"
614
+ f"Use bash/read/write/edit to interact with it."
615
+ ), True
616
+ else:
617
+ await _emit_trackio_state(sb)
618
+ return (
619
+ f"Sandbox already active: {sb.space_id}\n"
620
+ f"URL: {sb.url}\n"
621
+ f"Hardware: {active_hardware or 'unknown'}\n"
622
+ f"Use bash/read/write/edit to interact with it."
623
+ ), True
624
 
625
  create_kwargs: dict[str, Any] = {}
626
 
 
659
  """Factory: create a handler for a sandbox operation tool."""
660
 
661
  async def handler(args: dict[str, Any], session: Any = None) -> tuple[str, bool]:
662
+ sb, error = await get_active_or_preloaded_sandbox(session)
663
+ if error:
664
+ return error, False
665
+ if not sb:
666
+ return "Sandbox is still starting. Please retry shortly.", False
667
 
668
  try:
669
  result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
 
688
 
689
  tools = []
690
 
691
+ # sandbox_create (for GPU or other non-default hardware)
692
  tools.append(
693
  ToolSpec(
694
  name=SANDBOX_CREATE_TOOL_SPEC["name"],
 
701
  # Operation tools (auto-execute, no approval needed)
702
  for name in Sandbox.TOOLS.keys():
703
  spec = Sandbox.TOOLS[name]
704
+ description = (
705
+ "Uses the session's active sandbox. A private cpu-basic sandbox is "
706
+ "started automatically for normal CPU work; call sandbox_create only "
707
+ "for GPU or other non-default hardware.\n\n"
708
+ + spec["description"]
709
+ )
710
  tools.append(
711
  ToolSpec(
712
  name=name,
713
+ description=description,
714
  parameters=spec["parameters"],
715
  handler=_make_tool_handler(name),
716
  )
backend/routes/agent.py CHANGED
@@ -41,6 +41,7 @@ from agent.core.llm_params import _resolve_llm_params
41
  logger = logging.getLogger(__name__)
42
 
43
  router = APIRouter(prefix="/api", tags=["agent"])
 
44
 
45
  DEFAULT_CLAUDE_MODEL_ID = "bedrock/us.anthropic.claude-opus-4-6-v1"
46
  GATED_MODEL_IDS = {
@@ -559,6 +560,18 @@ async def list_sessions(user: dict = Depends(get_current_user)) -> list[SessionI
559
  return [SessionInfo(**s) for s in sessions]
560
 
561
 
 
 
 
 
 
 
 
 
 
 
 
 
562
  @router.delete("/session/{session_id}")
563
  async def delete_session(
564
  session_id: str, user: dict = Depends(get_current_user)
 
41
  logger = logging.getLogger(__name__)
42
 
43
  router = APIRouter(prefix="/api", tags=["agent"])
44
+ _background_teardown_tasks: set[asyncio.Task] = set()
45
 
46
  DEFAULT_CLAUDE_MODEL_ID = "bedrock/us.anthropic.claude-opus-4-6-v1"
47
  GATED_MODEL_IDS = {
 
560
  return [SessionInfo(**s) for s in sessions]
561
 
562
 
563
+ @router.post("/session/{session_id}/sandbox/teardown")
564
+ async def teardown_session_sandbox(
565
+ session_id: str, user: dict = Depends(get_current_user)
566
+ ) -> dict:
567
+ """Best-effort sandbox teardown that preserves durable chat history."""
568
+ await _check_session_access(session_id, user)
569
+ task = asyncio.create_task(session_manager.teardown_sandbox(session_id))
570
+ _background_teardown_tasks.add(task)
571
+ task.add_done_callback(_background_teardown_tasks.discard)
572
+ return {"status": "teardown_requested", "session_id": session_id}
573
+
574
+
575
  @router.delete("/session/{session_id}")
576
  async def delete_session(
577
  session_id: str, user: dict = Depends(get_current_user)
backend/session_manager.py CHANGED
@@ -336,6 +336,20 @@ class SessionManager:
336
  agent_session.task = task
337
  return agent_session
338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  @staticmethod
340
  def _can_access_session(agent_session: AgentSession, user_id: str) -> bool:
341
  return (
@@ -519,6 +533,7 @@ class SessionManager:
519
  hf_username=hf_username,
520
  )
521
  return started
 
522
  logger.info("Restored session %s for user %s", session_id, owner or user_id)
523
  return agent_session
524
 
@@ -599,6 +614,7 @@ class SessionManager:
599
  event_queue=event_queue,
600
  tool_router=tool_router,
601
  )
 
602
  await self.persist_session_snapshot(agent_session, runtime_state="idle")
603
 
604
  if is_pro is not None and user_id and user_id != "dev":
@@ -705,27 +721,9 @@ class SessionManager:
705
  with exponential backoff. A single missed delete = a permanently
706
  orphaned Space, so the cost of an extra retry beats the alternative.
707
  """
708
- sandbox = getattr(session, "sandbox", None)
709
- if not (sandbox and getattr(sandbox, "_owns_space", False)):
710
- return
711
 
712
- space_id = getattr(sandbox, "space_id", None)
713
- last_err: Exception | None = None
714
- for attempt in range(3):
715
- try:
716
- logger.info(f"Deleting sandbox {space_id} (attempt {attempt + 1}/3)...")
717
- await asyncio.to_thread(sandbox.delete)
718
- from agent.core import telemetry
719
- await telemetry.record_sandbox_destroy(session, sandbox)
720
- return
721
- except Exception as e:
722
- last_err = e
723
- if attempt < 2:
724
- await asyncio.sleep(2 ** attempt)
725
- logger.error(
726
- f"Failed to delete sandbox {space_id} after 3 attempts: {last_err}. "
727
- f"Orphan — sweep script will pick it up."
728
- )
729
 
730
  async def _run_session(
731
  self,
@@ -905,6 +903,18 @@ class SessionManager:
905
 
906
  return True
907
 
 
 
 
 
 
 
 
 
 
 
 
 
908
  async def update_session_title(self, session_id: str, title: str | None) -> None:
909
  """Persist a user-visible title for sidebar rehydration."""
910
  agent_session = self.sessions.get(session_id)
 
336
  agent_session.task = task
337
  return agent_session
338
 
339
+ @staticmethod
340
+ def _start_cpu_sandbox_preload(agent_session: AgentSession) -> None:
341
+ """Kick off a best-effort cpu-basic sandbox for the session."""
342
+ try:
343
+ from agent.tools.sandbox_tool import start_cpu_sandbox_preload
344
+
345
+ start_cpu_sandbox_preload(agent_session.session)
346
+ except Exception as e:
347
+ logger.warning(
348
+ "Failed to start CPU sandbox preload for %s: %s",
349
+ agent_session.session_id,
350
+ e,
351
+ )
352
+
353
  @staticmethod
354
  def _can_access_session(agent_session: AgentSession, user_id: str) -> bool:
355
  return (
 
533
  hf_username=hf_username,
534
  )
535
  return started
536
+ self._start_cpu_sandbox_preload(agent_session)
537
  logger.info("Restored session %s for user %s", session_id, owner or user_id)
538
  return agent_session
539
 
 
614
  event_queue=event_queue,
615
  tool_router=tool_router,
616
  )
617
+ self._start_cpu_sandbox_preload(agent_session)
618
  await self.persist_session_snapshot(agent_session, runtime_state="idle")
619
 
620
  if is_pro is not None and user_id and user_id != "dev":
 
721
  with exponential backoff. A single missed delete = a permanently
722
  orphaned Space, so the cost of an extra retry beats the alternative.
723
  """
724
+ from agent.tools.sandbox_tool import teardown_session_sandbox
 
 
725
 
726
+ await teardown_session_sandbox(session)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
727
 
728
  async def _run_session(
729
  self,
 
903
 
904
  return True
905
 
906
+ async def teardown_sandbox(self, session_id: str) -> bool:
907
+ """Delete only this session's sandbox runtime, preserving chat state."""
908
+ async with self._lock:
909
+ agent_session = self.sessions.get(session_id)
910
+
911
+ if not agent_session or not agent_session.is_active:
912
+ return False
913
+
914
+ await self._cleanup_sandbox(agent_session.session)
915
+ await self.persist_session_snapshot(agent_session, runtime_state="idle")
916
+ return True
917
+
918
  async def update_session_title(self, session_id: str, title: str | None) -> None:
919
  """Persist a user-visible title for sidebar rehydration."""
920
  agent_session = self.sessions.get(session_id)
frontend/src/components/Layout/AppLayout.tsx CHANGED
@@ -122,6 +122,39 @@ export default function AppLayout() {
122
  };
123
  }, [isConnected, activeSessionId]);
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  const handleSessionDead = useCallback(
126
  (deadSessionId: string) => {
127
  // Backend lost this session — mark it expired so the chat shows a
 
122
  };
123
  }, [isConnected, activeSessionId]);
124
 
125
+ // Best-effort sandbox cleanup when the browser tab/window closes. This
126
+ // preserves durable chat history; explicit delete still removes the session.
127
+ useEffect(() => {
128
+ const teardownSandboxes = () => {
129
+ const liveSessionIds = useSessionStore
130
+ .getState()
131
+ .sessions
132
+ .filter((session) => session.isActive && !session.expired)
133
+ .map((session) => session.id);
134
+
135
+ for (const sessionId of liveSessionIds) {
136
+ const url = `/api/session/${sessionId}/sandbox/teardown`;
137
+ const body = '{}';
138
+ const blob = new Blob([body], { type: 'application/json' });
139
+
140
+ if (navigator.sendBeacon?.(url, blob)) {
141
+ continue;
142
+ }
143
+
144
+ fetch(url, {
145
+ method: 'POST',
146
+ body,
147
+ keepalive: true,
148
+ credentials: 'include',
149
+ headers: { 'Content-Type': 'application/json' },
150
+ }).catch(() => {});
151
+ }
152
+ };
153
+
154
+ window.addEventListener('pagehide', teardownSandboxes);
155
+ return () => window.removeEventListener('pagehide', teardownSandboxes);
156
+ }, []);
157
+
158
  const handleSessionDead = useCallback(
159
  (deadSessionId: string) => {
160
  // Backend lost this session — mark it expired so the chat shows a
tests/unit/test_sandbox_auto_start.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from types import SimpleNamespace
2
+ from pathlib import Path
3
+
4
+ from agent.core.agent_loop import _needs_approval
5
+ from agent.tools.sandbox_tool import get_sandbox_tools
6
+
7
+
8
+ def test_default_cpu_sandbox_create_does_not_require_approval():
9
+ config = SimpleNamespace(yolo_mode=False)
10
+
11
+ assert _needs_approval("sandbox_create", {}, config) is False
12
+ assert _needs_approval("sandbox_create", {"hardware": "cpu-basic"}, config) is False
13
+
14
+
15
+ def test_non_default_sandbox_create_still_requires_approval():
16
+ config = SimpleNamespace(yolo_mode=False)
17
+
18
+ assert _needs_approval("sandbox_create", {"hardware": "cpu-upgrade"}, config) is True
19
+ assert _needs_approval("sandbox_create", {"hardware": "t4-small"}, config) is True
20
+
21
+
22
+ def test_prompt_and_tool_specs_do_not_require_cpu_sandbox_create():
23
+ prompt = Path("agent/prompts/system_prompt_v3.yaml").read_text()
24
+ tool_specs = {tool.name: tool.description for tool in get_sandbox_tools()}
25
+
26
+ assert "sandbox_create → install deps" not in prompt
27
+ assert "Do NOT call sandbox_create before normal CPU work" in prompt
28
+ assert "cpu-basic sandbox is already available" in prompt
29
+
30
+ assert "cpu-basic sandbox is already started automatically" in tool_specs["sandbox_create"]
31
+ assert "started automatically for normal CPU work" in tool_specs["bash"]
tests/unit/test_sandbox_private_spaces.py CHANGED
@@ -1,4 +1,6 @@
1
  import asyncio
 
 
2
  from types import SimpleNamespace
3
 
4
  from agent.core import telemetry
@@ -9,6 +11,7 @@ from agent.tools.sandbox_tool import sandbox_create_handler
9
 
10
  def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
11
  duplicate_kwargs = {}
 
12
 
13
  class FakeApi:
14
  def __init__(self, token=None):
@@ -17,6 +20,10 @@ def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
17
  def duplicate_space(self, **kwargs):
18
  duplicate_kwargs.update(kwargs)
19
 
 
 
 
 
20
  def add_space_secret(self, *args, **kwargs):
21
  pass
22
 
@@ -34,6 +41,54 @@ def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
34
  Sandbox.create(owner="alice", token="hf-token", log=lambda msg: None)
35
 
36
  assert duplicate_kwargs["private"] is True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
 
39
  def test_sandbox_tool_forces_private_spaces(monkeypatch):
@@ -68,6 +123,29 @@ def test_sandbox_tool_forces_private_spaces(monkeypatch):
68
  assert "Visibility: private" in out
69
 
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  def test_ensure_sandbox_overrides_private_argument(monkeypatch):
72
  captured_kwargs = {}
73
 
@@ -114,3 +192,230 @@ def test_ensure_sandbox_overrides_private_argument(monkeypatch):
114
  assert error is None
115
  assert sb is not None
116
  assert captured_kwargs["private"] is True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import asyncio
2
+ import threading
3
+ import time
4
  from types import SimpleNamespace
5
 
6
  from agent.core import telemetry
 
11
 
12
  def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
13
  duplicate_kwargs = {}
14
+ requested_hardware = []
15
 
16
  class FakeApi:
17
  def __init__(self, token=None):
 
20
  def duplicate_space(self, **kwargs):
21
  duplicate_kwargs.update(kwargs)
22
 
23
+ def request_space_hardware(self, space_id, hardware, sleep_time=None):
24
+ requested_hardware.append((space_id, hardware, sleep_time))
25
+ return SimpleNamespace(stage="BUILDING", hardware=None)
26
+
27
  def add_space_secret(self, *args, **kwargs):
28
  pass
29
 
 
41
  Sandbox.create(owner="alice", token="hf-token", log=lambda msg: None)
42
 
43
  assert duplicate_kwargs["private"] is True
44
+ assert requested_hardware == [(duplicate_kwargs["to_id"], "cpu-basic", None)]
45
+
46
+
47
+ def test_sandbox_client_retries_transient_runtime_404(monkeypatch):
48
+ runtime_calls = 0
49
+
50
+ class FakeResponse:
51
+ status_code = 404
52
+
53
+ class FakeRuntime404(Exception):
54
+ response = FakeResponse()
55
+
56
+ def __str__(self):
57
+ return "404 Client Error: Repository Not Found"
58
+
59
+ class FakeApi:
60
+ def __init__(self, token=None):
61
+ self.token = token
62
+
63
+ def duplicate_space(self, **kwargs):
64
+ pass
65
+
66
+ def request_space_hardware(self, space_id, hardware, sleep_time=None):
67
+ return SimpleNamespace(stage="BUILDING", hardware=None)
68
+
69
+ def add_space_secret(self, *args, **kwargs):
70
+ pass
71
+
72
+ def get_space_runtime(self, space_id):
73
+ nonlocal runtime_calls
74
+ runtime_calls += 1
75
+ if runtime_calls == 1:
76
+ raise FakeRuntime404()
77
+ return SimpleNamespace(stage="RUNNING", hardware="cpu-basic")
78
+
79
+ monkeypatch.setattr(sandbox_client, "HfApi", FakeApi)
80
+ monkeypatch.setattr(sandbox_client.time, "sleep", lambda seconds: None)
81
+ monkeypatch.setattr(
82
+ Sandbox,
83
+ "_setup_server",
84
+ staticmethod(lambda *args, **kwargs: None),
85
+ )
86
+ monkeypatch.setattr(Sandbox, "_wait_for_api", lambda self, *args, **kwargs: None)
87
+
88
+ sandbox = Sandbox.create(owner="alice", token="hf-token", log=lambda msg: None)
89
+
90
+ assert sandbox.space_id.startswith("alice/sandbox-")
91
+ assert runtime_calls == 2
92
 
93
 
94
  def test_sandbox_tool_forces_private_spaces(monkeypatch):
 
123
  assert "Visibility: private" in out
124
 
125
 
126
+ def test_orphan_sweep_preserves_spaces_without_last_modified():
127
+ deleted: list[str] = []
128
+ logs: list[str] = []
129
+
130
+ class FakeApi:
131
+ def list_spaces(self, **kwargs):
132
+ assert kwargs["full"] is True
133
+ return [SimpleNamespace(id="alice/sandbox-12345678")]
134
+
135
+ def delete_repo(self, repo_id, repo_type):
136
+ deleted.append(repo_id)
137
+
138
+ count = sandbox_tool._cleanup_user_orphan_sandboxes(
139
+ FakeApi(),
140
+ "alice",
141
+ logs.append,
142
+ )
143
+
144
+ assert count == 0
145
+ assert deleted == []
146
+ assert logs == ["orphan sweep: skipping alice/sandbox-12345678; missing lastModified"]
147
+
148
+
149
  def test_ensure_sandbox_overrides_private_argument(monkeypatch):
150
  captured_kwargs = {}
151
 
 
192
  assert error is None
193
  assert sb is not None
194
  assert captured_kwargs["private"] is True
195
+
196
+
197
+ def test_sandbox_creation_is_serialized_per_owner(monkeypatch):
198
+ active_creates = 0
199
+ max_active_creates = 0
200
+ active_lock = threading.Lock()
201
+
202
+ class FakeApi:
203
+ def __init__(self, token=None):
204
+ self.token = token
205
+
206
+ def whoami(self):
207
+ return {"name": "alice"}
208
+
209
+ class FakeSession:
210
+ def __init__(self):
211
+ self.hf_token = "hf-token"
212
+ self.sandbox = None
213
+ self.event_queue = SimpleNamespace(put_nowait=lambda event: None)
214
+ self._cancelled = asyncio.Event()
215
+
216
+ async def send_event(self, event):
217
+ pass
218
+
219
+ def fake_create(**kwargs):
220
+ nonlocal active_creates, max_active_creates
221
+ with active_lock:
222
+ active_creates += 1
223
+ max_active_creates = max(max_active_creates, active_creates)
224
+ time.sleep(0.02)
225
+ with active_lock:
226
+ active_creates -= 1
227
+ return SimpleNamespace(
228
+ space_id=f"alice/sandbox-{kwargs['hardware']}",
229
+ url="https://huggingface.co/spaces/alice/sandbox",
230
+ )
231
+
232
+ async def fake_record_sandbox_create(*args, **kwargs):
233
+ pass
234
+
235
+ monkeypatch.setattr(sandbox_tool, "HfApi", FakeApi)
236
+ monkeypatch.setattr(sandbox_tool, "_cleanup_user_orphan_sandboxes", lambda *args: 0)
237
+ monkeypatch.setattr(Sandbox, "create", staticmethod(fake_create))
238
+ monkeypatch.setattr(telemetry, "record_sandbox_create", fake_record_sandbox_create)
239
+ monkeypatch.setattr("huggingface_hub.metadata_update", lambda *args, **kwargs: None)
240
+
241
+ async def run():
242
+ await asyncio.gather(
243
+ sandbox_tool._ensure_sandbox(FakeSession()),
244
+ sandbox_tool._ensure_sandbox(FakeSession()),
245
+ )
246
+
247
+ asyncio.run(run())
248
+
249
+ assert max_active_creates == 1
250
+
251
+
252
+ def test_sandbox_operation_waits_for_cpu_preload():
253
+ calls: list[tuple[str, dict]] = []
254
+
255
+ class FakeSandbox:
256
+ def call_tool(self, name, args):
257
+ calls.append((name, args))
258
+ return SimpleNamespace(success=True, output="preloaded-ok", error="")
259
+
260
+ async def run():
261
+ session = SimpleNamespace(
262
+ sandbox=None,
263
+ sandbox_preload_error=None,
264
+ )
265
+
266
+ async def preload():
267
+ await asyncio.sleep(0)
268
+ session.sandbox = FakeSandbox()
269
+
270
+ session.sandbox_preload_task = asyncio.create_task(preload())
271
+ handler = sandbox_tool._make_tool_handler("bash")
272
+ return await handler({"command": "echo ok"}, session=session)
273
+
274
+ out, ok = asyncio.run(run())
275
+
276
+ assert ok is True
277
+ assert out == "preloaded-ok"
278
+ assert calls == [("bash", {"command": "echo ok"})]
279
+
280
+
281
+ def test_default_sandbox_create_waits_for_cpu_preload():
282
+ class FakeSandbox:
283
+ space_id = "alice/sandbox-cpu"
284
+ url = "https://huggingface.co/spaces/alice/sandbox-cpu"
285
+
286
+ async def run():
287
+ session = SimpleNamespace(
288
+ sandbox=None,
289
+ sandbox_preload_error=None,
290
+ )
291
+
292
+ async def preload():
293
+ await asyncio.sleep(0)
294
+ session.sandbox = FakeSandbox()
295
+ session.sandbox_hardware = "cpu-basic"
296
+
297
+ session.sandbox_preload_task = asyncio.create_task(preload())
298
+ return await sandbox_tool.sandbox_create_handler({}, session=session)
299
+
300
+ out, ok = asyncio.run(run())
301
+
302
+ assert ok is True
303
+ assert "Sandbox already active: alice/sandbox-cpu" in out
304
+ assert "Hardware: cpu-basic" in out
305
+
306
+
307
+ def test_sandbox_create_replaces_auto_cpu_sandbox(monkeypatch):
308
+ deleted: list[str] = []
309
+
310
+ class FakeSession:
311
+ def __init__(self):
312
+ self.sandbox = SimpleNamespace(
313
+ space_id="alice/sandbox-cpu",
314
+ url="https://huggingface.co/spaces/alice/sandbox-cpu",
315
+ _owns_space=True,
316
+ delete=lambda: deleted.append("alice/sandbox-cpu"),
317
+ )
318
+ self.sandbox_hardware = "cpu-basic"
319
+ self.sandbox_preload_task = None
320
+ self.sandbox_preload_cancel_event = None
321
+
322
+ async def send_event(self, event):
323
+ pass
324
+
325
+ gpu_sandbox = SimpleNamespace(
326
+ space_id="alice/sandbox-gpu",
327
+ url="https://huggingface.co/spaces/alice/sandbox-gpu",
328
+ _owns_space=True,
329
+ )
330
+
331
+ async def fake_ensure_sandbox(session, hardware="cpu-basic", **kwargs):
332
+ session.sandbox = gpu_sandbox
333
+ session.sandbox_hardware = hardware
334
+ return gpu_sandbox, None
335
+
336
+ async def fake_record_sandbox_destroy(*args, **kwargs):
337
+ pass
338
+
339
+ monkeypatch.setattr(sandbox_tool, "_ensure_sandbox", fake_ensure_sandbox)
340
+ monkeypatch.setattr(telemetry, "record_sandbox_destroy", fake_record_sandbox_destroy)
341
+
342
+ session = FakeSession()
343
+ out, ok = asyncio.run(
344
+ sandbox_tool.sandbox_create_handler(
345
+ {"hardware": "a100-large"},
346
+ session=session,
347
+ )
348
+ )
349
+
350
+ assert ok is True
351
+ assert deleted == ["alice/sandbox-cpu"]
352
+ assert session.sandbox is gpu_sandbox
353
+ assert session.sandbox_hardware == "a100-large"
354
+ assert "Hardware: a100-large" in out
355
+
356
+
357
+ def test_teardown_cancels_preload_and_deletes_owned_sandbox(monkeypatch):
358
+ deleted: list[str] = []
359
+
360
+ async def fake_record_sandbox_destroy(*args, **kwargs):
361
+ pass
362
+
363
+ monkeypatch.setattr(telemetry, "record_sandbox_destroy", fake_record_sandbox_destroy)
364
+
365
+ async def run():
366
+ cancel_event = threading.Event()
367
+
368
+ async def preload():
369
+ await asyncio.sleep(0)
370
+
371
+ session = SimpleNamespace(
372
+ sandbox=SimpleNamespace(
373
+ space_id="alice/sandbox-12345678",
374
+ _owns_space=True,
375
+ delete=lambda: deleted.append("alice/sandbox-12345678"),
376
+ ),
377
+ sandbox_hardware="cpu-basic",
378
+ sandbox_preload_task=asyncio.create_task(preload()),
379
+ sandbox_preload_cancel_event=cancel_event,
380
+ )
381
+
382
+ await sandbox_tool.teardown_session_sandbox(session)
383
+ return session, cancel_event
384
+
385
+ session, cancel_event = asyncio.run(run())
386
+
387
+ assert cancel_event.is_set()
388
+ assert deleted == ["alice/sandbox-12345678"]
389
+ assert session.sandbox is None
390
+ assert session.sandbox_hardware is None
391
+
392
+
393
+ def test_cancel_sandbox_preload_cancels_task_after_timeout(monkeypatch):
394
+ async def run():
395
+ async def fake_wait_for(awaitable, timeout):
396
+ await asyncio.sleep(0)
397
+ raise asyncio.TimeoutError
398
+
399
+ monkeypatch.setattr(sandbox_tool.asyncio, "wait_for", fake_wait_for)
400
+
401
+ cancel_event = threading.Event()
402
+ blocker = asyncio.Event()
403
+
404
+ async def preload():
405
+ await blocker.wait()
406
+
407
+ task = asyncio.create_task(preload())
408
+ session = SimpleNamespace(
409
+ sandbox_preload_task=task,
410
+ sandbox_preload_cancel_event=cancel_event,
411
+ )
412
+
413
+ await sandbox_tool.cancel_sandbox_preload(session)
414
+ await asyncio.sleep(0)
415
+
416
+ return task.cancelled(), cancel_event.is_set()
417
+
418
+ task_cancelled, cancel_event_set = asyncio.run(run())
419
+
420
+ assert task_cancelled is True
421
+ assert cancel_event_set is True
tests/unit/test_session_manager_persistence.py CHANGED
@@ -186,6 +186,12 @@ async def test_concurrent_lazy_restore_starts_only_one_agent_task():
186
  store = RestoreStore(delay=0.01)
187
  manager = _manager_with_store(store)
188
  stop = _install_fake_runtime(manager)
 
 
 
 
 
 
189
 
190
  try:
191
  first, second = await asyncio.gather(
@@ -197,12 +203,56 @@ async def test_concurrent_lazy_restore_starts_only_one_agent_task():
197
  assert first is second
198
  assert list(manager.sessions) == ["persisted-session"]
199
  assert manager.run_calls == 1 # type: ignore[attr-defined]
 
200
  assert not stop.is_set()
201
  finally:
202
  stop.set()
203
  await _cancel_runtime_tasks(manager)
204
 
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  @pytest.mark.asyncio
207
  async def test_lazy_restore_preserves_pending_approval_tool_calls():
208
  store = RestoreStore(
 
186
  store = RestoreStore(delay=0.01)
187
  manager = _manager_with_store(store)
188
  stop = _install_fake_runtime(manager)
189
+ scheduled: list[str] = []
190
+
191
+ def fake_start_cpu_sandbox_preload(agent_session: AgentSession) -> None:
192
+ scheduled.append(agent_session.session_id)
193
+
194
+ manager._start_cpu_sandbox_preload = fake_start_cpu_sandbox_preload # type: ignore[method-assign]
195
 
196
  try:
197
  first, second = await asyncio.gather(
 
203
  assert first is second
204
  assert list(manager.sessions) == ["persisted-session"]
205
  assert manager.run_calls == 1 # type: ignore[attr-defined]
206
+ assert scheduled == ["persisted-session"]
207
  assert not stop.is_set()
208
  finally:
209
  stop.set()
210
  await _cancel_runtime_tasks(manager)
211
 
212
 
213
+ @pytest.mark.asyncio
214
+ async def test_create_session_schedules_cpu_sandbox_preload():
215
+ manager = _manager_with_store(NoopSessionStore())
216
+ stop = _install_fake_runtime(manager)
217
+ scheduled: list[str] = []
218
+
219
+ def fake_start_cpu_sandbox_preload(agent_session: AgentSession) -> None:
220
+ scheduled.append(agent_session.session_id)
221
+
222
+ manager._start_cpu_sandbox_preload = fake_start_cpu_sandbox_preload # type: ignore[method-assign]
223
+
224
+ try:
225
+ session_id = await manager.create_session(user_id="owner", hf_token="token")
226
+
227
+ assert scheduled == [session_id]
228
+ assert session_id in manager.sessions
229
+ finally:
230
+ stop.set()
231
+ await _cancel_runtime_tasks(manager)
232
+
233
+
234
+ @pytest.mark.asyncio
235
+ async def test_lazy_restore_schedules_cpu_sandbox_preload():
236
+ manager = _manager_with_store(RestoreStore())
237
+ stop = _install_fake_runtime(manager)
238
+ scheduled: list[str] = []
239
+
240
+ def fake_start_cpu_sandbox_preload(agent_session: AgentSession) -> None:
241
+ scheduled.append(agent_session.session_id)
242
+
243
+ manager._start_cpu_sandbox_preload = fake_start_cpu_sandbox_preload # type: ignore[method-assign]
244
+
245
+ try:
246
+ restored = await manager.ensure_session_loaded("persisted-session", user_id="owner")
247
+
248
+ assert restored is not None
249
+ assert scheduled == ["persisted-session"]
250
+ assert "persisted-session" in manager.sessions
251
+ finally:
252
+ stop.set()
253
+ await _cancel_runtime_tasks(manager)
254
+
255
+
256
  @pytest.mark.asyncio
257
  async def test_lazy_restore_preserves_pending_approval_tool_calls():
258
  store = RestoreStore(