File size: 28,009 Bytes
a644598
 
 
 
8615c28
 
a644598
8615c28
 
a644598
 
 
 
 
1d50c78
 
e7524a8
8615c28
b05b6f5
1d50c78
a644598
 
 
 
4fc6e96
a644598
 
6131fc8
a644598
1d50c78
 
8615c28
 
1d50c78
 
 
1e1fe1f
1d50c78
 
 
 
 
 
8615c28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae24efc
b05b6f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae24efc
 
b05b6f5
ae24efc
 
 
b05b6f5
 
 
 
 
 
 
 
 
 
 
ae24efc
 
 
3c91fc8
 
 
ae24efc
 
 
 
 
 
 
 
 
 
23be8d5
 
ae24efc
23be8d5
 
 
 
 
 
ae24efc
 
 
 
3c91fc8
6131fc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7636865
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a644598
 
 
1d50c78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8615c28
1d50c78
 
 
 
 
 
1e1fe1f
1d50c78
 
754345f
 
 
1d50c78
 
 
 
 
8615c28
 
 
1d50c78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a644598
6131fc8
8615c28
6131fc8
8615c28
6131fc8
a644598
 
 
 
 
 
 
 
 
 
 
 
 
33f29a8
a644598
08153a8
a644598
 
 
 
 
 
 
8615c28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a644598
 
 
 
 
 
 
 
 
 
b05b6f5
 
9d1a532
e7524a8
 
 
8615c28
e7524a8
 
 
 
 
 
 
6131fc8
 
 
 
15d6506
3c91fc8
 
 
 
6131fc8
3c91fc8
e7524a8
3c91fc8
 
8615c28
9cd534c
2a2e170
754345f
2a2e170
e7524a8
 
 
 
 
 
8615c28
 
 
 
b05b6f5
8615c28
754345f
 
 
8615c28
 
a644598
8615c28
 
7636865
a644598
2a2e170
 
754345f
2a2e170
754345f
 
 
2a2e170
 
 
a644598
 
 
 
 
 
 
 
 
 
8615c28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7636865
8615c28
 
7636865
 
8615c28
7636865
 
b05b6f5
7636865
 
 
 
 
 
 
 
b05b6f5
7636865
754345f
7636865
 
 
 
 
754345f
7636865
 
 
 
 
 
 
 
8615c28
 
a644598
 
 
 
 
8615c28
 
 
 
 
15d6506
a644598
 
9d1a532
 
 
 
6131fc8
11112c6
6131fc8
a644598
 
 
 
 
 
 
 
 
 
8615c28
 
 
 
a644598
6131fc8
 
 
 
11112c6
6131fc8
 
 
 
 
 
 
 
 
 
 
 
a644598
 
 
 
 
 
6131fc8
a644598
 
8615c28
6131fc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3eec386
8615c28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a644598
 
8615c28
 
 
 
 
 
 
 
 
 
3eec386
 
8615c28
 
 
 
 
 
3eec386
 
 
 
 
8615c28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a644598
6131fc8
a644598
6131fc8
 
 
 
 
 
 
a644598
6131fc8
 
 
 
 
 
a644598
 
 
 
 
 
6131fc8
 
a644598
 
 
 
15d6506
a644598
 
 
 
 
 
 
 
8615c28
 
 
 
 
a644598
 
4fc6e96
 
 
 
 
 
 
 
a644598
 
6d4f619
 
a644598
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8615c28
a644598
 
 
 
 
 
 
 
 
 
 
 
8615c28
 
 
754345f
8615c28
a644598
 
 
8615c28
a644598
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
"""
Sandbox tools β€” expose the Sandbox client as agent tools.

5 tools total:
  sandbox_create β€” create/replace sandbox for non-default hardware
  bash, read, write, edit β€” operations on the active sandbox

A cpu-basic sandbox is preloaded for each session. Operation tools wait for it
if startup is still in progress.
"""

from __future__ import annotations

import asyncio
import logging
import re
import threading
import weakref
from collections.abc import Callable
from datetime import datetime, timedelta, timezone
from typing import Any

from huggingface_hub import HfApi, SpaceHardware

from agent.core.hub_artifacts import wrap_shell_command_with_hub_artifact_bootstrap
from agent.core.session import Event
from agent.tools.sandbox_client import Sandbox
from agent.tools.trackio_seed import ensure_trackio_dashboard

logger = logging.getLogger(__name__)

DEFAULT_CPU_SANDBOX_HARDWARE = "cpu-basic"

# Match the exact suffix pattern Sandbox.create produces: "sandbox-<8 hex>".
# Used to identify orphan sandboxes from prior sessions safely (won't match
# user-renamed lookalikes).
SANDBOX_SPACE_NAME_RE = re.compile(r"^sandbox-[a-f0-9]{8}$")

# How stale a sandbox must be before we treat it as definitely orphan.
# Anything more recent could be tied to a still-live session in another tab,
# so we leave it alone.
_ORPHAN_STALE_AFTER = timedelta(hours=1)

# HF Space duplication/build APIs can behave poorly when multiple private
# sandboxes are created concurrently for the same namespace. Keep session
# creation non-blocking, but serialize the actual Hub create path per owner.
_SANDBOX_CREATE_LOCKS: weakref.WeakKeyDictionary[
    asyncio.AbstractEventLoop, dict[str, asyncio.Lock]
] = weakref.WeakKeyDictionary()


def _get_sandbox_create_lock(owner: str) -> asyncio.Lock:
    loop = asyncio.get_running_loop()
    locks = _SANDBOX_CREATE_LOCKS.setdefault(loop, {})
    lock = locks.get(owner)
    if lock is None:
        lock = asyncio.Lock()
        locks[owner] = lock
    return lock


def _session_tool_logger(
    session: Any, *, tool: str = "sandbox"
) -> Callable[[str], object] | None:
    event_queue = getattr(session, "event_queue", None)
    if event_queue is None:
        return None

    loop = asyncio.get_running_loop()

    def _log(msg: str) -> None:
        loop.call_soon_threadsafe(
            event_queue.put_nowait,
            Event(event_type="tool_log", data={"tool": tool, "log": msg}),
        )

    return _log


def _looks_like_path(script: str) -> bool:
    """Return True if the script string looks like a file path (not inline code)."""
    if not (
        isinstance(script, str)
        and script.strip() == script
        and not any(c in script for c in "\r\n\0")
    ):
        return False

    if script.startswith("http://") or script.startswith("https://"):
        return False

    return (
        script.startswith("/")
        or script.startswith("./")
        or script.startswith("../")
        or (script.endswith(".py") and not any(c.isspace() for c in script))
    )


async def resolve_sandbox_script(
    sandbox: Any, script: str
) -> tuple[str | None, str | None]:
    """Read a file from the sandbox if *script* looks like a path.

    Returns:
        (content, error) β€” content is the file text on success,
        error is a message on failure.  Both None means *script*
        is not a path (caller should use it as-is).
    """
    if not sandbox or not _looks_like_path(script):
        return None, None
    try:
        # Use the read endpoint instead of bash("cat ...") which truncates at 25KB.
        result = await asyncio.to_thread(sandbox.read, script, limit=100_000)
        if result.success and result.output:
            # Strip line number prefixes (read returns "N\tcontent" format)
            lines = []
            for line in result.output.split("\n"):
                parts = line.split("\t", 1)
                lines.append(parts[1] if len(parts) == 2 else line)
            return "\n".join(lines), None
        return None, f"Failed to read {script} from sandbox: {result.error}"
    except Exception as e:
        return None, f"Failed to read {script} from sandbox: {e}"


async def _seed_trackio_dashboard_safe(session: Any, space_id: str) -> None:
    """Idempotently seed *space_id* with trackio dashboard files using the
    session's HF token. Logs progress, swallows errors β€” a failed seed should
    not block sandbox creation."""
    if not session or not getattr(session, "hf_token", None):
        return
    loop = asyncio.get_running_loop()

    def _log(msg: str) -> None:
        loop.call_soon_threadsafe(
            session.event_queue.put_nowait,
            Event(event_type="tool_log", data={"tool": "sandbox_create", "log": msg}),
        )

    try:
        await asyncio.to_thread(
            ensure_trackio_dashboard, space_id, session.hf_token, _log
        )
    except Exception as e:
        _log(f"trackio dashboard seed failed: {e}")


async def _update_persisted_sandbox_fields(session: Any, **fields: Any) -> None:
    """Best-effort update of sandbox metadata on the durable session record."""
    store = getattr(session, "persistence_store", None)
    session_id = getattr(session, "session_id", None)
    if not (store and session_id and hasattr(store, "update_session_fields")):
        return
    try:
        await store.update_session_fields(session_id, **fields)
    except Exception as e:
        logger.warning("Failed to persist sandbox metadata for %s: %s", session_id, e)


async def _persist_active_sandbox(
    session: Any,
    sandbox: Sandbox,
    *,
    hardware: str,
) -> None:
    space_id = getattr(sandbox, "space_id", None)
    if not space_id:
        return
    owner = space_id.split("/", 1)[0] if "/" in space_id else None
    await _update_persisted_sandbox_fields(
        session,
        sandbox_space_id=space_id,
        sandbox_hardware=hardware,
        sandbox_owner=owner,
        sandbox_created_at=datetime.now(timezone.utc),
        sandbox_status="active",
    )


async def _clear_persisted_sandbox(session: Any) -> None:
    await _update_persisted_sandbox_fields(
        session,
        sandbox_space_id=None,
        sandbox_hardware=None,
        sandbox_owner=None,
        sandbox_created_at=None,
        sandbox_status="destroyed",
    )


# ── Tool name mapping (short agent names β†’ Sandbox client names) ──────


def _cleanup_user_orphan_sandboxes(
    api: HfApi,
    owner: str,
    log: Any,
) -> int:
    """Delete stale ``sandbox-<8hex>`` Spaces in ``owner``'s account.

    "Stale" = not modified in the last hour. The naming pattern + staleness
    filter together make this safe:

    * Naming: only matches ``sandbox-<exactly 8 lowercase hex>``, the
      pattern Sandbox.create produces. Won't touch user-renamed Spaces.
    * Staleness: anything modified in the last hour might still be tied
      to a live session in another tab/replica, so we leave it alone.

    Runs blocking β€” call via ``asyncio.to_thread``. Best-effort: failures
    are logged but never raised, so a flaky HF API never blocks creation.
    """
    cutoff = datetime.now(timezone.utc) - _ORPHAN_STALE_AFTER
    deleted = 0
    try:
        spaces = list(api.list_spaces(author=owner, limit=200, full=True))
    except Exception as e:
        log(f"orphan sweep: list_spaces failed: {e}")
        return 0

    for space in spaces:
        space_name = space.id.rsplit("/", 1)[-1]
        if not SANDBOX_SPACE_NAME_RE.match(space_name):
            continue

        last_mod = getattr(space, "lastModified", None) or getattr(
            space, "last_modified", None
        )
        if isinstance(last_mod, str):
            try:
                last_mod = datetime.fromisoformat(last_mod.replace("Z", "+00:00"))
            except ValueError:
                last_mod = None
        if last_mod is None:
            log(f"orphan sweep: skipping {space.id}; missing lastModified")
            continue
        if last_mod and last_mod > cutoff:
            # Recent β€” could be a concurrent live session. Skip.
            continue

        try:
            api.delete_repo(repo_id=space.id, repo_type="space")
            deleted += 1
            log(f"orphan sweep: deleted {space.id}")
        except Exception as e:
            log(f"orphan sweep: failed to delete {space.id}: {e}")

    if deleted:
        log(f"orphan sweep: cleaned up {deleted} stale sandbox(es) before create")
    return deleted


async def _ensure_sandbox(
    session: Any,
    hardware: str = DEFAULT_CPU_SANDBOX_HARDWARE,
    extra_secrets: dict[str, str] | None = None,
    cancel_event: threading.Event | None = None,
    **create_kwargs,
) -> tuple[Sandbox | None, str | None]:
    """
    Ensure a sandbox exists on the session. Auto-creates with given hardware if needed.

    Returns:
        (sandbox, error_message) β€” one will be None.
    """
    if session and getattr(session, "sandbox", None):
        return session.sandbox, None

    if not session:
        return None, "No session available."

    token = session.hf_token
    if not token:
        return None, "No HF token available. Cannot create sandbox."

    api = HfApi(token=token)
    user_info = api.whoami()
    owner = user_info.get("name", user_info.get("user", ""))
    if not owner:
        return None, "Could not determine HF username from token."

    create_lock = _get_sandbox_create_lock(owner)
    if create_lock.locked():
        await session.send_event(
            Event(
                event_type="tool_log",
                data={
                    "tool": "sandbox",
                    "log": "Waiting for sandbox creation slot...",
                },
            )
        )

    async with create_lock:
        if getattr(session, "sandbox", None):
            return session.sandbox, None

        return await _create_sandbox_locked(
            session,
            api=api,
            owner=owner,
            hardware=hardware,
            extra_secrets=extra_secrets,
            cancel_event=cancel_event,
            **create_kwargs,
        )


async def _create_sandbox_locked(
    session: Any,
    *,
    api: HfApi,
    owner: str,
    hardware: str,
    extra_secrets: dict[str, str] | None = None,
    cancel_event: threading.Event | None = None,
    **create_kwargs,
) -> tuple[Sandbox | None, str | None]:
    """Create the Space while the per-owner sandbox creation lock is held."""
    token = session.hf_token
    await session.send_event(
        Event(
            event_type="tool_log",
            data={
                "tool": "sandbox",
                "log": f"Auto-creating sandbox for {owner} ({hardware})...",
            },
        )
    )

    # Thread-safe log callback: posts tool_log events from worker threads.
    _log = _session_tool_logger(session) or (lambda msg: None)

    # Bridge asyncio cancel event to a threading.Event for the blocking create call.
    # We poll session._cancelled from the main loop in a background task and set
    # a threading.Event that Sandbox.create checks during its polling loops.
    cancel_flag = cancel_event or threading.Event()

    async def _watch_cancel():
        await session._cancelled.wait()
        cancel_flag.set()

    watcher_task = asyncio.create_task(_watch_cancel())

    secrets: dict[str, str] = {"HF_TOKEN": token}
    if extra_secrets:
        secrets.update({k: v for k, v in extra_secrets.items() if v})

    create_kwargs["private"] = True  # enforce: overrides any caller-supplied value
    kwargs = {
        "owner": owner,
        "hardware": hardware,
        "token": token,
        "secrets": secrets,
        "log": _log,
        "cancel_event": cancel_flag,
        **create_kwargs,
    }
    if hardware != DEFAULT_CPU_SANDBOX_HARDWARE:
        kwargs["sleep_time"] = 2700
    import time as _t

    _t_start = _t.monotonic()
    try:
        sb = await asyncio.to_thread(Sandbox.create, **kwargs)
    except Sandbox.Cancelled:
        return None, "Sandbox creation cancelled by user."
    finally:
        watcher_task.cancel()

    if cancel_flag.is_set():
        if getattr(sb, "_owns_space", False):
            try:
                await asyncio.to_thread(sb.delete, log=_log)
            except Exception as e:
                logger.warning(
                    "Failed to delete cancelled sandbox %s: %s", sb.space_id, e
                )
        return None, "Sandbox creation cancelled by user."

    session.sandbox = sb
    session.sandbox_hardware = hardware
    session.sandbox_preload_error = None
    await _persist_active_sandbox(session, sb, hardware=hardware)

    # Telemetry: sandbox creation (infra consumption signal)
    from agent.core import telemetry

    await telemetry.record_sandbox_create(
        session,
        sb,
        hardware=hardware,
        create_latency_s=int(_t.monotonic() - _t_start),
    )

    await session.send_event(
        Event(
            event_type="tool_log",
            data={"tool": "sandbox", "log": f"Sandbox ready: {sb.space_id} ({sb.url})"},
        )
    )

    return sb, None


def start_cpu_sandbox_preload(session: Any) -> asyncio.Task | None:
    """Start a background ``cpu-basic`` sandbox for this session."""
    if not session or getattr(session, "sandbox", None):
        return None

    existing_task = getattr(session, "sandbox_preload_task", None)
    if existing_task and not existing_task.done():
        return existing_task

    cancel_event = threading.Event()
    session.sandbox_preload_cancel_event = cancel_event
    session.sandbox_preload_error = None

    async def _preload() -> Sandbox | None:
        try:
            sb, error = await _ensure_sandbox(
                session,
                hardware=DEFAULT_CPU_SANDBOX_HARDWARE,
                cancel_event=cancel_event,
            )
            if error:
                session.sandbox_preload_error = error
                return None
            return sb
        except asyncio.CancelledError:
            cancel_event.set()
            session.sandbox_preload_error = "Sandbox creation cancelled by user."
            raise
        except Exception as e:
            session.sandbox_preload_error = f"Failed to create sandbox: {e}"
            logger.warning("CPU sandbox preload failed: %s", e)
            return None

    task = asyncio.create_task(_preload())
    session.sandbox_preload_task = task
    return task


async def cancel_sandbox_preload(session: Any) -> None:
    """Best-effort cancellation for an in-flight CPU sandbox preload."""
    cancel_event = getattr(session, "sandbox_preload_cancel_event", None)
    if cancel_event is not None:
        cancel_event.set()

    task = getattr(session, "sandbox_preload_task", None)
    if not task or task.done():
        return

    current_task = asyncio.current_task()
    if task is current_task:
        return

    try:
        await asyncio.wait_for(asyncio.shield(task), timeout=30)
    except asyncio.TimeoutError:
        logger.warning(
            "Timed out waiting for CPU sandbox preload cancellation; "
            "task is still live, cancelling asyncio wrapper"
        )
        task.cancel()
    except asyncio.CancelledError:
        raise
    except Exception:
        pass


async def get_active_or_preloaded_sandbox(
    session: Any,
) -> tuple[Sandbox | None, str | None]:
    """Return the active sandbox, waiting for the startup preload if needed."""
    if not session:
        return None, "No session available."
    if getattr(session, "sandbox", None):
        return session.sandbox, None

    task = getattr(session, "sandbox_preload_task", None)
    if task:
        try:
            await asyncio.shield(task)
        except asyncio.CancelledError:
            raise
        except Exception as e:
            session.sandbox_preload_error = f"Failed to create sandbox: {e}"

    if getattr(session, "sandbox", None):
        return session.sandbox, None

    preload_error = getattr(session, "sandbox_preload_error", None)
    if preload_error:
        return None, preload_error

    return None, "Sandbox is still starting. Please retry shortly."


async def teardown_session_sandbox(session: Any) -> None:
    """Cancel sandbox preload and delete the active owned sandbox, if present."""
    if not session:
        return

    await cancel_sandbox_preload(session)

    sandbox = getattr(session, "sandbox", None)
    session.sandbox = None
    session.sandbox_hardware = None

    if not sandbox:
        return

    try:
        if not getattr(sandbox, "_owns_space", False):
            return

        space_id = getattr(sandbox, "space_id", None)
        delete_log = _session_tool_logger(session)
        last_err: Exception | None = None
        for attempt in range(3):
            try:
                logger.info(
                    "Deleting sandbox %s (attempt %s/3)...",
                    space_id,
                    attempt + 1,
                )
                await asyncio.to_thread(sandbox.delete, log=delete_log)
                from agent.core import telemetry

                await telemetry.record_sandbox_destroy(session, sandbox)
                return
            except Exception as e:
                last_err = e
                if attempt < 2:
                    await asyncio.sleep(2**attempt)
        logger.error(
            "Failed to delete sandbox %s after 3 attempts: %s. "
            "Orphan β€” sweep script will pick it up.",
            space_id,
            last_err,
        )
    finally:
        await _clear_persisted_sandbox(session)


# ── sandbox_create tool ──────────────────────────────────────────────

SANDBOX_CREATE_TOOL_SPEC = {
    "name": "sandbox_create",
    "description": (
        "Create or replace the session sandbox when non-default hardware is needed.\n\n"
        "A private cpu-basic sandbox is already started automatically for each session. "
        "For normal CPU code execution, call bash/read/write/edit directly; do NOT call sandbox_create first.\n\n"
        "Use sandbox_create when: you need GPU hardware, cpu-upgrade, or Trackio secrets before running code. "
        "The active sandbox persists across tool calls within the session. pip install works out of the box. "
        "Sandboxes are always created as private HF Spaces.\n\n"
        "For ML code that uses CUDA, bf16, or model loading: use GPU hardware (t4-small minimum). "
        "CPU sandboxes cannot run GPU code paths β€” your test will not catch GPU-related errors.\n\n"
        "Before choosing hardware, estimate your VRAM needs (models you run, training data size). Rule of thumb: bf16/fp16 β‰ˆ 2 bytes/param, "
        "fp32 β‰ˆ 4 bytes/param, plus ~20% overhead for optimizer states during training.\n"
        "Common picks: t4-small (16GB VRAM, fits ≀1-3B), a10g-small (24GB, ≀7B), a100-large (80GB, ≀30B). "
        "If the model won't fit, pick larger hardware upfront β€” OOM on a sandbox wastes time.\n\n"
        "If you intend to run a training script in this sandbox that uses report_to='trackio', "
        "pass `trackio_space_id` (e.g. '<username>/ml-intern-<8char>') and `trackio_project` so they "
        "are set as TRACKIO_SPACE_ID/TRACKIO_PROJECT secrets in the sandbox and the UI can embed the live dashboard.\n\n"
        "Hardware: " + ", ".join([e.value for e in SpaceHardware]) + ".\n"
    ),
    "parameters": {
        "type": "object",
        "required": [],
        "additionalProperties": False,
        "properties": {
            "hardware": {
                "type": "string",
                "enum": [e.value for e in SpaceHardware],
                "description": (
                    "Hardware tier for the sandbox. Omit for the existing auto-started "
                    "cpu-basic sandbox; choose GPU/cpu-upgrade only when needed."
                ),
            },
            "trackio_space_id": {
                "type": "string",
                "description": (
                    "Optional. The HF Space hosting the trackio dashboard for runs in this sandbox "
                    "(e.g. '<username>/ml-intern-<8char>', under YOUR HF namespace). Injected as "
                    "TRACKIO_SPACE_ID secret and surfaced to the UI. The Space is auto-created and "
                    "seeded with the trackio dashboard β€” DO NOT pre-create it via hf_repo_git, "
                    "that produces an empty Space that breaks the embed."
                ),
            },
            "trackio_project": {
                "type": "string",
                "description": (
                    "Optional. The trackio project name. Injected as TRACKIO_PROJECT secret and "
                    "used by the UI to filter the embedded dashboard to this project."
                ),
            },
        },
    },
}


async def sandbox_create_handler(
    args: dict[str, Any], session: Any = None, tool_call_id: str | None = None
) -> tuple[str, bool]:
    """Handle sandbox_create tool calls."""
    hardware = args.get("hardware", DEFAULT_CPU_SANDBOX_HARDWARE)
    trackio_space_id = args.get("trackio_space_id") or None
    trackio_project = args.get("trackio_project") or None

    async def _emit_trackio_state(sb: Sandbox) -> None:
        """Tell the frontend which trackio dashboard to embed for this sandbox."""
        if not (session and tool_call_id and trackio_space_id):
            return
        data: dict[str, Any] = {
            "tool_call_id": tool_call_id,
            "tool": "sandbox_create",
            "state": "running",
            "trackioSpaceId": trackio_space_id,
        }
        if trackio_project:
            data["trackioProject"] = trackio_project
        await session.send_event(Event(event_type="tool_state_change", data=data))

    preload_task = getattr(session, "sandbox_preload_task", None)
    if (
        session
        and not getattr(session, "sandbox", None)
        and preload_task
        and not preload_task.done()
        and hardware == DEFAULT_CPU_SANDBOX_HARDWARE
    ):
        sb, error = await get_active_or_preloaded_sandbox(session)
        if error:
            return error, False
        if sb:
            await _emit_trackio_state(sb)
            return (
                f"Sandbox already active: {sb.space_id}\n"
                f"URL: {sb.url}\n"
                f"Hardware: {DEFAULT_CPU_SANDBOX_HARDWARE}\n"
                f"Use bash/read/write/edit to interact with it."
            ), True

    if (
        session
        and not getattr(session, "sandbox", None)
        and preload_task
        and not preload_task.done()
        and hardware != DEFAULT_CPU_SANDBOX_HARDWARE
    ):
        await cancel_sandbox_preload(session)

    # If sandbox already exists, return its info or replace the auto CPU sandbox
    if session and getattr(session, "sandbox", None):
        sb = session.sandbox
        active_hardware = getattr(session, "sandbox_hardware", None)
        if active_hardware == hardware:
            await _emit_trackio_state(sb)
            return (
                f"Sandbox already active: {sb.space_id}\n"
                f"URL: {sb.url}\n"
                f"Hardware: {active_hardware}\n"
                f"Use bash/read/write/edit to interact with it."
            ), True

        requested_hardware = args.get("hardware")
        lockout_note = ""
        if (
            active_hardware == DEFAULT_CPU_SANDBOX_HARDWARE
            and hardware != DEFAULT_CPU_SANDBOX_HARDWARE
        ):
            await teardown_session_sandbox(session)
        elif requested_hardware:
            lockout_note = (
                f"\nRequested hardware: {requested_hardware}\n"
                "Hardware cannot be changed by calling sandbox_create again. "
                "Delete the existing sandbox first if you need a different tier."
            )
            await _emit_trackio_state(sb)
            return (
                f"Sandbox already active: {sb.space_id}\n"
                f"URL: {sb.url}\n"
                f"{lockout_note}\n"
                f"Use bash/read/write/edit to interact with it."
            ), True
        else:
            await _emit_trackio_state(sb)
            return (
                f"Sandbox already active: {sb.space_id}\n"
                f"URL: {sb.url}\n"
                f"Hardware: {active_hardware or 'unknown'}\n"
                f"Use bash/read/write/edit to interact with it."
            ), True

    create_kwargs: dict[str, Any] = {}

    extra_secrets: dict[str, str] = {}
    if trackio_space_id:
        extra_secrets["TRACKIO_SPACE_ID"] = trackio_space_id
        await _seed_trackio_dashboard_safe(session, trackio_space_id)
    if trackio_project:
        extra_secrets["TRACKIO_PROJECT"] = trackio_project

    try:
        sb, error = await _ensure_sandbox(
            session,
            hardware=hardware,
            extra_secrets=extra_secrets or None,
            **create_kwargs,
        )
    except Exception as e:
        return f"Failed to create sandbox: {e}", False

    if error:
        return error, False

    await _emit_trackio_state(sb)

    return (
        f"Sandbox created: {sb.space_id}\n"
        f"URL: {sb.url}\n"
        f"Hardware: {hardware}\n"
        "Visibility: private\n"
        f"Use bash/read/write/edit to interact with it."
    ), True


def _make_tool_handler(sandbox_tool_name: str):
    """Factory: create a handler for a sandbox operation tool."""

    async def handler(args: dict[str, Any], session: Any = None) -> tuple[str, bool]:
        sb, error = await get_active_or_preloaded_sandbox(session)
        if error:
            return error, False
        if not sb:
            return "Sandbox is still starting. Please retry shortly.", False

        try:
            if sandbox_tool_name == "bash" and args.get("command"):
                args = {
                    **args,
                    "command": wrap_shell_command_with_hub_artifact_bootstrap(
                        args["command"],
                        session,
                    ),
                }
            result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
            if result.success:
                output = result.output or "(no output)"
                return output, True
            else:
                error_msg = result.error or "Unknown error"
                output = result.output
                if output:
                    return f"{output}\n\nERROR: {error_msg}", False
                return f"ERROR: {error_msg}", False
        except Exception as e:
            return f"Sandbox operation failed: {e}", False

    return handler


def get_sandbox_tools():
    """Return all 5 sandbox ToolSpecs (sandbox_create + 4 operation tools)."""
    from agent.core.tools import ToolSpec

    tools = []

    # sandbox_create (for GPU or other non-default hardware)
    tools.append(
        ToolSpec(
            name=SANDBOX_CREATE_TOOL_SPEC["name"],
            description=SANDBOX_CREATE_TOOL_SPEC["description"],
            parameters=SANDBOX_CREATE_TOOL_SPEC["parameters"],
            handler=sandbox_create_handler,
        )
    )

    # Operation tools (auto-execute, no approval needed)
    for name in Sandbox.TOOLS.keys():
        spec = Sandbox.TOOLS[name]
        description = (
            "Uses the session's active sandbox. A private cpu-basic sandbox is "
            "started automatically for normal CPU work; call sandbox_create only "
            "for GPU or other non-default hardware.\n\n" + spec["description"]
        )
        tools.append(
            ToolSpec(
                name=name,
                description=description,
                parameters=spec["parameters"],
                handler=_make_tool_handler(name),
            )
        )

    return tools