ml-intern

Sleeping

App Files Files Community

lewtun HF Staff OpenAI Codex commited on 30 days ago

Commit

a436c02

2 Parent(s): ab0ff24 7636865

Deploy 2026-05-04

Browse files

Co-authored-by: OpenAI Codex <codex@openai.com>

Files changed (12) hide show

agent/context_manager/manager.py +148 -17
agent/core/agent_loop.py +59 -9
agent/tools/sandbox_client.py +58 -1
agent/tools/sandbox_tool.py +73 -19
backend/routes/agent.py +4 -2
backend/session_manager.py +133 -2
scripts/sweep_orphan_sandboxes.py +3 -8
tests/unit/test_agent_model_gating.py +56 -0
tests/unit/test_compaction_loop_break.py +360 -0
tests/unit/test_dangling_tool_calls.py +0 -9
tests/unit/test_sandbox_private_spaces.py +125 -0
tests/unit/test_session_manager_persistence.py +201 -0

agent/context_manager/manager.py CHANGED Viewed

@@ -79,6 +79,23 @@ _COMPACT_PROMPT = (
     "will be have to be filled in."
 )
 # Used when seeding a brand-new session from prior browser-cached messages.
 # Here we're writing a note to *ourselves* — so preserve the tool-call trail,
 # files produced, and planned next steps in first person. Optimized for
@@ -240,8 +257,6 @@ class ContextManager:
         """Add a message to the history"""
         if token_count:
             self.running_context_usage = token_count
-        if not getattr(message, "timestamp", None):
-            message.timestamp = datetime.now().isoformat()
         self.items.append(message)
         if self.on_message_added:
             self.on_message_added(message)
@@ -314,7 +329,6 @@ class ContextManager:
                             content="Tool was not executed (interrupted or error).",
                             tool_call_id=tc.id,
                             name=tc.function.name,
-                            timestamp=datetime.now().isoformat(),
                         )
                     )
@@ -374,6 +388,81 @@ class ContextManager:
     def needs_compaction(self) -> bool:
         return self.running_context_usage > self.compaction_threshold and bool(self.items)
     async def compact(
         self,
         model_name: str,
@@ -386,6 +475,13 @@ class ContextManager:
         ``session`` is optional — if passed, the underlying summarization
         LLM call is recorded via ``telemetry.record_llm_call(kind=
         "compaction")`` so its cost shows up in ``total_cost_usd``.
         """
         if not self.needs_compaction:
             return
@@ -409,12 +505,45 @@ class ContextManager:
         idx = len(self.items) - self.untouched_messages
         while idx > 1 and self.items[idx].role != "user":
             idx -= 1
         recent_messages = self.items[idx:]
         messages_to_summarize = self.items[first_user_idx + 1:idx]
-        # improbable, messages would have to very long
         if not messages_to_summarize:
             return
         summary, completion_tokens = await summarize_messages(
@@ -430,7 +559,6 @@ class ContextManager:
         summarized_message = Message(
             role="assistant",
             content=summary,
-            timestamp=datetime.now().isoformat(),
         )
         # Reconstruct: system + first user msg + summary + recent messages
@@ -439,16 +567,19 @@ class ContextManager:
             head.append(first_user_msg)
         self.items = head + [summarized_message] + recent_messages
-        # Count the actual post-compact context — system prompt + first user
-        # turn + summary + the preserved tail all contribute, not just the
-        # summary. litellm.token_counter uses the model's real tokenizer.
-        from litellm import token_counter
-        try:
-            self.running_context_usage = token_counter(
-                model=model_name,
-                messages=[m.model_dump() for m in self.items],
             )
-        except Exception as e:
-            logger.warning("token_counter failed post-compact (%s); falling back to rough estimate", e)
-            self.running_context_usage = len(self.system_prompt) // 4 + completion_tokens

     "will be have to be filled in."
 )
+# Per-message ceiling. If a single message in the "untouched" tail is larger
+# than this, compaction can't recover even after summarizing the middle —
+# producing the infinite compaction loop seen 2026-05-03 in pod logs (200k
+# context shrinks to 200k+ because one tool output is 80k tokens). We replace
+# such messages with a placeholder before compaction runs.
+_MAX_TOKENS_PER_MESSAGE = 50_000
+class CompactionFailedError(Exception):
+    """Raised when compaction can't reduce context below the threshold.
+    Typically means an individual preserved message (system, first user, or
+    untouched tail) exceeds what truncation can fix in one pass. The caller
+    must terminate the session — retrying produces an infinite loop that
+    burns Bedrock budget for free (~$3 per re-attempt on Opus).
+    """
 # Used when seeding a brand-new session from prior browser-cached messages.
 # Here we're writing a note to *ourselves* — so preserve the tool-call trail,
 # files produced, and planned next steps in first person. Optimized for
         """Add a message to the history"""
         if token_count:
             self.running_context_usage = token_count
         self.items.append(message)
         if self.on_message_added:
             self.on_message_added(message)
                             content="Tool was not executed (interrupted or error).",
                             tool_call_id=tc.id,
                             name=tc.function.name,
                         )
                     )
     def needs_compaction(self) -> bool:
         return self.running_context_usage > self.compaction_threshold and bool(self.items)
+    def _truncate_oversized(
+        self, messages: list[Message], model_name: str
+    ) -> list[Message]:
+        """Replace any message > _MAX_TOKENS_PER_MESSAGE with a placeholder.
+        These are typically tool outputs (CSV dumps, file contents) sitting in
+        the untouched tail or first-user position that compaction can't shrink
+        — they pass through verbatim, keeping context above threshold and
+        triggering an infinite compaction retry loop.
+        """
+        from litellm import token_counter
+        out: list[Message] = []
+        for msg in messages:
+            # System messages are sacred — they're the agent's instructions.
+            # In edge cases (items < untouched_messages), the slice math in
+            # compact() can let items[0] (the system message) leak into the
+            # recent_messages list. Defense-in-depth: never truncate it.
+            if msg.role == "system":
+                out.append(msg)
+                continue
+            try:
+                n = token_counter(model=model_name, messages=[msg.model_dump()])
+            except Exception:
+                # token_counter occasionally fails on edge-case content;
+                # don't drop the message, just keep it as-is.
+                out.append(msg)
+                continue
+            if n <= _MAX_TOKENS_PER_MESSAGE:
+                out.append(msg)
+                continue
+            placeholder = (
+                f"[truncated for compaction — original was {n} tokens, "
+                f"removed to keep context under {self.compaction_threshold} tokens]"
+            )
+            logger.warning(
+                "Truncating %s message: %d -> %d tokens for compaction",
+                msg.role, n, len(placeholder) // 4,
+            )
+            # Preserve all known assistant-side fields (tool_calls, thinking_blocks,
+            # reasoning_content, provider_specific_fields) even when content is
+            # replaced. Anthropic extended-thinking models reject the next request
+            # with "Invalid signature in thinking block" if thinking_blocks is
+            # dropped from a prior assistant message.
+            kept = {
+                k: getattr(msg, k, None)
+                for k in (
+                    "tool_call_id",
+                    "tool_calls",
+                    "name",
+                    "thinking_blocks",
+                    "reasoning_content",
+                    "provider_specific_fields",
+                )
+                if getattr(msg, k, None) is not None
+            }
+            out.append(Message(role=msg.role, content=placeholder, **kept))
+        return out
+    def _recompute_usage(self, model_name: str) -> None:
+        """Refresh ``running_context_usage`` from current items via real tokenizer."""
+        from litellm import token_counter
+        try:
+            self.running_context_usage = token_counter(
+                model=model_name,
+                messages=[m.model_dump() for m in self.items],
+            )
+        except Exception as e:
+            logger.warning("token_counter failed (%s); rough estimate", e)
+            # Rough fallback: 4 chars per token.
+            self.running_context_usage = sum(
+                len(getattr(m, "content", "") or "") for m in self.items
+            ) // 4
     async def compact(
         self,
         model_name: str,
         ``session`` is optional — if passed, the underlying summarization
         LLM call is recorded via ``telemetry.record_llm_call(kind=
         "compaction")`` so its cost shows up in ``total_cost_usd``.
+        Raises ``CompactionFailedError`` if the post-compact context is still
+        over the threshold. This happens when a preserved message (typically
+        a giant tool output stuck in the untouched tail) is too large for
+        truncation to fix. The caller must terminate the session — retrying
+        is what caused the 2026-05-03 infinite-compaction-loop pattern that
+        burned Bedrock budget invisibly.
         """
         if not self.needs_compaction:
             return
         idx = len(self.items) - self.untouched_messages
         while idx > 1 and self.items[idx].role != "user":
             idx -= 1
+        # The real invariant is "idx must be strictly after first_user_idx,
+        # otherwise recent_messages overlaps with the messages we put in
+        # head". The walk-back's `idx > 1` guard is necessary (no system in
+        # recent) but insufficient (first_user is also in head and would be
+        # duplicated). Anthropic API rejects two consecutive user messages
+        # with a 400 — bot review on PR #213 caught this on the second clamp
+        # iteration.
+        if idx <= first_user_idx:
+            idx = first_user_idx + 1
         recent_messages = self.items[idx:]
         messages_to_summarize = self.items[first_user_idx + 1:idx]
+        # Truncate any message that's larger than _MAX_TOKENS_PER_MESSAGE in
+        # the parts we PRESERVE through compaction (first_user + recent_tail).
+        # These are the only places where individual messages can defeat
+        # compaction by being intrinsically too large. Messages in
+        # ``messages_to_summarize`` are folded into the summary, so their size
+        # doesn't matter on its own.
+        if first_user_msg is not None:
+            truncated = self._truncate_oversized([first_user_msg], model_name)
+            first_user_msg = truncated[0]
+        recent_messages = self._truncate_oversized(recent_messages, model_name)
+        # If there's nothing to summarize but the preserved messages are now
+        # truncated and small, just rebuild and recompute. This is rare but
+        # avoids returning silently with the old (over-threshold) state.
         if not messages_to_summarize:
+            head = [system_msg] if system_msg else []
+            if first_user_msg:
+                head.append(first_user_msg)
+            self.items = head + recent_messages
+            self._recompute_usage(model_name)
+            if self.running_context_usage > self.compaction_threshold:
+                raise CompactionFailedError(
+                    f"Nothing to summarize but context ({self.running_context_usage}) "
+                    f"still over threshold ({self.compaction_threshold}) after truncation. "
+                    f"System prompt or first user message likely exceeds the budget."
+                )
             return
         summary, completion_tokens = await summarize_messages(
         summarized_message = Message(
             role="assistant",
             content=summary,
         )
         # Reconstruct: system + first user msg + summary + recent messages
             head.append(first_user_msg)
         self.items = head + [summarized_message] + recent_messages
+        self._recompute_usage(model_name)
+        # Hard verify: if compaction didn't bring us below the threshold even
+        # after truncating oversized preserved messages, retrying just burns
+        # Bedrock budget on the same useless compaction call. Raise so the
+        # caller can terminate the session cleanly. Pre-2026-05-04, the
+        # caller looped indefinitely (~$3/Opus retry) until the pod was
+        # killed — invisible to the dataset because the session never
+        # finished cleanly.
+        if self.running_context_usage > self.compaction_threshold:
+            raise CompactionFailedError(
+                f"Compaction ineffective: {self.running_context_usage} tokens "
+                f"still over threshold {self.compaction_threshold} after summarize "
+                f"and truncation. Likely the system prompt + first user + summary "
+                f"+ truncated tail still exceeds budget."
             )

agent/core/agent_loop.py CHANGED Viewed

@@ -516,19 +516,56 @@ def _friendly_error_message(error: Exception) -> str | None:
 async def _compact_and_notify(session: Session) -> None:
-    """Run compaction and send event if context was reduced."""
     cm = session.context_manager
     old_usage = cm.running_context_usage
     logger.debug(
         "Compaction check: usage=%d, max=%d, threshold=%d, needs_compact=%s",
         old_usage, cm.model_max_tokens, cm.compaction_threshold, cm.needs_compaction,
     )
-    await cm.compact(
-        model_name=session.config.model_name,
-        tool_specs=session.tool_router.get_tool_specs_for_llm(),
-        hf_token=session.hf_token,
-        session=session,
-    )
     new_usage = cm.running_context_usage
     if new_usage != old_usage:
         logger.warning(
@@ -1035,8 +1072,15 @@ class Handlers:
             if session.is_cancelled:
                 break
-            # Compact before calling the LLM if context is near the limit
             await _compact_and_notify(session)
             # Doom-loop detection: break out of repeated tool call patterns
             doom_prompt = check_for_doom_loop(session.context_manager.items)
@@ -1421,7 +1465,7 @@ class Handlers:
                 iteration += 1
             except ContextWindowExceededError:
-                # Force compact and retry this iteration
                 cm = session.context_manager
                 logger.warning(
                     "ContextWindowExceededError at iteration %d — forcing compaction "
@@ -1430,6 +1474,12 @@ class Handlers:
                 )
                 cm.running_context_usage = cm.model_max_tokens + 1
                 await _compact_and_notify(session)
                 continue
             except Exception as e:

 async def _compact_and_notify(session: Session) -> None:
+    """Run compaction and send event if context was reduced.
+    Catches ``CompactionFailedError`` and ends the session cleanly instead
+    of letting the caller retry. Pre-2026-05-04 the caller looped on
+    ContextWindowExceededError → compact → re-trigger, burning Bedrock
+    budget at ~$3/Opus retry while the session never reached the upload
+    path (so the cost was invisible in the dataset).
+    """
+    from agent.context_manager.manager import CompactionFailedError
     cm = session.context_manager
     old_usage = cm.running_context_usage
     logger.debug(
         "Compaction check: usage=%d, max=%d, threshold=%d, needs_compact=%s",
         old_usage, cm.model_max_tokens, cm.compaction_threshold, cm.needs_compaction,
     )
+    try:
+        await cm.compact(
+            model_name=session.config.model_name,
+            tool_specs=session.tool_router.get_tool_specs_for_llm(),
+            hf_token=session.hf_token,
+            session=session,
+        )
+    except CompactionFailedError as e:
+        logger.error(
+            "Compaction failed for session %s: %s — terminating session",
+            session.session_id, e,
+        )
+        # Persist the failure event so the dataset has a record of WHY this
+        # session ended (and the cost it incurred up to that point) even if
+        # save_and_upload_detached has issues downstream.
+        await session.send_event(Event(
+            event_type="session_terminated",
+            data={
+                "reason": "compaction_failed",
+                "context_usage": cm.running_context_usage,
+                "context_threshold": cm.compaction_threshold,
+                "error": str(e)[:300],
+                "user_message": (
+                    "Your conversation has grown too large to continue. "
+                    "The work you've done is saved — start a new session to keep going."
+                ),
+            },
+        ))
+        # Stop the agent loop; the finally in _run_session will fire
+        # cleanup_sandbox + save_trajectory so the dataset captures
+        # everything that did happen.
+        session.is_running = False
+        return
     new_usage = cm.running_context_usage
     if new_usage != old_usage:
         logger.warning(
             if session.is_cancelled:
                 break
+            # Compact before calling the LLM if context is near the limit.
+            # When _compact_and_notify catches CompactionFailedError it sets
+            # session.is_running = False; we MUST exit the loop here, otherwise
+            # the LLM call below fires with an over-threshold context, hits
+            # ContextWindowExceededError, and we end up looping again on the
+            # except path — exactly the bug this PR is supposed to fix.
             await _compact_and_notify(session)
+            if not session.is_running:
+                break
             # Doom-loop detection: break out of repeated tool call patterns
             doom_prompt = check_for_doom_loop(session.context_manager.items)
                 iteration += 1
             except ContextWindowExceededError:
+                # Force compact and retry this iteration.
                 cm = session.context_manager
                 logger.warning(
                     "ContextWindowExceededError at iteration %d — forcing compaction "
                 )
                 cm.running_context_usage = cm.model_max_tokens + 1
                 await _compact_and_notify(session)
+                # Same guard as the top of the loop: if compaction couldn't
+                # bring us under threshold, _compact_and_notify has already
+                # emitted session_terminated and set is_running=False. Continue
+                # would just re-call the LLM with the same too-big context.
+                if not session.is_running:
+                    break
                 continue
             except Exception as e:

agent/tools/sandbox_client.py CHANGED Viewed

@@ -65,6 +65,7 @@ MAX_TIMEOUT = 1200
 WAIT_TIMEOUT = 600
 WAIT_INTERVAL = 5
 API_WAIT_TIMEOUT = 180
 def _is_transient_space_visibility_error(error: Exception) -> bool:
@@ -75,6 +76,59 @@ def _is_transient_space_visibility_error(error: Exception) -> bool:
     message = str(error)
     return "Repository Not Found" in message or "404 Client Error" in message
 _DOCKERFILE = """\
 FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
@@ -627,10 +681,13 @@ class Sandbox:
         # Some template duplicates can initially inherit the template hardware.
         # Explicitly request the target tier so automatic CPU sandboxes never
         # silently come up on GPU hardware.
-        api.request_space_hardware(
             space_id,
             hardware=hardware,
             sleep_time=sleep_time,
         )
         _log(f"Requested hardware: {hardware}")

 WAIT_TIMEOUT = 600
 WAIT_INTERVAL = 5
 API_WAIT_TIMEOUT = 180
+HARDWARE_REQUEST_TIMEOUT = 60
 def _is_transient_space_visibility_error(error: Exception) -> bool:
     message = str(error)
     return "Repository Not Found" in message or "404 Client Error" in message
+def _is_transient_space_management_error(error: Exception) -> bool:
+    """Return True when a just-created private Space is not manageable yet."""
+    response = getattr(error, "response", None)
+    if getattr(response, "status_code", None) in {401, 404}:
+        return True
+    message = str(error)
+    return (
+        "Repository Not Found" in message
+        or "401 Client Error" in message
+        or "404 Client Error" in message
+    )
+def _request_space_hardware_with_retry(
+    api: HfApi,
+    space_id: str,
+    *,
+    hardware: str,
+    sleep_time: int | None,
+    log: Callable[[str], object],
+    check_cancel: Callable[[], object],
+) -> None:
+    """Request hardware, retrying while Hub permissions propagate for a new Space."""
+    deadline = time.time() + HARDWARE_REQUEST_TIMEOUT
+    attempt = 0
+    while True:
+        check_cancel()
+        try:
+            api.request_space_hardware(
+                space_id,
+                hardware=hardware,
+                sleep_time=sleep_time,
+            )
+            return
+        except Exception as e:
+            if not _is_transient_space_management_error(e):
+                raise
+            remaining = deadline - time.time()
+            if remaining <= 0:
+                raise
+            attempt += 1
+            status_code = getattr(getattr(e, "response", None), "status_code", None)
+            status = f"HTTP {status_code}" if status_code else type(e).__name__
+            log(
+                f"  Hardware request not accepted yet ({status}); "
+                f"retrying ({attempt})..."
+            )
+            time.sleep(min(WAIT_INTERVAL, remaining))
 _DOCKERFILE = """\
 FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
         # Some template duplicates can initially inherit the template hardware.
         # Explicitly request the target tier so automatic CPU sandboxes never
         # silently come up on GPU hardware.
+        _request_space_hardware_with_retry(
+            api,
             space_id,
             hardware=hardware,
             sleep_time=sleep_time,
+            log=_log,
+            check_cancel=_check_cancel,
         )
         _log(f"Requested hardware: {hardware}")

agent/tools/sandbox_tool.py CHANGED Viewed

@@ -120,6 +120,49 @@ async def _seed_trackio_dashboard_safe(session: Any, space_id: str) -> None:
         _log(f"trackio dashboard seed failed: {e}")
 # ── Tool name mapping (short agent names → Sandbox client names) ──────
@@ -313,6 +356,7 @@ async def _create_sandbox_locked(
     session.sandbox = sb
     session.sandbox_hardware = hardware
     session.sandbox_preload_error = None
     # Telemetry: sandbox creation (infra consumption signal)
     from agent.core import telemetry
@@ -448,28 +492,38 @@ async def teardown_session_sandbox(session: Any) -> None:
     session.sandbox = None
     session.sandbox_hardware = None
-    if not (sandbox and getattr(sandbox, "_owns_space", False)):
         return
-    space_id = getattr(sandbox, "space_id", None)
-    last_err: Exception | None = None
-    for attempt in range(3):
-        try:
-            logger.info("Deleting sandbox %s (attempt %s/3)...", space_id, attempt + 1)
-            await asyncio.to_thread(sandbox.delete)
-            from agent.core import telemetry
-            await telemetry.record_sandbox_destroy(session, sandbox)
             return
-        except Exception as e:
-            last_err = e
-            if attempt < 2:
-                await asyncio.sleep(2 ** attempt)
-    logger.error(
-        "Failed to delete sandbox %s after 3 attempts: %s. "
-        "Orphan — sweep script will pick it up.",
-        space_id,
-        last_err,
-    )
 # ── sandbox_create tool ──────────────────────────────────────────────

         _log(f"trackio dashboard seed failed: {e}")
+async def _update_persisted_sandbox_fields(session: Any, **fields: Any) -> None:
+    """Best-effort update of sandbox metadata on the durable session record."""
+    store = getattr(session, "persistence_store", None)
+    session_id = getattr(session, "session_id", None)
+    if not (store and session_id and hasattr(store, "update_session_fields")):
+        return
+    try:
+        await store.update_session_fields(session_id, **fields)
+    except Exception as e:
+        logger.warning("Failed to persist sandbox metadata for %s: %s", session_id, e)
+async def _persist_active_sandbox(
+    session: Any,
+    sandbox: Sandbox,
+    *,
+    hardware: str,
+) -> None:
+    space_id = getattr(sandbox, "space_id", None)
+    if not space_id:
+        return
+    owner = space_id.split("/", 1)[0] if "/" in space_id else None
+    await _update_persisted_sandbox_fields(
+        session,
+        sandbox_space_id=space_id,
+        sandbox_hardware=hardware,
+        sandbox_owner=owner,
+        sandbox_created_at=datetime.now(timezone.utc),
+        sandbox_status="active",
+    )
+async def _clear_persisted_sandbox(session: Any) -> None:
+    await _update_persisted_sandbox_fields(
+        session,
+        sandbox_space_id=None,
+        sandbox_hardware=None,
+        sandbox_owner=None,
+        sandbox_created_at=None,
+        sandbox_status="destroyed",
+    )
 # ── Tool name mapping (short agent names → Sandbox client names) ──────
     session.sandbox = sb
     session.sandbox_hardware = hardware
     session.sandbox_preload_error = None
+    await _persist_active_sandbox(session, sb, hardware=hardware)
     # Telemetry: sandbox creation (infra consumption signal)
     from agent.core import telemetry
     session.sandbox = None
     session.sandbox_hardware = None
+    if not sandbox:
         return
+    try:
+        if not getattr(sandbox, "_owns_space", False):
             return
+        space_id = getattr(sandbox, "space_id", None)
+        last_err: Exception | None = None
+        for attempt in range(3):
+            try:
+                logger.info(
+                    "Deleting sandbox %s (attempt %s/3)...",
+                    space_id,
+                    attempt + 1,
+                )
+                await asyncio.to_thread(sandbox.delete)
+                from agent.core import telemetry
+                await telemetry.record_sandbox_destroy(session, sandbox)
+                return
+            except Exception as e:
+                last_err = e
+                if attempt < 2:
+                    await asyncio.sleep(2 ** attempt)
+        logger.error(
+            "Failed to delete sandbox %s after 3 attempts: %s. "
+            "Orphan — sweep script will pick it up.",
+            space_id,
+            last_err,
+        )
+    finally:
+        await _clear_persisted_sandbox(session)
 # ── sandbox_create tool ──────────────────────────────────────────────

backend/routes/agent.py CHANGED Viewed

@@ -213,6 +213,7 @@ async def _check_session_access(
     session_id: str,
     user: dict[str, Any],
     request: Request | None = None,
 ) -> AgentSession:
     """Verify and lazily load the user's session. Raises 403 or 404."""
     hf_token = resolve_hf_request_token(request) if request is not None else user.get("hf_token")
@@ -221,6 +222,7 @@ async def _check_session_access(
         user["user_id"],
         hf_token=hf_token,
         hf_username=user.get("username"),
     )
     if not agent_session:
         raise HTTPException(status_code=404, detail="Session not found")
@@ -605,7 +607,7 @@ async def teardown_session_sandbox(
     session_id: str, user: dict = Depends(get_current_user)
 ) -> dict:
     """Best-effort sandbox teardown that preserves durable chat history."""
-    await _check_session_access(session_id, user)
     task = asyncio.create_task(session_manager.teardown_sandbox(session_id))
     _background_teardown_tasks.add(task)
     task.add_done_callback(_background_teardown_tasks.discard)
@@ -617,7 +619,7 @@ async def delete_session(
     session_id: str, user: dict = Depends(get_current_user)
 ) -> dict:
     """Delete a session. Only accessible by the session owner."""
-    await _check_session_access(session_id, user)
     success = await session_manager.delete_session(session_id)
     if not success:
         raise HTTPException(status_code=404, detail="Session not found")

     session_id: str,
     user: dict[str, Any],
     request: Request | None = None,
+    preload_sandbox: bool = True,
 ) -> AgentSession:
     """Verify and lazily load the user's session. Raises 403 or 404."""
     hf_token = resolve_hf_request_token(request) if request is not None else user.get("hf_token")
         user["user_id"],
         hf_token=hf_token,
         hf_username=user.get("username"),
+        preload_sandbox=preload_sandbox,
     )
     if not agent_session:
         raise HTTPException(status_code=404, detail="Session not found")
     session_id: str, user: dict = Depends(get_current_user)
 ) -> dict:
     """Best-effort sandbox teardown that preserves durable chat history."""
+    await _check_session_access(session_id, user, preload_sandbox=False)
     task = asyncio.create_task(session_manager.teardown_sandbox(session_id))
     _background_teardown_tasks.add(task)
     task.add_done_callback(_background_teardown_tasks.discard)
     session_id: str, user: dict = Depends(get_current_user)
 ) -> dict:
     """Delete a session. Only accessible by the session owner."""
+    await _check_session_access(session_id, user, preload_sandbox=False)
     success = await session_manager.delete_session(session_id)
     if not success:
         raise HTTPException(status_code=404, detail="Session not found")

backend/session_manager.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import asyncio
 import json
 import logging
 import uuid
 from dataclasses import dataclass, field
 from datetime import datetime
@@ -117,6 +118,8 @@ class SessionCapacityError(Exception):
 MAX_SESSIONS: int = 200
 MAX_SESSIONS_PER_USER: int = 10
 DEFAULT_YOLO_COST_CAP_USD: float = 5.0
 class SessionManager:
@@ -137,6 +140,7 @@ class SessionManager:
     async def close(self) -> None:
         """Flush and close shared background resources."""
         await self.messaging_gateway.close()
         if self.persistence_store is not None:
             await self.persistence_store.close()
@@ -372,6 +376,89 @@ class SessionManager:
             agent_session.hf_username = hf_username
             agent_session.session.hf_username = hf_username
     async def persist_session_snapshot(
         self,
         agent_session: AgentSession,
@@ -427,6 +514,7 @@ class SessionManager:
         user_id: str,
         hf_token: str | None = None,
         hf_username: str | None = None,
     ) -> AgentSession | None:
         """Return a live runtime session, lazily restoring it from Mongo."""
         async with self._lock:
@@ -463,6 +551,12 @@ class SessionManager:
         if user_id != "dev" and owner != "dev" and owner != user_id:
             return None
         from litellm import Message
         model = meta.get("model") or self.config.model_name
@@ -533,7 +627,8 @@ class SessionManager:
                 hf_username=hf_username,
             )
             return started
-        self._start_cpu_sandbox_preload(agent_session)
         logger.info("Restored session %s for user %s", session_id, owner or user_id)
         return agent_session
@@ -614,8 +709,8 @@ class SessionManager:
             event_queue=event_queue,
             tool_router=tool_router,
         )
-        self._start_cpu_sandbox_preload(agent_session)
         await self.persist_session_snapshot(agent_session, runtime_state="idle")
         if is_pro is not None and user_id and user_id != "dev":
             await self._track_pro_status(agent_session, is_pro=is_pro)
@@ -725,6 +820,42 @@ class SessionManager:
         await teardown_session_sandbox(session)
     async def _run_session(
         self,
         session_id: str,

 import asyncio
 import json
 import logging
+import os
 import uuid
 from dataclasses import dataclass, field
 from datetime import datetime
 MAX_SESSIONS: int = 200
 MAX_SESSIONS_PER_USER: int = 10
 DEFAULT_YOLO_COST_CAP_USD: float = 5.0
+SANDBOX_SHUTDOWN_CLEANUP_CONCURRENCY: int = 10
+SANDBOX_SHUTDOWN_CLEANUP_TIMEOUT_S: float = 60.0
 class SessionManager:
     async def close(self) -> None:
         """Flush and close shared background resources."""
+        await self._cleanup_all_sandboxes_on_close()
         await self.messaging_gateway.close()
         if self.persistence_store is not None:
             await self.persistence_store.close()
             agent_session.hf_username = hf_username
             agent_session.session.hf_username = hf_username
+    async def _clear_persisted_sandbox_metadata(self, session_id: str) -> None:
+        try:
+            await self._store().update_session_fields(
+                session_id,
+                sandbox_space_id=None,
+                sandbox_hardware=None,
+                sandbox_owner=None,
+                sandbox_created_at=None,
+                sandbox_status="destroyed",
+            )
+        except Exception as e:
+            logger.warning("Failed to clear sandbox metadata for %s: %s", session_id, e)
+    async def _cleanup_persisted_sandbox(
+        self,
+        session_id: str,
+        metadata: dict[str, Any],
+        *,
+        hf_token: str | None,
+    ) -> None:
+        """Delete a sandbox recorded by a previous backend process, if any."""
+        space_id = metadata.get("sandbox_space_id")
+        if not isinstance(space_id, str) or not space_id:
+            return
+        if metadata.get("sandbox_status") == "destroyed":
+            return
+        tokens: list[tuple[str, str]] = []
+        seen: set[str] = set()
+        for label, token in (
+            ("user", hf_token),
+            ("admin", os.environ.get("HF_ADMIN_TOKEN")),
+        ):
+            if token and token not in seen:
+                tokens.append((label, token))
+                seen.add(token)
+        if not tokens:
+            logger.warning(
+                "Cannot clean persisted sandbox %s for session %s: no HF token available",
+                space_id,
+                session_id,
+            )
+            return
+        last_err: Exception | None = None
+        for label, token in tokens:
+            try:
+                from huggingface_hub import HfApi
+                api = HfApi(token=token)
+                await asyncio.to_thread(
+                    api.delete_repo,
+                    repo_id=space_id,
+                    repo_type="space",
+                )
+                logger.info(
+                    "Deleted persisted sandbox %s for session %s with %s token",
+                    space_id,
+                    session_id,
+                    label,
+                )
+                await self._clear_persisted_sandbox_metadata(session_id)
+                return
+            except Exception as e:
+                status_code = getattr(getattr(e, "response", None), "status_code", None)
+                if status_code == 404:
+                    logger.info(
+                        "Persisted sandbox %s for session %s is already gone",
+                        space_id,
+                        session_id,
+                    )
+                    await self._clear_persisted_sandbox_metadata(session_id)
+                    return
+                last_err = e
+        logger.warning(
+            "Failed to delete persisted sandbox %s for session %s: %s",
+            space_id,
+            session_id,
+            last_err,
+        )
     async def persist_session_snapshot(
         self,
         agent_session: AgentSession,
         user_id: str,
         hf_token: str | None = None,
         hf_username: str | None = None,
+        preload_sandbox: bool = True,
     ) -> AgentSession | None:
         """Return a live runtime session, lazily restoring it from Mongo."""
         async with self._lock:
         if user_id != "dev" and owner != "dev" and owner != user_id:
             return None
+        await self._cleanup_persisted_sandbox(
+            session_id,
+            meta,
+            hf_token=hf_token,
+        )
         from litellm import Message
         model = meta.get("model") or self.config.model_name
                 hf_username=hf_username,
             )
             return started
+        if preload_sandbox:
+            self._start_cpu_sandbox_preload(agent_session)
         logger.info("Restored session %s for user %s", session_id, owner or user_id)
         return agent_session
             event_queue=event_queue,
             tool_router=tool_router,
         )
         await self.persist_session_snapshot(agent_session, runtime_state="idle")
+        self._start_cpu_sandbox_preload(agent_session)
         if is_pro is not None and user_id and user_id != "dev":
             await self._track_pro_status(agent_session, is_pro=is_pro)
         await teardown_session_sandbox(session)
+    async def _cleanup_all_sandboxes_on_close(self) -> None:
+        """Best-effort sandbox cleanup for graceful backend shutdown."""
+        async with self._lock:
+            agent_sessions = list(self.sessions.values())
+        if not agent_sessions:
+            return
+        semaphore = asyncio.Semaphore(SANDBOX_SHUTDOWN_CLEANUP_CONCURRENCY)
+        async def _cleanup_one(agent_session: AgentSession) -> None:
+            async with semaphore:
+                try:
+                    await self._cleanup_sandbox(agent_session.session)
+                except Exception as e:
+                    logger.warning(
+                        "Shutdown sandbox cleanup failed for %s: %s",
+                        agent_session.session_id,
+                        e,
+                    )
+        tasks = [
+            asyncio.create_task(_cleanup_one(agent_session))
+            for agent_session in agent_sessions
+        ]
+        try:
+            await asyncio.wait_for(
+                asyncio.gather(*tasks, return_exceptions=True),
+                timeout=SANDBOX_SHUTDOWN_CLEANUP_TIMEOUT_S,
+            )
+        except asyncio.TimeoutError:
+            logger.warning(
+                "Timed out after %.0fs cleaning up sandboxes on shutdown; "
+                "orphan sweeper will handle any stragglers",
+                SANDBOX_SHUTDOWN_CLEANUP_TIMEOUT_S,
+            )
     async def _run_session(
         self,
         session_id: str,

scripts/sweep_orphan_sandboxes.py CHANGED Viewed

@@ -49,17 +49,12 @@ something up to kill it.
 - Logs every action to stdout in JSON Lines for downstream auditing.
 ================================================================================
- Cron suggestion
 ================================================================================
-GitHub Actions, daily at 04:00 UTC:
-    schedule:
-      - cron: "0 4 * * *"
-    env:
-      HF_ADMIN_TOKEN: ${{ secrets.HF_ADMIN_TOKEN }}
-    steps:
-      - run: python scripts/sweep_orphan_sandboxes.py --apply --max-age-days 7
 """
 import argparse

 - Logs every action to stdout in JSON Lines for downstream auditing.
 ================================================================================
+ Manual usage
 ================================================================================
+Run manually with an admin token when a backstop cleanup is needed:
+    HF_ADMIN_TOKEN=... python scripts/sweep_orphan_sandboxes.py --apply --max-age-days 7
 """
 import argparse

tests/unit/test_agent_model_gating.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Tests for gated model handling in backend/routes/agent.py."""
 import sys
 from pathlib import Path
 from types import SimpleNamespace
@@ -253,3 +254,58 @@ async def test_set_session_yolo_calls_manager_with_cap_presence(monkeypatch):
             },
         )
     ]

 """Tests for gated model handling in backend/routes/agent.py."""
+import asyncio
 import sys
 from pathlib import Path
 from types import SimpleNamespace
             },
         )
     ]
+@pytest.mark.asyncio
+async def test_delete_session_access_check_skips_sandbox_preload(monkeypatch):
+    ensure_calls = []
+    delete_calls = []
+    async def fake_ensure_session_loaded(session_id, user_id, **kwargs):
+        ensure_calls.append((session_id, user_id, kwargs))
+        return SimpleNamespace(user_id=user_id)
+    async def fake_delete_session(session_id):
+        delete_calls.append(session_id)
+        return True
+    monkeypatch.setattr(
+        agent.session_manager,
+        "ensure_session_loaded",
+        fake_ensure_session_loaded,
+    )
+    monkeypatch.setattr(agent.session_manager, "delete_session", fake_delete_session)
+    response = await agent.delete_session("s1", {"user_id": "u1"})
+    assert response == {"status": "deleted", "session_id": "s1"}
+    assert delete_calls == ["s1"]
+    assert ensure_calls[0][2]["preload_sandbox"] is False
+@pytest.mark.asyncio
+async def test_teardown_session_access_check_skips_sandbox_preload(monkeypatch):
+    ensure_calls = []
+    teardown_calls = []
+    async def fake_ensure_session_loaded(session_id, user_id, **kwargs):
+        ensure_calls.append((session_id, user_id, kwargs))
+        return SimpleNamespace(user_id=user_id)
+    async def fake_teardown_sandbox(session_id):
+        teardown_calls.append(session_id)
+        return True
+    monkeypatch.setattr(
+        agent.session_manager,
+        "ensure_session_loaded",
+        fake_ensure_session_loaded,
+    )
+    monkeypatch.setattr(agent.session_manager, "teardown_sandbox", fake_teardown_sandbox)
+    response = await agent.teardown_session_sandbox("s1", {"user_id": "u1"})
+    await asyncio.sleep(0)
+    assert response == {"status": "teardown_requested", "session_id": "s1"}
+    assert teardown_calls == ["s1"]
+    assert ensure_calls[0][2]["preload_sandbox"] is False

tests/unit/test_compaction_loop_break.py ADDED Viewed

	@@ -0,0 +1,360 @@

+"""Regression tests for the 2026-05-03 infinite-compaction-loop bug.
+Pod logs from prod-114 showed sessions stuck retrying compaction every
+few seconds because a single oversized tool output in the untouched tail
+kept the post-compact context above the 90% threshold:
+    Context compacted: 200001 -> 215566 tokens
+    Context compacted: 215566 -> 215572 tokens
+    ContextWindowExceededError — forcing compaction
+    ... (continues for 5+ minutes)
+These tests cover three fixes:
+1. ``_truncate_oversized`` replaces oversized message content with a
+   placeholder and preserves all extended-thinking metadata fields.
+2. ``compact()`` raises ``CompactionFailedError`` when the post-compact
+   context is still over threshold.
+3. ``_compact_and_notify`` catches the error, sets ``session.is_running
+   = False``, and emits a ``session_terminated`` event so callers can
+   exit the agent loop.
+The P0 caught by PR #213 review (loop didn't actually exit on
+``is_running = False``) would have been caught by an end-to-end
+behavioral test of #3 — that gap is closed by the
+``test_compact_and_notify_terminates_session`` case below.
+"""
+from __future__ import annotations
+from unittest.mock import AsyncMock, MagicMock, patch
+import pytest
+from litellm import Message
+from agent.context_manager.manager import (
+    CompactionFailedError,
+    ContextManager,
+    _MAX_TOKENS_PER_MESSAGE,
+)
+# ── helpers ────────────────────────────────────────────────────────────
+def _make_cm(
+    *,
+    model_max_tokens: int = 100_000,
+    compact_size: int = 1_000,
+    untouched_messages: int = 5,
+) -> ContextManager:
+    cm = ContextManager.__new__(ContextManager)
+    cm.system_prompt = "system"
+    cm.model_max_tokens = model_max_tokens
+    cm.compact_size = compact_size
+    cm.running_context_usage = 0
+    cm.untouched_messages = untouched_messages
+    cm.items = [Message(role="system", content="system")]
+    cm.on_message_added = None
+    return cm
+def _msg(role: str, content: str | None = "x", **extra) -> Message:
+    return Message(role=role, content=content, **extra)
+# ── _truncate_oversized ────────────────────────────────────────────────
+def test_truncate_oversized_skips_messages_below_threshold():
+    cm = _make_cm()
+    msgs = [_msg("user", "small content")]
+    with patch("litellm.token_counter", return_value=100):
+        out = cm._truncate_oversized(msgs, "anthropic/claude-opus-4-6")
+    assert out == msgs  # unchanged
+def test_truncate_oversized_replaces_content_above_threshold():
+    cm = _make_cm()
+    big = "x" * (_MAX_TOKENS_PER_MESSAGE * 5)
+    msgs = [_msg("user", big)]
+    # token_counter returns the simulated big size for any message in this test
+    with patch("litellm.token_counter", return_value=_MAX_TOKENS_PER_MESSAGE * 2):
+        out = cm._truncate_oversized(msgs, "anthropic/claude-opus-4-6")
+    assert len(out) == 1
+    assert out[0].content != big
+    assert "[truncated for compaction" in out[0].content
+    assert str(_MAX_TOKENS_PER_MESSAGE * 2) in out[0].content
+def test_truncate_oversized_preserves_thinking_blocks():
+    """Anthropic extended-thinking models reject the next request with
+    ``Invalid signature in thinking block`` if a prior assistant message
+    drops thinking_blocks. Truncation must keep this metadata.
+    """
+    cm = _make_cm()
+    big = "x" * (_MAX_TOKENS_PER_MESSAGE * 5)
+    thinking = [{"type": "thinking", "thinking": "...", "signature": "abc123"}]
+    msg = Message(role="assistant", content=big)
+    msg.thinking_blocks = thinking
+    msg.reasoning_content = "deep thought"
+    with patch("litellm.token_counter", return_value=_MAX_TOKENS_PER_MESSAGE * 2):
+        out = cm._truncate_oversized([msg], "anthropic/claude-opus-4-6")
+    assert getattr(out[0], "thinking_blocks", None) == thinking
+    assert getattr(out[0], "reasoning_content", None) == "deep thought"
+def test_truncate_oversized_never_touches_system_message():
+    """The system prompt is the agent's instructions — must never be truncated.
+    Caught by the integration smoke test on PR #213: when items has fewer than
+    ``untouched_messages`` entries, the slice math in ``compact()`` can let
+    ``items[0]`` (the system message) leak into the ``recent_messages`` list
+    that gets passed to ``_truncate_oversized``. The function must guard
+    explicitly against this.
+    """
+    cm = _make_cm()
+    huge_system = "x" * (_MAX_TOKENS_PER_MESSAGE * 5)
+    msgs = [_msg("system", huge_system)]
+    with patch("litellm.token_counter", return_value=_MAX_TOKENS_PER_MESSAGE * 2):
+        out = cm._truncate_oversized(msgs, "anthropic/claude-opus-4-6")
+    assert out[0].content == huge_system, "system message must never be truncated"
+def test_truncate_oversized_resilient_to_token_counter_failure():
+    """token_counter occasionally raises on edge-case content. A blip there
+    must NOT drop the message — better to leave it and let compaction
+    handle it (or fail with CompactionFailedError) than to lose data.
+    """
+    cm = _make_cm()
+    msgs = [_msg("user", "anything")]
+    with patch("litellm.token_counter", side_effect=Exception("counter blew up")):
+        out = cm._truncate_oversized(msgs, "anthropic/claude-opus-4-6")
+    assert out == msgs
+# ── compact() raises CompactionFailedError ─────────────────────────────
+@pytest.mark.asyncio
+async def test_compact_raises_when_post_compact_still_over_threshold():
+    """The whole point of the new behavior: don't loop on a useless
+    compaction call. Raise so the caller can terminate the session.
+    """
+    cm = _make_cm(model_max_tokens=100_000)
+    # Build a context that's "over threshold" from the start
+    cm.items = [
+        Message(role="system", content="system"),
+        Message(role="user", content="task"),
+        Message(role="assistant", content="x" * 1000),
+        Message(role="user", content="follow-up 1"),
+        Message(role="assistant", content="reply 1"),
+        Message(role="user", content="follow-up 2"),
+        Message(role="assistant", content="reply 2"),
+    ]
+    cm.running_context_usage = 95_000  # over threshold (90% of 100k = 90k)
+    # Mock summarize_messages to return a tiny summary; mock _recompute_usage
+    # to keep the running_context_usage above threshold so compact() raises.
+    async def fake_summarize(*args, **kwargs):
+        return ("summary", 10)
+    def fake_recompute(self, model_name):
+        # Simulate post-compact still over threshold
+        self.running_context_usage = 95_000
+    with (
+        patch("agent.context_manager.manager.summarize_messages", side_effect=fake_summarize),
+        patch.object(ContextManager, "_recompute_usage", fake_recompute),
+        # Avoid token_counter calls in _truncate_oversized
+        patch("litellm.token_counter", return_value=100),
+    ):
+        with pytest.raises(CompactionFailedError):
+            await cm.compact(
+                model_name="anthropic/claude-opus-4-6",
+                tool_specs=None,
+                hf_token=None,
+                session=None,
+            )
+@pytest.mark.asyncio
+async def test_compact_does_not_duplicate_system_when_idx_is_zero():
+    """Regression for the second P0 caught by bot review on PR #213.
+    When ``len(items) == untouched_messages`` (the canonical 5-message
+    early-compaction case: system + user-task + giant-tool-output +
+    user-followup + assistant-reply), ``idx`` initialises to 0 and the
+    walk-back ``while idx > 1`` loop is a no-op. Without an explicit
+    clamp ``if idx < 1: idx = 1``, ``recent_messages = items[0:]``
+    starts at the system message, and the rebuild duplicates system +
+    first-user. Anthropic API rejects two system messages.
+    """
+    cm = _make_cm(model_max_tokens=100_000, untouched_messages=5)
+    cm.items = [
+        Message(role="system", content="system"),
+        Message(role="user", content="task"),
+        Message(role="assistant", content="ok"),  # would be the only
+                                                   # message_to_summarize but the
+                                                   # idx bug pulls it into recent
+        Message(role="user", content="followup"),
+        Message(role="assistant", content="reply"),
+    ]  # exactly 5 = untouched_messages, so idx initialises to 0
+    cm.running_context_usage = 95_000
+    async def fake_summarize(*args, **kwargs):
+        return ("summary", 10)
+    def fake_recompute(self, model_name):
+        self.running_context_usage = 5_000
+    with (
+        patch("agent.context_manager.manager.summarize_messages", side_effect=fake_summarize),
+        patch.object(ContextManager, "_recompute_usage", fake_recompute),
+        patch("litellm.token_counter", return_value=100),
+    ):
+        await cm.compact(
+            model_name="anthropic/claude-opus-4-6",
+            tool_specs=None,
+            hf_token=None,
+            session=None,
+        )
+    # Critical assertion: only ONE system message in items
+    system_count = sum(1 for m in cm.items if m.role == "system")
+    assert system_count == 1, (
+        f"Expected exactly 1 system message, found {system_count}. "
+        f"Roles: {[m.role for m in cm.items]}"
+    )
+    # And the first-user "task" message must also appear exactly once.
+    # Bot review on PR #213 caught a follow-up bug: clamping idx=1
+    # excludes the system but still overlaps with first_user_idx (also 1),
+    # so first_user_msg ends up in BOTH head and recent_messages →
+    # duplicate user message → Anthropic 400 (two consecutive user roles).
+    task_count = sum(
+        1 for m in cm.items
+        if m.role == "user" and (m.content or "") == "task"
+    )
+    assert task_count == 1, (
+        f"Expected exactly 1 'task' user message, found {task_count}. "
+        f"Roles+content: {[(m.role, (m.content or '')[:20]) for m in cm.items]}"
+    )
+    # Defense in depth: no two consecutive same-role messages (Anthropic
+    # API contract). System counts separately.
+    non_system = [m for m in cm.items if m.role != "system"]
+    for i in range(1, len(non_system)):
+        assert non_system[i].role != non_system[i-1].role, (
+            f"Two consecutive {non_system[i].role} messages at non-system "
+            f"position {i-1},{i} — Anthropic API rejects this. "
+            f"Roles: {[m.role for m in cm.items]}"
+        )
+@pytest.mark.asyncio
+async def test_compact_succeeds_when_post_compact_under_threshold():
+    """Happy path: when compaction does its job, no exception raised."""
+    cm = _make_cm(model_max_tokens=100_000)
+    cm.items = [
+        Message(role="system", content="system"),
+        Message(role="user", content="task"),
+        Message(role="assistant", content="x" * 1000),
+        Message(role="user", content="follow-up"),
+        Message(role="assistant", content="reply"),
+        Message(role="user", content="follow-up 2"),
+        Message(role="assistant", content="reply 2"),
+    ]
+    cm.running_context_usage = 95_000
+    async def fake_summarize(*args, **kwargs):
+        return ("summary", 10)
+    def fake_recompute(self, model_name):
+        self.running_context_usage = 5_000  # well under threshold
+    with (
+        patch("agent.context_manager.manager.summarize_messages", side_effect=fake_summarize),
+        patch.object(ContextManager, "_recompute_usage", fake_recompute),
+        patch("litellm.token_counter", return_value=100),
+    ):
+        await cm.compact(
+            model_name="anthropic/claude-opus-4-6",
+            tool_specs=None,
+            hf_token=None,
+            session=None,
+        )
+    assert cm.running_context_usage == 5_000
+# ── _compact_and_notify behavior on CompactionFailedError ──────────────
+@pytest.mark.asyncio
+async def test_compact_and_notify_terminates_session_on_failure():
+    """The PR's #213's P0 bug-class: setting ``is_running = False`` is
+    only effective if the agent loop checks it. This test asserts the
+    flag IS set AND a ``session_terminated`` event is emitted, so a
+    follow-up assertion in the agent loop test catches the loop-exit.
+    """
+    from agent.core.agent_loop import _compact_and_notify
+    session = MagicMock()
+    session.session_id = "sess-123"
+    session.is_running = True
+    session.config.model_name = "anthropic/claude-opus-4-6"
+    session.hf_token = None
+    session.tool_router.get_tool_specs_for_llm.return_value = []
+    session.send_event = AsyncMock()
+    cm = MagicMock()
+    cm.running_context_usage = 95_000
+    cm.compaction_threshold = 90_000
+    cm.model_max_tokens = 100_000
+    cm.items = []
+    cm.needs_compaction = True
+    cm.compact = AsyncMock(side_effect=CompactionFailedError("ineffective"))
+    session.context_manager = cm
+    await _compact_and_notify(session)
+    assert session.is_running is False, (
+        "_compact_and_notify must set is_running=False so the agent loop "
+        "can exit. P0 caught by bot review on PR #213 was that the loop "
+        "didn't actually check this flag."
+    )
+    assert session.send_event.await_count == 1
+    event = session.send_event.await_args.args[0]
+    assert event.event_type == "session_terminated"
+    assert event.data["reason"] == "compaction_failed"
+    assert event.data["context_usage"] == 95_000
+@pytest.mark.asyncio
+async def test_compact_and_notify_passes_through_on_success():
+    """When compaction succeeds, no termination event, is_running stays True."""
+    from agent.core.agent_loop import _compact_and_notify
+    session = MagicMock()
+    session.session_id = "sess-456"
+    session.is_running = True
+    session.config.model_name = "anthropic/claude-opus-4-6"
+    session.hf_token = None
+    session.tool_router.get_tool_specs_for_llm.return_value = []
+    session.send_event = AsyncMock()
+    cm = MagicMock()
+    cm.running_context_usage = 5_000
+    cm.compaction_threshold = 90_000
+    cm.model_max_tokens = 100_000
+    cm.items = []
+    cm.needs_compaction = False
+    cm.compact = AsyncMock(return_value=None)  # success
+    session.context_manager = cm
+    # Pretend old_usage == new_usage so the "compacted" event is also skipped
+    await _compact_and_notify(session)
+    assert session.is_running is True
+    # No session_terminated event emitted
+    for call in session.send_event.await_args_list:
+        ev = call.args[0]
+        assert ev.event_type != "session_terminated"

tests/unit/test_dangling_tool_calls.py CHANGED Viewed

@@ -67,15 +67,6 @@ def test_no_orphan_means_no_stub():
     assert tool_msgs[0].content == "ok"
-def test_add_message_records_message_timestamp():
-    cm = _make_cm()
-    msg = Message(role="user", content="hello")
-    cm.add_message(msg)
-    assert getattr(cm.items[-1], "timestamp", None)
 def test_multiple_dangling_tool_calls_in_one_assistant_message_are_all_patched():
     cm = _make_cm()
     cm.items.extend([

     assert tool_msgs[0].content == "ok"
 def test_multiple_dangling_tool_calls_in_one_assistant_message_are_all_patched():
     cm = _make_cm()
     cm.items.extend([

tests/unit/test_sandbox_private_spaces.py CHANGED Viewed

@@ -3,6 +3,8 @@ import threading
 import time
 from types import SimpleNamespace
 from agent.core import telemetry
 from agent.tools import sandbox_client, sandbox_tool
 from agent.tools.sandbox_client import Sandbox
@@ -91,6 +93,101 @@ def test_sandbox_client_retries_transient_runtime_404(monkeypatch):
     assert runtime_calls == 2
 def test_sandbox_tool_forces_private_spaces(monkeypatch):
     captured_kwargs = {}
@@ -148,6 +245,7 @@ def test_orphan_sweep_preserves_spaces_without_last_modified():
 def test_ensure_sandbox_overrides_private_argument(monkeypatch):
     captured_kwargs = {}
     class FakeApi:
         def __init__(self, token=None):
@@ -158,14 +256,23 @@ def test_ensure_sandbox_overrides_private_argument(monkeypatch):
     class FakeSession:
         def __init__(self):
             self.hf_token = "hf-token"
             self.sandbox = None
             self.event_queue = SimpleNamespace(put_nowait=lambda event: None)
             self._cancelled = asyncio.Event()
         async def send_event(self, event):
             pass
     def fake_create(**kwargs):
         captured_kwargs.update(kwargs)
         return SimpleNamespace(
@@ -192,6 +299,11 @@ def test_ensure_sandbox_overrides_private_argument(monkeypatch):
     assert error is None
     assert sb is not None
     assert captured_kwargs["private"] is True
 def test_sandbox_creation_is_serialized_per_owner(monkeypatch):
@@ -356,6 +468,7 @@ def test_sandbox_create_replaces_auto_cpu_sandbox(monkeypatch):
 def test_teardown_cancels_preload_and_deletes_owned_sandbox(monkeypatch):
     deleted: list[str] = []
     async def fake_record_sandbox_destroy(*args, **kwargs):
         pass
@@ -369,6 +482,7 @@ def test_teardown_cancels_preload_and_deletes_owned_sandbox(monkeypatch):
             await asyncio.sleep(0)
         session = SimpleNamespace(
             sandbox=SimpleNamespace(
                 space_id="alice/sandbox-12345678",
                 _owns_space=True,
@@ -377,17 +491,28 @@ def test_teardown_cancels_preload_and_deletes_owned_sandbox(monkeypatch):
             sandbox_hardware="cpu-basic",
             sandbox_preload_task=asyncio.create_task(preload()),
             sandbox_preload_cancel_event=cancel_event,
         )
         await sandbox_tool.teardown_session_sandbox(session)
         return session, cancel_event
     session, cancel_event = asyncio.run(run())
     assert cancel_event.is_set()
     assert deleted == ["alice/sandbox-12345678"]
     assert session.sandbox is None
     assert session.sandbox_hardware is None
 def test_cancel_sandbox_preload_cancels_task_after_timeout(monkeypatch):

 import time
 from types import SimpleNamespace
+import pytest
 from agent.core import telemetry
 from agent.tools import sandbox_client, sandbox_tool
 from agent.tools.sandbox_client import Sandbox
     assert runtime_calls == 2
+def test_sandbox_client_retries_transient_hardware_401(monkeypatch):
+    hardware_calls = 0
+    logs: list[str] = []
+    class FakeResponse:
+        status_code = 401
+    class FakeHardware401(Exception):
+        response = FakeResponse()
+        def __str__(self):
+            return "401 Client Error: Repository Not Found"
+    class FakeApi:
+        def __init__(self, token=None):
+            self.token = token
+        def duplicate_space(self, **kwargs):
+            pass
+        def request_space_hardware(self, space_id, hardware, sleep_time=None):
+            nonlocal hardware_calls
+            hardware_calls += 1
+            if hardware_calls == 1:
+                raise FakeHardware401()
+            return SimpleNamespace(stage="BUILDING", hardware=None)
+        def add_space_secret(self, *args, **kwargs):
+            pass
+        def get_space_runtime(self, space_id):
+            return SimpleNamespace(stage="RUNNING", hardware="cpu-basic")
+    monkeypatch.setattr(sandbox_client, "HfApi", FakeApi)
+    monkeypatch.setattr(sandbox_client.time, "sleep", lambda seconds: None)
+    monkeypatch.setattr(
+        Sandbox,
+        "_setup_server",
+        staticmethod(lambda *args, **kwargs: None),
+    )
+    monkeypatch.setattr(Sandbox, "_wait_for_api", lambda self, *args, **kwargs: None)
+    sandbox = Sandbox.create(owner="alice", token="hf-token", log=logs.append)
+    assert sandbox.space_id.startswith("alice/sandbox-")
+    assert hardware_calls == 2
+    assert any("Hardware request not accepted yet (HTTP 401)" in log for log in logs)
+def test_sandbox_hardware_retry_reraises_after_timeout(monkeypatch):
+    calls = 0
+    logs: list[str] = []
+    sleeps: list[float] = []
+    class FakeResponse:
+        status_code = 401
+    class FakeHardware401(Exception):
+        response = FakeResponse()
+        def __str__(self):
+            return "401 Client Error: Repository Not Found"
+    first_error = FakeHardware401("first")
+    timeout_error = FakeHardware401("timeout")
+    class FakeApi:
+        def request_space_hardware(self, space_id, hardware, sleep_time=None):
+            nonlocal calls
+            calls += 1
+            if calls == 1:
+                raise first_error
+            raise timeout_error
+    timestamps = iter([100.0, 100.0, 161.0])
+    monkeypatch.setattr(sandbox_client.time, "time", lambda: next(timestamps))
+    monkeypatch.setattr(sandbox_client.time, "sleep", sleeps.append)
+    with pytest.raises(FakeHardware401) as excinfo:
+        sandbox_client._request_space_hardware_with_retry(
+            FakeApi(),
+            "alice/sandbox-12345678",
+            hardware="cpu-basic",
+            sleep_time=None,
+            log=logs.append,
+            check_cancel=lambda: None,
+        )
+    assert excinfo.value is timeout_error
+    assert calls == 2
+    assert sleeps == [sandbox_client.WAIT_INTERVAL]
+    assert len(logs) == 1
 def test_sandbox_tool_forces_private_spaces(monkeypatch):
     captured_kwargs = {}
 def test_ensure_sandbox_overrides_private_argument(monkeypatch):
     captured_kwargs = {}
+    persisted: list[dict] = []
     class FakeApi:
         def __init__(self, token=None):
     class FakeSession:
         def __init__(self):
+            self.session_id = "s1"
             self.hf_token = "hf-token"
             self.sandbox = None
             self.event_queue = SimpleNamespace(put_nowait=lambda event: None)
             self._cancelled = asyncio.Event()
+            self.persistence_store = SimpleNamespace(
+                update_session_fields=lambda session_id, **fields: _record_metadata(
+                    session_id, fields
+                )
+            )
         async def send_event(self, event):
             pass
+    async def _record_metadata(session_id, fields):
+        persisted.append({"session_id": session_id, **fields})
     def fake_create(**kwargs):
         captured_kwargs.update(kwargs)
         return SimpleNamespace(
     assert error is None
     assert sb is not None
     assert captured_kwargs["private"] is True
+    assert persisted[-1]["session_id"] == "s1"
+    assert persisted[-1]["sandbox_space_id"] == "alice/sandbox-12345678"
+    assert persisted[-1]["sandbox_hardware"] == "cpu-basic"
+    assert persisted[-1]["sandbox_owner"] == "alice"
+    assert persisted[-1]["sandbox_status"] == "active"
 def test_sandbox_creation_is_serialized_per_owner(monkeypatch):
 def test_teardown_cancels_preload_and_deletes_owned_sandbox(monkeypatch):
     deleted: list[str] = []
+    persisted: list[dict] = []
     async def fake_record_sandbox_destroy(*args, **kwargs):
         pass
             await asyncio.sleep(0)
         session = SimpleNamespace(
+            session_id="s1",
             sandbox=SimpleNamespace(
                 space_id="alice/sandbox-12345678",
                 _owns_space=True,
             sandbox_hardware="cpu-basic",
             sandbox_preload_task=asyncio.create_task(preload()),
             sandbox_preload_cancel_event=cancel_event,
+            persistence_store=SimpleNamespace(
+                update_session_fields=lambda session_id, **fields: _record_metadata(
+                    session_id, fields
+                )
+            ),
         )
         await sandbox_tool.teardown_session_sandbox(session)
         return session, cancel_event
+    async def _record_metadata(session_id, fields):
+        persisted.append({"session_id": session_id, **fields})
     session, cancel_event = asyncio.run(run())
     assert cancel_event.is_set()
     assert deleted == ["alice/sandbox-12345678"]
     assert session.sandbox is None
     assert session.sandbox_hardware is None
+    assert persisted[-1]["session_id"] == "s1"
+    assert persisted[-1]["sandbox_space_id"] is None
+    assert persisted[-1]["sandbox_status"] == "destroyed"
 def test_cancel_sandbox_preload_cancels_task_after_timeout(monkeypatch):

tests/unit/test_session_manager_persistence.py CHANGED Viewed

@@ -4,6 +4,7 @@ from __future__ import annotations
 import asyncio
 import sys
 from datetime import datetime, UTC
 from pathlib import Path
 from types import SimpleNamespace
@@ -30,6 +31,10 @@ class FakeRuntimeSession:
         self.auto_approval_enabled = False
         self.auto_approval_cost_cap_usd = None
         self.auto_approval_estimated_spend_usd = 0.0
     def auto_approval_policy_summary(self):
         cap = self.auto_approval_cost_cap_usd
@@ -65,6 +70,7 @@ class RestoreStore(NoopSessionStore):
         self.messages = messages or []
         self.delay = delay
         self.load_calls = 0
     async def load_session(self, session_id: str, **_: Any) -> dict[str, Any] | None:
         self.load_calls += 1
@@ -75,6 +81,18 @@ class RestoreStore(NoopSessionStore):
         metadata.setdefault("_id", session_id)
         return {"metadata": metadata, "messages": self.messages}
 def _manager_with_store(store: NoopSessionStore) -> SessionManager:
     manager = object.__new__(SessionManager)
@@ -82,6 +100,7 @@ def _manager_with_store(store: NoopSessionStore) -> SessionManager:
     manager.sessions = {}
     manager._lock = asyncio.Lock()
     manager.persistence_store = store
     return manager
@@ -151,6 +170,87 @@ async def _cancel_runtime_tasks(manager: SessionManager) -> None:
         await asyncio.gather(*tasks, return_exceptions=True)
 @pytest.mark.asyncio
 async def test_existing_session_rejects_cross_user_token_overwrite():
     manager = _manager_with_store(NoopSessionStore())
@@ -253,6 +353,107 @@ async def test_lazy_restore_schedules_cpu_sandbox_preload():
         await _cancel_runtime_tasks(manager)
 @pytest.mark.asyncio
 async def test_lazy_restore_preserves_pending_approval_tool_calls():
     store = RestoreStore(

 import asyncio
 import sys
+import threading
 from datetime import datetime, UTC
 from pathlib import Path
 from types import SimpleNamespace
         self.auto_approval_enabled = False
         self.auto_approval_cost_cap_usd = None
         self.auto_approval_estimated_spend_usd = 0.0
+        self.sandbox = None
+        self.sandbox_hardware = None
+        self.sandbox_preload_task = None
+        self.sandbox_preload_cancel_event = None
     def auto_approval_policy_summary(self):
         cap = self.auto_approval_cost_cap_usd
         self.messages = messages or []
         self.delay = delay
         self.load_calls = 0
+        self.updated_fields: list[tuple[str, dict[str, Any]]] = []
     async def load_session(self, session_id: str, **_: Any) -> dict[str, Any] | None:
         self.load_calls += 1
         metadata.setdefault("_id", session_id)
         return {"metadata": metadata, "messages": self.messages}
+    async def update_session_fields(self, session_id: str, **fields: Any) -> None:
+        self.updated_fields.append((session_id, fields))
+        self.metadata.update(fields)
+class CloseableResource:
+    def __init__(self) -> None:
+        self.closed = False
+    async def close(self) -> None:
+        self.closed = True
 def _manager_with_store(store: NoopSessionStore) -> SessionManager:
     manager = object.__new__(SessionManager)
     manager.sessions = {}
     manager._lock = asyncio.Lock()
     manager.persistence_store = store
+    manager.messaging_gateway = CloseableResource()
     return manager
         await asyncio.gather(*tasks, return_exceptions=True)
+@pytest.mark.asyncio
+async def test_close_cancels_preload_and_deletes_owned_sandbox(monkeypatch):
+    deleted: list[str] = []
+    async def fake_record_sandbox_destroy(*args, **kwargs):
+        pass
+    monkeypatch.setattr(
+        "agent.core.telemetry.record_sandbox_destroy",
+        fake_record_sandbox_destroy,
+    )
+    store = NoopSessionStore()
+    manager = _manager_with_store(store)
+    gateway = CloseableResource()
+    persistence = CloseableResource()
+    manager.messaging_gateway = gateway  # type: ignore[assignment]
+    manager.persistence_store = persistence  # type: ignore[assignment]
+    cancel_event = asyncio.Event()
+    preload_cancel_event = threading.Event()
+    async def preload():
+        while not preload_cancel_event.is_set():
+            await asyncio.sleep(0)
+        cancel_event.set()
+    session = FakeRuntimeSession(hf_token="token")
+    session.session_id = "s1"
+    session.persistence_store = NoopSessionStore()
+    session.sandbox = SimpleNamespace(
+        space_id="owner/sandbox-12345678",
+        _owns_space=True,
+        delete=lambda: deleted.append("owner/sandbox-12345678"),
+    )
+    session.sandbox_hardware = "cpu-basic"
+    session.sandbox_preload_cancel_event = preload_cancel_event
+    session.sandbox_preload_task = asyncio.create_task(preload())
+    manager.sessions["s1"] = AgentSession(
+        session_id="s1",
+        session=session,  # type: ignore[arg-type]
+        tool_router=object(),  # type: ignore[arg-type]
+        submission_queue=asyncio.Queue(),
+        user_id="owner",
+        hf_token="token",
+    )
+    await manager.close()
+    assert preload_cancel_event.is_set()
+    assert cancel_event.is_set()
+    assert deleted == ["owner/sandbox-12345678"]
+    assert gateway.closed is True
+    assert persistence.closed is True
+@pytest.mark.asyncio
+async def test_close_closes_resources_when_sandbox_cleanup_fails():
+    manager = _manager_with_store(NoopSessionStore())
+    gateway = CloseableResource()
+    persistence = CloseableResource()
+    manager.messaging_gateway = gateway  # type: ignore[assignment]
+    manager.persistence_store = persistence  # type: ignore[assignment]
+    manager.sessions["s1"] = _runtime_agent_session("s1")
+    manager.sessions["s2"] = _runtime_agent_session("s2")
+    cleaned: list[str] = []
+    async def fake_cleanup(session):
+        cleaned.append(session.hf_token)
+        if session.hf_token == "owner-token":
+            raise RuntimeError("boom")
+    manager._cleanup_sandbox = fake_cleanup  # type: ignore[method-assign]
+    await manager.close()
+    assert cleaned == ["owner-token", "owner-token"]
+    assert gateway.closed is True
+    assert persistence.closed is True
 @pytest.mark.asyncio
 async def test_existing_session_rejects_cross_user_token_overwrite():
     manager = _manager_with_store(NoopSessionStore())
         await _cancel_runtime_tasks(manager)
+@pytest.mark.asyncio
+async def test_lazy_restore_deletes_persisted_sandbox_before_preload(monkeypatch):
+    deleted: list[tuple[str, str, str]] = []
+    class FakeApi:
+        def __init__(self, token=None):
+            self.token = token
+        def delete_repo(self, repo_id, repo_type):
+            deleted.append((self.token, repo_id, repo_type))
+    monkeypatch.setattr("huggingface_hub.HfApi", FakeApi)
+    store = RestoreStore(
+        metadata={
+            "session_id": "persisted-session",
+            "user_id": "owner",
+            "model": "test-model",
+            "created_at": datetime.now(UTC),
+            "sandbox_space_id": "owner/sandbox-12345678",
+            "sandbox_hardware": "cpu-basic",
+            "sandbox_owner": "owner",
+            "sandbox_created_at": datetime.now(UTC),
+            "sandbox_status": "active",
+        }
+    )
+    manager = _manager_with_store(store)
+    stop = _install_fake_runtime(manager)
+    scheduled: list[str] = []
+    def fake_start_cpu_sandbox_preload(agent_session: AgentSession) -> None:
+        scheduled.append(agent_session.session_id)
+    manager._start_cpu_sandbox_preload = fake_start_cpu_sandbox_preload  # type: ignore[method-assign]
+    try:
+        restored = await manager.ensure_session_loaded(
+            "persisted-session",
+            user_id="owner",
+            hf_token="user-token",
+        )
+        assert restored is not None
+        assert deleted == [("user-token", "owner/sandbox-12345678", "space")]
+        assert scheduled == ["persisted-session"]
+        assert store.metadata["sandbox_space_id"] is None
+        assert store.metadata["sandbox_status"] == "destroyed"
+    finally:
+        stop.set()
+        await _cancel_runtime_tasks(manager)
+@pytest.mark.asyncio
+async def test_lazy_restore_can_skip_cpu_sandbox_preload_after_cleanup(monkeypatch):
+    deleted: list[str] = []
+    class FakeApi:
+        def __init__(self, token=None):
+            self.token = token
+        def delete_repo(self, repo_id, repo_type):
+            deleted.append(repo_id)
+    monkeypatch.setattr("huggingface_hub.HfApi", FakeApi)
+    store = RestoreStore(
+        metadata={
+            "session_id": "persisted-session",
+            "user_id": "owner",
+            "model": "test-model",
+            "created_at": datetime.now(UTC),
+            "sandbox_space_id": "owner/sandbox-87654321",
+            "sandbox_status": "active",
+        }
+    )
+    manager = _manager_with_store(store)
+    stop = _install_fake_runtime(manager)
+    scheduled: list[str] = []
+    def fake_start_cpu_sandbox_preload(agent_session: AgentSession) -> None:
+        scheduled.append(agent_session.session_id)
+    manager._start_cpu_sandbox_preload = fake_start_cpu_sandbox_preload  # type: ignore[method-assign]
+    try:
+        restored = await manager.ensure_session_loaded(
+            "persisted-session",
+            user_id="owner",
+            hf_token="user-token",
+            preload_sandbox=False,
+        )
+        assert restored is not None
+        assert deleted == ["owner/sandbox-87654321"]
+        assert scheduled == []
+        assert store.metadata["sandbox_space_id"] is None
+    finally:
+        stop.set()
+        await _cancel_runtime_tasks(manager)
 @pytest.mark.asyncio
 async def test_lazy_restore_preserves_pending_approval_tool_calls():
     store = RestoreStore(