diff --git a/README.md b/README.md
index f661514cddfcc1e0460213bff7d93da74e6e165f..19587d32b58395337b711f95f2b3c2e87c156981 100644
--- a/README.md
+++ b/README.md
@@ -75,6 +75,56 @@ ml-intern --max-iterations 100 "your prompt"
 ml-intern --no-stream "your prompt"
 ```
 
+## Supported Gateways
+
+ML Intern currently supports one-way notification gateways from CLI sessions.
+These gateways send out-of-band status updates; they do not accept inbound chat
+messages.
+
+### Slack
+
+Slack notifications use the Slack Web API to post messages when the agent needs
+approval, hits an error, or completes a turn. Create a Slack app with a bot token
+that has `chat:write`, invite the bot to the target channel, then set:
+
+```bash
+SLACK_BOT_TOKEN=xoxb-...
+SLACK_CHANNEL_ID=C...
+```
+
+The CLI automatically creates a `slack.default` destination when both variables
+are present. Optional environment variables for the env-only default:
+
+```bash
+ML_INTERN_SLACK_NOTIFICATIONS=false
+ML_INTERN_SLACK_DESTINATION=slack.ops
+ML_INTERN_SLACK_AUTO_EVENTS=approval_required,error,turn_complete
+ML_INTERN_SLACK_ALLOW_AGENT_TOOL=true
+ML_INTERN_SLACK_ALLOW_AUTO_EVENTS=true
+```
+
+For a persistent user-level config, put overrides in
+`~/.config/ml-intern/cli_agent_config.json` or point `ML_INTERN_CLI_CONFIG` at a
+JSON file:
+
+```json
+{
+  "messaging": {
+    "enabled": true,
+    "auto_event_types": ["approval_required", "error", "turn_complete"],
+    "destinations": {
+      "slack.ops": {
+        "provider": "slack",
+        "token": "${SLACK_BOT_TOKEN}",
+        "channel": "${SLACK_CHANNEL_ID}",
+        "allow_agent_tool": true,
+        "allow_auto_events": true
+      }
+    }
+  }
+}
+```
+
 ## Architecture
 
 ### Component Overview
diff --git a/agent/config.py b/agent/config.py
index 7e696dd78fdbf04a7f5ff527127583930acae3d0..5a6a8a45f796a6557404b8f401ad2fad3f264288 100644
--- a/agent/config.py
+++ b/agent/config.py
@@ -6,6 +6,8 @@ from typing import Any, Union
 
 from dotenv import load_dotenv
 
+from agent.messaging.models import MessagingConfig
+
 # Project root: two levels up from this file (agent/config.py -> project root)
 _PROJECT_ROOT = Path(__file__).resolve().parent.parent
 from fastmcp.mcp_config import (
@@ -47,6 +49,104 @@ class Config(BaseModel):
     # ``xhigh`` or ``max`` for Anthropic 4.6 / 4.7). ``None`` = thinking off.
     # Valid values: None | "minimal" | "low" | "medium" | "high" | "xhigh" | "max"
     reasoning_effort: str | None = "max"
+    messaging: MessagingConfig = MessagingConfig()
+
+
+USER_CONFIG_ENV_VAR = "ML_INTERN_CLI_CONFIG"
+DEFAULT_USER_CONFIG_PATH = Path.home() / ".config" / "ml-intern" / "cli_agent_config.json"
+SLACK_DEFAULT_DESTINATION = "slack.default"
+SLACK_DEFAULT_AUTO_EVENT_TYPES = ["approval_required", "error", "turn_complete"]
+
+
+def _deep_merge_config(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
+    merged = dict(base)
+    for key, value in override.items():
+        current = merged.get(key)
+        if isinstance(current, dict) and isinstance(value, dict):
+            merged[key] = _deep_merge_config(current, value)
+        else:
+            merged[key] = value
+    return merged
+
+
+def _load_json_config(path: Path) -> dict[str, Any]:
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, dict):
+        raise ValueError(f"Config file {path} must contain a JSON object")
+    return data
+
+
+def _load_user_config() -> dict[str, Any]:
+    raw_path = os.environ.get(USER_CONFIG_ENV_VAR)
+    if raw_path:
+        path = Path(raw_path).expanduser()
+        if not path.exists():
+            raise FileNotFoundError(
+                f"{USER_CONFIG_ENV_VAR} points to missing config file: {path}"
+            )
+        return _load_json_config(path)
+
+    if DEFAULT_USER_CONFIG_PATH.exists():
+        return _load_json_config(DEFAULT_USER_CONFIG_PATH)
+    return {}
+
+
+def _env_bool(name: str, default: bool) -> bool:
+    value = os.environ.get(name)
+    if value is None:
+        return default
+    normalized = value.strip().lower()
+    if normalized in {"1", "true", "yes", "on"}:
+        return True
+    if normalized in {"0", "false", "no", "off"}:
+        return False
+    return default
+
+
+def _env_list(name: str) -> list[str] | None:
+    value = os.environ.get(name)
+    if value is None:
+        return None
+    return [item.strip() for item in value.split(",") if item.strip()]
+
+
+def apply_slack_user_defaults(raw_config: dict[str, Any]) -> dict[str, Any]:
+    """Enable a default Slack destination from user env vars, when present."""
+    if not _env_bool("ML_INTERN_SLACK_NOTIFICATIONS", True):
+        return raw_config
+
+    token = os.environ.get("SLACK_BOT_TOKEN")
+    channel = os.environ.get("SLACK_CHANNEL_ID") or os.environ.get("SLACK_CHANNEL")
+    if not token or not channel:
+        return raw_config
+
+    config = dict(raw_config)
+    messaging = dict(config.get("messaging") or {})
+    destinations = dict(messaging.get("destinations") or {})
+    destination_name = (
+        os.environ.get("ML_INTERN_SLACK_DESTINATION") or SLACK_DEFAULT_DESTINATION
+    ).strip()
+
+    if destination_name not in destinations:
+        destinations[destination_name] = {
+            "provider": "slack",
+            "token": token,
+            "channel": channel,
+            "allow_agent_tool": _env_bool("ML_INTERN_SLACK_ALLOW_AGENT_TOOL", True),
+            "allow_auto_events": _env_bool("ML_INTERN_SLACK_ALLOW_AUTO_EVENTS", True),
+        }
+
+    auto_events = _env_list("ML_INTERN_SLACK_AUTO_EVENTS")
+    if auto_events is not None:
+        messaging["auto_event_types"] = auto_events
+    elif "auto_event_types" not in messaging:
+        messaging["auto_event_types"] = SLACK_DEFAULT_AUTO_EVENT_TYPES
+
+    messaging["enabled"] = True
+    messaging["destinations"] = destinations
+    config["messaging"] = messaging
+    return config
 
 
 def substitute_env_vars(obj: Any) -> Any:
@@ -86,7 +186,10 @@ def substitute_env_vars(obj: Any) -> Any:
     return obj
 
 
-def load_config(config_path: str = "config.json") -> Config:
+def load_config(
+    config_path: str = "config.json",
+    include_user_defaults: bool = False,
+) -> Config:
     """
     Load configuration with environment variable substitution.
 
@@ -98,8 +201,10 @@ def load_config(config_path: str = "config.json") -> Config:
     load_dotenv(_PROJECT_ROOT / ".env")
     load_dotenv(override=False)
 
-    with open(config_path, "r") as f:
-        raw_config = json.load(f)
+    raw_config = _load_json_config(Path(config_path))
+    if include_user_defaults:
+        raw_config = _deep_merge_config(raw_config, _load_user_config())
+        raw_config = apply_slack_user_defaults(raw_config)
 
     config_with_env = substitute_env_vars(raw_config)
     return Config.model_validate(config_with_env)
diff --git a/agent/context_manager/manager.py b/agent/context_manager/manager.py
index 64584b6d56bd14073bbe5fae53e62481338c638f..c842c8842c215450e651a33abf93c2a115d68e54 100644
--- a/agent/context_manager/manager.py
+++ b/agent/context_manager/manager.py
@@ -160,6 +160,7 @@ class ContextManager:
         self.running_context_usage = 0
         self.untouched_messages = untouched_messages
         self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
+        self.on_message_added = None
 
     def _load_system_prompt(
         self,
@@ -219,6 +220,8 @@ class ContextManager:
         if token_count:
             self.running_context_usage = token_count
         self.items.append(message)
+        if self.on_message_added:
+            self.on_message_added(message)
 
     def get_messages(self) -> list[Message]:
         """Get all messages for sending to LLM.
diff --git a/agent/core/agent_loop.py b/agent/core/agent_loop.py
index 26361d413a7c19f29c2e02bfcbf50e62271295c3..8b7a4572d0843ec473d0c6c21c1cd059b022ad28 100644
--- a/agent/core/agent_loop.py
+++ b/agent/core/agent_loop.py
@@ -8,11 +8,18 @@ import logging
 import os
 import time
 from dataclasses import dataclass, field
-
-from litellm import ChatCompletionMessageToolCall, Message, acompletion
+from typing import Any
+
+from litellm import (
+    ChatCompletionMessageToolCall,
+    Message,
+    acompletion,
+    stream_chunk_builder,
+)
 from litellm.exceptions import ContextWindowExceededError
 
 from agent.config import Config
+from agent.messaging.gateway import NotificationGateway
 from agent.core import telemetry
 from agent.core.doom_loop import check_for_doom_loop
 from agent.core.llm_params import _resolve_llm_params
@@ -396,12 +403,159 @@ class LLMResult:
     token_count: int
     finish_reason: str | None
     usage: dict = field(default_factory=dict)
+    thinking_blocks: list[dict[str, Any]] | None = None
+    reasoning_content: str | None = None
+
+
+def _extract_thinking_state(
+    message: Any,
+) -> tuple[list[dict[str, Any]] | None, str | None]:
+    """Return provider reasoning fields that must be replayed after tool calls."""
+    provider_fields = getattr(message, "provider_specific_fields", None)
+    if not isinstance(provider_fields, dict):
+        provider_fields = {}
+
+    thinking_blocks = (
+        getattr(message, "thinking_blocks", None)
+        or provider_fields.get("thinking_blocks")
+        or None
+    )
+    reasoning_content = (
+        getattr(message, "reasoning_content", None)
+        or provider_fields.get("reasoning_content")
+        or None
+    )
+    return thinking_blocks, reasoning_content
+
+
+def _should_replay_thinking_state(model_name: str | None) -> bool:
+    """Only Anthropic's native adapter accepts replayed thinking metadata."""
+    return bool(model_name and model_name.startswith("anthropic/"))
+
+
+def _is_invalid_thinking_signature_error(exc: Exception) -> bool:
+    """Return True when Anthropic rejected replayed extended-thinking state."""
+    text = str(exc)
+    return (
+        "Invalid `signature` in `thinking` block" in text
+        or "Invalid signature in thinking block" in text
+    )
+
+
+def _strip_thinking_state_from_messages(messages: list[Any]) -> int:
+    """Remove replayed thinking metadata from assistant history messages."""
+    stripped = 0
+
+    for message in messages:
+        role = (
+            message.get("role")
+            if isinstance(message, dict)
+            else getattr(message, "role", None)
+        )
+        if role != "assistant":
+            continue
+
+        if isinstance(message, dict):
+            if message.pop("thinking_blocks", None) is not None:
+                stripped += 1
+            if message.pop("reasoning_content", None) is not None:
+                stripped += 1
+            provider_fields = message.get("provider_specific_fields")
+            content = message.get("content")
+        else:
+            if getattr(message, "thinking_blocks", None) is not None:
+                message.thinking_blocks = None
+                stripped += 1
+            if getattr(message, "reasoning_content", None) is not None:
+                message.reasoning_content = None
+                stripped += 1
+            provider_fields = getattr(message, "provider_specific_fields", None)
+            content = getattr(message, "content", None)
+
+        if isinstance(provider_fields, dict):
+            cleaned_fields = dict(provider_fields)
+            if cleaned_fields.pop("thinking_blocks", None) is not None:
+                stripped += 1
+            if cleaned_fields.pop("reasoning_content", None) is not None:
+                stripped += 1
+            if cleaned_fields != provider_fields:
+                if isinstance(message, dict):
+                    message["provider_specific_fields"] = cleaned_fields
+                else:
+                    message.provider_specific_fields = cleaned_fields
+
+        if isinstance(content, list):
+            cleaned_content = [
+                block
+                for block in content
+                if not (
+                    isinstance(block, dict)
+                    and block.get("type") in {"thinking", "redacted_thinking"}
+                )
+            ]
+            if len(cleaned_content) != len(content):
+                stripped += len(content) - len(cleaned_content)
+                if isinstance(message, dict):
+                    message["content"] = cleaned_content
+                else:
+                    message.content = cleaned_content
+
+    return stripped
+
+
+async def _maybe_heal_invalid_thinking_signature(
+    session: Session,
+    messages: list[Any],
+    exc: Exception,
+    *,
+    already_healed: bool,
+) -> bool:
+    if already_healed or not _is_invalid_thinking_signature_error(exc):
+        return False
+
+    stripped = _strip_thinking_state_from_messages(messages)
+    if not stripped:
+        return False
+
+    await session.send_event(Event(
+        event_type="tool_log",
+        data={
+            "tool": "system",
+            "log": (
+                "Anthropic rejected stale thinking signatures; retrying "
+                "without replayed thinking metadata."
+            ),
+        },
+    ))
+    return True
+
+
+def _assistant_message_from_result(
+    llm_result: LLMResult,
+    *,
+    model_name: str | None,
+    tool_calls: list[ToolCall] | None = None,
+) -> Message:
+    """Build an assistant history message without dropping reasoning state."""
+    kwargs: dict[str, Any] = {
+        "role": "assistant",
+        "content": llm_result.content,
+    }
+    if tool_calls is not None:
+        kwargs["tool_calls"] = tool_calls
+    if _should_replay_thinking_state(model_name):
+        if llm_result.thinking_blocks:
+            kwargs["thinking_blocks"] = llm_result.thinking_blocks
+        if llm_result.reasoning_content:
+            kwargs["reasoning_content"] = llm_result.reasoning_content
+    return Message(**kwargs)
 
 
 async def _call_llm_streaming(session: Session, messages, tools, llm_params) -> LLMResult:
     """Call the LLM with streaming, emitting assistant_chunk events."""
     response = None
     _healed_effort = False  # one-shot safety net per call
+    _healed_thinking_signature = False
     messages, tools = with_prompt_caching(messages, tools, llm_params.get("model"))
     t_start = time.monotonic()
     for _llm_attempt in range(_MAX_LLM_RETRIES):
@@ -429,6 +583,14 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
                     data={"tool": "system", "log": "Reasoning effort not supported for this model — adjusting and retrying."},
                 ))
                 continue
+            if await _maybe_heal_invalid_thinking_signature(
+                session,
+                messages,
+                e,
+                already_healed=_healed_thinking_signature,
+            ):
+                _healed_thinking_signature = True
+                continue
             _delay = _retry_delay_for(e, _llm_attempt)
             if _llm_attempt < _MAX_LLM_RETRIES - 1 and _delay is not None:
                 logger.warning(
@@ -448,8 +610,11 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
     token_count = 0
     finish_reason = None
     final_usage_chunk = None
+    chunks = []
+    should_replay_thinking = _should_replay_thinking_state(llm_params.get("model"))
 
     async for chunk in response:
+        chunks.append(chunk)
         if session.is_cancelled:
             tool_calls_acc.clear()
             break
@@ -498,6 +663,16 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
         latency_ms=int((time.monotonic() - t_start) * 1000),
         finish_reason=finish_reason,
     )
+    thinking_blocks = None
+    reasoning_content = None
+    if chunks and should_replay_thinking:
+        try:
+            rebuilt = stream_chunk_builder(chunks, messages=messages)
+            if rebuilt and getattr(rebuilt, "choices", None):
+                rebuilt_msg = rebuilt.choices[0].message
+                thinking_blocks, reasoning_content = _extract_thinking_state(rebuilt_msg)
+        except Exception:
+            logger.debug("Failed to rebuild streaming thinking state", exc_info=True)
 
     return LLMResult(
         content=full_content or None,
@@ -505,6 +680,8 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
         token_count=token_count,
         finish_reason=finish_reason,
         usage=usage,
+        thinking_blocks=thinking_blocks,
+        reasoning_content=reasoning_content,
     )
 
 
@@ -512,6 +689,7 @@ async def _call_llm_non_streaming(session: Session, messages, tools, llm_params)
     """Call the LLM without streaming, emit assistant_message at the end."""
     response = None
     _healed_effort = False
+    _healed_thinking_signature = False
     messages, tools = with_prompt_caching(messages, tools, llm_params.get("model"))
     t_start = time.monotonic()
     for _llm_attempt in range(_MAX_LLM_RETRIES):
@@ -538,6 +716,14 @@ async def _call_llm_non_streaming(session: Session, messages, tools, llm_params)
                     data={"tool": "system", "log": "Reasoning effort not supported for this model — adjusting and retrying."},
                 ))
                 continue
+            if await _maybe_heal_invalid_thinking_signature(
+                session,
+                messages,
+                e,
+                already_healed=_healed_thinking_signature,
+            ):
+                _healed_thinking_signature = True
+                continue
             _delay = _retry_delay_for(e, _llm_attempt)
             if _llm_attempt < _MAX_LLM_RETRIES - 1 and _delay is not None:
                 logger.warning(
@@ -557,6 +743,7 @@ async def _call_llm_non_streaming(session: Session, messages, tools, llm_params)
     content = message.content or None
     finish_reason = choice.finish_reason
     token_count = response.usage.total_tokens if response.usage else 0
+    thinking_blocks, reasoning_content = _extract_thinking_state(message)
 
     # Build tool_calls_acc in the same format as streaming
     tool_calls_acc: dict[int, dict] = {}
@@ -591,6 +778,8 @@ async def _call_llm_non_streaming(session: Session, messages, tools, llm_params)
         token_count=token_count,
         finish_reason=finish_reason,
         usage=usage,
+        thinking_blocks=thinking_blocks,
+        reasoning_content=reasoning_content,
     )
 
 
@@ -681,15 +870,6 @@ class Handlers:
                 session.context_manager.add_message(
                     Message(role="user", content=doom_prompt)
                 )
-                await session.send_event(
-                    Event(
-                        event_type="tool_log",
-                        data={
-                            "tool": "system",
-                            "log": "Doom loop detected — injecting corrective prompt",
-                        },
-                    )
-                )
 
             malformed_tool = _detect_repeated_malformed(session.context_manager.items)
             if malformed_tool:
@@ -763,7 +943,10 @@ class Handlers:
                         "  • For other tools: reduce the size of your arguments or use bash."
                     )
                     if content:
-                        assistant_msg = Message(role="assistant", content=content)
+                        assistant_msg = _assistant_message_from_result(
+                            llm_result,
+                            model_name=llm_params.get("model"),
+                        )
                         session.context_manager.add_message(assistant_msg, token_count)
                     session.context_manager.add_message(
                         Message(role="user", content=f"[SYSTEM: {truncation_hint}]")
@@ -819,7 +1002,10 @@ class Handlers:
                         (content or "")[:500],
                     )
                     if content:
-                        assistant_msg = Message(role="assistant", content=content)
+                        assistant_msg = _assistant_message_from_result(
+                            llm_result,
+                            model_name=llm_params.get("model"),
+                        )
                         session.context_manager.add_message(assistant_msg, token_count)
                         final_response = content
                     break
@@ -841,9 +1027,9 @@ class Handlers:
                         bad_tools.append(tc)
 
                 # Add assistant message with all tool calls to context
-                assistant_msg = Message(
-                    role="assistant",
-                    content=content,
+                assistant_msg = _assistant_message_from_result(
+                    llm_result,
+                    model_name=llm_params.get("model"),
                     tool_calls=tool_calls,
                 )
                 session.context_manager.add_message(assistant_msg, token_count)
@@ -1049,7 +1235,12 @@ class Handlers:
             await session.send_event(
                 Event(
                     event_type="turn_complete",
-                    data={"history_size": len(session.context_manager.items)},
+                    data={
+                        "history_size": len(session.context_manager.items),
+                        "final_response": final_response
+                        if isinstance(final_response, str)
+                        else None,
+                    },
                 )
             )
 
@@ -1358,12 +1549,16 @@ async def process_submission(session: Session, submission) -> bool:
 async def submission_loop(
     submission_queue: asyncio.Queue,
     event_queue: asyncio.Queue,
-    config: Config | None = None,
+    config: Config,
     tool_router: ToolRouter | None = None,
     session_holder: list | None = None,
     hf_token: str | None = None,
+    user_id: str | None = None,
     local_mode: bool = False,
     stream: bool = True,
+    notification_gateway: NotificationGateway | None = None,
+    notification_destinations: list[str] | None = None,
+    defer_turn_complete_notification: bool = False,
 ) -> None:
     """
     Main agent loop - processes submissions and dispatches to handlers.
@@ -1373,7 +1568,10 @@ async def submission_loop(
     # Create session with tool router
     session = Session(
         event_queue, config=config, tool_router=tool_router, hf_token=hf_token,
-        local_mode=local_mode, stream=stream,
+        user_id=user_id, local_mode=local_mode, stream=stream,
+        notification_gateway=notification_gateway,
+        notification_destinations=notification_destinations,
+        defer_turn_complete_notification=defer_turn_complete_notification,
     )
     if session_holder is not None:
         session_holder[0] = session
diff --git a/agent/core/doom_loop.py b/agent/core/doom_loop.py
index fbc3510a1222ae0888bc72a5288b4dc70a9de00f..878c7c00adfb4f8ea3fa7f068493ed8358d76b8d 100644
--- a/agent/core/doom_loop.py
+++ b/agent/core/doom_loop.py
@@ -24,9 +24,36 @@ class ToolCallSignature:
     result_hash: str | None = None
 
 
+def _normalize_args(args_str: str) -> str:
+    """Canonicalise a tool-call arguments string before hashing.
+
+    LLMs can emit semantically-identical JSON for the same call with different
+    key orderings (``{"a": 1, "b": 2}`` vs ``{"b": 2, "a": 1}``) or whitespace
+    (``{"a":1}`` vs ``{"a": 1}``). Hashing the raw bytes makes the doom-loop
+    detector miss those repeats. We parse-and-redump with ``sort_keys=True``
+    plus the most compact separators so trivially-different spellings collapse
+    to the same canonical form.
+
+    Falls back to the original string if the input isn't valid JSON (e.g. a
+    handful of providers occasionally pass a bare string for ``arguments``);
+    that path keeps the legacy behaviour and never raises.
+    """
+    if not args_str:
+        return ""
+    try:
+        return json.dumps(json.loads(args_str), sort_keys=True, separators=(",", ":"))
+    except (json.JSONDecodeError, TypeError, ValueError):
+        return args_str
+
+
 def _hash_args(args_str: str) -> str:
-    """Return a short hash of the JSON arguments string."""
-    return hashlib.md5(args_str.encode()).hexdigest()[:12]
+    """Return a short hash of the JSON arguments string.
+
+    The input is normalised via :func:`_normalize_args` first so that
+    semantically-identical tool calls produce the same hash regardless of key
+    order or whitespace.
+    """
+    return hashlib.md5(_normalize_args(args_str).encode()).hexdigest()[:12]
 
 
 def extract_recent_tool_signatures(
@@ -129,9 +156,13 @@ def check_for_doom_loop(messages: list[Message]) -> str | None:
     # Check for identical consecutive calls
     tool_name = detect_identical_consecutive(signatures, threshold=3)
     if tool_name:
-        logger.warning("Doom loop detected: %d+ identical consecutive calls to '%s'", 3, tool_name)
+        logger.warning(
+            "Repetition guard activated: %d+ identical consecutive calls to '%s'",
+            3,
+            tool_name,
+        )
         return (
-            f"[SYSTEM: DOOM LOOP DETECTED] You have called '{tool_name}' with the same "
+            f"[SYSTEM: REPETITION GUARD] You have called '{tool_name}' with the same "
             f"arguments multiple times in a row, getting the same result each time. "
             f"STOP repeating this approach — it is not working. "
             f"Step back and try a fundamentally different strategy. "
@@ -143,9 +174,9 @@ def check_for_doom_loop(messages: list[Message]) -> str | None:
     pattern = detect_repeating_sequence(signatures)
     if pattern:
         pattern_desc = " → ".join(s.name for s in pattern)
-        logger.warning("Doom loop detected: repeating sequence [%s]", pattern_desc)
+        logger.warning("Repetition guard activated: repeating sequence [%s]", pattern_desc)
         return (
-            f"[SYSTEM: DOOM LOOP DETECTED] You are stuck in a repeating cycle of tool calls: "
+            f"[SYSTEM: REPETITION GUARD] You are stuck in a repeating cycle of tool calls: "
             f"[{pattern_desc}]. This pattern has repeated multiple times without progress. "
             f"STOP this cycle and try a fundamentally different approach. "
             f"Consider: breaking down the problem differently, using alternative tools, "
diff --git a/agent/core/hf_access.py b/agent/core/hf_access.py
index 400db5a5a70efeb3cc513f4322469d504821e973..11446349fba5a41e7e92db86e9060e39dab00ba1 100644
--- a/agent/core/hf_access.py
+++ b/agent/core/hf_access.py
@@ -55,6 +55,13 @@ def _extract_username(whoami: dict[str, Any]) -> str | None:
 
 
 def _normalize_personal_plan(whoami: dict[str, Any]) -> str:
+    # OAuth whoami responses set `type: "user"` and surface Pro status only via
+    # the `isPro` boolean. Check the boolean first so a generic `type` value
+    # doesn't shadow it — otherwise Pro OAuth users get classified as free and
+    # blocked from running Jobs (smolagents/ml-intern Space discussion #21).
+    if whoami.get("isPro") is True or whoami.get("is_pro") is True:
+        return "pro"
+
     plan_str = ""
     for key in ("plan", "type", "accountType"):
         value = whoami.get(key)
@@ -62,9 +69,6 @@ def _normalize_personal_plan(whoami: dict[str, Any]) -> str:
             plan_str = value.lower()
             break
 
-    if not plan_str and (whoami.get("isPro") is True or whoami.get("is_pro") is True):
-        return "pro"
-
     if any(tag in plan_str for tag in ("pro", "enterprise", "team")):
         return "pro"
     return "free"
diff --git a/agent/core/hf_tokens.py b/agent/core/hf_tokens.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e72ccc128a9d9aaecb661c4c2ba3850a10b5dc0
--- /dev/null
+++ b/agent/core/hf_tokens.py
@@ -0,0 +1,85 @@
+"""Hugging Face token resolution helpers."""
+
+from __future__ import annotations
+
+import os
+from typing import Any
+
+
+def clean_hf_token(token: str | None) -> str | None:
+    """Normalize token strings the same way huggingface_hub does."""
+    if token is None:
+        return None
+    return token.replace("\r", "").replace("\n", "").strip() or None
+
+
+def get_cached_hf_token() -> str | None:
+    """Return the token from huggingface_hub's normal env/cache lookup."""
+    try:
+        from huggingface_hub import get_token
+
+        return get_token()
+    except Exception:
+        return None
+
+
+def resolve_hf_token(
+    *candidates: str | None,
+    include_cached: bool = True,
+) -> str | None:
+    """Return the first non-empty explicit token, then optionally HF cache."""
+    for token in candidates:
+        cleaned = clean_hf_token(token)
+        if cleaned:
+            return cleaned
+    if include_cached:
+        return get_cached_hf_token()
+    return None
+
+
+def resolve_hf_router_token(session_hf_token: str | None = None) -> str | None:
+    """Resolve the token used for Hugging Face Router LLM calls.
+
+    App-specific precedence:
+    1. INFERENCE_TOKEN: shared hosted-Space inference token.
+    2. session_hf_token: the active user/session token.
+    3. huggingface_hub.get_token(): HF_TOKEN/HUGGING_FACE_HUB_TOKEN or
+       local ``hf auth login`` cache.
+    """
+    return resolve_hf_token(os.environ.get("INFERENCE_TOKEN"), session_hf_token)
+
+
+def get_hf_bill_to() -> str | None:
+    """Return X-HF-Bill-To only when a shared inference token is active."""
+    if clean_hf_token(os.environ.get("INFERENCE_TOKEN")):
+        return os.environ.get("HF_BILL_TO", "smolagents")
+    return None
+
+
+def bearer_token_from_header(auth_header: str | None) -> str | None:
+    """Extract a cleaned bearer token from an Authorization header."""
+    if not auth_header or not auth_header.startswith("Bearer "):
+        return None
+    return clean_hf_token(auth_header[7:])
+
+
+def resolve_hf_request_token(
+    request: Any,
+    *,
+    include_env_fallback: bool = True,
+) -> str | None:
+    """Resolve a user token from a FastAPI request.
+
+    This intentionally does not use the local ``hf auth login`` cache. Backend
+    request paths should act as the browser user from Authorization/cookie, or
+    fall back only to an explicit server ``HF_TOKEN`` in dev/server contexts.
+    """
+    token = bearer_token_from_header(request.headers.get("Authorization", ""))
+    if token:
+        return token
+    token = clean_hf_token(request.cookies.get("hf_access_token"))
+    if token:
+        return token
+    if include_env_fallback:
+        return clean_hf_token(os.environ.get("HF_TOKEN"))
+    return None
diff --git a/agent/core/llm_params.py b/agent/core/llm_params.py
index bac507354348fc2b8ca423c427005e9e6efc8bb2..880886b3e1e2919f31d35934c6f9a4c3fb5e9525 100644
--- a/agent/core/llm_params.py
+++ b/agent/core/llm_params.py
@@ -5,7 +5,12 @@ can import it without pulling in the whole agent loop / tool router and
 creating circular imports.
 """
 
-import os
+from agent.core.hf_tokens import get_hf_bill_to, resolve_hf_router_token
+
+
+def _resolve_hf_router_token(session_hf_token: str | None = None) -> str | None:
+    """Backward-compatible private wrapper used by tests and older imports."""
+    return resolve_hf_router_token(session_hf_token)
 
 
 def _patch_litellm_effort_validation() -> None:
@@ -129,7 +134,8 @@ def _resolve_llm_params(
       1. INFERENCE_TOKEN env — shared key on the hosted Space (inference is
          free for users, billed to the Space owner via ``X-HF-Bill-To``).
       2. session.hf_token — the user's own token (CLI / OAuth / cache file).
-      3. HF_TOKEN env — belt-and-suspenders fallback for CLI users.
+      3. huggingface_hub cache — ``HF_TOKEN`` / ``HUGGING_FACE_HUB_TOKEN`` /
+         local ``hf auth login`` cache.
     """
     if model_name.startswith("anthropic/"):
         params: dict = {"model": model_name}
@@ -175,18 +181,13 @@ def _resolve_llm_params(
         return params
 
     hf_model = model_name.removeprefix("huggingface/")
-    api_key = (
-        os.environ.get("INFERENCE_TOKEN")
-        or session_hf_token
-        or os.environ.get("HF_TOKEN")
-    )
+    api_key = _resolve_hf_router_token(session_hf_token)
     params = {
         "model": f"openai/{hf_model}",
         "api_base": "https://router.huggingface.co/v1",
         "api_key": api_key,
     }
-    if os.environ.get("INFERENCE_TOKEN"):
-        bill_to = os.environ.get("HF_BILL_TO", "smolagents")
+    if bill_to := get_hf_bill_to():
         params["extra_headers"] = {"X-HF-Bill-To": bill_to}
     if reasoning_effort:
         hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
diff --git a/agent/core/session.py b/agent/core/session.py
index 0cf9524a12241f24aaa1ab1186fadc73620f177b..c53294cd251931cca8eff171f1ee694c2991e629 100644
--- a/agent/core/session.py
+++ b/agent/core/session.py
@@ -12,10 +12,13 @@ from typing import Any, Optional
 
 from agent.config import Config
 from agent.context_manager.manager import ContextManager
+from agent.messaging.gateway import NotificationGateway
+from agent.messaging.models import NotificationRequest
 
 logger = logging.getLogger(__name__)
 
 _DEFAULT_MAX_TOKENS = 200_000
+_TURN_COMPLETE_NOTIFICATION_CHARS = 39000
 
 
 def _get_max_tokens_safe(model_name: str) -> int:
@@ -62,6 +65,7 @@ class OpType(Enum):
 class Event:
     event_type: str
     data: Optional[dict[str, Any]] = None
+    seq: Optional[int] = None
 
 
 class Session:
@@ -73,16 +77,26 @@ class Session:
     def __init__(
         self,
         event_queue: asyncio.Queue,
-        config: Config | None = None,
+        config: Config,
         tool_router=None,
         context_manager: ContextManager | None = None,
         hf_token: str | None = None,
         local_mode: bool = False,
         stream: bool = True,
+        notification_gateway: NotificationGateway | None = None,
+        notification_destinations: list[str] | None = None,
+        defer_turn_complete_notification: bool = False,
+        session_id: str | None = None,
+        user_id: str | None = None,
+        persistence_store: Any | None = None,
     ):
         self.hf_token: Optional[str] = hf_token
+        self.user_id: Optional[str] = user_id
+        self.persistence_store = persistence_store
         self.tool_router = tool_router
         self.stream = stream
+        if config is None:
+            raise ValueError("Session requires a Config")
         tool_specs = tool_router.get_tool_specs_for_llm() if tool_router else []
         self.context_manager = context_manager or ContextManager(
             model_max_tokens=_get_max_tokens_safe(config.model_name),
@@ -93,15 +107,16 @@ class Session:
             local_mode=local_mode,
         )
         self.event_queue = event_queue
-        self.session_id = str(uuid.uuid4())
-        self.config = config or Config(
-            model_name="bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0",
-        )
+        self.session_id = session_id or str(uuid.uuid4())
+        self.config = config
         self.is_running = True
         self._cancelled = asyncio.Event()
         self.pending_approval: Optional[dict[str, Any]] = None
         self.sandbox = None
         self._running_job_ids: set[str] = set()  # HF job IDs currently executing
+        self.notification_gateway = notification_gateway
+        self.notification_destinations = list(notification_destinations or [])
+        self.defer_turn_complete_notification = defer_turn_complete_notification
 
         # Session trajectory logging
         self.logged_events: list[dict] = []
@@ -123,11 +138,10 @@ class Session:
         #          thinking params at all
         # Key absent → not probed yet; fall back to the raw preference.
         self.model_effective_effort: dict[str, str | None] = {}
+        self.context_manager.on_message_added = self._schedule_trace_message
 
     async def send_event(self, event: Event) -> None:
         """Send event back to client and log to trajectory"""
-        await self.event_queue.put(event)
-
         # Log event to trajectory
         self.logged_events.append(
             {
@@ -136,11 +150,149 @@ class Session:
                 "data": event.data,
             }
         )
+        if self.persistence_store is not None:
+            try:
+                event.seq = await self.persistence_store.append_event(
+                    self.session_id, event.event_type, event.data
+                )
+            except Exception as e:
+                logger.debug("Event persistence failed for %s: %s", self.session_id, e)
+
+        await self.event_queue.put(event)
+        await self._enqueue_auto_notification_requests(event)
 
         # Mid-turn heartbeat flush (owned by telemetry module).
         from agent.core.telemetry import HeartbeatSaver
+
         HeartbeatSaver.maybe_fire(self)
 
+    def _schedule_trace_message(self, message: Any) -> None:
+        """Best-effort append-only trace save for SFT/KPI export."""
+        if self.persistence_store is None:
+            return
+        try:
+            payload = message.model_dump(mode="json")
+        except Exception:
+            return
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError:
+            return
+        source = str(payload.get("role") or "message")
+        loop.create_task(
+            self.persistence_store.append_trace_message(
+                self.session_id, payload, source=source
+            )
+        )
+
+    def set_notification_destinations(self, destinations: list[str]) -> None:
+        """Replace the session's opted-in auto-notification destinations."""
+        deduped: list[str] = []
+        seen: set[str] = set()
+        for destination in destinations:
+            if destination not in seen:
+                deduped.append(destination)
+                seen.add(destination)
+        self.notification_destinations = deduped
+
+    async def send_deferred_turn_complete_notification(self, event: Event) -> None:
+        if event.event_type != "turn_complete":
+            return
+        await self._enqueue_auto_notification_requests(
+            event,
+            include_deferred_turn_complete=True,
+        )
+
+    async def _enqueue_auto_notification_requests(
+        self,
+        event: Event,
+        include_deferred_turn_complete: bool = False,
+    ) -> None:
+        if self.notification_gateway is None:
+            return
+        if not self.notification_destinations:
+            return
+        auto_events = set(self.config.messaging.auto_event_types)
+        if event.event_type not in auto_events:
+            return
+        if (
+            self.defer_turn_complete_notification
+            and event.event_type == "turn_complete"
+            and not include_deferred_turn_complete
+        ):
+            return
+
+        requests = self._build_auto_notification_requests(event)
+        for request in requests:
+            await self.notification_gateway.enqueue(request)
+
+    def _build_auto_notification_requests(
+        self, event: Event
+    ) -> list[NotificationRequest]:
+        metadata = {
+            "session_id": self.session_id,
+            "model": self.config.model_name,
+            "event_type": event.event_type,
+        }
+
+        title: str | None = None
+        message: str | None = None
+        severity = "info"
+        data = event.data or {}
+        if event.event_type == "approval_required":
+            tools = data.get("tools", [])
+            tool_names = []
+            for tool in tools if isinstance(tools, list) else []:
+                if isinstance(tool, dict):
+                    tool_name = str(tool.get("tool") or "").strip()
+                    if tool_name and tool_name not in tool_names:
+                        tool_names.append(tool_name)
+            count = len(tools) if isinstance(tools, list) else 0
+            title = "Agent approval required"
+            message = (
+                f"Session {self.session_id} is waiting for approval "
+                f"for {count} tool call(s)."
+            )
+            if tool_names:
+                message += " Tools: " + ", ".join(tool_names)
+            severity = "warning"
+        elif event.event_type == "error":
+            title = "Agent error"
+            error = str(data.get("error") or "Unknown error")
+            message = f"Session {self.session_id} hit an error.\n{error[:500]}"
+            severity = "error"
+        elif event.event_type == "turn_complete":
+            title = "Agent task complete"
+            summary = str(data.get("final_response") or "").strip()
+            if summary:
+                summary = summary[:_TURN_COMPLETE_NOTIFICATION_CHARS]
+                message = (
+                    f"Session {self.session_id} completed successfully.\n"
+                    f"{summary}"
+                )
+            else:
+                message = f"Session {self.session_id} completed successfully."
+            severity = "success"
+
+        if message is None:
+            return []
+
+        requests: list[NotificationRequest] = []
+        for destination in self.notification_destinations:
+            if not self.config.messaging.can_auto_send(destination):
+                continue
+            requests.append(
+                NotificationRequest(
+                    destination=destination,
+                    title=title,
+                    message=message,
+                    severity=severity,
+                    metadata=metadata,
+                    event_type=event.event_type,
+                )
+            )
+        return requests
+
     def cancel(self) -> None:
         """Signal cancellation to the running agent loop."""
         self._cancelled.set()
@@ -199,11 +351,21 @@ class Session:
                 tools = self.tool_router.get_tool_specs_for_llm() or []
             except Exception:
                 tools = []
+        # Sum per-call cost from llm_call events so analyzers don't have to
+        # walk the events array themselves. Each `llm_call` event already
+        # carries cost_usd from `agent.core.telemetry.record_llm_call`.
+        total_cost_usd = sum(
+            float((e.get("data") or {}).get("cost_usd") or 0.0)
+            for e in self.logged_events
+            if e.get("event_type") == "llm_call"
+        )
         return {
             "session_id": self.session_id,
+            "user_id": self.user_id,
             "session_start_time": self.session_start_time,
             "session_end_time": datetime.now().isoformat(),
             "model_name": self.config.model_name,
+            "total_cost_usd": total_cost_usd,
             "messages": [msg.model_dump() for msg in self.context_manager.items],
             "events": self.logged_events,
             "tools": tools,
diff --git a/agent/core/session_persistence.py b/agent/core/session_persistence.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c125b3883accfaf825596eb6345a5d6a6a1350f
--- /dev/null
+++ b/agent/core/session_persistence.py
@@ -0,0 +1,428 @@
+"""Optional durable session persistence for the hosted backend.
+
+The public CLI must keep working without MongoDB.  This module therefore
+exposes one small async store interface and returns a no-op implementation
+unless ``MONGODB_URI`` is configured and reachable.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from datetime import UTC, datetime
+from typing import Any
+
+from bson import BSON
+from pymongo import AsyncMongoClient, DeleteMany, ReturnDocument, UpdateOne
+from pymongo.errors import DuplicateKeyError, InvalidDocument, PyMongoError
+
+logger = logging.getLogger(__name__)
+
+SCHEMA_VERSION = 1
+MAX_BSON_BYTES = 15 * 1024 * 1024
+
+
+def _now() -> datetime:
+    return datetime.now(UTC)
+
+
+def _doc_id(session_id: str, idx: int) -> str:
+    return f"{session_id}:{idx}"
+
+
+def _safe_message_doc(message: dict[str, Any]) -> dict[str, Any]:
+    """Return a Mongo-safe message document payload.
+
+    Mongo's hard document limit is 16 MB.  We stay below that and store an
+    explicit marker rather than failing the whole snapshot for one huge tool log.
+    """
+    try:
+        if len(BSON.encode({"message": message})) <= MAX_BSON_BYTES:
+            return message
+    except (InvalidDocument, OverflowError):
+        pass
+    return {
+        "role": "tool",
+        "content": (
+            "[SYSTEM: A single persisted message exceeded MongoDB's document "
+            "size/encoding limit and was replaced by this marker.]"
+        ),
+        "ml_intern_persistence_error": "message_too_large_or_invalid",
+    }
+
+
+class NoopSessionStore:
+    """Async no-op store used when Mongo is not configured."""
+
+    enabled = False
+
+    async def init(self) -> None:
+        return None
+
+    async def close(self) -> None:
+        return None
+
+    async def upsert_session(self, **_: Any) -> None:
+        return None
+
+    async def save_snapshot(self, **_: Any) -> None:
+        return None
+
+    async def load_session(self, *_: Any, **__: Any) -> dict[str, Any] | None:
+        return None
+
+    async def list_sessions(self, *_: Any, **__: Any) -> list[dict[str, Any]]:
+        return []
+
+    async def soft_delete_session(self, *_: Any, **__: Any) -> None:
+        return None
+
+    async def update_session_fields(self, *_: Any, **__: Any) -> None:
+        return None
+
+    async def append_event(self, *_: Any, **__: Any) -> int | None:
+        return None
+
+    async def load_events_after(self, *_: Any, **__: Any) -> list[dict[str, Any]]:
+        return []
+
+    async def append_trace_message(self, *_: Any, **__: Any) -> int | None:
+        return None
+
+    async def get_quota(self, *_: Any, **__: Any) -> int | None:
+        return None
+
+    async def try_increment_quota(self, *_: Any, **__: Any) -> int | None:
+        return None
+
+    async def refund_quota(self, *_: Any, **__: Any) -> None:
+        return None
+
+
+class MongoSessionStore(NoopSessionStore):
+    """MongoDB-backed session store."""
+
+    enabled = True
+
+    def __init__(self, uri: str, db_name: str) -> None:
+        self.uri = uri
+        self.db_name = db_name
+        self.enabled = False
+        self.client: AsyncMongoClient | None = None
+        self.db = None
+
+    async def init(self) -> None:
+        try:
+            self.client = AsyncMongoClient(self.uri, serverSelectionTimeoutMS=3000)
+            self.db = self.client[self.db_name]
+            await self.client.admin.command("ping")
+            await self._create_indexes()
+            self.enabled = True
+            logger.info("Mongo session persistence enabled (db=%s)", self.db_name)
+        except Exception as e:
+            logger.warning("Mongo session persistence disabled: %s", e)
+            self.enabled = False
+            if self.client is not None:
+                await self.client.close()
+            self.client = None
+            self.db = None
+
+    async def close(self) -> None:
+        if self.client is not None:
+            await self.client.close()
+        self.client = None
+        self.db = None
+
+    async def _create_indexes(self) -> None:
+        if self.db is None:
+            return
+        await self.db.sessions.create_index(
+            [("user_id", 1), ("visibility", 1), ("updated_at", -1)]
+        )
+        await self.db.sessions.create_index(
+            [("visibility", 1), ("status", 1), ("last_active_at", -1)]
+        )
+        await self.db.session_messages.create_index(
+            [("session_id", 1), ("idx", 1)], unique=True
+        )
+        await self.db.session_events.create_index(
+            [("session_id", 1), ("seq", 1)], unique=True
+        )
+        await self.db.session_trace_messages.create_index(
+            [("session_id", 1), ("seq", 1)], unique=True
+        )
+        await self.db.session_trace_messages.create_index([("created_at", -1)])
+
+    def _ready(self) -> bool:
+        return bool(self.enabled and self.db is not None)
+
+    async def upsert_session(
+        self,
+        *,
+        session_id: str,
+        user_id: str,
+        model: str,
+        title: str | None = None,
+        surface: str = "frontend",
+        created_at: datetime | None = None,
+        runtime_state: str = "idle",
+        status: str = "active",
+        message_count: int = 0,
+        turn_count: int = 0,
+        pending_approval: list[dict[str, Any]] | None = None,
+        claude_counted: bool = False,
+        notification_destinations: list[str] | None = None,
+    ) -> None:
+        if not self._ready():
+            return
+        now = _now()
+        await self.db.sessions.update_one(
+            {"_id": session_id},
+            {
+                "$setOnInsert": {
+                    "_id": session_id,
+                    "session_id": session_id,
+                    "user_id": user_id,
+                    "surface": surface,
+                    "created_at": created_at or now,
+                    "schema_version": SCHEMA_VERSION,
+                    "visibility": "live",
+                },
+                "$set": {
+                    "title": title,
+                    "model": model,
+                    "status": status,
+                    "runtime_state": runtime_state,
+                    "updated_at": now,
+                    "last_active_at": now,
+                    "message_count": message_count,
+                    "turn_count": turn_count,
+                    "pending_approval": pending_approval or [],
+                    "claude_counted": claude_counted,
+                    "notification_destinations": notification_destinations or [],
+                },
+            },
+            upsert=True,
+        )
+
+    async def save_snapshot(
+        self,
+        *,
+        session_id: str,
+        user_id: str,
+        model: str,
+        messages: list[dict[str, Any]],
+        title: str | None = None,
+        runtime_state: str = "idle",
+        status: str = "active",
+        turn_count: int = 0,
+        pending_approval: list[dict[str, Any]] | None = None,
+        claude_counted: bool = False,
+        created_at: datetime | None = None,
+        notification_destinations: list[str] | None = None,
+    ) -> None:
+        if not self._ready():
+            return
+        now = _now()
+        await self.upsert_session(
+            session_id=session_id,
+            user_id=user_id,
+            model=model,
+            title=title,
+            created_at=created_at,
+            runtime_state=runtime_state,
+            status=status,
+            message_count=len(messages),
+            turn_count=turn_count,
+            pending_approval=pending_approval,
+            claude_counted=claude_counted,
+            notification_destinations=notification_destinations,
+        )
+        ops: list[Any] = []
+        for idx, raw in enumerate(messages):
+            ops.append(
+                UpdateOne(
+                    {"_id": _doc_id(session_id, idx)},
+                    {
+                        "$set": {
+                            "session_id": session_id,
+                            "idx": idx,
+                            "message": _safe_message_doc(raw),
+                            "updated_at": now,
+                        },
+                        "$setOnInsert": {"created_at": now},
+                    },
+                    upsert=True,
+                )
+            )
+        ops.append(DeleteMany({"session_id": session_id, "idx": {"$gte": len(messages)}}))
+        try:
+            if ops:
+                await self.db.session_messages.bulk_write(ops, ordered=False)
+        except PyMongoError as e:
+            logger.warning("Failed to persist session %s snapshot: %s", session_id, e)
+
+    async def load_session(
+        self, session_id: str, *, include_deleted: bool = False
+    ) -> dict[str, Any] | None:
+        if not self._ready():
+            return None
+        meta = await self.db.sessions.find_one({"_id": session_id})
+        if not meta:
+            return None
+        if meta.get("visibility") == "deleted" and not include_deleted:
+            return None
+        cursor = self.db.session_messages.find({"session_id": session_id}).sort("idx", 1)
+        messages = [row.get("message") async for row in cursor]
+        return {"metadata": meta, "messages": messages}
+
+    async def list_sessions(
+        self, user_id: str, *, include_deleted: bool = False
+    ) -> list[dict[str, Any]]:
+        if not self._ready():
+            return []
+        query: dict[str, Any] = {"user_id": user_id}
+        if user_id == "dev":
+            query = {}
+        if not include_deleted:
+            query["visibility"] = {"$ne": "deleted"}
+        cursor = self.db.sessions.find(query).sort("updated_at", -1)
+        return [row async for row in cursor]
+
+    async def soft_delete_session(self, session_id: str) -> None:
+        if not self._ready():
+            return
+        await self.db.sessions.update_one(
+            {"_id": session_id},
+            {
+                "$set": {
+                    "visibility": "deleted",
+                    "runtime_state": "idle",
+                    "updated_at": _now(),
+                }
+            },
+        )
+
+    async def update_session_fields(self, session_id: str, **fields: Any) -> None:
+        if not self._ready() or not fields:
+            return
+        fields["updated_at"] = _now()
+        await self.db.sessions.update_one({"_id": session_id}, {"$set": fields})
+
+    async def _next_seq(self, counter_id: str) -> int:
+        doc = await self.db.counters.find_one_and_update(
+            {"_id": counter_id},
+            {"$inc": {"seq": 1}},
+            upsert=True,
+            return_document=ReturnDocument.AFTER,
+        )
+        return int(doc["seq"])
+
+    async def append_event(
+        self, session_id: str, event_type: str, data: dict[str, Any] | None
+    ) -> int | None:
+        if not self._ready():
+            return None
+        try:
+            seq = await self._next_seq(f"event:{session_id}")
+            await self.db.session_events.insert_one(
+                {
+                    "_id": _doc_id(session_id, seq),
+                    "session_id": session_id,
+                    "seq": seq,
+                    "event_type": event_type,
+                    "data": data or {},
+                    "created_at": _now(),
+                }
+            )
+            return seq
+        except PyMongoError as e:
+            logger.debug("Failed to append event for %s: %s", session_id, e)
+            return None
+
+    async def load_events_after(self, session_id: str, after_seq: int = 0) -> list[dict[str, Any]]:
+        if not self._ready():
+            return []
+        cursor = self.db.session_events.find(
+            {"session_id": session_id, "seq": {"$gt": int(after_seq or 0)}}
+        ).sort("seq", 1)
+        return [row async for row in cursor]
+
+    async def append_trace_message(
+        self, session_id: str, message: dict[str, Any], source: str = "message"
+    ) -> int | None:
+        if not self._ready():
+            return None
+        try:
+            seq = await self._next_seq(f"trace:{session_id}")
+            await self.db.session_trace_messages.insert_one(
+                {
+                    "_id": _doc_id(session_id, seq),
+                    "session_id": session_id,
+                    "seq": seq,
+                    "role": message.get("role"),
+                    "message": _safe_message_doc(message),
+                    "source": source,
+                    "created_at": _now(),
+                }
+            )
+            return seq
+        except PyMongoError as e:
+            logger.debug("Failed to append trace message for %s: %s", session_id, e)
+            return None
+
+    async def get_quota(self, user_id: str, day: str) -> int | None:
+        if not self._ready():
+            return None
+        doc = await self.db.claude_quotas.find_one({"_id": f"{user_id}:{day}"})
+        return int(doc.get("count", 0)) if doc else 0
+
+    async def try_increment_quota(self, user_id: str, day: str, cap: int) -> int | None:
+        if not self._ready():
+            return None
+        key = f"{user_id}:{day}"
+        now = _now()
+        try:
+            await self.db.claude_quotas.insert_one(
+                {
+                    "_id": key,
+                    "user_id": user_id,
+                    "day": day,
+                    "count": 1,
+                    "updated_at": now,
+                }
+            )
+            return 1
+        except DuplicateKeyError:
+            pass
+        doc = await self.db.claude_quotas.find_one_and_update(
+            {"_id": key, "count": {"$lt": cap}},
+            {"$inc": {"count": 1}, "$set": {"updated_at": now}},
+            return_document=ReturnDocument.AFTER,
+        )
+        return int(doc["count"]) if doc else None
+
+    async def refund_quota(self, user_id: str, day: str) -> None:
+        if not self._ready():
+            return
+        await self.db.claude_quotas.update_one(
+            {"_id": f"{user_id}:{day}", "count": {"$gt": 0}},
+            {"$inc": {"count": -1}, "$set": {"updated_at": _now()}},
+        )
+
+
+_store: NoopSessionStore | MongoSessionStore | None = None
+
+
+def get_session_store() -> NoopSessionStore | MongoSessionStore:
+    global _store
+    if _store is None:
+        uri = os.environ.get("MONGODB_URI")
+        db_name = os.environ.get("MONGODB_DB", "ml-intern")
+        _store = MongoSessionStore(uri, db_name) if uri else NoopSessionStore()
+    return _store
+
+
+def _reset_store_for_tests(store: NoopSessionStore | MongoSessionStore | None = None) -> None:
+    global _store
+    _store = store
diff --git a/agent/core/session_uploader.py b/agent/core/session_uploader.py
index f22b520103c6b9c1b42878b3636c4bc10d674c70..d18ec6b8a49253716b00ef752544991dd07dba89 100644
--- a/agent/core/session_uploader.py
+++ b/agent/core/session_uploader.py
@@ -90,9 +90,11 @@ def upload_session_as_file(
         # across sessions with different tool rosters.
         session_row = {
             "session_id": data["session_id"],
+            "user_id": data.get("user_id"),
             "session_start_time": data["session_start_time"],
             "session_end_time": data["session_end_time"],
             "model_name": data["model_name"],
+            "total_cost_usd": data.get("total_cost_usd"),
             "messages": json.dumps(scrubbed_messages),
             "events": json.dumps(scrubbed_events),
             "tools": json.dumps(scrubbed_tools),
diff --git a/agent/core/tools.py b/agent/core/tools.py
index 9bbf91d798514fddbbae1b7c68d9f1826e82d824..ef2c57bc19478043996597083cba54a243cdf4cc 100644
--- a/agent/core/tools.py
+++ b/agent/core/tools.py
@@ -46,10 +46,12 @@ from agent.tools.hf_repo_git_tool import (
     hf_repo_git_handler,
 )
 from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
+from agent.tools.notify_tool import NOTIFY_TOOL_SPEC, notify_handler
 from agent.tools.papers_tool import HF_PAPERS_TOOL_SPEC, hf_papers_handler
 from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
 from agent.tools.research_tool import RESEARCH_TOOL_SPEC, research_handler
 from agent.tools.sandbox_tool import get_sandbox_tools
+from agent.tools.web_search_tool import WEB_SEARCH_TOOL_SPEC, web_search_handler
 
 # NOTE: Private HF repo tool disabled - replaced by hf_repo_files and hf_repo_git
 # from agent.tools.private_hf_repo_tools import (
@@ -310,6 +312,12 @@ def create_builtin_tools(local_mode: bool = False) -> list[ToolSpec]:
             parameters=HF_PAPERS_TOOL_SPEC["parameters"],
             handler=hf_papers_handler,
         ),
+        ToolSpec(
+            name=WEB_SEARCH_TOOL_SPEC["name"],
+            description=WEB_SEARCH_TOOL_SPEC["description"],
+            parameters=WEB_SEARCH_TOOL_SPEC["parameters"],
+            handler=web_search_handler,
+        ),
         # Dataset inspection tool (unified)
         ToolSpec(
             name=HF_INSPECT_DATASET_TOOL_SPEC["name"],
@@ -324,6 +332,12 @@ def create_builtin_tools(local_mode: bool = False) -> list[ToolSpec]:
             parameters=PLAN_TOOL_SPEC["parameters"],
             handler=plan_tool_handler,
         ),
+        ToolSpec(
+            name=NOTIFY_TOOL_SPEC["name"],
+            description=NOTIFY_TOOL_SPEC["description"],
+            parameters=NOTIFY_TOOL_SPEC["parameters"],
+            handler=notify_handler,
+        ),
         ToolSpec(
             name=HF_JOBS_TOOL_SPEC["name"],
             description=HF_JOBS_TOOL_SPEC["description"],
diff --git a/agent/main.py b/agent/main.py
index f601ab545687259ddbca3895235c5eb7fb31a027..f500cc5fe756e04e5fac90184b91c5d5198a51aa 100644
--- a/agent/main.py
+++ b/agent/main.py
@@ -23,8 +23,10 @@ from prompt_toolkit import PromptSession
 from agent.config import load_config
 from agent.core.agent_loop import submission_loop
 from agent.core import model_switcher
+from agent.core.hf_tokens import resolve_hf_token
 from agent.core.session import OpType
 from agent.core.tools import ToolRouter
+from agent.messaging.gateway import NotificationGateway
 from agent.utils.reliability_checks import check_training_script_save_pattern
 from agent.utils.terminal_display import (
     get_console,
@@ -69,26 +71,15 @@ def _safe_get_args(arguments: dict) -> dict:
     return args if isinstance(args, dict) else {}
 
 
-def _get_hf_token() -> str | None:
-    """Get HF token from environment, huggingface_hub API, or cached token file."""
-    token = os.environ.get("HF_TOKEN")
-    if token:
-        return token
+def _get_hf_user(token: str | None) -> str | None:
+    """Resolve the HF username for a token, if available."""
+    if not token:
+        return None
     try:
         from huggingface_hub import HfApi
-        api = HfApi()
-        token = api.token
-        if token:
-            return token
+        return HfApi(token=token).whoami().get("name")
     except Exception:
-        pass
-    # Fallback: read the cached token file directly
-    token_path = Path.home() / ".cache" / "huggingface" / "token"
-    if token_path.exists():
-        token = token_path.read_text().strip()
-        if token:
-            return token
-    return None
+        return None
 
 
 async def _prompt_and_save_hf_token(prompt_session: PromptSession) -> str:
@@ -342,6 +333,9 @@ async def event_listener(
                 stream_buf.discard()
                 print_turn_complete()
                 print_plan()
+                session = session_holder[0] if session_holder else None
+                if session is not None:
+                    await session.send_deferred_turn_complete_notification(event)
                 turn_complete_event.set()
             elif event.event_type == "interrupted":
                 shimmer.stop()
@@ -758,7 +752,7 @@ async def _handle_slash_command(
         normalized = arg.removeprefix("huggingface/")
         session = session_holder[0] if session_holder else None
         await model_switcher.probe_and_switch_model(
-            normalized, config, session, console, _get_hf_token(),
+            normalized, config, session, console, resolve_hf_token(),
         )
         return None
 
@@ -817,7 +811,7 @@ async def _handle_slash_command(
     return None
 
 
-async def main():
+async def main(model: str | None = None):
     """Interactive chat with the agent"""
 
     # Clear screen
@@ -827,19 +821,16 @@ async def main():
     prompt_session = PromptSession()
 
     # HF token — required, prompt if missing
-    hf_token = _get_hf_token()
+    hf_token = resolve_hf_token()
     if not hf_token:
         hf_token = await _prompt_and_save_hf_token(prompt_session)
 
-    config = load_config(CLI_CONFIG_PATH)
+    config = load_config(CLI_CONFIG_PATH, include_user_defaults=True)
+    if model:
+        config.model_name = model
 
     # Resolve username for banner
-    hf_user = None
-    try:
-        from huggingface_hub import HfApi
-        hf_user = HfApi(token=hf_token).whoami().get("name")
-    except Exception:
-        pass
+    hf_user = _get_hf_user(hf_token)
 
     print_banner(model=config.model_name, hf_user=hf_user)
 
@@ -857,6 +848,8 @@ async def main():
     turn_complete_event.set()
     ready_event = asyncio.Event()
 
+    notification_gateway = NotificationGateway(config.messaging)
+    await notification_gateway.start()
     # Create tool router with local mode
     tool_router = ToolRouter(config.mcpServers, hf_token=hf_token, local_mode=True)
 
@@ -871,8 +864,12 @@ async def main():
             tool_router=tool_router,
             session_holder=session_holder,
             hf_token=hf_token,
+            user_id=hf_user,
             local_mode=True,
             stream=True,
+            notification_gateway=notification_gateway,
+            notification_destinations=config.messaging.default_auto_destinations(),
+            defer_turn_complete_notification=True,
         )
     )
 
@@ -1028,6 +1025,8 @@ async def main():
         agent_task.cancel()
         # Agent didn't shut down cleanly — close MCP explicitly
         await tool_router.__aexit__(None, None, None)
+    finally:
+        await notification_gateway.close()
 
     # Now safe to cancel the listener (agent is done emitting events)
     listener_task.cancel()
@@ -1047,15 +1046,18 @@ async def headless_main(
     logging.basicConfig(level=logging.WARNING)
     _configure_runtime_logging()
 
-    hf_token = _get_hf_token()
+    hf_token = resolve_hf_token()
     if not hf_token:
         print("ERROR: No HF token found. Set HF_TOKEN or run `huggingface-cli login`.", file=sys.stderr)
         sys.exit(1)
 
     print(f"HF token loaded", file=sys.stderr)
 
-    config = load_config(CLI_CONFIG_PATH)
+    config = load_config(CLI_CONFIG_PATH, include_user_defaults=True)
     config.yolo_mode = True  # Auto-approve everything in headless mode
+    notification_gateway = NotificationGateway(config.messaging)
+    await notification_gateway.start()
+    hf_user = _get_hf_user(hf_token)
 
     if model:
         config.model_name = model
@@ -1082,8 +1084,12 @@ async def headless_main(
             tool_router=tool_router,
             session_holder=session_holder,
             hf_token=hf_token,
+            user_id=hf_user,
             local_mode=True,
             stream=stream,
+            notification_gateway=notification_gateway,
+            notification_destinations=config.messaging.default_auto_destinations(),
+            defer_turn_complete_notification=True,
         )
     )
 
@@ -1209,6 +1215,10 @@ async def headless_main(
             stream_buf.discard()
             history_size = event.data.get("history_size", "?") if event.data else "?"
             print(f"\n--- Agent {event.event_type} (history_size={history_size}) ---", file=sys.stderr)
+            if event.event_type == "turn_complete":
+                session = session_holder[0] if session_holder else None
+                if session is not None:
+                    await session.send_deferred_turn_complete_notification(event)
             break
 
     # Shutdown
@@ -1222,6 +1232,8 @@ async def headless_main(
     except asyncio.TimeoutError:
         agent_task.cancel()
         await tool_router.__aexit__(None, None, None)
+    finally:
+        await notification_gateway.close()
 
 
 def cli():
@@ -1252,7 +1264,7 @@ def cli():
                 max_iter = 10_000  # effectively unlimited
             asyncio.run(headless_main(args.prompt, model=args.model, max_iterations=max_iter, stream=not args.no_stream))
         else:
-            asyncio.run(main())
+            asyncio.run(main(model=args.model))
     except KeyboardInterrupt:
         print("\n\nGoodbye!")
 
diff --git a/agent/messaging/__init__.py b/agent/messaging/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c399d254e30fcbce555d6f51b810440b1171ec1a
--- /dev/null
+++ b/agent/messaging/__init__.py
@@ -0,0 +1,15 @@
+from agent.messaging.gateway import NotificationGateway
+from agent.messaging.models import (
+    MessagingConfig,
+    NotificationRequest,
+    NotificationResult,
+    SUPPORTED_AUTO_EVENT_TYPES,
+)
+
+__all__ = [
+    "MessagingConfig",
+    "NotificationGateway",
+    "NotificationRequest",
+    "NotificationResult",
+    "SUPPORTED_AUTO_EVENT_TYPES",
+]
diff --git a/agent/messaging/base.py b/agent/messaging/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf1d73894fa85ce066fa289902c4d6b783ceaa11
--- /dev/null
+++ b/agent/messaging/base.py
@@ -0,0 +1,27 @@
+from abc import ABC, abstractmethod
+
+import httpx
+
+from agent.messaging.models import DestinationConfig, NotificationRequest, NotificationResult
+
+
+class NotificationError(Exception):
+    """Delivery failed and should not be retried."""
+
+
+class RetryableNotificationError(NotificationError):
+    """Delivery failed transiently and can be retried."""
+
+
+class NotificationProvider(ABC):
+    provider_name: str
+
+    @abstractmethod
+    async def send(
+        self,
+        client: httpx.AsyncClient,
+        destination_name: str,
+        destination: DestinationConfig,
+        request: NotificationRequest,
+    ) -> NotificationResult:
+        """Deliver a notification to one destination."""
diff --git a/agent/messaging/gateway.py b/agent/messaging/gateway.py
new file mode 100644
index 0000000000000000000000000000000000000000..83c4704baafe9eadea23a336f691dc96db934e79
--- /dev/null
+++ b/agent/messaging/gateway.py
@@ -0,0 +1,166 @@
+import asyncio
+import logging
+from collections.abc import Iterable
+
+import httpx
+
+from agent.messaging.base import (
+    NotificationError,
+    NotificationProvider,
+    RetryableNotificationError,
+)
+from agent.messaging.models import (
+    MessagingConfig,
+    NotificationRequest,
+    NotificationResult,
+)
+from agent.messaging.slack import SlackProvider
+
+logger = logging.getLogger(__name__)
+
+_RETRY_DELAYS = (1, 2, 4)
+
+
+class NotificationGateway:
+    def __init__(self, config: MessagingConfig):
+        self.config = config
+        self._providers: dict[str, NotificationProvider] = {
+            "slack": SlackProvider(),
+        }
+        self._queue: asyncio.Queue[NotificationRequest] = asyncio.Queue()
+        self._worker_task: asyncio.Task | None = None
+        self._client: httpx.AsyncClient | None = None
+
+    @property
+    def enabled(self) -> bool:
+        return self.config.enabled
+
+    async def start(self) -> None:
+        if not self.enabled or self._worker_task is not None:
+            return
+        self._client = httpx.AsyncClient(timeout=10.0)
+        self._worker_task = asyncio.create_task(self._worker(), name="notification-gateway")
+
+    async def flush(self) -> None:
+        if not self.enabled:
+            return
+        await self._queue.join()
+
+    async def close(self) -> None:
+        if not self.enabled:
+            return
+        await self.flush()
+        if self._worker_task is not None:
+            self._worker_task.cancel()
+            try:
+                await self._worker_task
+            except asyncio.CancelledError:
+                pass
+            self._worker_task = None
+        if self._client is not None:
+            await self._client.aclose()
+            self._client = None
+
+    async def send(self, request: NotificationRequest) -> NotificationResult:
+        if not self.enabled:
+            return NotificationResult(
+                destination=request.destination,
+                ok=False,
+                provider="disabled",
+                error="Messaging is disabled",
+            )
+
+        destination = self.config.get_destination(request.destination)
+        if destination is None:
+            return NotificationResult(
+                destination=request.destination,
+                ok=False,
+                provider="unknown",
+                error=f"Unknown destination '{request.destination}'",
+            )
+
+        provider = self._providers.get(destination.provider)
+        if provider is None:
+            return NotificationResult(
+                destination=request.destination,
+                ok=False,
+                provider=destination.provider,
+                error=f"No provider implementation for '{destination.provider}'",
+            )
+        return await self._send_with_retries(provider, request.destination, destination, request)
+
+    async def send_many(
+        self, requests: Iterable[NotificationRequest]
+    ) -> list[NotificationResult]:
+        results: list[NotificationResult] = []
+        for request in requests:
+            results.append(await self.send(request))
+        return results
+
+    async def enqueue(self, request: NotificationRequest) -> bool:
+        if not self.enabled or self._worker_task is None:
+            return False
+        await self._queue.put(request)
+        return True
+
+    async def _worker(self) -> None:
+        while True:
+            request = await self._queue.get()
+            try:
+                result = await self.send(request)
+                if not result.ok:
+                    logger.warning(
+                        "Notification delivery failed for %s: %s",
+                        request.destination,
+                        result.error,
+                    )
+            except Exception:
+                logger.exception("Unexpected notification worker failure")
+            finally:
+                self._queue.task_done()
+
+    async def _send_with_retries(
+        self,
+        provider: NotificationProvider,
+        destination_name: str,
+        destination,
+        request: NotificationRequest,
+    ) -> NotificationResult:
+        client = self._client or httpx.AsyncClient(timeout=10.0)
+        owns_client = self._client is None
+        try:
+            for attempt in range(len(_RETRY_DELAYS) + 1):
+                try:
+                    return await provider.send(client, destination_name, destination, request)
+                except RetryableNotificationError as exc:
+                    if attempt >= len(_RETRY_DELAYS):
+                        return NotificationResult(
+                            destination=destination_name,
+                            ok=False,
+                            provider=provider.provider_name,
+                            error=str(exc),
+                        )
+                    delay = _RETRY_DELAYS[attempt]
+                    logger.warning(
+                        "Retrying notification to %s in %ss after transient error: %s",
+                        destination_name,
+                        delay,
+                        exc,
+                    )
+                    await asyncio.sleep(delay)
+                except NotificationError as exc:
+                    return NotificationResult(
+                        destination=destination_name,
+                        ok=False,
+                        provider=provider.provider_name,
+                        error=str(exc),
+                    )
+            return NotificationResult(
+                destination=destination_name,
+                ok=False,
+                provider=provider.provider_name,
+                error="Notification delivery exhausted retries",
+            )
+        finally:
+            if owns_client:
+                await client.aclose()
diff --git a/agent/messaging/models.py b/agent/messaging/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..25f645fe92fa70901843e68be82d82f3a78e0d16
--- /dev/null
+++ b/agent/messaging/models.py
@@ -0,0 +1,123 @@
+from typing import Annotated, Literal
+
+from pydantic import BaseModel, Field, field_validator, model_validator
+
+_DESTINATION_NAME_CHARS = set("abcdefghijklmnopqrstuvwxyz0123456789._-")
+SUPPORTED_AUTO_EVENT_TYPES = {"approval_required", "error", "turn_complete"}
+
+
+class SlackDestinationConfig(BaseModel):
+    provider: Literal["slack"] = "slack"
+    token: str
+    channel: str
+    allow_agent_tool: bool = False
+    allow_auto_events: bool = False
+    username: str | None = None
+    icon_emoji: str | None = None
+
+    @field_validator("token", "channel")
+    @classmethod
+    def _require_non_empty(cls, value: str) -> str:
+        value = value.strip()
+        if not value:
+            raise ValueError("must not be empty")
+        return value
+
+
+DestinationConfig = Annotated[SlackDestinationConfig, Field(discriminator="provider")]
+
+
+class MessagingConfig(BaseModel):
+    enabled: bool = False
+    auto_event_types: list[str] = Field(
+        default_factory=lambda: ["approval_required", "error", "turn_complete"]
+    )
+    destinations: dict[str, DestinationConfig] = Field(default_factory=dict)
+
+    @field_validator("destinations")
+    @classmethod
+    def _validate_destination_names(
+        cls, destinations: dict[str, DestinationConfig]
+    ) -> dict[str, DestinationConfig]:
+        for name in destinations:
+            if not name or any(char not in _DESTINATION_NAME_CHARS for char in name):
+                raise ValueError(
+                    "destination names must use lowercase letters, digits, '.', '_' or '-'"
+                )
+        return destinations
+
+    @field_validator("auto_event_types")
+    @classmethod
+    def _validate_auto_event_types(cls, event_types: list[str]) -> list[str]:
+        if not event_types:
+            return []
+        normalized: list[str] = []
+        seen: set[str] = set()
+        for event_type in event_types:
+            if event_type not in SUPPORTED_AUTO_EVENT_TYPES:
+                raise ValueError(
+                    f"unsupported auto event type '{event_type}'"
+                )
+            if event_type not in seen:
+                normalized.append(event_type)
+                seen.add(event_type)
+        return normalized
+
+    @model_validator(mode="after")
+    def _require_destinations_when_enabled(self) -> "MessagingConfig":
+        if self.enabled and not self.destinations:
+            raise ValueError("messaging.enabled requires at least one destination")
+        return self
+
+    def get_destination(self, name: str) -> DestinationConfig | None:
+        return self.destinations.get(name)
+
+    def can_agent_tool_send(self, name: str) -> bool:
+        destination = self.get_destination(name)
+        return bool(destination and destination.allow_agent_tool)
+
+    def can_auto_send(self, name: str) -> bool:
+        destination = self.get_destination(name)
+        return bool(destination and destination.allow_auto_events)
+
+    def default_auto_destinations(self) -> list[str]:
+        if not self.enabled:
+            return []
+        return [
+            name
+            for name in self.destinations
+            if self.can_auto_send(name)
+        ]
+
+
+class NotificationRequest(BaseModel):
+    destination: str
+    title: str | None = None
+    message: str
+    severity: Literal["info", "success", "warning", "error"] = "info"
+    metadata: dict[str, str] = Field(default_factory=dict)
+    event_type: str | None = None
+
+    @field_validator("destination", "message")
+    @classmethod
+    def _require_text(cls, value: str) -> str:
+        value = value.strip()
+        if not value:
+            raise ValueError("must not be empty")
+        return value
+
+    @field_validator("title")
+    @classmethod
+    def _normalize_title(cls, value: str | None) -> str | None:
+        if value is None:
+            return None
+        value = value.strip()
+        return value or None
+
+
+class NotificationResult(BaseModel):
+    destination: str
+    ok: bool
+    provider: str
+    error: str | None = None
+    external_id: str | None = None
diff --git a/agent/messaging/slack.py b/agent/messaging/slack.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1fb7c18eef91396e566fb04b4f6411f9184a2be
--- /dev/null
+++ b/agent/messaging/slack.py
@@ -0,0 +1,186 @@
+import json
+import re
+
+import httpx
+
+from agent.messaging.base import (
+    NotificationError,
+    NotificationProvider,
+    RetryableNotificationError,
+)
+from agent.messaging.models import (
+    NotificationRequest,
+    NotificationResult,
+    SlackDestinationConfig,
+)
+
+_SEVERITY_PREFIX = {
+    "info": "[INFO]",
+    "success": "[SUCCESS]",
+    "warning": "[WARNING]",
+    "error": "[ERROR]",
+}
+
+
+def _format_slack_mrkdwn(content: str) -> str:
+    """Convert common Markdown constructs to Slack's mrkdwn syntax."""
+    if not content:
+        return content
+
+    placeholders: dict[str, str] = {}
+    placeholder_index = 0
+
+    def placeholder(value: str) -> str:
+        nonlocal placeholder_index
+        key = f"\x00SLACK{placeholder_index}\x00"
+        placeholder_index += 1
+        placeholders[key] = value
+        return key
+
+    text = content
+
+    # Protect code before any formatting conversion. Slack's mrkdwn ignores
+    # formatting inside backticks, so these regions should stay byte-for-byte.
+    text = re.sub(
+        r"(```(?:[^\n]*\n)?[\s\S]*?```)",
+        lambda match: placeholder(match.group(0)),
+        text,
+    )
+    text = re.sub(r"(`[^`\n]+`)", lambda match: placeholder(match.group(0)), text)
+
+    def convert_markdown_link(match: re.Match[str]) -> str:
+        label = match.group(1)
+        url = match.group(2).strip()
+        if url.startswith("<") and url.endswith(">"):
+            url = url[1:-1].strip()
+        return placeholder(f"<{url}|{label}>")
+
+    text = re.sub(
+        r"\[([^\]]+)\]\(([^()]*(?:\([^()]*\)[^()]*)*)\)",
+        convert_markdown_link,
+        text,
+    )
+
+    # Preserve existing Slack entities and manual mrkdwn links before escaping.
+    text = re.sub(
+        r"(<(?:[@#!]|(?:https?|mailto|tel):)[^>\n]+>)",
+        lambda match: placeholder(match.group(1)),
+        text,
+    )
+    text = re.sub(
+        r"^(>+\s)",
+        lambda match: placeholder(match.group(0)),
+        text,
+        flags=re.MULTILINE,
+    )
+
+    text = text.replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
+    text = text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
+
+    def convert_header(match: re.Match[str]) -> str:
+        header = match.group(1).strip()
+        header = re.sub(r"\*\*(.+?)\*\*", r"\1", header)
+        return placeholder(f"*{header}*")
+
+    text = re.sub(r"^#{1,6}\s+(.+)$", convert_header, text, flags=re.MULTILINE)
+    text = re.sub(
+        r"\*\*\*(.+?)\*\*\*",
+        lambda match: placeholder(f"*_{match.group(1)}_*"),
+        text,
+    )
+    text = re.sub(
+        r"\*\*(.+?)\*\*",
+        lambda match: placeholder(f"*{match.group(1)}*"),
+        text,
+    )
+    text = re.sub(
+        r"(?<!\*)\*([^*\n]+)\*(?!\*)",
+        lambda match: placeholder(f"_{match.group(1)}_"),
+        text,
+    )
+    text = re.sub(
+        r"~~(.+?)~~",
+        lambda match: placeholder(f"~{match.group(1)}~"),
+        text,
+    )
+
+    for key in reversed(placeholders):
+        text = text.replace(key, placeholders[key])
+
+    return text
+
+
+def _format_text(request: NotificationRequest) -> str:
+    lines: list[str] = []
+    prefix = _SEVERITY_PREFIX[request.severity]
+    if request.title:
+        lines.append(f"{prefix} {request.title}")
+    else:
+        lines.append(prefix)
+    lines.append(request.message)
+    for key, value in request.metadata.items():
+        lines.append(f"{key}: {value}")
+    return _format_slack_mrkdwn("\n".join(lines))
+
+
+class SlackProvider(NotificationProvider):
+    provider_name = "slack"
+
+    async def send(
+        self,
+        client: httpx.AsyncClient,
+        destination_name: str,
+        destination: SlackDestinationConfig,
+        request: NotificationRequest,
+    ) -> NotificationResult:
+        payload = {
+            "channel": destination.channel,
+            "text": _format_text(request),
+            "mrkdwn": True,
+            "unfurl_links": False,
+            "unfurl_media": False,
+        }
+        if destination.username:
+            payload["username"] = destination.username
+        if destination.icon_emoji:
+            payload["icon_emoji"] = destination.icon_emoji
+
+        try:
+            response = await client.post(
+                "https://slack.com/api/chat.postMessage",
+                headers={
+                    "Authorization": f"Bearer {destination.token}",
+                    "Content-Type": "application/json; charset=utf-8",
+                },
+                content=json.dumps(payload),
+            )
+        except httpx.TimeoutException as exc:
+            raise RetryableNotificationError("Slack request timed out") from exc
+        except httpx.TransportError as exc:
+            raise RetryableNotificationError("Slack transport error") from exc
+
+        if response.status_code == 429 or response.status_code >= 500:
+            raise RetryableNotificationError(
+                f"Slack HTTP {response.status_code}"
+            )
+        if response.status_code >= 400:
+            raise NotificationError(f"Slack HTTP {response.status_code}")
+
+        try:
+            data = response.json()
+        except ValueError as exc:
+            raise RetryableNotificationError("Slack returned invalid JSON") from exc
+
+        if not data.get("ok"):
+            error = str(data.get("error") or "unknown_error")
+            if error == "ratelimited":
+                raise RetryableNotificationError(error)
+            raise NotificationError(error)
+
+        return NotificationResult(
+            destination=destination_name,
+            ok=True,
+            provider=self.provider_name,
+            external_id=str(data.get("ts") or ""),
+            error=None,
+        )
diff --git a/agent/prompts/system_prompt_v3.yaml b/agent/prompts/system_prompt_v3.yaml
index befa56bf7a68d6a9a776fa64d8e27e28d3131cbe..cb63c901e699f68687353121f4d166a8a059efb5 100644
--- a/agent/prompts/system_prompt_v3.yaml
+++ b/agent/prompts/system_prompt_v3.yaml
@@ -1,5 +1,5 @@
 system_prompt: |
-  You are Hugging Face Agent, an ML engineering assistant with {{ num_tools }} tools for training, fine-tuning, data processing, inference, and evaluation on the Hugging Face ecosystem.
+  You are ML Intern, an ML engineering assistant with {{ num_tools }} tools for training, fine-tuning, data processing, inference, and evaluation on the Hugging Face (HF) ecosystem.
 
   Your goal is to complete what the user requested with zero errors. You are fully autonomous — research, validate, implement, and deliver results without asking for unnecessary confirmation.
 
@@ -28,7 +28,7 @@ system_prompt: |
 
   # Mistakes you WILL make without research
 
-  HALLUCINATED IMPORTS: You will import from modules that were renamed or removed. Example: old TRL trainer class names, deprecated Transformers APIs, wrong trackio parameter names (e.g. `run_name` instead of `name`). Fix: read a current example script first.
+  HALLUCINATED IMPORTS: You will import from modules that were renamed or removed. Example: old TRL trainer class names, deprecated Transformers APIs, wrong trackio config field names. Fix: read a current example script first.
 
   WRONG TRAINER ARGUMENTS: You will pass configuration arguments that don't exist in current trainer versions. Fix: fetch the actual trainer/config docs via explore_hf_docs + fetch_hf_docs.
 
@@ -60,6 +60,38 @@ system_prompt: |
     DPO: "prompt", "chosen", "rejected"
     GRPO: "prompt"
 
+  # Trackio
+
+  Trackio is natively integrated with Transformers Trainer and all TRL trainers — the built-in TrackioCallback handles init/log/finish. In TrainingArguments/SFTConfig/DPOConfig/GRPOConfig set:
+    report_to="trackio"
+    run_name="<descriptive-run-name>"          # e.g. "sft_qwen3-4b_lr2e-5_bs128"
+    project="<descriptive-project-name>"       # keeps related runs grouped so you can compare them
+    trackio_space_id="<username>/mlintern-<8-char-id>"   # creates a public dashboard Space
+  `project` and `trackio_space_id` can also be set via TRACKIO_PROJECT / TRACKIO_SPACE_ID env vars.
+
+  Alerts are how iterations decide what to change. Use trackio.alert(title, text, level) at every decision point in training. Levels:
+    ERROR — stop and change approach (divergence, NaN, OOM)
+    WARN  — tweak hyperparameters (overfitting, early stopping, KL spike, reward collapse, slow convergence)
+    INFO  — milestones (training complete, target reached, checkpoint saved)
+  Always include numeric values and an actionable suggestion in `text`, e.g. "loss=12.4 at step 200 — lr likely too high, try ×0.1". A future call must be able to parse it and act on it.
+
+  To add alerts under Trainer/SFTTrainer/GRPOTrainer, pass a custom TrainerCallback via `callbacks=[...]` that calls trackio.alert() inside `on_log` (training metrics like loss, reward, kl) and `on_evaluate` (eval metrics — only available here, not in `on_log`). Keep each `if` simple: one metric, one threshold. Conditions stay easy to adjust between runs.
+
+  Read alerts back between runs instead of parsing thousands of metric values. CLI — always use --json:
+    trackio get alerts --project <p> --run <r> --json
+    trackio get alerts --project <p> --since <iso8601> --json   # incremental polling
+    trackio get run    --project <p> --run <r> --json
+    trackio get metric --project <p> --run <r> --metric <m> --json
+    trackio list runs  --project <p> --json
+  Python: api = trackio.Api(); api.alerts(<p>, run=<r>, since=<ts>); api.runs(<p>) (each run has .name, .config, .alerts()).
+
+  Drive the next config from prior alerts:
+    diverged       → lr × 0.1
+    overfitting    → weight_decay × 10 or reduce capacity
+    early stopping → lr × 0.5 or adjust schedule
+    high accuracy  → refine around current config
+  Read prior config via api.runs(...).config and only mutate keys the alerts justify changing.
+
   # Data audit
 
   Before working with any dataset, audit it first. Do not assume you know what the data looks like — inspect it.
@@ -75,7 +107,7 @@ system_prompt: |
     - Dataset format verified: [columns confirmed via hf_inspect_dataset/hub_repo_details]
     - push_to_hub=True and hub_model_id set
     - timeout: [value] (based on: [model size] on [hardware])
-    - Trackio monitoring included and working
+    - Trackio monitoring included and deploying metrics to a public Space
 
   If you cannot fill in all items, stop and complete the missing steps first.
 
@@ -156,6 +188,7 @@ system_prompt: |
   - Always include direct Hub URLs when referencing models, datasets, Spaces, or jobs.
   - For errors: state what went wrong, why, and what you're doing to fix it.
   - Do not over-explain or present elaborate option menus for simple tasks. When the user's intent is clear, act on it. Present options only when there's genuine ambiguity.
+  - Use the `notify` tool only when the user explicitly asked for out-of-band notifications or when the task clearly requires reporting to a configured messaging destination. Do not use it for routine chat updates.
 
   # Tool usage
 
diff --git a/agent/tools/__init__.py b/agent/tools/__init__.py
index 14ef45669bc443c1c005ddde69b4205eb02f46cb..65c793cbaad3b2f74eacaf1da6038ff0bef893d9 100644
--- a/agent/tools/__init__.py
+++ b/agent/tools/__init__.py
@@ -20,6 +20,7 @@ from agent.tools.github_read_file import (
 )
 from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
 from agent.tools.types import ToolResult
+from agent.tools.web_search_tool import WEB_SEARCH_TOOL_SPEC, web_search_handler
 
 __all__ = [
     "ToolResult",
@@ -36,4 +37,6 @@ __all__ = [
     "github_search_code_handler",
     "HF_INSPECT_DATASET_TOOL_SPEC",
     "hf_inspect_dataset_handler",
+    "WEB_SEARCH_TOOL_SPEC",
+    "web_search_handler",
 ]
diff --git a/agent/tools/jobs_tool.py b/agent/tools/jobs_tool.py
index c18d47e298e445dbc232181ca9bd9520b942fa80..6518fa3cbd3d286716e4f0aaae49512937564a7b 100644
--- a/agent/tools/jobs_tool.py
+++ b/agent/tools/jobs_tool.py
@@ -19,6 +19,7 @@ from huggingface_hub.utils import HfHubHTTPError
 
 from agent.core.hf_access import JobsAccessError, resolve_jobs_namespace
 from agent.core.session import Event
+from agent.tools.trackio_seed import ensure_trackio_dashboard
 from agent.tools.types import ToolResult
 
 logger = logging.getLogger(__name__)
@@ -382,6 +383,31 @@ class HfJobsTool:
                 "isError": True,
             }
 
+    async def _seed_trackio_dashboard(self, space_id: str) -> None:
+        """Idempotently install trackio dashboard files into *space_id* before
+        the job runs. Surfaces seed progress as tool_log events but never
+        raises — a seed failure should not block job submission, since trackio
+        often still works when the Space already has dashboard code from a
+        previous run.
+        """
+        loop = asyncio.get_running_loop()
+
+        def _log(msg: str) -> None:
+            if self.session is None:
+                return
+            loop.call_soon_threadsafe(
+                self.session.event_queue.put_nowait,
+                Event(event_type="tool_log", data={"tool": "hf_jobs", "log": msg}),
+            )
+
+        try:
+            await asyncio.to_thread(
+                ensure_trackio_dashboard, space_id, self.hf_token, _log
+            )
+        except Exception as e:
+            logger.warning(f"trackio dashboard seed failed for {space_id}: {e}")
+            _log(f"trackio dashboard seed failed: {e}")
+
     async def _wait_for_job_completion(
         self, job_id: str, namespace: Optional[str] = None
     ) -> tuple[str, list[str]]:
@@ -533,11 +559,24 @@ class HfJobsTool:
             # Run the job
             flavor = args.get("hardware_flavor", "cpu-basic")
             timeout_str = args.get("timeout", "30m")
+
+            # Trackio: agent-declared space + project become env vars on the job
+            # so trackio.init() picks them up automatically. We also surface them
+            # in tool_state_change so the frontend can embed the dashboard.
+            env_dict = _add_default_env(args.get("env"))
+            trackio_space_id = args.get("trackio_space_id")
+            trackio_project = args.get("trackio_project")
+            if trackio_space_id:
+                env_dict["TRACKIO_SPACE_ID"] = trackio_space_id
+                await self._seed_trackio_dashboard(trackio_space_id)
+            if trackio_project:
+                env_dict["TRACKIO_PROJECT"] = trackio_project
+
             job = await _async_call(
                 self.api.run_job,
                 image=image,
                 command=command,
-                env=_add_default_env(args.get("env")),
+                env=env_dict,
                 secrets=_add_environment_variables(args.get("secrets"), self.hf_token),
                 flavor=flavor,
                 timeout=timeout_str,
@@ -550,16 +589,18 @@ class HfJobsTool:
 
             # Send job URL immediately after job creation (before waiting for completion)
             if self.session and self.tool_call_id:
+                state_data: Dict[str, Any] = {
+                    "tool_call_id": self.tool_call_id,
+                    "tool": "hf_jobs",
+                    "state": "running",
+                    "jobUrl": job.url,
+                }
+                if trackio_space_id:
+                    state_data["trackioSpaceId"] = trackio_space_id
+                if trackio_project:
+                    state_data["trackioProject"] = trackio_project
                 await self.session.send_event(
-                    Event(
-                        event_type="tool_state_change",
-                        data={
-                            "tool_call_id": self.tool_call_id,
-                            "tool": "hf_jobs",
-                            "state": "running",
-                            "jobUrl": job.url,
-                        },
-                    )
+                    Event(event_type="tool_state_change", data=state_data)
                 )
 
             # Telemetry: job submission + completion (infra consumption signal).
@@ -594,16 +635,18 @@ class HfJobsTool:
 
             # Notify frontend of final status
             if self.session and self.tool_call_id:
+                final_data: Dict[str, Any] = {
+                    "tool_call_id": self.tool_call_id,
+                    "tool": "hf_jobs",
+                    "state": final_status.lower(),
+                    "jobUrl": job.url,
+                }
+                if trackio_space_id:
+                    final_data["trackioSpaceId"] = trackio_space_id
+                if trackio_project:
+                    final_data["trackioProject"] = trackio_project
                 await self.session.send_event(
-                    Event(
-                        event_type="tool_state_change",
-                        data={
-                            "tool_call_id": self.tool_call_id,
-                            "tool": "hf_jobs",
-                            "state": final_status.lower(),
-                            "jobUrl": job.url,
-                        },
-                    )
+                    Event(event_type="tool_state_change", data=final_data)
                 )
 
             # Filter out UV package installation output
@@ -977,7 +1020,10 @@ HF_JOBS_TOOL_SPEC = {
         "- You MUST have validated dataset format via hf_inspect_dataset or hub_repo_details.\n"
         "- Training config MUST include push_to_hub=True and hub_model_id. "
         "Job storage is EPHEMERAL — all files are deleted when the job ends. Without push_to_hub, trained models are lost permanently.\n"
-        "- Include trackio monitoring and provide the dashboard URL to the user.\n\n"
+        "- Include trackio monitoring and provide the dashboard URL to the user. "
+        "When the script uses report_to='trackio', also pass `trackio_space_id` "
+        "(e.g. '<username>/mlintern-<8char>') and `trackio_project` as tool args — "
+        "they are injected as TRACKIO_SPACE_ID/TRACKIO_PROJECT env vars and let the UI embed the live dashboard.\n\n"
         "BATCH/ABLATION JOBS: Submit ONE job first. Check logs to confirm it starts training successfully. "
         "Only then submit the remaining jobs. Never submit all at once — if there's a bug, all jobs fail.\n\n"
         "Operations: run, ps, logs, inspect, cancel, scheduled run/ps/inspect/delete/suspend/resume.\n\n"
@@ -1060,6 +1106,26 @@ HF_JOBS_TOOL_SPEC = {
                 "type": "object",
                 "description": "Environment variables {'KEY': 'VALUE'}. HF_TOKEN is auto-included.",
             },
+            "trackio_space_id": {
+                "type": "string",
+                "description": (
+                    "Optional. The HF Space hosting the trackio dashboard for this run "
+                    "(e.g. '<username>/mlintern-<8char>', under YOUR HF namespace). "
+                    "Injected as TRACKIO_SPACE_ID env var and used by the UI to embed "
+                    "the live dashboard. Set this whenever the script uses "
+                    "report_to='trackio'. The Space is auto-created and seeded with the "
+                    "trackio dashboard before the job starts — DO NOT pre-create it via "
+                    "hf_repo_git, that produces an empty Space that breaks the embed."
+                ),
+            },
+            "trackio_project": {
+                "type": "string",
+                "description": (
+                    "Optional. The trackio project name to log this run under. "
+                    "Injected as TRACKIO_PROJECT env var and used by the UI to filter "
+                    "the embedded dashboard to this project."
+                ),
+            },
             "namespace": {
                 "type": "string",
                 "description": (
diff --git a/agent/tools/notify_tool.py b/agent/tools/notify_tool.py
new file mode 100644
index 0000000000000000000000000000000000000000..f926d5a58d5f3c4b877cb8792f812f6e4fa322a7
--- /dev/null
+++ b/agent/tools/notify_tool.py
@@ -0,0 +1,108 @@
+from typing import Any
+
+from agent.messaging.models import NotificationRequest
+
+NOTIFY_TOOL_SPEC = {
+    "name": "notify",
+    "description": (
+        "Send an out-of-band notification to configured messaging destinations. "
+        "Use this only when the user explicitly asked for proactive notifications "
+        "or when the task requires reporting progress outside the chat. "
+        "Destinations must be named server-side configs such as 'slack.ops'."
+    ),
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "destinations": {
+                "type": "array",
+                "description": "Named messaging destinations to notify.",
+                "items": {"type": "string"},
+                "minItems": 1,
+            },
+            "message": {
+                "type": "string",
+                "description": "Main notification body.",
+            },
+            "title": {
+                "type": "string",
+                "description": "Optional short title line.",
+            },
+            "severity": {
+                "type": "string",
+                "enum": ["info", "success", "warning", "error"],
+                "description": "Notification severity label.",
+            },
+        },
+        "required": ["destinations", "message"],
+    },
+}
+
+
+async def notify_handler(
+    arguments: dict[str, Any], session=None, **_kwargs
+) -> tuple[str, bool]:
+    if session is None or session.notification_gateway is None:
+        return "Messaging is not configured for this session.", False
+
+    raw_destinations = arguments.get("destinations", [])
+    if not isinstance(raw_destinations, list) or not raw_destinations:
+        return "destinations must be a non-empty array of destination names.", False
+
+    destinations: list[str] = []
+    seen: set[str] = set()
+    for raw_name in raw_destinations:
+        if not isinstance(raw_name, str):
+            return "Each destination must be a string.", False
+        name = raw_name.strip()
+        if not name:
+            return "Destination names must not be empty.", False
+        if name not in seen:
+            destinations.append(name)
+            seen.add(name)
+
+    disallowed = [
+        name
+        for name in destinations
+        if not session.config.messaging.can_agent_tool_send(name)
+    ]
+    if disallowed:
+        return (
+            "These destinations are unavailable for the notify tool: "
+            + ", ".join(disallowed)
+        ), False
+
+    message = arguments.get("message", "")
+    if not isinstance(message, str) or not message.strip():
+        return "message must be a non-empty string.", False
+
+    title = arguments.get("title")
+    severity = arguments.get("severity", "info")
+    if title is not None and not isinstance(title, str):
+        return "title must be a string when provided.", False
+    if severity not in {"info", "success", "warning", "error"}:
+        return "severity must be one of: info, success, warning, error.", False
+
+    requests = [
+        NotificationRequest(
+            destination=name,
+            title=title,
+            message=message,
+            severity=severity,
+            metadata={
+                "session_id": session.session_id,
+                "model": session.config.model_name,
+            },
+        )
+        for name in destinations
+    ]
+    results = await session.notification_gateway.send_many(requests)
+
+    lines = []
+    all_ok = True
+    for result in results:
+        if result.ok:
+            lines.append(f"{result.destination}: sent")
+        else:
+            all_ok = False
+            lines.append(f"{result.destination}: failed ({result.error})")
+    return "\n".join(lines), all_ok
diff --git a/agent/tools/research_tool.py b/agent/tools/research_tool.py
index 18ae2ad6513d0ad98dff23e369f9222c59ef699c..11131766ee0262ba71805ff6a743c5da736e2386 100644
--- a/agent/tools/research_tool.py
+++ b/agent/tools/research_tool.py
@@ -37,6 +37,7 @@ RESEARCH_TOOL_NAMES = {
     "github_find_examples",
     "github_list_repos",
     "github_read_file",
+    "web_search",
     "hf_inspect_dataset",
     "hf_repo_files",
 }
@@ -102,6 +103,8 @@ tell you what actually works.
 - `explore_hf_docs(endpoint)`: Search docs for a library. Endpoints: trl, transformers, datasets, peft, accelerate, trackio, vllm, inference-endpoints, etc.
 - `fetch_hf_docs(url)`: Fetch full page content from explore results
 - `find_hf_api(query=..., tag=...)`: Find REST API endpoints
+- `web_search(query=..., allowed_domains=[...], blocked_domains=[...])`:
+  Search the current web when papers/docs/GitHub are not enough.
 
 ## Hub repo inspection
 - `hf_repo_files`: List/read files in any HF repo (model, dataset, space)
@@ -306,8 +309,10 @@ async def research_handler(
         # ── Doom-loop detection ──
         doom_prompt = check_for_doom_loop(messages)
         if doom_prompt:
-            logger.warning("Research sub-agent doom loop detected at iteration %d", _iteration)
-            await _log("Doom loop detected — injecting corrective prompt")
+            logger.warning(
+                "Research sub-agent repetition guard activated at iteration %d",
+                _iteration,
+            )
             messages.append(Message(role="user", content=doom_prompt))
 
         # ── Context budget: warn at 75%, hard-stop at 95% ──
@@ -424,7 +429,7 @@ async def research_handler(
                 await _log(f"▸ {tool_name}  {args_str}")
 
                 output, _success = await session.tool_router.call_tool(
-                    tool_name, tool_args, session=session
+                    tool_name, tool_args, session=session, tool_call_id=tc.id
                 )
                 _tool_uses += 1
                 await _log(f"tools:{_tool_uses}")
diff --git a/agent/tools/sandbox_client.py b/agent/tools/sandbox_client.py
index 16982c76fe62dc66ac9dbd88acafdb80fc2b2a0f..967d946c12d37c25d76084e02a72548a5fa22bc7 100644
--- a/agent/tools/sandbox_client.py
+++ b/agent/tools/sandbox_client.py
@@ -37,6 +37,7 @@ Tools: bash, read, write, edit, upload
 from __future__ import annotations
 
 import io
+import secrets as secrets_lib
 import sys
 import time
 import uuid
@@ -99,8 +100,8 @@ CMD ["python", "sandbox_server.py"]
 
 _SANDBOX_SERVER = '''\
 """Minimal FastAPI server for sandbox operations."""
-import os, subprocess, pathlib, signal, threading, re, tempfile
-from fastapi import FastAPI
+import hmac, os, subprocess, pathlib, signal, threading, re, tempfile
+from fastapi import Depends, FastAPI, HTTPException, Request
 from pydantic import BaseModel
 from typing import Optional
 import uvicorn
@@ -156,6 +157,22 @@ def _atomic_write(path: pathlib.Path, content: str):
 
 app = FastAPI()
 
+def _expected_api_token() -> str:
+    return os.environ.get("SANDBOX_API_TOKEN") or os.environ.get("HF_TOKEN") or ""
+
+def _require_auth(request: Request) -> None:
+    expected = _expected_api_token()
+    if not expected:
+        raise HTTPException(status_code=503, detail="Sandbox API token not configured")
+    auth_header = request.headers.get("authorization", "")
+    scheme, _, supplied = auth_header.partition(" ")
+    if scheme.lower() != "bearer" or not supplied:
+        raise HTTPException(status_code=401, detail="Missing bearer token")
+    if not hmac.compare_digest(supplied, expected):
+        raise HTTPException(status_code=401, detail="Invalid bearer token")
+
+_AUTH = [Depends(_require_auth)]
+
 # Track active bash processes so they can be killed on cancel
 _active_procs = {}  # pid -> subprocess.Popen
 _proc_lock = threading.Lock()
@@ -344,7 +361,7 @@ def _validate_python(content, path=""):
 def health():
     return {"status": "ok"}
 
-@app.post("/api/bash")
+@app.post("/api/bash", dependencies=_AUTH)
 def bash(req: BashReq):
     try:
         proc = subprocess.Popen(
@@ -371,7 +388,7 @@ def bash(req: BashReq):
     except Exception as e:
         return {"success": False, "output": "", "error": str(e)}
 
-@app.post("/api/kill")
+@app.post("/api/kill", dependencies=_AUTH)
 def kill_all():
     """Kill all active bash processes. Called when user cancels."""
     with _proc_lock:
@@ -389,7 +406,7 @@ def kill_all():
                 pass
     return {"success": True, "output": f"Killed {len(killed)} process(es): {killed}", "error": ""}
 
-@app.post("/api/read")
+@app.post("/api/read", dependencies=_AUTH)
 def read(req: ReadReq):
     try:
         p = pathlib.Path(req.path)
@@ -406,7 +423,7 @@ def read(req: ReadReq):
     except Exception as e:
         return {"success": False, "output": "", "error": str(e)}
 
-@app.post("/api/write")
+@app.post("/api/write", dependencies=_AUTH)
 def write(req: WriteReq):
     try:
         p = pathlib.Path(req.path)
@@ -420,7 +437,7 @@ def write(req: WriteReq):
     except Exception as e:
         return {"success": False, "output": "", "error": str(e)}
 
-@app.post("/api/edit")
+@app.post("/api/edit", dependencies=_AUTH)
 def edit(req: EditReq):
     try:
         p = pathlib.Path(req.path)
@@ -447,7 +464,7 @@ def edit(req: EditReq):
     except Exception as e:
         return {"success": False, "output": "", "error": str(e)}
 
-@app.post("/api/exists")
+@app.post("/api/exists", dependencies=_AUTH)
 def exists(req: ExistsReq):
     return {"success": True, "output": str(pathlib.Path(req.path).exists()).lower(), "error": ""}
 
@@ -482,6 +499,7 @@ class Sandbox:
 
     space_id: str
     token: str | None = None
+    api_token: str | None = field(default=None, repr=False)
     work_dir: str = "/app"
     timeout: int = DEFAULT_TIMEOUT
     _owns_space: bool = field(default=False, repr=False)
@@ -495,9 +513,10 @@ class Sandbox:
         # Trailing slash is critical: httpx resolves relative paths against base_url.
         # Without it, client.get("health") resolves to /health instead of /api/health.
         self._base_url = f"https://{slug}.hf.space/api/"
+        api_token = self.api_token or self.token
         self._client = httpx.Client(
             base_url=self._base_url,
-            headers={"Authorization": f"Bearer {self.token}"} if self.token else {},
+            headers={"Authorization": f"Bearer {api_token}"} if api_token else {},
             timeout=httpx.Timeout(MAX_TIMEOUT, connect=30),
             follow_redirects=True,
         )
@@ -563,6 +582,7 @@ class Sandbox:
         base = name or "sandbox"
         suffix = uuid.uuid4().hex[:8]
         space_id = f"{owner}/{base}-{suffix}"
+        sandbox_api_token = secrets_lib.token_urlsafe(32)
 
         _log(f"Creating sandbox: {space_id} (from {template})...")
 
@@ -583,8 +603,9 @@ class Sandbox:
         # Inject secrets BEFORE uploading server files (which triggers rebuild).
         # Secrets added after a Space is running aren't available until restart,
         # so they must be set before the build/start cycle.
-        if secrets:
-            for key, val in secrets.items():
+        sandbox_secrets = {**(secrets or {}), "SANDBOX_API_TOKEN": sandbox_api_token}
+        if sandbox_secrets:
+            for key, val in sandbox_secrets.items():
                 api.add_space_secret(space_id, key, val)
 
         # Upload sandbox server and Dockerfile (triggers rebuild)
@@ -617,7 +638,12 @@ class Sandbox:
         _check_cancel()
 
         # Wait for the API server to be responsive (non-fatal)
-        sb = cls(space_id=space_id, token=token, _owns_space=True)
+        sb = cls(
+            space_id=space_id,
+            token=token,
+            api_token=sandbox_api_token,
+            _owns_space=True,
+        )
         try:
             sb._wait_for_api(timeout=API_WAIT_TIMEOUT, log=_log)
         except TimeoutError as e:
@@ -648,13 +674,24 @@ class Sandbox:
         log("Server files uploaded, rebuild triggered.")
 
     @classmethod
-    def connect(cls, space_id: str, *, token: str | None = None) -> Sandbox:
+    def connect(
+        cls,
+        space_id: str,
+        *,
+        token: str | None = None,
+        api_token: str | None = None,
+    ) -> Sandbox:
         """
         Connect to an existing running Space.
 
         Does a health check to verify the Space is reachable.
         """
-        sb = cls(space_id=space_id, token=token, _owns_space=False)
+        sb = cls(
+            space_id=space_id,
+            token=token,
+            api_token=api_token,
+            _owns_space=False,
+        )
         sb._wait_for_api(timeout=60)
         return sb
 
@@ -687,6 +724,10 @@ class Sandbox:
             )
         print(f"Deleting sandbox: {self.space_id}...")
         self._hf_api.delete_repo(self.space_id, repo_type="space")
+        # Clear ownership so a second cleanup call (e.g. delete_session +
+        # _run_session.finally both fire) early-returns instead of retrying
+        # a 404 delete and emitting a spurious ERROR log.
+        self._owns_space = False
         self._client.close()
         print("Deleted.")
 
diff --git a/agent/tools/sandbox_tool.py b/agent/tools/sandbox_tool.py
index 6dfd3db19876eed20e5703d3f959cc8a7cb317e8..a5c26acabee66f8baedb4a3b81062a39839a327b 100644
--- a/agent/tools/sandbox_tool.py
+++ b/agent/tools/sandbox_tool.py
@@ -12,13 +12,29 @@ a cpu-basic sandbox is auto-created (no approval needed).
 from __future__ import annotations
 
 import asyncio
+import logging
+import re
 import threading
+from datetime import datetime, timedelta, timezone
 from typing import Any
 
 from huggingface_hub import HfApi, SpaceHardware
 
 from agent.core.session import Event
 from agent.tools.sandbox_client import Sandbox
+from agent.tools.trackio_seed import ensure_trackio_dashboard
+
+logger = logging.getLogger(__name__)
+
+# Match the exact suffix pattern Sandbox.create produces: "sandbox-<8 hex>".
+# Used to identify orphan sandboxes from prior sessions safely (won't match
+# user-renamed lookalikes).
+_SANDBOX_NAME_RE = re.compile(r"^sandbox-[a-f0-9]{8}$")
+
+# How stale a sandbox must be before we treat it as definitely orphan.
+# Anything more recent could be tied to a still-live session in another tab,
+# so we leave it alone.
+_ORPHAN_STALE_AFTER = timedelta(hours=1)
 
 
 def _looks_like_path(script: str) -> bool:
@@ -62,11 +78,89 @@ async def resolve_sandbox_script(
         return None, f"Failed to read {script} from sandbox: {e}"
 
 
+async def _seed_trackio_dashboard_safe(session: Any, space_id: str) -> None:
+    """Idempotently seed *space_id* with trackio dashboard files using the
+    session's HF token. Logs progress, swallows errors — a failed seed should
+    not block sandbox creation."""
+    if not session or not getattr(session, "hf_token", None):
+        return
+    loop = asyncio.get_running_loop()
+
+    def _log(msg: str) -> None:
+        loop.call_soon_threadsafe(
+            session.event_queue.put_nowait,
+            Event(event_type="tool_log", data={"tool": "sandbox_create", "log": msg}),
+        )
+
+    try:
+        await asyncio.to_thread(
+            ensure_trackio_dashboard, space_id, session.hf_token, _log
+        )
+    except Exception as e:
+        _log(f"trackio dashboard seed failed: {e}")
+
+
 # ── Tool name mapping (short agent names → Sandbox client names) ──────
 
 
+def _cleanup_user_orphan_sandboxes(
+    api: HfApi,
+    owner: str,
+    log: Any,
+) -> int:
+    """Delete stale ``sandbox-<8hex>`` Spaces in ``owner``'s account.
+
+    "Stale" = not modified in the last hour. The naming pattern + staleness
+    filter together make this safe:
+
+    * Naming: only matches ``sandbox-<exactly 8 lowercase hex>``, the
+      pattern Sandbox.create produces. Won't touch user-renamed Spaces.
+    * Staleness: anything modified in the last hour might still be tied
+      to a live session in another tab/replica, so we leave it alone.
+
+    Runs blocking — call via ``asyncio.to_thread``. Best-effort: failures
+    are logged but never raised, so a flaky HF API never blocks creation.
+    """
+    cutoff = datetime.now(timezone.utc) - _ORPHAN_STALE_AFTER
+    deleted = 0
+    try:
+        spaces = list(api.list_spaces(author=owner, limit=200))
+    except Exception as e:
+        log(f"orphan sweep: list_spaces failed: {e}")
+        return 0
+
+    for space in spaces:
+        space_name = space.id.rsplit("/", 1)[-1]
+        if not _SANDBOX_NAME_RE.match(space_name):
+            continue
+
+        last_mod = getattr(space, "lastModified", None) or getattr(space, "last_modified", None)
+        if isinstance(last_mod, str):
+            try:
+                last_mod = datetime.fromisoformat(last_mod.replace("Z", "+00:00"))
+            except ValueError:
+                last_mod = None
+        if last_mod and last_mod > cutoff:
+            # Recent — could be a concurrent live session. Skip.
+            continue
+
+        try:
+            api.delete_repo(repo_id=space.id, repo_type="space")
+            deleted += 1
+            log(f"orphan sweep: deleted {space.id}")
+        except Exception as e:
+            log(f"orphan sweep: failed to delete {space.id}: {e}")
+
+    if deleted:
+        log(f"orphan sweep: cleaned up {deleted} stale sandbox(es) before create")
+    return deleted
+
+
 async def _ensure_sandbox(
-    session: Any, hardware: str = "cpu-basic", **create_kwargs
+    session: Any,
+    hardware: str = "cpu-basic",
+    extra_secrets: dict[str, str] | None = None,
+    **create_kwargs,
 ) -> tuple[Sandbox | None, str | None]:
     """
     Ensure a sandbox exists on the session. Auto-creates with given hardware if needed.
@@ -109,6 +203,23 @@ async def _ensure_sandbox(
             Event(event_type="tool_log", data={"tool": "sandbox", "log": msg}),
         )
 
+    # Before we create a new sandbox, sweep this user's stale sandboxes from
+    # prior sessions. ``_cleanup_sandbox`` in session_manager fires only on
+    # clean session exit; pod kills, WebSocket drops, etc. leave orphans
+    # behind, and they accumulate on every new session forever (observed
+    # 2310 leaked across the Hub on 2026-04-27). Doing the cleanup here at
+    # session start = self-healing, no separate cron needed.
+    #
+    # The 1h staleness filter is the safety: a sandbox modified in the last
+    # hour might still be tied to a live session in another tab, so we skip.
+    # Anything older has no realistic chance of being active given typical
+    # session lengths.
+    try:
+        await asyncio.to_thread(_cleanup_user_orphan_sandboxes, api, owner, _log)
+    except Exception as e:
+        # Cleanup is best-effort — never block sandbox_create on it.
+        _log(f"orphan sandbox sweep failed (non-fatal): {e}")
+
     # Bridge asyncio cancel event to a threading.Event for the blocking create call.
     # We poll session._cancelled from the main loop in a background task and set
     # a threading.Event that Sandbox.create checks during its polling loops.
@@ -120,11 +231,15 @@ async def _ensure_sandbox(
 
     watcher_task = asyncio.create_task(_watch_cancel())
 
+    secrets: dict[str, str] = {"HF_TOKEN": token}
+    if extra_secrets:
+        secrets.update({k: v for k, v in extra_secrets.items() if v})
+
     kwargs = {
         "owner": owner,
         "hardware": hardware,
         "token": token,
-        "secrets": {"HF_TOKEN": token},
+        "secrets": secrets,
         "log": _log,
         "cancel_event": cancel_flag,
         **create_kwargs,
@@ -188,6 +303,9 @@ SANDBOX_CREATE_TOOL_SPEC = {
         "fp32 ≈ 4 bytes/param, plus ~20% overhead for optimizer states during training.\n"
         "Common picks: t4-small (16GB VRAM, fits ≤1-3B), a10g-small (24GB, ≤7B), a100-large (80GB, ≤30B). "
         "If the model won't fit, pick larger hardware upfront — OOM on a sandbox wastes time.\n\n"
+        "If you intend to run a training script in this sandbox that uses report_to='trackio', "
+        "pass `trackio_space_id` (e.g. '<username>/mlintern-<8char>') and `trackio_project` so they "
+        "are set as TRACKIO_SPACE_ID/TRACKIO_PROJECT secrets in the sandbox and the UI can embed the live dashboard.\n\n"
         "Hardware: " + ", ".join([e.value for e in SpaceHardware]) + ".\n"
     ),
     "parameters": {
@@ -204,16 +322,49 @@ SANDBOX_CREATE_TOOL_SPEC = {
                 "type": "boolean",
                 "description": "If true, create a private Space",
             },
+            "trackio_space_id": {
+                "type": "string",
+                "description": (
+                    "Optional. The HF Space hosting the trackio dashboard for runs in this sandbox "
+                    "(e.g. '<username>/mlintern-<8char>', under YOUR HF namespace). Injected as "
+                    "TRACKIO_SPACE_ID secret and surfaced to the UI. The Space is auto-created and "
+                    "seeded with the trackio dashboard — DO NOT pre-create it via hf_repo_git, "
+                    "that produces an empty Space that breaks the embed."
+                ),
+            },
+            "trackio_project": {
+                "type": "string",
+                "description": (
+                    "Optional. The trackio project name. Injected as TRACKIO_PROJECT secret and "
+                    "used by the UI to filter the embedded dashboard to this project."
+                ),
+            },
         },
     },
 }
 
 
 async def sandbox_create_handler(
-    args: dict[str, Any], session: Any = None
+    args: dict[str, Any], session: Any = None, tool_call_id: str | None = None
 ) -> tuple[str, bool]:
     """Handle sandbox_create tool calls."""
     hardware = args.get("hardware", "cpu-basic")
+    trackio_space_id = args.get("trackio_space_id") or None
+    trackio_project = args.get("trackio_project") or None
+
+    async def _emit_trackio_state(sb: Sandbox) -> None:
+        """Tell the frontend which trackio dashboard to embed for this sandbox."""
+        if not (session and tool_call_id and trackio_space_id):
+            return
+        data: dict[str, Any] = {
+            "tool_call_id": tool_call_id,
+            "tool": "sandbox_create",
+            "state": "running",
+            "trackioSpaceId": trackio_space_id,
+        }
+        if trackio_project:
+            data["trackioProject"] = trackio_project
+        await session.send_event(Event(event_type="tool_state_change", data=data))
 
     # If sandbox already exists, return its info
     if session and getattr(session, "sandbox", None):
@@ -226,6 +377,7 @@ async def sandbox_create_handler(
                 "Hardware cannot be changed by calling sandbox_create again. "
                 "Delete the existing sandbox first if you need a different tier."
             )
+        await _emit_trackio_state(sb)
         return (
             f"Sandbox already active: {sb.space_id}\n"
             f"URL: {sb.url}\n"
@@ -233,18 +385,32 @@ async def sandbox_create_handler(
             f"Use bash/read/write/edit to interact with it."
         ), True
 
-    create_kwargs = {}
+    create_kwargs: dict[str, Any] = {}
     if "private" in args:
         create_kwargs["private"] = args["private"]
 
+    extra_secrets: dict[str, str] = {}
+    if trackio_space_id:
+        extra_secrets["TRACKIO_SPACE_ID"] = trackio_space_id
+        await _seed_trackio_dashboard_safe(session, trackio_space_id)
+    if trackio_project:
+        extra_secrets["TRACKIO_PROJECT"] = trackio_project
+
     try:
-        sb, error = await _ensure_sandbox(session, hardware=hardware, **create_kwargs)
+        sb, error = await _ensure_sandbox(
+            session,
+            hardware=hardware,
+            extra_secrets=extra_secrets or None,
+            **create_kwargs,
+        )
     except Exception as e:
         return f"Failed to create sandbox: {e}", False
 
     if error:
         return error, False
 
+    await _emit_trackio_state(sb)
+
     return (
         f"Sandbox created: {sb.space_id}\n"
         f"URL: {sb.url}\n"
diff --git a/agent/tools/trackio_seed.py b/agent/tools/trackio_seed.py
new file mode 100644
index 0000000000000000000000000000000000000000..1062e1b5eda2701833aad7c1c895727d7fbd191e
--- /dev/null
+++ b/agent/tools/trackio_seed.py
@@ -0,0 +1,205 @@
+"""Seed an HF Space with the trackio dashboard.
+
+Background: when the agent creates a Space via `hf_repo_git create_repo` (or
+the user pre-creates one), it ships with no app.py — so the iframe shows the
+default Gradio "Get started" template instead of charts. Trackio's `init()`
+detects the existing Space but does NOT auto-bootstrap dashboard files into it,
+so the dashboard never materializes.
+
+This helper writes the three files trackio's runtime expects (README.md,
+requirements.txt, app.py) into the Space, idempotently, BEFORE the job that
+will call `trackio.init()` runs. We deliberately omit `hf_oauth: true` from
+the README so the embedded iframe in ml-intern renders without a login click —
+per-user privacy is enforced by namespace ownership instead.
+
+Beyond the dashboard files, the helper also creates the metrics bucket and
+mounts it on the Space at `/data` (with `TRACKIO_DIR` / `TRACKIO_BUCKET_ID`
+Space variables). Without this, the running job writes metrics into a bucket
+that the dashboard Space can't read, and the iframe shows "No projects".
+"""
+
+from __future__ import annotations
+
+import io
+from typing import Callable, Optional
+
+from huggingface_hub import (
+    HfApi,
+    Volume,
+    add_space_variable,
+    create_bucket,
+    create_repo,
+)
+from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
+
+
+_README = """---
+title: Trackio Dashboard
+emoji: 📊
+colorFrom: pink
+colorTo: gray
+sdk: gradio
+app_file: app.py
+pinned: false
+tags:
+  - trackio
+---
+
+Embedded trackio dashboard for ml-intern runs.
+"""
+
+_REQUIREMENTS = "trackio\n"
+_APP_PY = "import trackio\ntrackio.show()\n"
+
+# ml-intern brand mark surfaced inside the trackio dashboard. Trackio reads
+# `TRACKIO_LOGO_LIGHT_URL` / `TRACKIO_LOGO_DARK_URL` from Space variables and
+# renders them in place of its own logo. We point at the publicly-resolvable
+# copy on the smolagents/ml-intern Space repo so any seeded dashboard inherits
+# the ml-intern branding without each user having to host the asset.
+_LOGO_URL = (
+    "https://huggingface.co/spaces/smolagents/ml-intern/"
+    "resolve/main/frontend/public/smolagents.webp"
+)
+
+_FILES = {
+    "README.md": _README,
+    "requirements.txt": _REQUIREMENTS,
+    "app.py": _APP_PY,
+}
+
+
+def _already_seeded(api: HfApi, space_id: str) -> bool:
+    """Cheap check: does the Space already have a trackio dashboard app.py?
+
+    Avoids re-uploading the same three files on every job submission. We look
+    for the literal `trackio.show` call which is the load-bearing line — any
+    other app.py shape (the default gradio shell, a stale custom one) means
+    we should re-seed.
+    """
+    try:
+        path = api.hf_hub_download(
+            repo_id=space_id, repo_type="space", filename="app.py"
+        )
+    except (EntryNotFoundError, RepositoryNotFoundError, OSError):
+        return False
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            return "trackio.show" in f.read()
+    except OSError:
+        return False
+
+
+def _get_space_volumes(api: HfApi, space_id: str) -> list:
+    """Return mounted volumes for a Space.
+
+    `get_space_runtime()` doesn't always populate `volumes` even when the
+    mount exists; mirror trackio's fallback to `space_info().runtime.volumes`.
+    """
+    runtime = api.get_space_runtime(space_id)
+    if getattr(runtime, "volumes", None):
+        return list(runtime.volumes)
+    info = api.space_info(space_id)
+    if info.runtime and getattr(info.runtime, "volumes", None):
+        return list(info.runtime.volumes)
+    return []
+
+
+def _ensure_bucket_mounted(
+    api: HfApi,
+    space_id: str,
+    bucket_id: str,
+    hf_token: str,
+    log: Optional[Callable[[str], None]] = None,
+) -> None:
+    """Create the bucket if missing, mount it at `/data` on the Space, and
+    set the `TRACKIO_DIR` / `TRACKIO_BUCKET_ID` Space variables. Idempotent —
+    skips work that has already been done.
+    """
+    create_bucket(bucket_id, private=True, exist_ok=True, token=hf_token)
+
+    existing = _get_space_volumes(api, space_id)
+    already_mounted = any(
+        getattr(v, "type", None) == "bucket"
+        and getattr(v, "source", None) == bucket_id
+        and getattr(v, "mount_path", None) == "/data"
+        for v in existing
+    )
+    if not already_mounted:
+        preserved = [
+            v
+            for v in existing
+            if not (
+                getattr(v, "type", None) == "bucket"
+                and (
+                    getattr(v, "source", None) == bucket_id
+                    or getattr(v, "mount_path", None) == "/data"
+                )
+            )
+        ]
+        api.set_space_volumes(
+            space_id,
+            preserved + [Volume(type="bucket", source=bucket_id, mount_path="/data")],
+        )
+        if log:
+            log(f"mounted bucket {bucket_id} at /data on {space_id}")
+
+    variables = api.get_space_variables(space_id)
+    desired = {
+        "TRACKIO_DIR": "/data/trackio",
+        "TRACKIO_BUCKET_ID": bucket_id,
+        "TRACKIO_LOGO_LIGHT_URL": _LOGO_URL,
+        "TRACKIO_LOGO_DARK_URL": _LOGO_URL,
+    }
+    for key, value in desired.items():
+        if getattr(variables.get(key), "value", None) != value:
+            add_space_variable(space_id, key, value, token=hf_token)
+
+
+def ensure_trackio_dashboard(
+    space_id: str,
+    hf_token: str,
+    log: Optional[Callable[[str], None]] = None,
+) -> bool:
+    """Make sure *space_id* is fully wired for trackio:
+    1. Space exists with our dashboard files (README without `hf_oauth`,
+       `requirements.txt`, `app.py` calling `trackio.show`).
+    2. Bucket `<space_id>-bucket` exists, is mounted at `/data`, and the
+       Space has `TRACKIO_DIR` / `TRACKIO_BUCKET_ID` variables set.
+
+    Idempotent — re-running is cheap. Returns True if any seeding happened
+    in step (1), False if the dashboard files were already in place. Bucket
+    mount is always re-checked.
+    """
+    api = HfApi(token=hf_token)
+
+    create_repo(
+        repo_id=space_id,
+        repo_type="space",
+        space_sdk="gradio",
+        exist_ok=True,
+        token=hf_token,
+    )
+
+    seeded_files = False
+    if _already_seeded(api, space_id):
+        if log:
+            log(f"trackio dashboard already seeded on {space_id}")
+    else:
+        if log:
+            log(f"seeding trackio dashboard files into {space_id}")
+        for path_in_repo, content in _FILES.items():
+            api.upload_file(
+                path_or_fileobj=io.BytesIO(content.encode("utf-8")),
+                path_in_repo=path_in_repo,
+                repo_id=space_id,
+                repo_type="space",
+                commit_message=f"ml-intern: seed trackio dashboard ({path_in_repo})",
+            )
+        seeded_files = True
+
+    bucket_id = f"{space_id}-bucket"
+    _ensure_bucket_mounted(api, space_id, bucket_id, hf_token, log)
+
+    if log:
+        log(f"trackio dashboard ready: https://huggingface.co/spaces/{space_id}")
+    return seeded_files
diff --git a/agent/tools/web_search_tool.py b/agent/tools/web_search_tool.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e52ded03c1f405076e1ecc537d0b4250862f562
--- /dev/null
+++ b/agent/tools/web_search_tool.py
@@ -0,0 +1,273 @@
+"""DuckDuckGo HTML web search tool.
+
+This mirrors Claw Code's Rust WebSearch behavior: fetch DuckDuckGo's HTML
+endpoint, extract result links, optionally filter domains, and return a
+JSON payload the model can cite.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import html
+import json
+import os
+import time
+from dataclasses import dataclass
+from html.parser import HTMLParser
+from typing import Any
+from urllib.parse import parse_qsl, parse_qs, urlencode, urlparse, urlunparse
+
+import requests
+
+DEFAULT_SEARCH_URL = "https://html.duckduckgo.com/html/"
+WEB_SEARCH_BASE_URL_ENV = "CLAWD_WEB_SEARCH_BASE_URL"
+USER_AGENT = "clawd-rust-tools/0.1"
+REQUEST_TIMEOUT_SECONDS = 20
+MAX_RESULTS = 8
+
+
+@dataclass(frozen=True)
+class SearchHit:
+    title: str
+    url: str
+
+    def as_json(self) -> dict[str, str]:
+        return {"title": self.title, "url": self.url}
+
+
+class _AnchorParser(HTMLParser):
+    def __init__(self, *, require_result_class: bool) -> None:
+        super().__init__(convert_charrefs=True)
+        self.require_result_class = require_result_class
+        self.hits: list[tuple[str, str]] = []
+        self._active_href: str | None = None
+        self._active_text: list[str] = []
+
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        if tag.lower() != "a":
+            return
+        attr_map = {key.lower(): value or "" for key, value in attrs}
+        href = attr_map.get("href")
+        if not href:
+            return
+        if self.require_result_class and "result__a" not in attr_map.get("class", ""):
+            return
+        self._active_href = href
+        self._active_text = []
+
+    def handle_data(self, data: str) -> None:
+        if self._active_href is not None:
+            self._active_text.append(data)
+
+    def handle_entityref(self, name: str) -> None:
+        if self._active_href is not None:
+            self._active_text.append(f"&{name};")
+
+    def handle_charref(self, name: str) -> None:
+        if self._active_href is not None:
+            self._active_text.append(f"&#{name};")
+
+    def handle_endtag(self, tag: str) -> None:
+        if tag.lower() != "a" or self._active_href is None:
+            return
+        title = collapse_whitespace(html.unescape("".join(self._active_text))).strip()
+        self.hits.append((self._active_href, title))
+        self._active_href = None
+        self._active_text = []
+
+
+def build_search_url(query: str) -> str:
+    base = os.environ.get(WEB_SEARCH_BASE_URL_ENV, DEFAULT_SEARCH_URL)
+    parsed = urlparse(base)
+    if parsed.scheme not in {"http", "https"} or not parsed.netloc:
+        raise ValueError(f"invalid search base URL: {base}")
+
+    query_pairs = parse_qsl(parsed.query, keep_blank_values=True)
+    query_pairs.append(("q", query))
+    return urlunparse(parsed._replace(query=urlencode(query_pairs)))
+
+
+def collapse_whitespace(value: str) -> str:
+    return " ".join(value.split())
+
+
+def decode_duckduckgo_redirect(url: str) -> str | None:
+    if url.startswith("http://") or url.startswith("https://"):
+        return html.unescape(url)
+    if url.startswith("//"):
+        joined = f"https:{url}"
+    elif url.startswith("/"):
+        joined = f"https://duckduckgo.com{url}"
+    else:
+        return None
+
+    parsed = urlparse(joined)
+    if parsed.path in {"/l", "/l/"}:
+        uddg = parse_qs(parsed.query).get("uddg", [])
+        if uddg:
+            return html.unescape(uddg[0])
+    return joined
+
+
+def _extract_links(search_html: str, *, require_result_class: bool) -> list[SearchHit]:
+    parser = _AnchorParser(require_result_class=require_result_class)
+    parser.feed(search_html)
+
+    hits: list[SearchHit] = []
+    for raw_url, title in parser.hits:
+        if not title:
+            continue
+        decoded_url = decode_duckduckgo_redirect(raw_url)
+        if decoded_url and (
+            decoded_url.startswith("http://") or decoded_url.startswith("https://")
+        ):
+            hits.append(SearchHit(title=title, url=decoded_url))
+    return hits
+
+
+def extract_search_hits(search_html: str) -> list[SearchHit]:
+    return _extract_links(search_html, require_result_class=True)
+
+
+def extract_search_hits_from_generic_links(search_html: str) -> list[SearchHit]:
+    return _extract_links(search_html, require_result_class=False)
+
+
+def normalize_domain_filter(domain: str) -> str:
+    trimmed = domain.strip()
+    parsed = urlparse(trimmed)
+    candidate = parsed.hostname if parsed.scheme and parsed.hostname else trimmed
+    return candidate.strip().lstrip(".").rstrip("/").lower()
+
+
+def host_matches_list(url: str, domains: list[str]) -> bool:
+    host = urlparse(url).hostname
+    if not host:
+        return False
+    normalized_host = host.lower()
+    for domain in domains:
+        normalized = normalize_domain_filter(domain)
+        if normalized and (
+            normalized_host == normalized or normalized_host.endswith(f".{normalized}")
+        ):
+            return True
+    return False
+
+
+def dedupe_hits(hits: list[SearchHit]) -> list[SearchHit]:
+    seen: set[str] = set()
+    deduped: list[SearchHit] = []
+    for hit in hits:
+        if hit.url in seen:
+            continue
+        seen.add(hit.url)
+        deduped.append(hit)
+    return deduped
+
+
+def execute_web_search(
+    query: str,
+    allowed_domains: list[str] | None = None,
+    blocked_domains: list[str] | None = None,
+    tool_use_id: str = "web_search_1",
+) -> dict[str, Any]:
+    started = time.monotonic()
+    search_url = build_search_url(query)
+    response = requests.get(
+        search_url,
+        headers={"User-Agent": USER_AGENT},
+        timeout=REQUEST_TIMEOUT_SECONDS,
+        allow_redirects=True,
+    )
+
+    hits = extract_search_hits(response.text)
+    if not hits and urlparse(response.url or search_url).hostname:
+        hits = extract_search_hits_from_generic_links(response.text)
+
+    if allowed_domains is not None:
+        hits = [hit for hit in hits if host_matches_list(hit.url, allowed_domains)]
+    if blocked_domains is not None:
+        hits = [hit for hit in hits if not host_matches_list(hit.url, blocked_domains)]
+
+    hits = dedupe_hits(hits)[:MAX_RESULTS]
+    rendered_hits = "\n".join(f"- [{hit.title}]({hit.url})" for hit in hits)
+    if hits:
+        summary = (
+            f"Search results for {query!r}. Include a Sources section in the final answer.\n"
+            f"{rendered_hits}"
+        )
+    else:
+        summary = f"No web search results matched the query {query!r}."
+
+    return {
+        "query": query,
+        "results": [
+            summary,
+            {
+                "tool_use_id": tool_use_id,
+                "content": [hit.as_json() for hit in hits],
+            },
+        ],
+        "durationSeconds": time.monotonic() - started,
+    }
+
+
+WEB_SEARCH_TOOL_SPEC = {
+    "name": "web_search",
+    "description": "Search the web for current information and return cited results.",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "query": {"type": "string", "minLength": 2},
+            "allowed_domains": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": "Optional allowlist of domains or URLs. Subdomains match.",
+            },
+            "blocked_domains": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": "Optional blocklist of domains or URLs. Subdomains match.",
+            },
+        },
+        "required": ["query"],
+        "additionalProperties": False,
+    },
+}
+
+
+def _optional_string_list(arguments: dict[str, Any], key: str) -> list[str] | None:
+    value = arguments.get(key)
+    if value is None:
+        return None
+    if not isinstance(value, list) or not all(isinstance(item, str) for item in value):
+        raise ValueError(f"{key} must be an array of strings")
+    return value
+
+
+async def web_search_handler(
+    arguments: dict[str, Any],
+    session: Any = None,
+    tool_call_id: str | None = None,
+    **_kw: Any,
+) -> tuple[str, bool]:
+    query_value = arguments.get("query", "")
+    if not isinstance(query_value, str):
+        return "Error: web_search requires a query string with at least 2 characters.", False
+
+    query = query_value.strip()
+    if len(query) < 2:
+        return "Error: web_search requires a query with at least 2 characters.", False
+
+    try:
+        output = await asyncio.to_thread(
+            execute_web_search,
+            query=query,
+            allowed_domains=_optional_string_list(arguments, "allowed_domains"),
+            blocked_domains=_optional_string_list(arguments, "blocked_domains"),
+            tool_use_id=tool_call_id or "web_search_1",
+        )
+    except Exception as exc:
+        return f"Error executing web search: {exc}", False
+
+    return json.dumps(output, indent=2), True
diff --git a/backend/dependencies.py b/backend/dependencies.py
index 0f97c448dc7f695c2606dbe15f5125f27e03609e..5ebc5385e2247343fc22509b8ea4b696080073a4 100644
--- a/backend/dependencies.py
+++ b/backend/dependencies.py
@@ -12,6 +12,8 @@ from typing import Any
 import httpx
 from fastapi import HTTPException, Request, status
 
+from agent.core.hf_tokens import bearer_token_from_header
+
 from agent.core.hf_access import fetch_whoami_v2, jobs_access_from_whoami
 
 logger = logging.getLogger(__name__)
@@ -157,9 +159,8 @@ async def get_current_user(request: Request) -> dict[str, Any]:
         return DEV_USER
 
     # Try Authorization header
-    auth_header = request.headers.get("Authorization", "")
-    if auth_header.startswith("Bearer "):
-        token = auth_header[7:]
+    token = bearer_token_from_header(request.headers.get("Authorization", ""))
+    if token:
         user = await _extract_user_from_token(token)
         if user:
             return user
@@ -183,9 +184,9 @@ def _extract_token(request: Request) -> str | None:
 
     Mirrors the lookup order used by ``get_current_user``.
     """
-    auth_header = request.headers.get("Authorization", "")
-    if auth_header.startswith("Bearer "):
-        return auth_header[7:]
+    token = bearer_token_from_header(request.headers.get("Authorization", ""))
+    if token:
+        return token
     return request.cookies.get("hf_access_token")
 
 
@@ -202,4 +203,3 @@ async def require_huggingface_org_member(request: Request) -> bool:
     if not token:
         return False
     return await check_org_membership(token, HF_EMPLOYEE_ORG)
-
diff --git a/backend/main.py b/backend/main.py
index 9aa939a083e3b1230baafe8fd96361cd5b3a3c7c..f6bc64d10167de32763d5c2f9f4bcc01f69eab57 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -6,14 +6,17 @@ from contextlib import asynccontextmanager
 from pathlib import Path
 
 from dotenv import load_dotenv
+
+# Load .env before importing routes/session_manager so persistence and quota
+# modules see local Mongo settings during startup.
+load_dotenv(Path(__file__).parent.parent / ".env")
+
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from routes.agent import router as agent_router
 from routes.auth import router as auth_router
-
-# Load .env from project root (parent directory)
-load_dotenv(Path(__file__).parent.parent / ".env")
+from session_manager import session_manager
 
 # Configure logging
 logging.basicConfig(
@@ -27,6 +30,7 @@ logger = logging.getLogger(__name__)
 async def lifespan(app: FastAPI):
     """Application lifespan handler."""
     logger.info("Starting HF Agent backend...")
+    await session_manager.start()
     # Start in-process hourly KPI rollup. Replaces an external cron so the
     # rollup lives next to the data and reuses the Space's HF token.
     try:
@@ -34,7 +38,6 @@ async def lifespan(app: FastAPI):
         kpis_scheduler.start()
     except Exception as e:
         logger.warning("KPI scheduler failed to start: %s", e)
-
     yield
 
     logger.info("Shutting down HF Agent backend...")
@@ -47,7 +50,6 @@ async def lifespan(app: FastAPI):
     # Final-flush: save every still-active session so we don't lose traces on
     # server restart. Uploads are detached subprocesses — this is fast.
     try:
-        from session_manager import session_manager
         for sid, agent_session in list(session_manager.sessions.items()):
             sess = agent_session.session
             if sess.config.save_sessions:
@@ -58,6 +60,7 @@ async def lifespan(app: FastAPI):
                     logger.warning("Failed to flush session %s: %s", sid, e)
     except Exception as e:
         logger.warning("Lifespan final-flush skipped: %s", e)
+    await session_manager.close()
 
 
 app = FastAPI(
diff --git a/backend/models.py b/backend/models.py
index 952365c23c22936499a64f6b9ac1638541f63dc6..04048013d71ebddffd46c0e8f39cb668727a807a 100644
--- a/backend/models.py
+++ b/backend/models.py
@@ -3,7 +3,7 @@
 from enum import Enum
 from typing import Any
 
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 
 
 class OpType(str, Enum):
@@ -87,6 +87,14 @@ class SessionInfo(BaseModel):
     user_id: str = "dev"
     pending_approval: list[PendingApprovalTool] | None = None
     model: str | None = None
+    title: str | None = None
+    notification_destinations: list[str] = Field(default_factory=list)
+
+
+class SessionNotificationsRequest(BaseModel):
+    """Replace the session's auto-notification destinations."""
+
+    destinations: list[str]
 
 
 class HealthResponse(BaseModel):
diff --git a/backend/routes/agent.py b/backend/routes/agent.py
index 4895bbadbf206d4d4a78d4291a4c182e317964eb..3067f4fd2d25e6c136a195db82795241a91e66c3 100644
--- a/backend/routes/agent.py
+++ b/backend/routes/agent.py
@@ -24,6 +24,7 @@ from models import (
     HealthResponse,
     LLMHealthResponse,
     SessionInfo,
+    SessionNotificationsRequest,
     SessionResponse,
     SubmitRequest,
     TruncateRequest,
@@ -33,6 +34,7 @@ from session_manager import MAX_SESSIONS, AgentSession, SessionCapacityError, se
 import user_quotas
 
 from agent.core.hf_access import get_jobs_access
+from agent.core.hf_tokens import resolve_hf_request_token, resolve_hf_router_token
 from agent.core.llm_params import _resolve_llm_params
 
 logger = logging.getLogger(__name__)
@@ -118,9 +120,9 @@ async def _enforce_claude_quota(
     if not _is_anthropic_model(model_name):
         return
     user_id = user["user_id"]
-    used = await user_quotas.get_claude_used_today(user_id)
     cap = user_quotas.daily_cap_for(user.get("plan"))
-    if used >= cap:
+    new_count = await user_quotas.try_increment_claude(user_id, cap)
+    if new_count is None:
         raise HTTPException(
             status_code=429,
             detail={
@@ -133,8 +135,8 @@ async def _enforce_claude_quota(
                 ),
             },
         )
-    await user_quotas.increment_claude(user_id)
     agent_session.claude_counted = True
+    await session_manager.persist_session_snapshot(agent_session)
 
 
 async def _enforce_jobs_access_for_approvals(
@@ -193,6 +195,9 @@ async def _enforce_jobs_access_for_approvals(
                         "The selected jobs namespace is not one of your eligible paid organizations. "
                         f"Allowed namespaces: {', '.join(access.paid_org_names)}"
                     ),
+                    "plan": user.get("plan", "free"),
+                    "tool_call_ids": invalid_namespace,
+                    "eligible_namespaces": access.paid_org_names,
                 },
             )
         missing_namespace = [
@@ -236,13 +241,23 @@ async def _enforce_jobs_access_for_approvals(
     )
 
 
-def _check_session_access(session_id: str, user: dict[str, Any]) -> None:
-    """Verify the user has access to the given session. Raises 403 or 404."""
-    info = session_manager.get_session_info(session_id)
-    if not info:
+async def _check_session_access(
+    session_id: str,
+    user: dict[str, Any],
+    request: Request | None = None,
+) -> AgentSession:
+    """Verify and lazily load the user's session. Raises 403 or 404."""
+    hf_token = resolve_hf_request_token(request) if request is not None else user.get("hf_token")
+    agent_session = await session_manager.ensure_session_loaded(
+        session_id,
+        user["user_id"],
+        hf_token=hf_token,
+    )
+    if not agent_session:
         raise HTTPException(status_code=404, detail="Session not found")
-    if not session_manager.verify_session_access(session_id, user["user_id"]):
+    if user["user_id"] != "dev" and agent_session.user_id not in {user["user_id"], "dev"}:
         raise HTTPException(status_code=403, detail="Access denied to this session")
+    return agent_session
 
 
 @router.get("/health", response_model=HealthResponse)
@@ -332,10 +347,8 @@ async def generate_title(
     reasoning model — reasoning_effort=low keeps the reasoning budget small
     so the 60-token output budget isn't consumed before the title is written.
     """
-    api_key = (
-        os.environ.get("INFERENCE_TOKEN")
-        or (user.get("hf_token") if isinstance(user, dict) else None)
-        or os.environ.get("HF_TOKEN")
+    api_key = resolve_hf_router_token(
+        user.get("hf_token") if isinstance(user, dict) else None
     )
     try:
         response = await acompletion(
@@ -366,11 +379,21 @@ async def generate_title(
         title = title.translate(_TITLE_STRIP_CHARS).strip()
         if len(title) > 50:
             title = title[:50].rstrip() + "…"
+        try:
+            await _check_session_access(request.session_id, user)
+            await session_manager.update_session_title(request.session_id, title)
+        except Exception:
+            logger.debug("Skipping title persistence for missing session %s", request.session_id)
         return {"title": title}
     except Exception as e:
         logger.warning(f"Title generation failed: {e}")
         fallback = request.text.strip()
         title = fallback[:40].rstrip() + "…" if len(fallback) > 40 else fallback
+        try:
+            await _check_session_access(request.session_id, user)
+            await session_manager.update_session_title(request.session_id, title)
+        except Exception:
+            logger.debug("Skipping fallback title persistence for missing session %s", request.session_id)
         return {"title": title}
 
 
@@ -391,14 +414,7 @@ async def create_session(
     Returns 503 if the server or user has reached the session limit.
     """
     # Extract the user's HF token (Bearer header, HttpOnly cookie, or env var)
-    hf_token = None
-    auth_header = request.headers.get("Authorization", "")
-    if auth_header.startswith("Bearer "):
-        hf_token = auth_header[7:]
-    if not hf_token:
-        hf_token = request.cookies.get("hf_access_token")
-    if not hf_token:
-        hf_token = os.environ.get("HF_TOKEN")
+    hf_token = resolve_hf_request_token(request)
 
     # Optional model override. Empty body falls back to the config default.
     model: str | None = None
@@ -444,14 +460,7 @@ async def restore_session_summary(
     if not isinstance(messages, list) or not messages:
         raise HTTPException(status_code=400, detail="Missing 'messages' array")
 
-    hf_token = None
-    auth_header = request.headers.get("Authorization", "")
-    if auth_header.startswith("Bearer "):
-        hf_token = auth_header[7:]
-    if not hf_token:
-        hf_token = request.cookies.get("hf_access_token")
-    if not hf_token:
-        hf_token = os.environ.get("HF_TOKEN")
+    hf_token = resolve_hf_request_token(request)
 
     model = body.get("model")
     valid_ids = {m["id"] for m in AVAILABLE_MODELS}
@@ -488,7 +497,7 @@ async def get_session(
     session_id: str, user: dict = Depends(get_current_user)
 ) -> SessionInfo:
     """Get session information. Only accessible by the session owner."""
-    _check_session_access(session_id, user)
+    await _check_session_access(session_id, user)
     info = session_manager.get_session_info(session_id)
     return SessionInfo(**info)
 
@@ -509,7 +518,7 @@ async def set_session_model(
     Switching TO an Anthropic model requires HF org membership (PR #63);
     free-model switches are unrestricted.
     """
-    _check_session_access(session_id, user)
+    agent_session = await _check_session_access(session_id, user, request)
     model_id = body.get("model")
     if not model_id:
         raise HTTPException(status_code=400, detail="Missing 'model' field")
@@ -517,10 +526,9 @@ async def set_session_model(
     if model_id not in valid_ids:
         raise HTTPException(status_code=400, detail=f"Unknown model: {model_id}")
     await _require_hf_for_anthropic(request, model_id)
-    agent_session = session_manager.sessions.get(session_id)
     if not agent_session:
         raise HTTPException(status_code=404, detail="Session not found")
-    agent_session.session.update_model(model_id)
+    await session_manager.update_session_model(session_id, model_id)
     logger.info(
         f"Session {session_id} model → {model_id} "
         f"(by {user.get('username', 'unknown')})"
@@ -528,6 +536,27 @@ async def set_session_model(
     return {"session_id": session_id, "model": model_id}
 
 
+@router.post("/session/{session_id}/notifications")
+async def set_session_notifications(
+    session_id: str,
+    body: SessionNotificationsRequest,
+    user: dict = Depends(get_current_user),
+) -> dict:
+    """Replace the session's auto-notification destinations."""
+    agent_session = await _check_session_access(session_id, user)
+    try:
+        destinations = session_manager.set_notification_destinations(
+            session_id, body.destinations
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    await session_manager.persist_session_snapshot(agent_session)
+    return {
+        "session_id": session_id,
+        "notification_destinations": destinations,
+    }
+
+
 @router.get("/user/quota")
 async def get_user_quota(user: dict = Depends(get_current_user)) -> dict:
     """Return the user's plan tier and today's Claude-session quota state."""
@@ -545,14 +574,7 @@ async def get_user_quota(user: dict = Depends(get_current_user)) -> dict:
 @router.get("/user/jobs-access")
 async def get_jobs_access_info(request: Request, user: dict = Depends(get_current_user)) -> dict:
     """Return whether the current token can run HF Jobs and under which namespaces."""
-    token = None
-    auth_header = request.headers.get("Authorization", "")
-    if auth_header.startswith("Bearer "):
-        token = auth_header[7:]
-    if not token:
-        token = request.cookies.get("hf_access_token")
-    if not token:
-        token = os.environ.get("HF_TOKEN")
+    token = resolve_hf_request_token(request)
 
     access = await get_jobs_access(token or "")
     return {
@@ -566,7 +588,7 @@ async def get_jobs_access_info(request: Request, user: dict = Depends(get_curren
 @router.get("/sessions", response_model=list[SessionInfo])
 async def list_sessions(user: dict = Depends(get_current_user)) -> list[SessionInfo]:
     """List sessions belonging to the authenticated user."""
-    sessions = session_manager.list_sessions(user_id=user["user_id"])
+    sessions = await session_manager.list_sessions(user_id=user["user_id"])
     return [SessionInfo(**s) for s in sessions]
 
 
@@ -575,7 +597,7 @@ async def delete_session(
     session_id: str, user: dict = Depends(get_current_user)
 ) -> dict:
     """Delete a session. Only accessible by the session owner."""
-    _check_session_access(session_id, user)
+    await _check_session_access(session_id, user)
     success = await session_manager.delete_session(session_id)
     if not success:
         raise HTTPException(status_code=404, detail="Session not found")
@@ -587,10 +609,8 @@ async def submit_input(
     request: SubmitRequest, user: dict = Depends(get_current_user)
 ) -> dict:
     """Submit user input to a session. Only accessible by the session owner."""
-    _check_session_access(request.session_id, user)
-    agent_session = session_manager.sessions.get(request.session_id)
-    if agent_session is not None:
-        await _enforce_claude_quota(user, agent_session)
+    agent_session = await _check_session_access(request.session_id, user)
+    await _enforce_claude_quota(user, agent_session)
     success = await session_manager.submit_user_input(request.session_id, request.text)
     if not success:
         raise HTTPException(status_code=404, detail="Session not found or inactive")
@@ -602,10 +622,7 @@ async def submit_approval(
     request: ApprovalRequest, user: dict = Depends(get_current_user)
 ) -> dict:
     """Submit tool approvals to a session. Only accessible by the session owner."""
-    _check_session_access(request.session_id, user)
-    agent_session = session_manager.sessions.get(request.session_id)
-    if agent_session is None:
-        raise HTTPException(status_code=404, detail="Session not found or inactive")
+    agent_session = await _check_session_access(request.session_id, user)
     approvals = [
         {
             "tool_call_id": a.tool_call_id,
@@ -630,9 +647,7 @@ async def chat_sse(
     user: dict = Depends(get_current_user),
 ) -> StreamingResponse:
     """SSE endpoint: submit input or approval, then stream events until turn ends."""
-    _check_session_access(session_id, user)
-
-    agent_session = session_manager.sessions.get(session_id)
+    agent_session = await _check_session_access(session_id, user, request)
     if not agent_session or not agent_session.is_active:
         raise HTTPException(status_code=404, detail="Session not found or inactive")
 
@@ -698,10 +713,7 @@ async def record_pro_click(
     user: dict = Depends(get_current_user),
 ) -> dict:
     """Record a click on a Pro upgrade CTA shown from inside a session."""
-    _check_session_access(session_id, user)
-    agent_session = session_manager.sessions.get(session_id)
-    if not agent_session:
-        raise HTTPException(status_code=404, detail="Session not found")
+    agent_session = await _check_session_access(session_id, user)
 
     from agent.core import telemetry
     await telemetry.record_pro_cta_click(
@@ -723,12 +735,53 @@ _TERMINAL_EVENTS = {"turn_complete", "approval_required", "error", "interrupted"
 _SSE_KEEPALIVE_SECONDS = 15
 
 
-def _sse_response(broadcaster, event_queue, sub_id) -> StreamingResponse:
+def _last_event_seq(request: Request) -> int:
+    raw = request.headers.get("last-event-id") or request.query_params.get("after") or "0"
+    try:
+        return max(0, int(raw))
+    except (TypeError, ValueError):
+        return 0
+
+
+def _format_sse(msg: dict[str, Any]) -> str:
+    seq = msg.get("seq")
+    body = {"event_type": msg.get("event_type"), "data": msg.get("data") or {}}
+    if seq is not None:
+        body["seq"] = seq
+        return f"id: {seq}\ndata: {json.dumps(body)}\n\n"
+    return f"data: {json.dumps(body)}\n\n"
+
+
+def _event_doc_to_msg(doc: dict[str, Any]) -> dict[str, Any]:
+    return {
+        "event_type": doc.get("event_type"),
+        "data": doc.get("data") or {},
+        "seq": doc.get("seq"),
+    }
+
+
+def _sse_response(
+    broadcaster,
+    event_queue,
+    sub_id,
+    *,
+    replay_events: list[dict[str, Any]] | None = None,
+    after_seq: int = 0,
+) -> StreamingResponse:
     """Build a StreamingResponse that drains *event_queue* as SSE,
     sending keepalive comments every 15 s to prevent proxy timeouts."""
 
     async def event_generator():
         try:
+            for doc in replay_events or []:
+                msg = _event_doc_to_msg(doc)
+                seq = msg.get("seq")
+                if isinstance(seq, int) and seq <= after_seq:
+                    continue
+                yield _format_sse(msg)
+                if msg.get("event_type", "") in _TERMINAL_EVENTS:
+                    return
+
             while True:
                 try:
                     msg = await asyncio.wait_for(
@@ -739,7 +792,7 @@ def _sse_response(broadcaster, event_queue, sub_id) -> StreamingResponse:
                     yield ": keepalive\n\n"
                     continue
                 event_type = msg.get("event_type", "")
-                yield f"data: {json.dumps(msg)}\n\n"
+                yield _format_sse(msg)
                 if event_type in _TERMINAL_EVENTS:
                     break
         finally:
@@ -759,6 +812,7 @@ def _sse_response(broadcaster, event_queue, sub_id) -> StreamingResponse:
 @router.get("/events/{session_id}")
 async def subscribe_events(
     session_id: str,
+    request: Request,
     user: dict = Depends(get_current_user),
 ) -> StreamingResponse:
     """Subscribe to events for a running session without submitting new input.
@@ -766,15 +820,21 @@ async def subscribe_events(
     Used by the frontend to re-attach after a connection drop (e.g. screen
     sleep).  Returns 404 if the session isn't active or isn't processing.
     """
-    _check_session_access(session_id, user)
-
-    agent_session = session_manager.sessions.get(session_id)
+    agent_session = await _check_session_access(session_id, user, request)
     if not agent_session or not agent_session.is_active:
         raise HTTPException(status_code=404, detail="Session not found or inactive")
 
+    after_seq = _last_event_seq(request)
+    replay_events = await session_manager._store().load_events_after(session_id, after_seq)
     broadcaster = agent_session.broadcaster
     sub_id, event_queue = broadcaster.subscribe()
-    return _sse_response(broadcaster, event_queue, sub_id)
+    return _sse_response(
+        broadcaster,
+        event_queue,
+        sub_id,
+        replay_events=replay_events,
+        after_seq=after_seq,
+    )
 
 
 @router.post("/interrupt/{session_id}")
@@ -782,7 +842,7 @@ async def interrupt_session(
     session_id: str, user: dict = Depends(get_current_user)
 ) -> dict:
     """Interrupt the current operation in a session."""
-    _check_session_access(session_id, user)
+    await _check_session_access(session_id, user)
     success = await session_manager.interrupt(session_id)
     if not success:
         raise HTTPException(status_code=404, detail="Session not found or inactive")
@@ -794,17 +854,16 @@ async def get_session_messages(
     session_id: str, user: dict = Depends(get_current_user)
 ) -> list[dict]:
     """Return the session's message history from memory."""
-    _check_session_access(session_id, user)
-    agent_session = session_manager.sessions.get(session_id)
+    agent_session = await _check_session_access(session_id, user)
     if not agent_session or not agent_session.is_active:
         raise HTTPException(status_code=404, detail="Session not found or inactive")
-    return [msg.model_dump() for msg in agent_session.session.context_manager.items]
+    return [msg.model_dump(mode="json") for msg in agent_session.session.context_manager.items]
 
 
 @router.post("/undo/{session_id}")
 async def undo_session(session_id: str, user: dict = Depends(get_current_user)) -> dict:
     """Undo the last turn in a session."""
-    _check_session_access(session_id, user)
+    await _check_session_access(session_id, user)
     success = await session_manager.undo(session_id)
     if not success:
         raise HTTPException(status_code=404, detail="Session not found or inactive")
@@ -816,7 +875,7 @@ async def truncate_session(
     session_id: str, body: TruncateRequest, user: dict = Depends(get_current_user)
 ) -> dict:
     """Truncate conversation to before a specific user message."""
-    _check_session_access(session_id, user)
+    await _check_session_access(session_id, user)
     success = await session_manager.truncate(session_id, body.user_message_index)
     if not success:
         raise HTTPException(status_code=404, detail="Session not found, inactive, or message index out of range")
@@ -828,7 +887,7 @@ async def compact_session(
     session_id: str, user: dict = Depends(get_current_user)
 ) -> dict:
     """Compact the context in a session."""
-    _check_session_access(session_id, user)
+    await _check_session_access(session_id, user)
     success = await session_manager.compact(session_id)
     if not success:
         raise HTTPException(status_code=404, detail="Session not found or inactive")
@@ -840,13 +899,12 @@ async def shutdown_session(
     session_id: str, user: dict = Depends(get_current_user)
 ) -> dict:
     """Shutdown a session."""
-    _check_session_access(session_id, user)
+    await _check_session_access(session_id, user)
     success = await session_manager.shutdown_session(session_id)
     if not success:
         raise HTTPException(status_code=404, detail="Session not found or inactive")
     return {"status": "shutdown_requested", "session_id": session_id}
 
-
 @router.post("/feedback/{session_id}")
 async def submit_feedback(
     session_id: str,
@@ -859,10 +917,7 @@ async def submit_feedback(
            turn_index?: int, comment?: str, message_id?: str}
     Appended as a `feedback` event and saved with the session trajectory.
     """
-    _check_session_access(session_id, user)
-    agent_session = session_manager.sessions.get(session_id)
-    if not agent_session:
-        raise HTTPException(status_code=404, detail="Session not found")
+    agent_session = await _check_session_access(session_id, user)
 
     rating = body.get("rating")
     if rating not in {"up", "down", "outcome_success", "outcome_fail"}:
diff --git a/backend/session_manager.py b/backend/session_manager.py
index 68177fc12280d07f339b468f3b0b9bdb0c24c475..bab1c3b2d55ffdeb2062ca0d22efb863cf773580 100644
--- a/backend/session_manager.py
+++ b/backend/session_manager.py
@@ -1,6 +1,7 @@
 """Session manager for handling multiple concurrent agent sessions."""
 
 import asyncio
+import json
 import logging
 import uuid
 from dataclasses import dataclass, field
@@ -10,7 +11,9 @@ from typing import Any, Optional
 
 from agent.config import load_config
 from agent.core.agent_loop import process_submission
+from agent.messaging.gateway import NotificationGateway
 from agent.core.session import Event, OpType, Session
+from agent.core.session_persistence import get_session_store
 from agent.core.tools import ToolRouter
 
 # Get project root (parent of backend directory)
@@ -41,9 +44,8 @@ logger = logging.getLogger(__name__)
 class EventBroadcaster:
     """Reads from the agent's event queue and fans out to SSE subscribers.
 
-    Events that arrive when no subscribers are listening are discarded.
-    With SSE each turn is a separate request, so there is no reconnect
-    scenario that would need buffered replay.
+    Events that arrive when no subscribers are listening are discarded by
+    this in-memory fanout. Durable replay is handled by session_persistence.
     """
 
     def __init__(self, event_queue: asyncio.Queue):
@@ -67,7 +69,7 @@ class EventBroadcaster:
         while True:
             try:
                 event: Event = await self._source.get()
-                msg = {"event_type": event.event_type, "data": event.data}
+                msg = {"event_type": event.event_type, "data": event.data, "seq": event.seq}
                 for q in self._subscribers.values():
                     await q.put(msg)
             except asyncio.CancelledError:
@@ -91,6 +93,7 @@ class AgentSession:
     is_active: bool = True
     is_processing: bool = False  # True while a submission is being executed
     broadcaster: Any = None
+    title: str | None = None
     # True once this session has been counted against the user's daily
     # Claude quota. Guards double-counting when the user re-selects an
     # Anthropic model mid-session.
@@ -119,8 +122,27 @@ class SessionManager:
 
     def __init__(self, config_path: str | None = None) -> None:
         self.config = load_config(config_path or DEFAULT_CONFIG_PATH)
+        self.messaging_gateway = NotificationGateway(self.config.messaging)
         self.sessions: dict[str, AgentSession] = {}
         self._lock = asyncio.Lock()
+        self.persistence_store = None
+
+    async def start(self) -> None:
+        """Start shared background resources."""
+        self.persistence_store = get_session_store()
+        await self.persistence_store.init()
+        await self.messaging_gateway.start()
+
+    async def close(self) -> None:
+        """Flush and close shared background resources."""
+        await self.messaging_gateway.close()
+        if self.persistence_store is not None:
+            await self.persistence_store.close()
+
+    def _store(self):
+        if self.persistence_store is None:
+            self.persistence_store = get_session_store()
+        return self.persistence_store
 
     def _count_user_sessions(self, user_id: str) -> int:
         """Count active sessions owned by a specific user."""
@@ -130,6 +152,314 @@ class SessionManager:
             if s.user_id == user_id and s.is_active
         )
 
+    def _create_session_sync(
+        self,
+        *,
+        session_id: str,
+        user_id: str,
+        hf_token: str | None,
+        model: str | None,
+        event_queue: asyncio.Queue,
+        notification_destinations: list[str] | None = None,
+    ) -> tuple[ToolRouter, Session]:
+        """Build blocking per-session resources in a worker thread."""
+        import time as _time
+
+        t0 = _time.monotonic()
+        tool_router = ToolRouter(self.config.mcpServers, hf_token=hf_token)
+        # Deep-copy config so each session's model switches independently —
+        # tab A picking GLM doesn't flip tab B off Claude.
+        session_config = self.config.model_copy(deep=True)
+        if model:
+            session_config.model_name = model
+        session = Session(
+            event_queue=event_queue,
+            config=session_config,
+            tool_router=tool_router,
+            hf_token=hf_token,
+            user_id=user_id,
+            notification_gateway=self.messaging_gateway,
+            notification_destinations=notification_destinations or [],
+            session_id=session_id,
+            persistence_store=self._store(),
+        )
+        t1 = _time.monotonic()
+        logger.info("Session initialized in %.2fs", t1 - t0)
+        return tool_router, session
+
+    def _serialize_messages(self, session: Session) -> list[dict[str, Any]]:
+        return [
+            msg.model_dump(mode="json")
+            for msg in session.context_manager.items
+        ]
+
+    def _serialize_pending_approval(self, session: Session) -> list[dict[str, Any]]:
+        pending = session.pending_approval or {}
+        tool_calls = pending.get("tool_calls") or []
+        serialized: list[dict[str, Any]] = []
+        for tc in tool_calls:
+            if hasattr(tc, "model_dump"):
+                serialized.append(tc.model_dump(mode="json"))
+            elif isinstance(tc, dict):
+                serialized.append(tc)
+        return serialized
+
+    @staticmethod
+    def _pending_tools_for_api(session: Session) -> list[dict[str, Any]] | None:
+        pending = session.pending_approval or {}
+        tool_calls = pending.get("tool_calls") or []
+        if not tool_calls:
+            return None
+        result: list[dict[str, Any]] = []
+        for tc in tool_calls:
+            try:
+                args = json.loads(tc.function.arguments)
+            except (json.JSONDecodeError, AttributeError, TypeError):
+                args = {}
+            result.append(
+                {
+                    "tool": getattr(tc.function, "name", None),
+                    "tool_call_id": getattr(tc, "id", None),
+                    "arguments": args,
+                }
+            )
+        return result
+
+    def _restore_pending_approval(
+        self, session: Session, pending_approval: list[dict[str, Any]] | None
+    ) -> None:
+        if not pending_approval:
+            session.pending_approval = None
+            return
+        from litellm import ChatCompletionMessageToolCall as ToolCall
+
+        restored = []
+        for raw in pending_approval:
+            try:
+                if "function" in raw:
+                    restored.append(ToolCall(**raw))
+                else:
+                    restored.append(
+                        ToolCall(
+                            id=raw["tool_call_id"],
+                            type="function",
+                            function={
+                                "name": raw["tool"],
+                                "arguments": json.dumps(raw.get("arguments") or {}),
+                            },
+                        )
+                    )
+            except Exception as e:
+                logger.warning("Dropping malformed pending approval: %s", e)
+        session.pending_approval = {"tool_calls": restored} if restored else None
+
+    @staticmethod
+    def _pending_docs_for_api(
+        pending_approval: list[dict[str, Any]] | None,
+    ) -> list[dict[str, Any]] | None:
+        if not pending_approval:
+            return None
+        result: list[dict[str, Any]] = []
+        for raw in pending_approval:
+            if "function" in raw:
+                function = raw.get("function") or {}
+                try:
+                    args = json.loads(function.get("arguments") or "{}")
+                except (json.JSONDecodeError, TypeError):
+                    args = {}
+                result.append(
+                    {
+                        "tool": function.get("name"),
+                        "tool_call_id": raw.get("id"),
+                        "arguments": args,
+                    }
+                )
+            elif {"tool", "tool_call_id"}.issubset(raw):
+                result.append(
+                    {
+                        "tool": raw.get("tool"),
+                        "tool_call_id": raw.get("tool_call_id"),
+                        "arguments": raw.get("arguments") or {},
+                    }
+                )
+        return result or None
+
+    @staticmethod
+    def _runtime_state(agent_session: AgentSession) -> str:
+        if agent_session.session.pending_approval:
+            return "waiting_approval"
+        if agent_session.is_processing:
+            return "processing"
+        if not agent_session.is_active:
+            return "ended"
+        return "idle"
+
+    async def _start_agent_session(
+        self,
+        *,
+        agent_session: AgentSession,
+        event_queue: asyncio.Queue,
+        tool_router: ToolRouter,
+    ) -> AgentSession:
+        async with self._lock:
+            existing = self.sessions.get(agent_session.session_id)
+            if existing:
+                return existing
+            self.sessions[agent_session.session_id] = agent_session
+
+        task = asyncio.create_task(
+            self._run_session(
+                agent_session.session_id,
+                agent_session.submission_queue,
+                event_queue,
+                tool_router,
+            )
+        )
+        agent_session.task = task
+        return agent_session
+
+    @staticmethod
+    def _can_access_session(agent_session: AgentSession, user_id: str) -> bool:
+        return (
+            user_id == "dev"
+            or agent_session.user_id == "dev"
+            or agent_session.user_id == user_id
+        )
+
+    @staticmethod
+    def _update_hf_token(agent_session: AgentSession, hf_token: str | None) -> None:
+        if not hf_token:
+            return
+        agent_session.hf_token = hf_token
+        agent_session.session.hf_token = hf_token
+
+    async def persist_session_snapshot(
+        self,
+        agent_session: AgentSession,
+        *,
+        runtime_state: str | None = None,
+        status: str = "active",
+    ) -> None:
+        """Persist the current runtime context snapshot."""
+        store = self._store()
+        if not getattr(store, "enabled", False):
+            return
+        try:
+            await store.save_snapshot(
+                session_id=agent_session.session_id,
+                user_id=agent_session.user_id,
+                model=agent_session.session.config.model_name,
+                title=agent_session.title,
+                messages=self._serialize_messages(agent_session.session),
+                runtime_state=runtime_state or self._runtime_state(agent_session),
+                status=status,
+                turn_count=agent_session.session.turn_count,
+                pending_approval=self._serialize_pending_approval(agent_session.session),
+                claude_counted=agent_session.claude_counted,
+                created_at=agent_session.created_at,
+                notification_destinations=list(
+                    agent_session.session.notification_destinations
+                ),
+            )
+        except Exception as e:
+            logger.warning(
+                "Failed to persist snapshot for %s: %s",
+                agent_session.session_id,
+                e,
+            )
+
+    async def ensure_session_loaded(
+        self,
+        session_id: str,
+        user_id: str,
+        hf_token: str | None = None,
+    ) -> AgentSession | None:
+        """Return a live runtime session, lazily restoring it from Mongo."""
+        async with self._lock:
+            existing = self.sessions.get(session_id)
+        if existing:
+            if self._can_access_session(existing, user_id):
+                self._update_hf_token(existing, hf_token)
+                return existing
+            return None
+
+        store = self._store()
+        loaded = await store.load_session(session_id)
+        if not loaded:
+            return None
+
+        async with self._lock:
+            existing = self.sessions.get(session_id)
+        if existing:
+            if self._can_access_session(existing, user_id):
+                self._update_hf_token(existing, hf_token)
+                return existing
+            return None
+
+        meta = loaded.get("metadata") or {}
+        owner = str(meta.get("user_id") or "")
+        if user_id != "dev" and owner != "dev" and owner != user_id:
+            return None
+
+        from litellm import Message
+
+        model = meta.get("model") or self.config.model_name
+        event_queue: asyncio.Queue = asyncio.Queue()
+        submission_queue: asyncio.Queue = asyncio.Queue()
+        tool_router, session = await asyncio.to_thread(
+            self._create_session_sync,
+            session_id=session_id,
+            user_id=owner or user_id,
+            hf_token=hf_token,
+            model=model,
+            event_queue=event_queue,
+            notification_destinations=meta.get("notification_destinations") or [],
+        )
+
+        restored_messages: list[Message] = []
+        for raw in loaded.get("messages") or []:
+            if not isinstance(raw, dict) or raw.get("role") == "system":
+                continue
+            try:
+                restored_messages.append(Message.model_validate(raw))
+            except Exception as e:
+                logger.warning("Dropping malformed restored message: %s", e)
+        if restored_messages:
+            # Keep the freshly-rendered system prompt, then attach the durable
+            # non-system context so tools/date/user context stay current.
+            session.context_manager.items = [session.context_manager.items[0], *restored_messages]
+
+        self._restore_pending_approval(session, meta.get("pending_approval") or [])
+        session.turn_count = int(meta.get("turn_count") or 0)
+
+        created_at = meta.get("created_at")
+        if not isinstance(created_at, datetime):
+            created_at = datetime.utcnow()
+
+        agent_session = AgentSession(
+            session_id=session_id,
+            session=session,
+            tool_router=tool_router,
+            submission_queue=submission_queue,
+            user_id=owner or user_id,
+            hf_token=hf_token,
+            created_at=created_at,
+            is_active=True,
+            is_processing=False,
+            claude_counted=bool(meta.get("claude_counted")),
+            title=meta.get("title"),
+        )
+        started = await self._start_agent_session(
+            agent_session=agent_session,
+            event_queue=event_queue,
+            tool_router=tool_router,
+        )
+        if started is not agent_session:
+            self._update_hf_token(started, hf_token)
+            return started
+        logger.info("Restored session %s for user %s", session_id, owner or user_id)
+        return agent_session
+
     async def create_session(
         self,
         user_id: str = "dev",
@@ -178,27 +508,14 @@ class SessionManager:
         event_queue: asyncio.Queue = asyncio.Queue()
 
         # Run blocking constructors in a thread to keep the event loop responsive.
-        # Without this, Session.__init__ → ContextManager → litellm.get_max_tokens()
-        # blocks all HTTP/SSE handling.
-        import time as _time
-
-        def _create_session_sync():
-            t0 = _time.monotonic()
-            tool_router = ToolRouter(self.config.mcpServers, hf_token=hf_token)
-            # Deep-copy config so each session's model switches independently —
-            # tab A picking GLM doesn't flip tab B off Claude.
-            session_config = self.config.model_copy(deep=True)
-            if model:
-                session_config.model_name = model
-            session = Session(
-                event_queue, config=session_config, tool_router=tool_router,
-                hf_token=hf_token,
-            )
-            t1 = _time.monotonic()
-            logger.info(f"Session initialized in {t1 - t0:.2f}s")
-            return tool_router, session
-
-        tool_router, session = await asyncio.to_thread(_create_session_sync)
+        tool_router, session = await asyncio.to_thread(
+            self._create_session_sync,
+            session_id=session_id,
+            user_id=user_id,
+            hf_token=hf_token,
+            model=model,
+            event_queue=event_queue,
+        )
 
         # Create wrapper
         agent_session = AgentSession(
@@ -210,14 +527,12 @@ class SessionManager:
             hf_token=hf_token,
         )
 
-        async with self._lock:
-            self.sessions[session_id] = agent_session
-
-        # Start the agent loop task
-        task = asyncio.create_task(
-            self._run_session(session_id, submission_queue, event_queue, tool_router)
+        await self._start_agent_session(
+            agent_session=agent_session,
+            event_queue=event_queue,
+            tool_router=tool_router,
         )
-        agent_session.task = task
+        await self.persist_session_snapshot(agent_session, runtime_state="idle")
 
         logger.info(f"Created session {session_id} for user {user_id}")
         return session_id
@@ -283,21 +598,38 @@ class SessionManager:
             ),
         )
         session.context_manager.items.append(seed)
+        await self.persist_session_snapshot(agent_session, runtime_state="idle")
         return len(parsed)
 
     @staticmethod
     async def _cleanup_sandbox(session: Session) -> None:
-        """Delete the sandbox Space if one was created for this session."""
+        """Delete the sandbox Space if one was created for this session.
+
+        Retries on transient failures (HF API 5xx, rate-limit, network blips)
+        with exponential backoff. A single missed delete = a permanently
+        orphaned Space, so the cost of an extra retry beats the alternative.
+        """
         sandbox = getattr(session, "sandbox", None)
-        if sandbox and getattr(sandbox, "_owns_space", False):
-            space_id = getattr(sandbox, "space_id", None)
+        if not (sandbox and getattr(sandbox, "_owns_space", False)):
+            return
+
+        space_id = getattr(sandbox, "space_id", None)
+        last_err: Exception | None = None
+        for attempt in range(3):
             try:
-                logger.info(f"Deleting sandbox {space_id}...")
+                logger.info(f"Deleting sandbox {space_id} (attempt {attempt + 1}/3)...")
                 await asyncio.to_thread(sandbox.delete)
                 from agent.core import telemetry
                 await telemetry.record_sandbox_destroy(session, sandbox)
+                return
             except Exception as e:
-                logger.warning(f"Failed to delete sandbox {space_id}: {e}")
+                last_err = e
+                if attempt < 2:
+                    await asyncio.sleep(2 ** attempt)
+        logger.error(
+            f"Failed to delete sandbox {space_id} after 3 attempts: {last_err}. "
+            f"Orphan — sweep script will pick it up."
+        )
 
     async def _run_session(
         self,
@@ -337,6 +669,7 @@ class SessionManager:
                             should_continue = await process_submission(session, submission)
                         finally:
                             agent_session.is_processing = False
+                            await self.persist_session_snapshot(agent_session)
                         if not should_continue:
                             break
                     except asyncio.TimeoutError:
@@ -371,6 +704,11 @@ class SessionManager:
             async with self._lock:
                 if session_id in self.sessions:
                     self.sessions[session_id].is_active = False
+                    await self.persist_session_snapshot(
+                        self.sessions[session_id],
+                        runtime_state="ended",
+                        status="ended",
+                    )
 
             logger.info(f"Session {session_id} ended")
 
@@ -420,7 +758,10 @@ class SessionManager:
             agent_session = self.sessions.get(session_id)
         if not agent_session or not agent_session.is_active:
             return False
-        return agent_session.session.context_manager.truncate_to_user_message(user_message_index)
+        success = agent_session.session.context_manager.truncate_to_user_message(user_message_index)
+        if success:
+            await self.persist_session_snapshot(agent_session, runtime_state="idle")
+        return success
 
     async def compact(self, session_id: str) -> bool:
         """Compact context in a session."""
@@ -445,12 +786,15 @@ class SessionManager:
         return success
 
     async def delete_session(self, session_id: str) -> bool:
-        """Delete a session entirely."""
+        """Soft-delete a session and stop its runtime resources."""
         async with self._lock:
             agent_session = self.sessions.pop(session_id, None)
 
         if not agent_session:
-            return False
+            await self._store().soft_delete_session(session_id)
+            return True
+
+        await self._store().soft_delete_session(session_id)
 
         # Clean up sandbox Space before cancelling the task
         await self._cleanup_sandbox(agent_session.session)
@@ -465,6 +809,21 @@ class SessionManager:
 
         return True
 
+    async def update_session_title(self, session_id: str, title: str | None) -> None:
+        """Persist a user-visible title for sidebar rehydration."""
+        agent_session = self.sessions.get(session_id)
+        if agent_session:
+            agent_session.title = title
+        await self._store().update_session_fields(session_id, title=title)
+
+    async def update_session_model(self, session_id: str, model_id: str) -> bool:
+        agent_session = self.sessions.get(session_id)
+        if not agent_session or not agent_session.is_active:
+            return False
+        agent_session.session.update_model(model_id)
+        await self.persist_session_snapshot(agent_session, runtime_state="idle")
+        return True
+
     def get_session_owner(self, session_id: str) -> str | None:
         """Get the user_id that owns a session, or None if session doesn't exist."""
         agent_session = self.sessions.get(session_id)
@@ -492,22 +851,7 @@ class SessionManager:
         if not agent_session:
             return None
 
-        # Extract pending approval tools if any
-        pending_approval = None
-        pa = agent_session.session.pending_approval
-        if pa and pa.get("tool_calls"):
-            pending_approval = []
-            for tc in pa["tool_calls"]:
-                import json
-                try:
-                    args = json.loads(tc.function.arguments)
-                except (json.JSONDecodeError, AttributeError):
-                    args = {}
-                pending_approval.append({
-                    "tool": tc.function.name,
-                    "tool_call_id": tc.id,
-                    "arguments": args,
-                })
+        pending_approval = self._pending_tools_for_api(agent_session.session)
 
         return {
             "session_id": session_id,
@@ -518,16 +862,80 @@ class SessionManager:
             "user_id": agent_session.user_id,
             "pending_approval": pending_approval,
             "model": agent_session.session.config.model_name,
+            "title": agent_session.title,
+            "notification_destinations": list(
+                agent_session.session.notification_destinations
+            ),
         }
 
-    def list_sessions(self, user_id: str | None = None) -> list[dict[str, Any]]:
+    def set_notification_destinations(
+        self, session_id: str, destinations: list[str]
+    ) -> list[str]:
+        """Replace the session's opted-in auto-notification destinations."""
+        agent_session = self.sessions.get(session_id)
+        if not agent_session or not agent_session.is_active:
+            raise ValueError("Session not found or inactive")
+
+        normalized: list[str] = []
+        seen: set[str] = set()
+        for raw_name in destinations:
+            name = raw_name.strip()
+            if not name:
+                raise ValueError("Destination names must not be empty")
+            destination = self.config.messaging.get_destination(name)
+            if destination is None:
+                raise ValueError(f"Unknown destination '{name}'")
+            if not destination.allow_auto_events:
+                raise ValueError(
+                    f"Destination '{name}' is not enabled for auto events"
+                )
+            if name not in seen:
+                normalized.append(name)
+                seen.add(name)
+
+        agent_session.session.set_notification_destinations(normalized)
+        return normalized
+
+    async def list_sessions(self, user_id: str | None = None) -> list[dict[str, Any]]:
         """List sessions, optionally filtered by user.
 
         Args:
             user_id: If provided, only return sessions owned by this user.
                      If "dev", return all sessions (dev mode).
         """
-        results = []
+        results: list[dict[str, Any]] = []
+        store = self._store()
+        if getattr(store, "enabled", False):
+            for row in await store.list_sessions(user_id or "dev"):
+                sid = row.get("session_id") or row.get("_id")
+                if not sid:
+                    continue
+                runtime_info = self.get_session_info(str(sid))
+                if runtime_info:
+                    results.append(runtime_info)
+                    continue
+                created_at = row.get("created_at")
+                if isinstance(created_at, datetime):
+                    created_at_str = created_at.isoformat()
+                else:
+                    created_at_str = str(created_at or datetime.utcnow().isoformat())
+                pending = self._pending_docs_for_api(row.get("pending_approval") or [])
+                results.append(
+                    {
+                        "session_id": str(sid),
+                        "created_at": created_at_str,
+                        "is_active": row.get("status") != "ended",
+                        "is_processing": row.get("runtime_state") == "processing",
+                        "message_count": int(row.get("message_count") or 0),
+                        "user_id": row.get("user_id") or "dev",
+                        "pending_approval": pending or None,
+                        "model": row.get("model"),
+                        "title": row.get("title"),
+                        "notification_destinations": row.get("notification_destinations") or [],
+                    }
+                )
+            return results
+
         for sid in self.sessions:
             info = self.get_session_info(sid)
             if not info:
diff --git a/backend/user_quotas.py b/backend/user_quotas.py
index 2b38b1111fe9bc57d36eebcc87b3ac9c88f8326b..94b1b0274a7f9c8b046e8210b50da1215d7743e8 100644
--- a/backend/user_quotas.py
+++ b/backend/user_quotas.py
@@ -1,9 +1,8 @@
-"""In-memory daily quota for Claude session creations.
+"""Daily quota for Claude session creations.
 
 Tracks per-user Claude session starts against a daily cap derived from the
-user's HF plan. Caps reset at UTC midnight; the store itself is in-process
-and wipes on restart (deliberate — the cost of occasional over-subsidy at
-restart is much lower than running a DB).
+user's HF plan. MongoDB is the source of truth when configured; the
+in-process dict remains the fallback for local/dev/test runs.
 
 Unit: session *creations*, not messages. A user who selects Claude in a new
 session consumes one quota point; switching an existing Claude session to
@@ -18,6 +17,8 @@ import asyncio
 import os
 from datetime import UTC, datetime
 
+from agent.core.session_persistence import NoopSessionStore, get_session_store, _reset_store_for_tests
+
 CLAUDE_FREE_DAILY: int = int(os.environ.get("CLAUDE_FREE_DAILY", "1"))
 CLAUDE_PRO_DAILY: int = int(os.environ.get("CLAUDE_PRO_DAILY", "20"))
 
@@ -37,6 +38,11 @@ def daily_cap_for(plan: str | None) -> int:
 
 async def get_claude_used_today(user_id: str) -> int:
     """Return today's Claude session count for the user (0 if none / stale day)."""
+    store = get_session_store()
+    if getattr(store, "enabled", False):
+        db_count = await store.get_quota(user_id, _today())
+        return db_count or 0
+
     async with _lock:
         entry = _claude_counts.get(user_id)
         if entry is None:
@@ -51,11 +57,37 @@ async def get_claude_used_today(user_id: str) -> int:
 
 async def increment_claude(user_id: str) -> int:
     """Bump today's Claude session count for the user. Returns the new value."""
+    store = get_session_store()
+    if getattr(store, "enabled", False):
+        db_count = await store.try_increment_quota(user_id, _today(), cap=10**9)
+        return db_count or 0
+
+    async with _lock:
+        today = _today()
+        day, count = _claude_counts.get(user_id, (today, 0))
+        if day != today:
+            count = 0
+        count += 1
+        _claude_counts[user_id] = (today, count)
+        return count
+
+
+async def try_increment_claude(user_id: str, cap: int) -> int | None:
+    """Atomically bump today's count if below *cap*.
+
+    Returns the new count, or None when the user is already at the cap.
+    """
+    store = get_session_store()
+    if getattr(store, "enabled", False):
+        return await store.try_increment_quota(user_id, _today(), cap)
+
     async with _lock:
         today = _today()
         day, count = _claude_counts.get(user_id, (today, 0))
         if day != today:
             count = 0
+        if count >= cap:
+            return None
         count += 1
         _claude_counts[user_id] = (today, count)
         return count
@@ -63,6 +95,11 @@ async def increment_claude(user_id: str) -> int:
 
 async def refund_claude(user_id: str) -> None:
     """Decrement today's count — used when session creation fails after a successful gate."""
+    store = get_session_store()
+    if getattr(store, "enabled", False):
+        await store.refund_quota(user_id, _today())
+        return
+
     async with _lock:
         entry = _claude_counts.get(user_id)
         if entry is None:
@@ -81,3 +118,4 @@ async def refund_claude(user_id: str) -> None:
 def _reset_for_tests() -> None:
     """Test-only: clear the in-memory store."""
     _claude_counts.clear()
+    _reset_store_for_tests(NoopSessionStore())
diff --git a/configs/__init__.py b/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/configs/cli_agent_config.json b/configs/cli_agent_config.json
index 99335ca719d469737f4da1c3b48c1894a73b1845..5c6a22a354108453aea6de90bf81c76ef838fcc9 100644
--- a/configs/cli_agent_config.json
+++ b/configs/cli_agent_config.json
@@ -5,6 +5,11 @@
   "yolo_mode": false,
   "confirm_cpu_jobs": true,
   "auto_file_upload": true,
+  "messaging": {
+    "enabled": false,
+    "auto_event_types": ["approval_required", "error", "turn_complete"],
+    "destinations": {}
+  },
   "mcpServers": {
     "hf-mcp-server": {
       "transport": "http",
diff --git a/frontend/src/components/Chat/MarkdownContent.tsx b/frontend/src/components/Chat/MarkdownContent.tsx
index aaab83eb118ecd8627950dee80f794c9cefa0b5d..0d1e69171d3955e998d78807006862bd95422c34 100644
--- a/frontend/src/components/Chat/MarkdownContent.tsx
+++ b/frontend/src/components/Chat/MarkdownContent.tsx
@@ -1,4 +1,4 @@
-import { useMemo, useRef, useState, useEffect } from 'react';
+import { useMemo, useRef, useState, useEffect, type ComponentPropsWithoutRef } from 'react';
 import { Box } from '@mui/material';
 import ReactMarkdown from 'react-markdown';
 import remarkGfm from 'remark-gfm';
@@ -166,9 +166,17 @@ export default function MarkdownContent({ content, sx, isStreaming = false }: Ma
 
   const remarkPlugins = useMemo(() => [remarkGfm], []);
 
+  const components = useMemo(() => ({
+    a: ({ href, children, ...props }: ComponentPropsWithoutRef<'a'>) => (
+      <a href={href} target="_blank" rel="noopener noreferrer" {...props}>
+        {children}
+      </a>
+    ),
+  }), []);
+
   return (
     <Box sx={[markdownSx, ...(Array.isArray(sx) ? sx : sx ? [sx] : [])]}>
-      <ReactMarkdown remarkPlugins={remarkPlugins}>{displayContent}</ReactMarkdown>
+      <ReactMarkdown remarkPlugins={remarkPlugins} components={components}>{displayContent}</ReactMarkdown>
     </Box>
   );
 }
diff --git a/frontend/src/components/Chat/ToolCallGroup.tsx b/frontend/src/components/Chat/ToolCallGroup.tsx
index fc9fe35c19a7120486467f0ad1a84c1ca5681955..657e9e3688250cf48fb22501f8fcd797264cda30 100644
--- a/frontend/src/components/Chat/ToolCallGroup.tsx
+++ b/frontend/src/components/Chat/ToolCallGroup.tsx
@@ -220,6 +220,194 @@ function ResearchSteps({ steps }: { steps: string[] }) {
   );
 }
 
+// ---------------------------------------------------------------------------
+// Trackio dashboard embed
+// ---------------------------------------------------------------------------
+
+// HF repo IDs are `<owner>/<name>` where each segment is alphanumerics plus
+// `_`, `.`, `-`. Anything else (slashes, spaces, query params, missing owner)
+// would let an attacker-controlled string redirect the embed to a different
+// Space, so we refuse to render rather than build a malformed URL.
+const SPACE_ID_PATTERN = /^[a-zA-Z0-9_.-]+\/[a-zA-Z0-9_.-]+$/;
+
+function isValidSpaceId(spaceId: string): boolean {
+  return SPACE_ID_PATTERN.test(spaceId);
+}
+
+/** HF Space embed subdomain: 'user/space_name' → 'user-space-name'. */
+function spaceIdToSubdomain(spaceId: string): string {
+  return spaceId
+    .toLowerCase()
+    .replace(/[/_.]/g, '-')
+    .replace(/-+/g, '-')
+    .replace(/^-|-$/g, '');
+}
+
+function buildTrackioEmbedUrl(spaceId: string, project?: string): string {
+  // __theme=dark is gradio's standard query param to force the embedded
+  // dashboard into dark mode so it blends with the surrounding chat instead
+  // of flashing a bright white panel inside the dark UI.
+  const params = new URLSearchParams({
+    sidebar: 'hidden',
+    footer: 'false',
+    __theme: 'dark',
+  });
+  if (project) params.set('project', project);
+  return `https://${spaceIdToSubdomain(spaceId)}.hf.space/?${params.toString()}`;
+}
+
+function buildTrackioPageUrl(spaceId: string, project?: string): string {
+  const qs = project ? `?${new URLSearchParams({ project }).toString()}` : '';
+  return `https://huggingface.co/spaces/${spaceId}${qs}`;
+}
+
+function TrackioEmbed({ spaceId, project }: { spaceId: string; project?: string }) {
+  const [expanded, setExpanded] = useState(true);
+  const [iframeLoaded, setIframeLoaded] = useState(false);
+  const embedUrl = useMemo(() => buildTrackioEmbedUrl(spaceId, project), [spaceId, project]);
+  const pageUrl = useMemo(() => buildTrackioPageUrl(spaceId, project), [spaceId, project]);
+  const label = project ? `${spaceId} · ${project}` : spaceId;
+
+  if (!isValidSpaceId(spaceId)) return null;
+
+  return (
+    <Box sx={{ pl: 4.5, pr: 1.5, pb: 1, pt: 0.25 }}>
+      <Box
+        sx={{
+          border: '1px solid var(--tool-border)',
+          borderRadius: '8px',
+          overflow: 'hidden',
+          bgcolor: 'var(--code-panel-bg)',
+        }}
+      >
+        <Stack
+          direction="row"
+          alignItems="center"
+          spacing={1}
+          onClick={(e) => e.stopPropagation()}
+          sx={{
+            px: 1.25,
+            py: 0.5,
+            borderBottom: expanded ? '1px solid var(--tool-border)' : 'none',
+          }}
+        >
+          <Typography
+            sx={{
+              fontFamily: '"JetBrains Mono", ui-monospace, SFMono-Regular, monospace',
+              fontSize: '0.65rem',
+              fontWeight: 600,
+              color: 'var(--accent-yellow)',
+              letterSpacing: '0.04em',
+            }}
+          >
+            trackio
+          </Typography>
+          <Typography
+            sx={{
+              fontFamily: '"JetBrains Mono", ui-monospace, SFMono-Regular, monospace',
+              fontSize: '0.65rem',
+              color: 'var(--muted-text)',
+              flex: 1,
+              minWidth: 0,
+              overflow: 'hidden',
+              textOverflow: 'ellipsis',
+              whiteSpace: 'nowrap',
+            }}
+          >
+            {label}
+          </Typography>
+          <Link
+            href={pageUrl}
+            target="_blank"
+            rel="noopener noreferrer"
+            onClick={(e) => e.stopPropagation()}
+            sx={{
+              display: 'inline-flex',
+              alignItems: 'center',
+              gap: 0.4,
+              color: 'var(--accent-yellow)',
+              fontSize: '0.65rem',
+              textDecoration: 'none',
+              '&:hover': { textDecoration: 'underline' },
+            }}
+          >
+            <LaunchIcon sx={{ fontSize: 11 }} />
+            Open
+          </Link>
+          <Button
+            size="small"
+            onClick={(e) => {
+              e.stopPropagation();
+              setExpanded((v) => !v);
+            }}
+            sx={{
+              textTransform: 'none',
+              minWidth: 'auto',
+              px: 0.75,
+              py: 0,
+              fontSize: '0.65rem',
+              color: 'var(--muted-text)',
+              '&:hover': { color: 'var(--text)', bgcolor: 'transparent' },
+            }}
+          >
+            {expanded ? 'Hide' : 'Show'}
+          </Button>
+        </Stack>
+        {expanded && (
+          <Box sx={{ position: 'relative', width: '100%', height: 480, bgcolor: 'var(--code-panel-bg)' }}>
+            <iframe
+              src={embedUrl}
+              title={`Trackio dashboard ${label}`}
+              loading="lazy"
+              onLoad={() => setIframeLoaded(true)}
+              sandbox="allow-scripts allow-same-origin allow-forms allow-popups allow-downloads allow-modals"
+              style={{ border: 0, width: '100%', height: '100%', display: 'block' }}
+            />
+            {!iframeLoaded && (
+              <Stack
+                direction="column"
+                alignItems="center"
+                justifyContent="center"
+                spacing={1.5}
+                sx={{
+                  position: 'absolute',
+                  inset: 0,
+                  bgcolor: 'var(--code-panel-bg)',
+                  color: 'var(--muted-text)',
+                  pointerEvents: 'none',
+                }}
+              >
+                <CircularProgress size={20} sx={{ color: 'var(--accent-yellow)' }} />
+                <Typography
+                  sx={{
+                    fontFamily: '"JetBrains Mono", ui-monospace, SFMono-Regular, monospace',
+                    fontSize: '0.75rem',
+                    color: 'var(--text)',
+                  }}
+                >
+                  Spinning up the trackio dashboard…
+                </Typography>
+                <Typography
+                  sx={{
+                    fontFamily: '"JetBrains Mono", ui-monospace, SFMono-Regular, monospace',
+                    fontSize: '0.65rem',
+                    color: 'var(--muted-text)',
+                    textAlign: 'center',
+                    maxWidth: 360,
+                    px: 2,
+                  }}
+                >
+                  First load takes 30–60 seconds. Charts appear automatically once the run starts logging.
+                </Typography>
+              </Stack>
+            )}
+          </Box>
+        )}
+      </Box>
+    </Box>
+  );
+}
+
 // ---------------------------------------------------------------------------
 // Hardware pricing ($/hr) — from HF Spaces & Jobs pricing
 // ---------------------------------------------------------------------------
@@ -517,7 +705,7 @@ function InlineApproval({
 const EMPTY_AGENTS: Record<string, ResearchAgentState> = {};
 
 export default function ToolCallGroup({ tools, approveTools }: ToolCallGroupProps) {
-  const { setPanel, lockPanel, getJobUrl, getEditedScript, setJobStatus, getJobStatus, setToolError, getToolError, setToolRejected, getToolRejected } = useAgentStore();
+  const { setPanel, lockPanel, getJobUrl, getEditedScript, setJobStatus, getJobStatus, getTrackioDashboard, setToolError, getToolError, setToolRejected, getToolRejected } = useAgentStore();
   const researchAgents = useAgentStore(s => {
     const activeId = s.activeSessionId;
     return (activeId && s.sessionStates[activeId]?.researchAgents) || EMPTY_AGENTS;
@@ -1063,6 +1251,18 @@ export default function ToolCallGroup({ tools, approveTools }: ToolCallGroupProp
                 <ResearchSteps steps={researchAgents[tool.toolCallId].steps} />
               )}
 
+              {/* Trackio dashboard embed — shown for hf_jobs / sandbox_create runs that declared a trackio space */}
+              {(tool.toolName === 'hf_jobs' || tool.toolName === 'sandbox_create')
+                && !isPending
+                && !isRejected
+                && !cancelled
+                && (() => {
+                  const trackio = getTrackioDashboard(tool.toolCallId);
+                  return trackio
+                    ? <TrackioEmbed spaceId={trackio.spaceId} project={trackio.project} />
+                    : null;
+                })()}
+
               {/* Per-tool approval: undecided */}
               {isPending && !localDecision && !isSubmitting && (
                 <InlineApproval
diff --git a/frontend/src/components/JobsUpgradeDialog.tsx b/frontend/src/components/JobsUpgradeDialog.tsx
index 9a150204287d5e32a565d5bd4c87a3a0b17b2c3a..768f2d4c5c24d5f826154173d942f89a285ebb97 100644
--- a/frontend/src/components/JobsUpgradeDialog.tsx
+++ b/frontend/src/components/JobsUpgradeDialog.tsx
@@ -8,7 +8,6 @@ import {
   DialogContentText,
   DialogTitle,
   FormControl,
-  InputLabel,
   MenuItem,
   Select,
   Typography,
@@ -37,13 +36,20 @@ export default function JobsUpgradeDialog({
   onClose,
   onContinueWithNamespace,
 }: JobsUpgradeDialogProps) {
-  const [selectedNamespace, setSelectedNamespace] = useState('');
+  const [selectedNamespace, setSelectedNamespace] = useState(() => eligibleNamespaces[0] || '');
 
   useEffect(() => {
     if (!open) return;
     setSelectedNamespace(eligibleNamespaces[0] || '');
   }, [open, eligibleNamespaces]);
 
+  const isNamespace = mode === 'namespace';
+  const title = isNamespace ? 'Run jobs as' : 'Jobs need Pro or a paid org';
+
+  const body = isNamespace
+    ? "Pick which paid organization should pay for and own this job. We'll use the same one for the rest of this browser."
+    : message;
+
   return (
     <Dialog
       open={open}
@@ -57,7 +63,7 @@ export default function JobsUpgradeDialog({
           border: '1px solid var(--border)',
           borderRadius: 'var(--radius-md)',
           boxShadow: 'var(--shadow-1)',
-          maxWidth: 500,
+          maxWidth: 460,
           mx: 2,
         },
       }}
@@ -65,72 +71,75 @@ export default function JobsUpgradeDialog({
       <DialogTitle
         sx={{ color: 'var(--text)', fontWeight: 700, fontSize: '1rem', pt: 2.5, pb: 0, px: 3 }}
       >
-        {mode === 'namespace' ? 'Choose the org for this job' : 'Jobs need Pro or a paid org'}
+        {title}
       </DialogTitle>
       <DialogContent sx={{ px: 3, pt: 1.25, pb: 0 }}>
         <DialogContentText
           sx={{ color: 'var(--muted-text)', fontSize: '0.85rem', lineHeight: 1.6 }}
         >
-          {message}
+          {body}
         </DialogContentText>
-        {eligibleNamespaces.length > 0 && (
-          <Box
-            sx={{
-              mt: 2,
-              p: 1.5,
-              borderRadius: '8px',
-              bgcolor: 'var(--accent-yellow-weak)',
-              border: '1px solid var(--border)',
-            }}
-          >
-            <Typography
-              variant="caption"
+
+        {isNamespace ? (
+          <FormControl fullWidth size="small" sx={{ mt: 2 }}>
+            <Select
+              value={selectedNamespace}
+              displayEmpty
+              onChange={(e) => setSelectedNamespace(String(e.target.value))}
               sx={{
-                display: 'block',
-                fontWeight: 700,
+                bgcolor: 'var(--composer-bg)',
                 color: 'var(--text)',
-                fontSize: '0.78rem',
-                mb: 1,
-                letterSpacing: '0.02em',
+                fontSize: '0.88rem',
+                fontWeight: 600,
+                '& .MuiOutlinedInput-notchedOutline': { borderColor: 'var(--border)' },
+                '&:hover .MuiOutlinedInput-notchedOutline': { borderColor: 'var(--border)' },
+                '&.Mui-focused .MuiOutlinedInput-notchedOutline': {
+                  borderColor: 'var(--accent-yellow)',
+                  borderWidth: 1,
+                },
+                '& .MuiSelect-icon': { color: 'var(--muted-text)' },
+              }}
+              MenuProps={{
+                PaperProps: {
+                  sx: {
+                    bgcolor: 'var(--panel)',
+                    border: '1px solid var(--border)',
+                    borderRadius: '8px',
+                    mt: 0.5,
+                  },
+                },
               }}
             >
-              Eligible namespaces
-            </Typography>
-            {mode === 'namespace' ? (
-              <FormControl fullWidth size="small">
-                <InputLabel id="jobs-namespace-label">Organization</InputLabel>
-                <Select
-                  labelId="jobs-namespace-label"
-                  value={selectedNamespace}
-                  label="Organization"
-                  onChange={(e) => setSelectedNamespace(String(e.target.value))}
+              {eligibleNamespaces.map((namespace) => (
+                <MenuItem
+                  key={namespace}
+                  value={namespace}
+                  sx={{
+                    fontSize: '0.88rem',
+                    color: 'var(--text)',
+                    '&.Mui-selected': { bgcolor: 'rgba(255,255,255,0.05)' },
+                  }}
                 >
-                  {eligibleNamespaces.map((namespace) => (
-                    <MenuItem key={namespace} value={namespace}>
-                      {namespace}
-                    </MenuItem>
-                  ))}
-                </Select>
-              </FormControl>
-            ) : (
+                  {namespace}
+                </MenuItem>
+              ))}
+            </Select>
+          </FormControl>
+        ) : (
+          eligibleNamespaces.length > 0 && (
+            <Box sx={{ mt: 1.5 }}>
               <Typography
                 variant="caption"
-                sx={{ display: 'block', color: 'var(--muted-text)', fontSize: '0.78rem', lineHeight: 1.55 }}
+                sx={{ color: 'var(--muted-text)', fontSize: '0.78rem', lineHeight: 1.55 }}
               >
-                {eligibleNamespaces.join(', ')}
+                Eligible namespaces: {eligibleNamespaces.join(', ')}
               </Typography>
-            )}
-          </Box>
+            </Box>
+          )
         )}
-        <Typography
-          variant="caption"
-          sx={{ display: 'block', mt: 2, color: 'var(--muted-text)', fontSize: '0.78rem', lineHeight: 1.55 }}
-        >
-          If you decline, the agent will have to find another way forward without `hf_jobs`.
-        </Typography>
       </DialogContent>
-      <DialogActions sx={{ px: 3, pb: 2.5, pt: 2, gap: 1 }}>
-        {mode === 'namespace' ? (
+      <DialogActions sx={{ px: 3, pb: 2.5, pt: 2.5, gap: 1 }}>
+        {isNamespace ? (
           <Button
             onClick={() => onContinueWithNamespace(selectedNamespace)}
             disabled={!selectedNamespace}
@@ -147,7 +156,7 @@ export default function JobsUpgradeDialog({
               '&:hover': { bgcolor: '#FFB340', boxShadow: 'none' },
             }}
           >
-            Run under selected org
+            Continue
           </Button>
         ) : (
           <Button
@@ -183,7 +192,7 @@ export default function JobsUpgradeDialog({
             '&:hover': { bgcolor: 'var(--hover-bg)' },
           }}
         >
-          Decline tool call
+          {isNamespace ? 'Skip this tool call' : 'Decline tool call'}
         </Button>
       </DialogActions>
     </Dialog>
diff --git a/frontend/src/components/SessionSidebar/SessionSidebar.tsx b/frontend/src/components/SessionSidebar/SessionSidebar.tsx
index 3a369772f8cd057f8322cde87a49264ee75ad7c5..243e48c7b89887cf6af0fb55d46c5cb0a7a64007 100644
--- a/frontend/src/components/SessionSidebar/SessionSidebar.tsx
+++ b/frontend/src/components/SessionSidebar/SessionSidebar.tsx
@@ -1,4 +1,4 @@
-import { useCallback, useState } from 'react';
+import { useCallback, useEffect, useState } from 'react';
 import {
   Alert,
   Box,
@@ -25,13 +25,30 @@ interface SessionSidebarProps {
 }
 
 export default function SessionSidebar({ onClose }: SessionSidebarProps) {
-  const { sessions, activeSessionId, createSession, deleteSession, switchSession } =
+  const { sessions, activeSessionId, createSession, deleteSession, switchSession, mergeServerSessions } =
     useSessionStore();
   const { setPlan, clearPanel } =
     useAgentStore();
   const [isCreatingSession, setIsCreatingSession] = useState(false);
   const [capacityError, setCapacityError] = useState<string | null>(null);
 
+  useEffect(() => {
+    let cancelled = false;
+    (async () => {
+      try {
+        const response = await apiFetch('/api/sessions');
+        if (!response.ok) return;
+        const data = await response.json();
+        if (!cancelled && Array.isArray(data)) {
+          mergeServerSessions(data);
+        }
+      } catch {
+        /* local sidebar metadata is still usable */
+      }
+    })();
+    return () => { cancelled = true; };
+  }, [mergeServerSessions]);
+
   // -- Handlers -----------------------------------------------------------
 
   const handleNewSession = useCallback(async () => {
diff --git a/frontend/src/components/WelcomeScreen/WelcomeScreen.tsx b/frontend/src/components/WelcomeScreen/WelcomeScreen.tsx
index d37fc7873391626755c54d10ccefbc7d9ab23cbf..ef4a42432b1fcb4d0dc2f9011ec4f346b8abd1b6 100644
--- a/frontend/src/components/WelcomeScreen/WelcomeScreen.tsx
+++ b/frontend/src/components/WelcomeScreen/WelcomeScreen.tsx
@@ -280,6 +280,12 @@ export default function WelcomeScreen() {
       : '';
 
   return (
+    // Outer container scrolls; inner uses `margin: auto` so the checklist
+    // centers vertically when the viewport has room and falls back to top-
+    // aligned + scrollable when it doesn't. The previous setup hardcoded
+    // `justify-content: center` with no overflow, so on short viewports
+    // (1366×768 Chrome was the reported case) the bottom of the card —
+    // including the "Start session" CTA — got clipped with no way to scroll.
     <Box
       sx={{
         width: '100%',
@@ -287,172 +293,182 @@ export default function WelcomeScreen() {
         display: 'flex',
         flexDirection: 'column',
         alignItems: 'center',
-        justifyContent: 'center',
+        overflowY: 'auto',
         background: 'var(--body-gradient)',
-        py: 8,
       }}
     >
-      {/* Logo */}
       <Box
-        component="img"
-        src="/smolagents.webp"
-        alt="smolagents"
-        sx={{ width: 80, height: 80, mb: 2.5, display: 'block' }}
-      />
-
-      {/* Title */}
-      <Typography
-        variant="h2"
         sx={{
-          fontWeight: 800,
-          color: 'var(--text)',
-          mb: 1,
-          letterSpacing: '-0.02em',
-          fontSize: { xs: '1.8rem', md: '2.4rem' },
+          display: 'flex',
+          flexDirection: 'column',
+          alignItems: 'center',
+          width: '100%',
+          margin: 'auto',
+          py: 8,
         }}
       >
-        ML Intern
-      </Typography>
+        {/* Logo */}
+        <Box
+          component="img"
+          src="/smolagents.webp"
+          alt="smolagents"
+          sx={{ width: 80, height: 80, mb: 2.5, display: 'block' }}
+        />
 
-      {/* Description */}
-      <Typography
-        variant="body1"
-        sx={{
-          color: 'var(--muted-text)',
-          maxWidth: 480,
-          mb: 4,
-          lineHeight: 1.7,
-          fontSize: '0.9rem',
-          textAlign: 'center',
-          px: 2,
-          '& strong': { color: 'var(--text)', fontWeight: 600 },
-        }}
-      >
-        Your personal <strong>ML agent</strong>. It reads <strong>papers</strong>, finds <strong>datasets</strong>, trains <strong>models</strong>, and iterates until the numbers go up. Instructions in. Trained model out.
-      </Typography>
+        {/* Title */}
+        <Typography
+          variant="h2"
+          sx={{
+            fontWeight: 800,
+            color: 'var(--text)',
+            mb: 1,
+            letterSpacing: '-0.02em',
+            fontSize: { xs: '1.8rem', md: '2.4rem' },
+          }}
+        >
+          ML Intern
+        </Typography>
 
-      {/* ── Checklist ──────────────────────────────────────────── */}
-      <Box
-        sx={{
-          width: '100%',
-          maxWidth: 520,
-          bgcolor: 'var(--surface)',
-          border: '1px solid var(--border)',
-          borderRadius: '12px',
-          overflow: 'hidden',
-          mx: 2,
-        }}
-      >
-        {isDevUser ? (
-          /* Dev mode: single step */
-          <ChecklistStep
-            stepNumber={1}
-            title="Start Session"
-            description="Launch an AI agent session for ML engineering."
-            status="active"
-            actionLabel="Start Session"
-            actionIcon={<RocketLaunchIcon sx={{ fontSize: 16 }} />}
-            onAction={handleStartSession}
-            loading={isCreating}
-            isLast
-          />
-        ) : inIframe ? (
-          /* Iframe: 2 steps */
-          <>
-            <ChecklistStep
-              stepNumber={1}
-              title="Join ML Agent Explorers"
-              description="Get free access to GPUs, inference APIs, and Hub resources."
-              status={isOrgMember ? 'completed' : 'active'}
-              actionLabel="Join Organization"
-              actionIcon={<GroupAddIcon sx={{ fontSize: 16 }} />}
-              onAction={handleJoinOrg}
-            />
-            <ChecklistStep
-              stepNumber={2}
-              title="Open ML Intern"
-              description="Open the agent in a full browser tab to get started."
-              status={isOrgMember ? 'active' : 'locked'}
-              lockedReason="Join the organization first."
-              actionLabel="Open ML Intern"
-              actionIcon={<OpenInNewIcon sx={{ fontSize: 16 }} />}
-              actionHref={spaceHost}
-              isLast
-            />
-          </>
-        ) : (
-          /* Direct access: 3 steps */
-          <>
+        {/* Description */}
+        <Typography
+          variant="body1"
+          sx={{
+            color: 'var(--muted-text)',
+            maxWidth: 480,
+            mb: 4,
+            lineHeight: 1.7,
+            fontSize: '0.9rem',
+            textAlign: 'center',
+            px: 2,
+            '& strong': { color: 'var(--text)', fontWeight: 600 },
+          }}
+        >
+          Your personal <strong>ML agent</strong>. It reads <strong>papers</strong>, finds <strong>datasets</strong>, trains <strong>models</strong>, and iterates until the numbers go up. Instructions in. Trained model out.
+        </Typography>
+
+        {/* ── Checklist ──────────────────────────────────────────── */}
+        <Box
+          sx={{
+            width: '100%',
+            maxWidth: 520,
+            bgcolor: 'var(--surface)',
+            border: '1px solid var(--border)',
+            borderRadius: '12px',
+            overflow: 'hidden',
+            mx: 2,
+          }}
+        >
+          {isDevUser ? (
+            /* Dev mode: single step */
             <ChecklistStep
               stepNumber={1}
-              title="Sign in with Hugging Face"
-              description="Authenticate to access GPU resources and model APIs."
-              status={signInStatus}
-              actionLabel="Sign in"
-              actionIcon={<LoginIcon sx={{ fontSize: 16 }} />}
-              onAction={() => triggerLogin()}
-            />
-            <ChecklistStep
-              stepNumber={2}
-              title="Join ML Agent Explorers"
-              description="Get free access to GPUs, inference APIs, and Hub resources."
-              status={joinOrgStatus}
-              lockedReason="Sign in first to continue."
-              actionLabel="Join Organization"
-              actionIcon={<GroupAddIcon sx={{ fontSize: 16 }} />}
-              onAction={handleJoinOrg}
-            />
-            <ChecklistStep
-              stepNumber={3}
               title="Start Session"
               description="Launch an AI agent session for ML engineering."
-              status={startStatus}
-              lockedReason="Complete the steps above to continue."
+              status="active"
               actionLabel="Start Session"
               actionIcon={<RocketLaunchIcon sx={{ fontSize: 16 }} />}
               onAction={handleStartSession}
               loading={isCreating}
               isLast
             />
-          </>
+          ) : inIframe ? (
+            /* Iframe: 2 steps */
+            <>
+              <ChecklistStep
+                stepNumber={1}
+                title="Join ML Agent Explorers"
+                description="Get free access to GPUs, inference APIs, and Hub resources."
+                status={isOrgMember ? 'completed' : 'active'}
+                actionLabel="Join Organization"
+                actionIcon={<GroupAddIcon sx={{ fontSize: 16 }} />}
+                onAction={handleJoinOrg}
+              />
+              <ChecklistStep
+                stepNumber={2}
+                title="Open ML Intern"
+                description="Open the agent in a full browser tab to get started."
+                status={isOrgMember ? 'active' : 'locked'}
+                lockedReason="Join the organization first."
+                actionLabel="Open ML Intern"
+                actionIcon={<OpenInNewIcon sx={{ fontSize: 16 }} />}
+                actionHref={spaceHost}
+                isLast
+              />
+            </>
+          ) : (
+            /* Direct access: 3 steps */
+            <>
+              <ChecklistStep
+                stepNumber={1}
+                title="Sign in with Hugging Face"
+                description="Authenticate to access GPU resources and model APIs."
+                status={signInStatus}
+                actionLabel="Sign in"
+                actionIcon={<LoginIcon sx={{ fontSize: 16 }} />}
+                onAction={() => triggerLogin()}
+              />
+              <ChecklistStep
+                stepNumber={2}
+                title="Join ML Agent Explorers"
+                description="Get free access to GPUs, inference APIs, and Hub resources."
+                status={joinOrgStatus}
+                lockedReason="Sign in first to continue."
+                actionLabel="Join Organization"
+                actionIcon={<GroupAddIcon sx={{ fontSize: 16 }} />}
+                onAction={handleJoinOrg}
+              />
+              <ChecklistStep
+                stepNumber={3}
+                title="Start Session"
+                description="Launch an AI agent session for ML engineering."
+                status={startStatus}
+                lockedReason="Complete the steps above to continue."
+                actionLabel="Start Session"
+                actionIcon={<RocketLaunchIcon sx={{ fontSize: 16 }} />}
+                onAction={handleStartSession}
+                loading={isCreating}
+                isLast
+              />
+            </>
+          )}
+        </Box>
+
+        {/* Polling hint when waiting for org join */}
+        {isAuthenticated && !isOrgMember && !isDevUser && !inIframe && (
+          <Typography
+            variant="caption"
+            sx={{ mt: 2, color: 'var(--muted-text)', fontSize: '0.75rem', textAlign: 'center' }}
+          >
+            This page updates automatically when you join the organization.
+          </Typography>
         )}
-      </Box>
 
-      {/* Polling hint when waiting for org join */}
-      {isAuthenticated && !isOrgMember && !isDevUser && !inIframe && (
+        {/* Error */}
+        {error && (
+          <Alert
+            severity="warning"
+            variant="outlined"
+            onClose={() => setError(null)}
+            sx={{
+              mt: 3,
+              maxWidth: 400,
+              fontSize: '0.8rem',
+              borderColor: HF_ORANGE,
+              color: 'var(--text)',
+            }}
+          >
+            {error}
+          </Alert>
+        )}
+
+        {/* Footnote */}
         <Typography
           variant="caption"
-          sx={{ mt: 2, color: 'var(--muted-text)', fontSize: '0.75rem', textAlign: 'center' }}
+          sx={{ mt: 4, color: 'var(--muted-text)', opacity: 0.5, fontSize: '0.7rem' }}
         >
-          This page updates automatically when you join the organization.
+          Conversations are stored locally in your browser.
         </Typography>
-      )}
-
-      {/* Error */}
-      {error && (
-        <Alert
-          severity="warning"
-          variant="outlined"
-          onClose={() => setError(null)}
-          sx={{
-            mt: 3,
-            maxWidth: 400,
-            fontSize: '0.8rem',
-            borderColor: HF_ORANGE,
-            color: 'var(--text)',
-          }}
-        >
-          {error}
-        </Alert>
-      )}
-
-      {/* Footnote */}
-      <Typography
-        variant="caption"
-        sx={{ mt: 4, color: 'var(--muted-text)', opacity: 0.5, fontSize: '0.7rem' }}
-      >
-        Conversations are stored locally in your browser.
-      </Typography>
+      </Box>
     </Box>
   );
 }
diff --git a/frontend/src/hooks/useAgentChat.ts b/frontend/src/hooks/useAgentChat.ts
index a83a0ac285c3a1a9c139bc8683ba1383f91c91e9..743b2ae1ddd49701b1000fd527f5f6637b2d6119 100644
--- a/frontend/src/hooks/useAgentChat.ts
+++ b/frontend/src/hooks/useAgentChat.ts
@@ -371,7 +371,7 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
     } catch {
       return null;
     }
-  }, [sessionId, setNeedsAttention]);
+  }, [sessionId, setNeedsAttention, updateSession]);
 
   // -- useChat from Vercel AI SDK -----------------------------------------
   const chat = useChat({
@@ -447,6 +447,33 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
         }
         return;
       }
+      if (error.message === 'HF_JOBS_INVALID_NAMESPACE') {
+        // Saved preference is no longer one of the user's eligible namespaces
+        // (e.g. they left the org). Clear it and reopen the picker.
+        const typed = error as Error & {
+          detail?: Record<string, unknown>;
+          approvals?: Array<{
+            tool_call_id: string;
+            approved: boolean;
+            feedback?: string | null;
+            edited_script?: string | null;
+            namespace?: string | null;
+          }>;
+        };
+        useAgentStore.getState().setPreferredJobsNamespace(null);
+        void hydrateFromBackend();
+        if (isActiveRef.current) {
+          useAgentStore.getState().setJobsUpgradeRequired({
+            approvals: typed.approvals || [],
+            toolCallIds: (typed.detail?.tool_call_ids as string[]) || [],
+            message: String(typed.detail?.message || 'Pick a different organization for this job run.'),
+            eligibleNamespaces: (typed.detail?.eligible_namespaces as string[]) || [],
+            plan: ((typed.detail?.plan as 'free' | 'pro' | 'org') || 'free'),
+            mode: 'namespace',
+          });
+        }
+        return;
+      }
       logger.error('useChat error:', error);
       if (isActiveRef.current) {
         useAgentStore.getState().setError(error.message);
@@ -594,7 +621,10 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
     /** Read the event stream from GET /api/events and forward to side-channel. */
     const consumeEventStream = async (signal: AbortSignal) => {
       try {
-        const res = await apiFetch(`/api/events/${sessionId}`, {
+        const lastEventKey = `hf-agent-last-event:${sessionId}`;
+        const lastSeq = localStorage.getItem(lastEventKey);
+        const qs = lastSeq ? `?after=${encodeURIComponent(lastSeq)}` : '';
+        const res = await apiFetch(`/api/events/${sessionId}${qs}`, {
           headers: { 'Accept': 'text/event-stream' },
           signal,
         });
@@ -602,6 +632,71 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
 
         const reader = res.body.pipeThrough(new TextDecoderStream()).getReader();
         let buf = '';
+        let eventId: string | null = null;
+        let eventData = '';
+        const dispatch = async () => {
+          if (!eventData.trim()) {
+            eventId = null;
+            eventData = '';
+            return false;
+          }
+          const event = JSON.parse(eventData.trim());
+          const seq = event.seq ?? (eventId ? Number(eventId) : undefined);
+          if (Number.isFinite(seq)) {
+            localStorage.setItem(lastEventKey, String(seq));
+          }
+          eventId = null;
+          eventData = '';
+          // Forward to side-channel for real-time UI updates
+          const et = event.event_type as string;
+          if (et === 'processing') sideChannel.onProcessing();
+          else if (et === 'assistant_chunk') sideChannel.onStreaming();
+          else if (et === 'tool_call') {
+            const t = event.data?.tool as string;
+            const d = event.data?.arguments?.description as string | undefined;
+            sideChannel.onToolRunning(t, d);
+            sideChannel.onToolCallPanel(t, (event.data?.arguments || {}) as Record<string, unknown>);
+          } else if (et === 'tool_output') {
+            sideChannel.onToolOutputPanel(
+              event.data?.tool as string,
+              event.data?.tool_call_id as string,
+              event.data?.output as string,
+              event.data?.success as boolean,
+            );
+          } else if (et === 'tool_state_change') {
+            const state = event.data?.state as string;
+            const toolName = event.data?.tool as string;
+            if (state === 'running' && toolName) sideChannel.onToolRunning(toolName);
+          } else if (et === 'turn_complete' || et === 'error' || et === 'interrupted') {
+            sideChannel.onProcessingDone();
+            stopReconnect();
+            // Final hydration to get the complete message state
+            const result = await hydrateMessages();
+            if (result) {
+              const uiMsgs = llmMessagesToUIMessages(result.data, result.pendingIds, chatActionsRef.current.messages);
+              if (uiMsgs.length > 0) {
+                chat.setMessages(uiMsgs);
+                saveMessages(sessionId, uiMsgs);
+              }
+            }
+            return true;
+          } else if (et === 'approval_required') {
+            sideChannel.onApprovalRequired(
+              (event.data?.tools || []) as Array<{ tool: string; arguments: Record<string, unknown>; tool_call_id: string }>,
+            );
+            stopReconnect();
+            const result = await hydrateMessages();
+            if (result) {
+              const uiMsgs = llmMessagesToUIMessages(result.data, result.pendingIds, chatActionsRef.current.messages);
+              if (uiMsgs.length > 0) {
+                chat.setMessages(uiMsgs);
+                saveMessages(sessionId, uiMsgs);
+              }
+            }
+            return true;
+          }
+          return false;
+        };
         while (true) {
           const { value, done } = await reader.read();
           if (done || signal.aborted) break;
@@ -609,59 +704,21 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
           const lines = buf.split('\n');
           buf = lines.pop() || '';
           for (const line of lines) {
-            const trimmed = line.trim();
-            if (!trimmed.startsWith('data: ')) continue;
-            try {
-              const event = JSON.parse(trimmed.slice(6));
-              // Forward to side-channel for real-time UI updates
-              const et = event.event_type as string;
-              if (et === 'processing') sideChannel.onProcessing();
-              else if (et === 'assistant_chunk') sideChannel.onStreaming();
-              else if (et === 'tool_call') {
-                const t = event.data?.tool as string;
-                const d = event.data?.arguments?.description as string | undefined;
-                sideChannel.onToolRunning(t, d);
-                sideChannel.onToolCallPanel(t, (event.data?.arguments || {}) as Record<string, unknown>);
-              } else if (et === 'tool_output') {
-                sideChannel.onToolOutputPanel(
-                  event.data?.tool as string,
-                  event.data?.tool_call_id as string,
-                  event.data?.output as string,
-                  event.data?.success as boolean,
-                );
-              } else if (et === 'tool_state_change') {
-                const state = event.data?.state as string;
-                const toolName = event.data?.tool as string;
-                if (state === 'running' && toolName) sideChannel.onToolRunning(toolName);
-              } else if (et === 'turn_complete' || et === 'error' || et === 'interrupted') {
-                sideChannel.onProcessingDone();
-                stopReconnect();
-                // Final hydration to get the complete message state
-                const result = await hydrateMessages();
-                if (result) {
-                  const uiMsgs = llmMessagesToUIMessages(result.data, result.pendingIds, chatActionsRef.current.messages);
-                  if (uiMsgs.length > 0) {
-                    chat.setMessages(uiMsgs);
-                    saveMessages(sessionId, uiMsgs);
-                  }
-                }
-                return;
-              } else if (et === 'approval_required') {
-                sideChannel.onApprovalRequired(
-                  (event.data?.tools || []) as Array<{ tool: string; arguments: Record<string, unknown>; tool_call_id: string }>,
-                );
-                stopReconnect();
-                const result = await hydrateMessages();
-                if (result) {
-                  const uiMsgs = llmMessagesToUIMessages(result.data, result.pendingIds, chatActionsRef.current.messages);
-                  if (uiMsgs.length > 0) {
-                    chat.setMessages(uiMsgs);
-                    saveMessages(sessionId, uiMsgs);
-                  }
-                }
-                return;
-              }
-            } catch { /* ignore parse errors */ }
+            const trimmed = line.replace(/\r$/, '');
+            if (trimmed === '') {
+              try {
+                if (await dispatch()) return;
+              } catch { /* ignore parse errors */ }
+              continue;
+            }
+            if (trimmed.startsWith(':')) continue;
+            if (trimmed.startsWith('id:')) {
+              eventId = trimmed.slice(3).trim();
+              continue;
+            }
+            if (trimmed.startsWith('data:')) {
+              eventData += trimmed.slice(5).trimStart() + '\n';
+            }
           }
         }
       } catch {
@@ -830,6 +887,9 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
         : approval.namespace,
     }));
 
+    // Remember this choice so the picker doesn't reappear for every
+    // subsequent hf_jobs call.
+    useAgentStore.getState().setPreferredJobsNamespace(namespace);
     useAgentStore.getState().setJobsUpgradeRequired(null);
     return approveTools(approvals);
   }, [approveTools]);
diff --git a/frontend/src/lib/sse-chat-transport.ts b/frontend/src/lib/sse-chat-transport.ts
index fa59a98679c6090cc607fca5fae204d57d59921e..3ca39f1678ae04dba30b41ec5d30c6f0559f3827 100644
--- a/frontend/src/lib/sse-chat-transport.ts
+++ b/frontend/src/lib/sse-chat-transport.ts
@@ -42,35 +42,66 @@ function nextPartId(prefix: string): string {
   return `${prefix}-${Date.now()}-${++partIdCounter}`;
 }
 
+function lastEventKey(sessionId: string): string {
+  return `hf-agent-last-event:${sessionId}`;
+}
+
 /** Parse an SSE text stream into AgentEvent objects. */
-function createSSEParserStream(): TransformStream<string, AgentEvent> {
+function createSSEParserStream(sessionId: string): TransformStream<string, AgentEvent> {
   let buffer = '';
+  let eventId: string | null = null;
+  let data = '';
+
+  const dispatch = (controller: TransformStreamDefaultController<AgentEvent>) => {
+    if (!data.trim()) {
+      eventId = null;
+      data = '';
+      return;
+    }
+    try {
+      const json = JSON.parse(data.trim()) as AgentEvent;
+      const seq = json.seq ?? (eventId ? Number(eventId) : undefined);
+      if (Number.isFinite(seq)) {
+        json.seq = seq;
+        localStorage.setItem(lastEventKey(sessionId), String(seq));
+      }
+      controller.enqueue(json);
+    } catch {
+      logger.warn('SSE parse error:', data.trim());
+    } finally {
+      eventId = null;
+      data = '';
+    }
+  };
+
   return new TransformStream<string, AgentEvent>({
     transform(chunk, controller) {
       buffer += chunk;
       const lines = buffer.split('\n');
       // Keep the last (possibly incomplete) line in the buffer
       buffer = lines.pop() || '';
-      for (const line of lines) {
-        const trimmed = line.trim();
-        if (trimmed.startsWith('data: ')) {
-          try {
-            const json = JSON.parse(trimmed.slice(6));
-            controller.enqueue(json as AgentEvent);
-          } catch {
-            logger.warn('SSE parse error:', trimmed);
-          }
+      for (const rawLine of lines) {
+        const line = rawLine.replace(/\r$/, '');
+        if (line === '') {
+          dispatch(controller);
+          continue;
+        }
+        if (line.startsWith(':')) continue;
+        if (line.startsWith('id:')) {
+          eventId = line.slice(3).trim();
+        } else if (line.startsWith('data:')) {
+          data += line.slice(5).trimStart() + '\n';
         }
       }
     },
     flush(controller) {
-      // Process any remaining data in buffer
-      if (buffer.trim().startsWith('data: ')) {
-        try {
-          const json = JSON.parse(buffer.trim().slice(6));
-          controller.enqueue(json as AgentEvent);
-        } catch { /* ignore incomplete */ }
+      const line = buffer.replace(/\r$/, '');
+      if (line.startsWith('id:')) {
+        eventId = line.slice(3).trim();
+      } else if (line.startsWith('data:')) {
+        data += line.slice(5).trimStart() + '\n';
       }
+      dispatch(controller);
     },
   });
 }
@@ -226,12 +257,17 @@ function createEventToChunkStream(sideChannel: SideChannelCallbacks): TransformS
           const state = (event.data?.state as string) || '';
           const toolName = (event.data?.tool as string) || '';
           const jobUrl = (event.data?.jobUrl as string) || undefined;
+          const trackioSpaceId = (event.data?.trackioSpaceId as string) || undefined;
+          const trackioProject = (event.data?.trackioProject as string) || undefined;
 
           if (tcId.startsWith('plan_tool')) break;
 
           if (jobUrl && tcId) {
             useAgentStore.getState().setJobUrl(tcId, jobUrl);
           }
+          if (trackioSpaceId && tcId) {
+            useAgentStore.getState().setTrackioDashboard(tcId, trackioSpaceId, trackioProject);
+          }
           if (state === 'running' && toolName) {
             sideChannel.onToolRunning(toolName);
           }
@@ -320,7 +356,14 @@ export class SSEChatTransport implements ChatTransport<UIMessage> {
         const approved = p.approval?.approved ?? true;
         // Get edited script from agentStore if available
         const editedScript = useAgentStore.getState().getEditedScript(p.toolCallId);
-        const namespace = useAgentStore.getState().getApprovalNamespace(p.toolCallId);
+        const explicitNamespace = useAgentStore.getState().getApprovalNamespace(p.toolCallId);
+        // Fall back to the user's persisted choice so we don't re-prompt
+        // every hf_jobs call.  Backend will 400 if the saved namespace is
+        // no longer valid; the error handler clears the preference and
+        // reopens the picker.
+        const preferred = useAgentStore.getState().preferredJobsNamespace;
+        const namespace = explicitNamespace
+          ?? (approved && p.toolName === 'hf_jobs' ? preferred ?? null : null);
         return {
           tool_call_id: p.toolCallId,
           approved,
@@ -388,6 +431,20 @@ export class SSEChatTransport implements ChatTransport<UIMessage> {
         throw err;
       }
     }
+    if (response.status === 400) {
+      const payload = await response.json().catch(() => null);
+      if (payload?.detail?.error === 'hf_jobs_invalid_namespace') {
+        // Stored namespace is no longer eligible — surface so the UI can
+        // clear the saved preference and reopen the picker.
+        const err = new Error('HF_JOBS_INVALID_NAMESPACE') as Error & {
+          detail?: Record<string, unknown>;
+          approvals?: Array<Record<string, unknown>>;
+        };
+        err.detail = payload.detail as Record<string, unknown>;
+        err.approvals = (body.approvals as Array<Record<string, unknown>> | undefined) || [];
+        throw err;
+      }
+    }
     if (!response.ok) {
       const errorText = await response.text().catch(() => 'Request failed');
       throw new Error(`Chat request failed: ${response.status} ${errorText}`);
@@ -400,7 +457,7 @@ export class SSEChatTransport implements ChatTransport<UIMessage> {
     // Pipe: response bytes → text → SSE events → UIMessageChunks
     return response.body
       .pipeThrough(new TextDecoderStream())
-      .pipeThrough(createSSEParserStream())
+      .pipeThrough(createSSEParserStream(sessionId))
       .pipeThrough(createEventToChunkStream(this.sideChannel));
   }
 
@@ -415,7 +472,9 @@ export class SSEChatTransport implements ChatTransport<UIMessage> {
       if (!info.is_processing) return null;
 
       // Session is mid-turn — subscribe to its event broadcast.
-      const response = await apiFetch(`/api/events/${this.sessionId}`, {
+      const lastSeq = localStorage.getItem(lastEventKey(this.sessionId));
+      const qs = lastSeq ? `?after=${encodeURIComponent(lastSeq)}` : '';
+      const response = await apiFetch(`/api/events/${this.sessionId}${qs}`, {
         headers: { 'Accept': 'text/event-stream' },
       });
       if (!response.ok || !response.body) return null;
@@ -424,7 +483,7 @@ export class SSEChatTransport implements ChatTransport<UIMessage> {
 
       return response.body
         .pipeThrough(new TextDecoderStream())
-        .pipeThrough(createSSEParserStream())
+        .pipeThrough(createSSEParserStream(this.sessionId))
         .pipeThrough(createEventToChunkStream(this.sideChannel));
     } catch {
       return null;
diff --git a/frontend/src/store/agentStore.ts b/frontend/src/store/agentStore.ts
index ca32566f7da27ee5a164b74601c79db43ed63fc0..e2743c74e4a0488519a55d9022ae2303e4baae3a 100644
--- a/frontend/src/store/agentStore.ts
+++ b/frontend/src/store/agentStore.ts
@@ -141,12 +141,21 @@ interface AgentStore {
   // Namespace overrides chosen for hf_jobs approvals (tool_call_id -> namespace)
   approvalNamespaces: Record<string, string>;
 
+  // Persisted preferred namespace for hf_jobs (auto-applied to future approvals
+  // so the user only picks once)
+  preferredJobsNamespace: string | null;
+
   // Job URLs (tool_call_id -> job URL) for HF jobs
   jobUrls: Record<string, string>;
 
   // Job statuses (tool_call_id -> job status) for HF jobs
   jobStatuses: Record<string, string>;
 
+  // Trackio dashboard config per tool call (tool_call_id -> {spaceId, project?})
+  // Set by hf_jobs / sandbox_create tools when the agent declares trackio_space_id;
+  // the UI uses it to embed the live dashboard via an iframe.
+  trackioDashboards: Record<string, { spaceId: string; project?: string }>;
+
   // Tool error states (tool_call_id -> true if errored) - persisted across renders
   toolErrors: Record<string, boolean>;
 
@@ -194,12 +203,17 @@ interface AgentStore {
   getApprovalNamespace: (toolCallId: string) => string | undefined;
   clearApprovalNamespaces: () => void;
 
+  setPreferredJobsNamespace: (namespace: string | null) => void;
+
   setJobUrl: (toolCallId: string, jobUrl: string) => void;
   getJobUrl: (toolCallId: string) => string | undefined;
 
   setJobStatus: (toolCallId: string, status: string) => void;
   getJobStatus: (toolCallId: string) => string | undefined;
 
+  setTrackioDashboard: (toolCallId: string, spaceId: string, project?: string) => void;
+  getTrackioDashboard: (toolCallId: string) => { spaceId: string; project?: string } | undefined;
+
   setToolError: (toolCallId: string, hasError: boolean) => void;
   getToolError: (toolCallId: string) => boolean | undefined;
 
@@ -264,6 +278,48 @@ function saveRejectedTools(rejected: Record<string, boolean>): void {
   }
 }
 
+// Trackio dashboards survive a page reload — without persistence the iframe
+// disappears whenever the user refreshes mid-job, which is the exact moment
+// they'd want to keep watching it.
+function loadTrackioDashboards(): Record<string, { spaceId: string; project?: string }> {
+  try {
+    const stored = localStorage.getItem('hf-agent-trackio-dashboards');
+    return stored ? JSON.parse(stored) : {};
+  } catch {
+    return {};
+  }
+}
+
+function saveTrackioDashboards(dashboards: Record<string, { spaceId: string; project?: string }>): void {
+  try {
+    localStorage.setItem('hf-agent-trackio-dashboards', JSON.stringify(dashboards));
+  } catch (e) {
+    console.warn('Failed to persist trackio dashboards:', e);
+  }
+}
+
+const PREFERRED_JOBS_NAMESPACE_KEY = 'hf-agent-preferred-jobs-namespace';
+
+function loadPreferredJobsNamespace(): string | null {
+  try {
+    return localStorage.getItem(PREFERRED_JOBS_NAMESPACE_KEY);
+  } catch {
+    return null;
+  }
+}
+
+function savePreferredJobsNamespace(namespace: string | null): void {
+  try {
+    if (namespace) {
+      localStorage.setItem(PREFERRED_JOBS_NAMESPACE_KEY, namespace);
+    } else {
+      localStorage.removeItem(PREFERRED_JOBS_NAMESPACE_KEY);
+    }
+  } catch (e) {
+    console.warn('Failed to persist preferred jobs namespace:', e);
+  }
+}
+
 export const useAgentStore = create<AgentStore>()((set, get) => ({
   sessionStates: {},
   activeSessionId: null,
@@ -285,8 +341,10 @@ export const useAgentStore = create<AgentStore>()((set, get) => ({
 
   editedScripts: {},
   approvalNamespaces: {},
+  preferredJobsNamespace: loadPreferredJobsNamespace(),
   jobUrls: {},
   jobStatuses: {},
+  trackioDashboards: loadTrackioDashboards(),
   toolErrors: loadToolErrors(),
   rejectedTools: loadRejectedTools(),
 
@@ -465,6 +523,11 @@ export const useAgentStore = create<AgentStore>()((set, get) => ({
 
   clearApprovalNamespaces: () => set({ approvalNamespaces: {} }),
 
+  setPreferredJobsNamespace: (namespace) => {
+    savePreferredJobsNamespace(namespace);
+    set({ preferredJobsNamespace: namespace });
+  },
+
   // ── Job URLs ────────────────────────────────────────────────────────
 
   setJobUrl: (toolCallId, jobUrl) => {
@@ -485,6 +548,26 @@ export const useAgentStore = create<AgentStore>()((set, get) => ({
 
   getJobStatus: (toolCallId) => get().jobStatuses[toolCallId],
 
+  // ── Trackio Dashboards ──────────────────────────────────────────────
+
+  setTrackioDashboard: (toolCallId, spaceId, project) => {
+    set((state) => {
+      const existing = state.trackioDashboards[toolCallId];
+      // Don't churn the object if nothing changed (avoids extra renders).
+      if (existing && existing.spaceId === spaceId && existing.project === project) {
+        return {};
+      }
+      const updated = {
+        ...state.trackioDashboards,
+        [toolCallId]: { spaceId, ...(project ? { project } : {}) },
+      };
+      saveTrackioDashboards(updated);
+      return { trackioDashboards: updated };
+    });
+  },
+
+  getTrackioDashboard: (toolCallId) => get().trackioDashboards[toolCallId],
+
   // ── Tool Errors ─────────────────────────────────────────────────────
 
   setToolError: (toolCallId, hasError) => {
diff --git a/frontend/src/store/sessionStore.ts b/frontend/src/store/sessionStore.ts
index 62961b2dc97e18b3f4d247a0c60e48b0c2e20cf9..967c65000c6e48c81126c9579b4d97a6b27e5268 100644
--- a/frontend/src/store/sessionStore.ts
+++ b/frontend/src/store/sessionStore.ts
@@ -20,6 +20,14 @@ interface SessionStore {
   markExpired: (id: string) => void;
   /** Clear the expired flag (used after restore-with-summary succeeds). */
   clearExpired: (id: string) => void;
+  /** Merge durable server-side sessions into local sidebar metadata. */
+  mergeServerSessions: (sessions: Array<{
+    session_id: string;
+    title?: string | null;
+    created_at: string;
+    is_active?: boolean;
+    pending_approval?: unknown[] | null;
+  }>) => void;
   /** Atomically swap a session's id in the list + both localStorage caches.
    *  Used when we rehydrate an expired session into a freshly-created backend
    *  session — preserves title, timestamps, and messages. */
@@ -76,6 +84,45 @@ export const useSessionStore = create<SessionStore>()(
         }));
       },
 
+      mergeServerSessions: (serverSessions) => {
+        set((state) => {
+          const byId = new Map(state.sessions.map((s) => [s.id, s]));
+          const merged = [...state.sessions];
+          for (const server of serverSessions) {
+            const id = server.session_id;
+            if (!id) continue;
+            const existing = byId.get(id);
+            if (existing) {
+              const updated = {
+                ...existing,
+                title: server.title || existing.title,
+                isActive: server.is_active ?? existing.isActive,
+                needsAttention: Boolean(server.pending_approval?.length) || existing.needsAttention,
+                expired: false,
+              };
+              const idx = merged.findIndex((s) => s.id === id);
+              if (idx >= 0) merged[idx] = updated;
+              byId.set(id, updated);
+              continue;
+            }
+            const newSession: SessionMeta = {
+              id,
+              title: server.title || `Chat ${merged.length + 1}`,
+              createdAt: server.created_at || new Date().toISOString(),
+              isActive: server.is_active ?? true,
+              needsAttention: Boolean(server.pending_approval?.length),
+              expired: false,
+            };
+            merged.push(newSession);
+            byId.set(id, newSession);
+          }
+          return {
+            sessions: merged,
+            activeSessionId: state.activeSessionId || merged[merged.length - 1]?.id || null,
+          };
+        });
+      },
+
       renameSession: (oldId: string, newId: string) => {
         if (oldId === newId) return;
         moveMessages(oldId, newId);
diff --git a/frontend/src/types/events.ts b/frontend/src/types/events.ts
index 9cc197077e972a6ee811bbeecb66a06cd8a442e2..7319f2533376185bd414d502f0b6f990e9c9d153 100644
--- a/frontend/src/types/events.ts
+++ b/frontend/src/types/events.ts
@@ -24,6 +24,7 @@ export type EventType =
 export interface AgentEvent {
   event_type: EventType;
   data?: Record<string, unknown>;
+  seq?: number;
 }
 
 export interface ReadyEventData {
diff --git a/pyproject.toml b/pyproject.toml
index 89cadf94b8d066c7f4c70e9fa097554dee0139b8..c97737534612e4477e5d0f10a14783f854b15255 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ dependencies = [
     "requests>=2.33.0",
     "litellm>=1.83.0",
     "boto3>=1.35.0",
-    "huggingface-hub>=1.0.1",
+    "huggingface-hub>=1.12.0",
     "fastmcp>=3.2.0",
     "prompt-toolkit>=3.0.0",
     "thefuzz>=0.22.1",
@@ -27,6 +27,7 @@ dependencies = [
     "httpx>=0.27.0",
     "websockets>=13.0",
     "apscheduler>=3.10,<4",
+    "pymongo>=4.17.0",
 ]
 
 [project.optional-dependencies]
@@ -42,7 +43,7 @@ eval = [
 # Development and testing dependencies
 dev = [
     "pytest>=9.0.2",
-    "pytest-asyncio>=0.26.0",
+    "pytest-asyncio>=1.2.0",
 ]
 
 # All dependencies (eval + dev)
@@ -58,7 +59,20 @@ requires = ["setuptools>=64"]
 build-backend = "setuptools.build_meta"
 
 [tool.setuptools.packages.find]
-include = ["agent*"]
+# `configs` ships the JSON files loaded by agent.main.CLI_CONFIG_PATH at
+# runtime (resolves to <site-packages>/configs/cli_agent_config.json).
+# Without it, `uv tool install` / `pip install` produce a broken install
+# that imports fine but crashes at startup with FileNotFoundError.
+include = ["agent*", "configs"]
+
+[tool.setuptools.package-data]
+configs = ["*.json"]
+# Agent data files: system prompts loaded by ContextManager._load_system_prompt
+# at runtime (`<site-packages>/agent/prompts/system_prompt_v3.yaml`), plus the
+# package README. Without these, headless_main hangs forever — submission_loop
+# crashes with FileNotFoundError but headless_main doesn't check agent_task.done()
+# and just keeps awaiting the "ready" event_queue item that will never come.
+agent = ["README.md", "prompts/*.yaml"]
 
 [tool.uv]
 package = true
diff --git a/scripts/build_kpis.py b/scripts/build_kpis.py
index 10477288e6370f617a556e0a7c9e63acdd65e91a..dd62f85c0901a60acfec86bbd911d45277d4c684 100644
--- a/scripts/build_kpis.py
+++ b/scripts/build_kpis.py
@@ -38,15 +38,27 @@ re-running the same hour overwrites.
     llm_calls           — count of llm_call events
     tokens_prompt / _completion / _cache_read / _cache_creation
     cost_usd            — sum of llm_call.cost_usd
+    cost_per_session_mean / _p50 / _p95  — per-session cost distribution
     cache_hit_ratio     — cache_read / (cache_read + prompt)
-    tool_success_rate   — tool_output success=True / total tool_output
-    failure_rate        — sessions that ended with an `error` event / sessions
-    regenerate_rate     — sessions with any `undo_complete` event / sessions
+    tool_calls_total / _succeeded / _failed  — per-tool_output reliability counts
+    tool_success_rate   — succeeded / total (kept for back-compat)
+    successful_sessions / errored_sessions / regenerated_sessions  — outcome counts
+    failure_rate / regenerate_rate  — kept for back-compat
     time_to_first_action_s_p50 / _p95  — from session_start to first tool_call
     thumbs_up / thumbs_down
     hf_jobs_submitted / _succeeded / _blocked
+    sandboxes_created / _cpu / _gpu  — sandbox_create events bucketed by hardware
     pro_cta_clicks
     gpu_hours_by_flavor_json   — JSON-serialised {flavor: gpu-hours}
+    research_calls             — total `research` tool_call events
+    sessions_with_research     — sessions that called `research` ≥1
+    research_calls_per_session_p50 / _p95 — among sessions that did any (zero-only sessions excluded)
+    distinct_tools_per_session_p50 / _p95 — among sessions with ≥1 named tool_call
+    tool_calls_per_session_p50 / _p95     — among sessions with ≥1 named tool_call
+    tool_calls_per_turn_p50 / _p95        — calls / turns, among sessions with turns>0
+    tool_calls_by_name_json    — JSON {tool: total_calls} (all tools seen)
+    sessions_using_tool_json   — JSON {tool: distinct_sessions_using}
+    sessions_by_model_json     — JSON {model_name: count} (CLI vs Bedrock split)
 
 ================================================================================
  Usage
@@ -213,6 +225,7 @@ def _session_metrics(session: dict) -> dict:
         "thumbs_up": 0, "thumbs_down": 0,
         "hf_jobs_submitted": 0, "hf_jobs_succeeded": 0, "hf_jobs_blocked": 0,
         "pro_cta_clicks": 0,
+        "sandboxes_created": 0, "sandboxes_cpu": 0, "sandboxes_gpu": 0,
         "first_tool_s": -1,
     }
     events = session.get("events") or []
@@ -231,11 +244,19 @@ def _session_metrics(session: dict) -> dict:
     gpu_hours_by_flavor: dict[str, float] = defaultdict(float)
     jobs_submitted = 0
     jobs_succeeded = 0
-    jobs_blocked = 0
     thumbs_up = 0
     thumbs_down = 0
+    sandboxes_created = 0
+    sandboxes_cpu = 0
+    sandboxes_gpu = 0
+    jobs_blocked = 0
     pro_cta_clicks = 0
     pro_cta_by_source: dict[str, int] = defaultdict(int)
+    # Per-tool counters from tool_call events. Counted off tool_call (which
+    # carries data["tool"]) rather than tool_output (which only carries
+    # success/output) so we can attribute calls to specific tools.
+    tool_calls_by_name: dict[str, int] = defaultdict(int)
+    total_named_tool_calls = 0
 
     start_dt = _parse_ts(session_start)
 
@@ -260,6 +281,10 @@ def _session_metrics(session: dict) -> dict:
                 first_tool_ts = (ts - start_dt).total_seconds()
 
         elif et == "tool_call":
+            name = data.get("tool")
+            if name:
+                tool_calls_by_name[name] += 1
+                total_named_tool_calls += 1
             if first_tool_ts is None and ts is not None and start_dt is not None:
                 first_tool_ts = (ts - start_dt).total_seconds()
 
@@ -296,6 +321,19 @@ def _session_metrics(session: dict) -> dict:
             source = str(data.get("source") or "unknown")
             pro_cta_by_source[source] += 1
 
+        elif et == "sandbox_create":
+            sandboxes_created += 1
+            hardware = (data.get("hardware") or "").lower()
+            # CPU flavors are explicitly named "cpu-*". Everything else
+            # (including unknown/missing hardware strings) lands in the GPU
+            # bucket, since the auto-create default is "cpu-basic" which is
+            # matched here — anything that isn't is almost always an explicit
+            # GPU choice.
+            if hardware.startswith("cpu-"):
+                sandboxes_cpu += 1
+            else:
+                sandboxes_gpu += 1
+
     out["tool_calls_total"] = tool_total
     out["tool_calls_success"] = tool_success
     out["failures"] = 1 if had_error else 0
@@ -304,12 +342,22 @@ def _session_metrics(session: dict) -> dict:
     out["thumbs_down"] = thumbs_down
     out["hf_jobs_submitted"] = jobs_submitted
     out["hf_jobs_succeeded"] = jobs_succeeded
+    out["sandboxes_created"] = sandboxes_created
+    out["sandboxes_cpu"] = sandboxes_cpu
+    out["sandboxes_gpu"] = sandboxes_gpu
     out["hf_jobs_blocked"] = jobs_blocked
     out["pro_cta_clicks"] = pro_cta_clicks
     out["first_tool_s"] = first_tool_ts if first_tool_ts is not None else -1
     out["_gpu_hours_by_flavor"] = dict(gpu_hours_by_flavor)
     out["_pro_cta_by_source"] = dict(pro_cta_by_source)
     out["_user"] = session.get("user_id") or session.get("session_id")
+    # Intra-session tool fields. Underscore-prefixed = consumed by _aggregate
+    # only, never written to CSV directly.
+    out["_tool_calls_by_name"] = dict(tool_calls_by_name)
+    out["_research_calls"] = tool_calls_by_name.get("research", 0)
+    out["_distinct_tools_used"] = len(tool_calls_by_name)
+    out["_total_named_tool_calls"] = total_named_tool_calls
+    out["_model_name"] = session.get("model_name") or "unknown"
     return dict(out)
 
 
@@ -317,12 +365,36 @@ def _aggregate(per_session: list[dict]) -> dict:
     """Collapse a bucket's worth of session rollups into the final KPI row."""
     ttfa_values = [s["first_tool_s"] for s in per_session if s.get("first_tool_s", -1) >= 0]
     gpu_hours: dict[str, float] = defaultdict(float)
-    pro_cta_by_source: dict[str, int] = defaultdict(int)
     for s in per_session:
         for f, h in (s.get("_gpu_hours_by_flavor") or {}).items():
             gpu_hours[f] += h
-        for source, count in (s.get("_pro_cta_by_source") or {}).items():
-            pro_cta_by_source[source] += int(count)
+
+    # Per-tool aggregates. ``sessions_using_tool`` counts each session at most
+    # once per tool, so the dashboard can show "how many sessions reached for
+    # research" alongside "how many research calls overall".
+    tool_calls_by_name: dict[str, int] = defaultdict(int)
+    sessions_using_tool: dict[str, int] = defaultdict(int)
+    sessions_by_model: dict[str, int] = defaultdict(int)
+    for s in per_session:
+        for name, count in (s.get("_tool_calls_by_name") or {}).items():
+            tool_calls_by_name[name] += int(count)
+            sessions_using_tool[name] += 1
+        sessions_by_model[s.get("_model_name") or "unknown"] += 1
+
+    # Percentile inputs. All "per session" percentiles exclude sessions that
+    # never reached for the relevant signal — otherwise quiet hours
+    # (status-check sessions, abandoned new conversations) drag every median
+    # to 0 and the chart tells you nothing.
+    research_calls_nz = [s.get("_research_calls", 0) for s in per_session if s.get("_research_calls", 0) > 0]
+    distinct_tools_values = [s.get("_distinct_tools_used", 0) for s in per_session if s.get("_distinct_tools_used", 0) > 0]
+    total_calls_values = [s.get("_total_named_tool_calls", 0) for s in per_session if s.get("_total_named_tool_calls", 0) > 0]
+    # Per-turn intensity: turns>0 is the natural filter here (a session with
+    # 5 turns and 0 tools is a meaningful 0). Don't strip those.
+    calls_per_turn_values = [
+        s.get("_total_named_tool_calls", 0) / s["turns"]
+        for s in per_session
+        if s.get("turns", 0) > 0
+    ]
 
     total_sessions = sum(s["sessions"] for s in per_session)
     total_turns = sum(s["turns"] for s in per_session)
@@ -330,6 +402,16 @@ def _aggregate(per_session: list[dict]) -> dict:
     tokens_cache_read = sum(s["tokens_cache_read"] for s in per_session)
     tool_total = sum(s["tool_calls_total"] for s in per_session)
     tool_success = sum(s["tool_calls_success"] for s in per_session)
+    failures = int(sum(s["failures"] for s in per_session))
+    regenerates = int(sum(s["regenerate_sessions"] for s in per_session))
+    research_calls_total = int(sum(s.get("_research_calls", 0) for s in per_session))
+    sessions_with_research = sum(1 for s in per_session if s.get("_research_calls", 0) > 0)
+
+    # Per-session cost percentiles — chart "median session cost" alongside the
+    # mean so a few $700 outliers don't make you think every session is pricey.
+    session_costs = [float(s.get("cost_usd") or 0.0) for s in per_session]
+    cost_p50 = _percentile(session_costs, 0.5)
+    cost_p95 = _percentile(session_costs, 0.95)
 
     unique_users = {s.get("_user") for s in per_session if s.get("_user")}
 
@@ -343,26 +425,61 @@ def _aggregate(per_session: list[dict]) -> dict:
         "tokens_cache_read": int(tokens_cache_read),
         "tokens_cache_creation": int(sum(s["tokens_cache_creation"] for s in per_session)),
         "cost_usd": round(sum(s["cost_usd"] for s in per_session), 4),
+        # Per-session cost summaries.
+        "cost_per_session_mean": round(
+            sum(s["cost_usd"] for s in per_session) / total_sessions, 6
+        ) if total_sessions > 0 else 0.0,
+        "cost_per_session_p50": round(cost_p50, 6),
+        "cost_per_session_p95": round(cost_p95, 6),
         "cache_hit_ratio": round(
             tokens_cache_read / (tokens_cache_read + tokens_prompt), 4
         ) if (tokens_cache_read + tokens_prompt) > 0 else 0.0,
+        # Raw reliability COUNTS (these are what the dashboard shows directly).
+        "tool_calls_total": int(tool_total),
+        "tool_calls_succeeded": int(tool_success),
+        "tool_calls_failed": int(tool_total - tool_success),
+        "errored_sessions": failures,
+        # Successful = "did not raise an error event". Mutually exclusive
+        # with errored_sessions; sums with errored_sessions to total sessions.
+        "successful_sessions": int(total_sessions - failures),
+        # Regenerated is an orthogonal dimension (the user retried) — a
+        # session can be both successful and regenerated, or both errored
+        # and regenerated.
+        "regenerated_sessions": regenerates,
+        # Rates kept for backwards compatibility with anything reading the
+        # KPI dataset directly.
         "tool_success_rate": round(tool_success / tool_total, 4) if tool_total > 0 else 0.0,
-        "failure_rate": round(
-            sum(s["failures"] for s in per_session) / total_sessions, 4
-        ) if total_sessions > 0 else 0.0,
-        "regenerate_rate": round(
-            sum(s["regenerate_sessions"] for s in per_session) / total_sessions, 4
-        ) if total_sessions > 0 else 0.0,
+        "failure_rate": round(failures / total_sessions, 4) if total_sessions > 0 else 0.0,
+        "regenerate_rate": round(regenerates / total_sessions, 4) if total_sessions > 0 else 0.0,
         "time_to_first_action_s_p50": round(_percentile(ttfa_values, 0.5), 2),
         "time_to_first_action_s_p95": round(_percentile(ttfa_values, 0.95), 2),
         "thumbs_up": int(sum(s["thumbs_up"] for s in per_session)),
         "thumbs_down": int(sum(s["thumbs_down"] for s in per_session)),
         "hf_jobs_submitted": int(sum(s["hf_jobs_submitted"] for s in per_session)),
         "hf_jobs_succeeded": int(sum(s["hf_jobs_succeeded"] for s in per_session)),
-        "hf_jobs_blocked": int(sum(s["hf_jobs_blocked"] for s in per_session)),
-        "pro_cta_clicks": int(sum(s["pro_cta_clicks"] for s in per_session)),
+        "sandboxes_created": int(sum(s.get("sandboxes_created", 0) for s in per_session)),
+        "sandboxes_cpu": int(sum(s.get("sandboxes_cpu", 0) for s in per_session)),
+        "sandboxes_gpu": int(sum(s.get("sandboxes_gpu", 0) for s in per_session)),
+        "hf_jobs_blocked": int(sum(s.get("hf_jobs_blocked", 0) for s in per_session)),
+        "pro_cta_clicks": int(sum(s.get("pro_cta_clicks", 0) for s in per_session)),
         "gpu_hours_by_flavor_json": json.dumps(dict(gpu_hours), sort_keys=True),
-        "pro_cta_by_source_json": json.dumps(dict(pro_cta_by_source), sort_keys=True),
+        # Research KPIs — answer "is the agent reaching for research?".
+        "research_calls": research_calls_total,
+        "sessions_with_research": int(sessions_with_research),
+        "research_calls_per_session_p50": round(_percentile(research_calls_nz, 0.5), 2),
+        "research_calls_per_session_p95": round(_percentile(research_calls_nz, 0.95), 2),
+        # Intra-session breadth + intensity. p50 + p95 over per-session values.
+        "distinct_tools_per_session_p50": round(_percentile(distinct_tools_values, 0.5), 2),
+        "distinct_tools_per_session_p95": round(_percentile(distinct_tools_values, 0.95), 2),
+        "tool_calls_per_session_p50": round(_percentile(total_calls_values, 0.5), 2),
+        "tool_calls_per_session_p95": round(_percentile(total_calls_values, 0.95), 2),
+        "tool_calls_per_turn_p50": round(_percentile(calls_per_turn_values, 0.5), 2),
+        "tool_calls_per_turn_p95": round(_percentile(calls_per_turn_values, 0.95), 2),
+        # JSON columns let the dashboard add/remove tools without schema churn.
+        "tool_calls_by_name_json": json.dumps(dict(tool_calls_by_name), sort_keys=True),
+        "sessions_using_tool_json": json.dumps(dict(sessions_using_tool), sort_keys=True),
+        # Surface split — answers "is research dropping on Bedrock specifically?".
+        "sessions_by_model_json": json.dumps(dict(sessions_by_model), sort_keys=True),
     }
 
 
diff --git a/scripts/sweep_orphan_sandboxes.py b/scripts/sweep_orphan_sandboxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bf941736ede7c667d7439c933a34d76173b5c5f
--- /dev/null
+++ b/scripts/sweep_orphan_sandboxes.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+"""Backstop sweeper for orphan ml-intern sandbox Spaces.
+
+================================================================================
+ Why this script exists
+================================================================================
+
+The agent creates a sandbox Space per session (template duplicated from
+``burtenshaw/sandbox`` into the user's account, named ``<owner>/sandbox-<8hex>``).
+``backend.session_manager.SessionManager._cleanup_sandbox`` deletes it at end of
+session. In practice the cleanup misses some sandboxes:
+
+- pod killed / OOM / pre-emption / deploy rollouts → ``finally`` block skipped
+- WebSocket dropped without ``/shutdown`` from the client
+- HF API transient failure on ``delete_repo`` (we retry now, but not infinitely)
+
+The result observed 2026-04-27 was 2,310 orphan ``sandbox-*`` Spaces — every
+sandbox ever created was still around. This script is the backstop: list every
+``sandbox-*`` fork of ``burtenshaw/sandbox`` that hasn't been touched in N days
+and delete it.
+
+================================================================================
+ Identification rules
+================================================================================
+
+A Space is considered an orphan ml-intern sandbox iff ALL hold:
+
+1. Repo type = ``space``
+2. Name matches ``<owner>/sandbox-[a-f0-9]{8}$`` (the agent's naming convention)
+3. ``originRepo`` points at ``burtenshaw/sandbox`` (so we don't touch
+   user-renamed lookalikes)
+4. ``lastModified`` older than ``--max-age-days`` (default 7)
+
+We DO NOT use the ``runtime.stage`` (sleeping/running) as a filter — a sandbox
+that has been sleeping for 7 days is just as orphan as a deleted one but uses
+no compute. The cleanup is about repo/storage hygiene, not about waking
+something up to kill it.
+
+================================================================================
+ Safety
+================================================================================
+
+- ``--dry-run`` (default) prints what would be deleted, deletes nothing.
+- ``--apply`` actually calls ``HfApi.delete_repo``.
+- Hard cap ``--max-deletes`` (default 200) so a misconfigured run can't nuke
+  thousands at once.
+- Requires a token with admin rights via ``HF_ADMIN_TOKEN`` env var (the only
+  way to delete a Space owned by another user).
+- Logs every action to stdout in JSON Lines for downstream auditing.
+
+================================================================================
+ Cron suggestion
+================================================================================
+
+GitHub Actions, daily at 04:00 UTC:
+
+    schedule:
+      - cron: "0 4 * * *"
+    env:
+      HF_ADMIN_TOKEN: ${{ secrets.HF_ADMIN_TOKEN }}
+    steps:
+      - run: python scripts/sweep_orphan_sandboxes.py --apply --max-age-days 7
+"""
+
+import argparse
+import json
+import os
+import re
+import sys
+import time
+from datetime import datetime, timedelta, timezone
+
+from huggingface_hub import HfApi
+from huggingface_hub.utils import HfHubHTTPError
+
+SANDBOX_NAME_RE = re.compile(r"^[^/]+/sandbox-[a-f0-9]{8}$")
+TEMPLATE_REPO = "burtenshaw/sandbox"
+
+
+def log(record: dict) -> None:
+    """JSON Lines log so downstream tooling can grep / parse."""
+    record["ts"] = datetime.now(timezone.utc).isoformat()
+    print(json.dumps(record), flush=True)
+
+
+def is_sandbox_fork(space) -> bool:
+    """Filter: matches the ml-intern sandbox naming pattern.
+
+    NOTE: We initially tried filtering on ``duplicated_from == burtenshaw/sandbox``
+    too, for extra safety. That doesn't work — the HF REST API does not expose
+    ``duplicated_from`` on ``SpaceInfo`` (verified against ``huggingface-hub``
+    1.11+ and direct ``GET /api/spaces/{id}``: the field is None). The origin
+    repo lives in MongoDB but isn't surfaced. So we rely on the naming pattern
+    alone, which is specific enough: ``Sandbox.create()`` is the sole producer
+    of ``<owner>/sandbox-<8 lowercase hex>``, and that pattern is unlikely to
+    collide with user-created Spaces in practice. The ``--dry-run`` default
+    is the user-facing safety net for the rare false-positive.
+    """
+    return bool(SANDBOX_NAME_RE.match(space.id))
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__.split("\n\n")[0])
+    parser.add_argument(
+        "--max-age-days",
+        type=int,
+        default=7,
+        help="Delete sandboxes whose lastModified is older than this many days (default: 7)",
+    )
+    parser.add_argument(
+        "--max-deletes",
+        type=int,
+        default=200,
+        help="Hard cap on deletions per run, safety guard (default: 200)",
+    )
+    parser.add_argument(
+        "--apply",
+        action="store_true",
+        help="Actually delete. Without this flag, dry-run only.",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=10000,
+        help="Max number of candidate Spaces to scan via list_spaces (default: 10000)",
+    )
+    args = parser.parse_args()
+
+    token = os.environ.get("HF_ADMIN_TOKEN")
+    if not token:
+        log({"level": "error", "msg": "HF_ADMIN_TOKEN env var not set"})
+        return 1
+
+    api = HfApi(token=token)
+    cutoff = datetime.now(timezone.utc) - timedelta(days=args.max_age_days)
+    log({"level": "info", "msg": "sweep_start", "cutoff": cutoff.isoformat(),
+         "max_deletes": args.max_deletes, "apply": args.apply})
+
+    # ``list_spaces`` doesn't filter by name pattern — we scan and filter
+    # client-side. ``search="sandbox"`` narrows the network payload.
+    candidates = api.list_spaces(
+        search="sandbox", full=True, limit=args.limit
+    )
+
+    scanned = 0
+    matched = 0
+    deleted = 0
+    failed = 0
+    skipped_too_recent = 0
+    skipped_capped = 0
+
+    for space in candidates:
+        scanned += 1
+        if not is_sandbox_fork(space):
+            continue
+        matched += 1
+
+        last_mod = getattr(space, "lastModified", None) or getattr(space, "last_modified", None)
+        if isinstance(last_mod, str):
+            last_mod = datetime.fromisoformat(last_mod.replace("Z", "+00:00"))
+        if last_mod and last_mod > cutoff:
+            skipped_too_recent += 1
+            continue
+
+        log({"level": "info", "msg": "candidate", "space_id": space.id,
+             "last_modified": last_mod.isoformat() if last_mod else None})
+
+        if not args.apply:
+            continue
+
+        # When we hit the deletion cap, keep scanning so the final ``matched``
+        # count reflects the *true* orphan size — not just what was scanned
+        # before we stopped deleting. Operators planning multi-pass cleanups
+        # need an accurate denominator to know when they're done.
+        if deleted >= args.max_deletes:
+            skipped_capped += 1
+            continue
+
+        try:
+            api.delete_repo(repo_id=space.id, repo_type="space", token=token)
+            deleted += 1
+            log({"level": "info", "msg": "deleted", "space_id": space.id})
+            # Light throttle to avoid hitting HF API rate limits.
+            time.sleep(0.2)
+        except HfHubHTTPError as e:
+            failed += 1
+            log({"level": "error", "msg": "delete_failed", "space_id": space.id,
+                 "status": e.response.status_code, "error": str(e)[:200]})
+        except Exception as e:
+            failed += 1
+            log({"level": "error", "msg": "delete_failed", "space_id": space.id,
+                 "error": str(e)[:200]})
+
+    log({"level": "info", "msg": "sweep_end",
+         "scanned": scanned, "matched": matched,
+         "skipped_too_recent": skipped_too_recent,
+         "skipped_capped": skipped_capped,
+         "deleted": deleted, "failed": failed,
+         "capped": skipped_capped > 0,
+         "apply": args.apply})
+
+    return 0 if failed == 0 else 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/integration/test_live_sandbox_auth.py b/tests/integration/test_live_sandbox_auth.py
new file mode 100644
index 0000000000000000000000000000000000000000..b68f999044c80741a7ff3e0ef6d3dd3b6473d175
--- /dev/null
+++ b/tests/integration/test_live_sandbox_auth.py
@@ -0,0 +1,90 @@
+"""Opt-in live sandbox communication test.
+
+This test creates a real Hugging Face Space sandbox, verifies that unauthenticated
+requests are rejected, then exercises the authenticated agent client end-to-end.
+It is skipped unless ``ML_INTERN_LIVE_SANDBOX_TESTS=1`` and ``HF_TOKEN`` are set.
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import httpx
+import pytest
+from dotenv import load_dotenv
+from huggingface_hub import HfApi
+
+from agent.tools.sandbox_client import Sandbox
+
+
+if env_file := os.environ.get("ML_INTERN_LIVE_ENV_FILE"):
+    load_dotenv(Path(env_file))
+
+
+def _skip_without_live_sandbox() -> None:
+    if os.environ.get("ML_INTERN_LIVE_SANDBOX_TESTS") != "1":
+        pytest.skip("set ML_INTERN_LIVE_SANDBOX_TESTS=1 to create a real sandbox")
+    if not os.environ.get("HF_TOKEN"):
+        pytest.skip("set HF_TOKEN to create a real sandbox")
+
+
+def test_live_sandbox_authenticated_agent_communication():
+    _skip_without_live_sandbox()
+
+    token = os.environ["HF_TOKEN"]
+    owner = HfApi(token=token).whoami()["name"]
+    sandbox = None
+
+    try:
+        sandbox = Sandbox.create(
+            owner=owner,
+            name="ml-intern-live-auth",
+            hardware="cpu-basic",
+            private=False,
+            token=token,
+            secrets={"HF_TOKEN": token},
+            wait_timeout=900,
+        )
+
+        unauthenticated = httpx.Client(
+            base_url=sandbox._base_url,
+            timeout=30,
+            follow_redirects=True,
+        )
+        try:
+            denied = unauthenticated.post("exists", json={"path": "/tmp"})
+            assert denied.status_code == 401
+        finally:
+            unauthenticated.close()
+
+        bash = sandbox.bash("printf sandbox-live-ok", timeout=30)
+        assert bash.success, bash.error
+        assert "sandbox-live-ok" in bash.output
+
+        write = sandbox.write("/tmp/ml_intern_live_auth.txt", "alpha\nbeta\n")
+        assert write.success, write.error
+
+        exists = sandbox._call("exists", {"path": "/tmp/ml_intern_live_auth.txt"})
+        assert exists.success, exists.error
+        assert exists.output == "true"
+
+        read = sandbox.read("/tmp/ml_intern_live_auth.txt")
+        assert read.success, read.error
+        assert "alpha" in read.output
+        assert "beta" in read.output
+
+        reattached = Sandbox.connect(
+            sandbox.space_id,
+            token=token,
+            api_token=sandbox.api_token,
+        )
+        try:
+            reread = reattached.read("/tmp/ml_intern_live_auth.txt")
+            assert reread.success, reread.error
+            assert "alpha" in reread.output
+        finally:
+            reattached._client.close()
+    finally:
+        if sandbox is not None:
+            sandbox.delete()
diff --git a/tests/integration/test_live_thinking_models.py b/tests/integration/test_live_thinking_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..391b260bfe566171433250299966972b0152c68d
--- /dev/null
+++ b/tests/integration/test_live_thinking_models.py
@@ -0,0 +1,151 @@
+"""Opt-in live provider checks for thinking metadata replay.
+
+These tests intentionally call paid model APIs and are skipped unless
+``ML_INTERN_LIVE_LLM_TESTS=1`` plus the relevant provider key are set.
+They cover the concrete model families involved in #87 without making
+default CI depend on external credentials or provider availability.
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from types import SimpleNamespace
+
+import pytest
+from dotenv import load_dotenv
+from litellm import Message
+
+from agent.core.agent_loop import (
+    _assistant_message_from_result,
+    _call_llm_streaming,
+)
+from agent.core.llm_params import _resolve_llm_params
+
+
+if env_file := os.environ.get("ML_INTERN_LIVE_ENV_FILE"):
+    load_dotenv(Path(env_file))
+
+LIVE_TESTS_ENABLED = os.environ.get("ML_INTERN_LIVE_LLM_TESTS") == "1"
+OPUS_47_MODEL = "anthropic/claude-opus-4-7"
+LATEST_GPT_MODEL = "openai/gpt-5.2"
+REPORT_RESULT_TOOL = [
+    {
+        "type": "function",
+        "function": {
+            "name": "report_result",
+            "description": "Report the final test result.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "answer": {
+                        "type": "string",
+                        "description": "The exact marker requested by the test.",
+                    }
+                },
+                "required": ["answer"],
+            },
+        },
+    }
+]
+
+
+def _skip_without_live_flag() -> None:
+    if not LIVE_TESTS_ENABLED:
+        pytest.skip("set ML_INTERN_LIVE_LLM_TESTS=1 to run paid live LLM tests")
+
+
+def _skip_without_env(name: str) -> None:
+    if not os.environ.get(name):
+        pytest.skip(f"set {name} to run this live provider test")
+
+
+def _session(model_name: str):
+    events = []
+
+    async def send_event(event):
+        events.append(event)
+
+    return SimpleNamespace(
+        config=SimpleNamespace(model_name=model_name),
+        is_cancelled=False,
+        send_event=send_event,
+        events=events,
+    )
+
+
+@pytest.mark.asyncio
+async def test_live_opus_47_preserves_thinking_metadata_for_replay():
+    _skip_without_live_flag()
+    _skip_without_env("ANTHROPIC_API_KEY")
+
+    session = _session(OPUS_47_MODEL)
+    llm_params = _resolve_llm_params(
+        OPUS_47_MODEL,
+        reasoning_effort="high",
+    )
+
+    result = await _call_llm_streaming(
+        session,
+        messages=[
+            Message(
+                role="user",
+                content=(
+                    "Use careful reasoning for this small check. "
+                    "If 17 * 19 = 323, call report_result with answer OPUS_OK."
+                ),
+            )
+        ],
+        tools=REPORT_RESULT_TOOL,
+        llm_params=llm_params,
+    )
+
+    replay = _assistant_message_from_result(
+        result,
+        model_name=OPUS_47_MODEL,
+    )
+
+    assert result.content or result.tool_calls_acc
+    assert result.thinking_blocks, (
+        "Opus returned no thinking_blocks with reasoning_effort='high' - "
+        "check that adaptive thinking params are being forwarded correctly"
+    )
+    assert getattr(replay, "thinking_blocks", None) == result.thinking_blocks
+    assert getattr(replay, "reasoning_content", None) == result.reasoning_content
+
+
+@pytest.mark.asyncio
+async def test_live_latest_gpt_does_not_replay_reasoning_metadata():
+    _skip_without_live_flag()
+    _skip_without_env("OPENAI_API_KEY")
+
+    session = _session(LATEST_GPT_MODEL)
+    llm_params = _resolve_llm_params(
+        LATEST_GPT_MODEL,
+        reasoning_effort="low",
+    )
+
+    result = await _call_llm_streaming(
+        session,
+        messages=[
+            Message(
+                role="user",
+                content="Call report_result with answer GPT_OK.",
+            )
+        ],
+        tools=REPORT_RESULT_TOOL,
+        llm_params=llm_params,
+    )
+
+    # Even if a GPT-family response carries provider reasoning internally,
+    # OpenAI-compatible history must not echo it back on the next tool turn.
+    # Force the non-None strip path when the live model omits reasoning details.
+    result.reasoning_content = result.reasoning_content or "synthetic-reasoning"
+    replay = _assistant_message_from_result(
+        result,
+        model_name=LATEST_GPT_MODEL,
+    )
+
+    assert result.content or result.tool_calls_acc
+    assert getattr(replay, "thinking_blocks", None) is None
+    assert getattr(replay, "reasoning_content", None) is None
diff --git a/tests/unit/test_build_kpis.py b/tests/unit/test_build_kpis.py
index 5edefc572b461f77c9ad9bde58b8eddf0bfce478..6efba2366bc66efd5a179ce71119bdcc6026d44b 100644
--- a/tests/unit/test_build_kpis.py
+++ b/tests/unit/test_build_kpis.py
@@ -136,20 +136,141 @@ def test_aggregate_day_cache_hit_and_users():
     assert abs(row["cost_usd"] - 1.5) < 1e-9
 
 
-def test_aggregate_day_sums_pro_click_sources():
+def test_per_tool_counts_in_session_metrics():
+    mod = _load()
+    events = [
+        _ev("tool_call", {"tool": "bash"}),
+        _ev("tool_call", {"tool": "bash"}),
+        _ev("tool_call", {"tool": "research"}),
+        _ev("tool_call", {"tool": "read"}),
+        _ev("tool_call", {}),  # nameless tool_call must be ignored
+    ]
+    m = mod._session_metrics(_session(events, user_id="u1"))
+    assert m["_tool_calls_by_name"] == {"bash": 2, "research": 1, "read": 1}
+    assert m["_research_calls"] == 1
+    assert m["_distinct_tools_used"] == 3
+    assert m["_total_named_tool_calls"] == 4
+    assert m["_model_name"] == "claude-opus-4-6"
+
+
+def test_aggregate_research_kpis_only_count_doer_sessions():
     mod = _load()
     s1 = mod._session_metrics(_session([
-        _ev("pro_cta_click", {"source": "hf_jobs_upgrade_dialog"}),
-        _ev("pro_cta_click", {"source": "hf_jobs_upgrade_dialog"}),
+        _ev("tool_call", {"tool": "research"}),
+        _ev("tool_call", {"tool": "research"}),
+        _ev("tool_call", {"tool": "research"}),
     ], user_id="u1"))
     s2 = mod._session_metrics(_session([
+        _ev("tool_call", {"tool": "research"}),
+    ], user_id="u2"))
+    s3 = mod._session_metrics(_session([
+        _ev("tool_call", {"tool": "bash"}),
+    ], user_id="u3"))
+    row = mod._aggregate([s1, s2, s3])
+    assert row["sessions"] == 3
+    assert row["sessions_with_research"] == 2
+    assert row["research_calls"] == 4
+    # Median among sessions that did any research = (1, 3) -> 2.0
+    assert row["research_calls_per_session_p50"] == 2.0
+
+
+def test_aggregate_tool_breadth_and_intensity():
+    import json as _json
+    mod = _load()
+    s1 = mod._session_metrics(_session([
+        _ev("tool_call", {"tool": "bash"}),
+        _ev("tool_call", {"tool": "research"}),
+    ], user_id="u1"))
+    # Two user turns so calls/turn = 4/2 = 2
+    s2 = _session([
+        _ev("tool_call", {"tool": "bash"}),
+        _ev("tool_call", {"tool": "bash"}),
+        _ev("tool_call", {"tool": "edit"}),
+        _ev("tool_call", {"tool": "edit"}),
+    ], user_id="u2")
+    s2["messages"] = [{"role": "user"}, {"role": "user"}]
+    s2_metrics = mod._session_metrics(s2)
+    row = mod._aggregate([s1, s2_metrics])
+    assert _json.loads(row["tool_calls_by_name_json"]) == {
+        "bash": 3, "research": 1, "edit": 2,
+    }
+    assert _json.loads(row["sessions_using_tool_json"]) == {
+        "bash": 2, "research": 1, "edit": 1,
+    }
+    # u1: 2 distinct, u2: 2 distinct -> p50 = 2
+    assert row["distinct_tools_per_session_p50"] == 2.0
+    # tool_calls_per_session: u1=2, u2=4 -> p50=3
+    assert row["tool_calls_per_session_p50"] == 3.0
+    # u1: 2 turns(?) — _session() default has one user message, so calls/turn=2/1=2; u2=4/2=2
+    assert row["tool_calls_per_turn_p50"] == 2.0
+
+
+def test_breadth_intensity_percentiles_exclude_zero_tool_sessions():
+    """Sessions that never called a tool would otherwise crush the median."""
+    mod = _load()
+    # Two productive sessions and three idle ones (no tool calls). Without
+    # the doer-only filter, median of [0,0,0,2,4] = 0, which is useless.
+    productive_a = mod._session_metrics(_session([
+        _ev("tool_call", {"tool": "bash"}),
+        _ev("tool_call", {"tool": "research"}),
+    ], user_id="prod_a"))
+    productive_b = _session([
+        _ev("tool_call", {"tool": "bash"}),
+        _ev("tool_call", {"tool": "edit"}),
+        _ev("tool_call", {"tool": "edit"}),
+        _ev("tool_call", {"tool": "edit"}),
+    ], user_id="prod_b")
+    productive_b["messages"] = [{"role": "user"}, {"role": "user"}]
+    productive_b_metrics = mod._session_metrics(productive_b)
+    idle = [
+        mod._session_metrics(_session([], user_id="idle_a")),
+        mod._session_metrics(_session([], user_id="idle_b")),
+        mod._session_metrics(_session([], user_id="idle_c")),
+    ]
+    row = mod._aggregate([productive_a, productive_b_metrics, *idle])
+    # Median of [2 distinct, 2 distinct] = 2 (idle sessions filtered).
+    assert row["distinct_tools_per_session_p50"] == 2.0
+    # Median of [2 calls, 4 calls] = 3 (idle sessions filtered).
+    assert row["tool_calls_per_session_p50"] == 3.0
+
+
+def test_pro_clicks_and_blocked_jobs_in_aggregate():
+    """The aggregate row keeps pro_cta_clicks + hf_jobs_blocked columns
+    even if the dashboard doesn't currently chart them — they're cheap to
+    keep and downstream consumers may still depend on the schema."""
+    mod = _load()
+    s1 = mod._session_metrics(_session([
+        _ev("pro_cta_click", {"source": "hf_jobs_upgrade_dialog"}),
         _ev("pro_cta_click", {"source": "claude_cap_dialog"}),
+        _ev("jobs_access_blocked", {}),
+    ], user_id="u1"))
+    s2 = mod._session_metrics(_session([
+        _ev("jobs_access_blocked", {}),
+        _ev("jobs_access_blocked", {}),
     ], user_id="u2"))
-    row = mod._aggregate_day([s1, s2])
-    assert row["pro_cta_clicks"] == 3
-    assert row["pro_cta_by_source_json"] == (
-        '{"claude_cap_dialog": 1, "hf_jobs_upgrade_dialog": 2}'
-    )
+    row = mod._aggregate([s1, s2])
+    assert row["pro_cta_clicks"] == 2
+    assert row["hf_jobs_blocked"] == 3
+
+
+def test_aggregate_sessions_by_model_split():
+    import json as _json
+    mod = _load()
+    s_anthropic = _session([], user_id="a")
+    s_anthropic["model_name"] = "anthropic/claude-opus-4-6"
+    s_bedrock = _session([], user_id="b")
+    s_bedrock["model_name"] = "bedrock/us.anthropic.claude-opus-4-6-v1"
+    s_bedrock2 = _session([], user_id="c")
+    s_bedrock2["model_name"] = "bedrock/us.anthropic.claude-opus-4-6-v1"
+    row = mod._aggregate([
+        mod._session_metrics(s_anthropic),
+        mod._session_metrics(s_bedrock),
+        mod._session_metrics(s_bedrock2),
+    ])
+    assert _json.loads(row["sessions_by_model_json"]) == {
+        "anthropic/claude-opus-4-6": 1,
+        "bedrock/us.anthropic.claude-opus-4-6-v1": 2,
+    }
 
 
 def test_failure_and_regenerate_rates():
diff --git a/tests/unit/test_cli_rendering.py b/tests/unit/test_cli_rendering.py
index 7704afd58e59a8aaab3ffbc582c0def677672d69..ff633c0673c4bbdafda9a1de46e69bbb007bdfad 100644
--- a/tests/unit/test_cli_rendering.py
+++ b/tests/unit/test_cli_rendering.py
@@ -1,8 +1,12 @@
 """Regression tests for interactive CLI rendering and research model routing."""
 
+import sys
 from io import StringIO
 from types import SimpleNamespace
 
+import pytest
+
+import agent.main as main_mod
 from agent.tools.research_tool import _get_research_model
 from agent.utils import terminal_display
 
@@ -42,3 +46,45 @@ def test_subagent_display_does_not_spawn_background_redraw(monkeypatch):
     mgr.clear("agent-1")
 
     assert calls == []
+
+
+def test_cli_forwards_model_flag_to_interactive_main(monkeypatch):
+    seen: dict[str, str | None] = {}
+
+    async def fake_main(*, model=None):
+        seen["model"] = model
+
+    monkeypatch.setattr(sys, "argv", ["ml-intern", "--model", "openai/gpt-5.5"])
+    monkeypatch.setattr(main_mod, "main", fake_main)
+
+    main_mod.cli()
+
+    assert seen["model"] == "openai/gpt-5.5"
+
+
+@pytest.mark.asyncio
+async def test_interactive_main_applies_model_override_before_banner(monkeypatch):
+    class StopAfterBanner(Exception):
+        pass
+
+    def fake_banner(*, model=None, hf_user=None):
+        assert model == "openai/gpt-5.5"
+        assert hf_user == "tester"
+        raise StopAfterBanner
+
+    monkeypatch.setattr(main_mod.os, "system", lambda *_args, **_kwargs: 0)
+    monkeypatch.setattr(main_mod, "PromptSession", lambda: object())
+    monkeypatch.setattr(main_mod, "resolve_hf_token", lambda: "hf-token")
+    monkeypatch.setattr(main_mod, "_get_hf_user", lambda _token: "tester")
+    monkeypatch.setattr(
+        main_mod,
+        "load_config",
+        lambda _path, **_kwargs: SimpleNamespace(
+            model_name="moonshotai/Kimi-K2.6",
+            mcpServers={},
+        ),
+    )
+    monkeypatch.setattr(main_mod, "print_banner", fake_banner)
+
+    with pytest.raises(StopAfterBanner):
+        await main_mod.main(model="openai/gpt-5.5")
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..71f92b2a44e9b4fcb2dbe73179baefd05e25b683
--- /dev/null
+++ b/tests/unit/test_config.py
@@ -0,0 +1,121 @@
+import json
+
+from agent import config as config_module
+
+
+def _write_json(path, data):
+    path.write_text(json.dumps(data), encoding="utf-8")
+
+
+def test_load_config_does_not_apply_slack_user_defaults_by_default(tmp_path, monkeypatch):
+    config_path = tmp_path / "config.json"
+    _write_json(
+        config_path,
+        {
+            "model_name": "moonshotai/Kimi-K2.6",
+            "messaging": {
+                "enabled": False,
+                "destinations": {},
+            },
+        },
+    )
+    monkeypatch.setenv("SLACK_BOT_TOKEN", "xoxb-test")
+    monkeypatch.setenv("SLACK_CHANNEL_ID", "C123")
+
+    config = config_module.load_config(str(config_path))
+
+    assert not config.messaging.enabled
+    assert config.messaging.destinations == {}
+
+
+def test_load_config_applies_slack_user_defaults_from_env(tmp_path, monkeypatch):
+    config_path = tmp_path / "config.json"
+    _write_json(config_path, {"model_name": "moonshotai/Kimi-K2.6"})
+    monkeypatch.delenv("ML_INTERN_CLI_CONFIG", raising=False)
+    monkeypatch.setattr(
+        config_module,
+        "DEFAULT_USER_CONFIG_PATH",
+        tmp_path / "missing-user-config.json",
+    )
+    monkeypatch.setenv("SLACK_BOT_TOKEN", "xoxb-test")
+    monkeypatch.setenv("SLACK_CHANNEL_ID", "C123")
+
+    config = config_module.load_config(str(config_path), include_user_defaults=True)
+
+    assert config.messaging.enabled
+    assert config.messaging.auto_event_types == [
+        "approval_required",
+        "error",
+        "turn_complete",
+    ]
+    destination = config.messaging.destinations["slack.default"]
+    assert destination.token == "xoxb-test"
+    assert destination.channel == "C123"
+    assert destination.allow_agent_tool
+    assert destination.allow_auto_events
+
+
+def test_load_config_merges_user_config_before_env_substitution(tmp_path, monkeypatch):
+    config_path = tmp_path / "config.json"
+    user_config_path = tmp_path / "user-config.json"
+    _write_json(config_path, {"model_name": "moonshotai/Kimi-K2.6"})
+    _write_json(
+        user_config_path,
+        {
+            "messaging": {
+                "enabled": True,
+                "auto_event_types": ["approval_required"],
+                "destinations": {
+                    "slack.team": {
+                        "provider": "slack",
+                        "token": "${USER_SLACK_TOKEN}",
+                        "channel": "C999",
+                        "allow_agent_tool": False,
+                        "allow_auto_events": True,
+                    },
+                },
+            },
+        },
+    )
+    monkeypatch.setenv("ML_INTERN_CLI_CONFIG", str(user_config_path))
+    monkeypatch.setenv("ML_INTERN_SLACK_NOTIFICATIONS", "0")
+    monkeypatch.setenv("USER_SLACK_TOKEN", "xoxb-user")
+
+    config = config_module.load_config(str(config_path), include_user_defaults=True)
+
+    assert config.messaging.enabled
+    assert config.messaging.auto_event_types == ["approval_required"]
+    assert set(config.messaging.destinations) == {"slack.team"}
+    destination = config.messaging.destinations["slack.team"]
+    assert destination.token == "xoxb-user"
+    assert destination.channel == "C999"
+    assert not destination.allow_agent_tool
+    assert destination.allow_auto_events
+
+
+def test_slack_user_defaults_can_be_disabled(tmp_path, monkeypatch):
+    config_path = tmp_path / "config.json"
+    _write_json(
+        config_path,
+        {
+            "model_name": "moonshotai/Kimi-K2.6",
+            "messaging": {
+                "enabled": False,
+                "destinations": {},
+            },
+        },
+    )
+    monkeypatch.delenv("ML_INTERN_CLI_CONFIG", raising=False)
+    monkeypatch.setattr(
+        config_module,
+        "DEFAULT_USER_CONFIG_PATH",
+        tmp_path / "missing-user-config.json",
+    )
+    monkeypatch.setenv("ML_INTERN_SLACK_NOTIFICATIONS", "false")
+    monkeypatch.setenv("SLACK_BOT_TOKEN", "xoxb-test")
+    monkeypatch.setenv("SLACK_CHANNEL_ID", "C123")
+
+    config = config_module.load_config(str(config_path), include_user_defaults=True)
+
+    assert not config.messaging.enabled
+    assert config.messaging.destinations == {}
diff --git a/tests/unit/test_doom_loop.py b/tests/unit/test_doom_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbdac454d9c9b0b15e27747fe4c09e75fdf8b670
--- /dev/null
+++ b/tests/unit/test_doom_loop.py
@@ -0,0 +1,232 @@
+"""Tests for the doom-loop detector — repeated/cycling tool call patterns."""
+
+from dataclasses import dataclass
+
+from agent.core.doom_loop import (
+    ToolCallSignature,
+    _hash_args,
+    _normalize_args,
+    check_for_doom_loop,
+    detect_identical_consecutive,
+    detect_repeating_sequence,
+    extract_recent_tool_signatures,
+)
+
+
+# ── Lightweight stand-ins so we don't need the litellm message classes ──
+
+
+@dataclass
+class _Fn:
+    name: str
+    arguments: str
+
+
+@dataclass
+class _ToolCall:
+    function: _Fn
+
+
+@dataclass
+class _Msg:
+    role: str
+    tool_calls: list | None = None
+
+
+def _assistant_call(name: str, args: str) -> _Msg:
+    return _Msg(role="assistant", tool_calls=[_ToolCall(_Fn(name, args))])
+
+
+# ── _normalize_args / _hash_args ────────────────────────────────────────
+
+
+def test_normalize_args_collapses_key_order():
+    a = '{"path": "/foo", "query": "bar"}'
+    b = '{"query": "bar", "path": "/foo"}'
+    assert _normalize_args(a) == _normalize_args(b)
+
+
+def test_normalize_args_collapses_whitespace():
+    a = '{"path": "/foo", "query": "bar"}'
+    b = '{"path":"/foo","query":"bar"}'
+    assert _normalize_args(a) == _normalize_args(b)
+
+
+def test_normalize_args_preserves_value_difference():
+    a = '{"path": "/foo"}'
+    b = '{"path": "/bar"}'
+    assert _normalize_args(a) != _normalize_args(b)
+
+
+def test_normalize_args_preserves_nested_structure():
+    a = '{"a": {"x": 1, "y": 2}, "b": [3, 4]}'
+    b = '{"b": [3, 4], "a": {"y": 2, "x": 1}}'
+    assert _normalize_args(a) == _normalize_args(b)
+
+
+def test_normalize_args_array_order_is_significant():
+    # Lists are positional — different orderings should NOT collapse.
+    a = '{"items": [1, 2, 3]}'
+    b = '{"items": [3, 2, 1]}'
+    assert _normalize_args(a) != _normalize_args(b)
+
+
+def test_normalize_args_falls_back_for_invalid_json():
+    # Some providers occasionally pass a bare string; we shouldn't raise.
+    assert _normalize_args("not json") == "not json"
+    assert _normalize_args("{broken") == "{broken"
+
+
+def test_normalize_args_handles_empty_string():
+    assert _normalize_args("") == ""
+
+
+def test_hash_args_collapses_semantically_identical_calls():
+    # The headline regression: pre-fix these hashed differently and the
+    # doom-loop detector silently missed identical-consecutive calls.
+    a = '{"path": "/foo", "query": "bar"}'
+    b = '{"query": "bar", "path": "/foo"}'
+    assert _hash_args(a) == _hash_args(b)
+
+
+def test_hash_args_still_differs_on_real_argument_change():
+    assert _hash_args('{"path": "/a"}') != _hash_args('{"path": "/b"}')
+
+
+# ── extract_recent_tool_signatures ──────────────────────────────────────
+
+
+def test_extract_recent_signatures_collapses_reordered_keys():
+    """Three calls with reordered keys should produce identical signatures."""
+    msgs = [
+        _assistant_call("read", '{"path": "/foo", "limit": 100}'),
+        _assistant_call("read", '{"limit": 100, "path": "/foo"}'),
+        _assistant_call("read", '{"path":"/foo","limit":100}'),
+    ]
+    sigs = extract_recent_tool_signatures(msgs)
+    assert len(sigs) == 3
+    assert sigs[0] == sigs[1] == sigs[2]
+
+
+def test_extract_skips_non_assistant_messages():
+    msgs = [
+        _Msg(role="user", tool_calls=None),
+        _assistant_call("read", '{"path": "/x"}'),
+        _Msg(role="tool", tool_calls=None),
+    ]
+    sigs = extract_recent_tool_signatures(msgs)
+    assert len(sigs) == 1
+    assert sigs[0].name == "read"
+
+
+def test_extract_skips_assistant_without_tool_calls():
+    msgs = [_Msg(role="assistant", tool_calls=None)]
+    assert extract_recent_tool_signatures(msgs) == []
+
+
+# ── detect_identical_consecutive ────────────────────────────────────────
+
+
+def _sig(name: str, args: str = "{}") -> ToolCallSignature:
+    return ToolCallSignature(name=name, args_hash=_hash_args(args))
+
+
+def test_identical_consecutive_fires_at_threshold():
+    sigs = [_sig("read", '{"p": 1}')] * 3
+    assert detect_identical_consecutive(sigs, threshold=3) == "read"
+
+
+def test_identical_consecutive_stays_silent_below_threshold():
+    sigs = [_sig("read", '{"p": 1}')] * 2
+    assert detect_identical_consecutive(sigs, threshold=3) is None
+
+
+def test_identical_consecutive_resets_on_break():
+    # A, A, B, A, A — never 3 in a row.
+    sigs = [
+        _sig("read", '{"p": 1}'),
+        _sig("read", '{"p": 1}'),
+        _sig("read", '{"p": 2}'),
+        _sig("read", '{"p": 1}'),
+        _sig("read", '{"p": 1}'),
+    ]
+    assert detect_identical_consecutive(sigs, threshold=3) is None
+
+
+def test_identical_consecutive_catches_reordered_args_after_normalization():
+    """Regression for the bug: same call with shuffled keys must collapse."""
+    msgs = [
+        _assistant_call("research", '{"task": "find paper", "depth": 3}'),
+        _assistant_call("research", '{"depth": 3, "task": "find paper"}'),
+        _assistant_call("research", '{"task":"find paper","depth":3}'),
+    ]
+    sigs = extract_recent_tool_signatures(msgs)
+    assert detect_identical_consecutive(sigs, threshold=3) == "research"
+
+
+# ── detect_repeating_sequence ───────────────────────────────────────────
+
+
+def test_repeating_sequence_catches_alternating_pair():
+    sigs = [_sig("a"), _sig("b")] * 3
+    pattern = detect_repeating_sequence(sigs)
+    assert pattern is not None
+    assert [s.name for s in pattern] == ["a", "b"]
+
+
+def test_repeating_sequence_misses_when_pattern_breaks():
+    sigs = [_sig("a"), _sig("b"), _sig("a"), _sig("c")]
+    assert detect_repeating_sequence(sigs) is None
+
+
+def test_repeating_sequence_normalizes_args_inside_pattern():
+    """Cycle [research, read, research, read, ...] survives key reordering."""
+    msgs = [
+        _assistant_call("research", '{"q": "x", "n": 1}'),
+        _assistant_call("read", '{"path": "/a"}'),
+        _assistant_call("research", '{"n": 1, "q": "x"}'),
+        _assistant_call("read", '{"path":"/a"}'),
+        _assistant_call("research", '{"q":"x","n":1}'),
+        _assistant_call("read", '{"path": "/a"}'),
+    ]
+    sigs = extract_recent_tool_signatures(msgs)
+    pattern = detect_repeating_sequence(sigs)
+    assert pattern is not None
+    assert [s.name for s in pattern] == ["research", "read"]
+
+
+# ── check_for_doom_loop ─────────────────────────────────────────────────
+
+
+def test_check_for_doom_loop_quiet_below_minimum_signatures():
+    msgs = [_assistant_call("read", '{"p": 1}'), _assistant_call("read", '{"p": 1}')]
+    assert check_for_doom_loop(msgs) is None
+
+
+def test_check_for_doom_loop_returns_corrective_prompt_for_identical_run():
+    msgs = [_assistant_call("read", '{"p": 1}')] * 3
+    out = check_for_doom_loop(msgs)
+    assert out is not None
+    assert "DOOM LOOP DETECTED" in out
+    assert "'read'" in out
+
+
+def test_check_for_doom_loop_returns_corrective_prompt_for_cycle():
+    msgs = []
+    for _ in range(3):
+        msgs.append(_assistant_call("a", "{}"))
+        msgs.append(_assistant_call("b", "{}"))
+    out = check_for_doom_loop(msgs)
+    assert out is not None
+    assert "DOOM LOOP DETECTED" in out
+    assert "a → b" in out
+
+
+def test_check_for_doom_loop_quiet_when_args_meaningfully_differ():
+    """Same tool, three different arg values — not a loop."""
+    msgs = [
+        _assistant_call("read", '{"path": "/a.py"}'),
+        _assistant_call("read", '{"path": "/b.py"}'),
+        _assistant_call("read", '{"path": "/c.py"}'),
+    ]
+    assert check_for_doom_loop(msgs) is None
diff --git a/tests/unit/test_doom_loop_polling.py b/tests/unit/test_doom_loop_polling.py
index 0142f4591482a1df777476f39f4f1dc4517245d6..0c7636e3ef7626e54bd2ce31ba0635db414dba9c 100644
--- a/tests/unit/test_doom_loop_polling.py
+++ b/tests/unit/test_doom_loop_polling.py
@@ -5,7 +5,7 @@ Reproduces the failure mode in observatory sessions 40fcb414 ($32.59),
 long-running job with `bash sleep 300 && wc -l output` four times in a
 row. The arguments were byte-identical, but the results moved (27210 →
 36454 → 45770 → 55138 — actual progress). The detector hashed args only
-and false-fired DOOM LOOP, which made the agent abandon perfectly valid
+and false-fired the repetition guard, which made the agent abandon perfectly valid
 polling.
 
 After the fix the signature includes the tool result hash, so identical
@@ -66,7 +66,7 @@ def test_truly_stuck_polling_with_identical_results_still_fires():
     ]
     prompt = check_for_doom_loop(msgs)
     assert prompt is not None
-    assert "DOOM LOOP" in prompt
+    assert "REPETITION GUARD" in prompt
     assert "bash" in prompt
 
 
@@ -80,7 +80,7 @@ def test_identical_calls_with_no_results_yet_still_fires():
     ]
     prompt = check_for_doom_loop(msgs)
     assert prompt is not None
-    assert "DOOM LOOP" in prompt
+    assert "REPETITION GUARD" in prompt
     assert "write" in prompt
 
 
diff --git a/tests/unit/test_hf_access.py b/tests/unit/test_hf_access.py
index 7ccb96ce705e380bea099b8df69ce6f00fa24d1a..b8ab089742feaed1c156f8fc4d7afde115a110f8 100644
--- a/tests/unit/test_hf_access.py
+++ b/tests/unit/test_hf_access.py
@@ -37,3 +37,19 @@ def test_free_user_without_paid_org_cannot_run_jobs():
     assert access.can_run_jobs is False
     assert access.eligible_namespaces == []
     assert access.default_namespace is None
+
+
+def test_oauth_pro_user_recognized_via_is_pro_flag():
+    # OAuth login surfaces Pro status only as `isPro: true`; the `type` key is
+    # a generic "user" string. Regression test for Space discussion #21 — Pro
+    # OAuth users were being classified as free and blocked from Jobs.
+    access = jobs_access_from_whoami({
+        "name": "alice",
+        "type": "user",
+        "isPro": True,
+        "orgs": [],
+    })
+    assert access.plan == "pro"
+    assert access.personal_can_run_jobs is True
+    assert access.eligible_namespaces == ["alice"]
+    assert access.default_namespace == "alice"
diff --git a/tests/unit/test_llm_params.py b/tests/unit/test_llm_params.py
index ee6cf62c68c82a9b89e25c683ac5e63d9d8dfa94..5234461ad0f10477c48de9e5cb98d1b1ec927f4f 100644
--- a/tests/unit/test_llm_params.py
+++ b/tests/unit/test_llm_params.py
@@ -1,4 +1,9 @@
-from agent.core.llm_params import UnsupportedEffortError, _resolve_llm_params
+from agent.core.hf_tokens import resolve_hf_request_token
+from agent.core.llm_params import (
+    UnsupportedEffortError,
+    _resolve_hf_router_token,
+    _resolve_llm_params,
+)
 
 
 def test_openai_xhigh_effort_is_forwarded():
@@ -23,3 +28,80 @@ def test_openai_max_effort_is_still_rejected():
         assert "OpenAI doesn't accept effort='max'" in str(exc)
     else:
         raise AssertionError("Expected UnsupportedEffortError for max effort")
+
+
+def test_hf_router_token_prefers_inference_token(monkeypatch):
+    monkeypatch.setenv("INFERENCE_TOKEN", " inference-token ")
+    monkeypatch.setenv("HF_TOKEN", "hf-token")
+
+    assert _resolve_hf_router_token("session-token") == "inference-token"
+
+
+def test_hf_router_token_prefers_session_over_hf_cache(monkeypatch):
+    monkeypatch.delenv("INFERENCE_TOKEN", raising=False)
+    monkeypatch.setenv("HF_TOKEN", "hf-token")
+
+    assert _resolve_hf_router_token(" session-token ") == "session-token"
+
+
+def test_hf_router_token_uses_hf_token_env_via_huggingface_hub(monkeypatch):
+    monkeypatch.delenv("INFERENCE_TOKEN", raising=False)
+    monkeypatch.setenv("HF_TOKEN", " hf-token ")
+
+    assert _resolve_hf_router_token(None) == "hf-token"
+
+
+def test_hf_router_token_uses_huggingface_hub_cache(monkeypatch):
+    import huggingface_hub
+
+    monkeypatch.delenv("INFERENCE_TOKEN", raising=False)
+    monkeypatch.delenv("HF_TOKEN", raising=False)
+    monkeypatch.setattr(huggingface_hub, "get_token", lambda: "cached-token")
+
+    assert _resolve_hf_router_token(None) == "cached-token"
+
+
+def test_hf_router_token_swallows_huggingface_hub_errors(monkeypatch):
+    import huggingface_hub
+
+    def fail():
+        raise RuntimeError("cache unavailable")
+
+    monkeypatch.delenv("INFERENCE_TOKEN", raising=False)
+    monkeypatch.delenv("HF_TOKEN", raising=False)
+    monkeypatch.setattr(huggingface_hub, "get_token", fail)
+
+    assert _resolve_hf_router_token(None) is None
+
+
+def test_hf_router_params_set_bill_to_only_for_inference_token(monkeypatch):
+    monkeypatch.setenv("INFERENCE_TOKEN", "inference-token")
+    monkeypatch.setenv("HF_BILL_TO", "test-org")
+
+    params = _resolve_llm_params("moonshotai/Kimi-K2.6")
+
+    assert params["api_key"] == "inference-token"
+    assert params["extra_headers"] == {"X-HF-Bill-To": "test-org"}
+
+
+def test_hf_request_token_keeps_browser_user_precedence(monkeypatch):
+    class Request:
+        headers = {"Authorization": "Bearer browser-token"}
+        cookies = {"hf_access_token": "cookie-token"}
+
+    monkeypatch.setenv("HF_TOKEN", "server-token")
+
+    assert resolve_hf_request_token(Request()) == "browser-token"
+
+
+def test_hf_request_token_does_not_use_cached_login(monkeypatch):
+    import huggingface_hub
+
+    class Request:
+        headers = {}
+        cookies = {}
+
+    monkeypatch.delenv("HF_TOKEN", raising=False)
+    monkeypatch.setattr(huggingface_hub, "get_token", lambda: "cached-token")
+
+    assert resolve_hf_request_token(Request()) is None
diff --git a/tests/unit/test_messaging.py b/tests/unit/test_messaging.py
new file mode 100644
index 0000000000000000000000000000000000000000..968622c1aa038f5615c31f610c03f912a186ed3e
--- /dev/null
+++ b/tests/unit/test_messaging.py
@@ -0,0 +1,511 @@
+import asyncio
+import json
+from pathlib import Path
+from types import SimpleNamespace
+
+import httpx
+import pytest
+from pydantic import ValidationError
+
+from agent.config import Config
+from agent.core.session import Event, Session
+from agent.messaging.gateway import NotificationGateway
+from agent.messaging.models import NotificationRequest, NotificationResult
+from agent.messaging.slack import SlackProvider, _format_slack_mrkdwn
+from agent.tools.notify_tool import notify_handler
+from backend.session_manager import AgentSession, SessionManager
+
+
+class DummyToolRouter:
+    def get_tool_specs_for_llm(self) -> list[dict]:
+        return []
+
+
+class RecordingGateway:
+    def __init__(self):
+        self.enqueued: list[NotificationRequest] = []
+        self.sent: list[NotificationRequest] = []
+
+    async def enqueue(self, request: NotificationRequest) -> bool:
+        self.enqueued.append(request)
+        return True
+
+    async def send_many(
+        self, requests: list[NotificationRequest]
+    ) -> list[NotificationResult]:
+        self.sent.extend(requests)
+        return [
+            NotificationResult(
+                destination=request.destination,
+                ok=True,
+                provider="test",
+            )
+            for request in requests
+        ]
+
+
+def _config_with_messaging(**destination_overrides) -> Config:
+    destination = {
+        "provider": "slack",
+        "token": "xoxb-test",
+        "channel": "C123",
+        **destination_overrides,
+    }
+    return Config.model_validate(
+        {
+            "model_name": "moonshotai/Kimi-K2.6",
+            "messaging": {
+                "enabled": True,
+                "destinations": {
+                    "slack.ops": destination,
+                },
+            },
+        }
+    )
+
+
+def _test_session(
+    config: Config, gateway, session_id: str = "session-test"
+) -> Session:
+    return Session(
+        asyncio.Queue(),
+        config=config,
+        tool_router=DummyToolRouter(),
+        context_manager=SimpleNamespace(items=[]),
+        notification_gateway=gateway,
+        session_id=session_id,
+    )
+
+
+def test_messaging_config_validates_destination_names():
+    with pytest.raises(ValidationError):
+        Config.model_validate(
+            {
+                "model_name": "moonshotai/Kimi-K2.6",
+                "messaging": {
+                    "enabled": True,
+                    "destinations": {
+                        "Slack Ops": {
+                            "provider": "slack",
+                            "token": "x",
+                            "channel": "C123",
+                        }
+                    },
+                },
+            }
+        )
+
+    config = _config_with_messaging(allow_agent_tool=True, allow_auto_events=True)
+    assert config.messaging.can_agent_tool_send("slack.ops")
+    assert config.messaging.can_auto_send("slack.ops")
+
+
+def test_messaging_config_default_auto_destinations_only_returns_auto_enabled():
+    config = Config.model_validate(
+        {
+            "model_name": "moonshotai/Kimi-K2.6",
+            "messaging": {
+                "enabled": True,
+                "destinations": {
+                    "slack.ops": {
+                        "provider": "slack",
+                        "token": "xoxb-test",
+                        "channel": "C123",
+                        "allow_auto_events": True,
+                    },
+                    "slack.tool": {
+                        "provider": "slack",
+                        "token": "xoxb-test",
+                        "channel": "C999",
+                        "allow_agent_tool": True,
+                    },
+                },
+            },
+        }
+    )
+
+    assert config.messaging.default_auto_destinations() == ["slack.ops"]
+
+
+def test_messaging_config_default_auto_destinations_empty_when_disabled():
+    config = Config.model_validate(
+        {
+            "model_name": "moonshotai/Kimi-K2.6",
+            "messaging": {
+                "enabled": False,
+                "destinations": {
+                    "slack.ops": {
+                        "provider": "slack",
+                        "token": "xoxb-test",
+                        "channel": "C123",
+                        "allow_auto_events": True,
+                    },
+                },
+            },
+        }
+    )
+
+    assert config.messaging.default_auto_destinations() == []
+
+
+def test_slack_mrkdwn_formatter_converts_common_markdown():
+    formatted = _format_slack_mrkdwn(
+        "# Result\n"
+        "**Done** with *details* and ~~old text~~.\n"
+        "See [PR](https://github.com/huggingface/ml-intern/pull/116).\n"
+        "Keep `**literal**` and ```python\nx < 3\n``` untouched.\n"
+        "Escape <raw> & text."
+    )
+
+    assert "*Result*" in formatted
+    assert "*Done*" in formatted
+    assert "_details_" in formatted
+    assert "~old text~" in formatted
+    assert "<https://github.com/huggingface/ml-intern/pull/116|PR>" in formatted
+    assert "`**literal**`" in formatted
+    assert "```python\nx < 3\n```" in formatted
+    assert "Escape &lt;raw&gt; &amp; text." in formatted
+
+
+@pytest.mark.asyncio
+async def test_slack_provider_formats_and_sends_payload():
+    seen: dict[str, object] = {}
+
+    def handler(request: httpx.Request) -> httpx.Response:
+        seen["auth"] = request.headers["Authorization"]
+        seen["content_type"] = request.headers["Content-Type"]
+        seen["json"] = request.read().decode("utf-8")
+        return httpx.Response(200, json={"ok": True, "ts": "123.456"})
+
+    async with httpx.AsyncClient(transport=httpx.MockTransport(handler)) as client:
+        provider = SlackProvider()
+        result = await provider.send(
+            client,
+            "slack.ops",
+            _config_with_messaging().messaging.destinations["slack.ops"],
+            NotificationRequest(
+                destination="slack.ops",
+                title="Approval required",
+                message="A **run** is waiting. See [details](https://example.com).",
+                severity="warning",
+                metadata={"session_id": "sess-1"},
+            ),
+        )
+
+    assert result.ok
+    assert result.external_id == "123.456"
+    assert seen["auth"] == "Bearer xoxb-test"
+    assert seen["content_type"].startswith("application/json")
+    payload = json.loads(str(seen["json"]))
+    assert payload["channel"] == "C123"
+    assert payload["mrkdwn"] is True
+    assert payload["text"] == (
+        "[WARNING] Approval required\n"
+        "A *run* is waiting. See <https://example.com|details>.\n"
+        "session_id: sess-1"
+    )
+
+
+@pytest.mark.asyncio
+async def test_notification_gateway_retries_transient_failures(monkeypatch):
+    attempts = {"count": 0}
+
+    def handler(_request: httpx.Request) -> httpx.Response:
+        attempts["count"] += 1
+        if attempts["count"] == 1:
+            return httpx.Response(503, json={"ok": False})
+        return httpx.Response(200, json={"ok": True, "ts": "999.1"})
+
+    async def fake_sleep(_delay: float) -> None:
+        return None
+
+    monkeypatch.setattr("agent.messaging.gateway.asyncio.sleep", fake_sleep)
+
+    config = _config_with_messaging(allow_agent_tool=True)
+    gateway = NotificationGateway(config.messaging)
+    async with httpx.AsyncClient(transport=httpx.MockTransport(handler)) as client:
+        gateway._client = client
+        result = await gateway.send(
+            NotificationRequest(
+                destination="slack.ops",
+                message="hello",
+            )
+        )
+        gateway._client = None
+
+    assert attempts["count"] == 2
+    assert result.ok
+
+
+@pytest.mark.asyncio
+async def test_notify_tool_rejects_non_allowlisted_destinations():
+    config = _config_with_messaging(allow_agent_tool=False)
+    gateway = RecordingGateway()
+    session = _test_session(config, gateway)
+
+    output, ok = await notify_handler(
+        {"destinations": ["slack.ops"], "message": "done"},
+        session=session,
+    )
+
+    assert not ok
+    assert "unavailable for the notify tool" in output
+    assert gateway.sent == []
+
+
+@pytest.mark.asyncio
+async def test_notify_tool_sends_to_allowlisted_destinations():
+    config = _config_with_messaging(allow_agent_tool=True)
+    gateway = RecordingGateway()
+    session = _test_session(config, gateway, session_id="sess-42")
+
+    output, ok = await notify_handler(
+        {
+            "destinations": ["slack.ops"],
+            "title": "Training complete",
+            "message": "The run finished successfully.",
+            "severity": "success",
+        },
+        session=session,
+    )
+
+    assert ok
+    assert output == "slack.ops: sent"
+    assert len(gateway.sent) == 1
+    sent = gateway.sent[0]
+    assert sent.metadata["session_id"] == "sess-42"
+    assert sent.metadata["model"] == "moonshotai/Kimi-K2.6"
+
+
+@pytest.mark.asyncio
+async def test_session_auto_notifications_only_send_opted_in_auto_destinations():
+    config = Config.model_validate(
+        {
+            "model_name": "moonshotai/Kimi-K2.6",
+            "messaging": {
+                "enabled": True,
+                "destinations": {
+                    "slack.ops": {
+                        "provider": "slack",
+                        "token": "xoxb-test",
+                        "channel": "C123",
+                        "allow_auto_events": True,
+                    },
+                    "slack.tool": {
+                        "provider": "slack",
+                        "token": "xoxb-test",
+                        "channel": "C999",
+                        "allow_agent_tool": True,
+                    },
+                },
+            },
+        }
+    )
+    gateway = RecordingGateway()
+    session = _test_session(config, gateway, session_id="sess-auto")
+    session.set_notification_destinations(["slack.ops", "slack.tool"])
+
+    await session.send_event(
+        Event(
+            event_type="approval_required",
+            data={"tools": [{"tool": "hf_jobs", "tool_call_id": "tc-1"}]},
+        )
+    )
+    await session.send_event(
+        Event(event_type="assistant_message", data={"content": "normal message"})
+    )
+
+    assert len(gateway.enqueued) == 1
+    request = gateway.enqueued[0]
+    assert request.destination == "slack.ops"
+    assert request.severity == "warning"
+    assert request.event_type == "approval_required"
+    assert "hf_jobs" in request.message
+
+
+@pytest.mark.asyncio
+async def test_turn_complete_auto_notification_includes_final_response_summary():
+    config = Config.model_validate(
+        {
+            "model_name": "moonshotai/Kimi-K2.6",
+            "messaging": {
+                "enabled": True,
+                "destinations": {
+                    "slack.ops": {
+                        "provider": "slack",
+                        "token": "xoxb-test",
+                        "channel": "C123",
+                        "allow_auto_events": True,
+                    }
+                },
+            },
+        }
+    )
+    gateway = RecordingGateway()
+    session = _test_session(config, gateway, session_id="sess-done")
+    session.set_notification_destinations(["slack.ops"])
+
+    await session.send_event(
+        Event(
+            event_type="turn_complete",
+            data={
+                "history_size": 12,
+                "final_response": "Evaluation finished. Accuracy: 84.2% on the validation split.",
+            },
+        )
+    )
+
+    assert len(gateway.enqueued) == 1
+    request = gateway.enqueued[0]
+    assert request.destination == "slack.ops"
+    assert request.severity == "success"
+    assert request.event_type == "turn_complete"
+    assert "completed successfully" in request.message
+    assert "Accuracy: 84.2%" in request.message
+
+
+@pytest.mark.asyncio
+async def test_turn_complete_auto_notification_supports_longer_summary():
+    config = Config.model_validate(
+        {
+            "model_name": "moonshotai/Kimi-K2.6",
+            "messaging": {
+                "enabled": True,
+                "destinations": {
+                    "slack.ops": {
+                        "provider": "slack",
+                        "token": "xoxb-test",
+                        "channel": "C123",
+                        "allow_auto_events": True,
+                    }
+                },
+            },
+        }
+    )
+    gateway = RecordingGateway()
+    session = _test_session(config, gateway, session_id="sess-long")
+    session.set_notification_destinations(["slack.ops"])
+
+    long_summary = "A" * 1200 + " END"
+    await session.send_event(
+        Event(
+            event_type="turn_complete",
+            data={
+                "history_size": 12,
+                "final_response": long_summary,
+            },
+        )
+    )
+
+    assert len(gateway.enqueued) == 1
+    request = gateway.enqueued[0]
+    assert request.event_type == "turn_complete"
+    assert "A" * 1200 in request.message
+    assert request.message.endswith("END")
+
+
+@pytest.mark.asyncio
+async def test_turn_complete_auto_notification_can_be_deferred():
+    config = Config.model_validate(
+        {
+            "model_name": "moonshotai/Kimi-K2.6",
+            "messaging": {
+                "enabled": True,
+                "destinations": {
+                    "slack.ops": {
+                        "provider": "slack",
+                        "token": "xoxb-test",
+                        "channel": "C123",
+                        "allow_auto_events": True,
+                    }
+                },
+            },
+        }
+    )
+    gateway = RecordingGateway()
+    session = Session(
+        asyncio.Queue(),
+        config=config,
+        tool_router=DummyToolRouter(),
+        context_manager=SimpleNamespace(items=[]),
+        notification_gateway=gateway,
+        notification_destinations=["slack.ops"],
+        defer_turn_complete_notification=True,
+        session_id="sess-deferred",
+    )
+    event = Event(
+        event_type="turn_complete",
+        data={"final_response": "Finished after the CLI drained the stream."},
+    )
+
+    await session.send_event(event)
+    assert gateway.enqueued == []
+
+    await session.send_deferred_turn_complete_notification(event)
+
+    assert len(gateway.enqueued) == 1
+    request = gateway.enqueued[0]
+    assert request.destination == "slack.ops"
+    assert request.event_type == "turn_complete"
+    assert "Finished after the CLI drained the stream." in request.message
+
+
+@pytest.mark.asyncio
+async def test_turn_complete_can_be_disabled_by_custom_auto_event_config():
+    config = Config.model_validate(
+        {
+            "model_name": "moonshotai/Kimi-K2.6",
+            "messaging": {
+                "enabled": True,
+                "auto_event_types": ["error"],
+                "destinations": {
+                    "slack.ops": {
+                        "provider": "slack",
+                        "token": "xoxb-test",
+                        "channel": "C123",
+                        "allow_auto_events": True,
+                    }
+                },
+            },
+        }
+    )
+    gateway = RecordingGateway()
+    session = _test_session(config, gateway, session_id="sess-optout")
+    session.set_notification_destinations(["slack.ops"])
+
+    await session.send_event(
+        Event(
+            event_type="turn_complete",
+            data={"final_response": "This should not notify."},
+        )
+    )
+
+    assert gateway.enqueued == []
+
+
+def test_session_manager_updates_notification_destinations_in_session_info():
+    config = _config_with_messaging(allow_auto_events=True)
+    manager = SessionManager(str(Path(__file__).resolve().parents[2] / "configs" / "cli_agent_config.json"))
+    manager.config = config
+    manager.sessions = {}
+
+    session = _test_session(config, RecordingGateway(), session_id="sess-manager")
+    manager.sessions["sess-manager"] = AgentSession(
+        session_id="sess-manager",
+        session=session,
+        tool_router=DummyToolRouter(),
+        submission_queue=asyncio.Queue(),
+    )
+
+    updated = manager.set_notification_destinations(
+        "sess-manager",
+        ["slack.ops", "slack.ops"],
+    )
+
+    assert updated == ["slack.ops"]
+    info = manager.get_session_info("sess-manager")
+    assert info is not None
+    assert info["notification_destinations"] == ["slack.ops"]
+
+    with pytest.raises(ValueError):
+        manager.set_notification_destinations("sess-manager", ["slack.unknown"])
diff --git a/tests/unit/test_sandbox_api_auth.py b/tests/unit/test_sandbox_api_auth.py
new file mode 100644
index 0000000000000000000000000000000000000000..e60dfa5b34064756dda1e9776cd679b49b57878d
--- /dev/null
+++ b/tests/unit/test_sandbox_api_auth.py
@@ -0,0 +1,87 @@
+from fastapi.testclient import TestClient
+
+from agent.tools.sandbox_client import _SANDBOX_SERVER, Sandbox
+
+
+def _sandbox_app(
+    monkeypatch,
+    token: str | None = "sandbox-secret",
+    *,
+    hf_token: str | None = None,
+):
+    monkeypatch.delenv("SANDBOX_API_TOKEN", raising=False)
+    monkeypatch.delenv("HF_TOKEN", raising=False)
+    if token is not None:
+        monkeypatch.setenv("SANDBOX_API_TOKEN", token)
+    if hf_token is not None:
+        monkeypatch.setenv("HF_TOKEN", hf_token)
+    namespace = {}
+    exec(_SANDBOX_SERVER, namespace)
+    return namespace["app"]
+
+
+def test_health_is_public(monkeypatch):
+    client = TestClient(_sandbox_app(monkeypatch))
+
+    response = client.get("/api/health")
+
+    assert response.status_code == 200
+    assert response.json() == {"status": "ok"}
+
+
+def test_file_and_command_routes_require_bearer_token(monkeypatch):
+    client = TestClient(_sandbox_app(monkeypatch, "sandbox-secret"))
+
+    response = client.post("/api/exists", json={"path": "/tmp"})
+
+    assert response.status_code == 401
+
+
+def test_file_and_command_routes_accept_valid_bearer_token(monkeypatch):
+    client = TestClient(_sandbox_app(monkeypatch, "sandbox-secret"))
+
+    response = client.post(
+        "/api/exists",
+        json={"path": "/tmp"},
+        headers={"Authorization": "Bearer sandbox-secret"},
+    )
+
+    assert response.status_code == 200
+    assert response.json()["success"] is True
+
+
+def test_legacy_hf_token_fallback_is_accepted(monkeypatch):
+    client = TestClient(_sandbox_app(monkeypatch, token=None, hf_token="hf-secret"))
+
+    response = client.post(
+        "/api/exists",
+        json={"path": "/tmp"},
+        headers={"Authorization": "Bearer hf-secret"},
+    )
+
+    assert response.status_code == 200
+    assert response.json()["success"] is True
+
+
+def test_protected_routes_fail_closed_without_configured_token(monkeypatch):
+    client = TestClient(_sandbox_app(monkeypatch, None))
+
+    response = client.post(
+        "/api/exists",
+        json={"path": "/tmp"},
+        headers={"Authorization": "Bearer anything"},
+    )
+
+    assert response.status_code == 503
+
+
+def test_sandbox_prefers_control_plane_token_for_api_headers():
+    sandbox = Sandbox("owner/name", token="hf-token", api_token="sandbox-secret")
+
+    assert sandbox._client.headers["authorization"] == "Bearer sandbox-secret"
+
+
+def test_sandbox_api_token_is_hidden_from_repr():
+    sandbox = Sandbox("owner/name", token="hf-token", api_token="sandbox-secret")
+
+    assert "sandbox-secret" not in repr(sandbox)
diff --git a/tests/unit/test_session_manager_persistence.py b/tests/unit/test_session_manager_persistence.py
new file mode 100644
index 0000000000000000000000000000000000000000..355f9387aef6fe674264dbb8e3cfbb634cc78ea6
--- /dev/null
+++ b/tests/unit/test_session_manager_persistence.py
@@ -0,0 +1,240 @@
+"""Regression tests for server-side session persistence restore/access."""
+
+from __future__ import annotations
+
+import asyncio
+import sys
+from datetime import datetime, UTC
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any
+
+import pytest
+
+_BACKEND_DIR = Path(__file__).resolve().parent.parent.parent / "backend"
+if str(_BACKEND_DIR) not in sys.path:
+    sys.path.insert(0, str(_BACKEND_DIR))
+
+from agent.core.session_persistence import NoopSessionStore  # noqa: E402
+from session_manager import AgentSession, SessionManager  # noqa: E402
+
+
+class FakeRuntimeSession:
+    def __init__(self, *, hf_token: str | None = None, model: str = "test-model"):
+        self.hf_token = hf_token
+        self.context_manager = SimpleNamespace(items=[])
+        self.pending_approval = None
+        self.turn_count = 0
+        self.config = SimpleNamespace(model_name=model)
+        self.notification_destinations = []
+
+
+class RestoreStore(NoopSessionStore):
+    enabled = True
+
+    def __init__(
+        self,
+        *,
+        metadata: dict[str, Any] | None = None,
+        messages: list[dict[str, Any]] | None = None,
+        delay: float = 0,
+    ) -> None:
+        self.metadata = metadata or {
+            "session_id": "persisted-session",
+            "user_id": "owner",
+            "model": "test-model",
+            "created_at": datetime.now(UTC),
+        }
+        self.messages = messages or []
+        self.delay = delay
+        self.load_calls = 0
+
+    async def load_session(self, session_id: str, **_: Any) -> dict[str, Any] | None:
+        self.load_calls += 1
+        if self.delay:
+            await asyncio.sleep(self.delay)
+        metadata = dict(self.metadata)
+        metadata.setdefault("session_id", session_id)
+        metadata.setdefault("_id", session_id)
+        return {"metadata": metadata, "messages": self.messages}
+
+
+def _manager_with_store(store: NoopSessionStore) -> SessionManager:
+    manager = object.__new__(SessionManager)
+    manager.config = SimpleNamespace(model_name="test-model")
+    manager.sessions = {}
+    manager._lock = asyncio.Lock()
+    manager.persistence_store = store
+    return manager
+
+
+def _runtime_agent_session(
+    session_id: str,
+    *,
+    user_id: str = "owner",
+    hf_token: str | None = "owner-token",
+) -> AgentSession:
+    runtime_session = FakeRuntimeSession(hf_token=hf_token)
+    return AgentSession(
+        session_id=session_id,
+        session=runtime_session,  # type: ignore[arg-type]
+        tool_router=object(),  # type: ignore[arg-type]
+        submission_queue=asyncio.Queue(),
+        user_id=user_id,
+        hf_token=hf_token,
+    )
+
+
+def _install_fake_runtime(manager: SessionManager) -> asyncio.Event:
+    stop = asyncio.Event()
+    manager.run_calls = 0  # type: ignore[attr-defined]
+
+    def fake_create_session_sync(**kwargs: Any):
+        return object(), FakeRuntimeSession(
+            hf_token=kwargs.get("hf_token"),
+            model=kwargs.get("model") or "test-model",
+        )
+
+    async def fake_run_session(*_: Any) -> None:
+        manager.run_calls += 1  # type: ignore[attr-defined]
+        await stop.wait()
+
+    manager._create_session_sync = fake_create_session_sync  # type: ignore[method-assign]
+    manager._run_session = fake_run_session  # type: ignore[method-assign]
+    return stop
+
+
+async def _cancel_runtime_tasks(manager: SessionManager) -> None:
+    tasks = [
+        agent_session.task
+        for agent_session in manager.sessions.values()
+        if agent_session.task and not agent_session.task.done()
+    ]
+    for task in tasks:
+        task.cancel()
+    if tasks:
+        await asyncio.gather(*tasks, return_exceptions=True)
+
+
+@pytest.mark.asyncio
+async def test_existing_session_rejects_cross_user_token_overwrite():
+    manager = _manager_with_store(NoopSessionStore())
+    existing = _runtime_agent_session("s1", user_id="victim", hf_token="victim-token")
+    manager.sessions["s1"] = existing
+
+    result = await manager.ensure_session_loaded(
+        "s1", user_id="attacker", hf_token="attacker-token"
+    )
+
+    assert result is None
+    assert existing.hf_token == "victim-token"
+    assert existing.session.hf_token == "victim-token"
+
+
+@pytest.mark.asyncio
+async def test_existing_session_updates_token_after_access_check():
+    manager = _manager_with_store(NoopSessionStore())
+    existing = _runtime_agent_session("s1", user_id="owner", hf_token="old-token")
+    manager.sessions["s1"] = existing
+
+    result = await manager.ensure_session_loaded(
+        "s1", user_id="owner", hf_token="new-token"
+    )
+
+    assert result is existing
+    assert existing.hf_token == "new-token"
+    assert existing.session.hf_token == "new-token"
+
+
+@pytest.mark.asyncio
+async def test_concurrent_lazy_restore_starts_only_one_agent_task():
+    store = RestoreStore(delay=0.01)
+    manager = _manager_with_store(store)
+    stop = _install_fake_runtime(manager)
+
+    try:
+        first, second = await asyncio.gather(
+            manager.ensure_session_loaded("persisted-session", user_id="owner"),
+            manager.ensure_session_loaded("persisted-session", user_id="owner"),
+        )
+        await asyncio.sleep(0)
+
+        assert first is second
+        assert list(manager.sessions) == ["persisted-session"]
+        assert manager.run_calls == 1  # type: ignore[attr-defined]
+        assert not stop.is_set()
+    finally:
+        stop.set()
+        await _cancel_runtime_tasks(manager)
+
+
+@pytest.mark.asyncio
+async def test_lazy_restore_preserves_pending_approval_tool_calls():
+    store = RestoreStore(
+        metadata={
+            "session_id": "approval-session",
+            "user_id": "owner",
+            "model": "test-model",
+            "pending_approval": [
+                {
+                    "id": "call_123",
+                    "type": "function",
+                    "function": {
+                        "name": "create_file",
+                        "arguments": '{"path":"app.py"}',
+                    },
+                }
+            ],
+        }
+    )
+    manager = _manager_with_store(store)
+    stop = _install_fake_runtime(manager)
+
+    try:
+        restored = await manager.ensure_session_loaded("approval-session", user_id="owner")
+
+        assert restored is not None
+        tool_calls = restored.session.pending_approval["tool_calls"]
+        assert len(tool_calls) == 1
+        assert tool_calls[0].id == "call_123"
+        assert tool_calls[0].function.name == "create_file"
+        assert tool_calls[0].function.arguments == '{"path":"app.py"}'
+    finally:
+        stop.set()
+        await _cancel_runtime_tasks(manager)
+
+
+@pytest.mark.asyncio
+async def test_list_sessions_dev_uses_store_dev_visibility():
+    class ListStore(NoopSessionStore):
+        enabled = True
+
+        def __init__(self) -> None:
+            self.seen_user_id: str | None = None
+
+        async def list_sessions(self, user_id: str, **_: Any) -> list[dict[str, Any]]:
+            self.seen_user_id = user_id
+            if user_id == "dev":
+                return [
+                    {
+                        "session_id": "s1",
+                        "user_id": "alice",
+                        "model": "m",
+                        "created_at": datetime.now(UTC),
+                    },
+                    {
+                        "session_id": "s2",
+                        "user_id": "bob",
+                        "model": "m",
+                        "created_at": datetime.now(UTC),
+                    },
+                ]
+            return []
+
+    store = ListStore()
+    manager = _manager_with_store(store)
+
+    sessions = await manager.list_sessions(user_id="dev")
+
+    assert store.seen_user_id == "dev"
+    assert {session["session_id"] for session in sessions} == {"s1", "s2"}
diff --git a/tests/unit/test_session_persistence.py b/tests/unit/test_session_persistence.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bddb10f875812df5534add23569eb4d1fe803d4
--- /dev/null
+++ b/tests/unit/test_session_persistence.py
@@ -0,0 +1,31 @@
+"""Unit tests for the optional durable session store abstraction."""
+
+import pytest
+
+from agent.core.session_persistence import NoopSessionStore, _safe_message_doc
+
+
+@pytest.mark.asyncio
+async def test_noop_store_keeps_local_cli_and_tests_db_free():
+    store = NoopSessionStore()
+
+    await store.init()
+    await store.upsert_session(session_id="s1", user_id="u1", model="m")
+    await store.save_snapshot(
+        session_id="s1",
+        user_id="u1",
+        model="m",
+        messages=[{"role": "user", "content": "hello"}],
+    )
+
+    assert await store.load_session("s1") is None
+    assert await store.list_sessions("u1") == []
+    assert await store.append_event("s1", "processing", {}) is None
+    assert await store.try_increment_quota("u1", "2099-01-01", 1) is None
+
+
+def test_unsafe_message_payload_is_replaced_with_marker():
+    marker = _safe_message_doc({"role": "assistant", "content": object()})
+
+    assert marker["role"] == "tool"
+    assert marker["ml_intern_persistence_error"] == "message_too_large_or_invalid"
diff --git a/tests/unit/test_sft_tagger.py b/tests/unit/test_sft_tagger.py
index 2ade0f64d5eb6ff49f61a4814fd78c8bdfe5f085..70d4edd60b280df7b3e989a1aa3ce22927794290 100644
--- a/tests/unit/test_sft_tagger.py
+++ b/tests/unit/test_sft_tagger.py
@@ -79,7 +79,7 @@ def test_outcome_ongoing():
 
 def test_outcome_doom_loop_and_context():
     events = [
-        _ev("tool_log", {"tool": "system", "log": "Doom loop detected — injecting corrective prompt"}),
+        _ev("tool_log", {"tool": "system", "log": "Doom loop detected"}),
         _ev("compacted", {"old_tokens": 100, "new_tokens": 50}),
         _ev("turn_complete", {"history_size": 10}),
     ]
diff --git a/tests/unit/test_thinking_history.py b/tests/unit/test_thinking_history.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ef4b2f61a023ac9899311f0e133dfe4438aa398
--- /dev/null
+++ b/tests/unit/test_thinking_history.py
@@ -0,0 +1,299 @@
+from types import SimpleNamespace
+
+import pytest
+from litellm import ChatCompletionMessageToolCall, Message
+
+from agent.core import agent_loop
+from agent.core.agent_loop import (
+    LLMResult,
+    _call_llm_streaming,
+    _assistant_message_from_result,
+    _extract_thinking_state,
+)
+
+
+def test_extract_thinking_state_from_litellm_message():
+    message = Message(
+        role="assistant",
+        content="working",
+        thinking_blocks=[{"type": "thinking", "thinking": "reasoned"}],
+        reasoning_content="reasoned",
+    )
+
+    thinking_blocks, reasoning_content = _extract_thinking_state(message)
+
+    assert thinking_blocks == [{"type": "thinking", "thinking": "reasoned"}]
+    assert reasoning_content == "reasoned"
+
+
+def test_extract_thinking_state_from_provider_fields():
+    message = SimpleNamespace(
+        provider_specific_fields={
+            "thinking_blocks": [{"type": "thinking", "thinking": "reasoned"}],
+            "reasoning_content": "reasoned",
+        },
+    )
+
+    thinking_blocks, reasoning_content = _extract_thinking_state(message)
+
+    assert thinking_blocks == [{"type": "thinking", "thinking": "reasoned"}]
+    assert reasoning_content == "reasoned"
+
+
+def test_assistant_message_from_result_preserves_thinking_with_tool_calls():
+    tool_call = ChatCompletionMessageToolCall(
+        id="call_1",
+        type="function",
+        function={"name": "bash", "arguments": '{"command": "date"}'},
+    )
+    result = LLMResult(
+        content=None,
+        tool_calls_acc={},
+        token_count=12,
+        finish_reason="tool_calls",
+        thinking_blocks=[{"type": "thinking", "thinking": "reasoned"}],
+        reasoning_content="reasoned",
+    )
+
+    message = _assistant_message_from_result(
+        result,
+        model_name="anthropic/claude-opus-4-6",
+        tool_calls=[tool_call],
+    )
+
+    assert message.tool_calls == [tool_call]
+    assert message.thinking_blocks == [{"type": "thinking", "thinking": "reasoned"}]
+    assert message.reasoning_content == "reasoned"
+
+
+def test_assistant_message_from_result_strips_non_anthropic_reasoning_content():
+    result = LLMResult(
+        content=None,
+        tool_calls_acc={},
+        token_count=12,
+        finish_reason="tool_calls",
+        thinking_blocks=[{"type": "thinking", "thinking": "reasoned"}],
+        reasoning_content="reasoned",
+    )
+
+    message = _assistant_message_from_result(
+        result,
+        model_name="openai/Qwen/Qwen3-Next-80B-A3B-Instruct",
+    )
+
+    assert getattr(message, "thinking_blocks", None) is None
+    assert getattr(message, "reasoning_content", None) is None
+
+
+def test_assistant_message_from_result_omits_absent_thinking_fields():
+    result = LLMResult(
+        content="done",
+        tool_calls_acc={},
+        token_count=12,
+        finish_reason="stop",
+    )
+
+    message = _assistant_message_from_result(
+        result,
+        model_name="anthropic/claude-opus-4-6",
+    )
+
+    assert message.content == "done"
+    assert getattr(message, "thinking_blocks", None) is None
+    assert getattr(message, "reasoning_content", None) is None
+
+
+@pytest.mark.asyncio
+async def test_streaming_call_rebuilds_anthropic_thinking_state(monkeypatch):
+    async def fake_stream():
+        yield SimpleNamespace(
+            choices=[
+                SimpleNamespace(
+                    delta=SimpleNamespace(content="done", tool_calls=None),
+                    finish_reason="stop",
+                )
+            ],
+        )
+        yield SimpleNamespace(choices=[], usage=SimpleNamespace(total_tokens=3))
+
+    async def fake_acompletion(**_kwargs):
+        return fake_stream()
+
+    def fake_chunk_builder(chunks, **_kwargs):
+        assert len(chunks) == 2
+        return SimpleNamespace(
+            choices=[
+                SimpleNamespace(
+                    message=Message(
+                        role="assistant",
+                        content="done",
+                        thinking_blocks=[{"type": "thinking", "thinking": "reasoned"}],
+                        reasoning_content="reasoned",
+                    )
+                )
+            ]
+        )
+
+    events = []
+    async def send_event(event):
+        events.append(event)
+
+    session = SimpleNamespace(
+        config=SimpleNamespace(model_name="anthropic/claude-opus-4-6"),
+        is_cancelled=False,
+        send_event=send_event,
+    )
+    monkeypatch.setattr(agent_loop, "acompletion", fake_acompletion)
+    monkeypatch.setattr(agent_loop, "stream_chunk_builder", fake_chunk_builder)
+
+    result = await _call_llm_streaming(
+        session,
+        messages=[Message(role="user", content="hi")],
+        tools=[],
+        llm_params={"model": "anthropic/claude-opus-4-6"},
+    )
+
+    assert result.content == "done"
+    assert result.thinking_blocks == [{"type": "thinking", "thinking": "reasoned"}]
+    assert result.reasoning_content == "reasoned"
+
+
+@pytest.mark.asyncio
+async def test_streaming_call_rebuilds_anthropic_delta_thinking_state(monkeypatch):
+    async def fake_stream():
+        yield SimpleNamespace(
+            choices=[
+                SimpleNamespace(
+                    delta=SimpleNamespace(
+                        content=None,
+                        tool_calls=None,
+                        thinking_blocks=[
+                            {
+                                "type": "thinking",
+                                "thinking": "reasoned",
+                                "signature": "",
+                            }
+                        ],
+                    ),
+                    finish_reason=None,
+                )
+            ],
+        )
+        yield SimpleNamespace(
+            choices=[
+                SimpleNamespace(
+                    delta=SimpleNamespace(
+                        content=None,
+                        tool_calls=None,
+                        thinking_blocks=[
+                            {
+                                "type": "thinking",
+                                "thinking": "",
+                                "signature": "signed",
+                            }
+                        ],
+                    ),
+                    finish_reason=None,
+                )
+            ],
+        )
+        yield SimpleNamespace(
+            choices=[
+                SimpleNamespace(
+                    delta=SimpleNamespace(content="done", tool_calls=None),
+                    finish_reason="stop",
+                )
+            ],
+        )
+        yield SimpleNamespace(choices=[], usage=SimpleNamespace(total_tokens=3))
+
+    async def fake_acompletion(**_kwargs):
+        return fake_stream()
+
+    def fake_chunk_builder(chunks, **_kwargs):
+        assert len(chunks) == 4
+        return SimpleNamespace(
+            choices=[
+                SimpleNamespace(
+                    message=Message(
+                        role="assistant",
+                        content="done",
+                        thinking_blocks=[
+                            {
+                                "type": "thinking",
+                                "thinking": "reasoned",
+                                "signature": "signed",
+                            }
+                        ],
+                        reasoning_content="reasoned",
+                    )
+                )
+            ]
+        )
+
+    events = []
+    async def send_event(event):
+        events.append(event)
+
+    session = SimpleNamespace(
+        config=SimpleNamespace(model_name="anthropic/claude-opus-4-7"),
+        is_cancelled=False,
+        send_event=send_event,
+    )
+    monkeypatch.setattr(agent_loop, "acompletion", fake_acompletion)
+    monkeypatch.setattr(agent_loop, "stream_chunk_builder", fake_chunk_builder)
+
+    result = await _call_llm_streaming(
+        session,
+        messages=[Message(role="user", content="hi")],
+        tools=[],
+        llm_params={"model": "anthropic/claude-opus-4-7"},
+    )
+
+    assert result.content == "done"
+    assert result.thinking_blocks == [
+        {"type": "thinking", "thinking": "reasoned", "signature": "signed"}
+    ]
+    assert result.reasoning_content == "reasoned"
+
+
+@pytest.mark.asyncio
+async def test_streaming_call_skips_chunk_rebuild_for_non_anthropic(monkeypatch):
+    async def fake_stream():
+        yield SimpleNamespace(
+            choices=[
+                SimpleNamespace(
+                    delta=SimpleNamespace(content="done", tool_calls=None),
+                    finish_reason="stop",
+                )
+            ],
+        )
+
+    async def fake_acompletion(**_kwargs):
+        return fake_stream()
+
+    def fail_chunk_builder(*_args, **_kwargs):
+        raise AssertionError("stream_chunk_builder should not run")
+
+    events = []
+    async def send_event(event):
+        events.append(event)
+
+    session = SimpleNamespace(
+        config=SimpleNamespace(model_name="openai/Qwen/Qwen3"),
+        is_cancelled=False,
+        send_event=send_event,
+    )
+    monkeypatch.setattr(agent_loop, "acompletion", fake_acompletion)
+    monkeypatch.setattr(agent_loop, "stream_chunk_builder", fail_chunk_builder)
+
+    result = await _call_llm_streaming(
+        session,
+        messages=[Message(role="user", content="hi")],
+        tools=[],
+        llm_params={"model": "openai/Qwen/Qwen3"},
+    )
+
+    assert result.content == "done"
+    assert result.thinking_blocks is None
+    assert result.reasoning_content is None
diff --git a/tests/unit/test_user_quotas.py b/tests/unit/test_user_quotas.py
index 581179b07ae5f3eedde3ada6cff0df038e3e8fed..4475b1ebd52aa719c5fc1d91ede88212adbb9cbe 100644
--- a/tests/unit/test_user_quotas.py
+++ b/tests/unit/test_user_quotas.py
@@ -15,6 +15,7 @@ if str(_BACKEND_DIR) not in sys.path:
     sys.path.insert(0, str(_BACKEND_DIR))
 
 import user_quotas  # noqa: E402
+from agent.core.session_persistence import NoopSessionStore, _reset_store_for_tests  # noqa: E402
 
 
 @pytest.fixture(autouse=True)
@@ -74,6 +75,33 @@ async def test_concurrent_increments_under_lock_do_not_lose_writes():
     assert await user_quotas.get_claude_used_today("race") == 50
 
 
+@pytest.mark.asyncio
+async def test_try_increment_returns_none_at_cap():
+    assert await user_quotas.try_increment_claude("freebie", 1) == 1
+    assert await user_quotas.try_increment_claude("freebie", 1) is None
+    assert await user_quotas.get_claude_used_today("freebie") == 1
+
+
+@pytest.mark.asyncio
+async def test_try_increment_delegates_cap_to_enabled_store():
+    class StoreAtCap(NoopSessionStore):
+        enabled = True
+
+        async def try_increment_quota(self, user_id: str, day: str, cap: int):
+            assert user_id == "mongo-user"
+            assert cap == 1
+            return None
+
+        async def get_quota(self, user_id: str, day: str):
+            return 1
+
+    _reset_store_for_tests(StoreAtCap())
+
+    assert await user_quotas.try_increment_claude("mongo-user", 1) is None
+    assert await user_quotas.get_claude_used_today("mongo-user") == 1
+    assert "mongo-user" not in user_quotas._claude_counts
+
+
 @pytest.mark.asyncio
 async def test_refund_decrements_and_drops_entry_at_zero():
     await user_quotas.increment_claude("u1")
diff --git a/tests/unit/test_web_search_tool.py b/tests/unit/test_web_search_tool.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd243447141349b1c67e04d8a5c6367356da6674
--- /dev/null
+++ b/tests/unit/test_web_search_tool.py
@@ -0,0 +1,161 @@
+import json
+
+import pytest
+
+from agent.core.tools import create_builtin_tools
+from agent.tools import web_search_tool
+
+
+class _FakeResponse:
+    def __init__(self, text: str, url: str = "https://html.duckduckgo.com/html/?q=x"):
+        self.text = text
+        self.url = url
+
+
+def _content_block(output: dict):
+    return next(item for item in output["results"] if isinstance(item, dict))["content"]
+
+
+def test_web_search_extracts_duckduckgo_results_and_filters_domains(monkeypatch):
+    seen = {}
+
+    def fake_get(url, headers, timeout, allow_redirects):
+        seen.update(
+            {
+                "url": url,
+                "user_agent": headers["User-Agent"],
+                "timeout": timeout,
+                "allow_redirects": allow_redirects,
+            }
+        )
+        return _FakeResponse(
+            """
+            <html><body>
+              <a class="result__a" href="https://docs.rs/reqwest">Reqwest docs</a>
+              <a class="result__a" href="https://example.com/blocked">Blocked result</a>
+            </body></html>
+            """,
+            url,
+        )
+
+    monkeypatch.setenv(web_search_tool.WEB_SEARCH_BASE_URL_ENV, "http://search.test/search")
+    monkeypatch.setattr(web_search_tool.requests, "get", fake_get)
+
+    output = web_search_tool.execute_web_search(
+        "rust web search",
+        allowed_domains=["https://DOCS.rs/"],
+        blocked_domains=["HTTPS://EXAMPLE.COM"],
+    )
+
+    assert seen == {
+        "url": "http://search.test/search?q=rust+web+search",
+        "user_agent": "clawd-rust-tools/0.1",
+        "timeout": 20,
+        "allow_redirects": True,
+    }
+    assert output["query"] == "rust web search"
+    assert _content_block(output) == [
+        {"title": "Reqwest docs", "url": "https://docs.rs/reqwest"}
+    ]
+    assert "Include a Sources section" in output["results"][0]
+
+
+def test_web_search_decodes_duckduckgo_redirects():
+    hits = web_search_tool.extract_search_hits(
+        """
+        <a class="result__a"
+           href="/l/?uddg=https%3A%2F%2Fexample.org%2Fpaper%3Fx%3D1&amp;rut=abc">
+          Example Paper
+        </a>
+        """
+    )
+
+    assert hits == [
+        web_search_tool.SearchHit(
+            title="Example Paper",
+            url="https://example.org/paper?x=1",
+        )
+    ]
+
+
+def test_web_search_generic_fallback_dedupes_and_rejects_bad_base_url(monkeypatch):
+    def fake_get(url, headers, timeout, allow_redirects):
+        return _FakeResponse(
+            """
+            <html><body>
+              <a href="https://example.com/one">Example One</a>
+              <a href="https://example.com/one">Duplicate Example One</a>
+              <a href="https://docs.rs/tokio">Tokio Docs</a>
+            </body></html>
+            """,
+            url,
+        )
+
+    monkeypatch.setenv(web_search_tool.WEB_SEARCH_BASE_URL_ENV, "http://search.test/fallback")
+    monkeypatch.setattr(web_search_tool.requests, "get", fake_get)
+
+    output = web_search_tool.execute_web_search("generic links")
+
+    assert _content_block(output) == [
+        {"title": "Example One", "url": "https://example.com/one"},
+        {"title": "Tokio Docs", "url": "https://docs.rs/tokio"},
+    ]
+
+    monkeypatch.setenv(web_search_tool.WEB_SEARCH_BASE_URL_ENV, "://bad-base-url")
+    with pytest.raises(ValueError):
+        web_search_tool.execute_web_search("generic links")
+
+
+@pytest.mark.asyncio
+async def test_web_search_handler_returns_pretty_json(monkeypatch):
+    to_thread_calls = []
+
+    async def fake_to_thread(func, /, *args, **kwargs):
+        to_thread_calls.append((func, args, kwargs))
+        return func(*args, **kwargs)
+
+    monkeypatch.setattr(
+        web_search_tool,
+        "execute_web_search",
+        lambda **kwargs: {
+            "query": kwargs["query"],
+            "results": ["No web search results matched the query 'x'.", {"content": []}],
+            "durationSeconds": 0.1,
+        },
+    )
+    monkeypatch.setattr(web_search_tool.asyncio, "to_thread", fake_to_thread)
+
+    text, ok = await web_search_tool.web_search_handler({"query": "x"})
+
+    assert ok is False
+    assert "at least 2 characters" in text
+
+    text, ok = await web_search_tool.web_search_handler(
+        {"query": "valid query"}, tool_call_id="call_123"
+    )
+
+    assert ok is True
+    parsed = json.loads(text)
+    assert parsed["query"] == "valid query"
+    assert to_thread_calls[0][0] is web_search_tool.execute_web_search
+    assert to_thread_calls[0][2]["tool_use_id"] == "call_123"
+
+    text, ok = await web_search_tool.web_search_handler(
+        {"query": "valid query", "allowed_domains": "docs.rs"}
+    )
+
+    assert ok is False
+    assert "allowed_domains must be an array of strings" in text
+
+    text, ok = await web_search_tool.web_search_handler({"query": None})
+
+    assert ok is False
+    assert "query string" in text
+
+
+def test_web_search_is_registered_for_llm():
+    tools = create_builtin_tools(local_mode=True)
+    specs = {tool.name: tool for tool in tools}
+
+    assert "web_search" in specs
+    assert specs["web_search"].parameters["required"] == ["query"]
diff --git a/uv.lock b/uv.lock
index 3bddba0dc09a053b29c06189c50237fc0fb23de2..73df668c3f519dee440254013cbd65b6ba6b986e 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1006,31 +1006,34 @@ wheels = [
 
 [[package]]
 name = "hf-xet"
-version = "1.2.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/5e/6e/0f11bacf08a67f7fb5ee09740f2ca54163863b07b70d579356e9222ce5d8/hf_xet-1.2.0.tar.gz", hash = "sha256:a8c27070ca547293b6890c4bf389f713f80e8c478631432962bb7f4bc0bd7d7f", size = 506020, upload-time = "2025-10-24T19:04:32.129Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9e/a5/85ef910a0aa034a2abcfadc360ab5ac6f6bc4e9112349bd40ca97551cff0/hf_xet-1.2.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:ceeefcd1b7aed4956ae8499e2199607765fbd1c60510752003b6cc0b8413b649", size = 2861870, upload-time = "2025-10-24T19:04:11.422Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/40/e2e0a7eb9a51fe8828ba2d47fe22a7e74914ea8a0db68a18c3aa7449c767/hf_xet-1.2.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b70218dd548e9840224df5638fdc94bd033552963cfa97f9170829381179c813", size = 2717584, upload-time = "2025-10-24T19:04:09.586Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/7d/daf7f8bc4594fdd59a8a596f9e3886133fdc68e675292218a5e4c1b7e834/hf_xet-1.2.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d40b18769bb9a8bc82a9ede575ce1a44c75eb80e7375a01d76259089529b5dc", size = 3315004, upload-time = "2025-10-24T19:04:00.314Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/ba/45ea2f605fbf6d81c8b21e4d970b168b18a53515923010c312c06cd83164/hf_xet-1.2.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd3a6027d59cfb60177c12d6424e31f4b5ff13d8e3a1247b3a584bf8977e6df5", size = 3222636, upload-time = "2025-10-24T19:03:58.111Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/1d/04513e3cab8f29ab8c109d309ddd21a2705afab9d52f2ba1151e0c14f086/hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6de1fc44f58f6dd937956c8d304d8c2dea264c80680bcfa61ca4a15e7b76780f", size = 3408448, upload-time = "2025-10-24T19:04:20.951Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/7c/60a2756d7feec7387db3a1176c632357632fbe7849fce576c5559d4520c7/hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f182f264ed2acd566c514e45da9f2119110e48a87a327ca271027904c70c5832", size = 3503401, upload-time = "2025-10-24T19:04:22.549Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/64/48fffbd67fb418ab07451e4ce641a70de1c40c10a13e25325e24858ebe5a/hf_xet-1.2.0-cp313-cp313t-win_amd64.whl", hash = "sha256:293a7a3787e5c95d7be1857358a9130694a9c6021de3f27fa233f37267174382", size = 2900866, upload-time = "2025-10-24T19:04:33.461Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/51/f7e2caae42f80af886db414d4e9885fac959330509089f97cccb339c6b87/hf_xet-1.2.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:10bfab528b968c70e062607f663e21e34e2bba349e8038db546646875495179e", size = 2861861, upload-time = "2025-10-24T19:04:19.01Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/1d/a641a88b69994f9371bd347f1dd35e5d1e2e2460a2e350c8d5165fc62005/hf_xet-1.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2a212e842647b02eb6a911187dc878e79c4aa0aa397e88dd3b26761676e8c1f8", size = 2717699, upload-time = "2025-10-24T19:04:17.306Z" },
-    { url = "https://files.pythonhosted.org/packages/df/e0/e5e9bba7d15f0318955f7ec3f4af13f92e773fbb368c0b8008a5acbcb12f/hf_xet-1.2.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30e06daccb3a7d4c065f34fc26c14c74f4653069bb2b194e7f18f17cbe9939c0", size = 3314885, upload-time = "2025-10-24T19:04:07.642Z" },
-    { url = "https://files.pythonhosted.org/packages/21/90/b7fe5ff6f2b7b8cbdf1bd56145f863c90a5807d9758a549bf3d916aa4dec/hf_xet-1.2.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:29c8fc913a529ec0a91867ce3d119ac1aac966e098cf49501800c870328cc090", size = 3221550, upload-time = "2025-10-24T19:04:05.55Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/cb/73f276f0a7ce46cc6a6ec7d6c7d61cbfe5f2e107123d9bbd0193c355f106/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e159cbfcfbb29f920db2c09ed8b660eb894640d284f102ada929b6e3dc410a", size = 3408010, upload-time = "2025-10-24T19:04:28.598Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/1e/d642a12caa78171f4be64f7cd9c40e3ca5279d055d0873188a58c0f5fbb9/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9c91d5ae931510107f148874e9e2de8a16052b6f1b3ca3c1b12f15ccb491390f", size = 3503264, upload-time = "2025-10-24T19:04:30.397Z" },
-    { url = "https://files.pythonhosted.org/packages/17/b5/33764714923fa1ff922770f7ed18c2daae034d21ae6e10dbf4347c854154/hf_xet-1.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:210d577732b519ac6ede149d2f2f34049d44e8622bf14eb3d63bbcd2d4b332dc", size = 2901071, upload-time = "2025-10-24T19:04:37.463Z" },
-    { url = "https://files.pythonhosted.org/packages/96/2d/22338486473df5923a9ab7107d375dbef9173c338ebef5098ef593d2b560/hf_xet-1.2.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:46740d4ac024a7ca9b22bebf77460ff43332868b661186a8e46c227fdae01848", size = 2866099, upload-time = "2025-10-24T19:04:15.366Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/8c/c5becfa53234299bc2210ba314eaaae36c2875e0045809b82e40a9544f0c/hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:27df617a076420d8845bea087f59303da8be17ed7ec0cd7ee3b9b9f579dff0e4", size = 2722178, upload-time = "2025-10-24T19:04:13.695Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/92/cf3ab0b652b082e66876d08da57fcc6fa2f0e6c70dfbbafbd470bb73eb47/hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3651fd5bfe0281951b988c0facbe726aa5e347b103a675f49a3fa8144c7968fd", size = 3320214, upload-time = "2025-10-24T19:04:03.596Z" },
-    { url = "https://files.pythonhosted.org/packages/46/92/3f7ec4a1b6a65bf45b059b6d4a5d38988f63e193056de2f420137e3c3244/hf_xet-1.2.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d06fa97c8562fb3ee7a378dd9b51e343bc5bc8190254202c9771029152f5e08c", size = 3229054, upload-time = "2025-10-24T19:04:01.949Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/dd/7ac658d54b9fb7999a0ccb07ad863b413cbaf5cf172f48ebcd9497ec7263/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4c1428c9ae73ec0939410ec73023c4f842927f39db09b063b9482dac5a3bb737", size = 3413812, upload-time = "2025-10-24T19:04:24.585Z" },
-    { url = "https://files.pythonhosted.org/packages/92/68/89ac4e5b12a9ff6286a12174c8538a5930e2ed662091dd2572bbe0a18c8a/hf_xet-1.2.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a55558084c16b09b5ed32ab9ed38421e2d87cf3f1f89815764d1177081b99865", size = 3508920, upload-time = "2025-10-24T19:04:26.927Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/44/870d44b30e1dcfb6a65932e3e1506c103a8a5aea9103c337e7a53180322c/hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69", size = 2905735, upload-time = "2025-10-24T19:04:35.928Z" },
+version = "1.4.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/53/92/ec9ad04d0b5728dca387a45af7bc98fbb0d73b2118759f5f6038b61a57e8/hf_xet-1.4.3.tar.gz", hash = "sha256:8ddedb73c8c08928c793df2f3401ec26f95be7f7e516a7bee2fbb546f6676113", size = 670477, upload-time = "2026-03-31T22:40:07.874Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/72/43/724d307b34e353da0abd476e02f72f735cdd2bc86082dee1b32ea0bfee1d/hf_xet-1.4.3-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:7551659ba4f1e1074e9623996f28c3873682530aee0a846b7f2f066239228144", size = 3800935, upload-time = "2026-03-31T22:39:49.618Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/d2/8bee5996b699262edb87dbb54118d287c0e1b2fc78af7cdc41857ba5e3c4/hf_xet-1.4.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:bee693ada985e7045997f05f081d0e12c4c08bd7626dc397f8a7c487e6c04f7f", size = 3558942, upload-time = "2026-03-31T22:39:47.938Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/a1/e993d09cbe251196fb60812b09a58901c468127b7259d2bf0f68bf6088eb/hf_xet-1.4.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:21644b404bb0100fe3857892f752c4d09642586fd988e61501c95bbf44b393a3", size = 4207657, upload-time = "2026-03-31T22:39:39.69Z" },
+    { url = "https://files.pythonhosted.org/packages/64/44/9eb6d21e5c34c63e5e399803a6932fa983cabdf47c0ecbcfe7ea97684b8c/hf_xet-1.4.3-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:987f09cfe418237812896a6736b81b1af02a3a6dcb4b4944425c4c4fca7a7cf8", size = 3986765, upload-time = "2026-03-31T22:39:37.936Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/7b/8ad6f16fdb82f5f7284a34b5ec48645bd575bdcd2f6f0d1644775909c486/hf_xet-1.4.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:60cf7fc43a99da0a853345cf86d23738c03983ee5249613a6305d3e57a5dca74", size = 4188162, upload-time = "2026-03-31T22:39:58.382Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/c4/39d6e136cbeea9ca5a23aad4b33024319222adbdc059ebcda5fc7d9d5ff4/hf_xet-1.4.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2815a49a7a59f3e2edf0cf113ae88e8cb2ca2a221bf353fb60c609584f4884d4", size = 4424525, upload-time = "2026-03-31T22:40:00.225Z" },
+    { url = "https://files.pythonhosted.org/packages/46/f2/adc32dae6bdbc367853118b9878139ac869419a4ae7ba07185dc31251b76/hf_xet-1.4.3-cp313-cp313t-win_amd64.whl", hash = "sha256:42ee323265f1e6a81b0e11094564fb7f7e0ec75b5105ffd91ae63f403a11931b", size = 3671610, upload-time = "2026-03-31T22:40:10.42Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/19/25d897dcc3f81953e0c2cde9ec186c7a0fee413eb0c9a7a9130d87d94d3a/hf_xet-1.4.3-cp313-cp313t-win_arm64.whl", hash = "sha256:27c976ba60079fb8217f485b9c5c7fcd21c90b0367753805f87cb9f3cdc4418a", size = 3528529, upload-time = "2026-03-31T22:40:09.106Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/36/3e8f85ca9fe09b8de2b2e10c63b3b3353d7dda88a0b3d426dffbe7b8313b/hf_xet-1.4.3-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:5251d5ece3a81815bae9abab41cf7ddb7bcb8f56411bce0827f4a3071c92fdc6", size = 3801019, upload-time = "2026-03-31T22:39:56.651Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/9c/defb6cb1de28bccb7bd8d95f6e60f72a3d3fa4cb3d0329c26fb9a488bfe7/hf_xet-1.4.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1feb0f3abeacee143367c326a128a2e2b60868ec12a36c225afb1d6c5a05e6d2", size = 3558746, upload-time = "2026-03-31T22:39:54.766Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/bd/8d001191893178ff8e826e46ad5299446e62b93cd164e17b0ffea08832ec/hf_xet-1.4.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8b301fc150290ca90b4fccd079829b84bb4786747584ae08b94b4577d82fb791", size = 4207692, upload-time = "2026-03-31T22:39:46.246Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/48/6790b402803250e9936435613d3a78b9aaeee7973439f0918848dde58309/hf_xet-1.4.3-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:d972fbe95ddc0d3c0fc49b31a8a69f47db35c1e3699bf316421705741aab6653", size = 3986281, upload-time = "2026-03-31T22:39:44.648Z" },
+    { url = "https://files.pythonhosted.org/packages/51/56/ea62552fe53db652a9099eda600b032d75554d0e86c12a73824bfedef88b/hf_xet-1.4.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c5b48db1ee344a805a1b9bd2cda9b6b65fe77ed3787bd6e87ad5521141d317cd", size = 4187414, upload-time = "2026-03-31T22:40:04.951Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/f5/bc1456d4638061bea997e6d2db60a1a613d7b200e0755965ec312dc1ef79/hf_xet-1.4.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:22bdc1f5fb8b15bf2831440b91d1c9bbceeb7e10c81a12e8d75889996a5c9da8", size = 4424368, upload-time = "2026-03-31T22:40:06.347Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/76/ab597bae87e1f06d18d3ecb8ed7f0d3c9a37037fc32ce76233d369273c64/hf_xet-1.4.3-cp314-cp314t-win_amd64.whl", hash = "sha256:0392c79b7cf48418cd61478c1a925246cf10639f4cd9d94368d8ca1e8df9ea07", size = 3672280, upload-time = "2026-03-31T22:40:16.401Z" },
+    { url = "https://files.pythonhosted.org/packages/62/05/2e462d34e23a09a74d73785dbed71cc5dbad82a72eee2ad60a72a554155d/hf_xet-1.4.3-cp314-cp314t-win_arm64.whl", hash = "sha256:681c92a07796325778a79d76c67011764ecc9042a8c3579332b61b63ae512075", size = 3528945, upload-time = "2026-03-31T22:40:14.995Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/9f/9c23e4a447b8f83120798f9279d0297a4d1360bdbf59ef49ebec78fe2545/hf_xet-1.4.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:d0da85329eaf196e03e90b84c2d0aca53bd4573d097a75f99609e80775f98025", size = 3805048, upload-time = "2026-03-31T22:39:53.105Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/f8/7aacb8e5f4a7899d39c787b5984e912e6c18b11be136ef13947d7a66d265/hf_xet-1.4.3-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:e23717ce4186b265f69afa66e6f0069fe7efbf331546f5c313d00e123dc84583", size = 3562178, upload-time = "2026-03-31T22:39:51.295Z" },
+    { url = "https://files.pythonhosted.org/packages/df/9a/a24b26dc8a65f0ecc0fe5be981a19e61e7ca963b85e062c083f3a9100529/hf_xet-1.4.3-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc360b70c815bf340ed56c7b8c63aacf11762a4b099b2fe2c9bd6d6068668c08", size = 4212320, upload-time = "2026-03-31T22:39:42.922Z" },
+    { url = "https://files.pythonhosted.org/packages/53/60/46d493db155d2ee2801b71fb1b0fd67696359047fdd8caee2c914cc50c79/hf_xet-1.4.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:39f2d2e9654cd9b4319885733993807aab6de9dfbd34c42f0b78338d6617421f", size = 3991546, upload-time = "2026-03-31T22:39:41.335Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/f5/067363e1c96c6b17256910830d1b54099d06287e10f4ec6ec4e7e08371fc/hf_xet-1.4.3-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:49ad8a8cead2b56051aa84d7fce3e1335efe68df3cf6c058f22a65513885baac", size = 4193200, upload-time = "2026-03-31T22:40:01.936Z" },
+    { url = "https://files.pythonhosted.org/packages/42/4b/53951592882d9c23080c7644542fda34a3813104e9e11fa1a7d82d419cb8/hf_xet-1.4.3-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7716d62015477a70ea272d2d68cd7cad140f61c52ee452e133e139abfe2c17ba", size = 4429392, upload-time = "2026-03-31T22:40:03.492Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/21/75a6c175b4e79662ad8e62f46a40ce341d8d6b206b06b4320d07d55b188c/hf_xet-1.4.3-cp37-abi3-win_amd64.whl", hash = "sha256:6b591fcad34e272a5b02607485e4f2a1334aebf1bc6d16ce8eb1eb8978ac2021", size = 3677359, upload-time = "2026-03-31T22:40:13.619Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/7c/44314ecd0e89f8b2b51c9d9e5e7a60a9c1c82024ac471d415860557d3cd8/hf_xet-1.4.3-cp37-abi3-win_arm64.whl", hash = "sha256:7c2c7e20bcfcc946dc67187c203463f5e932e395845d098cc2a93f5b67ca0b47", size = 3533664, upload-time = "2026-03-31T22:40:12.152Z" },
 ]
 
 [[package]]
@@ -1108,7 +1111,7 @@ wheels = [
 
 [[package]]
 name = "huggingface-hub"
-version = "1.1.5"
+version = "1.12.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -1117,14 +1120,13 @@ dependencies = [
     { name = "httpx" },
     { name = "packaging" },
     { name = "pyyaml" },
-    { name = "shellingham" },
     { name = "tqdm" },
-    { name = "typer-slim" },
+    { name = "typer" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/fb/02/c3d534d7498ba2792da1d2ce56b5d38bbcbcbbba62071c90ee289b408e8d/huggingface_hub-1.1.5.tar.gz", hash = "sha256:40ba5c9a08792d888fde6088920a0a71ab3cd9d5e6617c81a797c657f1fd9968", size = 607199, upload-time = "2025-11-20T15:49:32.809Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/56/52/1b54cb569509c725a32c1315261ac9fd0e6b91bbbf74d86fca10d3376164/huggingface_hub-1.12.0.tar.gz", hash = "sha256:7c3fe85e24b652334e5d456d7a812cd9a071e75630fac4365d9165ab5e4a34b6", size = 763091, upload-time = "2026-04-24T13:32:08.674Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/35/f4/124858007ddf3c61e9b144107304c9152fa80b5b6c168da07d86fe583cc1/huggingface_hub-1.1.5-py3-none-any.whl", hash = "sha256:e88ecc129011f37b868586bbcfae6c56868cae80cd56a79d61575426a3aa0d7d", size = 516000, upload-time = "2025-11-20T15:49:30.926Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/2b/ef03ddb96bd1123503c2bd6932001020292deea649e9bf4caa2cb65a85bf/huggingface_hub-1.12.0-py3-none-any.whl", hash = "sha256:d74939969585ee35748bd66de09baf84099d461bda7287cd9043bfb99b0e424d", size = 646806, upload-time = "2026-04-24T13:32:06.717Z" },
 ]
 
 [[package]]
@@ -1784,6 +1786,7 @@ dependencies = [
     { name = "nbformat" },
     { name = "prompt-toolkit" },
     { name = "pydantic" },
+    { name = "pymongo" },
     { name = "python-dotenv" },
     { name = "requests" },
     { name = "rich" },
@@ -1822,7 +1825,7 @@ requires-dist = [
     { name = "fastapi", specifier = ">=0.115.0" },
     { name = "fastmcp", specifier = ">=3.2.0" },
     { name = "httpx", specifier = ">=0.27.0" },
-    { name = "huggingface-hub", specifier = ">=1.0.1" },
+    { name = "huggingface-hub", specifier = ">=1.12.0" },
     { name = "inspect-ai", marker = "extra == 'eval'", specifier = ">=0.3.149" },
     { name = "litellm", specifier = ">=1.83.0" },
     { name = "ml-intern", extras = ["eval", "dev"], marker = "extra == 'all'" },
@@ -1831,8 +1834,9 @@ requires-dist = [
     { name = "pandas", marker = "extra == 'eval'", specifier = ">=2.3.3" },
     { name = "prompt-toolkit", specifier = ">=3.0.0" },
     { name = "pydantic", specifier = ">=2.12.3" },
+    { name = "pymongo", specifier = ">=4.17.0" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" },
-    { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.26.0" },
+    { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=1.2.0" },
     { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "requests", specifier = ">=2.33.0" },
     { name = "rich", specifier = ">=13.0.0" },
@@ -2767,6 +2771,67 @@ crypto = [
     { name = "cryptography" },
 ]
 
+[[package]]
+name = "pymongo"
+version = "4.17.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "dnspython" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ca/64/50be6fbac9c79fe2e4c17401a467da2d8764d82833d83cec325afe5cab32/pymongo-4.17.0.tar.gz", hash = "sha256:70ffa08ba641468cc068cf46c06b34f01a8ce3489f6411309fcb5ceabe6b2fc0", size = 2523370, upload-time = "2026-04-20T16:39:53.524Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c4/e2/336d86f221cf1b56b2ed9330d4a3b98f9f38f0b37829ae9a9184617d5419/pymongo-4.17.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4141e6c6a339789b2974efa00ecd9409101672d77a0e3ee2cc3839eedf8ec4df", size = 874668, upload-time = "2026-04-20T16:37:41.39Z" },
+    { url = "https://files.pythonhosted.org/packages/34/8e/75d3c6c935d187ab59c61e9c15d9aab3f274b563eaf1706e8cae5f508dec/pymongo-4.17.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e68c76b84e0c132d9dbf9307f12ff8185702328187a87b9aca8c941303873433", size = 875294, upload-time = "2026-04-20T16:37:43.432Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/ec/62e855744489dbcd54fd778aae4d80fa4c4819e8fb228ca0cf6f21a03997/pymongo-4.17.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ba2195d4f386f839a52a23ea1cfd60ffaaba78a3d7841db51b7e433001139918", size = 1496233, upload-time = "2026-04-20T16:37:45.518Z" },
+    { url = "https://files.pythonhosted.org/packages/82/e8/93e4e5e5ce8fdf8929dabeefe24aafa5ce046028eed0dfa8eeb936e72c49/pymongo-4.17.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8446ff4bfcb6ec2a2e50998c860986a1e992136f998b7f53e7a717fb8aa5a0b9", size = 1522927, upload-time = "2026-04-20T16:37:47.492Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/ca/425dc1d21e0f17bdea0072fc463f662f7fa06d2852af52975c9eced3c07c/pymongo-4.17.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2a0d5ac205728c86e0a02192f1aa5f865b0d7d51f8df6101c01a69a7fc620d72", size = 1583468, upload-time = "2026-04-20T16:37:49.221Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/9d/f08b07eeffda1a43c1759f0fa625e88ae12360996eb56d42aad832fa7dff/pymongo-4.17.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:485c8a8eaa4c739f00a331fc73757898ee7c092c214a79e63866ff76aaf282ff", size = 1572787, upload-time = "2026-04-20T16:37:51.061Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/c2/6855a07aafa7b894929af23675b6fb9634800ce43122b76a62f6eeb8da2a/pymongo-4.17.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b2dfcc795f5b9fedbe179a11fdf6051581479d196582a3fe819a92a00e9b9969", size = 1526184, upload-time = "2026-04-20T16:37:53.358Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/05/c952bac7db71c1942ea3559fcd308b49754cc5004b455935fb4000d1f37b/pymongo-4.17.0-cp311-cp311-win32.whl", hash = "sha256:c2292144505fb12156b981bd440f3dc994a883da06ac726c0c8692ccdbc1c510", size = 852621, upload-time = "2026-04-20T16:37:55.28Z" },
+    { url = "https://files.pythonhosted.org/packages/11/c0/c04da9f4c0c6252404598f4e394b862a58a9e866822a70ae261c8a018fdf/pymongo-4.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:2e190827834fce70ecdf9d46796c6dbc0ce08ea87dc2ff5bc6f3f5579b605cb9", size = 867852, upload-time = "2026-04-20T16:37:57.233Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/b2/c7b4870fbeef471e947d3e014676f5910d02e0197074d692ebcf24ec049a/pymongo-4.17.0-cp311-cp311-win_arm64.whl", hash = "sha256:a8f9c40a09bb7d4b9fc8b1da65ecf6efa79bda5cb2756f39d9b6940fac1d19ae", size = 855019, upload-time = "2026-04-20T16:37:58.983Z" },
+    { url = "https://files.pythonhosted.org/packages/98/90/60bcb508840135d5ee46b51b1a950f548338aa8145a8366dbe6639ae51ac/pymongo-4.17.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53ffa94b2340dbf6b055e09a0090618c60482c158ecfc9565642fc996bf0944", size = 930529, upload-time = "2026-04-20T16:38:00.936Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/e9/313840f1e52c6dfac47f704428cbfbce59956ebe7633bffc92b03f74f0ad/pymongo-4.17.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6fe0de9d0f6791abce3471230b32b4817bf89d27b1182b6a550e1ec0fa72aa9a", size = 930665, upload-time = "2026-04-20T16:38:02.915Z" },
+    { url = "https://files.pythonhosted.org/packages/78/35/9d3565ea45b1606f635c1e2cd2563c28d66caafdc50f7ad7d979fcd1b363/pymongo-4.17.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:e537e95514dae1aaa718f481ec03151a0f0394bcd05f1322896d8fc1330cb729", size = 1762369, upload-time = "2026-04-20T16:38:05.375Z" },
+    { url = "https://files.pythonhosted.org/packages/95/ee/149b0d4b1a11c38bff6f14c23d5814c9b0843fd6dc38ad40596bdb1a62d2/pymongo-4.17.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:37a8385c29881b43eab31f584100fa0eaddedd5607adf010147ba1810118be90", size = 1798044, upload-time = "2026-04-20T16:38:07.195Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/d4/4cee4a7b8d8f6f0550ef6cd2fea42455c5ed619a220cb6ba4fb40d6a5bc8/pymongo-4.17.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f3ee3d241ed77a4fc99ce3cff3b289c3ebce37f61fdd7349d3592c23b82c8784", size = 1878567, upload-time = "2026-04-20T16:38:09.121Z" },
+    { url = "https://files.pythonhosted.org/packages/45/ef/7fe366c84952619ee2f69973566c214775e083dd4df465751912153e4b72/pymongo-4.17.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:9eb5d63a3c518cb0804ed678f5e2b875af032d89a7cf57a57360322cf6a4d222", size = 1864881, upload-time = "2026-04-20T16:38:10.896Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/35/b577d82c6d1be7aee7ac7e249bc86f7847998345042e5f8360de238e177b/pymongo-4.17.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8e97e03fa13327c87e3fdc5656acd01e71817f0c1dc3221cd8f30de136bf4ec3", size = 1800349, upload-time = "2026-04-20T16:38:13.589Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/69/dafcf04f66e130ddd91aeb92e7a692480eda46dcd04ec1dbe82c06619e10/pymongo-4.17.0-cp312-cp312-win32.whl", hash = "sha256:6877214bff5f06f6884a9fc8d9016a4a7a5f51f537f5c51ac3a576f93e7dfb32", size = 900518, upload-time = "2026-04-20T16:38:15.541Z" },
+    { url = "https://files.pythonhosted.org/packages/11/35/5c9262a459f988b4eb2605f70815240b77a0d4131136c4326d18f1822b89/pymongo-4.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:9828485f72f63c7d802e0ec41f71906f633c2692621ab3af55ca990186b091b1", size = 920335, upload-time = "2026-04-20T16:38:17.665Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/da/e9c7265ee176faccf4e52c4797837e794d93569a1046f6b19a4acc36e5ad/pymongo-4.17.0-cp312-cp312-win_arm64.whl", hash = "sha256:1195370a77baf003b59b10e91ecc4706297197f0dd9d29c840cc556dc08f7cee", size = 903289, upload-time = "2026-04-20T16:38:19.33Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/6b/c1206879708b94e82fcd8b9653440ec271f79a3674d122192df383047f5a/pymongo-4.17.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:809ec74de3b9148ae43fa8df9faf53470f511c8d384f13b99d6f671f2a379f15", size = 985829, upload-time = "2026-04-20T16:38:21.031Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/cf/bb044ed85160e5c40f568c7c4f4e8ea16f40764ff5d302e5befbe8f6f814/pymongo-4.17.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a431b737816bf4cddd4fa0fcef04e424ad36b7692734a64150f872fb8f3208be", size = 985899, upload-time = "2026-04-20T16:38:23.409Z" },
+    { url = "https://files.pythonhosted.org/packages/74/0a/f6dfd5ea3901e5d6888da8de8ba728971a1d447debab681cfc56f90d1208/pymongo-4.17.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:e4fab10f8403169ce92f3cea921609d9ee81107306caae06c08f592d4b8ad2b5", size = 2028569, upload-time = "2026-04-20T16:38:25.343Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/c5/081f59a1c02ae8c0dc73ae58e563838c44eec81aeafa7d0b93a637841c9b/pymongo-4.17.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:20323b0b1c1d33770ad1fc68d429c757734ce9ad3594421c3d6618f10572b1b9", size = 2072916, upload-time = "2026-04-20T16:38:27.291Z" },
+    { url = "https://files.pythonhosted.org/packages/31/42/6e41d434297ffe8b30d9c3717916591a4a7be9075a0dcc2fafdfaaaa62ed/pymongo-4.17.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5a5de048e6da5c18e27cc2437e8c15b3b0cdc8385c15b41178b0caa3322a09c2", size = 2173234, upload-time = "2026-04-20T16:38:29.474Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/cf/1e4a7db352ef9485831c7268dfe8402f0117b32a9ad54b16e810699e3617/pymongo-4.17.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:dff3de1294fbbc1db0ba6b511f77b8e540601d092538a31312e99c8a91a78b1e", size = 2156784, upload-time = "2026-04-20T16:38:32.134Z" },
+    { url = "https://files.pythonhosted.org/packages/12/10/6195be29962a61ebb5f4bd9e4c7519890b172f7968a0a0d880398c6ddb02/pymongo-4.17.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:faf03e4c2aafd6de626dbd30ba246d369ae33f47f10629d1bbe40f72115027a6", size = 2074446, upload-time = "2026-04-20T16:38:34.004Z" },
+    { url = "https://files.pythonhosted.org/packages/37/48/33410b8819837ed370c738587306bdf060b59cef11823be212f4a07703c5/pymongo-4.17.0-cp313-cp313-win32.whl", hash = "sha256:c9786665926a09630c5d420c79762cfadbff35a9438bcbc4c81a9fb5ab9228b7", size = 948435, upload-time = "2026-04-20T16:38:35.922Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/77/c0ed522f798a286b99acaa7914ed8d9c80ab091f97f57c59ffed72906e5e/pymongo-4.17.0-cp313-cp313-win_amd64.whl", hash = "sha256:5960519b4d7168f1ecdd3ea10c81b2aedeb9423651aca953cfbc8e76705d3b38", size = 972847, upload-time = "2026-04-20T16:38:37.888Z" },
+    { url = "https://files.pythonhosted.org/packages/97/f0/c39480a2db385fde23861d0c8acda41cdaf1d43e46579db72c5c013a2e81/pymongo-4.17.0-cp313-cp313-win_arm64.whl", hash = "sha256:0ff6bd2f735ab5356541e3e57d5b7dbfbc3f2ee1ccb10b6b0f82d58af69d1d8e", size = 951575, upload-time = "2026-04-20T16:38:40.544Z" },
+    { url = "https://files.pythonhosted.org/packages/da/49/2b0250762a89737ed6f9cea238331baca061b89a8ddd10dd17fee52c3970/pymongo-4.17.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:ff5aa3f1c7e3f08eb0e7a016c91ba468b1850ccfd63d9b1f12f56350f4974cef", size = 1040945, upload-time = "2026-04-20T16:38:42.783Z" },
+    { url = "https://files.pythonhosted.org/packages/89/1c/7a9b5447a08be20e84b6e5b17330917e8d6d9507daa3cd099a9309f11ad7/pymongo-4.17.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:e816db649ba5d7de0568cf3a9f287a9dc9aad21cf0ca667ab156a7ef47fca0b0", size = 1041187, upload-time = "2026-04-20T16:38:45.358Z" },
+    { url = "https://files.pythonhosted.org/packages/78/a1/71704f61632dfc90407a5834fe5f6132854937c4a3648f6c05c351d85a45/pymongo-4.17.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:12c4fded3a9f1d6a687e36ebd384ac6d00b9b00de1969aa74048e7051ec2a713", size = 2294806, upload-time = "2026-04-20T16:38:47.734Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/b9/aff42be75108b96c2469b1d9329b912c15108f3e7ef32fdc86da8423c330/pymongo-4.17.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2db66aa8dd253a0fc1fad3b0d23d5b3993f7ebde02fbbd7727128debf2853675", size = 2348231, upload-time = "2026-04-20T16:38:50.371Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/30/44c115b8ba1479942c15fd9480eb29a7da0ba68acd56983423ba0deb4a94/pymongo-4.17.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3987e96e7c7be4083d42e8ac2cc6c0d5b78db9973c90fce42ae800b616ca6b20", size = 2467614, upload-time = "2026-04-20T16:38:52.665Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/84/21ee95c8bf0ca7acae7ec7eb365d740bf8fc0156c194baf2c3bdfcb85ec0/pymongo-4.17.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:cee36b3c0d0354f880fa7a7fdcdaf2bb5e542c2281e25c1bfadf8cfe21eba7d2", size = 2445970, upload-time = "2026-04-20T16:38:55.175Z" },
+    { url = "https://files.pythonhosted.org/packages/06/89/081d7f1809d5ca09d1e47e49f2111b245f5694de3a7af32cd3a353a6f43f/pymongo-4.17.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:320b34457b20bbcc79997801f95d25ce00472915ca5241167242b42c4359e027", size = 2348605, upload-time = "2026-04-20T16:38:57.557Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/c3/0d949f9d3f2a341c1f635c398c16615e96f89f51ff424ed81e914cf1a4de/pymongo-4.17.0-cp314-cp314-win32.whl", hash = "sha256:df4a644af9ae132d4bfdb2e9516ea51a615fd881caddfbfbd071cf1354844479", size = 1004119, upload-time = "2026-04-20T16:39:00.309Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/55/5c3a3db1048054c695c75c5964cc8bedc2247fdb5a75ef6fab4ec8bb013e/pymongo-4.17.0-cp314-cp314-win_amd64.whl", hash = "sha256:c797f8a80957134f6dd9690367a0f8f5906d672119af2c6aa55f0c527b656bed", size = 1032314, upload-time = "2026-04-20T16:39:02.665Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/19/e235f39906134cb0ffd5574c5a59c355ef5380f0499644ab94994afbb109/pymongo-4.17.0-cp314-cp314-win_arm64.whl", hash = "sha256:68fca71e05ee5da23a8d73cee8379dfb3d26e609a377cae731d742771ed96946", size = 1007627, upload-time = "2026-04-20T16:39:04.678Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/e0/c4c1a86791415b14c684fa0908f9da96de91594a3fd1fa1b8dc689fbb800/pymongo-4.17.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:b4384700cffc3f1dd98e088bc0072dedf6d7d68a230bb4b972665cf69c071c1e", size = 1099151, upload-time = "2026-04-20T16:39:06.969Z" },
+    { url = "https://files.pythonhosted.org/packages/81/4b/69c67f3e23fd9b23b9bedc7ebd23754881cc9d5c5d5b2a9811e96b07f475/pymongo-4.17.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:93641192644fa1ee0f34030e774fd31022a27ad11ba22cb1716142231524f8bd", size = 1099346, upload-time = "2026-04-20T16:39:08.996Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/19/a5208f62f9508a26d73acc69bd3821b8c8adae253679a3c26d2f9652f0d5/pymongo-4.17.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:75bc3aa5b94fdb7138d357ec6ca61cd97e0c79f4f7f0bd3efe9639b15cc50942", size = 2619034, upload-time = "2026-04-20T16:39:11.049Z" },
+    { url = "https://files.pythonhosted.org/packages/77/27/426cba1ec5973082a56d4150798529bfdf4151c31391ed1fbbecb23ef2ac/pymongo-4.17.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:50e8f8e23c6df7c6d6929f5e734980b227706e73ee847517c9ba5af90f7fc466", size = 2689939, upload-time = "2026-04-20T16:39:13.617Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/2e/f70993d1255e33f6ee59a4ec4371cc65bff7a7e3fda7d55c3386f25287e8/pymongo-4.17.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:15d3f3d732aecac1f8d481bde4029755615639bd3076f258a2147210aec8515a", size = 2824994, upload-time = "2026-04-20T16:39:16.057Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/eb/87b0e988ba889e1fcc3430c2cfc166b251872c813e92b43174298bee17ff/pymongo-4.17.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c5f62862d0f87be481fa1fe8cb811994486773c94a2b61e509285e3f2890763", size = 2801745, upload-time = "2026-04-20T16:39:18.476Z" },
+    { url = "https://files.pythonhosted.org/packages/67/4c/3f83412d086f682d4d468761d66ddc49cf161e786ea74073045eb4491c60/pymongo-4.17.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:64837adbbd72073301af51bb0fc80e3d7707fe5527cea1033ba0320f0b2f881b", size = 2684636, upload-time = "2026-04-20T16:39:20.878Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/d8/b75f6f4ab6c8beb50b0270a4f1e2530b5774f5e116563440e1677ca1820f/pymongo-4.17.0-cp314-cp314t-win32.whl", hash = "sha256:b93b22eedc62598cf5ee9d8c8007a8e9121c50fd88137012d8985500e9dc3151", size = 1056356, upload-time = "2026-04-20T16:39:22.996Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/5e/648c8a238eef18a25ed8a169ea6542d4a860bbec3e95b3d9badac2935c71/pymongo-4.17.0-cp314-cp314t-win_amd64.whl", hash = "sha256:3689ea34f6b647c7d1e7bdc60fcfb214b2789ed1359a7fb96569c69f50e5f18f", size = 1090964, upload-time = "2026-04-20T16:39:24.989Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/cb/d9780b66939c4fc1f024bcc7be23a2abcfe06a9745ca8fa76dc73395482e/pymongo-4.17.0-cp314-cp314t-win_arm64.whl", hash = "sha256:9543d8f84c2e5608565c08ac679774811e6730770d8a645439b073422a4276fb", size = 1058526, upload-time = "2026-04-20T16:39:27.924Z" },
+]
+
 [[package]]
 name = "pyperclip"
 version = "1.11.0"
@@ -3607,16 +3672,18 @@ wheels = [
 ]
 
 [[package]]
-name = "typer-slim"
-version = "0.20.0"
+name = "typer"
+version = "0.25.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
+    { name = "annotated-doc" },
     { name = "click" },
-    { name = "typing-extensions" },
+    { name = "rich" },
+    { name = "shellingham" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/8e/45/81b94a52caed434b94da65729c03ad0fb7665fab0f7db9ee54c94e541403/typer_slim-0.20.0.tar.gz", hash = "sha256:9fc6607b3c6c20f5c33ea9590cbeb17848667c51feee27d9e314a579ab07d1a3", size = 106561, upload-time = "2025-10-20T17:03:46.642Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/7b/27/ede8cec7596e0041ba7e7b80b47d132562f56ff454313a16f6084e555c9f/typer-0.25.0.tar.gz", hash = "sha256:123eaf9f19bb40fd268310e12a542c0c6b4fab9c98d9d23342a01ff95e3ce930", size = 120150, upload-time = "2026-04-26T08:46:14.767Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/5e/dd/5cbf31f402f1cc0ab087c94d4669cfa55bd1e818688b910631e131d74e75/typer_slim-0.20.0-py3-none-any.whl", hash = "sha256:f42a9b7571a12b97dddf364745d29f12221865acef7a2680065f9bb29c7dc89d", size = 47087, upload-time = "2025-10-20T17:03:44.546Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/72/193d4e586ec5a4db834a36bbeb47641a62f951f114ffd0fe5b1b46e8d56f/typer-0.25.0-py3-none-any.whl", hash = "sha256:ac01b48823d3db9a83c9e164338057eadbb1c9957a2a6b4eeb486669c560b5dc", size = 55993, upload-time = "2026-04-26T08:46:15.889Z" },
 ]
 
 [[package]]