ml-intern

Sleeping

App Files Files Community

lewtun HF Staff OpenAI Codex commited on May 1

Commit

a8e0e2c

2 Parent(s): 1c68712 77324b8

Deploy 2026-05-01

Browse files

Co-authored-by: OpenAI Codex <codex@openai.com>

Files changed (24) hide show

agent/core/agent_loop.py +244 -26
agent/core/approval_policy.py +11 -0
agent/core/cost_estimation.py +278 -0
agent/core/session.py +37 -0
agent/core/session_persistence.py +12 -0
agent/main.py +28 -6
agent/prompts/system_prompt_v3.yaml +1 -1
agent/tools/docs_tools.py +1 -1
backend/models.py +19 -0
backend/routes/agent.py +21 -0
backend/session_manager.py +94 -0
frontend/src/components/Chat/ToolCallGroup.tsx +20 -1
frontend/src/components/Layout/AppLayout.tsx +2 -0
frontend/src/components/YoloControl.tsx +155 -0
frontend/src/hooks/useAgentChat.ts +27 -2
frontend/src/lib/sse-chat-transport.ts +13 -1
frontend/src/store/agentStore.ts +33 -0
frontend/src/store/sessionStore.ts +45 -0
frontend/src/types/agent.ts +4 -0
frontend/src/types/events.ts +4 -0
tests/unit/test_agent_model_gating.py +45 -0
tests/unit/test_auto_approval_policy.py +185 -0
tests/unit/test_cost_estimation.py +58 -0
tests/unit/test_session_manager_persistence.py +73 -0

agent/core/agent_loop.py CHANGED Viewed

@@ -19,6 +19,11 @@ from litellm import (
 from litellm.exceptions import ContextWindowExceededError
 from agent.config import Config
 from agent.messaging.gateway import NotificationGateway
 from agent.core import telemetry
 from agent.core.doom_loop import check_for_doom_loop
@@ -110,13 +115,39 @@ def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
     return True, None
-def _needs_approval(
     tool_name: str, tool_args: dict, config: Config | None = None
 ) -> bool:
-    """Check if a tool call requires user approval before execution."""
-    # Yolo mode: skip all approvals
-    if config and config.yolo_mode:
-        return False
     # If args are malformed, skip approval (validation error will be shown later)
     args_valid, _ = _validate_tool_args(tool_args)
@@ -127,8 +158,10 @@ def _needs_approval(
         return True
     if tool_name == "hf_jobs":
-        operation = tool_args.get("operation", "")
-        if operation not in ["run", "uv", "scheduled run", "scheduled uv"]:
             return False
         # Check if this is a CPU-only job
@@ -180,6 +213,143 @@ def _needs_approval(
     return False
 # -- LLM retry constants --------------------------------------------------
 _MAX_LLM_RETRIES = 3
 _LLM_RETRY_DELAYS = [5, 15, 30]  # seconds between retries
@@ -1063,29 +1233,49 @@ class Handlers:
                 if session.is_cancelled:
                     break
-                # Separate good tools into approval-required vs auto-execute
-                approval_required_tools: list[tuple[ToolCall, str, dict]] = []
-                non_approval_tools: list[tuple[ToolCall, str, dict]] = []
                 for tc, tool_name, tool_args in good_tools:
-                    if _needs_approval(tool_name, tool_args, session.config):
-                        approval_required_tools.append((tc, tool_name, tool_args))
                     else:
-                        non_approval_tools.append((tc, tool_name, tool_args))
                 # Execute non-approval tools (in parallel when possible)
                 if non_approval_tools:
                     # 1. Validate args upfront
                     parsed_tools: list[
-                        tuple[ToolCall, str, dict, bool, str]
                     ] = []
-                    for tc, tool_name, tool_args in non_approval_tools:
                         args_valid, error_msg = _validate_tool_args(tool_args)
                         parsed_tools.append(
-                            (tc, tool_name, tool_args, args_valid, error_msg)
                         )
                     # 2. Send all tool_call events upfront (so frontend shows them all)
-                    for tc, tool_name, tool_args, args_valid, _ in parsed_tools:
                         if args_valid:
                             await session.send_event(
                                 Event(
@@ -1103,11 +1293,14 @@ class Handlers:
                         tc: ToolCall,
                         name: str,
                         args: dict,
                         valid: bool,
                         err: str,
                     ) -> tuple[ToolCall, str, dict, str, bool]:
                         if not valid:
                             return (tc, name, args, err, False)
                         out, ok = await session.tool_router.call_tool(
                             name, args, session=session, tool_call_id=tc.id
                         )
@@ -1115,8 +1308,8 @@ class Handlers:
                     gather_task = asyncio.ensure_future(asyncio.gather(
                         *[
-                            _exec_tool(tc, name, args, valid, err)
-                            for tc, name, args, valid, err in parsed_tools
                         ]
                     ))
                     cancel_task = asyncio.ensure_future(session._cancelled.wait())
@@ -1133,7 +1326,7 @@ class Handlers:
                         except asyncio.CancelledError:
                             pass
                         # Notify frontend that in-flight tools were cancelled
-                        for tc, name, _args, valid, _ in parsed_tools:
                             if valid:
                                 await session.send_event(Event(
                                     event_type="tool_state_change",
@@ -1171,7 +1364,8 @@ class Handlers:
                 if approval_required_tools:
                     # Prepare batch approval data
                     tools_data = []
-                    for tc, tool_name, tool_args in approval_required_tools:
                         # Resolve sandbox file paths for hf_jobs scripts so the
                         # frontend can display & edit the actual file content.
                         if tool_name == "hf_jobs" and isinstance(tool_args.get("script"), str):
@@ -1181,20 +1375,42 @@ class Handlers:
                             if resolved:
                                 tool_args = {**tool_args, "script": resolved}
-                        tools_data.append({
                             "tool": tool_name,
                             "arguments": tool_args,
                             "tool_call_id": tc.id,
-                        })
                     await session.send_event(Event(
                         event_type="approval_required",
-                        data={"tools": tools_data, "count": len(tools_data)},
                     ))
                     # Store all approval-requiring tools (ToolCall objects for execution)
                     session.pending_approval = {
-                        "tool_calls": [tc for tc, _, _ in approval_required_tools],
                     }
                     # Return early - wait for EXEC_APPROVAL operation
@@ -1384,6 +1600,8 @@ class Handlers:
                 )
             )
             output, success = await session.tool_router.call_tool(
                 tool_name, tool_args, session=session, tool_call_id=tc.id
             )

 from litellm.exceptions import ContextWindowExceededError
 from agent.config import Config
+from agent.core.approval_policy import (
+    is_scheduled_operation,
+    normalize_tool_operation,
+)
+from agent.core.cost_estimation import CostEstimate, estimate_tool_cost
 from agent.messaging.gateway import NotificationGateway
 from agent.core import telemetry
 from agent.core.doom_loop import check_for_doom_loop
     return True, None
+_IMMEDIATE_HF_JOB_RUNS = {"run", "uv"}
+@dataclass(frozen=True)
+class ApprovalDecision:
+    requires_approval: bool
+    auto_approved: bool = False
+    auto_approval_blocked: bool = False
+    block_reason: str | None = None
+    estimated_cost_usd: float | None = None
+    remaining_cap_usd: float | None = None
+    billable: bool = False
+def _operation(tool_args: dict) -> str:
+    return normalize_tool_operation(tool_args.get("operation"))
+def _is_immediate_hf_job_run(tool_name: str, tool_args: dict) -> bool:
+    return tool_name == "hf_jobs" and _operation(tool_args) in _IMMEDIATE_HF_JOB_RUNS
+def _is_scheduled_hf_job_run(tool_name: str, tool_args: dict) -> bool:
+    return tool_name == "hf_jobs" and is_scheduled_operation(_operation(tool_args))
+def _is_budgeted_auto_approval_target(tool_name: str, tool_args: dict) -> bool:
+    return tool_name == "sandbox_create" or _is_immediate_hf_job_run(tool_name, tool_args)
+def _base_needs_approval(
     tool_name: str, tool_args: dict, config: Config | None = None
 ) -> bool:
+    """Check if a tool call requires approval before YOLO policy is applied."""
     # If args are malformed, skip approval (validation error will be shown later)
     args_valid, _ = _validate_tool_args(tool_args)
         return True
     if tool_name == "hf_jobs":
+        operation = _operation(tool_args)
+        if is_scheduled_operation(operation):
+            return True
+        if operation not in _IMMEDIATE_HF_JOB_RUNS:
             return False
         # Check if this is a CPU-only job
     return False
+def _needs_approval(
+    tool_name: str, tool_args: dict, config: Config | None = None
+) -> bool:
+    """Legacy sync approval predicate used by tests and CLI display helpers."""
+    if _is_scheduled_hf_job_run(tool_name, tool_args):
+        return True
+    if config and config.yolo_mode:
+        return False
+    return _base_needs_approval(tool_name, tool_args, config)
+def _session_auto_approval_enabled(session: Session | None) -> bool:
+    return bool(session and getattr(session, "auto_approval_enabled", False))
+def _effective_yolo_enabled(session: Session | None, config: Config | None) -> bool:
+    return bool((config and config.yolo_mode) or _session_auto_approval_enabled(session))
+def _remaining_budget_after_reservations(
+    session: Session | None, reserved_spend_usd: float
+) -> float | None:
+    if not session or getattr(session, "auto_approval_cost_cap_usd", None) is None:
+        return None
+    cap = float(getattr(session, "auto_approval_cost_cap_usd") or 0.0)
+    spent = float(getattr(session, "auto_approval_estimated_spend_usd", 0.0) or 0.0)
+    return round(max(0.0, cap - spent - reserved_spend_usd), 4)
+def _budget_block_reason(
+    estimate: CostEstimate,
+    *,
+    remaining_cap_usd: float | None,
+) -> str | None:
+    if estimate.estimated_cost_usd is None:
+        return estimate.block_reason or "Could not estimate the cost safely."
+    if remaining_cap_usd is not None and estimate.estimated_cost_usd > remaining_cap_usd:
+        return (
+            f"Estimated cost ${estimate.estimated_cost_usd:.2f} exceeds "
+            f"remaining YOLO cap ${remaining_cap_usd:.2f}."
+        )
+    return None
+async def _approval_decision(
+    tool_name: str,
+    tool_args: dict,
+    session: Session,
+    *,
+    reserved_spend_usd: float = 0.0,
+) -> ApprovalDecision:
+    """Return the approval decision for one parsed tool call."""
+    config = session.config
+    base_requires_approval = _base_needs_approval(tool_name, tool_args, config)
+    # Scheduled jobs are recurring/unbounded enough that YOLO never bypasses
+    # the human confirmation, including legacy config.yolo_mode.
+    if _is_scheduled_hf_job_run(tool_name, tool_args):
+        return ApprovalDecision(
+            requires_approval=True,
+            auto_approval_blocked=_effective_yolo_enabled(session, config),
+            block_reason="Scheduled HF jobs always require manual approval.",
+        )
+    yolo_enabled = _effective_yolo_enabled(session, config)
+    budgeted_target = _is_budgeted_auto_approval_target(tool_name, tool_args)
+    # Cost caps are a session-scoped web policy. Legacy config.yolo_mode
+    # remains uncapped for CLI/headless, except for scheduled jobs above.
+    session_yolo_enabled = _session_auto_approval_enabled(session)
+    if yolo_enabled and budgeted_target and session_yolo_enabled:
+        estimate = await estimate_tool_cost(tool_name, tool_args, session=session)
+        remaining = _remaining_budget_after_reservations(session, reserved_spend_usd)
+        reason = _budget_block_reason(estimate, remaining_cap_usd=remaining)
+        if reason:
+            return ApprovalDecision(
+                requires_approval=True,
+                auto_approval_blocked=True,
+                block_reason=reason,
+                estimated_cost_usd=estimate.estimated_cost_usd,
+                remaining_cap_usd=remaining,
+                billable=estimate.billable,
+            )
+        if base_requires_approval:
+            return ApprovalDecision(
+                requires_approval=False,
+                auto_approved=True,
+                estimated_cost_usd=estimate.estimated_cost_usd,
+                remaining_cap_usd=remaining,
+                billable=estimate.billable,
+            )
+        return ApprovalDecision(
+            requires_approval=False,
+            estimated_cost_usd=estimate.estimated_cost_usd,
+            remaining_cap_usd=remaining,
+            billable=estimate.billable,
+        )
+    if base_requires_approval and yolo_enabled:
+        return ApprovalDecision(requires_approval=False, auto_approved=True)
+    return ApprovalDecision(requires_approval=base_requires_approval)
+def _record_estimated_spend(session: Session, decision: ApprovalDecision) -> None:
+    if not decision.billable or decision.estimated_cost_usd is None:
+        return
+    if hasattr(session, "add_auto_approval_estimated_spend"):
+        session.add_auto_approval_estimated_spend(decision.estimated_cost_usd)
+    else:
+        session.auto_approval_estimated_spend_usd = round(
+            float(getattr(session, "auto_approval_estimated_spend_usd", 0.0) or 0.0)
+            + float(decision.estimated_cost_usd),
+            4,
+        )
+async def _record_manual_approved_spend_if_needed(
+    session: Session,
+    tool_name: str,
+    tool_args: dict,
+) -> None:
+    if not _session_auto_approval_enabled(session):
+        return
+    if not _is_budgeted_auto_approval_target(tool_name, tool_args):
+        return
+    estimate = await estimate_tool_cost(tool_name, tool_args, session=session)
+    _record_estimated_spend(
+        session,
+        ApprovalDecision(
+            requires_approval=False,
+            billable=estimate.billable,
+            estimated_cost_usd=estimate.estimated_cost_usd,
+        ),
+    )
 # -- LLM retry constants --------------------------------------------------
 _MAX_LLM_RETRIES = 3
 _LLM_RETRY_DELAYS = [5, 15, 30]  # seconds between retries
                 if session.is_cancelled:
                     break
+                # Separate good tools into approval-required vs auto-execute.
+                # Track reserved spend while classifying a batch so two
+                # auto-approved jobs in one model response cannot jointly
+                # exceed the remaining session cap.
+                approval_required_tools: list[
+                    tuple[ToolCall, str, dict, ApprovalDecision]
+                ] = []
+                non_approval_tools: list[
+                    tuple[ToolCall, str, dict, ApprovalDecision]
+                ] = []
+                reserved_auto_spend_usd = 0.0
                 for tc, tool_name, tool_args in good_tools:
+                    decision = await _approval_decision(
+                        tool_name,
+                        tool_args,
+                        session,
+                        reserved_spend_usd=reserved_auto_spend_usd,
+                    )
+                    if decision.requires_approval:
+                        approval_required_tools.append((tc, tool_name, tool_args, decision))
                     else:
+                        non_approval_tools.append((tc, tool_name, tool_args, decision))
+                        if (
+                            decision.auto_approved
+                            and decision.billable
+                            and decision.estimated_cost_usd is not None
+                        ):
+                            reserved_auto_spend_usd += decision.estimated_cost_usd
                 # Execute non-approval tools (in parallel when possible)
                 if non_approval_tools:
                     # 1. Validate args upfront
                     parsed_tools: list[
+                        tuple[ToolCall, str, dict, ApprovalDecision, bool, str]
                     ] = []
+                    for tc, tool_name, tool_args, decision in non_approval_tools:
                         args_valid, error_msg = _validate_tool_args(tool_args)
                         parsed_tools.append(
+                            (tc, tool_name, tool_args, decision, args_valid, error_msg)
                         )
                     # 2. Send all tool_call events upfront (so frontend shows them all)
+                    for tc, tool_name, tool_args, _decision, args_valid, _ in parsed_tools:
                         if args_valid:
                             await session.send_event(
                                 Event(
                         tc: ToolCall,
                         name: str,
                         args: dict,
+                        decision: ApprovalDecision,
                         valid: bool,
                         err: str,
                     ) -> tuple[ToolCall, str, dict, str, bool]:
                         if not valid:
                             return (tc, name, args, err, False)
+                        if decision.billable:
+                            _record_estimated_spend(session, decision)
                         out, ok = await session.tool_router.call_tool(
                             name, args, session=session, tool_call_id=tc.id
                         )
                     gather_task = asyncio.ensure_future(asyncio.gather(
                         *[
+                            _exec_tool(tc, name, args, decision, valid, err)
+                            for tc, name, args, decision, valid, err in parsed_tools
                         ]
                     ))
                     cancel_task = asyncio.ensure_future(session._cancelled.wait())
                         except asyncio.CancelledError:
                             pass
                         # Notify frontend that in-flight tools were cancelled
+                        for tc, name, _args, _decision, valid, _ in parsed_tools:
                             if valid:
                                 await session.send_event(Event(
                                     event_type="tool_state_change",
                 if approval_required_tools:
                     # Prepare batch approval data
                     tools_data = []
+                    blocked_payloads = []
+                    for tc, tool_name, tool_args, decision in approval_required_tools:
                         # Resolve sandbox file paths for hf_jobs scripts so the
                         # frontend can display & edit the actual file content.
                         if tool_name == "hf_jobs" and isinstance(tool_args.get("script"), str):
                             if resolved:
                                 tool_args = {**tool_args, "script": resolved}
+                        tool_payload = {
                             "tool": tool_name,
                             "arguments": tool_args,
                             "tool_call_id": tc.id,
+                        }
+                        if decision.auto_approval_blocked:
+                            tool_payload.update(
+                                {
+                                    "auto_approval_blocked": True,
+                                    "block_reason": decision.block_reason,
+                                    "estimated_cost_usd": decision.estimated_cost_usd,
+                                    "remaining_cap_usd": decision.remaining_cap_usd,
+                                }
+                            )
+                            blocked_payloads.append(tool_payload)
+                        tools_data.append(tool_payload)
+                    event_data = {"tools": tools_data, "count": len(tools_data)}
+                    if blocked_payloads:
+                        first = blocked_payloads[0]
+                        event_data.update(
+                            {
+                                "auto_approval_blocked": True,
+                                "block_reason": first.get("block_reason"),
+                                "estimated_cost_usd": first.get("estimated_cost_usd"),
+                                "remaining_cap_usd": first.get("remaining_cap_usd"),
+                            }
+                        )
                     await session.send_event(Event(
                         event_type="approval_required",
+                        data=event_data,
                     ))
                     # Store all approval-requiring tools (ToolCall objects for execution)
                     session.pending_approval = {
+                        "tool_calls": [tc for tc, _, _, _ in approval_required_tools],
                     }
                     # Return early - wait for EXEC_APPROVAL operation
                 )
             )
+            await _record_manual_approved_spend_if_needed(session, tool_name, tool_args)
             output, success = await session.tool_router.call_tool(
                 tool_name, tool_args, session=session, tool_call_id=tc.id
             )

agent/core/approval_policy.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""Shared predicates for approval-gated tool operations."""
+from typing import Any
+def normalize_tool_operation(operation: Any) -> str:
+    return str(operation or "").strip().lower()
+def is_scheduled_operation(operation: Any) -> bool:
+    return normalize_tool_operation(operation).startswith("scheduled ")

agent/core/cost_estimation.py ADDED Viewed

	@@ -0,0 +1,278 @@

+"""Conservative cost estimates for auto-approved infrastructure actions."""
+import os
+import re
+import time
+from dataclasses import dataclass
+from typing import Any
+import httpx
+OPENID_PROVIDER_URL = os.environ.get("OPENID_PROVIDER_URL", "https://huggingface.co")
+JOBS_HARDWARE_URL = f"{OPENID_PROVIDER_URL}/api/jobs/hardware"
+JOBS_PRICE_CACHE_TTL_S = 6 * 60 * 60
+DEFAULT_JOB_TIMEOUT_HOURS = 0.5
+DEFAULT_SANDBOX_RESERVATION_HOURS = 1.0
+# Static fallback prices are intentionally conservative enough for a budget
+# guard. The live /api/jobs/hardware catalog wins whenever it is reachable.
+HF_JOBS_PRICE_USD_PER_HOUR: dict[str, float] = {
+    "cpu-basic": 0.05,
+    "cpu-upgrade": 0.25,
+    "cpu-performance": 0.50,
+    "cpu-xl": 1.00,
+    "t4-small": 0.60,
+    "t4-medium": 0.90,
+    "l4x1": 1.00,
+    "l4x4": 4.00,
+    "l40sx1": 2.00,
+    "l40sx4": 8.00,
+    "l40sx8": 16.00,
+    "a10g-small": 1.00,
+    "a10g-large": 2.00,
+    "a10g-largex2": 4.00,
+    "a10g-largex4": 8.00,
+    "a100-large": 4.00,
+    "a100x4": 16.00,
+    "a100x8": 32.00,
+    "h200": 10.00,
+    "h200x2": 20.00,
+    "h200x4": 40.00,
+    "h200x8": 80.00,
+    "inf2x6": 6.00,
+}
+SPACE_PRICE_USD_PER_HOUR: dict[str, float] = {
+    "cpu-basic": 0.0,
+    "cpu-upgrade": 0.05,
+    "cpu-performance": 0.50,
+    "cpu-xl": 1.00,
+    "t4-small": 0.60,
+    "t4-medium": 0.90,
+    "l4x1": 1.00,
+    "l4x4": 4.00,
+    "l40sx1": 2.00,
+    "l40sx4": 8.00,
+    "l40sx8": 16.00,
+    "a10g-small": 1.00,
+    "a10g-large": 2.00,
+    "a10g-largex2": 4.00,
+    "a10g-largex4": 8.00,
+    "a100-large": 4.00,
+    "a100x4": 16.00,
+    "a100x8": 32.00,
+    "h200": 10.00,
+    "h200x2": 20.00,
+    "h200x4": 40.00,
+    "h200x8": 80.00,
+    "inf2x6": 6.00,
+}
+_DURATION_RE = re.compile(r"^\s*(\d+(?:\.\d+)?)\s*([smhd]?)\s*$", re.IGNORECASE)
+_PRICE_RE = re.compile(r"(\d+(?:\.\d+)?)")
+_jobs_price_cache: tuple[float, dict[str, float]] | None = None
+@dataclass(frozen=True)
+class CostEstimate:
+    """Estimated cost for a tool call.
+    ``estimated_cost_usd=None`` means the call may be billable but we could not
+    estimate it safely, so auto-approval should fall back to a human decision.
+    """
+    estimated_cost_usd: float | None
+    billable: bool
+    block_reason: str | None = None
+    label: str | None = None
+def parse_timeout_hours(value: Any, *, default_hours: float = DEFAULT_JOB_TIMEOUT_HOURS) -> float | None:
+    """Parse HF timeout values into hours.
+    Strings accept ``s``, ``m``, ``h``, or ``d`` suffixes. Numeric values are
+    treated as seconds, matching the Hub client's typed timeout parameter.
+    """
+    if value is None or value == "":
+        return default_hours
+    if isinstance(value, bool):
+        return None
+    if isinstance(value, int | float):
+        seconds = float(value)
+        return seconds / 3600 if seconds > 0 else None
+    if not isinstance(value, str):
+        return None
+    match = _DURATION_RE.match(value)
+    if not match:
+        return None
+    amount = float(match.group(1))
+    unit = match.group(2).lower() or "s"
+    if amount <= 0:
+        return None
+    if unit == "s":
+        return amount / 3600
+    if unit == "m":
+        return amount / 60
+    if unit == "h":
+        return amount
+    if unit == "d":
+        return amount * 24
+    return None
+def _extract_flavor(item: dict[str, Any]) -> str | None:
+    for key in ("flavor", "name", "id", "value", "hardware", "hardware_flavor"):
+        value = item.get(key)
+        if isinstance(value, str) and value:
+            return value
+    return None
+def _coerce_price(value: Any) -> float | None:
+    if isinstance(value, bool) or value is None:
+        return None
+    if isinstance(value, int | float):
+        return float(value) if value >= 0 else None
+    if isinstance(value, str):
+        match = _PRICE_RE.search(value.replace(",", ""))
+        if match:
+            return float(match.group(1))
+    return None
+def _extract_hourly_price(item: dict[str, Any]) -> float | None:
+    for key in (
+        "price",
+        "price_usd",
+        "priceUsd",
+        "price_per_hour",
+        "pricePerHour",
+        "hourly_price",
+        "hourlyPrice",
+        "usd_per_hour",
+        "usdPerHour",
+    ):
+        price = _coerce_price(item.get(key))
+        if price is not None:
+            return price
+    for key in ("pricing", "billing", "cost"):
+        nested = item.get(key)
+        if isinstance(nested, dict):
+            price = _extract_hourly_price(nested)
+            if price is not None:
+                return price
+    return None
+def _iter_hardware_items(payload: Any):
+    if isinstance(payload, list):
+        for item in payload:
+            yield from _iter_hardware_items(item)
+    elif isinstance(payload, dict):
+        if _extract_flavor(payload):
+            yield payload
+        for key in ("hardware", "flavors", "items", "data", "jobs"):
+            child = payload.get(key)
+            if child is not None:
+                yield from _iter_hardware_items(child)
+def _parse_jobs_price_catalog(payload: Any) -> dict[str, float]:
+    prices: dict[str, float] = {}
+    for item in _iter_hardware_items(payload):
+        flavor = _extract_flavor(item)
+        price = _extract_hourly_price(item)
+        if flavor and price is not None:
+            prices[flavor] = price
+    return prices
+async def hf_jobs_price_catalog() -> dict[str, float]:
+    """Return live HF Jobs hourly prices, falling back to static prices."""
+    global _jobs_price_cache
+    now = time.monotonic()
+    if _jobs_price_cache and now - _jobs_price_cache[0] < JOBS_PRICE_CACHE_TTL_S:
+        return dict(_jobs_price_cache[1])
+    prices: dict[str, float] = {}
+    try:
+        async with httpx.AsyncClient(timeout=3.0) as client:
+            response = await client.get(JOBS_HARDWARE_URL)
+            if response.status_code == 200:
+                prices = _parse_jobs_price_catalog(response.json())
+    except (httpx.HTTPError, ValueError):
+        prices = {}
+    if not prices:
+        prices = dict(HF_JOBS_PRICE_USD_PER_HOUR)
+    else:
+        prices = {**HF_JOBS_PRICE_USD_PER_HOUR, **prices}
+    _jobs_price_cache = (now, prices)
+    return dict(prices)
+async def estimate_hf_job_cost(args: dict[str, Any]) -> CostEstimate:
+    flavor = str(
+        args.get("hardware_flavor")
+        or args.get("flavor")
+        or args.get("hardware")
+        or "cpu-basic"
+    )
+    timeout_hours = parse_timeout_hours(args.get("timeout"))
+    if timeout_hours is None:
+        return CostEstimate(
+            estimated_cost_usd=None,
+            billable=True,
+            block_reason=f"Could not parse HF job timeout: {args.get('timeout')!r}.",
+            label=flavor,
+        )
+    prices = await hf_jobs_price_catalog()
+    price = prices.get(flavor)
+    if price is None:
+        return CostEstimate(
+            estimated_cost_usd=None,
+            billable=True,
+            block_reason=f"No price is available for HF job hardware '{flavor}'.",
+            label=flavor,
+        )
+    return CostEstimate(
+        estimated_cost_usd=round(price * timeout_hours, 4),
+        billable=price > 0,
+        label=flavor,
+    )
+async def estimate_sandbox_cost(args: dict[str, Any], *, session: Any = None) -> CostEstimate:
+    if session is not None and getattr(session, "sandbox", None):
+        return CostEstimate(estimated_cost_usd=0.0, billable=False, label="existing")
+    hardware = str(args.get("hardware") or "cpu-basic")
+    price = SPACE_PRICE_USD_PER_HOUR.get(hardware)
+    if price is None:
+        return CostEstimate(
+            estimated_cost_usd=None,
+            billable=True,
+            block_reason=f"No price is available for sandbox hardware '{hardware}'.",
+            label=hardware,
+        )
+    return CostEstimate(
+        estimated_cost_usd=round(price * DEFAULT_SANDBOX_RESERVATION_HOURS, 4),
+        billable=price > 0,
+        label=hardware,
+    )
+async def estimate_tool_cost(
+    tool_name: str, args: dict[str, Any], *, session: Any = None
+) -> CostEstimate:
+    if tool_name == "sandbox_create":
+        return await estimate_sandbox_cost(args, session=session)
+    if tool_name == "hf_jobs":
+        return await estimate_hf_job_cost(args)
+    return CostEstimate(estimated_cost_usd=0.0, billable=False)

agent/core/session.py CHANGED Viewed

@@ -120,6 +120,9 @@ class Session:
         self.notification_gateway = notification_gateway
         self.notification_destinations = list(notification_destinations or [])
         self.defer_turn_complete_notification = defer_turn_complete_notification
         # Session trajectory logging
         self.logged_events: list[dict] = []
@@ -313,6 +316,40 @@ class Session:
         self.config.model_name = model_name
         self.context_manager.model_max_tokens = _get_max_tokens_safe(model_name)
     def effective_effort_for(self, model_name: str) -> str | None:
         """Resolve the effort level to actually send for ``model_name``.

         self.notification_gateway = notification_gateway
         self.notification_destinations = list(notification_destinations or [])
         self.defer_turn_complete_notification = defer_turn_complete_notification
+        self.auto_approval_enabled: bool = False
+        self.auto_approval_cost_cap_usd: float | None = None
+        self.auto_approval_estimated_spend_usd: float = 0.0
         # Session trajectory logging
         self.logged_events: list[dict] = []
         self.config.model_name = model_name
         self.context_manager.model_max_tokens = _get_max_tokens_safe(model_name)
+    def set_auto_approval_policy(
+        self, *, enabled: bool, cost_cap_usd: float | None
+    ) -> None:
+        self.auto_approval_enabled = bool(enabled)
+        self.auto_approval_cost_cap_usd = cost_cap_usd
+    def add_auto_approval_estimated_spend(self, amount_usd: float | None) -> None:
+        if amount_usd is None or amount_usd <= 0:
+            return
+        self.auto_approval_estimated_spend_usd = round(
+            self.auto_approval_estimated_spend_usd + float(amount_usd), 4
+        )
+    @property
+    def auto_approval_remaining_usd(self) -> float | None:
+        if self.auto_approval_cost_cap_usd is None:
+            return None
+        return round(
+            max(
+                0.0,
+                self.auto_approval_cost_cap_usd
+                - self.auto_approval_estimated_spend_usd,
+            ),
+            4,
+        )
+    def auto_approval_policy_summary(self) -> dict[str, Any]:
+        return {
+            "enabled": self.auto_approval_enabled,
+            "cost_cap_usd": self.auto_approval_cost_cap_usd,
+            "estimated_spend_usd": round(self.auto_approval_estimated_spend_usd, 4),
+            "remaining_usd": self.auto_approval_remaining_usd,
+        }
     def effective_effort_for(self, model_name: str) -> str | None:
         """Resolve the effort level to actually send for ``model_name``.

agent/core/session_persistence.py CHANGED Viewed

@@ -176,6 +176,9 @@ class MongoSessionStore(NoopSessionStore):
         pending_approval: list[dict[str, Any]] | None = None,
         claude_counted: bool = False,
         notification_destinations: list[str] | None = None,
     ) -> None:
         if not self._ready():
             return
@@ -204,6 +207,9 @@ class MongoSessionStore(NoopSessionStore):
                     "pending_approval": pending_approval or [],
                     "claude_counted": claude_counted,
                     "notification_destinations": notification_destinations or [],
                 },
             },
             upsert=True,
@@ -224,6 +230,9 @@ class MongoSessionStore(NoopSessionStore):
         claude_counted: bool = False,
         created_at: datetime | None = None,
         notification_destinations: list[str] | None = None,
     ) -> None:
         if not self._ready():
             return
@@ -241,6 +250,9 @@ class MongoSessionStore(NoopSessionStore):
             pending_approval=pending_approval,
             claude_counted=claude_counted,
             notification_destinations=notification_destinations,
         )
         ops: list[Any] = []
         for idx, raw in enumerate(messages):

         pending_approval: list[dict[str, Any]] | None = None,
         claude_counted: bool = False,
         notification_destinations: list[str] | None = None,
+        auto_approval_enabled: bool = False,
+        auto_approval_cost_cap_usd: float | None = None,
+        auto_approval_estimated_spend_usd: float = 0.0,
     ) -> None:
         if not self._ready():
             return
                     "pending_approval": pending_approval or [],
                     "claude_counted": claude_counted,
                     "notification_destinations": notification_destinations or [],
+                    "auto_approval_enabled": auto_approval_enabled,
+                    "auto_approval_cost_cap_usd": auto_approval_cost_cap_usd,
+                    "auto_approval_estimated_spend_usd": auto_approval_estimated_spend_usd,
                 },
             },
             upsert=True,
         claude_counted: bool = False,
         created_at: datetime | None = None,
         notification_destinations: list[str] | None = None,
+        auto_approval_enabled: bool = False,
+        auto_approval_cost_cap_usd: float | None = None,
+        auto_approval_estimated_spend_usd: float = 0.0,
     ) -> None:
         if not self._ready():
             return
             pending_approval=pending_approval,
             claude_counted=claude_counted,
             notification_destinations=notification_destinations,
+            auto_approval_enabled=auto_approval_enabled,
+            auto_approval_cost_cap_usd=auto_approval_cost_cap_usd,
+            auto_approval_estimated_spend_usd=auto_approval_estimated_spend_usd,
         )
         ops: list[Any] = []
         for idx, raw in enumerate(messages):

agent/main.py CHANGED Viewed

@@ -21,6 +21,7 @@ import litellm
 from prompt_toolkit import PromptSession
 from agent.config import load_config
 from agent.core.agent_loop import submission_loop
 from agent.core import model_switcher
 from agent.core.hf_tokens import resolve_hf_token
@@ -55,6 +56,20 @@ litellm.suppress_debug_info = True
 CLI_CONFIG_PATH = Path(__file__).parent.parent / "configs" / "cli_agent_config.json"
 def _configure_runtime_logging() -> None:
     """Keep third-party warning spam from punching through the interactive UI."""
     import logging
@@ -375,8 +390,11 @@ async def event_listener(
                 tools_data = event.data.get("tools", []) if event.data else []
                 count = event.data.get("count", 0) if event.data else 0
-                # If yolo mode is active, auto-approve everything
-                if config and config.yolo_mode:
                     approvals = [
                         {
                             "tool_call_id": t.get("tool_call_id", ""),
@@ -1293,14 +1311,18 @@ async def headless_main(
             else:
                 print_tool_log(tool, log)
         elif event.event_type == "approval_required":
-            # Auto-approve everything in headless mode (safety net if yolo_mode
-            # didn't prevent the approval event for some reason)
             tools_data = event.data.get("tools", []) if event.data else []
             approvals = [
                 {
                     "tool_call_id": t.get("tool_call_id", ""),
-                    "approved": True,
-                    "feedback": None,
                 }
                 for t in tools_data
             ]

 from prompt_toolkit import PromptSession
 from agent.config import load_config
+from agent.core.approval_policy import is_scheduled_operation
 from agent.core.agent_loop import submission_loop
 from agent.core import model_switcher
 from agent.core.hf_tokens import resolve_hf_token
 CLI_CONFIG_PATH = Path(__file__).parent.parent / "configs" / "cli_agent_config.json"
+def _is_scheduled_hf_job_tool(tool_info: dict[str, Any]) -> bool:
+    if tool_info.get("tool") != "hf_jobs":
+        return False
+    arguments = tool_info.get("arguments") or {}
+    if isinstance(arguments, str):
+        try:
+            arguments = json.loads(arguments)
+        except json.JSONDecodeError:
+            return False
+    if not isinstance(arguments, dict):
+        return False
+    return is_scheduled_operation(arguments.get("operation"))
 def _configure_runtime_logging() -> None:
     """Keep third-party warning spam from punching through the interactive UI."""
     import logging
                 tools_data = event.data.get("tools", []) if event.data else []
                 count = event.data.get("count", 0) if event.data else 0
+                # If yolo mode is active, auto-approve everything except
+                # scheduled HF jobs, whose recurring cost stays manual.
+                if config and config.yolo_mode and not any(
+                    _is_scheduled_hf_job_tool(t) for t in tools_data
+                ):
                     approvals = [
                         {
                             "tool_call_id": t.get("tool_call_id", ""),
             else:
                 print_tool_log(tool, log)
         elif event.event_type == "approval_required":
+            # Auto-approve in headless mode, except scheduled HF jobs. Those
+            # are rejected because their recurring cost needs manual approval.
             tools_data = event.data.get("tools", []) if event.data else []
             approvals = [
                 {
                     "tool_call_id": t.get("tool_call_id", ""),
+                    "approved": not _is_scheduled_hf_job_tool(t),
+                    "feedback": (
+                        "Scheduled HF jobs require manual approval."
+                        if _is_scheduled_hf_job_tool(t)
+                        else None
+                    ),
                 }
                 for t in tools_data
             ]

agent/prompts/system_prompt_v3.yaml CHANGED Viewed

@@ -42,7 +42,7 @@ system_prompt: |
   SILENT DATASET SUBSTITUTION: When a requested dataset fails to load, you will silently switch to a different one without telling the user. Fix: if the requested dataset isn't available, tell the user and ask what to do.
-  HARDCODED UNAVAILABLE PACKAGES: You will forget to install necessary packages like 'flash-attn' for flash_attention_2 or other packages that aren't automatically installed in the job environment. Fix: install necessary packages before running the job.
   SCOPE-CHANGING FIXES: Avoid at all costs! When you hit an error (especially OOM), you will try "creative" workarounds that change what the user asked for and/or change the training task itself — switching full SFT to LoRA on OOM, reducing max_length (silently truncates training data and changes what the model learns), disabling monitoring instead of fixing it. Do not do this. Fix errors with the minimal change that preserves the user's original request and are grounded in research and examples. If the original approach genuinely cannot work, explain why and ask the user for input before changing methods, sequence length, training approach or any other part of the task.

   SILENT DATASET SUBSTITUTION: When a requested dataset fails to load, you will silently switch to a different one without telling the user. Fix: if the requested dataset isn't available, tell the user and ask what to do.
+  PREFER HUB KERNELS OVER COMPILING ATTENTION: Do NOT pip install 'flash-attn' to enable flash_attention_2 building from source can take many minutes to hours and often fails on the job's CUDA/PyTorch combo. Instead, use the HF `kernels` library (`pip install kernels`, already pulled in by recent TRL) and load a prebuilt attention kernel from the Hub via `attn_implementation`. Examples: `AutoModelForCausalLM.from_pretrained(..., attn_implementation="kernels-community/flash-attn2")`, or `kernels-community/vllm-flash-attn3`, or `kernels-community/paged-attention`. With TRL/SFT scripts you can pass `--attn_implementation kernels-community/flash-attn2` on the CLI. Search additional kernels at https://huggingface.co/models?other=kernel. Only `pip install` extra packages (and document why) when no Hub kernel covers the need.
   SCOPE-CHANGING FIXES: Avoid at all costs! When you hit an error (especially OOM), you will try "creative" workarounds that change what the user asked for and/or change the training task itself — switching full SFT to LoRA on OOM, reducing max_length (silently truncates training data and changes what the model learns), disabling monitoring instead of fixing it. Do not do this. Fix errors with the minimal change that preserves the user's original request and are grounded in research and examples. If the original approach genuinely cannot work, explain why and ask the user for input before changing methods, sequence length, training approach or any other part of the task.

agent/tools/docs_tools.py CHANGED Viewed

@@ -932,7 +932,7 @@ EXPLORE_HF_DOCS_TOOL_SPEC = {
                     "• argilla — Data annotation, feedback, and human-in-the-loop workflows.\n"
                     "• distilabel — Synthetic data generation and distillation pipelines.\n"
                     "• microsoft-azure — Azure deployment and integration guides.\n"
-                    "• kernels — Lightweight execution environments and notebook-style workflows.\n"
                     "• google-cloud — GCP deployment and serving workflows.\n"
                 ),
             },

                     "• argilla — Data annotation, feedback, and human-in-the-loop workflows.\n"
                     "• distilabel — Synthetic data generation and distillation pipelines.\n"
                     "• microsoft-azure — Azure deployment and integration guides.\n"
+                    "• kernels — Load prebuilt compute kernels (E.g. flash-attn2) from the Hub via `attn_implementation`; avoids compiling flash-attn from source.\n"
                     "• google-cloud — GCP deployment and serving workflows.\n"
                 ),
             },

backend/models.py CHANGED Viewed

@@ -76,6 +76,15 @@ class PendingApprovalTool(BaseModel):
     arguments: dict[str, Any] = {}
 class SessionInfo(BaseModel):
     """Session metadata."""
@@ -89,6 +98,9 @@ class SessionInfo(BaseModel):
     model: str | None = None
     title: str | None = None
     notification_destinations: list[str] = Field(default_factory=list)
 class SessionNotificationsRequest(BaseModel):
@@ -97,6 +109,13 @@ class SessionNotificationsRequest(BaseModel):
     destinations: list[str]
 class HealthResponse(BaseModel):
     """Health check response."""

     arguments: dict[str, Any] = {}
+class SessionAutoApprovalInfo(BaseModel):
+    """Per-session auto-approval budget state."""
+    enabled: bool = False
+    cost_cap_usd: float | None = None
+    estimated_spend_usd: float = 0.0
+    remaining_usd: float | None = None
 class SessionInfo(BaseModel):
     """Session metadata."""
     model: str | None = None
     title: str | None = None
     notification_destinations: list[str] = Field(default_factory=list)
+    auto_approval: SessionAutoApprovalInfo = Field(
+        default_factory=SessionAutoApprovalInfo
+    )
 class SessionNotificationsRequest(BaseModel):
     destinations: list[str]
+class SessionYoloRequest(BaseModel):
+    """Update a session's auto-approval policy."""
+    enabled: bool
+    cost_cap_usd: float | None = Field(default=None, ge=0)
 class HealthResponse(BaseModel):
     """Health check response."""

backend/routes/agent.py CHANGED Viewed

@@ -26,6 +26,7 @@ from models import (
     SessionInfo,
     SessionNotificationsRequest,
     SessionResponse,
     SubmitRequest,
     TruncateRequest,
 )
@@ -498,6 +499,26 @@ async def set_session_notifications(
     }
 @router.get("/user/quota")
 async def get_user_quota(user: dict = Depends(get_current_user)) -> dict:
     """Return the user's plan tier and today's premium-model quota state."""

     SessionInfo,
     SessionNotificationsRequest,
     SessionResponse,
+    SessionYoloRequest,
     SubmitRequest,
     TruncateRequest,
 )
     }
+@router.patch("/session/{session_id}/yolo")
+async def set_session_yolo(
+    session_id: str,
+    body: SessionYoloRequest,
+    user: dict = Depends(get_current_user),
+) -> dict:
+    """Update the session-scoped auto-approval policy."""
+    await _check_session_access(session_id, user)
+    try:
+        summary = await session_manager.update_session_auto_approval(
+            session_id,
+            enabled=body.enabled,
+            cost_cap_usd=body.cost_cap_usd,
+            cap_provided="cost_cap_usd" in body.model_fields_set,
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    return {"session_id": session_id, **summary}
 @router.get("/user/quota")
 async def get_user_quota(user: dict = Depends(get_current_user)) -> dict:
     """Return the user's plan tier and today's premium-model quota state."""

backend/session_manager.py CHANGED Viewed

@@ -116,6 +116,7 @@ class SessionCapacityError(Exception):
 # and per-request overhead.
 MAX_SESSIONS: int = 200
 MAX_SESSIONS_PER_USER: int = 10
 class SessionManager:
@@ -297,6 +298,20 @@ class SessionManager:
             return "ended"
         return "idle"
     async def _start_agent_session(
         self,
         *,
@@ -370,6 +385,20 @@ class SessionManager:
                 notification_destinations=list(
                     agent_session.session.notification_destinations
                 ),
             )
         except Exception as e:
             logger.warning(
@@ -451,6 +480,14 @@ class SessionManager:
         self._restore_pending_approval(session, meta.get("pending_approval") or [])
         session.turn_count = int(meta.get("turn_count") or 0)
         created_at = meta.get("created_at")
         if not isinstance(created_at, datetime):
@@ -883,6 +920,43 @@ class SessionManager:
         await self.persist_session_snapshot(agent_session, runtime_state="idle")
         return True
     def get_session_owner(self, session_id: str) -> str | None:
         """Get the user_id that owns a session, or None if session doesn't exist."""
         agent_session = self.sessions.get(session_id)
@@ -925,6 +999,7 @@ class SessionManager:
             "notification_destinations": list(
                 agent_session.session.notification_destinations
             ),
         }
     def set_notification_destinations(
@@ -991,6 +1066,25 @@ class SessionManager:
                         "model": row.get("model"),
                         "title": row.get("title"),
                         "notification_destinations": row.get("notification_destinations") or [],
                     }
                 )
             return results

 # and per-request overhead.
 MAX_SESSIONS: int = 200
 MAX_SESSIONS_PER_USER: int = 10
+DEFAULT_YOLO_COST_CAP_USD: float = 5.0
 class SessionManager:
             return "ended"
         return "idle"
+    @staticmethod
+    def _auto_approval_summary(session: Session) -> dict[str, Any]:
+        if hasattr(session, "auto_approval_policy_summary"):
+            return session.auto_approval_policy_summary()
+        cap = getattr(session, "auto_approval_cost_cap_usd", None)
+        estimated = float(getattr(session, "auto_approval_estimated_spend_usd", 0.0) or 0.0)
+        remaining = None if cap is None else round(max(0.0, float(cap) - estimated), 4)
+        return {
+            "enabled": bool(getattr(session, "auto_approval_enabled", False)),
+            "cost_cap_usd": cap,
+            "estimated_spend_usd": round(estimated, 4),
+            "remaining_usd": remaining,
+        }
     async def _start_agent_session(
         self,
         *,
                 notification_destinations=list(
                     agent_session.session.notification_destinations
                 ),
+                auto_approval_enabled=bool(
+                    getattr(agent_session.session, "auto_approval_enabled", False)
+                ),
+                auto_approval_cost_cap_usd=getattr(
+                    agent_session.session, "auto_approval_cost_cap_usd", None
+                ),
+                auto_approval_estimated_spend_usd=float(
+                    getattr(
+                        agent_session.session,
+                        "auto_approval_estimated_spend_usd",
+                        0.0,
+                    )
+                    or 0.0
+                ),
             )
         except Exception as e:
             logger.warning(
         self._restore_pending_approval(session, meta.get("pending_approval") or [])
         session.turn_count = int(meta.get("turn_count") or 0)
+        session.auto_approval_enabled = bool(meta.get("auto_approval_enabled", False))
+        raw_cap = meta.get("auto_approval_cost_cap_usd")
+        session.auto_approval_cost_cap_usd = (
+            float(raw_cap) if isinstance(raw_cap, int | float) else None
+        )
+        session.auto_approval_estimated_spend_usd = float(
+            meta.get("auto_approval_estimated_spend_usd") or 0.0
+        )
         created_at = meta.get("created_at")
         if not isinstance(created_at, datetime):
         await self.persist_session_snapshot(agent_session, runtime_state="idle")
         return True
+    async def update_session_auto_approval(
+        self,
+        session_id: str,
+        *,
+        enabled: bool,
+        cost_cap_usd: float | None,
+        cap_provided: bool = False,
+    ) -> dict[str, Any]:
+        agent_session = self.sessions.get(session_id)
+        if not agent_session or not agent_session.is_active:
+            raise ValueError("Session not found or inactive")
+        session = agent_session.session
+        if enabled:
+            if not cap_provided and cost_cap_usd is None:
+                cost_cap_usd = getattr(
+                    session, "auto_approval_cost_cap_usd", None
+                )
+                if cost_cap_usd is None:
+                    cost_cap_usd = DEFAULT_YOLO_COST_CAP_USD
+            elif cost_cap_usd is None:
+                cost_cap_usd = DEFAULT_YOLO_COST_CAP_USD
+        else:
+            if not cap_provided:
+                cost_cap_usd = getattr(session, "auto_approval_cost_cap_usd", None)
+        if hasattr(session, "set_auto_approval_policy"):
+            session.set_auto_approval_policy(
+                enabled=enabled,
+                cost_cap_usd=cost_cap_usd,
+            )
+        else:
+            session.auto_approval_enabled = bool(enabled)
+            session.auto_approval_cost_cap_usd = cost_cap_usd
+        await self.persist_session_snapshot(agent_session)
+        return self._auto_approval_summary(session)
     def get_session_owner(self, session_id: str) -> str | None:
         """Get the user_id that owns a session, or None if session doesn't exist."""
         agent_session = self.sessions.get(session_id)
             "notification_destinations": list(
                 agent_session.session.notification_destinations
             ),
+            "auto_approval": self._auto_approval_summary(agent_session.session),
         }
     def set_notification_destinations(
                         "model": row.get("model"),
                         "title": row.get("title"),
                         "notification_destinations": row.get("notification_destinations") or [],
+                        "auto_approval": {
+                            "enabled": bool(row.get("auto_approval_enabled", False)),
+                            "cost_cap_usd": row.get("auto_approval_cost_cap_usd"),
+                            "estimated_spend_usd": float(
+                                row.get("auto_approval_estimated_spend_usd") or 0.0
+                            ),
+                            "remaining_usd": (
+                                None
+                                if row.get("auto_approval_cost_cap_usd") is None
+                                else round(
+                                    max(
+                                        0.0,
+                                        float(row.get("auto_approval_cost_cap_usd") or 0.0)
+                                        - float(row.get("auto_approval_estimated_spend_usd") or 0.0),
+                                    ),
+                                    4,
+                                )
+                            ),
+                        },
                     }
                 )
             return results

frontend/src/components/Chat/ToolCallGroup.tsx CHANGED Viewed

@@ -1,5 +1,5 @@
 import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
-import { Box, Stack, Typography, Chip, Button, TextField, IconButton, Link, CircularProgress } from '@mui/material';
 import CheckCircleOutlineIcon from '@mui/icons-material/CheckCircleOutline';
 import ErrorOutlineIcon from '@mui/icons-material/ErrorOutline';
 import OpenInNewIcon from '@mui/icons-material/OpenInNew';
@@ -502,6 +502,7 @@ function InlineApproval({
 }) {
   const [feedback, setFeedback] = useState('');
   const args = input as Record<string, unknown> | undefined;
   const { setPanel, getEditedScript } = useAgentStore();
   const { setRightPanelOpen, setLeftSidebarOpen } = useLayoutStore();
   const hasEditedScript = !!getEditedScript(toolCallId);
@@ -521,6 +522,24 @@ function InlineApproval({
   return (
     <Box sx={{ px: 1.5, py: 1.5, borderTop: '1px solid var(--tool-border)' }}>
       {toolName === 'sandbox_create' && args && (() => {
         const hw = String(args.hardware || 'cpu-basic');
         const cost = costLabel(hw);

 import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
+import { Alert, Box, Stack, Typography, Chip, Button, TextField, IconButton, Link, CircularProgress } from '@mui/material';
 import CheckCircleOutlineIcon from '@mui/icons-material/CheckCircleOutline';
 import ErrorOutlineIcon from '@mui/icons-material/ErrorOutline';
 import OpenInNewIcon from '@mui/icons-material/OpenInNew';
 }) {
   const [feedback, setFeedback] = useState('');
   const args = input as Record<string, unknown> | undefined;
+  const autoApproval = useAgentStore((state) => state.budgetBlocks[toolCallId]);
   const { setPanel, getEditedScript } = useAgentStore();
   const { setRightPanelOpen, setLeftSidebarOpen } = useLayoutStore();
   const hasEditedScript = !!getEditedScript(toolCallId);
   return (
     <Box sx={{ px: 1.5, py: 1.5, borderTop: '1px solid var(--tool-border)' }}>
+      {autoApproval && (
+        <Alert
+          severity="warning"
+          sx={{
+            mb: 1.5,
+            py: 0.5,
+            bgcolor: 'rgba(245,158,11,0.08)',
+            border: '1px solid rgba(245,158,11,0.18)',
+            color: 'var(--text)',
+            '& .MuiAlert-icon': { color: 'var(--accent-yellow)' },
+          }}
+        >
+          <Typography variant="body2" sx={{ fontSize: '0.72rem' }}>
+            YOLO paused: {autoApproval.reason || 'manual approval required.'}
+          </Typography>
+        </Alert>
+      )}
       {toolName === 'sandbox_create' && args && (() => {
         const hw = String(args.hardware || 'cpu-basic');
         const cost = costLabel(hw);

frontend/src/components/Layout/AppLayout.tsx CHANGED Viewed

@@ -24,6 +24,7 @@ import SessionSidebar from '@/components/SessionSidebar/SessionSidebar';
 import SessionChat from '@/components/SessionChat';
 import CodePanel from '@/components/CodePanel/CodePanel';
 import WelcomeScreen from '@/components/WelcomeScreen/WelcomeScreen';
 import { apiFetch } from '@/utils/api';
 const DRAWER_WIDTH = 260;
@@ -252,6 +253,7 @@ export default function AppLayout() {
           </Box>
           <Box sx={{ display: 'flex', alignItems: 'center', gap: 0.5 }}>
             <IconButton
               onClick={toggleTheme}
               size="small"

 import SessionChat from '@/components/SessionChat';
 import CodePanel from '@/components/CodePanel/CodePanel';
 import WelcomeScreen from '@/components/WelcomeScreen/WelcomeScreen';
+import YoloControl from '@/components/YoloControl';
 import { apiFetch } from '@/utils/api';
 const DRAWER_WIDTH = 260;
           </Box>
           <Box sx={{ display: 'flex', alignItems: 'center', gap: 0.5 }}>
+            <YoloControl />
             <IconButton
               onClick={toggleTheme}
               size="small"

frontend/src/components/YoloControl.tsx ADDED Viewed

	@@ -0,0 +1,155 @@

+import { useEffect, useMemo, useState } from 'react';
+import {
+  Button,
+  Dialog,
+  DialogActions,
+  DialogContent,
+  DialogTitle,
+  TextField,
+  Tooltip,
+  Typography,
+} from '@mui/material';
+import BoltOutlinedIcon from '@mui/icons-material/BoltOutlined';
+import { useSessionStore } from '@/store/sessionStore';
+import { apiFetch } from '@/utils/api';
+const DEFAULT_CAP_USD = 5;
+function money(value: number | null | undefined): string {
+  if (value === null || value === undefined) return 'uncapped';
+  if (value >= 100) return `$${value.toFixed(0)}`;
+  return `$${value.toFixed(2).replace(/\.00$/, '')}`;
+}
+export default function YoloControl() {
+  const { sessions, activeSessionId, updateSessionYolo } = useSessionStore();
+  const activeSession = useMemo(
+    () => sessions.find((s) => s.id === activeSessionId) || null,
+    [sessions, activeSessionId],
+  );
+  const [dialogOpen, setDialogOpen] = useState(false);
+  const [capInput, setCapInput] = useState(String(DEFAULT_CAP_USD));
+  const [busy, setBusy] = useState(false);
+  const [error, setError] = useState<string | null>(null);
+  const enabled = Boolean(activeSession?.autoApprovalEnabled);
+  const disabled = !activeSessionId || activeSession?.expired || busy;
+  const remaining = activeSession?.autoApprovalRemainingUsd ?? null;
+  const cap = activeSession?.autoApprovalCostCapUsd ?? null;
+  useEffect(() => {
+    if (!activeSession) return;
+    setCapInput(String(activeSession.autoApprovalCostCapUsd ?? DEFAULT_CAP_USD));
+  }, [activeSession?.id, activeSession?.autoApprovalCostCapUsd]); // eslint-disable-line react-hooks/exhaustive-deps
+  async function patchPolicy(nextEnabled: boolean, nextCap?: number) {
+    if (!activeSessionId) return null;
+    setBusy(true);
+    setError(null);
+    try {
+      const body: Record<string, unknown> = { enabled: nextEnabled };
+      if (nextCap !== undefined) body.cost_cap_usd = nextCap;
+      const response = await apiFetch(`/api/session/${activeSessionId}/yolo`, {
+        method: 'PATCH',
+        body: JSON.stringify(body),
+      });
+      if (!response.ok) {
+        throw new Error(await response.text());
+      }
+      const data = await response.json();
+      updateSessionYolo(activeSessionId, data);
+      return data;
+    } catch {
+      setError('Could not update YOLO settings.');
+      return null;
+    } finally {
+      setBusy(false);
+    }
+  }
+  const handleToggle = async () => {
+    if (disabled) return;
+    if (enabled) {
+      await patchPolicy(false);
+      return;
+    }
+    const nextCap = cap ?? DEFAULT_CAP_USD;
+    const updated = await patchPolicy(true, nextCap);
+    if (updated) {
+      setCapInput(String(updated.cost_cap_usd ?? nextCap));
+      setDialogOpen(true);
+    }
+  };
+  const handleSaveCap = async () => {
+    const parsed = Number(capInput);
+    if (!Number.isFinite(parsed) || parsed < 0) {
+      setError('Enter a non-negative dollar amount.');
+      return;
+    }
+    const updated = await patchPolicy(true, parsed);
+    if (updated) setDialogOpen(false);
+  };
+  return (
+    <>
+      <Tooltip title={enabled ? 'Disable session YOLO auto-approval' : 'Enable session YOLO auto-approval'}>
+        <span>
+          <Button
+            size="small"
+            variant={enabled ? 'contained' : 'outlined'}
+            disabled={disabled}
+            onClick={handleToggle}
+            startIcon={<BoltOutlinedIcon sx={{ fontSize: 16 }} />}
+            sx={{
+              minWidth: { xs: 74, md: 116 },
+              height: 32,
+              px: { xs: 1, md: 1.25 },
+              borderRadius: '8px',
+              textTransform: 'none',
+              fontSize: '0.72rem',
+              whiteSpace: 'nowrap',
+              bgcolor: enabled ? 'var(--accent-yellow)' : 'transparent',
+              color: enabled ? '#111' : 'text.secondary',
+              borderColor: enabled ? 'var(--accent-yellow)' : 'divider',
+              '&:hover': {
+                bgcolor: enabled ? 'var(--accent-yellow)' : 'action.hover',
+                borderColor: 'var(--accent-yellow)',
+              },
+            }}
+          >
+            {enabled ? `YOLO ${money(remaining)}` : 'YOLO'}
+          </Button>
+        </span>
+      </Tooltip>
+      <Dialog open={dialogOpen} onClose={() => setDialogOpen(false)} maxWidth="xs" fullWidth>
+        <DialogTitle sx={{ pb: 1 }}>YOLO Budget</DialogTitle>
+        <DialogContent sx={{ display: 'flex', flexDirection: 'column', gap: 1.5, pt: 1 }}>
+          <Typography variant="body2" color="text.secondary">
+            Auto-approval is active for this session. Scheduled HF jobs still require approval.
+          </Typography>
+          <TextField
+            autoFocus
+            label="Session cap (USD)"
+            type="number"
+            size="small"
+            value={capInput}
+            onChange={(e) => setCapInput(e.target.value)}
+            inputProps={{ min: 0, step: 0.5 }}
+            error={Boolean(error)}
+            helperText={error || `Estimated spend: ${money(activeSession?.autoApprovalEstimatedSpendUsd ?? 0)} of ${money(cap)}`}
+          />
+        </DialogContent>
+        <DialogActions>
+          <Button onClick={() => setDialogOpen(false)} sx={{ textTransform: 'none' }}>
+            Close
+          </Button>
+          <Button onClick={handleSaveCap} disabled={busy} variant="contained" sx={{ textTransform: 'none' }}>
+            Save
+          </Button>
+        </DialogActions>
+      </Dialog>
+    </>
+  );
+}

frontend/src/hooks/useAgentChat.ts CHANGED Viewed

@@ -36,7 +36,7 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
   const isActiveRef = useRef(isActive);
   isActiveRef.current = isActive;
-  const { setNeedsAttention } = useSessionStore();
   // Helper: update this session's state (mirrors to globals if active)
   const updateSession = useAgentStore.getState().updateSession;
@@ -186,6 +186,20 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
         if (!tools.length) return;
         setNeedsAttention(sessionId, true);
         updateSession(sessionId, { activityStatus: { type: 'waiting-approval' } });
         // Build panel data for this session's pending approval
@@ -480,6 +494,9 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
             );
             if (pendingIds.size > 0) setNeedsAttention(sessionId, true);
           }
           return { data, pendingIds, info };
         }
         return { data, pendingIds, info: null };
@@ -562,7 +579,15 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
             return true;
           } else if (et === 'approval_required') {
             sideChannel.onApprovalRequired(
-              (event.data?.tools || []) as Array<{ tool: string; arguments: Record<string, unknown>; tool_call_id: string }>,
             );
             stopReconnect();
             const result = await hydrateMessages();

   const isActiveRef = useRef(isActive);
   isActiveRef.current = isActive;
+  const { setNeedsAttention, updateSessionYolo } = useSessionStore();
   // Helper: update this session's state (mirrors to globals if active)
   const updateSession = useAgentStore.getState().updateSession;
         if (!tools.length) return;
         setNeedsAttention(sessionId, true);
+        const store = useAgentStore.getState();
+        for (const tool of tools) {
+          store.setToolBudgetBlock(
+            tool.tool_call_id,
+            tool.auto_approval_blocked
+              ? {
+                  reason: tool.block_reason ?? null,
+                  estimatedCostUsd: tool.estimated_cost_usd ?? null,
+                  remainingCapUsd: tool.remaining_cap_usd ?? null,
+                }
+              : null,
+          );
+        }
         updateSession(sessionId, { activityStatus: { type: 'waiting-approval' } });
         // Build panel data for this session's pending approval
             );
             if (pendingIds.size > 0) setNeedsAttention(sessionId, true);
           }
+          if (info.auto_approval) {
+            updateSessionYolo(sessionId, info.auto_approval);
+          }
           return { data, pendingIds, info };
         }
         return { data, pendingIds, info: null };
             return true;
           } else if (et === 'approval_required') {
             sideChannel.onApprovalRequired(
+              (event.data?.tools || []) as Array<{
+                tool: string;
+                arguments: Record<string, unknown>;
+                tool_call_id: string;
+                auto_approval_blocked?: boolean;
+                block_reason?: string | null;
+                estimated_cost_usd?: number | null;
+                remaining_cap_usd?: number | null;
+              }>,
             );
             stopReconnect();
             const result = await hydrateMessages();

frontend/src/lib/sse-chat-transport.ts CHANGED Viewed

@@ -26,7 +26,15 @@ export interface SideChannelCallbacks {
   onToolLog: (tool: string, log: string, agentId?: string, label?: string) => void;
   onConnectionChange: (connected: boolean) => void;
   onSessionDead: (sessionId: string) => void;
-  onApprovalRequired: (tools: Array<{ tool: string; arguments: Record<string, unknown>; tool_call_id: string }>) => void;
   onToolCallPanel: (tool: string, args: Record<string, unknown>) => void;
   onToolOutputPanel: (tool: string, toolCallId: string, output: string, success: boolean) => void;
   onStreaming: () => void;
@@ -236,6 +244,10 @@ function createEventToChunkStream(sideChannel: SideChannelCallbacks): TransformS
             tool: string;
             arguments: Record<string, unknown>;
             tool_call_id: string;
           }>;
           if (!tools) break;

   onToolLog: (tool: string, log: string, agentId?: string, label?: string) => void;
   onConnectionChange: (connected: boolean) => void;
   onSessionDead: (sessionId: string) => void;
+  onApprovalRequired: (tools: Array<{
+    tool: string;
+    arguments: Record<string, unknown>;
+    tool_call_id: string;
+    auto_approval_blocked?: boolean;
+    block_reason?: string | null;
+    estimated_cost_usd?: number | null;
+    remaining_cap_usd?: number | null;
+  }>) => void;
   onToolCallPanel: (tool: string, args: Record<string, unknown>) => void;
   onToolOutputPanel: (tool: string, toolCallId: string, output: string, success: boolean) => void;
   onStreaming: () => void;
             tool: string;
             arguments: Record<string, unknown>;
             tool_call_id: string;
+            auto_approval_blocked?: boolean;
+            block_reason?: string | null;
+            estimated_cost_usd?: number | null;
+            remaining_cap_usd?: number | null;
           }>;
           if (!tools) break;

frontend/src/store/agentStore.ts CHANGED Viewed

@@ -50,6 +50,12 @@ export interface JobsUpgradeState {
   namespace?: string | null;
 }
 export type ActivityStatus =
   | { type: 'idle' }
   | { type: 'thinking' }
@@ -145,6 +151,9 @@ interface AgentStore {
   // Tool rejected states (tool_call_id -> true if rejected by user) - persisted across renders
   rejectedTools: Record<string, boolean>;
   // ── Per-session actions ─────────────────────────────────────────────
   /** Update a session's state. If it's the active session, also update flat state. */
@@ -196,6 +205,9 @@ interface AgentStore {
   setToolRejected: (toolCallId: string, isRejected: boolean) => void;
   getToolRejected: (toolCallId: string) => boolean | undefined;
 }
 /**
@@ -300,6 +312,7 @@ export const useAgentStore = create<AgentStore>()((set, get) => ({
   trackioDashboards: loadTrackioDashboards(),
   toolErrors: loadToolErrors(),
   rejectedTools: loadRejectedTools(),
   // ── Per-session state management ──────────────────────────────────
@@ -529,4 +542,24 @@ export const useAgentStore = create<AgentStore>()((set, get) => ({
   },
   getToolRejected: (toolCallId) => get().rejectedTools[toolCallId],
 }));

   namespace?: string | null;
 }
+export interface ToolBudgetBlockState {
+  reason?: string | null;
+  estimatedCostUsd?: number | null;
+  remainingCapUsd?: number | null;
+}
 export type ActivityStatus =
   | { type: 'idle' }
   | { type: 'thinking' }
   // Tool rejected states (tool_call_id -> true if rejected by user) - persisted across renders
   rejectedTools: Record<string, boolean>;
+  // Tool budget-block metadata (tool_call_id -> display metadata) - transient UI state
+  budgetBlocks: Record<string, ToolBudgetBlockState>;
   // ── Per-session actions ─────────────────────────────────────────────
   /** Update a session's state. If it's the active session, also update flat state. */
   setToolRejected: (toolCallId: string, isRejected: boolean) => void;
   getToolRejected: (toolCallId: string) => boolean | undefined;
+  setToolBudgetBlock: (toolCallId: string, block: ToolBudgetBlockState | null) => void;
+  getToolBudgetBlock: (toolCallId: string) => ToolBudgetBlockState | undefined;
 }
 /**
   trackioDashboards: loadTrackioDashboards(),
   toolErrors: loadToolErrors(),
   rejectedTools: loadRejectedTools(),
+  budgetBlocks: {},
   // ── Per-session state management ──────────────────────────────────
   },
   getToolRejected: (toolCallId) => get().rejectedTools[toolCallId],
+  // ── Tool Budget Blocks ───────────────────────────────────────────────
+  setToolBudgetBlock: (toolCallId, block) => {
+    set((state) => {
+      if (!block) {
+        const next = { ...state.budgetBlocks };
+        delete next[toolCallId];
+        return { budgetBlocks: next };
+      }
+      return {
+        budgetBlocks: {
+          ...state.budgetBlocks,
+          [toolCallId]: block,
+        },
+      };
+    });
+  },
+  getToolBudgetBlock: (toolCallId) => get().budgetBlocks[toolCallId],
 }));

frontend/src/store/sessionStore.ts CHANGED Viewed

@@ -27,7 +27,19 @@ interface SessionStore {
     created_at: string;
     is_active?: boolean;
     pending_approval?: unknown[] | null;
   }>) => void;
   /** Atomically swap a session's id in the list + both localStorage caches.
    *  Used when we rehydrate an expired session into a freshly-created backend
    *  session — preserves title, timestamps, and messages. */
@@ -47,6 +59,10 @@ export const useSessionStore = create<SessionStore>()(
           createdAt: new Date().toISOString(),
           isActive: true,
           needsAttention: false,
         };
         set((state) => ({
           sessions: [...state.sessions, newSession],
@@ -93,12 +109,21 @@ export const useSessionStore = create<SessionStore>()(
             if (!id) continue;
             const existing = byId.get(id);
             if (existing) {
               const updated = {
                 ...existing,
                 title: server.title || existing.title,
                 isActive: server.is_active ?? existing.isActive,
                 needsAttention: Boolean(server.pending_approval?.length) || existing.needsAttention,
                 expired: false,
               };
               const idx = merged.findIndex((s) => s.id === id);
               if (idx >= 0) merged[idx] = updated;
@@ -112,6 +137,10 @@ export const useSessionStore = create<SessionStore>()(
               isActive: server.is_active ?? true,
               needsAttention: Boolean(server.pending_approval?.length),
               expired: false,
             };
             merged.push(newSession);
             byId.set(id, newSession);
@@ -123,6 +152,22 @@ export const useSessionStore = create<SessionStore>()(
         });
       },
       renameSession: (oldId: string, newId: string) => {
         if (oldId === newId) return;
         moveMessages(oldId, newId);

     created_at: string;
     is_active?: boolean;
     pending_approval?: unknown[] | null;
+    auto_approval?: {
+      enabled?: boolean;
+      cost_cap_usd?: number | null;
+      estimated_spend_usd?: number;
+      remaining_usd?: number | null;
+    } | null;
   }>) => void;
+  updateSessionYolo: (id: string, policy: {
+    enabled: boolean;
+    cost_cap_usd?: number | null;
+    estimated_spend_usd?: number;
+    remaining_usd?: number | null;
+  }) => void;
   /** Atomically swap a session's id in the list + both localStorage caches.
    *  Used when we rehydrate an expired session into a freshly-created backend
    *  session — preserves title, timestamps, and messages. */
           createdAt: new Date().toISOString(),
           isActive: true,
           needsAttention: false,
+          autoApprovalEnabled: false,
+          autoApprovalCostCapUsd: null,
+          autoApprovalEstimatedSpendUsd: 0,
+          autoApprovalRemainingUsd: null,
         };
         set((state) => ({
           sessions: [...state.sessions, newSession],
             if (!id) continue;
             const existing = byId.get(id);
             if (existing) {
+              const auto = server.auto_approval;
               const updated = {
                 ...existing,
                 title: server.title || existing.title,
                 isActive: server.is_active ?? existing.isActive,
                 needsAttention: Boolean(server.pending_approval?.length) || existing.needsAttention,
                 expired: false,
+                ...(auto
+                  ? {
+                      autoApprovalEnabled: Boolean(auto.enabled),
+                      autoApprovalCostCapUsd: auto.cost_cap_usd ?? null,
+                      autoApprovalEstimatedSpendUsd: auto.estimated_spend_usd ?? 0,
+                      autoApprovalRemainingUsd: auto.remaining_usd ?? null,
+                    }
+                  : {}),
               };
               const idx = merged.findIndex((s) => s.id === id);
               if (idx >= 0) merged[idx] = updated;
               isActive: server.is_active ?? true,
               needsAttention: Boolean(server.pending_approval?.length),
               expired: false,
+              autoApprovalEnabled: Boolean(server.auto_approval?.enabled),
+              autoApprovalCostCapUsd: server.auto_approval?.cost_cap_usd ?? null,
+              autoApprovalEstimatedSpendUsd: server.auto_approval?.estimated_spend_usd ?? 0,
+              autoApprovalRemainingUsd: server.auto_approval?.remaining_usd ?? null,
             };
             merged.push(newSession);
             byId.set(id, newSession);
         });
       },
+      updateSessionYolo: (id, policy) => {
+        set((state) => ({
+          sessions: state.sessions.map((s) =>
+            s.id === id
+              ? {
+                  ...s,
+                  autoApprovalEnabled: policy.enabled,
+                  autoApprovalCostCapUsd: policy.cost_cap_usd ?? null,
+                  autoApprovalEstimatedSpendUsd: policy.estimated_spend_usd ?? 0,
+                  autoApprovalRemainingUsd: policy.remaining_usd ?? null,
+                }
+              : s,
+          ),
+        }));
+      },
       renameSession: (oldId: string, newId: string) => {
         if (oldId === newId) return;
         moveMessages(oldId, newId);

frontend/src/types/agent.ts CHANGED Viewed

@@ -21,6 +21,10 @@ export interface SessionMeta {
    *  disables input until the user chooses to restore-with-summary or
    *  start fresh. */
   expired?: boolean;
 }
 export interface ToolApproval {

    *  disables input until the user chooses to restore-with-summary or
    *  start fresh. */
   expired?: boolean;
+  autoApprovalEnabled?: boolean;
+  autoApprovalCostCapUsd?: number | null;
+  autoApprovalEstimatedSpendUsd?: number;
+  autoApprovalRemainingUsd?: number | null;
 }
 export interface ToolApproval {

frontend/src/types/events.ts CHANGED Viewed

@@ -68,6 +68,10 @@ export interface ApprovalToolItem {
   tool: string;
   arguments: Record<string, unknown>;
   tool_call_id: string;
 }
 export interface TurnCompleteEventData {

   tool: string;
   arguments: Record<string, unknown>;
   tool_call_id: string;
+  auto_approval_blocked?: boolean;
+  block_reason?: string | null;
+  estimated_cost_usd?: number | null;
+  remaining_cap_usd?: number | null;
 }
 export interface TurnCompleteEventData {

tests/unit/test_agent_model_gating.py CHANGED Viewed

@@ -127,3 +127,48 @@ async def test_user_quota_response_uses_premium_fields_only(monkeypatch):
         "premium_daily_cap": 5,
         "premium_remaining": 3,
     }

         "premium_daily_cap": 5,
         "premium_remaining": 3,
     }
+@pytest.mark.asyncio
+async def test_set_session_yolo_calls_manager_with_cap_presence(monkeypatch):
+    async def fake_check_session_access(session_id, user, request=None):
+        assert session_id == "s1"
+        assert user["user_id"] == "u1"
+        return object()
+    calls = []
+    async def fake_update_session_auto_approval(session_id, **kwargs):
+        calls.append((session_id, kwargs))
+        return {
+            "enabled": kwargs["enabled"],
+            "cost_cap_usd": 7.5,
+            "estimated_spend_usd": 0.0,
+            "remaining_usd": 7.5,
+        }
+    monkeypatch.setattr(agent, "_check_session_access", fake_check_session_access)
+    monkeypatch.setattr(
+        agent.session_manager,
+        "update_session_auto_approval",
+        fake_update_session_auto_approval,
+    )
+    response = await agent.set_session_yolo(
+        "s1",
+        agent.SessionYoloRequest(enabled=True, cost_cap_usd=7.5),
+        {"user_id": "u1"},
+    )
+    assert response["enabled"] is True
+    assert response["remaining_usd"] == 7.5
+    assert calls == [
+        (
+            "s1",
+            {
+                "enabled": True,
+                "cost_cap_usd": 7.5,
+                "cap_provided": True,
+            },
+        )
+    ]

tests/unit/test_auto_approval_policy.py ADDED Viewed

	@@ -0,0 +1,185 @@

+from types import SimpleNamespace
+import pytest
+from agent.config import Config
+from agent.core import agent_loop
+from agent.core.cost_estimation import CostEstimate
+def _config(**overrides):
+    data = {
+        "model_name": "moonshotai/Kimi-K2.6",
+        "confirm_cpu_jobs": True,
+        "auto_file_upload": False,
+        "yolo_mode": False,
+        **overrides,
+    }
+    return Config.model_validate(data)
+def _session(*, cap=5.0, spent=0.0, enabled=True):
+    return SimpleNamespace(
+        config=_config(),
+        auto_approval_enabled=enabled,
+        auto_approval_cost_cap_usd=cap,
+        auto_approval_estimated_spend_usd=spent,
+        sandbox=None,
+    )
+@pytest.mark.asyncio
+async def test_session_yolo_auto_approves_non_costed_approval_tool():
+    decision = await agent_loop._approval_decision(
+        "hf_repo_files",
+        {"operation": "upload", "path": "README.md"},
+        _session(),
+    )
+    assert decision.requires_approval is False
+    assert decision.auto_approved is True
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "operation",
+    ["scheduled run", "scheduled uv", "scheduled  run"],
+)
+async def test_scheduled_hf_jobs_always_require_manual_approval(operation):
+    session = _session()
+    session.config.yolo_mode = True
+    decision = await agent_loop._approval_decision(
+        "hf_jobs",
+        {"operation": operation, "script": "print(1)"},
+        session,
+    )
+    assert decision.requires_approval is True
+    assert decision.auto_approval_blocked is True
+    assert "Scheduled HF jobs" in decision.block_reason
+    assert agent_loop._needs_approval("hf_jobs", {"operation": operation}, session.config)
+@pytest.mark.asyncio
+async def test_immediate_hf_job_under_cap_auto_runs(monkeypatch):
+    async def fake_estimate(*args, **kwargs):
+        return CostEstimate(estimated_cost_usd=2.0, billable=True)
+    monkeypatch.setattr(agent_loop, "estimate_tool_cost", fake_estimate)
+    decision = await agent_loop._approval_decision(
+        "hf_jobs",
+        {"operation": "run", "hardware_flavor": "a10g-large", "timeout": "1h"},
+        _session(cap=5.0, spent=1.0),
+    )
+    assert decision.requires_approval is False
+    assert decision.auto_approved is True
+    assert decision.estimated_cost_usd == 2.0
+@pytest.mark.asyncio
+async def test_immediate_hf_job_over_cap_falls_back_to_approval(monkeypatch):
+    async def fake_estimate(*args, **kwargs):
+        return CostEstimate(estimated_cost_usd=2.0, billable=True)
+    monkeypatch.setattr(agent_loop, "estimate_tool_cost", fake_estimate)
+    decision = await agent_loop._approval_decision(
+        "hf_jobs",
+        {"operation": "run", "hardware_flavor": "a10g-large", "timeout": "1h"},
+        _session(cap=5.0, spent=4.0),
+    )
+    assert decision.requires_approval is True
+    assert decision.auto_approval_blocked is True
+    assert "exceeds" in decision.block_reason
+    assert decision.remaining_cap_usd == 1.0
+@pytest.mark.asyncio
+async def test_unknown_cost_falls_back_to_approval(monkeypatch):
+    async def fake_estimate(*args, **kwargs):
+        return CostEstimate(
+            estimated_cost_usd=None,
+            billable=True,
+            block_reason="No price is available.",
+        )
+    monkeypatch.setattr(agent_loop, "estimate_tool_cost", fake_estimate)
+    decision = await agent_loop._approval_decision(
+        "sandbox_create",
+        {"hardware": "mystery-gpu"},
+        _session(),
+    )
+    assert decision.requires_approval is True
+    assert decision.auto_approval_blocked is True
+    assert decision.estimated_cost_usd is None
+@pytest.mark.asyncio
+async def test_batch_reservation_blocks_second_over_budget_job(monkeypatch):
+    async def fake_estimate(*args, **kwargs):
+        return CostEstimate(estimated_cost_usd=3.0, billable=True)
+    monkeypatch.setattr(agent_loop, "estimate_tool_cost", fake_estimate)
+    session = _session(cap=5.0, spent=0.0)
+    first = await agent_loop._approval_decision(
+        "hf_jobs",
+        {"operation": "run", "hardware_flavor": "a10g-large"},
+        session,
+        reserved_spend_usd=0.0,
+    )
+    second = await agent_loop._approval_decision(
+        "hf_jobs",
+        {"operation": "run", "hardware_flavor": "a10g-large"},
+        session,
+        reserved_spend_usd=first.estimated_cost_usd or 0.0,
+    )
+    assert first.requires_approval is False
+    assert second.requires_approval is True
+    assert second.remaining_cap_usd == 2.0
+@pytest.mark.asyncio
+async def test_manual_approval_does_not_record_spend_when_session_yolo_disabled(monkeypatch):
+    called = False
+    async def fake_estimate(*args, **kwargs):
+        nonlocal called
+        called = True
+        return CostEstimate(estimated_cost_usd=2.0, billable=True)
+    monkeypatch.setattr(agent_loop, "estimate_tool_cost", fake_estimate)
+    session = _session(enabled=False, cap=5.0, spent=0.0)
+    await agent_loop._record_manual_approved_spend_if_needed(
+        session,
+        "sandbox_create",
+        {"hardware": "a10g-large"},
+    )
+    assert called is False
+    assert session.auto_approval_estimated_spend_usd == 0.0
+@pytest.mark.asyncio
+async def test_manual_approval_records_spend_when_session_yolo_enabled(monkeypatch):
+    async def fake_estimate(*args, **kwargs):
+        return CostEstimate(estimated_cost_usd=1.25, billable=True)
+    monkeypatch.setattr(agent_loop, "estimate_tool_cost", fake_estimate)
+    session = _session(enabled=True, cap=5.0, spent=0.5)
+    await agent_loop._record_manual_approved_spend_if_needed(
+        session,
+        "sandbox_create",
+        {"hardware": "a10g-large"},
+    )
+    assert session.auto_approval_estimated_spend_usd == 1.75

tests/unit/test_cost_estimation.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from types import SimpleNamespace
+import pytest
+from agent.core import cost_estimation
+def test_parse_timeout_hours_common_units():
+    assert cost_estimation.parse_timeout_hours(None) == 0.5
+    assert cost_estimation.parse_timeout_hours("30m") == 0.5
+    assert cost_estimation.parse_timeout_hours("3h") == 3
+    assert cost_estimation.parse_timeout_hours(3600) == 1
+    assert cost_estimation.parse_timeout_hours("not-a-duration") is None
+@pytest.mark.asyncio
+async def test_estimate_hf_job_cost_uses_catalog_price(monkeypatch):
+    async def fake_catalog():
+        return {"a100-large": 4.0}
+    monkeypatch.setattr(cost_estimation, "hf_jobs_price_catalog", fake_catalog)
+    estimate = await cost_estimation.estimate_hf_job_cost(
+        {"hardware_flavor": "a100-large", "timeout": "8h"}
+    )
+    assert estimate.estimated_cost_usd == 32.0
+    assert estimate.billable is True
+@pytest.mark.asyncio
+async def test_estimate_hf_job_cost_blocks_unknown_price(monkeypatch):
+    async def fake_catalog():
+        return {}
+    monkeypatch.setattr(cost_estimation, "hf_jobs_price_catalog", fake_catalog)
+    estimate = await cost_estimation.estimate_hf_job_cost(
+        {"hardware_flavor": "mystery-gpu", "timeout": "30m"}
+    )
+    assert estimate.estimated_cost_usd is None
+    assert estimate.billable is True
+    assert "No price" in estimate.block_reason
+@pytest.mark.asyncio
+async def test_estimate_sandbox_cost_is_zero_for_existing_or_cpu_basic():
+    existing = await cost_estimation.estimate_sandbox_cost(
+        {"hardware": "a100-large"},
+        session=SimpleNamespace(sandbox=object()),
+    )
+    cpu = await cost_estimation.estimate_sandbox_cost({"hardware": "cpu-basic"})
+    assert existing.estimated_cost_usd == 0.0
+    assert existing.billable is False
+    assert cpu.estimated_cost_usd == 0.0
+    assert cpu.billable is False

tests/unit/test_session_manager_persistence.py CHANGED Viewed

@@ -27,6 +27,23 @@ class FakeRuntimeSession:
         self.turn_count = 0
         self.config = SimpleNamespace(model_name=model)
         self.notification_destinations = []
 class RestoreStore(NoopSessionStore):
@@ -85,6 +102,24 @@ def _runtime_agent_session(
     )
 def _install_fake_runtime(manager: SessionManager) -> asyncio.Event:
     stop = asyncio.Event()
     manager.run_calls = 0  # type: ignore[attr-defined]
@@ -204,6 +239,34 @@ async def test_lazy_restore_preserves_pending_approval_tool_calls():
         await _cancel_runtime_tasks(manager)
 @pytest.mark.asyncio
 async def test_list_sessions_dev_uses_store_dev_visibility():
     class ListStore(NoopSessionStore):
@@ -221,6 +284,9 @@ async def test_list_sessions_dev_uses_store_dev_visibility():
                         "user_id": "alice",
                         "model": "m",
                         "created_at": datetime.now(UTC),
                     },
                     {
                         "session_id": "s2",
@@ -238,3 +304,10 @@ async def test_list_sessions_dev_uses_store_dev_visibility():
     assert store.seen_user_id == "dev"
     assert {session["session_id"] for session in sessions} == {"s1", "s2"}

         self.turn_count = 0
         self.config = SimpleNamespace(model_name=model)
         self.notification_destinations = []
+        self.auto_approval_enabled = False
+        self.auto_approval_cost_cap_usd = None
+        self.auto_approval_estimated_spend_usd = 0.0
+    def auto_approval_policy_summary(self):
+        cap = self.auto_approval_cost_cap_usd
+        remaining = None if cap is None else max(0, cap - self.auto_approval_estimated_spend_usd)
+        return {
+            "enabled": self.auto_approval_enabled,
+            "cost_cap_usd": cap,
+            "estimated_spend_usd": self.auto_approval_estimated_spend_usd,
+            "remaining_usd": remaining,
+        }
+    def set_auto_approval_policy(self, *, enabled, cost_cap_usd):
+        self.auto_approval_enabled = enabled
+        self.auto_approval_cost_cap_usd = cost_cap_usd
 class RestoreStore(NoopSessionStore):
     )
+@pytest.mark.asyncio
+async def test_update_session_auto_approval_defaults_to_five_dollars():
+    manager = _manager_with_store(NoopSessionStore())
+    existing = _runtime_agent_session("s1", user_id="owner")
+    manager.sessions["s1"] = existing
+    summary = await manager.update_session_auto_approval(
+        "s1",
+        enabled=True,
+        cost_cap_usd=None,
+        cap_provided=False,
+    )
+    assert summary["enabled"] is True
+    assert summary["cost_cap_usd"] == 5.0
+    assert summary["remaining_usd"] == 5.0
 def _install_fake_runtime(manager: SessionManager) -> asyncio.Event:
     stop = asyncio.Event()
     manager.run_calls = 0  # type: ignore[attr-defined]
         await _cancel_runtime_tasks(manager)
+@pytest.mark.asyncio
+async def test_lazy_restore_preserves_auto_approval_policy():
+    store = RestoreStore(
+        metadata={
+            "session_id": "yolo-session",
+            "user_id": "owner",
+            "model": "test-model",
+            "auto_approval_enabled": True,
+            "auto_approval_cost_cap_usd": 5.0,
+            "auto_approval_estimated_spend_usd": 1.25,
+        }
+    )
+    manager = _manager_with_store(store)
+    stop = _install_fake_runtime(manager)
+    try:
+        restored = await manager.ensure_session_loaded("yolo-session", user_id="owner")
+        assert restored is not None
+        assert restored.session.auto_approval_enabled is True
+        assert restored.session.auto_approval_cost_cap_usd == 5.0
+        assert restored.session.auto_approval_estimated_spend_usd == 1.25
+        assert restored.session.auto_approval_policy_summary()["remaining_usd"] == 3.75
+    finally:
+        stop.set()
+        await _cancel_runtime_tasks(manager)
 @pytest.mark.asyncio
 async def test_list_sessions_dev_uses_store_dev_visibility():
     class ListStore(NoopSessionStore):
                         "user_id": "alice",
                         "model": "m",
                         "created_at": datetime.now(UTC),
+                        "auto_approval_enabled": True,
+                        "auto_approval_cost_cap_usd": 5.0,
+                        "auto_approval_estimated_spend_usd": 2.0,
                     },
                     {
                         "session_id": "s2",
     assert store.seen_user_id == "dev"
     assert {session["session_id"] for session in sessions} == {"s1", "s2"}
+    yolo = next(session for session in sessions if session["session_id"] == "s1")
+    assert yolo["auto_approval"] == {
+        "enabled": True,
+        "cost_cap_usd": 5.0,
+        "estimated_spend_usd": 2.0,
+        "remaining_usd": 3.0,
+    }