ml-intern

Sleeping

App Files Files Community

lewtun HF Staff OpenAI Codex commited on 22 days ago

Commit

479aaea

2 Parent(s): ade0b7e 2b4c539

Deploy 2026-05-11

Browse files

Co-authored-by: OpenAI Codex <codex@openai.com>

Files changed (30) hide show

agent/config.py +2 -1
agent/core/agent_loop.py +94 -1
agent/core/session.py +2 -0
agent/main.py +64 -12
agent/prompts/system_prompt_v3.yaml +11 -2
agent/tools/jobs_tool.py +7 -3
agent/tools/plan_tool.py +8 -4
agent/tools/sandbox_client.py +11 -8
agent/tools/sandbox_tool.py +38 -18
agent/utils/terminal_display.py +69 -14
backend/dataset_uploads.py +305 -0
backend/models.py +18 -1
backend/routes/agent.py +147 -1
configs/cli_agent_config.json +1 -0
frontend/src/components/Chat/ChatInput.tsx +224 -11
frontend/src/components/SessionChat.tsx +11 -1
frontend/src/hooks/useAgentChat.ts +43 -0
frontend/src/utils/api.ts +70 -18
pyproject.toml +1 -0
tests/unit/test_cli_rendering.py +247 -3
tests/unit/test_config.py +35 -0
tests/unit/test_dataset_uploads.py +465 -0
tests/unit/test_hub_artifacts.py +5 -1
tests/unit/test_no_tool_continuation_guard.py +147 -0
tests/unit/test_sandbox_auto_start.py +107 -0
tests/unit/test_sandbox_private_spaces.py +218 -18
tests/unit/test_sandbox_script_resolution.py +70 -0
tests/unit/test_session_manager_persistence.py +1 -1
tests/unit/test_trackio_space_ids.py +16 -0
uv.lock +2 -0

agent/config.py CHANGED Viewed

@@ -2,7 +2,7 @@ import json
 import os
 import re
 from pathlib import Path
-from typing import Any, Union
 from dotenv import load_dotenv
 from fastmcp.mcp_config import (
@@ -46,6 +46,7 @@ class Config(BaseModel):
     # Permission control parameters
     confirm_cpu_jobs: bool = True
     auto_file_upload: bool = False
     # Reasoning effort *preference* — the ceiling the user wants. The probe
     # on `/model` walks a cascade down from here (``max`` → ``xhigh`` → ``high``

 import os
 import re
 from pathlib import Path
+from typing import Any, Literal, Union
 from dotenv import load_dotenv
 from fastmcp.mcp_config import (
     # Permission control parameters
     confirm_cpu_jobs: bool = True
     auto_file_upload: bool = False
+    tool_runtime: Literal["local", "sandbox"] = "local"
     # Reasoning effort *preference* — the ceiling the user wants. The probe
     # on `/model` walks a cascade down from here (``max`` → ``xhigh`` → ``high``

agent/core/agent_loop.py CHANGED Viewed

@@ -32,7 +32,11 @@ from agent.core.prompt_caching import with_prompt_caching
 from agent.core.session import DEFAULT_SESSION_LOG_DIR, Event, OpType, Session
 from agent.core.tools import ToolRouter
 from agent.tools.jobs_tool import CPU_FLAVORS
-from agent.tools.sandbox_tool import DEFAULT_CPU_SANDBOX_HARDWARE
 logger = logging.getLogger(__name__)
@@ -40,6 +44,43 @@ ToolCall = ChatCompletionMessageToolCall
 _MALFORMED_TOOL_PREFIX = "ERROR: Tool call to '"
 _MALFORMED_TOOL_SUFFIX = "' had malformed JSON arguments"
 def _malformed_tool_name(message: Message) -> str | None:
@@ -1153,6 +1194,7 @@ class Handlers:
         final_response = None
         errored = False
         max_iterations = session.config.max_iterations
         while max_iterations == -1 or iteration < max_iterations:
             # ── Cancellation check: before LLM call ──
@@ -1301,6 +1343,51 @@ class Handlers:
                 # If no tool calls, add assistant message and we're done
                 if not tool_calls:
                     logger.debug(
                         "Agent loop ending: no tool calls. "
                         "finish_reason=%s, token_count=%d, "
@@ -1324,6 +1411,8 @@ class Handlers:
                         final_response = content
                     break
                 # Validate tool call args (one json.loads per call, once)
                 # and split into good vs bad
                 good_tools: list[tuple[ToolCall, str, dict]] = []
@@ -1940,6 +2029,8 @@ class Handlers:
             _ = session.save_and_upload_detached(repo_id)
         session.is_running = False
         await session.send_event(Event(event_type="shutdown"))
         return True
@@ -2023,6 +2114,8 @@ async def submission_loop(
     )
     if session_holder is not None:
         session_holder[0] = session
     logger.info("Agent loop started")
     # Retry any failed uploads from previous sessions (fire-and-forget).

 from agent.core.session import DEFAULT_SESSION_LOG_DIR, Event, OpType, Session
 from agent.core.tools import ToolRouter
 from agent.tools.jobs_tool import CPU_FLAVORS
+from agent.tools.sandbox_tool import (
+    DEFAULT_CPU_SANDBOX_HARDWARE,
+    start_cpu_sandbox_preload,
+    teardown_session_sandbox,
+)
 logger = logging.getLogger(__name__)
 _MALFORMED_TOOL_PREFIX = "ERROR: Tool call to '"
 _MALFORMED_TOOL_SUFFIX = "' had malformed JSON arguments"
+_NO_TOOL_INCOMPLETE_PLAN_RETRY_LIMIT = 2
+def _unfinished_plan_items(session: Session) -> list[dict[str, str]]:
+    plan = getattr(session, "current_plan", None) or []
+    unfinished: list[dict[str, str]] = []
+    for item in plan:
+        if not isinstance(item, dict):
+            continue
+        status = item.get("status")
+        if status in {"pending", "in_progress"}:
+            unfinished.append(item)
+    return unfinished
+def _format_plan_items_for_guard(items: list[dict[str, str]], limit: int = 4) -> str:
+    formatted = []
+    for item in items[:limit]:
+        item_id = item.get("id") or "?"
+        content = item.get("content") or "(unnamed task)"
+        status = item.get("status") or "unknown"
+        formatted.append(f"{item_id}. {content} [{status}]")
+    if len(items) > limit:
+        formatted.append(f"... and {len(items) - limit} more")
+    return "; ".join(formatted)
+def _no_tool_incomplete_plan_prompt(items: list[dict[str, str]]) -> str:
+    summary = _format_plan_items_for_guard(items)
+    return (
+        "[SYSTEM: CONTINUATION GUARD] Your previous response ended without any "
+        "tool calls, but the task is not complete. The current plan still has "
+        f"unfinished items: {summary}. Do not return control to the user yet. "
+        "Continue from the next unfinished item and make at least one tool call "
+        "now. If you genuinely cannot continue, first use tools to inspect the "
+        "state or verify the blocker."
+    )
 def _malformed_tool_name(message: Message) -> str | None:
         final_response = None
         errored = False
         max_iterations = session.config.max_iterations
+        no_tool_incomplete_plan_retries = 0
         while max_iterations == -1 or iteration < max_iterations:
             # ── Cancellation check: before LLM call ──
                 # If no tool calls, add assistant message and we're done
                 if not tool_calls:
+                    unfinished_plan = _unfinished_plan_items(session)
+                    if (
+                        unfinished_plan
+                        and no_tool_incomplete_plan_retries
+                        < _NO_TOOL_INCOMPLETE_PLAN_RETRY_LIMIT
+                    ):
+                        logger.info(
+                            "No tool calls with unfinished plan; retrying agent turn "
+                            "(attempt %d/%d)",
+                            no_tool_incomplete_plan_retries + 1,
+                            _NO_TOOL_INCOMPLETE_PLAN_RETRY_LIMIT,
+                        )
+                        if content:
+                            assistant_msg = _assistant_message_from_result(
+                                llm_result,
+                                model_name=llm_params.get("model"),
+                            )
+                            session.context_manager.add_message(
+                                assistant_msg, token_count
+                            )
+                        session.context_manager.add_message(
+                            Message(
+                                role="user",
+                                content=_no_tool_incomplete_plan_prompt(
+                                    unfinished_plan
+                                ),
+                            )
+                        )
+                        no_tool_incomplete_plan_retries += 1
+                        await session.send_event(
+                            Event(
+                                event_type="tool_log",
+                                data={
+                                    "tool": "system",
+                                    "log": (
+                                        "Plan still has unfinished items after a "
+                                        "text-only response — retrying instead of "
+                                        "returning to the prompt."
+                                    ),
+                                },
+                            )
+                        )
+                        iteration += 1
+                        continue
                     logger.debug(
                         "Agent loop ending: no tool calls. "
                         "finish_reason=%s, token_count=%d, "
                         final_response = content
                     break
+                no_tool_incomplete_plan_retries = 0
                 # Validate tool call args (one json.loads per call, once)
                 # and split into good vs bad
                 good_tools: list[tuple[ToolCall, str, dict]] = []
             _ = session.save_and_upload_detached(repo_id)
         session.is_running = False
+        if not getattr(session, "local_mode", False):
+            await teardown_session_sandbox(session)
         await session.send_event(Event(event_type="shutdown"))
         return True
     )
     if session_holder is not None:
         session_holder[0] = session
+    if not local_mode:
+        start_cpu_sandbox_preload(session)
     logger.info("Agent loop started")
     # Retry any failed uploads from previous sessions (fire-and-forget).

agent/core/session.py CHANGED Viewed

@@ -99,6 +99,7 @@ class Session:
         self.hf_token: Optional[str] = hf_token
         self.user_id: Optional[str] = user_id
         self.hf_username: Optional[str] = hf_username
         self.persistence_store = persistence_store
         self.tool_router = tool_router
         self.stream = stream
@@ -117,6 +118,7 @@ class Session:
         self.session_id = session_id or str(uuid.uuid4())
         self.config = config
         self.is_running = True
         self._cancelled = asyncio.Event()
         self.pending_approval: Optional[dict[str, Any]] = None
         self.sandbox = None

         self.hf_token: Optional[str] = hf_token
         self.user_id: Optional[str] = user_id
         self.hf_username: Optional[str] = hf_username
+        self.local_mode = local_mode
         self.persistence_store = persistence_store
         self.tool_router = tool_router
         self.stream = stream
         self.session_id = session_id or str(uuid.uuid4())
         self.config = config
         self.is_running = True
+        self.current_plan: list[dict[str, str]] = []
         self._cancelled = asyncio.Event()
         self.pending_approval: Optional[dict[str, Any]] = None
         self.sandbox = None

agent/main.py CHANGED Viewed

@@ -59,6 +59,34 @@ CLI_CONFIG_PATH = Path(__file__).parent.parent / "configs" / "cli_agent_config.j
 logger = logging.getLogger(__name__)
 def _is_scheduled_hf_job_tool(tool_info: dict[str, Any]) -> bool:
     if tool_info.get("tool") != "hf_jobs":
         return False
@@ -957,6 +985,7 @@ async def _handle_slash_command(
         session = session_holder[0] if session_holder else None
         print(f"Model: {config.model_name}")
         print(f"Reasoning effort: {config.reasoning_effort or 'off'}")
         if session:
             print(f"Turns: {session.turn_count}")
             print(f"Context items: {len(session.context_manager.items)}")
@@ -1076,7 +1105,7 @@ async def _handle_share_traces_command(arg: str, config, session) -> None:
     console.print(f"[green]Dataset is now {label}.[/green] {url}")
-async def main(model: str | None = None):
     """Interactive chat with the agent"""
     # Clear screen
@@ -1088,16 +1117,23 @@ async def main(model: str | None = None):
     config = load_config(CLI_CONFIG_PATH, include_user_defaults=True)
     if model:
         config.model_name = model
-    # HF token — required for Hub-backed models/tools, but not for local LLMs.
     hf_token = resolve_hf_token()
-    if not hf_token and not is_local_model_id(config.model_name):
         hf_token = await _prompt_and_save_hf_token(prompt_session)
     # Resolve username for banner
     hf_user = _get_hf_user(hf_token)
-    print_banner(model=config.model_name, hf_user=hf_user)
     # Pre-warm the HF router catalog in the background so /model switches
     # don't block on a network fetch.
@@ -1116,8 +1152,10 @@ async def main(model: str | None = None):
     notification_gateway = NotificationGateway(config.messaging)
     await notification_gateway.start()
-    # Create tool router with local mode
-    tool_router = ToolRouter(config.mcpServers, hf_token=hf_token, local_mode=True)
     # Session holder for interrupt/model/status access
     session_holder = [None]
@@ -1131,7 +1169,7 @@ async def main(model: str | None = None):
             session_holder=session_holder,
             hf_token=hf_token,
             user_id=hf_user,
-            local_mode=True,
             stream=True,
             notification_gateway=notification_gateway,
             notification_destinations=config.messaging.default_auto_destinations(),
@@ -1153,6 +1191,8 @@ async def main(model: str | None = None):
     )
     await ready_event.wait()
     submission_id = [0]
     # Mirrors codex-rs/tui/src/bottom_pane/mod.rs:137
@@ -1310,6 +1350,7 @@ async def headless_main(
     model: str | None = None,
     max_iterations: int | None = None,
     stream: bool = True,
 ) -> None:
     """Run a single prompt headlessly and exit."""
     import logging
@@ -1322,11 +1363,13 @@ async def headless_main(
     if model:
         config.model_name = model
     hf_token = resolve_hf_token()
-    if not hf_token and not is_local_model_id(config.model_name):
         print(
-            "ERROR: No HF token found. Set HF_TOKEN or run `huggingface-cli login`.",
             file=sys.stderr,
         )
         sys.exit(1)
@@ -1342,6 +1385,7 @@ async def headless_main(
         config.max_iterations = max_iterations
     print(f"Model: {config.model_name}", file=sys.stderr)
     print(f"Max iterations: {config.max_iterations}", file=sys.stderr)
     print(f"Prompt: {prompt}", file=sys.stderr)
     print("---", file=sys.stderr)
@@ -1349,7 +1393,9 @@ async def headless_main(
     submission_queue: asyncio.Queue = asyncio.Queue()
     event_queue: asyncio.Queue = asyncio.Queue()
-    tool_router = ToolRouter(config.mcpServers, hf_token=hf_token, local_mode=True)
     session_holder: list = [None]
     agent_task = asyncio.create_task(
@@ -1361,7 +1407,7 @@ async def headless_main(
             session_holder=session_holder,
             hf_token=hf_token,
             user_id=hf_user,
-            local_mode=True,
             stream=stream,
             notification_gateway=notification_gateway,
             notification_destinations=config.messaging.default_auto_destinations(),
@@ -1556,6 +1602,11 @@ def cli():
         action="store_true",
         help="Disable token streaming (use non-streaming LLM calls)",
     )
     args = parser.parse_args()
     try:
@@ -1569,10 +1620,11 @@ def cli():
                     model=args.model,
                     max_iterations=max_iter,
                     stream=not args.no_stream,
                 )
             )
         else:
-            asyncio.run(main(model=args.model))
     except KeyboardInterrupt:
         print("\n\nGoodbye!")

 logger = logging.getLogger(__name__)
+def _apply_tool_runtime_override(config: Any, *, sandbox_tools: bool) -> str:
+    if sandbox_tools:
+        config.tool_runtime = "sandbox"
+    return getattr(config, "tool_runtime", "local")
+def _is_local_tool_runtime(config: Any) -> bool:
+    return getattr(config, "tool_runtime", "local") == "local"
+def _tool_runtime_label(local_mode: bool) -> str:
+    return "local filesystem" if local_mode else "HF sandbox"
+async def _wait_for_initial_sandbox_preload(session_holder: list | None) -> None:
+    session = session_holder[0] if session_holder else None
+    task = getattr(session, "sandbox_preload_task", None)
+    if not task:
+        return
+    try:
+        await asyncio.shield(task)
+    except asyncio.CancelledError:
+        raise
+    except Exception:
+        # The sandbox tool will surface the stored preload error on first use.
+        return
 def _is_scheduled_hf_job_tool(tool_info: dict[str, Any]) -> bool:
     if tool_info.get("tool") != "hf_jobs":
         return False
         session = session_holder[0] if session_holder else None
         print(f"Model: {config.model_name}")
         print(f"Reasoning effort: {config.reasoning_effort or 'off'}")
+        print(f"Tool runtime: {_tool_runtime_label(_is_local_tool_runtime(config))}")
         if session:
             print(f"Turns: {session.turn_count}")
             print(f"Context items: {len(session.context_manager.items)}")
     console.print(f"[green]Dataset is now {label}.[/green] {url}")
+async def main(model: str | None = None, sandbox_tools: bool = False):
     """Interactive chat with the agent"""
     # Clear screen
     config = load_config(CLI_CONFIG_PATH, include_user_defaults=True)
     if model:
         config.model_name = model
+    _apply_tool_runtime_override(config, sandbox_tools=sandbox_tools)
+    local_mode = _is_local_tool_runtime(config)
+    # HF token — required for Hub-backed models/tools and sandbox tools, but
+    # not for local LLMs using only local filesystem tools.
     hf_token = resolve_hf_token()
+    if not hf_token and (not is_local_model_id(config.model_name) or not local_mode):
         hf_token = await _prompt_and_save_hf_token(prompt_session)
     # Resolve username for banner
     hf_user = _get_hf_user(hf_token)
+    print_banner(
+        model=config.model_name,
+        hf_user=hf_user,
+        tool_runtime=_tool_runtime_label(local_mode),
+    )
     # Pre-warm the HF router catalog in the background so /model switches
     # don't block on a network fetch.
     notification_gateway = NotificationGateway(config.messaging)
     await notification_gateway.start()
+    # Create tool router with the selected CLI tool runtime.
+    tool_router = ToolRouter(
+        config.mcpServers, hf_token=hf_token, local_mode=local_mode
+    )
     # Session holder for interrupt/model/status access
     session_holder = [None]
             session_holder=session_holder,
             hf_token=hf_token,
             user_id=hf_user,
+            local_mode=local_mode,
             stream=True,
             notification_gateway=notification_gateway,
             notification_destinations=config.messaging.default_auto_destinations(),
     )
     await ready_event.wait()
+    if not local_mode:
+        await _wait_for_initial_sandbox_preload(session_holder)
     submission_id = [0]
     # Mirrors codex-rs/tui/src/bottom_pane/mod.rs:137
     model: str | None = None,
     max_iterations: int | None = None,
     stream: bool = True,
+    sandbox_tools: bool = False,
 ) -> None:
     """Run a single prompt headlessly and exit."""
     import logging
     if model:
         config.model_name = model
+    _apply_tool_runtime_override(config, sandbox_tools=sandbox_tools)
+    local_mode = _is_local_tool_runtime(config)
     hf_token = resolve_hf_token()
+    if not hf_token and (not is_local_model_id(config.model_name) or not local_mode):
         print(
+            "ERROR: No HF token found. Set HF_TOKEN or run `hf auth login`.",
             file=sys.stderr,
         )
         sys.exit(1)
         config.max_iterations = max_iterations
     print(f"Model: {config.model_name}", file=sys.stderr)
+    print(f"Tool runtime: {_tool_runtime_label(local_mode)}", file=sys.stderr)
     print(f"Max iterations: {config.max_iterations}", file=sys.stderr)
     print(f"Prompt: {prompt}", file=sys.stderr)
     print("---", file=sys.stderr)
     submission_queue: asyncio.Queue = asyncio.Queue()
     event_queue: asyncio.Queue = asyncio.Queue()
+    tool_router = ToolRouter(
+        config.mcpServers, hf_token=hf_token, local_mode=local_mode
+    )
     session_holder: list = [None]
     agent_task = asyncio.create_task(
             session_holder=session_holder,
             hf_token=hf_token,
             user_id=hf_user,
+            local_mode=local_mode,
             stream=stream,
             notification_gateway=notification_gateway,
             notification_destinations=config.messaging.default_auto_destinations(),
         action="store_true",
         help="Disable token streaming (use non-streaming LLM calls)",
     )
+    parser.add_argument(
+        "--sandbox-tools",
+        action="store_true",
+        help="Use HF Space sandbox tools instead of local filesystem tools",
+    )
     args = parser.parse_args()
     try:
                     model=args.model,
                     max_iterations=max_iter,
                     stream=not args.no_stream,
+                    sandbox_tools=args.sandbox_tools,
                 )
             )
         else:
+            asyncio.run(main(model=args.model, sandbox_tools=args.sandbox_tools))
     except KeyboardInterrupt:
         print("\n\nGoodbye!")

agent/prompts/system_prompt_v3.yaml CHANGED Viewed

@@ -66,7 +66,7 @@ system_prompt: |
     report_to="trackio"
     run_name="<descriptive-run-name>"          # e.g. "sft_qwen3-4b_lr2e-5_bs128"
     project="<descriptive-project-name>"       # keeps related runs grouped so you can compare them
-    trackio_space_id="<username>/mlintern-<8-char-id>"   # creates a public dashboard Space
   `project` and `trackio_space_id` can also be set via TRACKIO_PROJECT / TRACKIO_SPACE_ID env vars.
   Alerts are how iterations decide what to change. Use trackio.alert(title, text, level) at every decision point in training. Levels:
@@ -102,9 +102,18 @@ system_prompt: |
   # When submitting a training job
   Before calling hf_jobs, output a pre-flight check:
     - Reference implementation: [which example you based this on]
     - Dataset format verified: [columns confirmed via hf_inspect_dataset/hub_repo_details]
     - push_to_hub=True and hub_model_id set
     - timeout: [value] (based on: [model size] on [hardware])
     - Trackio monitoring included and deploying metrics to a public Space
@@ -127,7 +136,7 @@ system_prompt: |
   Do NOT call sandbox_create before normal CPU work. Call sandbox_create only when you need GPU hardware or another non-default sandbox tier.
-  Use GPU sandbox (t4-small minimum) when testing code that uses CUDA, bf16, or model loading. CPU sandboxes cannot test GPU code paths.
   # When a task has 3+ steps

     report_to="trackio"
     run_name="<descriptive-run-name>"          # e.g. "sft_qwen3-4b_lr2e-5_bs128"
     project="<descriptive-project-name>"       # keeps related runs grouped so you can compare them
+    trackio_space_id="<username>/ml-intern-<8-char-id>"  # creates a public dashboard Space
   `project` and `trackio_space_id` can also be set via TRACKIO_PROJECT / TRACKIO_SPACE_ID env vars.
   Alerts are how iterations decide what to change. Use trackio.alert(title, text, level) at every decision point in training. Levels:
   # When submitting a training job
+  Never pass a local machine path to hf_jobs.script, such as /Users/..., /home/..., /fsx/..., or a repo checkout path. HF Jobs runs in a fresh cloud environment where local files do not exist. For hf_jobs.script, use exactly one of:
+    - inline Python source code
+    - a file already written in the session sandbox, e.g. /app/train.py, ./train.py, or train.py
+    - a public/raw URL
+  If you wrote or tested a script locally, read the file content and submit it inline, or write it into the sandbox first.
+  GPU preflight is mandatory before hf_jobs when the job will run on GPU, or when the script loads a model, uses CUDA, bf16/fp16, quantization, flash attention, or torch.compile. First create a GPU sandbox with sandbox_create (t4-small minimum; choose larger hardware when VRAM requires it), run a tiny smoke test there using the same imports, model-loading path, training entrypoint, and a tiny dataset/subset, then fix failures before submitting. If you skip GPU sandbox preflight, state why before calling hf_jobs.
   Before calling hf_jobs, output a pre-flight check:
     - Reference implementation: [which example you based this on]
     - Dataset format verified: [columns confirmed via hf_inspect_dataset/hub_repo_details]
+    - GPU sandbox smoke test: [hardware and result, or explicitly not applicable because ...]
     - push_to_hub=True and hub_model_id set
     - timeout: [value] (based on: [model size] on [hardware])
     - Trackio monitoring included and deploying metrics to a public Space
   Do NOT call sandbox_create before normal CPU work. Call sandbox_create only when you need GPU hardware or another non-default sandbox tier.
+  Use a GPU sandbox (t4-small minimum) when testing code that uses CUDA, bf16/fp16, quantization, flash attention, torch.compile, or model loading. CPU sandboxes cannot test GPU code paths. If the available sandbox tiers cannot fit the full model path, test the largest useful smoke path, state what was not covered, and submit one HF job first.
   # When a task has 3+ steps

agent/tools/jobs_tool.py CHANGED Viewed

@@ -1112,11 +1112,14 @@ HF_JOBS_TOOL_SPEC = {
         "- You MUST have called github_find_examples + github_read_file to find a working reference implementation. "
         "Scripts based on your internal knowledge WILL use outdated APIs and fail.\n"
         "- You MUST have validated dataset format via hf_inspect_dataset or hub_repo_details.\n"
         "- Training config MUST include push_to_hub=True and hub_model_id. "
         "Job storage is EPHEMERAL — all files are deleted when the job ends. Without push_to_hub, trained models are lost permanently.\n"
         "- Include trackio monitoring and provide the dashboard URL to the user. "
         "When the script uses report_to='trackio', also pass `trackio_space_id` "
-        "(e.g. '<username>/mlintern-<8char>') and `trackio_project` as tool args — "
         "they are injected as TRACKIO_SPACE_ID/TRACKIO_PROJECT env vars and let the UI embed the live dashboard.\n\n"
         "BATCH/ABLATION JOBS: Submit ONE job first. Check logs to confirm it starts training successfully. "
         "Only then submit the remaining jobs. Never submit all at once — if there's a bug, all jobs fail.\n\n"
@@ -1157,8 +1160,9 @@ HF_JOBS_TOOL_SPEC = {
             "script": {
                 "type": "string",
                 "description": (
-                    "Python code or sandbox file path (e.g. '/app/train.py') or URL. "
                     "Triggers Python mode. For ML training: base this on a working example found via github_find_examples, not on internal knowledge. "
                     "Mutually exclusive with 'command'."
                 ),
             },
@@ -1204,7 +1208,7 @@ HF_JOBS_TOOL_SPEC = {
                 "type": "string",
                 "description": (
                     "Optional. The HF Space hosting the trackio dashboard for this run "
-                    "(e.g. '<username>/mlintern-<8char>', under YOUR HF namespace). "
                     "Injected as TRACKIO_SPACE_ID env var and used by the UI to embed "
                     "the live dashboard. Set this whenever the script uses "
                     "report_to='trackio'. The Space is auto-created and seeded with the "

         "- You MUST have called github_find_examples + github_read_file to find a working reference implementation. "
         "Scripts based on your internal knowledge WILL use outdated APIs and fail.\n"
         "- You MUST have validated dataset format via hf_inspect_dataset or hub_repo_details.\n"
+        "- If the job runs on GPU, or the script loads a model, uses CUDA, bf16/fp16, quantization, flash attention, "
+        "or torch.compile, you MUST create a GPU sandbox with sandbox_create first, run a tiny smoke test there, "
+        "and fix failures before submitting. If skipped, state why before calling hf_jobs.\n"
         "- Training config MUST include push_to_hub=True and hub_model_id. "
         "Job storage is EPHEMERAL — all files are deleted when the job ends. Without push_to_hub, trained models are lost permanently.\n"
         "- Include trackio monitoring and provide the dashboard URL to the user. "
         "When the script uses report_to='trackio', also pass `trackio_space_id` "
+        "(e.g. '<username>/ml-intern-<8char>') and `trackio_project` as tool args — "
         "they are injected as TRACKIO_SPACE_ID/TRACKIO_PROJECT env vars and let the UI embed the live dashboard.\n\n"
         "BATCH/ABLATION JOBS: Submit ONE job first. Check logs to confirm it starts training successfully. "
         "Only then submit the remaining jobs. Never submit all at once — if there's a bug, all jobs fail.\n\n"
             "script": {
                 "type": "string",
                 "description": (
+                    "Python code, sandbox file path (e.g. '/app/train.py', './train.py', or bare 'train.py'), or URL. "
                     "Triggers Python mode. For ML training: base this on a working example found via github_find_examples, not on internal knowledge. "
+                    "For GPU/model-loading training scripts, smoke-test in a GPU sandbox before submission. "
                     "Mutually exclusive with 'command'."
                 ),
             },
                 "type": "string",
                 "description": (
                     "Optional. The HF Space hosting the trackio dashboard for this run "
+                    "(e.g. '<username>/ml-intern-<8char>', under YOUR HF namespace). "
                     "Injected as TRACKIO_SPACE_ID env var and used by the UI to embed "
                     "the live dashboard. Set this whenever the script uses "
                     "report_to='trackio'. The Space is auto-created and seeded with the "

agent/tools/plan_tool.py CHANGED Viewed

@@ -54,20 +54,24 @@ class PlanTool:
                     "isError": True,
                 }
-        # Store the raw todos structure in memory
-        _current_plan = todos
         # Emit plan update event if session is available
         if self.session:
             await self.session.send_event(
                 Event(
                     event_type="plan_update",
-                    data={"plan": todos},
                 )
             )
         # Format only for display using terminal_display utility
-        formatted_output = format_plan_tool_output(todos)
         return {
             "formatted": formatted_output,

                     "isError": True,
                 }
+        # Store a session-scoped copy so the runtime can tell whether a
+        # text-only model response is trying to stop while work remains.
+        stored_todos = [dict(todo) for todo in todos]
+        _current_plan = stored_todos
+        if self.session is not None:
+            self.session.current_plan = stored_todos
         # Emit plan update event if session is available
         if self.session:
             await self.session.send_event(
                 Event(
                     event_type="plan_update",
+                    data={"plan": stored_todos},
                 )
             )
         # Format only for display using terminal_display utility
+        formatted_output = format_plan_tool_output(stored_todos)
         return {
             "formatted": formatted_output,

agent/tools/sandbox_client.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # /// script
 # requires-python = ">=3.10"
-# dependencies = ["huggingface_hub>=0.20.0", "httpx>=0.27.0"]
 # ///
 """
 Sandbox Tools — Agent-native primitives for HF Space dev-mode sandboxes.
@@ -615,18 +615,19 @@ class Sandbox:
         kwargs = {
             "from_id": template,
             "to_id": space_id,
             "private": private,
-            "hardware": hardware,
         }
         if sleep_time is not None:
-            kwargs["sleep_time"] = sleep_time
-        api.duplicate_space(**kwargs)
         _log(f"Space created: https://huggingface.co/spaces/{space_id}")
         _check_cancel()
-        # ``duplicate_space`` sends hardware and sleepTimeSeconds in the
         # initial create request. Avoid a second /hardware call: deployed HF
         # OAuth tokens can 401 on that endpoint for a just-created private
         # Space even though duplication itself succeeded. We rely on the
@@ -775,21 +776,23 @@ class Sandbox:
             f"Last status: {last_status}, last error: {last_err}"
         )
-    def delete(self):
         """Delete the Space. Only works if this Sandbox created it."""
         if not self._owns_space:
             raise RuntimeError(
                 f"This Sandbox did not create {self.space_id}. "
                 f"Use self._hf_api.delete_repo() directly if you're sure."
             )
-        print(f"Deleting sandbox: {self.space_id}...")
         self._hf_api.delete_repo(self.space_id, repo_type="space")
         # Clear ownership so a second cleanup call (e.g. delete_session +
         # _run_session.finally both fire) early-returns instead of retrying
         # a 404 delete and emitting a spurious ERROR log.
         self._owns_space = False
         self._client.close()
-        print("Deleted.")
     def pause(self):
         """Pause the Space (stops billing, preserves state)."""

 #!/usr/bin/env python3
 # /// script
 # requires-python = ">=3.10"
+# dependencies = ["huggingface_hub>=1.12.0", "httpx>=0.27.0"]
 # ///
 """
 Sandbox Tools — Agent-native primitives for HF Space dev-mode sandboxes.
         kwargs = {
             "from_id": template,
             "to_id": space_id,
+            "repo_type": "space",
             "private": private,
+            "space_hardware": hardware,
         }
         if sleep_time is not None:
+            kwargs["space_sleep_time"] = sleep_time
+        api.duplicate_repo(**kwargs)
         _log(f"Space created: https://huggingface.co/spaces/{space_id}")
         _check_cancel()
+        # ``duplicate_repo`` sends hardware and sleepTimeSeconds in the
         # initial create request. Avoid a second /hardware call: deployed HF
         # OAuth tokens can 401 on that endpoint for a just-created private
         # Space even though duplication itself succeeded. We rely on the
             f"Last status: {last_status}, last error: {last_err}"
         )
+    def delete(self, log: Callable[[str], object] | None = None):
         """Delete the Space. Only works if this Sandbox created it."""
         if not self._owns_space:
             raise RuntimeError(
                 f"This Sandbox did not create {self.space_id}. "
                 f"Use self._hf_api.delete_repo() directly if you're sure."
             )
+        if log:
+            log(f"Deleting sandbox: {self.space_id}...")
         self._hf_api.delete_repo(self.space_id, repo_type="space")
         # Clear ownership so a second cleanup call (e.g. delete_session +
         # _run_session.finally both fire) early-returns instead of retrying
         # a 404 delete and emitting a spurious ERROR log.
         self._owns_space = False
         self._client.close()
+        if log:
+            log("Deleted.")
     def pause(self):
         """Pause the Space (stops billing, preserves state)."""

agent/tools/sandbox_tool.py CHANGED Viewed

@@ -16,6 +16,7 @@ import logging
 import re
 import threading
 import weakref
 from datetime import datetime, timedelta, timezone
 from typing import Any
@@ -58,17 +59,41 @@ def _get_sandbox_create_lock(owner: str) -> asyncio.Lock:
     return lock
 def _looks_like_path(script: str) -> bool:
     """Return True if the script string looks like a file path (not inline code)."""
-    return (
         isinstance(script, str)
         and script.strip() == script
         and not any(c in script for c in "\r\n\0")
-        and (
-            script.startswith("/")
-            or script.startswith("./")
-            or script.startswith("../")
-        )
     )
@@ -303,14 +328,8 @@ async def _create_sandbox_locked(
         )
     )
-    # Thread-safe log callback: posts tool_log events from the worker thread
-    loop = asyncio.get_running_loop()
-    def _log(msg: str) -> None:
-        loop.call_soon_threadsafe(
-            session.event_queue.put_nowait,
-            Event(event_type="tool_log", data={"tool": "sandbox", "log": msg}),
-        )
     # Bridge asyncio cancel event to a threading.Event for the blocking create call.
     # We poll session._cancelled from the main loop in a background task and set
@@ -352,7 +371,7 @@ async def _create_sandbox_locked(
     if cancel_flag.is_set():
         if getattr(sb, "_owns_space", False):
             try:
-                await asyncio.to_thread(sb.delete)
             except Exception as e:
                 logger.warning(
                     "Failed to delete cancelled sandbox %s: %s", sb.space_id, e
@@ -497,6 +516,7 @@ async def teardown_session_sandbox(session: Any) -> None:
             return
         space_id = getattr(sandbox, "space_id", None)
         last_err: Exception | None = None
         for attempt in range(3):
             try:
@@ -505,7 +525,7 @@ async def teardown_session_sandbox(session: Any) -> None:
                     space_id,
                     attempt + 1,
                 )
-                await asyncio.to_thread(sandbox.delete)
                 from agent.core import telemetry
                 await telemetry.record_sandbox_destroy(session, sandbox)
@@ -542,7 +562,7 @@ SANDBOX_CREATE_TOOL_SPEC = {
         "Common picks: t4-small (16GB VRAM, fits ≤1-3B), a10g-small (24GB, ≤7B), a100-large (80GB, ≤30B). "
         "If the model won't fit, pick larger hardware upfront — OOM on a sandbox wastes time.\n\n"
         "If you intend to run a training script in this sandbox that uses report_to='trackio', "
-        "pass `trackio_space_id` (e.g. '<username>/mlintern-<8char>') and `trackio_project` so they "
         "are set as TRACKIO_SPACE_ID/TRACKIO_PROJECT secrets in the sandbox and the UI can embed the live dashboard.\n\n"
         "Hardware: " + ", ".join([e.value for e in SpaceHardware]) + ".\n"
     ),
@@ -563,7 +583,7 @@ SANDBOX_CREATE_TOOL_SPEC = {
                 "type": "string",
                 "description": (
                     "Optional. The HF Space hosting the trackio dashboard for runs in this sandbox "
-                    "(e.g. '<username>/mlintern-<8char>', under YOUR HF namespace). Injected as "
                     "TRACKIO_SPACE_ID secret and surfaced to the UI. The Space is auto-created and "
                     "seeded with the trackio dashboard — DO NOT pre-create it via hf_repo_git, "
                     "that produces an empty Space that breaks the embed."

 import re
 import threading
 import weakref
+from collections.abc import Callable
 from datetime import datetime, timedelta, timezone
 from typing import Any
     return lock
+def _session_tool_logger(
+    session: Any, *, tool: str = "sandbox"
+) -> Callable[[str], object] | None:
+    event_queue = getattr(session, "event_queue", None)
+    if event_queue is None:
+        return None
+    loop = asyncio.get_running_loop()
+    def _log(msg: str) -> None:
+        loop.call_soon_threadsafe(
+            event_queue.put_nowait,
+            Event(event_type="tool_log", data={"tool": tool, "log": msg}),
+        )
+    return _log
 def _looks_like_path(script: str) -> bool:
     """Return True if the script string looks like a file path (not inline code)."""
+    if not (
         isinstance(script, str)
         and script.strip() == script
         and not any(c in script for c in "\r\n\0")
+    ):
+        return False
+    if script.startswith("http://") or script.startswith("https://"):
+        return False
+    return (
+        script.startswith("/")
+        or script.startswith("./")
+        or script.startswith("../")
+        or (script.endswith(".py") and not any(c.isspace() for c in script))
     )
         )
     )
+    # Thread-safe log callback: posts tool_log events from worker threads.
+    _log = _session_tool_logger(session) or (lambda msg: None)
     # Bridge asyncio cancel event to a threading.Event for the blocking create call.
     # We poll session._cancelled from the main loop in a background task and set
     if cancel_flag.is_set():
         if getattr(sb, "_owns_space", False):
             try:
+                await asyncio.to_thread(sb.delete, log=_log)
             except Exception as e:
                 logger.warning(
                     "Failed to delete cancelled sandbox %s: %s", sb.space_id, e
             return
         space_id = getattr(sandbox, "space_id", None)
+        delete_log = _session_tool_logger(session)
         last_err: Exception | None = None
         for attempt in range(3):
             try:
                     space_id,
                     attempt + 1,
                 )
+                await asyncio.to_thread(sandbox.delete, log=delete_log)
                 from agent.core import telemetry
                 await telemetry.record_sandbox_destroy(session, sandbox)
         "Common picks: t4-small (16GB VRAM, fits ≤1-3B), a10g-small (24GB, ≤7B), a100-large (80GB, ≤30B). "
         "If the model won't fit, pick larger hardware upfront — OOM on a sandbox wastes time.\n\n"
         "If you intend to run a training script in this sandbox that uses report_to='trackio', "
+        "pass `trackio_space_id` (e.g. '<username>/ml-intern-<8char>') and `trackio_project` so they "
         "are set as TRACKIO_SPACE_ID/TRACKIO_PROJECT secrets in the sandbox and the UI can embed the live dashboard.\n\n"
         "Hardware: " + ", ".join([e.value for e in SpaceHardware]) + ".\n"
     ),
                 "type": "string",
                 "description": (
                     "Optional. The HF Space hosting the trackio dashboard for runs in this sandbox "
+                    "(e.g. '<username>/ml-intern-<8char>', under YOUR HF namespace). Injected as "
                     "TRACKIO_SPACE_ID secret and surfaced to the UI. The Space is auto-created and "
                     "seeded with the trackio dashboard — DO NOT pre-create it via hf_repo_git, "
                     "that produces an empty Space that breaks the embed."

agent/utils/terminal_display.py CHANGED Viewed

@@ -6,6 +6,7 @@ import asyncio
 import re
 from rich.console import Console
 from rich.markdown import Heading, Markdown
 from rich.panel import Panel
 from rich.theme import Theme
@@ -92,7 +93,11 @@ def get_console() -> Console:
 # ── Banner ─────────────────────────────────────────────────────────────
-def print_banner(model: str | None = None, hf_user: str | None = None) -> None:
     """Print particle logo then CRT boot sequence with system info."""
     from agent.utils.particle_logo import run_particle_logo
     from agent.utils.crt_boot import run_boot_sequence
@@ -115,6 +120,7 @@ def print_banner(model: str | None = None, hf_user: str | None = None) -> None:
         (f"{_I}Initializing agent runtime...", gold),
         (f"{_I}  User: {user_label}", dim_gold),
         (f"{_I}  Model: {model_label}", dim_gold),
         (f"{_I}  Tools: loading...", dim_gold),
         ("", ""),
         (f"{_I}/help for commands · /model to switch · /quit to exit", gold),
@@ -446,23 +452,72 @@ def print_yolo_approve(count: int) -> None:
 # ── Help ───────────────────────────────────────────────────────────────
-HELP_TEXT = f"""\
-{_I}[bold]Commands[/bold]
-{_I}  [cyan]/help[/cyan]            Show this help
-{_I}  [cyan]/undo[/cyan]            Undo last turn
-{_I}  [cyan]/compact[/cyan]         Compact context window
-{_I}  [cyan]/resume[/cyan] [index|id|path] Pick up from a log in ./session_logs
-{_I}  [cyan]/model[/cyan] [id]      Show available models or switch
-{_I}  [cyan]/effort[/cyan] [level]  Reasoning effort (minimal|low|medium|high|xhigh|max|off)
-{_I}  [cyan]/yolo[/cyan]            Toggle auto-approve mode
-{_I}  [cyan]/status[/cyan]          Current model & turn count
-{_I}  [cyan]/share-traces[/cyan] [public|private]  Show/flip visibility of your HF trace dataset
-{_I}  [cyan]/quit[/cyan]            Exit"""
 def print_help() -> None:
     _console.print()
-    _console.print(HELP_TEXT)
     _console.print()

 import re
 from rich.console import Console
+from rich.markup import escape
 from rich.markdown import Heading, Markdown
 from rich.panel import Panel
 from rich.theme import Theme
 # ── Banner ─────────────────────────────────────────────────────────────
+def print_banner(
+    model: str | None = None,
+    hf_user: str | None = None,
+    tool_runtime: str | None = None,
+) -> None:
     """Print particle logo then CRT boot sequence with system info."""
     from agent.utils.particle_logo import run_particle_logo
     from agent.utils.crt_boot import run_boot_sequence
         (f"{_I}Initializing agent runtime...", gold),
         (f"{_I}  User: {user_label}", dim_gold),
         (f"{_I}  Model: {model_label}", dim_gold),
+        (f"{_I}  Tool runtime: {tool_runtime or 'local filesystem'}", dim_gold),
         (f"{_I}  Tools: loading...", dim_gold),
         ("", ""),
         (f"{_I}/help for commands · /model to switch · /quit to exit", gold),
 # ── Help ───────────────────────────────────────────────────────────────
+HELP_ROWS: tuple[tuple[str, str, str], ...] = (
+    ("/help", "", "Show this help"),
+    ("/undo", "", "Undo last turn"),
+    ("/compact", "", "Compact context window"),
+    ("/resume", "[index|id|path]", "Pick up from ./session_logs"),
+    ("/model", "[id]", "Show available models or switch"),
+    (
+        "/effort",
+        "[level]",
+        "Set reasoning effort preference",
+    ),
+    ("/yolo", "", "Toggle auto-approve mode"),
+    ("/status", "", "Current model & turn count"),
+    (
+        "/share-traces",
+        "[public|private]",
+        "Show or change HF trace visibility",
+    ),
+    ("/quit", "", "Exit"),
+)
+def _help_column_widths(
+    rows: tuple[tuple[str, str, str], ...],
+) -> tuple[int, int]:
+    return (
+        max(len(command) for command, _, _ in rows),
+        max(len(args) for _, args, _ in rows),
+    )
+def _format_help_row(
+    command: str,
+    args: str,
+    description: str,
+    command_width: int,
+    args_width: int,
+) -> str:
+    command_gap = " " * (command_width - len(command) + 2)
+    args_gap = " " * (args_width - len(args) + 2)
+    command_markup = f"[cyan]{escape(command)}[/cyan]"
+    args_markup = f"[muted]{escape(args)}[/muted]" if args else ""
+    return f"{_I}  {command_markup}{command_gap}{args_markup}{args_gap}{description}"
+def format_help_text(rows: tuple[tuple[str, str, str], ...] | None = None) -> str:
+    help_rows = HELP_ROWS if rows is None else rows
+    command_width, args_width = _help_column_widths(help_rows)
+    return "\n".join(
+        [f"{_I}[bold]Commands[/bold]"]
+        + [
+            _format_help_row(
+                command,
+                args,
+                description,
+                command_width,
+                args_width,
+            )
+            for command, args, description in help_rows
+        ]
+    )
 def print_help() -> None:
     _console.print()
+    _console.print(format_help_text())
     _console.print()

backend/dataset_uploads.py ADDED Viewed

	@@ -0,0 +1,305 @@

+"""Helpers for session-scoped dataset uploads to the Hugging Face Hub."""
+import asyncio
+import os
+import re
+import uuid
+from dataclasses import dataclass
+from urllib.parse import quote
+from fastapi import HTTPException, UploadFile
+from huggingface_hub import HfApi
+MAX_DATASET_UPLOAD_BYTES = 100 * 1024 * 1024
+ALLOWED_DATASET_EXTENSIONS = {"csv", "json", "jsonl"}
+_SAFE_FILENAME_RE = re.compile(r"[^A-Za-z0-9._-]+")
+_SAFE_NAMESPACE_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]{0,95}$")
+@dataclass(frozen=True)
+class DatasetUpload:
+    session_id: str
+    repo_id: str
+    repo_type: str
+    private: bool
+    upload_id: str
+    config_name: str
+    filename: str
+    original_filename: str
+    path_in_repo: str
+    size_bytes: int
+    format: str
+    hub_url: str
+    load_dataset_snippet: str
+    def response_payload(self) -> dict[str, str | int | bool]:
+        return {
+            "session_id": self.session_id,
+            "repo_id": self.repo_id,
+            "repo_type": self.repo_type,
+            "private": self.private,
+            "upload_id": self.upload_id,
+            "config_name": self.config_name,
+            "filename": self.filename,
+            "path_in_repo": self.path_in_repo,
+            "size_bytes": self.size_bytes,
+            "format": self.format,
+            "hub_url": self.hub_url,
+            "load_dataset_snippet": self.load_dataset_snippet,
+        }
+def sanitize_dataset_filename(filename: str | None) -> str:
+    """Return a Hub-safe basename while preserving the extension."""
+    raw = os.path.basename(filename or "").strip()
+    if not raw:
+        raw = "dataset.csv"
+    safe = _SAFE_FILENAME_RE.sub("-", raw).strip(".-_")
+    if not safe:
+        safe = "dataset.csv"
+    stem, ext = os.path.splitext(safe)
+    if not stem:
+        stem = "dataset"
+    if not ext:
+        ext = ".csv"
+    max_stem_len = 96 - len(ext)
+    stem = stem[:max_stem_len].strip(".-_") or "dataset"
+    return f"{stem}{ext.lower()}"
+def display_filename(filename: str | None, fallback: str) -> str:
+    raw = os.path.basename(filename or "").strip()
+    if not raw:
+        return fallback
+    cleaned = "".join(char for char in raw if ord(char) >= 32)
+    return cleaned[:160] or fallback
+def dataset_format_from_filename(filename: str) -> str:
+    ext = os.path.splitext(filename)[1].lower().lstrip(".")
+    if ext not in ALLOWED_DATASET_EXTENSIONS:
+        raise HTTPException(
+            status_code=400,
+            detail="Only .csv, .json, and .jsonl dataset files are supported.",
+        )
+    return ext
+def session_dataset_repo_id(hf_username: str | None, session_id: str) -> str:
+    namespace = (hf_username or "").strip()
+    if not namespace or not _SAFE_NAMESPACE_RE.fullmatch(namespace):
+        raise HTTPException(
+            status_code=400,
+            detail="Could not determine a valid Hugging Face namespace.",
+        )
+    safe_session_id = re.sub(r"[^A-Za-z0-9]+", "-", session_id).strip("-")
+    if not safe_session_id:
+        safe_session_id = uuid.uuid4().hex[:8]
+    return f"{namespace}/ml-intern-{safe_session_id[:8]}-datasets"
+async def upload_size_bytes(upload: UploadFile) -> int:
+    await asyncio.to_thread(upload.file.seek, 0, os.SEEK_END)
+    size = await asyncio.to_thread(upload.file.tell)
+    await asyncio.to_thread(upload.file.seek, 0)
+    return int(size)
+async def validate_dataset_upload(upload: UploadFile) -> tuple[str, str, int]:
+    dataset_format = dataset_format_from_filename(upload.filename or "")
+    safe_filename = sanitize_dataset_filename(upload.filename)
+    size = await upload_size_bytes(upload)
+    if size <= 0:
+        raise HTTPException(status_code=400, detail="Uploaded dataset file is empty.")
+    if size > MAX_DATASET_UPLOAD_BYTES:
+        raise HTTPException(
+            status_code=413,
+            detail="Dataset upload exceeds the 100 MB limit.",
+        )
+    return safe_filename, dataset_format, size
+def dataset_hub_url(repo_id: str, path_in_repo: str) -> str:
+    quoted_path = quote(path_in_repo, safe="/")
+    return f"https://huggingface.co/datasets/{repo_id}/blob/main/{quoted_path}"
+def dataset_config_name(upload_id: str) -> str:
+    safe_upload_id = re.sub(r"[^A-Za-z0-9]+", "_", upload_id).strip("_").lower()
+    if not safe_upload_id:
+        safe_upload_id = "dataset"
+    return f"upload_{safe_upload_id[:32]}"
+def dataset_config_name_from_path(path_in_repo: str) -> str:
+    parts = path_in_repo.split("/")
+    if len(parts) >= 3 and parts[0] == "uploads":
+        return dataset_config_name(parts[1])
+    stem = os.path.splitext(os.path.basename(path_in_repo))[0]
+    return dataset_config_name(stem)
+def is_dataset_upload_path(path_in_repo: str) -> bool:
+    parts = path_in_repo.split("/")
+    if len(parts) != 3 or parts[0] != "uploads" or not parts[1] or not parts[2]:
+        return False
+    extension = os.path.splitext(path_in_repo)[1].lower().lstrip(".")
+    return extension in ALLOWED_DATASET_EXTENSIONS
+def unique_dataset_upload_paths(paths: list[str]) -> list[str]:
+    seen = set()
+    upload_paths = []
+    for path in paths:
+        if not is_dataset_upload_path(path) or path in seen:
+            continue
+        seen.add(path)
+        upload_paths.append(path)
+    return upload_paths
+def load_dataset_snippet(repo_id: str, config_name: str) -> str:
+    return (
+        "from datasets import load_dataset\n\n"
+        f'dataset = load_dataset("{repo_id}", "{config_name}", '
+        'split="train", token=True)'
+    )
+def dataset_repo_card(repo_id: str, upload_paths: list[str]) -> bytes:
+    config_lines = []
+    unique_upload_paths = unique_dataset_upload_paths(upload_paths)
+    if unique_upload_paths:
+        config_lines.append("configs:")
+        for path in unique_upload_paths:
+            config_lines.extend(
+                [
+                    f"- config_name: {dataset_config_name_from_path(path)}",
+                    "  data_files:",
+                    "  - split: train",
+                    f'    path: "{path}"',
+                ]
+            )
+    configs = "\n".join(config_lines)
+    if configs:
+        configs = f"{configs}\n"
+    content = f"""---
+tags:
+- ml-intern
+- uploaded-dataset
+{configs}---
+# {repo_id}
+Private dataset files uploaded through ML Intern.
+Files are stored under `uploads/<upload_id>/` and are attached to the
+corresponding ML Intern session context by Hub reference, not by copying file
+contents into the chat.
+Each uploaded file is exposed as its own dataset config so files with different
+schemas can coexist in the same session repo.
+"""
+    return content.encode("utf-8")
+def dataset_context_note(upload: DatasetUpload) -> str:
+    return f"""[SYSTEM: The user uploaded a dataset file for this session.
+Use this Hugging Face Hub dataset reference when the task needs the uploaded data.
+Do not look for the uploaded file on local disk and do not ask the user to
+upload it again unless this Hub reference fails.
+- Repo ID: {upload.repo_id}
+- Repo type: dataset
+- Dataset config: {upload.config_name}
+- File in repo: {upload.path_in_repo}
+- Original filename: {upload.original_filename}
+- Stored filename: {upload.filename}
+- Format: {upload.format}
+- Size: {upload.size_bytes} bytes
+- Hub URL: {upload.hub_url}
+Load it with:
+```python
+{upload.load_dataset_snippet}
+```
+]"""
+async def push_dataset_upload_to_hub(
+    *,
+    upload: UploadFile,
+    session_id: str,
+    hf_username: str,
+    hf_token: str,
+) -> DatasetUpload:
+    safe_filename, dataset_format, size = await validate_dataset_upload(upload)
+    original_filename = display_filename(upload.filename, safe_filename)
+    upload_id = uuid.uuid4().hex[:12]
+    config_name = dataset_config_name(upload_id)
+    repo_id = session_dataset_repo_id(hf_username, session_id)
+    path_in_repo = f"uploads/{upload_id}/{safe_filename}"
+    hub_url = dataset_hub_url(repo_id, path_in_repo)
+    snippet = load_dataset_snippet(repo_id, config_name)
+    api = HfApi(token=hf_token)
+    await asyncio.to_thread(
+        api.create_repo,
+        repo_id=repo_id,
+        repo_type="dataset",
+        private=True,
+        exist_ok=True,
+    )
+    await asyncio.to_thread(
+        api.update_repo_settings,
+        repo_id=repo_id,
+        repo_type="dataset",
+        private=True,
+    )
+    repo_files = await asyncio.to_thread(
+        api.list_repo_files,
+        repo_id=repo_id,
+        repo_type="dataset",
+    )
+    upload_paths = unique_dataset_upload_paths([*repo_files, path_in_repo])
+    await asyncio.to_thread(upload.file.seek, 0)
+    file_bytes = await asyncio.to_thread(upload.file.read)
+    await asyncio.to_thread(
+        api.upload_file,
+        path_or_fileobj=file_bytes,
+        path_in_repo=path_in_repo,
+        repo_id=repo_id,
+        repo_type="dataset",
+        commit_message=f"Upload dataset file {safe_filename}",
+    )
+    await asyncio.to_thread(
+        api.upload_file,
+        path_or_fileobj=dataset_repo_card(repo_id, upload_paths),
+        path_in_repo="README.md",
+        repo_id=repo_id,
+        repo_type="dataset",
+        commit_message="Update ML Intern dataset upload configs",
+    )
+    return DatasetUpload(
+        session_id=session_id,
+        repo_id=repo_id,
+        repo_type="dataset",
+        private=True,
+        upload_id=upload_id,
+        config_name=config_name,
+        filename=safe_filename,
+        original_filename=original_filename,
+        path_in_repo=path_in_repo,
+        size_bytes=size,
+        format=dataset_format,
+        hub_url=hub_url,
+        load_dataset_snippet=snippet,
+    )

backend/models.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Pydantic models for API requests and responses."""
 from enum import Enum
-from typing import Any
 from pydantic import BaseModel, Field
@@ -120,6 +120,23 @@ class SessionYoloRequest(BaseModel):
     cost_cap_usd: float | None = Field(default=None, ge=0)
 class HealthResponse(BaseModel):
     """Health check response."""

 """Pydantic models for API requests and responses."""
 from enum import Enum
+from typing import Any, Literal
 from pydantic import BaseModel, Field
     cost_cap_usd: float | None = Field(default=None, ge=0)
+class DatasetUploadResponse(BaseModel):
+    """Response for a dataset file uploaded to the Hub."""
+    session_id: str
+    repo_id: str
+    repo_type: Literal["dataset"] = "dataset"
+    private: bool = True
+    upload_id: str
+    config_name: str
+    filename: str
+    path_in_repo: str
+    size_bytes: int
+    format: Literal["csv", "json", "jsonl"]
+    hub_url: str
+    load_dataset_snippet: str
 class HealthResponse(BaseModel):
     """Health check response."""

backend/routes/agent.py CHANGED Viewed

@@ -21,10 +21,18 @@ from fastapi import (
 )
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import StreamingResponse
-from litellm import acompletion
 from pydantic import ValidationError
 from models import (
     ApprovalRequest,
     HealthResponse,
     LLMHealthResponse,
     SessionInfo,
@@ -58,6 +66,7 @@ PREMIUM_MODEL_IDS = {
     DEFAULT_CLAUDE_MODEL_ID,
     "openai/gpt-5.5",
 }
 def _claude_picker_model_id() -> str:
@@ -203,6 +212,63 @@ def _user_hf_token(user: dict[str, Any] | None) -> str | None:
     return user.get(INTERNAL_HF_TOKEN_KEY)
 async def _check_session_access(
     session_id: str,
     user: dict[str, Any],
@@ -542,6 +608,86 @@ async def set_session_notifications(
     }
 @router.patch("/session/{session_id}/yolo")
 async def set_session_yolo(
     session_id: str,

 )
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import StreamingResponse
+from huggingface_hub.errors import HfHubHTTPError
+from litellm import Message, acompletion
 from pydantic import ValidationError
+from starlette.datastructures import FormData, UploadFile
+from dataset_uploads import (
+    MAX_DATASET_UPLOAD_BYTES,
+    dataset_context_note,
+    push_dataset_upload_to_hub,
+)
 from models import (
     ApprovalRequest,
+    DatasetUploadResponse,
     HealthResponse,
     LLMHealthResponse,
     SessionInfo,
     DEFAULT_CLAUDE_MODEL_ID,
     "openai/gpt-5.5",
 }
+DATASET_UPLOAD_MULTIPART_SLACK_BYTES = 1024 * 1024
 def _claude_picker_model_id() -> str:
     return user.get(INTERNAL_HF_TOKEN_KEY)
+def _reject_oversize_dataset_upload(request: Request) -> None:
+    raw_content_length = request.headers.get("content-length")
+    if raw_content_length is None:
+        return
+    try:
+        content_length = int(raw_content_length)
+    except (TypeError, ValueError):
+        return
+    if content_length > MAX_DATASET_UPLOAD_BYTES + DATASET_UPLOAD_MULTIPART_SLACK_BYTES:
+        raise HTTPException(
+            status_code=413,
+            detail="Dataset upload exceeds the 100 MB limit.",
+        )
+def _dataset_upload_file_from_form(form: FormData) -> UploadFile:
+    uploaded_files = [
+        (key, value)
+        for key, value in form.multi_items()
+        if isinstance(value, UploadFile)
+    ]
+    if len(uploaded_files) != 1:
+        raise HTTPException(
+            status_code=400,
+            detail="Upload exactly one dataset file.",
+        )
+    field_name, upload = uploaded_files[0]
+    if field_name != "file":
+        raise HTTPException(
+            status_code=400,
+            detail="Missing 'file' upload field.",
+        )
+    return upload
+def _dataset_upload_hub_http_exception(error: HfHubHTTPError) -> HTTPException:
+    status_code = getattr(error.response, "status_code", None)
+    if status_code == 401:
+        detail = "Hugging Face rejected the token used for the dataset upload."
+        return HTTPException(status_code=401, detail=detail)
+    if status_code == 403:
+        detail = (
+            "Hugging Face denied permission to create or write to the dataset repo."
+        )
+        return HTTPException(status_code=403, detail=detail)
+    if status_code == 404:
+        detail = "Could not find the Hugging Face namespace or dataset repo."
+        return HTTPException(status_code=404, detail=detail)
+    if status_code == 429:
+        detail = "Hugging Face Hub rate limit reached while uploading the dataset."
+        return HTTPException(status_code=429, detail=detail)
+    return HTTPException(
+        status_code=502,
+        detail="Hugging Face Hub upload failed. Please try again.",
+    )
 async def _check_session_access(
     session_id: str,
     user: dict[str, Any],
     }
+@router.post("/session/{session_id}/datasets", response_model=DatasetUploadResponse)
+async def upload_session_dataset(
+    session_id: str,
+    request: Request,
+    user: dict = Depends(get_current_user),
+) -> DatasetUploadResponse:
+    """Upload a CSV/JSON dataset file to a private Hub dataset for this session."""
+    file: UploadFile | None = None
+    try:
+        _reject_oversize_dataset_upload(request)
+        agent_session = await _check_session_access(session_id, user, request)
+        if not agent_session or not agent_session.is_active:
+            raise HTTPException(status_code=404, detail="Session not found")
+        if agent_session.is_processing:
+            raise HTTPException(
+                status_code=409,
+                detail="Cannot upload a dataset while the agent is processing.",
+            )
+        if agent_session.session.pending_approval:
+            raise HTTPException(
+                status_code=409,
+                detail="Approve or reject pending tools before uploading a dataset.",
+            )
+        hf_token = (
+            resolve_hf_request_token(request, include_env_fallback=False)
+            or _user_hf_token(user)
+            or resolve_hf_request_token(request)
+        )
+        if not hf_token:
+            raise HTTPException(
+                status_code=401,
+                detail="A Hugging Face token is required to upload datasets.",
+            )
+        form = await request.form(
+            max_files=1,
+            max_fields=1,
+            max_part_size=MAX_DATASET_UPLOAD_BYTES,
+        )
+        file = _dataset_upload_file_from_form(form)
+        hf_username = user.get("username") or agent_session.hf_username
+        uploaded = await push_dataset_upload_to_hub(
+            upload=file,
+            session_id=session_id,
+            hf_username=hf_username,
+            hf_token=hf_token,
+        )
+        agent_session.session.context_manager.add_message(
+            Message(role="user", content=dataset_context_note(uploaded))
+        )
+        await session_manager.persist_session_snapshot(agent_session)
+        logger.info(
+            "Uploaded dataset file %s to %s for session %s",
+            uploaded.filename,
+            uploaded.repo_id,
+            session_id,
+        )
+        return DatasetUploadResponse(**uploaded.response_payload())
+    except HTTPException:
+        raise
+    except HfHubHTTPError as e:
+        logger.warning(
+            "Hub rejected dataset upload for session %s: status=%s request_id=%s",
+            session_id,
+            getattr(e.response, "status_code", None),
+            getattr(e, "request_id", None),
+        )
+        raise _dataset_upload_hub_http_exception(e)
+    except Exception:
+        logger.exception("Dataset upload failed for session %s", session_id)
+        raise HTTPException(
+            status_code=502,
+            detail="Dataset upload failed. Please try again.",
+        )
+    finally:
+        if file is not None:
+            await file.close()
 @router.patch("/session/{session_id}/yolo")
 async def set_session_yolo(
     session_id: str,

configs/cli_agent_config.json CHANGED Viewed

@@ -7,6 +7,7 @@
   "yolo_mode": false,
   "confirm_cpu_jobs": true,
   "auto_file_upload": true,
   "messaging": {
     "enabled": false,
     "auto_event_types": ["approval_required", "error", "turn_complete"],

   "yolo_mode": false,
   "confirm_cpu_jobs": true,
   "auto_file_upload": true,
+  "tool_runtime": "local",
   "messaging": {
     "enabled": false,
     "auto_event_types": ["approval_required", "error", "turn_complete"],

frontend/src/components/Chat/ChatInput.tsx CHANGED Viewed

@@ -11,12 +11,15 @@ import {
   ListItemIcon,
   ListItemText,
   Chip,
   Snackbar,
 } from '@mui/material';
 import ArrowUpwardIcon from '@mui/icons-material/ArrowUpward';
 import ArrowDropDownIcon from '@mui/icons-material/ArrowDropDown';
 import StopIcon from '@mui/icons-material/Stop';
-import { apiFetch } from '@/utils/api';
 import { useUserQuota } from '@/hooks/useUserQuota';
 import ClaudeCapDialog from '@/components/ClaudeCapDialog';
 import JobsUpgradeDialog from '@/components/JobsUpgradeDialog';
@@ -118,18 +121,49 @@ interface ChatInputProps {
   initialModelPath?: string | null;
   onSend: (text: string) => void;
   onStop?: () => void;
   isProcessing?: boolean;
   disabled?: boolean;
   placeholder?: string;
 }
 const isClaudeModel = (m: ModelOption) => isClaudePath(m.modelPath);
 const isPremiumModel = (m: ModelOption) => isPremiumPath(m.modelPath);
 const firstFreeModel = (options: ModelOption[]) => options.find(m => !isPremiumModel(m)) ?? options[0];
-export default function ChatInput({ sessionId, initialModelPath, onSend, onStop, isProcessing = false, disabled = false, placeholder = 'Ask anything...' }: ChatInputProps) {
   const [input, setInput] = useState('');
   const inputRef = useRef<HTMLTextAreaElement>(null);
   const [modelOptions, setModelOptions] = useState<ModelOption[]>(DEFAULT_MODEL_OPTIONS);
   const modelOptionsRef = useRef<ModelOption[]>(DEFAULT_MODEL_OPTIONS);
   const sessionIdRef = useRef<string | undefined>(sessionId);
@@ -150,6 +184,11 @@ export default function ChatInput({ sessionId, initialModelPath, onSend, onStop,
   const updateSessionModel = useSessionStore((s) => s.updateSessionModel);
   const [awaitingTopUp, setAwaitingTopUp] = useState(false);
   const [modelSwitchError, setModelSwitchError] = useState<string | null>(null);
   const lastSentRef = useRef<string>('');
   useEffect(() => {
@@ -216,12 +255,75 @@ export default function ChatInput({ sessionId, initialModelPath, onSend, onStop,
   }, [disabled, isProcessing]);
   const handleSend = useCallback(() => {
-    if (input.trim() && !disabled) {
       lastSentRef.current = input;
       onSend(input);
       setInput('');
     }
-  }, [input, disabled, onSend]);
   // When the chat transport reports a premium-model quota 429, restore the typed
   // text so the user doesn't lose their message.
@@ -231,6 +333,18 @@ export default function ChatInput({ sessionId, initialModelPath, onSend, onStop,
     }
   }, [claudeQuotaExhausted]);
   // Refresh the quota display whenever the session changes (user might
   // have started another tab that spent quota).
   useEffect(() => {
@@ -382,9 +496,12 @@ export default function ChatInput({ sessionId, initialModelPath, onSend, onStop,
         <Box
           className="composer"
           sx={{
-            display: 'flex',
-            gap: '10px',
-            alignItems: 'flex-start',
             bgcolor: 'var(--composer-bg)',
             borderRadius: 'var(--radius-md)',
             p: '12px',
@@ -420,7 +537,7 @@ export default function ChatInput({ sessionId, initialModelPath, onSend, onStop,
                 }
             }}
             sx={{
-                flex: 1,
                 '& .MuiInputBase-root': {
                     p: 0,
                     backgroundColor: 'transparent',
@@ -431,11 +548,46 @@ export default function ChatInput({ sessionId, initialModelPath, onSend, onStop,
                 }
             }}
           />
           {isProcessing ? (
             <IconButton
               onClick={onStop}
               sx={{
-                mt: 1,
                 p: 1.5,
                 borderRadius: '10px',
                 color: 'var(--muted-text)',
@@ -455,9 +607,11 @@ export default function ChatInput({ sessionId, initialModelPath, onSend, onStop,
           ) : (
             <IconButton
               onClick={handleSend}
-              disabled={disabled || !input.trim()}
               sx={{
-                mt: 1,
                 p: 1,
                 borderRadius: '10px',
                 color: 'var(--muted-text)',
@@ -475,6 +629,65 @@ export default function ChatInput({ sessionId, initialModelPath, onSend, onStop,
             </IconButton>
           )}
         </Box>
         {/* Powered By Badge */}
         <Box

   ListItemIcon,
   ListItemText,
   Chip,
+  LinearProgress,
   Snackbar,
+  Tooltip,
 } from '@mui/material';
 import ArrowUpwardIcon from '@mui/icons-material/ArrowUpward';
 import ArrowDropDownIcon from '@mui/icons-material/ArrowDropDown';
 import StopIcon from '@mui/icons-material/Stop';
+import AddIcon from '@mui/icons-material/Add';
+import { apiFetch, apiUpload } from '@/utils/api';
 import { useUserQuota } from '@/hooks/useUserQuota';
 import ClaudeCapDialog from '@/components/ClaudeCapDialog';
 import JobsUpgradeDialog from '@/components/JobsUpgradeDialog';
   initialModelPath?: string | null;
   onSend: (text: string) => void;
   onStop?: () => void;
+  onDatasetUploaded?: () => Promise<boolean> | boolean;
   isProcessing?: boolean;
   disabled?: boolean;
   placeholder?: string;
 }
+interface DatasetUploadResponse {
+  session_id: string;
+  repo_id: string;
+  repo_type: 'dataset';
+  private: true;
+  upload_id: string;
+  config_name: string;
+  filename: string;
+  path_in_repo: string;
+  size_bytes: number;
+  format: 'csv' | 'json' | 'jsonl';
+  hub_url: string;
+  load_dataset_snippet: string;
+}
+const MAX_DATASET_UPLOAD_BYTES = 100 * 1024 * 1024;
+const DATASET_UPLOAD_ACCEPT = '.csv,.json,.jsonl';
+const DATASET_UPLOAD_EXTENSIONS = new Set(['csv', 'json', 'jsonl']);
 const isClaudeModel = (m: ModelOption) => isClaudePath(m.modelPath);
 const isPremiumModel = (m: ModelOption) => isPremiumPath(m.modelPath);
 const firstFreeModel = (options: ModelOption[]) => options.find(m => !isPremiumModel(m)) ?? options[0];
+const formatBytes = (bytes: number) => {
+  if (bytes < 1024) return `${bytes} B`;
+  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
+  return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
+};
+const datasetRepoUrl = (repoId: string) => (
+  `https://huggingface.co/datasets/${repoId.split('/').map(encodeURIComponent).join('/')}`
+);
+export default function ChatInput({ sessionId, initialModelPath, onSend, onStop, onDatasetUploaded, isProcessing = false, disabled = false, placeholder = 'Ask anything...' }: ChatInputProps) {
   const [input, setInput] = useState('');
   const inputRef = useRef<HTMLTextAreaElement>(null);
+  const fileInputRef = useRef<HTMLInputElement>(null);
   const [modelOptions, setModelOptions] = useState<ModelOption[]>(DEFAULT_MODEL_OPTIONS);
   const modelOptionsRef = useRef<ModelOption[]>(DEFAULT_MODEL_OPTIONS);
   const sessionIdRef = useRef<string | undefined>(sessionId);
   const updateSessionModel = useSessionStore((s) => s.updateSessionModel);
   const [awaitingTopUp, setAwaitingTopUp] = useState(false);
   const [modelSwitchError, setModelSwitchError] = useState<string | null>(null);
+  const [datasetUploadError, setDatasetUploadError] = useState<string | null>(null);
+  const [datasetUploadSuccess, setDatasetUploadSuccess] = useState<string | null>(null);
+  const [uploadedDatasets, setUploadedDatasets] = useState<DatasetUploadResponse[]>([]);
+  const [isUploadingDataset, setIsUploadingDataset] = useState(false);
+  const [datasetUploadProgress, setDatasetUploadProgress] = useState<number | null>(null);
   const lastSentRef = useRef<string>('');
   useEffect(() => {
   }, [disabled, isProcessing]);
   const handleSend = useCallback(() => {
+    if (input.trim() && !disabled && !isUploadingDataset) {
       lastSentRef.current = input;
       onSend(input);
       setInput('');
     }
+  }, [input, disabled, isUploadingDataset, onSend]);
+  const handleDatasetUploadClick = useCallback(() => {
+    fileInputRef.current?.click();
+  }, []);
+  const handleDatasetFileChange = useCallback(
+    async (event: React.ChangeEvent<HTMLInputElement>) => {
+      const file = event.target.files?.[0];
+      event.target.value = '';
+      if (!file) return;
+      if (!sessionId) {
+        setDatasetUploadError('Start a session before uploading a dataset.');
+        return;
+      }
+      const extension = file.name.split('.').pop()?.toLowerCase() || '';
+      if (!DATASET_UPLOAD_EXTENSIONS.has(extension)) {
+        setDatasetUploadError('Only CSV, JSON, and JSONL dataset files are supported.');
+        return;
+      }
+      if (file.size > MAX_DATASET_UPLOAD_BYTES) {
+        setDatasetUploadError(
+          `Dataset files must be 100 MB or smaller. ${file.name} is ${formatBytes(file.size)}.`
+        );
+        return;
+      }
+      if (file.size === 0) {
+        setDatasetUploadError('Uploaded dataset file is empty.');
+        return;
+      }
+      const formData = new FormData();
+      formData.append('file', file);
+      setIsUploadingDataset(true);
+      setDatasetUploadProgress(0);
+      setDatasetUploadError(null);
+      setDatasetUploadSuccess(null);
+      try {
+        const res = await apiUpload(`/api/session/${sessionId}/datasets`, formData, {
+          onProgress: ({ percent }) => {
+            setDatasetUploadProgress(percent !== null && percent < 100 ? percent : null);
+          },
+        });
+        if (!res.ok) {
+          setDatasetUploadError(await readApiErrorMessage(res, 'Dataset upload failed.'));
+          return;
+        }
+        const payload = await res.json() as DatasetUploadResponse;
+        setUploadedDatasets((previous) => [payload, ...previous]);
+        setDatasetUploadSuccess(`Uploaded ${payload.filename} to ${payload.repo_id}`);
+        await onDatasetUploaded?.();
+      } catch (error) {
+        setDatasetUploadError(
+          error instanceof Error ? error.message : 'Dataset upload failed.'
+        );
+      } finally {
+        setIsUploadingDataset(false);
+        setDatasetUploadProgress(null);
+      }
+    },
+    [sessionId, onDatasetUploaded],
+  );
   // When the chat transport reports a premium-model quota 429, restore the typed
   // text so the user doesn't lose their message.
     }
   }, [claudeQuotaExhausted]);
+  useEffect(() => {
+    if (!datasetUploadError) return;
+    const timeout = window.setTimeout(() => setDatasetUploadError(null), 7000);
+    return () => window.clearTimeout(timeout);
+  }, [datasetUploadError]);
+  useEffect(() => {
+    if (!datasetUploadSuccess) return;
+    const timeout = window.setTimeout(() => setDatasetUploadSuccess(null), 5000);
+    return () => window.clearTimeout(timeout);
+  }, [datasetUploadSuccess]);
   // Refresh the quota display whenever the session changes (user might
   // have started another tab that spent quota).
   useEffect(() => {
         <Box
           className="composer"
           sx={{
+            display: 'grid',
+            gridTemplateColumns: 'auto 1fr auto',
+            gridTemplateRows: 'auto auto',
+            columnGap: '10px',
+            rowGap: '4px',
+            alignItems: 'end',
             bgcolor: 'var(--composer-bg)',
             borderRadius: 'var(--radius-md)',
             p: '12px',
                 }
             }}
             sx={{
+                gridColumn: '1 / -1',
                 '& .MuiInputBase-root': {
                     p: 0,
                     backgroundColor: 'transparent',
                 }
             }}
           />
+          <input
+            ref={fileInputRef}
+            type="file"
+            accept={DATASET_UPLOAD_ACCEPT}
+            onChange={handleDatasetFileChange}
+            style={{ display: 'none' }}
+          />
+          <Box sx={{ gridColumn: '1', gridRow: '2', display: 'flex' }}>
+            <Tooltip title="Upload dataset">
+              <span>
+                <IconButton
+                  onClick={handleDatasetUploadClick}
+                  disabled={disabled || isProcessing || isUploadingDataset || !sessionId}
+                  sx={{
+                    p: 1,
+                    borderRadius: '50%',
+                    color: uploadedDatasets.length ? 'var(--accent-yellow)' : 'var(--muted-text)',
+                    transition: 'all 0.2s',
+                    '&:hover': {
+                      color: 'var(--accent-yellow)',
+                      bgcolor: 'var(--hover-bg)',
+                    },
+                    '&.Mui-disabled': {
+                      opacity: 0.3,
+                    },
+                  }}
+                  aria-label="Upload dataset"
+                >
+                  <AddIcon fontSize="small" />
+                </IconButton>
+              </span>
+            </Tooltip>
+          </Box>
           {isProcessing ? (
             <IconButton
               onClick={onStop}
               sx={{
+                gridColumn: '3',
+                gridRow: '2',
+                justifySelf: 'end',
                 p: 1.5,
                 borderRadius: '10px',
                 color: 'var(--muted-text)',
           ) : (
             <IconButton
               onClick={handleSend}
+              disabled={disabled || isUploadingDataset || !input.trim()}
               sx={{
+                gridColumn: '3',
+                gridRow: '2',
+                justifySelf: 'end',
                 p: 1,
                 borderRadius: '10px',
                 color: 'var(--muted-text)',
             </IconButton>
           )}
         </Box>
+        {isUploadingDataset && (
+          <Box sx={{ mt: 1, px: 0.5 }}>
+            <LinearProgress
+              variant={datasetUploadProgress === null ? 'indeterminate' : 'determinate'}
+              value={datasetUploadProgress ?? 0}
+              aria-label="Dataset upload progress"
+              sx={{
+                height: 4,
+                borderRadius: 999,
+                bgcolor: 'rgba(255,255,255,0.08)',
+                '& .MuiLinearProgress-bar': {
+                  borderRadius: 999,
+                  bgcolor: 'var(--accent-yellow)',
+                },
+              }}
+            />
+          </Box>
+        )}
+        {(datasetUploadError || datasetUploadSuccess) && (
+          <Box sx={{ display: 'flex', justifyContent: 'center', mt: 1 }}>
+            <Alert
+              severity={datasetUploadError ? 'error' : 'success'}
+              variant="filled"
+              onClose={() => {
+                setDatasetUploadError(null);
+                setDatasetUploadSuccess(null);
+              }}
+              sx={{ fontSize: '0.8rem', maxWidth: 520, width: '100%' }}
+            >
+              {datasetUploadError ?? datasetUploadSuccess}
+            </Alert>
+          </Box>
+        )}
+        {uploadedDatasets.length > 0 && (
+          <Box sx={{ display: 'flex', flexWrap: 'wrap', gap: 0.75, justifyContent: 'center', mt: 1 }}>
+            {uploadedDatasets.map((dataset) => (
+              <Chip
+                key={dataset.upload_id}
+                size="small"
+                label={`Dataset: ${dataset.filename}`}
+                component="a"
+                href={datasetRepoUrl(dataset.repo_id)}
+                target="_blank"
+                rel="noreferrer"
+                clickable
+                sx={{
+                  maxWidth: '100%',
+                  bgcolor: 'rgba(255,255,255,0.08)',
+                  color: 'var(--text)',
+                  border: '1px solid var(--divider)',
+                  '& .MuiChip-label': {
+                    overflow: 'hidden',
+                    textOverflow: 'ellipsis',
+                  },
+                }}
+              />
+            ))}
+          </Box>
+        )}
         {/* Powered By Badge */}
         <Box

frontend/src/components/SessionChat.tsx CHANGED Viewed

@@ -27,7 +27,16 @@ export default function SessionChat({ sessionId, isActive, onSessionDead }: Sess
   const sessionMeta = sessions.find((s) => s.id === sessionId);
   const isExpired = sessionMeta?.expired === true;
-  const { messages, sendMessage, stop, status, undoLastTurn, editAndRegenerate, approveTools } = useAgentChat({
     sessionId,
     isActive,
     onReady: () => logger.log(`Session ${sessionId} ready`),
@@ -116,6 +125,7 @@ export default function SessionChat({ sessionId, isActive, onSessionDead }: Sess
           initialModelPath={sessionMeta?.model}
           onSend={handleSendMessage}
           onStop={handleStop}
           isProcessing={busy}
           disabled={!isConnected || activityStatus.type === 'waiting-approval'}
           placeholder={

   const sessionMeta = sessions.find((s) => s.id === sessionId);
   const isExpired = sessionMeta?.expired === true;
+  const {
+    messages,
+    sendMessage,
+    stop,
+    status,
+    undoLastTurn,
+    editAndRegenerate,
+    approveTools,
+    refreshMessages,
+  } = useAgentChat({
     sessionId,
     isActive,
     onReady: () => logger.log(`Session ${sessionId} ready`),
           initialModelPath={sessionMeta?.model}
           onSend={handleSendMessage}
           onStop={handleStop}
+          onDatasetUploaded={refreshMessages}
           isProcessing={busy}
           disabled={!isConnected || activityStatus.type === 'waiting-approval'}
           placeholder={

frontend/src/hooks/useAgentChat.ts CHANGED Viewed

@@ -804,6 +804,48 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
     }
   }, [sessionId, chat]);
   return {
     messages: chat.messages,
     sendMessage: chat.sendMessage,
@@ -812,5 +854,6 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
     undoLastTurn,
     editAndRegenerate,
     approveTools,
   };
 }

     }
   }, [sessionId, chat]);
+  const refreshMessages = useCallback(async () => {
+    try {
+      const [msgsRes, infoRes] = await Promise.all([
+        apiFetch(`/api/session/${sessionId}/messages`),
+        apiFetch(`/api/session/${sessionId}`),
+      ]);
+      if (!msgsRes.ok) return false;
+      const data = await msgsRes.json();
+      if (!Array.isArray(data) || data.length === 0) return false;
+      saveBackendMessages(sessionId, data);
+      let pendingIds: Set<string> | undefined;
+      if (infoRes.ok) {
+        const info = await infoRes.json();
+        if (info.pending_approval && Array.isArray(info.pending_approval)) {
+          pendingIds = new Set(
+            info.pending_approval.map((t: { tool_call_id: string }) => t.tool_call_id)
+          );
+          if (pendingIds.size > 0) setNeedsAttention(sessionId, true);
+        }
+        if (info.auto_approval) {
+          updateSessionYolo(sessionId, info.auto_approval);
+        }
+      }
+      const uiMsgs = llmMessagesToUIMessages(
+        data,
+        pendingIds,
+        chatActionsRef.current.messages,
+      );
+      const setMsgs = chatActionsRef.current.setMessages;
+      if (setMsgs && uiMsgs.length > 0) {
+        setMsgs(uiMsgs);
+        saveMessages(sessionId, uiMsgs);
+      }
+      return true;
+    } catch {
+      return false;
+    }
+  }, [sessionId, setNeedsAttention, updateSessionYolo]);
   return {
     messages: chat.messages,
     sendMessage: chat.sendMessage,
     undoLastTurn,
     editAndRegenerate,
     approveTools,
+    refreshMessages,
   };
 }

frontend/src/utils/api.ts CHANGED Viewed

@@ -7,15 +7,36 @@
 import { triggerLogin } from '@/hooks/useAuth';
 /** Wrapper around fetch with credentials and common headers. */
 export async function apiFetch(
   path: string,
   options: RequestInit = {}
 ): Promise<Response> {
-  const headers: Record<string, string> = {
-    'Content-Type': 'application/json',
-    ...(options.headers as Record<string, string>),
-  };
   const response = await fetch(path, {
     ...options,
@@ -23,19 +44,50 @@ export async function apiFetch(
     credentials: 'include', // Send cookies with every request
   });
-  // Handle 401 — redirect to login
-  if (response.status === 401) {
-    try {
-      const authStatus = await fetch('/auth/status', { credentials: 'include' });
-      const data = await authStatus.json();
-      if (data.auth_enabled) {
-        triggerLogin();
-        throw new Error('Authentication required — redirecting to login.');
-      }
-    } catch (e) {
-      if (e instanceof Error && e.message.includes('redirecting')) throw e;
-    }
-  }
   return response;
-}

 import { triggerLogin } from '@/hooks/useAuth';
+export interface ApiUploadProgress {
+  loaded: number;
+  total: number | null;
+  percent: number | null;
+}
+async function handleUnauthorized(response: Response): Promise<void> {
+  if (response.status !== 401) return;
+  try {
+    const authStatus = await fetch('/auth/status', { credentials: 'include' });
+    const data = await authStatus.json();
+    if (data.auth_enabled) {
+      triggerLogin();
+      throw new Error('Authentication required — redirecting to login.');
+    }
+  } catch (e) {
+    if (e instanceof Error && e.message.includes('redirecting')) throw e;
+  }
+}
 /** Wrapper around fetch with credentials and common headers. */
 export async function apiFetch(
   path: string,
   options: RequestInit = {}
 ): Promise<Response> {
+  const headers = new Headers(options.headers);
+  const isFormData = options.body instanceof FormData;
+  if (!isFormData && !headers.has('Content-Type')) {
+    headers.set('Content-Type', 'application/json');
+  }
   const response = await fetch(path, {
     ...options,
     credentials: 'include', // Send cookies with every request
   });
+  await handleUnauthorized(response);
   return response;
+}
+function headersFromXhr(rawHeaders: string): Headers {
+  const headers = new Headers();
+  rawHeaders.trim().split(/[\r\n]+/).forEach((line) => {
+    const separator = line.indexOf(':');
+    if (separator <= 0) return;
+    headers.append(
+      line.slice(0, separator).trim(),
+      line.slice(separator + 1).trim(),
+    );
+  });
+  return headers;
+}
+export async function apiUpload(
+  path: string,
+  formData: FormData,
+  options: { onProgress?: (progress: ApiUploadProgress) => void } = {},
+): Promise<Response> {
+  return new Promise<Response>((resolve, reject) => {
+    const xhr = new XMLHttpRequest();
+    xhr.open('POST', path);
+    xhr.withCredentials = true;
+    xhr.upload.onprogress = (event) => {
+      const total = event.lengthComputable ? event.total : null;
+      const percent = total
+        ? Math.min(100, Math.round((event.loaded / total) * 100))
+        : null;
+      options.onProgress?.({ loaded: event.loaded, total, percent });
+    };
+    xhr.onerror = () => reject(new Error('Network error while uploading.'));
+    xhr.onabort = () => reject(new Error('Dataset upload was canceled.'));
+    xhr.onload = () => {
+      const response = new Response(xhr.responseText, {
+        status: xhr.status,
+        statusText: xhr.statusText,
+        headers: headersFromXhr(xhr.getAllResponseHeaders()),
+      });
+      handleUnauthorized(response).then(() => resolve(response)).catch(reject);
+    };
+    xhr.send(formData);
+  });
+}

pyproject.toml CHANGED Viewed

@@ -28,6 +28,7 @@ dependencies = [
     "websockets>=13.0",
     "apscheduler>=3.10,<4",
     "pymongo>=4.17.0",
 ]
 [project.optional-dependencies]

     "websockets>=13.0",
     "apscheduler>=3.10,<4",
     "pymongo>=4.17.0",
+    "python-multipart>=0.0.20",
 ]
 [project.optional-dependencies]

tests/unit/test_cli_rendering.py CHANGED Viewed

@@ -1,10 +1,12 @@
 """Regression tests for interactive CLI rendering and research model routing."""
 import sys
 from io import StringIO
 from types import SimpleNamespace
 import pytest
 import agent.main as main_mod
 from agent.tools.research_tool import _get_research_model
@@ -29,6 +31,50 @@ def test_non_anthropic_research_model_is_unchanged():
     assert _get_research_model("openai/gpt-5.4") == "openai/gpt-5.4"
 def test_subagent_display_does_not_spawn_background_redraw(monkeypatch):
     calls: list[object] = []
@@ -52,10 +98,11 @@ def test_subagent_display_does_not_spawn_background_redraw(monkeypatch):
 def test_cli_forwards_model_flag_to_interactive_main(monkeypatch):
-    seen: dict[str, str | None] = {}
-    async def fake_main(*, model=None):
         seen["model"] = model
     monkeypatch.setattr(sys, "argv", ["ml-intern", "--model", "openai/gpt-5.5"])
     monkeypatch.setattr(main_mod, "main", fake_main)
@@ -63,6 +110,61 @@ def test_cli_forwards_model_flag_to_interactive_main(monkeypatch):
     main_mod.cli()
     assert seen["model"] == "openai/gpt-5.5"
 @pytest.mark.asyncio
@@ -70,9 +172,10 @@ async def test_interactive_main_applies_model_override_before_banner(monkeypatch
     class StopAfterBanner(Exception):
         pass
-    def fake_banner(*, model=None, hf_user=None):
         assert model == "openai/gpt-5.5"
         assert hf_user == "tester"
         raise StopAfterBanner
     monkeypatch.setattr(main_mod.os, "system", lambda *_args, **_kwargs: 0)
@@ -85,9 +188,150 @@ async def test_interactive_main_applies_model_override_before_banner(monkeypatch
         lambda _path, **_kwargs: SimpleNamespace(
             model_name="moonshotai/Kimi-K2.6",
             mcpServers={},
         ),
     )
     monkeypatch.setattr(main_mod, "print_banner", fake_banner)
     with pytest.raises(StopAfterBanner):
         await main_mod.main(model="openai/gpt-5.5")

 """Regression tests for interactive CLI rendering and research model routing."""
+import asyncio
 import sys
 from io import StringIO
 from types import SimpleNamespace
 import pytest
+from rich.console import Console
 import agent.main as main_mod
 from agent.tools.research_tool import _get_research_model
     assert _get_research_model("openai/gpt-5.4") == "openai/gpt-5.4"
+def test_help_output_keeps_descriptions_aligned(monkeypatch):
+    output = StringIO()
+    console = Console(
+        file=output,
+        color_system=None,
+        theme=terminal_display._THEME,
+        width=120,
+    )
+    monkeypatch.setattr(terminal_display, "_console", console)
+    terminal_display.print_help()
+    lines = [line.rstrip() for line in output.getvalue().splitlines() if line.strip()]
+    description_columns = []
+    for command, args, description in terminal_display.HELP_ROWS:
+        line = next(line for line in lines if command in line)
+        if args:
+            assert args in line
+        description_columns.append(line.index(description))
+    assert len(set(description_columns)) == 1
+def test_help_output_recomputes_widths_from_rows():
+    rows = terminal_display.HELP_ROWS + (
+        ("/longer-command", "[longer-args]", "Synthetic help row"),
+    )
+    output = StringIO()
+    Console(
+        file=output,
+        color_system=None,
+        theme=terminal_display._THEME,
+        width=140,
+    ).print(terminal_display.format_help_text(rows))
+    lines = [line.rstrip() for line in output.getvalue().splitlines() if line.strip()]
+    description_columns = [
+        next(line for line in lines if command in line).index(description)
+        for command, _args, description in rows
+    ]
+    assert len(set(description_columns)) == 1
 def test_subagent_display_does_not_spawn_background_redraw(monkeypatch):
     calls: list[object] = []
 def test_cli_forwards_model_flag_to_interactive_main(monkeypatch):
+    seen: dict[str, object] = {}
+    async def fake_main(*, model=None, sandbox_tools=False):
         seen["model"] = model
+        seen["sandbox_tools"] = sandbox_tools
     monkeypatch.setattr(sys, "argv", ["ml-intern", "--model", "openai/gpt-5.5"])
     monkeypatch.setattr(main_mod, "main", fake_main)
     main_mod.cli()
     assert seen["model"] == "openai/gpt-5.5"
+    assert seen["sandbox_tools"] is False
+def test_cli_forwards_sandbox_flag_to_interactive_main(monkeypatch):
+    seen: dict[str, object] = {}
+    async def fake_main(*, model=None, sandbox_tools=False):
+        seen["model"] = model
+        seen["sandbox_tools"] = sandbox_tools
+    monkeypatch.setattr(sys, "argv", ["ml-intern", "--sandbox-tools"])
+    monkeypatch.setattr(main_mod, "main", fake_main)
+    main_mod.cli()
+    assert seen == {"model": None, "sandbox_tools": True}
+def test_cli_forwards_sandbox_flag_to_headless_main(monkeypatch):
+    seen: dict[str, object] = {}
+    async def fake_headless_main(
+        prompt,
+        *,
+        model=None,
+        max_iterations=None,
+        stream=True,
+        sandbox_tools=False,
+    ):
+        seen.update(
+            {
+                "prompt": prompt,
+                "model": model,
+                "max_iterations": max_iterations,
+                "stream": stream,
+                "sandbox_tools": sandbox_tools,
+            }
+        )
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        ["ml-intern", "--sandbox-tools", "--no-stream", "train a model"],
+    )
+    monkeypatch.setattr(main_mod, "headless_main", fake_headless_main)
+    main_mod.cli()
+    assert seen == {
+        "prompt": "train a model",
+        "model": None,
+        "max_iterations": None,
+        "stream": False,
+        "sandbox_tools": True,
+    }
 @pytest.mark.asyncio
     class StopAfterBanner(Exception):
         pass
+    def fake_banner(*, model=None, hf_user=None, tool_runtime=None):
         assert model == "openai/gpt-5.5"
         assert hf_user == "tester"
+        assert tool_runtime == "local filesystem"
         raise StopAfterBanner
     monkeypatch.setattr(main_mod.os, "system", lambda *_args, **_kwargs: 0)
         lambda _path, **_kwargs: SimpleNamespace(
             model_name="moonshotai/Kimi-K2.6",
             mcpServers={},
+            tool_runtime="local",
         ),
     )
     monkeypatch.setattr(main_mod, "print_banner", fake_banner)
     with pytest.raises(StopAfterBanner):
         await main_mod.main(model="openai/gpt-5.5")
+@pytest.mark.asyncio
+async def test_local_model_local_runtime_skips_hf_token_prompt(monkeypatch):
+    class StopAfterBanner(Exception):
+        pass
+    async def fail_prompt(_prompt_session):
+        raise AssertionError("local model with local tools should not prompt")
+    def fake_banner(*, model=None, hf_user=None, tool_runtime=None):
+        assert model == "llamacpp/model"
+        assert hf_user is None
+        assert tool_runtime == "local filesystem"
+        raise StopAfterBanner
+    monkeypatch.setattr(main_mod.os, "system", lambda *_args, **_kwargs: 0)
+    monkeypatch.setattr(main_mod, "PromptSession", lambda: object())
+    monkeypatch.setattr(main_mod, "resolve_hf_token", lambda: None)
+    monkeypatch.setattr(main_mod, "_prompt_and_save_hf_token", fail_prompt)
+    monkeypatch.setattr(main_mod, "_get_hf_user", lambda _token: None)
+    monkeypatch.setattr(
+        main_mod,
+        "load_config",
+        lambda _path, **_kwargs: SimpleNamespace(
+            model_name="llamacpp/model",
+            mcpServers={},
+            tool_runtime="local",
+        ),
+    )
+    monkeypatch.setattr(main_mod, "print_banner", fake_banner)
+    with pytest.raises(StopAfterBanner):
+        await main_mod.main()
+@pytest.mark.asyncio
+async def test_local_model_sandbox_runtime_prompts_for_hf_token(monkeypatch):
+    class StopAfterBanner(Exception):
+        pass
+    prompted = False
+    async def fake_prompt(_prompt_session):
+        nonlocal prompted
+        prompted = True
+        return "hf-token"
+    def fake_banner(*, model=None, hf_user=None, tool_runtime=None):
+        assert model == "llamacpp/model"
+        assert hf_user == "tester"
+        assert tool_runtime == "HF sandbox"
+        raise StopAfterBanner
+    monkeypatch.setattr(main_mod.os, "system", lambda *_args, **_kwargs: 0)
+    monkeypatch.setattr(main_mod, "PromptSession", lambda: object())
+    monkeypatch.setattr(main_mod, "resolve_hf_token", lambda: None)
+    monkeypatch.setattr(main_mod, "_prompt_and_save_hf_token", fake_prompt)
+    monkeypatch.setattr(main_mod, "_get_hf_user", lambda _token: "tester")
+    monkeypatch.setattr(
+        main_mod,
+        "load_config",
+        lambda _path, **_kwargs: SimpleNamespace(
+            model_name="llamacpp/model",
+            mcpServers={},
+            tool_runtime="local",
+        ),
+    )
+    monkeypatch.setattr(main_mod, "print_banner", fake_banner)
+    with pytest.raises(StopAfterBanner):
+        await main_mod.main(sandbox_tools=True)
+    assert prompted is True
+@pytest.mark.asyncio
+async def test_interactive_main_passes_sandbox_runtime_to_tool_router(monkeypatch):
+    class StopAfterToolRouter(Exception):
+        pass
+    seen: dict[str, object] = {}
+    class FakeGateway:
+        def __init__(self, _config):
+            pass
+        async def start(self):
+            pass
+    class FakeToolRouter:
+        def __init__(self, mcp_servers, *, hf_token=None, local_mode=True):
+            seen["mcp_servers"] = mcp_servers
+            seen["hf_token"] = hf_token
+            seen["local_mode"] = local_mode
+            raise StopAfterToolRouter
+    from agent.core import hf_router_catalog
+    monkeypatch.setattr(main_mod.os, "system", lambda *_args, **_kwargs: 0)
+    monkeypatch.setattr(main_mod, "PromptSession", lambda: object())
+    monkeypatch.setattr(main_mod, "resolve_hf_token", lambda: "hf-token")
+    monkeypatch.setattr(main_mod, "_get_hf_user", lambda _token: "tester")
+    monkeypatch.setattr(main_mod, "print_banner", lambda **_kwargs: None)
+    monkeypatch.setattr(hf_router_catalog, "prewarm", lambda: None)
+    monkeypatch.setattr(
+        main_mod,
+        "load_config",
+        lambda _path, **_kwargs: SimpleNamespace(
+            model_name="llamacpp/model",
+            mcpServers={"server": object()},
+            messaging=SimpleNamespace(default_auto_destinations=lambda: []),
+            tool_runtime="local",
+        ),
+    )
+    monkeypatch.setattr(main_mod, "NotificationGateway", FakeGateway)
+    monkeypatch.setattr(main_mod, "ToolRouter", FakeToolRouter)
+    with pytest.raises(StopAfterToolRouter):
+        await main_mod.main(sandbox_tools=True)
+    assert seen["hf_token"] == "hf-token"
+    assert seen["local_mode"] is False
+@pytest.mark.asyncio
+async def test_initial_sandbox_preload_waits_before_prompt():
+    waited = False
+    async def preload():
+        nonlocal waited
+        await asyncio.sleep(0)
+        waited = True
+    task = asyncio.create_task(preload())
+    await main_mod._wait_for_initial_sandbox_preload(
+        [SimpleNamespace(sandbox_preload_task=task)]
+    )
+    assert waited is True

tests/unit/test_config.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import json
 from agent import config as config_module
@@ -121,3 +124,35 @@ def test_slack_user_defaults_can_be_disabled(tmp_path, monkeypatch):
     assert not config.messaging.enabled
     assert config.messaging.destinations == {}

 import json
+import pytest
+from pydantic import ValidationError
 from agent import config as config_module
     assert not config.messaging.enabled
     assert config.messaging.destinations == {}
+def test_tool_runtime_defaults_to_local(tmp_path):
+    config_path = tmp_path / "config.json"
+    _write_json(config_path, {"model_name": "moonshotai/Kimi-K2.6"})
+    config = config_module.load_config(str(config_path))
+    assert config.tool_runtime == "local"
+def test_user_config_can_set_sandbox_tool_runtime(tmp_path, monkeypatch):
+    config_path = tmp_path / "config.json"
+    user_config_path = tmp_path / "user-config.json"
+    _write_json(config_path, {"model_name": "moonshotai/Kimi-K2.6"})
+    _write_json(user_config_path, {"tool_runtime": "sandbox"})
+    monkeypatch.setenv("ML_INTERN_CLI_CONFIG", str(user_config_path))
+    config = config_module.load_config(str(config_path), include_user_defaults=True)
+    assert config.tool_runtime == "sandbox"
+def test_invalid_tool_runtime_is_rejected(tmp_path):
+    config_path = tmp_path / "config.json"
+    _write_json(
+        config_path,
+        {"model_name": "moonshotai/Kimi-K2.6", "tool_runtime": "hybrid"},
+    )
+    with pytest.raises(ValidationError):
+        config_module.load_config(str(config_path))

tests/unit/test_dataset_uploads.py ADDED Viewed

	@@ -0,0 +1,465 @@

+import io
+import sys
+from pathlib import Path
+from types import SimpleNamespace
+import httpx
+import pytest
+from fastapi import HTTPException, UploadFile
+from huggingface_hub.errors import HfHubHTTPError
+from starlette.datastructures import FormData
+_BACKEND_DIR = Path(__file__).resolve().parent.parent.parent / "backend"
+if str(_BACKEND_DIR) not in sys.path:
+    sys.path.insert(0, str(_BACKEND_DIR))
+import dataset_uploads  # noqa: E402
+from routes import agent  # noqa: E402
+def _upload(filename: str, content: bytes = b"a,b\n1,2\n") -> UploadFile:
+    return UploadFile(filename=filename, file=io.BytesIO(content))
+def _track_close(upload: UploadFile):
+    state = {"closed": False}
+    original_close = upload.close
+    async def close():
+        state["closed"] = True
+        await original_close()
+    upload.close = close
+    return state
+def _request(
+    upload: UploadFile | None = None,
+    headers: dict[str, str] | None = None,
+):
+    state = {"form_called": False}
+    class FakeRequest:
+        def __init__(self):
+            self.headers = headers or {}
+            self.cookies = {}
+        async def form(self, **_kwargs):
+            state["form_called"] = True
+            if upload is None:
+                raise AssertionError("request.form() should not be called")
+            return FormData([("file", upload)])
+    return FakeRequest(), state
+def test_sanitize_dataset_filename_strips_paths_and_unsafe_chars():
+    assert (
+        dataset_uploads.sanitize_dataset_filename("../../bad file (final).CSV")
+        == "bad-file-final.csv"
+    )
+    assert dataset_uploads.sanitize_dataset_filename("") == "dataset.csv"
+def test_dataset_format_rejects_unsupported_extension():
+    with pytest.raises(HTTPException) as exc_info:
+        dataset_uploads.dataset_format_from_filename("notes.txt")
+    assert exc_info.value.status_code == 400
+    with pytest.raises(HTTPException):
+        dataset_uploads.dataset_format_from_filename("notes")
+def test_dataset_repo_card_exposes_each_upload_as_config():
+    card = dataset_uploads.dataset_repo_card(
+        "alice/ml-intern-s1-datasets",
+        [
+            "README.md",
+            "uploads/oldabc/rows.jsonl",
+            "uploads/oldabc/rows.jsonl",
+            "uploads/newdef/table.csv",
+        ],
+    ).decode("utf-8")
+    assert "configs:" in card
+    assert "- config_name: upload_oldabc" in card
+    assert '    path: "uploads/oldabc/rows.jsonl"' in card
+    assert "- config_name: upload_newdef" in card
+    assert '    path: "uploads/newdef/table.csv"' in card
+    assert card.count("- config_name: upload_oldabc") == 1
+@pytest.mark.asyncio
+async def test_validate_dataset_upload_rejects_size_over_limit(monkeypatch):
+    monkeypatch.setattr(dataset_uploads, "MAX_DATASET_UPLOAD_BYTES", 3)
+    upload = _upload("rows.csv", b"abcd")
+    try:
+        with pytest.raises(HTTPException) as exc_info:
+            await dataset_uploads.validate_dataset_upload(upload)
+    finally:
+        await upload.close()
+    assert exc_info.value.status_code == 413
+@pytest.mark.asyncio
+async def test_push_dataset_upload_creates_private_repo_and_uploads_file(monkeypatch):
+    instances = []
+    class FakeApi:
+        def __init__(self, token):
+            self.token = token
+            self.create_calls = []
+            self.settings_calls = []
+            self.list_calls = []
+            self.upload_calls = []
+            instances.append(self)
+        def create_repo(self, **kwargs):
+            self.create_calls.append(kwargs)
+        def update_repo_settings(self, **kwargs):
+            self.settings_calls.append(kwargs)
+        def list_repo_files(self, **kwargs):
+            self.list_calls.append(kwargs)
+            return [
+                "README.md",
+                "uploads/oldupload/old.jsonl",
+                "uploads/notes.txt",
+            ]
+        def upload_file(self, **kwargs):
+            if kwargs["path_in_repo"] != "README.md":
+                assert kwargs["path_or_fileobj"] == b"a,b\n1,2\n"
+            self.upload_calls.append(kwargs)
+    monkeypatch.setattr(dataset_uploads, "HfApi", FakeApi)
+    monkeypatch.setattr(
+        dataset_uploads.uuid,
+        "uuid4",
+        lambda: SimpleNamespace(hex="feedfacecafebeef"),
+    )
+    upload = _upload("../Data Set.CSV")
+    try:
+        result = await dataset_uploads.push_dataset_upload_to_hub(
+            upload=upload,
+            session_id="12345678-90ab-cdef-1234-567890abcdef",
+            hf_username="alice",
+            hf_token="hf-token",
+        )
+    finally:
+        await upload.close()
+    api = instances[0]
+    assert api.token == "hf-token"
+    assert api.create_calls == [
+        {
+            "repo_id": "alice/ml-intern-12345678-datasets",
+            "repo_type": "dataset",
+            "private": True,
+            "exist_ok": True,
+        }
+    ]
+    assert api.settings_calls == [
+        {
+            "repo_id": "alice/ml-intern-12345678-datasets",
+            "repo_type": "dataset",
+            "private": True,
+        }
+    ]
+    assert api.list_calls == [
+        {
+            "repo_id": "alice/ml-intern-12345678-datasets",
+            "repo_type": "dataset",
+        }
+    ]
+    assert [call["path_in_repo"] for call in api.upload_calls] == [
+        "uploads/feedfacecafe/Data-Set.csv",
+        "README.md",
+    ]
+    readme = api.upload_calls[1]["path_or_fileobj"].decode("utf-8")
+    assert "- config_name: upload_oldupload" in readme
+    assert '    path: "uploads/oldupload/old.jsonl"' in readme
+    assert "- config_name: upload_feedfacecafe" in readme
+    assert '    path: "uploads/feedfacecafe/Data-Set.csv"' in readme
+    assert result.repo_id == "alice/ml-intern-12345678-datasets"
+    assert result.config_name == "upload_feedfacecafe"
+    assert result.format == "csv"
+    assert result.load_dataset_snippet == (
+        "from datasets import load_dataset\n\n"
+        'dataset = load_dataset("alice/ml-intern-12345678-datasets", '
+        '"upload_feedfacecafe", split="train", token=True)'
+    )
+@pytest.mark.asyncio
+async def test_upload_route_requires_hf_token_without_parsing_upload(monkeypatch):
+    monkeypatch.delenv("HF_TOKEN", raising=False)
+    upload = _upload("rows.csv")
+    close_state = _track_close(upload)
+    request, request_state = _request(upload)
+    async def fake_check_session_access(*_args, **_kwargs):
+        return SimpleNamespace(
+            is_active=True,
+            is_processing=False,
+            session=SimpleNamespace(pending_approval=None),
+            hf_username="alice",
+        )
+    monkeypatch.setattr(agent, "_check_session_access", fake_check_session_access)
+    try:
+        with pytest.raises(HTTPException) as exc_info:
+            await agent.upload_session_dataset(
+                "s1",
+                request,
+                {"user_id": "u1", "username": "alice"},
+            )
+        assert exc_info.value.status_code == 401
+        assert request_state["form_called"] is False
+        assert close_state["closed"] is False
+    finally:
+        await upload.close()
+@pytest.mark.asyncio
+async def test_upload_route_rejects_content_length_before_parsing(monkeypatch):
+    upload = _upload("rows.csv")
+    close_state = _track_close(upload)
+    request, request_state = _request(
+        upload,
+        headers={
+            "content-length": str(
+                dataset_uploads.MAX_DATASET_UPLOAD_BYTES
+                + agent.DATASET_UPLOAD_MULTIPART_SLACK_BYTES
+                + 1
+            )
+        },
+    )
+    async def fake_check_session_access(*_args, **_kwargs):
+        raise AssertionError("session access should not run for oversized uploads")
+    monkeypatch.setattr(agent, "_check_session_access", fake_check_session_access)
+    try:
+        with pytest.raises(HTTPException) as exc_info:
+            await agent.upload_session_dataset(
+                "s1",
+                request,
+                {
+                    "user_id": "u1",
+                    "username": "alice",
+                    agent.INTERNAL_HF_TOKEN_KEY: "hf-token",
+                },
+            )
+        assert exc_info.value.status_code == 413
+        assert request_state["form_called"] is False
+        assert close_state["closed"] is False
+    finally:
+        await upload.close()
+@pytest.mark.asyncio
+async def test_upload_route_rejects_busy_session_without_parsing_upload(monkeypatch):
+    upload = _upload("rows.csv")
+    close_state = _track_close(upload)
+    request, request_state = _request(upload)
+    async def fake_check_session_access(*_args, **_kwargs):
+        return SimpleNamespace(
+            is_active=True,
+            is_processing=True,
+            session=SimpleNamespace(pending_approval=None),
+            hf_username="alice",
+        )
+    monkeypatch.setattr(agent, "_check_session_access", fake_check_session_access)
+    with pytest.raises(HTTPException) as exc_info:
+        await agent.upload_session_dataset(
+            "s1",
+            request,
+            {
+                "user_id": "u1",
+                "username": "alice",
+                agent.INTERNAL_HF_TOKEN_KEY: "hf-token",
+            },
+        )
+    assert exc_info.value.status_code == 409
+    assert request_state["form_called"] is False
+    assert close_state["closed"] is False
+    await upload.close()
+@pytest.mark.asyncio
+async def test_upload_route_appends_context_note_and_persists(monkeypatch):
+    upload = _upload("rows.jsonl", b'{"text":"hi"}\n')
+    close_state = _track_close(upload)
+    request, request_state = _request(upload)
+    messages = []
+    persisted = []
+    agent_session = SimpleNamespace(
+        is_active=True,
+        is_processing=False,
+        session=SimpleNamespace(
+            pending_approval=None,
+            context_manager=SimpleNamespace(add_message=messages.append),
+        ),
+        hf_username="alice",
+    )
+    uploaded = dataset_uploads.DatasetUpload(
+        session_id="s1",
+        repo_id="alice/ml-intern-s1-datasets",
+        repo_type="dataset",
+        private=True,
+        upload_id="abc123",
+        config_name="upload_abc123",
+        filename="rows.jsonl",
+        original_filename="rows.jsonl",
+        path_in_repo="uploads/abc123/rows.jsonl",
+        size_bytes=14,
+        format="jsonl",
+        hub_url="https://huggingface.co/datasets/alice/ml-intern-s1-datasets/blob/main/uploads/abc123/rows.jsonl",
+        load_dataset_snippet='dataset = load_dataset("json")',
+    )
+    async def fake_check_session_access(*_args, **_kwargs):
+        return agent_session
+    async def fake_push_dataset_upload_to_hub(**kwargs):
+        assert kwargs["upload"] is upload
+        assert kwargs["hf_token"] == "hf-token"
+        return uploaded
+    async def fake_persist_session_snapshot(value):
+        persisted.append(value)
+    monkeypatch.setattr(agent, "_check_session_access", fake_check_session_access)
+    monkeypatch.setattr(
+        agent, "push_dataset_upload_to_hub", fake_push_dataset_upload_to_hub
+    )
+    monkeypatch.setattr(
+        agent.session_manager,
+        "persist_session_snapshot",
+        fake_persist_session_snapshot,
+    )
+    response = await agent.upload_session_dataset(
+        "s1",
+        request,
+        {
+            "user_id": "u1",
+            "username": "alice",
+            agent.INTERNAL_HF_TOKEN_KEY: "hf-token",
+        },
+    )
+    assert response.repo_id == uploaded.repo_id
+    assert response.config_name == uploaded.config_name
+    assert response.path_in_repo == uploaded.path_in_repo
+    assert len(messages) == 1
+    assert messages[0].role == "user"
+    assert messages[0].content.startswith("[SYSTEM:")
+    assert uploaded.config_name in messages[0].content
+    assert uploaded.path_in_repo in messages[0].content
+    assert persisted == [agent_session]
+    assert request_state["form_called"] is True
+    assert close_state["closed"] is True
+@pytest.mark.asyncio
+async def test_upload_route_closes_upload_when_hub_upload_fails(monkeypatch):
+    upload = _upload("rows.csv")
+    close_state = _track_close(upload)
+    request, request_state = _request(upload)
+    async def fake_check_session_access(*_args, **_kwargs):
+        return SimpleNamespace(
+            is_active=True,
+            is_processing=False,
+            session=SimpleNamespace(pending_approval=None),
+            hf_username="alice",
+        )
+    async def fake_push_dataset_upload_to_hub(**_kwargs):
+        raise RuntimeError("hub unavailable")
+    monkeypatch.setattr(agent, "_check_session_access", fake_check_session_access)
+    monkeypatch.setattr(
+        agent, "push_dataset_upload_to_hub", fake_push_dataset_upload_to_hub
+    )
+    with pytest.raises(HTTPException) as exc_info:
+        await agent.upload_session_dataset(
+            "s1",
+            request,
+            {
+                "user_id": "u1",
+                "username": "alice",
+                agent.INTERNAL_HF_TOKEN_KEY: "hf-token",
+            },
+        )
+    assert exc_info.value.status_code == 502
+    assert exc_info.value.detail == "Dataset upload failed. Please try again."
+    assert request_state["form_called"] is True
+    assert close_state["closed"] is True
+@pytest.mark.asyncio
+async def test_upload_route_maps_hub_permission_error_safely(monkeypatch):
+    upload = _upload("rows.csv")
+    close_state = _track_close(upload)
+    request, request_state = _request(upload)
+    async def fake_check_session_access(*_args, **_kwargs):
+        return SimpleNamespace(
+            is_active=True,
+            is_processing=False,
+            session=SimpleNamespace(pending_approval=None),
+            hf_username="alice",
+        )
+    async def fake_push_dataset_upload_to_hub(**_kwargs):
+        response = httpx.Response(
+            403,
+            request=httpx.Request("POST", "https://huggingface.co/api/datasets"),
+            headers={"x-request-id": "req-123"},
+        )
+        raise HfHubHTTPError(
+            "403 Forbidden: token hf_secret cannot write",
+            response=response,
+            server_message="token hf_secret cannot write",
+        )
+    monkeypatch.setattr(agent, "_check_session_access", fake_check_session_access)
+    monkeypatch.setattr(
+        agent, "push_dataset_upload_to_hub", fake_push_dataset_upload_to_hub
+    )
+    with pytest.raises(HTTPException) as exc_info:
+        await agent.upload_session_dataset(
+            "s1",
+            request,
+            {
+                "user_id": "u1",
+                "username": "alice",
+                agent.INTERNAL_HF_TOKEN_KEY: "hf-token",
+            },
+        )
+    assert exc_info.value.status_code == 403
+    assert exc_info.value.detail == (
+        "Hugging Face denied permission to create or write to the dataset repo."
+    )
+    assert "hf_secret" not in exc_info.value.detail
+    assert request_state["form_called"] is True
+    assert close_state["closed"] is True

tests/unit/test_hub_artifacts.py CHANGED Viewed

@@ -549,7 +549,7 @@ def test_sitecustomize_caches_lazy_collection_slug_across_bootstraps(
     ]
-def test_sitecustomize_skips_sandbox_space_registration(monkeypatch):
     import huggingface_hub as hub
     from huggingface_hub import HfApi
@@ -579,6 +579,10 @@ def test_sitecustomize_skips_sandbox_space_registration(monkeypatch):
     def fake_add_collection_item(self, **kwargs):
         collection_items.append(kwargs)
     monkeypatch.setattr(HfApi, "upload_file", fake_upload_file)
     monkeypatch.setattr(HfApi, "create_collection", fake_create_collection)
     monkeypatch.setattr(HfApi, "add_collection_item", fake_add_collection_item)

     ]
+def test_sitecustomize_skips_sandbox_space_registration(monkeypatch, tmp_path):
     import huggingface_hub as hub
     from huggingface_hub import HfApi
     def fake_add_collection_item(self, **kwargs):
         collection_items.append(kwargs)
+    monkeypatch.setenv(
+        "ML_INTERN_ARTIFACT_COLLECTION_CACHE",
+        str(tmp_path / "collection-slug.txt"),
+    )
     monkeypatch.setattr(HfApi, "upload_file", fake_upload_file)
     monkeypatch.setattr(HfApi, "create_collection", fake_create_collection)
     monkeypatch.setattr(HfApi, "add_collection_item", fake_add_collection_item)

tests/unit/test_no_tool_continuation_guard.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import asyncio
+import json
+import pytest
+from agent.config import Config
+from agent.core import agent_loop
+from agent.core.agent_loop import Handlers, LLMResult
+from agent.core.session import Session
+from agent.tools.plan_tool import PlanTool
+class FakeToolRouter:
+    def __init__(self):
+        self.calls = []
+    def get_tool_specs_for_llm(self):
+        return [
+            {
+                "type": "function",
+                "function": {
+                    "name": "plan_tool",
+                    "description": "Update plan",
+                    "parameters": {"type": "object"},
+                },
+            }
+        ]
+    async def call_tool(self, name, arguments, session=None, tool_call_id=None):
+        self.calls.append((name, arguments, tool_call_id))
+        if name == "plan_tool" and session is not None:
+            session.current_plan = [dict(todo) for todo in arguments["todos"]]
+        return "plan updated", True
+@pytest.mark.asyncio
+async def test_plan_tool_stores_session_scoped_plan():
+    events = []
+    class FakeSession:
+        current_plan = []
+        async def send_event(self, event):
+            events.append(event)
+    session = FakeSession()
+    todos = [{"id": "1", "content": "Smoke test", "status": "in_progress"}]
+    result = await PlanTool(session=session).execute({"todos": todos})
+    assert result["isError"] is False
+    assert session.current_plan == todos
+    assert events[0].event_type == "plan_update"
+    assert events[0].data == {"plan": todos}
+@pytest.mark.asyncio
+async def test_no_tool_response_retries_when_plan_is_incomplete(monkeypatch):
+    config = Config.model_validate(
+        {"model_name": "openai/test", "save_sessions": False}
+    )
+    event_queue = asyncio.Queue()
+    router = FakeToolRouter()
+    session = Session(
+        event_queue,
+        config,
+        tool_router=router,
+        stream=False,
+    )
+    session.current_plan = [
+        {
+            "id": "1",
+            "content": "Write and smoke-test training script",
+            "status": "in_progress",
+        },
+        {"id": "2", "content": "Launch full training job", "status": "pending"},
+    ]
+    calls = []
+    async def fake_call_llm_non_streaming(session, messages, tools, llm_params):
+        calls.append(messages)
+        if len(calls) == 1:
+            return LLMResult(
+                content="I should keep going, but I forgot to call a tool.",
+                tool_calls_acc={},
+                token_count=10,
+                finish_reason="stop",
+            )
+        if len(calls) == 2:
+            assert "CONTINUATION GUARD" in messages[-1].content
+            return LLMResult(
+                content=None,
+                tool_calls_acc={
+                    0: {
+                        "id": "call_1",
+                        "function": {
+                            "name": "plan_tool",
+                            "arguments": json.dumps(
+                                {
+                                    "todos": [
+                                        {
+                                            "id": "1",
+                                            "content": "Write and smoke-test training script",
+                                            "status": "completed",
+                                        },
+                                        {
+                                            "id": "2",
+                                            "content": "Launch full training job",
+                                            "status": "completed",
+                                        },
+                                    ]
+                                }
+                            ),
+                        },
+                    }
+                },
+                token_count=20,
+                finish_reason="tool_calls",
+            )
+        return LLMResult(
+            content="Done.",
+            tool_calls_acc={},
+            token_count=30,
+            finish_reason="stop",
+        )
+    monkeypatch.setattr(
+        agent_loop, "_resolve_llm_params", lambda *_, **__: {"model": "openai/test"}
+    )
+    monkeypatch.setattr(
+        agent_loop, "_call_llm_non_streaming", fake_call_llm_non_streaming
+    )
+    final = await Handlers.run_agent(session, "continue")
+    assert final == "Done."
+    assert len(calls) == 3
+    assert router.calls[0][0] == "plan_tool"
+    assert all(todo["status"] == "completed" for todo in session.current_plan)
+    events = []
+    while not event_queue.empty():
+        events.append(await event_queue.get())
+    assert any(
+        event.event_type == "tool_log"
+        and "text-only response" in (event.data or {}).get("log", "")
+        for event in events
+    )

tests/unit/test_sandbox_auto_start.py CHANGED Viewed

@@ -1,7 +1,15 @@
 from types import SimpleNamespace
 from pathlib import Path
 from agent.core.agent_loop import _needs_approval
 from agent.tools.sandbox_tool import get_sandbox_tools
@@ -34,3 +42,102 @@ def test_prompt_and_tool_specs_do_not_require_cpu_sandbox_create():
         in tool_specs["sandbox_create"]
     )
     assert "started automatically for normal CPU work" in tool_specs["bash"]

+import asyncio
 from types import SimpleNamespace
 from pathlib import Path
+import pytest
+from agent.config import Config
+from agent.core import agent_loop
 from agent.core.agent_loop import _needs_approval
+from agent.core.session import OpType
+from agent.core.tools import create_builtin_tools
+from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC
 from agent.tools.sandbox_tool import get_sandbox_tools
         in tool_specs["sandbox_create"]
     )
     assert "started automatically for normal CPU work" in tool_specs["bash"]
+def test_prompt_rejects_local_machine_paths_for_hf_jobs_scripts():
+    prompt = Path("agent/prompts/system_prompt_v3.yaml").read_text()
+    assert "Never pass a local machine path to hf_jobs.script" in prompt
+    assert "/fsx/..." in prompt
+    assert "inline Python source code" in prompt
+    assert "a file already written in the session sandbox" in prompt
+def test_prompt_and_hf_jobs_spec_require_gpu_preflight_for_gpu_jobs():
+    prompt = Path("agent/prompts/system_prompt_v3.yaml").read_text()
+    jobs_description = HF_JOBS_TOOL_SPEC["description"]
+    assert "GPU preflight is mandatory before hf_jobs" in prompt
+    assert "GPU sandbox smoke test" in prompt
+    assert "If you skip GPU sandbox preflight" in prompt
+    assert "you MUST create a GPU sandbox with sandbox_create first" in jobs_description
+    assert "If skipped, state why before calling hf_jobs" in jobs_description
+def test_local_tool_runtime_excludes_sandbox_create():
+    tool_names = {tool.name for tool in create_builtin_tools(local_mode=True)}
+    assert {"bash", "read", "write", "edit"} <= tool_names
+    assert "sandbox_create" not in tool_names
+def test_sandbox_tool_runtime_includes_sandbox_create():
+    tool_names = {tool.name for tool in create_builtin_tools(local_mode=False)}
+    assert {"sandbox_create", "bash", "read", "write", "edit"} <= tool_names
+@pytest.mark.asyncio
+async def test_cli_sandbox_runtime_preloads_and_tears_down_sandbox(monkeypatch):
+    started = []
+    torn_down = []
+    class FakeToolRouter:
+        tools = {}
+        def get_tool_specs_for_llm(self):
+            return []
+        async def __aenter__(self):
+            return self
+        async def __aexit__(self, exc_type, exc, tb):
+            return None
+    def fake_start_cpu_sandbox_preload(session):
+        started.append(session)
+        return None
+    async def fake_teardown_session_sandbox(session):
+        torn_down.append(session)
+    monkeypatch.setattr(
+        agent_loop, "start_cpu_sandbox_preload", fake_start_cpu_sandbox_preload
+    )
+    monkeypatch.setattr(
+        agent_loop, "teardown_session_sandbox", fake_teardown_session_sandbox
+    )
+    submission_queue = asyncio.Queue()
+    event_queue = asyncio.Queue()
+    session_holder = [None]
+    config = Config.model_validate(
+        {"model_name": "openai/gpt-5.5", "save_sessions": False}
+    )
+    task = asyncio.create_task(
+        agent_loop.submission_loop(
+            submission_queue,
+            event_queue,
+            config=config,
+            tool_router=FakeToolRouter(),
+            session_holder=session_holder,
+            hf_token="hf-token",
+            user_id="tester",
+            local_mode=False,
+        )
+    )
+    ready = await asyncio.wait_for(event_queue.get(), timeout=1)
+    assert ready.event_type == "ready"
+    assert started == [session_holder[0]]
+    assert session_holder[0].local_mode is False
+    await submission_queue.put(
+        SimpleNamespace(
+            operation=SimpleNamespace(op_type=OpType.SHUTDOWN, data=None),
+        )
+    )
+    await asyncio.wait_for(task, timeout=1)
+    assert torn_down == [session_holder[0]]

tests/unit/test_sandbox_private_spaces.py CHANGED Viewed

@@ -13,6 +13,28 @@ def _fail_metadata_update(*args, **kwargs):
     raise AssertionError("sandbox creation should not update Space metadata")
 def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
     duplicate_kwargs = {}
     logs: list[str] = []
@@ -22,8 +44,25 @@ def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
         def __init__(self, token=None):
             self.token = token
-        def duplicate_space(self, **kwargs):
-            duplicate_kwargs.update(kwargs)
         def request_space_hardware(self, space_id, hardware, sleep_time=None):
             requested_hardware.append((space_id, hardware, sleep_time))
@@ -45,12 +84,38 @@ def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
     Sandbox.create(owner="alice", token="hf-token", log=logs.append)
     assert duplicate_kwargs["private"] is True
-    assert duplicate_kwargs["hardware"] == "cpu-basic"
     assert requested_hardware == []
     assert not any("sleep time" in log for log in logs)
 def test_sandbox_client_retries_transient_runtime_404(monkeypatch):
     runtime_calls = 0
@@ -67,7 +132,16 @@ def test_sandbox_client_retries_transient_runtime_404(monkeypatch):
         def __init__(self, token=None):
             self.token = token
-        def duplicate_space(self, **kwargs):
             pass
         def request_space_hardware(self, space_id, hardware, sleep_time=None):
@@ -107,8 +181,25 @@ def test_sandbox_client_configures_gpu_at_duplication(monkeypatch):
         def __init__(self, token=None):
             self.token = token
-        def duplicate_space(self, **kwargs):
-            duplicate_kwargs.update(kwargs)
         def request_space_hardware(self, space_id, hardware, sleep_time=None):
             requested_hardware.append((space_id, hardware, sleep_time))
@@ -137,8 +228,9 @@ def test_sandbox_client_configures_gpu_at_duplication(monkeypatch):
     )
     assert sandbox.space_id.startswith("alice/sandbox-")
-    assert duplicate_kwargs["hardware"] == "t4-small"
-    assert duplicate_kwargs["sleep_time"] == 2700
     assert requested_hardware == []
     assert "Using duplicated Space hardware: t4-small" in logs
     assert "Using duplicated Space sleep time: 2700s" in logs
@@ -153,8 +245,25 @@ def test_sandbox_client_logs_cpu_sleep_time_as_hub_fixed(monkeypatch):
         def __init__(self, token=None):
             self.token = token
-        def duplicate_space(self, **kwargs):
-            duplicate_kwargs.update(kwargs)
         def request_space_hardware(self, space_id, hardware, sleep_time=None):
             requested_hardware.append((space_id, hardware, sleep_time))
@@ -180,8 +289,9 @@ def test_sandbox_client_logs_cpu_sleep_time_as_hub_fixed(monkeypatch):
         log=logs.append,
     )
-    assert duplicate_kwargs["hardware"] == "cpu-basic"
-    assert duplicate_kwargs["sleep_time"] == 2700
     assert requested_hardware == []
     assert "Using duplicated Space hardware: cpu-basic" in logs
     assert (
@@ -310,6 +420,71 @@ def test_ensure_sandbox_overrides_private_argument(monkeypatch):
     assert persisted[-1]["sandbox_status"] == "active"
 def test_sandbox_creation_is_serialized_per_owner(monkeypatch):
     active_creates = 0
     max_active_creates = 0
@@ -429,7 +604,7 @@ def test_sandbox_create_replaces_auto_cpu_sandbox(monkeypatch):
                 space_id="alice/sandbox-cpu",
                 url="https://huggingface.co/spaces/alice/sandbox-cpu",
                 _owns_space=True,
-                delete=lambda: deleted.append("alice/sandbox-cpu"),
             )
             self.sandbox_hardware = "cpu-basic"
             self.sandbox_preload_task = None
@@ -474,10 +649,11 @@ def test_sandbox_create_replaces_auto_cpu_sandbox(monkeypatch):
 def test_teardown_cancels_preload_and_deletes_owned_sandbox(monkeypatch):
     deleted: list[str] = []
     persisted: list[dict] = []
-    async def fake_record_sandbox_destroy(*args, **kwargs):
-        pass
     monkeypatch.setattr(
         telemetry, "record_sandbox_destroy", fake_record_sandbox_destroy
@@ -485,20 +661,28 @@ def test_teardown_cancels_preload_and_deletes_owned_sandbox(monkeypatch):
     async def run():
         cancel_event = threading.Event()
         async def preload():
             await asyncio.sleep(0)
         session = SimpleNamespace(
             session_id="s1",
             sandbox=SimpleNamespace(
                 space_id="alice/sandbox-12345678",
                 _owns_space=True,
-                delete=lambda: deleted.append("alice/sandbox-12345678"),
             ),
             sandbox_hardware="cpu-basic",
             sandbox_preload_task=asyncio.create_task(preload()),
             sandbox_preload_cancel_event=cancel_event,
             persistence_store=SimpleNamespace(
                 update_session_fields=lambda session_id, **fields: _record_metadata(
                     session_id, fields
@@ -507,17 +691,33 @@ def test_teardown_cancels_preload_and_deletes_owned_sandbox(monkeypatch):
         )
         await sandbox_tool.teardown_session_sandbox(session)
-        return session, cancel_event
     async def _record_metadata(session_id, fields):
         persisted.append({"session_id": session_id, **fields})
-    session, cancel_event = asyncio.run(run())
     assert cancel_event.is_set()
     assert deleted == ["alice/sandbox-12345678"]
     assert session.sandbox is None
     assert session.sandbox_hardware is None
     assert persisted[-1]["session_id"] == "s1"
     assert persisted[-1]["sandbox_space_id"] is None
     assert persisted[-1]["sandbox_status"] == "destroyed"

     raise AssertionError("sandbox creation should not update Space metadata")
+def _capture_duplicate_repo_call(
+    captured,
+    *,
+    from_id,
+    to_id,
+    repo_type,
+    private,
+    space_hardware,
+    space_sleep_time=None,
+):
+    captured.update(
+        {
+            "from_id": from_id,
+            "to_id": to_id,
+            "repo_type": repo_type,
+            "private": private,
+            "space_hardware": space_hardware,
+            "space_sleep_time": space_sleep_time,
+        }
+    )
 def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
     duplicate_kwargs = {}
     logs: list[str] = []
         def __init__(self, token=None):
             self.token = token
+        def duplicate_repo(
+            self,
+            *,
+            from_id,
+            to_id,
+            repo_type,
+            private,
+            space_hardware,
+            space_sleep_time=None,
+        ):
+            _capture_duplicate_repo_call(
+                duplicate_kwargs,
+                from_id=from_id,
+                to_id=to_id,
+                repo_type=repo_type,
+                private=private,
+                space_hardware=space_hardware,
+                space_sleep_time=space_sleep_time,
+            )
         def request_space_hardware(self, space_id, hardware, sleep_time=None):
             requested_hardware.append((space_id, hardware, sleep_time))
     Sandbox.create(owner="alice", token="hf-token", log=logs.append)
+    assert duplicate_kwargs["repo_type"] == "space"
     assert duplicate_kwargs["private"] is True
+    assert duplicate_kwargs["space_hardware"] == "cpu-basic"
     assert requested_hardware == []
     assert not any("sleep time" in log for log in logs)
+def test_sandbox_delete_uses_log_callback_without_stdout(monkeypatch, capsys):
+    deleted: list[tuple[str, str]] = []
+    class FakeApi:
+        def __init__(self, token=None):
+            self.token = token
+        def delete_repo(self, repo_id, repo_type):
+            deleted.append((repo_id, repo_type))
+    monkeypatch.setattr(sandbox_client, "HfApi", FakeApi)
+    sandbox = Sandbox("alice/sandbox-12345678", token="hf-token", _owns_space=True)
+    logs: list[str] = []
+    sandbox.delete(log=logs.append)
+    captured = capsys.readouterr()
+    assert captured.out == ""
+    assert captured.err == ""
+    assert deleted == [("alice/sandbox-12345678", "space")]
+    assert logs == ["Deleting sandbox: alice/sandbox-12345678...", "Deleted."]
+    assert sandbox._owns_space is False
 def test_sandbox_client_retries_transient_runtime_404(monkeypatch):
     runtime_calls = 0
         def __init__(self, token=None):
             self.token = token
+        def duplicate_repo(
+            self,
+            *,
+            from_id,
+            to_id,
+            repo_type,
+            private,
+            space_hardware,
+            space_sleep_time=None,
+        ):
             pass
         def request_space_hardware(self, space_id, hardware, sleep_time=None):
         def __init__(self, token=None):
             self.token = token
+        def duplicate_repo(
+            self,
+            *,
+            from_id,
+            to_id,
+            repo_type,
+            private,
+            space_hardware,
+            space_sleep_time=None,
+        ):
+            _capture_duplicate_repo_call(
+                duplicate_kwargs,
+                from_id=from_id,
+                to_id=to_id,
+                repo_type=repo_type,
+                private=private,
+                space_hardware=space_hardware,
+                space_sleep_time=space_sleep_time,
+            )
         def request_space_hardware(self, space_id, hardware, sleep_time=None):
             requested_hardware.append((space_id, hardware, sleep_time))
     )
     assert sandbox.space_id.startswith("alice/sandbox-")
+    assert duplicate_kwargs["repo_type"] == "space"
+    assert duplicate_kwargs["space_hardware"] == "t4-small"
+    assert duplicate_kwargs["space_sleep_time"] == 2700
     assert requested_hardware == []
     assert "Using duplicated Space hardware: t4-small" in logs
     assert "Using duplicated Space sleep time: 2700s" in logs
         def __init__(self, token=None):
             self.token = token
+        def duplicate_repo(
+            self,
+            *,
+            from_id,
+            to_id,
+            repo_type,
+            private,
+            space_hardware,
+            space_sleep_time=None,
+        ):
+            _capture_duplicate_repo_call(
+                duplicate_kwargs,
+                from_id=from_id,
+                to_id=to_id,
+                repo_type=repo_type,
+                private=private,
+                space_hardware=space_hardware,
+                space_sleep_time=space_sleep_time,
+            )
         def request_space_hardware(self, space_id, hardware, sleep_time=None):
             requested_hardware.append((space_id, hardware, sleep_time))
         log=logs.append,
     )
+    assert duplicate_kwargs["repo_type"] == "space"
+    assert duplicate_kwargs["space_hardware"] == "cpu-basic"
+    assert duplicate_kwargs["space_sleep_time"] == 2700
     assert requested_hardware == []
     assert "Using duplicated Space hardware: cpu-basic" in logs
     assert (
     assert persisted[-1]["sandbox_status"] == "active"
+def test_cancelled_sandbox_creation_logs_delete_through_tool_log(monkeypatch):
+    deleted: list[str] = []
+    class FakeSession:
+        def __init__(self):
+            self.hf_token = "hf-token"
+            self.sandbox = None
+            self.event_queue = asyncio.Queue()
+            self._cancelled = asyncio.Event()
+        async def send_event(self, event):
+            await self.event_queue.put(event)
+    def fake_create(**kwargs):
+        def delete(log=None):
+            deleted.append("alice/sandbox-12345678")
+            if log:
+                log("Deleting sandbox: alice/sandbox-12345678...")
+                log("Deleted.")
+        return SimpleNamespace(
+            space_id="alice/sandbox-12345678",
+            url="https://huggingface.co/spaces/alice/sandbox-12345678",
+            _owns_space=True,
+            delete=delete,
+        )
+    monkeypatch.setattr(Sandbox, "create", staticmethod(fake_create))
+    async def run():
+        session = FakeSession()
+        cancel_event = threading.Event()
+        cancel_event.set()
+        sb, error = await sandbox_tool._create_sandbox_locked(
+            session,
+            api=SimpleNamespace(),
+            owner="alice",
+            hardware="cpu-basic",
+            cancel_event=cancel_event,
+        )
+        await asyncio.sleep(0)
+        events = []
+        while not session.event_queue.empty():
+            events.append(await session.event_queue.get())
+        return sb, error, events
+    sb, error, events = asyncio.run(run())
+    assert sb is None
+    assert error == "Sandbox creation cancelled by user."
+    assert deleted == ["alice/sandbox-12345678"]
+    assert [
+        event.data
+        for event in events
+        if event.event_type == "tool_log"
+        and event.data
+        and event.data.get("log")
+        in {"Deleting sandbox: alice/sandbox-12345678...", "Deleted."}
+    ] == [
+        {"tool": "sandbox", "log": "Deleting sandbox: alice/sandbox-12345678..."},
+        {"tool": "sandbox", "log": "Deleted."},
+    ]
 def test_sandbox_creation_is_serialized_per_owner(monkeypatch):
     active_creates = 0
     max_active_creates = 0
                 space_id="alice/sandbox-cpu",
                 url="https://huggingface.co/spaces/alice/sandbox-cpu",
                 _owns_space=True,
+                delete=lambda log=None: deleted.append("alice/sandbox-cpu"),
             )
             self.sandbox_hardware = "cpu-basic"
             self.sandbox_preload_task = None
 def test_teardown_cancels_preload_and_deletes_owned_sandbox(monkeypatch):
     deleted: list[str] = []
+    destroyed: list[str] = []
     persisted: list[dict] = []
+    async def fake_record_sandbox_destroy(session, sandbox, *args, **kwargs):
+        destroyed.append(sandbox.space_id)
     monkeypatch.setattr(
         telemetry, "record_sandbox_destroy", fake_record_sandbox_destroy
     async def run():
         cancel_event = threading.Event()
+        event_queue = asyncio.Queue()
         async def preload():
             await asyncio.sleep(0)
+        def delete(log=None):
+            deleted.append("alice/sandbox-12345678")
+            if log:
+                log("Deleting sandbox: alice/sandbox-12345678...")
+                log("Deleted.")
         session = SimpleNamespace(
             session_id="s1",
             sandbox=SimpleNamespace(
                 space_id="alice/sandbox-12345678",
                 _owns_space=True,
+                delete=delete,
             ),
             sandbox_hardware="cpu-basic",
             sandbox_preload_task=asyncio.create_task(preload()),
             sandbox_preload_cancel_event=cancel_event,
+            event_queue=event_queue,
             persistence_store=SimpleNamespace(
                 update_session_fields=lambda session_id, **fields: _record_metadata(
                     session_id, fields
         )
         await sandbox_tool.teardown_session_sandbox(session)
+        await asyncio.sleep(0)
+        events = []
+        while not event_queue.empty():
+            events.append(await event_queue.get())
+        return session, cancel_event, events
     async def _record_metadata(session_id, fields):
         persisted.append({"session_id": session_id, **fields})
+    session, cancel_event, events = asyncio.run(run())
     assert cancel_event.is_set()
     assert deleted == ["alice/sandbox-12345678"]
+    assert destroyed == ["alice/sandbox-12345678"]
     assert session.sandbox is None
     assert session.sandbox_hardware is None
+    assert [
+        event.data
+        for event in events
+        if event.event_type == "tool_log"
+        and event.data
+        and event.data.get("log")
+        in {"Deleting sandbox: alice/sandbox-12345678...", "Deleted."}
+    ] == [
+        {"tool": "sandbox", "log": "Deleting sandbox: alice/sandbox-12345678..."},
+        {"tool": "sandbox", "log": "Deleted."},
+    ]
     assert persisted[-1]["session_id"] == "s1"
     assert persisted[-1]["sandbox_space_id"] is None
     assert persisted[-1]["sandbox_status"] == "destroyed"

tests/unit/test_sandbox_script_resolution.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from types import SimpleNamespace
+import pytest
+from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC
+from agent.tools.sandbox_tool import resolve_sandbox_script
+class FakeSandbox:
+    def __init__(self):
+        self.read_paths = []
+    def read(self, path, *, limit):
+        self.read_paths.append((path, limit))
+        return SimpleNamespace(
+            success=True,
+            output="1\tprint('training')\n2\tprint('done')",
+            error="",
+        )
+@pytest.mark.asyncio
+async def test_resolve_sandbox_script_accepts_bare_python_filename():
+    sandbox = FakeSandbox()
+    content, error = await resolve_sandbox_script(sandbox, "train_smollm2.py")
+    assert error is None
+    assert content == "print('training')\nprint('done')"
+    assert sandbox.read_paths == [("train_smollm2.py", 100_000)]
+@pytest.mark.asyncio
+async def test_resolve_sandbox_script_accepts_relative_python_path():
+    sandbox = FakeSandbox()
+    content, error = await resolve_sandbox_script(sandbox, "scripts/train.py")
+    assert error is None
+    assert content == "print('training')\nprint('done')"
+    assert sandbox.read_paths == [("scripts/train.py", 100_000)]
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "script",
+    [
+        "https://example.com/train.py",
+        "http://example.com/train.py",
+        "train_smollm2.py --epochs 1",
+        "print('hello')",
+    ],
+)
+async def test_resolve_sandbox_script_ignores_non_path_scripts(script):
+    sandbox = FakeSandbox()
+    content, error = await resolve_sandbox_script(sandbox, script)
+    assert content is None
+    assert error is None
+    assert sandbox.read_paths == []
+def test_hf_jobs_script_description_mentions_bare_python_filenames():
+    script_description = HF_JOBS_TOOL_SPEC["parameters"]["properties"]["script"][
+        "description"
+    ]
+    assert "bare 'train.py'" in script_description
+    assert "smoke-test in a GPU sandbox before submission" in script_description

tests/unit/test_session_manager_persistence.py CHANGED Viewed

@@ -207,7 +207,7 @@ async def test_close_cancels_preload_and_deletes_owned_sandbox(monkeypatch):
     session.sandbox = SimpleNamespace(
         space_id="owner/sandbox-12345678",
         _owns_space=True,
-        delete=lambda: deleted.append("owner/sandbox-12345678"),
     )
     session.sandbox_hardware = "cpu-basic"
     session.sandbox_preload_cancel_event = preload_cancel_event

     session.sandbox = SimpleNamespace(
         space_id="owner/sandbox-12345678",
         _owns_space=True,
+        delete=lambda log=None: deleted.append("owner/sandbox-12345678"),
     )
     session.sandbox_hardware = "cpu-basic"
     session.sandbox_preload_cancel_event = preload_cancel_event

tests/unit/test_trackio_space_ids.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import json
+from pathlib import Path
+from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC
+from agent.tools.sandbox_tool import SANDBOX_CREATE_TOOL_SPEC
+def test_trackio_space_examples_use_hyphenated_ml_intern_prefix():
+    prompt = Path("agent/prompts/system_prompt_v3.yaml").read_text()
+    tool_specs = json.dumps([HF_JOBS_TOOL_SPEC, SANDBOX_CREATE_TOOL_SPEC])
+    legacy_prefix = "ml" + "intern"
+    assert "<username>/ml-intern-<8-char-id>" in prompt
+    assert "<username>/ml-intern-<8char>" in tool_specs
+    assert legacy_prefix not in prompt
+    assert legacy_prefix not in tool_specs

uv.lock CHANGED Viewed

@@ -1788,6 +1788,7 @@ dependencies = [
     { name = "pydantic" },
     { name = "pymongo" },
     { name = "python-dotenv" },
     { name = "requests" },
     { name = "rich" },
     { name = "thefuzz" },
@@ -1840,6 +1841,7 @@ requires-dist = [
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" },
     { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=1.2.0" },
     { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "requests", specifier = ">=2.33.0" },
     { name = "rich", specifier = ">=13.0.0" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.15.12" },

     { name = "pydantic" },
     { name = "pymongo" },
     { name = "python-dotenv" },
+    { name = "python-multipart" },
     { name = "requests" },
     { name = "rich" },
     { name = "thefuzz" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" },
     { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=1.2.0" },
     { name = "python-dotenv", specifier = ">=1.2.1" },
+    { name = "python-multipart", specifier = ">=0.0.20" },
     { name = "requests", specifier = ">=2.33.0" },
     { name = "rich", specifier = ">=13.0.0" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.15.12" },