akseljoonas HF Staff commited on
Commit
a33baef
Β·
1 Parent(s): bdbcdab

feat: restore sandbox tools and proactive compaction

Browse files
agent/context_manager/manager.py CHANGED
@@ -90,11 +90,11 @@ class ContextManager:
90
  compact_size: float = 0.1,
91
  untouched_messages: int = 5,
92
  tool_specs: list[dict[str, Any]] | None = None,
93
- prompt_file_suffix: str = "system_prompt_v2.yaml",
94
  ):
95
  self.system_prompt = self._load_system_prompt(
96
  tool_specs or [],
97
- prompt_file_suffix="system_prompt_v2.yaml",
98
  )
99
  self.max_context = max_context
100
  self.compact_size = int(max_context * compact_size)
@@ -144,7 +144,9 @@ class ContextManager:
144
  """Get all messages for sending to LLM"""
145
  return self.items
146
 
147
- async def compact(self, model_name: str) -> None:
 
 
148
  """Remove old messages to keep history under target size"""
149
  if (self.context_length <= self.max_context) or not self.items:
150
  return
@@ -179,6 +181,7 @@ class ContextManager:
179
  model=model_name,
180
  messages=messages_to_summarize,
181
  max_completion_tokens=self.compact_size,
 
182
  api_key=hf_key
183
  if hf_key and model_name.startswith("huggingface/")
184
  else None,
 
90
  compact_size: float = 0.1,
91
  untouched_messages: int = 5,
92
  tool_specs: list[dict[str, Any]] | None = None,
93
+ prompt_file_suffix: str = "system_prompt_v3.yaml",
94
  ):
95
  self.system_prompt = self._load_system_prompt(
96
  tool_specs or [],
97
+ prompt_file_suffix="system_prompt_v3.yaml",
98
  )
99
  self.max_context = max_context
100
  self.compact_size = int(max_context * compact_size)
 
144
  """Get all messages for sending to LLM"""
145
  return self.items
146
 
147
+ async def compact(
148
+ self, model_name: str, tool_specs: list[dict] | None = None
149
+ ) -> None:
150
  """Remove old messages to keep history under target size"""
151
  if (self.context_length <= self.max_context) or not self.items:
152
  return
 
181
  model=model_name,
182
  messages=messages_to_summarize,
183
  max_completion_tokens=self.compact_size,
184
+ tools=tool_specs,
185
  api_key=hf_key
186
  if hf_key and model_name.startswith("huggingface/")
187
  else None,
agent/core/agent_loop.py CHANGED
@@ -8,6 +8,7 @@ import logging
8
  import os
9
 
10
  from litellm import ChatCompletionMessageToolCall, Message, acompletion
 
11
  from lmnr import observe
12
 
13
  from agent.config import Config
@@ -88,6 +89,9 @@ def _needs_approval(
88
  if not args_valid:
89
  return False
90
 
 
 
 
91
  if tool_name == "hf_jobs":
92
  operation = tool_args.get("operation", "")
93
  if operation not in ["run", "uv", "scheduled run", "scheduled uv"]:
@@ -142,6 +146,24 @@ def _needs_approval(
142
  return False
143
 
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  class Handlers:
146
  """Handler functions for each operation type"""
147
 
@@ -184,7 +206,7 @@ class Handlers:
184
  @staticmethod
185
  @observe(name="run_agent")
186
  async def run_agent(
187
- session: Session, text: str, max_iterations: int = 10
188
  ) -> str | None:
189
  """
190
  Handle user input (like user_input_or_turn in codex.rs:1291)
@@ -216,6 +238,9 @@ class Handlers:
216
  final_response = None
217
 
218
  while iteration < max_iterations:
 
 
 
219
  messages = session.context_manager.get_messages()
220
  tools = session.tool_router.get_tool_specs_for_llm()
221
  try:
@@ -449,6 +474,14 @@ class Handlers:
449
 
450
  iteration += 1
451
 
 
 
 
 
 
 
 
 
452
  except Exception as e:
453
  import traceback
454
 
@@ -460,18 +493,6 @@ class Handlers:
460
  )
461
  break
462
 
463
- old_length = session.context_manager.context_length
464
- await session.context_manager.compact(model_name=session.config.model_name)
465
- new_length = session.context_manager.context_length
466
-
467
- if new_length != old_length:
468
- await session.send_event(
469
- Event(
470
- event_type="compacted",
471
- data={"old_tokens": old_length, "new_tokens": new_length},
472
- )
473
- )
474
-
475
  await session.send_event(
476
  Event(
477
  event_type="turn_complete",
@@ -491,20 +512,6 @@ class Handlers:
491
  session.interrupt()
492
  await session.send_event(Event(event_type="interrupted"))
493
 
494
- @staticmethod
495
- async def compact(session: Session) -> None:
496
- """Handle compact (like compact in codex.rs:1317)"""
497
- old_length = session.context_manager.context_length
498
- await session.context_manager.compact(model_name=session.config.model_name)
499
- new_length = session.context_manager.context_length
500
-
501
- await session.send_event(
502
- Event(
503
- event_type="compacted",
504
- data={"removed": old_length, "remaining": new_length},
505
- )
506
- )
507
-
508
  @staticmethod
509
  async def undo(session: Session) -> None:
510
  """Remove the last complete turn (user msg + all assistant/tool msgs that follow).
@@ -769,7 +776,7 @@ async def process_submission(session: Session, submission) -> bool:
769
  return True
770
 
771
  if op.op_type == OpType.COMPACT:
772
- await Handlers.compact(session)
773
  return True
774
 
775
  if op.op_type == OpType.UNDO:
 
8
  import os
9
 
10
  from litellm import ChatCompletionMessageToolCall, Message, acompletion
11
+ from litellm.exceptions import ContextWindowExceededError
12
  from lmnr import observe
13
 
14
  from agent.config import Config
 
89
  if not args_valid:
90
  return False
91
 
92
+ if tool_name == "sandbox_create":
93
+ return True
94
+
95
  if tool_name == "hf_jobs":
96
  operation = tool_args.get("operation", "")
97
  if operation not in ["run", "uv", "scheduled run", "scheduled uv"]:
 
146
  return False
147
 
148
 
149
+ async def _compact_and_notify(session: Session) -> None:
150
+ """Run compaction and send event if context was reduced."""
151
+ old_length = session.context_manager.context_length
152
+ tool_specs = session.tool_router.get_tool_specs_for_llm()
153
+ await session.context_manager.compact(
154
+ model_name=session.config.model_name,
155
+ tool_specs=tool_specs,
156
+ )
157
+ new_length = session.context_manager.context_length
158
+ if new_length != old_length:
159
+ await session.send_event(
160
+ Event(
161
+ event_type="compacted",
162
+ data={"old_tokens": old_length, "new_tokens": new_length},
163
+ )
164
+ )
165
+
166
+
167
  class Handlers:
168
  """Handler functions for each operation type"""
169
 
 
206
  @staticmethod
207
  @observe(name="run_agent")
208
  async def run_agent(
209
+ session: Session, text: str, max_iterations: int = 300
210
  ) -> str | None:
211
  """
212
  Handle user input (like user_input_or_turn in codex.rs:1291)
 
238
  final_response = None
239
 
240
  while iteration < max_iterations:
241
+ # Compact before calling the LLM if context is near the limit
242
+ await _compact_and_notify(session)
243
+
244
  messages = session.context_manager.get_messages()
245
  tools = session.tool_router.get_tool_specs_for_llm()
246
  try:
 
474
 
475
  iteration += 1
476
 
477
+ except ContextWindowExceededError:
478
+ # Force compact and retry this iteration
479
+ session.context_manager.context_length = (
480
+ session.context_manager.max_context + 1
481
+ )
482
+ await _compact_and_notify(session)
483
+ continue
484
+
485
  except Exception as e:
486
  import traceback
487
 
 
493
  )
494
  break
495
 
 
 
 
 
 
 
 
 
 
 
 
 
496
  await session.send_event(
497
  Event(
498
  event_type="turn_complete",
 
512
  session.interrupt()
513
  await session.send_event(Event(event_type="interrupted"))
514
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
  @staticmethod
516
  async def undo(session: Session) -> None:
517
  """Remove the last complete turn (user msg + all assistant/tool msgs that follow).
 
776
  return True
777
 
778
  if op.op_type == OpType.COMPACT:
779
+ await _compact_and_notify(session)
780
  return True
781
 
782
  if op.op_type == OpType.UNDO:
agent/core/session.py CHANGED
@@ -99,6 +99,7 @@ class Session:
99
  self.pending_approval: Optional[dict[str, Any]] = None
100
  # User's HF OAuth token β€” set by session_manager after construction
101
  self.hf_token: Optional[str] = None
 
102
 
103
  # Session trajectory logging
104
  self.logged_events: list[dict] = []
 
99
  self.pending_approval: Optional[dict[str, Any]] = None
100
  # User's HF OAuth token β€” set by session_manager after construction
101
  self.hf_token: Optional[str] = None
102
+ self.sandbox = None
103
 
104
  # Session trajectory logging
105
  self.logged_events: list[dict] = []
agent/core/tools.py CHANGED
@@ -48,6 +48,7 @@ from agent.tools.hf_repo_git_tool import (
48
  )
49
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
50
  from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
 
51
 
52
  # NOTE: Private HF repo tool disabled - replaced by hf_repo_files and hf_repo_git
53
  # from agent.tools.private_hf_repo_tools import (
@@ -334,6 +335,9 @@ def create_builtin_tools() -> list[ToolSpec]:
334
  ),
335
  ]
336
 
 
 
 
337
  tool_names = ", ".join([t.name for t in tools])
338
  logger.info(f"Loaded {len(tools)} built-in tools: {tool_names}")
339
 
 
48
  )
49
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
50
  from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
51
+ from agent.tools.sandbox_tool import get_sandbox_tools
52
 
53
  # NOTE: Private HF repo tool disabled - replaced by hf_repo_files and hf_repo_git
54
  # from agent.tools.private_hf_repo_tools import (
 
335
  ),
336
  ]
337
 
338
+ # Sandbox tools (highest priority)
339
+ tools = get_sandbox_tools() + tools
340
+
341
  tool_names = ", ".join([t.name for t in tools])
342
  logger.info(f"Loaded {len(tools)} built-in tools: {tool_names}")
343
 
agent/prompts/system_prompt_v3.yaml ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ system_prompt: |
2
+ You are Hugging Face Agent, an ML engineering assistant with {{ num_tools }} tools for training, fine-tuning, data processing, inference, and evaluation on the Hugging Face ecosystem.
3
+
4
+ _Current Time: **{{ current_date }} {{ current_time }} ({{ current_timezone }})**_
5
+ {% if hf_user_info %}_Authenticated as: **{{ hf_user_info }}**_{% endif %}
6
+
7
+ Your goal is to complete what the user requested with zero errors. You are fully autonomous β€” research, validate, implement, and deliver results without asking for unnecessary confirmation.
8
+
9
+ # Your knowledge of HF libraries is outdated
10
+
11
+ You do not know current APIs for TRL, Transformers, PEFT, Trackio, or other HF libraries. Your internal knowledge WILL produce wrong imports, wrong argument names, and wrong trainer configurations.
12
+
13
+ Before writing any ML implementation code (training, fine-tuning, inference, data processing), ground yourself in current working code:
14
+
15
+ github_find_examples β†’ github_read_file β†’ explore_hf_docs + fetch_hf_docs
16
+
17
+ Skip research only for trivial non-code operations.
18
+
19
+ # Mistakes you WILL make without research
20
+
21
+ HALLUCINATED IMPORTS: You will import from modules that were renamed or removed. Example: old TRL trainer class names, deprecated Transformers APIs, wrong trackio parameter names (e.g. `run_name` instead of `name`). Fix: read a current example script first.
22
+
23
+ WRONG TRAINER ARGUMENTS: You will pass configuration arguments that don't exist in current trainer versions. Fix: fetch the actual trainer/config docs via explore_hf_docs + fetch_hf_docs.
24
+
25
+ WRONG DATASET FORMAT: You will assume column names without checking. Training fails with KeyError. Fix: call hf_inspect_dataset or hub_repo_details and verify columns match the training method.
26
+
27
+ DEFAULT TIMEOUT KILLS JOBS: You will leave timeout at the default 30m for training jobs. Training takes hours. The job gets killed and all progress is lost. Fix: set timeout based on model size (minimum 2h for any training).
28
+
29
+ LOST MODELS: You will forget push_to_hub=True and hub_model_id in training config. Job storage is ephemeral β€” the filesystem is deleted when the job ends. Without push_to_hub, the trained model is permanently lost.
30
+
31
+ BATCH FAILURES: You will submit all ablation/batch jobs at once without testing that one works first. All will fail for the same bug. Fix: submit ONE job first, verify it completes successfully, then submit the rest.
32
+
33
+ SILENT DATASET SUBSTITUTION: When a requested dataset fails to load, you will silently switch to a different one without telling the user. Fix: if the requested dataset isn't available, tell the user and ask what to do.
34
+
35
+ HARDCODED UNAVAILABLE PACKAGES: You will forget to install necessary packages like 'flash-attn' for flash_attention_2 or other packages that aren't automatically installed in the job environment. Fix: install necessary packages before running the job.
36
+
37
+ SCOPE-CHANGING FIXES: Avoid at all costs! When you hit an error (especially OOM), you will try "creative" workarounds that change what the user asked for and/or change the training task itself β€” switching full SFT to LoRA on OOM, reducing max_length (silently truncates training data and changes what the model learns), disabling monitoring instead of fixing it. Do not do this. Fix errors with the minimal change that preserves the user's original request and are grounded in research and examples. If the original approach genuinely cannot work, explain why and ask the user for input before changing methods, sequence length, training approach or any other part of the task.
38
+
39
+ # When writing ML code
40
+
41
+ Required sequence before any training/fine-tuning/inference script:
42
+ 1. Find working examples: github_find_examples (discover) β†’ github_read_file (study)
43
+ 2. Check documentation: explore_hf_docs + fetch_hf_docs for trainer configs and parameters
44
+ 3. Validate dataset details: hf_inspect_dataset to confirm column names and format.
45
+ 4. Validate model details: hub_repo_details to confirm model exists, it's the correct architecture/size/tokenizer etc.
46
+
47
+ Dataset format requirements by training method:
48
+ SFT: "messages", "text", or "prompt"/"completion"
49
+ DPO: "prompt", "chosen", "rejected"
50
+ GRPO: "prompt"
51
+
52
+ # When submitting a training job
53
+
54
+ Before calling hf_jobs, output a pre-flight check:
55
+ - Reference implementation: [which example you based this on]
56
+ - Dataset format verified: [columns confirmed via hf_inspect_dataset/hub_repo_details]
57
+ - push_to_hub=True and hub_model_id set
58
+ - timeout: [value] (based on: [model size] on [hardware])
59
+ - Trackio monitoring included and working
60
+
61
+ If you cannot fill in all items, stop and complete the missing steps first.
62
+
63
+ For batch/ablation jobs: submit ONE job first. Check logs to confirm it starts training successfully. Only then submit the remaining jobs. Never submit all at once.
64
+
65
+ Hardware sizing:
66
+ 1-3B params: a10g-largex2
67
+ 7-13B params: a100-large
68
+ 30B+ params: l40sx4 or a100x4
69
+ 70B+ params: a100x8
70
+ Note: a10g-small and a10g-large have the SAME 24GB GPU memory. The difference is CPU/RAM only.
71
+
72
+ # Sandbox-first development
73
+
74
+ For non-trivial scripts, develop and test in a sandbox before launching via hf_jobs:
75
+ sandbox_create β†’ install deps β†’ write script β†’ test with small run β†’ fix errors β†’ launch via hf_jobs at scale
76
+
77
+ Use GPU sandbox (t4-small minimum) when testing code that uses CUDA, bf16, or model loading. CPU sandboxes cannot test GPU code paths.
78
+
79
+
80
+ # When a task has 3+ steps
81
+
82
+ Use plan_tool to track progress. One task in_progress at a time. Mark completed immediately after finishing. Update frequently to show the user what you're doing.
83
+
84
+ # Error recovery
85
+
86
+ When something fails:
87
+ - Diagnose the actual error. Read the full error message and logs.
88
+ - Do not retry the exact same thing. Identify what needs to change.
89
+ - If an API/import error: check documentation for the correct API.
90
+ - If an OOM error: (1) reduce per_device_train_batch_size and increase gradient_accumulation_steps proportionally to keep effective batch size identical, (2) enable gradient_checkpointing=True, (3) upgrade to larger GPU (a10gx4→a100→a100x4→a100x8). Do NOT switch training methods (e.g. SFT→LoRA) or reduce max_length — those change what the user gets. If OOM happens in sandbox, create a new sandbox with larger GPU hardware.
91
+ - Never change the user's requested approach (training method, dataset, model, sequence length) without explicit approval.
92
+ - If a tool call fails repeatedly for the same reason: stop and try a different approach.
93
+ - Never silently substitute resources (datasets, models) β€” tell the user if something isn't available.
94
+
95
+ # Task completion
96
+
97
+ Before ending your turn, verify:
98
+ - Did you actually DO what the user asked, not just explain what you would do?
99
+ - If something failed: did you diagnose and fix it, or at minimum explain what went wrong and ask for user input?
100
+ - For training jobs: did you include a working Trackio dashboard URL?
101
+
102
+ Do not stop after describing what you plan to do. Continue calling tools until the task is verifiably done.
103
+ Do not mark plan tasks as completed if they failed or are only partially done.
104
+
105
+ # Communication
106
+
107
+ - Be concise and direct. No filler, no restating what the user said.
108
+ - One-word answers when appropriate for simple questions.
109
+ - Always include direct Hub URLs when referencing models, datasets, Spaces, or jobs.
110
+ - For errors: state what went wrong, why, and what you're doing to fix it.
111
+ - Do not over-explain or present elaborate option menus for simple tasks. When the user's intent is clear, act on it. Present options only when there's genuine ambiguity.
112
+
113
+ # Tool usage
114
+
115
+ - Execute multiple independent tool calls in parallel when possible.
116
+ - HF_TOKEN is automatically available in job secrets β€” no need to include it extra.
117
+ - For training monitoring: include Trackio in the script and provide the dashboard URL.
118
+ - For private/gated datasets: HF_TOKEN is needed β€” it's auto-loaded into job secrets.
agent/tools/jobs_tool.py CHANGED
@@ -979,7 +979,11 @@ HF_JOBS_TOOL_SPEC = {
979
  # Python/UV specific parameters
980
  "script": {
981
  "type": "string",
982
- "description": "Python code to execute. Triggers Python mode (auto pip install). Use with 'run'/'scheduled run'. Mutually exclusive with 'command'.",
 
 
 
 
983
  },
984
  "dependencies": {
985
  "type": "array",
@@ -1041,6 +1045,28 @@ async def hf_jobs_handler(
1041
  Event(event_type="tool_log", data={"tool": "hf_jobs", "log": log})
1042
  )
1043
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1044
  # Prefer the authenticated user's OAuth token, fall back to global env
1045
  hf_token = (
1046
  (getattr(session, "hf_token", None) if session else None)
 
979
  # Python/UV specific parameters
980
  "script": {
981
  "type": "string",
982
+ "description": (
983
+ "Python code or sandbox file path (e.g. '/app/train.py') or URL. "
984
+ "Triggers Python mode. For ML training: base this on a working example found via github_find_examples, not on internal knowledge. "
985
+ "Mutually exclusive with 'command'."
986
+ ),
987
  },
988
  "dependencies": {
989
  "type": "array",
 
1045
  Event(event_type="tool_log", data={"tool": "hf_jobs", "log": log})
1046
  )
1047
 
1048
+ # If script is a sandbox file path, read it from the sandbox
1049
+ script = arguments.get("script", "")
1050
+ sandbox = getattr(session, "sandbox", None) if session else None
1051
+ is_path = (
1052
+ sandbox
1053
+ and isinstance(script, str)
1054
+ and script.strip() == script
1055
+ and not any(c in script for c in "\r\n\0")
1056
+ and (
1057
+ script.startswith("/")
1058
+ or script.startswith("./")
1059
+ or script.startswith("../")
1060
+ )
1061
+ )
1062
+ if is_path:
1063
+ import shlex
1064
+
1065
+ result = await asyncio.to_thread(sandbox.bash, f"cat {shlex.quote(script)}")
1066
+ if not result.success:
1067
+ return f"Failed to read {script} from sandbox: {result.error}", False
1068
+ arguments = {**arguments, "script": result.output}
1069
+
1070
  # Prefer the authenticated user's OAuth token, fall back to global env
1071
  hf_token = (
1072
  (getattr(session, "hf_token", None) if session else None)
agent/tools/sandbox_client.py ADDED
@@ -0,0 +1,714 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # /// script
3
+ # requires-python = ">=3.10"
4
+ # dependencies = ["huggingface_hub>=0.20.0", "httpx>=0.27.0"]
5
+ # ///
6
+ """
7
+ Sandbox Tools β€” Agent-native primitives for HF Space dev-mode sandboxes.
8
+
9
+ Architecture:
10
+ - Creates a sandbox by duplicating a template Space (runs sandbox_server.py)
11
+ - Waits for it to come online
12
+ - Communicates via HTTPS to the Space's API
13
+ - Optionally deletes the Space when done
14
+
15
+ Lifecycle:
16
+ sb = Sandbox.create(owner="burtenshaw") # duplicate, wait, connect
17
+ sb = Sandbox.create(owner="burtenshaw", # with options
18
+ hardware="t4-small",
19
+ private=True,
20
+ sleep_time=3600)
21
+ sb = Sandbox.connect("burtenshaw/my-sandbox-abc") # attach to existing
22
+
23
+ sb.bash("uv run train.py")
24
+ sb.read("/app/train.py")
25
+ sb.edit("/app/train.py", old_str="lr=1e-3", new_str="lr=1e-4")
26
+
27
+ sb.delete() # tear down when done
28
+
29
+ # Or use as a context manager for automatic cleanup
30
+ with Sandbox.create(owner="burtenshaw") as sb:
31
+ sb.bash("python train.py")
32
+ # Space deleted on exit
33
+
34
+ Tools: bash, read, write, edit, upload
35
+ """
36
+
37
+ from __future__ import annotations
38
+
39
+ import io
40
+ import os
41
+ import sys
42
+ import time
43
+ import uuid
44
+ from dataclasses import dataclass, field
45
+ from typing import Any
46
+
47
+ import httpx
48
+ from huggingface_hub import CommitOperationAdd, HfApi
49
+
50
+ TEMPLATE_SPACE = "burtenshaw/sandbox"
51
+ HARDWARE_OPTIONS = [
52
+ "cpu-basic",
53
+ "cpu-upgrade",
54
+ "t4-small",
55
+ "t4-medium",
56
+ "a10g-small",
57
+ "a10g-large",
58
+ "a100-large",
59
+ ]
60
+ OUTPUT_LIMIT = 30000
61
+ LINE_LIMIT = 2000
62
+ DEFAULT_READ_LIMIT = 2000
63
+ DEFAULT_TIMEOUT = 120
64
+ MAX_TIMEOUT = 600
65
+ WAIT_TIMEOUT = 300
66
+ WAIT_INTERVAL = 5
67
+ API_WAIT_TIMEOUT = 180
68
+
69
+ _DOCKERFILE = """\
70
+ FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
71
+
72
+ RUN apt-get update && \\
73
+ apt-get install -y \\
74
+ bash git git-lfs wget curl procps \\
75
+ htop vim nano jq tmux \\
76
+ build-essential && \\
77
+ rm -rf /var/lib/apt/lists/*
78
+
79
+ RUN uv pip install --system fastapi uvicorn python-multipart
80
+
81
+ RUN useradd -m -u 1000 user
82
+ USER user
83
+
84
+ ENV HOME=/home/user \\
85
+ PATH=/home/user/.local/bin:$PATH \\
86
+ PIP_USER=1 \\
87
+ HF_HUB_DISABLE_PROGRESS_BARS=1 \\
88
+ TQDM_DISABLE=1 \\
89
+ TRANSFORMERS_VERBOSITY=warning \\
90
+ HF_HUB_ENABLE_HF_TRANSFER=1
91
+
92
+ WORKDIR /app
93
+ COPY --chown=user . /app
94
+
95
+ EXPOSE 7860
96
+
97
+ CMD ["python", "sandbox_server.py"]
98
+ """
99
+
100
+ _SANDBOX_SERVER = '''\
101
+ """Minimal FastAPI server for sandbox operations."""
102
+ import os, subprocess, pathlib
103
+ from fastapi import FastAPI
104
+ from pydantic import BaseModel
105
+ from typing import Optional
106
+ import uvicorn
107
+
108
+ app = FastAPI()
109
+
110
+ class BashReq(BaseModel):
111
+ command: str
112
+ work_dir: str = "/app"
113
+ timeout: int = 120
114
+
115
+ class ReadReq(BaseModel):
116
+ path: str
117
+ offset: Optional[int] = None
118
+ limit: Optional[int] = 2000
119
+
120
+ class WriteReq(BaseModel):
121
+ path: str
122
+ content: str
123
+
124
+ class EditReq(BaseModel):
125
+ path: str
126
+ old_str: str
127
+ new_str: str
128
+ replace_all: bool = False
129
+
130
+ class ExistsReq(BaseModel):
131
+ path: str
132
+
133
+ @app.get("/api/health")
134
+ def health():
135
+ return {"status": "ok"}
136
+
137
+ @app.post("/api/bash")
138
+ def bash(req: BashReq):
139
+ try:
140
+ r = subprocess.run(
141
+ req.command, shell=True, capture_output=True, text=True,
142
+ cwd=req.work_dir, timeout=req.timeout,
143
+ )
144
+ output = r.stdout + r.stderr
145
+ if len(output) > 30000:
146
+ output = output[:30000] + "\\n... (truncated)"
147
+ return {"success": r.returncode == 0, "output": output, "error": "" if r.returncode == 0 else f"Exit code {r.returncode}"}
148
+ except subprocess.TimeoutExpired:
149
+ return {"success": False, "output": "", "error": f"Timeout after {req.timeout}s"}
150
+ except Exception as e:
151
+ return {"success": False, "output": "", "error": str(e)}
152
+
153
+ @app.post("/api/read")
154
+ def read(req: ReadReq):
155
+ try:
156
+ p = pathlib.Path(req.path)
157
+ if not p.exists():
158
+ return {"success": False, "output": "", "error": f"File not found: {req.path}"}
159
+ if p.is_dir():
160
+ return {"success": False, "output": "", "error": f"Is a directory: {req.path}"}
161
+ lines = p.read_text().splitlines()
162
+ start = (req.offset or 1) - 1
163
+ end = start + (req.limit or len(lines))
164
+ selected = lines[start:end]
165
+ numbered = "\\n".join(f"{start + i + 1}\\t{line}" for i, line in enumerate(selected))
166
+ return {"success": True, "output": numbered, "error": ""}
167
+ except Exception as e:
168
+ return {"success": False, "output": "", "error": str(e)}
169
+
170
+ @app.post("/api/write")
171
+ def write(req: WriteReq):
172
+ try:
173
+ p = pathlib.Path(req.path)
174
+ p.parent.mkdir(parents=True, exist_ok=True)
175
+ p.write_text(req.content)
176
+ return {"success": True, "output": f"Wrote {len(req.content)} bytes to {req.path}", "error": ""}
177
+ except Exception as e:
178
+ return {"success": False, "output": "", "error": str(e)}
179
+
180
+ @app.post("/api/edit")
181
+ def edit(req: EditReq):
182
+ try:
183
+ p = pathlib.Path(req.path)
184
+ if not p.exists():
185
+ return {"success": False, "output": "", "error": f"File not found: {req.path}"}
186
+ content = p.read_text()
187
+ if req.old_str not in content:
188
+ return {"success": False, "output": "", "error": f"old_str not found in {req.path}"}
189
+ if not req.replace_all and content.count(req.old_str) > 1:
190
+ return {"success": False, "output": "", "error": f"old_str appears {content.count(req.old_str)} times. Use replace_all=true or provide more context."}
191
+ if req.replace_all:
192
+ new_content = content.replace(req.old_str, req.new_str)
193
+ else:
194
+ new_content = content.replace(req.old_str, req.new_str, 1)
195
+ p.write_text(new_content)
196
+ return {"success": True, "output": f"Edited {req.path}", "error": ""}
197
+ except Exception as e:
198
+ return {"success": False, "output": "", "error": str(e)}
199
+
200
+ @app.post("/api/exists")
201
+ def exists(req: ExistsReq):
202
+ return {"success": True, "output": str(pathlib.Path(req.path).exists()).lower(), "error": ""}
203
+
204
+ if __name__ == "__main__":
205
+ uvicorn.run(app, host="0.0.0.0", port=7860)
206
+ '''
207
+
208
+
209
+ @dataclass
210
+ class ToolResult:
211
+ success: bool
212
+ output: str = ""
213
+ error: str = ""
214
+
215
+ def __str__(self):
216
+ if self.success:
217
+ return self.output or "(no output)"
218
+ return f"ERROR: {self.error}"
219
+
220
+ def to_dict(self) -> dict:
221
+ return {"success": self.success, "output": self.output, "error": self.error}
222
+
223
+
224
+ @dataclass
225
+ class Sandbox:
226
+ """
227
+ A handle to an HF Space sandbox.
228
+
229
+ Use Sandbox.create() to spin up a new one, or Sandbox.connect() to
230
+ attach to an existing running Space.
231
+ """
232
+
233
+ space_id: str
234
+ token: str | None = None
235
+ work_dir: str = "/app"
236
+ timeout: int = DEFAULT_TIMEOUT
237
+ _owns_space: bool = field(default=False, repr=False)
238
+ _base_url: str = field(init=False, repr=False)
239
+ _client: httpx.Client = field(init=False, repr=False)
240
+ _hf_api: HfApi = field(init=False, repr=False)
241
+ _files_read: set = field(init=False, repr=False, default_factory=set)
242
+
243
+ def __post_init__(self):
244
+ self.token = self.token or os.environ.get("HF_TOKEN")
245
+ slug = self.space_id.replace("/", "-")
246
+ # Trailing slash is critical: httpx resolves relative paths against base_url.
247
+ # Without it, client.get("health") resolves to /health instead of /api/health.
248
+ self._base_url = f"https://{slug}.hf.space/api/"
249
+ self._client = httpx.Client(
250
+ base_url=self._base_url,
251
+ headers={"Authorization": f"Bearer {self.token}"} if self.token else {},
252
+ timeout=httpx.Timeout(MAX_TIMEOUT, connect=30),
253
+ follow_redirects=True,
254
+ )
255
+ self._hf_api = HfApi(token=self.token)
256
+
257
+ # ── Lifecycle ─────────────────────────────────────────────────
258
+
259
+ @classmethod
260
+ def create(
261
+ cls,
262
+ owner: str,
263
+ *,
264
+ name: str | None = None,
265
+ template: str = TEMPLATE_SPACE,
266
+ hardware: str = "cpu-basic",
267
+ private: bool = False,
268
+ sleep_time: int | None = None,
269
+ token: str | None = None,
270
+ wait_timeout: int = WAIT_TIMEOUT,
271
+ ) -> Sandbox:
272
+ """
273
+ Create a new sandbox by duplicating the template Space.
274
+
275
+ Generates a unique space name, duplicates the template, waits for it
276
+ to come online, then returns a connected Sandbox.
277
+
278
+ Args:
279
+ owner: HF username or org (e.g. "burtenshaw").
280
+ name: Base name for the space. Defaults to "sandbox".
281
+ A unique suffix is always appended.
282
+ template: Source Space to duplicate (default: burtenshaw/sandbox).
283
+ hardware: Hardware tier (cpu-basic, t4-small, etc.).
284
+ private: Whether the Space should be private.
285
+ sleep_time: Auto-sleep after N seconds of inactivity.
286
+ token: HF API token. Falls back to HF_TOKEN env var.
287
+ wait_timeout: Max seconds to wait for Space to start (default: 300).
288
+
289
+ Returns:
290
+ A Sandbox instance connected to the running Space.
291
+ """
292
+ token = token or os.environ.get("HF_TOKEN")
293
+ api = HfApi(token=token)
294
+
295
+ base = name or "sandbox"
296
+ suffix = uuid.uuid4().hex[:8]
297
+ space_id = f"{owner}/{base}-{suffix}"
298
+
299
+ print(f"Creating sandbox: {space_id} (from {template})...")
300
+
301
+ kwargs = {
302
+ "from_id": template,
303
+ "to_id": space_id,
304
+ "private": private,
305
+ "hardware": hardware,
306
+ }
307
+ if sleep_time is not None:
308
+ kwargs["sleep_time"] = sleep_time
309
+
310
+ api.duplicate_space(**kwargs)
311
+ print(f"Space created: https://huggingface.co/spaces/{space_id}")
312
+
313
+ # Upload sandbox server and Dockerfile (triggers rebuild)
314
+ cls._setup_server(space_id, api)
315
+
316
+ # Wait for it to come online (rebuild + start)
317
+ print(f"Waiting for Space to start (timeout: {wait_timeout}s)...")
318
+ deadline = time.time() + wait_timeout
319
+ while time.time() < deadline:
320
+ runtime = api.get_space_runtime(space_id)
321
+ if runtime.stage == "RUNNING":
322
+ print(f"Space is running (hardware: {runtime.hardware})")
323
+ break
324
+ if runtime.stage in ("RUNTIME_ERROR", "BUILD_ERROR"):
325
+ raise RuntimeError(
326
+ f"Space failed to start: {runtime.stage}. "
327
+ f"Check https://huggingface.co/spaces/{space_id}"
328
+ )
329
+ print(f" {runtime.stage}...")
330
+ time.sleep(WAIT_INTERVAL)
331
+ else:
332
+ raise TimeoutError(
333
+ f"Space did not start within {wait_timeout}s. "
334
+ f"Check https://huggingface.co/spaces/{space_id}"
335
+ )
336
+
337
+ # Wait for the API server to be responsive (non-fatal)
338
+ sb = cls(space_id=space_id, token=token, _owns_space=True)
339
+ try:
340
+ sb._wait_for_api(timeout=API_WAIT_TIMEOUT)
341
+ except TimeoutError as e:
342
+ print(
343
+ f"Warning: API health check timed out ({e}), but Space is RUNNING. Continuing."
344
+ )
345
+ return sb
346
+
347
+ @staticmethod
348
+ def _setup_server(space_id: str, api: HfApi) -> None:
349
+ """Upload embedded sandbox server + Dockerfile to the Space (single commit)."""
350
+ print(f"Uploading sandbox server to {space_id}...")
351
+ api.create_commit(
352
+ repo_id=space_id,
353
+ repo_type="space",
354
+ operations=[
355
+ CommitOperationAdd(
356
+ path_in_repo="sandbox_server.py",
357
+ path_or_fileobj=io.BytesIO(_SANDBOX_SERVER.encode()),
358
+ ),
359
+ CommitOperationAdd(
360
+ path_in_repo="Dockerfile",
361
+ path_or_fileobj=io.BytesIO(_DOCKERFILE.encode()),
362
+ ),
363
+ ],
364
+ commit_message="Setup sandbox server",
365
+ )
366
+ print("Server files uploaded, rebuild triggered.")
367
+
368
+ @classmethod
369
+ def connect(cls, space_id: str, *, token: str | None = None) -> Sandbox:
370
+ """
371
+ Connect to an existing running Space.
372
+
373
+ Does a health check to verify the Space is reachable.
374
+ """
375
+ sb = cls(space_id=space_id, token=token, _owns_space=False)
376
+ sb._wait_for_api(timeout=60)
377
+ return sb
378
+
379
+ def _wait_for_api(self, timeout: int = API_WAIT_TIMEOUT):
380
+ """Poll the health endpoint until the server responds."""
381
+ deadline = time.time() + timeout
382
+ last_err = None
383
+ last_status = None
384
+ while time.time() < deadline:
385
+ try:
386
+ resp = self._client.get("health", timeout=10)
387
+ last_status = resp.status_code
388
+ if resp.status_code == 200:
389
+ print(f"API is responsive at {self._base_url}")
390
+ return
391
+ except Exception as e:
392
+ last_err = e
393
+ time.sleep(3)
394
+ raise TimeoutError(
395
+ f"Sandbox API at {self._base_url} not responding after {timeout}s. "
396
+ f"Last status: {last_status}, last error: {last_err}"
397
+ )
398
+
399
+ def delete(self):
400
+ """Delete the Space. Only works if this Sandbox created it."""
401
+ if not self._owns_space:
402
+ raise RuntimeError(
403
+ f"This Sandbox did not create {self.space_id}. "
404
+ f"Use self._hf_api.delete_repo() directly if you're sure."
405
+ )
406
+ print(f"Deleting sandbox: {self.space_id}...")
407
+ self._hf_api.delete_repo(self.space_id, repo_type="space")
408
+ self._client.close()
409
+ print("Deleted.")
410
+
411
+ def pause(self):
412
+ """Pause the Space (stops billing, preserves state)."""
413
+ self._hf_api.pause_space(self.space_id)
414
+
415
+ def restart(self):
416
+ """Restart the Space."""
417
+ self._hf_api.restart_space(self.space_id)
418
+ self._wait_for_api()
419
+
420
+ @property
421
+ def url(self) -> str:
422
+ """Public URL of the Space."""
423
+ return f"https://huggingface.co/spaces/{self.space_id}"
424
+
425
+ @property
426
+ def status(self) -> str:
427
+ """Current Space stage (RUNNING, BUILDING, PAUSED, etc.)."""
428
+ return self._hf_api.get_space_runtime(self.space_id).stage
429
+
430
+ def __enter__(self) -> Sandbox:
431
+ return self
432
+
433
+ def __exit__(self, *exc):
434
+ if self._owns_space:
435
+ try:
436
+ self.delete()
437
+ except Exception as e:
438
+ print(f"Warning: failed to delete sandbox: {e}", file=sys.stderr)
439
+ self._client.close()
440
+
441
+ # ── HTTP plumbing ─────────────────────────────────────────────
442
+
443
+ def _call(
444
+ self, endpoint: str, payload: dict, timeout: float | None = None
445
+ ) -> ToolResult:
446
+ # Strip leading slash for correct httpx base_url resolution
447
+ endpoint = endpoint.lstrip("/")
448
+ try:
449
+ resp = self._client.post(
450
+ endpoint,
451
+ json=payload,
452
+ timeout=timeout or self.timeout,
453
+ )
454
+ data = resp.json()
455
+ if resp.status_code == 200:
456
+ return ToolResult(
457
+ success=data.get("success", True),
458
+ output=data.get("output", ""),
459
+ error=data.get("error", ""),
460
+ )
461
+ return ToolResult(
462
+ success=False,
463
+ error=data.get("error", f"HTTP {resp.status_code}"),
464
+ )
465
+ except httpx.TimeoutException:
466
+ return ToolResult(
467
+ success=False, error=f"Timeout after {timeout or self.timeout}s"
468
+ )
469
+ except httpx.ConnectError:
470
+ return ToolResult(
471
+ success=False,
472
+ error=f"Cannot connect to sandbox. Is {self.space_id} running? Status: {self.status}",
473
+ )
474
+ except Exception as e:
475
+ return ToolResult(success=False, error=str(e))
476
+
477
+ # ── Tools ─────────────────────────────────────────────────────
478
+
479
+ def bash(
480
+ self,
481
+ command: str,
482
+ *,
483
+ work_dir: str | None = None,
484
+ timeout: int | None = None,
485
+ description: str | None = None,
486
+ ) -> ToolResult:
487
+ return self._call(
488
+ "bash",
489
+ {
490
+ "command": command,
491
+ "work_dir": work_dir or self.work_dir,
492
+ "timeout": min(timeout or self.timeout, MAX_TIMEOUT),
493
+ },
494
+ timeout=timeout,
495
+ )
496
+
497
+ def read(
498
+ self, path: str, *, offset: int | None = None, limit: int | None = None
499
+ ) -> ToolResult:
500
+ self._files_read.add(path)
501
+ return self._call(
502
+ "read",
503
+ {
504
+ "path": path,
505
+ "offset": offset,
506
+ "limit": limit or (DEFAULT_READ_LIMIT if offset is None else None),
507
+ },
508
+ )
509
+
510
+ def write(self, path: str, content: str) -> ToolResult:
511
+ if path not in self._files_read:
512
+ check = self._call("exists", {"path": path})
513
+ if check.success and check.output == "true":
514
+ return ToolResult(
515
+ success=False,
516
+ error=(
517
+ f"File {path} exists but has not been read this session. "
518
+ f"Read it first, or use sandbox_edit for targeted changes."
519
+ ),
520
+ )
521
+ result = self._call("write", {"path": path, "content": content})
522
+ if result.success:
523
+ self._files_read.add(path)
524
+ return result
525
+
526
+ def edit(
527
+ self, path: str, old_str: str, new_str: str, *, replace_all: bool = False
528
+ ) -> ToolResult:
529
+ if old_str == new_str:
530
+ return ToolResult(success=False, error="old_str and new_str are identical.")
531
+ if path not in self._files_read:
532
+ return ToolResult(
533
+ success=False,
534
+ error=f"File {path} has not been read this session. Read it first.",
535
+ )
536
+ return self._call(
537
+ "edit",
538
+ {
539
+ "path": path,
540
+ "old_str": old_str,
541
+ "new_str": new_str,
542
+ "replace_all": replace_all,
543
+ },
544
+ )
545
+
546
+ # ── Tool schemas & dispatch ───────────────────────────────────
547
+
548
+ TOOLS = {
549
+ "bash": {
550
+ "description": (
551
+ "Run a shell command in the remote sandbox and return stdout/stderr.\n"
552
+ "\n"
553
+ "Commands run in a shell at the working directory (default /app). "
554
+ "Each invocation is independent β€” use files in /app to persist state.\n"
555
+ "\n"
556
+ "AVOID using bash for operations covered by specialized tools:\n"
557
+ "- File reading: use read (not cat/head/tail)\n"
558
+ "- File editing: use edit (not sed/awk)\n"
559
+ "- File writing: use write (not echo/cat <<EOF)\n"
560
+ "\n"
561
+ "For long-running tasks, background them:\n"
562
+ " nohup uv run train.py > /app/train.log 2>&1 &\n"
563
+ "Then check with read on the log file.\n"
564
+ "\n"
565
+ "Chain dependent commands with &&. Independent commands should be "
566
+ "separate bash calls (they can run in parallel).\n"
567
+ "\n"
568
+ "Timeout default 120s, max 600s."
569
+ ),
570
+ "parameters": {
571
+ "type": "object",
572
+ "required": ["command"],
573
+ "additionalProperties": False,
574
+ "properties": {
575
+ "command": {
576
+ "type": "string",
577
+ "description": "The shell command to execute.",
578
+ },
579
+ "description": {
580
+ "type": "string",
581
+ "description": "Short description (5-10 words, active voice). E.g. 'Install dependencies', 'Run training script'.",
582
+ },
583
+ "work_dir": {
584
+ "type": "string",
585
+ "description": "Working directory (default: /app).",
586
+ },
587
+ "timeout": {
588
+ "type": "integer",
589
+ "description": "Timeout in seconds (default: 120, max: 600).",
590
+ },
591
+ },
592
+ },
593
+ },
594
+ "read": {
595
+ "description": (
596
+ "Read file contents with line numbers (cat -n format).\n"
597
+ "\n"
598
+ "Returns the first 2000 lines by default. For large files, use offset/limit "
599
+ "to read a specific range. Line numbers always match the original file.\n"
600
+ "\n"
601
+ "Lines longer than 2000 chars are truncated.\n"
602
+ "Cannot read directories β€” use bash with 'ls' instead."
603
+ ),
604
+ "parameters": {
605
+ "type": "object",
606
+ "required": ["path"],
607
+ "additionalProperties": False,
608
+ "properties": {
609
+ "path": {
610
+ "type": "string",
611
+ "description": "Absolute path to the file to read.",
612
+ },
613
+ "offset": {
614
+ "type": "integer",
615
+ "description": "Start from this line (1-based). Only if file is too large.",
616
+ },
617
+ "limit": {
618
+ "type": "integer",
619
+ "description": "Number of lines to read. Only if file is too large.",
620
+ },
621
+ },
622
+ },
623
+ },
624
+ "write": {
625
+ "description": (
626
+ "Create or overwrite a file. Creates parent directories as needed.\n"
627
+ "\n"
628
+ "For existing files, you MUST read the file first (system enforced). "
629
+ "Prefer edit for modifications."
630
+ ),
631
+ "parameters": {
632
+ "type": "object",
633
+ "required": ["path", "content"],
634
+ "additionalProperties": False,
635
+ "properties": {
636
+ "path": {
637
+ "type": "string",
638
+ "description": "Absolute path to the file to write.",
639
+ },
640
+ "content": {
641
+ "type": "string",
642
+ "description": "Complete file content.",
643
+ },
644
+ },
645
+ },
646
+ },
647
+ "edit": {
648
+ "description": (
649
+ "Targeted edit via exact string replacement.\n"
650
+ "\n"
651
+ "Rules:\n"
652
+ "- old_str must appear EXACTLY once (unless replace_all is true).\n"
653
+ "- Include enough context in old_str for uniqueness.\n"
654
+ "- old_str and new_str must differ.\n"
655
+ "- Preserve indentation exactly.\n"
656
+ "- To delete code, set new_str to empty string.\n"
657
+ "- File MUST have been read this session (system enforced).\n"
658
+ "- Do NOT include line number prefixes in old_str/new_str.\n"
659
+ "\n"
660
+ "Use replace_all=true for batch operations like variable renaming."
661
+ ),
662
+ "parameters": {
663
+ "type": "object",
664
+ "required": ["path", "old_str", "new_str"],
665
+ "additionalProperties": False,
666
+ "properties": {
667
+ "path": {
668
+ "type": "string",
669
+ "description": "Absolute path to the file.",
670
+ },
671
+ "old_str": {
672
+ "type": "string",
673
+ "description": "Exact text to find (must differ from new_str).",
674
+ },
675
+ "new_str": {"type": "string", "description": "Replacement text."},
676
+ "replace_all": {
677
+ "type": "boolean",
678
+ "description": "Replace all occurrences (default: false).",
679
+ "default": False,
680
+ },
681
+ },
682
+ },
683
+ },
684
+ }
685
+
686
+ @classmethod
687
+ def tool_definitions(cls) -> list[dict]:
688
+ return [{"name": name, **spec} for name, spec in cls.TOOLS.items()]
689
+
690
+ def call_tool(self, name: str, arguments: dict[str, Any]) -> ToolResult:
691
+ dispatch = {
692
+ "bash": lambda a: self.bash(
693
+ a["command"],
694
+ work_dir=a.get("work_dir"),
695
+ timeout=a.get("timeout"),
696
+ description=a.get("description"),
697
+ ),
698
+ "read": lambda a: self.read(
699
+ a["path"],
700
+ offset=a.get("offset"),
701
+ limit=a.get("limit"),
702
+ ),
703
+ "write": lambda a: self.write(a["path"], a["content"]),
704
+ "edit": lambda a: self.edit(
705
+ a["path"],
706
+ a["old_str"],
707
+ a["new_str"],
708
+ replace_all=a.get("replace_all", False),
709
+ ),
710
+ }
711
+ fn = dispatch.get(name)
712
+ if not fn:
713
+ return ToolResult(success=False, error=f"Unknown tool: {name}")
714
+ return fn(arguments)
agent/tools/sandbox_tool.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sandbox tools β€” expose the Sandbox client as agent tools.
3
+
4
+ 5 tools total:
5
+ sandbox_create β€” explicit sandbox creation (requires approval)
6
+ bash, read, write, edit β€” operations on the sandbox
7
+
8
+ If any operation tool is called without an active sandbox,
9
+ a cpu-basic sandbox is auto-created (no approval needed).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import asyncio
15
+ import os
16
+ from typing import Any
17
+
18
+ from huggingface_hub import HfApi, SpaceHardware
19
+
20
+ from agent.core.session import Event
21
+ from agent.tools.sandbox_client import Sandbox
22
+
23
+ # ── Tool name mapping (short agent names β†’ Sandbox client names) ──────
24
+
25
+
26
+ async def _ensure_sandbox(
27
+ session: Any, hardware: str = "cpu-basic", **create_kwargs
28
+ ) -> tuple[Sandbox | None, str | None]:
29
+ """
30
+ Ensure a sandbox exists on the session. Auto-creates with given hardware if needed.
31
+
32
+ Returns:
33
+ (sandbox, error_message) β€” one will be None.
34
+ """
35
+ if session and getattr(session, "sandbox", None):
36
+ return session.sandbox, None
37
+
38
+ if not session:
39
+ return None, "No session available."
40
+
41
+ token = os.environ.get("HF_TOKEN")
42
+ if not token:
43
+ return None, "HF_TOKEN environment variable not set. Cannot create sandbox."
44
+
45
+ api = HfApi(token=token)
46
+ user_info = api.whoami()
47
+ owner = user_info.get("name", user_info.get("user", ""))
48
+ if not owner:
49
+ return None, "Could not determine HF username from token."
50
+
51
+ await session.send_event(
52
+ Event(
53
+ event_type="tool_log",
54
+ data={
55
+ "tool": "sandbox",
56
+ "log": f"Auto-creating sandbox for {owner} ({hardware})...",
57
+ },
58
+ )
59
+ )
60
+
61
+ kwargs = {"owner": owner, "hardware": hardware, "token": token, **create_kwargs}
62
+ sb = await asyncio.to_thread(Sandbox.create, **kwargs)
63
+ session.sandbox = sb
64
+
65
+ await session.send_event(
66
+ Event(
67
+ event_type="tool_log",
68
+ data={"tool": "sandbox", "log": f"Sandbox ready: {sb.space_id} ({sb.url})"},
69
+ )
70
+ )
71
+
72
+ return sb, None
73
+
74
+
75
+ # ── sandbox_create tool ──────────────────────────────────────────────
76
+
77
+ SANDBOX_CREATE_TOOL_SPEC = {
78
+ "name": "sandbox_create",
79
+ "description": (
80
+ "Create a persistent remote Linux environment for developing and testing scripts.\n\n"
81
+ "Workflow: sandbox_create β†’ write script β†’ pip install β†’ test with small run β†’ fix errors β†’ hf_jobs at scale.\n"
82
+ "The sandbox persists across tool calls within the session. pip install works out of the box.\n\n"
83
+ "Use this when: you need to develop, test, and iterate on scripts before launching via hf_jobs. "
84
+ "Especially for training scripts where you need to verify imports, test on a small subset, and fix errors interactively.\n\n"
85
+ "Skip this when: the task is a simple one-shot operation (status check, resource search, quick data query), "
86
+ "or the script is copied from a verified working example with minimal changes.\n\n"
87
+ "For ML code that uses CUDA, bf16, or model loading: use GPU hardware (t4-small minimum). "
88
+ "CPU sandboxes cannot run GPU code paths β€” your test will not catch GPU-related errors.\n\n"
89
+ "Hardware: " + ", ".join([e.value for e in SpaceHardware]) + ".\n"
90
+ ),
91
+ "parameters": {
92
+ "type": "object",
93
+ "required": [],
94
+ "additionalProperties": False,
95
+ "properties": {
96
+ "hardware": {
97
+ "type": "string",
98
+ "enum": [e.value for e in SpaceHardware],
99
+ "description": "Hardware tier for the sandbox (default: cpu-basic)",
100
+ },
101
+ "private": {
102
+ "type": "boolean",
103
+ "description": "If true, create a private Space",
104
+ },
105
+ },
106
+ },
107
+ }
108
+
109
+
110
+ async def sandbox_create_handler(
111
+ args: dict[str, Any], session: Any = None
112
+ ) -> tuple[str, bool]:
113
+ """Handle sandbox_create tool calls."""
114
+ # If sandbox already exists, return its info
115
+ if session and getattr(session, "sandbox", None):
116
+ sb = session.sandbox
117
+ return (
118
+ f"Sandbox already active: {sb.space_id}\n"
119
+ f"URL: {sb.url}\n"
120
+ f"Use bash/read/write/edit to interact with it."
121
+ ), True
122
+
123
+ hardware = args.get("hardware", "cpu-basic")
124
+ create_kwargs = {}
125
+ if "private" in args:
126
+ create_kwargs["private"] = args["private"]
127
+
128
+ try:
129
+ sb, error = await _ensure_sandbox(session, hardware=hardware, **create_kwargs)
130
+ except Exception as e:
131
+ return f"Failed to create sandbox: {e}", False
132
+
133
+ if error:
134
+ return error, False
135
+
136
+ return (
137
+ f"Sandbox created: {sb.space_id}\n"
138
+ f"URL: {sb.url}\n"
139
+ f"Hardware: {hardware}\n"
140
+ f"Use bash/read/write/edit to interact with it."
141
+ ), True
142
+
143
+
144
+ def _make_tool_handler(sandbox_tool_name: str):
145
+ """Factory: create a handler for a sandbox operation tool."""
146
+
147
+ async def handler(args: dict[str, Any], session: Any = None) -> tuple[str, bool]:
148
+ # Auto-create sandbox if not present
149
+ try:
150
+ sb, error = await _ensure_sandbox(session)
151
+ except Exception as e:
152
+ return f"Failed to auto-create sandbox: {e}", False
153
+
154
+ if error:
155
+ return error, False
156
+
157
+ try:
158
+ result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
159
+ if result.success:
160
+ return result.output or "(no output)", True
161
+ else:
162
+ error_msg = result.error or "Unknown error"
163
+ output = result.output
164
+ if output:
165
+ return f"{output}\n\nERROR: {error_msg}", False
166
+ return f"ERROR: {error_msg}", False
167
+ except Exception as e:
168
+ return f"Sandbox operation failed: {e}", False
169
+
170
+ return handler
171
+
172
+
173
+ def get_sandbox_tools():
174
+ """Return all 5 sandbox ToolSpecs (sandbox_create + 4 operation tools)."""
175
+ from agent.core.tools import ToolSpec
176
+
177
+ tools = []
178
+
179
+ # sandbox_create (explicit creation, requires approval)
180
+ tools.append(
181
+ ToolSpec(
182
+ name=SANDBOX_CREATE_TOOL_SPEC["name"],
183
+ description=SANDBOX_CREATE_TOOL_SPEC["description"],
184
+ parameters=SANDBOX_CREATE_TOOL_SPEC["parameters"],
185
+ handler=sandbox_create_handler,
186
+ )
187
+ )
188
+
189
+ # Operation tools (auto-execute, no approval needed)
190
+ for name in Sandbox.TOOLS.keys():
191
+ spec = Sandbox.TOOLS[name]
192
+ tools.append(
193
+ ToolSpec(
194
+ name=name,
195
+ description=spec["description"],
196
+ parameters=spec["parameters"],
197
+ handler=_make_tool_handler(name),
198
+ )
199
+ )
200
+
201
+ return tools