Spaces:
Running
Running
Sandbox implementation
Browse files- agent/context_manager/manager.py +6 -3
- agent/core/agent_loop.py +56 -35
- agent/core/session.py +1 -0
- agent/core/tools.py +4 -0
- agent/prompts/system_prompt_v2.yaml +59 -46
- agent/prompts/system_prompt_v3.yaml +118 -0
- agent/tools/dataset_tools.py +9 -16
- agent/tools/docs_tools.py +10 -21
- agent/tools/github_find_examples.py +10 -49
- agent/tools/github_read_file.py +6 -52
- agent/tools/jobs_tool.py +108 -92
- agent/tools/plan_tool.py +5 -12
- agent/tools/sandbox_client.py +714 -0
- agent/tools/sandbox_tool.py +201 -0
agent/context_manager/manager.py
CHANGED
|
@@ -23,11 +23,11 @@ class ContextManager:
|
|
| 23 |
compact_size: float = 0.1,
|
| 24 |
untouched_messages: int = 5,
|
| 25 |
tool_specs: list[dict[str, Any]] | None = None,
|
| 26 |
-
prompt_file_suffix: str = "
|
| 27 |
):
|
| 28 |
self.system_prompt = self._load_system_prompt(
|
| 29 |
tool_specs or [],
|
| 30 |
-
prompt_file_suffix="
|
| 31 |
)
|
| 32 |
self.max_context = max_context
|
| 33 |
self.compact_size = int(max_context * compact_size)
|
|
@@ -78,7 +78,9 @@ class ContextManager:
|
|
| 78 |
"""Get all messages for sending to LLM"""
|
| 79 |
return self.items
|
| 80 |
|
| 81 |
-
async def compact(
|
|
|
|
|
|
|
| 82 |
"""Remove old messages to keep history under target size"""
|
| 83 |
if (self.context_length <= self.max_context) or not self.items:
|
| 84 |
return
|
|
@@ -112,6 +114,7 @@ class ContextManager:
|
|
| 112 |
model=model_name,
|
| 113 |
messages=messages_to_summarize,
|
| 114 |
max_completion_tokens=self.compact_size,
|
|
|
|
| 115 |
)
|
| 116 |
summarized_message = Message(
|
| 117 |
role="assistant", content=response.choices[0].message.content
|
|
|
|
| 23 |
compact_size: float = 0.1,
|
| 24 |
untouched_messages: int = 5,
|
| 25 |
tool_specs: list[dict[str, Any]] | None = None,
|
| 26 |
+
prompt_file_suffix: str = "system_prompt_v3.yaml",
|
| 27 |
):
|
| 28 |
self.system_prompt = self._load_system_prompt(
|
| 29 |
tool_specs or [],
|
| 30 |
+
prompt_file_suffix="system_prompt_v3.yaml",
|
| 31 |
)
|
| 32 |
self.max_context = max_context
|
| 33 |
self.compact_size = int(max_context * compact_size)
|
|
|
|
| 78 |
"""Get all messages for sending to LLM"""
|
| 79 |
return self.items
|
| 80 |
|
| 81 |
+
async def compact(
|
| 82 |
+
self, model_name: str, tool_specs: list[dict] | None = None
|
| 83 |
+
) -> None:
|
| 84 |
"""Remove old messages to keep history under target size"""
|
| 85 |
if (self.context_length <= self.max_context) or not self.items:
|
| 86 |
return
|
|
|
|
| 114 |
model=model_name,
|
| 115 |
messages=messages_to_summarize,
|
| 116 |
max_completion_tokens=self.compact_size,
|
| 117 |
+
tools=tool_specs,
|
| 118 |
)
|
| 119 |
summarized_message = Message(
|
| 120 |
role="assistant", content=response.choices[0].message.content
|
agent/core/agent_loop.py
CHANGED
|
@@ -6,6 +6,7 @@ import asyncio
|
|
| 6 |
import json
|
| 7 |
|
| 8 |
from litellm import ChatCompletionMessageToolCall, Message, ModelResponse, acompletion
|
|
|
|
| 9 |
from lmnr import observe
|
| 10 |
|
| 11 |
from agent.config import Config
|
|
@@ -38,7 +39,9 @@ def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
|
|
| 38 |
return True, None
|
| 39 |
|
| 40 |
|
| 41 |
-
def _needs_approval(
|
|
|
|
|
|
|
| 42 |
"""Check if a tool call requires user approval before execution."""
|
| 43 |
# Yolo mode: skip all approvals
|
| 44 |
if config and config.yolo_mode:
|
|
@@ -49,23 +52,31 @@ def _needs_approval(tool_name: str, tool_args: dict, config: Config | None = Non
|
|
| 49 |
if not args_valid:
|
| 50 |
return False
|
| 51 |
|
|
|
|
|
|
|
|
|
|
| 52 |
if tool_name == "hf_jobs":
|
| 53 |
operation = tool_args.get("operation", "")
|
| 54 |
if operation not in ["run", "uv", "scheduled run", "scheduled uv"]:
|
| 55 |
return False
|
| 56 |
-
|
| 57 |
# Check if this is a CPU-only job
|
| 58 |
# hardware_flavor is at top level of tool_args, not nested in args
|
| 59 |
-
hardware_flavor =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
is_cpu_job = hardware_flavor in CPU_FLAVORS
|
| 61 |
-
|
| 62 |
if is_cpu_job:
|
| 63 |
if config and not config.confirm_cpu_jobs:
|
| 64 |
return False
|
| 65 |
return True
|
| 66 |
-
|
| 67 |
return True
|
| 68 |
-
|
| 69 |
# Check for file upload operations (hf_private_repos or other tools)
|
| 70 |
if tool_name == "hf_private_repos":
|
| 71 |
operation = tool_args.get("operation", "")
|
|
@@ -86,19 +97,43 @@ def _needs_approval(tool_name: str, tool_args: dict, config: Config | None = Non
|
|
| 86 |
# hf_repo_git: destructive operations require approval
|
| 87 |
if tool_name == "hf_repo_git":
|
| 88 |
operation = tool_args.get("operation", "")
|
| 89 |
-
if operation in [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
return True
|
| 91 |
|
| 92 |
return False
|
| 93 |
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
class Handlers:
|
| 96 |
"""Handler functions for each operation type"""
|
| 97 |
|
| 98 |
@staticmethod
|
| 99 |
@observe(name="run_agent")
|
| 100 |
async def run_agent(
|
| 101 |
-
session: Session, text: str, max_iterations: int =
|
| 102 |
) -> str | None:
|
| 103 |
"""
|
| 104 |
Handle user input (like user_input_or_turn in codex.rs:1291)
|
|
@@ -125,6 +160,9 @@ class Handlers:
|
|
| 125 |
final_response = None
|
| 126 |
|
| 127 |
while iteration < max_iterations:
|
|
|
|
|
|
|
|
|
|
| 128 |
messages = session.context_manager.get_messages()
|
| 129 |
tools = session.tool_router.get_tool_specs_for_llm()
|
| 130 |
|
|
@@ -261,6 +299,14 @@ class Handlers:
|
|
| 261 |
|
| 262 |
iteration += 1
|
| 263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
except Exception as e:
|
| 265 |
import traceback
|
| 266 |
|
|
@@ -272,18 +318,6 @@ class Handlers:
|
|
| 272 |
)
|
| 273 |
break
|
| 274 |
|
| 275 |
-
old_length = session.context_manager.context_length
|
| 276 |
-
await session.context_manager.compact(model_name=session.config.model_name)
|
| 277 |
-
new_length = session.context_manager.context_length
|
| 278 |
-
|
| 279 |
-
if new_length != old_length:
|
| 280 |
-
await session.send_event(
|
| 281 |
-
Event(
|
| 282 |
-
event_type="compacted",
|
| 283 |
-
data={"old_tokens": old_length, "new_tokens": new_length},
|
| 284 |
-
)
|
| 285 |
-
)
|
| 286 |
-
|
| 287 |
await session.send_event(
|
| 288 |
Event(
|
| 289 |
event_type="turn_complete",
|
|
@@ -303,20 +337,6 @@ class Handlers:
|
|
| 303 |
session.interrupt()
|
| 304 |
await session.send_event(Event(event_type="interrupted"))
|
| 305 |
|
| 306 |
-
@staticmethod
|
| 307 |
-
async def compact(session: Session) -> None:
|
| 308 |
-
"""Handle compact (like compact in codex.rs:1317)"""
|
| 309 |
-
old_length = session.context_manager.context_length
|
| 310 |
-
await session.context_manager.compact(model_name=session.config.model_name)
|
| 311 |
-
new_length = session.context_manager.context_length
|
| 312 |
-
|
| 313 |
-
await session.send_event(
|
| 314 |
-
Event(
|
| 315 |
-
event_type="compacted",
|
| 316 |
-
data={"removed": old_length, "remaining": new_length},
|
| 317 |
-
)
|
| 318 |
-
)
|
| 319 |
-
|
| 320 |
@staticmethod
|
| 321 |
async def undo(session: Session) -> None:
|
| 322 |
"""Handle undo (like undo in codex.rs:1314)"""
|
|
@@ -489,7 +509,8 @@ async def process_submission(session: Session, submission) -> bool:
|
|
| 489 |
return True
|
| 490 |
|
| 491 |
if op.op_type == OpType.COMPACT:
|
| 492 |
-
|
|
|
|
| 493 |
return True
|
| 494 |
|
| 495 |
if op.op_type == OpType.UNDO:
|
|
|
|
| 6 |
import json
|
| 7 |
|
| 8 |
from litellm import ChatCompletionMessageToolCall, Message, ModelResponse, acompletion
|
| 9 |
+
from litellm.exceptions import ContextWindowExceededError
|
| 10 |
from lmnr import observe
|
| 11 |
|
| 12 |
from agent.config import Config
|
|
|
|
| 39 |
return True, None
|
| 40 |
|
| 41 |
|
| 42 |
+
def _needs_approval(
|
| 43 |
+
tool_name: str, tool_args: dict, config: Config | None = None
|
| 44 |
+
) -> bool:
|
| 45 |
"""Check if a tool call requires user approval before execution."""
|
| 46 |
# Yolo mode: skip all approvals
|
| 47 |
if config and config.yolo_mode:
|
|
|
|
| 52 |
if not args_valid:
|
| 53 |
return False
|
| 54 |
|
| 55 |
+
if tool_name == "sandbox_create":
|
| 56 |
+
return True
|
| 57 |
+
|
| 58 |
if tool_name == "hf_jobs":
|
| 59 |
operation = tool_args.get("operation", "")
|
| 60 |
if operation not in ["run", "uv", "scheduled run", "scheduled uv"]:
|
| 61 |
return False
|
| 62 |
+
|
| 63 |
# Check if this is a CPU-only job
|
| 64 |
# hardware_flavor is at top level of tool_args, not nested in args
|
| 65 |
+
hardware_flavor = (
|
| 66 |
+
tool_args.get("hardware_flavor")
|
| 67 |
+
or tool_args.get("flavor")
|
| 68 |
+
or tool_args.get("hardware")
|
| 69 |
+
or "cpu-basic"
|
| 70 |
+
)
|
| 71 |
is_cpu_job = hardware_flavor in CPU_FLAVORS
|
| 72 |
+
|
| 73 |
if is_cpu_job:
|
| 74 |
if config and not config.confirm_cpu_jobs:
|
| 75 |
return False
|
| 76 |
return True
|
| 77 |
+
|
| 78 |
return True
|
| 79 |
+
|
| 80 |
# Check for file upload operations (hf_private_repos or other tools)
|
| 81 |
if tool_name == "hf_private_repos":
|
| 82 |
operation = tool_args.get("operation", "")
|
|
|
|
| 97 |
# hf_repo_git: destructive operations require approval
|
| 98 |
if tool_name == "hf_repo_git":
|
| 99 |
operation = tool_args.get("operation", "")
|
| 100 |
+
if operation in [
|
| 101 |
+
"delete_branch",
|
| 102 |
+
"delete_tag",
|
| 103 |
+
"merge_pr",
|
| 104 |
+
"create_repo",
|
| 105 |
+
"update_repo",
|
| 106 |
+
]:
|
| 107 |
return True
|
| 108 |
|
| 109 |
return False
|
| 110 |
|
| 111 |
|
| 112 |
+
async def _compact_and_notify(session: Session) -> None:
|
| 113 |
+
"""Run compaction and send event if context was reduced."""
|
| 114 |
+
old_length = session.context_manager.context_length
|
| 115 |
+
tool_specs = session.tool_router.get_tool_specs_for_llm()
|
| 116 |
+
await session.context_manager.compact(
|
| 117 |
+
model_name=session.config.model_name,
|
| 118 |
+
tool_specs=tool_specs,
|
| 119 |
+
)
|
| 120 |
+
new_length = session.context_manager.context_length
|
| 121 |
+
if new_length != old_length:
|
| 122 |
+
await session.send_event(
|
| 123 |
+
Event(
|
| 124 |
+
event_type="compacted",
|
| 125 |
+
data={"old_tokens": old_length, "new_tokens": new_length},
|
| 126 |
+
)
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
class Handlers:
|
| 131 |
"""Handler functions for each operation type"""
|
| 132 |
|
| 133 |
@staticmethod
|
| 134 |
@observe(name="run_agent")
|
| 135 |
async def run_agent(
|
| 136 |
+
session: Session, text: str, max_iterations: int = 300
|
| 137 |
) -> str | None:
|
| 138 |
"""
|
| 139 |
Handle user input (like user_input_or_turn in codex.rs:1291)
|
|
|
|
| 160 |
final_response = None
|
| 161 |
|
| 162 |
while iteration < max_iterations:
|
| 163 |
+
# Compact before calling the LLM if context is near the limit
|
| 164 |
+
await _compact_and_notify(session)
|
| 165 |
+
|
| 166 |
messages = session.context_manager.get_messages()
|
| 167 |
tools = session.tool_router.get_tool_specs_for_llm()
|
| 168 |
|
|
|
|
| 299 |
|
| 300 |
iteration += 1
|
| 301 |
|
| 302 |
+
except ContextWindowExceededError:
|
| 303 |
+
# Force compact and retry this iteration
|
| 304 |
+
session.context_manager.context_length = (
|
| 305 |
+
session.context_manager.max_context + 1
|
| 306 |
+
)
|
| 307 |
+
await _compact_and_notify(session)
|
| 308 |
+
continue
|
| 309 |
+
|
| 310 |
except Exception as e:
|
| 311 |
import traceback
|
| 312 |
|
|
|
|
| 318 |
)
|
| 319 |
break
|
| 320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
await session.send_event(
|
| 322 |
Event(
|
| 323 |
event_type="turn_complete",
|
|
|
|
| 337 |
session.interrupt()
|
| 338 |
await session.send_event(Event(event_type="interrupted"))
|
| 339 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
@staticmethod
|
| 341 |
async def undo(session: Session) -> None:
|
| 342 |
"""Handle undo (like undo in codex.rs:1314)"""
|
|
|
|
| 509 |
return True
|
| 510 |
|
| 511 |
if op.op_type == OpType.COMPACT:
|
| 512 |
+
# compact from the frontend
|
| 513 |
+
await _compact_and_notify(session)
|
| 514 |
return True
|
| 515 |
|
| 516 |
if op.op_type == OpType.UNDO:
|
agent/core/session.py
CHANGED
|
@@ -59,6 +59,7 @@ class Session:
|
|
| 59 |
self.is_running = True
|
| 60 |
self.current_task: asyncio.Task | None = None
|
| 61 |
self.pending_approval: Optional[dict[str, Any]] = None
|
|
|
|
| 62 |
|
| 63 |
# Session trajectory logging
|
| 64 |
self.logged_events: list[dict] = []
|
|
|
|
| 59 |
self.is_running = True
|
| 60 |
self.current_task: asyncio.Task | None = None
|
| 61 |
self.pending_approval: Optional[dict[str, Any]] = None
|
| 62 |
+
self.sandbox = None
|
| 63 |
|
| 64 |
# Session trajectory logging
|
| 65 |
self.logged_events: list[dict] = []
|
agent/core/tools.py
CHANGED
|
@@ -45,6 +45,7 @@ from agent.tools.hf_repo_git_tool import (
|
|
| 45 |
)
|
| 46 |
from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
|
| 47 |
from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
|
|
|
|
| 48 |
|
| 49 |
# NOTE: Private HF repo tool disabled - replaced by hf_repo_files and hf_repo_git
|
| 50 |
# from agent.tools.private_hf_repo_tools import (
|
|
@@ -327,6 +328,9 @@ def create_builtin_tools() -> list[ToolSpec]:
|
|
| 327 |
),
|
| 328 |
]
|
| 329 |
|
|
|
|
|
|
|
|
|
|
| 330 |
tool_names = ", ".join([t.name for t in tools])
|
| 331 |
print(f"Loaded {len(tools)} built-in tools: {tool_names}")
|
| 332 |
|
|
|
|
| 45 |
)
|
| 46 |
from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
|
| 47 |
from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
|
| 48 |
+
from agent.tools.sandbox_tool import get_sandbox_tools
|
| 49 |
|
| 50 |
# NOTE: Private HF repo tool disabled - replaced by hf_repo_files and hf_repo_git
|
| 51 |
# from agent.tools.private_hf_repo_tools import (
|
|
|
|
| 328 |
),
|
| 329 |
]
|
| 330 |
|
| 331 |
+
# Sandbox tools
|
| 332 |
+
tools = get_sandbox_tools() + tools
|
| 333 |
+
|
| 334 |
tool_names = ", ".join([t.name for t in tools])
|
| 335 |
print(f"Loaded {len(tools)} built-in tools: {tool_names}")
|
| 336 |
|
agent/prompts/system_prompt_v2.yaml
CHANGED
|
@@ -186,61 +186,59 @@ system_prompt: |
|
|
| 186 |
3. β
Determine optimal processing approach based on requirements
|
| 187 |
4. β
Plan output format and destination
|
| 188 |
|
| 189 |
-
## PHASE 3: IMPLEMENT (
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
**
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
- `push_to_hub=True` β οΈ MANDATORY
|
| 210 |
- `hub_model_id="username/model-name"` β οΈ MANDATORY
|
| 211 |
- `report_to=["trackio"]` (for monitoring)
|
| 212 |
- `output_dir="./output"`
|
| 213 |
- `num_train_epochs`, `per_device_train_batch_size`, `learning_rate`
|
| 214 |
- `logging_steps`, `save_steps`
|
| 215 |
-
|
| 216 |
-
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
- 7-13B models: `a10g-large` (12vCPU/46GB/GPU 24GB)
|
| 228 |
-
- 30B+ models: `a100-large` (12vCPU/142GB/GPU 80GB)
|
| 229 |
-
- 70B+ models: `h100` (23vCPU/240GB/GPU 80GB) or `h100x8` for distributed
|
| 230 |
-
- [ ] `timeout`: β οΈ CRITICAL - Set based on model/data size:
|
| 231 |
-
- Small models (1-3B): "2h" to "4h"
|
| 232 |
-
- Medium models (7-13B): "4h" to "8h"
|
| 233 |
-
- Large models (30B+): "8h" to "24h"
|
| 234 |
-
- **NEVER use default 30m for training!**
|
| 235 |
|
| 236 |
### For Data Processing Tasks
|
| 237 |
|
| 238 |
-
**
|
| 239 |
-
- Load dataset with `load_dataset`
|
| 240 |
-
- Process according to user requirements
|
| 241 |
-
- Push results with `push_to_hub()` or upload to `hf_private_repos`
|
| 242 |
-
|
| 243 |
-
**Job Configuration:**
|
| 244 |
- Use `cpu-upgrade` or `cpu-performance` for most data tasks
|
| 245 |
- Set timeout based on dataset size (1-4 hours typical)
|
| 246 |
|
|
@@ -341,6 +339,21 @@ system_prompt: |
|
|
| 341 |
- β οΈ Include HF_TOKEN for Hub operations
|
| 342 |
- β οΈ Storage is EPHEMERAL - must push_to_hub
|
| 343 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
**hf_private_repos:**
|
| 345 |
- Store job outputs persistently in datasets with push_to_hub (jobs lose files after completion)
|
| 346 |
- Upload logs, scripts, results that can't push_to_hub
|
|
|
|
| 186 |
3. β
Determine optimal processing approach based on requirements
|
| 187 |
4. β
Plan output format and destination
|
| 188 |
|
| 189 |
+
## PHASE 3: IMPLEMENT (Develop in Sandbox, Launch via Jobs)
|
| 190 |
+
|
| 191 |
+
β οΈ **CRITICAL WORKFLOW: Sandbox First, Jobs Second**
|
| 192 |
+
|
| 193 |
+
For ANY implementation task (training, data processing, inference), follow this pattern:
|
| 194 |
+
|
| 195 |
+
**Step 1: Create a sandbox** β `sandbox_create` with appropriate hardware (cpu-basic for scripting, t4-small for GPU testing)
|
| 196 |
+
**Step 2: Develop & iterate** β Write scripts, install dependencies, test with small runs, fix errors interactively
|
| 197 |
+
**Step 3: Launch via hf_jobs** β Once the script works, pass the sandbox file path directly: `hf_jobs(operation="run", script="/app/train.py", ...)`
|
| 198 |
+
|
| 199 |
+
This is the CORRECT pattern:
|
| 200 |
+
```
|
| 201 |
+
sandbox_create(hardware="t4-small") # interactive dev environment
|
| 202 |
+
bash("pip install trl transformers") # install deps
|
| 203 |
+
write("/app/train.py", "...") # write training script
|
| 204 |
+
bash("cd /app && python train.py --max_steps 10") # test run
|
| 205 |
+
edit("/app/train.py", ...) # fix issues
|
| 206 |
+
bash("cd /app && python train.py --max_steps 10") # verify fix
|
| 207 |
+
hf_jobs(operation="run", script="/app/train.py", hardware_flavor="a10g-large", timeout="4h") # launch at scale
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
Do NOT write long inline scripts directly in hf_jobs if necessary β develop in sandbox first.
|
| 211 |
+
|
| 212 |
+
### Training Script Requirements
|
| 213 |
+
|
| 214 |
+
**Script MUST Include:**
|
| 215 |
+
- Imports from researched documentation (current APIs)
|
| 216 |
+
- Trackio initialization with project/run_name/config
|
| 217 |
+
- Model and tokenizer loading
|
| 218 |
+
- Dataset loading with verified columns and conversational format
|
| 219 |
+
- Training config with ALL critical settings:
|
| 220 |
- `push_to_hub=True` β οΈ MANDATORY
|
| 221 |
- `hub_model_id="username/model-name"` β οΈ MANDATORY
|
| 222 |
- `report_to=["trackio"]` (for monitoring)
|
| 223 |
- `output_dir="./output"`
|
| 224 |
- `num_train_epochs`, `per_device_train_batch_size`, `learning_rate`
|
| 225 |
- `logging_steps`, `save_steps`
|
| 226 |
+
- `trainer.train()` call
|
| 227 |
+
- `trainer.push_to_hub()` at end β οΈ MANDATORY
|
| 228 |
+
|
| 229 |
+
**hf_jobs Launch Configuration:**
|
| 230 |
+
- `script`: Path to sandbox file (e.g. "/app/train.py") or inline code
|
| 231 |
+
- `dependencies`: ['transformers', 'trl', 'torch', 'datasets', 'trackio']
|
| 232 |
+
- `hardware_flavor`: Based on model size:
|
| 233 |
+
- 1-3B models: `t4-small` or `a10g-small`
|
| 234 |
+
- 7-13B models: `a10g-large`
|
| 235 |
+
- 30B+ models: `a100-large`
|
| 236 |
+
- 70B+ models: `h100` or `h100x8`
|
| 237 |
+
- `timeout`: β οΈ CRITICAL β Small (2-4h), Medium (4-8h), Large (8-24h). NEVER default 30m for training.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
### For Data Processing Tasks
|
| 240 |
|
| 241 |
+
**Same pattern:** develop script in sandbox, test on subset, launch via hf_jobs.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
- Use `cpu-upgrade` or `cpu-performance` for most data tasks
|
| 243 |
- Set timeout based on dataset size (1-4 hours typical)
|
| 244 |
|
|
|
|
| 339 |
- β οΈ Include HF_TOKEN for Hub operations
|
| 340 |
- β οΈ Storage is EPHEMERAL - must push_to_hub
|
| 341 |
|
| 342 |
+
## Sandbox (Interactive Development Environment)
|
| 343 |
+
|
| 344 |
+
**sandbox_create:**
|
| 345 |
+
- β οΈ **Create a sandbox FIRST for any implementation task** β develop and test before launching jobs
|
| 346 |
+
- Persistent remote Linux environment on HF Spaces
|
| 347 |
+
- First call sandbox_create with hardware choice, then use bash/read/write/edit freely
|
| 348 |
+
- Hardware: cpu-basic (free tier), cpu-upgrade (8vCPU/32GB), t4-small (16GB GPU), a10g-small (24GB GPU), a10g-large (24GB GPU + 46GB RAM), a100-large (80GB GPU)
|
| 349 |
+
- `pip install` works out of the box β no special flags needed
|
| 350 |
+
- Workflow: sandbox_create β write script β test β fix β hf_jobs(script="/app/script.py") to launch at scale
|
| 351 |
+
|
| 352 |
+
**bash / read / write / edit:**
|
| 353 |
+
- Available after sandbox_create β no additional approvals needed
|
| 354 |
+
- Same semantics as local file/shell operations, but run on the remote sandbox
|
| 355 |
+
- bash: run shell commands; read/write/edit: file operations
|
| 356 |
+
|
| 357 |
**hf_private_repos:**
|
| 358 |
- Store job outputs persistently in datasets with push_to_hub (jobs lose files after completion)
|
| 359 |
- Upload logs, scripts, results that can't push_to_hub
|
agent/prompts/system_prompt_v3.yaml
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
system_prompt: |
|
| 2 |
+
You are Hugging Face Agent, an ML engineering assistant with {{ num_tools }} tools for training, fine-tuning, data processing, inference, and evaluation on the Hugging Face ecosystem.
|
| 3 |
+
|
| 4 |
+
_Current Time: **{{ current_date }} {{ current_time }} ({{ current_timezone }})**_
|
| 5 |
+
{% if hf_user_info %}_Authenticated as: **{{ hf_user_info }}**_{% endif %}
|
| 6 |
+
|
| 7 |
+
Your goal is to complete what the user requested with zero errors. You are fully autonomous β research, validate, implement, and deliver results without asking for unnecessary confirmation.
|
| 8 |
+
|
| 9 |
+
# Your knowledge of HF libraries is outdated
|
| 10 |
+
|
| 11 |
+
You do not know current APIs for TRL, Transformers, PEFT, Trackio, or other HF libraries. Your internal knowledge WILL produce wrong imports, wrong argument names, and wrong trainer configurations.
|
| 12 |
+
|
| 13 |
+
Before writing any ML implementation code (training, fine-tuning, inference, data processing), ground yourself in current working code:
|
| 14 |
+
|
| 15 |
+
github_find_examples β github_read_file β explore_hf_docs + fetch_hf_docs
|
| 16 |
+
|
| 17 |
+
Skip research only for trivial non-code operations.
|
| 18 |
+
|
| 19 |
+
# Mistakes you WILL make without research
|
| 20 |
+
|
| 21 |
+
HALLUCINATED IMPORTS: You will import from modules that were renamed or removed. Example: old TRL trainer class names, deprecated Transformers APIs, wrong trackio parameter names (e.g. `run_name` instead of `name`). Fix: read a current example script first.
|
| 22 |
+
|
| 23 |
+
WRONG TRAINER ARGUMENTS: You will pass configuration arguments that don't exist in current trainer versions. Fix: fetch the actual trainer/config docs via explore_hf_docs + fetch_hf_docs.
|
| 24 |
+
|
| 25 |
+
WRONG DATASET FORMAT: You will assume column names without checking. Training fails with KeyError. Fix: call hf_inspect_dataset or hub_repo_details and verify columns match the training method.
|
| 26 |
+
|
| 27 |
+
DEFAULT TIMEOUT KILLS JOBS: You will leave timeout at the default 30m for training jobs. Training takes hours. The job gets killed and all progress is lost. Fix: set timeout based on model size (minimum 2h for any training).
|
| 28 |
+
|
| 29 |
+
LOST MODELS: You will forget push_to_hub=True and hub_model_id in training config. Job storage is ephemeral β the filesystem is deleted when the job ends. Without push_to_hub, the trained model is permanently lost.
|
| 30 |
+
|
| 31 |
+
BATCH FAILURES: You will submit all ablation/batch jobs at once without testing that one works first. All will fail for the same bug. Fix: submit ONE job first, verify it completes successfully, then submit the rest.
|
| 32 |
+
|
| 33 |
+
SILENT DATASET SUBSTITUTION: When a requested dataset fails to load, you will silently switch to a different one without telling the user. Fix: if the requested dataset isn't available, tell the user and ask what to do.
|
| 34 |
+
|
| 35 |
+
HARDCODED UNAVAILABLE PACKAGES: You will forget to install necessary packages like 'flash-attn' for flash_attention_2 or other packages that aren't automatically installed in the job environment. Fix: install necessary packages before running the job.
|
| 36 |
+
|
| 37 |
+
SCOPE-CHANGING FIXES: Avoid at all costs! When you hit an error (especially OOM), you will try "creative" workarounds that change what the user asked for and/or change the training task itself β switching full SFT to LoRA on OOM, reducing max_length (silently truncates training data and changes what the model learns), disabling monitoring instead of fixing it. Do not do this. Fix errors with the minimal change that preserves the user's original request and are grounded in research and examples. If the original approach genuinely cannot work, explain why and ask the user for input before changing methods, sequence length, training approach or any other part of the task.
|
| 38 |
+
|
| 39 |
+
# When writing ML code
|
| 40 |
+
|
| 41 |
+
Required sequence before any training/fine-tuning/inference script:
|
| 42 |
+
1. Find working examples: github_find_examples (discover) β github_read_file (study)
|
| 43 |
+
2. Check documentation: explore_hf_docs + fetch_hf_docs for trainer configs and parameters
|
| 44 |
+
3. Validate dataset details: hf_inspect_dataset to confirm column names and format.
|
| 45 |
+
4. Validate model details: hub_repo_details to confirm model exists, it's the correct architecture/size/tokenizer etc.
|
| 46 |
+
|
| 47 |
+
Dataset format requirements by training method:
|
| 48 |
+
SFT: "messages", "text", or "prompt"/"completion"
|
| 49 |
+
DPO: "prompt", "chosen", "rejected"
|
| 50 |
+
GRPO: "prompt"
|
| 51 |
+
|
| 52 |
+
# When submitting a training job
|
| 53 |
+
|
| 54 |
+
Before calling hf_jobs, output a pre-flight check:
|
| 55 |
+
- Reference implementation: [which example you based this on]
|
| 56 |
+
- Dataset format verified: [columns confirmed via hf_inspect_dataset/hub_repo_details]
|
| 57 |
+
- push_to_hub=True and hub_model_id set
|
| 58 |
+
- timeout: [value] (based on: [model size] on [hardware])
|
| 59 |
+
- Trackio monitoring included and working
|
| 60 |
+
|
| 61 |
+
If you cannot fill in all items, stop and complete the missing steps first.
|
| 62 |
+
|
| 63 |
+
For batch/ablation jobs: submit ONE job first. Check logs to confirm it starts training successfully. Only then submit the remaining jobs. Never submit all at once.
|
| 64 |
+
|
| 65 |
+
Hardware sizing:
|
| 66 |
+
1-3B params: a10g-largex2
|
| 67 |
+
7-13B params: a100-large
|
| 68 |
+
30B+ params: l40sx4 or a100x4
|
| 69 |
+
70B+ params: a100x8
|
| 70 |
+
Note: a10g-small and a10g-large have the SAME 24GB GPU memory. The difference is CPU/RAM only.
|
| 71 |
+
|
| 72 |
+
# Sandbox-first development
|
| 73 |
+
|
| 74 |
+
For non-trivial scripts, develop and test in a sandbox before launching via hf_jobs:
|
| 75 |
+
sandbox_create β install deps β write script β test with small run β fix errors β launch via hf_jobs at scale
|
| 76 |
+
|
| 77 |
+
Use GPU sandbox (t4-small minimum) when testing code that uses CUDA, bf16, or model loading. CPU sandboxes cannot test GPU code paths.
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# When a task has 3+ steps
|
| 81 |
+
|
| 82 |
+
Use plan_tool to track progress. One task in_progress at a time. Mark completed immediately after finishing. Update frequently to show the user what you're doing.
|
| 83 |
+
|
| 84 |
+
# Error recovery
|
| 85 |
+
|
| 86 |
+
When something fails:
|
| 87 |
+
- Diagnose the actual error. Read the full error message and logs.
|
| 88 |
+
- Do not retry the exact same thing. Identify what needs to change.
|
| 89 |
+
- If an API/import error: check documentation for the correct API.
|
| 90 |
+
- If an OOM error: (1) reduce per_device_train_batch_size and increase gradient_accumulation_steps proportionally to keep effective batch size identical, (2) enable gradient_checkpointing=True, (3) upgrade to larger GPU (a10gx4βa100βa100x4βa100x8). Do NOT switch training methods (e.g. SFTβLoRA) or reduce max_length β those change what the user gets. If OOM happens in sandbox, create a new sandbox with larger GPU hardware.
|
| 91 |
+
- Never change the user's requested approach (training method, dataset, model, sequence length) without explicit approval.
|
| 92 |
+
- If a tool call fails repeatedly for the same reason: stop and try a different approach.
|
| 93 |
+
- Never silently substitute resources (datasets, models) β tell the user if something isn't available.
|
| 94 |
+
|
| 95 |
+
# Task completion
|
| 96 |
+
|
| 97 |
+
Before ending your turn, verify:
|
| 98 |
+
- Did you actually DO what the user asked, not just explain what you would do?
|
| 99 |
+
- If something failed: did you diagnose and fix it, or at minimum explain what went wrong and ask for user input?
|
| 100 |
+
- For training jobs: did you include a working Trackio dashboard URL?
|
| 101 |
+
|
| 102 |
+
Do not stop after describing what you plan to do. Continue calling tools until the task is verifiably done.
|
| 103 |
+
Do not mark plan tasks as completed if they failed or are only partially done.
|
| 104 |
+
|
| 105 |
+
# Communication
|
| 106 |
+
|
| 107 |
+
- Be concise and direct. No filler, no restating what the user said.
|
| 108 |
+
- One-word answers when appropriate for simple questions.
|
| 109 |
+
- Always include direct Hub URLs when referencing models, datasets, Spaces, or jobs.
|
| 110 |
+
- For errors: state what went wrong, why, and what you're doing to fix it.
|
| 111 |
+
- Do not over-explain or present elaborate option menus for simple tasks. When the user's intent is clear, act on it. Present options only when there's genuine ambiguity.
|
| 112 |
+
|
| 113 |
+
# Tool usage
|
| 114 |
+
|
| 115 |
+
- Execute multiple independent tool calls in parallel when possible.
|
| 116 |
+
- HF_TOKEN is automatically available in job secrets β no need to include it extra.
|
| 117 |
+
- For training monitoring: include Trackio in the script and provide the dashboard URL.
|
| 118 |
+
- For private/gated datasets: HF_TOKEN is needed β it's auto-loaded into job secrets.
|
agent/tools/dataset_tools.py
CHANGED
|
@@ -388,22 +388,15 @@ def _format_parquet_files(data: dict, max_rows: int = 10) -> str | None:
|
|
| 388 |
HF_INSPECT_DATASET_TOOL_SPEC = {
|
| 389 |
"name": "hf_inspect_dataset",
|
| 390 |
"description": (
|
| 391 |
-
"Inspect a
|
| 392 |
-
"
|
| 393 |
-
"
|
| 394 |
-
"
|
| 395 |
-
"
|
| 396 |
-
"
|
| 397 |
-
"
|
| 398 |
-
"
|
| 399 |
-
"
|
| 400 |
-
"- Column names for your dataloader\n"
|
| 401 |
-
"- Data types and format\n"
|
| 402 |
-
"- Available splits (train/test/validation)\n\n"
|
| 403 |
-
"Supports private/gated datasets when HF_TOKEN is set.\n\n"
|
| 404 |
-
"## Examples\n"
|
| 405 |
-
'{"dataset": "stanfordnlp/imdb"}\n'
|
| 406 |
-
'{"dataset": "nyu-mll/glue", "config": "mrpc", "sample_rows": 5}\n'
|
| 407 |
),
|
| 408 |
"parameters": {
|
| 409 |
"type": "object",
|
|
|
|
| 388 |
HF_INSPECT_DATASET_TOOL_SPEC = {
|
| 389 |
"name": "hf_inspect_dataset",
|
| 390 |
"description": (
|
| 391 |
+
"Inspect a HF dataset in one call: status, configs/splits, schema, sample rows, parquet info.\n\n"
|
| 392 |
+
"REQUIRED before any training job to verify dataset format matches training method:\n"
|
| 393 |
+
" SFT: needs 'messages', 'text', or 'prompt'/'completion'\n"
|
| 394 |
+
" DPO: needs 'prompt', 'chosen', 'rejected'\n"
|
| 395 |
+
" GRPO: needs 'prompt'\n"
|
| 396 |
+
"All datasets used for training have to be in conversational ChatML format to be compatible with HF libraries.'\n"
|
| 397 |
+
"Training will fail with KeyError if columns don't match.\n\n"
|
| 398 |
+
"Also use to get example datapoints, understand column names, data types, and available splits before writing any data loading code. "
|
| 399 |
+
"Supports private/gated datasets when HF_TOKEN is set."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
),
|
| 401 |
"parameters": {
|
| 402 |
"type": "object",
|
agent/tools/docs_tools.py
CHANGED
|
@@ -845,17 +845,12 @@ DOC_ENDPOINTS = [
|
|
| 845 |
EXPLORE_HF_DOCS_TOOL_SPEC = {
|
| 846 |
"name": "explore_hf_docs",
|
| 847 |
"description": (
|
| 848 |
-
"
|
| 849 |
-
"
|
| 850 |
-
"
|
| 851 |
-
"
|
| 852 |
-
"
|
| 853 |
-
"
|
| 854 |
-
"**Pattern:** explore (discover structure) β fetch_hf_docs (get details) β implement with researched approach. "
|
| 855 |
-
"Returns: Sidebar navigation with titles, URLs, and glimpses of all pages in the selected documentation. "
|
| 856 |
-
"**Then:** Use fetch_hf_docs with specific URLs from results to get full content. "
|
| 857 |
-
"**Critical for reliability:** Never implement based on internal knowledge without checking current docs first - APIs change frequently."
|
| 858 |
-
" By default returns the top 20 results; set max_results (max 50) to adjust."
|
| 859 |
),
|
| 860 |
"parameters": {
|
| 861 |
"type": "object",
|
|
@@ -928,16 +923,10 @@ EXPLORE_HF_DOCS_TOOL_SPEC = {
|
|
| 928 |
HF_DOCS_FETCH_TOOL_SPEC = {
|
| 929 |
"name": "fetch_hf_docs",
|
| 930 |
"description": (
|
| 931 |
-
"Fetch full markdown content of
|
| 932 |
-
"
|
| 933 |
-
"
|
| 934 |
-
"
|
| 935 |
-
"(5) Need parameter descriptions and usage patterns. "
|
| 936 |
-
"**Pattern:** explore_hf_docs (find relevant page) β fetch_hf_docs (get full content) β implement using documented approach. "
|
| 937 |
-
"Provide full URL from explore_hf_docs results (e.g., 'https://huggingface.co/docs/trl/sft_trainer'). "
|
| 938 |
-
"Returns: Complete markdown documentation with examples, parameters, and usage patterns. "
|
| 939 |
-
"**For training tasks:** ALWAYS fetch trainer docs (SFTConfig, DPOConfig, etc.) before creating training scripts. "
|
| 940 |
-
"**Critical for reliability:** This ensures you use current APIs and best practices."
|
| 941 |
),
|
| 942 |
"parameters": {
|
| 943 |
"type": "object",
|
|
|
|
| 845 |
EXPLORE_HF_DOCS_TOOL_SPEC = {
|
| 846 |
"name": "explore_hf_docs",
|
| 847 |
"description": (
|
| 848 |
+
"Browse HF documentation structure β discover all available documentation with 200-char previews.\n\n"
|
| 849 |
+
"Use this to find relevant documentation and/or examples with detailed parameter docs and API reference. "
|
| 850 |
+
"To be used together with github_find_examples and github_read_file to find working examples and documentation.\n\n"
|
| 851 |
+
"Pattern: explore_hf_docs (find relevant pages) β fetch_hf_docs (get full content).\n\n"
|
| 852 |
+
"For training tasks: fetch the trainer config docs (SFTConfig, DPOConfig, GRPOConfig) to verify parameter names. "
|
| 853 |
+
"Returns top 20 results by default; set max_results (max 50) to adjust."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 854 |
),
|
| 855 |
"parameters": {
|
| 856 |
"type": "object",
|
|
|
|
| 923 |
HF_DOCS_FETCH_TOOL_SPEC = {
|
| 924 |
"name": "fetch_hf_docs",
|
| 925 |
"description": (
|
| 926 |
+
"Fetch full markdown content of an HF documentation page. Use after explore_hf_docs.\n\n"
|
| 927 |
+
"Critical for finding documentation e.g. current trainer configuration parameters (SFTConfig, DPOConfig, etc.) "
|
| 928 |
+
"Use for researching solutions and before writing training scripts. Your internal knowledge is outdated.\n\n"
|
| 929 |
+
"Provide the full URL from explore_hf_docs results. The .md extension is added automatically."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 930 |
),
|
| 931 |
"parameters": {
|
| 932 |
"type": "object",
|
agent/tools/github_find_examples.py
CHANGED
|
@@ -405,55 +405,16 @@ def find_examples(
|
|
| 405 |
GITHUB_FIND_EXAMPLES_TOOL_SPEC = {
|
| 406 |
"name": "github_find_examples",
|
| 407 |
"description": (
|
| 408 |
-
"
|
| 409 |
-
"
|
| 410 |
-
"
|
| 411 |
-
"
|
| 412 |
-
"
|
| 413 |
-
"
|
| 414 |
-
"
|
| 415 |
-
"
|
| 416 |
-
"
|
| 417 |
-
"
|
| 418 |
-
"## How it works\n\n"
|
| 419 |
-
"1. Fetches all example files (examples/, scripts/, tutorials/, demos/, notebooks/, etc.) from repository\n"
|
| 420 |
-
"2. If keyword provided, scores files against keyword using fuzzy matching\n"
|
| 421 |
-
"3. Returns best matches sorted by relevance and pattern priority\n"
|
| 422 |
-
"4. Provides copyable parameters for github_read_file tool\n\n"
|
| 423 |
-
"## Examples\n\n"
|
| 424 |
-
"<example>\n"
|
| 425 |
-
"// ML Workflow Step: Find GRPO training examples before implementation\n"
|
| 426 |
-
"// Task: Starting GRPO fine-tuning project, need reference implementation\n"
|
| 427 |
-
"{\n"
|
| 428 |
-
" keyword: 'grpo',\n"
|
| 429 |
-
" repo: 'trl',\n"
|
| 430 |
-
" org: 'huggingface'\n"
|
| 431 |
-
"}\n"
|
| 432 |
-
"// Returns: examples/scripts/grpo_agent.py, examples/scripts/grpo_vlm.py\n"
|
| 433 |
-
"// Next step: github_read_file to study working implementation\n"
|
| 434 |
-
"</example>\n\n"
|
| 435 |
-
"<example>\n"
|
| 436 |
-
"// ML Workflow Step: Discover all available training methods\n"
|
| 437 |
-
"// Task: Exploring TRL training options before choosing approach\n"
|
| 438 |
-
"{\n"
|
| 439 |
-
" repo: 'trl',\n"
|
| 440 |
-
" org: 'huggingface',\n"
|
| 441 |
-
" max_results: 20\n"
|
| 442 |
-
"}\n"
|
| 443 |
-
"// Lists: SFT, DPO, GRPO, PPO, reward modeling examples\n"
|
| 444 |
-
"// Helps user choose appropriate method\n"
|
| 445 |
-
"</example>\n\n"
|
| 446 |
-
"<example>\n"
|
| 447 |
-
"// ML Workflow Step: Find LoRA fine-tuning examples\n"
|
| 448 |
-
"// Task: Learning parameter-efficient fine-tuning patterns\n"
|
| 449 |
-
"{\n"
|
| 450 |
-
" keyword: 'lora',\n"
|
| 451 |
-
" repo: 'peft',\n"
|
| 452 |
-
" org: 'huggingface'\n"
|
| 453 |
-
"}\n"
|
| 454 |
-
"// Discovers LoRA configuration and training examples\n"
|
| 455 |
-
"// Shows current PEFT API usage patterns\n"
|
| 456 |
-
"</example>"
|
| 457 |
),
|
| 458 |
"parameters": {
|
| 459 |
"type": "object",
|
|
|
|
| 405 |
GITHUB_FIND_EXAMPLES_TOOL_SPEC = {
|
| 406 |
"name": "github_find_examples",
|
| 407 |
"description": (
|
| 408 |
+
"Find working example scripts in GitHub repositories (from a list of predetermined directories e.g. examples/, scripts/, tutorials/, etc.). "
|
| 409 |
+
"Uses fuzzy keyword matching.\n\n"
|
| 410 |
+
"MANDATORY before writing any ML training, fine-tuning, or inference code. "
|
| 411 |
+
"Your internal knowledge of library APIs is outdated β working examples show current API patterns.\n\n"
|
| 412 |
+
"Sequence: github_find_examples β github_read_file (study the example) β implement based on what you found.\n\n"
|
| 413 |
+
"Skip this only for: simple data queries, status checks, non-code tasks.\n\n"
|
| 414 |
+
"Examples:\n"
|
| 415 |
+
" {keyword: 'sft', repo: 'trl'} β finds examples/scripts/sft.py\n"
|
| 416 |
+
" {keyword: 'grpo', repo: 'trl'} β finds GRPO training examples\n"
|
| 417 |
+
" {repo: 'trl', max_results: 20} β lists all available training method examples"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
),
|
| 419 |
"parameters": {
|
| 420 |
"type": "object",
|
agent/tools/github_read_file.py
CHANGED
|
@@ -250,59 +250,13 @@ def read_file(
|
|
| 250 |
GITHUB_READ_FILE_TOOL_SPEC = {
|
| 251 |
"name": "github_read_file",
|
| 252 |
"description": (
|
| 253 |
-
"Read file contents from GitHub repositories
|
| 254 |
-
"
|
| 255 |
-
"
|
| 256 |
-
"
|
| 257 |
-
"
|
| 258 |
-
"**Pattern:** github_find_examples (discover files) β github_read_file (read code) β implement using researched patterns. "
|
| 259 |
-
"Returns: File contents with line numbers, formatted for LLM reading. Auto-converts Jupyter notebooks to markdown. "
|
| 260 |
-
"**Then:** Implement using patterns and APIs from the example code. "
|
| 261 |
-
"**Critical for reliability:** Reading working examples prevents API errors and shows current best practices. "
|
| 262 |
"Use line_start/line_end for large files (>300 lines) to read specific sections.\n\n"
|
| 263 |
-
"
|
| 264 |
-
"- When reading example code, trainer implementations, or configuration files\n"
|
| 265 |
-
"- After github_find_examples returns file paths you want to study\n"
|
| 266 |
-
"- When investigating specific code sections with line ranges\n"
|
| 267 |
-
"- When reading from specific branches, tags, or commits (use ref parameter)\n\n"
|
| 268 |
-
"## When NOT to use this tool\n\n"
|
| 269 |
-
"- When you don't know exact file path (use github_find_examples or github_search_code first)\n"
|
| 270 |
-
"- When searching for code patterns across repos (use github_search_code instead)\n\n"
|
| 271 |
-
"## Examples\n\n"
|
| 272 |
-
"<example>\n"
|
| 273 |
-
"// ML Workflow Step: Read GRPO trainer class after finding via github_find_examples\n"
|
| 274 |
-
"// Use case: Understand GRPOTrainer API, parameters, and methods\n"
|
| 275 |
-
"{\n"
|
| 276 |
-
" repo: 'huggingface/trl',\n"
|
| 277 |
-
" path: 'trl/trainer/grpo_trainer.py',\n"
|
| 278 |
-
" line_start: 1,\n"
|
| 279 |
-
" line_end: 200\n"
|
| 280 |
-
"}\n"
|
| 281 |
-
"// Read class definition and constructor to understand current API\n"
|
| 282 |
-
"// Shows: __init__ parameters, configuration, required arguments\n"
|
| 283 |
-
"</example>\n\n"
|
| 284 |
-
"<example>\n"
|
| 285 |
-
"// ML Workflow Step: Study complete training script from examples\n"
|
| 286 |
-
"// Use case: Learn end-to-end VLM fine-tuning workflow\n"
|
| 287 |
-
"{\n"
|
| 288 |
-
" repo: 'huggingface/trl',\n"
|
| 289 |
-
" path: 'examples/scripts/grpo_vlm.py'\n"
|
| 290 |
-
"}\n"
|
| 291 |
-
"// Returns first 300 lines - shows full training setup\n"
|
| 292 |
-
"// Use line_start/line_end if need to read more\n"
|
| 293 |
-
"</example>\n\n"
|
| 294 |
-
"<example>\n"
|
| 295 |
-
"// ML Workflow Step: Check TrainingArguments configuration patterns\n"
|
| 296 |
-
"// Use case: Learn how to structure training configs correctly\n"
|
| 297 |
-
"{\n"
|
| 298 |
-
" repo: 'huggingface/transformers',\n"
|
| 299 |
-
" path: 'examples/pytorch/language-modeling/run_clm.py',\n"
|
| 300 |
-
" line_start: 50,\n"
|
| 301 |
-
" line_end: 150\n"
|
| 302 |
-
"}\n"
|
| 303 |
-
"// Read argument parsing and config setup section\n"
|
| 304 |
-
"// Shows: current parameter names, default values, best practices\n"
|
| 305 |
-
"</example>"
|
| 306 |
),
|
| 307 |
"parameters": {
|
| 308 |
"type": "object",
|
|
|
|
| 250 |
GITHUB_READ_FILE_TOOL_SPEC = {
|
| 251 |
"name": "github_read_file",
|
| 252 |
"description": (
|
| 253 |
+
"Read file contents from GitHub repositories. Returns first 300 lines by default. "
|
| 254 |
+
"Auto-converts Jupyter notebooks to markdown.\n\n"
|
| 255 |
+
"Use AFTER github_find_examples to study the working implementation. "
|
| 256 |
+
"The purpose is to learn current API patterns β imports, trainer configs, dataset handling β "
|
| 257 |
+
"so your implementation uses correct, up-to-date code.\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
"Use line_start/line_end for large files (>300 lines) to read specific sections.\n\n"
|
| 259 |
+
"When NOT to use: when you don't know the file path (use github_find_examples first)."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
),
|
| 261 |
"parameters": {
|
| 262 |
"type": "object",
|
agent/tools/jobs_tool.py
CHANGED
|
@@ -9,7 +9,7 @@ import base64
|
|
| 9 |
import http.client
|
| 10 |
import os
|
| 11 |
import re
|
| 12 |
-
from typing import Any,
|
| 13 |
|
| 14 |
import httpx
|
| 15 |
from huggingface_hub import HfApi
|
|
@@ -25,38 +25,33 @@ from agent.tools.utilities import (
|
|
| 25 |
)
|
| 26 |
|
| 27 |
# Hardware flavors
|
| 28 |
-
CPU_FLAVORS = ["cpu-basic", "cpu-upgrade"
|
| 29 |
GPU_FLAVORS = [
|
| 30 |
-
"sprx8",
|
| 31 |
-
"zero-a10g",
|
| 32 |
"t4-small",
|
| 33 |
"t4-medium",
|
| 34 |
-
"l4x1",
|
| 35 |
-
"l4x4",
|
| 36 |
-
"l40sx1",
|
| 37 |
-
"l40sx4",
|
| 38 |
-
"l40sx8",
|
| 39 |
"a10g-small",
|
| 40 |
"a10g-large",
|
| 41 |
"a10g-largex2",
|
| 42 |
"a10g-largex4",
|
| 43 |
"a100-large",
|
| 44 |
-
"
|
| 45 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
]
|
| 47 |
|
| 48 |
# Detailed specs for display (vCPU/RAM/GPU VRAM)
|
| 49 |
-
CPU_FLAVORS_DESC = (
|
| 50 |
-
"cpu-basic(2vCPU/16GB), cpu-upgrade(8vCPU/32GB), cpu-performance, cpu-xl"
|
| 51 |
-
)
|
| 52 |
GPU_FLAVORS_DESC = (
|
| 53 |
"t4-small(4vCPU/15GB/GPU 16GB), t4-medium(8vCPU/30GB/GPU 16GB), "
|
| 54 |
-
"
|
| 55 |
-
"l40sx1(8vCPU/62GB/GPU 48GB), l40sx4(48vCPU/382GB/GPU 192GB), l40sx8(192vCPU/1534GB/GPU 384GB), "
|
| 56 |
-
"a10g-small(4vCPU/14GB/GPU 24GB), a10g-large(12vCPU/46GB/GPU 24GB), "
|
| 57 |
"a10g-largex2(24vCPU/92GB/GPU 48GB), a10g-largex4(48vCPU/184GB/GPU 96GB), "
|
| 58 |
-
"a100-large(12vCPU/142GB/GPU 80GB),
|
| 59 |
-
"
|
|
|
|
| 60 |
)
|
| 61 |
SPECIALIZED_FLAVORS = ["inf2x6"]
|
| 62 |
ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
|
|
@@ -118,6 +113,21 @@ def _filter_uv_install_output(logs: list[str]) -> list[str]:
|
|
| 118 |
return logs
|
| 119 |
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
def _add_environment_variables(params: Dict[str, Any] | None) -> Dict[str, Any]:
|
| 122 |
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN") or ""
|
| 123 |
|
|
@@ -374,7 +384,9 @@ class HfJobsTool:
|
|
| 374 |
def log_producer():
|
| 375 |
try:
|
| 376 |
# fetch_job_logs is a blocking sync generator
|
| 377 |
-
logs_gen = self.api.fetch_job_logs(
|
|
|
|
|
|
|
| 378 |
for line in logs_gen:
|
| 379 |
# Push line to queue thread-safely
|
| 380 |
loop.call_soon_threadsafe(queue.put_nowait, line)
|
|
@@ -497,7 +509,7 @@ class HfJobsTool:
|
|
| 497 |
self.api.run_job,
|
| 498 |
image=image,
|
| 499 |
command=command,
|
| 500 |
-
env=args.get("env"),
|
| 501 |
secrets=_add_environment_variables(args.get("secrets")),
|
| 502 |
flavor=args.get("hardware_flavor", "cpu-basic"),
|
| 503 |
timeout=args.get("timeout", "30m"),
|
|
@@ -715,7 +727,7 @@ To verify, call this tool with `{{"operation": "inspect", "job_id": "{job_id}"}}
|
|
| 715 |
image=image,
|
| 716 |
command=command,
|
| 717 |
schedule=schedule,
|
| 718 |
-
env=args.get("env"),
|
| 719 |
secrets=_add_environment_variables(args.get("secrets")),
|
| 720 |
flavor=args.get("hardware_flavor", "cpu-basic"),
|
| 721 |
timeout=args.get("timeout", "30m"),
|
|
@@ -875,56 +887,31 @@ To inspect, call this tool with `{{"operation": "scheduled inspect", "scheduled_
|
|
| 875 |
HF_JOBS_TOOL_SPEC = {
|
| 876 |
"name": "hf_jobs",
|
| 877 |
"description": (
|
| 878 |
-
"Execute Python scripts or Docker containers on HF cloud infrastructure
|
| 879 |
-
"
|
| 880 |
-
"
|
| 881 |
-
"
|
| 882 |
-
"
|
| 883 |
-
"
|
| 884 |
-
"
|
| 885 |
-
"
|
| 886 |
-
"
|
| 887 |
-
"
|
| 888 |
-
"
|
| 889 |
-
"
|
| 890 |
-
"
|
| 891 |
-
"
|
| 892 |
-
"
|
| 893 |
-
"
|
| 894 |
-
"
|
| 895 |
-
|
| 896 |
-
|
| 897 |
-
"
|
| 898 |
-
"
|
| 899 |
-
"
|
| 900 |
-
"
|
| 901 |
-
"
|
| 902 |
-
"
|
| 903 |
-
"β DON'T poll logs automatically\n"
|
| 904 |
-
"β DON'T wait for completion\n"
|
| 905 |
-
"β DON'T check status unless user asks\n\n"
|
| 906 |
-
"**For Training Tasks:**\n"
|
| 907 |
-
"β’ ALWAYS research TRL docs first: explore_hf_docs('trl') β fetch_hf_docs(<trainer_url>)\n"
|
| 908 |
-
"β’ ALWAYS validate dataset format with hub_repo_details (SFT needs messages/text, DPO needs chosen/rejected)\n"
|
| 909 |
-
"β’ ALWAYS include Trackio monitoring in script (explore_hf_docs('trackio'))\n"
|
| 910 |
-
"β’ ALWAYS enable push_to_hub=True in training config\n"
|
| 911 |
-
"β’ Set timeout 2-8h for training (NOT default 30m)\n"
|
| 912 |
-
"β’ Confirm model/dataset choices with user before submitting\n\n"
|
| 913 |
-
"**Examples:**\n\n"
|
| 914 |
-
"**Training - Fine-tune LLM:**\n"
|
| 915 |
-
"{'operation': 'run', 'script': '# Training script with TRL\\nfrom trl import SFTConfig, SFTTrainer\\nfrom transformers import AutoModelForCausalLM\\nmodel = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen3-4B\")\\n# ... researched implementation from docs ...\\ntrainer.train()\\ntrainer.push_to_hub(\"user-name/my-model\")', 'dependencies': ['transformers', 'trl', 'torch', 'datasets', 'trackio'], 'hardware_flavor': 'a10g-large', 'timeout': '4h'}\n\n"
|
| 916 |
-
"**Data Processing:**\n"
|
| 917 |
-
"{'operation': 'run', 'script': 'from datasets import load_dataset\\nds = load_dataset(\"data\")\\n# process...\\nds.push_to_hub(\"user/processed\")', 'dependencies': ['datasets', 'pandas'], 'hardware_flavor': 'cpu-upgrade', 'timeout': '2h'}\n\n"
|
| 918 |
-
"**Scheduled Daily Job:**\n"
|
| 919 |
-
"{'operation': 'scheduled run', 'schedule': '@daily', 'script': 'from datasets import Dataset\\nimport pandas as pd\\n# scrape/generate data\\ndf = pd.DataFrame(data)\\nds = Dataset.from_pandas(df)\\nds.push_to_hub(\"user-name/daily-dataset\")', 'dependencies': ['datasets', 'pandas'], 'hardware_flavor': 'cpu-basic'}\n\n"
|
| 920 |
-
"**Docker Mode:**\n"
|
| 921 |
-
"{'operation': 'run', 'image': 'pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime', 'command': ['python', 'train.py', '--epochs', '10'], 'hardware_flavor': 'a100-large'}\n\n"
|
| 922 |
-
"**Monitor Operations:**\n"
|
| 923 |
-
"{'operation': 'ps'} - List all jobs\n"
|
| 924 |
-
"{'operation': 'logs', 'job_id': 'xxx'} - Stream logs (only when user requests)\n"
|
| 925 |
-
"{'operation': 'inspect', 'job_id': 'xxx'} - Get job details\n"
|
| 926 |
-
"{'operation': 'cancel', 'job_id': 'xxx'} - Stop job\n\n"
|
| 927 |
-
"β οΈ CRITICAL: Files created during execution are DELETED when job finishes. MUST push_to_hub() all outputs (models, datasets, artifacts) in script. For logs/scripts, use hf_private_repos after completion."
|
| 928 |
),
|
| 929 |
"parameters": {
|
| 930 |
"type": "object",
|
|
@@ -944,58 +931,65 @@ HF_JOBS_TOOL_SPEC = {
|
|
| 944 |
"scheduled suspend",
|
| 945 |
"scheduled resume",
|
| 946 |
],
|
| 947 |
-
"description":
|
| 948 |
-
"Operation to execute. Valid values: [run, ps, logs, inspect, cancel, "
|
| 949 |
-
"scheduled run, scheduled ps, scheduled inspect, scheduled delete, "
|
| 950 |
-
"scheduled suspend, scheduled resume]"
|
| 951 |
-
),
|
| 952 |
},
|
| 953 |
-
# Python/UV specific parameters
|
| 954 |
"script": {
|
| 955 |
"type": "string",
|
| 956 |
-
"description":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 957 |
},
|
| 958 |
"dependencies": {
|
| 959 |
"type": "array",
|
| 960 |
"items": {"type": "string"},
|
| 961 |
-
"description":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 962 |
},
|
| 963 |
-
# Docker specific parameters
|
| 964 |
"image": {
|
| 965 |
"type": "string",
|
| 966 |
-
"description": "Docker image.
|
| 967 |
},
|
| 968 |
"command": {
|
| 969 |
"type": "array",
|
| 970 |
"items": {"type": "string"},
|
| 971 |
-
"description": "Command to execute as list.
|
| 972 |
},
|
| 973 |
-
# Hardware and environment
|
| 974 |
"hardware_flavor": {
|
| 975 |
"type": "string",
|
| 976 |
-
"description":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 977 |
},
|
| 978 |
"timeout": {
|
| 979 |
"type": "string",
|
| 980 |
-
"description":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 981 |
},
|
| 982 |
"env": {
|
| 983 |
"type": "object",
|
| 984 |
-
"description": "Environment variables
|
| 985 |
},
|
| 986 |
-
# Job management parameters
|
| 987 |
"job_id": {
|
| 988 |
"type": "string",
|
| 989 |
-
"description": "Job ID
|
| 990 |
},
|
| 991 |
-
# Scheduled job parameters
|
| 992 |
"scheduled_job_id": {
|
| 993 |
"type": "string",
|
| 994 |
-
"description": "Scheduled job ID. Required for:
|
| 995 |
},
|
| 996 |
"schedule": {
|
| 997 |
"type": "string",
|
| 998 |
-
"description": "
|
| 999 |
},
|
| 1000 |
},
|
| 1001 |
"required": ["operation"],
|
|
@@ -1015,6 +1009,28 @@ async def hf_jobs_handler(
|
|
| 1015 |
Event(event_type="tool_log", data={"tool": "hf_jobs", "log": log})
|
| 1016 |
)
|
| 1017 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1018 |
# Get token and namespace from HF token
|
| 1019 |
hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
|
| 1020 |
namespace = HfApi(token=hf_token).whoami().get("name") if hf_token else None
|
|
|
|
| 9 |
import http.client
|
| 10 |
import os
|
| 11 |
import re
|
| 12 |
+
from typing import Any, Awaitable, Callable, Dict, Literal, Optional
|
| 13 |
|
| 14 |
import httpx
|
| 15 |
from huggingface_hub import HfApi
|
|
|
|
| 25 |
)
|
| 26 |
|
| 27 |
# Hardware flavors
|
| 28 |
+
CPU_FLAVORS = ["cpu-basic", "cpu-upgrade"]
|
| 29 |
GPU_FLAVORS = [
|
|
|
|
|
|
|
| 30 |
"t4-small",
|
| 31 |
"t4-medium",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
"a10g-small",
|
| 33 |
"a10g-large",
|
| 34 |
"a10g-largex2",
|
| 35 |
"a10g-largex4",
|
| 36 |
"a100-large",
|
| 37 |
+
"a100x4",
|
| 38 |
+
"a100x8",
|
| 39 |
+
"l4x1",
|
| 40 |
+
"l4x4",
|
| 41 |
+
"l40sx1",
|
| 42 |
+
"l40sx4",
|
| 43 |
+
"l40sx8",
|
| 44 |
]
|
| 45 |
|
| 46 |
# Detailed specs for display (vCPU/RAM/GPU VRAM)
|
| 47 |
+
CPU_FLAVORS_DESC = "cpu-basic(2vCPU/16GB), cpu-upgrade(8vCPU/32GB)"
|
|
|
|
|
|
|
| 48 |
GPU_FLAVORS_DESC = (
|
| 49 |
"t4-small(4vCPU/15GB/GPU 16GB), t4-medium(8vCPU/30GB/GPU 16GB), "
|
| 50 |
+
"a10g-small(4vCPU/15GB/GPU 24GB), a10g-large(12vCPU/46GB/GPU 24GB), "
|
|
|
|
|
|
|
| 51 |
"a10g-largex2(24vCPU/92GB/GPU 48GB), a10g-largex4(48vCPU/184GB/GPU 96GB), "
|
| 52 |
+
"a100-large(12vCPU/142GB/GPU 80GB), a100x4(48vCPU/568GB/GPU 320GB), a100x8(96vCPU/1136GB/GPU 640GB), "
|
| 53 |
+
"l4x1(8vCPU/30GB/GPU 24GB), l4x4(48vCPU/186GB/GPU 96GB), "
|
| 54 |
+
"l40sx1(8vCPU/62GB/GPU 48GB), l40sx4(48vCPU/382GB/GPU 192GB), l40sx8(192vCPU/1534GB/GPU 384GB)"
|
| 55 |
)
|
| 56 |
SPECIALIZED_FLAVORS = ["inf2x6"]
|
| 57 |
ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
|
|
|
|
| 113 |
return logs
|
| 114 |
|
| 115 |
|
| 116 |
+
_DEFAULT_ENV = {
|
| 117 |
+
"HF_HUB_DISABLE_PROGRESS_BARS": "1",
|
| 118 |
+
"TQDM_DISABLE": "1",
|
| 119 |
+
"TRANSFORMERS_VERBOSITY": "warning",
|
| 120 |
+
"HF_HUB_ENABLE_HF_TRANSFER": "1",
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def _add_default_env(params: Dict[str, Any] | None) -> Dict[str, Any]:
|
| 125 |
+
"""Inject default env vars for clean, agent-friendly output."""
|
| 126 |
+
result = dict(_DEFAULT_ENV)
|
| 127 |
+
result.update(params or {}) # user-provided values override defaults
|
| 128 |
+
return result
|
| 129 |
+
|
| 130 |
+
|
| 131 |
def _add_environment_variables(params: Dict[str, Any] | None) -> Dict[str, Any]:
|
| 132 |
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN") or ""
|
| 133 |
|
|
|
|
| 384 |
def log_producer():
|
| 385 |
try:
|
| 386 |
# fetch_job_logs is a blocking sync generator
|
| 387 |
+
logs_gen = self.api.fetch_job_logs(
|
| 388 |
+
job_id=job_id, namespace=namespace
|
| 389 |
+
)
|
| 390 |
for line in logs_gen:
|
| 391 |
# Push line to queue thread-safely
|
| 392 |
loop.call_soon_threadsafe(queue.put_nowait, line)
|
|
|
|
| 509 |
self.api.run_job,
|
| 510 |
image=image,
|
| 511 |
command=command,
|
| 512 |
+
env=_add_default_env(args.get("env")),
|
| 513 |
secrets=_add_environment_variables(args.get("secrets")),
|
| 514 |
flavor=args.get("hardware_flavor", "cpu-basic"),
|
| 515 |
timeout=args.get("timeout", "30m"),
|
|
|
|
| 727 |
image=image,
|
| 728 |
command=command,
|
| 729 |
schedule=schedule,
|
| 730 |
+
env=_add_default_env(args.get("env")),
|
| 731 |
secrets=_add_environment_variables(args.get("secrets")),
|
| 732 |
flavor=args.get("hardware_flavor", "cpu-basic"),
|
| 733 |
timeout=args.get("timeout", "30m"),
|
|
|
|
| 887 |
HF_JOBS_TOOL_SPEC = {
|
| 888 |
"name": "hf_jobs",
|
| 889 |
"description": (
|
| 890 |
+
"Execute Python scripts or Docker containers on HF cloud infrastructure.\n\n"
|
| 891 |
+
"Two modes (mutually exclusive): Python mode (script + dependencies) or Docker mode (command + image). "
|
| 892 |
+
"Provide exactly ONE of 'script' or 'command'.\n\n"
|
| 893 |
+
"BEFORE submitting training/fine-tuning jobs:\n"
|
| 894 |
+
"- You MUST have called github_find_examples + github_read_file to find a working reference implementation. "
|
| 895 |
+
"Scripts based on your internal knowledge WILL use outdated APIs and fail.\n"
|
| 896 |
+
"- You MUST have validated dataset format via hf_inspect_dataset or hub_repo_details.\n"
|
| 897 |
+
"- Training config MUST include push_to_hub=True and hub_model_id. "
|
| 898 |
+
"Job storage is EPHEMERAL β all files are deleted when the job ends. Without push_to_hub, trained models are lost permanently.\n"
|
| 899 |
+
"- Include trackio monitoring and provide the dashboard URL to the user.\n\n"
|
| 900 |
+
"BATCH/ABLATION JOBS: Submit ONE job first. Check logs to confirm it starts training successfully. "
|
| 901 |
+
"Only then submit the remaining jobs. Never submit all at once β if there's a bug, all jobs fail.\n\n"
|
| 902 |
+
"Operations: run, ps, logs, inspect, cancel, scheduled run/ps/inspect/delete/suspend/resume.\n\n"
|
| 903 |
+
f"Hardware: CPU: {CPU_FLAVORS_DESC}. GPU: {GPU_FLAVORS_DESC}.\n"
|
| 904 |
+
"Common picks: t4-small ($0.60/hr, 1-3B), a10g-large ($2/hr, 7-13B), a100-large ($4/hr, 30B+), h100 ($6/hr, 70B+). "
|
| 905 |
+
"Note: a10g-small and a10g-large have the SAME 24GB GPU β the difference is CPU/RAM only.\n\n"
|
| 906 |
+
"OOM RECOVERY: When a training job fails with CUDA OOM:\n"
|
| 907 |
+
"1. Reduce per_device_train_batch_size and increase gradient_accumulation_steps proportionally (keep effective batch size identical)\n"
|
| 908 |
+
"2. Enable gradient_checkpointing=True\n"
|
| 909 |
+
"3. Upgrade to larger GPU (a10gβa100βh100)\n"
|
| 910 |
+
"Do NOT switch training methods (e.g. full SFT to LoRA) or reduce max_length β those change what the user gets and require explicit approval.\n\n"
|
| 911 |
+
"Examples:\n"
|
| 912 |
+
"Training: {'operation': 'run', 'script': '/app/train.py', 'dependencies': ['transformers', 'trl', 'torch', 'datasets', 'trackio'], 'hardware_flavor': 'a100-large', 'timeout': '8h'}\n"
|
| 913 |
+
"Monitor: {'operation': 'ps'}, {'operation': 'logs', 'job_id': 'xxx'}, {'operation': 'cancel', 'job_id': 'xxx'}"
|
| 914 |
+
"Docker: {'operation': 'run', 'command': ['duckdb', '-c', 'select 1 + 2'], 'image': 'duckdb/duckdb', 'hardware_flavor': 'cpu-basic', 'timeout': '1h'}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 915 |
),
|
| 916 |
"parameters": {
|
| 917 |
"type": "object",
|
|
|
|
| 931 |
"scheduled suspend",
|
| 932 |
"scheduled resume",
|
| 933 |
],
|
| 934 |
+
"description": "Operation to execute.",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 935 |
},
|
|
|
|
| 936 |
"script": {
|
| 937 |
"type": "string",
|
| 938 |
+
"description": (
|
| 939 |
+
"Python code or sandbox file path (e.g. '/app/train.py') or URL. "
|
| 940 |
+
"Triggers Python mode. For ML training: base this on a working example found via github_find_examples, not on internal knowledge. "
|
| 941 |
+
"Mutually exclusive with 'command'."
|
| 942 |
+
),
|
| 943 |
},
|
| 944 |
"dependencies": {
|
| 945 |
"type": "array",
|
| 946 |
"items": {"type": "string"},
|
| 947 |
+
"description": (
|
| 948 |
+
"Pip packages to install. Include ALL required packages. "
|
| 949 |
+
"Common training set: ['transformers', 'trl', 'torch', 'datasets', 'trackio', 'accelerate']. "
|
| 950 |
+
"Only used with 'script'."
|
| 951 |
+
),
|
| 952 |
},
|
|
|
|
| 953 |
"image": {
|
| 954 |
"type": "string",
|
| 955 |
+
"description": "Docker image. Optional β auto-selected if not provided. Use with 'command'.",
|
| 956 |
},
|
| 957 |
"command": {
|
| 958 |
"type": "array",
|
| 959 |
"items": {"type": "string"},
|
| 960 |
+
"description": "Command to execute as list. Triggers Docker mode. Mutually exclusive with 'script'.",
|
| 961 |
},
|
|
|
|
| 962 |
"hardware_flavor": {
|
| 963 |
"type": "string",
|
| 964 |
+
"description": (
|
| 965 |
+
"Hardware type. Sizing guide: 1-3B params β t4-small/a10g-small, "
|
| 966 |
+
"7-13B β a10g-large, 30B+ β a100-large, 70B+ β h100/h100x8. "
|
| 967 |
+
f"All options: CPU: {CPU_FLAVORS}. GPU: {GPU_FLAVORS}."
|
| 968 |
+
),
|
| 969 |
},
|
| 970 |
"timeout": {
|
| 971 |
"type": "string",
|
| 972 |
+
"description": (
|
| 973 |
+
"Maximum job runtime. MUST be >2h for any training job β default 30m kills training mid-run. "
|
| 974 |
+
"Guidelines: 1-3B models: 3-4h, 7-13B: 6-8h, 30B+: 12-24h. "
|
| 975 |
+
"Use 30m-1h only for quick data processing or inference tasks. Default: '30m'."
|
| 976 |
+
),
|
| 977 |
},
|
| 978 |
"env": {
|
| 979 |
"type": "object",
|
| 980 |
+
"description": "Environment variables {'KEY': 'VALUE'}. HF_TOKEN is auto-included.",
|
| 981 |
},
|
|
|
|
| 982 |
"job_id": {
|
| 983 |
"type": "string",
|
| 984 |
+
"description": "Job ID. Required for: logs, inspect, cancel.",
|
| 985 |
},
|
|
|
|
| 986 |
"scheduled_job_id": {
|
| 987 |
"type": "string",
|
| 988 |
+
"description": "Scheduled job ID. Required for: scheduled inspect/delete/suspend/resume.",
|
| 989 |
},
|
| 990 |
"schedule": {
|
| 991 |
"type": "string",
|
| 992 |
+
"description": "Cron schedule or preset (@hourly, @daily, @weekly, @monthly). Required for: scheduled run.",
|
| 993 |
},
|
| 994 |
},
|
| 995 |
"required": ["operation"],
|
|
|
|
| 1009 |
Event(event_type="tool_log", data={"tool": "hf_jobs", "log": log})
|
| 1010 |
)
|
| 1011 |
|
| 1012 |
+
# If script is a sandbox file path, read it from the sandbox
|
| 1013 |
+
script = arguments.get("script", "")
|
| 1014 |
+
sandbox = getattr(session, "sandbox", None) if session else None
|
| 1015 |
+
is_path = (
|
| 1016 |
+
sandbox
|
| 1017 |
+
and isinstance(script, str)
|
| 1018 |
+
and script.strip() == script
|
| 1019 |
+
and not any(c in script for c in "\r\n\0")
|
| 1020 |
+
and (
|
| 1021 |
+
script.startswith("/")
|
| 1022 |
+
or script.startswith("./")
|
| 1023 |
+
or script.startswith("../")
|
| 1024 |
+
)
|
| 1025 |
+
)
|
| 1026 |
+
if is_path:
|
| 1027 |
+
import shlex
|
| 1028 |
+
|
| 1029 |
+
result = await asyncio.to_thread(sandbox.bash, f"cat {shlex.quote(script)}")
|
| 1030 |
+
if not result.success:
|
| 1031 |
+
return f"Failed to read {script} from sandbox: {result.error}", False
|
| 1032 |
+
arguments = {**arguments, "script": result.output}
|
| 1033 |
+
|
| 1034 |
# Get token and namespace from HF token
|
| 1035 |
hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN")
|
| 1036 |
namespace = HfApi(token=hf_token).whoami().get("name") if hf_token else None
|
agent/tools/plan_tool.py
CHANGED
|
@@ -85,18 +85,11 @@ def get_current_plan() -> List[Dict[str, str]]:
|
|
| 85 |
PLAN_TOOL_SPEC = {
|
| 86 |
"name": "plan_tool",
|
| 87 |
"description": (
|
| 88 |
-
"
|
| 89 |
-
"
|
| 90 |
-
"
|
| 91 |
-
"
|
| 92 |
-
"
|
| 93 |
-
"**Pattern:** Create plan at start β Mark in_progress when starting task β Mark completed immediately after finishing β User sees clear progress. "
|
| 94 |
-
"Each call replaces entire plan (full list required). "
|
| 95 |
-
"**Critical for reliability:** Exactly ONE task in_progress at a time (not zero, not multiple). "
|
| 96 |
-
"Mark tasks completed IMMEDIATELY after finishing - don't batch completions. "
|
| 97 |
-
"**For long-running tasks:** Update plan after each major step to keep user informed. "
|
| 98 |
-
"**Only mark completed when:** Task fully accomplished, no errors, all requirements met. "
|
| 99 |
-
"Keep tasks pending if blocked/errors occur - create new task to resolve blockers."
|
| 100 |
),
|
| 101 |
"parameters": {
|
| 102 |
"type": "object",
|
|
|
|
| 85 |
PLAN_TOOL_SPEC = {
|
| 86 |
"name": "plan_tool",
|
| 87 |
"description": (
|
| 88 |
+
"Track progress on multi-step tasks with a todo list (pending/in_progress/completed).\n\n"
|
| 89 |
+
"Use for tasks with 3+ steps. Each call replaces the entire plan (send full list).\n\n"
|
| 90 |
+
"Rules: exactly ONE task in_progress at a time. Mark completed immediately after finishing. "
|
| 91 |
+
"Only mark completed when the task fully succeeded β keep in_progress if there are errors. "
|
| 92 |
+
"Update frequently so the user sees progress."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
),
|
| 94 |
"parameters": {
|
| 95 |
"type": "object",
|
agent/tools/sandbox_client.py
ADDED
|
@@ -0,0 +1,714 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# /// script
|
| 3 |
+
# requires-python = ">=3.10"
|
| 4 |
+
# dependencies = ["huggingface_hub>=0.20.0", "httpx>=0.27.0"]
|
| 5 |
+
# ///
|
| 6 |
+
"""
|
| 7 |
+
Sandbox Tools β Agent-native primitives for HF Space dev-mode sandboxes.
|
| 8 |
+
|
| 9 |
+
Architecture:
|
| 10 |
+
- Creates a sandbox by duplicating a template Space (runs sandbox_server.py)
|
| 11 |
+
- Waits for it to come online
|
| 12 |
+
- Communicates via HTTPS to the Space's API
|
| 13 |
+
- Optionally deletes the Space when done
|
| 14 |
+
|
| 15 |
+
Lifecycle:
|
| 16 |
+
sb = Sandbox.create(owner="burtenshaw") # duplicate, wait, connect
|
| 17 |
+
sb = Sandbox.create(owner="burtenshaw", # with options
|
| 18 |
+
hardware="t4-small",
|
| 19 |
+
private=True,
|
| 20 |
+
sleep_time=3600)
|
| 21 |
+
sb = Sandbox.connect("burtenshaw/my-sandbox-abc") # attach to existing
|
| 22 |
+
|
| 23 |
+
sb.bash("uv run train.py")
|
| 24 |
+
sb.read("/app/train.py")
|
| 25 |
+
sb.edit("/app/train.py", old_str="lr=1e-3", new_str="lr=1e-4")
|
| 26 |
+
|
| 27 |
+
sb.delete() # tear down when done
|
| 28 |
+
|
| 29 |
+
# Or use as a context manager for automatic cleanup
|
| 30 |
+
with Sandbox.create(owner="burtenshaw") as sb:
|
| 31 |
+
sb.bash("python train.py")
|
| 32 |
+
# Space deleted on exit
|
| 33 |
+
|
| 34 |
+
Tools: bash, read, write, edit, upload
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
from __future__ import annotations
|
| 38 |
+
|
| 39 |
+
import io
|
| 40 |
+
import os
|
| 41 |
+
import sys
|
| 42 |
+
import time
|
| 43 |
+
import uuid
|
| 44 |
+
from dataclasses import dataclass, field
|
| 45 |
+
from typing import Any
|
| 46 |
+
|
| 47 |
+
import httpx
|
| 48 |
+
from huggingface_hub import CommitOperationAdd, HfApi
|
| 49 |
+
|
| 50 |
+
TEMPLATE_SPACE = "burtenshaw/sandbox"
|
| 51 |
+
HARDWARE_OPTIONS = [
|
| 52 |
+
"cpu-basic",
|
| 53 |
+
"cpu-upgrade",
|
| 54 |
+
"t4-small",
|
| 55 |
+
"t4-medium",
|
| 56 |
+
"a10g-small",
|
| 57 |
+
"a10g-large",
|
| 58 |
+
"a100-large",
|
| 59 |
+
]
|
| 60 |
+
OUTPUT_LIMIT = 30000
|
| 61 |
+
LINE_LIMIT = 2000
|
| 62 |
+
DEFAULT_READ_LIMIT = 2000
|
| 63 |
+
DEFAULT_TIMEOUT = 120
|
| 64 |
+
MAX_TIMEOUT = 600
|
| 65 |
+
WAIT_TIMEOUT = 300
|
| 66 |
+
WAIT_INTERVAL = 5
|
| 67 |
+
API_WAIT_TIMEOUT = 180
|
| 68 |
+
|
| 69 |
+
_DOCKERFILE = """\
|
| 70 |
+
FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
|
| 71 |
+
|
| 72 |
+
RUN apt-get update && \\
|
| 73 |
+
apt-get install -y \\
|
| 74 |
+
bash git git-lfs wget curl procps \\
|
| 75 |
+
htop vim nano jq tmux \\
|
| 76 |
+
build-essential && \\
|
| 77 |
+
rm -rf /var/lib/apt/lists/*
|
| 78 |
+
|
| 79 |
+
RUN uv pip install --system fastapi uvicorn python-multipart
|
| 80 |
+
|
| 81 |
+
RUN useradd -m -u 1000 user
|
| 82 |
+
USER user
|
| 83 |
+
|
| 84 |
+
ENV HOME=/home/user \\
|
| 85 |
+
PATH=/home/user/.local/bin:$PATH \\
|
| 86 |
+
PIP_USER=1 \\
|
| 87 |
+
HF_HUB_DISABLE_PROGRESS_BARS=1 \\
|
| 88 |
+
TQDM_DISABLE=1 \\
|
| 89 |
+
TRANSFORMERS_VERBOSITY=warning \\
|
| 90 |
+
HF_HUB_ENABLE_HF_TRANSFER=1
|
| 91 |
+
|
| 92 |
+
WORKDIR /app
|
| 93 |
+
COPY --chown=user . /app
|
| 94 |
+
|
| 95 |
+
EXPOSE 7860
|
| 96 |
+
|
| 97 |
+
CMD ["python", "sandbox_server.py"]
|
| 98 |
+
"""
|
| 99 |
+
|
| 100 |
+
_SANDBOX_SERVER = '''\
|
| 101 |
+
"""Minimal FastAPI server for sandbox operations."""
|
| 102 |
+
import os, subprocess, pathlib
|
| 103 |
+
from fastapi import FastAPI
|
| 104 |
+
from pydantic import BaseModel
|
| 105 |
+
from typing import Optional
|
| 106 |
+
import uvicorn
|
| 107 |
+
|
| 108 |
+
app = FastAPI()
|
| 109 |
+
|
| 110 |
+
class BashReq(BaseModel):
|
| 111 |
+
command: str
|
| 112 |
+
work_dir: str = "/app"
|
| 113 |
+
timeout: int = 120
|
| 114 |
+
|
| 115 |
+
class ReadReq(BaseModel):
|
| 116 |
+
path: str
|
| 117 |
+
offset: Optional[int] = None
|
| 118 |
+
limit: Optional[int] = 2000
|
| 119 |
+
|
| 120 |
+
class WriteReq(BaseModel):
|
| 121 |
+
path: str
|
| 122 |
+
content: str
|
| 123 |
+
|
| 124 |
+
class EditReq(BaseModel):
|
| 125 |
+
path: str
|
| 126 |
+
old_str: str
|
| 127 |
+
new_str: str
|
| 128 |
+
replace_all: bool = False
|
| 129 |
+
|
| 130 |
+
class ExistsReq(BaseModel):
|
| 131 |
+
path: str
|
| 132 |
+
|
| 133 |
+
@app.get("/api/health")
|
| 134 |
+
def health():
|
| 135 |
+
return {"status": "ok"}
|
| 136 |
+
|
| 137 |
+
@app.post("/api/bash")
|
| 138 |
+
def bash(req: BashReq):
|
| 139 |
+
try:
|
| 140 |
+
r = subprocess.run(
|
| 141 |
+
req.command, shell=True, capture_output=True, text=True,
|
| 142 |
+
cwd=req.work_dir, timeout=req.timeout,
|
| 143 |
+
)
|
| 144 |
+
output = r.stdout + r.stderr
|
| 145 |
+
if len(output) > 30000:
|
| 146 |
+
output = output[:30000] + "\\n... (truncated)"
|
| 147 |
+
return {"success": r.returncode == 0, "output": output, "error": "" if r.returncode == 0 else f"Exit code {r.returncode}"}
|
| 148 |
+
except subprocess.TimeoutExpired:
|
| 149 |
+
return {"success": False, "output": "", "error": f"Timeout after {req.timeout}s"}
|
| 150 |
+
except Exception as e:
|
| 151 |
+
return {"success": False, "output": "", "error": str(e)}
|
| 152 |
+
|
| 153 |
+
@app.post("/api/read")
|
| 154 |
+
def read(req: ReadReq):
|
| 155 |
+
try:
|
| 156 |
+
p = pathlib.Path(req.path)
|
| 157 |
+
if not p.exists():
|
| 158 |
+
return {"success": False, "output": "", "error": f"File not found: {req.path}"}
|
| 159 |
+
if p.is_dir():
|
| 160 |
+
return {"success": False, "output": "", "error": f"Is a directory: {req.path}"}
|
| 161 |
+
lines = p.read_text().splitlines()
|
| 162 |
+
start = (req.offset or 1) - 1
|
| 163 |
+
end = start + (req.limit or len(lines))
|
| 164 |
+
selected = lines[start:end]
|
| 165 |
+
numbered = "\\n".join(f"{start + i + 1}\\t{line}" for i, line in enumerate(selected))
|
| 166 |
+
return {"success": True, "output": numbered, "error": ""}
|
| 167 |
+
except Exception as e:
|
| 168 |
+
return {"success": False, "output": "", "error": str(e)}
|
| 169 |
+
|
| 170 |
+
@app.post("/api/write")
|
| 171 |
+
def write(req: WriteReq):
|
| 172 |
+
try:
|
| 173 |
+
p = pathlib.Path(req.path)
|
| 174 |
+
p.parent.mkdir(parents=True, exist_ok=True)
|
| 175 |
+
p.write_text(req.content)
|
| 176 |
+
return {"success": True, "output": f"Wrote {len(req.content)} bytes to {req.path}", "error": ""}
|
| 177 |
+
except Exception as e:
|
| 178 |
+
return {"success": False, "output": "", "error": str(e)}
|
| 179 |
+
|
| 180 |
+
@app.post("/api/edit")
|
| 181 |
+
def edit(req: EditReq):
|
| 182 |
+
try:
|
| 183 |
+
p = pathlib.Path(req.path)
|
| 184 |
+
if not p.exists():
|
| 185 |
+
return {"success": False, "output": "", "error": f"File not found: {req.path}"}
|
| 186 |
+
content = p.read_text()
|
| 187 |
+
if req.old_str not in content:
|
| 188 |
+
return {"success": False, "output": "", "error": f"old_str not found in {req.path}"}
|
| 189 |
+
if not req.replace_all and content.count(req.old_str) > 1:
|
| 190 |
+
return {"success": False, "output": "", "error": f"old_str appears {content.count(req.old_str)} times. Use replace_all=true or provide more context."}
|
| 191 |
+
if req.replace_all:
|
| 192 |
+
new_content = content.replace(req.old_str, req.new_str)
|
| 193 |
+
else:
|
| 194 |
+
new_content = content.replace(req.old_str, req.new_str, 1)
|
| 195 |
+
p.write_text(new_content)
|
| 196 |
+
return {"success": True, "output": f"Edited {req.path}", "error": ""}
|
| 197 |
+
except Exception as e:
|
| 198 |
+
return {"success": False, "output": "", "error": str(e)}
|
| 199 |
+
|
| 200 |
+
@app.post("/api/exists")
|
| 201 |
+
def exists(req: ExistsReq):
|
| 202 |
+
return {"success": True, "output": str(pathlib.Path(req.path).exists()).lower(), "error": ""}
|
| 203 |
+
|
| 204 |
+
if __name__ == "__main__":
|
| 205 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
| 206 |
+
'''
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
@dataclass
|
| 210 |
+
class ToolResult:
|
| 211 |
+
success: bool
|
| 212 |
+
output: str = ""
|
| 213 |
+
error: str = ""
|
| 214 |
+
|
| 215 |
+
def __str__(self):
|
| 216 |
+
if self.success:
|
| 217 |
+
return self.output or "(no output)"
|
| 218 |
+
return f"ERROR: {self.error}"
|
| 219 |
+
|
| 220 |
+
def to_dict(self) -> dict:
|
| 221 |
+
return {"success": self.success, "output": self.output, "error": self.error}
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
@dataclass
|
| 225 |
+
class Sandbox:
|
| 226 |
+
"""
|
| 227 |
+
A handle to an HF Space sandbox.
|
| 228 |
+
|
| 229 |
+
Use Sandbox.create() to spin up a new one, or Sandbox.connect() to
|
| 230 |
+
attach to an existing running Space.
|
| 231 |
+
"""
|
| 232 |
+
|
| 233 |
+
space_id: str
|
| 234 |
+
token: str | None = None
|
| 235 |
+
work_dir: str = "/app"
|
| 236 |
+
timeout: int = DEFAULT_TIMEOUT
|
| 237 |
+
_owns_space: bool = field(default=False, repr=False)
|
| 238 |
+
_base_url: str = field(init=False, repr=False)
|
| 239 |
+
_client: httpx.Client = field(init=False, repr=False)
|
| 240 |
+
_hf_api: HfApi = field(init=False, repr=False)
|
| 241 |
+
_files_read: set = field(init=False, repr=False, default_factory=set)
|
| 242 |
+
|
| 243 |
+
def __post_init__(self):
|
| 244 |
+
self.token = self.token or os.environ.get("HF_TOKEN")
|
| 245 |
+
slug = self.space_id.replace("/", "-")
|
| 246 |
+
# Trailing slash is critical: httpx resolves relative paths against base_url.
|
| 247 |
+
# Without it, client.get("health") resolves to /health instead of /api/health.
|
| 248 |
+
self._base_url = f"https://{slug}.hf.space/api/"
|
| 249 |
+
self._client = httpx.Client(
|
| 250 |
+
base_url=self._base_url,
|
| 251 |
+
headers={"Authorization": f"Bearer {self.token}"} if self.token else {},
|
| 252 |
+
timeout=httpx.Timeout(MAX_TIMEOUT, connect=30),
|
| 253 |
+
follow_redirects=True,
|
| 254 |
+
)
|
| 255 |
+
self._hf_api = HfApi(token=self.token)
|
| 256 |
+
|
| 257 |
+
# ββ Lifecycle βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 258 |
+
|
| 259 |
+
@classmethod
|
| 260 |
+
def create(
|
| 261 |
+
cls,
|
| 262 |
+
owner: str,
|
| 263 |
+
*,
|
| 264 |
+
name: str | None = None,
|
| 265 |
+
template: str = TEMPLATE_SPACE,
|
| 266 |
+
hardware: str = "cpu-basic",
|
| 267 |
+
private: bool = False,
|
| 268 |
+
sleep_time: int | None = None,
|
| 269 |
+
token: str | None = None,
|
| 270 |
+
wait_timeout: int = WAIT_TIMEOUT,
|
| 271 |
+
) -> Sandbox:
|
| 272 |
+
"""
|
| 273 |
+
Create a new sandbox by duplicating the template Space.
|
| 274 |
+
|
| 275 |
+
Generates a unique space name, duplicates the template, waits for it
|
| 276 |
+
to come online, then returns a connected Sandbox.
|
| 277 |
+
|
| 278 |
+
Args:
|
| 279 |
+
owner: HF username or org (e.g. "burtenshaw").
|
| 280 |
+
name: Base name for the space. Defaults to "sandbox".
|
| 281 |
+
A unique suffix is always appended.
|
| 282 |
+
template: Source Space to duplicate (default: burtenshaw/sandbox).
|
| 283 |
+
hardware: Hardware tier (cpu-basic, t4-small, etc.).
|
| 284 |
+
private: Whether the Space should be private.
|
| 285 |
+
sleep_time: Auto-sleep after N seconds of inactivity.
|
| 286 |
+
token: HF API token. Falls back to HF_TOKEN env var.
|
| 287 |
+
wait_timeout: Max seconds to wait for Space to start (default: 300).
|
| 288 |
+
|
| 289 |
+
Returns:
|
| 290 |
+
A Sandbox instance connected to the running Space.
|
| 291 |
+
"""
|
| 292 |
+
token = token or os.environ.get("HF_TOKEN")
|
| 293 |
+
api = HfApi(token=token)
|
| 294 |
+
|
| 295 |
+
base = name or "sandbox"
|
| 296 |
+
suffix = uuid.uuid4().hex[:8]
|
| 297 |
+
space_id = f"{owner}/{base}-{suffix}"
|
| 298 |
+
|
| 299 |
+
print(f"Creating sandbox: {space_id} (from {template})...")
|
| 300 |
+
|
| 301 |
+
kwargs = {
|
| 302 |
+
"from_id": template,
|
| 303 |
+
"to_id": space_id,
|
| 304 |
+
"private": private,
|
| 305 |
+
"hardware": hardware,
|
| 306 |
+
}
|
| 307 |
+
if sleep_time is not None:
|
| 308 |
+
kwargs["sleep_time"] = sleep_time
|
| 309 |
+
|
| 310 |
+
api.duplicate_space(**kwargs)
|
| 311 |
+
print(f"Space created: https://huggingface.co/spaces/{space_id}")
|
| 312 |
+
|
| 313 |
+
# Upload sandbox server and Dockerfile (triggers rebuild)
|
| 314 |
+
cls._setup_server(space_id, api)
|
| 315 |
+
|
| 316 |
+
# Wait for it to come online (rebuild + start)
|
| 317 |
+
print(f"Waiting for Space to start (timeout: {wait_timeout}s)...")
|
| 318 |
+
deadline = time.time() + wait_timeout
|
| 319 |
+
while time.time() < deadline:
|
| 320 |
+
runtime = api.get_space_runtime(space_id)
|
| 321 |
+
if runtime.stage == "RUNNING":
|
| 322 |
+
print(f"Space is running (hardware: {runtime.hardware})")
|
| 323 |
+
break
|
| 324 |
+
if runtime.stage in ("RUNTIME_ERROR", "BUILD_ERROR"):
|
| 325 |
+
raise RuntimeError(
|
| 326 |
+
f"Space failed to start: {runtime.stage}. "
|
| 327 |
+
f"Check https://huggingface.co/spaces/{space_id}"
|
| 328 |
+
)
|
| 329 |
+
print(f" {runtime.stage}...")
|
| 330 |
+
time.sleep(WAIT_INTERVAL)
|
| 331 |
+
else:
|
| 332 |
+
raise TimeoutError(
|
| 333 |
+
f"Space did not start within {wait_timeout}s. "
|
| 334 |
+
f"Check https://huggingface.co/spaces/{space_id}"
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
# Wait for the API server to be responsive (non-fatal)
|
| 338 |
+
sb = cls(space_id=space_id, token=token, _owns_space=True)
|
| 339 |
+
try:
|
| 340 |
+
sb._wait_for_api(timeout=API_WAIT_TIMEOUT)
|
| 341 |
+
except TimeoutError as e:
|
| 342 |
+
print(
|
| 343 |
+
f"Warning: API health check timed out ({e}), but Space is RUNNING. Continuing."
|
| 344 |
+
)
|
| 345 |
+
return sb
|
| 346 |
+
|
| 347 |
+
@staticmethod
|
| 348 |
+
def _setup_server(space_id: str, api: HfApi) -> None:
|
| 349 |
+
"""Upload embedded sandbox server + Dockerfile to the Space (single commit)."""
|
| 350 |
+
print(f"Uploading sandbox server to {space_id}...")
|
| 351 |
+
api.create_commit(
|
| 352 |
+
repo_id=space_id,
|
| 353 |
+
repo_type="space",
|
| 354 |
+
operations=[
|
| 355 |
+
CommitOperationAdd(
|
| 356 |
+
path_in_repo="sandbox_server.py",
|
| 357 |
+
path_or_fileobj=io.BytesIO(_SANDBOX_SERVER.encode()),
|
| 358 |
+
),
|
| 359 |
+
CommitOperationAdd(
|
| 360 |
+
path_in_repo="Dockerfile",
|
| 361 |
+
path_or_fileobj=io.BytesIO(_DOCKERFILE.encode()),
|
| 362 |
+
),
|
| 363 |
+
],
|
| 364 |
+
commit_message="Setup sandbox server",
|
| 365 |
+
)
|
| 366 |
+
print("Server files uploaded, rebuild triggered.")
|
| 367 |
+
|
| 368 |
+
@classmethod
|
| 369 |
+
def connect(cls, space_id: str, *, token: str | None = None) -> Sandbox:
|
| 370 |
+
"""
|
| 371 |
+
Connect to an existing running Space.
|
| 372 |
+
|
| 373 |
+
Does a health check to verify the Space is reachable.
|
| 374 |
+
"""
|
| 375 |
+
sb = cls(space_id=space_id, token=token, _owns_space=False)
|
| 376 |
+
sb._wait_for_api(timeout=60)
|
| 377 |
+
return sb
|
| 378 |
+
|
| 379 |
+
def _wait_for_api(self, timeout: int = API_WAIT_TIMEOUT):
|
| 380 |
+
"""Poll the health endpoint until the server responds."""
|
| 381 |
+
deadline = time.time() + timeout
|
| 382 |
+
last_err = None
|
| 383 |
+
last_status = None
|
| 384 |
+
while time.time() < deadline:
|
| 385 |
+
try:
|
| 386 |
+
resp = self._client.get("health", timeout=10)
|
| 387 |
+
last_status = resp.status_code
|
| 388 |
+
if resp.status_code == 200:
|
| 389 |
+
print(f"API is responsive at {self._base_url}")
|
| 390 |
+
return
|
| 391 |
+
except Exception as e:
|
| 392 |
+
last_err = e
|
| 393 |
+
time.sleep(3)
|
| 394 |
+
raise TimeoutError(
|
| 395 |
+
f"Sandbox API at {self._base_url} not responding after {timeout}s. "
|
| 396 |
+
f"Last status: {last_status}, last error: {last_err}"
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
def delete(self):
|
| 400 |
+
"""Delete the Space. Only works if this Sandbox created it."""
|
| 401 |
+
if not self._owns_space:
|
| 402 |
+
raise RuntimeError(
|
| 403 |
+
f"This Sandbox did not create {self.space_id}. "
|
| 404 |
+
f"Use self._hf_api.delete_repo() directly if you're sure."
|
| 405 |
+
)
|
| 406 |
+
print(f"Deleting sandbox: {self.space_id}...")
|
| 407 |
+
self._hf_api.delete_repo(self.space_id, repo_type="space")
|
| 408 |
+
self._client.close()
|
| 409 |
+
print("Deleted.")
|
| 410 |
+
|
| 411 |
+
def pause(self):
|
| 412 |
+
"""Pause the Space (stops billing, preserves state)."""
|
| 413 |
+
self._hf_api.pause_space(self.space_id)
|
| 414 |
+
|
| 415 |
+
def restart(self):
|
| 416 |
+
"""Restart the Space."""
|
| 417 |
+
self._hf_api.restart_space(self.space_id)
|
| 418 |
+
self._wait_for_api()
|
| 419 |
+
|
| 420 |
+
@property
|
| 421 |
+
def url(self) -> str:
|
| 422 |
+
"""Public URL of the Space."""
|
| 423 |
+
return f"https://huggingface.co/spaces/{self.space_id}"
|
| 424 |
+
|
| 425 |
+
@property
|
| 426 |
+
def status(self) -> str:
|
| 427 |
+
"""Current Space stage (RUNNING, BUILDING, PAUSED, etc.)."""
|
| 428 |
+
return self._hf_api.get_space_runtime(self.space_id).stage
|
| 429 |
+
|
| 430 |
+
def __enter__(self) -> Sandbox:
|
| 431 |
+
return self
|
| 432 |
+
|
| 433 |
+
def __exit__(self, *exc):
|
| 434 |
+
if self._owns_space:
|
| 435 |
+
try:
|
| 436 |
+
self.delete()
|
| 437 |
+
except Exception as e:
|
| 438 |
+
print(f"Warning: failed to delete sandbox: {e}", file=sys.stderr)
|
| 439 |
+
self._client.close()
|
| 440 |
+
|
| 441 |
+
# ββ HTTP plumbing βββββββββββββββββββββββββββββββββββββββββββββ
|
| 442 |
+
|
| 443 |
+
def _call(
|
| 444 |
+
self, endpoint: str, payload: dict, timeout: float | None = None
|
| 445 |
+
) -> ToolResult:
|
| 446 |
+
# Strip leading slash for correct httpx base_url resolution
|
| 447 |
+
endpoint = endpoint.lstrip("/")
|
| 448 |
+
try:
|
| 449 |
+
resp = self._client.post(
|
| 450 |
+
endpoint,
|
| 451 |
+
json=payload,
|
| 452 |
+
timeout=timeout or self.timeout,
|
| 453 |
+
)
|
| 454 |
+
data = resp.json()
|
| 455 |
+
if resp.status_code == 200:
|
| 456 |
+
return ToolResult(
|
| 457 |
+
success=data.get("success", True),
|
| 458 |
+
output=data.get("output", ""),
|
| 459 |
+
error=data.get("error", ""),
|
| 460 |
+
)
|
| 461 |
+
return ToolResult(
|
| 462 |
+
success=False,
|
| 463 |
+
error=data.get("error", f"HTTP {resp.status_code}"),
|
| 464 |
+
)
|
| 465 |
+
except httpx.TimeoutException:
|
| 466 |
+
return ToolResult(
|
| 467 |
+
success=False, error=f"Timeout after {timeout or self.timeout}s"
|
| 468 |
+
)
|
| 469 |
+
except httpx.ConnectError:
|
| 470 |
+
return ToolResult(
|
| 471 |
+
success=False,
|
| 472 |
+
error=f"Cannot connect to sandbox. Is {self.space_id} running? Status: {self.status}",
|
| 473 |
+
)
|
| 474 |
+
except Exception as e:
|
| 475 |
+
return ToolResult(success=False, error=str(e))
|
| 476 |
+
|
| 477 |
+
# ββ Tools βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 478 |
+
|
| 479 |
+
def bash(
|
| 480 |
+
self,
|
| 481 |
+
command: str,
|
| 482 |
+
*,
|
| 483 |
+
work_dir: str | None = None,
|
| 484 |
+
timeout: int | None = None,
|
| 485 |
+
description: str | None = None,
|
| 486 |
+
) -> ToolResult:
|
| 487 |
+
return self._call(
|
| 488 |
+
"bash",
|
| 489 |
+
{
|
| 490 |
+
"command": command,
|
| 491 |
+
"work_dir": work_dir or self.work_dir,
|
| 492 |
+
"timeout": min(timeout or self.timeout, MAX_TIMEOUT),
|
| 493 |
+
},
|
| 494 |
+
timeout=timeout,
|
| 495 |
+
)
|
| 496 |
+
|
| 497 |
+
def read(
|
| 498 |
+
self, path: str, *, offset: int | None = None, limit: int | None = None
|
| 499 |
+
) -> ToolResult:
|
| 500 |
+
self._files_read.add(path)
|
| 501 |
+
return self._call(
|
| 502 |
+
"read",
|
| 503 |
+
{
|
| 504 |
+
"path": path,
|
| 505 |
+
"offset": offset,
|
| 506 |
+
"limit": limit or (DEFAULT_READ_LIMIT if offset is None else None),
|
| 507 |
+
},
|
| 508 |
+
)
|
| 509 |
+
|
| 510 |
+
def write(self, path: str, content: str) -> ToolResult:
|
| 511 |
+
if path not in self._files_read:
|
| 512 |
+
check = self._call("exists", {"path": path})
|
| 513 |
+
if check.success and check.output == "true":
|
| 514 |
+
return ToolResult(
|
| 515 |
+
success=False,
|
| 516 |
+
error=(
|
| 517 |
+
f"File {path} exists but has not been read this session. "
|
| 518 |
+
f"Read it first, or use sandbox_edit for targeted changes."
|
| 519 |
+
),
|
| 520 |
+
)
|
| 521 |
+
result = self._call("write", {"path": path, "content": content})
|
| 522 |
+
if result.success:
|
| 523 |
+
self._files_read.add(path)
|
| 524 |
+
return result
|
| 525 |
+
|
| 526 |
+
def edit(
|
| 527 |
+
self, path: str, old_str: str, new_str: str, *, replace_all: bool = False
|
| 528 |
+
) -> ToolResult:
|
| 529 |
+
if old_str == new_str:
|
| 530 |
+
return ToolResult(success=False, error="old_str and new_str are identical.")
|
| 531 |
+
if path not in self._files_read:
|
| 532 |
+
return ToolResult(
|
| 533 |
+
success=False,
|
| 534 |
+
error=f"File {path} has not been read this session. Read it first.",
|
| 535 |
+
)
|
| 536 |
+
return self._call(
|
| 537 |
+
"edit",
|
| 538 |
+
{
|
| 539 |
+
"path": path,
|
| 540 |
+
"old_str": old_str,
|
| 541 |
+
"new_str": new_str,
|
| 542 |
+
"replace_all": replace_all,
|
| 543 |
+
},
|
| 544 |
+
)
|
| 545 |
+
|
| 546 |
+
# ββ Tool schemas & dispatch βββββββββββββββββββββββββββββββββββ
|
| 547 |
+
|
| 548 |
+
TOOLS = {
|
| 549 |
+
"bash": {
|
| 550 |
+
"description": (
|
| 551 |
+
"Run a shell command in the remote sandbox and return stdout/stderr.\n"
|
| 552 |
+
"\n"
|
| 553 |
+
"Commands run in a shell at the working directory (default /app). "
|
| 554 |
+
"Each invocation is independent β use files in /app to persist state.\n"
|
| 555 |
+
"\n"
|
| 556 |
+
"AVOID using bash for operations covered by specialized tools:\n"
|
| 557 |
+
"- File reading: use read (not cat/head/tail)\n"
|
| 558 |
+
"- File editing: use edit (not sed/awk)\n"
|
| 559 |
+
"- File writing: use write (not echo/cat <<EOF)\n"
|
| 560 |
+
"\n"
|
| 561 |
+
"For long-running tasks, background them:\n"
|
| 562 |
+
" nohup uv run train.py > /app/train.log 2>&1 &\n"
|
| 563 |
+
"Then check with read on the log file.\n"
|
| 564 |
+
"\n"
|
| 565 |
+
"Chain dependent commands with &&. Independent commands should be "
|
| 566 |
+
"separate bash calls (they can run in parallel).\n"
|
| 567 |
+
"\n"
|
| 568 |
+
"Timeout default 120s, max 600s."
|
| 569 |
+
),
|
| 570 |
+
"parameters": {
|
| 571 |
+
"type": "object",
|
| 572 |
+
"required": ["command"],
|
| 573 |
+
"additionalProperties": False,
|
| 574 |
+
"properties": {
|
| 575 |
+
"command": {
|
| 576 |
+
"type": "string",
|
| 577 |
+
"description": "The shell command to execute.",
|
| 578 |
+
},
|
| 579 |
+
"description": {
|
| 580 |
+
"type": "string",
|
| 581 |
+
"description": "Short description (5-10 words, active voice). E.g. 'Install dependencies', 'Run training script'.",
|
| 582 |
+
},
|
| 583 |
+
"work_dir": {
|
| 584 |
+
"type": "string",
|
| 585 |
+
"description": "Working directory (default: /app).",
|
| 586 |
+
},
|
| 587 |
+
"timeout": {
|
| 588 |
+
"type": "integer",
|
| 589 |
+
"description": "Timeout in seconds (default: 120, max: 600).",
|
| 590 |
+
},
|
| 591 |
+
},
|
| 592 |
+
},
|
| 593 |
+
},
|
| 594 |
+
"read": {
|
| 595 |
+
"description": (
|
| 596 |
+
"Read file contents with line numbers (cat -n format).\n"
|
| 597 |
+
"\n"
|
| 598 |
+
"Returns the first 2000 lines by default. For large files, use offset/limit "
|
| 599 |
+
"to read a specific range. Line numbers always match the original file.\n"
|
| 600 |
+
"\n"
|
| 601 |
+
"Lines longer than 2000 chars are truncated.\n"
|
| 602 |
+
"Cannot read directories β use bash with 'ls' instead."
|
| 603 |
+
),
|
| 604 |
+
"parameters": {
|
| 605 |
+
"type": "object",
|
| 606 |
+
"required": ["path"],
|
| 607 |
+
"additionalProperties": False,
|
| 608 |
+
"properties": {
|
| 609 |
+
"path": {
|
| 610 |
+
"type": "string",
|
| 611 |
+
"description": "Absolute path to the file to read.",
|
| 612 |
+
},
|
| 613 |
+
"offset": {
|
| 614 |
+
"type": "integer",
|
| 615 |
+
"description": "Start from this line (1-based). Only if file is too large.",
|
| 616 |
+
},
|
| 617 |
+
"limit": {
|
| 618 |
+
"type": "integer",
|
| 619 |
+
"description": "Number of lines to read. Only if file is too large.",
|
| 620 |
+
},
|
| 621 |
+
},
|
| 622 |
+
},
|
| 623 |
+
},
|
| 624 |
+
"write": {
|
| 625 |
+
"description": (
|
| 626 |
+
"Create or overwrite a file. Creates parent directories as needed.\n"
|
| 627 |
+
"\n"
|
| 628 |
+
"For existing files, you MUST read the file first (system enforced). "
|
| 629 |
+
"Prefer edit for modifications."
|
| 630 |
+
),
|
| 631 |
+
"parameters": {
|
| 632 |
+
"type": "object",
|
| 633 |
+
"required": ["path", "content"],
|
| 634 |
+
"additionalProperties": False,
|
| 635 |
+
"properties": {
|
| 636 |
+
"path": {
|
| 637 |
+
"type": "string",
|
| 638 |
+
"description": "Absolute path to the file to write.",
|
| 639 |
+
},
|
| 640 |
+
"content": {
|
| 641 |
+
"type": "string",
|
| 642 |
+
"description": "Complete file content.",
|
| 643 |
+
},
|
| 644 |
+
},
|
| 645 |
+
},
|
| 646 |
+
},
|
| 647 |
+
"edit": {
|
| 648 |
+
"description": (
|
| 649 |
+
"Targeted edit via exact string replacement.\n"
|
| 650 |
+
"\n"
|
| 651 |
+
"Rules:\n"
|
| 652 |
+
"- old_str must appear EXACTLY once (unless replace_all is true).\n"
|
| 653 |
+
"- Include enough context in old_str for uniqueness.\n"
|
| 654 |
+
"- old_str and new_str must differ.\n"
|
| 655 |
+
"- Preserve indentation exactly.\n"
|
| 656 |
+
"- To delete code, set new_str to empty string.\n"
|
| 657 |
+
"- File MUST have been read this session (system enforced).\n"
|
| 658 |
+
"- Do NOT include line number prefixes in old_str/new_str.\n"
|
| 659 |
+
"\n"
|
| 660 |
+
"Use replace_all=true for batch operations like variable renaming."
|
| 661 |
+
),
|
| 662 |
+
"parameters": {
|
| 663 |
+
"type": "object",
|
| 664 |
+
"required": ["path", "old_str", "new_str"],
|
| 665 |
+
"additionalProperties": False,
|
| 666 |
+
"properties": {
|
| 667 |
+
"path": {
|
| 668 |
+
"type": "string",
|
| 669 |
+
"description": "Absolute path to the file.",
|
| 670 |
+
},
|
| 671 |
+
"old_str": {
|
| 672 |
+
"type": "string",
|
| 673 |
+
"description": "Exact text to find (must differ from new_str).",
|
| 674 |
+
},
|
| 675 |
+
"new_str": {"type": "string", "description": "Replacement text."},
|
| 676 |
+
"replace_all": {
|
| 677 |
+
"type": "boolean",
|
| 678 |
+
"description": "Replace all occurrences (default: false).",
|
| 679 |
+
"default": False,
|
| 680 |
+
},
|
| 681 |
+
},
|
| 682 |
+
},
|
| 683 |
+
},
|
| 684 |
+
}
|
| 685 |
+
|
| 686 |
+
@classmethod
|
| 687 |
+
def tool_definitions(cls) -> list[dict]:
|
| 688 |
+
return [{"name": name, **spec} for name, spec in cls.TOOLS.items()]
|
| 689 |
+
|
| 690 |
+
def call_tool(self, name: str, arguments: dict[str, Any]) -> ToolResult:
|
| 691 |
+
dispatch = {
|
| 692 |
+
"bash": lambda a: self.bash(
|
| 693 |
+
a["command"],
|
| 694 |
+
work_dir=a.get("work_dir"),
|
| 695 |
+
timeout=a.get("timeout"),
|
| 696 |
+
description=a.get("description"),
|
| 697 |
+
),
|
| 698 |
+
"read": lambda a: self.read(
|
| 699 |
+
a["path"],
|
| 700 |
+
offset=a.get("offset"),
|
| 701 |
+
limit=a.get("limit"),
|
| 702 |
+
),
|
| 703 |
+
"write": lambda a: self.write(a["path"], a["content"]),
|
| 704 |
+
"edit": lambda a: self.edit(
|
| 705 |
+
a["path"],
|
| 706 |
+
a["old_str"],
|
| 707 |
+
a["new_str"],
|
| 708 |
+
replace_all=a.get("replace_all", False),
|
| 709 |
+
),
|
| 710 |
+
}
|
| 711 |
+
fn = dispatch.get(name)
|
| 712 |
+
if not fn:
|
| 713 |
+
return ToolResult(success=False, error=f"Unknown tool: {name}")
|
| 714 |
+
return fn(arguments)
|
agent/tools/sandbox_tool.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Sandbox tools β expose the Sandbox client as agent tools.
|
| 3 |
+
|
| 4 |
+
5 tools total:
|
| 5 |
+
sandbox_create β explicit sandbox creation (requires approval)
|
| 6 |
+
bash, read, write, edit β operations on the sandbox
|
| 7 |
+
|
| 8 |
+
If any operation tool is called without an active sandbox,
|
| 9 |
+
a cpu-basic sandbox is auto-created (no approval needed).
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import asyncio
|
| 15 |
+
import os
|
| 16 |
+
from typing import Any
|
| 17 |
+
|
| 18 |
+
from huggingface_hub import HfApi, SpaceHardware
|
| 19 |
+
|
| 20 |
+
from agent.core.session import Event
|
| 21 |
+
from agent.tools.sandbox_client import Sandbox
|
| 22 |
+
|
| 23 |
+
# ββ Tool name mapping (short agent names β Sandbox client names) ββββββ
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
async def _ensure_sandbox(
|
| 27 |
+
session: Any, hardware: str = "cpu-basic", **create_kwargs
|
| 28 |
+
) -> tuple[Sandbox | None, str | None]:
|
| 29 |
+
"""
|
| 30 |
+
Ensure a sandbox exists on the session. Auto-creates with given hardware if needed.
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
(sandbox, error_message) β one will be None.
|
| 34 |
+
"""
|
| 35 |
+
if session and getattr(session, "sandbox", None):
|
| 36 |
+
return session.sandbox, None
|
| 37 |
+
|
| 38 |
+
if not session:
|
| 39 |
+
return None, "No session available."
|
| 40 |
+
|
| 41 |
+
token = os.environ.get("HF_TOKEN")
|
| 42 |
+
if not token:
|
| 43 |
+
return None, "HF_TOKEN environment variable not set. Cannot create sandbox."
|
| 44 |
+
|
| 45 |
+
api = HfApi(token=token)
|
| 46 |
+
user_info = api.whoami()
|
| 47 |
+
owner = user_info.get("name", user_info.get("user", ""))
|
| 48 |
+
if not owner:
|
| 49 |
+
return None, "Could not determine HF username from token."
|
| 50 |
+
|
| 51 |
+
await session.send_event(
|
| 52 |
+
Event(
|
| 53 |
+
event_type="tool_log",
|
| 54 |
+
data={
|
| 55 |
+
"tool": "sandbox",
|
| 56 |
+
"log": f"Auto-creating sandbox for {owner} ({hardware})...",
|
| 57 |
+
},
|
| 58 |
+
)
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
kwargs = {"owner": owner, "hardware": hardware, "token": token, **create_kwargs}
|
| 62 |
+
sb = await asyncio.to_thread(Sandbox.create, **kwargs)
|
| 63 |
+
session.sandbox = sb
|
| 64 |
+
|
| 65 |
+
await session.send_event(
|
| 66 |
+
Event(
|
| 67 |
+
event_type="tool_log",
|
| 68 |
+
data={"tool": "sandbox", "log": f"Sandbox ready: {sb.space_id} ({sb.url})"},
|
| 69 |
+
)
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
return sb, None
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# ββ sandbox_create tool ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 76 |
+
|
| 77 |
+
SANDBOX_CREATE_TOOL_SPEC = {
|
| 78 |
+
"name": "sandbox_create",
|
| 79 |
+
"description": (
|
| 80 |
+
"Create a persistent remote Linux environment for developing and testing scripts.\n\n"
|
| 81 |
+
"Workflow: sandbox_create β write script β pip install β test with small run β fix errors β hf_jobs at scale.\n"
|
| 82 |
+
"The sandbox persists across tool calls within the session. pip install works out of the box.\n\n"
|
| 83 |
+
"Use this when: you need to develop, test, and iterate on scripts before launching via hf_jobs. "
|
| 84 |
+
"Especially for training scripts where you need to verify imports, test on a small subset, and fix errors interactively.\n\n"
|
| 85 |
+
"Skip this when: the task is a simple one-shot operation (status check, resource search, quick data query), "
|
| 86 |
+
"or the script is copied from a verified working example with minimal changes.\n\n"
|
| 87 |
+
"For ML code that uses CUDA, bf16, or model loading: use GPU hardware (t4-small minimum). "
|
| 88 |
+
"CPU sandboxes cannot run GPU code paths β your test will not catch GPU-related errors.\n\n"
|
| 89 |
+
"Hardware: " + ", ".join([e.value for e in SpaceHardware]) + ".\n"
|
| 90 |
+
),
|
| 91 |
+
"parameters": {
|
| 92 |
+
"type": "object",
|
| 93 |
+
"required": [],
|
| 94 |
+
"additionalProperties": False,
|
| 95 |
+
"properties": {
|
| 96 |
+
"hardware": {
|
| 97 |
+
"type": "string",
|
| 98 |
+
"enum": [e.value for e in SpaceHardware],
|
| 99 |
+
"description": "Hardware tier for the sandbox (default: cpu-basic)",
|
| 100 |
+
},
|
| 101 |
+
"private": {
|
| 102 |
+
"type": "boolean",
|
| 103 |
+
"description": "If true, create a private Space",
|
| 104 |
+
},
|
| 105 |
+
},
|
| 106 |
+
},
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
async def sandbox_create_handler(
|
| 111 |
+
args: dict[str, Any], session: Any = None
|
| 112 |
+
) -> tuple[str, bool]:
|
| 113 |
+
"""Handle sandbox_create tool calls."""
|
| 114 |
+
# If sandbox already exists, return its info
|
| 115 |
+
if session and getattr(session, "sandbox", None):
|
| 116 |
+
sb = session.sandbox
|
| 117 |
+
return (
|
| 118 |
+
f"Sandbox already active: {sb.space_id}\n"
|
| 119 |
+
f"URL: {sb.url}\n"
|
| 120 |
+
f"Use bash/read/write/edit to interact with it."
|
| 121 |
+
), True
|
| 122 |
+
|
| 123 |
+
hardware = args.get("hardware", "cpu-basic")
|
| 124 |
+
create_kwargs = {}
|
| 125 |
+
if "private" in args:
|
| 126 |
+
create_kwargs["private"] = args["private"]
|
| 127 |
+
|
| 128 |
+
try:
|
| 129 |
+
sb, error = await _ensure_sandbox(session, hardware=hardware, **create_kwargs)
|
| 130 |
+
except Exception as e:
|
| 131 |
+
return f"Failed to create sandbox: {e}", False
|
| 132 |
+
|
| 133 |
+
if error:
|
| 134 |
+
return error, False
|
| 135 |
+
|
| 136 |
+
return (
|
| 137 |
+
f"Sandbox created: {sb.space_id}\n"
|
| 138 |
+
f"URL: {sb.url}\n"
|
| 139 |
+
f"Hardware: {hardware}\n"
|
| 140 |
+
f"Use bash/read/write/edit to interact with it."
|
| 141 |
+
), True
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def _make_tool_handler(sandbox_tool_name: str):
|
| 145 |
+
"""Factory: create a handler for a sandbox operation tool."""
|
| 146 |
+
|
| 147 |
+
async def handler(args: dict[str, Any], session: Any = None) -> tuple[str, bool]:
|
| 148 |
+
# Auto-create sandbox if not present
|
| 149 |
+
try:
|
| 150 |
+
sb, error = await _ensure_sandbox(session)
|
| 151 |
+
except Exception as e:
|
| 152 |
+
return f"Failed to auto-create sandbox: {e}", False
|
| 153 |
+
|
| 154 |
+
if error:
|
| 155 |
+
return error, False
|
| 156 |
+
|
| 157 |
+
try:
|
| 158 |
+
result = await asyncio.to_thread(sb.call_tool, sandbox_tool_name, args)
|
| 159 |
+
if result.success:
|
| 160 |
+
return result.output or "(no output)", True
|
| 161 |
+
else:
|
| 162 |
+
error_msg = result.error or "Unknown error"
|
| 163 |
+
output = result.output
|
| 164 |
+
if output:
|
| 165 |
+
return f"{output}\n\nERROR: {error_msg}", False
|
| 166 |
+
return f"ERROR: {error_msg}", False
|
| 167 |
+
except Exception as e:
|
| 168 |
+
return f"Sandbox operation failed: {e}", False
|
| 169 |
+
|
| 170 |
+
return handler
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def get_sandbox_tools():
|
| 174 |
+
"""Return all 5 sandbox ToolSpecs (sandbox_create + 4 operation tools)."""
|
| 175 |
+
from agent.core.tools import ToolSpec
|
| 176 |
+
|
| 177 |
+
tools = []
|
| 178 |
+
|
| 179 |
+
# sandbox_create (explicit creation, requires approval)
|
| 180 |
+
tools.append(
|
| 181 |
+
ToolSpec(
|
| 182 |
+
name=SANDBOX_CREATE_TOOL_SPEC["name"],
|
| 183 |
+
description=SANDBOX_CREATE_TOOL_SPEC["description"],
|
| 184 |
+
parameters=SANDBOX_CREATE_TOOL_SPEC["parameters"],
|
| 185 |
+
handler=sandbox_create_handler,
|
| 186 |
+
)
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
# Operation tools (auto-execute, no approval needed)
|
| 190 |
+
for name in Sandbox.TOOLS.keys():
|
| 191 |
+
spec = Sandbox.TOOLS[name]
|
| 192 |
+
tools.append(
|
| 193 |
+
ToolSpec(
|
| 194 |
+
name=name,
|
| 195 |
+
description=spec["description"],
|
| 196 |
+
parameters=spec["parameters"],
|
| 197 |
+
handler=_make_tool_handler(name),
|
| 198 |
+
)
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
return tools
|