akseljoonas HF Staff commited on
Commit
b63e2df
·
1 Parent(s): b9c2325

reworked the hf-jobs tool

Browse files
Files changed (1) hide show
  1. agent/tools/jobs_tool.py +212 -189
agent/tools/jobs_tool.py CHANGED
@@ -46,13 +46,11 @@ ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
46
  # Operation names
47
  OperationType = Literal[
48
  "run",
49
- "uv",
50
  "ps",
51
  "logs",
52
  "inspect",
53
  "cancel",
54
  "scheduled run",
55
- "scheduled uv",
56
  "scheduled ps",
57
  "scheduled inspect",
58
  "scheduled delete",
@@ -104,6 +102,8 @@ def _build_uv_command(
104
  if script_args:
105
  parts.extend(script_args)
106
 
 
 
107
  return parts
108
 
109
 
@@ -124,8 +124,6 @@ def _wrap_inline_script(
124
 
125
  def _ensure_hf_transfer_dependency(deps: list[str] | None) -> list[str]:
126
  """Ensure hf-transfer is included in the dependencies list"""
127
- if deps is None:
128
- return ["hf-transfer"]
129
 
130
  if isinstance(deps, list):
131
  deps_copy = deps.copy() # Don't modify the original
@@ -170,7 +168,7 @@ def _job_info_to_dict(job_info) -> Dict[str, Any]:
170
  "createdAt": job_info.created_at.isoformat(),
171
  "dockerImage": job_info.docker_image,
172
  "spaceId": job_info.space_id,
173
- "flavor": job_info.flavor,
174
  "owner": {"name": job_info.owner.name},
175
  }
176
 
@@ -209,7 +207,7 @@ def _scheduled_job_info_to_dict(scheduled_job_info) -> Dict[str, Any]:
209
  "dockerImage": job_spec.docker_image,
210
  "spaceId": job_spec.space_id,
211
  "command": job_spec.command or [],
212
- "flavor": job_spec.flavor or "cpu-basic",
213
  },
214
  }
215
 
@@ -224,7 +222,8 @@ class HfJobsTool:
224
  async def execute(self, params: Dict[str, Any]) -> ToolResult:
225
  """Execute the specified operation"""
226
  operation = params.get("operation")
227
- args = params.get("args", {})
 
228
 
229
  # If no operation provided, return error
230
  if not operation:
@@ -242,8 +241,6 @@ class HfJobsTool:
242
  # Route to appropriate handler
243
  if operation == "run":
244
  return await self._run_job(args)
245
- elif operation == "uv":
246
- return await self._run_uv_job(args)
247
  elif operation == "ps":
248
  return await self._list_jobs(args)
249
  elif operation == "logs":
@@ -254,8 +251,6 @@ class HfJobsTool:
254
  return await self._cancel_job(args)
255
  elif operation == "scheduled run":
256
  return await self._scheduled_run(args)
257
- elif operation == "scheduled uv":
258
- return await self._scheduled_uv(args)
259
  elif operation == "scheduled ps":
260
  return await self._list_scheduled_jobs(args)
261
  elif operation == "scheduled inspect":
@@ -270,8 +265,8 @@ class HfJobsTool:
270
  return {
271
  "formatted": f'Unknown operation: "{operation}"\n\n'
272
  "Available operations:\n"
273
- "- run, uv, ps, logs, inspect, cancel\n"
274
- "- scheduled run, scheduled uv, scheduled ps, scheduled inspect, "
275
  "scheduled delete, scheduled suspend, scheduled resume\n\n"
276
  "Call this tool with no operation for full usage instructions.",
277
  "totalResults": 0,
@@ -322,94 +317,69 @@ class HfJobsTool:
322
  return final_status, all_logs
323
 
324
  async def _run_job(self, args: Dict[str, Any]) -> ToolResult:
325
- """Run a job using HfApi.run_job()"""
326
  try:
327
- job = await _async_call(
328
- self.api.run_job,
329
- image=args.get("image", "python:3.12"),
330
- command=args.get("command"),
331
- env=_add_environment_variables(args.get("env")),
332
- secrets=_add_environment_variables(args.get("secrets")),
333
- flavor=args.get("flavor", "cpu-basic"),
334
- timeout=args.get("timeout", "30m"),
335
- namespace=args.get("namespace") or self.namespace,
336
- )
337
-
338
- # Wait for completion and stream logs
339
- print(f"Job started: {job.url}")
340
- print("Streaming logs...\n---\n")
341
-
342
- final_status, all_logs = await self._wait_for_job_completion(
343
- job_id=job.id,
344
- namespace=args.get("namespace") or self.namespace,
345
- )
346
-
347
- # Format all logs for the agent
348
- log_text = "\n".join(all_logs) if all_logs else "(no logs)"
349
 
350
- response = f"""Job completed!
 
 
 
 
351
 
352
- **Job ID:** {job.id}
353
- **Final Status:** {final_status}
354
- **View at:** {job.url}
 
355
 
356
- **Logs:**
357
- ```
358
- {log_text}
359
- ```"""
360
- return {"formatted": response, "totalResults": 1, "resultsShared": 1}
 
 
 
 
 
 
 
361
 
362
- except Exception as e:
363
- raise Exception(f"Failed to run job: {str(e)}")
 
364
 
365
- async def _run_uv_job(self, args: Dict[str, Any]) -> ToolResult:
366
- """Run UV job with inline script support (no local files needed)"""
367
- try:
368
- script = args.get("script")
369
- if not script:
370
- raise ValueError("script is required")
371
-
372
- # Get dependencies and ensure hf-transfer is included
373
- deps = (
374
- args.get("with_deps")
375
- or args.get("dependencies")
376
- or args.get("packages")
377
- )
378
- deps = _ensure_hf_transfer_dependency(deps)
379
-
380
- # Resolve the command based on script type (URL, inline, or file)
381
- command = _resolve_uv_command(
382
- script=script,
383
- with_deps=deps,
384
- python=args.get("python"),
385
- script_args=args.get("script_args"),
386
- )
387
 
388
- # Use run_job with UV image instead of run_uv_job
389
  job = await _async_call(
390
  self.api.run_job,
391
- image=UV_DEFAULT_IMAGE,
392
  command=command,
393
  env=_add_environment_variables(args.get("env")),
394
  secrets=_add_environment_variables(args.get("secrets")),
395
- flavor=args.get("flavor") or args.get("hardware") or "cpu-basic",
396
  timeout=args.get("timeout", "30m"),
397
- namespace=args.get("namespace") or self.namespace,
398
  )
399
 
400
  # Wait for completion and stream logs
401
- print(f"UV Job started: {job.url}")
402
  print("Streaming logs...\n---\n")
403
 
404
  final_status, all_logs = await self._wait_for_job_completion(
405
  job_id=job.id,
406
- namespace=args.get("namespace") or self.namespace,
407
  )
408
 
409
  # Format all logs for the agent
410
  log_text = "\n".join(all_logs) if all_logs else "(no logs)"
411
 
412
- response = f"""UV Job completed!
413
 
414
  **Job ID:** {job.id}
415
  **Final Status:** {final_status}
@@ -422,13 +392,11 @@ class HfJobsTool:
422
  return {"formatted": response, "totalResults": 1, "resultsShared": 1}
423
 
424
  except Exception as e:
425
- raise Exception(f"Failed to run UV job: {str(e)}")
426
 
427
  async def _list_jobs(self, args: Dict[str, Any]) -> ToolResult:
428
  """List jobs using HfApi.list_jobs()"""
429
- jobs_list = await _async_call(
430
- self.api.list_jobs, namespace=args.get("namespace") or self.namespace
431
- )
432
 
433
  # Filter jobs
434
  if not args.get("all", False):
@@ -451,7 +419,7 @@ class HfJobsTool:
451
  "resultsShared": 0,
452
  }
453
  return {
454
- "formatted": 'No running jobs found. Use `{"args": {"all": true}}` to show all jobs.',
455
  "totalResults": 0,
456
  "resultsShared": 0,
457
  }
@@ -476,9 +444,7 @@ class HfJobsTool:
476
 
477
  try:
478
  # Fetch logs (returns generator, convert to list)
479
- logs_gen = self.api.fetch_job_logs(
480
- job_id=job_id, namespace=args.get("namespace") or self.namespace
481
- )
482
  logs = await _async_call(list, logs_gen)
483
 
484
  if not logs:
@@ -522,7 +488,7 @@ class HfJobsTool:
522
  job = await _async_call(
523
  self.api.inspect_job,
524
  job_id=jid,
525
- namespace=args.get("namespace") or self.namespace,
526
  )
527
  jobs.append(_job_info_to_dict(job))
528
  except Exception as e:
@@ -551,108 +517,93 @@ class HfJobsTool:
551
  await _async_call(
552
  self.api.cancel_job,
553
  job_id=job_id,
554
- namespace=args.get("namespace") or self.namespace,
555
  )
556
 
557
  response = f"""✓ Job {job_id} has been cancelled.
558
 
559
- To verify, call this tool with `{{"operation": "inspect", "args": {{"job_id": "{job_id}"}}}}`"""
560
 
561
  return {"formatted": response, "totalResults": 1, "resultsShared": 1}
562
 
563
  async def _scheduled_run(self, args: Dict[str, Any]) -> ToolResult:
564
- """Create scheduled job using HfApi.create_scheduled_job()"""
565
  try:
566
- scheduled_job = await _async_call(
567
- self.api.create_scheduled_job,
568
- image=args.get("image", "python:3.12"),
569
- command=args.get("command"),
570
- schedule=args.get("schedule"),
571
- env=_add_environment_variables(args.get("env")),
572
- secrets=_add_environment_variables(args.get("secrets")),
573
- flavor=args.get("flavor", "cpu-basic"),
574
- timeout=args.get("timeout", "30m"),
575
- namespace=args.get("namespace") or self.namespace,
576
- )
577
-
578
- scheduled_dict = _scheduled_job_info_to_dict(scheduled_job)
579
-
580
- response = f"""✓ Scheduled job created successfully!
581
-
582
- **Scheduled Job ID:** {scheduled_dict["id"]}
583
- **Schedule:** {scheduled_dict["schedule"]}
584
- **Suspended:** {"Yes" if scheduled_dict.get("suspend") else "No"}
585
- **Next Run:** {scheduled_dict.get("nextRun", "N/A")}
586
 
587
- To inspect, call this tool with `{{"operation": "scheduled inspect", "args": {{"scheduled_job_id": "{scheduled_dict["id"]}"}}}}`
588
- To list all, call this tool with `{{"operation": "scheduled ps"}}`"""
589
 
590
- return {"formatted": response, "totalResults": 1, "resultsShared": 1}
 
 
 
 
591
 
592
- except Exception as e:
593
- raise Exception(f"Failed to create scheduled job: {str(e)}")
 
 
594
 
595
- async def _scheduled_uv(self, args: Dict[str, Any]) -> ToolResult:
596
- """Create scheduled UV job with inline script support"""
597
- try:
598
- script = args.get("script")
599
- if not script:
600
- raise ValueError("script is required")
 
 
 
 
 
 
601
 
602
- schedule = args.get("schedule")
603
- if not schedule:
604
- raise ValueError("schedule is required")
605
 
606
- # Get dependencies and ensure hf-transfer is included
607
- deps = (
608
- args.get("with_deps")
609
- or args.get("dependencies")
610
- or args.get("packages")
611
- )
612
- deps = _ensure_hf_transfer_dependency(deps)
613
-
614
- # Resolve the command based on script type
615
- command = _resolve_uv_command(
616
- script=script,
617
- with_deps=deps,
618
- python=args.get("python"),
619
- script_args=args.get("script_args"),
620
- )
621
 
622
- # Use create_scheduled_job with UV image
623
  scheduled_job = await _async_call(
624
  self.api.create_scheduled_job,
625
- image=UV_DEFAULT_IMAGE,
626
  command=command,
627
  schedule=schedule,
628
  env=_add_environment_variables(args.get("env")),
629
  secrets=_add_environment_variables(args.get("secrets")),
630
- flavor=args.get("flavor") or args.get("hardware") or "cpu-basic",
631
  timeout=args.get("timeout", "30m"),
632
- namespace=args.get("namespace") or self.namespace,
633
  )
634
 
635
  scheduled_dict = _scheduled_job_info_to_dict(scheduled_job)
636
 
637
- response = f"""✓ Scheduled UV job created successfully!
638
 
639
  **Scheduled Job ID:** {scheduled_dict["id"]}
640
  **Schedule:** {scheduled_dict["schedule"]}
641
  **Suspended:** {"Yes" if scheduled_dict.get("suspend") else "No"}
642
  **Next Run:** {scheduled_dict.get("nextRun", "N/A")}
643
 
644
- To inspect, call this tool with `{{"operation": "scheduled inspect", "args": {{"scheduled_job_id": "{scheduled_dict["id"]}"}}}}`"""
 
645
 
646
  return {"formatted": response, "totalResults": 1, "resultsShared": 1}
647
 
648
  except Exception as e:
649
- raise Exception(f"Failed to create scheduled UV job: {str(e)}")
650
 
651
  async def _list_scheduled_jobs(self, args: Dict[str, Any]) -> ToolResult:
652
  """List scheduled jobs using HfApi.list_scheduled_jobs()"""
653
  scheduled_jobs_list = await _async_call(
654
  self.api.list_scheduled_jobs,
655
- namespace=args.get("namespace") or self.namespace,
656
  )
657
 
658
  # Filter jobs - default: hide suspended jobs unless --all is specified
@@ -672,7 +623,7 @@ To inspect, call this tool with `{{"operation": "scheduled inspect", "args": {{"
672
  "resultsShared": 0,
673
  }
674
  return {
675
- "formatted": 'No active scheduled jobs found. Use `{"args": {"all": true}}` to show suspended jobs.',
676
  "totalResults": 0,
677
  "resultsShared": 0,
678
  }
@@ -698,7 +649,7 @@ To inspect, call this tool with `{{"operation": "scheduled inspect", "args": {{"
698
  scheduled_job = await _async_call(
699
  self.api.inspect_scheduled_job,
700
  scheduled_job_id=scheduled_job_id,
701
- namespace=args.get("namespace") or self.namespace,
702
  )
703
 
704
  scheduled_dict = _scheduled_job_info_to_dict(scheduled_job)
@@ -724,7 +675,7 @@ To inspect, call this tool with `{{"operation": "scheduled inspect", "args": {{"
724
  await _async_call(
725
  self.api.delete_scheduled_job,
726
  scheduled_job_id=scheduled_job_id,
727
- namespace=args.get("namespace") or self.namespace,
728
  )
729
 
730
  return {
@@ -747,12 +698,12 @@ To inspect, call this tool with `{{"operation": "scheduled inspect", "args": {{"
747
  await _async_call(
748
  self.api.suspend_scheduled_job,
749
  scheduled_job_id=scheduled_job_id,
750
- namespace=args.get("namespace") or self.namespace,
751
  )
752
 
753
  response = f"""✓ Scheduled job {scheduled_job_id} has been suspended.
754
 
755
- To resume, call this tool with `{{"operation": "scheduled resume", "args": {{"scheduled_job_id": "{scheduled_job_id}"}}}}`"""
756
 
757
  return {"formatted": response, "totalResults": 1, "resultsShared": 1}
758
 
@@ -770,12 +721,12 @@ To resume, call this tool with `{{"operation": "scheduled resume", "args": {{"sc
770
  await _async_call(
771
  self.api.resume_scheduled_job,
772
  scheduled_job_id=scheduled_job_id,
773
- namespace=args.get("namespace") or self.namespace,
774
  )
775
 
776
  response = f"""✓ Scheduled job {scheduled_job_id} has been resumed.
777
 
778
- To inspect, call this tool with `{{"operation": "scheduled inspect", "args": {{"scheduled_job_id": "{scheduled_job_id}"}}}}`"""
779
 
780
  return {"formatted": response, "totalResults": 1, "resultsShared": 1}
781
 
@@ -784,40 +735,35 @@ To inspect, call this tool with `{{"operation": "scheduled inspect", "args": {{"
784
  HF_JOBS_TOOL_SPEC = {
785
  "name": "hf_jobs",
786
  "description": (
787
- "Manage Hugging Face CPU/GPU compute jobs. Run commands in Docker containers, "
788
- "execute Python scripts with UV. List, schedule and monitor jobs/logs. "
789
- "Call this tool with no operation for full usage instructions and examples.\n\n"
790
- "## Available Operations\n"
791
- "**Job Management:**\n"
792
- "- run: Run job with Docker image\n"
793
- "- uv: Run Python script with inline dependencies (recommended for Python)\n"
794
- "- ps: List jobs (shows running by default, use args: {'all': true} for all)\n"
795
- "- logs: Fetch job logs (args: {'job_id': 'xxx'})\n"
796
- "- inspect: Get job details (args: {'job_id': 'xxx'})\n"
797
- "- cancel: Cancel running job (args: {'job_id': 'xxx'})\n\n"
798
- "**Scheduled Jobs:**\n"
799
- "- same functionality as Job Management, but recurring periodically\n"
800
- "- schedule: One of '@annually', '@yearly', '@monthly', '@weekly', '@daily', '@hourly', or a CRON schedule expression (e.g., '0 9 * * 1' for 9 AM every Monday).\n"
801
- "- scheduled run/uv/ps/inspect/delete/suspend/resume\n\n"
802
  "## Available Hardware Flavors\n"
803
  "**CPU:** cpu-basic, cpu-upgrade, cpu-performance, cpu-xl\n"
804
  "**GPU:** t4-small, t4-medium, l4x1, l4x4, a10g-small, a10g-large, a10g-largex2, a10g-largex4, a100-large, h100, h100x8\n"
805
  "**Specialized:** inf2x6\n\n"
806
  "## Usage Examples\n"
807
- "**Run Python with UV (recommended):**\n"
808
- "{'operation': 'uv', 'args': {'script': 'import torch\\nprint(torch.cuda.is_available())', "
809
- "'dependencies': ['torch', 'transformers'], 'flavor': 'a10g-small', 'secrets': {'HF_TOKEN': '$HF_TOKEN'}}}\n\n"
810
- "**Run Docker command:**\n"
811
- "{'operation': 'run', 'args': {'image': 'python:3.12', 'command': ['python', '-c', 'print(42)'], 'flavor': 'cpu-basic'}}\n\n"
 
812
  "**List running jobs:**\n"
813
  "{'operation': 'ps'}\n\n"
814
- "## Key Parameters\n"
815
- "- script: Python code (for uv) - can be inline string, URL, or file path\n"
816
- "- dependencies/packages: List of pip packages (for uv)\n"
817
- "- command: Array format ['cmd', 'arg1', 'arg2'] (for run)\n"
818
- "- flavor/hardware: Choose appropriate size (default: cpu-basic)\n"
819
- "- secrets: {'HF_TOKEN': '$HF_TOKEN'} for Hub access\n"
820
- "- timeout: Max runtime (default: '30m')\n\n"
821
  "## Important Notes\n"
822
  "- **CRITICAL: Job files are EPHEMERAL** - ALL files created in HF Jobs (trained models, datasets, outputs, completions etc.) are DELETED when the job completes. You MUST upload any outputs to HF Hub in the script itself (using model.push_to_hub() when training models, dataset.push_to_hub() when creating text based outputs, etc.)."
823
  "- Always pass full script content - no local files available on server\n"
@@ -832,13 +778,11 @@ HF_JOBS_TOOL_SPEC = {
832
  "type": "string",
833
  "enum": [
834
  "run",
835
- "uv",
836
  "ps",
837
  "logs",
838
  "inspect",
839
  "cancel",
840
  "scheduled run",
841
- "scheduled uv",
842
  "scheduled ps",
843
  "scheduled inspect",
844
  "scheduled delete",
@@ -846,22 +790,101 @@ HF_JOBS_TOOL_SPEC = {
846
  "scheduled resume",
847
  ],
848
  "description": (
849
- "Operation to execute. Valid values: [run, uv, ps, logs, inspect, cancel, "
850
- "scheduled run, scheduled uv, scheduled ps, scheduled inspect, scheduled delete, "
851
  "scheduled suspend, scheduled resume]"
852
  ),
853
  },
854
- "args": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
855
  "type": "object",
856
  "description": (
857
- "Operation-specific arguments as a JSON object. "
858
- "Common args: script (for uv), packages/dependencies (array), "
859
- "flavor/hardware (e.g., a10g-large, cpu-basic), command (array), "
860
- "image (string), env (object), secrets (object)."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
861
  ),
862
- "additionalProperties": True,
863
  },
864
  },
 
865
  },
866
  }
867
 
 
46
  # Operation names
47
  OperationType = Literal[
48
  "run",
 
49
  "ps",
50
  "logs",
51
  "inspect",
52
  "cancel",
53
  "scheduled run",
 
54
  "scheduled ps",
55
  "scheduled inspect",
56
  "scheduled delete",
 
102
  if script_args:
103
  parts.extend(script_args)
104
 
105
+ # add defaults
106
+ # parts.extend(["--push_to_hub"])
107
  return parts
108
 
109
 
 
124
 
125
  def _ensure_hf_transfer_dependency(deps: list[str] | None) -> list[str]:
126
  """Ensure hf-transfer is included in the dependencies list"""
 
 
127
 
128
  if isinstance(deps, list):
129
  deps_copy = deps.copy() # Don't modify the original
 
168
  "createdAt": job_info.created_at.isoformat(),
169
  "dockerImage": job_info.docker_image,
170
  "spaceId": job_info.space_id,
171
+ "hardware_flavor": job_info.flavor,
172
  "owner": {"name": job_info.owner.name},
173
  }
174
 
 
207
  "dockerImage": job_spec.docker_image,
208
  "spaceId": job_spec.space_id,
209
  "command": job_spec.command or [],
210
+ "hardware_flavor": job_spec.flavor or "cpu-basic",
211
  },
212
  }
213
 
 
222
  async def execute(self, params: Dict[str, Any]) -> ToolResult:
223
  """Execute the specified operation"""
224
  operation = params.get("operation")
225
+
226
+ args = params
227
 
228
  # If no operation provided, return error
229
  if not operation:
 
241
  # Route to appropriate handler
242
  if operation == "run":
243
  return await self._run_job(args)
 
 
244
  elif operation == "ps":
245
  return await self._list_jobs(args)
246
  elif operation == "logs":
 
251
  return await self._cancel_job(args)
252
  elif operation == "scheduled run":
253
  return await self._scheduled_run(args)
 
 
254
  elif operation == "scheduled ps":
255
  return await self._list_scheduled_jobs(args)
256
  elif operation == "scheduled inspect":
 
265
  return {
266
  "formatted": f'Unknown operation: "{operation}"\n\n'
267
  "Available operations:\n"
268
+ "- run, ps, logs, inspect, cancel\n"
269
+ "- scheduled run, scheduled ps, scheduled inspect, "
270
  "scheduled delete, scheduled suspend, scheduled resume\n\n"
271
  "Call this tool with no operation for full usage instructions.",
272
  "totalResults": 0,
 
317
  return final_status, all_logs
318
 
319
  async def _run_job(self, args: Dict[str, Any]) -> ToolResult:
320
+ """Run a job using HfApi.run_job() - smart detection of Python vs Docker mode"""
321
  try:
322
+ script = args.get("script")
323
+ command = args.get("command")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
+ # Validate mutually exclusive parameters
326
+ if script and command:
327
+ raise ValueError(
328
+ "'script' and 'command' are mutually exclusive. Provide one or the other, not both."
329
+ )
330
 
331
+ if not script and not command:
332
+ raise ValueError(
333
+ "Either 'script' (for Python) or 'command' (for Docker) must be provided."
334
+ )
335
 
336
+ # Python mode: script provided
337
+ if script:
338
+ # Get dependencies and ensure hf-transfer is included
339
+ deps = _ensure_hf_transfer_dependency(args.get("dependencies"))
340
+
341
+ # Resolve the command based on script type (URL, inline, or file)
342
+ command = _resolve_uv_command(
343
+ script=script,
344
+ with_deps=deps,
345
+ python=args.get("python"),
346
+ script_args=args.get("script_args"),
347
+ )
348
 
349
+ # Use UV image unless overridden
350
+ image = args.get("image", UV_DEFAULT_IMAGE)
351
+ job_type = "Python"
352
 
353
+ # Docker mode: command provided
354
+ else:
355
+ image = args.get("image", "python:3.12")
356
+ job_type = "Docker"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
 
358
+ # Run the job
359
  job = await _async_call(
360
  self.api.run_job,
361
+ image=image,
362
  command=command,
363
  env=_add_environment_variables(args.get("env")),
364
  secrets=_add_environment_variables(args.get("secrets")),
365
+ flavor=args.get("hardware_flavor", "cpu-basic"),
366
  timeout=args.get("timeout", "30m"),
367
+ namespace=self.namespace,
368
  )
369
 
370
  # Wait for completion and stream logs
371
+ print(f"{job_type} job started: {job.url}")
372
  print("Streaming logs...\n---\n")
373
 
374
  final_status, all_logs = await self._wait_for_job_completion(
375
  job_id=job.id,
376
+ namespace=self.namespace,
377
  )
378
 
379
  # Format all logs for the agent
380
  log_text = "\n".join(all_logs) if all_logs else "(no logs)"
381
 
382
+ response = f"""{job_type} job completed!
383
 
384
  **Job ID:** {job.id}
385
  **Final Status:** {final_status}
 
392
  return {"formatted": response, "totalResults": 1, "resultsShared": 1}
393
 
394
  except Exception as e:
395
+ raise Exception(f"Failed to run job: {str(e)}")
396
 
397
  async def _list_jobs(self, args: Dict[str, Any]) -> ToolResult:
398
  """List jobs using HfApi.list_jobs()"""
399
+ jobs_list = await _async_call(self.api.list_jobs, namespace=self.namespace)
 
 
400
 
401
  # Filter jobs
402
  if not args.get("all", False):
 
419
  "resultsShared": 0,
420
  }
421
  return {
422
+ "formatted": 'No running jobs found. Use `{"operation": "ps", "all": true}` to show all jobs.',
423
  "totalResults": 0,
424
  "resultsShared": 0,
425
  }
 
444
 
445
  try:
446
  # Fetch logs (returns generator, convert to list)
447
+ logs_gen = self.api.fetch_job_logs(job_id=job_id, namespace=self.namespace)
 
 
448
  logs = await _async_call(list, logs_gen)
449
 
450
  if not logs:
 
488
  job = await _async_call(
489
  self.api.inspect_job,
490
  job_id=jid,
491
+ namespace=self.namespace,
492
  )
493
  jobs.append(_job_info_to_dict(job))
494
  except Exception as e:
 
517
  await _async_call(
518
  self.api.cancel_job,
519
  job_id=job_id,
520
+ namespace=self.namespace,
521
  )
522
 
523
  response = f"""✓ Job {job_id} has been cancelled.
524
 
525
+ To verify, call this tool with `{{"operation": "inspect", "job_id": "{job_id}"}}`"""
526
 
527
  return {"formatted": response, "totalResults": 1, "resultsShared": 1}
528
 
529
  async def _scheduled_run(self, args: Dict[str, Any]) -> ToolResult:
530
+ """Create scheduled job using HfApi.create_scheduled_job() - smart detection of Python vs Docker mode"""
531
  try:
532
+ script = args.get("script")
533
+ command = args.get("command")
534
+ schedule = args.get("schedule")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
 
536
+ if not schedule:
537
+ raise ValueError("schedule is required for scheduled jobs")
538
 
539
+ # Validate mutually exclusive parameters
540
+ if script and command:
541
+ raise ValueError(
542
+ "'script' and 'command' are mutually exclusive. Provide one or the other, not both."
543
+ )
544
 
545
+ if not script and not command:
546
+ raise ValueError(
547
+ "Either 'script' (for Python) or 'command' (for Docker) must be provided."
548
+ )
549
 
550
+ # Python mode: script provided
551
+ if script:
552
+ # Get dependencies and ensure hf-transfer is included
553
+ deps = _ensure_hf_transfer_dependency(args.get("dependencies"))
554
+
555
+ # Resolve the command based on script type
556
+ command = _resolve_uv_command(
557
+ script=script,
558
+ with_deps=deps,
559
+ python=args.get("python"),
560
+ script_args=args.get("script_args"),
561
+ )
562
 
563
+ # Use UV image unless overridden
564
+ image = args.get("image", UV_DEFAULT_IMAGE)
565
+ job_type = "Python"
566
 
567
+ # Docker mode: command provided
568
+ else:
569
+ image = args.get("image", "python:3.12")
570
+ job_type = "Docker"
 
 
 
 
 
 
 
 
 
 
 
571
 
572
+ # Create scheduled job
573
  scheduled_job = await _async_call(
574
  self.api.create_scheduled_job,
575
+ image=image,
576
  command=command,
577
  schedule=schedule,
578
  env=_add_environment_variables(args.get("env")),
579
  secrets=_add_environment_variables(args.get("secrets")),
580
+ flavor=args.get("hardware_flavor", "cpu-basic"),
581
  timeout=args.get("timeout", "30m"),
582
+ namespace=self.namespace,
583
  )
584
 
585
  scheduled_dict = _scheduled_job_info_to_dict(scheduled_job)
586
 
587
+ response = f"""✓ Scheduled {job_type} job created successfully!
588
 
589
  **Scheduled Job ID:** {scheduled_dict["id"]}
590
  **Schedule:** {scheduled_dict["schedule"]}
591
  **Suspended:** {"Yes" if scheduled_dict.get("suspend") else "No"}
592
  **Next Run:** {scheduled_dict.get("nextRun", "N/A")}
593
 
594
+ To inspect, call this tool with `{{"operation": "scheduled inspect", "scheduled_job_id": "{scheduled_dict["id"]}"}}`
595
+ To list all, call this tool with `{{"operation": "scheduled ps"}}`"""
596
 
597
  return {"formatted": response, "totalResults": 1, "resultsShared": 1}
598
 
599
  except Exception as e:
600
+ raise Exception(f"Failed to create scheduled job: {str(e)}")
601
 
602
  async def _list_scheduled_jobs(self, args: Dict[str, Any]) -> ToolResult:
603
  """List scheduled jobs using HfApi.list_scheduled_jobs()"""
604
  scheduled_jobs_list = await _async_call(
605
  self.api.list_scheduled_jobs,
606
+ namespace=self.namespace,
607
  )
608
 
609
  # Filter jobs - default: hide suspended jobs unless --all is specified
 
623
  "resultsShared": 0,
624
  }
625
  return {
626
+ "formatted": 'No active scheduled jobs found. Use `{"operation": "scheduled ps", "all": true}` to show suspended jobs.',
627
  "totalResults": 0,
628
  "resultsShared": 0,
629
  }
 
649
  scheduled_job = await _async_call(
650
  self.api.inspect_scheduled_job,
651
  scheduled_job_id=scheduled_job_id,
652
+ namespace=self.namespace,
653
  )
654
 
655
  scheduled_dict = _scheduled_job_info_to_dict(scheduled_job)
 
675
  await _async_call(
676
  self.api.delete_scheduled_job,
677
  scheduled_job_id=scheduled_job_id,
678
+ namespace=self.namespace,
679
  )
680
 
681
  return {
 
698
  await _async_call(
699
  self.api.suspend_scheduled_job,
700
  scheduled_job_id=scheduled_job_id,
701
+ namespace=self.namespace,
702
  )
703
 
704
  response = f"""✓ Scheduled job {scheduled_job_id} has been suspended.
705
 
706
+ To resume, call this tool with `{{"operation": "scheduled resume", "scheduled_job_id": "{scheduled_job_id}"}}`"""
707
 
708
  return {"formatted": response, "totalResults": 1, "resultsShared": 1}
709
 
 
721
  await _async_call(
722
  self.api.resume_scheduled_job,
723
  scheduled_job_id=scheduled_job_id,
724
+ namespace=self.namespace,
725
  )
726
 
727
  response = f"""✓ Scheduled job {scheduled_job_id} has been resumed.
728
 
729
+ To inspect, call this tool with `{{"operation": "scheduled inspect", "scheduled_job_id": "{scheduled_job_id}"}}`"""
730
 
731
  return {"formatted": response, "totalResults": 1, "resultsShared": 1}
732
 
 
735
  HF_JOBS_TOOL_SPEC = {
736
  "name": "hf_jobs",
737
  "description": (
738
+ "Manage Hugging Face CPU/GPU compute jobs. Run Python scripts with UV or custom Docker commands. "
739
+ "List, schedule and monitor jobs/logs.\n\n"
740
+ "## Operations\n"
741
+ "**Jobs:** run, ps, logs, inspect, cancel\n"
742
+ "**Scheduled Jobs:** scheduled run, scheduled ps, scheduled inspect, scheduled delete, scheduled suspend, scheduled resume\n\n"
743
+ "## 'run' Operation Behavior\n"
744
+ "The 'run' operation automatically detects what you want to do:\n"
745
+ "- If 'script' provided runs a Python script and auto installs dependencies in the env\n"
746
+ "- If 'command' provided runs a custom Docker command (full control)\n"
747
+ "- 'script' and 'command' are MUTUALLY EXCLUSIVE - provide one or the other, not both\n\n"
 
 
 
 
 
748
  "## Available Hardware Flavors\n"
749
  "**CPU:** cpu-basic, cpu-upgrade, cpu-performance, cpu-xl\n"
750
  "**GPU:** t4-small, t4-medium, l4x1, l4x4, a10g-small, a10g-large, a10g-largex2, a10g-largex4, a100-large, h100, h100x8\n"
751
  "**Specialized:** inf2x6\n\n"
752
  "## Usage Examples\n"
753
+ "**Run Python script with dependencies:**\n"
754
+ "{'operation': 'run', 'script': 'import torch\\nprint(torch.cuda.is_available())', 'dependencies': ['torch', 'transformers'], 'hardware_flavor': 'a10g-small'}\n\n"
755
+ "**Run Python with secrets:**\n"
756
+ "{'operation': 'run', 'script': 'from huggingface_hub import HfApi\\napi = HfApi()\\nprint(api.whoami())', 'dependencies': ['huggingface-hub']}\n\n"
757
+ "**Run custom Docker command:**\n"
758
+ "{'operation': 'run', 'image': 'nvidia/cuda:12.0-base', 'command': ['nvidia-smi']}\n\n"
759
  "**List running jobs:**\n"
760
  "{'operation': 'ps'}\n\n"
761
+ "**Get job logs:**\n"
762
+ "{'operation': 'logs', 'job_id': 'xxx'}\n\n"
763
+ "**Cancel job:**\n"
764
+ "{'operation': 'cancel', 'job_id': 'xxx'}\n\n"
765
+ "**Schedule daily Python job:**\n"
766
+ "{'operation': 'scheduled run', 'script': 'print(\"daily task\")', 'schedule': '@daily'}\n\n"
 
767
  "## Important Notes\n"
768
  "- **CRITICAL: Job files are EPHEMERAL** - ALL files created in HF Jobs (trained models, datasets, outputs, completions etc.) are DELETED when the job completes. You MUST upload any outputs to HF Hub in the script itself (using model.push_to_hub() when training models, dataset.push_to_hub() when creating text based outputs, etc.)."
769
  "- Always pass full script content - no local files available on server\n"
 
778
  "type": "string",
779
  "enum": [
780
  "run",
 
781
  "ps",
782
  "logs",
783
  "inspect",
784
  "cancel",
785
  "scheduled run",
 
786
  "scheduled ps",
787
  "scheduled inspect",
788
  "scheduled delete",
 
790
  "scheduled resume",
791
  ],
792
  "description": (
793
+ "Operation to execute. Valid values: [run, ps, logs, inspect, cancel, "
794
+ "scheduled run, scheduled ps, scheduled inspect, scheduled delete, "
795
  "scheduled suspend, scheduled resume]"
796
  ),
797
  },
798
+ # Python/UV specific parameters
799
+ "script": {
800
+ "type": "string",
801
+ "description": (
802
+ "Python code to execute. Can be inline code or a raw GitHub URL. "
803
+ "Auto-uses UV image and builds UV command. "
804
+ "USED with: 'run', 'scheduled run' (triggers Python mode). "
805
+ "MUTUALLY EXCLUSIVE with 'command'. "
806
+ "NOT USED with: 'ps', 'logs', 'inspect', 'cancel', 'scheduled ps/inspect/delete/suspend/resume'."
807
+ ),
808
+ },
809
+ "dependencies": {
810
+ "type": "array",
811
+ "items": {"type": "string"},
812
+ "description": (
813
+ "List of pip packages to install. Example: ['torch', 'transformers']. "
814
+ "Only used when 'script' is provided. "
815
+ "USED with: 'run', 'scheduled run' (optional, only with script). "
816
+ "NOT USED with: 'ps', 'logs', 'inspect', 'cancel', 'scheduled ps/inspect/delete/suspend/resume'."
817
+ ),
818
+ },
819
+ # Docker specific parameters
820
+ "image": {
821
+ "type": "string",
822
+ "description": (
823
+ "Docker image to use. Default: UV image if 'script' provided, else 'python:3.12'. "
824
+ "Can override the default UV image when using 'script'. "
825
+ "USED with: 'run', 'scheduled run' (optional). "
826
+ "NOT USED with: 'ps', 'logs', 'inspect', 'cancel', 'scheduled ps/inspect/delete/suspend/resume'."
827
+ ),
828
+ },
829
+ "command": {
830
+ "type": "array",
831
+ "items": {"type": "string"},
832
+ "description": (
833
+ "Command to execute as array. Example: ['python', '-c', 'print(42)']. "
834
+ "Use this for full Docker control. "
835
+ "USED with: 'run', 'scheduled run' (triggers Docker mode). "
836
+ "MUTUALLY EXCLUSIVE with 'script'. "
837
+ "NOT USED with: 'ps', 'logs', 'inspect', 'cancel', 'scheduled ps/inspect/delete/suspend/resume'."
838
+ ),
839
+ },
840
+ # Hardware and environment
841
+ "hardware_flavor": {
842
+ "type": "string",
843
+ "description": (
844
+ "Hardware flavor. CPU: cpu-basic, cpu-upgrade, cpu-performance, cpu-xl. "
845
+ "GPU: t4-small, t4-medium, l4x1, l4x4, a10g-small, a10g-large, a10g-largex2, a10g-largex4, a100-large, h100, h100x8. "
846
+ "Default: cpu-basic. "
847
+ "USED with: 'run', 'uv', 'scheduled run', 'scheduled uv' (optional). "
848
+ "NOT USED with: 'ps', 'logs', 'inspect', 'cancel', 'scheduled ps/inspect/delete/suspend/resume'."
849
+ ),
850
+ },
851
+ "secrets": {
852
  "type": "object",
853
  "description": (
854
+ "Secret environment variables. Format: {'KEY': 'VALUE'}. HF_TOKEN is loaded automatically. "
855
+ "USED with: 'run', 'uv', 'scheduled run', 'scheduled uv' (optional). "
856
+ "NOT USED with: 'ps', 'logs', 'inspect', 'cancel', 'scheduled ps/inspect/delete/suspend/resume'."
857
+ ),
858
+ },
859
+ # Job management parameters
860
+ "job_id": {
861
+ "type": "string",
862
+ "description": (
863
+ "Job ID to operate on. "
864
+ "REQUIRED for: 'logs', 'inspect', 'cancel'. "
865
+ "NOT USED with: 'run', 'uv', 'ps', 'scheduled run/uv/ps/inspect/delete/suspend/resume'."
866
+ ),
867
+ },
868
+ # Scheduled job parameters
869
+ "scheduled_job_id": {
870
+ "type": "string",
871
+ "description": (
872
+ "Scheduled job ID to operate on. "
873
+ "REQUIRED for: 'scheduled inspect', 'scheduled delete', 'scheduled suspend', 'scheduled resume'. "
874
+ "NOT USED with: 'run', 'uv', 'ps', 'logs', 'inspect', 'cancel', 'scheduled run', 'scheduled uv', 'scheduled ps'."
875
+ ),
876
+ },
877
+ "schedule": {
878
+ "type": "string",
879
+ "description": (
880
+ "Cron schedule or preset. Presets: '@hourly', '@daily', '@weekly', '@monthly', '@yearly'. "
881
+ "Cron example: '0 9 * * 1' (9 AM every Monday). "
882
+ "REQUIRED for: 'scheduled run', 'scheduled uv'. "
883
+ "NOT USED with: 'run', 'uv', 'ps', 'logs', 'inspect', 'cancel', 'scheduled ps/inspect/delete/suspend/resume'."
884
  ),
 
885
  },
886
  },
887
+ "required": ["operation"],
888
  },
889
  }
890