| | """ |
| | GPU Tools for SPARKNET |
| | Tools for GPU monitoring and management |
| | """ |
| |
|
| | from typing import Optional |
| | from loguru import logger |
| | from .base_tool import BaseTool, ToolResult |
| | from ..utils.gpu_manager import get_gpu_manager |
| |
|
| |
|
| | class GPUMonitorTool(BaseTool): |
| | """Tool for monitoring GPU status.""" |
| |
|
| | def __init__(self): |
| | super().__init__( |
| | name="gpu_monitor", |
| | description="Monitor GPU status, memory usage, and utilization", |
| | ) |
| | self.add_parameter("gpu_id", "int", "Specific GPU ID to monitor (optional)", required=False, default=None) |
| | self.gpu_manager = get_gpu_manager() |
| |
|
| | async def execute(self, gpu_id: Optional[int] = None, **kwargs) -> ToolResult: |
| | """ |
| | Monitor GPU status. |
| | |
| | Args: |
| | gpu_id: Specific GPU ID or None for all GPUs |
| | |
| | Returns: |
| | ToolResult with GPU information |
| | """ |
| | try: |
| | if gpu_id is not None: |
| | |
| | info = self.gpu_manager.get_gpu_info(gpu_id) |
| |
|
| | if "error" in info: |
| | return ToolResult( |
| | success=False, |
| | output=None, |
| | error=info["error"], |
| | ) |
| |
|
| | output = self._format_gpu_info(info) |
| |
|
| | return ToolResult( |
| | success=True, |
| | output=output, |
| | metadata=info, |
| | ) |
| | else: |
| | |
| | all_info = self.gpu_manager.get_all_gpu_info() |
| |
|
| | output_lines = [] |
| | for info in all_info: |
| | if "error" not in info: |
| | output_lines.append(self._format_gpu_info(info)) |
| |
|
| | output = "\n\n".join(output_lines) |
| |
|
| | return ToolResult( |
| | success=True, |
| | output=output, |
| | metadata={"gpus": all_info}, |
| | ) |
| |
|
| | except Exception as e: |
| | logger.error(f"GPU monitoring error: {e}") |
| | return ToolResult( |
| | success=False, |
| | output=None, |
| | error=f"Monitoring error: {str(e)}", |
| | ) |
| |
|
| | def _format_gpu_info(self, info: dict) -> str: |
| | """Format GPU info for display.""" |
| | return ( |
| | f"GPU {info['gpu_id']}: {info['name']}\n" |
| | f" Memory: {info['memory_used'] / 1024**3:.2f} GB / {info['memory_total'] / 1024**3:.2f} GB " |
| | f"({info['memory_percent']:.1f}% used)\n" |
| | f" Free Memory: {info['memory_free'] / 1024**3:.2f} GB\n" |
| | f" GPU Utilization: {info['gpu_utilization']}%\n" |
| | f" Temperature: {info['temperature']}°C" |
| | ) |
| |
|
| |
|
| | class GPUSelectTool(BaseTool): |
| | """Tool for selecting best available GPU.""" |
| |
|
| | def __init__(self): |
| | super().__init__( |
| | name="gpu_select", |
| | description="Select the best available GPU based on free memory", |
| | ) |
| | self.add_parameter("min_memory_gb", "float", "Minimum required memory in GB", required=False, default=8.0) |
| | self.gpu_manager = get_gpu_manager() |
| |
|
| | async def execute(self, min_memory_gb: float = 8.0, **kwargs) -> ToolResult: |
| | """ |
| | Select best GPU. |
| | |
| | Args: |
| | min_memory_gb: Minimum required memory |
| | |
| | Returns: |
| | ToolResult with selected GPU ID |
| | """ |
| | try: |
| | gpu_id = self.gpu_manager.select_best_gpu(min_memory_gb) |
| |
|
| | if gpu_id is None: |
| | return ToolResult( |
| | success=False, |
| | output=None, |
| | error=f"No GPU found with {min_memory_gb} GB free memory", |
| | ) |
| |
|
| | info = self.gpu_manager.get_gpu_info(gpu_id) |
| |
|
| | output = ( |
| | f"Selected GPU {gpu_id}: {info['name']}\n" |
| | f"Free Memory: {info['memory_free'] / 1024**3:.2f} GB" |
| | ) |
| |
|
| | return ToolResult( |
| | success=True, |
| | output=output, |
| | metadata={ |
| | "gpu_id": gpu_id, |
| | "gpu_info": info, |
| | }, |
| | ) |
| |
|
| | except Exception as e: |
| | logger.error(f"GPU selection error: {e}") |
| | return ToolResult( |
| | success=False, |
| | output=None, |
| | error=f"Selection error: {str(e)}", |
| | ) |
| |
|