Spaces:
Paused
Paused
| import platform | |
| import subprocess | |
| import os | |
| import psutil | |
| import torch | |
| from typing import Optional, Dict, Tuple, Union | |
| NumericValue = Union[int, float] | |
| MetricsDict = Dict[str, NumericValue] | |
| class SystemMonitor: | |
| def get_nvidia_gpu_info() -> Tuple[str, MetricsDict, Optional[str]]: | |
| """Get NVIDIA GPU information and metrics for GPU 0.""" | |
| metrics = {} | |
| gpu_name_from_torch = "NVIDIA GPU (name unavailable)" | |
| warning_message = None # To indicate if nvidia-smi failed and we're using PyTorch fallback | |
| try: | |
| gpu_name_from_torch = f"{torch.cuda.get_device_name(0)}" | |
| except Exception: | |
| # If even the name fails, nvidia-smi is highly likely to fail too. | |
| # Prepare basic PyTorch metrics as the ultimate fallback. | |
| metrics = { | |
| 'memory_used_gb': torch.cuda.memory_allocated(0) / 1024**3 if torch.cuda.is_available() else 0, | |
| 'memory_total_gb': torch.cuda.get_device_properties(0).total_memory / 1024**3 if torch.cuda.is_available() else 0, | |
| # Add placeholders for other metrics to maintain UI symmetry if nvidia-smi fails | |
| 'memory_reserved_gb': 0.0, | |
| 'temperature': 0.0, | |
| 'utilization': 0.0 | |
| } | |
| warning_message = "Could not get GPU name via PyTorch. nvidia-smi likely to fail or has failed. Displaying basic PyTorch memory (application-specific)." | |
| return gpu_name_from_torch, metrics, warning_message | |
| # Query for memory.used, memory.total, memory.reserved, temperature.gpu, utilization.gpu | |
| nvidia_smi_common_args = [ | |
| 'nvidia-smi', | |
| '--query-gpu=memory.used,memory.total,memory.reserved,temperature.gpu,utilization.gpu', | |
| '--format=csv,nounits,noheader' | |
| ] | |
| smi_output_str = None | |
| try: | |
| # Attempt 1: Query specific GPU 0 | |
| smi_output_str = subprocess.check_output( | |
| nvidia_smi_common_args + ['--id=0'], | |
| encoding='utf-8', timeout=1.5, stderr=subprocess.PIPE | |
| ) | |
| except (subprocess.SubprocessError, FileNotFoundError, ValueError) as e1: | |
| # print(f"nvidia-smi with --id=0 failed: {type(e1).__name__}. Trying general query.") | |
| try: | |
| # Attempt 2: Query all GPUs and parse the first line | |
| smi_output_str = subprocess.check_output( | |
| nvidia_smi_common_args, # Without --id=0 | |
| encoding='utf-8', timeout=1.5, stderr=subprocess.PIPE | |
| ) | |
| if smi_output_str: | |
| smi_output_str = smi_output_str.strip().split('\n')[0] # Take the first line | |
| except (subprocess.SubprocessError, FileNotFoundError, ValueError) as e2: | |
| # print(f"nvidia-smi (general query) also failed: {type(e2).__name__}. Falling back to torch.cuda.") | |
| # Fallback to basic CUDA info from PyTorch, plus placeholders for UI | |
| metrics = { | |
| 'memory_used_gb': torch.cuda.memory_allocated(0) / 1024**3 if torch.cuda.is_available() else 0, | |
| 'memory_total_gb': torch.cuda.get_device_properties(0).total_memory / 1024**3 if torch.cuda.is_available() else 0, | |
| 'memory_reserved_gb': 0.0, # Placeholder | |
| 'temperature': 0.0, # Placeholder | |
| 'utilization': 0.0 # Placeholder | |
| } | |
| warning_message = "nvidia-smi failed. GPU Memory Used is PyTorch specific (not total). Other GPU stats unavailable." | |
| return gpu_name_from_torch, metrics, warning_message | |
| if smi_output_str: | |
| parts = smi_output_str.strip().split(',') | |
| if len(parts) == 5: # memory.used, memory.total, memory.reserved, temp, util | |
| memory_used_mib, memory_total_mib, memory_reserved_mib, temp, util = map(float, parts) | |
| metrics = { | |
| 'memory_used_gb': memory_used_mib / 1024, | |
| 'memory_total_gb': memory_total_mib / 1024, | |
| 'memory_reserved_gb': memory_reserved_mib / 1024, # This is from nvidia-smi's memory.reserved | |
| 'temperature': temp, | |
| 'utilization': util | |
| } | |
| else: | |
| # print(f"Unexpected nvidia-smi output format: {smi_output_str}. Parts: {len(parts)}") | |
| warning_message = "nvidia-smi output format unexpected. Some GPU stats may be missing or inaccurate." | |
| # Fallback with placeholders to maintain UI structure | |
| metrics = { | |
| 'memory_used_gb': torch.cuda.memory_allocated(0) / 1024**3 if torch.cuda.is_available() else 0, # PyTorch fallback | |
| 'memory_total_gb': torch.cuda.get_device_properties(0).total_memory / 1024**3 if torch.cuda.is_available() else 0, # PyTorch fallback | |
| 'memory_reserved_gb': 0.0, | |
| 'temperature': 0.0, | |
| 'utilization': 0.0 | |
| } | |
| if len(parts) >= 2: # Try to parse what we can if format is just partially off | |
| try: metrics['memory_used_gb'] = float(parts[0]) / 1024 | |
| except: pass | |
| try: metrics['memory_total_gb'] = float(parts[1]) / 1024 | |
| except: pass | |
| else: # Should have been caught by try-except, but as a final safety | |
| metrics = { | |
| 'memory_used_gb': 0.0, 'memory_total_gb': 0.0, 'memory_reserved_gb': 0.0, | |
| 'temperature': 0.0, 'utilization': 0.0 | |
| } | |
| warning_message = "Failed to get any output from nvidia-smi." | |
| return gpu_name_from_torch, metrics, warning_message | |
| def get_mac_gpu_info() -> Tuple[str, MetricsDict, Optional[str]]: # Add warning return for consistency | |
| """Get Apple Silicon GPU information without requiring sudo.""" | |
| metrics = {} | |
| warning_message = None | |
| try: | |
| memory = psutil.virtual_memory() | |
| metrics = { | |
| 'memory_total_gb': memory.total / (1024**3), | |
| 'memory_used_gb': memory.used / (1024**3), # This is system RAM, reported as "Unified Memory" | |
| 'utilization': psutil.cpu_percent(), # Still CPU usage as proxy | |
| # Placeholders for Mac to match NVIDIA's output structure for UI symmetry | |
| 'memory_reserved_gb': 0.0, # N/A for unified memory in this context | |
| 'temperature': 0.0 # Not easily available without sudo | |
| } | |
| if metrics['utilization'] == psutil.cpu_percent(): # Check if it's actually CPU util | |
| warning_message = "Mac GPU Load is proxied by CPU Usage." | |
| except Exception as e: | |
| # print(f"Error getting Mac info: {e}") | |
| metrics = { | |
| 'memory_total_gb': 0.0, 'memory_used_gb': 0.0, 'utilization': 0.0, | |
| 'memory_reserved_gb': 0.0, 'temperature': 0.0 | |
| } | |
| warning_message = "Could not retrieve Mac system info." | |
| return "Apple Silicon GPU", metrics, warning_message # Changed name for clarity | |
| def get_amd_gpu_info() -> Tuple[str, MetricsDict, Optional[str]]: # Add warning return | |
| """Get AMD GPU information.""" | |
| metrics = { # Initialize with placeholders for all expected keys for UI symmetry | |
| 'memory_used_gb': 0.0, | |
| 'memory_total_gb': 0.0, | |
| 'memory_reserved_gb': 0.0, # Typically N/A or not reported by rocm-smi in a 'reserved' sense | |
| 'temperature': 0.0, | |
| 'utilization': 0.0 | |
| } | |
| warning_message = None | |
| source = "unknown" | |
| try: | |
| # Try rocm-smi first | |
| try: | |
| result = subprocess.check_output(['rocm-smi', '--showmeminfo', 'vram', '--showtemp', '--showuse'], encoding='utf-8', timeout=1.5, stderr=subprocess.PIPE) | |
| # Example rocm-smi output parsing (highly dependent on actual output format) | |
| # This needs to be robust or use a more structured output format like --json if rocm-smi supports it | |
| # For VRAM Used/Total: | |
| # GPU[0] VRAM Usage: 2020M/16368M | |
| # For Temp: | |
| # GPU[0] Temperature: 34c | |
| # For Util: | |
| # GPU[0] GPU Use: 0% | |
| lines = result.strip().split('\n') | |
| for line in lines: | |
| if line.startswith("GPU[0]"): # Assuming card 0 | |
| if "VRAM Usage:" in line: | |
| mem_parts = line.split("VRAM Usage:")[1].strip().split('/') | |
| metrics['memory_used_gb'] = float(mem_parts[0].replace('M', '')) / 1024 | |
| metrics['memory_total_gb'] = float(mem_parts[1].replace('M', '')) / 1024 | |
| source = "rocm-smi" | |
| elif "Temperature:" in line: | |
| metrics['temperature'] = float(line.split("Temperature:")[1].strip().replace('c', '')) | |
| source = "rocm-smi" | |
| elif "GPU Use:" in line: | |
| metrics['utilization'] = float(line.split("GPU Use:")[1].strip().replace('%', '')) | |
| source = "rocm-smi" | |
| if source != "rocm-smi": # if parsing failed or fields were missing | |
| warning_message = "rocm-smi ran but output parsing failed." | |
| except (subprocess.SubprocessError, FileNotFoundError, ValueError) as e_rocm: | |
| # print(f"rocm-smi failed: {e_rocm}. Trying sysfs.") | |
| warning_message = "rocm-smi not found or failed. " | |
| # Try sysfs as fallback on Linux | |
| if platform.system() == "Linux": | |
| base_path = "/sys/class/drm/card0/device" # This assumes card0 | |
| sysfs_found_any = False | |
| try: | |
| with open(f"{base_path}/hwmon/hwmon0/temp1_input") as f: # Check for specific hwmon index | |
| metrics['temperature'] = float(f.read().strip()) / 1000 | |
| sysfs_found_any = True | |
| except (FileNotFoundError, PermissionError, ValueError): pass # Ignore if specific file not found | |
| try: | |
| with open(f"{base_path}/mem_info_vram_total") as f: | |
| metrics['memory_total_gb'] = int(f.read().strip()) / (1024**3) # Bytes to GiB | |
| with open(f"{base_path}/mem_info_vram_used") as f: | |
| metrics['memory_used_gb'] = int(f.read().strip()) / (1024**3) # Bytes to GiB | |
| sysfs_found_any = True | |
| except (FileNotFoundError, PermissionError, ValueError): pass | |
| try: | |
| with open(f"{base_path}/gpu_busy_percent") as f: | |
| metrics['utilization'] = float(f.read().strip()) | |
| sysfs_found_any = True | |
| except (FileNotFoundError, PermissionError, ValueError): pass | |
| if sysfs_found_any: | |
| source = "sysfs" | |
| warning_message += "Using sysfs (may be incomplete)." | |
| else: | |
| warning_message += "sysfs also failed or provided no data." | |
| else: | |
| warning_message += "Not on Linux, no sysfs fallback." | |
| except Exception as e_amd_main: # Catch-all for unforeseen issues in AMD block | |
| # print(f"General error in get_amd_gpu_info: {e_amd_main}") | |
| warning_message = (warning_message or "") + " Unexpected error in AMD GPU info gathering." | |
| return f"AMD GPU ({source})", metrics, warning_message | |
| def is_amd_gpu() -> bool: # No changes needed here | |
| try: | |
| # Check for rocm-smi first as it's more definitive | |
| rocm_smi_exists = False | |
| try: | |
| subprocess.check_call(['rocm-smi', '-h'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=0.5) | |
| rocm_smi_exists = True | |
| except (subprocess.SubprocessError, FileNotFoundError): | |
| pass # rocm-smi not found or errored | |
| if rocm_smi_exists: | |
| return True | |
| # Fallback to sysfs check if on Linux | |
| if platform.system() == "Linux" and os.path.exists('/sys/class/drm/card0/device/vendor'): | |
| with open('/sys/class/drm/card0/device/vendor', 'r') as f: | |
| return '0x1002' in f.read() # AMD's PCI vendor ID | |
| return False | |
| except: | |
| return False | |
| def get_system_info(cls) -> str: | |
| """Get detailed system status with support for different GPU types.""" | |
| gpu_name_display: Optional[str] = None | |
| metrics: MetricsDict = {} | |
| gpu_warning: Optional[str] = None | |
| try: | |
| # Determine GPU type and get metrics | |
| if torch.cuda.is_available(): # Implies NVIDIA usually | |
| gpu_name_display, metrics, gpu_warning = cls.get_nvidia_gpu_info() | |
| elif platform.system() == "Darwin" and platform.processor() == "arm": # Apple Silicon | |
| gpu_name_display, metrics, gpu_warning = cls.get_mac_gpu_info() | |
| elif cls.is_amd_gpu(): # Check for AMD (works on Linux, might need refinement for Windows if not using PyTorch ROCm) | |
| gpu_name_display, metrics, gpu_warning = cls.get_amd_gpu_info() | |
| else: # No specific GPU detected by these primary checks | |
| # Could add a PyTorch ROCm check here if desired for AMD on Windows/Linux without rocm-smi | |
| # if hasattr(torch, 'rocm_is_available') and torch.rocm_is_available(): | |
| # gpu_name_display = "AMD GPU (via PyTorch ROCm)" | |
| # metrics = { ... basic torch.rocm metrics ... } | |
| pass | |
| # Format GPU info based on available metrics | |
| if gpu_name_display: | |
| gpu_info_lines = [f"🎮 GPU: {gpu_name_display}"] | |
| # Standard memory reporting | |
| if 'memory_used_gb' in metrics and 'memory_total_gb' in metrics: | |
| mem_label = "GPU Memory" | |
| if platform.system() == "Darwin" and platform.processor() == "arm": | |
| mem_label = "Unified Memory" # For Apple Silicon | |
| gpu_info_lines.append( | |
| f"📊 {mem_label}: {metrics.get('memory_used_gb', 0.0):.1f}GB / {metrics.get('memory_total_gb', 0.0):.1f}GB" | |
| ) | |
| # VRAM Reserved (NVIDIA specific from nvidia-smi, or placeholder) | |
| # if 'memory_reserved_gb' in metrics and torch.cuda.is_available() and not (platform.system() == "Darwin"): # Show for NVIDIA, not Mac | |
| # gpu_info_lines.append(f"💾 VRAM Reserved: {metrics.get('memory_reserved_gb', 0.0):.1f}GB") | |
| if 'temperature' in metrics and metrics.get('temperature', 0.0) > 0: # Only show if temp is valid | |
| gpu_info_lines.append(f"🌡️ GPU Temp: {metrics.get('temperature', 0.0):.0f}°C") | |
| if 'utilization' in metrics: | |
| gpu_info_lines.append(f"⚡ GPU Load: {metrics.get('utilization', 0.0):.0f}%") | |
| if gpu_warning: # Display any warning from the GPU info functions | |
| gpu_info_lines.append(f"⚠️ {gpu_warning}") | |
| gpu_section = "\n".join(gpu_info_lines) + "\n" | |
| else: | |
| gpu_section = "🎮 GPU: No dedicated GPU detected or supported\n" | |
| # Get CPU info | |
| cpu_count = psutil.cpu_count(logical=False) # Physical cores | |
| cpu_threads = psutil.cpu_count(logical=True) # Logical processors | |
| cpu_info = f"💻 CPU: {cpu_count or 'N/A'} Cores, {cpu_threads or 'N/A'} Threads\n" | |
| cpu_usage = f"⚡ CPU Usage: {psutil.cpu_percent()}%\n" | |
| # Get RAM info | |
| ram = psutil.virtual_memory() | |
| ram_used_gb = ram.used / (1024**3) | |
| ram_total_gb = ram.total / (1024**3) | |
| ram_info = f"🎯 System RAM: {ram_used_gb:.1f}GB / {ram_total_gb:.1f}GB ({ram.percent}%)" | |
| return f"{gpu_section}{cpu_info}{cpu_usage}{ram_info}" | |
| except Exception as e: | |
| # print(f"Overall error in get_system_info: {e}") | |
| # import traceback; print(traceback.format_exc()) | |
| return f"Error collecting system info: {str(e)}" |