import torch import comfy.model_management from ..core import logger import os import platform def is_jetson() -> bool: """ Determines if the Python environment is running on a Jetson device by checking the device model information or the platform release. """ PROC_DEVICE_MODEL = '' try: with open('/proc/device-tree/model', 'r') as f: PROC_DEVICE_MODEL = f.read().strip() logger.info(f"Device model: {PROC_DEVICE_MODEL}") return "NVIDIA" in PROC_DEVICE_MODEL except Exception as e: # logger.warning(f"JETSON: Could not read /proc/device-tree/model: {e} (If you're not using Jetson, ignore this warning)") # If /proc/device-tree/model is not available, check platform.release() platform_release = platform.release() logger.info(f"Platform release: {platform_release}") if 'tegra' in platform_release.lower(): logger.info("Detected 'tegra' in platform release. Assuming Jetson device.") return True else: logger.info("JETSON: Not detected.") return False IS_JETSON = is_jetson() class CGPUInfo: """ This class is responsible for getting information from GPU (ONLY). """ cuda = False pynvmlLoaded = False jtopLoaded = False cudaAvailable = False torchDevice = 'cpu' cudaDevice = 'cpu' cudaDevicesFound = 0 switchGPU = True switchVRAM = True switchTemperature = True gpus = [] gpusUtilization = [] gpusVRAM = [] gpusTemperature = [] def __init__(self): if IS_JETSON: # Try to import jtop for Jetson devices try: from jtop import jtop self.jtopInstance = jtop() self.jtopInstance.start() self.jtopLoaded = True logger.info('jtop initialized on Jetson device.') except ImportError as e: logger.error('jtop is not installed. ' + str(e)) except Exception as e: logger.error('Could not initialize jtop. ' + str(e)) else: # Try to import pynvml for non-Jetson devices try: import pynvml self.pynvml = pynvml self.pynvml.nvmlInit() self.pynvmlLoaded = True logger.info('pynvml (NVIDIA) initialized.') except ImportError as e: logger.error('pynvml is not installed. ' + str(e)) except Exception as e: logger.error('Could not init pynvml (NVIDIA). ' + str(e)) self.anygpuLoaded = self.pynvmlLoaded or self.jtopLoaded try: self.torchDevice = comfy.model_management.get_torch_device_name(comfy.model_management.get_torch_device()) except Exception as e: logger.error('Could not pick default device. ' + str(e)) if self.pynvmlLoaded and not self.jtopLoaded and not self.deviceGetCount(): logger.warning('No GPU detected, disabling GPU monitoring.') self.anygpuLoaded = False self.pynvmlLoaded = False self.jtopLoaded = False if self.anygpuLoaded: if self.deviceGetCount() > 0: self.cudaDevicesFound = self.deviceGetCount() logger.info(f"GPU/s:") for deviceIndex in range(self.cudaDevicesFound): deviceHandle = self.deviceGetHandleByIndex(deviceIndex) gpuName = self.deviceGetName(deviceHandle, deviceIndex) logger.info(f"{deviceIndex}) {gpuName}") self.gpus.append({ 'index': deviceIndex, 'name': gpuName, }) # Same index as gpus, with default values self.gpusUtilization.append(True) self.gpusVRAM.append(True) self.gpusTemperature.append(True) self.cuda = True logger.info(self.systemGetDriverVersion()) else: logger.warning('No GPU with CUDA detected.') else: logger.warning('No GPU monitoring libraries available.') self.cudaDevice = 'cpu' if self.torchDevice == 'cpu' else 'cuda' self.cudaAvailable = torch.cuda.is_available() if self.cuda and self.cudaAvailable and self.torchDevice == 'cpu': logger.warning('CUDA is available, but torch is using CPU.') def getInfo(self): logger.debug('Getting GPUs info...') return self.gpus def getStatus(self): gpuUtilization = -1 gpuTemperature = -1 vramUsed = -1 vramTotal = -1 vramPercent = -1 gpuType = '' gpus = [] if self.cudaDevice == 'cpu': gpuType = 'cpu' gpus.append({ 'gpu_utilization': -1, 'gpu_temperature': -1, 'vram_total': -1, 'vram_used': -1, 'vram_used_percent': -1, }) else: gpuType = self.cudaDevice if self.anygpuLoaded and self.cuda and self.cudaAvailable: for deviceIndex in range(self.cudaDevicesFound): deviceHandle = self.deviceGetHandleByIndex(deviceIndex) gpuUtilization = -1 vramPercent = -1 vramUsed = -1 vramTotal = -1 gpuTemperature = -1 # GPU Utilization if self.switchGPU and self.gpusUtilization[deviceIndex]: try: gpuUtilization = self.deviceGetUtilizationRates(deviceHandle) except Exception as e: logger.error('Could not get GPU utilization. ' + str(e)) logger.error('Monitor of GPU is turning off.') self.switchGPU = False if self.switchVRAM and self.gpusVRAM[deviceIndex]: try: memory = self.deviceGetMemoryInfo(deviceHandle) vramUsed = memory['used'] vramTotal = memory['total'] # Check if vramTotal is not zero or None if vramTotal and vramTotal != 0: vramPercent = vramUsed / vramTotal * 100 except Exception as e: logger.error('Could not get GPU memory info. ' + str(e)) self.switchVRAM = False # Temperature if self.switchTemperature and self.gpusTemperature[deviceIndex]: try: gpuTemperature = self.deviceGetTemperature(deviceHandle) except Exception as e: logger.error('Could not get GPU temperature. Turning off this feature. ' + str(e)) self.switchTemperature = False gpus.append({ 'gpu_utilization': gpuUtilization, 'gpu_temperature': gpuTemperature, 'vram_total': vramTotal, 'vram_used': vramUsed, 'vram_used_percent': vramPercent, }) return { 'device_type': gpuType, 'gpus': gpus, } def deviceGetCount(self): if self.pynvmlLoaded: return self.pynvml.nvmlDeviceGetCount() elif self.jtopLoaded: # For Jetson devices, we assume there's one GPU return 1 else: return 0 def deviceGetHandleByIndex(self, index): if self.pynvmlLoaded: return self.pynvml.nvmlDeviceGetHandleByIndex(index) elif self.jtopLoaded: return index # On Jetson, index acts as handle else: return 0 def deviceGetName(self, deviceHandle, deviceIndex): if self.pynvmlLoaded: gpuName = 'Unknown GPU' try: gpuName = self.pynvml.nvmlDeviceGetName(deviceHandle) try: gpuName = gpuName.decode('utf-8', errors='ignore') except AttributeError: pass except UnicodeDecodeError as e: gpuName = 'Unknown GPU (decoding error)' logger.error(f"UnicodeDecodeError: {e}") return gpuName elif self.jtopLoaded: # Access the GPU name from self.jtopInstance.gpu try: gpu_info = self.jtopInstance.gpu gpu_name = next(iter(gpu_info.keys())) return gpu_name except Exception as e: logger.error('Could not get GPU name. ' + str(e)) return 'Unknown GPU' else: return '' def systemGetDriverVersion(self): if self.pynvmlLoaded: return f'NVIDIA Driver: {self.pynvml.nvmlSystemGetDriverVersion()}' elif self.jtopLoaded: # No direct method to get driver version from jtop return 'NVIDIA Driver: unknown' else: return 'Driver unknown' def deviceGetUtilizationRates(self, deviceHandle): if self.pynvmlLoaded: return self.pynvml.nvmlDeviceGetUtilizationRates(deviceHandle).gpu elif self.jtopLoaded: # GPU utilization from jtop stats try: gpu_util = self.jtopInstance.stats.get('GPU', -1) return gpu_util except Exception as e: logger.error('Could not get GPU utilization. ' + str(e)) return -1 else: return 0 def deviceGetMemoryInfo(self, deviceHandle): if self.pynvmlLoaded: mem = self.pynvml.nvmlDeviceGetMemoryInfo(deviceHandle) return {'total': mem.total, 'used': mem.used} elif self.jtopLoaded: mem_data = self.jtopInstance.memory['RAM'] total = mem_data['tot'] used = mem_data['used'] return {'total': total, 'used': used} else: return {'total': 1, 'used': 1} def deviceGetTemperature(self, deviceHandle): if self.pynvmlLoaded: return self.pynvml.nvmlDeviceGetTemperature(deviceHandle, self.pynvml.NVML_TEMPERATURE_GPU) elif self.jtopLoaded: try: temperature = self.jtopInstance.stats.get('Temp gpu', -1) return temperature except Exception as e: logger.error('Could not get GPU temperature. ' + str(e)) return -1 else: return 0 def close(self): if self.jtopLoaded and self.jtopInstance is not None: self.jtopInstance.close()