| import os |
| import threading |
| import warnings |
| from typing import TYPE_CHECKING, Any |
|
|
| if TYPE_CHECKING: |
| from trackio.run import Run |
|
|
| pynvml: Any = None |
| PYNVML_AVAILABLE = False |
| _nvml_initialized = False |
| _nvml_lock = threading.Lock() |
| _energy_baseline: dict[int, float] = {} |
|
|
|
|
| def _ensure_pynvml(): |
| global PYNVML_AVAILABLE, pynvml |
| if PYNVML_AVAILABLE: |
| return pynvml |
| try: |
| import pynvml as _pynvml |
|
|
| pynvml = _pynvml |
| PYNVML_AVAILABLE = True |
| return pynvml |
| except ImportError: |
| raise ImportError( |
| "nvidia-ml-py is required for GPU monitoring. " |
| "Install it with: pip install nvidia-ml-py" |
| ) |
|
|
|
|
| def _init_nvml() -> bool: |
| global _nvml_initialized |
| with _nvml_lock: |
| if _nvml_initialized: |
| return True |
| try: |
| nvml = _ensure_pynvml() |
| nvml.nvmlInit() |
| _nvml_initialized = True |
| return True |
| except Exception: |
| return False |
|
|
|
|
| def get_gpu_count() -> tuple[int, list[int]]: |
| """ |
| Get the number of GPUs visible to this process and their physical indices. |
| Respects CUDA_VISIBLE_DEVICES environment variable. |
| |
| Returns: |
| Tuple of (count, physical_indices) where: |
| - count: Number of visible GPUs |
| - physical_indices: List mapping logical index to physical GPU index. |
| e.g., if CUDA_VISIBLE_DEVICES=2,3 returns (2, [2, 3]) |
| meaning logical GPU 0 = physical GPU 2, logical GPU 1 = physical GPU 3 |
| """ |
| if not _init_nvml(): |
| return 0, [] |
|
|
| cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES") |
| if cuda_visible is not None and cuda_visible.strip(): |
| try: |
| indices = [int(x.strip()) for x in cuda_visible.split(",") if x.strip()] |
| return len(indices), indices |
| except ValueError: |
| pass |
|
|
| try: |
| total = pynvml.nvmlDeviceGetCount() |
| return total, list(range(total)) |
| except Exception: |
| return 0, [] |
|
|
|
|
| def get_all_gpu_count() -> tuple[int, list[int]]: |
| """ |
| Get the total number of physical GPUs on the machine, ignoring CUDA_VISIBLE_DEVICES. |
| |
| Returns: |
| Tuple of (count, physical_indices) for ALL GPUs on the machine. |
| e.g., on a 4-GPU machine returns (4, [0, 1, 2, 3]) regardless of |
| CUDA_VISIBLE_DEVICES setting. |
| """ |
| if not _init_nvml(): |
| return 0, [] |
|
|
| try: |
| total = pynvml.nvmlDeviceGetCount() |
| return total, list(range(total)) |
| except Exception: |
| return 0, [] |
|
|
|
|
| def gpu_available() -> bool: |
| """ |
| Check if GPU monitoring is available. |
| |
| Returns True if nvidia-ml-py is installed and at least one NVIDIA GPU is detected. |
| This is used for auto-detection of GPU logging. |
| """ |
| try: |
| _ensure_pynvml() |
| count, _ = get_gpu_count() |
| return count > 0 |
| except ImportError: |
| return False |
| except Exception: |
| return False |
|
|
|
|
| def reset_energy_baseline(): |
| """Reset the energy baseline for all GPUs. Called when a new run starts.""" |
| global _energy_baseline |
| _energy_baseline = {} |
|
|
|
|
| def collect_gpu_metrics(device: int | None = None, all_gpus: bool = False) -> dict: |
| """ |
| Collect GPU metrics for visible GPUs. |
| |
| Args: |
| device: CUDA device index to collect metrics from. If None, collects |
| from all GPUs visible to this process (respects CUDA_VISIBLE_DEVICES). |
| The device index is the logical CUDA index (0, 1, 2...), not the |
| physical GPU index. |
| all_gpus: If True and device is None, collect metrics for ALL physical GPUs |
| on the machine, ignoring CUDA_VISIBLE_DEVICES. Used by GpuMonitor |
| to report system-wide GPU metrics in distributed training. |
| |
| Returns: |
| Dictionary of GPU metrics. Keys use device indices (gpu/0/, gpu/1/, etc.). |
| """ |
| if not _init_nvml(): |
| return {} |
|
|
| if all_gpus and device is None: |
| gpu_count, visible_gpus = get_all_gpu_count() |
| else: |
| gpu_count, visible_gpus = get_gpu_count() |
| if gpu_count == 0: |
| return {} |
|
|
| if device is not None: |
| if device < 0 or device >= gpu_count: |
| return {} |
| gpu_indices = [(device, visible_gpus[device])] |
| else: |
| gpu_indices = list(enumerate(visible_gpus)) |
|
|
| metrics = {} |
| total_util = 0.0 |
| total_mem_used_gib = 0.0 |
| total_power = 0.0 |
| max_temp = 0.0 |
| valid_util_count = 0 |
|
|
| for logical_idx, physical_idx in gpu_indices: |
| prefix = f"gpu/{logical_idx}" |
| try: |
| handle = pynvml.nvmlDeviceGetHandleByIndex(physical_idx) |
|
|
| try: |
| util = pynvml.nvmlDeviceGetUtilizationRates(handle) |
| metrics[f"{prefix}/utilization"] = util.gpu |
| metrics[f"{prefix}/memory_utilization"] = util.memory |
| total_util += util.gpu |
| valid_util_count += 1 |
| except Exception: |
| pass |
|
|
| try: |
| mem = pynvml.nvmlDeviceGetMemoryInfo(handle) |
| mem_used_gib = mem.used / (1024**3) |
| mem_total_gib = mem.total / (1024**3) |
| metrics[f"{prefix}/allocated_memory"] = mem_used_gib |
| metrics[f"{prefix}/total_memory"] = mem_total_gib |
| if mem.total > 0: |
| metrics[f"{prefix}/memory_usage"] = mem.used / mem.total |
| total_mem_used_gib += mem_used_gib |
| except Exception: |
| pass |
|
|
| try: |
| power_mw = pynvml.nvmlDeviceGetPowerUsage(handle) |
| power_w = power_mw / 1000.0 |
| metrics[f"{prefix}/power"] = power_w |
| total_power += power_w |
| except Exception: |
| pass |
|
|
| try: |
| power_limit_mw = pynvml.nvmlDeviceGetPowerManagementLimit(handle) |
| power_limit_w = power_limit_mw / 1000.0 |
| metrics[f"{prefix}/power_limit"] = power_limit_w |
| if power_limit_w > 0 and f"{prefix}/power" in metrics: |
| metrics[f"{prefix}/power_percent"] = ( |
| metrics[f"{prefix}/power"] / power_limit_w |
| ) * 100 |
| except Exception: |
| pass |
|
|
| try: |
| temp = pynvml.nvmlDeviceGetTemperature( |
| handle, pynvml.NVML_TEMPERATURE_GPU |
| ) |
| metrics[f"{prefix}/temp"] = temp |
| max_temp = max(max_temp, temp) |
| except Exception: |
| pass |
|
|
| try: |
| sm_clock = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_SM) |
| metrics[f"{prefix}/sm_clock"] = sm_clock |
| except Exception: |
| pass |
|
|
| try: |
| mem_clock = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM) |
| metrics[f"{prefix}/memory_clock"] = mem_clock |
| except Exception: |
| pass |
|
|
| try: |
| fan_speed = pynvml.nvmlDeviceGetFanSpeed(handle) |
| metrics[f"{prefix}/fan_speed"] = fan_speed |
| except Exception: |
| pass |
|
|
| try: |
| pstate = pynvml.nvmlDeviceGetPerformanceState(handle) |
| metrics[f"{prefix}/performance_state"] = pstate |
| except Exception: |
| pass |
|
|
| try: |
| energy_mj = pynvml.nvmlDeviceGetTotalEnergyConsumption(handle) |
| if physical_idx not in _energy_baseline: |
| _energy_baseline[physical_idx] = energy_mj |
| energy_consumed_mj = energy_mj - _energy_baseline[physical_idx] |
| metrics[f"{prefix}/energy_consumed"] = energy_consumed_mj / 1000.0 |
| except Exception: |
| pass |
|
|
| try: |
| pcie_tx = pynvml.nvmlDeviceGetPcieThroughput( |
| handle, pynvml.NVML_PCIE_UTIL_TX_BYTES |
| ) |
| pcie_rx = pynvml.nvmlDeviceGetPcieThroughput( |
| handle, pynvml.NVML_PCIE_UTIL_RX_BYTES |
| ) |
| metrics[f"{prefix}/pcie_tx"] = pcie_tx / 1024.0 |
| metrics[f"{prefix}/pcie_rx"] = pcie_rx / 1024.0 |
| except Exception: |
| pass |
|
|
| try: |
| throttle = pynvml.nvmlDeviceGetCurrentClocksThrottleReasons(handle) |
| metrics[f"{prefix}/throttle_thermal"] = int( |
| bool(throttle & pynvml.nvmlClocksThrottleReasonSwThermalSlowdown) |
| ) |
| metrics[f"{prefix}/throttle_power"] = int( |
| bool(throttle & pynvml.nvmlClocksThrottleReasonSwPowerCap) |
| ) |
| metrics[f"{prefix}/throttle_hw_slowdown"] = int( |
| bool(throttle & pynvml.nvmlClocksThrottleReasonHwSlowdown) |
| ) |
| metrics[f"{prefix}/throttle_apps"] = int( |
| bool( |
| throttle |
| & pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting |
| ) |
| ) |
| except Exception: |
| pass |
|
|
| try: |
| ecc_corrected = pynvml.nvmlDeviceGetTotalEccErrors( |
| handle, |
| pynvml.NVML_MEMORY_ERROR_TYPE_CORRECTED, |
| pynvml.NVML_VOLATILE_ECC, |
| ) |
| metrics[f"{prefix}/corrected_memory_errors"] = ecc_corrected |
| except Exception: |
| pass |
|
|
| try: |
| ecc_uncorrected = pynvml.nvmlDeviceGetTotalEccErrors( |
| handle, |
| pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, |
| pynvml.NVML_VOLATILE_ECC, |
| ) |
| metrics[f"{prefix}/uncorrected_memory_errors"] = ecc_uncorrected |
| except Exception: |
| pass |
|
|
| except Exception: |
| continue |
|
|
| if valid_util_count > 0: |
| metrics["gpu/mean_utilization"] = total_util / valid_util_count |
| if total_mem_used_gib > 0: |
| metrics["gpu/total_allocated_memory"] = total_mem_used_gib |
| if total_power > 0: |
| metrics["gpu/total_power"] = total_power |
| if max_temp > 0: |
| metrics["gpu/max_temp"] = max_temp |
|
|
| return metrics |
|
|
|
|
| class GpuMonitor: |
| def __init__(self, run: "Run", interval: float = 10.0): |
| self._run = run |
| self._interval = interval |
| self._stop_flag = threading.Event() |
| self._thread: "threading.Thread | None" = None |
|
|
| def start(self): |
| count, _ = get_all_gpu_count() |
| if count == 0: |
| warnings.warn( |
| "auto_log_gpu=True but no NVIDIA GPUs detected. GPU logging disabled." |
| ) |
| return |
|
|
| reset_energy_baseline() |
| self._thread = threading.Thread(target=self._monitor_loop, daemon=True) |
| self._thread.start() |
|
|
| def stop(self): |
| self._stop_flag.set() |
| if self._thread is not None: |
| self._thread.join(timeout=2.0) |
|
|
| def _monitor_loop(self): |
| while not self._stop_flag.is_set(): |
| try: |
| metrics = collect_gpu_metrics(all_gpus=True) |
| if metrics: |
| self._run.log_system(metrics) |
| except Exception: |
| pass |
|
|
| self._stop_flag.wait(timeout=self._interval) |
|
|
|
|
| def log_gpu(run: "Run | None" = None, device: int | None = None) -> dict: |
| """ |
| Log GPU metrics to the current or specified run as system metrics. |
| |
| Args: |
| run: Optional Run instance. If None, uses current run from context. |
| device: CUDA device index to collect metrics from. If None, collects |
| from all GPUs visible to this process (respects CUDA_VISIBLE_DEVICES). |
| |
| Returns: |
| dict: The GPU metrics that were logged. |
| |
| Example: |
| ```python |
| import trackio |
| |
| run = trackio.init(project="my-project") |
| trackio.log({"loss": 0.5}) |
| trackio.log_gpu() # logs all visible GPUs |
| trackio.log_gpu(device=0) # logs only CUDA device 0 |
| ``` |
| """ |
| from trackio import context_vars |
|
|
| if run is None: |
| run = context_vars.current_run.get() |
| if run is None: |
| raise RuntimeError("Call trackio.init() before trackio.log_gpu().") |
|
|
| metrics = collect_gpu_metrics(device=device) |
| if metrics: |
| run.log_system(metrics) |
| return metrics |
|
|