Spaces:

trackio-tests
/

test_test

Running

File size: 11,919 Bytes

1834e19

import os
import threading
import warnings
from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
    from trackio.run import Run

pynvml: Any = None
PYNVML_AVAILABLE = False
_nvml_initialized = False
_nvml_lock = threading.Lock()
_energy_baseline: dict[int, float] = {}


def _ensure_pynvml():
    global PYNVML_AVAILABLE, pynvml
    if PYNVML_AVAILABLE:
        return pynvml
    try:
        import pynvml as _pynvml

        pynvml = _pynvml
        PYNVML_AVAILABLE = True
        return pynvml
    except ImportError:
        raise ImportError(
            "nvidia-ml-py is required for GPU monitoring. "
            "Install it with: pip install nvidia-ml-py"
        )


def _init_nvml() -> bool:
    global _nvml_initialized
    with _nvml_lock:
        if _nvml_initialized:
            return True
        try:
            nvml = _ensure_pynvml()
            nvml.nvmlInit()
            _nvml_initialized = True
            return True
        except Exception:
            return False


def _shutdown_nvml():
    global _nvml_initialized
    with _nvml_lock:
        if _nvml_initialized and pynvml is not None:
            try:
                pynvml.nvmlShutdown()
            except Exception:
                pass
            _nvml_initialized = False


def get_gpu_count() -> tuple[int, list[int]]:
    """
    Get the number of GPUs visible to this process and their physical indices.
    Respects CUDA_VISIBLE_DEVICES environment variable.

    Returns:
        Tuple of (count, physical_indices) where:
        - count: Number of visible GPUs
        - physical_indices: List mapping logical index to physical GPU index.
          e.g., if CUDA_VISIBLE_DEVICES=2,3 returns (2, [2, 3])
          meaning logical GPU 0 = physical GPU 2, logical GPU 1 = physical GPU 3
    """
    if not _init_nvml():
        return 0, []

    cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES")
    if cuda_visible is not None and cuda_visible.strip():
        try:
            indices = [int(x.strip()) for x in cuda_visible.split(",") if x.strip()]
            return len(indices), indices
        except ValueError:
            pass

    try:
        total = pynvml.nvmlDeviceGetCount()
        return total, list(range(total))
    except Exception:
        return 0, []


def gpu_available() -> bool:
    """
    Check if GPU monitoring is available.

    Returns True if nvidia-ml-py is installed and at least one NVIDIA GPU is detected.
    This is used for auto-detection of GPU logging.
    """
    try:
        _ensure_pynvml()
        count, _ = get_gpu_count()
        return count > 0
    except ImportError:
        return False
    except Exception:
        return False


def reset_energy_baseline():
    """Reset the energy baseline for all GPUs. Called when a new run starts."""
    global _energy_baseline
    _energy_baseline = {}


def collect_gpu_metrics(device: int | None = None) -> dict:
    """
    Collect GPU metrics for visible GPUs.

    Args:
        device: CUDA device index to collect metrics from. If None, collects
                from all GPUs visible to this process (respects CUDA_VISIBLE_DEVICES).
                The device index is the logical CUDA index (0, 1, 2...), not the
                physical GPU index.

    Returns:
        Dictionary of GPU metrics. Keys use logical device indices (gpu/0/, gpu/1/, etc.)
        which correspond to CUDA device indices, not physical GPU indices.
    """
    if not _init_nvml():
        return {}

    gpu_count, visible_gpus = get_gpu_count()
    if gpu_count == 0:
        return {}

    if device is not None:
        if device < 0 or device >= gpu_count:
            return {}
        gpu_indices = [(device, visible_gpus[device])]
    else:
        gpu_indices = list(enumerate(visible_gpus))

    metrics = {}
    total_util = 0.0
    total_mem_used_gib = 0.0
    total_power = 0.0
    max_temp = 0.0
    valid_util_count = 0

    for logical_idx, physical_idx in gpu_indices:
        prefix = f"gpu/{logical_idx}"
        try:
            handle = pynvml.nvmlDeviceGetHandleByIndex(physical_idx)

            try:
                util = pynvml.nvmlDeviceGetUtilizationRates(handle)
                metrics[f"{prefix}/utilization"] = util.gpu
                metrics[f"{prefix}/memory_utilization"] = util.memory
                total_util += util.gpu
                valid_util_count += 1
            except Exception:
                pass

            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                mem_used_gib = mem.used / (1024**3)
                mem_total_gib = mem.total / (1024**3)
                metrics[f"{prefix}/allocated_memory"] = mem_used_gib
                metrics[f"{prefix}/total_memory"] = mem_total_gib
                if mem.total > 0:
                    metrics[f"{prefix}/memory_usage"] = mem.used / mem.total
                total_mem_used_gib += mem_used_gib
            except Exception:
                pass

            try:
                power_mw = pynvml.nvmlDeviceGetPowerUsage(handle)
                power_w = power_mw / 1000.0
                metrics[f"{prefix}/power"] = power_w
                total_power += power_w
            except Exception:
                pass

            try:
                power_limit_mw = pynvml.nvmlDeviceGetPowerManagementLimit(handle)
                power_limit_w = power_limit_mw / 1000.0
                metrics[f"{prefix}/power_limit"] = power_limit_w
                if power_limit_w > 0 and f"{prefix}/power" in metrics:
                    metrics[f"{prefix}/power_percent"] = (
                        metrics[f"{prefix}/power"] / power_limit_w
                    ) * 100
            except Exception:
                pass

            try:
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU
                )
                metrics[f"{prefix}/temp"] = temp
                max_temp = max(max_temp, temp)
            except Exception:
                pass

            try:
                sm_clock = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_SM)
                metrics[f"{prefix}/sm_clock"] = sm_clock
            except Exception:
                pass

            try:
                mem_clock = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM)
                metrics[f"{prefix}/memory_clock"] = mem_clock
            except Exception:
                pass

            try:
                fan_speed = pynvml.nvmlDeviceGetFanSpeed(handle)
                metrics[f"{prefix}/fan_speed"] = fan_speed
            except Exception:
                pass

            try:
                pstate = pynvml.nvmlDeviceGetPerformanceState(handle)
                metrics[f"{prefix}/performance_state"] = pstate
            except Exception:
                pass

            try:
                energy_mj = pynvml.nvmlDeviceGetTotalEnergyConsumption(handle)
                if logical_idx not in _energy_baseline:
                    _energy_baseline[logical_idx] = energy_mj
                energy_consumed_mj = energy_mj - _energy_baseline[logical_idx]
                metrics[f"{prefix}/energy_consumed"] = energy_consumed_mj / 1000.0
            except Exception:
                pass

            try:
                pcie_tx = pynvml.nvmlDeviceGetPcieThroughput(
                    handle, pynvml.NVML_PCIE_UTIL_TX_BYTES
                )
                pcie_rx = pynvml.nvmlDeviceGetPcieThroughput(
                    handle, pynvml.NVML_PCIE_UTIL_RX_BYTES
                )
                metrics[f"{prefix}/pcie_tx"] = pcie_tx / 1024.0
                metrics[f"{prefix}/pcie_rx"] = pcie_rx / 1024.0
            except Exception:
                pass

            try:
                throttle = pynvml.nvmlDeviceGetCurrentClocksThrottleReasons(handle)
                metrics[f"{prefix}/throttle_thermal"] = int(
                    bool(throttle & pynvml.nvmlClocksThrottleReasonSwThermalSlowdown)
                )
                metrics[f"{prefix}/throttle_power"] = int(
                    bool(throttle & pynvml.nvmlClocksThrottleReasonSwPowerCap)
                )
                metrics[f"{prefix}/throttle_hw_slowdown"] = int(
                    bool(throttle & pynvml.nvmlClocksThrottleReasonHwSlowdown)
                )
                metrics[f"{prefix}/throttle_apps"] = int(
                    bool(
                        throttle
                        & pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting
                    )
                )
            except Exception:
                pass

            try:
                ecc_corrected = pynvml.nvmlDeviceGetTotalEccErrors(
                    handle,
                    pynvml.NVML_MEMORY_ERROR_TYPE_CORRECTED,
                    pynvml.NVML_VOLATILE_ECC,
                )
                metrics[f"{prefix}/corrected_memory_errors"] = ecc_corrected
            except Exception:
                pass

            try:
                ecc_uncorrected = pynvml.nvmlDeviceGetTotalEccErrors(
                    handle,
                    pynvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
                    pynvml.NVML_VOLATILE_ECC,
                )
                metrics[f"{prefix}/uncorrected_memory_errors"] = ecc_uncorrected
            except Exception:
                pass

        except Exception:
            continue

    if valid_util_count > 0:
        metrics["gpu/mean_utilization"] = total_util / valid_util_count
    if total_mem_used_gib > 0:
        metrics["gpu/total_allocated_memory"] = total_mem_used_gib
    if total_power > 0:
        metrics["gpu/total_power"] = total_power
    if max_temp > 0:
        metrics["gpu/max_temp"] = max_temp

    return metrics


class GpuMonitor:
    def __init__(self, run: "Run", interval: float = 10.0):
        self._run = run
        self._interval = interval
        self._stop_flag = threading.Event()
        self._thread: "threading.Thread | None" = None

    def start(self):
        count, _ = get_gpu_count()
        if count == 0:
            warnings.warn(
                "auto_log_gpu=True but no NVIDIA GPUs detected. GPU logging disabled."
            )
            return

        reset_energy_baseline()
        self._thread = threading.Thread(target=self._monitor_loop, daemon=True)
        self._thread.start()

    def stop(self):
        self._stop_flag.set()
        if self._thread is not None:
            self._thread.join(timeout=2.0)

    def _monitor_loop(self):
        while not self._stop_flag.is_set():
            try:
                metrics = collect_gpu_metrics()
                if metrics:
                    self._run.log_system(metrics)
            except Exception:
                pass

            self._stop_flag.wait(timeout=self._interval)


def log_gpu(run: "Run | None" = None, device: int | None = None) -> dict:
    """
    Log GPU metrics to the current or specified run as system metrics.

    Args:
        run: Optional Run instance. If None, uses current run from context.
        device: CUDA device index to collect metrics from. If None, collects
                from all GPUs visible to this process (respects CUDA_VISIBLE_DEVICES).

    Returns:
        dict: The GPU metrics that were logged.

    Example:
        ```python
        import trackio

        run = trackio.init(project="my-project")
        trackio.log({"loss": 0.5})
        trackio.log_gpu()  # logs all visible GPUs
        trackio.log_gpu(device=0)  # logs only CUDA device 0
        ```
    """
    from trackio import context_vars

    if run is None:
        run = context_vars.current_run.get()
        if run is None:
            raise RuntimeError("Call trackio.init() before trackio.log_gpu().")

    metrics = collect_gpu_metrics(device=device)
    if metrics:
        run.log_system(metrics)
    return metrics