| |
| """Shared RunPod GPU runtime preflight.""" |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import subprocess |
| import sys |
| from typing import Any |
|
|
|
|
| def parse_args(argv: list[str] | None = None) -> argparse.Namespace: |
| parser = argparse.ArgumentParser(description="Validate RunPod GPU runtime.") |
| parser.add_argument( |
| "--context", |
| default="RunPod", |
| help="Short label printed in error messages.", |
| ) |
| return parser.parse_args(argv) |
|
|
|
|
| def detect_gpu_visibility() -> bool: |
| try: |
| result = subprocess.run( |
| ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], |
| capture_output=True, |
| text=True, |
| check=False, |
| timeout=10, |
| ) |
| except Exception: |
| return False |
| return result.returncode == 0 and bool(result.stdout.strip()) |
|
|
|
|
| def probe_torch(torch_module: Any | None = None) -> dict[str, Any]: |
| torch = torch_module |
| if torch is None: |
| import torch as torch |
|
|
| cuda_available = bool(torch.cuda.is_available()) |
| device_count = int(torch.cuda.device_count()) |
| probe: dict[str, Any] = { |
| "torch_version": str(torch.__version__), |
| "cuda_version": getattr(torch.version, "cuda", None), |
| "cuda_available": cuda_available, |
| "device_count": device_count, |
| "device_name": None, |
| "total_memory_gb": None, |
| "capability_tag": None, |
| "supported_arches": [], |
| "smoke_error": None, |
| } |
|
|
| if not cuda_available or device_count <= 0: |
| return probe |
|
|
| major, minor = torch.cuda.get_device_capability(0) |
| props = torch.cuda.get_device_properties(0) |
| supported_arches = [] |
| if hasattr(torch.cuda, "get_arch_list"): |
| try: |
| supported_arches = list(torch.cuda.get_arch_list()) |
| except Exception: |
| supported_arches = [] |
|
|
| probe.update( |
| { |
| "device_name": torch.cuda.get_device_name(0), |
| "total_memory_gb": round(props.total_memory / 1e9, 1), |
| "capability_tag": f"sm_{major}{minor}", |
| "supported_arches": supported_arches, |
| } |
| ) |
|
|
| try: |
| sample = torch.tensor([1.0], device="cuda") |
| sample = sample + 1 |
| _ = float(sample.sum().item()) |
| torch.cuda.synchronize() |
| except Exception as exc: |
| probe["smoke_error"] = str(exc) |
|
|
| return probe |
|
|
|
|
| def evaluate_runtime(*, gpu_visible: bool, probe: dict[str, Any]) -> tuple[bool, str | None]: |
| if gpu_visible and not probe["cuda_available"]: |
| return False, "GPU is visible to nvidia-smi but PyTorch CUDA is unavailable" |
|
|
| if probe["cuda_available"] and probe["capability_tag"] and probe["supported_arches"]: |
| if probe["capability_tag"] not in probe["supported_arches"]: |
| supported = " ".join(probe["supported_arches"]) |
| return ( |
| False, |
| f"GPU capability {probe['capability_tag']} is not supported by this PyTorch build " |
| f"(supported: {supported})", |
| ) |
|
|
| if probe["smoke_error"]: |
| return False, f"CUDA smoke test failed: {probe['smoke_error']}" |
|
|
| return True, None |
|
|
|
|
| def print_probe(*, gpu_visible: bool, probe: dict[str, Any]) -> None: |
| print(f" torch: {probe['torch_version']}") |
| print(f" torch.version.cuda: {probe['cuda_version']}") |
| print(f" CUDA available: {probe['cuda_available']}") |
| print(f" device_count: {probe['device_count']}") |
| print(f" nvidia-smi GPU visible: {gpu_visible}") |
| if probe["device_name"]: |
| print( |
| " GPU: " |
| f"{probe['device_name']}, VRAM: {probe['total_memory_gb']} GB, capability: {probe['capability_tag']}" |
| ) |
| if probe["supported_arches"]: |
| print(f" PyTorch CUDA arch list: {' '.join(probe['supported_arches'])}") |
| if probe["smoke_error"]: |
| print(f" CUDA smoke error: {probe['smoke_error']}") |
|
|
|
|
| def main(argv: list[str] | None = None) -> int: |
| args = parse_args(argv) |
| gpu_visible = detect_gpu_visibility() |
| probe = probe_torch() |
| print_probe(gpu_visible=gpu_visible, probe=probe) |
| ok, message = evaluate_runtime(gpu_visible=gpu_visible, probe=probe) |
| if not ok: |
| print(f" ERROR: {message}") |
| print( |
| f" {args.context} GPU runtime is not usable with the current PyTorch/CUDA stack. " |
| "Use a supported NVIDIA architecture or a newer compatible template." |
| ) |
| return 1 |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|