#!/usr/bin/env python3 """Shared RunPod GPU runtime preflight.""" from __future__ import annotations import argparse import subprocess import sys from typing import Any def parse_args(argv: list[str] | None = None) -> argparse.Namespace: parser = argparse.ArgumentParser(description="Validate RunPod GPU runtime.") parser.add_argument( "--context", default="RunPod", help="Short label printed in error messages.", ) return parser.parse_args(argv) def detect_gpu_visibility() -> bool: try: result = subprocess.run( ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], capture_output=True, text=True, check=False, timeout=10, ) except Exception: return False return result.returncode == 0 and bool(result.stdout.strip()) def probe_torch(torch_module: Any | None = None) -> dict[str, Any]: torch = torch_module if torch is None: import torch as torch # type: ignore[no-redef] cuda_available = bool(torch.cuda.is_available()) device_count = int(torch.cuda.device_count()) probe: dict[str, Any] = { "torch_version": str(torch.__version__), "cuda_version": getattr(torch.version, "cuda", None), "cuda_available": cuda_available, "device_count": device_count, "device_name": None, "total_memory_gb": None, "capability_tag": None, "supported_arches": [], "smoke_error": None, } if not cuda_available or device_count <= 0: return probe major, minor = torch.cuda.get_device_capability(0) props = torch.cuda.get_device_properties(0) supported_arches = [] if hasattr(torch.cuda, "get_arch_list"): try: supported_arches = list(torch.cuda.get_arch_list()) except Exception: supported_arches = [] probe.update( { "device_name": torch.cuda.get_device_name(0), "total_memory_gb": round(props.total_memory / 1e9, 1), "capability_tag": f"sm_{major}{minor}", "supported_arches": supported_arches, } ) try: sample = torch.tensor([1.0], device="cuda") sample = sample + 1 _ = float(sample.sum().item()) torch.cuda.synchronize() except Exception as exc: # pragma: no cover - exercised via runtime probe["smoke_error"] = str(exc) return probe def evaluate_runtime(*, gpu_visible: bool, probe: dict[str, Any]) -> tuple[bool, str | None]: if gpu_visible and not probe["cuda_available"]: return False, "GPU is visible to nvidia-smi but PyTorch CUDA is unavailable" if probe["cuda_available"] and probe["capability_tag"] and probe["supported_arches"]: if probe["capability_tag"] not in probe["supported_arches"]: supported = " ".join(probe["supported_arches"]) return ( False, f"GPU capability {probe['capability_tag']} is not supported by this PyTorch build " f"(supported: {supported})", ) if probe["smoke_error"]: return False, f"CUDA smoke test failed: {probe['smoke_error']}" return True, None def print_probe(*, gpu_visible: bool, probe: dict[str, Any]) -> None: print(f" torch: {probe['torch_version']}") print(f" torch.version.cuda: {probe['cuda_version']}") print(f" CUDA available: {probe['cuda_available']}") print(f" device_count: {probe['device_count']}") print(f" nvidia-smi GPU visible: {gpu_visible}") if probe["device_name"]: print( " GPU: " f"{probe['device_name']}, VRAM: {probe['total_memory_gb']} GB, capability: {probe['capability_tag']}" ) if probe["supported_arches"]: print(f" PyTorch CUDA arch list: {' '.join(probe['supported_arches'])}") if probe["smoke_error"]: print(f" CUDA smoke error: {probe['smoke_error']}") def main(argv: list[str] | None = None) -> int: args = parse_args(argv) gpu_visible = detect_gpu_visibility() probe = probe_torch() print_probe(gpu_visible=gpu_visible, probe=probe) ok, message = evaluate_runtime(gpu_visible=gpu_visible, probe=probe) if not ok: print(f" ERROR: {message}") print( f" {args.context} GPU runtime is not usable with the current PyTorch/CUDA stack. " "Use a supported NVIDIA architecture or a newer compatible template." ) return 1 return 0 if __name__ == "__main__": raise SystemExit(main())