jw-search / deploy /runpod /check_gpu_runtime.py
G Davies
Deploy JW Search Docker Space
722bda8 verified
#!/usr/bin/env python3
"""Shared RunPod GPU runtime preflight."""
from __future__ import annotations
import argparse
import subprocess
import sys
from typing import Any
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Validate RunPod GPU runtime.")
parser.add_argument(
"--context",
default="RunPod",
help="Short label printed in error messages.",
)
return parser.parse_args(argv)
def detect_gpu_visibility() -> bool:
try:
result = subprocess.run(
["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
capture_output=True,
text=True,
check=False,
timeout=10,
)
except Exception:
return False
return result.returncode == 0 and bool(result.stdout.strip())
def probe_torch(torch_module: Any | None = None) -> dict[str, Any]:
torch = torch_module
if torch is None:
import torch as torch # type: ignore[no-redef]
cuda_available = bool(torch.cuda.is_available())
device_count = int(torch.cuda.device_count())
probe: dict[str, Any] = {
"torch_version": str(torch.__version__),
"cuda_version": getattr(torch.version, "cuda", None),
"cuda_available": cuda_available,
"device_count": device_count,
"device_name": None,
"total_memory_gb": None,
"capability_tag": None,
"supported_arches": [],
"smoke_error": None,
}
if not cuda_available or device_count <= 0:
return probe
major, minor = torch.cuda.get_device_capability(0)
props = torch.cuda.get_device_properties(0)
supported_arches = []
if hasattr(torch.cuda, "get_arch_list"):
try:
supported_arches = list(torch.cuda.get_arch_list())
except Exception:
supported_arches = []
probe.update(
{
"device_name": torch.cuda.get_device_name(0),
"total_memory_gb": round(props.total_memory / 1e9, 1),
"capability_tag": f"sm_{major}{minor}",
"supported_arches": supported_arches,
}
)
try:
sample = torch.tensor([1.0], device="cuda")
sample = sample + 1
_ = float(sample.sum().item())
torch.cuda.synchronize()
except Exception as exc: # pragma: no cover - exercised via runtime
probe["smoke_error"] = str(exc)
return probe
def evaluate_runtime(*, gpu_visible: bool, probe: dict[str, Any]) -> tuple[bool, str | None]:
if gpu_visible and not probe["cuda_available"]:
return False, "GPU is visible to nvidia-smi but PyTorch CUDA is unavailable"
if probe["cuda_available"] and probe["capability_tag"] and probe["supported_arches"]:
if probe["capability_tag"] not in probe["supported_arches"]:
supported = " ".join(probe["supported_arches"])
return (
False,
f"GPU capability {probe['capability_tag']} is not supported by this PyTorch build "
f"(supported: {supported})",
)
if probe["smoke_error"]:
return False, f"CUDA smoke test failed: {probe['smoke_error']}"
return True, None
def print_probe(*, gpu_visible: bool, probe: dict[str, Any]) -> None:
print(f" torch: {probe['torch_version']}")
print(f" torch.version.cuda: {probe['cuda_version']}")
print(f" CUDA available: {probe['cuda_available']}")
print(f" device_count: {probe['device_count']}")
print(f" nvidia-smi GPU visible: {gpu_visible}")
if probe["device_name"]:
print(
" GPU: "
f"{probe['device_name']}, VRAM: {probe['total_memory_gb']} GB, capability: {probe['capability_tag']}"
)
if probe["supported_arches"]:
print(f" PyTorch CUDA arch list: {' '.join(probe['supported_arches'])}")
if probe["smoke_error"]:
print(f" CUDA smoke error: {probe['smoke_error']}")
def main(argv: list[str] | None = None) -> int:
args = parse_args(argv)
gpu_visible = detect_gpu_visibility()
probe = probe_torch()
print_probe(gpu_visible=gpu_visible, probe=probe)
ok, message = evaluate_runtime(gpu_visible=gpu_visible, probe=probe)
if not ok:
print(f" ERROR: {message}")
print(
f" {args.context} GPU runtime is not usable with the current PyTorch/CUDA stack. "
"Use a supported NVIDIA architecture or a newer compatible template."
)
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())