Spaces:

jw-tools
/

jw-search

Running on CPU Upgrade

jw-search / deploy /runpod /check_gpu_runtime.py

G Davies

Deploy JW Search Docker Space

722bda8 verified 9 days ago

4.61 kB

	#!/usr/bin/env python3
	"""Shared RunPod GPU runtime preflight."""

	from __future__ import annotations

	import argparse
	import subprocess
	import sys
	from typing import Any


	def parse_args(argv: list[str] \| None = None) -> argparse.Namespace:
	parser = argparse.ArgumentParser(description="Validate RunPod GPU runtime.")
	parser.add_argument(
	"--context",
	default="RunPod",
	help="Short label printed in error messages.",
	)
	return parser.parse_args(argv)


	def detect_gpu_visibility() -> bool:
	try:
	result = subprocess.run(
	["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
	capture_output=True,
	text=True,
	check=False,
	timeout=10,
	)
	except Exception:
	return False
	return result.returncode == 0 and bool(result.stdout.strip())


	def probe_torch(torch_module: Any \| None = None) -> dict[str, Any]:
	torch = torch_module
	if torch is None:
	import torch as torch # type: ignore[no-redef]

	cuda_available = bool(torch.cuda.is_available())
	device_count = int(torch.cuda.device_count())
	probe: dict[str, Any] = {
	"torch_version": str(torch.__version__),
	"cuda_version": getattr(torch.version, "cuda", None),
	"cuda_available": cuda_available,
	"device_count": device_count,
	"device_name": None,
	"total_memory_gb": None,
	"capability_tag": None,
	"supported_arches": [],
	"smoke_error": None,
	}

	if not cuda_available or device_count <= 0:
	return probe

	major, minor = torch.cuda.get_device_capability(0)
	props = torch.cuda.get_device_properties(0)
	supported_arches = []
	if hasattr(torch.cuda, "get_arch_list"):
	try:
	supported_arches = list(torch.cuda.get_arch_list())
	except Exception:
	supported_arches = []

	probe.update(
	{
	"device_name": torch.cuda.get_device_name(0),
	"total_memory_gb": round(props.total_memory / 1e9, 1),
	"capability_tag": f"sm_{major}{minor}",
	"supported_arches": supported_arches,
	}
	)

	try:
	sample = torch.tensor([1.0], device="cuda")
	sample = sample + 1
	_ = float(sample.sum().item())
	torch.cuda.synchronize()
	except Exception as exc: # pragma: no cover - exercised via runtime
	probe["smoke_error"] = str(exc)

	return probe


	def evaluate_runtime(*, gpu_visible: bool, probe: dict[str, Any]) -> tuple[bool, str \| None]:
	if gpu_visible and not probe["cuda_available"]:
	return False, "GPU is visible to nvidia-smi but PyTorch CUDA is unavailable"

	if probe["cuda_available"] and probe["capability_tag"] and probe["supported_arches"]:
	if probe["capability_tag"] not in probe["supported_arches"]:
	supported = " ".join(probe["supported_arches"])
	return (
	False,
	f"GPU capability {probe['capability_tag']} is not supported by this PyTorch build "
	f"(supported: {supported})",
	)

	if probe["smoke_error"]:
	return False, f"CUDA smoke test failed: {probe['smoke_error']}"

	return True, None


	def print_probe(*, gpu_visible: bool, probe: dict[str, Any]) -> None:
	print(f" torch: {probe['torch_version']}")
	print(f" torch.version.cuda: {probe['cuda_version']}")
	print(f" CUDA available: {probe['cuda_available']}")
	print(f" device_count: {probe['device_count']}")
	print(f" nvidia-smi GPU visible: {gpu_visible}")
	if probe["device_name"]:
	print(
	" GPU: "
	f"{probe['device_name']}, VRAM: {probe['total_memory_gb']} GB, capability: {probe['capability_tag']}"
	)
	if probe["supported_arches"]:
	print(f" PyTorch CUDA arch list: {' '.join(probe['supported_arches'])}")
	if probe["smoke_error"]:
	print(f" CUDA smoke error: {probe['smoke_error']}")


	def main(argv: list[str] \| None = None) -> int:
	args = parse_args(argv)
	gpu_visible = detect_gpu_visibility()
	probe = probe_torch()
	print_probe(gpu_visible=gpu_visible, probe=probe)
	ok, message = evaluate_runtime(gpu_visible=gpu_visible, probe=probe)
	if not ok:
	print(f" ERROR: {message}")
	print(
	f" {args.context} GPU runtime is not usable with the current PyTorch/CUDA stack. "
	"Use a supported NVIDIA architecture or a newer compatible template."
	)
	return 1
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())