Spaces:

jw-tools
/

jw-search

Running on CPU Upgrade

File size: 4,614 Bytes

722bda8

#!/usr/bin/env python3
"""Shared RunPod GPU runtime preflight."""

from __future__ import annotations

import argparse
import subprocess
import sys
from typing import Any


def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Validate RunPod GPU runtime.")
    parser.add_argument(
        "--context",
        default="RunPod",
        help="Short label printed in error messages.",
    )
    return parser.parse_args(argv)


def detect_gpu_visibility() -> bool:
    try:
        result = subprocess.run(
            ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
            capture_output=True,
            text=True,
            check=False,
            timeout=10,
        )
    except Exception:
        return False
    return result.returncode == 0 and bool(result.stdout.strip())


def probe_torch(torch_module: Any | None = None) -> dict[str, Any]:
    torch = torch_module
    if torch is None:
        import torch as torch  # type: ignore[no-redef]

    cuda_available = bool(torch.cuda.is_available())
    device_count = int(torch.cuda.device_count())
    probe: dict[str, Any] = {
        "torch_version": str(torch.__version__),
        "cuda_version": getattr(torch.version, "cuda", None),
        "cuda_available": cuda_available,
        "device_count": device_count,
        "device_name": None,
        "total_memory_gb": None,
        "capability_tag": None,
        "supported_arches": [],
        "smoke_error": None,
    }

    if not cuda_available or device_count <= 0:
        return probe

    major, minor = torch.cuda.get_device_capability(0)
    props = torch.cuda.get_device_properties(0)
    supported_arches = []
    if hasattr(torch.cuda, "get_arch_list"):
        try:
            supported_arches = list(torch.cuda.get_arch_list())
        except Exception:
            supported_arches = []

    probe.update(
        {
            "device_name": torch.cuda.get_device_name(0),
            "total_memory_gb": round(props.total_memory / 1e9, 1),
            "capability_tag": f"sm_{major}{minor}",
            "supported_arches": supported_arches,
        }
    )

    try:
        sample = torch.tensor([1.0], device="cuda")
        sample = sample + 1
        _ = float(sample.sum().item())
        torch.cuda.synchronize()
    except Exception as exc:  # pragma: no cover - exercised via runtime
        probe["smoke_error"] = str(exc)

    return probe


def evaluate_runtime(*, gpu_visible: bool, probe: dict[str, Any]) -> tuple[bool, str | None]:
    if gpu_visible and not probe["cuda_available"]:
        return False, "GPU is visible to nvidia-smi but PyTorch CUDA is unavailable"

    if probe["cuda_available"] and probe["capability_tag"] and probe["supported_arches"]:
        if probe["capability_tag"] not in probe["supported_arches"]:
            supported = " ".join(probe["supported_arches"])
            return (
                False,
                f"GPU capability {probe['capability_tag']} is not supported by this PyTorch build "
                f"(supported: {supported})",
            )

    if probe["smoke_error"]:
        return False, f"CUDA smoke test failed: {probe['smoke_error']}"

    return True, None


def print_probe(*, gpu_visible: bool, probe: dict[str, Any]) -> None:
    print(f"  torch: {probe['torch_version']}")
    print(f"  torch.version.cuda: {probe['cuda_version']}")
    print(f"  CUDA available: {probe['cuda_available']}")
    print(f"  device_count: {probe['device_count']}")
    print(f"  nvidia-smi GPU visible: {gpu_visible}")
    if probe["device_name"]:
        print(
            "  GPU: "
            f"{probe['device_name']}, VRAM: {probe['total_memory_gb']} GB, capability: {probe['capability_tag']}"
        )
    if probe["supported_arches"]:
        print(f"  PyTorch CUDA arch list: {' '.join(probe['supported_arches'])}")
    if probe["smoke_error"]:
        print(f"  CUDA smoke error: {probe['smoke_error']}")


def main(argv: list[str] | None = None) -> int:
    args = parse_args(argv)
    gpu_visible = detect_gpu_visibility()
    probe = probe_torch()
    print_probe(gpu_visible=gpu_visible, probe=probe)
    ok, message = evaluate_runtime(gpu_visible=gpu_visible, probe=probe)
    if not ok:
        print(f"  ERROR: {message}")
        print(
            f"  {args.context} GPU runtime is not usable with the current PyTorch/CUDA stack. "
            "Use a supported NVIDIA architecture or a newer compatible template."
        )
        return 1
    return 0


if __name__ == "__main__":
    raise SystemExit(main())