File size: 945 Bytes

a544dd6

"""GPU preflight diagnostics for Nomen-AI training.

Run before smoke/SFT/DPO to fail early if CUDA or VRAM is unavailable.
"""
import sys
import torch


def main():
    print('torch:', torch.__version__)
    print('cuda_available:', torch.cuda.is_available())
    if not torch.cuda.is_available():
        raise SystemExit('ERROR: CUDA is not available. Use a Colab GPU/T4 runtime or Docker with NVIDIA runtime.')
    device = torch.cuda.current_device()
    name = torch.cuda.get_device_name(device)
    props = torch.cuda.get_device_properties(device)
    total_gb = props.total_memory / 1e9
    print('gpu_name:', name)
    print('total_vram_gb:', round(total_gb, 2))
    print('compute_capability:', f'{props.major}.{props.minor}')
    if total_gb < 14:
        raise SystemExit(f'ERROR: VRAM {total_gb:.1f}GB is below expected T4-class 15GB. Use T4/A10G or larger.')
    print('GPU_PREFLIGHT_PASS')


if __name__ == '__main__':
    main()