File size: 2,473 Bytes
50ebd92 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | #!/usr/bin/env python3
"""
Check NCCL availability for distributed training.
Run from repo root (no torchrun needed):
uv run python scripts/check_nccl.py
# or: python scripts/check_nccl.py
"""
import sys
import subprocess
def main():
print("=== NCCL / CUDA environment check ===\n")
# Python / PyTorch
print(f"Python: {sys.executable}")
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA compiled with: {torch.version.cuda or 'N/A'}")
print(f"cuDNN: {torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else 'N/A'}")
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")
if cuda_available:
print(f"CUDA runtime: {torch.version.cuda}")
print(f"Device count: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
print(f" [{i}] {torch.cuda.get_device_name(i)}")
# NCCL via PyTorch
import torch.distributed as dist
nccl_available = dist.is_nccl_available()
print(f"\nNCCL available (PyTorch): {nccl_available}")
# Optional: nvidia-smi
try:
out = subprocess.run(
["nvidia-smi", "--query-gpu=driver_version,name", "--format=csv,noheader"],
capture_output=True,
text=True,
timeout=5,
)
if out.returncode == 0 and out.stdout.strip():
print("\nnvidia-smi (driver / GPU):")
for line in out.stdout.strip().split("\n"):
print(f" {line}")
except FileNotFoundError:
print("\nnvidia-smi: not found (optional)")
except subprocess.TimeoutExpired:
print("\nnvidia-smi: timeout (optional)")
print("\n--- What to do next ---")
if not cuda_available:
print(" CUDA not available. Install PyTorch with CUDA: uv sync --extra gpu (and use GPU index)")
elif not nccl_available:
print(" NCCL not available in this PyTorch build. Use a CUDA-enabled wheel (e.g. cu128).")
print(" Reinstall: uv sync --extra gpu (with GPU index in pyproject)")
else:
print(" NCCL is available in PyTorch. To verify it actually runs (multi-process), use:")
print(" torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu")
print(" If that segfaults, use gloo: NANOCHAT_DDP_BACKEND=gloo torchrun ...")
return 0 if (cuda_available and nccl_available) else 1
if __name__ == "__main__":
sys.exit(main())
|