| |
| """ |
| Check NCCL availability for distributed training. |
| Run from repo root (no torchrun needed): |
| uv run python scripts/check_nccl.py |
| # or: python scripts/check_nccl.py |
| """ |
| import sys |
| import subprocess |
|
|
| def main(): |
| print("=== NCCL / CUDA environment check ===\n") |
|
|
| |
| print(f"Python: {sys.executable}") |
| import torch |
| print(f"PyTorch: {torch.__version__}") |
| print(f"CUDA compiled with: {torch.version.cuda or 'N/A'}") |
| print(f"cuDNN: {torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else 'N/A'}") |
|
|
| cuda_available = torch.cuda.is_available() |
| print(f"CUDA available: {cuda_available}") |
| if cuda_available: |
| print(f"CUDA runtime: {torch.version.cuda}") |
| print(f"Device count: {torch.cuda.device_count()}") |
| for i in range(torch.cuda.device_count()): |
| print(f" [{i}] {torch.cuda.get_device_name(i)}") |
|
|
| |
| import torch.distributed as dist |
| nccl_available = dist.is_nccl_available() |
| print(f"\nNCCL available (PyTorch): {nccl_available}") |
|
|
| |
| try: |
| out = subprocess.run( |
| ["nvidia-smi", "--query-gpu=driver_version,name", "--format=csv,noheader"], |
| capture_output=True, |
| text=True, |
| timeout=5, |
| ) |
| if out.returncode == 0 and out.stdout.strip(): |
| print("\nnvidia-smi (driver / GPU):") |
| for line in out.stdout.strip().split("\n"): |
| print(f" {line}") |
| except FileNotFoundError: |
| print("\nnvidia-smi: not found (optional)") |
| except subprocess.TimeoutExpired: |
| print("\nnvidia-smi: timeout (optional)") |
|
|
| print("\n--- What to do next ---") |
| if not cuda_available: |
| print(" CUDA not available. Install PyTorch with CUDA: uv sync --extra gpu (and use GPU index)") |
| elif not nccl_available: |
| print(" NCCL not available in this PyTorch build. Use a CUDA-enabled wheel (e.g. cu128).") |
| print(" Reinstall: uv sync --extra gpu (with GPU index in pyproject)") |
| else: |
| print(" NCCL is available in PyTorch. To verify it actually runs (multi-process), use:") |
| print(" torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu") |
| print(" If that segfaults, use gloo: NANOCHAT_DDP_BACKEND=gloo torchrun ...") |
|
|
| return 0 if (cuda_available and nccl_available) else 1 |
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|