| # Try running 8-GPU test with NCCL and common workaround env vars (to avoid SIGSEGV on some A100/DGX nodes). | |
| # Run from repo root: bash scripts/try_nccl_8gpu.sh | |
| # If it still crashes, use gloo: NANOCHAT_DDP_BACKEND=gloo torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu | |
| set -e | |
| cd "$(dirname "$0")/.." | |
| # Reduce chance of NCCL SIGSEGV on single-node A100/DGX | |
| export NCCL_IB_DISABLE=1 | |
| export NCCL_P2P_DISABLE=1 | |
| # Force socket interface: lo = loopback (single-node), or use eth0 / your main interface | |
| export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-lo}" | |
| # Deterministic GPU order | |
| export CUDA_DEVICE_ORDER=PCI_BUS_ID | |
| # Optional: verbose NCCL (set to INFO or WARN to debug) | |
| export NCCL_DEBUG="${NCCL_DEBUG:-WARN}" | |
| echo "Trying NCCL with: NCCL_IB_DISABLE=1 NCCL_P2P_DISABLE=1 NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME CUDA_DEVICE_ORDER=PCI_BUS_ID" | |
| torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu | |