#!/bin/bash # Try running 8-GPU test with NCCL and common workaround env vars (to avoid SIGSEGV on some A100/DGX nodes). # Run from repo root: bash scripts/try_nccl_8gpu.sh # If it still crashes, use gloo: NANOCHAT_DDP_BACKEND=gloo torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu set -e cd "$(dirname "$0")/.." # Reduce chance of NCCL SIGSEGV on single-node A100/DGX export NCCL_IB_DISABLE=1 export NCCL_P2P_DISABLE=1 # Force socket interface: lo = loopback (single-node), or use eth0 / your main interface export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-lo}" # Deterministic GPU order export CUDA_DEVICE_ORDER=PCI_BUS_ID # Optional: verbose NCCL (set to INFO or WARN to debug) export NCCL_DEBUG="${NCCL_DEBUG:-WARN}" echo "Trying NCCL with: NCCL_IB_DISABLE=1 NCCL_P2P_DISABLE=1 NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME CUDA_DEVICE_ORDER=PCI_BUS_ID" torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu