#!/bin/bash
# Try running 8-GPU test with NCCL and common workaround env vars (to avoid SIGSEGV on some A100/DGX nodes).
# Run from repo root:  bash scripts/try_nccl_8gpu.sh
# If it still crashes, use gloo:  NANOCHAT_DDP_BACKEND=gloo torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu

set -e
cd "$(dirname "$0")/.."

# Reduce chance of NCCL SIGSEGV on single-node A100/DGX
export NCCL_IB_DISABLE=1
export NCCL_P2P_DISABLE=1
# Force socket interface: lo = loopback (single-node), or use eth0 / your main interface
export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-lo}"
# Deterministic GPU order
export CUDA_DEVICE_ORDER=PCI_BUS_ID
# Optional: verbose NCCL (set to INFO or WARN to debug)
export NCCL_DEBUG="${NCCL_DEBUG:-WARN}"

echo "Trying NCCL with: NCCL_IB_DISABLE=1 NCCL_P2P_DISABLE=1 NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME CUDA_DEVICE_ORDER=PCI_BUS_ID"
torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu