ksjpswaroop
/

nanochat-eos

Model card Files Files and versions

nanochat-eos / scripts /try_nccl_8gpu.sh

ksjpswaroop's picture

Upload folder using huggingface_hub

50ebd92 verified 4 months ago

history blame contribute delete

930 Bytes

	#!/bin/bash
	# Try running 8-GPU test with NCCL and common workaround env vars (to avoid SIGSEGV on some A100/DGX nodes).
	# Run from repo root: bash scripts/try_nccl_8gpu.sh
	# If it still crashes, use gloo: NANOCHAT_DDP_BACKEND=gloo torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu

	set -e
	cd "$(dirname "$0")/.."

	# Reduce chance of NCCL SIGSEGV on single-node A100/DGX
	export NCCL_IB_DISABLE=1
	export NCCL_P2P_DISABLE=1
	# Force socket interface: lo = loopback (single-node), or use eth0 / your main interface
	export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-lo}"
	# Deterministic GPU order
	export CUDA_DEVICE_ORDER=PCI_BUS_ID
	# Optional: verbose NCCL (set to INFO or WARN to debug)
	export NCCL_DEBUG="${NCCL_DEBUG:-WARN}"

	echo "Trying NCCL with: NCCL_IB_DISABLE=1 NCCL_P2P_DISABLE=1 NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME CUDA_DEVICE_ORDER=PCI_BUS_ID"
	torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu