nanochat-eos / scripts /update_driver_instructions.sh
ksjpswaroop's picture
Upload folder using huggingface_hub
50ebd92 verified
#!/bin/bash
# Print driver-update instructions for this system. Run from repo root: bash scripts/update_driver_instructions.sh
# Does not run sudo or modify the system.
set -e
echo "=== NVIDIA driver update instructions ==="
echo ""
if ! command -v nvidia-smi &>/dev/null; then
echo "nvidia-smi not found. Install NVIDIA driver or run on a GPU node."
exit 1
fi
DRIVER=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1)
CUDA_MAX=$(nvidia-smi 2>/dev/null | grep "CUDA Version" | sed -n 's/.*CUDA Version: \([0-9.]*\).*/\1/p' | head -1)
echo "Current driver: ${DRIVER:-unknown}"
echo "Max CUDA (driver): ${CUDA_MAX:-unknown}"
echo ""
if [ -f /etc/dgx-release ] || [ -f /etc/nvidia-dgx/release ] 2>/dev/null; then
echo "DGX system detected."
echo ""
if command -v cm-chroot-sw-img &>/dev/null; then
echo "Base Command Manager (BCM) detected. Upgrade via the BaseOS image:"
echo " 1. Enter image: cm-chroot-sw-img /cm/images/baseos-image/"
echo " 2. Inside chroot: apt update && apt upgrade && apt autoremove && exit"
echo " 3. Change driver (in chroot): install nvidia-driver-550-server + nvidia-fabricmanager-550 (DGX A100 needs Fabric Manager)."
echo " See: docs/A100_AND_GPT3_PATH.md section 'Update drivers' (BCM / fabric)"
echo " Docs: https://docs.nvidia.com/dgx/baseos-on-bcm-install-guide/upgrading-baseos-bcm.html"
else
echo "Use NVIDIA datacenter driver and, on DGX A100, nvidia-fabricmanager-<branch> (same as driver)."
echo " DGX OS guide: https://docs.nvidia.com/dgx/dgx-os-6-user-guide/additional_software.html"
echo " Driver download: https://www.nvidia.com/Download/index.aspx (Tesla / A100, Linux 64-bit)"
echo ""
echo "Example (standalone DGX):"
echo " sudo apt update"
echo " sudo apt install -y nvidia-driver-550-server nvidia-fabricmanager-550"
echo " # If 'held broken packages': try nvidia-headless-550-server, or CUDA repo (Option B in docs)"
echo " sudo reboot"
fi
else
echo "Non-DGX. Options:"
echo " PPA: sudo add-apt-repository ppa:graphics-drivers/ppa && sudo apt update && sudo apt install nvidia-driver-550"
echo " .run: Download from https://www.nvidia.com/Download/index.aspx then run the installer"
echo " sudo reboot # after install"
fi
echo ""
echo "After reboot, check: nvidia-smi (aim for CUDA Version >= 12.8 for PyTorch cu128)"
echo "Then retest NCCL: bash scripts/try_nccl_8gpu.sh"