#!/bin/bash # Print driver-update instructions for this system. Run from repo root: bash scripts/update_driver_instructions.sh # Does not run sudo or modify the system. set -e echo "=== NVIDIA driver update instructions ===" echo "" if ! command -v nvidia-smi &>/dev/null; then echo "nvidia-smi not found. Install NVIDIA driver or run on a GPU node." exit 1 fi DRIVER=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1) CUDA_MAX=$(nvidia-smi 2>/dev/null | grep "CUDA Version" | sed -n 's/.*CUDA Version: \([0-9.]*\).*/\1/p' | head -1) echo "Current driver: ${DRIVER:-unknown}" echo "Max CUDA (driver): ${CUDA_MAX:-unknown}" echo "" if [ -f /etc/dgx-release ] || [ -f /etc/nvidia-dgx/release ] 2>/dev/null; then echo "DGX system detected." echo "" if command -v cm-chroot-sw-img &>/dev/null; then echo "Base Command Manager (BCM) detected. Upgrade via the BaseOS image:" echo " 1. Enter image: cm-chroot-sw-img /cm/images/baseos-image/" echo " 2. Inside chroot: apt update && apt upgrade && apt autoremove && exit" echo " 3. Change driver (in chroot): install nvidia-driver-550-server + nvidia-fabricmanager-550 (DGX A100 needs Fabric Manager)." echo " See: docs/A100_AND_GPT3_PATH.md section 'Update drivers' (BCM / fabric)" echo " Docs: https://docs.nvidia.com/dgx/baseos-on-bcm-install-guide/upgrading-baseos-bcm.html" else echo "Use NVIDIA datacenter driver and, on DGX A100, nvidia-fabricmanager- (same as driver)." echo " DGX OS guide: https://docs.nvidia.com/dgx/dgx-os-6-user-guide/additional_software.html" echo " Driver download: https://www.nvidia.com/Download/index.aspx (Tesla / A100, Linux 64-bit)" echo "" echo "Example (standalone DGX):" echo " sudo apt update" echo " sudo apt install -y nvidia-driver-550-server nvidia-fabricmanager-550" echo " # If 'held broken packages': try nvidia-headless-550-server, or CUDA repo (Option B in docs)" echo " sudo reboot" fi else echo "Non-DGX. Options:" echo " PPA: sudo add-apt-repository ppa:graphics-drivers/ppa && sudo apt update && sudo apt install nvidia-driver-550" echo " .run: Download from https://www.nvidia.com/Download/index.aspx then run the installer" echo " sudo reboot # after install" fi echo "" echo "After reboot, check: nvidia-smi (aim for CUDA Version >= 12.8 for PyTorch cu128)" echo "Then retest NCCL: bash scripts/try_nccl_8gpu.sh"