| #!/bin/bash |
| |
| |
|
|
| set -e |
| echo "=== NVIDIA driver update instructions ===" |
| echo "" |
|
|
| if ! command -v nvidia-smi &>/dev/null; then |
| echo "nvidia-smi not found. Install NVIDIA driver or run on a GPU node." |
| exit 1 |
| fi |
|
|
| DRIVER=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1) |
| CUDA_MAX=$(nvidia-smi 2>/dev/null | grep "CUDA Version" | sed -n 's/.*CUDA Version: \([0-9.]*\).*/\1/p' | head -1) |
|
|
| echo "Current driver: ${DRIVER:-unknown}" |
| echo "Max CUDA (driver): ${CUDA_MAX:-unknown}" |
| echo "" |
|
|
| if [ -f /etc/dgx-release ] || [ -f /etc/nvidia-dgx/release ] 2>/dev/null; then |
| echo "DGX system detected." |
| echo "" |
| if command -v cm-chroot-sw-img &>/dev/null; then |
| echo "Base Command Manager (BCM) detected. Upgrade via the BaseOS image:" |
| echo " 1. Enter image: cm-chroot-sw-img /cm/images/baseos-image/" |
| echo " 2. Inside chroot: apt update && apt upgrade && apt autoremove && exit" |
| echo " 3. Change driver (in chroot): install nvidia-driver-550-server + nvidia-fabricmanager-550 (DGX A100 needs Fabric Manager)." |
| echo " See: docs/A100_AND_GPT3_PATH.md section 'Update drivers' (BCM / fabric)" |
| echo " Docs: https://docs.nvidia.com/dgx/baseos-on-bcm-install-guide/upgrading-baseos-bcm.html" |
| else |
| echo "Use NVIDIA datacenter driver and, on DGX A100, nvidia-fabricmanager-<branch> (same as driver)." |
| echo " DGX OS guide: https://docs.nvidia.com/dgx/dgx-os-6-user-guide/additional_software.html" |
| echo " Driver download: https://www.nvidia.com/Download/index.aspx (Tesla / A100, Linux 64-bit)" |
| echo "" |
| echo "Example (standalone DGX):" |
| echo " sudo apt update" |
| echo " sudo apt install -y nvidia-driver-550-server nvidia-fabricmanager-550" |
| echo " # If 'held broken packages': try nvidia-headless-550-server, or CUDA repo (Option B in docs)" |
| echo " sudo reboot" |
| fi |
| else |
| echo "Non-DGX. Options:" |
| echo " PPA: sudo add-apt-repository ppa:graphics-drivers/ppa && sudo apt update && sudo apt install nvidia-driver-550" |
| echo " .run: Download from https://www.nvidia.com/Download/index.aspx then run the installer" |
| echo " sudo reboot # after install" |
| fi |
|
|
| echo "" |
| echo "After reboot, check: nvidia-smi (aim for CUDA Version >= 12.8 for PyTorch cu128)" |
| echo "Then retest NCCL: bash scripts/try_nccl_8gpu.sh" |
|
|