File size: 2,464 Bytes
50ebd92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/bin/bash
# Print driver-update instructions for this system. Run from repo root:  bash scripts/update_driver_instructions.sh
# Does not run sudo or modify the system.

set -e
echo "=== NVIDIA driver update instructions ==="
echo ""

if ! command -v nvidia-smi &>/dev/null; then
  echo "nvidia-smi not found. Install NVIDIA driver or run on a GPU node."
  exit 1
fi

DRIVER=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1)
CUDA_MAX=$(nvidia-smi 2>/dev/null | grep "CUDA Version" | sed -n 's/.*CUDA Version: \([0-9.]*\).*/\1/p' | head -1)

echo "Current driver: ${DRIVER:-unknown}"
echo "Max CUDA (driver): ${CUDA_MAX:-unknown}"
echo ""

if [ -f /etc/dgx-release ] || [ -f /etc/nvidia-dgx/release ] 2>/dev/null; then
  echo "DGX system detected."
  echo ""
  if command -v cm-chroot-sw-img &>/dev/null; then
    echo "Base Command Manager (BCM) detected. Upgrade via the BaseOS image:"
    echo "  1. Enter image: cm-chroot-sw-img /cm/images/baseos-image/"
    echo "  2. Inside chroot: apt update && apt upgrade && apt autoremove && exit"
    echo "  3. Change driver (in chroot): install nvidia-driver-550-server + nvidia-fabricmanager-550 (DGX A100 needs Fabric Manager)."
    echo "  See: docs/A100_AND_GPT3_PATH.md section 'Update drivers' (BCM / fabric)"
    echo "  Docs: https://docs.nvidia.com/dgx/baseos-on-bcm-install-guide/upgrading-baseos-bcm.html"
  else
    echo "Use NVIDIA datacenter driver and, on DGX A100, nvidia-fabricmanager-<branch> (same as driver)."
    echo "  DGX OS guide: https://docs.nvidia.com/dgx/dgx-os-6-user-guide/additional_software.html"
    echo "  Driver download: https://www.nvidia.com/Download/index.aspx (Tesla / A100, Linux 64-bit)"
    echo ""
    echo "Example (standalone DGX):"
    echo "  sudo apt update"
    echo "  sudo apt install -y nvidia-driver-550-server nvidia-fabricmanager-550"
    echo "  # If 'held broken packages': try nvidia-headless-550-server, or CUDA repo (Option B in docs)"
    echo "  sudo reboot"
  fi
else
  echo "Non-DGX. Options:"
  echo "  PPA:   sudo add-apt-repository ppa:graphics-drivers/ppa && sudo apt update && sudo apt install nvidia-driver-550"
  echo "  .run:  Download from https://www.nvidia.com/Download/index.aspx then run the installer"
  echo "  sudo reboot  # after install"
fi

echo ""
echo "After reboot, check: nvidia-smi  (aim for CUDA Version >= 12.8 for PyTorch cu128)"
echo "Then retest NCCL:    bash scripts/try_nccl_8gpu.sh"