File size: 2,833 Bytes
7451c3f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | #!/usr/bin/env bash
# Shared helpers for accelerate launch scripts.
detect_num_gpus() {
# Explicit override always wins.
if [[ -n "${NUM_GPUS:-}" ]]; then
echo "${NUM_GPUS}"
return
fi
# Container / scheduler often expose only a subset via CUDA_VISIBLE_DEVICES.
if [[ -n "${CUDA_VISIBLE_DEVICES:-}" ]]; then
local count=0
local d
IFS=',' read -ra _DEVS <<< "${CUDA_VISIBLE_DEVICES}"
for d in "${_DEVS[@]}"; do
d="${d// /}"
if [[ -n "${d}" ]]; then
count=$((count + 1))
fi
done
if [[ "${count}" -gt 0 ]]; then
echo "${count}"
return
fi
fi
# torch.cuda.device_count() reflects what this process can actually use.
if command -v python >/dev/null 2>&1; then
local torch_count
torch_count="$(python - <<'PY' 2>/dev/null || true
import torch
print(torch.cuda.device_count())
PY
)"
if [[ "${torch_count}" =~ ^[0-9]+$ ]] && [[ "${torch_count}" -gt 0 ]]; then
echo "${torch_count}"
return
fi
fi
if command -v nvidia-smi >/dev/null 2>&1; then
local count
count="$(nvidia-smi -L 2>/dev/null | wc -l | tr -d ' ')"
if [[ "${count}" =~ ^[0-9]+$ ]] && [[ "${count}" -gt 0 ]]; then
echo "${count}"
return
fi
fi
echo 1
}
launch_num_processes_flag() {
local num_gpus
num_gpus="$(detect_num_gpus)"
echo "--num_processes ${num_gpus}"
}
resolve_accelerate_config() {
# Explicit override always wins.
if [[ -n "${ACCELERATE_CONFIG:-}" ]]; then
echo "${ACCELERATE_CONFIG}"
return
fi
local num_gpus
num_gpus="$(detect_num_gpus)"
# Default: native PyTorch DDP (MULTI_GPU). No DeepSpeed install required.
# DeepSpeed ZeRO (student sharding, teacher colocated):
# ACCELERATE_CONFIG=default_config_zero2.yaml bash scripts/train_opd_7b_chartqa_deepspeed.sh
# Optional ZeRO-0 (no sharding): set ACCELERATE_CONFIG=default_config_deepspeed.yaml
if [[ "${num_gpus}" -ge 8 ]]; then
echo "default_config_8gpu.yaml"
else
echo "default_config.yaml"
fi
}
print_launch_plan() {
local num_gpus
local accel_config
num_gpus="$(detect_num_gpus)"
accel_config="$(resolve_accelerate_config)"
echo "============================================================"
echo "Launch plan: --num_processes ${num_gpus}"
echo "accelerate config: ${accel_config} (DDP/MULTI_GPU unless overridden)"
if [[ -n "${CUDA_VISIBLE_DEVICES:-}" ]]; then
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}"
fi
if command -v nvidia-smi >/dev/null 2>&1; then
echo "nvidia-smi -L:"
nvidia-smi -L 2>/dev/null || true
fi
if command -v python >/dev/null 2>&1; then
python - <<'PY' 2>/dev/null || true
import torch
print(f"torch.cuda.device_count()={torch.cuda.device_count()}")
PY
fi
echo "============================================================"
}
|