File size: 2,833 Bytes
7451c3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env bash
# Shared helpers for accelerate launch scripts.

detect_num_gpus() {
  # Explicit override always wins.
  if [[ -n "${NUM_GPUS:-}" ]]; then
    echo "${NUM_GPUS}"
    return
  fi

  # Container / scheduler often expose only a subset via CUDA_VISIBLE_DEVICES.
  if [[ -n "${CUDA_VISIBLE_DEVICES:-}" ]]; then
    local count=0
    local d
    IFS=',' read -ra _DEVS <<< "${CUDA_VISIBLE_DEVICES}"
    for d in "${_DEVS[@]}"; do
      d="${d// /}"
      if [[ -n "${d}" ]]; then
        count=$((count + 1))
      fi
    done
    if [[ "${count}" -gt 0 ]]; then
      echo "${count}"
      return
    fi
  fi

  # torch.cuda.device_count() reflects what this process can actually use.
  if command -v python >/dev/null 2>&1; then
    local torch_count
    torch_count="$(python - <<'PY' 2>/dev/null || true
import torch
print(torch.cuda.device_count())
PY
)"
    if [[ "${torch_count}" =~ ^[0-9]+$ ]] && [[ "${torch_count}" -gt 0 ]]; then
      echo "${torch_count}"
      return
    fi
  fi

  if command -v nvidia-smi >/dev/null 2>&1; then
    local count
    count="$(nvidia-smi -L 2>/dev/null | wc -l | tr -d ' ')"
    if [[ "${count}" =~ ^[0-9]+$ ]] && [[ "${count}" -gt 0 ]]; then
      echo "${count}"
      return
    fi
  fi

  echo 1
}

launch_num_processes_flag() {
  local num_gpus
  num_gpus="$(detect_num_gpus)"
  echo "--num_processes ${num_gpus}"
}

resolve_accelerate_config() {
  # Explicit override always wins.
  if [[ -n "${ACCELERATE_CONFIG:-}" ]]; then
    echo "${ACCELERATE_CONFIG}"
    return
  fi

  local num_gpus
  num_gpus="$(detect_num_gpus)"

  # Default: native PyTorch DDP (MULTI_GPU). No DeepSpeed install required.
  # DeepSpeed ZeRO (student sharding, teacher colocated):
  #   ACCELERATE_CONFIG=default_config_zero2.yaml bash scripts/train_opd_7b_chartqa_deepspeed.sh
  # Optional ZeRO-0 (no sharding): set ACCELERATE_CONFIG=default_config_deepspeed.yaml
  if [[ "${num_gpus}" -ge 8 ]]; then
    echo "default_config_8gpu.yaml"
  else
    echo "default_config.yaml"
  fi
}

print_launch_plan() {
  local num_gpus
  local accel_config
  num_gpus="$(detect_num_gpus)"
  accel_config="$(resolve_accelerate_config)"
  echo "============================================================"
  echo "Launch plan: --num_processes ${num_gpus}"
  echo "accelerate config: ${accel_config} (DDP/MULTI_GPU unless overridden)"
  if [[ -n "${CUDA_VISIBLE_DEVICES:-}" ]]; then
    echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}"
  fi
  if command -v nvidia-smi >/dev/null 2>&1; then
    echo "nvidia-smi -L:"
    nvidia-smi -L 2>/dev/null || true
  fi
  if command -v python >/dev/null 2>&1; then
    python - <<'PY' 2>/dev/null || true
import torch
print(f"torch.cuda.device_count()={torch.cuda.device_count()}")
PY
  fi
  echo "============================================================"
}