File size: 8,360 Bytes
e759c11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#!/usr/bin/env bash

module load mamba/latest
source activate gaudi-pytorch-diffusion-1.22.0.740

set -euo pipefail

# Resolve repository root (script lives in task2/)
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"

# Shared constants for Habana Gaudi support.
TARGET_GAUDI_ENV="gaudi-pytorch-diffusion-1.22.0.740"

# Try multiple possible locations for the Gaudi environment
POSSIBLE_GAUDI_PATHS=(
  "${HOME}/mamba/envs/${TARGET_GAUDI_ENV}"
  "${HOME}/.conda/envs/${TARGET_GAUDI_ENV}"
  "/packages/envs/${TARGET_GAUDI_ENV}"
  "${CONDA_PREFIX}/../${TARGET_GAUDI_ENV}"
)

TARGET_GAUDI_PREFIX=""
for path in "${POSSIBLE_GAUDI_PATHS[@]}"; do
  if [[ -d "$path" ]]; then
    TARGET_GAUDI_PREFIX="$path"
    break
  fi
done

TARGET_GAUDI_ACTIVATE="${TARGET_GAUDI_PREFIX}/bin/activate"

# Ensure Habana Gaudi environment when available so subshell picks up the right
# python/pip executables. Controlled by LWM_AUTO_HABANA (defaults to enabled).
ensure_gaudi_env() {
  if [[ "${LWM_AUTO_HABANA:-1}" != "1" ]]; then
    echo "[DEBUG] Auto Gaudi activation disabled (LWM_AUTO_HABANA=0)"
    return
  fi

  if [[ "${CONDA_DEFAULT_ENV:-}" == "${TARGET_GAUDI_ENV}" ]]; then
    echo "[DEBUG] Already in Gaudi environment: ${CONDA_DEFAULT_ENV}"
    return
  fi

  if [[ ! -f "${TARGET_GAUDI_ACTIVATE}" ]]; then
    echo "[DEBUG] Gaudi environment not found at ${TARGET_GAUDI_ACTIVATE}"
    return
  fi

  echo "[DEBUG] Attempting to activate Gaudi environment..."
  
  if command -v module >/dev/null 2>&1; then
    echo "[DEBUG] Loading mamba module..."
    module load mamba/latest 2>&1 | grep -v "^$" || true
  else
    echo "[DEBUG] module command not available, skipping module load"
  fi

  local activated="0"

  if [[ -f "${TARGET_GAUDI_ACTIVATE}" ]]; then
    echo "[DEBUG] Trying direct activation: source ${TARGET_GAUDI_ACTIVATE}"
    # shellcheck disable=SC1091
    if source "${TARGET_GAUDI_ACTIVATE}" 2>&1; then
      activated="1"
      echo "[DEBUG] Successfully activated via direct path"
    fi
  fi

  if [[ "${activated}" != "1" ]]; then
    echo "[DEBUG] Trying conda activate: source activate ${TARGET_GAUDI_ENV}"
    # shellcheck disable=SC1091
    if source activate "${TARGET_GAUDI_ENV}" 2>&1; then
      activated="1"
      echo "[DEBUG] Successfully activated via conda"
    else
      echo "[DEBUG] Failed to activate Gaudi environment"
    fi
  fi
  
  if [[ "${activated}" == "1" ]]; then
    echo "[DEBUG] Gaudi environment activated successfully"
  fi
}

ensure_gaudi_env

# Use Gaudi environment's python directly to avoid version conflicts
GAUDI_PYTHON="/home/namhyunk/mamba/envs/gaudi-pytorch-diffusion-1.22.0.740/bin/python"
if [[ -f "${GAUDI_PYTHON}" ]]; then
  # Add project root to PYTHONPATH for proper imports
  export PYTHONPATH="${ROOT_DIR}:${PYTHONPATH:-}"
  PYTHON_CMD=("${GAUDI_PYTHON}")
  echo "[DEBUG] Using Gaudi python: ${GAUDI_PYTHON}"
  echo "[DEBUG] PYTHONPATH: ${PYTHONPATH}"
else
  # Fallback to system python
  PYTHON_CMD=(python)
  echo "[DEBUG] Using system python (Gaudi python not found)"
fi

# Optimize for Habana Gaudi environment (with 96GB HPU memory, but CPU memory constrained)
# Reduce cache size to prevent OOM (25 files × 64MB = 1.6GB, too much for concurrent processes)
export LWM_SPECTRO_MEMMAP_CACHE_SIZE="${LWM_SPECTRO_MEMMAP_CACHE_SIZE:-8}"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-4}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-4}"
# Prevent user site-packages from interfering with conda environment
export PYTHONNOUSERSITE=1

# Generate a single timestamp for this entire benchmark run
# This ensures all models/sizes share the same output directory
export LWM_RUN_TIMESTAMP="${LWM_RUN_TIMESTAMP:-$(date +%Y%m%d_%H%M%S)}"

# Run the Task 2 joint SNR+Mobility benchmark across the default backbones.
# Optimized defaults for Habana Gaudi environment with 8 cores and 96GB memory
echo "
==================================================
Task 2 joint SNR+Mobility benchmark launcher
=================================================="
echo "[INFO] Run timestamp: ${LWM_RUN_TIMESTAMP}"
echo "[INFO] Memmap cache size: ${LWM_SPECTRO_MEMMAP_CACHE_SIZE}"
echo "[INFO] OMP threads: ${OMP_NUM_THREADS}"
# Reduce batch size to prevent OOM (16 instead of 32)
BATCH_SIZE="${TASK2_BENCH_BATCH_SIZE:-16}"
MAX_SAMPLES_PER_CONFIG="${TASK2_BENCH_MAX_SAMPLES_PER_CONFIG:-128}"
EPOCHS="${TASK2_BENCH_EPOCHS:-100}"
NUM_WORKERS="${TASK2_BENCH_NUM_WORKERS:-0}"
TRAINABLE_LAYERS="${TASK2_BENCH_TRAINABLE_LAYERS:-2}"
LWM_TRAINABLE_LAYERS="${TASK2_BENCH_LWM_TRAINABLE_LAYERS:-2}"
VAL_PER_CLASS="${TASK2_BENCH_VAL_PER_CLASS:-128}"
TEST_PER_CLASS="${TASK2_BENCH_TEST_PER_CLASS:-128}"
EVAL_EVERY="${TASK2_BENCH_EVAL_EVERY:-2}"
TRAIN_SIZE_DEFAULT="${TASK2_BENCH_TRAIN_SIZES:-"5 10 20 50 100 200 400"}"
IFS=' ' read -r -a TRAIN_SIZES <<< "${TRAIN_SIZE_DEFAULT}"
NUM_SHARDS="${TASK2_BENCH_NUM_SHARDS:-4}"
SHARD_OVERRIDE="${TASK2_BENCH_SHARD_INDEX:-}"
if [[ -n "${SHARD_OVERRIDE}" ]]; then
  SHARD_INDICES=("${SHARD_OVERRIDE}")
else
  if [[ "${NUM_SHARDS}" -lt 1 ]]; then
    NUM_SHARDS=1
  fi
  SHARD_INDICES=()
  for ((idx=0; idx<NUM_SHARDS; idx++)); do
    SHARD_INDICES+=("${idx}")
  done
fi
echo "[INFO] Benchmark batch size: ${BATCH_SIZE}"
echo "[INFO] Benchmark max samples per config: ${MAX_SAMPLES_PER_CONFIG}"
echo "[INFO] Benchmark epochs: ${EPOCHS}"
echo "[INFO] Benchmark num workers: ${NUM_WORKERS}"
echo "[INFO] Benchmark eval every: ${EVAL_EVERY} epochs"
echo "[INFO] Benchmark val/test per class: ${VAL_PER_CLASS}/${TEST_PER_CLASS}"
echo "[INFO] Benchmark train sizes: ${TRAIN_SIZES[*]}"
if [[ "${NUM_SHARDS}" -gt 1 ]]; then
  echo "[INFO] Dataset shards: ${NUM_SHARDS} (indices: ${SHARD_INDICES[*]})"
fi

run_benchmark() {
  "${PYTHON_CMD[@]}" "${ROOT_DIR}/task2/train_joint_snr_mobility.py" \
    --data-root spectrograms \
    --cities city_10_austin \
    --comm LTE \
    --benchmark \
    --batch-size "${BATCH_SIZE}" \
    --epochs "${EPOCHS}" \
    --max-samples-per-config "${MAX_SAMPLES_PER_CONFIG}" \
    --num-workers "${NUM_WORKERS}" \
    --eval-every "${EVAL_EVERY}" \
    --trainable-layers "${TRAINABLE_LAYERS}" \
    --lwm-trainable-layers "${LWM_TRAINABLE_LAYERS}" \
    --val-per-class "${VAL_PER_CLASS}" \
    --test-per-class "${TEST_PER_CLASS}" \
    "$@"
}

SPLIT_TRAIN_SIZES="${TASK2_BENCH_SPLIT_TRAIN_SIZES:-1}"
explicit_train_sizes=0
models_cli=0
for arg in "$@"; do
  if [[ "$arg" == "--train-sizes" ]]; then
    explicit_train_sizes=1
  fi
  if [[ "$arg" == "--models" ]]; then
    models_cli=1
  fi
done

if [[ ${models_cli} -eq 0 ]]; then
  if [[ -n "${TASK2_BENCH_MODEL:-}" ]]; then
    MODELS_TO_RUN=("${TASK2_BENCH_MODEL}")
  else
    MODELS_TO_RUN=("lwm" "ieee_cnn" "resnet18" "efficientnet_b0" "mobilenet_v3_small")
  fi
fi

if [[ "${SPLIT_TRAIN_SIZES}" != "0" && ${explicit_train_sizes} -eq 0 ]]; then
  for size in "${TRAIN_SIZES[@]}"; do
    echo ""
    echo "=========================================="
    echo "Benchmarking train size: ${size} per class"
    echo "=========================================="
    for shard_idx in "${SHARD_INDICES[@]}"; do
      echo "  [INFO] Shard ${shard_idx}/${NUM_SHARDS}"
      shard_args=(--num-shards "${NUM_SHARDS}" --shard-index "${shard_idx}")
      if [[ ${models_cli} -eq 1 ]]; then
        run_benchmark --train-sizes "${size}" "${shard_args[@]}" "$@"
      else
        for model in "${MODELS_TO_RUN[@]}"; do
          echo "    -> Model: ${model}"
          run_benchmark --models "${model}" --train-sizes "${size}" "${shard_args[@]}" "$@"
        done
      fi
    done
  done
else
  if [[ ${models_cli} -eq 1 ]]; then
    for shard_idx in "${SHARD_INDICES[@]}"; do
      echo ""
      echo "=========================================="
      echo "Benchmarking shard: ${shard_idx}/${NUM_SHARDS}"
      echo "=========================================="
      run_benchmark --num-shards "${NUM_SHARDS}" --shard-index "${shard_idx}" "$@"
    done
  else
    for model in "${MODELS_TO_RUN[@]}"; do
      echo ""
      echo "=========================================="
      echo "Benchmarking model: ${model}"
      echo "=========================================="
      for shard_idx in "${SHARD_INDICES[@]}"; do
        echo "  [INFO] Shard ${shard_idx}/${NUM_SHARDS}"
        run_benchmark --models "${model}" --num-shards "${NUM_SHARDS}" --shard-index "${shard_idx}" "$@"
      done
    done
  fi
fi