|
|
#!/usr/bin/env bash |
|
|
|
|
|
module load mamba/latest |
|
|
source activate gaudi-pytorch-diffusion-1.22.0.740 |
|
|
|
|
|
set -euo pipefail |
|
|
|
|
|
|
|
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" |
|
|
|
|
|
|
|
|
TARGET_GAUDI_ENV="gaudi-pytorch-diffusion-1.22.0.740" |
|
|
|
|
|
|
|
|
POSSIBLE_GAUDI_PATHS=( |
|
|
"${HOME}/mamba/envs/${TARGET_GAUDI_ENV}" |
|
|
"${HOME}/.conda/envs/${TARGET_GAUDI_ENV}" |
|
|
"/packages/envs/${TARGET_GAUDI_ENV}" |
|
|
"${CONDA_PREFIX}/../${TARGET_GAUDI_ENV}" |
|
|
) |
|
|
|
|
|
TARGET_GAUDI_PREFIX="" |
|
|
for path in "${POSSIBLE_GAUDI_PATHS[@]}"; do |
|
|
if [[ -d "$path" ]]; then |
|
|
TARGET_GAUDI_PREFIX="$path" |
|
|
break |
|
|
fi |
|
|
done |
|
|
|
|
|
TARGET_GAUDI_ACTIVATE="${TARGET_GAUDI_PREFIX}/bin/activate" |
|
|
|
|
|
|
|
|
|
|
|
ensure_gaudi_env() { |
|
|
if [[ "${LWM_AUTO_HABANA:-1}" != "1" ]]; then |
|
|
echo "[DEBUG] Auto Gaudi activation disabled (LWM_AUTO_HABANA=0)" |
|
|
return |
|
|
fi |
|
|
|
|
|
if [[ "${CONDA_DEFAULT_ENV:-}" == "${TARGET_GAUDI_ENV}" ]]; then |
|
|
echo "[DEBUG] Already in Gaudi environment: ${CONDA_DEFAULT_ENV}" |
|
|
return |
|
|
fi |
|
|
|
|
|
if [[ ! -f "${TARGET_GAUDI_ACTIVATE}" ]]; then |
|
|
echo "[DEBUG] Gaudi environment not found at ${TARGET_GAUDI_ACTIVATE}" |
|
|
return |
|
|
fi |
|
|
|
|
|
echo "[DEBUG] Attempting to activate Gaudi environment..." |
|
|
|
|
|
if command -v module >/dev/null 2>&1; then |
|
|
echo "[DEBUG] Loading mamba module..." |
|
|
module load mamba/latest 2>&1 | grep -v "^$" || true |
|
|
else |
|
|
echo "[DEBUG] module command not available, skipping module load" |
|
|
fi |
|
|
|
|
|
local activated="0" |
|
|
|
|
|
if [[ -f "${TARGET_GAUDI_ACTIVATE}" ]]; then |
|
|
echo "[DEBUG] Trying direct activation: source ${TARGET_GAUDI_ACTIVATE}" |
|
|
|
|
|
if source "${TARGET_GAUDI_ACTIVATE}" 2>&1; then |
|
|
activated="1" |
|
|
echo "[DEBUG] Successfully activated via direct path" |
|
|
fi |
|
|
fi |
|
|
|
|
|
if [[ "${activated}" != "1" ]]; then |
|
|
echo "[DEBUG] Trying conda activate: source activate ${TARGET_GAUDI_ENV}" |
|
|
|
|
|
if source activate "${TARGET_GAUDI_ENV}" 2>&1; then |
|
|
activated="1" |
|
|
echo "[DEBUG] Successfully activated via conda" |
|
|
else |
|
|
echo "[DEBUG] Failed to activate Gaudi environment" |
|
|
fi |
|
|
fi |
|
|
|
|
|
if [[ "${activated}" == "1" ]]; then |
|
|
echo "[DEBUG] Gaudi environment activated successfully" |
|
|
fi |
|
|
} |
|
|
|
|
|
ensure_gaudi_env |
|
|
|
|
|
|
|
|
GAUDI_PYTHON="/home/namhyunk/mamba/envs/gaudi-pytorch-diffusion-1.22.0.740/bin/python" |
|
|
if [[ -f "${GAUDI_PYTHON}" ]]; then |
|
|
|
|
|
export PYTHONPATH="${ROOT_DIR}:${PYTHONPATH:-}" |
|
|
PYTHON_CMD=("${GAUDI_PYTHON}") |
|
|
echo "[DEBUG] Using Gaudi python: ${GAUDI_PYTHON}" |
|
|
echo "[DEBUG] PYTHONPATH: ${PYTHONPATH}" |
|
|
else |
|
|
|
|
|
PYTHON_CMD=(python) |
|
|
echo "[DEBUG] Using system python (Gaudi python not found)" |
|
|
fi |
|
|
|
|
|
|
|
|
|
|
|
export LWM_SPECTRO_MEMMAP_CACHE_SIZE="${LWM_SPECTRO_MEMMAP_CACHE_SIZE:-8}" |
|
|
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-4}" |
|
|
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-4}" |
|
|
|
|
|
export PYTHONNOUSERSITE=1 |
|
|
|
|
|
|
|
|
|
|
|
export LWM_RUN_TIMESTAMP="${LWM_RUN_TIMESTAMP:-$(date +%Y%m%d_%H%M%S)}" |
|
|
|
|
|
|
|
|
|
|
|
echo " |
|
|
================================================== |
|
|
Task 2 joint SNR+Mobility benchmark launcher |
|
|
==================================================" |
|
|
echo "[INFO] Run timestamp: ${LWM_RUN_TIMESTAMP}" |
|
|
echo "[INFO] Memmap cache size: ${LWM_SPECTRO_MEMMAP_CACHE_SIZE}" |
|
|
echo "[INFO] OMP threads: ${OMP_NUM_THREADS}" |
|
|
|
|
|
BATCH_SIZE="${TASK2_BENCH_BATCH_SIZE:-16}" |
|
|
MAX_SAMPLES_PER_CONFIG="${TASK2_BENCH_MAX_SAMPLES_PER_CONFIG:-128}" |
|
|
EPOCHS="${TASK2_BENCH_EPOCHS:-100}" |
|
|
NUM_WORKERS="${TASK2_BENCH_NUM_WORKERS:-0}" |
|
|
TRAINABLE_LAYERS="${TASK2_BENCH_TRAINABLE_LAYERS:-2}" |
|
|
LWM_TRAINABLE_LAYERS="${TASK2_BENCH_LWM_TRAINABLE_LAYERS:-2}" |
|
|
VAL_PER_CLASS="${TASK2_BENCH_VAL_PER_CLASS:-128}" |
|
|
TEST_PER_CLASS="${TASK2_BENCH_TEST_PER_CLASS:-128}" |
|
|
EVAL_EVERY="${TASK2_BENCH_EVAL_EVERY:-2}" |
|
|
TRAIN_SIZE_DEFAULT="${TASK2_BENCH_TRAIN_SIZES:-"5 10 20 50 100 200 400"}" |
|
|
IFS=' ' read -r -a TRAIN_SIZES <<< "${TRAIN_SIZE_DEFAULT}" |
|
|
NUM_SHARDS="${TASK2_BENCH_NUM_SHARDS:-4}" |
|
|
SHARD_OVERRIDE="${TASK2_BENCH_SHARD_INDEX:-}" |
|
|
if [[ -n "${SHARD_OVERRIDE}" ]]; then |
|
|
SHARD_INDICES=("${SHARD_OVERRIDE}") |
|
|
else |
|
|
if [[ "${NUM_SHARDS}" -lt 1 ]]; then |
|
|
NUM_SHARDS=1 |
|
|
fi |
|
|
SHARD_INDICES=() |
|
|
for ((idx=0; idx<NUM_SHARDS; idx++)); do |
|
|
SHARD_INDICES+=("${idx}") |
|
|
done |
|
|
fi |
|
|
echo "[INFO] Benchmark batch size: ${BATCH_SIZE}" |
|
|
echo "[INFO] Benchmark max samples per config: ${MAX_SAMPLES_PER_CONFIG}" |
|
|
echo "[INFO] Benchmark epochs: ${EPOCHS}" |
|
|
echo "[INFO] Benchmark num workers: ${NUM_WORKERS}" |
|
|
echo "[INFO] Benchmark eval every: ${EVAL_EVERY} epochs" |
|
|
echo "[INFO] Benchmark val/test per class: ${VAL_PER_CLASS}/${TEST_PER_CLASS}" |
|
|
echo "[INFO] Benchmark train sizes: ${TRAIN_SIZES[*]}" |
|
|
if [[ "${NUM_SHARDS}" -gt 1 ]]; then |
|
|
echo "[INFO] Dataset shards: ${NUM_SHARDS} (indices: ${SHARD_INDICES[*]})" |
|
|
fi |
|
|
|
|
|
run_benchmark() { |
|
|
"${PYTHON_CMD[@]}" "${ROOT_DIR}/task2/train_joint_snr_mobility.py" \ |
|
|
--data-root spectrograms \ |
|
|
--cities city_10_austin \ |
|
|
--comm LTE \ |
|
|
--benchmark \ |
|
|
--batch-size "${BATCH_SIZE}" \ |
|
|
--epochs "${EPOCHS}" \ |
|
|
--max-samples-per-config "${MAX_SAMPLES_PER_CONFIG}" \ |
|
|
--num-workers "${NUM_WORKERS}" \ |
|
|
--eval-every "${EVAL_EVERY}" \ |
|
|
--trainable-layers "${TRAINABLE_LAYERS}" \ |
|
|
--lwm-trainable-layers "${LWM_TRAINABLE_LAYERS}" \ |
|
|
--val-per-class "${VAL_PER_CLASS}" \ |
|
|
--test-per-class "${TEST_PER_CLASS}" \ |
|
|
"$@" |
|
|
} |
|
|
|
|
|
SPLIT_TRAIN_SIZES="${TASK2_BENCH_SPLIT_TRAIN_SIZES:-1}" |
|
|
explicit_train_sizes=0 |
|
|
models_cli=0 |
|
|
for arg in "$@"; do |
|
|
if [[ "$arg" == "--train-sizes" ]]; then |
|
|
explicit_train_sizes=1 |
|
|
fi |
|
|
if [[ "$arg" == "--models" ]]; then |
|
|
models_cli=1 |
|
|
fi |
|
|
done |
|
|
|
|
|
if [[ ${models_cli} -eq 0 ]]; then |
|
|
if [[ -n "${TASK2_BENCH_MODEL:-}" ]]; then |
|
|
MODELS_TO_RUN=("${TASK2_BENCH_MODEL}") |
|
|
else |
|
|
MODELS_TO_RUN=("lwm" "ieee_cnn" "resnet18" "efficientnet_b0" "mobilenet_v3_small") |
|
|
fi |
|
|
fi |
|
|
|
|
|
if [[ "${SPLIT_TRAIN_SIZES}" != "0" && ${explicit_train_sizes} -eq 0 ]]; then |
|
|
for size in "${TRAIN_SIZES[@]}"; do |
|
|
echo "" |
|
|
echo "==========================================" |
|
|
echo "Benchmarking train size: ${size} per class" |
|
|
echo "==========================================" |
|
|
for shard_idx in "${SHARD_INDICES[@]}"; do |
|
|
echo " [INFO] Shard ${shard_idx}/${NUM_SHARDS}" |
|
|
shard_args=(--num-shards "${NUM_SHARDS}" --shard-index "${shard_idx}") |
|
|
if [[ ${models_cli} -eq 1 ]]; then |
|
|
run_benchmark --train-sizes "${size}" "${shard_args[@]}" "$@" |
|
|
else |
|
|
for model in "${MODELS_TO_RUN[@]}"; do |
|
|
echo " -> Model: ${model}" |
|
|
run_benchmark --models "${model}" --train-sizes "${size}" "${shard_args[@]}" "$@" |
|
|
done |
|
|
fi |
|
|
done |
|
|
done |
|
|
else |
|
|
if [[ ${models_cli} -eq 1 ]]; then |
|
|
for shard_idx in "${SHARD_INDICES[@]}"; do |
|
|
echo "" |
|
|
echo "==========================================" |
|
|
echo "Benchmarking shard: ${shard_idx}/${NUM_SHARDS}" |
|
|
echo "==========================================" |
|
|
run_benchmark --num-shards "${NUM_SHARDS}" --shard-index "${shard_idx}" "$@" |
|
|
done |
|
|
else |
|
|
for model in "${MODELS_TO_RUN[@]}"; do |
|
|
echo "" |
|
|
echo "==========================================" |
|
|
echo "Benchmarking model: ${model}" |
|
|
echo "==========================================" |
|
|
for shard_idx in "${SHARD_INDICES[@]}"; do |
|
|
echo " [INFO] Shard ${shard_idx}/${NUM_SHARDS}" |
|
|
run_benchmark --models "${model}" --num-shards "${NUM_SHARDS}" --shard-index "${shard_idx}" "$@" |
|
|
done |
|
|
done |
|
|
fi |
|
|
fi |
|
|
|