lwm-spectro / task2 /run_task2_joint_benchmark.sh
wi-lab's picture
Upload task2/run_task2_joint_benchmark.sh with huggingface_hub
e759c11 verified
#!/usr/bin/env bash
module load mamba/latest
source activate gaudi-pytorch-diffusion-1.22.0.740
set -euo pipefail
# Resolve repository root (script lives in task2/)
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
# Shared constants for Habana Gaudi support.
TARGET_GAUDI_ENV="gaudi-pytorch-diffusion-1.22.0.740"
# Try multiple possible locations for the Gaudi environment
POSSIBLE_GAUDI_PATHS=(
"${HOME}/mamba/envs/${TARGET_GAUDI_ENV}"
"${HOME}/.conda/envs/${TARGET_GAUDI_ENV}"
"/packages/envs/${TARGET_GAUDI_ENV}"
"${CONDA_PREFIX}/../${TARGET_GAUDI_ENV}"
)
TARGET_GAUDI_PREFIX=""
for path in "${POSSIBLE_GAUDI_PATHS[@]}"; do
if [[ -d "$path" ]]; then
TARGET_GAUDI_PREFIX="$path"
break
fi
done
TARGET_GAUDI_ACTIVATE="${TARGET_GAUDI_PREFIX}/bin/activate"
# Ensure Habana Gaudi environment when available so subshell picks up the right
# python/pip executables. Controlled by LWM_AUTO_HABANA (defaults to enabled).
ensure_gaudi_env() {
if [[ "${LWM_AUTO_HABANA:-1}" != "1" ]]; then
echo "[DEBUG] Auto Gaudi activation disabled (LWM_AUTO_HABANA=0)"
return
fi
if [[ "${CONDA_DEFAULT_ENV:-}" == "${TARGET_GAUDI_ENV}" ]]; then
echo "[DEBUG] Already in Gaudi environment: ${CONDA_DEFAULT_ENV}"
return
fi
if [[ ! -f "${TARGET_GAUDI_ACTIVATE}" ]]; then
echo "[DEBUG] Gaudi environment not found at ${TARGET_GAUDI_ACTIVATE}"
return
fi
echo "[DEBUG] Attempting to activate Gaudi environment..."
if command -v module >/dev/null 2>&1; then
echo "[DEBUG] Loading mamba module..."
module load mamba/latest 2>&1 | grep -v "^$" || true
else
echo "[DEBUG] module command not available, skipping module load"
fi
local activated="0"
if [[ -f "${TARGET_GAUDI_ACTIVATE}" ]]; then
echo "[DEBUG] Trying direct activation: source ${TARGET_GAUDI_ACTIVATE}"
# shellcheck disable=SC1091
if source "${TARGET_GAUDI_ACTIVATE}" 2>&1; then
activated="1"
echo "[DEBUG] Successfully activated via direct path"
fi
fi
if [[ "${activated}" != "1" ]]; then
echo "[DEBUG] Trying conda activate: source activate ${TARGET_GAUDI_ENV}"
# shellcheck disable=SC1091
if source activate "${TARGET_GAUDI_ENV}" 2>&1; then
activated="1"
echo "[DEBUG] Successfully activated via conda"
else
echo "[DEBUG] Failed to activate Gaudi environment"
fi
fi
if [[ "${activated}" == "1" ]]; then
echo "[DEBUG] Gaudi environment activated successfully"
fi
}
ensure_gaudi_env
# Use Gaudi environment's python directly to avoid version conflicts
GAUDI_PYTHON="/home/namhyunk/mamba/envs/gaudi-pytorch-diffusion-1.22.0.740/bin/python"
if [[ -f "${GAUDI_PYTHON}" ]]; then
# Add project root to PYTHONPATH for proper imports
export PYTHONPATH="${ROOT_DIR}:${PYTHONPATH:-}"
PYTHON_CMD=("${GAUDI_PYTHON}")
echo "[DEBUG] Using Gaudi python: ${GAUDI_PYTHON}"
echo "[DEBUG] PYTHONPATH: ${PYTHONPATH}"
else
# Fallback to system python
PYTHON_CMD=(python)
echo "[DEBUG] Using system python (Gaudi python not found)"
fi
# Optimize for Habana Gaudi environment (with 96GB HPU memory, but CPU memory constrained)
# Reduce cache size to prevent OOM (25 files × 64MB = 1.6GB, too much for concurrent processes)
export LWM_SPECTRO_MEMMAP_CACHE_SIZE="${LWM_SPECTRO_MEMMAP_CACHE_SIZE:-8}"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-4}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-4}"
# Prevent user site-packages from interfering with conda environment
export PYTHONNOUSERSITE=1
# Generate a single timestamp for this entire benchmark run
# This ensures all models/sizes share the same output directory
export LWM_RUN_TIMESTAMP="${LWM_RUN_TIMESTAMP:-$(date +%Y%m%d_%H%M%S)}"
# Run the Task 2 joint SNR+Mobility benchmark across the default backbones.
# Optimized defaults for Habana Gaudi environment with 8 cores and 96GB memory
echo "
==================================================
Task 2 joint SNR+Mobility benchmark launcher
=================================================="
echo "[INFO] Run timestamp: ${LWM_RUN_TIMESTAMP}"
echo "[INFO] Memmap cache size: ${LWM_SPECTRO_MEMMAP_CACHE_SIZE}"
echo "[INFO] OMP threads: ${OMP_NUM_THREADS}"
# Reduce batch size to prevent OOM (16 instead of 32)
BATCH_SIZE="${TASK2_BENCH_BATCH_SIZE:-16}"
MAX_SAMPLES_PER_CONFIG="${TASK2_BENCH_MAX_SAMPLES_PER_CONFIG:-128}"
EPOCHS="${TASK2_BENCH_EPOCHS:-100}"
NUM_WORKERS="${TASK2_BENCH_NUM_WORKERS:-0}"
TRAINABLE_LAYERS="${TASK2_BENCH_TRAINABLE_LAYERS:-2}"
LWM_TRAINABLE_LAYERS="${TASK2_BENCH_LWM_TRAINABLE_LAYERS:-2}"
VAL_PER_CLASS="${TASK2_BENCH_VAL_PER_CLASS:-128}"
TEST_PER_CLASS="${TASK2_BENCH_TEST_PER_CLASS:-128}"
EVAL_EVERY="${TASK2_BENCH_EVAL_EVERY:-2}"
TRAIN_SIZE_DEFAULT="${TASK2_BENCH_TRAIN_SIZES:-"5 10 20 50 100 200 400"}"
IFS=' ' read -r -a TRAIN_SIZES <<< "${TRAIN_SIZE_DEFAULT}"
NUM_SHARDS="${TASK2_BENCH_NUM_SHARDS:-4}"
SHARD_OVERRIDE="${TASK2_BENCH_SHARD_INDEX:-}"
if [[ -n "${SHARD_OVERRIDE}" ]]; then
SHARD_INDICES=("${SHARD_OVERRIDE}")
else
if [[ "${NUM_SHARDS}" -lt 1 ]]; then
NUM_SHARDS=1
fi
SHARD_INDICES=()
for ((idx=0; idx<NUM_SHARDS; idx++)); do
SHARD_INDICES+=("${idx}")
done
fi
echo "[INFO] Benchmark batch size: ${BATCH_SIZE}"
echo "[INFO] Benchmark max samples per config: ${MAX_SAMPLES_PER_CONFIG}"
echo "[INFO] Benchmark epochs: ${EPOCHS}"
echo "[INFO] Benchmark num workers: ${NUM_WORKERS}"
echo "[INFO] Benchmark eval every: ${EVAL_EVERY} epochs"
echo "[INFO] Benchmark val/test per class: ${VAL_PER_CLASS}/${TEST_PER_CLASS}"
echo "[INFO] Benchmark train sizes: ${TRAIN_SIZES[*]}"
if [[ "${NUM_SHARDS}" -gt 1 ]]; then
echo "[INFO] Dataset shards: ${NUM_SHARDS} (indices: ${SHARD_INDICES[*]})"
fi
run_benchmark() {
"${PYTHON_CMD[@]}" "${ROOT_DIR}/task2/train_joint_snr_mobility.py" \
--data-root spectrograms \
--cities city_10_austin \
--comm LTE \
--benchmark \
--batch-size "${BATCH_SIZE}" \
--epochs "${EPOCHS}" \
--max-samples-per-config "${MAX_SAMPLES_PER_CONFIG}" \
--num-workers "${NUM_WORKERS}" \
--eval-every "${EVAL_EVERY}" \
--trainable-layers "${TRAINABLE_LAYERS}" \
--lwm-trainable-layers "${LWM_TRAINABLE_LAYERS}" \
--val-per-class "${VAL_PER_CLASS}" \
--test-per-class "${TEST_PER_CLASS}" \
"$@"
}
SPLIT_TRAIN_SIZES="${TASK2_BENCH_SPLIT_TRAIN_SIZES:-1}"
explicit_train_sizes=0
models_cli=0
for arg in "$@"; do
if [[ "$arg" == "--train-sizes" ]]; then
explicit_train_sizes=1
fi
if [[ "$arg" == "--models" ]]; then
models_cli=1
fi
done
if [[ ${models_cli} -eq 0 ]]; then
if [[ -n "${TASK2_BENCH_MODEL:-}" ]]; then
MODELS_TO_RUN=("${TASK2_BENCH_MODEL}")
else
MODELS_TO_RUN=("lwm" "ieee_cnn" "resnet18" "efficientnet_b0" "mobilenet_v3_small")
fi
fi
if [[ "${SPLIT_TRAIN_SIZES}" != "0" && ${explicit_train_sizes} -eq 0 ]]; then
for size in "${TRAIN_SIZES[@]}"; do
echo ""
echo "=========================================="
echo "Benchmarking train size: ${size} per class"
echo "=========================================="
for shard_idx in "${SHARD_INDICES[@]}"; do
echo " [INFO] Shard ${shard_idx}/${NUM_SHARDS}"
shard_args=(--num-shards "${NUM_SHARDS}" --shard-index "${shard_idx}")
if [[ ${models_cli} -eq 1 ]]; then
run_benchmark --train-sizes "${size}" "${shard_args[@]}" "$@"
else
for model in "${MODELS_TO_RUN[@]}"; do
echo " -> Model: ${model}"
run_benchmark --models "${model}" --train-sizes "${size}" "${shard_args[@]}" "$@"
done
fi
done
done
else
if [[ ${models_cli} -eq 1 ]]; then
for shard_idx in "${SHARD_INDICES[@]}"; do
echo ""
echo "=========================================="
echo "Benchmarking shard: ${shard_idx}/${NUM_SHARDS}"
echo "=========================================="
run_benchmark --num-shards "${NUM_SHARDS}" --shard-index "${shard_idx}" "$@"
done
else
for model in "${MODELS_TO_RUN[@]}"; do
echo ""
echo "=========================================="
echo "Benchmarking model: ${model}"
echo "=========================================="
for shard_idx in "${SHARD_INDICES[@]}"; do
echo " [INFO] Shard ${shard_idx}/${NUM_SHARDS}"
run_benchmark --models "${model}" --num-shards "${NUM_SHARDS}" --shard-index "${shard_idx}" "$@"
done
done
fi
fi