#!/usr/bin/env bash module load mamba/latest source activate gaudi-pytorch-diffusion-1.22.0.740 set -euo pipefail # Resolve repository root (script lives in task2/) ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" # Shared constants for Habana Gaudi support. TARGET_GAUDI_ENV="gaudi-pytorch-diffusion-1.22.0.740" # Try multiple possible locations for the Gaudi environment POSSIBLE_GAUDI_PATHS=( "${HOME}/mamba/envs/${TARGET_GAUDI_ENV}" "${HOME}/.conda/envs/${TARGET_GAUDI_ENV}" "/packages/envs/${TARGET_GAUDI_ENV}" "${CONDA_PREFIX}/../${TARGET_GAUDI_ENV}" ) TARGET_GAUDI_PREFIX="" for path in "${POSSIBLE_GAUDI_PATHS[@]}"; do if [[ -d "$path" ]]; then TARGET_GAUDI_PREFIX="$path" break fi done TARGET_GAUDI_ACTIVATE="${TARGET_GAUDI_PREFIX}/bin/activate" # Ensure Habana Gaudi environment when available so subshell picks up the right # python/pip executables. Controlled by LWM_AUTO_HABANA (defaults to enabled). ensure_gaudi_env() { if [[ "${LWM_AUTO_HABANA:-1}" != "1" ]]; then echo "[DEBUG] Auto Gaudi activation disabled (LWM_AUTO_HABANA=0)" return fi if [[ "${CONDA_DEFAULT_ENV:-}" == "${TARGET_GAUDI_ENV}" ]]; then echo "[DEBUG] Already in Gaudi environment: ${CONDA_DEFAULT_ENV}" return fi if [[ ! -f "${TARGET_GAUDI_ACTIVATE}" ]]; then echo "[DEBUG] Gaudi environment not found at ${TARGET_GAUDI_ACTIVATE}" return fi echo "[DEBUG] Attempting to activate Gaudi environment..." if command -v module >/dev/null 2>&1; then echo "[DEBUG] Loading mamba module..." module load mamba/latest 2>&1 | grep -v "^$" || true else echo "[DEBUG] module command not available, skipping module load" fi local activated="0" if [[ -f "${TARGET_GAUDI_ACTIVATE}" ]]; then echo "[DEBUG] Trying direct activation: source ${TARGET_GAUDI_ACTIVATE}" # shellcheck disable=SC1091 if source "${TARGET_GAUDI_ACTIVATE}" 2>&1; then activated="1" echo "[DEBUG] Successfully activated via direct path" fi fi if [[ "${activated}" != "1" ]]; then echo "[DEBUG] Trying conda activate: source activate ${TARGET_GAUDI_ENV}" # shellcheck disable=SC1091 if source activate "${TARGET_GAUDI_ENV}" 2>&1; then activated="1" echo "[DEBUG] Successfully activated via conda" else echo "[DEBUG] Failed to activate Gaudi environment" fi fi if [[ "${activated}" == "1" ]]; then echo "[DEBUG] Gaudi environment activated successfully" fi } ensure_gaudi_env # Use Gaudi environment's python directly to avoid version conflicts GAUDI_PYTHON="/home/namhyunk/mamba/envs/gaudi-pytorch-diffusion-1.22.0.740/bin/python" if [[ -f "${GAUDI_PYTHON}" ]]; then # Add project root to PYTHONPATH for proper imports export PYTHONPATH="${ROOT_DIR}:${PYTHONPATH:-}" PYTHON_CMD=("${GAUDI_PYTHON}") echo "[DEBUG] Using Gaudi python: ${GAUDI_PYTHON}" echo "[DEBUG] PYTHONPATH: ${PYTHONPATH}" else # Fallback to system python PYTHON_CMD=(python) echo "[DEBUG] Using system python (Gaudi python not found)" fi # Optimize for Habana Gaudi environment (with 96GB HPU memory, but CPU memory constrained) # Reduce cache size to prevent OOM (25 files × 64MB = 1.6GB, too much for concurrent processes) export LWM_SPECTRO_MEMMAP_CACHE_SIZE="${LWM_SPECTRO_MEMMAP_CACHE_SIZE:-8}" export OMP_NUM_THREADS="${OMP_NUM_THREADS:-4}" export MKL_NUM_THREADS="${MKL_NUM_THREADS:-4}" # Prevent user site-packages from interfering with conda environment export PYTHONNOUSERSITE=1 # Generate a single timestamp for this entire benchmark run # This ensures all models/sizes share the same output directory export LWM_RUN_TIMESTAMP="${LWM_RUN_TIMESTAMP:-$(date +%Y%m%d_%H%M%S)}" # Run the Task 2 joint SNR+Mobility benchmark across the default backbones. # Optimized defaults for Habana Gaudi environment with 8 cores and 96GB memory echo " ================================================== Task 2 joint SNR+Mobility benchmark launcher ==================================================" echo "[INFO] Run timestamp: ${LWM_RUN_TIMESTAMP}" echo "[INFO] Memmap cache size: ${LWM_SPECTRO_MEMMAP_CACHE_SIZE}" echo "[INFO] OMP threads: ${OMP_NUM_THREADS}" # Reduce batch size to prevent OOM (16 instead of 32) BATCH_SIZE="${TASK2_BENCH_BATCH_SIZE:-16}" MAX_SAMPLES_PER_CONFIG="${TASK2_BENCH_MAX_SAMPLES_PER_CONFIG:-128}" EPOCHS="${TASK2_BENCH_EPOCHS:-100}" NUM_WORKERS="${TASK2_BENCH_NUM_WORKERS:-0}" TRAINABLE_LAYERS="${TASK2_BENCH_TRAINABLE_LAYERS:-2}" LWM_TRAINABLE_LAYERS="${TASK2_BENCH_LWM_TRAINABLE_LAYERS:-2}" VAL_PER_CLASS="${TASK2_BENCH_VAL_PER_CLASS:-128}" TEST_PER_CLASS="${TASK2_BENCH_TEST_PER_CLASS:-128}" EVAL_EVERY="${TASK2_BENCH_EVAL_EVERY:-2}" TRAIN_SIZE_DEFAULT="${TASK2_BENCH_TRAIN_SIZES:-"5 10 20 50 100 200 400"}" IFS=' ' read -r -a TRAIN_SIZES <<< "${TRAIN_SIZE_DEFAULT}" NUM_SHARDS="${TASK2_BENCH_NUM_SHARDS:-4}" SHARD_OVERRIDE="${TASK2_BENCH_SHARD_INDEX:-}" if [[ -n "${SHARD_OVERRIDE}" ]]; then SHARD_INDICES=("${SHARD_OVERRIDE}") else if [[ "${NUM_SHARDS}" -lt 1 ]]; then NUM_SHARDS=1 fi SHARD_INDICES=() for ((idx=0; idx Model: ${model}" run_benchmark --models "${model}" --train-sizes "${size}" "${shard_args[@]}" "$@" done fi done done else if [[ ${models_cli} -eq 1 ]]; then for shard_idx in "${SHARD_INDICES[@]}"; do echo "" echo "==========================================" echo "Benchmarking shard: ${shard_idx}/${NUM_SHARDS}" echo "==========================================" run_benchmark --num-shards "${NUM_SHARDS}" --shard-index "${shard_idx}" "$@" done else for model in "${MODELS_TO_RUN[@]}"; do echo "" echo "==========================================" echo "Benchmarking model: ${model}" echo "==========================================" for shard_idx in "${SHARD_INDICES[@]}"; do echo " [INFO] Shard ${shard_idx}/${NUM_SHARDS}" run_benchmark --models "${model}" --num-shards "${NUM_SHARDS}" --shard-index "${shard_idx}" "$@" done done fi fi