lwm-spectro / task1 /run_task1_all_models.sh
wi-lab's picture
Upload task1/run_task1_all_models.sh with huggingface_hub
6a0e1d3 verified
#!/usr/bin/env bash
module load mamba/latest
source activate gaudi-pytorch-diffusion-1.22.0.740
set -euo pipefail
# Resolve repository root (script lives in task1/)
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
# Run Task 1 benchmark covering all configured backbones.
# Default device is 'auto' which will detect HPU, then CUDA, then CPU
# Override with --device hpu/cuda/cpu if needed
# All paths now default to project root, so we don't need to specify them
# Shared constants for Habana Gaudi support.
TARGET_GAUDI_ENV="gaudi-pytorch-diffusion-1.22.0.740"
# Try multiple possible locations for the Gaudi environment
POSSIBLE_GAUDI_PATHS=(
"${HOME}/mamba/envs/${TARGET_GAUDI_ENV}"
"${HOME}/.conda/envs/${TARGET_GAUDI_ENV}"
"/packages/envs/${TARGET_GAUDI_ENV}"
"${CONDA_PREFIX}/../${TARGET_GAUDI_ENV}"
)
TARGET_GAUDI_PREFIX=""
for path in "${POSSIBLE_GAUDI_PATHS[@]}"; do
if [[ -d "$path" ]]; then
TARGET_GAUDI_PREFIX="$path"
break
fi
done
TARGET_GAUDI_ACTIVATE="${TARGET_GAUDI_PREFIX}/bin/activate"
# Ensure Habana Gaudi environment when available so subshell picks up the right
# python/pip executables. Controlled by LWM_AUTO_HABANA (defaults to enabled).
ensure_gaudi_env() {
if [[ "${LWM_AUTO_HABANA:-1}" != "1" ]]; then
echo "[DEBUG] Auto Gaudi activation disabled (LWM_AUTO_HABANA=0)"
return
fi
if [[ "${CONDA_DEFAULT_ENV:-}" == "${TARGET_GAUDI_ENV}" ]]; then
echo "[DEBUG] Already in Gaudi environment: ${CONDA_DEFAULT_ENV}"
return
fi
if [[ ! -f "${TARGET_GAUDI_ACTIVATE}" ]]; then
echo "[DEBUG] Gaudi environment not found at ${TARGET_GAUDI_ACTIVATE}"
return
fi
echo "[DEBUG] Attempting to activate Gaudi environment..."
if command -v module >/dev/null 2>&1; then
echo "[DEBUG] Loading mamba module..."
module load mamba/latest 2>&1 | grep -v "^$" || true
else
echo "[DEBUG] module command not available, skipping module load"
fi
local activated="0"
if [[ -f "${TARGET_GAUDI_ACTIVATE}" ]]; then
echo "[DEBUG] Trying direct activation: source ${TARGET_GAUDI_ACTIVATE}"
# shellcheck disable=SC1091
if source "${TARGET_GAUDI_ACTIVATE}" 2>&1; then
activated="1"
echo "[DEBUG] Successfully activated via direct path"
fi
fi
if [[ "${activated}" != "1" ]]; then
echo "[DEBUG] Trying conda activate: source activate ${TARGET_GAUDI_ENV}"
# shellcheck disable=SC1091
if source activate "${TARGET_GAUDI_ENV}" 2>&1; then
activated="1"
echo "[DEBUG] Successfully activated via conda"
else
echo "[DEBUG] Failed to activate Gaudi environment"
fi
fi
if [[ "${activated}" == "1" ]]; then
echo "[DEBUG] Gaudi environment activated successfully"
fi
}
ensure_gaudi_env
# Resolve python interpreter. Prefer the activated environment even if a module
# pre-set PYTHON points elsewhere.
try_python() {
local executable="$1"
if [[ -z "${executable}" ]]; then
return 1
fi
if [[ "${executable}" == */* ]]; then
if [[ ! -x "${executable}" ]]; then
return 1
fi
else
if ! command -v "${executable}" >/dev/null 2>&1; then
return 1
fi
fi
if "${executable}" -c "import torch" >/dev/null 2>&1; then
PYTHON_CMD=("${executable}")
return 0
fi
return 1
}
PYTHON_CMD=()
# After ensure_gaudi_env, if we're in the target environment, use its python directly
# This has highest priority to ensure we use the correct Gaudi Python
if [[ "${CONDA_DEFAULT_ENV:-}" == "${TARGET_GAUDI_ENV}" ]] || [[ "${CONDA_PREFIX:-}" == "${TARGET_GAUDI_PREFIX}" ]]; then
echo "[DEBUG] Gaudi environment active: ${CONDA_DEFAULT_ENV:-unknown}"
if [[ -x "${TARGET_GAUDI_PREFIX}/bin/python" ]]; then
PYTHON_CMD=("${TARGET_GAUDI_PREFIX}/bin/python")
echo "[DEBUG] Forcing use of Gaudi Python: ${TARGET_GAUDI_PREFIX}/bin/python"
fi
fi
# Highest priority: explicit CONDA prefix (current shell)
if [[ ${#PYTHON_CMD[@]} -eq 0 && -n "${CONDA_PREFIX:-}" ]]; then
try_python "${CONDA_PREFIX}/bin/python" || true
fi
# Virtualenv support
if [[ ${#PYTHON_CMD[@]} -eq 0 && -n "${VIRTUAL_ENV:-}" ]]; then
try_python "${VIRTUAL_ENV}/bin/python" || true
fi
# Try known Habana Gaudi environment path explicitly
if [[ ${#PYTHON_CMD[@]} -eq 0 && -x "${TARGET_GAUDI_PREFIX}/bin/python" ]]; then
try_python "${TARGET_GAUDI_PREFIX}/bin/python" || true
fi
# Try common conda locations using the active env name
if [[ ${#PYTHON_CMD[@]} -eq 0 && -n "${CONDA_DEFAULT_ENV:-}" && "${CONDA_DEFAULT_ENV}" != "base" ]]; then
if [[ -n "${HOME:-}" ]]; then
try_python "${HOME}/mamba/envs/${CONDA_DEFAULT_ENV}/bin/python" || true
if [[ ${#PYTHON_CMD[@]} -eq 0 ]]; then
try_python "${HOME}/.conda/envs/${CONDA_DEFAULT_ENV}/bin/python" || true
fi
fi
if [[ ${#PYTHON_CMD[@]} -eq 0 && -n "${MAMBA_ROOT_PREFIX:-}" ]]; then
try_python "${MAMBA_ROOT_PREFIX}/envs/${CONDA_DEFAULT_ENV}/bin/python" || true
fi
if [[ ${#PYTHON_CMD[@]} -eq 0 ]]; then
base_python="$(command -v python 2>/dev/null || true)"
if [[ -n "${base_python}" ]]; then
base_root="$(dirname "$(dirname "${base_python}")")"
try_python "${base_root}/envs/${CONDA_DEFAULT_ENV}/bin/python" || true
fi
fi
fi
# PATH lookup (python3 preferred over python)
if [[ ${#PYTHON_CMD[@]} -eq 0 ]]; then
try_python "$(command -v python3 2>/dev/null || true)" || true
fi
if [[ ${#PYTHON_CMD[@]} -eq 0 ]]; then
try_python "$(command -v python 2>/dev/null || true)" || true
fi
# Lowest priority: PYTHON env var if set
if [[ ${#PYTHON_CMD[@]} -eq 0 && -n "${PYTHON:-}" ]]; then
echo "[WARN] Falling back to PYTHON=${PYTHON}"
try_python "${PYTHON}" || true
fi
if [[ ${#PYTHON_CMD[@]} -eq 0 ]]; then
echo "[ERROR] Could not find a python interpreter with torch installed." >&2
echo "[ERROR] Activate the Habana environment (module load mamba/latest; source activate gaudi-pytorch-diffusion-1.22.0.740)." >&2
exit 1
fi
# Ignore user site-packages to avoid version conflicts with Habana PyTorch
export PYTHONNOUSERSITE=1
# Debug: show which python is being used
python_path="$(command -v "${PYTHON_CMD[0]}" 2>/dev/null || true)"
if [[ -z "${python_path}" ]]; then
python_path="${PYTHON_CMD[0]}"
fi
echo "[DEBUG] Using Python: ${python_path}"
python_version="$("${PYTHON_CMD[@]}" --version)"
echo "[DEBUG] Python version: ${python_version}"
echo "[DEBUG] PYTHONNOUSERSITE=${PYTHONNOUSERSITE}"
# Run models one at a time to avoid OOM issues
# You can pass --models to override this behavior
if [[ "$*" == *"--models"* ]]; then
# User specified models, run as-is
"${PYTHON_CMD[@]}" "${ROOT_DIR}/task1/train_mcs_models.py" \
--cities city_10_austin \
--comm-types LTE \
"$@"
else
# Run each model separately
for model in lwm resnet18 efficientnet_b0 mobilenet_v3_small simple_cnn; do
echo ""
echo "=========================================="
echo "Training model: ${model}"
echo "=========================================="
"${PYTHON_CMD[@]}" "${ROOT_DIR}/task1/train_mcs_models.py" \
--cities city_10_austin \
--comm-types LTE \
--models "${model}" \
"$@"
done
fi