wi-lab commited on
Commit
6a0e1d3
·
verified ·
1 Parent(s): a48635b

Upload task1/run_task1_all_models.sh with huggingface_hub

Browse files
Files changed (1) hide show
  1. task1/run_task1_all_models.sh +223 -0
task1/run_task1_all_models.sh ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ module load mamba/latest
3
+ source activate gaudi-pytorch-diffusion-1.22.0.740
4
+
5
+ set -euo pipefail
6
+
7
+ # Resolve repository root (script lives in task1/)
8
+ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
9
+
10
+ # Run Task 1 benchmark covering all configured backbones.
11
+ # Default device is 'auto' which will detect HPU, then CUDA, then CPU
12
+ # Override with --device hpu/cuda/cpu if needed
13
+ # All paths now default to project root, so we don't need to specify them
14
+
15
+ # Shared constants for Habana Gaudi support.
16
+ TARGET_GAUDI_ENV="gaudi-pytorch-diffusion-1.22.0.740"
17
+
18
+ # Try multiple possible locations for the Gaudi environment
19
+ POSSIBLE_GAUDI_PATHS=(
20
+ "${HOME}/mamba/envs/${TARGET_GAUDI_ENV}"
21
+ "${HOME}/.conda/envs/${TARGET_GAUDI_ENV}"
22
+ "/packages/envs/${TARGET_GAUDI_ENV}"
23
+ "${CONDA_PREFIX}/../${TARGET_GAUDI_ENV}"
24
+ )
25
+
26
+ TARGET_GAUDI_PREFIX=""
27
+ for path in "${POSSIBLE_GAUDI_PATHS[@]}"; do
28
+ if [[ -d "$path" ]]; then
29
+ TARGET_GAUDI_PREFIX="$path"
30
+ break
31
+ fi
32
+ done
33
+
34
+ TARGET_GAUDI_ACTIVATE="${TARGET_GAUDI_PREFIX}/bin/activate"
35
+
36
+ # Ensure Habana Gaudi environment when available so subshell picks up the right
37
+ # python/pip executables. Controlled by LWM_AUTO_HABANA (defaults to enabled).
38
+ ensure_gaudi_env() {
39
+ if [[ "${LWM_AUTO_HABANA:-1}" != "1" ]]; then
40
+ echo "[DEBUG] Auto Gaudi activation disabled (LWM_AUTO_HABANA=0)"
41
+ return
42
+ fi
43
+
44
+ if [[ "${CONDA_DEFAULT_ENV:-}" == "${TARGET_GAUDI_ENV}" ]]; then
45
+ echo "[DEBUG] Already in Gaudi environment: ${CONDA_DEFAULT_ENV}"
46
+ return
47
+ fi
48
+
49
+ if [[ ! -f "${TARGET_GAUDI_ACTIVATE}" ]]; then
50
+ echo "[DEBUG] Gaudi environment not found at ${TARGET_GAUDI_ACTIVATE}"
51
+ return
52
+ fi
53
+
54
+ echo "[DEBUG] Attempting to activate Gaudi environment..."
55
+
56
+ if command -v module >/dev/null 2>&1; then
57
+ echo "[DEBUG] Loading mamba module..."
58
+ module load mamba/latest 2>&1 | grep -v "^$" || true
59
+ else
60
+ echo "[DEBUG] module command not available, skipping module load"
61
+ fi
62
+
63
+ local activated="0"
64
+
65
+ if [[ -f "${TARGET_GAUDI_ACTIVATE}" ]]; then
66
+ echo "[DEBUG] Trying direct activation: source ${TARGET_GAUDI_ACTIVATE}"
67
+ # shellcheck disable=SC1091
68
+ if source "${TARGET_GAUDI_ACTIVATE}" 2>&1; then
69
+ activated="1"
70
+ echo "[DEBUG] Successfully activated via direct path"
71
+ fi
72
+ fi
73
+
74
+ if [[ "${activated}" != "1" ]]; then
75
+ echo "[DEBUG] Trying conda activate: source activate ${TARGET_GAUDI_ENV}"
76
+ # shellcheck disable=SC1091
77
+ if source activate "${TARGET_GAUDI_ENV}" 2>&1; then
78
+ activated="1"
79
+ echo "[DEBUG] Successfully activated via conda"
80
+ else
81
+ echo "[DEBUG] Failed to activate Gaudi environment"
82
+ fi
83
+ fi
84
+
85
+ if [[ "${activated}" == "1" ]]; then
86
+ echo "[DEBUG] Gaudi environment activated successfully"
87
+ fi
88
+ }
89
+
90
+ ensure_gaudi_env
91
+
92
+ # Resolve python interpreter. Prefer the activated environment even if a module
93
+ # pre-set PYTHON points elsewhere.
94
+ try_python() {
95
+ local executable="$1"
96
+ if [[ -z "${executable}" ]]; then
97
+ return 1
98
+ fi
99
+
100
+ if [[ "${executable}" == */* ]]; then
101
+ if [[ ! -x "${executable}" ]]; then
102
+ return 1
103
+ fi
104
+ else
105
+ if ! command -v "${executable}" >/dev/null 2>&1; then
106
+ return 1
107
+ fi
108
+ fi
109
+
110
+ if "${executable}" -c "import torch" >/dev/null 2>&1; then
111
+ PYTHON_CMD=("${executable}")
112
+ return 0
113
+ fi
114
+
115
+ return 1
116
+ }
117
+
118
+ PYTHON_CMD=()
119
+
120
+ # After ensure_gaudi_env, if we're in the target environment, use its python directly
121
+ # This has highest priority to ensure we use the correct Gaudi Python
122
+ if [[ "${CONDA_DEFAULT_ENV:-}" == "${TARGET_GAUDI_ENV}" ]] || [[ "${CONDA_PREFIX:-}" == "${TARGET_GAUDI_PREFIX}" ]]; then
123
+ echo "[DEBUG] Gaudi environment active: ${CONDA_DEFAULT_ENV:-unknown}"
124
+ if [[ -x "${TARGET_GAUDI_PREFIX}/bin/python" ]]; then
125
+ PYTHON_CMD=("${TARGET_GAUDI_PREFIX}/bin/python")
126
+ echo "[DEBUG] Forcing use of Gaudi Python: ${TARGET_GAUDI_PREFIX}/bin/python"
127
+ fi
128
+ fi
129
+
130
+ # Highest priority: explicit CONDA prefix (current shell)
131
+ if [[ ${#PYTHON_CMD[@]} -eq 0 && -n "${CONDA_PREFIX:-}" ]]; then
132
+ try_python "${CONDA_PREFIX}/bin/python" || true
133
+ fi
134
+
135
+ # Virtualenv support
136
+ if [[ ${#PYTHON_CMD[@]} -eq 0 && -n "${VIRTUAL_ENV:-}" ]]; then
137
+ try_python "${VIRTUAL_ENV}/bin/python" || true
138
+ fi
139
+
140
+ # Try known Habana Gaudi environment path explicitly
141
+ if [[ ${#PYTHON_CMD[@]} -eq 0 && -x "${TARGET_GAUDI_PREFIX}/bin/python" ]]; then
142
+ try_python "${TARGET_GAUDI_PREFIX}/bin/python" || true
143
+ fi
144
+
145
+ # Try common conda locations using the active env name
146
+ if [[ ${#PYTHON_CMD[@]} -eq 0 && -n "${CONDA_DEFAULT_ENV:-}" && "${CONDA_DEFAULT_ENV}" != "base" ]]; then
147
+ if [[ -n "${HOME:-}" ]]; then
148
+ try_python "${HOME}/mamba/envs/${CONDA_DEFAULT_ENV}/bin/python" || true
149
+ if [[ ${#PYTHON_CMD[@]} -eq 0 ]]; then
150
+ try_python "${HOME}/.conda/envs/${CONDA_DEFAULT_ENV}/bin/python" || true
151
+ fi
152
+ fi
153
+
154
+ if [[ ${#PYTHON_CMD[@]} -eq 0 && -n "${MAMBA_ROOT_PREFIX:-}" ]]; then
155
+ try_python "${MAMBA_ROOT_PREFIX}/envs/${CONDA_DEFAULT_ENV}/bin/python" || true
156
+ fi
157
+
158
+ if [[ ${#PYTHON_CMD[@]} -eq 0 ]]; then
159
+ base_python="$(command -v python 2>/dev/null || true)"
160
+ if [[ -n "${base_python}" ]]; then
161
+ base_root="$(dirname "$(dirname "${base_python}")")"
162
+ try_python "${base_root}/envs/${CONDA_DEFAULT_ENV}/bin/python" || true
163
+ fi
164
+ fi
165
+ fi
166
+
167
+ # PATH lookup (python3 preferred over python)
168
+ if [[ ${#PYTHON_CMD[@]} -eq 0 ]]; then
169
+ try_python "$(command -v python3 2>/dev/null || true)" || true
170
+ fi
171
+
172
+ if [[ ${#PYTHON_CMD[@]} -eq 0 ]]; then
173
+ try_python "$(command -v python 2>/dev/null || true)" || true
174
+ fi
175
+
176
+ # Lowest priority: PYTHON env var if set
177
+ if [[ ${#PYTHON_CMD[@]} -eq 0 && -n "${PYTHON:-}" ]]; then
178
+ echo "[WARN] Falling back to PYTHON=${PYTHON}"
179
+ try_python "${PYTHON}" || true
180
+ fi
181
+
182
+ if [[ ${#PYTHON_CMD[@]} -eq 0 ]]; then
183
+ echo "[ERROR] Could not find a python interpreter with torch installed." >&2
184
+ echo "[ERROR] Activate the Habana environment (module load mamba/latest; source activate gaudi-pytorch-diffusion-1.22.0.740)." >&2
185
+ exit 1
186
+ fi
187
+
188
+ # Ignore user site-packages to avoid version conflicts with Habana PyTorch
189
+ export PYTHONNOUSERSITE=1
190
+
191
+ # Debug: show which python is being used
192
+ python_path="$(command -v "${PYTHON_CMD[0]}" 2>/dev/null || true)"
193
+ if [[ -z "${python_path}" ]]; then
194
+ python_path="${PYTHON_CMD[0]}"
195
+ fi
196
+
197
+ echo "[DEBUG] Using Python: ${python_path}"
198
+ python_version="$("${PYTHON_CMD[@]}" --version)"
199
+ echo "[DEBUG] Python version: ${python_version}"
200
+ echo "[DEBUG] PYTHONNOUSERSITE=${PYTHONNOUSERSITE}"
201
+
202
+ # Run models one at a time to avoid OOM issues
203
+ # You can pass --models to override this behavior
204
+ if [[ "$*" == *"--models"* ]]; then
205
+ # User specified models, run as-is
206
+ "${PYTHON_CMD[@]}" "${ROOT_DIR}/task1/train_mcs_models.py" \
207
+ --cities city_10_austin \
208
+ --comm-types LTE \
209
+ "$@"
210
+ else
211
+ # Run each model separately
212
+ for model in lwm resnet18 efficientnet_b0 mobilenet_v3_small simple_cnn; do
213
+ echo ""
214
+ echo "=========================================="
215
+ echo "Training model: ${model}"
216
+ echo "=========================================="
217
+ "${PYTHON_CMD[@]}" "${ROOT_DIR}/task1/train_mcs_models.py" \
218
+ --cities city_10_austin \
219
+ --comm-types LTE \
220
+ --models "${model}" \
221
+ "$@"
222
+ done
223
+ fi