| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| |
| SERVER_URL="${1:?Usage: deploy_worker.sh <server_url> <worker_id> [hf_token] [wandb_token] [auth_user] [auth_pass]}" |
| WORKER_ID="${2:?Usage: deploy_worker.sh <server_url> <worker_id> [hf_token] [wandb_token] [auth_user] [auth_pass]}" |
| HF_TOKEN="${3:-}" |
| WANDB_TOKEN="${4:-}" |
| AUTH_USER="${5:-${TQ_AUTH_USER:-}}" |
| AUTH_PASS="${6:-${TQ_AUTH_PASS:-}}" |
|
|
| |
| PIPELINE_DIR="/root/ascad-training-pipeline" |
| DATA_DIR="/root/ascad_data" |
| HF_REPO="lemousehunter/ascad-training-pipeline" |
| ASCAD_URL="https://www.data.gouv.fr/api/1/datasets/r/e7ab6f9e-79bf-431f-a5ed-faf0ebe9b08e" |
| LOG_FILE="/root/worker.log" |
|
|
| echo "============================================" |
| echo " ASCAD Worker Deployment" |
| echo " Server: ${SERVER_URL}" |
| echo " Worker ID: ${WORKER_ID}" |
| echo " Auth: ${AUTH_USER:+enabled (user=${AUTH_USER})}${AUTH_USER:-disabled}" |
| echo "============================================" |
|
|
| |
| echo "[1/6] Installing system packages..." |
| apt-get update -qq && apt-get install -y -qq git wget unzip screen > /dev/null 2>&1 |
| echo " Done." |
|
|
| |
| echo "[2/6] Configuring credentials..." |
| if [ -n "${HF_TOKEN}" ]; then |
| pip install -q huggingface_hub |
| python3 -c "from huggingface_hub import login; login(token='${HF_TOKEN}')" |
| echo " HuggingFace: logged in" |
| fi |
|
|
| if [ -n "${WANDB_TOKEN}" ]; then |
| pip install -q wandb |
| wandb login "${WANDB_TOKEN}" 2>/dev/null |
| echo " W&B: logged in" |
| fi |
|
|
| |
| echo "[3/6] Downloading training pipeline from HuggingFace..." |
| if [ -d "${PIPELINE_DIR}" ]; then |
| echo " Pipeline directory exists, pulling latest..." |
| cd "${PIPELINE_DIR}" && git pull |
| else |
| pip install -q huggingface_hub |
| python3 -c " |
| from huggingface_hub import snapshot_download |
| snapshot_download( |
| repo_id='${HF_REPO}', |
| repo_type='model', |
| local_dir='${PIPELINE_DIR}', |
| ) |
| print(' Downloaded.') |
| " |
| fi |
|
|
| |
| echo "[4/6] Installing Python dependencies..." |
| cd "${PIPELINE_DIR}" |
| pip install -q -r requirements.txt |
| echo " Done." |
|
|
| |
| echo "[5/6] Downloading ASCAD dataset..." |
| mkdir -p "${DATA_DIR}" |
|
|
| if [ -f "${DATA_DIR}/ASCAD_data/ASCAD_databases/ASCAD.h5" ]; then |
| echo " ASCAD dataset already exists, skipping download." |
| else |
| echo " Downloading ASCAD_data.zip (~1.5 GB)..." |
| wget -q --show-progress -O "${DATA_DIR}/ASCAD_data.zip" "${ASCAD_URL}" |
| echo " Extracting..." |
| cd "${DATA_DIR}" && unzip -q ASCAD_data.zip |
| rm -f ASCAD_data.zip |
| echo " Done." |
| fi |
|
|
| |
| if [ ! -f "${DATA_DIR}/ATMega8515_raw_traces.h5" ]; then |
| echo " Note: ATMega8515_raw_traces.h5 not found." |
| echo " The MTAN model uses the global window from raw traces." |
| echo " Download separately if needed for Experiment 2." |
| fi |
|
|
| |
| echo "[6/6] Starting worker agent..." |
| echo " Log file: ${LOG_FILE}" |
|
|
| |
| export VAST_INSTANCE_ID="${VAST_INSTANCE_ID:-unknown}" |
|
|
| |
| AUTH_ARGS="" |
| if [ -n "${AUTH_USER}" ] && [ -n "${AUTH_PASS}" ]; then |
| AUTH_ARGS="--auth-user '${AUTH_USER}' --auth-pass '${AUTH_PASS}'" |
| fi |
|
|
| |
| screen -dmS worker bash -c " |
| export TQ_AUTH_USER='${AUTH_USER}' TQ_AUTH_PASS='${AUTH_PASS}' |
| cd ${PIPELINE_DIR} && \ |
| python3 -m orchestrator.worker.agent \ |
| --server-url '${SERVER_URL}' \ |
| --worker-id '${WORKER_ID}' \ |
| --data-dir '${DATA_DIR}' \ |
| --pipeline-dir '${PIPELINE_DIR}' \ |
| ${AUTH_ARGS} \ |
| --forward-logs ${LOG_FILE} \ |
| 2>&1 | tee -a ${LOG_FILE} |
| " |
|
|
| echo "" |
| echo "============================================" |
| echo " Worker deployed successfully!" |
| echo " Screen session: worker" |
| echo " Attach: screen -r worker" |
| echo " Log: tail -f ${LOG_FILE}" |
| echo "============================================" |
|
|