ascad-training-pipeline / scripts /deploy_worker.sh
lemousehunter
feat: LMIC-TSBN model + persistence fixes across restarts
cbb6546
#!/usr/bin/env bash
# ============================================================================
# deploy_worker.sh β€” Deploy a worker agent on a Vast.ai GPU instance
# ============================================================================
# This script is run via SSH on a freshly provisioned Vast.ai instance.
# It installs dependencies, downloads the training pipeline and ASCAD dataset,
# and starts the worker agent pointing to the queue server.
#
# Usage:
# ssh -p <port> root@<host> 'bash -s' < scripts/deploy_worker.sh \
# <server_url> <worker_id> [hf_token] [wandb_token] [auth_user] [auth_pass]
#
# Or copy and run directly on the instance:
# bash deploy_worker.sh http://queue-server:8080 worker-001 hf_xxx wandb_xxx admin s3cret
#
# Arguments:
# $1 β€” Queue server URL (e.g., http://1.2.3.4:8080)
# $2 β€” Unique worker ID (e.g., worker-38090)
# $3 β€” HuggingFace token (optional, for model uploads)
# $4 β€” Weights & Biases token (optional, for experiment tracking)
# $5 β€” Auth username for queue server (optional, or set TQ_AUTH_USER)
# $6 β€” Auth password for queue server (optional, or set TQ_AUTH_PASS)
# ============================================================================
set -euo pipefail
# ── Arguments ────────────────────────────────────────────────────────────────
SERVER_URL="${1:?Usage: deploy_worker.sh <server_url> <worker_id> [hf_token] [wandb_token] [auth_user] [auth_pass]}"
WORKER_ID="${2:?Usage: deploy_worker.sh <server_url> <worker_id> [hf_token] [wandb_token] [auth_user] [auth_pass]}"
HF_TOKEN="${3:-}"
WANDB_TOKEN="${4:-}"
AUTH_USER="${5:-${TQ_AUTH_USER:-}}"
AUTH_PASS="${6:-${TQ_AUTH_PASS:-}}"
# ── Configuration ────────────────────────────────────────────────────────────
PIPELINE_DIR="/root/ascad-training-pipeline"
DATA_DIR="/root/ascad_data"
HF_REPO="lemousehunter/ascad-training-pipeline"
ASCAD_URL="https://www.data.gouv.fr/api/1/datasets/r/e7ab6f9e-79bf-431f-a5ed-faf0ebe9b08e"
LOG_FILE="/root/worker.log"
echo "============================================"
echo " ASCAD Worker Deployment"
echo " Server: ${SERVER_URL}"
echo " Worker ID: ${WORKER_ID}"
echo " Auth: ${AUTH_USER:+enabled (user=${AUTH_USER})}${AUTH_USER:-disabled}"
echo "============================================"
# ── Step 1: System packages ──────────────────────────────────────────────────
echo "[1/6] Installing system packages..."
apt-get update -qq && apt-get install -y -qq git wget unzip screen > /dev/null 2>&1
echo " Done."
# ── Step 2: Configure credentials ───────────────────────────────────────────
echo "[2/6] Configuring credentials..."
if [ -n "${HF_TOKEN}" ]; then
pip install -q huggingface_hub
python3 -c "from huggingface_hub import login; login(token='${HF_TOKEN}')"
echo " HuggingFace: logged in"
fi
if [ -n "${WANDB_TOKEN}" ]; then
pip install -q wandb
wandb login "${WANDB_TOKEN}" 2>/dev/null
echo " W&B: logged in"
fi
# ── Step 3: Clone training pipeline ─────────────────────────────────────────
echo "[3/6] Downloading training pipeline from HuggingFace..."
if [ -d "${PIPELINE_DIR}" ]; then
echo " Pipeline directory exists, pulling latest..."
cd "${PIPELINE_DIR}" && git pull
else
pip install -q huggingface_hub
python3 -c "
from huggingface_hub import snapshot_download
snapshot_download(
repo_id='${HF_REPO}',
repo_type='model',
local_dir='${PIPELINE_DIR}',
)
print(' Downloaded.')
"
fi
# ── Step 4: Install Python dependencies ─────────────────────────────────────
echo "[4/6] Installing Python dependencies..."
cd "${PIPELINE_DIR}"
pip install -q -r requirements.txt
echo " Done."
# ── Step 5: Download ASCAD dataset ──────────────────────────────────────────
echo "[5/6] Downloading ASCAD dataset..."
mkdir -p "${DATA_DIR}"
if [ -f "${DATA_DIR}/ASCAD_data/ASCAD_databases/ASCAD.h5" ]; then
echo " ASCAD dataset already exists, skipping download."
else
echo " Downloading ASCAD_data.zip (~1.5 GB)..."
wget -q --show-progress -O "${DATA_DIR}/ASCAD_data.zip" "${ASCAD_URL}"
echo " Extracting..."
cd "${DATA_DIR}" && unzip -q ASCAD_data.zip
rm -f ASCAD_data.zip
echo " Done."
fi
# Also check for raw traces (needed for per-byte window extraction)
if [ ! -f "${DATA_DIR}/ATMega8515_raw_traces.h5" ]; then
echo " Note: ATMega8515_raw_traces.h5 not found."
echo " The MTAN model uses the global window from raw traces."
echo " Download separately if needed for Experiment 2."
fi
# ── Step 6: Start worker agent ──────────────────────────────────────────────
echo "[6/6] Starting worker agent..."
echo " Log file: ${LOG_FILE}"
# Export Vast.ai instance ID if available
export VAST_INSTANCE_ID="${VAST_INSTANCE_ID:-unknown}"
# Build auth arguments
AUTH_ARGS=""
if [ -n "${AUTH_USER}" ] && [ -n "${AUTH_PASS}" ]; then
AUTH_ARGS="--auth-user '${AUTH_USER}' --auth-pass '${AUTH_PASS}'"
fi
# Start the worker in a screen session for persistence
screen -dmS worker bash -c "
export TQ_AUTH_USER='${AUTH_USER}' TQ_AUTH_PASS='${AUTH_PASS}'
cd ${PIPELINE_DIR} && \
python3 -m orchestrator.worker.agent \
--server-url '${SERVER_URL}' \
--worker-id '${WORKER_ID}' \
--data-dir '${DATA_DIR}' \
--pipeline-dir '${PIPELINE_DIR}' \
${AUTH_ARGS} \
--forward-logs ${LOG_FILE} \
2>&1 | tee -a ${LOG_FILE}
"
echo ""
echo "============================================"
echo " Worker deployed successfully!"
echo " Screen session: worker"
echo " Attach: screen -r worker"
echo " Log: tail -f ${LOG_FILE}"
echo "============================================"