#!/usr/bin/env bash # ============================================================================ # deploy_worker.sh — Deploy a worker agent on a Vast.ai GPU instance # ============================================================================ # This script is run via SSH on a freshly provisioned Vast.ai instance. # It installs dependencies, downloads the training pipeline and ASCAD dataset, # and starts the worker agent pointing to the queue server. # # Usage: # ssh -p root@ 'bash -s' < scripts/deploy_worker.sh \ # [hf_token] [wandb_token] [auth_user] [auth_pass] # # Or copy and run directly on the instance: # bash deploy_worker.sh http://queue-server:8080 worker-001 hf_xxx wandb_xxx admin s3cret # # Arguments: # $1 — Queue server URL (e.g., http://1.2.3.4:8080) # $2 — Unique worker ID (e.g., worker-38090) # $3 — HuggingFace token (optional, for model uploads) # $4 — Weights & Biases token (optional, for experiment tracking) # $5 — Auth username for queue server (optional, or set TQ_AUTH_USER) # $6 — Auth password for queue server (optional, or set TQ_AUTH_PASS) # ============================================================================ set -euo pipefail # ── Arguments ──────────────────────────────────────────────────────────────── SERVER_URL="${1:?Usage: deploy_worker.sh [hf_token] [wandb_token] [auth_user] [auth_pass]}" WORKER_ID="${2:?Usage: deploy_worker.sh [hf_token] [wandb_token] [auth_user] [auth_pass]}" HF_TOKEN="${3:-}" WANDB_TOKEN="${4:-}" AUTH_USER="${5:-${TQ_AUTH_USER:-}}" AUTH_PASS="${6:-${TQ_AUTH_PASS:-}}" # ── Configuration ──────────────────────────────────────────────────────────── PIPELINE_DIR="/root/ascad-training-pipeline" DATA_DIR="/root/ascad_data" HF_REPO="lemousehunter/ascad-training-pipeline" ASCAD_URL="https://www.data.gouv.fr/api/1/datasets/r/e7ab6f9e-79bf-431f-a5ed-faf0ebe9b08e" LOG_FILE="/root/worker.log" echo "============================================" echo " ASCAD Worker Deployment" echo " Server: ${SERVER_URL}" echo " Worker ID: ${WORKER_ID}" echo " Auth: ${AUTH_USER:+enabled (user=${AUTH_USER})}${AUTH_USER:-disabled}" echo "============================================" # ── Step 1: System packages ────────────────────────────────────────────────── echo "[1/6] Installing system packages..." apt-get update -qq && apt-get install -y -qq git wget unzip screen > /dev/null 2>&1 echo " Done." # ── Step 2: Configure credentials ─────────────────────────────────────────── echo "[2/6] Configuring credentials..." if [ -n "${HF_TOKEN}" ]; then pip install -q huggingface_hub python3 -c "from huggingface_hub import login; login(token='${HF_TOKEN}')" echo " HuggingFace: logged in" fi if [ -n "${WANDB_TOKEN}" ]; then pip install -q wandb wandb login "${WANDB_TOKEN}" 2>/dev/null echo " W&B: logged in" fi # ── Step 3: Clone training pipeline ───────────────────────────────────────── echo "[3/6] Downloading training pipeline from HuggingFace..." if [ -d "${PIPELINE_DIR}" ]; then echo " Pipeline directory exists, pulling latest..." cd "${PIPELINE_DIR}" && git pull else pip install -q huggingface_hub python3 -c " from huggingface_hub import snapshot_download snapshot_download( repo_id='${HF_REPO}', repo_type='model', local_dir='${PIPELINE_DIR}', ) print(' Downloaded.') " fi # ── Step 4: Install Python dependencies ───────────────────────────────────── echo "[4/6] Installing Python dependencies..." cd "${PIPELINE_DIR}" pip install -q -r requirements.txt echo " Done." # ── Step 5: Download ASCAD dataset ────────────────────────────────────────── echo "[5/6] Downloading ASCAD dataset..." mkdir -p "${DATA_DIR}" if [ -f "${DATA_DIR}/ASCAD_data/ASCAD_databases/ASCAD.h5" ]; then echo " ASCAD dataset already exists, skipping download." else echo " Downloading ASCAD_data.zip (~1.5 GB)..." wget -q --show-progress -O "${DATA_DIR}/ASCAD_data.zip" "${ASCAD_URL}" echo " Extracting..." cd "${DATA_DIR}" && unzip -q ASCAD_data.zip rm -f ASCAD_data.zip echo " Done." fi # Also check for raw traces (needed for per-byte window extraction) if [ ! -f "${DATA_DIR}/ATMega8515_raw_traces.h5" ]; then echo " Note: ATMega8515_raw_traces.h5 not found." echo " The MTAN model uses the global window from raw traces." echo " Download separately if needed for Experiment 2." fi # ── Step 6: Start worker agent ────────────────────────────────────────────── echo "[6/6] Starting worker agent..." echo " Log file: ${LOG_FILE}" # Export Vast.ai instance ID if available export VAST_INSTANCE_ID="${VAST_INSTANCE_ID:-unknown}" # Build auth arguments AUTH_ARGS="" if [ -n "${AUTH_USER}" ] && [ -n "${AUTH_PASS}" ]; then AUTH_ARGS="--auth-user '${AUTH_USER}' --auth-pass '${AUTH_PASS}'" fi # Start the worker in a screen session for persistence screen -dmS worker bash -c " export TQ_AUTH_USER='${AUTH_USER}' TQ_AUTH_PASS='${AUTH_PASS}' cd ${PIPELINE_DIR} && \ python3 -m orchestrator.worker.agent \ --server-url '${SERVER_URL}' \ --worker-id '${WORKER_ID}' \ --data-dir '${DATA_DIR}' \ --pipeline-dir '${PIPELINE_DIR}' \ ${AUTH_ARGS} \ --forward-logs ${LOG_FILE} \ 2>&1 | tee -a ${LOG_FILE} " echo "" echo "============================================" echo " Worker deployed successfully!" echo " Screen session: worker" echo " Attach: screen -r worker" echo " Log: tail -f ${LOG_FILE}" echo "============================================"