#!/bin/bash # ============================================================================ # ASCAD Training Worker - Instance Setup Script # ============================================================================ # Run this ONCE after a Vast.ai instance boots with the pre-baked Docker # image (tensorflow/tensorflow:2.16.2-gpu). It handles everything that # cannot be baked into the image: # # 1. Install pip dependencies (binary wheels only — fast) # 2. Pull latest pipeline code from HuggingFace # 3. Download & extract the ASCAD dataset (if not already present) # 4. Verify GPU availability # 5. Optionally start the worker agent # # Usage: # # Minimal (just set up the environment): # bash setup.sh # # # Full (set up + start worker agent): # bash setup.sh --start-worker \ # --server-url http://ORCH_IP:8080 \ # --worker-id worker-a100-1 \ # --auth-user admin \ # --auth-pass SECRET \ # --wandb-key YOUR_WANDB_KEY # # Environment variables (alternative to flags): # TQ_SERVER_URL - Orchestrator URL # TQ_AUTH_USER - Auth username # TQ_AUTH_PASS - Auth password # WORKER_ID - Unique worker ID # WANDB_API_KEY - W&B API key # HF_TOKEN - HuggingFace token (optional, for private repos) # ============================================================================ set -euo pipefail # ── Parse arguments ───────────────────────────────────────────────────────── START_WORKER=false SERVER_URL="${TQ_SERVER_URL:-}" AUTH_USER="${TQ_AUTH_USER:-admin}" AUTH_PASS="${TQ_AUTH_PASS:-}" WORKER_ID_ARG="${WORKER_ID:-worker-$(hostname)}" WANDB_KEY="${WANDB_API_KEY:-}" DATA_DIR="/root/ascad_data" PIPELINE_DIR="/root/ascad-training-pipeline" SKIP_DATA=false while [[ $# -gt 0 ]]; do case "$1" in --start-worker) START_WORKER=true; shift ;; --server-url) SERVER_URL="$2"; shift 2 ;; --worker-id) WORKER_ID_ARG="$2"; shift 2 ;; --auth-user) AUTH_USER="$2"; shift 2 ;; --auth-pass) AUTH_PASS="$2"; shift 2 ;; --wandb-key) WANDB_KEY="$2"; shift 2 ;; --data-dir) DATA_DIR="$2"; shift 2 ;; --pipeline-dir) PIPELINE_DIR="$2"; shift 2 ;; --skip-data) SKIP_DATA=true; shift ;; -h|--help) head -35 "$0" | tail -30 exit 0 ;; *) echo "Unknown argument: $1" >&2 exit 1 ;; esac done # ── Logging ───────────────────────────────────────────────────────────────── LOG_FILE="/root/setup.log" exec > >(tee "$LOG_FILE") 2>&1 BOLD="\033[1m" GREEN="\033[32m" YELLOW="\033[33m" RESET="\033[0m" step() { echo -e "\n${BOLD}${GREEN}[$1/$TOTAL_STEPS]${RESET} ${BOLD}$2${RESET}"; } warn() { echo -e " ${YELLOW}⚠ $1${RESET}"; } ok() { echo -e " ✓ $1"; } TOTAL_STEPS=4 if $START_WORKER; then TOTAL_STEPS=5; fi echo "============================================" echo " ASCAD Training Worker - Setup" echo " $(date -u '+%Y-%m-%d %H:%M:%S UTC')" echo "============================================" # ── Step 1: Install pip dependencies ──────────────────────────────────────── step 1 "Installing pip dependencies (binary wheels)..." STARTED=$(date +%s) pip3 install --quiet --no-cache-dir --only-binary :all: \ scipy \ scikit-learn \ wandb \ huggingface_hub \ websocket-client \ 2>&1 | tail -3 ELAPSED=$(( $(date +%s) - STARTED )) ok "Done in ${ELAPSED}s" # ── Step 2: Pull latest code from HuggingFace ────────────────────────────── step 2 "Pulling pipeline code from HuggingFace..." STARTED=$(date +%s) python3 -c " import os os.environ['HF_TOKEN'] = os.environ.get('HF_TOKEN', '') from huggingface_hub import snapshot_download snapshot_download( repo_id='lemousehunter/ascad-training-pipeline', repo_type='model', local_dir='${PIPELINE_DIR}' ) " 2>&1 | grep -v "^$" ELAPSED=$(( $(date +%s) - STARTED )) ok "Code at ${PIPELINE_DIR} (${ELAPSED}s)" # Clear any stale bytecode find "${PIPELINE_DIR}" -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true ok "Cleared __pycache__" # ── Step 3: Download ASCAD dataset ────────────────────────────────────────── DATASET_FILE="${DATA_DIR}/ASCAD_data/ASCAD_databases/ATMega8515_raw_traces.h5" if $SKIP_DATA; then step 3 "Skipping dataset download (--skip-data)" elif [ -f "$DATASET_FILE" ]; then step 3 "Dataset already present, skipping download." ok "$DATASET_FILE exists ($(du -sh "$DATASET_FILE" | cut -f1))" else step 3 "Downloading ASCAD dataset (~4.2 GB)..." STARTED=$(date +%s) mkdir -p "$DATA_DIR" DOWNLOAD_URL="https://www.data.gouv.fr/api/1/datasets/r/e7ab6f9e-79bf-431f-a5ed-faf0ebe9b08e" wget --progress=bar:force:noscroll -O "${DATA_DIR}/ASCAD_data.zip" "$DOWNLOAD_URL" 2>&1 DL_ELAPSED=$(( $(date +%s) - STARTED )) ok "Downloaded in ${DL_ELAPSED}s" echo " Extracting..." cd "$DATA_DIR" unzip -o ASCAD_data.zip rm -f ASCAD_data.zip ok "Dataset ready at ${DATASET_FILE}" ELAPSED=$(( $(date +%s) - STARTED )) ok "Total data step: ${ELAPSED}s" fi # ── Step 4: Verify GPU ───────────────────────────────────────────────────── step 4 "Verifying GPU..." python3 -c " import tensorflow as tf gpus = tf.config.list_physical_devices('GPU') if gpus: print(f' GPU detected: {len(gpus)} device(s)') for g in gpus: print(f' {g}') else: print(' WARNING: No GPU detected!') import sys; sys.exit(1) " ok "GPU verified" # ── Step 5 (optional): Start worker agent ────────────────────────────────── if $START_WORKER; then step 5 "Starting worker agent..." if [ -z "$SERVER_URL" ]; then echo " ERROR: --server-url is required to start the worker" >&2 exit 1 fi if [ -z "$AUTH_PASS" ]; then echo " ERROR: --auth-pass is required to start the worker" >&2 exit 1 fi # Login to W&B if key provided if [ -n "$WANDB_KEY" ]; then WANDB_API_KEY="$WANDB_KEY" python3 -c "import wandb; wandb.login(key='${WANDB_KEY}')" 2>/dev/null || true ok "W&B logged in" fi # Install screen if not present which screen >/dev/null 2>&1 || apt-get install -y -qq screen >/dev/null 2>&1 # Start worker in a screen session screen -dmS worker bash -c " cd ${PIPELINE_DIR} && \ python3 -m orchestrator.worker.agent \ --server-url '${SERVER_URL}' \ --worker-id '${WORKER_ID_ARG}' \ --data-dir '${DATA_DIR}' \ --pipeline-dir '${PIPELINE_DIR}' \ --auth-user '${AUTH_USER}' \ --auth-pass '${AUTH_PASS}' \ --forward-logs /root/worker.log \ 2>&1 | tee -a /root/worker.log " ok "Worker started in screen session 'worker'" echo " Use 'screen -r worker' to attach" fi # ── Summary ───────────────────────────────────────────────────────────────── echo "" echo "============================================" echo " Setup complete!" echo " $(date -u '+%Y-%m-%d %H:%M:%S UTC')" echo "============================================" echo "" echo " Pipeline: ${PIPELINE_DIR}" echo " Data: ${DATA_DIR}" echo " Log: ${LOG_FILE}" if $START_WORKER; then echo " Worker: screen -r worker" echo " Worker log: /root/worker.log" fi echo "" echo " Quick start (if worker not started):" echo " cd ${PIPELINE_DIR}" echo " python3 -m orchestrator.worker.agent \\" echo " --server-url http://ORCH_IP:8080 \\" echo " --worker-id ${WORKER_ID_ARG} \\" echo " --auth-user admin --auth-pass SECRET" echo ""