File size: 6,429 Bytes

#!/usr/bin/env bash
# ============================================================================
# deploy_worker.sh — Deploy a worker agent on a Vast.ai GPU instance
# ============================================================================
# This script is run via SSH on a freshly provisioned Vast.ai instance.
# It installs dependencies, downloads the training pipeline and ASCAD dataset,
# and starts the worker agent pointing to the queue server.
#
# Usage:
#   ssh -p <port> root@<host> 'bash -s' < scripts/deploy_worker.sh \
#       <server_url> <worker_id> [hf_token] [wandb_token] [auth_user] [auth_pass]
#
# Or copy and run directly on the instance:
#   bash deploy_worker.sh http://queue-server:8080 worker-001 hf_xxx wandb_xxx admin s3cret
#
# Arguments:
#   $1 — Queue server URL (e.g., http://1.2.3.4:8080)
#   $2 — Unique worker ID (e.g., worker-38090)
#   $3 — HuggingFace token (optional, for model uploads)
#   $4 — Weights & Biases token (optional, for experiment tracking)
#   $5 — Auth username for queue server (optional, or set TQ_AUTH_USER)
#   $6 — Auth password for queue server (optional, or set TQ_AUTH_PASS)
# ============================================================================

set -euo pipefail

# ── Arguments ────────────────────────────────────────────────────────────────
SERVER_URL="${1:?Usage: deploy_worker.sh <server_url> <worker_id> [hf_token] [wandb_token] [auth_user] [auth_pass]}"
WORKER_ID="${2:?Usage: deploy_worker.sh <server_url> <worker_id> [hf_token] [wandb_token] [auth_user] [auth_pass]}"
HF_TOKEN="${3:-}"
WANDB_TOKEN="${4:-}"
AUTH_USER="${5:-${TQ_AUTH_USER:-}}"
AUTH_PASS="${6:-${TQ_AUTH_PASS:-}}"

# ── Configuration ────────────────────────────────────────────────────────────
PIPELINE_DIR="/root/ascad-training-pipeline"
DATA_DIR="/root/ascad_data"
HF_REPO="lemousehunter/ascad-training-pipeline"
ASCAD_URL="https://www.data.gouv.fr/api/1/datasets/r/e7ab6f9e-79bf-431f-a5ed-faf0ebe9b08e"
LOG_FILE="/root/worker.log"

echo "============================================"
echo "  ASCAD Worker Deployment"
echo "  Server:    ${SERVER_URL}"
echo "  Worker ID: ${WORKER_ID}"
echo "  Auth:      ${AUTH_USER:+enabled (user=${AUTH_USER})}${AUTH_USER:-disabled}"
echo "============================================"

# ── Step 1: System packages ──────────────────────────────────────────────────
echo "[1/6] Installing system packages..."
apt-get update -qq && apt-get install -y -qq git wget unzip screen > /dev/null 2>&1
echo "  Done."

# ── Step 2: Configure credentials ───────────────────────────────────────────
echo "[2/6] Configuring credentials..."
if [ -n "${HF_TOKEN}" ]; then
    pip install -q huggingface_hub
    python3 -c "from huggingface_hub import login; login(token='${HF_TOKEN}')"
    echo "  HuggingFace: logged in"
fi

if [ -n "${WANDB_TOKEN}" ]; then
    pip install -q wandb
    wandb login "${WANDB_TOKEN}" 2>/dev/null
    echo "  W&B: logged in"
fi

# ── Step 3: Clone training pipeline ─────────────────────────────────────────
echo "[3/6] Downloading training pipeline from HuggingFace..."
if [ -d "${PIPELINE_DIR}" ]; then
    echo "  Pipeline directory exists, pulling latest..."
    cd "${PIPELINE_DIR}" && git pull
else
    pip install -q huggingface_hub
    python3 -c "
from huggingface_hub import snapshot_download
snapshot_download(
    repo_id='${HF_REPO}',
    repo_type='model',
    local_dir='${PIPELINE_DIR}',
)
print('  Downloaded.')
"
fi

# ── Step 4: Install Python dependencies ─────────────────────────────────────
echo "[4/6] Installing Python dependencies..."
cd "${PIPELINE_DIR}"
pip install -q -r requirements.txt
echo "  Done."

# ── Step 5: Download ASCAD dataset ──────────────────────────────────────────
echo "[5/6] Downloading ASCAD dataset..."
mkdir -p "${DATA_DIR}"

if [ -f "${DATA_DIR}/ASCAD_data/ASCAD_databases/ASCAD.h5" ]; then
    echo "  ASCAD dataset already exists, skipping download."
else
    echo "  Downloading ASCAD_data.zip (~1.5 GB)..."
    wget -q --show-progress -O "${DATA_DIR}/ASCAD_data.zip" "${ASCAD_URL}"
    echo "  Extracting..."
    cd "${DATA_DIR}" && unzip -q ASCAD_data.zip
    rm -f ASCAD_data.zip
    echo "  Done."
fi

# Also check for raw traces (needed for per-byte window extraction)
if [ ! -f "${DATA_DIR}/ATMega8515_raw_traces.h5" ]; then
    echo "  Note: ATMega8515_raw_traces.h5 not found."
    echo "  The MTAN model uses the global window from raw traces."
    echo "  Download separately if needed for Experiment 2."
fi

# ── Step 6: Start worker agent ──────────────────────────────────────────────
echo "[6/6] Starting worker agent..."
echo "  Log file: ${LOG_FILE}"

# Export Vast.ai instance ID if available
export VAST_INSTANCE_ID="${VAST_INSTANCE_ID:-unknown}"

# Build auth arguments
AUTH_ARGS=""
if [ -n "${AUTH_USER}" ] && [ -n "${AUTH_PASS}" ]; then
    AUTH_ARGS="--auth-user '${AUTH_USER}' --auth-pass '${AUTH_PASS}'"
fi

# Start the worker in a screen session for persistence
screen -dmS worker bash -c "
    export TQ_AUTH_USER='${AUTH_USER}' TQ_AUTH_PASS='${AUTH_PASS}'
    cd ${PIPELINE_DIR} && \
    python3 -m orchestrator.worker.agent \
        --server-url '${SERVER_URL}' \
        --worker-id '${WORKER_ID}' \
        --data-dir '${DATA_DIR}' \
        --pipeline-dir '${PIPELINE_DIR}' \
        ${AUTH_ARGS} \
        --forward-logs ${LOG_FILE} \
    2>&1 | tee -a ${LOG_FILE}
"

echo ""
echo "============================================"
echo "  Worker deployed successfully!"
echo "  Screen session: worker"
echo "  Attach: screen -r worker"
echo "  Log: tail -f ${LOG_FILE}"
echo "============================================"