File size: 8,423 Bytes

#!/bin/bash
# ============================================================================
# ASCAD Training Worker - Instance Setup Script
# ============================================================================
# Run this ONCE after a Vast.ai instance boots with the pre-baked Docker
# image (tensorflow/tensorflow:2.16.2-gpu).  It handles everything that
# cannot be baked into the image:
#
#   1. Install pip dependencies (binary wheels only — fast)
#   2. Pull latest pipeline code from HuggingFace
#   3. Download & extract the ASCAD dataset (if not already present)
#   4. Verify GPU availability
#   5. Optionally start the worker agent
#
# Usage:
#   # Minimal (just set up the environment):
#   bash setup.sh
#
#   # Full (set up + start worker agent):
#   bash setup.sh --start-worker \
#       --server-url http://ORCH_IP:8080 \
#       --worker-id worker-a100-1 \
#       --auth-user admin \
#       --auth-pass SECRET \
#       --wandb-key YOUR_WANDB_KEY
#
# Environment variables (alternative to flags):
#   TQ_SERVER_URL   - Orchestrator URL
#   TQ_AUTH_USER    - Auth username
#   TQ_AUTH_PASS    - Auth password
#   WORKER_ID       - Unique worker ID
#   WANDB_API_KEY   - W&B API key
#   HF_TOKEN        - HuggingFace token (optional, for private repos)
# ============================================================================

set -euo pipefail

# ── Parse arguments ─────────────────────────────────────────────────────────
START_WORKER=false
SERVER_URL="${TQ_SERVER_URL:-}"
AUTH_USER="${TQ_AUTH_USER:-admin}"
AUTH_PASS="${TQ_AUTH_PASS:-}"
WORKER_ID_ARG="${WORKER_ID:-worker-$(hostname)}"
WANDB_KEY="${WANDB_API_KEY:-}"
DATA_DIR="/root/ascad_data"
PIPELINE_DIR="/root/ascad-training-pipeline"
SKIP_DATA=false

while [[ $# -gt 0 ]]; do
    case "$1" in
        --start-worker)     START_WORKER=true; shift ;;
        --server-url)       SERVER_URL="$2"; shift 2 ;;
        --worker-id)        WORKER_ID_ARG="$2"; shift 2 ;;
        --auth-user)        AUTH_USER="$2"; shift 2 ;;
        --auth-pass)        AUTH_PASS="$2"; shift 2 ;;
        --wandb-key)        WANDB_KEY="$2"; shift 2 ;;
        --data-dir)         DATA_DIR="$2"; shift 2 ;;
        --pipeline-dir)     PIPELINE_DIR="$2"; shift 2 ;;
        --skip-data)        SKIP_DATA=true; shift ;;
        -h|--help)
            head -35 "$0" | tail -30
            exit 0
            ;;
        *)
            echo "Unknown argument: $1" >&2
            exit 1
            ;;
    esac
done

# ── Logging ─────────────────────────────────────────────────────────────────
LOG_FILE="/root/setup.log"
exec > >(tee "$LOG_FILE") 2>&1

BOLD="\033[1m"
GREEN="\033[32m"
YELLOW="\033[33m"
RESET="\033[0m"

step() { echo -e "\n${BOLD}${GREEN}[$1/$TOTAL_STEPS]${RESET} ${BOLD}$2${RESET}"; }
warn() { echo -e "  ${YELLOW}⚠ $1${RESET}"; }
ok()   { echo -e "  ✓ $1"; }

TOTAL_STEPS=4
if $START_WORKER; then TOTAL_STEPS=5; fi

echo "============================================"
echo "  ASCAD Training Worker - Setup"
echo "  $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
echo "============================================"

# ── Step 1: Install pip dependencies ────────────────────────────────────────
step 1 "Installing pip dependencies (binary wheels)..."
STARTED=$(date +%s)

pip3 install --quiet --no-cache-dir --only-binary :all: \
    scipy \
    scikit-learn \
    wandb \
    huggingface_hub \
    websocket-client \
    2>&1 | tail -3

ELAPSED=$(( $(date +%s) - STARTED ))
ok "Done in ${ELAPSED}s"

# ── Step 2: Pull latest code from HuggingFace ──────────────────────────────
step 2 "Pulling pipeline code from HuggingFace..."
STARTED=$(date +%s)

python3 -c "
import os
os.environ['HF_TOKEN'] = os.environ.get('HF_TOKEN', '')
from huggingface_hub import snapshot_download
snapshot_download(
    repo_id='lemousehunter/ascad-training-pipeline',
    repo_type='model',
    local_dir='${PIPELINE_DIR}'
)
" 2>&1 | grep -v "^$"

ELAPSED=$(( $(date +%s) - STARTED ))
ok "Code at ${PIPELINE_DIR} (${ELAPSED}s)"

# Clear any stale bytecode
find "${PIPELINE_DIR}" -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
ok "Cleared __pycache__"

# ── Step 3: Download ASCAD dataset ──────────────────────────────────────────
DATASET_FILE="${DATA_DIR}/ASCAD_data/ASCAD_databases/ATMega8515_raw_traces.h5"

if $SKIP_DATA; then
    step 3 "Skipping dataset download (--skip-data)"
elif [ -f "$DATASET_FILE" ]; then
    step 3 "Dataset already present, skipping download."
    ok "$DATASET_FILE exists ($(du -sh "$DATASET_FILE" | cut -f1))"
else
    step 3 "Downloading ASCAD dataset (~4.2 GB)..."
    STARTED=$(date +%s)
    mkdir -p "$DATA_DIR"

    DOWNLOAD_URL="https://www.data.gouv.fr/api/1/datasets/r/e7ab6f9e-79bf-431f-a5ed-faf0ebe9b08e"
    wget --progress=bar:force:noscroll -O "${DATA_DIR}/ASCAD_data.zip" "$DOWNLOAD_URL" 2>&1

    DL_ELAPSED=$(( $(date +%s) - STARTED ))
    ok "Downloaded in ${DL_ELAPSED}s"

    echo "  Extracting..."
    cd "$DATA_DIR"
    unzip -o ASCAD_data.zip
    rm -f ASCAD_data.zip
    ok "Dataset ready at ${DATASET_FILE}"

    ELAPSED=$(( $(date +%s) - STARTED ))
    ok "Total data step: ${ELAPSED}s"
fi

# ── Step 4: Verify GPU ─────────────────────────────────────────────────────
step 4 "Verifying GPU..."
python3 -c "
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f'  GPU detected: {len(gpus)} device(s)')
    for g in gpus:
        print(f'    {g}')
else:
    print('  WARNING: No GPU detected!')
    import sys; sys.exit(1)
"
ok "GPU verified"

# ── Step 5 (optional): Start worker agent ──────────────────────────────────
if $START_WORKER; then
    step 5 "Starting worker agent..."

    if [ -z "$SERVER_URL" ]; then
        echo "  ERROR: --server-url is required to start the worker" >&2
        exit 1
    fi
    if [ -z "$AUTH_PASS" ]; then
        echo "  ERROR: --auth-pass is required to start the worker" >&2
        exit 1
    fi

    # Login to W&B if key provided
    if [ -n "$WANDB_KEY" ]; then
        WANDB_API_KEY="$WANDB_KEY" python3 -c "import wandb; wandb.login(key='${WANDB_KEY}')" 2>/dev/null || true
        ok "W&B logged in"
    fi

    # Install screen if not present
    which screen >/dev/null 2>&1 || apt-get install -y -qq screen >/dev/null 2>&1

    # Start worker in a screen session
    screen -dmS worker bash -c "
        cd ${PIPELINE_DIR} && \
        python3 -m orchestrator.worker.agent \
            --server-url '${SERVER_URL}' \
            --worker-id '${WORKER_ID_ARG}' \
            --data-dir '${DATA_DIR}' \
            --pipeline-dir '${PIPELINE_DIR}' \
            --auth-user '${AUTH_USER}' \
            --auth-pass '${AUTH_PASS}' \
            --forward-logs /root/worker.log \
        2>&1 | tee -a /root/worker.log
    "

    ok "Worker started in screen session 'worker'"
    echo "  Use 'screen -r worker' to attach"
fi

# ── Summary ─────────────────────────────────────────────────────────────────
echo ""
echo "============================================"
echo "  Setup complete!"
echo "  $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
echo "============================================"
echo ""
echo "  Pipeline:  ${PIPELINE_DIR}"
echo "  Data:      ${DATA_DIR}"
echo "  Log:       ${LOG_FILE}"
if $START_WORKER; then
    echo "  Worker:    screen -r worker"
    echo "  Worker log: /root/worker.log"
fi
echo ""
echo "  Quick start (if worker not started):"
echo "    cd ${PIPELINE_DIR}"
echo "    python3 -m orchestrator.worker.agent \\"
echo "        --server-url http://ORCH_IP:8080 \\"
echo "        --worker-id ${WORKER_ID_ARG} \\"
echo "        --auth-user admin --auth-pass SECRET"
echo ""