File size: 6,429 Bytes
283a882 cbb6546 283a882 cbb6546 283a882 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | #!/usr/bin/env bash
# ============================================================================
# deploy_worker.sh β Deploy a worker agent on a Vast.ai GPU instance
# ============================================================================
# This script is run via SSH on a freshly provisioned Vast.ai instance.
# It installs dependencies, downloads the training pipeline and ASCAD dataset,
# and starts the worker agent pointing to the queue server.
#
# Usage:
# ssh -p <port> root@<host> 'bash -s' < scripts/deploy_worker.sh \
# <server_url> <worker_id> [hf_token] [wandb_token] [auth_user] [auth_pass]
#
# Or copy and run directly on the instance:
# bash deploy_worker.sh http://queue-server:8080 worker-001 hf_xxx wandb_xxx admin s3cret
#
# Arguments:
# $1 β Queue server URL (e.g., http://1.2.3.4:8080)
# $2 β Unique worker ID (e.g., worker-38090)
# $3 β HuggingFace token (optional, for model uploads)
# $4 β Weights & Biases token (optional, for experiment tracking)
# $5 β Auth username for queue server (optional, or set TQ_AUTH_USER)
# $6 β Auth password for queue server (optional, or set TQ_AUTH_PASS)
# ============================================================================
set -euo pipefail
# ββ Arguments ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
SERVER_URL="${1:?Usage: deploy_worker.sh <server_url> <worker_id> [hf_token] [wandb_token] [auth_user] [auth_pass]}"
WORKER_ID="${2:?Usage: deploy_worker.sh <server_url> <worker_id> [hf_token] [wandb_token] [auth_user] [auth_pass]}"
HF_TOKEN="${3:-}"
WANDB_TOKEN="${4:-}"
AUTH_USER="${5:-${TQ_AUTH_USER:-}}"
AUTH_PASS="${6:-${TQ_AUTH_PASS:-}}"
# ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
PIPELINE_DIR="/root/ascad-training-pipeline"
DATA_DIR="/root/ascad_data"
HF_REPO="lemousehunter/ascad-training-pipeline"
ASCAD_URL="https://www.data.gouv.fr/api/1/datasets/r/e7ab6f9e-79bf-431f-a5ed-faf0ebe9b08e"
LOG_FILE="/root/worker.log"
echo "============================================"
echo " ASCAD Worker Deployment"
echo " Server: ${SERVER_URL}"
echo " Worker ID: ${WORKER_ID}"
echo " Auth: ${AUTH_USER:+enabled (user=${AUTH_USER})}${AUTH_USER:-disabled}"
echo "============================================"
# ββ Step 1: System packages ββββββββββββββββββββββββββββββββββββββββββββββββββ
echo "[1/6] Installing system packages..."
apt-get update -qq && apt-get install -y -qq git wget unzip screen > /dev/null 2>&1
echo " Done."
# ββ Step 2: Configure credentials βββββββββββββββββββββββββββββββββββββββββββ
echo "[2/6] Configuring credentials..."
if [ -n "${HF_TOKEN}" ]; then
pip install -q huggingface_hub
python3 -c "from huggingface_hub import login; login(token='${HF_TOKEN}')"
echo " HuggingFace: logged in"
fi
if [ -n "${WANDB_TOKEN}" ]; then
pip install -q wandb
wandb login "${WANDB_TOKEN}" 2>/dev/null
echo " W&B: logged in"
fi
# ββ Step 3: Clone training pipeline βββββββββββββββββββββββββββββββββββββββββ
echo "[3/6] Downloading training pipeline from HuggingFace..."
if [ -d "${PIPELINE_DIR}" ]; then
echo " Pipeline directory exists, pulling latest..."
cd "${PIPELINE_DIR}" && git pull
else
pip install -q huggingface_hub
python3 -c "
from huggingface_hub import snapshot_download
snapshot_download(
repo_id='${HF_REPO}',
repo_type='model',
local_dir='${PIPELINE_DIR}',
)
print(' Downloaded.')
"
fi
# ββ Step 4: Install Python dependencies βββββββββββββββββββββββββββββββββββββ
echo "[4/6] Installing Python dependencies..."
cd "${PIPELINE_DIR}"
pip install -q -r requirements.txt
echo " Done."
# ββ Step 5: Download ASCAD dataset ββββββββββββββββββββββββββββββββββββββββββ
echo "[5/6] Downloading ASCAD dataset..."
mkdir -p "${DATA_DIR}"
if [ -f "${DATA_DIR}/ASCAD_data/ASCAD_databases/ASCAD.h5" ]; then
echo " ASCAD dataset already exists, skipping download."
else
echo " Downloading ASCAD_data.zip (~1.5 GB)..."
wget -q --show-progress -O "${DATA_DIR}/ASCAD_data.zip" "${ASCAD_URL}"
echo " Extracting..."
cd "${DATA_DIR}" && unzip -q ASCAD_data.zip
rm -f ASCAD_data.zip
echo " Done."
fi
# Also check for raw traces (needed for per-byte window extraction)
if [ ! -f "${DATA_DIR}/ATMega8515_raw_traces.h5" ]; then
echo " Note: ATMega8515_raw_traces.h5 not found."
echo " The MTAN model uses the global window from raw traces."
echo " Download separately if needed for Experiment 2."
fi
# ββ Step 6: Start worker agent ββββββββββββββββββββββββββββββββββββββββββββββ
echo "[6/6] Starting worker agent..."
echo " Log file: ${LOG_FILE}"
# Export Vast.ai instance ID if available
export VAST_INSTANCE_ID="${VAST_INSTANCE_ID:-unknown}"
# Build auth arguments
AUTH_ARGS=""
if [ -n "${AUTH_USER}" ] && [ -n "${AUTH_PASS}" ]; then
AUTH_ARGS="--auth-user '${AUTH_USER}' --auth-pass '${AUTH_PASS}'"
fi
# Start the worker in a screen session for persistence
screen -dmS worker bash -c "
export TQ_AUTH_USER='${AUTH_USER}' TQ_AUTH_PASS='${AUTH_PASS}'
cd ${PIPELINE_DIR} && \
python3 -m orchestrator.worker.agent \
--server-url '${SERVER_URL}' \
--worker-id '${WORKER_ID}' \
--data-dir '${DATA_DIR}' \
--pipeline-dir '${PIPELINE_DIR}' \
${AUTH_ARGS} \
--forward-logs ${LOG_FILE} \
2>&1 | tee -a ${LOG_FILE}
"
echo ""
echo "============================================"
echo " Worker deployed successfully!"
echo " Screen session: worker"
echo " Attach: screen -r worker"
echo " Log: tail -f ${LOG_FILE}"
echo "============================================"
|