lemousehunter

feat: LMIC-TSBN model + persistence fixes across restarts

cbb6546 24 days ago

6.43 kB

	#!/usr/bin/env bash
	# ============================================================================
	# deploy_worker.sh — Deploy a worker agent on a Vast.ai GPU instance
	# ============================================================================
	# This script is run via SSH on a freshly provisioned Vast.ai instance.
	# It installs dependencies, downloads the training pipeline and ASCAD dataset,
	# and starts the worker agent pointing to the queue server.
	#
	# Usage:
	# ssh -p <port> root@<host> 'bash -s' < scripts/deploy_worker.sh \
	# <server_url> <worker_id> [hf_token] [wandb_token] [auth_user] [auth_pass]
	#
	# Or copy and run directly on the instance:
	# bash deploy_worker.sh http://queue-server:8080 worker-001 hf_xxx wandb_xxx admin s3cret
	#
	# Arguments:
	# $1 — Queue server URL (e.g., http://1.2.3.4:8080)
	# $2 — Unique worker ID (e.g., worker-38090)
	# $3 — HuggingFace token (optional, for model uploads)
	# $4 — Weights & Biases token (optional, for experiment tracking)
	# $5 — Auth username for queue server (optional, or set TQ_AUTH_USER)
	# $6 — Auth password for queue server (optional, or set TQ_AUTH_PASS)
	# ============================================================================

	set -euo pipefail

	# ── Arguments ────────────────────────────────────────────────────────────────
	SERVER_URL="${1:?Usage: deploy_worker.sh <server_url> <worker_id> [hf_token] [wandb_token] [auth_user] [auth_pass]}"
	WORKER_ID="${2:?Usage: deploy_worker.sh <server_url> <worker_id> [hf_token] [wandb_token] [auth_user] [auth_pass]}"
	HF_TOKEN="${3:-}"
	WANDB_TOKEN="${4:-}"
	AUTH_USER="${5:-${TQ_AUTH_USER:-}}"
	AUTH_PASS="${6:-${TQ_AUTH_PASS:-}}"

	# ── Configuration ────────────────────────────────────────────────────────────
	PIPELINE_DIR="/root/ascad-training-pipeline"
	DATA_DIR="/root/ascad_data"
	HF_REPO="lemousehunter/ascad-training-pipeline"
	ASCAD_URL="https://www.data.gouv.fr/api/1/datasets/r/e7ab6f9e-79bf-431f-a5ed-faf0ebe9b08e"
	LOG_FILE="/root/worker.log"

	echo "============================================"
	echo " ASCAD Worker Deployment"
	echo " Server: ${SERVER_URL}"
	echo " Worker ID: ${WORKER_ID}"
	echo " Auth: ${AUTH_USER:+enabled (user=${AUTH_USER})}${AUTH_USER:-disabled}"
	echo "============================================"

	# ── Step 1: System packages ──────────────────────────────────────────────────
	echo "[1/6] Installing system packages..."
	apt-get update -qq && apt-get install -y -qq git wget unzip screen > /dev/null 2>&1
	echo " Done."

	# ── Step 2: Configure credentials ───────────────────────────────────────────
	echo "[2/6] Configuring credentials..."
	if [ -n "${HF_TOKEN}" ]; then
	pip install -q huggingface_hub
	python3 -c "from huggingface_hub import login; login(token='${HF_TOKEN}')"
	echo " HuggingFace: logged in"
	fi

	if [ -n "${WANDB_TOKEN}" ]; then
	pip install -q wandb
	wandb login "${WANDB_TOKEN}" 2>/dev/null
	echo " W&B: logged in"
	fi

	# ── Step 3: Clone training pipeline ─────────────────────────────────────────
	echo "[3/6] Downloading training pipeline from HuggingFace..."
	if [ -d "${PIPELINE_DIR}" ]; then
	echo " Pipeline directory exists, pulling latest..."
	cd "${PIPELINE_DIR}" && git pull
	else
	pip install -q huggingface_hub
	python3 -c "
	from huggingface_hub import snapshot_download
	snapshot_download(
	repo_id='${HF_REPO}',
	repo_type='model',
	local_dir='${PIPELINE_DIR}',
	)
	print(' Downloaded.')
	"
	fi

	# ── Step 4: Install Python dependencies ─────────────────────────────────────
	echo "[4/6] Installing Python dependencies..."
	cd "${PIPELINE_DIR}"
	pip install -q -r requirements.txt
	echo " Done."

	# ── Step 5: Download ASCAD dataset ──────────────────────────────────────────
	echo "[5/6] Downloading ASCAD dataset..."
	mkdir -p "${DATA_DIR}"

	if [ -f "${DATA_DIR}/ASCAD_data/ASCAD_databases/ASCAD.h5" ]; then
	echo " ASCAD dataset already exists, skipping download."
	else
	echo " Downloading ASCAD_data.zip (~1.5 GB)..."
	wget -q --show-progress -O "${DATA_DIR}/ASCAD_data.zip" "${ASCAD_URL}"
	echo " Extracting..."
	cd "${DATA_DIR}" && unzip -q ASCAD_data.zip
	rm -f ASCAD_data.zip
	echo " Done."
	fi

	# Also check for raw traces (needed for per-byte window extraction)
	if [ ! -f "${DATA_DIR}/ATMega8515_raw_traces.h5" ]; then
	echo " Note: ATMega8515_raw_traces.h5 not found."
	echo " The MTAN model uses the global window from raw traces."
	echo " Download separately if needed for Experiment 2."
	fi

	# ── Step 6: Start worker agent ──────────────────────────────────────────────
	echo "[6/6] Starting worker agent..."
	echo " Log file: ${LOG_FILE}"

	# Export Vast.ai instance ID if available
	export VAST_INSTANCE_ID="${VAST_INSTANCE_ID:-unknown}"

	# Build auth arguments
	AUTH_ARGS=""
	if [ -n "${AUTH_USER}" ] && [ -n "${AUTH_PASS}" ]; then
	AUTH_ARGS="--auth-user '${AUTH_USER}' --auth-pass '${AUTH_PASS}'"
	fi

	# Start the worker in a screen session for persistence
	screen -dmS worker bash -c "
	export TQ_AUTH_USER='${AUTH_USER}' TQ_AUTH_PASS='${AUTH_PASS}'
	cd ${PIPELINE_DIR} && \
	python3 -m orchestrator.worker.agent \
	--server-url '${SERVER_URL}' \
	--worker-id '${WORKER_ID}' \
	--data-dir '${DATA_DIR}' \
	--pipeline-dir '${PIPELINE_DIR}' \
	${AUTH_ARGS} \
	--forward-logs ${LOG_FILE} \
	2>&1 \| tee -a ${LOG_FILE}
	"

	echo ""
	echo "============================================"
	echo " Worker deployed successfully!"
	echo " Screen session: worker"
	echo " Attach: screen -r worker"
	echo " Log: tail -f ${LOG_FILE}"
	echo "============================================"