lemousehunter
feat: LMIC-TSBN model + persistence fixes across restarts
cbb6546
#!/bin/bash
# ============================================================================
# ASCAD Training Worker - Instance Setup Script
# ============================================================================
# Run this ONCE after a Vast.ai instance boots with the pre-baked Docker
# image (tensorflow/tensorflow:2.16.2-gpu). It handles everything that
# cannot be baked into the image:
#
# 1. Install pip dependencies (binary wheels only β€” fast)
# 2. Pull latest pipeline code from HuggingFace
# 3. Download & extract the ASCAD dataset (if not already present)
# 4. Verify GPU availability
# 5. Optionally start the worker agent
#
# Usage:
# # Minimal (just set up the environment):
# bash setup.sh
#
# # Full (set up + start worker agent):
# bash setup.sh --start-worker \
# --server-url http://ORCH_IP:8080 \
# --worker-id worker-a100-1 \
# --auth-user admin \
# --auth-pass SECRET \
# --wandb-key YOUR_WANDB_KEY
#
# Environment variables (alternative to flags):
# TQ_SERVER_URL - Orchestrator URL
# TQ_AUTH_USER - Auth username
# TQ_AUTH_PASS - Auth password
# WORKER_ID - Unique worker ID
# WANDB_API_KEY - W&B API key
# HF_TOKEN - HuggingFace token (optional, for private repos)
# ============================================================================
set -euo pipefail
# ── Parse arguments ─────────────────────────────────────────────────────────
START_WORKER=false
SERVER_URL="${TQ_SERVER_URL:-}"
AUTH_USER="${TQ_AUTH_USER:-admin}"
AUTH_PASS="${TQ_AUTH_PASS:-}"
WORKER_ID_ARG="${WORKER_ID:-worker-$(hostname)}"
WANDB_KEY="${WANDB_API_KEY:-}"
DATA_DIR="/root/ascad_data"
PIPELINE_DIR="/root/ascad-training-pipeline"
SKIP_DATA=false
while [[ $# -gt 0 ]]; do
case "$1" in
--start-worker) START_WORKER=true; shift ;;
--server-url) SERVER_URL="$2"; shift 2 ;;
--worker-id) WORKER_ID_ARG="$2"; shift 2 ;;
--auth-user) AUTH_USER="$2"; shift 2 ;;
--auth-pass) AUTH_PASS="$2"; shift 2 ;;
--wandb-key) WANDB_KEY="$2"; shift 2 ;;
--data-dir) DATA_DIR="$2"; shift 2 ;;
--pipeline-dir) PIPELINE_DIR="$2"; shift 2 ;;
--skip-data) SKIP_DATA=true; shift ;;
-h|--help)
head -35 "$0" | tail -30
exit 0
;;
*)
echo "Unknown argument: $1" >&2
exit 1
;;
esac
done
# ── Logging ─────────────────────────────────────────────────────────────────
LOG_FILE="/root/setup.log"
exec > >(tee "$LOG_FILE") 2>&1
BOLD="\033[1m"
GREEN="\033[32m"
YELLOW="\033[33m"
RESET="\033[0m"
step() { echo -e "\n${BOLD}${GREEN}[$1/$TOTAL_STEPS]${RESET} ${BOLD}$2${RESET}"; }
warn() { echo -e " ${YELLOW}⚠ $1${RESET}"; }
ok() { echo -e " βœ“ $1"; }
TOTAL_STEPS=4
if $START_WORKER; then TOTAL_STEPS=5; fi
echo "============================================"
echo " ASCAD Training Worker - Setup"
echo " $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
echo "============================================"
# ── Step 1: Install pip dependencies ────────────────────────────────────────
step 1 "Installing pip dependencies (binary wheels)..."
STARTED=$(date +%s)
pip3 install --quiet --no-cache-dir --only-binary :all: \
scipy \
scikit-learn \
wandb \
huggingface_hub \
websocket-client \
2>&1 | tail -3
ELAPSED=$(( $(date +%s) - STARTED ))
ok "Done in ${ELAPSED}s"
# ── Step 2: Pull latest code from HuggingFace ──────────────────────────────
step 2 "Pulling pipeline code from HuggingFace..."
STARTED=$(date +%s)
python3 -c "
import os
os.environ['HF_TOKEN'] = os.environ.get('HF_TOKEN', '')
from huggingface_hub import snapshot_download
snapshot_download(
repo_id='lemousehunter/ascad-training-pipeline',
repo_type='model',
local_dir='${PIPELINE_DIR}'
)
" 2>&1 | grep -v "^$"
ELAPSED=$(( $(date +%s) - STARTED ))
ok "Code at ${PIPELINE_DIR} (${ELAPSED}s)"
# Clear any stale bytecode
find "${PIPELINE_DIR}" -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
ok "Cleared __pycache__"
# ── Step 3: Download ASCAD dataset ──────────────────────────────────────────
DATASET_FILE="${DATA_DIR}/ASCAD_data/ASCAD_databases/ATMega8515_raw_traces.h5"
if $SKIP_DATA; then
step 3 "Skipping dataset download (--skip-data)"
elif [ -f "$DATASET_FILE" ]; then
step 3 "Dataset already present, skipping download."
ok "$DATASET_FILE exists ($(du -sh "$DATASET_FILE" | cut -f1))"
else
step 3 "Downloading ASCAD dataset (~4.2 GB)..."
STARTED=$(date +%s)
mkdir -p "$DATA_DIR"
DOWNLOAD_URL="https://www.data.gouv.fr/api/1/datasets/r/e7ab6f9e-79bf-431f-a5ed-faf0ebe9b08e"
wget --progress=bar:force:noscroll -O "${DATA_DIR}/ASCAD_data.zip" "$DOWNLOAD_URL" 2>&1
DL_ELAPSED=$(( $(date +%s) - STARTED ))
ok "Downloaded in ${DL_ELAPSED}s"
echo " Extracting..."
cd "$DATA_DIR"
unzip -o ASCAD_data.zip
rm -f ASCAD_data.zip
ok "Dataset ready at ${DATASET_FILE}"
ELAPSED=$(( $(date +%s) - STARTED ))
ok "Total data step: ${ELAPSED}s"
fi
# ── Step 4: Verify GPU ─────────────────────────────────────────────────────
step 4 "Verifying GPU..."
python3 -c "
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
print(f' GPU detected: {len(gpus)} device(s)')
for g in gpus:
print(f' {g}')
else:
print(' WARNING: No GPU detected!')
import sys; sys.exit(1)
"
ok "GPU verified"
# ── Step 5 (optional): Start worker agent ──────────────────────────────────
if $START_WORKER; then
step 5 "Starting worker agent..."
if [ -z "$SERVER_URL" ]; then
echo " ERROR: --server-url is required to start the worker" >&2
exit 1
fi
if [ -z "$AUTH_PASS" ]; then
echo " ERROR: --auth-pass is required to start the worker" >&2
exit 1
fi
# Login to W&B if key provided
if [ -n "$WANDB_KEY" ]; then
WANDB_API_KEY="$WANDB_KEY" python3 -c "import wandb; wandb.login(key='${WANDB_KEY}')" 2>/dev/null || true
ok "W&B logged in"
fi
# Install screen if not present
which screen >/dev/null 2>&1 || apt-get install -y -qq screen >/dev/null 2>&1
# Start worker in a screen session
screen -dmS worker bash -c "
cd ${PIPELINE_DIR} && \
python3 -m orchestrator.worker.agent \
--server-url '${SERVER_URL}' \
--worker-id '${WORKER_ID_ARG}' \
--data-dir '${DATA_DIR}' \
--pipeline-dir '${PIPELINE_DIR}' \
--auth-user '${AUTH_USER}' \
--auth-pass '${AUTH_PASS}' \
--forward-logs /root/worker.log \
2>&1 | tee -a /root/worker.log
"
ok "Worker started in screen session 'worker'"
echo " Use 'screen -r worker' to attach"
fi
# ── Summary ─────────────────────────────────────────────────────────────────
echo ""
echo "============================================"
echo " Setup complete!"
echo " $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
echo "============================================"
echo ""
echo " Pipeline: ${PIPELINE_DIR}"
echo " Data: ${DATA_DIR}"
echo " Log: ${LOG_FILE}"
if $START_WORKER; then
echo " Worker: screen -r worker"
echo " Worker log: /root/worker.log"
fi
echo ""
echo " Quick start (if worker not started):"
echo " cd ${PIPELINE_DIR}"
echo " python3 -m orchestrator.worker.agent \\"
echo " --server-url http://ORCH_IP:8080 \\"
echo " --worker-id ${WORKER_ID_ARG} \\"
echo " --auth-user admin --auth-pass SECRET"
echo ""