| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| |
| START_WORKER=false |
| SERVER_URL="${TQ_SERVER_URL:-}" |
| AUTH_USER="${TQ_AUTH_USER:-admin}" |
| AUTH_PASS="${TQ_AUTH_PASS:-}" |
| WORKER_ID_ARG="${WORKER_ID:-worker-$(hostname)}" |
| WANDB_KEY="${WANDB_API_KEY:-}" |
| DATA_DIR="/root/ascad_data" |
| PIPELINE_DIR="/root/ascad-training-pipeline" |
| SKIP_DATA=false |
|
|
| while [[ $# -gt 0 ]]; do |
| case "$1" in |
| --start-worker) START_WORKER=true; shift ;; |
| --server-url) SERVER_URL="$2"; shift 2 ;; |
| --worker-id) WORKER_ID_ARG="$2"; shift 2 ;; |
| --auth-user) AUTH_USER="$2"; shift 2 ;; |
| --auth-pass) AUTH_PASS="$2"; shift 2 ;; |
| --wandb-key) WANDB_KEY="$2"; shift 2 ;; |
| --data-dir) DATA_DIR="$2"; shift 2 ;; |
| --pipeline-dir) PIPELINE_DIR="$2"; shift 2 ;; |
| --skip-data) SKIP_DATA=true; shift ;; |
| -h|--help) |
| head -35 "$0" | tail -30 |
| exit 0 |
| ;; |
| *) |
| echo "Unknown argument: $1" >&2 |
| exit 1 |
| ;; |
| esac |
| done |
|
|
| |
| LOG_FILE="/root/setup.log" |
| exec > >(tee "$LOG_FILE") 2>&1 |
|
|
| BOLD="\033[1m" |
| GREEN="\033[32m" |
| YELLOW="\033[33m" |
| RESET="\033[0m" |
|
|
| step() { echo -e "\n${BOLD}${GREEN}[$1/$TOTAL_STEPS]${RESET} ${BOLD}$2${RESET}"; } |
| warn() { echo -e " ${YELLOW}β $1${RESET}"; } |
| ok() { echo -e " β $1"; } |
|
|
| TOTAL_STEPS=4 |
| if $START_WORKER; then TOTAL_STEPS=5; fi |
|
|
| echo "============================================" |
| echo " ASCAD Training Worker - Setup" |
| echo " $(date -u '+%Y-%m-%d %H:%M:%S UTC')" |
| echo "============================================" |
|
|
| |
| step 1 "Installing pip dependencies (binary wheels)..." |
| STARTED=$(date +%s) |
|
|
| pip3 install --quiet --no-cache-dir --only-binary :all: \ |
| scipy \ |
| scikit-learn \ |
| wandb \ |
| huggingface_hub \ |
| websocket-client \ |
| 2>&1 | tail -3 |
|
|
| ELAPSED=$(( $(date +%s) - STARTED )) |
| ok "Done in ${ELAPSED}s" |
|
|
| |
| step 2 "Pulling pipeline code from HuggingFace..." |
| STARTED=$(date +%s) |
|
|
| python3 -c " |
| import os |
| os.environ['HF_TOKEN'] = os.environ.get('HF_TOKEN', '') |
| from huggingface_hub import snapshot_download |
| snapshot_download( |
| repo_id='lemousehunter/ascad-training-pipeline', |
| repo_type='model', |
| local_dir='${PIPELINE_DIR}' |
| ) |
| " 2>&1 | grep -v "^$" |
|
|
| ELAPSED=$(( $(date +%s) - STARTED )) |
| ok "Code at ${PIPELINE_DIR} (${ELAPSED}s)" |
|
|
| |
| find "${PIPELINE_DIR}" -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true |
| ok "Cleared __pycache__" |
|
|
| |
| DATASET_FILE="${DATA_DIR}/ASCAD_data/ASCAD_databases/ATMega8515_raw_traces.h5" |
|
|
| if $SKIP_DATA; then |
| step 3 "Skipping dataset download (--skip-data)" |
| elif [ -f "$DATASET_FILE" ]; then |
| step 3 "Dataset already present, skipping download." |
| ok "$DATASET_FILE exists ($(du -sh "$DATASET_FILE" | cut -f1))" |
| else |
| step 3 "Downloading ASCAD dataset (~4.2 GB)..." |
| STARTED=$(date +%s) |
| mkdir -p "$DATA_DIR" |
|
|
| DOWNLOAD_URL="https://www.data.gouv.fr/api/1/datasets/r/e7ab6f9e-79bf-431f-a5ed-faf0ebe9b08e" |
| wget --progress=bar:force:noscroll -O "${DATA_DIR}/ASCAD_data.zip" "$DOWNLOAD_URL" 2>&1 |
|
|
| DL_ELAPSED=$(( $(date +%s) - STARTED )) |
| ok "Downloaded in ${DL_ELAPSED}s" |
|
|
| echo " Extracting..." |
| cd "$DATA_DIR" |
| unzip -o ASCAD_data.zip |
| rm -f ASCAD_data.zip |
| ok "Dataset ready at ${DATASET_FILE}" |
|
|
| ELAPSED=$(( $(date +%s) - STARTED )) |
| ok "Total data step: ${ELAPSED}s" |
| fi |
|
|
| |
| step 4 "Verifying GPU..." |
| python3 -c " |
| import tensorflow as tf |
| gpus = tf.config.list_physical_devices('GPU') |
| if gpus: |
| print(f' GPU detected: {len(gpus)} device(s)') |
| for g in gpus: |
| print(f' {g}') |
| else: |
| print(' WARNING: No GPU detected!') |
| import sys; sys.exit(1) |
| " |
| ok "GPU verified" |
|
|
| |
| if $START_WORKER; then |
| step 5 "Starting worker agent..." |
|
|
| if [ -z "$SERVER_URL" ]; then |
| echo " ERROR: --server-url is required to start the worker" >&2 |
| exit 1 |
| fi |
| if [ -z "$AUTH_PASS" ]; then |
| echo " ERROR: --auth-pass is required to start the worker" >&2 |
| exit 1 |
| fi |
|
|
| |
| if [ -n "$WANDB_KEY" ]; then |
| WANDB_API_KEY="$WANDB_KEY" python3 -c "import wandb; wandb.login(key='${WANDB_KEY}')" 2>/dev/null || true |
| ok "W&B logged in" |
| fi |
|
|
| |
| which screen >/dev/null 2>&1 || apt-get install -y -qq screen >/dev/null 2>&1 |
|
|
| |
| screen -dmS worker bash -c " |
| cd ${PIPELINE_DIR} && \ |
| python3 -m orchestrator.worker.agent \ |
| --server-url '${SERVER_URL}' \ |
| --worker-id '${WORKER_ID_ARG}' \ |
| --data-dir '${DATA_DIR}' \ |
| --pipeline-dir '${PIPELINE_DIR}' \ |
| --auth-user '${AUTH_USER}' \ |
| --auth-pass '${AUTH_PASS}' \ |
| --forward-logs /root/worker.log \ |
| 2>&1 | tee -a /root/worker.log |
| " |
|
|
| ok "Worker started in screen session 'worker'" |
| echo " Use 'screen -r worker' to attach" |
| fi |
|
|
| |
| echo "" |
| echo "============================================" |
| echo " Setup complete!" |
| echo " $(date -u '+%Y-%m-%d %H:%M:%S UTC')" |
| echo "============================================" |
| echo "" |
| echo " Pipeline: ${PIPELINE_DIR}" |
| echo " Data: ${DATA_DIR}" |
| echo " Log: ${LOG_FILE}" |
| if $START_WORKER; then |
| echo " Worker: screen -r worker" |
| echo " Worker log: /root/worker.log" |
| fi |
| echo "" |
| echo " Quick start (if worker not started):" |
| echo " cd ${PIPELINE_DIR}" |
| echo " python3 -m orchestrator.worker.agent \\" |
| echo " --server-url http://ORCH_IP:8080 \\" |
| echo " --worker-id ${WORKER_ID_ARG} \\" |
| echo " --auth-user admin --auth-pass SECRET" |
| echo "" |
|
|