| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
| exec > >(tee /root/worker_setup.log) 2>&1 |
|
|
| echo "============================================" |
| echo " ASCAD Training Worker - Onstart" |
| echo " $(date -u '+%Y-%m-%d %H:%M:%S UTC')" |
| echo "============================================" |
|
|
| |
| echo "[1/4] Installing pip dependencies..." |
| pip3 install --quiet --no-cache-dir \ |
| h5py \ |
| numpy \ |
| scipy \ |
| scikit-learn \ |
| requests \ |
| pyyaml \ |
| wandb \ |
| huggingface_hub |
|
|
| echo " Done." |
|
|
| |
| echo "[2/4] Pulling pipeline code from HuggingFace..." |
| python3 -c " |
| import os |
| os.environ['HF_TOKEN'] = os.environ.get('HF_TOKEN', '') |
| from huggingface_hub import snapshot_download |
| snapshot_download( |
| repo_id='lemousehunter/ascad-training-pipeline', |
| repo_type='model', |
| local_dir='/root/ascad-training-pipeline' |
| ) |
| print(' Code downloaded successfully.') |
| " |
|
|
| |
| DATASET_DIR="/root/ascad_data" |
| DATASET_FILE="${DATASET_DIR}/ATMega8515_raw_traces.h5" |
|
|
| if [ ! -f "$DATASET_FILE" ]; then |
| echo "[3/4] Downloading ASCAD dataset (~4.2 GB)..." |
| mkdir -p "$DATASET_DIR" |
| wget -q --show-progress -O "${DATASET_DIR}/ascad.zip" \ |
| "https://www.data.gouv.fr/api/1/datasets/r/e7ab6f9e-79bf-431f-a5ed-faf0ebe9b08e" |
| echo " Extracting..." |
| cd "$DATASET_DIR" |
| unzip -o ascad.zip |
| rm -f ascad.zip |
| echo " Dataset ready at ${DATASET_FILE}" |
| else |
| echo "[3/4] Dataset already present, skipping download." |
| fi |
|
|
| |
| echo "[4/4] Checking GPU..." |
| python3 -c " |
| import tensorflow as tf |
| gpus = tf.config.list_physical_devices('GPU') |
| if gpus: |
| print(f' GPU detected: {len(gpus)} device(s)') |
| for g in gpus: |
| print(f' {g}') |
| else: |
| print(' WARNING: No GPU detected!') |
| " |
|
|
| echo "" |
| echo "============================================" |
| echo " Setup complete. Starting worker agent." |
| echo " $(date -u '+%Y-%m-%d %H:%M:%S UTC')" |
| echo "============================================" |
| echo "" |
|
|
| |
| cd /root/ascad-training-pipeline |
|
|
| |
| if [ -n "${WANDB_API_KEY:-}" ]; then |
| python3 -c "import wandb; wandb.login(key='${WANDB_API_KEY}')" 2>/dev/null || true |
| fi |
|
|
| screen -dmS worker bash -c " |
| cd /root/ascad-training-pipeline && \ |
| python3 -m orchestrator.worker.agent \ |
| --server-url '${TQ_SERVER_URL}' \ |
| --worker-id '${WORKER_ID:-worker-\$(hostname)}' \ |
| --data-dir '${DATASET_DIR}' \ |
| --pipeline-dir /root/ascad-training-pipeline \ |
| --auth-user '${TQ_AUTH_USER:-admin}' \ |
| --auth-pass '${TQ_AUTH_PASS}' \ |
| --forward-logs /root/worker.log \ |
| 2>&1 | tee -a /root/worker.log |
| " |
|
|
| echo "Worker started in screen session 'worker'." |
| echo "Use 'screen -r worker' to attach." |
|
|