File size: 4,068 Bytes
283a882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbb6546
283a882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbb6546
 
283a882
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/bin/bash
# ============================================================================
# ASCAD Training Worker - Vast.ai Onstart Script
# ============================================================================
# Base image: tensorflow/tensorflow:2.16.2-gpu
# This script is passed as --onstart to vastai create instance.
# It installs deps, pulls code from HF, downloads data, and starts the worker.
#
# Required env vars (set via --env):
#   TQ_SERVER_URL   - Orchestrator server URL (e.g., http://IP:PORT)
#   TQ_AUTH_USER    - Auth username
#   TQ_AUTH_PASS    - Auth password
#   WORKER_ID       - Unique worker identifier
#   HF_TOKEN        - HuggingFace token for code download
#   WANDB_API_KEY   - W&B API key for logging
# ============================================================================

set -euo pipefail
exec > >(tee /root/worker_setup.log) 2>&1

echo "============================================"
echo "  ASCAD Training Worker - Onstart"
echo "  $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
echo "============================================"

# ── 1. Install pip dependencies (~30s) ───────────────────────────────────
echo "[1/4] Installing pip dependencies..."
pip3 install --quiet --no-cache-dir \
    h5py \
    numpy \
    scipy \
    scikit-learn \
    requests \
    pyyaml \
    wandb \
    huggingface_hub

echo "  Done."

# ── 2. Pull latest code from HuggingFace (~10s) ─────────────────────────
echo "[2/4] Pulling pipeline code from HuggingFace..."
python3 -c "
import os
os.environ['HF_TOKEN'] = os.environ.get('HF_TOKEN', '')
from huggingface_hub import snapshot_download
snapshot_download(
    repo_id='lemousehunter/ascad-training-pipeline',
    repo_type='model',
    local_dir='/root/ascad-training-pipeline'
)
print('  Code downloaded successfully.')
"

# ── 3. Download ASCAD dataset if not present (~5-10 min) ─────────────────
DATASET_DIR="/root/ascad_data"
DATASET_FILE="${DATASET_DIR}/ATMega8515_raw_traces.h5"

if [ ! -f "$DATASET_FILE" ]; then
    echo "[3/4] Downloading ASCAD dataset (~4.2 GB)..."
    mkdir -p "$DATASET_DIR"
    wget -q --show-progress -O "${DATASET_DIR}/ascad.zip" \
        "https://www.data.gouv.fr/api/1/datasets/r/e7ab6f9e-79bf-431f-a5ed-faf0ebe9b08e"
    echo "  Extracting..."
    cd "$DATASET_DIR"
    unzip -o ascad.zip
    rm -f ascad.zip
    echo "  Dataset ready at ${DATASET_FILE}"
else
    echo "[3/4] Dataset already present, skipping download."
fi

# ── 4. Verify GPU and start worker ──────────────────────────────────────
echo "[4/4] Checking GPU..."
python3 -c "
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f'  GPU detected: {len(gpus)} device(s)')
    for g in gpus:
        print(f'    {g}')
else:
    print('  WARNING: No GPU detected!')
"

echo ""
echo "============================================"
echo "  Setup complete. Starting worker agent."
echo "  $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
echo "============================================"
echo ""

# Start worker in a screen session so it survives SSH disconnects
cd /root/ascad-training-pipeline

# Login to W&B if key provided
if [ -n "${WANDB_API_KEY:-}" ]; then
    python3 -c "import wandb; wandb.login(key='${WANDB_API_KEY}')" 2>/dev/null || true
fi

screen -dmS worker bash -c "
    cd /root/ascad-training-pipeline && \
    python3 -m orchestrator.worker.agent \
        --server-url '${TQ_SERVER_URL}' \
        --worker-id '${WORKER_ID:-worker-\$(hostname)}' \
        --data-dir '${DATASET_DIR}' \
        --pipeline-dir /root/ascad-training-pipeline \
        --auth-user '${TQ_AUTH_USER:-admin}' \
        --auth-pass '${TQ_AUTH_PASS}' \
        --forward-logs /root/worker.log \
    2>&1 | tee -a /root/worker.log
"

echo "Worker started in screen session 'worker'."
echo "Use 'screen -r worker' to attach."