File size: 8,423 Bytes
283a882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbb6546
283a882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbb6546
 
283a882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#!/bin/bash
# ============================================================================
# ASCAD Training Worker - Instance Setup Script
# ============================================================================
# Run this ONCE after a Vast.ai instance boots with the pre-baked Docker
# image (tensorflow/tensorflow:2.16.2-gpu).  It handles everything that
# cannot be baked into the image:
#
#   1. Install pip dependencies (binary wheels only β€” fast)
#   2. Pull latest pipeline code from HuggingFace
#   3. Download & extract the ASCAD dataset (if not already present)
#   4. Verify GPU availability
#   5. Optionally start the worker agent
#
# Usage:
#   # Minimal (just set up the environment):
#   bash setup.sh
#
#   # Full (set up + start worker agent):
#   bash setup.sh --start-worker \
#       --server-url http://ORCH_IP:8080 \
#       --worker-id worker-a100-1 \
#       --auth-user admin \
#       --auth-pass SECRET \
#       --wandb-key YOUR_WANDB_KEY
#
# Environment variables (alternative to flags):
#   TQ_SERVER_URL   - Orchestrator URL
#   TQ_AUTH_USER    - Auth username
#   TQ_AUTH_PASS    - Auth password
#   WORKER_ID       - Unique worker ID
#   WANDB_API_KEY   - W&B API key
#   HF_TOKEN        - HuggingFace token (optional, for private repos)
# ============================================================================

set -euo pipefail

# ── Parse arguments ─────────────────────────────────────────────────────────
START_WORKER=false
SERVER_URL="${TQ_SERVER_URL:-}"
AUTH_USER="${TQ_AUTH_USER:-admin}"
AUTH_PASS="${TQ_AUTH_PASS:-}"
WORKER_ID_ARG="${WORKER_ID:-worker-$(hostname)}"
WANDB_KEY="${WANDB_API_KEY:-}"
DATA_DIR="/root/ascad_data"
PIPELINE_DIR="/root/ascad-training-pipeline"
SKIP_DATA=false

while [[ $# -gt 0 ]]; do
    case "$1" in
        --start-worker)     START_WORKER=true; shift ;;
        --server-url)       SERVER_URL="$2"; shift 2 ;;
        --worker-id)        WORKER_ID_ARG="$2"; shift 2 ;;
        --auth-user)        AUTH_USER="$2"; shift 2 ;;
        --auth-pass)        AUTH_PASS="$2"; shift 2 ;;
        --wandb-key)        WANDB_KEY="$2"; shift 2 ;;
        --data-dir)         DATA_DIR="$2"; shift 2 ;;
        --pipeline-dir)     PIPELINE_DIR="$2"; shift 2 ;;
        --skip-data)        SKIP_DATA=true; shift ;;
        -h|--help)
            head -35 "$0" | tail -30
            exit 0
            ;;
        *)
            echo "Unknown argument: $1" >&2
            exit 1
            ;;
    esac
done

# ── Logging ─────────────────────────────────────────────────────────────────
LOG_FILE="/root/setup.log"
exec > >(tee "$LOG_FILE") 2>&1

BOLD="\033[1m"
GREEN="\033[32m"
YELLOW="\033[33m"
RESET="\033[0m"

step() { echo -e "\n${BOLD}${GREEN}[$1/$TOTAL_STEPS]${RESET} ${BOLD}$2${RESET}"; }
warn() { echo -e "  ${YELLOW}⚠ $1${RESET}"; }
ok()   { echo -e "  βœ“ $1"; }

TOTAL_STEPS=4
if $START_WORKER; then TOTAL_STEPS=5; fi

echo "============================================"
echo "  ASCAD Training Worker - Setup"
echo "  $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
echo "============================================"

# ── Step 1: Install pip dependencies ────────────────────────────────────────
step 1 "Installing pip dependencies (binary wheels)..."
STARTED=$(date +%s)

pip3 install --quiet --no-cache-dir --only-binary :all: \
    scipy \
    scikit-learn \
    wandb \
    huggingface_hub \
    websocket-client \
    2>&1 | tail -3

ELAPSED=$(( $(date +%s) - STARTED ))
ok "Done in ${ELAPSED}s"

# ── Step 2: Pull latest code from HuggingFace ──────────────────────────────
step 2 "Pulling pipeline code from HuggingFace..."
STARTED=$(date +%s)

python3 -c "
import os
os.environ['HF_TOKEN'] = os.environ.get('HF_TOKEN', '')
from huggingface_hub import snapshot_download
snapshot_download(
    repo_id='lemousehunter/ascad-training-pipeline',
    repo_type='model',
    local_dir='${PIPELINE_DIR}'
)
" 2>&1 | grep -v "^$"

ELAPSED=$(( $(date +%s) - STARTED ))
ok "Code at ${PIPELINE_DIR} (${ELAPSED}s)"

# Clear any stale bytecode
find "${PIPELINE_DIR}" -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
ok "Cleared __pycache__"

# ── Step 3: Download ASCAD dataset ──────────────────────────────────────────
DATASET_FILE="${DATA_DIR}/ASCAD_data/ASCAD_databases/ATMega8515_raw_traces.h5"

if $SKIP_DATA; then
    step 3 "Skipping dataset download (--skip-data)"
elif [ -f "$DATASET_FILE" ]; then
    step 3 "Dataset already present, skipping download."
    ok "$DATASET_FILE exists ($(du -sh "$DATASET_FILE" | cut -f1))"
else
    step 3 "Downloading ASCAD dataset (~4.2 GB)..."
    STARTED=$(date +%s)
    mkdir -p "$DATA_DIR"

    DOWNLOAD_URL="https://www.data.gouv.fr/api/1/datasets/r/e7ab6f9e-79bf-431f-a5ed-faf0ebe9b08e"
    wget --progress=bar:force:noscroll -O "${DATA_DIR}/ASCAD_data.zip" "$DOWNLOAD_URL" 2>&1

    DL_ELAPSED=$(( $(date +%s) - STARTED ))
    ok "Downloaded in ${DL_ELAPSED}s"

    echo "  Extracting..."
    cd "$DATA_DIR"
    unzip -o ASCAD_data.zip
    rm -f ASCAD_data.zip
    ok "Dataset ready at ${DATASET_FILE}"

    ELAPSED=$(( $(date +%s) - STARTED ))
    ok "Total data step: ${ELAPSED}s"
fi

# ── Step 4: Verify GPU ─────────────────────────────────────────────────────
step 4 "Verifying GPU..."
python3 -c "
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f'  GPU detected: {len(gpus)} device(s)')
    for g in gpus:
        print(f'    {g}')
else:
    print('  WARNING: No GPU detected!')
    import sys; sys.exit(1)
"
ok "GPU verified"

# ── Step 5 (optional): Start worker agent ──────────────────────────────────
if $START_WORKER; then
    step 5 "Starting worker agent..."

    if [ -z "$SERVER_URL" ]; then
        echo "  ERROR: --server-url is required to start the worker" >&2
        exit 1
    fi
    if [ -z "$AUTH_PASS" ]; then
        echo "  ERROR: --auth-pass is required to start the worker" >&2
        exit 1
    fi

    # Login to W&B if key provided
    if [ -n "$WANDB_KEY" ]; then
        WANDB_API_KEY="$WANDB_KEY" python3 -c "import wandb; wandb.login(key='${WANDB_KEY}')" 2>/dev/null || true
        ok "W&B logged in"
    fi

    # Install screen if not present
    which screen >/dev/null 2>&1 || apt-get install -y -qq screen >/dev/null 2>&1

    # Start worker in a screen session
    screen -dmS worker bash -c "
        cd ${PIPELINE_DIR} && \
        python3 -m orchestrator.worker.agent \
            --server-url '${SERVER_URL}' \
            --worker-id '${WORKER_ID_ARG}' \
            --data-dir '${DATA_DIR}' \
            --pipeline-dir '${PIPELINE_DIR}' \
            --auth-user '${AUTH_USER}' \
            --auth-pass '${AUTH_PASS}' \
            --forward-logs /root/worker.log \
        2>&1 | tee -a /root/worker.log
    "

    ok "Worker started in screen session 'worker'"
    echo "  Use 'screen -r worker' to attach"
fi

# ── Summary ─────────────────────────────────────────────────────────────────
echo ""
echo "============================================"
echo "  Setup complete!"
echo "  $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
echo "============================================"
echo ""
echo "  Pipeline:  ${PIPELINE_DIR}"
echo "  Data:      ${DATA_DIR}"
echo "  Log:       ${LOG_FILE}"
if $START_WORKER; then
    echo "  Worker:    screen -r worker"
    echo "  Worker log: /root/worker.log"
fi
echo ""
echo "  Quick start (if worker not started):"
echo "    cd ${PIPELINE_DIR}"
echo "    python3 -m orchestrator.worker.agent \\"
echo "        --server-url http://ORCH_IP:8080 \\"
echo "        --worker-id ${WORKER_ID_ARG} \\"
echo "        --auth-user admin --auth-pass SECRET"
echo ""