File size: 13,938 Bytes
e605733
 
 
 
 
 
04f3861
e605733
 
 
 
 
 
 
 
 
0e3516b
e605733
0e3516b
e605733
 
 
0e3516b
e605733
 
 
 
 
0e3516b
e605733
 
 
 
 
 
 
0e3516b
331afc9
e605733
0e3516b
e605733
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f3e3fa
e605733
9f3e3fa
e605733
 
 
 
9f3e3fa
 
 
 
e605733
 
 
9f3e3fa
 
e605733
9f3e3fa
 
e605733
5e564f3
9f3e3fa
e605733
04f3861
9f3e3fa
04f3861
5e564f3
04f3861
9f3e3fa
 
 
 
 
 
 
 
 
 
e605733
 
9f3e3fa
 
 
e605733
 
 
 
 
9f3e3fa
e605733
 
 
 
 
 
 
 
 
 
04f3861
 
 
e605733
 
04f3861
e605733
04f3861
 
 
 
 
 
e605733
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04f3861
 
 
e605733
 
 
 
04f3861
 
e605733
04f3861
 
e605733
 
04f3861
 
 
e605733
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
#!/bin/bash
#===============================================================================
# Apollo Training Server - Complete Automated Setup
#===============================================================================
# This script sets up a fresh server for Apollo training:
# 1. Installs ClickHouse, Neo4j, Python dependencies
# 2. Downloads epochs 844-846 from Hugging Face
# 3. Ingests all data into databases
# 4. Generates training cache (fully offline)
# 5. Ready to train!
#
# Usage:
#   huggingface-cli login  # or export HF_TOKEN=your_token
#   hf download --repo-type model zirobtc/oracle --local-dir ./apollo
#   cd apollo && chmod +x install.sh && source install.sh
#===============================================================================

set -e

# Colors
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
BLUE='\033[0;34m'; CYAN='\033[0;36m'; BOLD='\033[1m'; NC='\033[0m'

log() { echo -e "${BLUE}[$(date +%H:%M:%S)]${NC} $1"; }
success() { echo -e "${GREEN}[โœ“]${NC} $1"; }
warn() { echo -e "${YELLOW}[!]${NC} $1"; }
error() { echo -e "${RED}[โœ—]${NC} $1"; exit 1; }
header() { echo -e "\n${CYAN}${BOLD}โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•${NC}"; echo -e "${CYAN}${BOLD}  $1${NC}"; echo -e "${CYAN}${BOLD}โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•${NC}\n"; }

#===============================================================================
# Configuration
#===============================================================================
APOLLO_DIR="$(pwd)"
DATA_DIR="${APOLLO_DIR}/data"
CACHE_DIR="${DATA_DIR}/cache"
PUMP_FUN_DIR="${DATA_DIR}/pump_fun"

EPOCHS=(844 845 846)
CACHE_WORKERS=8

export CLICKHOUSE_HOST="localhost"
export CLICKHOUSE_PORT="9000"
export CLICKHOUSE_HTTP_PORT="8123"
export CLICKHOUSE_USER="default"
export CLICKHOUSE_PASSWORD=""
export CLICKHOUSE_DATABASE="default"

export NEO4J_URI="bolt://localhost:7687"
export NEO4J_USER="neo4j"
export NEO4J_PASSWORD="neo4j123"

echo ""
echo "โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—"
echo "โ•‘              ๐Ÿš€ Apollo Training Server Setup ๐Ÿš€                โ•‘"
echo "โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•"
echo ""
echo "  ๐Ÿ“ Directory: ${APOLLO_DIR}"
echo "  ๐Ÿ“… Epochs:    ${EPOCHS[*]}"
echo "  ๐Ÿ”ง Workers:   ${CACHE_WORKERS}"
echo ""

#===============================================================================
# Step 1: System Dependencies
#===============================================================================
header "Step 1/7: Installing System Dependencies"

log "Updating package list..."
sudo apt update -qq

log "Installing base packages..."
sudo apt install -y -qq \
    curl wget gnupg apt-transport-https ca-certificates dirmngr \
    pkg-config libudev-dev build-essential \
    python3 python3-pip python3-venv \
    htop tmux unzip pigz pv \
    openjdk-11-jre-headless

# Rust (needed for some deps)
if ! command -v cargo &> /dev/null; then
    log "Installing Rust..."
    curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
    source $HOME/.cargo/env
fi

success "Base dependencies installed"

#===============================================================================
# Step 2: Install ClickHouse
#===============================================================================
header "Step 2/7: Installing ClickHouse"

if ! command -v clickhouse-server &> /dev/null; then
    log "Adding ClickHouse repository..."
    sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754 2>/dev/null || true
    echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list
    sudo apt update -qq

    log "Installing ClickHouse..."
    sudo DEBIAN_FRONTEND=noninteractive apt install -y clickhouse-server clickhouse-client
else
    log "ClickHouse already installed"
fi

# Configure for high performance
log "Configuring ClickHouse..."
sudo mkdir -p /etc/clickhouse-server/config.d/
cat << 'CHXML' | sudo tee /etc/clickhouse-server/config.d/apollo.xml > /dev/null
<?xml version="1.0"?>
<clickhouse>
    <max_server_memory_usage_to_ram_ratio>0.8</max_server_memory_usage_to_ram_ratio>
    <max_threads>32</max_threads>
    <listen_host>0.0.0.0</listen_host>
</clickhouse>
CHXML

log "Starting ClickHouse..."
sudo systemctl enable clickhouse-server 2>/dev/null || true
sudo systemctl start clickhouse-server 2>/dev/null || sudo clickhouse-server --daemon
sleep 3

if clickhouse-client --query "SELECT 1" &>/dev/null; then
    success "ClickHouse is running"
else
    warn "ClickHouse may need manual start: sudo clickhouse-server --daemon"
fi

#===============================================================================
# Step 3: Install Neo4j
#===============================================================================
header "Step 3/7: Installing Neo4j"

if ! command -v neo4j &> /dev/null; then
    log "Adding Neo4j repository..."
    sudo wget -qO - https://debian.neo4j.com/neotechnology.gpg.key | sudo gpg --dearmor -o /usr/share/keyrings/neo4j.gpg 2>/dev/null || true
    echo "deb [signed-by=/usr/share/keyrings/neo4j.gpg] https://debian.neo4j.com stable latest" | sudo tee /etc/apt/sources.list.d/neo4j.list
    sudo apt update -qq

    log "Installing Neo4j..."
    sudo apt install -y neo4j
else
    log "Neo4j already installed"
fi

# Configure
log "Configuring Neo4j..."
sudo tee /etc/neo4j/neo4j.conf > /dev/null << 'NEOCONF'
server.default_listen_address=0.0.0.0
server.bolt.listen_address=:7687
server.http.listen_address=:7474
server.memory.heap.initial_size=4g
server.memory.heap.max_size=16g
server.memory.pagecache.size=8g
dbms.security.auth_enabled=true
NEOCONF

log "Setting Neo4j password..."
sudo neo4j-admin dbms set-initial-password ${NEO4J_PASSWORD} 2>/dev/null || true

log "Starting Neo4j..."
sudo systemctl enable neo4j 2>/dev/null || true
sudo systemctl start neo4j 2>/dev/null || neo4j start
sleep 5

success "Neo4j configured (password: ${NEO4J_PASSWORD})"

#===============================================================================
# Step 4: Python Environment
#===============================================================================
header "Step 4/7: Setting up Python Environment"

cd "$APOLLO_DIR"

if [ ! -d "venv" ]; then
    log "Creating virtual environment..."
    python3 -m venv venv
fi

log "Activating environment..."
source venv/bin/activate

log "Upgrading pip..."
pip install --upgrade pip -q

log "Installing PyTorch with CUDA..."
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 -q

log "Installing requirements..."
pip install -r requirements.txt -q
pip install Pillow requests huggingface_hub -q

success "Python environment ready"

#===============================================================================
# Step 5+6: Download, Ingest, Delete (one epoch at a time to save disk)
#===============================================================================
header "Step 5-6/7: Processing Epochs (Download โ†’ Ingest โ†’ Delete)"

cd "$APOLLO_DIR"
source venv/bin/activate

log "Processing epochs one at a time to minimize disk usage..."
log "Each epoch: ~20GB download โ†’ ingest โ†’ delete"
echo ""

for epoch in "${EPOCHS[@]}"; do
    EPOCH_DIR="${PUMP_FUN_DIR}/epoch_${epoch}"

    log "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”"
    log "Processing epoch ${epoch}..."

    # Step 1: Download
    log "  [1/3] Downloading epoch ${epoch}..."
    python scripts/download_epoch_artifacts.py --epoch "$epoch" || {
        error "Failed to download epoch ${epoch}. Cannot continue."
    }

    # Step 2: Ingest (always pass --merge-neo4j; auto-detect handles empty DB)
    log "  [2/3] Ingesting epoch ${epoch} into databases..."
    python scripts/ingest_epoch.py --epoch "$epoch" --merge-neo4j || {
        error "Ingestion failed for epoch ${epoch}. Cannot continue."
    }

    # Step 3: Delete parquet files to free disk space
    log "  [3/3] Cleaning up epoch ${epoch} parquet files..."
    rm -rf "$EPOCH_DIR"

    # Show progress
    CURRENT_MINTS=$(clickhouse-client --query "SELECT count() FROM mints" 2>/dev/null || echo "0")
    CURRENT_TRADES=$(clickhouse-client --query "SELECT count() FROM trades" 2>/dev/null || echo "0")
    log "  Progress: ${CURRENT_MINTS} mints, ${CURRENT_TRADES} trades"
    log "  Disk free: $(df -h . | awk 'NR==2{print $4}')"
done

# Final verification
log ""
log "Verifying final data..."
MINTS=$(clickhouse-client --query "SELECT count() FROM mints" 2>/dev/null || echo "0")
TRADES=$(clickhouse-client --query "SELECT count() FROM trades" 2>/dev/null || echo "0")
log "  ๐Ÿ“Š Mints:  ${MINTS}"
log "  ๐Ÿ“Š Trades: ${TRADES}"

success "All epochs processed and ingested"

#===============================================================================
# Step 7: Generate Training Cache
#===============================================================================
header "Step 7/7: Generating Training Cache (Offline Mode)"

cd "$APOLLO_DIR"
source venv/bin/activate
mkdir -p "$CACHE_DIR"

log "Generating balanced cache with ${CACHE_WORKERS} workers (context mode)..."
log "Target: ~15,000 balanced cache files across all classes"
log "โณ This may take 1-3 hours depending on data size..."
echo ""

python scripts/cache_dataset.py \
    --output_dir "$CACHE_DIR" \
    --num_workers "$CACHE_WORKERS" \
    --cache_mode context \
    --context_length 4096 \
    --horizons_seconds 30 60 120 240 420 \
    --quantiles 0.1 0.5 0.9 \
    --min_trades 10 || {
    warn "Cache generation had errors - check logs"
}

CACHE_COUNT=$(ls -1 "$CACHE_DIR"/sample_*.pt 2>/dev/null | wc -l)
success "Cache complete: ${CACHE_COUNT} samples"

#===============================================================================
# Create Helper Files
#===============================================================================
log "Creating configuration files..."

# .env file
cat << ENVFILE > "${APOLLO_DIR}/.env"
CLICKHOUSE_HOST=${CLICKHOUSE_HOST}
CLICKHOUSE_PORT=${CLICKHOUSE_PORT}
CLICKHOUSE_HTTP_PORT=${CLICKHOUSE_HTTP_PORT}
CLICKHOUSE_USER=${CLICKHOUSE_USER}
CLICKHOUSE_PASSWORD=${CLICKHOUSE_PASSWORD}
CLICKHOUSE_DATABASE=${CLICKHOUSE_DATABASE}
NEO4J_URI=${NEO4J_URI}
NEO4J_USER=${NEO4J_USER}
NEO4J_PASSWORD=${NEO4J_PASSWORD}
APOLLO_CACHE_DIR=${CACHE_DIR}
ENVFILE

# Training launch script
cat << 'TRAINSH' > "${APOLLO_DIR}/train_launch.sh"
#!/bin/bash
cd "$(dirname "$0")"
source venv/bin/activate
source .env 2>/dev/null || true

echo "๐Ÿš€ Starting Apollo training..."
echo "   GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null || echo 'N/A')"
echo ""

accelerate launch train.py \
    --epochs 10 \
    --batch_size 8 \
    --grad_accum_steps 2 \
    --learning_rate 1e-4 \
    --warmup_ratio 0.1 \
    --max_grad_norm 1.0 \
    --mixed_precision bf16 \
    --max_seq_len 4096 \
    --horizons_seconds 30 60 120 240 420 \
    --quantiles 0.1 0.5 0.9 \
    --ohlc_stats_path ./data/ohlc_stats.npz \
    --num_workers 4 \
    --pin_memory \
    --val_split 0.1 \
    --val_every 2000 \
    --save_every 2000 \
    --log_every 50 \
    "$@"
TRAINSH
chmod +x "${APOLLO_DIR}/train_launch.sh"

# Status check script
cat << 'STATUSSH' > "${APOLLO_DIR}/check_status.sh"
#!/bin/bash
echo "โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•"
echo "        Apollo Server Status"
echo "โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•"
echo ""
echo "๐Ÿ“Š System:"
echo "   CPU: $(nproc) cores"
echo "   RAM: $(free -h | awk '/^Mem:/{print $2}') total"
echo "   Disk: $(df -h . | awk 'NR==2{print $4}') free"
echo ""
if command -v nvidia-smi &> /dev/null; then
    echo "๐ŸŽฎ GPU:"
    nvidia-smi --query-gpu=name,memory.used,memory.total --format=csv,noheader
    echo ""
fi
echo "๐Ÿ’พ ClickHouse:"
clickhouse-client --query "SELECT 'Mints: ' || toString(count()) FROM mints" 2>/dev/null || echo "   Not running"
clickhouse-client --query "SELECT 'Trades: ' || toString(count()) FROM trades" 2>/dev/null || echo ""
echo ""
echo "๐Ÿ“ฆ Cache:"
echo "   Files: $(ls -1 ./data/cache/sample_*.pt 2>/dev/null | wc -l)"
echo "   Size: $(du -sh ./data/cache 2>/dev/null | cut -f1 || echo 'N/A')"
echo ""
STATUSSH
chmod +x "${APOLLO_DIR}/check_status.sh"

#===============================================================================
# Done!
#===============================================================================
echo ""
echo "โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—"
echo "โ•‘                   ๐ŸŽ‰ Setup Complete! ๐ŸŽ‰                        โ•‘"
echo "โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•"
echo ""
echo "  ๐Ÿ“ฆ Cache: ${CACHE_COUNT} samples ready"
echo "  ๐Ÿ“ Location: ${CACHE_DIR}"
echo ""
echo "  ๐Ÿš€ To start training:"
echo ""
echo "     ./train_launch.sh"
echo ""
echo "  ๐Ÿ“Š To check status:"
echo ""
echo "     ./check_status.sh"
echo ""
success "Apollo is ready to train!"