File size: 18,215 Bytes
e605733
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
#!/bin/bash
#===============================================================================
# Apollo Training Server - Complete Setup Script
#===============================================================================
# This script sets up a fresh server for Apollo training from scratch:
#
# 1. Installs system dependencies (ClickHouse, Neo4j, Python, CUDA)
# 2. Installs Python requirements
# 3. Downloads epochs 844-850 from Hugging Face
# 4. Ingests all data into ClickHouse and Neo4j
# 5. Generates training cache files
# 6. Launches training
#
# Usage:
#   export HF_TOKEN="your_huggingface_token"
#   chmod +x scripts/setup_fresh_server.sh
#   ./scripts/setup_fresh_server.sh
#
# Or run specific steps:
#   ./scripts/setup_fresh_server.sh --step install-deps
#   ./scripts/setup_fresh_server.sh --step download-epochs
#   ./scripts/setup_fresh_server.sh --step ingest-epochs
#   ./scripts/setup_fresh_server.sh --step generate-cache
#   ./scripts/setup_fresh_server.sh --step train
#===============================================================================

set -e  # Exit on error

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'

log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[βœ“]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
log_step() { echo -e "\n${CYAN}========================================${NC}"; echo -e "${CYAN}  $1${NC}"; echo -e "${CYAN}========================================${NC}\n"; }

#===============================================================================
# Configuration
#===============================================================================
APOLLO_DIR="${APOLLO_DIR:-$(cd "$(dirname "$0")/.." && pwd)}"
DATA_DIR="${DATA_DIR:-${APOLLO_DIR}/data}"
CACHE_DIR="${CACHE_DIR:-${DATA_DIR}/cache}"
PUMP_FUN_DIR="${DATA_DIR}/pump_fun"

# Epochs to download and ingest
EPOCHS=(844 845 846 847 848 849 850)

# ClickHouse settings
export CLICKHOUSE_HOST="${CLICKHOUSE_HOST:-localhost}"
export CLICKHOUSE_PORT="${CLICKHOUSE_PORT:-9000}"
export CLICKHOUSE_HTTP_PORT="${CLICKHOUSE_HTTP_PORT:-8123}"
export CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}"
export CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-}"
export CLICKHOUSE_DATABASE="${CLICKHOUSE_DATABASE:-default}"

# Neo4j settings
export NEO4J_URI="${NEO4J_URI:-bolt://localhost:7687}"
export NEO4J_USER="${NEO4J_USER:-neo4j}"
export NEO4J_PASSWORD="${NEO4J_PASSWORD:-apollo2024}"

# Caching settings
CACHE_WORKERS="${CACHE_WORKERS:-8}"
MAX_CACHE_SAMPLES="${MAX_CACHE_SAMPLES:-}"  # Empty = all samples

# Training settings
BATCH_SIZE="${BATCH_SIZE:-16}"
NUM_EPOCHS="${NUM_EPOCHS:-7}"

#===============================================================================
# Parse Arguments
#===============================================================================
STEP=""
SKIP_CONFIRM=false

while [[ $# -gt 0 ]]; do
    case $1 in
        --step)
            STEP="$2"
            shift 2
            ;;
        --yes|-y)
            SKIP_CONFIRM=true
            shift
            ;;
        --epochs)
            IFS=',' read -ra EPOCHS <<< "$2"
            shift 2
            ;;
        --help|-h)
            echo "Usage: $0 [OPTIONS]"
            echo ""
            echo "Options:"
            echo "  --step STEP    Run only specific step:"
            echo "                   install-deps, download-epochs, ingest-epochs,"
            echo "                   generate-cache, train, all (default)"
            echo "  --epochs X,Y,Z Comma-separated list of epochs (default: 844-850)"
            echo "  --yes, -y      Skip confirmation prompts"
            echo "  --help, -h     Show this help message"
            exit 0
            ;;
        *)
            log_error "Unknown option: $1"
            exit 1
            ;;
    esac
done

#===============================================================================
# Step 1: Install System Dependencies
#===============================================================================
install_dependencies() {
    log_step "Step 1: Installing System Dependencies"

    # Detect OS
    if [ -f /etc/os-release ]; then
        . /etc/os-release
        OS=$ID
    else
        log_error "Cannot detect OS. Please install dependencies manually."
        exit 1
    fi

    log_info "Detected OS: $OS"

    # Update package list
    log_info "Updating package list..."
    sudo apt-get update -qq

    # Install basic dependencies
    log_info "Installing basic dependencies..."
    sudo apt-get install -y -qq \
        curl wget git build-essential \
        python3 python3-pip python3-venv \
        htop tmux unzip pigz pv \
        apt-transport-https ca-certificates gnupg

    # Install ClickHouse
    if ! command -v clickhouse-server &> /dev/null; then
        log_info "Installing ClickHouse..."
        sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754 2>/dev/null || true
        echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list
        sudo apt-get update -qq
        sudo DEBIAN_FRONTEND=noninteractive apt-get install -y clickhouse-server clickhouse-client
    else
        log_info "ClickHouse already installed"
    fi

    # Configure and start ClickHouse
    log_info "Configuring ClickHouse..."
    sudo mkdir -p /etc/clickhouse-server/config.d/
    cat << 'EOF' | sudo tee /etc/clickhouse-server/config.d/apollo.xml > /dev/null
<?xml version="1.0"?>
<clickhouse>
    <max_server_memory_usage_to_ram_ratio>0.8</max_server_memory_usage_to_ram_ratio>
    <max_memory_usage>200000000000</max_memory_usage>
    <max_threads>32</max_threads>
    <listen_host>0.0.0.0</listen_host>
</clickhouse>
EOF

    sudo systemctl enable clickhouse-server
    sudo systemctl start clickhouse-server || sudo systemctl restart clickhouse-server
    sleep 3

    # Verify ClickHouse
    if clickhouse-client --query "SELECT 1" &>/dev/null; then
        log_success "ClickHouse is running"
    else
        log_error "ClickHouse failed to start"
        exit 1
    fi

    # Install Neo4j
    if ! command -v neo4j &> /dev/null; then
        log_info "Installing Neo4j..."
        wget -O - https://debian.neo4j.com/neotechnology.gpg.key 2>/dev/null | sudo apt-key add - 2>/dev/null || true
        echo 'deb https://debian.neo4j.com stable latest' | sudo tee /etc/apt/sources.list.d/neo4j.list
        sudo apt-get update -qq
        sudo apt-get install -y neo4j
    else
        log_info "Neo4j already installed"
    fi

    # Configure and start Neo4j
    log_info "Configuring Neo4j..."
    sudo tee /etc/neo4j/neo4j.conf > /dev/null << EOF
dbms.default_listen_address=0.0.0.0
dbms.connector.bolt.listen_address=:7687
dbms.connector.http.listen_address=:7474
dbms.memory.heap.initial_size=4g
dbms.memory.heap.max_size=16g
dbms.memory.pagecache.size=8g
dbms.security.auth_enabled=true
EOF

    sudo systemctl enable neo4j
    sudo systemctl start neo4j || sudo systemctl restart neo4j
    sleep 5

    # Set Neo4j password (first time setup)
    log_info "Setting Neo4j password..."
    curl -s -X POST "http://localhost:7474/user/neo4j/password" \
        -H "Content-Type: application/json" \
        -d "{\"password\":\"${NEO4J_PASSWORD}\"}" \
        -u neo4j:neo4j 2>/dev/null || true

    log_success "System dependencies installed"
}

#===============================================================================
# Step 2: Install Python Dependencies
#===============================================================================
install_python_deps() {
    log_step "Step 2: Installing Python Dependencies"

    cd "$APOLLO_DIR"

    # Create virtual environment if it doesn't exist
    if [ ! -d "venv" ]; then
        log_info "Creating Python virtual environment..."
        python3 -m venv venv
    fi

    # Activate virtual environment
    source venv/bin/activate

    # Upgrade pip
    log_info "Upgrading pip..."
    pip install --upgrade pip -q

    # Install PyTorch with CUDA
    log_info "Installing PyTorch with CUDA support..."
    pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 -q

    # Install requirements
    log_info "Installing project requirements..."
    pip install -r requirements.txt -q

    # Install additional dependencies that might be missing
    pip install Pillow requests -q

    log_success "Python dependencies installed"
}

#===============================================================================
# Step 3: Download Epochs
#===============================================================================
download_epochs() {
    log_step "Step 3: Downloading Epochs ${EPOCHS[*]}"

    cd "$APOLLO_DIR"
    source venv/bin/activate

    # Check for HF token
    if [ -z "$HF_TOKEN" ]; then
        log_warn "HF_TOKEN not set. Some downloads may fail."
        log_info "Set it with: export HF_TOKEN=your_token"
    fi

    # Download each epoch
    for epoch in "${EPOCHS[@]}"; do
        log_info "Downloading epoch ${epoch}..."

        # Check if already downloaded
        EPOCH_DIR="${PUMP_FUN_DIR}/epoch_${epoch}"
        if [ -d "$EPOCH_DIR" ] && [ "$(ls -A "$EPOCH_DIR" 2>/dev/null)" ]; then
            PARQUET_COUNT=$(ls -1 "$EPOCH_DIR"/*.parquet 2>/dev/null | wc -l)
            if [ "$PARQUET_COUNT" -gt 10 ]; then
                log_info "Epoch ${epoch} already downloaded (${PARQUET_COUNT} files), skipping..."
                continue
            fi
        fi

        # Download using existing script
        python scripts/download_epoch_artifacts.py --epoch "$epoch" ${HF_TOKEN:+--token "$HF_TOKEN"} || {
            log_warn "Failed to download epoch ${epoch}, continuing..."
        }
    done

    log_success "Epoch downloads complete"
}

#===============================================================================
# Step 4: Ingest Epochs into ClickHouse and Neo4j
#===============================================================================
ingest_epochs() {
    log_step "Step 4: Ingesting Epochs into ClickHouse and Neo4j"

    cd "$APOLLO_DIR"
    source venv/bin/activate

    # Ingest each epoch
    for epoch in "${EPOCHS[@]}"; do
        log_info "Ingesting epoch ${epoch}..."

        EPOCH_DIR="${PUMP_FUN_DIR}/epoch_${epoch}"
        if [ ! -d "$EPOCH_DIR" ]; then
            log_warn "Epoch ${epoch} not found at ${EPOCH_DIR}, skipping..."
            continue
        fi

        # Run ingestion script
        python scripts/ingest_epoch.py --epoch "$epoch" --merge-neo4j || {
            log_warn "Failed to ingest epoch ${epoch}, continuing..."
        }

        # Clean up downloaded files to save space (optional)
        # Uncomment if you want to delete after ingestion:
        # log_info "Cleaning up epoch ${epoch} files..."
        # rm -rf "$EPOCH_DIR"
    done

    # Verify data
    log_info "Verifying data ingestion..."
    MINT_COUNT=$(clickhouse-client --query "SELECT count() FROM mints" 2>/dev/null || echo "0")
    TRADE_COUNT=$(clickhouse-client --query "SELECT count() FROM trades" 2>/dev/null || echo "0")
    log_info "  Mints: ${MINT_COUNT}"
    log_info "  Trades: ${TRADE_COUNT}"

    log_success "Epoch ingestion complete"
}

#===============================================================================
# Step 5: Generate Training Cache
#===============================================================================
generate_cache() {
    log_step "Step 5: Generating Training Cache"

    cd "$APOLLO_DIR"
    source venv/bin/activate

    # Create cache directory
    mkdir -p "$CACHE_DIR"

    # Check if cache already exists
    EXISTING_CACHE=$(ls -1 "$CACHE_DIR"/sample_*.pt 2>/dev/null | wc -l)
    if [ "$EXISTING_CACHE" -gt 1000 ]; then
        log_warn "Found ${EXISTING_CACHE} existing cache files"
        if [ "$SKIP_CONFIRM" = false ]; then
            read -p "Continue caching (will add to existing)? [y/N] " -n 1 -r
            echo
            if [[ ! $REPLY =~ ^[Yy]$ ]]; then
                log_info "Skipping cache generation"
                return 0
            fi
        fi
    fi

    # Generate cache using parallel script
    log_info "Generating cache with ${CACHE_WORKERS} workers..."
    log_info "This may take several hours for 230k+ samples..."

    CACHE_ARGS="--output_dir $CACHE_DIR --num_workers $CACHE_WORKERS"
    if [ -n "$MAX_CACHE_SAMPLES" ]; then
        CACHE_ARGS="$CACHE_ARGS --max_samples $MAX_CACHE_SAMPLES"
    fi

    python scripts/cache_parallel.py $CACHE_ARGS || {
        log_error "Cache generation failed"
        exit 1
    }

    # Validate cache
    log_info "Validating cache..."
    python scripts/validate_cache_v2.py --cache_dir "$CACHE_DIR" --sample_size 100 || true

    FINAL_CACHE=$(ls -1 "$CACHE_DIR"/sample_*.pt 2>/dev/null | wc -l)
    log_success "Cache generation complete: ${FINAL_CACHE} samples"
}

#===============================================================================
# Step 6: Launch Training
#===============================================================================
launch_training() {
    log_step "Step 6: Launching Training"

    cd "$APOLLO_DIR"
    source venv/bin/activate

    # Check cache exists
    CACHE_COUNT=$(ls -1 "$CACHE_DIR"/sample_*.pt 2>/dev/null | wc -l)
    if [ "$CACHE_COUNT" -lt 100 ]; then
        log_error "Not enough cache files (${CACHE_COUNT}). Run cache generation first."
        exit 1
    fi

    log_info "Starting training with ${CACHE_COUNT} cached samples..."
    log_info "  Batch size: ${BATCH_SIZE}"
    log_info "  Epochs: ${NUM_EPOCHS}"
    log_info "  GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null || echo 'N/A')"

    # Launch training
    accelerate launch train.py \
        --epochs "$NUM_EPOCHS" \
        --batch_size "$BATCH_SIZE" \
        --grad_accum_steps 4 \
        --learning_rate 1e-4 \
        --warmup_ratio 0.1 \
        --max_grad_norm 1.0 \
        --mixed_precision bf16 \
        --max_seq_len 8192 \
        --horizons_seconds 60 180 300 600 1800 3600 7200 \
        --quantiles 0.1 0.5 0.9 \
        --num_workers 16 \
        --pin_memory \
        --val_split 0.1 \
        --val_every 5000 \
        --save_every 5000 \
        --log_every 100
}

#===============================================================================
# Step 7: Create Environment File
#===============================================================================
create_env_file() {
    log_info "Creating .env file..."

    cat << EOF > "${APOLLO_DIR}/.env"
# ClickHouse
CLICKHOUSE_HOST=${CLICKHOUSE_HOST}
CLICKHOUSE_PORT=${CLICKHOUSE_PORT}
CLICKHOUSE_HTTP_PORT=${CLICKHOUSE_HTTP_PORT}
CLICKHOUSE_USER=${CLICKHOUSE_USER}
CLICKHOUSE_PASSWORD=${CLICKHOUSE_PASSWORD}
CLICKHOUSE_DATABASE=${CLICKHOUSE_DATABASE}

# Neo4j
NEO4J_URI=${NEO4J_URI}
NEO4J_USER=${NEO4J_USER}
NEO4J_PASSWORD=${NEO4J_PASSWORD}

# Paths
APOLLO_DATA_DIR=${DATA_DIR}
APOLLO_CACHE_DIR=${CACHE_DIR}

# Hugging Face (set your token here)
HF_TOKEN=${HF_TOKEN:-}
EOF

    log_success "Environment file created at ${APOLLO_DIR}/.env"
}

#===============================================================================
# Main Execution
#===============================================================================
main() {
    echo ""
    echo "╔═══════════════════════════════════════════════════════════════╗"
    echo "β•‘         Apollo Training Server - Complete Setup               β•‘"
    echo "β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•"
    echo ""
    echo "  Apollo Directory: ${APOLLO_DIR}"
    echo "  Data Directory:   ${DATA_DIR}"
    echo "  Cache Directory:  ${CACHE_DIR}"
    echo "  Epochs:           ${EPOCHS[*]}"
    echo ""

    # Check for HF_TOKEN
    if [ -z "$HF_TOKEN" ]; then
        log_warn "HF_TOKEN not set. Downloads may fail."
        log_info "Set it with: export HF_TOKEN=your_huggingface_token"
        echo ""
    fi

    # Run specific step or all steps
    case "$STEP" in
        install-deps)
            install_dependencies
            install_python_deps
            create_env_file
            ;;
        download-epochs)
            download_epochs
            ;;
        ingest-epochs)
            ingest_epochs
            ;;
        generate-cache)
            generate_cache
            ;;
        train)
            launch_training
            ;;
        ""|all)
            # Run all steps
            if [ "$SKIP_CONFIRM" = false ]; then
                echo "This will run the complete setup pipeline:"
                echo "  1. Install system dependencies (ClickHouse, Neo4j)"
                echo "  2. Install Python dependencies"
                echo "  3. Download epochs ${EPOCHS[*]}"
                echo "  4. Ingest data into databases"
                echo "  5. Generate training cache"
                echo "  6. Launch training"
                echo ""
                read -p "Continue? [y/N] " -n 1 -r
                echo
                if [[ ! $REPLY =~ ^[Yy]$ ]]; then
                    log_info "Aborted."
                    exit 0
                fi
            fi

            install_dependencies
            install_python_deps
            create_env_file
            download_epochs
            ingest_epochs
            generate_cache
            launch_training
            ;;
        *)
            log_error "Unknown step: $STEP"
            echo "Valid steps: install-deps, download-epochs, ingest-epochs, generate-cache, train, all"
            exit 1
            ;;
    esac

    echo ""
    log_success "Setup complete!"
    echo ""
    echo "Useful commands:"
    echo "  source venv/bin/activate    # Activate Python environment"
    echo "  ./scripts/check_status.sh   # Check system status"
    echo "  accelerate launch train.py  # Start training"
    echo ""
}

# Run main
main "$@"