#!/bin/bash #=============================================================================== # Apollo Training Server - Complete Setup Script #=============================================================================== # This script sets up a fresh server for Apollo training from scratch: # # 1. Installs system dependencies (ClickHouse, Neo4j, Python, CUDA) # 2. Installs Python requirements # 3. Downloads epochs 844-850 from Hugging Face # 4. Ingests all data into ClickHouse and Neo4j # 5. Generates training cache files # 6. Launches training # # Usage: # export HF_TOKEN="your_huggingface_token" # chmod +x scripts/setup_fresh_server.sh # ./scripts/setup_fresh_server.sh # # Or run specific steps: # ./scripts/setup_fresh_server.sh --step install-deps # ./scripts/setup_fresh_server.sh --step download-epochs # ./scripts/setup_fresh_server.sh --step ingest-epochs # ./scripts/setup_fresh_server.sh --step generate-cache # ./scripts/setup_fresh_server.sh --step train #=============================================================================== set -e # Exit on error # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' NC='\033[0m' log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } log_success() { echo -e "${GREEN}[✓]${NC} $1"; } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } log_error() { echo -e "${RED}[ERROR]${NC} $1"; } log_step() { echo -e "\n${CYAN}========================================${NC}"; echo -e "${CYAN} $1${NC}"; echo -e "${CYAN}========================================${NC}\n"; } #=============================================================================== # Configuration #=============================================================================== APOLLO_DIR="${APOLLO_DIR:-$(cd "$(dirname "$0")/.." && pwd)}" DATA_DIR="${DATA_DIR:-${APOLLO_DIR}/data}" CACHE_DIR="${CACHE_DIR:-${DATA_DIR}/cache}" PUMP_FUN_DIR="${DATA_DIR}/pump_fun" # Epochs to download and ingest EPOCHS=(844 845 846 847 848 849 850) # ClickHouse settings export CLICKHOUSE_HOST="${CLICKHOUSE_HOST:-localhost}" export CLICKHOUSE_PORT="${CLICKHOUSE_PORT:-9000}" export CLICKHOUSE_HTTP_PORT="${CLICKHOUSE_HTTP_PORT:-8123}" export CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}" export CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-}" export CLICKHOUSE_DATABASE="${CLICKHOUSE_DATABASE:-default}" # Neo4j settings export NEO4J_URI="${NEO4J_URI:-bolt://localhost:7687}" export NEO4J_USER="${NEO4J_USER:-neo4j}" export NEO4J_PASSWORD="${NEO4J_PASSWORD:-apollo2024}" # Caching settings CACHE_WORKERS="${CACHE_WORKERS:-8}" MAX_CACHE_SAMPLES="${MAX_CACHE_SAMPLES:-}" # Empty = all samples # Training settings BATCH_SIZE="${BATCH_SIZE:-16}" NUM_EPOCHS="${NUM_EPOCHS:-7}" #=============================================================================== # Parse Arguments #=============================================================================== STEP="" SKIP_CONFIRM=false while [[ $# -gt 0 ]]; do case $1 in --step) STEP="$2" shift 2 ;; --yes|-y) SKIP_CONFIRM=true shift ;; --epochs) IFS=',' read -ra EPOCHS <<< "$2" shift 2 ;; --help|-h) echo "Usage: $0 [OPTIONS]" echo "" echo "Options:" echo " --step STEP Run only specific step:" echo " install-deps, download-epochs, ingest-epochs," echo " generate-cache, train, all (default)" echo " --epochs X,Y,Z Comma-separated list of epochs (default: 844-850)" echo " --yes, -y Skip confirmation prompts" echo " --help, -h Show this help message" exit 0 ;; *) log_error "Unknown option: $1" exit 1 ;; esac done #=============================================================================== # Step 1: Install System Dependencies #=============================================================================== install_dependencies() { log_step "Step 1: Installing System Dependencies" # Detect OS if [ -f /etc/os-release ]; then . /etc/os-release OS=$ID else log_error "Cannot detect OS. Please install dependencies manually." exit 1 fi log_info "Detected OS: $OS" # Update package list log_info "Updating package list..." sudo apt-get update -qq # Install basic dependencies log_info "Installing basic dependencies..." sudo apt-get install -y -qq \ curl wget git build-essential \ python3 python3-pip python3-venv \ htop tmux unzip pigz pv \ apt-transport-https ca-certificates gnupg # Install ClickHouse if ! command -v clickhouse-server &> /dev/null; then log_info "Installing ClickHouse..." sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754 2>/dev/null || true echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list sudo apt-get update -qq sudo DEBIAN_FRONTEND=noninteractive apt-get install -y clickhouse-server clickhouse-client else log_info "ClickHouse already installed" fi # Configure and start ClickHouse log_info "Configuring ClickHouse..." sudo mkdir -p /etc/clickhouse-server/config.d/ cat << 'EOF' | sudo tee /etc/clickhouse-server/config.d/apollo.xml > /dev/null 0.8 200000000000 32 0.0.0.0 EOF sudo systemctl enable clickhouse-server sudo systemctl start clickhouse-server || sudo systemctl restart clickhouse-server sleep 3 # Verify ClickHouse if clickhouse-client --query "SELECT 1" &>/dev/null; then log_success "ClickHouse is running" else log_error "ClickHouse failed to start" exit 1 fi # Install Neo4j if ! command -v neo4j &> /dev/null; then log_info "Installing Neo4j..." wget -O - https://debian.neo4j.com/neotechnology.gpg.key 2>/dev/null | sudo apt-key add - 2>/dev/null || true echo 'deb https://debian.neo4j.com stable latest' | sudo tee /etc/apt/sources.list.d/neo4j.list sudo apt-get update -qq sudo apt-get install -y neo4j else log_info "Neo4j already installed" fi # Configure and start Neo4j log_info "Configuring Neo4j..." sudo tee /etc/neo4j/neo4j.conf > /dev/null << EOF dbms.default_listen_address=0.0.0.0 dbms.connector.bolt.listen_address=:7687 dbms.connector.http.listen_address=:7474 dbms.memory.heap.initial_size=4g dbms.memory.heap.max_size=16g dbms.memory.pagecache.size=8g dbms.security.auth_enabled=true EOF sudo systemctl enable neo4j sudo systemctl start neo4j || sudo systemctl restart neo4j sleep 5 # Set Neo4j password (first time setup) log_info "Setting Neo4j password..." curl -s -X POST "http://localhost:7474/user/neo4j/password" \ -H "Content-Type: application/json" \ -d "{\"password\":\"${NEO4J_PASSWORD}\"}" \ -u neo4j:neo4j 2>/dev/null || true log_success "System dependencies installed" } #=============================================================================== # Step 2: Install Python Dependencies #=============================================================================== install_python_deps() { log_step "Step 2: Installing Python Dependencies" cd "$APOLLO_DIR" # Create virtual environment if it doesn't exist if [ ! -d "venv" ]; then log_info "Creating Python virtual environment..." python3 -m venv venv fi # Activate virtual environment source venv/bin/activate # Upgrade pip log_info "Upgrading pip..." pip install --upgrade pip -q # Install PyTorch with CUDA log_info "Installing PyTorch with CUDA support..." pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 -q # Install requirements log_info "Installing project requirements..." pip install -r requirements.txt -q # Install additional dependencies that might be missing pip install Pillow requests -q log_success "Python dependencies installed" } #=============================================================================== # Step 3: Download Epochs #=============================================================================== download_epochs() { log_step "Step 3: Downloading Epochs ${EPOCHS[*]}" cd "$APOLLO_DIR" source venv/bin/activate # Check for HF token if [ -z "$HF_TOKEN" ]; then log_warn "HF_TOKEN not set. Some downloads may fail." log_info "Set it with: export HF_TOKEN=your_token" fi # Download each epoch for epoch in "${EPOCHS[@]}"; do log_info "Downloading epoch ${epoch}..." # Check if already downloaded EPOCH_DIR="${PUMP_FUN_DIR}/epoch_${epoch}" if [ -d "$EPOCH_DIR" ] && [ "$(ls -A "$EPOCH_DIR" 2>/dev/null)" ]; then PARQUET_COUNT=$(ls -1 "$EPOCH_DIR"/*.parquet 2>/dev/null | wc -l) if [ "$PARQUET_COUNT" -gt 10 ]; then log_info "Epoch ${epoch} already downloaded (${PARQUET_COUNT} files), skipping..." continue fi fi # Download using existing script python scripts/download_epoch_artifacts.py --epoch "$epoch" ${HF_TOKEN:+--token "$HF_TOKEN"} || { log_warn "Failed to download epoch ${epoch}, continuing..." } done log_success "Epoch downloads complete" } #=============================================================================== # Step 4: Ingest Epochs into ClickHouse and Neo4j #=============================================================================== ingest_epochs() { log_step "Step 4: Ingesting Epochs into ClickHouse and Neo4j" cd "$APOLLO_DIR" source venv/bin/activate # Ingest each epoch for epoch in "${EPOCHS[@]}"; do log_info "Ingesting epoch ${epoch}..." EPOCH_DIR="${PUMP_FUN_DIR}/epoch_${epoch}" if [ ! -d "$EPOCH_DIR" ]; then log_warn "Epoch ${epoch} not found at ${EPOCH_DIR}, skipping..." continue fi # Run ingestion script python scripts/ingest_epoch.py --epoch "$epoch" --merge-neo4j || { log_warn "Failed to ingest epoch ${epoch}, continuing..." } # Clean up downloaded files to save space (optional) # Uncomment if you want to delete after ingestion: # log_info "Cleaning up epoch ${epoch} files..." # rm -rf "$EPOCH_DIR" done # Verify data log_info "Verifying data ingestion..." MINT_COUNT=$(clickhouse-client --query "SELECT count() FROM mints" 2>/dev/null || echo "0") TRADE_COUNT=$(clickhouse-client --query "SELECT count() FROM trades" 2>/dev/null || echo "0") log_info " Mints: ${MINT_COUNT}" log_info " Trades: ${TRADE_COUNT}" log_success "Epoch ingestion complete" } #=============================================================================== # Step 5: Generate Training Cache #=============================================================================== generate_cache() { log_step "Step 5: Generating Training Cache" cd "$APOLLO_DIR" source venv/bin/activate # Create cache directory mkdir -p "$CACHE_DIR" # Check if cache already exists EXISTING_CACHE=$(ls -1 "$CACHE_DIR"/sample_*.pt 2>/dev/null | wc -l) if [ "$EXISTING_CACHE" -gt 1000 ]; then log_warn "Found ${EXISTING_CACHE} existing cache files" if [ "$SKIP_CONFIRM" = false ]; then read -p "Continue caching (will add to existing)? [y/N] " -n 1 -r echo if [[ ! $REPLY =~ ^[Yy]$ ]]; then log_info "Skipping cache generation" return 0 fi fi fi # Generate cache using parallel script log_info "Generating cache with ${CACHE_WORKERS} workers..." log_info "This may take several hours for 230k+ samples..." CACHE_ARGS="--output_dir $CACHE_DIR --num_workers $CACHE_WORKERS" if [ -n "$MAX_CACHE_SAMPLES" ]; then CACHE_ARGS="$CACHE_ARGS --max_samples $MAX_CACHE_SAMPLES" fi python scripts/cache_parallel.py $CACHE_ARGS || { log_error "Cache generation failed" exit 1 } # Validate cache log_info "Validating cache..." python scripts/validate_cache_v2.py --cache_dir "$CACHE_DIR" --sample_size 100 || true FINAL_CACHE=$(ls -1 "$CACHE_DIR"/sample_*.pt 2>/dev/null | wc -l) log_success "Cache generation complete: ${FINAL_CACHE} samples" } #=============================================================================== # Step 6: Launch Training #=============================================================================== launch_training() { log_step "Step 6: Launching Training" cd "$APOLLO_DIR" source venv/bin/activate # Check cache exists CACHE_COUNT=$(ls -1 "$CACHE_DIR"/sample_*.pt 2>/dev/null | wc -l) if [ "$CACHE_COUNT" -lt 100 ]; then log_error "Not enough cache files (${CACHE_COUNT}). Run cache generation first." exit 1 fi log_info "Starting training with ${CACHE_COUNT} cached samples..." log_info " Batch size: ${BATCH_SIZE}" log_info " Epochs: ${NUM_EPOCHS}" log_info " GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null || echo 'N/A')" # Launch training accelerate launch train.py \ --epochs "$NUM_EPOCHS" \ --batch_size "$BATCH_SIZE" \ --grad_accum_steps 4 \ --learning_rate 1e-4 \ --warmup_ratio 0.1 \ --max_grad_norm 1.0 \ --mixed_precision bf16 \ --max_seq_len 8192 \ --horizons_seconds 60 180 300 600 1800 3600 7200 \ --quantiles 0.1 0.5 0.9 \ --num_workers 16 \ --pin_memory \ --val_split 0.1 \ --val_every 5000 \ --save_every 5000 \ --log_every 100 } #=============================================================================== # Step 7: Create Environment File #=============================================================================== create_env_file() { log_info "Creating .env file..." cat << EOF > "${APOLLO_DIR}/.env" # ClickHouse CLICKHOUSE_HOST=${CLICKHOUSE_HOST} CLICKHOUSE_PORT=${CLICKHOUSE_PORT} CLICKHOUSE_HTTP_PORT=${CLICKHOUSE_HTTP_PORT} CLICKHOUSE_USER=${CLICKHOUSE_USER} CLICKHOUSE_PASSWORD=${CLICKHOUSE_PASSWORD} CLICKHOUSE_DATABASE=${CLICKHOUSE_DATABASE} # Neo4j NEO4J_URI=${NEO4J_URI} NEO4J_USER=${NEO4J_USER} NEO4J_PASSWORD=${NEO4J_PASSWORD} # Paths APOLLO_DATA_DIR=${DATA_DIR} APOLLO_CACHE_DIR=${CACHE_DIR} # Hugging Face (set your token here) HF_TOKEN=${HF_TOKEN:-} EOF log_success "Environment file created at ${APOLLO_DIR}/.env" } #=============================================================================== # Main Execution #=============================================================================== main() { echo "" echo "╔═══════════════════════════════════════════════════════════════╗" echo "║ Apollo Training Server - Complete Setup ║" echo "╚═══════════════════════════════════════════════════════════════╝" echo "" echo " Apollo Directory: ${APOLLO_DIR}" echo " Data Directory: ${DATA_DIR}" echo " Cache Directory: ${CACHE_DIR}" echo " Epochs: ${EPOCHS[*]}" echo "" # Check for HF_TOKEN if [ -z "$HF_TOKEN" ]; then log_warn "HF_TOKEN not set. Downloads may fail." log_info "Set it with: export HF_TOKEN=your_huggingface_token" echo "" fi # Run specific step or all steps case "$STEP" in install-deps) install_dependencies install_python_deps create_env_file ;; download-epochs) download_epochs ;; ingest-epochs) ingest_epochs ;; generate-cache) generate_cache ;; train) launch_training ;; ""|all) # Run all steps if [ "$SKIP_CONFIRM" = false ]; then echo "This will run the complete setup pipeline:" echo " 1. Install system dependencies (ClickHouse, Neo4j)" echo " 2. Install Python dependencies" echo " 3. Download epochs ${EPOCHS[*]}" echo " 4. Ingest data into databases" echo " 5. Generate training cache" echo " 6. Launch training" echo "" read -p "Continue? [y/N] " -n 1 -r echo if [[ ! $REPLY =~ ^[Yy]$ ]]; then log_info "Aborted." exit 0 fi fi install_dependencies install_python_deps create_env_file download_epochs ingest_epochs generate_cache launch_training ;; *) log_error "Unknown step: $STEP" echo "Valid steps: install-deps, download-epochs, ingest-epochs, generate-cache, train, all" exit 1 ;; esac echo "" log_success "Setup complete!" echo "" echo "Useful commands:" echo " source venv/bin/activate # Activate Python environment" echo " ./scripts/check_status.sh # Check system status" echo " accelerate launch train.py # Start training" echo "" } # Run main main "$@"