oracle / scripts /setup_fresh_server.sh
zirobtc's picture
Upload folder using huggingface_hub
e605733
#!/bin/bash
#===============================================================================
# Apollo Training Server - Complete Setup Script
#===============================================================================
# This script sets up a fresh server for Apollo training from scratch:
#
# 1. Installs system dependencies (ClickHouse, Neo4j, Python, CUDA)
# 2. Installs Python requirements
# 3. Downloads epochs 844-850 from Hugging Face
# 4. Ingests all data into ClickHouse and Neo4j
# 5. Generates training cache files
# 6. Launches training
#
# Usage:
# export HF_TOKEN="your_huggingface_token"
# chmod +x scripts/setup_fresh_server.sh
# ./scripts/setup_fresh_server.sh
#
# Or run specific steps:
# ./scripts/setup_fresh_server.sh --step install-deps
# ./scripts/setup_fresh_server.sh --step download-epochs
# ./scripts/setup_fresh_server.sh --step ingest-epochs
# ./scripts/setup_fresh_server.sh --step generate-cache
# ./scripts/setup_fresh_server.sh --step train
#===============================================================================
set -e # Exit on error
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[βœ“]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
log_step() { echo -e "\n${CYAN}========================================${NC}"; echo -e "${CYAN} $1${NC}"; echo -e "${CYAN}========================================${NC}\n"; }
#===============================================================================
# Configuration
#===============================================================================
APOLLO_DIR="${APOLLO_DIR:-$(cd "$(dirname "$0")/.." && pwd)}"
DATA_DIR="${DATA_DIR:-${APOLLO_DIR}/data}"
CACHE_DIR="${CACHE_DIR:-${DATA_DIR}/cache}"
PUMP_FUN_DIR="${DATA_DIR}/pump_fun"
# Epochs to download and ingest
EPOCHS=(844 845 846 847 848 849 850)
# ClickHouse settings
export CLICKHOUSE_HOST="${CLICKHOUSE_HOST:-localhost}"
export CLICKHOUSE_PORT="${CLICKHOUSE_PORT:-9000}"
export CLICKHOUSE_HTTP_PORT="${CLICKHOUSE_HTTP_PORT:-8123}"
export CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}"
export CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-}"
export CLICKHOUSE_DATABASE="${CLICKHOUSE_DATABASE:-default}"
# Neo4j settings
export NEO4J_URI="${NEO4J_URI:-bolt://localhost:7687}"
export NEO4J_USER="${NEO4J_USER:-neo4j}"
export NEO4J_PASSWORD="${NEO4J_PASSWORD:-apollo2024}"
# Caching settings
CACHE_WORKERS="${CACHE_WORKERS:-8}"
MAX_CACHE_SAMPLES="${MAX_CACHE_SAMPLES:-}" # Empty = all samples
# Training settings
BATCH_SIZE="${BATCH_SIZE:-16}"
NUM_EPOCHS="${NUM_EPOCHS:-7}"
#===============================================================================
# Parse Arguments
#===============================================================================
STEP=""
SKIP_CONFIRM=false
while [[ $# -gt 0 ]]; do
case $1 in
--step)
STEP="$2"
shift 2
;;
--yes|-y)
SKIP_CONFIRM=true
shift
;;
--epochs)
IFS=',' read -ra EPOCHS <<< "$2"
shift 2
;;
--help|-h)
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " --step STEP Run only specific step:"
echo " install-deps, download-epochs, ingest-epochs,"
echo " generate-cache, train, all (default)"
echo " --epochs X,Y,Z Comma-separated list of epochs (default: 844-850)"
echo " --yes, -y Skip confirmation prompts"
echo " --help, -h Show this help message"
exit 0
;;
*)
log_error "Unknown option: $1"
exit 1
;;
esac
done
#===============================================================================
# Step 1: Install System Dependencies
#===============================================================================
install_dependencies() {
log_step "Step 1: Installing System Dependencies"
# Detect OS
if [ -f /etc/os-release ]; then
. /etc/os-release
OS=$ID
else
log_error "Cannot detect OS. Please install dependencies manually."
exit 1
fi
log_info "Detected OS: $OS"
# Update package list
log_info "Updating package list..."
sudo apt-get update -qq
# Install basic dependencies
log_info "Installing basic dependencies..."
sudo apt-get install -y -qq \
curl wget git build-essential \
python3 python3-pip python3-venv \
htop tmux unzip pigz pv \
apt-transport-https ca-certificates gnupg
# Install ClickHouse
if ! command -v clickhouse-server &> /dev/null; then
log_info "Installing ClickHouse..."
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754 2>/dev/null || true
echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list
sudo apt-get update -qq
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y clickhouse-server clickhouse-client
else
log_info "ClickHouse already installed"
fi
# Configure and start ClickHouse
log_info "Configuring ClickHouse..."
sudo mkdir -p /etc/clickhouse-server/config.d/
cat << 'EOF' | sudo tee /etc/clickhouse-server/config.d/apollo.xml > /dev/null
<?xml version="1.0"?>
<clickhouse>
<max_server_memory_usage_to_ram_ratio>0.8</max_server_memory_usage_to_ram_ratio>
<max_memory_usage>200000000000</max_memory_usage>
<max_threads>32</max_threads>
<listen_host>0.0.0.0</listen_host>
</clickhouse>
EOF
sudo systemctl enable clickhouse-server
sudo systemctl start clickhouse-server || sudo systemctl restart clickhouse-server
sleep 3
# Verify ClickHouse
if clickhouse-client --query "SELECT 1" &>/dev/null; then
log_success "ClickHouse is running"
else
log_error "ClickHouse failed to start"
exit 1
fi
# Install Neo4j
if ! command -v neo4j &> /dev/null; then
log_info "Installing Neo4j..."
wget -O - https://debian.neo4j.com/neotechnology.gpg.key 2>/dev/null | sudo apt-key add - 2>/dev/null || true
echo 'deb https://debian.neo4j.com stable latest' | sudo tee /etc/apt/sources.list.d/neo4j.list
sudo apt-get update -qq
sudo apt-get install -y neo4j
else
log_info "Neo4j already installed"
fi
# Configure and start Neo4j
log_info "Configuring Neo4j..."
sudo tee /etc/neo4j/neo4j.conf > /dev/null << EOF
dbms.default_listen_address=0.0.0.0
dbms.connector.bolt.listen_address=:7687
dbms.connector.http.listen_address=:7474
dbms.memory.heap.initial_size=4g
dbms.memory.heap.max_size=16g
dbms.memory.pagecache.size=8g
dbms.security.auth_enabled=true
EOF
sudo systemctl enable neo4j
sudo systemctl start neo4j || sudo systemctl restart neo4j
sleep 5
# Set Neo4j password (first time setup)
log_info "Setting Neo4j password..."
curl -s -X POST "http://localhost:7474/user/neo4j/password" \
-H "Content-Type: application/json" \
-d "{\"password\":\"${NEO4J_PASSWORD}\"}" \
-u neo4j:neo4j 2>/dev/null || true
log_success "System dependencies installed"
}
#===============================================================================
# Step 2: Install Python Dependencies
#===============================================================================
install_python_deps() {
log_step "Step 2: Installing Python Dependencies"
cd "$APOLLO_DIR"
# Create virtual environment if it doesn't exist
if [ ! -d "venv" ]; then
log_info "Creating Python virtual environment..."
python3 -m venv venv
fi
# Activate virtual environment
source venv/bin/activate
# Upgrade pip
log_info "Upgrading pip..."
pip install --upgrade pip -q
# Install PyTorch with CUDA
log_info "Installing PyTorch with CUDA support..."
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 -q
# Install requirements
log_info "Installing project requirements..."
pip install -r requirements.txt -q
# Install additional dependencies that might be missing
pip install Pillow requests -q
log_success "Python dependencies installed"
}
#===============================================================================
# Step 3: Download Epochs
#===============================================================================
download_epochs() {
log_step "Step 3: Downloading Epochs ${EPOCHS[*]}"
cd "$APOLLO_DIR"
source venv/bin/activate
# Check for HF token
if [ -z "$HF_TOKEN" ]; then
log_warn "HF_TOKEN not set. Some downloads may fail."
log_info "Set it with: export HF_TOKEN=your_token"
fi
# Download each epoch
for epoch in "${EPOCHS[@]}"; do
log_info "Downloading epoch ${epoch}..."
# Check if already downloaded
EPOCH_DIR="${PUMP_FUN_DIR}/epoch_${epoch}"
if [ -d "$EPOCH_DIR" ] && [ "$(ls -A "$EPOCH_DIR" 2>/dev/null)" ]; then
PARQUET_COUNT=$(ls -1 "$EPOCH_DIR"/*.parquet 2>/dev/null | wc -l)
if [ "$PARQUET_COUNT" -gt 10 ]; then
log_info "Epoch ${epoch} already downloaded (${PARQUET_COUNT} files), skipping..."
continue
fi
fi
# Download using existing script
python scripts/download_epoch_artifacts.py --epoch "$epoch" ${HF_TOKEN:+--token "$HF_TOKEN"} || {
log_warn "Failed to download epoch ${epoch}, continuing..."
}
done
log_success "Epoch downloads complete"
}
#===============================================================================
# Step 4: Ingest Epochs into ClickHouse and Neo4j
#===============================================================================
ingest_epochs() {
log_step "Step 4: Ingesting Epochs into ClickHouse and Neo4j"
cd "$APOLLO_DIR"
source venv/bin/activate
# Ingest each epoch
for epoch in "${EPOCHS[@]}"; do
log_info "Ingesting epoch ${epoch}..."
EPOCH_DIR="${PUMP_FUN_DIR}/epoch_${epoch}"
if [ ! -d "$EPOCH_DIR" ]; then
log_warn "Epoch ${epoch} not found at ${EPOCH_DIR}, skipping..."
continue
fi
# Run ingestion script
python scripts/ingest_epoch.py --epoch "$epoch" --merge-neo4j || {
log_warn "Failed to ingest epoch ${epoch}, continuing..."
}
# Clean up downloaded files to save space (optional)
# Uncomment if you want to delete after ingestion:
# log_info "Cleaning up epoch ${epoch} files..."
# rm -rf "$EPOCH_DIR"
done
# Verify data
log_info "Verifying data ingestion..."
MINT_COUNT=$(clickhouse-client --query "SELECT count() FROM mints" 2>/dev/null || echo "0")
TRADE_COUNT=$(clickhouse-client --query "SELECT count() FROM trades" 2>/dev/null || echo "0")
log_info " Mints: ${MINT_COUNT}"
log_info " Trades: ${TRADE_COUNT}"
log_success "Epoch ingestion complete"
}
#===============================================================================
# Step 5: Generate Training Cache
#===============================================================================
generate_cache() {
log_step "Step 5: Generating Training Cache"
cd "$APOLLO_DIR"
source venv/bin/activate
# Create cache directory
mkdir -p "$CACHE_DIR"
# Check if cache already exists
EXISTING_CACHE=$(ls -1 "$CACHE_DIR"/sample_*.pt 2>/dev/null | wc -l)
if [ "$EXISTING_CACHE" -gt 1000 ]; then
log_warn "Found ${EXISTING_CACHE} existing cache files"
if [ "$SKIP_CONFIRM" = false ]; then
read -p "Continue caching (will add to existing)? [y/N] " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
log_info "Skipping cache generation"
return 0
fi
fi
fi
# Generate cache using parallel script
log_info "Generating cache with ${CACHE_WORKERS} workers..."
log_info "This may take several hours for 230k+ samples..."
CACHE_ARGS="--output_dir $CACHE_DIR --num_workers $CACHE_WORKERS"
if [ -n "$MAX_CACHE_SAMPLES" ]; then
CACHE_ARGS="$CACHE_ARGS --max_samples $MAX_CACHE_SAMPLES"
fi
python scripts/cache_parallel.py $CACHE_ARGS || {
log_error "Cache generation failed"
exit 1
}
# Validate cache
log_info "Validating cache..."
python scripts/validate_cache_v2.py --cache_dir "$CACHE_DIR" --sample_size 100 || true
FINAL_CACHE=$(ls -1 "$CACHE_DIR"/sample_*.pt 2>/dev/null | wc -l)
log_success "Cache generation complete: ${FINAL_CACHE} samples"
}
#===============================================================================
# Step 6: Launch Training
#===============================================================================
launch_training() {
log_step "Step 6: Launching Training"
cd "$APOLLO_DIR"
source venv/bin/activate
# Check cache exists
CACHE_COUNT=$(ls -1 "$CACHE_DIR"/sample_*.pt 2>/dev/null | wc -l)
if [ "$CACHE_COUNT" -lt 100 ]; then
log_error "Not enough cache files (${CACHE_COUNT}). Run cache generation first."
exit 1
fi
log_info "Starting training with ${CACHE_COUNT} cached samples..."
log_info " Batch size: ${BATCH_SIZE}"
log_info " Epochs: ${NUM_EPOCHS}"
log_info " GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null || echo 'N/A')"
# Launch training
accelerate launch train.py \
--epochs "$NUM_EPOCHS" \
--batch_size "$BATCH_SIZE" \
--grad_accum_steps 4 \
--learning_rate 1e-4 \
--warmup_ratio 0.1 \
--max_grad_norm 1.0 \
--mixed_precision bf16 \
--max_seq_len 8192 \
--horizons_seconds 60 180 300 600 1800 3600 7200 \
--quantiles 0.1 0.5 0.9 \
--num_workers 16 \
--pin_memory \
--val_split 0.1 \
--val_every 5000 \
--save_every 5000 \
--log_every 100
}
#===============================================================================
# Step 7: Create Environment File
#===============================================================================
create_env_file() {
log_info "Creating .env file..."
cat << EOF > "${APOLLO_DIR}/.env"
# ClickHouse
CLICKHOUSE_HOST=${CLICKHOUSE_HOST}
CLICKHOUSE_PORT=${CLICKHOUSE_PORT}
CLICKHOUSE_HTTP_PORT=${CLICKHOUSE_HTTP_PORT}
CLICKHOUSE_USER=${CLICKHOUSE_USER}
CLICKHOUSE_PASSWORD=${CLICKHOUSE_PASSWORD}
CLICKHOUSE_DATABASE=${CLICKHOUSE_DATABASE}
# Neo4j
NEO4J_URI=${NEO4J_URI}
NEO4J_USER=${NEO4J_USER}
NEO4J_PASSWORD=${NEO4J_PASSWORD}
# Paths
APOLLO_DATA_DIR=${DATA_DIR}
APOLLO_CACHE_DIR=${CACHE_DIR}
# Hugging Face (set your token here)
HF_TOKEN=${HF_TOKEN:-}
EOF
log_success "Environment file created at ${APOLLO_DIR}/.env"
}
#===============================================================================
# Main Execution
#===============================================================================
main() {
echo ""
echo "╔═══════════════════════════════════════════════════════════════╗"
echo "β•‘ Apollo Training Server - Complete Setup β•‘"
echo "β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•"
echo ""
echo " Apollo Directory: ${APOLLO_DIR}"
echo " Data Directory: ${DATA_DIR}"
echo " Cache Directory: ${CACHE_DIR}"
echo " Epochs: ${EPOCHS[*]}"
echo ""
# Check for HF_TOKEN
if [ -z "$HF_TOKEN" ]; then
log_warn "HF_TOKEN not set. Downloads may fail."
log_info "Set it with: export HF_TOKEN=your_huggingface_token"
echo ""
fi
# Run specific step or all steps
case "$STEP" in
install-deps)
install_dependencies
install_python_deps
create_env_file
;;
download-epochs)
download_epochs
;;
ingest-epochs)
ingest_epochs
;;
generate-cache)
generate_cache
;;
train)
launch_training
;;
""|all)
# Run all steps
if [ "$SKIP_CONFIRM" = false ]; then
echo "This will run the complete setup pipeline:"
echo " 1. Install system dependencies (ClickHouse, Neo4j)"
echo " 2. Install Python dependencies"
echo " 3. Download epochs ${EPOCHS[*]}"
echo " 4. Ingest data into databases"
echo " 5. Generate training cache"
echo " 6. Launch training"
echo ""
read -p "Continue? [y/N] " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
log_info "Aborted."
exit 0
fi
fi
install_dependencies
install_python_deps
create_env_file
download_epochs
ingest_epochs
generate_cache
launch_training
;;
*)
log_error "Unknown step: $STEP"
echo "Valid steps: install-deps, download-epochs, ingest-epochs, generate-cache, train, all"
exit 1
;;
esac
echo ""
log_success "Setup complete!"
echo ""
echo "Useful commands:"
echo " source venv/bin/activate # Activate Python environment"
echo " ./scripts/check_status.sh # Check system status"
echo " accelerate launch train.py # Start training"
echo ""
}
# Run main
main "$@"