File size: 2,883 Bytes

370f342

#!/bin/bash

# Multi-GPU AMP Flow Matching Training Launch Script
# This script launches distributed training across 4 H100 GPUs

echo "=== Launching Multi-GPU AMP Flow Matching Training with FULL DATA ==="
echo "Using 4 H100 GPUs for distributed training"
echo "Using ALL available peptide embeddings and UniProt data"
echo "EXTENDED TRAINING: 5000 iterations with CFG support"
echo ""

# Check if required files exist
echo "Checking required files..."
if [ ! -f "final_compressor_model.pth" ]; then
    echo "❌ Missing final_compressor_model.pth"
    echo "Please run compressor_with_embeddings.py first"
    exit 1
fi

if [ ! -f "final_decompressor_model.pth" ]; then
    echo "❌ Missing final_decompressor_model.pth"
    echo "Please run compressor_with_embeddings.py first"
    exit 1
fi

if [ ! -d "/data2/edwardsun/flow_project/peptide_embeddings/" ]; then
    echo "❌ Missing /data2/edwardsun/flow_project/peptide_embeddings/ directory"
    echo "Please run final_sequence_encoder.py first"
    exit 1
fi

# Check for full data files
if [ ! -f "/data2/edwardsun/flow_project/peptide_embeddings/all_peptide_embeddings.pt" ]; then
    echo "⚠️  Warning: all_peptide_embeddings.pt not found"
    echo "Will use individual embedding files instead"
else
    echo "✓ Found all_peptide_embeddings.pt (4.3GB - ALL peptide data)"
fi

# Check if there are embedding files in the directory (fallback)
if [ ! "$(ls -A /data2/edwardsun/flow_project/peptide_embeddings/*.pt 2>/dev/null)" ]; then
    echo "❌ No .pt files found in /data2/edwardsun/flow_project/peptide_embeddings/ directory"
    echo "Please run final_sequence_encoder.py first"
    exit 1
fi

echo "✓ All required files found!"
echo ""

# Set environment variables for distributed training
export NCCL_DEBUG=INFO
export NCCL_IB_DISABLE=0
export NCCL_P2P_DISABLE=0

# Launch distributed training
echo "Starting distributed training with torchrun..."
echo "Configuration (FULL DATA TRAINING):"
echo "  - Number of GPUs: 4"
echo "  - Batch size per GPU: 64"
echo "  - Total batch size: 256"
echo "  - Total iterations: 5,000"
echo "  - Data: ALL peptide embeddings + ALL UniProt data"
echo "  - Estimated time: ~30-45 minutes (4x faster than single GPU)"
echo ""

# Launch with torchrun
torchrun \
    --nproc_per_node=4 \
    --nnodes=1 \
    --node_rank=0 \
    --master_addr=localhost \
    --master_port=29500 \
    amp_flow_training_multi_gpu.py

echo ""
echo "=== Training Complete with FULL DATA ==="
echo "Check for output files:"
echo "  - amp_flow_model_final_full_data.pth (final model with full data)"
echo "  - amp_flow_checkpoint_full_data_step_*.pth (checkpoints)"
echo ""
echo "Next steps:"
echo "1. Test the model: python generate_amps.py"
echo "2. If successful, increase iterations for full training"
echo "3. Implement reflow for 1-step generation"
echo "4. Add conditioning for toxicity"