File size: 2,883 Bytes
370f342 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
#!/bin/bash
# Multi-GPU AMP Flow Matching Training Launch Script
# This script launches distributed training across 4 H100 GPUs
echo "=== Launching Multi-GPU AMP Flow Matching Training with FULL DATA ==="
echo "Using 4 H100 GPUs for distributed training"
echo "Using ALL available peptide embeddings and UniProt data"
echo "EXTENDED TRAINING: 5000 iterations with CFG support"
echo ""
# Check if required files exist
echo "Checking required files..."
if [ ! -f "final_compressor_model.pth" ]; then
echo "❌ Missing final_compressor_model.pth"
echo "Please run compressor_with_embeddings.py first"
exit 1
fi
if [ ! -f "final_decompressor_model.pth" ]; then
echo "❌ Missing final_decompressor_model.pth"
echo "Please run compressor_with_embeddings.py first"
exit 1
fi
if [ ! -d "/data2/edwardsun/flow_project/peptide_embeddings/" ]; then
echo "❌ Missing /data2/edwardsun/flow_project/peptide_embeddings/ directory"
echo "Please run final_sequence_encoder.py first"
exit 1
fi
# Check for full data files
if [ ! -f "/data2/edwardsun/flow_project/peptide_embeddings/all_peptide_embeddings.pt" ]; then
echo "⚠️ Warning: all_peptide_embeddings.pt not found"
echo "Will use individual embedding files instead"
else
echo "✓ Found all_peptide_embeddings.pt (4.3GB - ALL peptide data)"
fi
# Check if there are embedding files in the directory (fallback)
if [ ! "$(ls -A /data2/edwardsun/flow_project/peptide_embeddings/*.pt 2>/dev/null)" ]; then
echo "❌ No .pt files found in /data2/edwardsun/flow_project/peptide_embeddings/ directory"
echo "Please run final_sequence_encoder.py first"
exit 1
fi
echo "✓ All required files found!"
echo ""
# Set environment variables for distributed training
export NCCL_DEBUG=INFO
export NCCL_IB_DISABLE=0
export NCCL_P2P_DISABLE=0
# Launch distributed training
echo "Starting distributed training with torchrun..."
echo "Configuration (FULL DATA TRAINING):"
echo " - Number of GPUs: 4"
echo " - Batch size per GPU: 64"
echo " - Total batch size: 256"
echo " - Total iterations: 5,000"
echo " - Data: ALL peptide embeddings + ALL UniProt data"
echo " - Estimated time: ~30-45 minutes (4x faster than single GPU)"
echo ""
# Launch with torchrun
torchrun \
--nproc_per_node=4 \
--nnodes=1 \
--node_rank=0 \
--master_addr=localhost \
--master_port=29500 \
amp_flow_training_multi_gpu.py
echo ""
echo "=== Training Complete with FULL DATA ==="
echo "Check for output files:"
echo " - amp_flow_model_final_full_data.pth (final model with full data)"
echo " - amp_flow_checkpoint_full_data_step_*.pth (checkpoints)"
echo ""
echo "Next steps:"
echo "1. Test the model: python generate_amps.py"
echo "2. If successful, increase iterations for full training"
echo "3. Implement reflow for 1-step generation"
echo "4. Add conditioning for toxicity" |