| | #!/bin/bash |
| |
|
| | |
| | |
| |
|
| | echo "=== Launching Multi-GPU AMP Flow Matching Training with FULL DATA ===" |
| | echo "Using 4 H100 GPUs for distributed training" |
| | echo "Using ALL available peptide embeddings and UniProt data" |
| | echo "EXTENDED TRAINING: 5000 iterations with CFG support" |
| | echo "" |
| |
|
| | |
| | echo "Checking required files..." |
| | if [ ! -f "final_compressor_model.pth" ]; then |
| | echo "β Missing final_compressor_model.pth" |
| | echo "Please run compressor_with_embeddings.py first" |
| | exit 1 |
| | fi |
| |
|
| | if [ ! -f "final_decompressor_model.pth" ]; then |
| | echo "β Missing final_decompressor_model.pth" |
| | echo "Please run compressor_with_embeddings.py first" |
| | exit 1 |
| | fi |
| |
|
| | if [ ! -d "/data2/edwardsun/flow_project/peptide_embeddings/" ]; then |
| | echo "β Missing /data2/edwardsun/flow_project/peptide_embeddings/ directory" |
| | echo "Please run final_sequence_encoder.py first" |
| | exit 1 |
| | fi |
| |
|
| | |
| | if [ ! -f "/data2/edwardsun/flow_project/peptide_embeddings/all_peptide_embeddings.pt" ]; then |
| | echo "β οΈ Warning: all_peptide_embeddings.pt not found" |
| | echo "Will use individual embedding files instead" |
| | else |
| | echo "β Found all_peptide_embeddings.pt (4.3GB - ALL peptide data)" |
| | fi |
| |
|
| | |
| | if [ ! "$(ls -A /data2/edwardsun/flow_project/peptide_embeddings/*.pt 2>/dev/null)" ]; then |
| | echo "β No .pt files found in /data2/edwardsun/flow_project/peptide_embeddings/ directory" |
| | echo "Please run final_sequence_encoder.py first" |
| | exit 1 |
| | fi |
| |
|
| | echo "β All required files found!" |
| | echo "" |
| |
|
| | |
| | export NCCL_DEBUG=INFO |
| | export NCCL_IB_DISABLE=0 |
| | export NCCL_P2P_DISABLE=0 |
| |
|
| | |
| | echo "Starting distributed training with torchrun..." |
| | echo "Configuration (FULL DATA TRAINING):" |
| | echo " - Number of GPUs: 4" |
| | echo " - Batch size per GPU: 64" |
| | echo " - Total batch size: 256" |
| | echo " - Total iterations: 5,000" |
| | echo " - Data: ALL peptide embeddings + ALL UniProt data" |
| | echo " - Estimated time: ~30-45 minutes (4x faster than single GPU)" |
| | echo "" |
| |
|
| | |
| | torchrun \ |
| | --nproc_per_node=4 \ |
| | --nnodes=1 \ |
| | --node_rank=0 \ |
| | --master_addr=localhost \ |
| | --master_port=29500 \ |
| | amp_flow_training_multi_gpu.py |
| |
|
| | echo "" |
| | echo "=== Training Complete with FULL DATA ===" |
| | echo "Check for output files:" |
| | echo " - amp_flow_model_final_full_data.pth (final model with full data)" |
| | echo " - amp_flow_checkpoint_full_data_step_*.pth (checkpoints)" |
| | echo "" |
| | echo "Next steps:" |
| | echo "1. Test the model: python generate_amps.py" |
| | echo "2. If successful, increase iterations for full training" |
| | echo "3. Implement reflow for 1-step generation" |
| | echo "4. Add conditioning for toxicity" |