File size: 4,422 Bytes
370f342 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
#!/bin/bash
# Optimized Single GPU AMP Flow Matching Training Launch Script with FULL DATA
# This script launches optimized training on GPU 3 using ALL available data
# Features: Mixed precision (BF16), increased batch size, H100 optimizations
echo "=== Launching Optimized Single GPU AMP Flow Matching Training with FULL DATA ==="
echo "Using GPU 3 for training (other GPUs are busy)"
echo "Using ALL available peptide embeddings and UniProt data"
echo "OVERNIGHT TRAINING: 15000 iterations with CFG support and H100 optimizations"
echo ""
# Check if required files exist
echo "Checking required files..."
if [ ! -f "final_compressor_model.pth" ]; then
echo "β Missing final_compressor_model.pth"
echo "Please run compressor_with_embeddings.py first"
exit 1
fi
if [ ! -f "final_decompressor_model.pth" ]; then
echo "β Missing final_decompressor_model.pth"
echo "Please run compressor_with_embeddings.py first"
exit 1
fi
if [ ! -d "/data2/edwardsun/flow_project/peptide_embeddings/" ]; then
echo "β Missing /data2/edwardsun/flow_project/peptide_embeddings/ directory"
echo "Please run final_sequence_encoder.py first"
exit 1
fi
# Check for full data files
if [ ! -f "/data2/edwardsun/flow_project/peptide_embeddings/all_peptide_embeddings.pt" ]; then
echo "β οΈ Warning: all_peptide_embeddings.pt not found"
echo "Will use individual embedding files instead"
else
echo "β Found all_peptide_embeddings.pt (4.3GB - ALL peptide data)"
fi
if [ ! -f "/data2/edwardsun/flow_project/test_uniprot_processed/uniprot_processed_data.json" ]; then
echo "β Missing /data2/edwardsun/flow_project/test_uniprot_processed/uniprot_processed_data.json"
echo "This contains ALL UniProt data for CFG training"
exit 1
else
echo "β Found uniprot_processed_data.json (3.4GB - ALL UniProt data)"
fi
echo "β All required files found!"
echo ""
# Set CUDA device to GPU 3
export CUDA_VISIBLE_DEVICES=3
# Enable H100 optimizations
export TORCH_CUDNN_V8_API_ENABLED=1
export TORCH_CUDNN_V8_API_DISABLED=0
echo "=== Optimized Training Configuration ==="
echo " - GPU: 3 (CUDA_VISIBLE_DEVICES=3)"
echo " - Batch size: 96 (optimized based on profiling)"
echo " - Total iterations: 6,000"
echo " - Mixed precision: BF16 (H100 optimized)"
echo " - Learning rate: 4e-4 -> 2e-4 (cosine annealing)"
echo " - Warmup steps: 5,000"
echo " - Gradient clipping: 1.0"
echo " - Weight decay: 0.01"
echo " - Data workers: 16"
echo " - CFG dropout: 15%"
echo " - Validation: Every 10,000 steps"
echo " - Checkpoints: Every 1,000 epochs"
echo " - Estimated time: ~8-10 hours (overnight training)"
echo ""
# Check GPU memory and capabilities
echo "Checking GPU capabilities..."
nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader,nounits | while IFS=, read -r name total free; do
echo " GPU: $name"
echo " Total memory: ${total}MB"
echo " Free memory: ${free}MB"
echo " Available: $((free * 100 / total))%"
done
echo ""
# Launch optimized training
echo "Starting optimized single GPU training on GPU 3 with FULL DATA..."
echo ""
# Launch training with optional wandb logging
# Uncomment the following line if you want to use wandb logging:
# python amp_flow_training_single_gpu_full_data.py --use_wandb
# Standard training without wandb
python amp_flow_training_single_gpu_full_data.py
echo ""
echo "=== Optimized Overnight Training Complete with FULL DATA ==="
echo "Check for output files:"
echo " - amp_flow_model_best_optimized.pth (best validation model)"
echo " - amp_flow_model_final_optimized.pth (final model)"
echo " - amp_flow_checkpoint_optimized_step_*.pth (checkpoints every 1000 epochs)"
echo ""
echo "Training optimizations applied:"
echo " β Mixed precision (BF16) for ~30-50% speedup"
echo " β Increased batch size (128) for better H100 utilization"
echo " β Optimized learning rate schedule with proper warmup"
echo " β Gradient clipping for training stability"
echo " β CFG dropout for better guidance"
echo " β Validation monitoring and early stopping"
echo " β PyTorch 2.x compilation for speedup"
echo ""
echo "Next steps:"
echo "1. Test the optimized model: python generate_amps.py"
echo "2. Compare performance with previous model"
echo "3. Implement reflow for 1-step generation"
echo "4. Add conditioning for toxicity"
echo "5. Fine-tune on specific AMP properties" |