FlowAMP / launch_full_data_training.sh

Initial FlowAMP upload: Complete project with all essential files

370f342 5 months ago

4.42 kB

	#!/bin/bash

	# Optimized Single GPU AMP Flow Matching Training Launch Script with FULL DATA
	# This script launches optimized training on GPU 3 using ALL available data
	# Features: Mixed precision (BF16), increased batch size, H100 optimizations

	echo "=== Launching Optimized Single GPU AMP Flow Matching Training with FULL DATA ==="
	echo "Using GPU 3 for training (other GPUs are busy)"
	echo "Using ALL available peptide embeddings and UniProt data"
	echo "OVERNIGHT TRAINING: 15000 iterations with CFG support and H100 optimizations"
	echo ""

	# Check if required files exist
	echo "Checking required files..."
	if [ ! -f "final_compressor_model.pth" ]; then
	echo "❌ Missing final_compressor_model.pth"
	echo "Please run compressor_with_embeddings.py first"
	exit 1
	fi

	if [ ! -f "final_decompressor_model.pth" ]; then
	echo "❌ Missing final_decompressor_model.pth"
	echo "Please run compressor_with_embeddings.py first"
	exit 1
	fi

	if [ ! -d "/data2/edwardsun/flow_project/peptide_embeddings/" ]; then
	echo "❌ Missing /data2/edwardsun/flow_project/peptide_embeddings/ directory"
	echo "Please run final_sequence_encoder.py first"
	exit 1
	fi

	# Check for full data files
	if [ ! -f "/data2/edwardsun/flow_project/peptide_embeddings/all_peptide_embeddings.pt" ]; then
	echo "⚠️ Warning: all_peptide_embeddings.pt not found"
	echo "Will use individual embedding files instead"
	else
	echo "✓ Found all_peptide_embeddings.pt (4.3GB - ALL peptide data)"
	fi

	if [ ! -f "/data2/edwardsun/flow_project/test_uniprot_processed/uniprot_processed_data.json" ]; then
	echo "❌ Missing /data2/edwardsun/flow_project/test_uniprot_processed/uniprot_processed_data.json"
	echo "This contains ALL UniProt data for CFG training"
	exit 1
	else
	echo "✓ Found uniprot_processed_data.json (3.4GB - ALL UniProt data)"
	fi

	echo "✓ All required files found!"
	echo ""

	# Set CUDA device to GPU 3
	export CUDA_VISIBLE_DEVICES=3

	# Enable H100 optimizations
	export TORCH_CUDNN_V8_API_ENABLED=1
	export TORCH_CUDNN_V8_API_DISABLED=0

	echo "=== Optimized Training Configuration ==="
	echo " - GPU: 3 (CUDA_VISIBLE_DEVICES=3)"
	echo " - Batch size: 96 (optimized based on profiling)"
	echo " - Total iterations: 6,000"
	echo " - Mixed precision: BF16 (H100 optimized)"
	echo " - Learning rate: 4e-4 -> 2e-4 (cosine annealing)"
	echo " - Warmup steps: 5,000"
	echo " - Gradient clipping: 1.0"
	echo " - Weight decay: 0.01"
	echo " - Data workers: 16"
	echo " - CFG dropout: 15%"
	echo " - Validation: Every 10,000 steps"
	echo " - Checkpoints: Every 1,000 epochs"
	echo " - Estimated time: ~8-10 hours (overnight training)"
	echo ""

	# Check GPU memory and capabilities
	echo "Checking GPU capabilities..."
	nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader,nounits \| while IFS=, read -r name total free; do
	echo " GPU: $name"
	echo " Total memory: ${total}MB"
	echo " Free memory: ${free}MB"
	echo " Available: $((free * 100 / total))%"
	done

	echo ""

	# Launch optimized training
	echo "Starting optimized single GPU training on GPU 3 with FULL DATA..."
	echo ""

	# Launch training with optional wandb logging
	# Uncomment the following line if you want to use wandb logging:
	# python amp_flow_training_single_gpu_full_data.py --use_wandb

	# Standard training without wandb
	python amp_flow_training_single_gpu_full_data.py

	echo ""
	echo "=== Optimized Overnight Training Complete with FULL DATA ==="
	echo "Check for output files:"
	echo " - amp_flow_model_best_optimized.pth (best validation model)"
	echo " - amp_flow_model_final_optimized.pth (final model)"
	echo " - amp_flow_checkpoint_optimized_step_*.pth (checkpoints every 1000 epochs)"
	echo ""
	echo "Training optimizations applied:"
	echo " ✓ Mixed precision (BF16) for ~30-50% speedup"
	echo " ✓ Increased batch size (128) for better H100 utilization"
	echo " ✓ Optimized learning rate schedule with proper warmup"
	echo " ✓ Gradient clipping for training stability"
	echo " ✓ CFG dropout for better guidance"
	echo " ✓ Validation monitoring and early stopping"
	echo " ✓ PyTorch 2.x compilation for speedup"
	echo ""
	echo "Next steps:"
	echo "1. Test the optimized model: python generate_amps.py"
	echo "2. Compare performance with previous model"
	echo "3. Implement reflow for 1-step generation"
	echo "4. Add conditioning for toxicity"
	echo "5. Fine-tune on specific AMP properties"