FlowAMP / launch_multi_gpu_training.sh

Initial FlowAMP upload: Complete project with all essential files

370f342 6 months ago

2.88 kB

	#!/bin/bash

	# Multi-GPU AMP Flow Matching Training Launch Script
	# This script launches distributed training across 4 H100 GPUs

	echo "=== Launching Multi-GPU AMP Flow Matching Training with FULL DATA ==="
	echo "Using 4 H100 GPUs for distributed training"
	echo "Using ALL available peptide embeddings and UniProt data"
	echo "EXTENDED TRAINING: 5000 iterations with CFG support"
	echo ""

	# Check if required files exist
	echo "Checking required files..."
	if [ ! -f "final_compressor_model.pth" ]; then
	echo "❌ Missing final_compressor_model.pth"
	echo "Please run compressor_with_embeddings.py first"
	exit 1
	fi

	if [ ! -f "final_decompressor_model.pth" ]; then
	echo "❌ Missing final_decompressor_model.pth"
	echo "Please run compressor_with_embeddings.py first"
	exit 1
	fi

	if [ ! -d "/data2/edwardsun/flow_project/peptide_embeddings/" ]; then
	echo "❌ Missing /data2/edwardsun/flow_project/peptide_embeddings/ directory"
	echo "Please run final_sequence_encoder.py first"
	exit 1
	fi

	# Check for full data files
	if [ ! -f "/data2/edwardsun/flow_project/peptide_embeddings/all_peptide_embeddings.pt" ]; then
	echo "⚠️ Warning: all_peptide_embeddings.pt not found"
	echo "Will use individual embedding files instead"
	else
	echo "✓ Found all_peptide_embeddings.pt (4.3GB - ALL peptide data)"
	fi

	# Check if there are embedding files in the directory (fallback)
	if [ ! "$(ls -A /data2/edwardsun/flow_project/peptide_embeddings/*.pt 2>/dev/null)" ]; then
	echo "❌ No .pt files found in /data2/edwardsun/flow_project/peptide_embeddings/ directory"
	echo "Please run final_sequence_encoder.py first"
	exit 1
	fi

	echo "✓ All required files found!"
	echo ""

	# Set environment variables for distributed training
	export NCCL_DEBUG=INFO
	export NCCL_IB_DISABLE=0
	export NCCL_P2P_DISABLE=0

	# Launch distributed training
	echo "Starting distributed training with torchrun..."
	echo "Configuration (FULL DATA TRAINING):"
	echo " - Number of GPUs: 4"
	echo " - Batch size per GPU: 64"
	echo " - Total batch size: 256"
	echo " - Total iterations: 5,000"
	echo " - Data: ALL peptide embeddings + ALL UniProt data"
	echo " - Estimated time: ~30-45 minutes (4x faster than single GPU)"
	echo ""

	# Launch with torchrun
	torchrun \
	--nproc_per_node=4 \
	--nnodes=1 \
	--node_rank=0 \
	--master_addr=localhost \
	--master_port=29500 \
	amp_flow_training_multi_gpu.py

	echo ""
	echo "=== Training Complete with FULL DATA ==="
	echo "Check for output files:"
	echo " - amp_flow_model_final_full_data.pth (final model with full data)"
	echo " - amp_flow_checkpoint_full_data_step_*.pth (checkpoints)"
	echo ""
	echo "Next steps:"
	echo "1. Test the model: python generate_amps.py"
	echo "2. If successful, increase iterations for full training"
	echo "3. Implement reflow for 1-step generation"
	echo "4. Add conditioning for toxicity"