FlowFinal / src /generate_amps.py

Add generate_amps.py

37158e8 verified 6 months ago

17.4 kB

	import torch
	import torch.nn.functional as F
	import numpy as np
	from tqdm import tqdm
	import os
	from datetime import datetime
	#d Import torchdiffeq for proper ODE solving
	try:
	from torchdiffeq import odeint
	TORCHDIFFEQ_AVAILABLE = True
	print("✓ torchdiffeq available for proper ODE solving")
	except ImportError:
	TORCHDIFFEQ_AVAILABLE = False
	print("⚠️ torchdiffeq not available, using manual Euler integration")

	# Import your components
	from compressor_with_embeddings import Compressor, Decompressor
	from final_flow_model import AMPFlowMatcherCFGConcat, AMPProtFlowPipelineCFG

	class AMPGenerator:
	"""
	Generate AMP samples using trained ProtFlow model.
	"""

	def __init__(self, model_path, device='cuda'):
	self.device = device

	# Load models
	self._load_models(model_path)

	# Load preprocessing statistics
	self.stats = torch.load('normalization_stats.pt', map_location=device)

	def _load_models(self, model_path):
	"""Load trained models."""
	print("Loading trained models...")

	# Load compressor and decompressor
	self.compressor = Compressor().to(self.device)
	self.decompressor = Decompressor().to(self.device)

	self.compressor.load_state_dict(torch.load('/data2/edwardsun/flow_amp/models/final_compressor_model.pth', map_location=self.device))
	self.decompressor.load_state_dict(torch.load('/data2/edwardsun/flow_amp/models/final_decompressor_model.pth', map_location=self.device))

	# Load flow matching model with CFG
	self.flow_model = AMPFlowMatcherCFGConcat(
	hidden_dim=480,
	compressed_dim=80, # 1280 // 16
	n_layers=12,
	n_heads=16,
	dim_ff=3072,
	max_seq_len=25,
	use_cfg=True
	).to(self.device)

	checkpoint = torch.load(model_path, map_location=self.device, weights_only=False)

	# Handle PyTorch compilation wrapper
	state_dict = checkpoint['flow_model_state_dict']
	new_state_dict = {}

	for key, value in state_dict.items():
	# Remove _orig_mod prefix if present
	if key.startswith('_orig_mod.'):
	new_key = key[10:] # Remove '_orig_mod.' prefix
	else:
	new_key = key
	new_state_dict[new_key] = value

	self.flow_model.load_state_dict(new_state_dict)

	print(f"✓ All models loaded successfully from step {checkpoint['step']}!")
	print(f" Loss at checkpoint: {checkpoint['loss']:.6f}")

	# Initialize ODE solving capabilities
	if TORCHDIFFEQ_AVAILABLE:
	print("✓ Enhanced with proper ODE solving (torchdiffeq)")
	else:
	print("⚠️ Using fallback Euler integration")

	def _create_ode_func(self, cfg_scale=7.5):
	"""Create ODE function for torchdiffeq integration."""

	def ode_func(t, x):
	"""
	ODE function: dx/dt = v_theta(x, t)

	Args:
	t: scalar time (single float)
	x: state tensor [BLD] (flattened)
	Returns:
	dx/dt: derivative [BLD] (flattened)
	"""
	# Reshape x back to [B, L, D]
	batch_size, seq_len, dim = self.current_shape
	x = x.view(batch_size, seq_len, dim)

	# Create time tensor for batch
	t_tensor = torch.full((batch_size,), t, device=self.device, dtype=x.dtype)

	# Compute vector field with CFG
	if cfg_scale > 0:
	# With AMP condition
	amp_labels = torch.full((batch_size,), 0, device=self.device) # 0 = AMP
	vt_cond = self.flow_model(x, t_tensor, labels=amp_labels)

	# Without condition (mask)
	mask_labels = torch.full((batch_size,), 2, device=self.device) # 2 = Mask
	vt_uncond = self.flow_model(x, t_tensor, labels=mask_labels)

	# CFG interpolation
	vt = vt_uncond + cfg_scale * (vt_cond - vt_uncond)
	else:
	# No CFG, use mask label
	mask_labels = torch.full((batch_size,), 2, device=self.device)
	vt = self.flow_model(x, t_tensor, labels=mask_labels)

	# Return flattened derivative
	return vt.view(-1)

	return ode_func

	def generate_amps(self, num_samples=100, num_steps=25, batch_size=32, cfg_scale=7.5,
	ode_method='dopri5', rtol=1e-5, atol=1e-6):
	"""
	Generate AMP samples using flow matching with CFG and improved ODE solving.

	Args:
	num_samples: Number of AMP samples to generate
	num_steps: Number of ODE solving steps (25 for good quality, 1 for reflow)
	batch_size: Batch size for generation
	cfg_scale: CFG guidance scale (higher = stronger conditioning)
	ode_method: ODE solver method ('dopri5', 'rk4', 'euler', 'adaptive_heun')
	rtol: Relative tolerance for adaptive solvers
	atol: Absolute tolerance for adaptive solvers
	"""
	method_str = f"{ode_method} ODE solver" if TORCHDIFFEQ_AVAILABLE and ode_method != 'euler' else "manual Euler integration"
	print(f"Generating {num_samples} AMP samples with {method_str} (CFG scale: {cfg_scale})...")
	if TORCHDIFFEQ_AVAILABLE and ode_method != 'euler':
	print(f" Method: {ode_method}, rtol={rtol}, atol={atol}")

	self.flow_model.eval()
	self.compressor.eval()
	self.decompressor.eval()

	all_generated = []

	with torch.no_grad():
	for i in tqdm(range(0, num_samples, batch_size), desc="Generating with improved ODE"):
	current_batch = min(batch_size, num_samples - i)

	# Sample random noise (starting point at t=1)
	eps = torch.randn(current_batch, 25, 80, device=self.device) # [B, L', COMP_DIM]

	# Choose ODE solving method
	if TORCHDIFFEQ_AVAILABLE and ode_method != 'euler':
	# Use proper ODE solver
	try:
	# Store shape for ODE function
	self.current_shape = eps.shape

	# Create ODE function
	ode_func = self._create_ode_func(cfg_scale=cfg_scale)

	# Time span: from t=1 (noise) to t=0 (data)
	t_span = torch.tensor([1.0, 0.0], device=self.device, dtype=eps.dtype)

	# Flatten initial condition for torchdiffeq
	y0 = eps.view(-1)

	# Solve ODE with proper adaptive solver
	if ode_method in ['dopri5', 'adaptive_heun']:
	# Adaptive solvers
	solution = odeint(
	ode_func, y0, t_span,
	method=ode_method,
	rtol=rtol,
	atol=atol,
	options={'max_num_steps': 1000}
	)
	else:
	# Fixed-step solvers
	solution = odeint(
	ode_func, y0, t_span,
	method=ode_method,
	options={'step_size': 0.04} # 1/25 for 25 steps
	)

	# Get final solution (at t=0)
	xt = solution[-1].view(self.current_shape)

	except Exception as e:
	print(f"⚠️ ODE solving failed for batch {i//batch_size + 1}: {e}")
	print("Falling back to Euler method...")
	# Fall through to Euler method
	xt = self._generate_with_euler(eps, current_batch, cfg_scale, num_steps)
	else:
	# Use manual Euler integration (original method)
	xt = self._generate_with_euler(eps, current_batch, cfg_scale, num_steps)

	# Decompress to get embeddings
	decompressed = self.decompressor(xt) # [B, L, ESM_DIM]

	# Apply reverse preprocessing
	m, s, mn, mx = self.stats['mean'], self.stats['std'], self.stats['min'], self.stats['max']
	decompressed = decompressed * (mx - mn + 1e-8) + mn
	decompressed = decompressed * s + m

	all_generated.append(decompressed.cpu())

	# Concatenate all batches
	generated_embeddings = torch.cat(all_generated, dim=0)

	print(f"✓ Generated {generated_embeddings.shape[0]} AMP embeddings")
	print(f" Shape: {generated_embeddings.shape}")
	print(f" Stats - Mean: {generated_embeddings.mean():.4f}, Std: {generated_embeddings.std():.4f}")

	return generated_embeddings

	def _generate_with_euler(self, eps, current_batch, cfg_scale, num_steps):
	"""Fallback Euler integration method (original implementation)."""
	xt = eps.clone()
	amp_labels = torch.full((current_batch,), 0, device=self.device) # 0 = AMP
	mask_labels = torch.full((current_batch,), 2, device=self.device) # 2 = Mask

	for step in range(num_steps):
	t = torch.ones(current_batch, device=self.device) * (1.0 - step/num_steps)

	# CFG: Generate with condition and without condition
	if cfg_scale > 0:
	# With AMP condition
	vt_cond = self.flow_model(xt, t, labels=amp_labels)

	# Without condition (mask)
	vt_uncond = self.flow_model(xt, t, labels=mask_labels)

	# CFG interpolation
	vt = vt_uncond + cfg_scale * (vt_cond - vt_uncond)
	else:
	# No CFG, use mask label
	vt = self.flow_model(xt, t, labels=mask_labels)

	# Euler step for backward integration (t: 1 -> 0)
	dt = -1.0 / num_steps
	xt = xt + vt * dt

	return xt

	def compare_ode_methods(self, num_samples=20, cfg_scale=7.5):
	"""
	Compare different ODE solving methods for quality assessment.
	"""
	if not TORCHDIFFEQ_AVAILABLE:
	print("⚠️ torchdiffeq not available, cannot compare ODE methods")
	return self.generate_amps(num_samples=num_samples, cfg_scale=cfg_scale)

	methods = ['euler', 'rk4', 'dopri5', 'adaptive_heun']
	results = {}

	print("🔬 Comparing ODE solving methods...")

	for method in methods:
	print(f"\n--- Testing {method} ---")
	try:
	start_time = torch.cuda.Event(enable_timing=True)
	end_time = torch.cuda.Event(enable_timing=True)

	start_time.record()
	embeddings = self.generate_amps(
	num_samples=num_samples,
	batch_size=10,
	cfg_scale=cfg_scale,
	ode_method=method
	)
	end_time.record()

	torch.cuda.synchronize()
	elapsed_time = start_time.elapsed_time(end_time) / 1000.0 # Convert to seconds

	results[method] = {
	'embeddings': embeddings,
	'time': elapsed_time,
	'mean': embeddings.mean().item(),
	'std': embeddings.std().item(),
	'success': True
	}

	print(f"✓ {method}: {elapsed_time:.2f}s, mean={embeddings.mean():.4f}, std={embeddings.std():.4f}")

	except Exception as e:
	print(f"❌ {method} failed: {e}")
	results[method] = {'success': False, 'error': str(e)}

	return results

	def generate_with_reflow(self, num_samples=100):
	"""
	Generate AMP samples using 1-step reflow (if you have reflow model).
	"""
	print(f"Generating {num_samples} AMP samples with 1-step reflow...")

	# This would use the reflow implementation
	# For now, just use 1-step generation
	return self.generate_amps(num_samples=num_samples, num_steps=1, batch_size=32)

	def main():
	"""Main generation function."""
	print("=== AMP Generation Pipeline with CFG ===")

	# Use the best model from training (lowest validation loss: 0.017183)
	model_path = '/data2/edwardsun/flow_checkpoints/amp_flow_model_best_optimized.pth'

	# Check if checkpoint exists
	try:
	checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
	print(f"✓ Found best model at step {checkpoint['step']} with loss {checkpoint['loss']:.6f}")
	print(f" Global step: {checkpoint['global_step']}")
	print(f" Total samples: {checkpoint['total_samples']:,}")
	except:
	print(f"❌ Best model not found: {model_path}")
	print("Please train the flow matching model first using amp_flow_training.py")
	return

	# Initialize generator
	generator = AMPGenerator(model_path, device='cuda')

	# Test ODE methods comparison if available
	if TORCHDIFFEQ_AVAILABLE:
	print("\n🔬 Comparing ODE solving methods...")
	comparison_results = generator.compare_ode_methods(num_samples=10, cfg_scale=7.5)

	# Use best method for generation
	best_method = 'dopri5' # Recommended method
	print(f"\n🚀 Using {best_method} for main generation...")
	else:
	best_method = 'euler'
	print("\n⚠️ Using fallback Euler integration...")

	# Generate samples with different CFG scales using improved ODE solving
	print("\n1. Generating with CFG scale 0.0 (no conditioning)...")
	samples_no_cfg = generator.generate_amps(num_samples=20, num_steps=25, cfg_scale=0.0, ode_method=best_method)

	print("\n2. Generating with CFG scale 3.0 (weak conditioning)...")
	samples_weak_cfg = generator.generate_amps(num_samples=20, num_steps=25, cfg_scale=3.0, ode_method=best_method)

	print("\n3. Generating with CFG scale 7.5 (strong conditioning)...")
	samples_strong_cfg = generator.generate_amps(num_samples=20, num_steps=25, cfg_scale=7.5, ode_method=best_method)

	print("\n4. Generating with CFG scale 15.0 (very strong conditioning)...")
	samples_very_strong_cfg = generator.generate_amps(num_samples=20, num_steps=25, cfg_scale=15.0, ode_method=best_method)

	# Create output directory if it doesn't exist
	output_dir = '/data2/edwardsun/generated_samples'
	os.makedirs(output_dir, exist_ok=True)

	# Get today's date for filename
	today = datetime.now().strftime('%Y%m%d')

	# Save generated samples with date
	torch.save(samples_no_cfg, os.path.join(output_dir, f'generated_amps_best_model_no_cfg_{today}.pt'))
	torch.save(samples_weak_cfg, os.path.join(output_dir, f'generated_amps_best_model_weak_cfg_{today}.pt'))
	torch.save(samples_strong_cfg, os.path.join(output_dir, f'generated_amps_best_model_strong_cfg_{today}.pt'))
	torch.save(samples_very_strong_cfg, os.path.join(output_dir, f'generated_amps_best_model_very_strong_cfg_{today}.pt'))

	print("\n✓ Generation complete!")
	print(f"Generated samples saved (Date: {today}):")
	print(f" - generated_amps_best_model_no_cfg_{today}.pt (no conditioning)")
	print(f" - generated_amps_best_model_weak_cfg_{today}.pt (weak CFG)")
	print(f" - generated_amps_best_model_strong_cfg_{today}.pt (strong CFG)")
	print(f" - generated_amps_best_model_very_strong_cfg_{today}.pt (very strong CFG)")

	print("\nCFG Analysis:")
	print(" - CFG scale 0.0: No conditioning, generates diverse sequences")
	print(" - CFG scale 3.0: Weak AMP conditioning")
	print(" - CFG scale 7.5: Strong AMP conditioning (recommended)")
	print(" - CFG scale 15.0: Very strong AMP conditioning (may be too restrictive)")

	print("\nNext steps:")
	print("1. Decode embeddings back to sequences using ESM-2 decoder")
	print("2. Evaluate with ProtFlow metrics (FPD, MMD, ESM-2 perplexity)")
	print("3. Compare sequences generated with different CFG scales")
	print("4. Evaluate AMP properties (antimicrobial activity, toxicity)")
	if TORCHDIFFEQ_AVAILABLE:
	print(f"5. ✓ Enhanced generation with {best_method} ODE solver")
	else:
	print("5. Install torchdiffeq for improved ODE solving: pip install torchdiffeq")

	if __name__ == "__main__":
	main()