File size: 17,356 Bytes
37158e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 |
import torch
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
import os
from datetime import datetime
#d Import torchdiffeq for proper ODE solving
try:
from torchdiffeq import odeint
TORCHDIFFEQ_AVAILABLE = True
print("✓ torchdiffeq available for proper ODE solving")
except ImportError:
TORCHDIFFEQ_AVAILABLE = False
print("⚠️ torchdiffeq not available, using manual Euler integration")
# Import your components
from compressor_with_embeddings import Compressor, Decompressor
from final_flow_model import AMPFlowMatcherCFGConcat, AMPProtFlowPipelineCFG
class AMPGenerator:
"""
Generate AMP samples using trained ProtFlow model.
"""
def __init__(self, model_path, device='cuda'):
self.device = device
# Load models
self._load_models(model_path)
# Load preprocessing statistics
self.stats = torch.load('normalization_stats.pt', map_location=device)
def _load_models(self, model_path):
"""Load trained models."""
print("Loading trained models...")
# Load compressor and decompressor
self.compressor = Compressor().to(self.device)
self.decompressor = Decompressor().to(self.device)
self.compressor.load_state_dict(torch.load('/data2/edwardsun/flow_amp/models/final_compressor_model.pth', map_location=self.device))
self.decompressor.load_state_dict(torch.load('/data2/edwardsun/flow_amp/models/final_decompressor_model.pth', map_location=self.device))
# Load flow matching model with CFG
self.flow_model = AMPFlowMatcherCFGConcat(
hidden_dim=480,
compressed_dim=80, # 1280 // 16
n_layers=12,
n_heads=16,
dim_ff=3072,
max_seq_len=25,
use_cfg=True
).to(self.device)
checkpoint = torch.load(model_path, map_location=self.device, weights_only=False)
# Handle PyTorch compilation wrapper
state_dict = checkpoint['flow_model_state_dict']
new_state_dict = {}
for key, value in state_dict.items():
# Remove _orig_mod prefix if present
if key.startswith('_orig_mod.'):
new_key = key[10:] # Remove '_orig_mod.' prefix
else:
new_key = key
new_state_dict[new_key] = value
self.flow_model.load_state_dict(new_state_dict)
print(f"✓ All models loaded successfully from step {checkpoint['step']}!")
print(f" Loss at checkpoint: {checkpoint['loss']:.6f}")
# Initialize ODE solving capabilities
if TORCHDIFFEQ_AVAILABLE:
print("✓ Enhanced with proper ODE solving (torchdiffeq)")
else:
print("⚠️ Using fallback Euler integration")
def _create_ode_func(self, cfg_scale=7.5):
"""Create ODE function for torchdiffeq integration."""
def ode_func(t, x):
"""
ODE function: dx/dt = v_theta(x, t)
Args:
t: scalar time (single float)
x: state tensor [B*L*D] (flattened)
Returns:
dx/dt: derivative [B*L*D] (flattened)
"""
# Reshape x back to [B, L, D]
batch_size, seq_len, dim = self.current_shape
x = x.view(batch_size, seq_len, dim)
# Create time tensor for batch
t_tensor = torch.full((batch_size,), t, device=self.device, dtype=x.dtype)
# Compute vector field with CFG
if cfg_scale > 0:
# With AMP condition
amp_labels = torch.full((batch_size,), 0, device=self.device) # 0 = AMP
vt_cond = self.flow_model(x, t_tensor, labels=amp_labels)
# Without condition (mask)
mask_labels = torch.full((batch_size,), 2, device=self.device) # 2 = Mask
vt_uncond = self.flow_model(x, t_tensor, labels=mask_labels)
# CFG interpolation
vt = vt_uncond + cfg_scale * (vt_cond - vt_uncond)
else:
# No CFG, use mask label
mask_labels = torch.full((batch_size,), 2, device=self.device)
vt = self.flow_model(x, t_tensor, labels=mask_labels)
# Return flattened derivative
return vt.view(-1)
return ode_func
def generate_amps(self, num_samples=100, num_steps=25, batch_size=32, cfg_scale=7.5,
ode_method='dopri5', rtol=1e-5, atol=1e-6):
"""
Generate AMP samples using flow matching with CFG and improved ODE solving.
Args:
num_samples: Number of AMP samples to generate
num_steps: Number of ODE solving steps (25 for good quality, 1 for reflow)
batch_size: Batch size for generation
cfg_scale: CFG guidance scale (higher = stronger conditioning)
ode_method: ODE solver method ('dopri5', 'rk4', 'euler', 'adaptive_heun')
rtol: Relative tolerance for adaptive solvers
atol: Absolute tolerance for adaptive solvers
"""
method_str = f"{ode_method} ODE solver" if TORCHDIFFEQ_AVAILABLE and ode_method != 'euler' else "manual Euler integration"
print(f"Generating {num_samples} AMP samples with {method_str} (CFG scale: {cfg_scale})...")
if TORCHDIFFEQ_AVAILABLE and ode_method != 'euler':
print(f" Method: {ode_method}, rtol={rtol}, atol={atol}")
self.flow_model.eval()
self.compressor.eval()
self.decompressor.eval()
all_generated = []
with torch.no_grad():
for i in tqdm(range(0, num_samples, batch_size), desc="Generating with improved ODE"):
current_batch = min(batch_size, num_samples - i)
# Sample random noise (starting point at t=1)
eps = torch.randn(current_batch, 25, 80, device=self.device) # [B, L', COMP_DIM]
# Choose ODE solving method
if TORCHDIFFEQ_AVAILABLE and ode_method != 'euler':
# Use proper ODE solver
try:
# Store shape for ODE function
self.current_shape = eps.shape
# Create ODE function
ode_func = self._create_ode_func(cfg_scale=cfg_scale)
# Time span: from t=1 (noise) to t=0 (data)
t_span = torch.tensor([1.0, 0.0], device=self.device, dtype=eps.dtype)
# Flatten initial condition for torchdiffeq
y0 = eps.view(-1)
# Solve ODE with proper adaptive solver
if ode_method in ['dopri5', 'adaptive_heun']:
# Adaptive solvers
solution = odeint(
ode_func, y0, t_span,
method=ode_method,
rtol=rtol,
atol=atol,
options={'max_num_steps': 1000}
)
else:
# Fixed-step solvers
solution = odeint(
ode_func, y0, t_span,
method=ode_method,
options={'step_size': 0.04} # 1/25 for 25 steps
)
# Get final solution (at t=0)
xt = solution[-1].view(self.current_shape)
except Exception as e:
print(f"⚠️ ODE solving failed for batch {i//batch_size + 1}: {e}")
print("Falling back to Euler method...")
# Fall through to Euler method
xt = self._generate_with_euler(eps, current_batch, cfg_scale, num_steps)
else:
# Use manual Euler integration (original method)
xt = self._generate_with_euler(eps, current_batch, cfg_scale, num_steps)
# Decompress to get embeddings
decompressed = self.decompressor(xt) # [B, L, ESM_DIM]
# Apply reverse preprocessing
m, s, mn, mx = self.stats['mean'], self.stats['std'], self.stats['min'], self.stats['max']
decompressed = decompressed * (mx - mn + 1e-8) + mn
decompressed = decompressed * s + m
all_generated.append(decompressed.cpu())
# Concatenate all batches
generated_embeddings = torch.cat(all_generated, dim=0)
print(f"✓ Generated {generated_embeddings.shape[0]} AMP embeddings")
print(f" Shape: {generated_embeddings.shape}")
print(f" Stats - Mean: {generated_embeddings.mean():.4f}, Std: {generated_embeddings.std():.4f}")
return generated_embeddings
def _generate_with_euler(self, eps, current_batch, cfg_scale, num_steps):
"""Fallback Euler integration method (original implementation)."""
xt = eps.clone()
amp_labels = torch.full((current_batch,), 0, device=self.device) # 0 = AMP
mask_labels = torch.full((current_batch,), 2, device=self.device) # 2 = Mask
for step in range(num_steps):
t = torch.ones(current_batch, device=self.device) * (1.0 - step/num_steps)
# CFG: Generate with condition and without condition
if cfg_scale > 0:
# With AMP condition
vt_cond = self.flow_model(xt, t, labels=amp_labels)
# Without condition (mask)
vt_uncond = self.flow_model(xt, t, labels=mask_labels)
# CFG interpolation
vt = vt_uncond + cfg_scale * (vt_cond - vt_uncond)
else:
# No CFG, use mask label
vt = self.flow_model(xt, t, labels=mask_labels)
# Euler step for backward integration (t: 1 -> 0)
dt = -1.0 / num_steps
xt = xt + vt * dt
return xt
def compare_ode_methods(self, num_samples=20, cfg_scale=7.5):
"""
Compare different ODE solving methods for quality assessment.
"""
if not TORCHDIFFEQ_AVAILABLE:
print("⚠️ torchdiffeq not available, cannot compare ODE methods")
return self.generate_amps(num_samples=num_samples, cfg_scale=cfg_scale)
methods = ['euler', 'rk4', 'dopri5', 'adaptive_heun']
results = {}
print("🔬 Comparing ODE solving methods...")
for method in methods:
print(f"\n--- Testing {method} ---")
try:
start_time = torch.cuda.Event(enable_timing=True)
end_time = torch.cuda.Event(enable_timing=True)
start_time.record()
embeddings = self.generate_amps(
num_samples=num_samples,
batch_size=10,
cfg_scale=cfg_scale,
ode_method=method
)
end_time.record()
torch.cuda.synchronize()
elapsed_time = start_time.elapsed_time(end_time) / 1000.0 # Convert to seconds
results[method] = {
'embeddings': embeddings,
'time': elapsed_time,
'mean': embeddings.mean().item(),
'std': embeddings.std().item(),
'success': True
}
print(f"✓ {method}: {elapsed_time:.2f}s, mean={embeddings.mean():.4f}, std={embeddings.std():.4f}")
except Exception as e:
print(f"❌ {method} failed: {e}")
results[method] = {'success': False, 'error': str(e)}
return results
def generate_with_reflow(self, num_samples=100):
"""
Generate AMP samples using 1-step reflow (if you have reflow model).
"""
print(f"Generating {num_samples} AMP samples with 1-step reflow...")
# This would use the reflow implementation
# For now, just use 1-step generation
return self.generate_amps(num_samples=num_samples, num_steps=1, batch_size=32)
def main():
"""Main generation function."""
print("=== AMP Generation Pipeline with CFG ===")
# Use the best model from training (lowest validation loss: 0.017183)
model_path = '/data2/edwardsun/flow_checkpoints/amp_flow_model_best_optimized.pth'
# Check if checkpoint exists
try:
checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
print(f"✓ Found best model at step {checkpoint['step']} with loss {checkpoint['loss']:.6f}")
print(f" Global step: {checkpoint['global_step']}")
print(f" Total samples: {checkpoint['total_samples']:,}")
except:
print(f"❌ Best model not found: {model_path}")
print("Please train the flow matching model first using amp_flow_training.py")
return
# Initialize generator
generator = AMPGenerator(model_path, device='cuda')
# Test ODE methods comparison if available
if TORCHDIFFEQ_AVAILABLE:
print("\n🔬 Comparing ODE solving methods...")
comparison_results = generator.compare_ode_methods(num_samples=10, cfg_scale=7.5)
# Use best method for generation
best_method = 'dopri5' # Recommended method
print(f"\n🚀 Using {best_method} for main generation...")
else:
best_method = 'euler'
print("\n⚠️ Using fallback Euler integration...")
# Generate samples with different CFG scales using improved ODE solving
print("\n1. Generating with CFG scale 0.0 (no conditioning)...")
samples_no_cfg = generator.generate_amps(num_samples=20, num_steps=25, cfg_scale=0.0, ode_method=best_method)
print("\n2. Generating with CFG scale 3.0 (weak conditioning)...")
samples_weak_cfg = generator.generate_amps(num_samples=20, num_steps=25, cfg_scale=3.0, ode_method=best_method)
print("\n3. Generating with CFG scale 7.5 (strong conditioning)...")
samples_strong_cfg = generator.generate_amps(num_samples=20, num_steps=25, cfg_scale=7.5, ode_method=best_method)
print("\n4. Generating with CFG scale 15.0 (very strong conditioning)...")
samples_very_strong_cfg = generator.generate_amps(num_samples=20, num_steps=25, cfg_scale=15.0, ode_method=best_method)
# Create output directory if it doesn't exist
output_dir = '/data2/edwardsun/generated_samples'
os.makedirs(output_dir, exist_ok=True)
# Get today's date for filename
today = datetime.now().strftime('%Y%m%d')
# Save generated samples with date
torch.save(samples_no_cfg, os.path.join(output_dir, f'generated_amps_best_model_no_cfg_{today}.pt'))
torch.save(samples_weak_cfg, os.path.join(output_dir, f'generated_amps_best_model_weak_cfg_{today}.pt'))
torch.save(samples_strong_cfg, os.path.join(output_dir, f'generated_amps_best_model_strong_cfg_{today}.pt'))
torch.save(samples_very_strong_cfg, os.path.join(output_dir, f'generated_amps_best_model_very_strong_cfg_{today}.pt'))
print("\n✓ Generation complete!")
print(f"Generated samples saved (Date: {today}):")
print(f" - generated_amps_best_model_no_cfg_{today}.pt (no conditioning)")
print(f" - generated_amps_best_model_weak_cfg_{today}.pt (weak CFG)")
print(f" - generated_amps_best_model_strong_cfg_{today}.pt (strong CFG)")
print(f" - generated_amps_best_model_very_strong_cfg_{today}.pt (very strong CFG)")
print("\nCFG Analysis:")
print(" - CFG scale 0.0: No conditioning, generates diverse sequences")
print(" - CFG scale 3.0: Weak AMP conditioning")
print(" - CFG scale 7.5: Strong AMP conditioning (recommended)")
print(" - CFG scale 15.0: Very strong AMP conditioning (may be too restrictive)")
print("\nNext steps:")
print("1. Decode embeddings back to sequences using ESM-2 decoder")
print("2. Evaluate with ProtFlow metrics (FPD, MMD, ESM-2 perplexity)")
print("3. Compare sequences generated with different CFG scales")
print("4. Evaluate AMP properties (antimicrobial activity, toxicity)")
if TORCHDIFFEQ_AVAILABLE:
print(f"5. ✓ Enhanced generation with {best_method} ODE solver")
else:
print("5. Install torchdiffeq for improved ODE solving: pip install torchdiffeq")
if __name__ == "__main__":
main() |