File size: 9,517 Bytes

370f342

import torch
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
import os
from datetime import datetime

# Import your components
from compressor_with_embeddings import Compressor, Decompressor
from final_flow_model import AMPFlowMatcherCFGConcat, AMPProtFlowPipelineCFG

class AMPGenerator:
    """
    Generate AMP samples using trained ProtFlow model.
    """
    
    def __init__(self, model_path, device='cuda'):
        self.device = device
        
        # Load models
        self._load_models(model_path)
        
        # Load preprocessing statistics
        self.stats = torch.load('normalization_stats.pt', map_location=device)
        
    def _load_models(self, model_path):
        """Load trained models."""
        print("Loading trained models...")
        
        # Load compressor and decompressor
        self.compressor = Compressor().to(self.device)
        self.decompressor = Decompressor().to(self.device)
        
        self.compressor.load_state_dict(torch.load('/data2/edwardsun/flow_amp/models/final_compressor_model.pth', map_location=self.device))
        self.decompressor.load_state_dict(torch.load('/data2/edwardsun/flow_amp/models/final_decompressor_model.pth', map_location=self.device))
        
        # Load flow matching model with CFG
        self.flow_model = AMPFlowMatcherCFGConcat(
            hidden_dim=480,
            compressed_dim=80,  # 1280 // 16
            n_layers=12,
            n_heads=16,
            dim_ff=3072,
            max_seq_len=25,
            use_cfg=True
        ).to(self.device)
        
        checkpoint = torch.load(model_path, map_location=self.device)
        
        # Handle PyTorch compilation wrapper
        state_dict = checkpoint['flow_model_state_dict']
        new_state_dict = {}
        
        for key, value in state_dict.items():
            # Remove _orig_mod prefix if present
            if key.startswith('_orig_mod.'):
                new_key = key[10:]  # Remove '_orig_mod.' prefix
            else:
                new_key = key
            new_state_dict[new_key] = value
        
        self.flow_model.load_state_dict(new_state_dict)
        
        print(f"✓ All models loaded successfully from step {checkpoint['step']}!")
        print(f"  Loss at checkpoint: {checkpoint['loss']:.6f}")
        
    def generate_amps(self, num_samples=100, num_steps=25, batch_size=32, cfg_scale=7.5):
        """
        Generate AMP samples using flow matching with CFG.
        
        Args:
            num_samples: Number of AMP samples to generate
            num_steps: Number of ODE solving steps (25 for good quality, 1 for reflow)
            batch_size: Batch size for generation
            cfg_scale: CFG guidance scale (higher = stronger conditioning)
        """
        print(f"Generating {num_samples} AMP samples with {num_steps} steps (CFG scale: {cfg_scale})...")
        
        self.flow_model.eval()
        self.compressor.eval()
        self.decompressor.eval()
        
        all_generated = []
        
        with torch.no_grad():
            for i in tqdm(range(0, num_samples, batch_size), desc="Generating"):
                current_batch = min(batch_size, num_samples - i)
                
                # Sample random noise
                eps = torch.randn(current_batch, 25, 80, device=self.device)  # [B, L', COMP_DIM]
                
                # ODE solving steps with CFG
                xt = eps.clone()
                amp_labels = torch.full((current_batch,), 0, device=self.device)  # 0 = AMP
                mask_labels = torch.full((current_batch,), 2, device=self.device)  # 2 = Mask
                
                for step in range(num_steps):
                    t = torch.ones(current_batch, device=self.device) * (1.0 - step/num_steps)
                    
                    # CFG: Generate with condition and without condition
                    if cfg_scale > 0:
                        # With AMP condition
                        vt_cond = self.flow_model(xt, t, labels=amp_labels)
                        
                        # Without condition (mask)
                        vt_uncond = self.flow_model(xt, t, labels=mask_labels)
                        
                        # CFG interpolation
                        vt = vt_uncond + cfg_scale * (vt_cond - vt_uncond)
                    else:
                        # No CFG, use mask label
                        vt = self.flow_model(xt, t, labels=mask_labels)
                    
                    # Euler step for backward integration (t: 1 -> 0)
                    # Use negative dt to integrate backward from noise to data
                    dt = -1.0 / num_steps
                    xt = xt + vt * dt
                
                # Decompress to get embeddings
                decompressed = self.decompressor(xt)  # [B, L, ESM_DIM]
                
                # Apply reverse preprocessing
                m, s, mn, mx = self.stats['mean'], self.stats['std'], self.stats['min'], self.stats['max']
                decompressed = decompressed * (mx - mn + 1e-8) + mn
                decompressed = decompressed * s + m
                
                all_generated.append(decompressed.cpu())
        
        # Concatenate all batches
        generated_embeddings = torch.cat(all_generated, dim=0)
        
        print(f"✓ Generated {generated_embeddings.shape[0]} AMP embeddings")
        print(f"  Shape: {generated_embeddings.shape}")
        print(f"  Stats - Mean: {generated_embeddings.mean():.4f}, Std: {generated_embeddings.std():.4f}")
        
        return generated_embeddings
    
    def generate_with_reflow(self, num_samples=100):
        """
        Generate AMP samples using 1-step reflow (if you have reflow model).
        """
        print(f"Generating {num_samples} AMP samples with 1-step reflow...")
        
        # This would use the reflow implementation
        # For now, just use 1-step generation
        return self.generate_amps(num_samples=num_samples, num_steps=1, batch_size=32)

def main():
    """Main generation function."""
    print("=== AMP Generation Pipeline with CFG ===")
    
    # Use the best model from training
    model_path = '/data2/edwardsun/flow_amp/checkpoints/amp_flow_model_best_optimized.pth'
    
    # Check if checkpoint exists
    try:
        checkpoint = torch.load(model_path, map_location='cpu')
        print(f"✓ Found best model at step {checkpoint['step']} with loss {checkpoint['loss']:.6f}")
        print(f"  Global step: {checkpoint['global_step']}")
        print(f"  Total samples: {checkpoint['total_samples']:,}")
    except:
        print(f"❌ Best model not found: {model_path}")
        print("Please train the flow matching model first using amp_flow_training.py")
        return
    
    # Initialize generator
    generator = AMPGenerator(model_path, device='cuda')
    
    # Generate samples with different CFG scales
    print("\n1. Generating with CFG scale 0.0 (no conditioning)...")
    samples_no_cfg = generator.generate_amps(num_samples=20, num_steps=25, cfg_scale=0.0)
    
    print("\n2. Generating with CFG scale 3.0 (weak conditioning)...")
    samples_weak_cfg = generator.generate_amps(num_samples=20, num_steps=25, cfg_scale=3.0)
    
    print("\n3. Generating with CFG scale 7.5 (strong conditioning)...")
    samples_strong_cfg = generator.generate_amps(num_samples=20, num_steps=25, cfg_scale=7.5)
    
    print("\n4. Generating with CFG scale 15.0 (very strong conditioning)...")
    samples_very_strong_cfg = generator.generate_amps(num_samples=20, num_steps=25, cfg_scale=15.0)
    
    # Create output directory if it doesn't exist
    output_dir = '/data2/edwardsun/generated_samples'
    os.makedirs(output_dir, exist_ok=True)
    
    # Get today's date for filename
    today = datetime.now().strftime('%Y%m%d')
    
    # Save generated samples with date
    torch.save(samples_no_cfg, os.path.join(output_dir, f'generated_amps_best_model_no_cfg_{today}.pt'))
    torch.save(samples_weak_cfg, os.path.join(output_dir, f'generated_amps_best_model_weak_cfg_{today}.pt'))
    torch.save(samples_strong_cfg, os.path.join(output_dir, f'generated_amps_best_model_strong_cfg_{today}.pt'))
    torch.save(samples_very_strong_cfg, os.path.join(output_dir, f'generated_amps_best_model_very_strong_cfg_{today}.pt'))
    
    print("\n✓ Generation complete!")
    print(f"Generated samples saved (Date: {today}):")
    print(f"  - generated_amps_best_model_no_cfg_{today}.pt (no conditioning)")
    print(f"  - generated_amps_best_model_weak_cfg_{today}.pt (weak CFG)")
    print(f"  - generated_amps_best_model_strong_cfg_{today}.pt (strong CFG)")
    print(f"  - generated_amps_best_model_very_strong_cfg_{today}.pt (very strong CFG)")
    
    print("\nCFG Analysis:")
    print("  - CFG scale 0.0: No conditioning, generates diverse sequences")
    print("  - CFG scale 3.0: Weak AMP conditioning")
    print("  - CFG scale 7.5: Strong AMP conditioning (recommended)")
    print("  - CFG scale 15.0: Very strong AMP conditioning (may be too restrictive)")
    
    print("\nNext steps:")
    print("1. Decode embeddings back to sequences using ESM-2 decoder")
    print("2. Evaluate AMP properties (antimicrobial activity, toxicity)")
    print("3. Compare sequences generated with different CFG scales")
    print("4. Implement conditioning for specific properties")

if __name__ == "__main__":
    main()