import pdb
import torch
import torch.nn.functional as F
import torch.nn as nn
import pytorch_lightning as pl
import time
from transformers import AutoModel, AutoConfig, AutoTokenizer
import xgboost as xgb
import esm

from flow_matching.path import MixtureDiscreteProbPath
from flow_matching.path.scheduler import PolynomialConvexScheduler
from flow_matching.solver import MixtureDiscreteEulerSolver
from flow_matching.utils import ModelWrapper
from flow_matching.loss import MixturePathGeneralizedKL

from models.peptide_models import CNNModel
from modules.bindevaluator_modules import *

def parse_motifs(motif: str) -> list:
    parts = motif.split(',')
    result = []

    for part in parts:
        part = part.strip()
        if '-' in part:
            start, end = map(int, part.split('-'))
            result.extend(range(start, end + 1))
        else:
            result.append(int(part))

    # result = [pos-1 for pos in result]
    print(f'Target Motifs: {result}')
    return torch.tensor(result)

class BindEvaluator(pl.LightningModule):
    def __init__(self, n_layers, d_model, d_hidden, n_head,
                 d_k, d_v, d_inner, dropout=0.2,
                 learning_rate=0.00001, max_epochs=15, kl_weight=1):
        super(BindEvaluator, self).__init__()

        self.esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D")
        self.esm_model.eval()
        # freeze all the esm_model parameters
        for param in self.esm_model.parameters():
            param.requires_grad = False

        self.repeated_module = RepeatedModule3(n_layers, d_model, d_hidden,
                                               n_head, d_k, d_v, d_inner, dropout=dropout)

        self.final_attention_layer = MultiHeadAttentionSequence(n_head, d_model,
                                                                d_k, d_v, dropout=dropout)

        self.final_ffn = FFN(d_model, d_inner, dropout=dropout)

        self.output_projection_prot = nn.Linear(d_model, 1)

        self.learning_rate = learning_rate
        self.max_epochs = max_epochs
        self.kl_weight = kl_weight

        self.classification_threshold = nn.Parameter(torch.tensor(0.5))  # Initial threshold
        self.historical_memory = 0.9
        self.class_weights = torch.tensor([3.000471363174231, 0.5999811490272925])  # binding_site weights, non-bidning site weights

    def forward(self, binder_tokens, target_tokens):
        peptide_sequence = self.esm_model(**binder_tokens).last_hidden_state
        protein_sequence = self.esm_model(**target_tokens).last_hidden_state

        prot_enc, sequence_enc, sequence_attention_list, prot_attention_list, \
            seq_prot_attention_list, seq_prot_attention_list = self.repeated_module(peptide_sequence,
                                                                                    protein_sequence)

        prot_enc, final_prot_seq_attention = self.final_attention_layer(prot_enc, sequence_enc, sequence_enc)

        prot_enc = self.final_ffn(prot_enc)

        prot_enc = self.output_projection_prot(prot_enc)

        return prot_enc

    def get_probs(self, x_t, target_sequence):
        '''
        Inputs:
        - xt: Shape (bsz, seq_len)
        - target_sequence: Shape (1, tgt_len)
        '''
        # pdb.set_trace()
        target_sequence = target_sequence.repeat(x_t.shape[0], 1)
        binder_attention_mask = torch.ones_like(x_t)
        target_attention_mask = torch.ones_like(target_sequence)

        binder_attention_mask[:, 0] = binder_attention_mask[:, -1] = 0
        target_attention_mask[:, 0] = target_attention_mask[:, -1] = 0

        binder_tokens = {'input_ids': x_t, 'attention_mask': binder_attention_mask.to(x_t.device)}
        target_tokens = {'input_ids': target_sequence, 'attention_mask': target_attention_mask.to(target_sequence.device)}
        
        logits = self.forward(binder_tokens, target_tokens).squeeze(-1)
        # pdb.set_trace()
        logits[:, 0] = logits[:, -1] = -100 # float('-inf')
        probs = torch.sigmoid(logits)

        return probs    # shape (bsz, tgt_len)

    def motif_score(self, x_t, target_sequence, motifs):
        probs = self.get_probs(x_t, target_sequence)
        motif_probs = probs[:, motifs]
        motif_score = motif_probs.sum(dim=-1) / len(motifs)
        # pdb.set_trace()
        return motif_score

    def non_motif_score(self, x_t, target_sequence, motifs):
        probs = self.get_probs(x_t, target_sequence)
        non_motif_probs = probs[:, [i for i in range(probs.shape[1]) if i not in motifs]]
        mask = non_motif_probs >= 0.5
        count = mask.sum(dim=-1)

        non_motif_score = torch.where(count > 0, (non_motif_probs * mask).sum(dim=-1) / count, torch.zeros_like(count))

        return non_motif_score

    def scoring(self, x_t, target_sequence, motifs, penalty=False):
        probs = self.get_probs(x_t, target_sequence)
        motif_probs = probs[:, motifs]
        motif_score = motif_probs.sum(dim=-1) / len(motifs)
        # pdb.set_trace()

        if penalty:
            non_motif_probs = probs[:, [i for i in range(probs.shape[1]) if i not in motifs]]
            mask = non_motif_probs >= 0.5
            count = mask.sum(dim=-1)
            # non_motif_score = 1 - torch.where(count > 0, (non_motif_probs * mask).sum(dim=-1) / count, torch.zeros_like(count))
            non_motif_score = count / target_sequence.shape[1]
            return motif_score, 1 - non_motif_score
        else:
            return motif_score

class MotifModel(nn.Module):
    def __init__(self, bindevaluator, target_sequence, motifs, penalty=False):
        super(MotifModel, self).__init__()
        self.bindevaluator = bindevaluator
        self.target_sequence = target_sequence
        self.motifs = motifs
        self.penalty = penalty
    
    def forward(self, x):
        return self.bindevaluator.scoring(x, self.target_sequence, self.motifs, self.penalty)

class UnpooledBindingPredictor(nn.Module):
    def __init__(self, 
                 esm_model_name="facebook/esm2_t33_650M_UR50D",
                 hidden_dim=512,
                 kernel_sizes=[3, 5, 7],
                 n_heads=8,
                 n_layers=3,
                 dropout=0.1,
                 freeze_esm=True):
        super().__init__()
        
        # Define binding thresholds
        self.tight_threshold = 7.5    # Kd/Ki/IC50 ≤ ~30nM
        self.weak_threshold = 6.0     # Kd/Ki/IC50 > 1μM
        
        # Load ESM model for computing embeddings on the fly
        self.esm_model = AutoModel.from_pretrained(esm_model_name)
        self.config = AutoConfig.from_pretrained(esm_model_name)
        
        # Freeze ESM parameters if needed
        if freeze_esm:
            for param in self.esm_model.parameters():
                param.requires_grad = False
        
        # Get ESM hidden size
        esm_dim = self.config.hidden_size
        
        # Output channels for CNN layers
        output_channels_per_kernel = 64
        
        # CNN layers for handling variable length sequences
        self.protein_conv_layers = nn.ModuleList([
            nn.Conv1d(
                in_channels=esm_dim,
                out_channels=output_channels_per_kernel,
                kernel_size=k,
                padding='same'
            ) for k in kernel_sizes
        ])
        
        self.binder_conv_layers = nn.ModuleList([
            nn.Conv1d(
                in_channels=esm_dim,
                out_channels=output_channels_per_kernel,
                kernel_size=k,
                padding='same'
            ) for k in kernel_sizes
        ])
        
        # Calculate total features after convolution and pooling
        total_features_per_seq = output_channels_per_kernel * len(kernel_sizes) * 2
        
        # Project to same dimension after CNN processing
        self.protein_projection = nn.Linear(total_features_per_seq, hidden_dim)
        self.binder_projection = nn.Linear(total_features_per_seq, hidden_dim)
        
        self.protein_norm = nn.LayerNorm(hidden_dim)
        self.binder_norm = nn.LayerNorm(hidden_dim)
        
        # Cross attention blocks with layer norm
        self.cross_attention_layers = nn.ModuleList([
            nn.ModuleDict({
                'attention': nn.MultiheadAttention(hidden_dim, n_heads, dropout=dropout),
                'norm1': nn.LayerNorm(hidden_dim),
                'ffn': nn.Sequential(
                    nn.Linear(hidden_dim, hidden_dim * 4),
                    nn.ReLU(),
                    nn.Dropout(dropout),
                    nn.Linear(hidden_dim * 4, hidden_dim)
                ),
                'norm2': nn.LayerNorm(hidden_dim)
            }) for _ in range(n_layers)
        ])
        
        # Prediction heads
        self.shared_head = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
        )
        
        # Regression head
        self.regression_head = nn.Linear(hidden_dim, 1)
        
        # Classification head (3 classes: tight, medium, loose binding)
        self.classification_head = nn.Linear(hidden_dim, 3)
        
    def get_binding_class(self, affinity):
        """Convert affinity values to class indices
        0: tight binding (>= 7.5)
        1: medium binding (6.0-7.5)
        2: weak binding (< 6.0)
        """
        if isinstance(affinity, torch.Tensor):
            tight_mask = affinity >= self.tight_threshold
            weak_mask = affinity < self.weak_threshold
            medium_mask = ~(tight_mask | weak_mask)
            
            classes = torch.zeros_like(affinity, dtype=torch.long)
            classes[medium_mask] = 1
            classes[weak_mask] = 2
            return classes
        else:
            if affinity >= self.tight_threshold:
                return 0  # tight binding
            elif affinity < self.weak_threshold:
                return 2  # weak binding
            else:
                return 1  # medium binding
    
    def compute_embeddings(self, input_ids, attention_mask=None):
        """Compute ESM embeddings on the fly"""
        esm_outputs = self.esm_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        
        # Get the unpooled last hidden states (batch_size x seq_length x hidden_size)
        return esm_outputs.last_hidden_state
    
    def process_sequence(self, unpooled_emb, conv_layers, attention_mask=None):
        """Process a sequence through CNN layers and pooling"""
        # Transpose for CNN: [batch_size, hidden_size, seq_length]
        x = unpooled_emb.transpose(1, 2)
        
        # Apply CNN layers and collect outputs
        conv_outputs = []
        for conv in conv_layers:
            conv_out = F.relu(conv(x))
            conv_outputs.append(conv_out)
        
        # Concatenate along channel dimension
        conv_output = torch.cat(conv_outputs, dim=1)
        
        # Global pooling (both max and average)
        # If attention mask is provided, use it to create a proper mask for pooling
        if attention_mask is not None:
            # Create a mask for pooling (1 for valid positions, 0 for padding)
            # Expand mask to match conv_output channels
            expanded_mask = attention_mask.unsqueeze(1).expand(-1, conv_output.size(1), -1)
            
            # Apply mask (set padding to large negative value for max pooling)
            masked_output = conv_output.clone()
            masked_output = masked_output.masked_fill(expanded_mask == 0, float('-inf'))
            
            # Max pooling along sequence dimension
            max_pooled = torch.max(masked_output, dim=2)[0]
            
            # Average pooling (sum divided by number of valid positions)
            sum_pooled = torch.sum(conv_output * expanded_mask, dim=2)
            valid_positions = torch.sum(expanded_mask, dim=2)
            valid_positions = torch.clamp(valid_positions, min=1.0)  # Avoid division by zero
            avg_pooled = sum_pooled / valid_positions
        else:
            # If no mask, use standard pooling
            max_pooled = torch.max(conv_output, dim=2)[0]
            avg_pooled = torch.mean(conv_output, dim=2)
        
        # Concatenate the pooled features
        pooled = torch.cat([max_pooled, avg_pooled], dim=1)
        
        return pooled
        
    def forward(self, protein_input_ids, binder_input_ids, protein_mask=None, binder_mask=None):
        # Compute embeddings on the fly using the ESM model
        protein_unpooled = self.compute_embeddings(protein_input_ids, protein_mask)
        binder_unpooled = self.compute_embeddings(binder_input_ids, binder_mask)
        
        # Process protein and binder sequences through CNN layers
        protein_features = self.process_sequence(protein_unpooled, self.protein_conv_layers, protein_mask)
        binder_features = self.process_sequence(binder_unpooled, self.binder_conv_layers, binder_mask)
        
        # Project to same dimension
        protein = self.protein_norm(self.protein_projection(protein_features))
        binder = self.binder_norm(self.binder_projection(binder_features))
        
        # Reshape for attention: from [batch_size, hidden_dim] to [1, batch_size, hidden_dim]
        protein = protein.unsqueeze(0)
        binder = binder.unsqueeze(0)
        
        # Cross attention layers
        for layer in self.cross_attention_layers:
            # Protein attending to binder
            attended_protein = layer['attention'](
                protein, binder, binder
            )[0]
            protein = layer['norm1'](protein + attended_protein)
            protein = layer['norm2'](protein + layer['ffn'](protein))
            
            # Binder attending to protein
            attended_binder = layer['attention'](
                binder, protein, protein
            )[0]
            binder = layer['norm1'](binder + attended_binder)
            binder = layer['norm2'](binder + layer['ffn'](binder))
        
        # Remove sequence dimension
        protein_pool = protein.squeeze(0)
        binder_pool = binder.squeeze(0)
        
        # Concatenate both representations
        combined = torch.cat([protein_pool, binder_pool], dim=-1)
        
        # Shared features
        shared_features = self.shared_head(combined)
        
        regression_output = self.regression_head(shared_features)
        # classification_logits = self.classification_head(shared_features)
        
        # return regression_output, classification_logits
        return regression_output

class ImprovedBindingPredictor(nn.Module):
    def __init__(self, 
                 esm_dim=1280,
                 smiles_dim=1280,
                 hidden_dim=512,
                 n_heads=8,
                 n_layers=5,
                 dropout=0.1):
        super().__init__()
        
        # Define binding thresholds
        self.tight_threshold = 7.5    # Kd/Ki/IC50 ≤ ~30nM
        self.weak_threshold = 6.0     # Kd/Ki/IC50 > 1μM
        
        # Project to same dimension
        self.smiles_projection = nn.Linear(smiles_dim, hidden_dim)
        self.protein_projection = nn.Linear(esm_dim, hidden_dim)
        self.protein_norm = nn.LayerNorm(hidden_dim)
        self.smiles_norm = nn.LayerNorm(hidden_dim)
        
        # Cross attention blocks with layer norm
        self.cross_attention_layers = nn.ModuleList([
            nn.ModuleDict({
                'attention': nn.MultiheadAttention(hidden_dim, n_heads, dropout=dropout),
                'norm1': nn.LayerNorm(hidden_dim),
                'ffn': nn.Sequential(
                    nn.Linear(hidden_dim, hidden_dim * 4),
                    nn.ReLU(),
                    nn.Dropout(dropout),
                    nn.Linear(hidden_dim * 4, hidden_dim)
                ),
                'norm2': nn.LayerNorm(hidden_dim)
            }) for _ in range(n_layers)
        ])
        
        # Prediction heads
        self.shared_head = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
        )
        
        # Regression head
        self.regression_head = nn.Linear(hidden_dim, 1)
        
        # Classification head (3 classes: tight, medium, loose binding)
        self.classification_head = nn.Linear(hidden_dim, 3)
        
    def get_binding_class(self, affinity):
        """Convert affinity values to class indices
        0: tight binding (>= 7.5)
        1: medium binding (6.0-7.5)
        2: weak binding (< 6.0)
        """
        if isinstance(affinity, torch.Tensor):
            tight_mask = affinity >= self.tight_threshold
            weak_mask = affinity < self.weak_threshold
            medium_mask = ~(tight_mask | weak_mask)
            
            classes = torch.zeros_like(affinity, dtype=torch.long)
            classes[medium_mask] = 1
            classes[weak_mask] = 2
            return classes
        else:
            if affinity >= self.tight_threshold:
                return 0  # tight binding
            elif affinity < self.weak_threshold:
                return 2  # weak binding
            else:
                return 1  # medium binding
        
    def forward(self, protein_emb, binder_emb):
        
        protein = self.protein_norm(self.protein_projection(protein_emb))
        smiles = self.smiles_norm(self.smiles_projection(binder_emb))
        
        protein = protein.transpose(0, 1)
        smiles = smiles.transpose(0, 1)
        
        # Cross attention layers
        for layer in self.cross_attention_layers:
            # Protein attending to SMILES
            attended_protein = layer['attention'](
                protein, smiles, smiles
            )[0]
            protein = layer['norm1'](protein + attended_protein)
            protein = layer['norm2'](protein + layer['ffn'](protein))
            
            # SMILES attending to protein
            attended_smiles = layer['attention'](
                smiles, protein, protein
            )[0]
            smiles = layer['norm1'](smiles + attended_smiles)
            smiles = layer['norm2'](smiles + layer['ffn'](smiles))
        
        # Get sequence-level representations
        protein_pool = torch.mean(protein, dim=0)
        smiles_pool = torch.mean(smiles, dim=0)
        
        # Concatenate both representations
        combined = torch.cat([protein_pool, smiles_pool], dim=-1)
        
        # Shared features
        shared_features = self.shared_head(combined)
        
        regression_output = self.regression_head(shared_features)
        
        return regression_output

class PooledAffinityModel(nn.Module):
    def __init__(self, affinity_predictor, target_sequence):
        super(PooledAffinityModel, self).__init__()
        self.affinity_predictor = affinity_predictor
        self.target_sequence = target_sequence
        self.esm_model = AutoModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(self.target_sequence.device)
        for param in self.esm_model.parameters():
            param.requires_grad = False
    
    def compute_embeddings(self, input_ids, attention_mask=None):
        """Compute ESM embeddings on the fly"""
        esm_outputs = self.esm_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        
        # Get the unpooled last hidden states (batch_size x seq_length x hidden_size)
        return esm_outputs.last_hidden_state
    
    def forward(self, x):
        target_sequence = self.target_sequence.repeat(x.shape[0], 1)

        protein_emb = self.compute_embeddings(input_ids=target_sequence)
        binder_emb = self.compute_embeddings(input_ids=x)
        return self.affinity_predictor(protein_emb=protein_emb, binder_emb=binder_emb).squeeze(-1)  

class AffinityModel(nn.Module):
    def __init__(self, affinity_predictor, target_sequence):
        super(AffinityModel, self).__init__()
        self.affinity_predictor = affinity_predictor
        self.target_sequence = target_sequence
    
    def forward(self, x):
        target_sequence = self.target_sequence.repeat(x.shape[0], 1)
        affinity = self.affinity_predictor(protein_input_ids=target_sequence, binder_input_ids=x).squeeze(-1) 
        return affinity / 10

class HemolysisModel:
    def __init__(self, device):
        self.predictor = xgb.Booster(model_file='../classifier_ckpt/wt_hemolysis.json')
        
        self.model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
        self.model.eval()

        self.device = device
    
    def generate_embeddings(self, sequences):
        """Generate ESM embeddings for protein sequences"""
        with torch.no_grad():
            embeddings = self.model(input_ids=sequences).last_hidden_state.mean(dim=1)
            embeddings = embeddings.cpu().numpy()
        
        return embeddings
    
    def get_scores(self, input_seqs):
        scores = np.ones(len(input_seqs))
        features = self.generate_embeddings(input_seqs)
        
        if len(features) == 0:
            return scores
        
        features = np.nan_to_num(features, nan=0.)
        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
        
        features = xgb.DMatrix(features)
        
        probs = self.predictor.predict(features)
        # return the probability of it being not hemolytic
        return torch.from_numpy(scores - probs).to(self.device)
    
    def __call__(self, input_seqs: list):
        scores = self.get_scores(input_seqs)
        return scores

# ======================== MLP =========================================
# Still need mean pooling along lengths
class MaskedMeanPool(nn.Module):
    def forward(self, X, M):  # X: (B,L,H), M: (B,L)
        Mf = M.unsqueeze(-1).float()
        denom = Mf.sum(dim=1).clamp(min=1.0)
        return (X * Mf).sum(dim=1) / denom  # (B,H)

class MLPClassifier(nn.Module):
    def __init__(self, in_dim, hidden=512, dropout=0.1):
        super().__init__()
        self.pool = MaskedMeanPool()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, 1),
        )
    def forward(self, X, M):
        z = self.pool(X, M)
        return self.net(z).squeeze(-1)  # logits
# ======================== MLP =========================================

class NonfoulingModel:
    def __init__(self, device):
        ckpt = torch.load('../classifier_ckpt/wt_nonfouling.pt', weights_only=False, map_location=device)
        best_params = ckpt["best_params"]
        self.predictor = MLPClassifier(in_dim=1280, hidden=int(best_params["hidden"]), dropout=float(best_params.get("dropout", 0.1)))
        self.predictor.load_state_dict(ckpt["state_dict"])
        self.predictor = self.predictor.to(device)
        self.predictor.eval()

        self.model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
        self.model.eval()

        self.device = device
    
    def get_scores(self, input_ids, attention_mask):
        with torch.no_grad():
            features = self.model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        scores = self.predictor(features, attention_mask)
        return scores
    
    def __call__(self, input_ids):
        attention_mask = torch.ones_like(input_ids).to(self.device)
        scores = self.get_scores(input_ids, attention_mask)
        return 1.0 / (1.0 + torch.exp(-scores))
    
class SolubilityModel:
    def __init__(self, device):
        # change model path
        self.predictor = xgb.Booster(model_file='../classifier_ckpt/best_model_solubility.json')
        
        self.model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
        self.model.eval()

        self.device = device
    
    def generate_embeddings(self, sequences):
        """Generate ESM embeddings for protein sequences"""
        with torch.no_grad():
            embeddings = self.model(input_ids=sequences).last_hidden_state.mean(dim=1)
            embeddings = embeddings.cpu().numpy()
        
        return embeddings
    
    def get_scores(self, input_seqs: list):
        scores = np.zeros(len(input_seqs))
        features = self.generate_embeddings(input_seqs)
        
        if len(features) == 0:
            return scores
        
        features = np.nan_to_num(features, nan=0.)
        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
        
        features = xgb.DMatrix(features)
        
        scores = self.predictor.predict(features)
        return torch.from_numpy(scores).to(self.device)
    
    def __call__(self, input_seqs: list):
        scores = self.get_scores(input_seqs)
        return scores

class SolubilityModelNew:
    def __init__(self, device):
        self.hydro_ids = torch.tensor([5, 7, 4, 12, 20, 18, 22, 14], device=device)
        self.device = device

    def get_scores(self, x):
        a = x[:, 1:-1]
        mask = (a.unsqueeze(-1) == self.hydro_ids).any(dim=-1)
        ratios = mask.float().mean(dim=1)
        return 1 - ratios
    
    def __call__(self, input_seqs: list):
        scores = self.get_scores(input_seqs)
        return scores

class PeptideCNN(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, dropout_rate):
        super().__init__()
        self.conv1 = nn.Conv1d(input_dim, hidden_dims[0], kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_dims[0], hidden_dims[1], kernel_size=5, padding=1)
        self.fc = nn.Linear(hidden_dims[1], output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.predictor = nn.Linear(output_dim, 1)  # For regression/classification

        self.esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D")
        self.esm_model.eval()

    def forward(self, input_ids, attention_mask=None, return_features=False):
        with torch.no_grad():
            x = self.esm_model(input_ids, attention_mask).last_hidden_state
        # x shape: (B, L, input_dim)
        x = x.permute(0, 2, 1)  # Reshape to (B, input_dim, L) for Conv1d
        x = nn.functional.relu(self.conv1(x))
        x = self.dropout(x)
        x = nn.functional.relu(self.conv2(x))
        x = self.dropout(x)
        x = x.permute(0, 2, 1)  # Reshape back to (B, L, hidden_dims[1])
        
        # Global average pooling over the sequence dimension (L)
        x = x.mean(dim=1)  # Shape: (B, hidden_dims[1])
        
        features = self.fc(x)  # features shape: (B, output_dim)
        if return_features:
            return features
        return self.predictor(features)  # Output shape: (B, 1)


# -----------------------------
# Model definition (must match training)
# -----------------------------
class TransformerRegressor(nn.Module):
    def __init__(self, in_dim, d_model=256, nhead=8, layers=2, ff=512, dropout=0.1):
        super().__init__()
        self.proj = nn.Linear(in_dim, d_model)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=ff,
            dropout=dropout,
            batch_first=True,
            activation="gelu",
        )
        self.enc = nn.TransformerEncoder(enc_layer, num_layers=layers)
        self.head = nn.Linear(d_model, 1)

    def forward(self, X, M):
        # M: True = keep token, False = padding
        pad_mask = ~M
        Z = self.proj(X)
        Z = self.enc(Z, src_key_padding_mask=pad_mask)
        Mf = M.unsqueeze(-1).float()
        denom = Mf.sum(dim=1).clamp(min=1.0)
        pooled = (Z * Mf).sum(dim=1) / denom
        return self.head(pooled).squeeze(-1)


def build_model(model_name: str, in_dim: int, params: dict) -> nn.Module:
    # In your training code, transformer uses fixed architecture values (d_model/nhead/layers/ff/dropout).
    # (See build_model in finetune_nn_cv.py :contentReference[oaicite:2]{index=2})
    if model_name != "transformer":
        raise ValueError(f"This inference file currently supports model_name='transformer', got: {model_name}")
    return TransformerRegressor(
        in_dim=in_dim,
        d_model=384,
        nhead=4,
        layers=1,
        ff=512,
        dropout=0.1521676463658988,
    )

def _clean_state_dict(state_dict: dict) -> dict:
    cleaned = {}
    for k, v in state_dict.items():
        if k.startswith("module."):
            k = k[len("module.") :]
        if k.startswith("model."):
            k = k[len("model.") :]
        cleaned[k] = v
    return cleaned

class HalfLifeModel:
    """
    Loads:
      - ESM2 encoder to generate *unpooled* token embeddings (per residue)
      - Your fine-tuned TransformerRegressor from final_model.pt

    By default, __call__ returns "hours":
      - if ckpt['target_col'] == 'log_label' -> expm1(pred)
      - else -> raw pred
    """

    def __init__(
        self,
        device,
        ckpt_path = "../classifier_ckpt/wt_halflife.pt",
    ):
        self.device = device

        # --- load NN checkpoint (saved by your finetune script) ---
        ckpt = torch.load(ckpt_path, map_location="cpu")
        if not isinstance(ckpt, dict) or "state_dict" not in ckpt:
            raise ValueError(f"Checkpoint at {ckpt_path} is not the expected dict with a 'state_dict' key.")

        self.best_params = ckpt.get("best_params", {})
        self.in_dim = int(ckpt.get("in_dim"))
        self.target_col = ckpt.get("target_col", "label")  # 'log_label' or 'label'

        # --- build + load regressor ---
        self.regressor = build_model(model_name="transformer", in_dim=self.in_dim, params=self.best_params)
        self.regressor.load_state_dict(_clean_state_dict(ckpt["state_dict"]), strict=True)
        self.regressor.to(self.device)
        self.regressor.eval()

        # --- ESM2 embedding model ---
        self.emb_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(self.device)
        self.emb_model.eval()
        self.tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")

        # sanity: ESM2 hidden size should match training in_dim
        esm_hidden = int(self.emb_model.config.hidden_size)
        if esm_hidden != self.in_dim:
            raise ValueError(
                f"Mismatch: ESM hidden_size={esm_hidden}, but checkpoint in_dim={self.in_dim}.\n"
                f"Did you train on a different embedding model/dimension than facebook/esm2_t33_650M_UR50D?"
            )

    @torch.no_grad()
    def _embed_unpooled_batch(self, sequences):
        out = self.emb_model(input_ids=sequences)
        hs = out.last_hidden_state  # (B, T, H)

        per_seq = []
        lengths = []

        for i in range(hs.shape[0]):
            emb = hs[i, 1:-1, :]    # (L, H)
            per_seq.append(emb)
            lengths.append(int(emb.shape[0]))

        Lmax = max(lengths) if lengths else 0
        H = hs.shape[-1]
        X = hs.new_zeros((len(sequences), Lmax, H), dtype=torch.float32)
        M = torch.zeros((len(sequences), Lmax), dtype=torch.bool, device=self.device)

        for i, emb in enumerate(per_seq):
            L = emb.shape[0]
            if L == 0:
                continue
            X[i, :L, :] = emb.to(torch.float32)
            M[i, :L] = True

        return X, M

    @torch.no_grad()
    def predict_raw(self, input_seqs):
        """
        Returns the regressor output in the same space as training target_col:
          - if trained on log_label -> returns log1p(hours)
          - if trained on label     -> returns hours (or whatever label scale was)
        """
        if len(input_seqs) == 0:
            return np.array([], dtype=np.float32)

        X, M = self._embed_unpooled_batch(input_seqs)
        yhat = self.regressor(X, M).detach().cpu().numpy().astype(np.float32)  # (B,)
        # pdb.set_trace()
        return yhat

    def predict_hours(self, input_seqs) -> np.ndarray:
        """
        If your model was trained on log_label, convert back to hours via expm1.
        Otherwise returns raw predictions.
        """
        raw = self.predict_raw(input_seqs)
        if self.target_col == "log_label":
            return np.expm1(raw).astype(np.float32)
        return raw.astype(np.float32)

    def __call__(self, input_seqs) -> np.ndarray:
        return torch.from_numpy(self.predict_hours(input_seqs)).to(self.device)


def load_bindevaluator(checkpoint_path, device):
    bindevaluator = BindEvaluator.load_from_checkpoint(checkpoint_path, n_layers=8, d_model=128, d_hidden=128, n_head=8, d_k=64, d_v=128, d_inner=64).to(device)
    bindevaluator.eval()
    for param in bindevaluator.parameters():
        param.requires_grad = False

    return bindevaluator


def load_solver(checkpoint_path, vocab_size, device):
    lr = 1e-4
    epochs = 200
    embed_dim = 512
    hidden_dim = 256
    epsilon = 1e-3
    batch_size = 256
    warmup_epochs = epochs // 10
    device = 'cuda:0'
    

    probability_denoiser = CNNModel(alphabet_size=vocab_size, embed_dim=embed_dim, hidden_dim=hidden_dim).to(device)
    probability_denoiser.load_state_dict(torch.load(checkpoint_path, map_location=device, weights_only=False))
    probability_denoiser.eval()
    for param in probability_denoiser.parameters():
        param.requires_grad = False

    # instantiate a convex path object
    scheduler = PolynomialConvexScheduler(n=2.0)
    path = MixtureDiscreteProbPath(scheduler=scheduler)

    class WrappedModel(ModelWrapper):
        def forward(self, x: torch.Tensor, t: torch.Tensor, **extras):
            return torch.softmax(self.model(x, t), dim=-1)

    wrapped_probability_denoiser = WrappedModel(probability_denoiser)
    solver = MixtureDiscreteEulerSolver(model=wrapped_probability_denoiser, path=path, vocabulary_size=vocab_size)

    return solver


def load_pooled_affinity_predictor(checkpoint_path, device):
    """Load trained model from checkpoint."""
    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)

    model = ImprovedBindingPredictor().to(device)
    
    # Load the trained weights
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()  # Set to evaluation mode
    
    return model

def load_affinity_predictor(checkpoint_path, device):
    """Load trained model from checkpoint."""
    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)

    model = UnpooledBindingPredictor(
        esm_model_name="facebook/esm2_t33_650M_UR50D",
        hidden_dim=384,
        kernel_sizes=[3, 5, 7],
        n_heads=8,
        n_layers=4,
        dropout=0.14561457009902096,
        freeze_esm=True
    ).to(device)

    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval() 
    
    return model