Tong Chen commited on Sep 30, 2025

Commit

d2693e0

1 Parent(s): f9d1b81

add files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

peptide/ckpt/PepReDi_base.pt +3 -0
peptide/ckpt/PepReDi_v1.pt +3 -0
peptide/ckpt/PepReDi_v2.pt +3 -0
peptide/ckpt/PepReDi_v3.pt +3 -0
peptide/classifier_ckpt/best_model_half_life.pth +3 -0
peptide/classifier_ckpt/best_model_hemolysis.json +0 -0
peptide/classifier_ckpt/best_model_nonfouling.json +0 -0
peptide/classifier_ckpt/best_model_solubility.json +0 -0
peptide/classifier_ckpt/binding_affinity_pooled.pt +3 -0
peptide/classifier_ckpt/binding_affinity_unpooled.pt +3 -0
peptide/data/test/data-00000-of-00001.arrow +3 -0
peptide/data/test/dataset_info.json +15 -0
peptide/data/test/state.json +13 -0
peptide/data/train/data-00000-of-00001.arrow +3 -0
peptide/data/train/dataset_info.json +15 -0
peptide/data/train/state.json +13 -0
peptide/data/val/data-00000-of-00001.arrow +3 -0
peptide/data/val/dataset_info.json +15 -0
peptide/data/val/state.json +13 -0
peptide/generation.py +213 -0
peptide/moo.py +284 -0
peptide/new_coupling.py +226 -0
peptide/peptide_classifiers.py +568 -0
peptide/rectified_datasets/v1/dataset_dict.json +1 -0
peptide/rectified_datasets/v1/test/data-00000-of-00001.arrow +3 -0
peptide/rectified_datasets/v1/test/dataset_info.json +28 -0
peptide/rectified_datasets/v1/test/state.json +13 -0
peptide/rectified_datasets/v1/train/data-00000-of-00001.arrow +3 -0
peptide/rectified_datasets/v1/train/dataset_info.json +28 -0
peptide/rectified_datasets/v1/train/state.json +13 -0
peptide/rectified_datasets/v1/validation/data-00000-of-00001.arrow +3 -0
peptide/rectified_datasets/v1/validation/dataset_info.json +28 -0
peptide/rectified_datasets/v1/validation/state.json +13 -0
peptide/rectified_datasets/v2/dataset_dict.json +1 -0
peptide/rectified_datasets/v2/test/data-00000-of-00001.arrow +3 -0
peptide/rectified_datasets/v2/test/dataset_info.json +28 -0
peptide/rectified_datasets/v2/test/state.json +13 -0
peptide/rectified_datasets/v2/train/data-00000-of-00001.arrow +3 -0
peptide/rectified_datasets/v2/train/dataset_info.json +28 -0
peptide/rectified_datasets/v2/train/state.json +13 -0
peptide/rectified_datasets/v2/validation/data-00000-of-00001.arrow +3 -0
peptide/rectified_datasets/v2/validation/dataset_info.json +28 -0
peptide/rectified_datasets/v2/validation/state.json +13 -0
peptide/rectified_datasets/v3/dataset_dict.json +1 -0
peptide/rectified_datasets/v3/test/data-00000-of-00001.arrow +3 -0
peptide/rectified_datasets/v3/test/dataset_info.json +28 -0
peptide/rectified_datasets/v3/test/state.json +13 -0
peptide/rectified_datasets/v3/train/data-00000-of-00001.arrow +3 -0
peptide/rectified_datasets/v3/train/dataset_info.json +28 -0
peptide/rectified_datasets/v3/train/state.json +13 -0

peptide/ckpt/PepReDi_base.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b6437edee514bd7adb9aacea776f3dd97c59ebc7b4928b390eafdd87eaeb8c9
+size 344474053

peptide/ckpt/PepReDi_v1.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b72e98ce0752e6c4805179b37f09c2e3824d22db0113e165988ee4017d6f3d39
+size 344457840

peptide/ckpt/PepReDi_v2.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6de709e0c6ce9356d37dfc9b6238249c129ee2fd30ad6d03da3b35d5f5a5f8ad
+size 344457840

peptide/ckpt/PepReDi_v3.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7456b4d156e922a4a499573df4a0cf228315c5540bc67b842af274f605561f79
+size 344457840

peptide/classifier_ckpt/best_model_half_life.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f80f1b20e90ba30503804c738aad4b3bb253424ff2e6e8a86c8e13a2fa1669f9
+size 2623795199

peptide/classifier_ckpt/best_model_hemolysis.json ADDED Viewed

The diff for this file is too large to render. See raw diff

peptide/classifier_ckpt/best_model_nonfouling.json ADDED Viewed

The diff for this file is too large to render. See raw diff

peptide/classifier_ckpt/best_model_solubility.json ADDED Viewed

The diff for this file is too large to render. See raw diff

peptide/classifier_ckpt/binding_affinity_pooled.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91f60e417dfa64277e433b5bc841060d295b43f2d9c19b277b954ce447b44949
+size 211324073

peptide/classifier_ckpt/binding_affinity_unpooled.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc28ae9f09b981b07547a773ca2e07f241cb08b3b8aa901e66627ff153f3aa8b
+size 2731670995

peptide/data/test/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f3a4dde7bb38d2ae4aae44265f1beef5df22f36d21f17e6813641e090cc679c
+size 82440

peptide/data/test/dataset_info.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

peptide/data/test/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "ae4d0541bd157aeb",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

peptide/data/train/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:646801034bcc9683b219f5d3195c4f4ce6551c5ecbc1b1bcbdfd8a7027d8a49e
+size 641784

peptide/data/train/dataset_info.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

peptide/data/train/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "7b63856b107c2d5c",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

peptide/data/val/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f3a4dde7bb38d2ae4aae44265f1beef5df22f36d21f17e6813641e090cc679c
+size 82440

peptide/data/val/dataset_info.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "List"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

peptide/data/val/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "ae4d0541bd157aeb",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

peptide/generation.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import argparse
+import math
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+from transformers import AutoTokenizer
+# --- Model Architecture ---
+def modulate(x, shift, scale):
+    """
+    Modulates the input tensor x with a shift and scale.
+    """
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds a continuous scalar timestep t in [0, 1] into a vector representation.
+    """
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(1, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+    def forward(self, t):
+        # t is shape (batch_size,), needs to be (batch_size, 1) for the Linear layer.
+        return self.mlp(t.unsqueeze(-1))
+class DiTBlock(nn.Module):
+    """
+    A single block of the Diffusion Transformer.
+    """
+    def __init__(self, hidden_size, n_heads):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = nn.MultiheadAttention(hidden_size, n_heads, batch_first=True)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(hidden_size, 4 * hidden_size),
+            nn.GELU(),
+            nn.Linear(4 * hidden_size, hidden_size)
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
+        x_norm1 = modulate(self.norm1(x), shift_msa, scale_msa)
+        attn_output, _ = self.attn(x_norm1, x_norm1, x_norm1)
+        x = x + gate_msa.unsqueeze(1) * attn_output
+        x_norm2 = modulate(self.norm2(x), shift_mlp, scale_mlp)
+        mlp_output = self.mlp(x_norm2)
+        x = x + gate_mlp.unsqueeze(1) * mlp_output
+        return x
+class MDLM(nn.Module):
+    """
+    Masked Diffusion Language Model (MDLM) using a DiT backbone.
+    """
+    def __init__(self, vocab_size, seq_len, model_dim, n_heads, n_layers):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.seq_len = seq_len
+        self.model_dim = model_dim
+        self.mask_token_id = vocab_size # Use vocab_size as the ID for the mask token
+        self.token_embedder = nn.Embedding(vocab_size + 1, model_dim) # +1 for the mask token
+        self.pos_embedder = nn.Parameter(torch.randn(1, seq_len, model_dim))
+        self.time_embedder = TimestepEmbedder(model_dim)
+        self.transformer_blocks = nn.ModuleList([
+            DiTBlock(model_dim, n_heads) for _ in range(n_layers)
+        ])
+        self.final_norm = nn.LayerNorm(model_dim)
+        self.lm_head = nn.Linear(model_dim, vocab_size)
+    def forward(self, x, t):
+        seq_len = x.shape[1]
+        x_embed = self.token_embedder(x) + self.pos_embedder[:, :seq_len, :]
+        t_embed = self.time_embedder(t)
+        for block in self.transformer_blocks:
+            x_embed = block(x_embed, t_embed)
+        x_embed = self.final_norm(x_embed)
+        logits = self.lm_head(x_embed)
+        return logits
+# --- Generation Function ---
+def generate_samples(model, device, num_samples, seq_len, steps, temperature):
+    """
+    Generates samples by starting from a random sequence and progressively refining it.
+    """
+    model.eval()
+    # Start with a completely random sequence of tokens
+    shape = (num_samples, seq_len)
+    x = torch.randint(0, model.vocab_size, shape, dtype=torch.long, device=device)
+    # Cosine schedule determines how many tokens we *keep* from the previous step.
+    # It goes from 0 (keep none) to seq_len (keep all).
+    keep_schedule = torch.cos(torch.linspace(math.pi / 2, 0, steps, device=device)) * seq_len
+    keep_schedule = torch.round(keep_schedule).long()
+    with torch.no_grad():
+        progress_bar = tqdm(range(steps), desc="Generating Samples")
+        for i in progress_bar:
+            # Time `t` should go from 0 (pure noise) up to 1 (pure data)
+            t_continuous = torch.full((num_samples,), (i) / steps, device=device)
+            logits = model(x, t_continuous)
+            # Apply temperature scaling to control diversity
+            scaled_logits = logits / temperature
+            probs = torch.nn.functional.softmax(scaled_logits, dim=-1)
+            # Sample a full new sequence from the model's prediction
+            sampled_tokens = torch.multinomial(probs.view(-1, model.vocab_size), 1).view(shape)
+            # For the last step, the new sample is our final result
+            if i == steps - 1:
+                x = sampled_tokens
+                break
+            # Determine which tokens from the *newly sampled sequence* to keep, based on confidence
+            confidence = torch.gather(probs, 2, sampled_tokens.unsqueeze(-1)).squeeze(-1)
+            # Find the indices of the most confident tokens to keep
+            num_to_keep = keep_schedule[i]
+            _, indices_to_keep = torch.topk(confidence, num_to_keep, largest=True, dim=-1)
+            # Create a mask for the tokens we are keeping
+            keep_mask = torch.zeros_like(x, dtype=torch.bool).scatter_(1, indices_to_keep, True)
+            # The next sequence `x` is a mix:
+            # - Where keep_mask is True, we use the new, confident sampled_tokens.
+            # - Where keep_mask is False, we keep the tokens from the previous step `x`.
+            x = torch.where(keep_mask, sampled_tokens, x)
+    return x
+# --- Main Execution ---
+def main(args):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    print(f"Loading checkpoint from {args.checkpoint}...")
+    try:
+        checkpoint = torch.load(args.checkpoint, map_location=device, weights_only=False)
+        model_args = checkpoint['args']
+    except FileNotFoundError:
+        print(f"Error: Checkpoint file not found at {args.checkpoint}")
+        return
+    except Exception as e:
+        print(f"Error loading checkpoint: {e}")
+        return
+    print("Initializing model...")
+    model = MDLM(
+        vocab_size=model_args.vocab_size,
+        seq_len=model_args.seq_len,
+        model_dim=model_args.model_dim,
+        n_heads=model_args.n_heads,
+        n_layers=model_args.n_layers
+    ).to(device)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    print("Model loaded successfully.")
+    gen_len = args.gen_len if args.gen_len is not None else model_args.seq_len
+    if gen_len > model_args.seq_len:
+        raise ValueError(f"Requested generation length ({gen_len}) is greater than the model's max length ({model_args.seq_len}).")
+    print(f"Generating sequences of length {gen_len}.")
+    generated_tokens = generate_samples(
+        model=model,
+        device=device,
+        num_samples=args.num_samples,
+        seq_len=gen_len,
+        steps=args.gen_steps,
+        temperature=args.temperature
+    )
+    print("Decoding and saving samples...")
+    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
+    with open(args.output_file, 'w') as f:
+        for sample_tokens in generated_tokens:
+            sequence = tokenizer.decode(sample_tokens.tolist(), skip_special_tokens=False)
+            clean_sequence = sequence.replace(" ", "")[5:-5]
+            f.write(clean_sequence + "\n")
+            print(clean_sequence)
+    print(f"Generation complete. {args.num_samples} sequences saved to {args.output_file}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate samples from a trained ReDi (MDLM) model starting from random noise.")
+    parser.add_argument("--checkpoint", type=str, required=True, help="Path to the model checkpoint file.")
+    parser.add_argument("--num_samples", type=int, default=128, help="Number of samples to generate.")
+    parser.add_argument("--output_file", type=str, default="./generated_peptides.txt", help="File to save the generated peptide sequences.")
+    parser.add_argument("--gen_steps", type=int, default=16, help="Number of steps for the progressive refinement process.")
+    parser.add_argument("--gen_len", type=int, default=None, help="Desired length of the generated sequences. Defaults to the model's maximum trained length.")
+    parser.add_argument("--temperature", type=float, default=1.0, help="Sampling temperature. >1 increases diversity, <1 decreases it.")
+    args = parser.parse_args()
+    main(args)

peptide/moo.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import argparse
+import math
+import random
+from collections import Counter
+import csv
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tqdm import tqdm
+from transformers import AutoTokenizer
+from peptide_classifiers import *
+# --- Model Architecture (Must match the trained model) ---
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+class TimestepEmbedder(nn.Module):
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(1, hidden_size, bias=True), nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+    def forward(self, t):
+        return self.mlp(t.unsqueeze(-1))
+class DiTBlock(nn.Module):
+    def __init__(self, hidden_size, n_heads):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = nn.MultiheadAttention(hidden_size, n_heads, batch_first=True)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(hidden_size, 4 * hidden_size), nn.GELU(),
+            nn.Linear(4 * hidden_size, hidden_size)
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
+        x_norm1 = modulate(self.norm1(x), shift_msa, scale_msa)
+        attn_output, _ = self.attn(x_norm1, x_norm1, x_norm1)
+        x = x + gate_msa.unsqueeze(1) * attn_output
+        x_norm2 = modulate(self.norm2(x), shift_mlp, scale_mlp)
+        mlp_output = self.mlp(x_norm2)
+        x = x + gate_mlp.unsqueeze(1) * mlp_output
+        return x
+class MDLM(nn.Module):
+    def __init__(self, vocab_size, seq_len, model_dim, n_heads, n_layers):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.seq_len = seq_len
+        self.model_dim = model_dim
+        self.mask_token_id = vocab_size
+        self.token_embedder = nn.Embedding(vocab_size + 1, model_dim)
+        self.pos_embedder = nn.Parameter(torch.randn(1, seq_len, model_dim))
+        self.time_embedder = TimestepEmbedder(model_dim)
+        self.transformer_blocks = nn.ModuleList([DiTBlock(model_dim, n_heads) for _ in range(n_layers)])
+        self.final_norm = nn.LayerNorm(model_dim)
+        self.lm_head = nn.Linear(model_dim, vocab_size)
+    def forward(self, x, t):
+        seq_len = x.shape[1]
+        x_embed = self.token_embedder(x) + self.pos_embedder[:, :seq_len, :]
+        t_embed = self.time_embedder(t)
+        for block in self.transformer_blocks:
+            x_embed = block(x_embed, t_embed)
+        x_embed = self.final_norm(x_embed)
+        logits = self.lm_head(x_embed)
+        return logits
+class MOGGenerator:
+    def __init__(self, model, device, objectives, args):
+        self.model = model
+        self.device = device
+        self.objectives = objectives
+        self.args = args
+        self.num_objectives = len(objectives)
+    def _get_scores(self, x_batch):
+        """Calculates the normalized scores for a batch of sequences."""
+        scores = []
+        for obj_func in self.objectives:
+            scores.append(obj_func(x_batch.to(self.device)))
+        return torch.stack(scores, dim=0)
+    def _barker_g(self, u):
+        """Barker balancing function."""
+        return u / (1 + u)
+    def generate(self):
+        """Main generation loop."""
+        shape = (self.args.num_samples, self.args.gen_len + 2)
+        x = torch.randint(5, self.model.vocab_size, shape, dtype=torch.long, device=self.device)
+        x[:, 0] = 0
+        x[:, -1] = 2
+        if args.weights is None:
+            weights = torch.full((self.num_objectives,), 1/self.num_objectives, device=self.device).view(-1,1)
+        else:
+            weights = torch.tensor(self.args.weights, device=self.device).view(-1, 1)
+        if len(weights) != self.num_objectives:
+            raise ValueError("Number of weights must match number of objectives.")
+        print(f"Weights: {weights}")
+        if self.args.min_threshold is not None:
+            min_threshold = torch.tensor(self.args.min_threshold, device=self.device)
+        else:
+            min_threshold = None
+        total_optimization_steps = self.args.optimization_steps * self.args.gen_len
+        with torch.no_grad():
+            for t in tqdm(range(total_optimization_steps), desc="MOG Generation"):
+                # Anneal guidance strength
+                eta_t = self.args.eta_min + (self.args.eta_max - self.args.eta_min) * (t / (total_optimization_steps - 1))
+                # eta_t = 0.5 * (self.args.eta_min + self.args.eta_max)
+                # Choose a random position to mutate
+                mut_idx = random.randint(1, self.args.gen_len)
+                # Determine the generation timestep
+                # We cycle through the timesteps to ensure all are visited
+                generation_step = t % self.args.optimization_steps
+                time_t = torch.full((self.args.num_samples,), (generation_step / self.args.optimization_steps), device=self.device)
+                # Get proposal distribution from ReDi model for the chosen position
+                logits = self.model(x, time_t)
+                probs = F.softmax(logits, dim=-1)
+                pos_probs = probs[:, mut_idx, :]
+                pos_probs[:, x[:, mut_idx]] = 0   # We don't evalute the same token
+                # Prune candidate vocabulary using top-p sampling
+                sorted_probs, sorted_indices = torch.sort(pos_probs, descending=True)
+                cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+                remove_mask = cumulative_probs > self.args.top_p
+                remove_mask[..., 1:] = remove_mask[..., :-1].clone()
+                remove_mask[..., 0] = 0
+                # Get the set of candidate tokens for each sample in the batch
+                candidate_tokens_list = []
+                for i in range(self.args.num_samples):
+                    sample_mask = remove_mask[i]
+                    candidates = sorted_indices[i, ~sample_mask]
+                    candidate_tokens_list.append(candidates)
+                # Get current scores
+                current_scores = self._get_scores(x)
+                w_current = torch.exp(eta_t * torch.min(weights * current_scores, dim=0).values)
+                # Evaluate all candidate tokens for each sample
+                final_proposal_tokens = []
+                for i in range(self.args.num_samples):
+                    candidates = candidate_tokens_list[i]
+                    candidates = torch.tensor([token for token in candidates if token not in [0,1,2,3]], device=candidates.device)
+                    num_candidates = len(candidates)
+                    # Create a batch of proposed sequences for the current sample
+                    x_prop_batch = x[i].repeat(num_candidates, 1)
+                    x_prop_batch[:, mut_idx] = candidates
+                    # Evaluate all proposals
+                    proposal_scores = self._get_scores(x_prop_batch)
+                    proposal_s_omega = torch.min(weights * proposal_scores, dim=0).values
+                    w_proposal = torch.exp(eta_t * proposal_s_omega)
+                    # Get ReDi probabilities for the candidates
+                    redi_probs = pos_probs[i, candidates]
+                    # Calculate unnormalized guided probabilities
+                    tilde_q = redi_probs * self._barker_g(w_proposal / w_current[i])
+                    # Normalize and sample the final token
+                    final_probs = tilde_q / (torch.sum(tilde_q) + 1e-9)
+                    index = torch.multinomial(final_probs, 1).item()
+                    if torch.sum(weights.squeeze(1) * proposal_scores[:, index]) >= torch.sum(weights.squeeze(1) * current_scores[:,i]):
+                        final_token = candidates[index]
+                        print(f"Previous Weighted Sum: {torch.sum(weights.squeeze(1) * current_scores[:,i])}")
+                        print(f"Previous Scores: {current_scores[:,i]}")
+                        print(f"New Weighted Sum: {torch.sum(weights.squeeze(1) * proposal_scores[:, index])}")
+                        print(f"New Scores: {proposal_scores[:,index]}")
+                    else:
+                        final_token = x[i][mut_idx]
+                    # final_token = candidates[index]
+                    final_proposal_tokens.append(final_token)
+                # Update the sequences with the chosen tokens
+                x[torch.arange(self.args.num_samples), mut_idx] = torch.stack(final_proposal_tokens)
+                scores = self._get_scores(x)
+        return x
+# --- Main Execution ---
+def main(args):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    target = args.target
+    tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
+    target_sequence = tokenizer(target, return_tensors='pt')['input_ids'].to(device)
+    affinity_predictor = load_affinity_predictor('/scratch/pranamlab/tong/ReDi_discrete/peptides/classifier_ckpt/binding_affinity_unpooled.pt', device)
+    affinity_model = AffinityModel(affinity_predictor, target_sequence)
+    hemolysis_model = HemolysisModel(device=device)
+    nonfouling_model = NonfoulingModel(device=device)
+    solubility_model = SolubilityModel(device=device)
+    halflife_model = HalfLifeModel(device=device)
+    print(f"Loading checkpoint from {args.checkpoint}...")
+    try:
+        checkpoint = torch.load(args.checkpoint, map_location=device, weights_only=False)
+        model_args = checkpoint['args']
+    except Exception as e:
+        print(f"Error loading checkpoint: {e}")
+        return
+    print("Initializing model...")
+    model = MDLM(
+        vocab_size=model_args.vocab_size,
+        seq_len=model_args.seq_len,
+        model_dim=model_args.model_dim,
+        n_heads=model_args.n_heads,
+        n_layers=model_args.n_layers
+    ).to(device)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    print("Model loaded successfully.")
+    # List of all objective functions
+    OBJECTIVE_FUNCTIONS = [hemolysis_model, nonfouling_model, solubility_model, halflife_model, affinity_model]
+    mog_generator = MOGGenerator(model, device, OBJECTIVE_FUNCTIONS, args)
+    hemolysis = []
+    nonfouling = []
+    solubility = []
+    halflife = []
+    affinity = []
+    for _ in range(args.num_batches):
+        generated_tokens = mog_generator.generate()
+        final_scores = mog_generator._get_scores(generated_tokens).detach().cpu().numpy()
+        with open(args.output_file, 'a', newline='') as f:
+            writer = csv.writer(f)
+            for i in range(args.num_samples):
+                sample_tokens = generated_tokens[i]
+                print(sample_tokens)
+                sequence_str = tokenizer.decode(sample_tokens.tolist(), skip_special_tokens=False).replace(" ", "")[5:-5]
+                scores = final_scores[:, i]
+                writer.writerow([sequence_str] + scores.tolist())
+                print([sequence_str] + scores.tolist())
+    print("Generation complete.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Multi-Objective Generation with LBP-MOG-ReDi (Single Mutation).")
+    parser.add_argument("--checkpoint", type=str, required=True, help="Path to the trained ReDi model checkpoint.")
+    parser.add_argument("--num_samples", type=int, default=10, help="Number of samples to generate.")
+    parser.add_argument("--num_batches", type=int, default=10, help="Number of samples to generate.")
+    parser.add_argument("--output_file", type=str, default="./mog_peptides.txt", help="File to save the generated sequences.")
+    parser.add_argument("--gen_len", type=int, default=50, help="Length of the sequences to generate.")
+    parser.add_argument("--optimization_steps", type=int, default=16, help="Number of passes over the sequence.")
+    parser.add_argument("--weights", type=float, nargs='+', required=False, help="Weights for the objectives (e.g., 0.5 0.5).")
+    parser.add_argument("--min_threshold", type=float, nargs='+', required=False, help="minimum threshold for the objectives (e.g., 0.2 0.2).")
+    parser.add_argument("--eta_min", type=float, default=1.0, help="Minimum guidance strength for annealing.")
+    parser.add_argument("--eta_max", type=float, default=20.0, help="Maximum guidance strength for annealing.")
+    parser.add_argument("--top_p", type=float, default=0.9, help="Top-p for pruning candidate tokens.")
+    parser.add_argument("--target", type=str, required=True)
+    args = parser.parse_args()
+    main(args)

peptide/new_coupling.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import argparse
+import math
+import os
+from collections import defaultdict
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+from datasets import Dataset, DatasetDict
+# --- Model Architecture (Must match the trained model) ---
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+class TimestepEmbedder(nn.Module):
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(1, hidden_size, bias=True), nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+    def forward(self, t):
+        return self.mlp(t.unsqueeze(-1))
+class DiTBlock(nn.Module):
+    def __init__(self, hidden_size, n_heads):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = nn.MultiheadAttention(hidden_size, n_heads, batch_first=True)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(hidden_size, 4 * hidden_size), nn.GELU(),
+            nn.Linear(4 * hidden_size, hidden_size)
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
+        x_norm1 = modulate(self.norm1(x), shift_msa, scale_msa)
+        attn_output, _ = self.attn(x_norm1, x_norm1, x_norm1)
+        x = x + gate_msa.unsqueeze(1) * attn_output
+        x_norm2 = modulate(self.norm2(x), shift_mlp, scale_mlp)
+        mlp_output = self.mlp(x_norm2)
+        x = x + gate_mlp.unsqueeze(1) * mlp_output
+        return x
+class MDLM(nn.Module):
+    def __init__(self, vocab_size, seq_len, model_dim, n_heads, n_layers):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.seq_len = seq_len
+        self.model_dim = model_dim
+        self.mask_token_id = vocab_size
+        self.token_embedder = nn.Embedding(vocab_size + 1, model_dim)
+        self.pos_embedder = nn.Parameter(torch.randn(1, seq_len, model_dim))
+        self.time_embedder = TimestepEmbedder(model_dim)
+        self.transformer_blocks = nn.ModuleList([DiTBlock(model_dim, n_heads) for _ in range(n_layers)])
+        self.final_norm = nn.LayerNorm(model_dim)
+        self.lm_head = nn.Linear(model_dim, vocab_size)
+    def forward(self, x, t):
+        seq_len = x.shape[1]
+        x_embed = self.token_embedder(x) + self.pos_embedder[:, :seq_len, :]
+        t_embed = self.time_embedder(t)
+        for block in self.transformer_blocks:
+            x_embed = block(x_embed, t_embed)
+        x_embed = self.final_norm(x_embed)
+        logits = self.lm_head(x_embed)
+        return logits
+# --- Generation & Utility Functions ---
+def generate_x1_from_x0(model, device, x0_batch, steps, temperature):
+    model.eval()
+    x = x0_batch.clone()
+    num_samples, seq_len = x.shape
+    keep_schedule = torch.cos(torch.linspace(math.pi / 2, 0, steps, device=device)) * seq_len
+    keep_schedule = torch.round(keep_schedule).long()
+    with torch.no_grad():
+        for i in range(steps):
+            t_continuous = torch.full((num_samples,), 1.0 - (i / steps), device=device)
+            logits = model(x, t_continuous)
+            scaled_logits = logits / temperature
+            probs = torch.nn.functional.softmax(scaled_logits, dim=-1)
+            sampled_tokens = torch.multinomial(probs.view(-1, model.vocab_size), 1).view(x.shape)
+            if i == steps - 1:
+                x = sampled_tokens
+                break
+            confidence = torch.gather(probs, 2, sampled_tokens.unsqueeze(-1)).squeeze(-1)
+            num_to_keep = keep_schedule[i]
+            _, indices_to_keep = torch.topk(confidence, num_to_keep, largest=True, dim=-1)
+            keep_mask = torch.zeros_like(x, dtype=torch.bool).scatter_(1, indices_to_keep, True)
+            x = torch.where(keep_mask, sampled_tokens, x)
+    return x
+def is_sample_valid(sample_x1):
+    """
+    Checks if special tokens [0, 1, 2, 3] appear in the middle of the sequence.
+    """
+    middle_sequence = sample_x1[1:-1]
+    invalid_tokens = {0, 1, 2, 3}
+    for token in middle_sequence:
+        if token in invalid_tokens:
+            return False
+    return True
+def create_prebatched_dataset(dataset, max_tokens_per_batch=500):
+    """
+    Groups samples into batches and restructures the dataset.
+    Each row in the new dataset is a complete batch.
+    """
+    # Group samples by their length
+    data_by_length = defaultdict(list)
+    for sample in dataset:
+        length = len(sample['input_ids_x1'])
+        data_by_length[length].append(sample)
+    # Create the actual batches
+    batched_data = {'input_ids_x0': [], 'input_ids_x1': []}
+    for length, samples in data_by_length.items():
+        samples_per_batch = max(1, max_tokens_per_batch // length)
+        for i in range(0, len(samples), samples_per_batch):
+            batch_samples = samples[i:i + samples_per_batch]
+            batch_x0 = [s['input_ids_x0'] for s in batch_samples]
+            batch_x1 = [s['input_ids_x1'] for s in batch_samples]
+            batched_data['input_ids_x0'].append(batch_x0)
+            batched_data['input_ids_x1'].append(batch_x1)
+    return Dataset.from_dict(batched_data)
+# --- Main Execution ---
+def main(args):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    print(f"Loading checkpoint from {args.checkpoint}...")
+    try:
+        checkpoint = torch.load(args.checkpoint, map_location=device, weights_only=False)
+        model_args = checkpoint['args']
+    except Exception as e:
+        print(f"Error loading checkpoint: {e}")
+        return
+    print("Initializing model...")
+    model = MDLM(
+        vocab_size=model_args.vocab_size,
+        seq_len=model_args.seq_len,
+        model_dim=model_args.model_dim,
+        n_heads=model_args.n_heads,
+        n_layers=model_args.n_layers
+    ).to(device)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    print("Model loaded successfully.")
+    all_x0 = []
+    all_x1 = []
+    # 1. Generate samples for each length
+    for length in range(args.min_len, args.max_len + 1):
+        print(f"Generating {args.samples_per_len} valid samples for length {length}...")
+        valid_samples_count = 0
+        pbar = tqdm(total=args.samples_per_len)
+        while valid_samples_count < args.samples_per_len:
+            remaining = args.samples_per_len - valid_samples_count
+            batch_size = min(args.batch_size, remaining)
+            shape = (batch_size, length)
+            x0_batch = torch.randint(0, model.vocab_size, shape, dtype=torch.long, device=device)
+            x1_batch = generate_x1_from_x0(model, device, x0_batch, args.gen_steps, args.temperature)
+            # 2. Perform sanity check on each sample
+            for x0, x1 in zip(x0_batch, x1_batch):
+                if is_sample_valid(x1.tolist()):
+                    all_x0.append(x0.cpu().tolist())
+                    all_x1.append(x1.cpu().tolist())
+                    valid_samples_count += 1
+                    pbar.update(1)
+                    if valid_samples_count >= args.samples_per_len:
+                        break
+        pbar.close()
+    # 3. Create dataset and split
+    print("Splitting dataset...")
+    rectified_data = {'input_ids_x0': all_x0, 'input_ids_x1': all_x1}
+    dataset = Dataset.from_dict(rectified_data)
+    train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
+    valid_test_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)
+    final_dataset_dict = DatasetDict({
+        'train': train_test_split['train'],
+        'validation': valid_test_split['train'],
+        'test': valid_test_split['test']
+    })
+    # 4. Pre-batch each split
+    print("Pre-batching splits...")
+    batched_dataset_dict = DatasetDict()
+    for split_name, split_dataset in final_dataset_dict.items():
+        print(f"Processing {split_name} split...")
+        batched_dataset_dict[split_name] = create_prebatched_dataset(split_dataset)
+    # 5. Save the final dataset
+    output_path = f"{args.output_path}/v{args.version}"
+    print(f"Saving new batched dataset to {output_path}...")
+    batched_dataset_dict.save_to_disk(output_path)
+    print("Rectification complete.")
+    print(f"Train on this by updating your training script's dataset path to '{output_path}'.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate a rectified dataset with variable lengths and pre-batching.")
+    parser.add_argument("--checkpoint", type=str, required=True)
+    parser.add_argument("--output_path", type=str, default="./rectified_datasets")
+    parser.add_argument("--version", type=str, default='1')
+    parser.add_argument("--samples_per_len", type=int, default=10000)
+    parser.add_argument("--min_len", type=int, default=6)
+    parser.add_argument("--max_len", type=int, default=49)
+    parser.add_argument("--gen_steps", type=int, default=16)
+    parser.add_argument("--temperature", type=float, default=1.0)
+    parser.add_argument("--batch_size", type=int, default=128)
+    args = parser.parse_args()
+    main(args)

peptide/peptide_classifiers.py ADDED Viewed

	@@ -0,0 +1,568 @@

+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+import pytorch_lightning as pl
+import time
+from transformers import AutoModel, AutoConfig, AutoTokenizer
+import xgboost as xgb
+import esm
+class UnpooledBindingPredictor(nn.Module):
+    def __init__(self,
+                 esm_model_name="facebook/esm2_t33_650M_UR50D",
+                 hidden_dim=512,
+                 kernel_sizes=[3, 5, 7],
+                 n_heads=8,
+                 n_layers=3,
+                 dropout=0.1,
+                 freeze_esm=True):
+        super().__init__()
+        # Define binding thresholds
+        self.tight_threshold = 7.5    # Kd/Ki/IC50 ≤ ~30nM
+        self.weak_threshold = 6.0     # Kd/Ki/IC50 > 1μM
+        # Load ESM model for computing embeddings on the fly
+        self.esm_model = AutoModel.from_pretrained(esm_model_name)
+        self.config = AutoConfig.from_pretrained(esm_model_name)
+        # Freeze ESM parameters if needed
+        if freeze_esm:
+            for param in self.esm_model.parameters():
+                param.requires_grad = False
+        # Get ESM hidden size
+        esm_dim = self.config.hidden_size
+        # Output channels for CNN layers
+        output_channels_per_kernel = 64
+        # CNN layers for handling variable length sequences
+        self.protein_conv_layers = nn.ModuleList([
+            nn.Conv1d(
+                in_channels=esm_dim,
+                out_channels=output_channels_per_kernel,
+                kernel_size=k,
+                padding='same'
+            ) for k in kernel_sizes
+        ])
+        self.binder_conv_layers = nn.ModuleList([
+            nn.Conv1d(
+                in_channels=esm_dim,
+                out_channels=output_channels_per_kernel,
+                kernel_size=k,
+                padding='same'
+            ) for k in kernel_sizes
+        ])
+        # Calculate total features after convolution and pooling
+        total_features_per_seq = output_channels_per_kernel * len(kernel_sizes) * 2
+        # Project to same dimension after CNN processing
+        self.protein_projection = nn.Linear(total_features_per_seq, hidden_dim)
+        self.binder_projection = nn.Linear(total_features_per_seq, hidden_dim)
+        self.protein_norm = nn.LayerNorm(hidden_dim)
+        self.binder_norm = nn.LayerNorm(hidden_dim)
+        # Cross attention blocks with layer norm
+        self.cross_attention_layers = nn.ModuleList([
+            nn.ModuleDict({
+                'attention': nn.MultiheadAttention(hidden_dim, n_heads, dropout=dropout),
+                'norm1': nn.LayerNorm(hidden_dim),
+                'ffn': nn.Sequential(
+                    nn.Linear(hidden_dim, hidden_dim * 4),
+                    nn.ReLU(),
+                    nn.Dropout(dropout),
+                    nn.Linear(hidden_dim * 4, hidden_dim)
+                ),
+                'norm2': nn.LayerNorm(hidden_dim)
+            }) for _ in range(n_layers)
+        ])
+        # Prediction heads
+        self.shared_head = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+        )
+        # Regression head
+        self.regression_head = nn.Linear(hidden_dim, 1)
+        # Classification head (3 classes: tight, medium, loose binding)
+        self.classification_head = nn.Linear(hidden_dim, 3)
+    def get_binding_class(self, affinity):
+        """Convert affinity values to class indices
+        0: tight binding (>= 7.5)
+        1: medium binding (6.0-7.5)
+        2: weak binding (< 6.0)
+        """
+        if isinstance(affinity, torch.Tensor):
+            tight_mask = affinity >= self.tight_threshold
+            weak_mask = affinity < self.weak_threshold
+            medium_mask = ~(tight_mask | weak_mask)
+            classes = torch.zeros_like(affinity, dtype=torch.long)
+            classes[medium_mask] = 1
+            classes[weak_mask] = 2
+            return classes
+        else:
+            if affinity >= self.tight_threshold:
+                return 0  # tight binding
+            elif affinity < self.weak_threshold:
+                return 2  # weak binding
+            else:
+                return 1  # medium binding
+    def compute_embeddings(self, input_ids, attention_mask=None):
+        """Compute ESM embeddings on the fly"""
+        esm_outputs = self.esm_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=True
+        )
+        # Get the unpooled last hidden states (batch_size x seq_length x hidden_size)
+        return esm_outputs.last_hidden_state
+    def process_sequence(self, unpooled_emb, conv_layers, attention_mask=None):
+        """Process a sequence through CNN layers and pooling"""
+        # Transpose for CNN: [batch_size, hidden_size, seq_length]
+        x = unpooled_emb.transpose(1, 2)
+        # Apply CNN layers and collect outputs
+        conv_outputs = []
+        for conv in conv_layers:
+            conv_out = F.relu(conv(x))
+            conv_outputs.append(conv_out)
+        # Concatenate along channel dimension
+        conv_output = torch.cat(conv_outputs, dim=1)
+        # Global pooling (both max and average)
+        # If attention mask is provided, use it to create a proper mask for pooling
+        if attention_mask is not None:
+            # Create a mask for pooling (1 for valid positions, 0 for padding)
+            # Expand mask to match conv_output channels
+            expanded_mask = attention_mask.unsqueeze(1).expand(-1, conv_output.size(1), -1)
+            # Apply mask (set padding to large negative value for max pooling)
+            masked_output = conv_output.clone()
+            masked_output = masked_output.masked_fill(expanded_mask == 0, float('-inf'))
+            # Max pooling along sequence dimension
+            max_pooled = torch.max(masked_output, dim=2)[0]
+            # Average pooling (sum divided by number of valid positions)
+            sum_pooled = torch.sum(conv_output * expanded_mask, dim=2)
+            valid_positions = torch.sum(expanded_mask, dim=2)
+            valid_positions = torch.clamp(valid_positions, min=1.0)  # Avoid division by zero
+            avg_pooled = sum_pooled / valid_positions
+        else:
+            # If no mask, use standard pooling
+            max_pooled = torch.max(conv_output, dim=2)[0]
+            avg_pooled = torch.mean(conv_output, dim=2)
+        # Concatenate the pooled features
+        pooled = torch.cat([max_pooled, avg_pooled], dim=1)
+        return pooled
+    def forward(self, protein_input_ids, binder_input_ids, protein_mask=None, binder_mask=None):
+        # Compute embeddings on the fly using the ESM model
+        protein_unpooled = self.compute_embeddings(protein_input_ids, protein_mask)
+        binder_unpooled = self.compute_embeddings(binder_input_ids, binder_mask)
+        # Process protein and binder sequences through CNN layers
+        protein_features = self.process_sequence(protein_unpooled, self.protein_conv_layers, protein_mask)
+        binder_features = self.process_sequence(binder_unpooled, self.binder_conv_layers, binder_mask)
+        # Project to same dimension
+        protein = self.protein_norm(self.protein_projection(protein_features))
+        binder = self.binder_norm(self.binder_projection(binder_features))
+        # Reshape for attention: from [batch_size, hidden_dim] to [1, batch_size, hidden_dim]
+        protein = protein.unsqueeze(0)
+        binder = binder.unsqueeze(0)
+        # Cross attention layers
+        for layer in self.cross_attention_layers:
+            # Protein attending to binder
+            attended_protein = layer['attention'](
+                protein, binder, binder
+            )[0]
+            protein = layer['norm1'](protein + attended_protein)
+            protein = layer['norm2'](protein + layer['ffn'](protein))
+            # Binder attending to protein
+            attended_binder = layer['attention'](
+                binder, protein, protein
+            )[0]
+            binder = layer['norm1'](binder + attended_binder)
+            binder = layer['norm2'](binder + layer['ffn'](binder))
+        # Remove sequence dimension
+        protein_pool = protein.squeeze(0)
+        binder_pool = binder.squeeze(0)
+        # Concatenate both representations
+        combined = torch.cat([protein_pool, binder_pool], dim=-1)
+        # Shared features
+        shared_features = self.shared_head(combined)
+        regression_output = self.regression_head(shared_features)
+        # classification_logits = self.classification_head(shared_features)
+        # return regression_output, classification_logits
+        return regression_output
+class ImprovedBindingPredictor(nn.Module):
+    def __init__(self,
+                 esm_dim=1280,
+                 smiles_dim=1280,
+                 hidden_dim=512,
+                 n_heads=8,
+                 n_layers=5,
+                 dropout=0.1):
+        super().__init__()
+        # Define binding thresholds
+        self.tight_threshold = 7.5    # Kd/Ki/IC50 ≤ ~30nM
+        self.weak_threshold = 6.0     # Kd/Ki/IC50 > 1μM
+        # Project to same dimension
+        self.smiles_projection = nn.Linear(smiles_dim, hidden_dim)
+        self.protein_projection = nn.Linear(esm_dim, hidden_dim)
+        self.protein_norm = nn.LayerNorm(hidden_dim)
+        self.smiles_norm = nn.LayerNorm(hidden_dim)
+        # Cross attention blocks with layer norm
+        self.cross_attention_layers = nn.ModuleList([
+            nn.ModuleDict({
+                'attention': nn.MultiheadAttention(hidden_dim, n_heads, dropout=dropout),
+                'norm1': nn.LayerNorm(hidden_dim),
+                'ffn': nn.Sequential(
+                    nn.Linear(hidden_dim, hidden_dim * 4),
+                    nn.ReLU(),
+                    nn.Dropout(dropout),
+                    nn.Linear(hidden_dim * 4, hidden_dim)
+                ),
+                'norm2': nn.LayerNorm(hidden_dim)
+            }) for _ in range(n_layers)
+        ])
+        # Prediction heads
+        self.shared_head = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+        )
+        # Regression head
+        self.regression_head = nn.Linear(hidden_dim, 1)
+        # Classification head (3 classes: tight, medium, loose binding)
+        self.classification_head = nn.Linear(hidden_dim, 3)
+    def get_binding_class(self, affinity):
+        """Convert affinity values to class indices
+        0: tight binding (>= 7.5)
+        1: medium binding (6.0-7.5)
+        2: weak binding (< 6.0)
+        """
+        if isinstance(affinity, torch.Tensor):
+            tight_mask = affinity >= self.tight_threshold
+            weak_mask = affinity < self.weak_threshold
+            medium_mask = ~(tight_mask | weak_mask)
+            classes = torch.zeros_like(affinity, dtype=torch.long)
+            classes[medium_mask] = 1
+            classes[weak_mask] = 2
+            return classes
+        else:
+            if affinity >= self.tight_threshold:
+                return 0  # tight binding
+            elif affinity < self.weak_threshold:
+                return 2  # weak binding
+            else:
+                return 1  # medium binding
+    def forward(self, protein_emb, binder_emb):
+        protein = self.protein_norm(self.protein_projection(protein_emb))
+        smiles = self.smiles_norm(self.smiles_projection(binder_emb))
+        protein = protein.transpose(0, 1)
+        smiles = smiles.transpose(0, 1)
+        # Cross attention layers
+        for layer in self.cross_attention_layers:
+            # Protein attending to SMILES
+            attended_protein = layer['attention'](
+                protein, smiles, smiles
+            )[0]
+            protein = layer['norm1'](protein + attended_protein)
+            protein = layer['norm2'](protein + layer['ffn'](protein))
+            # SMILES attending to protein
+            attended_smiles = layer['attention'](
+                smiles, protein, protein
+            )[0]
+            smiles = layer['norm1'](smiles + attended_smiles)
+            smiles = layer['norm2'](smiles + layer['ffn'](smiles))
+        # Get sequence-level representations
+        protein_pool = torch.mean(protein, dim=0)
+        smiles_pool = torch.mean(smiles, dim=0)
+        # Concatenate both representations
+        combined = torch.cat([protein_pool, smiles_pool], dim=-1)
+        # Shared features
+        shared_features = self.shared_head(combined)
+        regression_output = self.regression_head(shared_features)
+        return regression_output
+class PooledAffinityModel(nn.Module):
+    def __init__(self, affinity_predictor, target_sequence):
+        super(PooledAffinityModel, self).__init__()
+        self.affinity_predictor = affinity_predictor
+        self.target_sequence = target_sequence
+        self.esm_model = AutoModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(self.target_sequence.device)
+        for param in self.esm_model.parameters():
+            param.requires_grad = False
+    def compute_embeddings(self, input_ids, attention_mask=None):
+        """Compute ESM embeddings on the fly"""
+        esm_outputs = self.esm_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=True
+        )
+        # Get the unpooled last hidden states (batch_size x seq_length x hidden_size)
+        return esm_outputs.last_hidden_state
+    def forward(self, x):
+        target_sequence = self.target_sequence.repeat(x.shape[0], 1)
+        protein_emb = self.compute_embeddings(input_ids=target_sequence)
+        binder_emb = self.compute_embeddings(input_ids=x)
+        return self.affinity_predictor(protein_emb=protein_emb, binder_emb=binder_emb).squeeze(-1)
+class AffinityModel(nn.Module):
+    def __init__(self, affinity_predictor, target_sequence):
+        super(AffinityModel, self).__init__()
+        self.affinity_predictor = affinity_predictor
+        self.target_sequence = target_sequence
+    def forward(self, x):
+        target_sequence = self.target_sequence.repeat(x.shape[0], 1)
+        affinity = self.affinity_predictor(protein_input_ids=target_sequence, binder_input_ids=x).squeeze(-1)
+        return affinity / 10
+class HemolysisModel:
+    def __init__(self, device):
+        self.predictor = xgb.Booster(model_file='./classifier_ckpt/best_model_hemolysis.json')
+        self.model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
+        self.model.eval()
+        self.device = device
+    def generate_embeddings(self, sequences):
+        """Generate ESM embeddings for protein sequences"""
+        with torch.no_grad():
+            embeddings = self.model(input_ids=sequences).last_hidden_state.mean(dim=1)
+            embeddings = embeddings.cpu().numpy()
+        return embeddings
+    def get_scores(self, input_seqs):
+        scores = np.ones(len(input_seqs))
+        features = self.generate_embeddings(input_seqs)
+        if len(features) == 0:
+            return scores
+        features = np.nan_to_num(features, nan=0.)
+        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
+        features = xgb.DMatrix(features)
+        probs = self.predictor.predict(features)
+        # return the probability of it being not hemolytic
+        return torch.from_numpy(scores - probs).to(self.device)
+    def __call__(self, input_seqs: list):
+        scores = self.get_scores(input_seqs)
+        return scores
+class NonfoulingModel:
+    def __init__(self, device):
+        # change model path
+        self.predictor = xgb.Booster(model_file='./classifier_ckpt/best_model_nonfouling.json')
+        self.model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
+        self.model.eval()
+        self.device = device
+    def generate_embeddings(self, sequences):
+        """Generate ESM embeddings for protein sequences"""
+        with torch.no_grad():
+            embeddings = self.model(input_ids=sequences).last_hidden_state.mean(dim=1)
+            embeddings = embeddings.cpu().numpy()
+        return embeddings
+    def get_scores(self, input_seqs):
+        scores = np.zeros(len(input_seqs))
+        features = self.generate_embeddings(input_seqs)
+        if len(features) == 0:
+            return scores
+        features = np.nan_to_num(features, nan=0.)
+        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
+        features = xgb.DMatrix(features)
+        scores = self.predictor.predict(features)
+        return torch.from_numpy(scores).to(self.device)
+    def __call__(self, input_seqs: list):
+        scores = self.get_scores(input_seqs)
+        return scores
+class SolubilityModel:
+    def __init__(self, device):
+        # change model path
+        self.predictor = xgb.Booster(model_file='./classifier_ckpt/best_model_solubility.json')
+        self.model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
+        self.model.eval()
+        self.device = device
+    def generate_embeddings(self, sequences):
+        """Generate ESM embeddings for protein sequences"""
+        with torch.no_grad():
+            embeddings = self.model(input_ids=sequences).last_hidden_state.mean(dim=1)
+            embeddings = embeddings.cpu().numpy()
+        return embeddings
+    def get_scores(self, input_seqs: list):
+        scores = np.zeros(len(input_seqs))
+        features = self.generate_embeddings(input_seqs)
+        if len(features) == 0:
+            return scores
+        features = np.nan_to_num(features, nan=0.)
+        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
+        features = xgb.DMatrix(features)
+        scores = self.predictor.predict(features)
+        return torch.from_numpy(scores).to(self.device)
+    def __call__(self, input_seqs: list):
+        scores = self.get_scores(input_seqs)
+        return scores
+class PeptideCNN(nn.Module):
+    def __init__(self, input_dim, hidden_dims, output_dim, dropout_rate):
+        super().__init__()
+        self.conv1 = nn.Conv1d(input_dim, hidden_dims[0], kernel_size=3, padding=1)
+        self.conv2 = nn.Conv1d(hidden_dims[0], hidden_dims[1], kernel_size=5, padding=1)
+        self.fc = nn.Linear(hidden_dims[1], output_dim)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.predictor = nn.Linear(output_dim, 1)  # For regression/classification
+        self.esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D")
+        self.esm_model.eval()
+    def forward(self, input_ids, attention_mask=None, return_features=False):
+        with torch.no_grad():
+            x = self.esm_model(input_ids, attention_mask).last_hidden_state
+        # x shape: (B, L, input_dim)
+        x = x.permute(0, 2, 1)  # Reshape to (B, input_dim, L) for Conv1d
+        x = nn.functional.relu(self.conv1(x))
+        x = self.dropout(x)
+        x = nn.functional.relu(self.conv2(x))
+        x = self.dropout(x)
+        x = x.permute(0, 2, 1)  # Reshape back to (B, L, hidden_dims[1])
+        # Global average pooling over the sequence dimension (L)
+        x = x.mean(dim=1)  # Shape: (B, hidden_dims[1])
+        features = self.fc(x)  # features shape: (B, output_dim)
+        if return_features:
+            return features
+        return self.predictor(features)  # Output shape: (B, 1)
+class HalfLifeModel:
+    def __init__(self, device):
+        input_dim = 1280
+        hidden_dims = [input_dim // 2, input_dim // 4]
+        output_dim = input_dim // 8
+        dropout_rate = 0.3
+        self.model = PeptideCNN(input_dim, hidden_dims, output_dim, dropout_rate).to(device)
+        self.model.load_state_dict(torch.load('./classifier_ckpt/best_model_half_life.pth', map_location=device, weights_only=False))
+        self.model.eval()
+    def __call__(self, x):
+        prediction = self.model(x, return_features=False)
+        half_life = torch.clamp(prediction.squeeze(-1), max=2.0, min=0.0)
+        return half_life / 2
+def load_bindevaluator(checkpoint_path, device):
+    bindevaluator = BindEvaluator.load_from_checkpoint(checkpoint_path, n_layers=8, d_model=128, d_hidden=128, n_head=8, d_k=64, d_v=128, d_inner=64).to(device)
+    bindevaluator.eval()
+    for param in bindevaluator.parameters():
+        param.requires_grad = False
+    return bindevaluator
+def load_pooled_affinity_predictor(checkpoint_path, device):
+    """Load trained model from checkpoint."""
+    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
+    model = ImprovedBindingPredictor().to(device)
+    # Load the trained weights
+    model.load_state_dict(checkpoint['model_state_dict'])
+    model.eval()  # Set to evaluation mode
+    return model
+def load_affinity_predictor(checkpoint_path, device):
+    """Load trained model from checkpoint."""
+    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
+    model = UnpooledBindingPredictor(
+        esm_model_name="facebook/esm2_t33_650M_UR50D",
+        hidden_dim=384,
+        kernel_sizes=[3, 5, 7],
+        n_heads=8,
+        n_layers=4,
+        dropout=0.14561457009902096,
+        freeze_esm=True
+    ).to(device)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    model.eval()
+    return model

peptide/rectified_datasets/v1/dataset_dict.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"splits": ["train", "validation", "test"]}

peptide/rectified_datasets/v1/test/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:087edd095dcd714192f1d4ef341b1894bcee3fb03d5453c2b04d8ce031589318
+size 19749472

peptide/rectified_datasets/v1/test/dataset_info.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids_x0": {
+      "feature": {
+        "feature": {
+          "dtype": "int64",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    },
+    "input_ids_x1": {
+      "feature": {
+        "feature": {
+          "dtype": "int64",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

peptide/rectified_datasets/v1/test/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "118d550fe7101754",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

peptide/rectified_datasets/v1/train/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b4af46986df1e635f0dff8e3f00c4d4c06aac26161836ca694299d1cc0bd20f
+size 157859216

peptide/rectified_datasets/v1/train/dataset_info.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids_x0": {
+      "feature": {
+        "feature": {
+          "dtype": "int64",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    },
+    "input_ids_x1": {
+      "feature": {
+        "feature": {
+          "dtype": "int64",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

peptide/rectified_datasets/v1/train/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "a5ddb0c42fb68c3f",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

peptide/rectified_datasets/v1/validation/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:409939a66eebfac56a884440065ec2e6cd1b81632fede0d4e2156cd56600a2b8
+size 19725216

peptide/rectified_datasets/v1/validation/dataset_info.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids_x0": {
+      "feature": {
+        "feature": {
+          "dtype": "int64",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    },
+    "input_ids_x1": {
+      "feature": {
+        "feature": {
+          "dtype": "int64",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

peptide/rectified_datasets/v1/validation/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "3a37666e1156a9e6",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

peptide/rectified_datasets/v2/dataset_dict.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"splits": ["train", "validation", "test"]}

peptide/rectified_datasets/v2/test/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd85592cab50715eb0268f061670b7a98d3871a7aa1baefd40c6a23055f489d6
+size 19749472

peptide/rectified_datasets/v2/test/dataset_info.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids_x0": {
+      "feature": {
+        "feature": {
+          "dtype": "int64",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    },
+    "input_ids_x1": {
+      "feature": {
+        "feature": {
+          "dtype": "int64",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

peptide/rectified_datasets/v2/test/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "03f0e67bb58fcf47",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

peptide/rectified_datasets/v2/train/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:26bea1ec7f7db609a4949a6209bc5536003f7ab169f1e03d5a256db81af434bd
+size 157859216

peptide/rectified_datasets/v2/train/dataset_info.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids_x0": {
+      "feature": {
+        "feature": {
+          "dtype": "int64",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    },
+    "input_ids_x1": {
+      "feature": {
+        "feature": {
+          "dtype": "int64",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

peptide/rectified_datasets/v2/train/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "c41975ecd76982be",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

peptide/rectified_datasets/v2/validation/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff40262e105e2c748e8e5ca8f89b5882af643f099227d65e6c4b13da8d328094
+size 19725216

peptide/rectified_datasets/v2/validation/dataset_info.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids_x0": {
+      "feature": {
+        "feature": {
+          "dtype": "int64",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    },
+    "input_ids_x1": {
+      "feature": {
+        "feature": {
+          "dtype": "int64",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

peptide/rectified_datasets/v2/validation/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "39ddf61d20fce77a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

peptide/rectified_datasets/v3/dataset_dict.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"splits": ["train", "validation", "test"]}

peptide/rectified_datasets/v3/test/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d6169a4b613ec0ac0c7fdcda92ec031f6eab8a2d0cf451dd744b780ea096825
+size 19749472

peptide/rectified_datasets/v3/test/dataset_info.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids_x0": {
+      "feature": {
+        "feature": {
+          "dtype": "int64",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    },
+    "input_ids_x1": {
+      "feature": {
+        "feature": {
+          "dtype": "int64",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

peptide/rectified_datasets/v3/test/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "f6aed185a066dd98",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

peptide/rectified_datasets/v3/train/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a7828b0d21a1d8103347e7bdc4438caf3dd9f1d1bf1158fcec31e1de0d9bcea
+size 157859216

peptide/rectified_datasets/v3/train/dataset_info.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "input_ids_x0": {
+      "feature": {
+        "feature": {
+          "dtype": "int64",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    },
+    "input_ids_x1": {
+      "feature": {
+        "feature": {
+          "dtype": "int64",
+          "_type": "Value"
+        },
+        "_type": "List"
+      },
+      "_type": "List"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

peptide/rectified_datasets/v3/train/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "448b61e862d72291",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}