Spaces:

ai-gero
/

ProtoBind-Diff

Paused

App Files Files Community

vladimir.manuylov commited on Jul 8, 2025

Commit

bd082dc

1 Parent(s): 8e21d42

initial commit

Browse files

Files changed (13) hide show

LICENSE +21 -0
README.md +29 -2
app.py +146 -0
protobind_diff/__init__.py +0 -0
protobind_diff/data_loader.py +761 -0
protobind_diff/decoder_rope.py +769 -0
protobind_diff/esm_inference.py +175 -0
protobind_diff/ligands/__init__.py +0 -0
protobind_diff/ligands/rdkit_utils.py +209 -0
protobind_diff/ligands/smiles_tokenizer.py +135 -0
protobind_diff/model.py +411 -0
protobind_diff/noise_schedule.py +185 -0
pyproject.toml +44 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Jeremy Wohlwend, Gabriele Corso, Saro Passaro
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: ProtoBind Diff
-emoji: 🏢
 colorFrom: yellow
 colorTo: gray
 sdk: gradio
@@ -11,4 +11,31 @@ license: mit
 short_description: Structure-free target-specific molecule generation
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: ProtoBind Diff
+emoji: 💊
 colorFrom: yellow
 colorTo: gray
 sdk: gradio
 short_description: Structure-free target-specific molecule generation
 ---
+## A Structure-Free Diffusion Language Model for Protein Sequence-Conditioned Ligand Design
+<a href="https://www.biorxiv.org/content/10.1101/2025.06.16.659955v1">
+    <img
+      src="https://img.shields.io/badge/bioRxiv-paper-blue?logo=biorxiv&logoColor=white"
+      alt="Paper on bioRxiv"
+    />
+  </a>
+<a href="https://github.com/gero-science/ProtoBind-Diff">
+    <img
+      src="https://img.shields.io/badge/GitHub-code-black?logo=github&logoColor=white"
+      alt="View on GitHub"
+    />
+</a>
+## Citation
+```bibtex
+@article {Mistryukova2025.06.16.659955,
+	author = {Mistryukova, Lukia and Manuilov, Vladimir and Avchaciov, Konstantin and Fedichev, Peter O.},
+	title = {ProtoBind-Diff: A Structure-Free Diffusion Language Model for Protein Sequence-Conditioned Ligand Design},
+	year = {2025},
+	journal = {bioRxiv}
+}
+```
+## License
+The code and model weights are released under MIT license. See the [LICENSE](LICENSE) file for details.

app.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# app.py
+# --- IMPORTS ---
+import re
+from pathlib import Path
+import gradio as gr
+import torch
+from torch.utils.data import DataLoader
+import lightning.pytorch as pl
+from protobind_diff.esm_inference import get_esm_embedding
+from protobind_diff.model import ModelGenerator
+from protobind_diff.data_loader import InferenceDataset
+from huggingface_hub import hf_hub_download
+# Hugging Face Hub details
+REPO_ID = "ai-gero/ProtoBind-Diff"
+MODEL_FILENAME = "model.ckpt"
+TOKENIZER_FILENAME = "tokenizer_smiles_diffusion.json"
+def generate_smiles_for_sequence(protein_sequence: str, num_samples: int):
+    """
+    The main prediction function that runs the full pipeline.
+    """
+    if not protein_sequence:
+        raise gr.Error("Protein sequence cannot be empty.")
+    protein_sequence = re.sub(r"[^A-Z]", "", protein_sequence.upper())
+    if len(protein_sequence) < 10:
+        raise gr.Error("Protein sequence is too short.")
+    embedding = get_esm_embedding(
+        protein_sequence,
+        'esm2_t33_650M_UR50D',
+        device
+    ).to(dtype=torch.bfloat16)
+    n_batches = num_samples // 10
+    dataset = InferenceDataset(embedding, batch_size=10, n_batches=n_batches)
+    loader = DataLoader(dataset, batch_size=None)
+    trainer = pl.Trainer(
+        accelerator="auto",
+        devices=1,
+        logger=False,
+        precision="16-mixed" if device == "cuda" else "32-true"
+    )
+    predictions_batches = trainer.predict(model=protobind_model, dataloaders=loader)
+    all_smiles = [smi for batch in predictions_batches for smi in batch[0]]
+    unique_smiles = list(set(all_smiles))
+    return ",\n".join(unique_smiles)
+# --- GRADIO APP DEFINITION ---
+# Load models on app startup
+device = "cuda" if torch.cuda.is_available() else "cpu"
+tokenizer_path = hf_hub_download(
+    repo_id=REPO_ID,
+    filename=TOKENIZER_FILENAME,
+)
+ckpt_path = hf_hub_download(
+    repo_id=REPO_ID,
+    filename=MODEL_FILENAME,
+)
+protobind_model = ModelGenerator.load_from_checkpoint(
+    ckpt_path,
+    map_location=device,
+    tokenizer_path=tokenizer_path,
+    seq_embedding_dim=1280,
+    load=True,
+)
+protobind_model.eval()
+protobind_model.to(device)
+# Define the UI
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # ProtoBind-Diff: Protein-Conditioned Ligand Generation
+        This Space demonstrates **ProtoBind-Diff**, a diffusion model for generating novel drug-like molecules (ligands)
+        conditioned on a target protein sequence. Provide a protein's amino acid sequence to generate potential binding molecules in SMILES format.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            protein_sequence = gr.Textbox(
+                lines=10,
+                label="Protein Amino Acid Sequence",
+                placeholder="Enter your protein sequence here (e.g., MGY...)"
+            )
+            num_samples = gr.Slider(
+                minimum=10,
+                maximum=200,
+                value=50,
+                step=10,
+                label="Generation Attempts",
+                info=(
+                    "Upper limit on generation attempts. Duplicates and invalid molecules "
+                    "are discarded, so the final count of unique molecules may be lower. "
+                    "More attempts increase runtime but can improve diversity."
+                )
+            )
+            submit_btn = gr.Button("Generate Molecules", variant="primary")
+        with gr.Column(scale=3):
+            output_smiles = gr.Textbox(
+                lines=15,
+                label="Generated SMILES",
+                info="A list of unique, valid SMILES strings generated for the target protein.",
+                interactive=True
+            )
+    gr.Markdown("### Examples")
+    gr.Examples(
+        examples=[
+            ["MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFEHQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQHLSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDHTGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHILGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHKRIEVEQALAHPYLEQYYDPSDEPIAEAPFKFDMELDDLPKEKLKELIFEETARFQPGYRS",
+             50],
+            ["MDILCEENTSLSSTTNSLMQLNDDTRLYSNDFNSGEANTSDAFNWTVDSENRTNLSCEGCLSPSCLSLLHLQEKNWSALLTAVVIILTIAGNILVIMAVSLEKKLQNATNYFLMSLAIADMLLGFLVMPVSMLTILYGYRWPLPSKLCAVWIYLDVLFSTASIMHLCAISLDRYVAIQNPIHHSRFNSRTKAFLKIIAVWTISVGISMPIPVFGLQDDSKVFKEGSCLLADDNFVLIGSFVSFFIPLTIMVITYFLTIKSLQKEATLCVSDLGTRAKLASFSFLPQSSLSSEKLFQRSIHREPGSYTGRRTMQSISNEQKACKVLGIVFFLFVVMWCPFFITNIMAVICKESCNEDVIGALLNVFVWIGYLSSAVNPLVYTLFNKTYRSAFSRYIQCQYKENKKPLQLILVNTIPALAYKSSQLQMGQKKNSKQDAKTTDNDCSMVALGKQHSEEASKDNSDGVNEKVSCV",
+             100]
+        ],
+        inputs=[protein_sequence, num_samples],
+        outputs=output_smiles,
+        fn=generate_smiles_for_sequence,
+        cache_examples=False,
+    )
+    gr.Markdown(
+        """
+        ---
+        *Model developed by Gero AI. For more details, check out the [original repository](https://github.com/gero-science/ProtoBind-Diff).*
+        """
+    )
+    submit_btn.click(
+        fn=generate_smiles_for_sequence,
+        inputs=[protein_sequence, num_samples],
+        outputs=output_smiles
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch(share=True)

protobind_diff/__init__.py ADDED Viewed

File without changes

protobind_diff/data_loader.py ADDED Viewed

	@@ -0,0 +1,761 @@

+# Data loader for the protobind-diff.
+# This version only supports ProtobindMaskedDiffusion with SMILES and ESM-2 protein encodings.
+import os.path
+import json
+import logging
+from pathlib import Path
+from enum import Enum
+from typing import Dict, List, Tuple, Optional, Union
+from zipfile import ZipFile
+import lightning.pytorch as pl
+import numpy as np
+import torch
+import pandas as pd
+from torch.utils.data import Dataset, DataLoader
+from torch.nn.utils.rnn import pad_sequence
+from tqdm.auto import tqdm
+from .ligands.smiles_tokenizer import ChemformerTokenizer
+from .ligands.rdkit_utils import randomize_smiles_rotated, cluster_fpsim2
+logger = logging.getLogger("lightning")
+class SplittingMethod(Enum):
+    # enum that describes various train/val/test splitting methods.
+    RANDOM = 1
+def split_at_random(df: pd.DataFrame, valid_fraction=0.1, test_fraction=0.1, seed=777):
+    """Randomly splits a DataFrame into training, validation, and test sets.
+        Args:
+            df (pd.DataFrame): The DataFrame to split.
+            valid_fraction (float): The fraction of the data to allocate to the validation set.
+            test_fraction (float): The fraction of the data to allocate to the test set.
+            seed (int): The random seed for shuffling to ensure reproducibility.
+        Returns:
+            Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing the
+            training, validation, and test DataFrames.
+        """
+    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
+    valid_size = int(len(df) * valid_fraction)
+    test_size = int(len(df) * test_fraction)
+    train_size = len(df) - valid_size - test_size
+    train_df = df[:train_size]
+    valid_df = df[train_size:train_size + valid_size]
+    test_df = df[train_size + valid_size:]
+    return train_df, valid_df, test_df
+class RandomizedSmilesDataset(object):
+    """Creates a dataset of tokenized SMILES strings, with an option for on-the-fly randomization.
+     This dataset maps integer indices to SMILES strings and provides tokenized
+     representations. It can randomize SMILES strings during data retrieval to
+     augment the training data.
+     Attributes:
+         smiles (pd.Series): A series of SMILES strings indexed by integers.
+         tokenizer (ChemformerTokenizer): The tokenizer for converting SMILES to tokens.
+         randomize (bool): If True, applies SMILES randomization at retrieval time.
+     """
+    def __init__(self, smiles: dict, tokenizer: ChemformerTokenizer,
+                 randomize: bool = True):
+        self.smiles = pd.Series(data=smiles.keys(), index=smiles.values()).sort_index()
+        assert len(self.smiles) == self.smiles.index[-1] + 1, (f"{len(self.smiles)}"
+                                                               f" {self.smiles.index[:5]} {self.smiles.index[-5:]}")
+        self.tokenizer = tokenizer
+        self.randomize = randomize
+        logger.info(f"Molecular dataset initialized: RandomizedSmilesDataset {type(self.tokenizer)}"
+                    f" random: {self.randomize}")
+    def __len__(self):
+        return len(self.smiles)
+    def __getitem__(self, item):
+        smi = self.smiles[item]
+        if self.randomize:
+            smi = randomize_smiles_rotated(smi)
+        mol = self.tokenizer.encode(smi)[0]
+        return mol
+    @classmethod
+    def from_json(cls, path, **kwargs):
+        with open(path) as f:
+            categorical_mappings = json.load(f)
+            smiles = categorical_mappings['smiles']
+            loaded = cls(smiles, **kwargs)
+        return loaded
+class RandomizedBatchSampler(torch.utils.data.Sampler):
+    """A batch sampler that minimizes padding while maximizing batch randomness.
+        To achieve this, the sampler employs a two-level shuffling strategy:
+        1.  The data is first sorted by sequence length and grouped into buckets.
+        2.  Within each bucket, the sample indices are shuffled.
+        3.  Batches are created by slicing across the globally sorted list of indices,
+            which keeps sequence lengths within a batch similar.
+        4.  The order of these batches is then shuffled to ensure randomness across epochs.
+        This approach balances the trade-off between minimizing padding (by batching
+        similar-length sequences) and maintaining randomness required for effective training.
+        """
+    def __init__(self, sequence_length: np.ndarray, shuffle: bool, batch_volume: int,
+                 generator: torch.Generator = None, num_ranges: int = 150, batch_size: int = 128):
+        """Initializes the RandomizedBatchSampler.
+           Args:
+               sequence_length (np.ndarray): An array of sequence lengths for each item in the dataset.
+               shuffle (bool): If True, shuffle batches and indices within length buckets.
+               batch_volume (int): The maximum total number of elements (seq_len^2) per batch.
+               generator (torch.Generator, optional): PyTorch random number generator. Defaults to None.
+               num_ranges (int): The number of buckets to partition the sequence lengths into.
+               batch_size (int): The maximum number of samples per batch.
+        """
+        self.shuffle = shuffle
+        # For val/test (i.e. when we don't shuffle) we can fit more batches in memory as we don't need grads.
+        batch_volume_factor = 1 if shuffle else 2
+        self.batch_volume = batch_volume * batch_volume_factor
+        assert max(sequence_length) ** 2 < self.batch_volume, \
+            f"Cannot fit sequence {max(sequence_length)=} to {batch_volume=}"
+        if generator is None:
+            self.generator = self._init_generator()
+        else:
+            self.generator = generator
+        self.num_ranges = num_ranges
+        self.sequence_length = sequence_length
+        self.sequence_length_2 = self.sequence_length ** 2
+        self.batch_size = batch_size
+        bins = np.linspace(np.min(sequence_length), np.max(sequence_length) + 1, num_ranges)
+        digit_bins = np.digitize(sequence_length, bins=bins, right=True)
+        self.sequence_length_buckets = [torch.tensor(np.where(digit_bins == i)[0],
+                                                     dtype=torch.int32) for i in range(num_ranges)]
+        self._prepared_batches = None
+    def _get_sliced_batches(self):
+        if self.shuffle:
+            # reshuffle the sequence length buckets.
+            for i in range(len(self.sequence_length_buckets)):
+                self.sequence_length_buckets[i] = self.sequence_length_buckets[i][torch.randperm(
+                    len(self.sequence_length_buckets[i]), generator=self.generator)]
+        current_batch = []
+        current_batch_volume = 0
+        current_batch_size = 0
+        for i in range(self.num_ranges):
+            for idx in self.sequence_length_buckets[i]:
+                if (current_batch_volume + self.sequence_length_2[idx] >= self.batch_volume
+                        or current_batch_size >= self.batch_size):
+                    yield current_batch
+                    current_batch = []
+                    current_batch_volume = 0
+                    current_batch_size = 0
+                current_batch.append(idx.item())
+                current_batch_volume += self.sequence_length_2[idx]
+                current_batch_size += 1
+        if len(current_batch) > 0:
+            yield current_batch
+    @staticmethod
+    def _init_generator():
+        seed = int(torch.empty((), dtype=torch.int64).random_().item())
+        generator = torch.Generator()
+        generator.manual_seed(seed)
+        return generator
+    @property
+    def _length(self):
+        if self._prepared_batches is None:
+            self._prepared_batches = list(self._get_sliced_batches())
+        return len(self._prepared_batches)
+    def __len__(self):
+        return self._length
+    def __iter__(self):
+        if self.shuffle:
+            # Then get the batches and serve them in random order
+            if self._prepared_batches is None:
+                self._prepared_batches = list(self._get_sliced_batches())
+            for batch_idx in torch.randperm(self._length, generator=self.generator):
+                yield self._prepared_batches[batch_idx]
+            self._prepared_batches = None  # Destroy _prepared_batches to recreate it again in __len__
+        else:
+            for batch in self._get_sliced_batches():
+                yield batch
+class ProtobindDataModule(pl.LightningDataModule):
+    """PyTorch Lightning DataModule for Protobind-diffusion datasets.
+        This module handles the loading, processing, and batching of protein-ligand
+        data. It is designed to work with ESM-2 protein embeddings and tokenized
+        SMILES representations for ligands. The module manages data splitting,
+        feature loading, and provides DataLoaders with an efficient batching
+        strategy to minimize padding.
+        Key Features:
+        -   Loads pre-computed ESM-2 protein embeddings.
+        -   Utilizes tokenized SMILES for ligands via `ChemformerTokenizer`.
+        -   Implements a `RandomizedBatchSampler` to create efficient, low-padding batches.
+        -   Handles dataset splitting into training, validation, and test sets.
+    """
+    MASK_VALUE = 0
+    def __init__(self, *,
+                 data_dir: Path,
+                 exp_dir: Path,
+                 splitting_method: SplittingMethod,
+                 batch_volume: int,
+                 num_workers: int,
+                 sequence_type: str = 'esm_zip',
+                 esm_model_name: str = "esm2_t33_650M_UR50D",
+                 max_size_batch: int = 16,
+                 dataset_params: Optional[dict] = None,
+                 float_type: str = 'float32'):
+        super().__init__()
+        """Initializes the ProtobindDataModule.
+             Args:
+                 data_dir (Path): The directory containing the raw dataset files (e.g., data.csv, embeddings).
+                 exp_dir (Path): The directory to save experiment artifacts, including data splits.
+                 splitting_method (SplittingMethod): The method for splitting data (e.g., RANDOM).
+                 batch_volume (int): The target batch volume for the RandomizedBatchSampler.
+                 num_workers (int): The number of workers for the DataLoader.
+                 sequence_type (str): The type of protein sequence data. Must be 'esm_zip'.
+                 esm_model_name (str): The specific ESM model name for embeddings.
+                 max_size_batch (int): The maximum number of samples in a batch.
+                 dataset_params (Optional[dict]): Parameters for the underlying molecular dataset.
+                 float_type (str): The floating-point precision to use.
+        """
+        self.csv_path = data_dir / "data.csv"
+        self.categorical_mappings_path = data_dir / "categorical_mappings.json"
+        # Validate sequence type - only allow ESM variants
+        if sequence_type not in ['esm_zip']:
+            raise ValueError(f"DataModule only supports only 'esm_zip' sequence type, got: {sequence_type}")
+        # directory structure:
+        # output_dir / split / exp_dir_prefix
+        self.exp_dir: Path = Path(exp_dir)
+        self.split_dir: Path = self.exp_dir.parent
+        self.exp_data_dir: Path = self.split_dir.parent
+        self.data_dir = data_dir
+        if dataset_params is None:
+            dataset_params = {}
+        # Create simplified SMILES dataloader
+        self.molecular_dataloader = MolecularDataloaderSMILES(
+            data_dir=data_dir,
+            dataset_options=dataset_params,
+        )
+        self.float_type = float_type
+        self.batch_volume = batch_volume
+        self.max_size_batch = max_size_batch
+        self.num_workers = num_workers
+        self.splitting_method = splitting_method
+        self.esm_model_name = esm_model_name
+        # Only support ESM embeddings (float type data)
+        self.sequence_dtype = getattr(torch, self.float_type)
+        # Will be initialized in setup()
+        self.train_dataset: Optional[torch.utils.data.Dataset] = None
+        self.val_dataset: Optional[torch.utils.data.Dataset] = None
+        self.test_dataset: Optional[torch.utils.data.Dataset] = None
+        self.datasets: Dict[str, pd.DataFrame] = {}
+        self.torch_datasets: Dict[str, torch.utils.data.Dataset] = {}
+    @staticmethod
+    def _read_df(csv_path: Path) -> pd.DataFrame:
+        _use_columns = ['smiles', 'sequence', 'log_IC50', 'log_Ki', 'log_Kd', 'log_EC50', 'label', 'split',
+                        'cluster_smi']
+        df = pd.read_csv(csv_path, nrows=1)
+        _use_columns = df.columns.intersection(_use_columns)
+        dtypes = {"smiles": int, "sequence": int, "log_IC50": float,
+                  "log_Ki": float, "log_Kd": float, "log_EC50": float,
+                  "label": float, "split": str, "cluster_smi": str}
+        df = pd.read_csv(csv_path, dtype=dtypes, usecols=_use_columns)
+        return df
+    @staticmethod
+    def _read_df_and_compute_sequence_lengths(csv_path: Path, length_dict: dict) -> pd.DataFrame:
+        # to reduce RAM load only necessary columns
+        df = ProtobindDataModule._read_df(csv_path)
+        df['sequence_length'] = df["sequence"].map(length_dict)
+        # sort by sequence length to increase the batching efficiency.
+        df.sort_values(by="sequence_length", inplace=True)
+        return df
+    def check_splits_exist(self):
+        """ Tries to find that train-test split exist """
+        if (self.split_dir / "train.csv").exists():
+            assert (self.split_dir / "valid.csv").exists()
+            assert (self.split_dir / "test.csv").exists()
+            logger.info(f"train.csv/valid.csv/test.csv exist, "
+                        f"no new splits will be created for {self.splitting_method}")
+            return True
+        return False
+    def prepare_data_split(self, seed=777, valid_fraction=0.1, test_fraction=0.1):
+        """ Create train.csv, val.csv and test.csv in the experiment dir """
+        if self.check_splits_exist():
+            return
+        # Check that data exists
+        for path in [self.csv_path, self.categorical_mappings_path]:
+            if not path.exists():
+                raise FileNotFoundError(
+                    f"Could not find {path}. Please download the data.")
+        # load label data
+        data_df = pd.read_csv(self.csv_path)
+        # add clusters
+        distance_data = list(self.csv_path.parent.glob('all_smiles_sparse_*.npz'))
+        if len(distance_data) > 0:
+            logger.info(f"Calculating clusters for SMILES and distance data {distance_data[0]}")
+            clusters_smi = cluster_fpsim2(distance_data[0])
+            len_ = len(data_df)
+            data_df = data_df.merge(pd.Series(clusters_smi, name='cluster_smi'), left_on='smiles', right_index=True)
+            assert data_df.shape[0] == len_, (f"Failed to merge clusters, {len_=} {data_df.shape=}"
+                                              f" {clusters_smi.min()} {clusters_smi.max()}")
+        else:
+            raise FileNotFoundError(f'Could not find any all_smiles_sparse_*.npz in {str(self.csv_path.parent)}')
+        # Create splits
+        if self.splitting_method == SplittingMethod.RANDOM:
+            train, valid, test = split_at_random(data_df, valid_fraction=valid_fraction,
+                                                 test_fraction=test_fraction, seed=seed)
+        else:
+            raise NotImplementedError(
+                f"Splitting method {self.splitting_method} is not implemented in simplified version.")
+        train.to_csv(self.split_dir / "train.csv", index=False)
+        valid.to_csv(self.split_dir / "valid.csv", index=False)
+        test.to_csv(self.split_dir / "test.csv", index=False)
+    def prepare_data(self, **kwargs):
+        if kwargs.get('load', False):
+            return
+        if self.exp_dir.exists():
+            logger.info(f"Experiment directory {self.exp_dir} already exists. All existing files "
+                        f" will be kept. To create new data/split remove {self.exp_data_dir} or {self.split_dir}")
+        self.exp_dir.mkdir(parents=True, exist_ok=True)
+        # Make train-test split
+        default_split_kwargs = {'seed': 777,
+                                'valid_fraction': 0.1,
+                                'test_fraction': 0.1,
+                                }
+        # update from kwargs
+        for key in default_split_kwargs.keys():
+            if key in kwargs:
+                default_split_kwargs[key] = kwargs[key]
+        # Create new split or skip if exist
+        self.prepare_data_split(**default_split_kwargs)
+        # Prepare smiles (simplified - only tokenized smiles)
+        self.molecular_dataloader.prepare_molecular_features()
+    def setup(self, stage=None):
+        """Loads and prepares the datasets for a given stage.
+                This method is called by PyTorch Lightning. It performs the following steps:
+                1.  Loads molecular features (tokenized SMILES).
+                2.  Loads protein features (pre-computed ESM embeddings).
+                3.  Loads data splits (train/val/test) from CSV files.
+                4.  Initializes the PyTorch Datasets for each split.
+                Args:
+                    stage (str, optional): The stage to setup ('fit', 'validate', 'test', 'predict').
+        """
+        logger.info("Loading molecular features")
+        # Load molecular features (simplified - only SMILES)
+        self.molecular_dataloader.load_molecular_features()
+        # Load protein features (only ESM embeddings)
+        logger.info(f"Loading protein features {self.esm_model_name}")
+        prot_embbeding_pt = self.data_dir / f'all_prots_{self.esm_model_name}.pt'
+        if prot_embbeding_pt.exists():
+            self.idx_to_sequence_data = torch.load(prot_embbeding_pt, map_location='cpu', weights_only=False)
+            length_dict = {idx: emb.shape[0] for idx, emb in self.idx_to_sequence_data.items()}
+            self.sequence_embedding_dim = next(iter(self.idx_to_sequence_data.values())).shape[1]
+        else:
+            raise FileNotFoundError(
+                f"Packed proteins `all_prots_{self.esm_model_name}.pt` is not found in {self.data_dir}")
+        # load data. Use integer dtypes for categorical features and float for labels.
+        logger.info("Loading activity table")
+        self.datasets = dict(zip(["train", "val", "test"],
+                                 [self._read_df_and_compute_sequence_lengths(self.split_dir / f"{split}.csv",
+                                                                             length_dict)
+                                  for split in ["train", "valid", "test"]]))
+        # initialise self.train_dataset, self.val_dataset, self.test_dataset
+        for ds in ['train', 'val', 'test']:
+            df_ds = self.datasets[ds]
+            assert len(ds) > 0, f"{ds=} is empty"
+            ds_proto = self.create_dataset(df_ds)
+            ds_proto._is_train = (ds == 'train')
+            self.torch_datasets[ds] = ds_proto
+    def create_dataset(self, df, **kwargs):
+        dataset_kwargs = self.molecular_dataloader.dataset_kwargs
+        dataset_class = DatasetMolecularEmbeddings
+        cluster_smi = None
+        sample_smiles = dataset_kwargs.get('sample_smiles', False)
+        if sample_smiles:
+            cluster_smi = df['cluster_smi'].values
+        logger.info(f"Creating dataset: using {dataset_class=}")
+        ds_proto = dataset_class(
+            sequence_embedding=(self.idx_to_sequence_data),
+            smiles_embeddings=self.molecular_dataloader.get_features(),
+            sequences=df['sequence'].values,
+            sequences_length=df['sequence_length'].values,
+            smiles=df['smiles'].values,
+            dtype=self.float_type,
+            cluster_smi=cluster_smi,
+            **dataset_kwargs,
+            **kwargs,
+        )
+        return ds_proto
+    def get_dataloader(self, dataset, shuffle, use_sampler=True, pin_memory=True):
+        if use_sampler:
+            sampler = RandomizedBatchSampler(sequence_length=dataset.sequences_length,
+                                             shuffle=shuffle,
+                                             batch_volume=self.batch_volume,
+                                             batch_size=self.max_size_batch)
+            return DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.collate_fn,
+                              num_workers=self.num_workers, pin_memory=pin_memory)
+        else:
+            return DataLoader(dataset=dataset, collate_fn=dataset.collate_fn, batch_size=self.max_size_batch,
+                              num_workers=self.num_workers, pin_memory=pin_memory, shuffle=shuffle)
+    def train_dataloader(self, use_sampler=True, shuffle=True):
+        return self.get_dataloader(self.torch_datasets['train'], shuffle=shuffle, use_sampler=use_sampler)
+    def val_dataloader(self, use_sampler=True, shuffle=False):
+        return self.get_dataloader(self.torch_datasets['val'], shuffle=shuffle, use_sampler=use_sampler)
+    def test_dataloader(self, use_sampler=True, shuffle=False):
+        return self.get_dataloader(self.torch_datasets['test'], shuffle=shuffle, use_sampler=use_sampler)
+    def predict_dataloader(self, dataset='test', use_sampler=False, shuffle=False):
+        return self.get_dataloader(self.torch_datasets[dataset], shuffle=shuffle, use_sampler=use_sampler)
+    def get_smiles_embedding_dim(self):
+        return self.molecular_dataloader.embedding_size
+    def get_sequence_embedding_dim(self):
+        return self.sequence_embedding_dim
+class DatasetNumpy(Dataset):
+    """ Dataset for feeding model with sequences and ligands embeddings """
+    def __init__(self, *, sequence_embedding: Tuple[np.array, np.array],
+                 smiles_embeddings: np.ndarray,
+                 sequences: np.ndarray,
+                 sequences_length: np.ndarray,
+                 smiles: np.ndarray,
+                 dtype='float16',
+                 **kwargs,
+                 ):
+        """
+        Args:
+            sequence_embedding: embedding for sequences - 1 per each sequence
+            smiles_embeddings: embedding for smiles - 1 per each smile
+            sequences: sequence label in the dataset - 1 per sample
+            sequences_length: sequence length in the dataset - 1 per sample
+            smiles: smile label in the dataset - 1 per sample
+        """
+        assert len(sequences) == len(sequences_length), f"{len(sequences)=}  {len(sequences_length)=}"
+        assert len(sequences) == len(smiles), f"{len(sequences)=}  {len(smiles)=}"
+        self.data_sequence = sequence_embedding
+        self.smiles_embeddings = self.init_smiles_embeddings(smiles_embeddings)
+        self.sequences_length = sequences_length
+        self.sequences = sequences
+        self.smiles = smiles
+        self.float_type = getattr(torch, dtype)
+        # Only support ESM embeddings (float type)
+        self.sequence_dtype = self.float_type
+        self._is_train = False  # this parameter is assigned in during model.setup()
+        # SMILES SAMPLER
+        sample_smiles = kwargs.get('sample_smiles', False)
+        self.cluster_smiles = kwargs.get('cluster_smi', None)
+        self.smiles_to_cluster = None
+        if sample_smiles:
+            self.group_smiles(self.cluster_smiles)
+            self.get_smiles_id = self._smiles_id_sample
+        else:
+            self.get_smiles_id = self._smiles_id_as_ind
+    def init_smiles_embeddings(self, smiles_embeddings):
+        return smiles_embeddings
+    def group_smiles(self, clusters):
+        """ for each sequence group similar smiles to list for random sampling during training """
+        len_ = len(self.sequences)
+        df = pd.DataFrame(data={'smiles': self.smiles, 'sequence': self.sequences, 'cluster_smi': clusters,
+                                'sequences_length': self.sequences_length}
+                          ).groupby(['cluster_smi', 'sequence', 'sequences_length'], as_index=False).agg(list)
+        self.smiles_to_cluster = df['smiles'].values
+        self.sequences = df['sequence'].values
+        self.cluster_smiles = df['cluster_smi'].values
+        self.sequences_length = df['sequences_length'].values
+        logger.info(f"Sampling from similar smiles is ON, dataset size reduced from {len_} to {len(self.sequences)}")
+    def _smiles_id_as_ind(self, idx: int) -> int:
+        """ Get smiles is from array self.smiles """
+        return self.smiles[idx]
+    def _smiles_id_sample(self, idx) -> int:
+        """ Sample smile id from grouped SMILES from same cluster"""
+        return np.random.choice(self.smiles_to_cluster[idx])
+    def __len__(self) -> int:
+        # the number of entries in the dataset
+        return len(self.sequences)
+    def __getitem__(self, idx) -> Tuple[np.ndarray, np.ndarray, int]:
+        seq_id = self.sequences[idx]
+        smi_id = self.get_smiles_id(idx)
+        return (self.parametrize_sequence(seq_id),
+                self.parametrize_smiles(smi_id),
+                self.sequences_length[idx])
+    def parametrize_smiles(self, smiles_id: int) -> np.array:
+        return self.smiles_embeddings[smiles_id]
+    def parametrize_sequence(self, sequence_id: int) -> np.array:
+        return self.data_sequence[sequence_id]
+    @staticmethod
+    def _collate_fn_pack(batch):
+        """ Pack dataset samples to sequences of sequences, smiles, sequence_lengths  """
+        return zip(*batch)
+    def _pad_sequence(self, sequences: List[np.ndarray]) -> torch.Tensor:
+        return pad_sequence([torch.tensor(s, dtype=self.sequence_dtype) for s in sequences], batch_first=True,
+                            padding_value=ProtobindDataModule.MASK_VALUE)
+    def collate_fn(self, batch: Tuple[np.ndarray, np.ndarray, int ]) -> Tuple[
+        Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
+        """Collates samples into a single batch, padding sequences to the same length.
+        Args:
+            batch : A tuple of samples, where each sample is the output of `__getitem__`.
+        Returns:
+            Tuple: A tuple containing batched tensors:
+                - ((torch.Tensor, torch.Tensor)): A tuple of padded protein sequences
+                  and a tensor of their original lengths.
+                - (torch.Tensor): A batch of SMILES embeddings.
+        """
+        sequences, smiles, sequence_lengths = self._collate_fn_pack(batch)
+        padded_sequences = self._pad_sequence(sequences)
+        return ((padded_sequences, torch.tensor(sequence_lengths, dtype=torch.int32)),
+                torch.tensor(np.array(smiles), dtype=self.float_type))
+class DatasetMolecularEmbeddings(DatasetNumpy):
+    """A dataset for masked diffusion models using protein embeddings and tokenized SMILES.
+    This class extends `DatasetNumpy` to handle variable-length, tokenized SMILES
+    representations from a `RandomizedSmilesDataset`. It overrides methods for
+    SMILES parameterization and batch collation to support this token-based approach,
+    which is required for diffusion models.
+    """
+    def parametrize_smiles(self, smiles_id: int) -> Tuple[np.array, int]:
+        mol = self.smiles_embeddings[smiles_id]
+        return mol, len(mol)
+    def __getitem__(self, idx) -> Tuple[np.ndarray, np.array, int, int, int, int]:
+        """Retrieves a single data sample with tokenized SMILES.
+                Unlike the parent class, this method returns the tokenized SMILES
+                and its length instead of a fixed-size embedding.
+         """
+        seq_id = self.sequences[idx]
+        smi_id = self.smiles[idx]
+        return (self.parametrize_sequence(seq_id),) + self.parametrize_smiles(smi_id) + (
+            self.sequences_length[idx], seq_id, smi_id)
+    def init_smiles_embeddings(self, smiles_embeddings):
+        if isinstance(smiles_embeddings, RandomizedSmilesDataset):
+            return smiles_embeddings
+        else:
+            raise ValueError("version only supports RandomizedSmilesDataset")
+    def collate_fn(self, batch: List[Tuple[np.ndarray, np.array, int, int, int, int]]) -> Tuple[
+        Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor],
+        torch.Tensor, torch.Tensor]:
+        """Collates samples into a batch, padding both protein and SMILES sequences.
+        Args:
+            batch (list): A list of samples, where each sample is the output of __getitem__.
+        Returns:
+            Tuple: A tuple containing the final batched tensors for the model:
+                - ((torch.Tensor, torch.Tensor)): Padded protein sequences and their lengths.
+                - ((torch.Tensor, torch.Tensor)): Padded tokenized SMILES and their lengths.
+                - (torch.Tensor): A batch of sequence IDs.
+                - (torch.Tensor): A batch of SMILES IDs.
+        """
+        sequences, atom, atom_lengths, sequence_lengths, seq_id, smi_id \
+            = self._collate_fn_pack(batch)
+        padded_sequences = self._pad_sequence(sequences)  # padding proteins sequences
+        padded_atom = pad_sequence([s.to(dtype=self.float_type) for s in atom], batch_first=True)
+        atom_lengths = torch.tensor(atom_lengths, dtype=torch.int32)
+        return ((padded_sequences, torch.tensor(sequence_lengths, dtype=torch.int32)),
+                (padded_atom, atom_lengths),
+                torch.tensor(seq_id, dtype=torch.int32),
+                torch.tensor(smi_id, dtype=torch.int32),
+                )
+class MolecularDataloaderSMILES(object):
+    """
+    molecular dataloader that only supports tokenized SMILES
+    with ChemformerTokenizer for masked diffusion models.
+    """
+    def __init__(self, *,
+                 data_dir: Path,
+                 dataset_options: Optional[dict] = None):
+        """
+        Args:
+            data_dir: path to data folder containing tokenizer files and dict with all smiles and fasta sequences
+            dataset_options: dictionary with additional parameters used to create pytorch Dataset
+        """
+        self.data_dir = data_dir
+        if dataset_options is None:
+            logger.info('Setting tokenizer file name to tokenizer_smiles_diffusion.json')
+            dataset_options = {'tokenizer_json_name': 'tokenizer_smiles_diffusion'}
+        self.dataset_options = dataset_options
+        self.tokenizer_path = self.data_dir / f"{dataset_options['tokenizer_json_name']}.json"
+        self.tokenizer = ChemformerTokenizer(filename=str(self.tokenizer_path))
+        self.randomize = dataset_options.get('randomize', False)
+        self.smiles_embedding_dim = 1  # For tokenized SMILES, embedding dim is 1
+        self.baseline_dim = 0  # this version doesn't support baseline features
+    def prepare_molecular_features(self):
+        """Prepare molecular features"""
+        if not self.tokenizer_path.exists():
+            raise FileNotFoundError(
+                f"Could not find tokenizer at {self.tokenizer_path}. Please ensure the tokenizer file exists.")
+        logger.info(f"Found ChemformerTokenizer at {self.tokenizer_path}")
+    def load_molecular_features(self):
+        """Load molecular features - loads SMILES mappings"""
+        categorical_mappings_path = self.data_dir / 'categorical_mappings.json'
+        if not categorical_mappings_path.exists():
+            raise FileNotFoundError(f"categorical_mappings.json not found in data_dir: {self.data_dir}")
+        self.smiles_dataset = RandomizedSmilesDataset.from_json(
+            categorical_mappings_path,
+            tokenizer=self.tokenizer,
+            randomize=self.randomize
+        )
+    def get_features(self):
+        """Get the SMILES dataset for tokenized molecular features"""
+        return self.smiles_dataset
+    @property
+    def dataset_kwargs(self):
+        """Return dataset options for creating pytorch datasets"""
+        return self.dataset_options
+    @property
+    def embedding_size(self):
+        """Get embedding size for tokenized SMILES"""
+        return self.smiles_embedding_dim
+class InferenceDataset(Dataset):
+    """Creates a dataset for running inference on a single protein embedding.
+    This utility dataset repeatedly yields the same batch, created by expanding
+    a single input embedding. It's designed for generating a large number of
+    ligand samples for one protein target without a traditional dataset structure.
+    """
+    def __init__(self, embedding: torch.Tensor, batch_size: int, n_batches: int):
+        """Initializes the inference dataset.
+        Args:
+            embedding (torch.Tensor): The single protein embedding tensor to be used.
+            batch_size (int): The number of times to repeat the embedding in each batch.
+            n_batches (int): The total number of identical batches the dataset should yield.
+        """
+        self.embedding_single = embedding
+        self.batch_size = batch_size
+        self.n_batches = n_batches
+        self.seq_len = embedding.shape[1]
+    def __len__(self) -> int:
+        return self.n_batches
+    def __getitem__(self, idx: int) -> Tuple:
+        """Generates a full batch ready for model inference.
+        Note: This method ignores the `idx` argument and always returns the same
+        batch, which is constructed by expanding the stored protein embedding.
+        It includes dummy values to match the data structure expected by the model.
+        Returns:
+            Tuple: A tuple containing pre-batched tensors:
+                - ((torch.Tensor, torch.Tensor)): Expanded protein embeddings and their lengths.
+                - (torch.Tensor): A dummy NaN tensor (placeholder for SMILES).
+                - (torch.Tensor): A batch of placeholder sequence IDs (-1).
+                - (torch.Tensor): A dummy NaN tensor (placeholder for smiles IDs).
+        """
+        embedding = self.embedding_single.expand(self.batch_size, -1, -1).contiguous()
+        lengths = torch.full((self.batch_size,), self.seq_len, dtype=torch.int32)
+        seq_ids = torch.full((self.batch_size,), -1, dtype=torch.int32) #seq_ids dont exist for new sequences
+        return (
+            (embedding, lengths),
+            torch.tensor(float('nan')),
+            seq_ids,
+            torch.tensor(float('nan')),
+        )

protobind_diff/decoder_rope.py ADDED Viewed

	@@ -0,0 +1,769 @@

+import math
+from math import pi
+import typing
+from typing import Tuple, Optional, Literal
+from einops import rearrange, repeat
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.amp import autocast
+from torch.nn import Module, ModuleList
+from torch import nn, einsum, broadcast_tensors, Tensor
+#################################################################################
+#                                Rotary Encoding                                #
+#################################################################################
+# helper functions
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+def slice_at_dim(t, dim_slice: slice, *, dim):
+    dim += (t.ndim if dim < 0 else 0)
+    colons = [slice(None)] * t.ndim
+    colons[dim] = dim_slice
+    return t[tuple(colons)]
+# rotary embedding helper functions
+def rotate_half(x):
+    """Splits the last dimension of a tensor, swaps halves, and negates the first half."""
+    x = rearrange(x, '... (d r) -> ... d r', r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, '... d r -> ... (d r)')
+@autocast('cuda', enabled=False)
+def apply_rotary_emb(
+        freqs,
+        t,
+        start_index=0,
+        scale=1.,
+        seq_dim=-2,
+        freqs_seq_dim=None
+):
+    """Applies rotary positional embeddings to a given tensor.
+        Args:
+            freqs (torch.Tensor): The rotary frequencies.
+            t (torch.Tensor): The tensor to apply embeddings to (e.g., queries or keys).
+            start_index (int): The feature dimension index to start applying rotations from.
+            scale (float): A scaling factor, used for xPos.
+            seq_dim (int): The sequence dimension of the input tensor `t`.
+            freqs_seq_dim (Optional[int]): The sequence dimension of the freqs tensor.
+    """
+    dtype = t.dtype
+    if not exists(freqs_seq_dim):
+        if freqs.ndim == 2 or t.ndim == 3:
+            freqs_seq_dim = 0
+    if t.ndim == 3 or exists(freqs_seq_dim):
+        seq_len = t.shape[seq_dim]
+        freqs = slice_at_dim(freqs, slice(-seq_len, None), dim=freqs_seq_dim)
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+    assert rot_dim <= t.shape[
+        -1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
+    # Split t into three parts: left, middle (to be transformed), and right
+    t_left = t[..., :start_index]
+    t_middle = t[..., start_index:end_index]
+    t_right = t[..., end_index:]
+    # Apply rotary embeddings without modifying t in place
+    t_transformed = (t_middle * freqs.cos() * scale) + (rotate_half(t_middle) * freqs.sin() * scale)
+    out = torch.cat((t_left, t_transformed, t_right), dim=-1)
+    return out.type(dtype)
+# learned rotation helpers
+def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
+    if exists(freq_ranges):
+        rotations = einsum('..., f -> ... f', rotations, freq_ranges)
+        rotations = rearrange(rotations, '... r f -> ... (r f)')
+    rotations = repeat(rotations, '... n -> ... (n r)', r=2)
+    return apply_rotary_emb(rotations, t, start_index=start_index)
+# classes
+class RotaryEmbedding(Module):
+    """
+    original paper: https://arxiv.org/abs/2104.09864
+    rescale rotary embeddings to longer sequence length without fine-tuning
+    code source: https://github.com/lucidrains/rotary-embedding-torch
+    """
+    def __init__(
+            self,
+            dim,
+            custom_freqs: Tensor | None = None,
+            freqs_for: Literal['lang', 'pixel', 'constant'] = 'lang',
+            theta=10000,
+            max_freq=10,
+            num_freqs=1,
+            learned_freq=False,
+            use_xpos=False,
+            xpos_scale_base=512,
+            interpolate_factor=1.,
+            theta_rescale_factor=1.,
+            seq_before_head_dim=False,
+            cache_if_possible=True,
+            cache_max_seq_len=8192
+    ):
+        super().__init__()
+        """Initializes the RotaryEmbedding module.
+                Args:
+                    dim (int): The feature dimension to apply rotary embeddings to.
+                    custom_freqs ([Tensor]): An optional tensor of custom frequencies.
+                    freqs_for : The method for generating
+                        frequencies. 'lang' is standard for transformers.
+                    theta (int): A core hyperparameter for frequency calculation.
+                    learned_freq (bool): If True, the frequencies are trainable parameters.
+                    use_xpos (bool): If True, enables the xPos (extrapolatable) variant.
+                    interpolate_factor (float): A factor for positional interpolation, which
+                        can help with length generalization.
+                    cache_if_possible (bool): If True, caches calculated frequencies for efficiency.
+        """
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+        self.freqs_for = freqs_for
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+        self.cache_if_possible = cache_if_possible
+        self.cache_max_seq_len = cache_max_seq_len
+        self.register_buffer('cached_freqs', torch.zeros(cache_max_seq_len, dim), persistent=False)
+        self.cached_freqs_seq_len = 0
+        self.freqs = nn.Parameter(freqs, requires_grad=learned_freq)
+        self.learned_freq = learned_freq
+        # dummy for device
+        self.register_buffer('dummy', torch.tensor(0), persistent=False)
+        # default sequence dimension
+        self.seq_before_head_dim = seq_before_head_dim
+        self.default_seq_dim = -3 if seq_before_head_dim else -2
+        # interpolation factors
+        assert interpolate_factor >= 1.
+        self.interpolate_factor = interpolate_factor
+        # xpos
+        self.use_xpos = use_xpos
+        if not use_xpos:
+            return
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = xpos_scale_base
+        self.register_buffer('scale', scale, persistent=False)
+        self.register_buffer('cached_scales', torch.zeros(cache_max_seq_len, dim), persistent=False)
+        self.cached_scales_seq_len = 0
+        # add apply_rotary_emb as static method
+        self.apply_rotary_emb = staticmethod(apply_rotary_emb)
+    @property
+    def device(self):
+        return self.dummy.device
+    def get_seq_pos(self, seq_len, device, dtype, offset=0):
+        return (torch.arange(seq_len, device=device, dtype=dtype) + offset) / self.interpolate_factor
+    def rotate_queries_or_keys(self, t, seq_dim=None, offset=0, scale=None):
+        """Applies rotary embeddings to a single tensor (queries or keys).
+                Args:
+                    t (torch.Tensor): The input tensor (queries or keys).
+                    seq_dim : The sequence dimension of the tensor.
+                    offset (int): An offset for the position sequence, used for caching.
+                    scale (Optional[float]): A scaling factor, required if using xPos.
+                Returns:
+                    torch.Tensor: The tensor with rotary embeddings applied.
+         """
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        assert not self.use_xpos or exists(
+            scale), 'you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings'
+        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
+        seq = self.get_seq_pos(seq_len, device=device, dtype=dtype, offset=offset)
+        freqs = self.forward(seq, seq_len=seq_len, offset=offset)
+        if seq_dim == -3:
+            freqs = rearrange(freqs, 'n d -> n 1 d')
+        return apply_rotary_emb(freqs, t, scale=default(scale, 1.), seq_dim=seq_dim)
+    def rotate_queries_with_cached_keys(self, q, k, seq_dim=None, offset=0):
+        dtype, device, seq_dim = q.dtype, q.device, default(seq_dim, self.default_seq_dim)
+        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
+        assert q_len <= k_len
+        q_scale = k_scale = 1.
+        if self.use_xpos:
+            seq = self.get_seq_pos(k_len, dtype=dtype, device=device)
+            q_scale = self.get_scale(seq[-q_len:]).type(dtype)
+            k_scale = self.get_scale(seq).type(dtype)
+        rotated_q = self.rotate_queries_or_keys(q, seq_dim=seq_dim, scale=q_scale, offset=k_len - q_len + offset)
+        rotated_k = self.rotate_queries_or_keys(k, seq_dim=seq_dim, scale=k_scale ** -1)
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+        return rotated_q, rotated_k
+    def rotate_queries_and_keys(self, q, k, seq_dim=None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        assert self.use_xpos
+        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
+        seq = self.get_seq_pos(seq_len, dtype=dtype, device=device)
+        freqs = self.forward(seq, seq_len=seq_len)
+        scale = self.get_scale(seq, seq_len=seq_len).to(dtype)
+        if seq_dim == -3:
+            freqs = rearrange(freqs, 'n d -> n 1 d')
+            scale = rearrange(scale, 'n d -> n 1 d')
+        rotated_q = apply_rotary_emb(freqs, q, scale=scale, seq_dim=seq_dim)
+        rotated_k = apply_rotary_emb(freqs, k, scale=scale ** -1, seq_dim=seq_dim)
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+        return rotated_q, rotated_k
+    def get_scale(
+            self,
+            t: Tensor,
+            seq_len = None,
+            offset=0
+    ):
+        assert self.use_xpos
+        should_cache = (
+                self.cache_if_possible and
+                exists(seq_len) and
+                (offset + seq_len) <= self.cache_max_seq_len
+        )
+        if (
+                should_cache and \
+                exists(self.cached_scales) and \
+                (seq_len + offset) <= self.cached_scales_seq_len
+        ):
+            return self.cached_scales[offset:(offset + seq_len)]
+        scale = 1.
+        if self.use_xpos:
+            power = (t - len(t) // 2) / self.scale_base
+            scale = self.scale ** rearrange(power, 'n -> n 1')
+            scale = repeat(scale, 'n d -> n (d r)', r=2)
+        if should_cache and offset == 0:
+            self.cached_scales[:seq_len] = scale.detach()
+            self.cached_scales_seq_len = seq_len
+        return scale
+    def get_axial_freqs(self, *dims):
+        Colon = slice(None)
+        all_freqs = []
+        for ind, dim in enumerate(dims):
+            if self.freqs_for == 'pixel':
+                pos = torch.linspace(-1, 1, steps=dim, device=self.device)
+            else:
+                pos = torch.arange(dim, device=self.device)
+            freqs = self.forward(pos, seq_len=dim)
+            all_axis = [None] * len(dims)
+            all_axis[ind] = Colon
+            new_axis_slice = (Ellipsis, *all_axis, Colon)
+            all_freqs.append(freqs[new_axis_slice])
+        all_freqs = broadcast_tensors(*all_freqs)
+        return torch.cat(all_freqs, dim=-1)
+    @autocast('cuda', enabled=False)
+    def forward(
+            self,
+            t: Tensor,
+            seq_len = None,
+            offset=0
+    ):
+        """Calculates the rotary frequencies for a given sequence of positions.
+        Args:
+            t (torch.Tensor): A tensor of position indices.
+            seq_len (int): The total sequence length, used for caching.
+            offset (int): The starting position offset.
+        Returns:
+            torch.Tensor: A tensor of calculated rotation frequencies.
+        """
+        should_cache = (
+                self.cache_if_possible and
+                not self.learned_freq and
+                exists(seq_len) and
+                self.freqs_for != 'pixel' and
+                (offset + seq_len) <= self.cache_max_seq_len
+        )
+        if (
+                should_cache and \
+                exists(self.cached_freqs) and \
+                (offset + seq_len) <= self.cached_freqs_seq_len
+        ):
+            return self.cached_freqs[offset:(offset + seq_len)].detach()
+        freqs = self.freqs
+        freqs = einsum('..., f -> ... f', t.type(freqs.dtype), freqs)
+        freqs = repeat(freqs, '... n -> ... (n r)', r=2)
+        if should_cache and offset == 0:
+            self.cached_freqs[:seq_len] = freqs.detach()
+            self.cached_freqs_seq_len = seq_len
+        return freqs
+#################################################################################
+#                             Multi Head Attention                              #
+#################################################################################
+class LayerNorm(nn.Module):
+    """Implements a Layer Normalization module."""
+    def __init__(self, d_model, eps=1e-12):
+        """Initializes the LayerNorm module.
+        Args:
+            d_model (int): The dimension of the model's features.
+            eps (float): A small value added to the variance for numerical stability.
+        """
+        super(LayerNorm, self).__init__()
+        self.gamma = nn.Parameter(torch.ones(d_model))
+        self.beta = nn.Parameter(torch.zeros(d_model))
+        self.eps = eps
+    def forward(self, x):
+        """Applies Layer Normalization to the input tensor along the last dimension.
+        Args:
+            x (torch.Tensor): The input tensor to normalize.
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
+        mean = x.mean(-1, keepdim=True)
+        var = x.var(-1, unbiased=False, keepdim=True)
+        # '-1' means last dimension.
+        out = (x - mean) / torch.sqrt(var + self.eps)
+        out = self.gamma * out + self.beta
+        return out
+class PositionwiseFeedForward(nn.Module):
+    """Implements the Position-wise Feed-Forward network of a Transformer block."""
+    def __init__(self, d_model, hidden, drop_prob=0.1):
+        """Initializes the PositionwiseFeedForward module.
+        Args:
+            d_model (int): The input and output dimension of the layer.
+            hidden (int): The dimension of the inner hidden layer.
+            drop_prob (float): The probability for the dropout layer.
+        """
+        super(PositionwiseFeedForward, self).__init__()
+        self.linear1 = nn.Linear(d_model, hidden)
+        self.linear2 = nn.Linear(hidden, d_model)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(p=drop_prob)
+    def forward(self, x):
+        """Passes the input through the feed-forward network.
+        The process is: Linear -> ReLU -> Dropout -> Linear.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor.
+        """
+        x = self.linear1(x)
+        x = self.relu(x)
+        x = self.dropout(x)
+        x = self.linear2(x)
+        return x
+class ScaleDotProductAttention(nn.Module):
+    def __init__(self):
+        super(ScaleDotProductAttention, self).__init__()
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, q, k, v, mask=None, e=1e-12):
+        """
+        Performs the Scaled Dot-Product Attention calculation.
+        Args:
+            q (torch.Tensor): The query tensor.
+            k (torch.Tensor): The key tensor.
+            v (torch.Tensor): The value tensor.
+            mask (torch.Tensor, optional): A mask to prevent attention to
+                certain positions. Defaults to None.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple containing the attention
+            output and the attention scores.
+        """
+        batch_size, head, length, d_tensor = k.size()
+        k_t = k.transpose(2, 3)  # transpose
+        score = (q @ k_t) / math.sqrt(d_tensor)  # scaled dot product
+        if mask is not None:
+            score = score.masked_fill(mask == 0, -10000)
+        score = self.softmax(score)
+        v = score @ v
+        return v, score
+class MultiHeadAttention(nn.Module):
+    """Implements a Multi-Head Attention layer with optional Rotary Position Embeddings."""
+    def __init__(self, d_model, n_head):
+        """Initializes the MultiHeadAttention layer.
+          Args:
+              d_model (int): The total dimension of the model.
+              n_head (int): The number of attention heads. d_model must be divisible by n_head.
+        """
+        super(MultiHeadAttention, self).__init__()
+        self.n_head = n_head
+        self.attention = ScaleDotProductAttention()
+        self.w_q = nn.Linear(d_model, d_model)
+        self.w_k = nn.Linear(d_model, d_model)
+        self.w_v = nn.Linear(d_model, d_model)
+        self.w_concat = nn.Linear(d_model, d_model)
+        self.rotary_emb = RotaryEmbedding(dim=d_model // n_head)
+    def forward(self, q, k, v, mask=None, apply_rotary=False):
+        """Performs the forward pass for multi-head attention.
+                Args:
+                    q (torch.Tensor): The query tensor.
+                    k (torch.Tensor): The key tensor.
+                    v (torch.Tensor): The value tensor.
+                    mask (torch.Tensor, optional): An attention mask. Defaults to None.
+                    apply_rotary (bool): If True, applies Rotary Position Embeddings to Q and K
+                        before the attention calculation. Defaults to False.
+                Returns:
+                    Tuple[torch.Tensor, torch.Tensor]: A tuple containing the final output tensor
+                    and the attention scores.
+        """
+        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)
+        q, k, v = self.split(q), self.split(k), self.split(v)
+        if apply_rotary:
+            # add Rotary Positional Embeddings (RoPE)
+            # https://arxiv.org/abs/2104.09864
+            q = self.rotary_emb.rotate_queries_or_keys(q)
+            k = self.rotary_emb.rotate_queries_or_keys(k)
+        out, attention = self.attention(q, k, v, mask=mask)
+        out = self.concat(out)
+        out = self.w_concat(out)
+        return out, attention
+    def split(self, tensor):
+        """Splits the last dimension of a tensor into multiple heads."""
+        batch_size, length, d_model = tensor.size()
+        d_tensor = d_model // self.n_head
+        tensor = tensor.view(batch_size, length, self.n_head, d_tensor).transpose(1, 2)
+        return tensor
+    def concat(self, tensor):
+        """Concatenates multiple heads back into a single tensor."""
+        batch_size, head, length, d_tensor = tensor.size()
+        d_model = head * d_tensor
+        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
+        return tensor
+#################################################################################
+#                               Embedding Layers                                #
+#################################################################################
+class EmbeddingLayer(nn.Module):
+    """A simple lookup-based embedding layer with Kaiming uniform initialization."""
+    def __init__(self, dim, vocab_dim):
+        super().__init__()
+        self.embedding = nn.Parameter(torch.empty((vocab_dim, dim)))
+        torch.nn.init.kaiming_uniform_(self.embedding, a=math.sqrt(5))
+    def forward(self, x):
+        """Looks up the embeddings for the given indices.
+        Args:
+            x (torch.Tensor): A tensor of integer indices.
+        Returns:
+            torch.Tensor: The corresponding embedding vectors.
+        """
+        return self.embedding[x]
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        """Initializes the TimestepEmbedder.
+         Args:
+             hidden_size (int): The final dimension of the timestep embedding.
+             frequency_embedding_size (int): The number of frequencies to use for
+                 the sinusoidal embedding.
+         """
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True))
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            - math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding,
+                 torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+#################################################################################
+#                                  Decoder                                      #
+#################################################################################
+class DecoderLayer(nn.Module):
+    """
+    code source: https://github.com/hyunwoongko/transformer
+    """
+    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
+        """Initializes the DecoderLayer.
+        Args:
+            d_model (int): The dimension of the model.
+            ffn_hidden (int): The dimension of the hidden layer in the feed-forward network.
+            n_head (int): The number of attention heads.
+            drop_prob (float): The dropout probability.
+        """
+        super(DecoderLayer, self).__init__()
+        self.self_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
+        self.norm1 = LayerNorm(d_model=d_model)
+        self.dropout1 = nn.Dropout(p=drop_prob)
+        self.enc_dec_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
+        self.norm2 = LayerNorm(d_model=d_model)
+        self.dropout2 = nn.Dropout(p=drop_prob)
+        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
+        self.norm3 = LayerNorm(d_model=d_model)
+        self.dropout3 = nn.Dropout(p=drop_prob)
+    def forward(self, dec, enc, trg_mask, src_mask, return_attention=False):
+        """Performs one forward pass of the decoder layer.
+        Args:
+            dec (torch.Tensor): The input tensor from the previous decoder layer.
+            enc (torch.Tensor): The output tensor from the encoder (for conditioning).
+            trg_mask (torch.Tensor): The mask for the decoder's self-attention.
+            src_mask (torch.Tensor): The mask for the cross-attention.
+            return_attention (bool): If True, returns the cross-attention weights.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple containing the output tensor
+            and the attention weights (or None).
+        """
+        attention = None
+        _x = dec
+        x, _ = self.self_attention(q=dec, k=dec, v=dec, mask=trg_mask, apply_rotary=True)
+        x = self.dropout1(x)
+        x = self.norm1(x + _x)
+        if enc is not None:
+            _x = x
+            if return_attention:
+                x, attention = self.enc_dec_attention(q=x, k=enc, v=enc, mask=src_mask)
+            else:
+                x, _ = self.enc_dec_attention(q=x, k=enc, v=enc, mask=src_mask)
+            x = self.dropout2(x)
+            x = self.norm2(x + _x)
+        _x = x
+        x = self.ffn(x)
+        x = self.dropout3(x)
+        x = self.norm3(x + _x)
+        return x, attention
+class Decoder_RoPE(nn.Module):
+    """A decoder that uses Rotary Position Embeddings (RoPE).
+    This model is designed for a diffusion task, taking a ligand sequence, a
+    conditioning protein sequence, and a diffusion timestep (sigma) as input
+    to predict the output logits for the ligand.
+    """
+    def __init__(self,
+                 vocab_size,
+                 seq_emb_dim,
+                 hidden_size: int=640,
+                 nhead: int=8,
+                 n_layers: int=4,
+                 expand_feedforward: int=3,
+                 dropout: float=0.1):
+        """Args:
+            vocab_size (int): The size of the output vocabulary (e.g., ligand tokens).
+            seq_emb_dim (int): The dimension of the input sequence embeddings.
+            hidden_size (int): The main hidden dimension of the Transformer model.
+            nhead (int): The number of attention heads in each DecoderLayer.
+            n_layers (int): The number of DecoderLayers to stack.
+            expand_feedforward (int): The expansion factor for the feed-forward
+                network's hidden layer.
+            dropout (float): The dropout probability.
+        """
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.vocab_embed = EmbeddingLayer(self.hidden_size, vocab_size)
+        self.linear = nn.Linear(self.hidden_size, vocab_size)
+        self.apply_seq_linear = False
+        if seq_emb_dim != self.hidden_size:
+            self.apply_seq_linear = True
+            self.linear_seq = nn.Linear(seq_emb_dim, self.hidden_size)
+        self.sigma_map = TimestepEmbedder(self.hidden_size)
+        self.layers = nn.ModuleList([DecoderLayer(d_model=self.hidden_size,
+                                                  ffn_hidden=self.hidden_size * expand_feedforward,
+                                                  n_head=nhead,
+                                                  drop_prob=dropout)
+                                     for _ in range(n_layers)])
+    def forward(self,
+                ligand: torch.Tensor,
+                sigma: torch.Tensor,
+                sequence: torch.Tensor,
+                sequence_lengths: torch.Tensor,
+                lig_padding_mask: Optional[torch.Tensor]=None,
+                return_attention: bool=False) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Performs the forward pass of the decoder.
+        It processes the ligand sequence conditioned on the protein sequence and the
+        diffusion timestep (sigma). The sigma embedding is prepended to the protein
+        sequence to form a single conditioning context.
+        Args:
+            ligand (torch.Tensor): A batch of ligand token ID tensors.
+            sigma (torch.Tensor): A batch of scalar diffusion timesteps.
+            sequence (torch.Tensor): A batch of conditioning protein sequence embeddings.
+            sequence_lengths (torch.Tensor): The original lengths of the protein sequences.
+            lig_padding_mask (Optional[torch.Tensor]): A padding mask for the ligand.
+            return_attention (bool): If True, returns the cross-attention weights
+                from the last decoder layer.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple of (output_logits, attention_weights).
+        """
+        ligand = self.vocab_embed(ligand)
+        sigma = F.silu(self.sigma_map(sigma)).unsqueeze(1)
+        if self.apply_seq_linear:
+            sequence = self.linear_seq(sequence)
+        condition = torch.cat([sigma, sequence], dim=1)
+        sequence_lengths += 1
+        range_tensor = torch.arange(condition.shape[1], device=sequence.device).unsqueeze(0)
+        condition_mask = range_tensor < sequence_lengths.unsqueeze(1)
+        condition_mask = condition_mask.unsqueeze(1).unsqueeze(2)
+        if lig_padding_mask is not None:
+            lig_padding_mask = lig_padding_mask.unsqueeze(1).unsqueeze(2)
+        for layer in self.layers:
+            ligand, attention = layer(ligand, condition,
+                                      trg_mask=lig_padding_mask, src_mask=condition_mask,
+                                      return_attention=return_attention)
+        output = self.linear(ligand)
+        return output, attention

protobind_diff/esm_inference.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import argparse, sys
+from typing import Optional, Tuple
+from pathlib import Path
+import esm
+import os
+import torch
+import numpy as np
+import re
+from Bio import SeqIO
+from torch.utils.data import Dataset, DataLoader
+import lightning.pytorch as pl
+from protobind_diff.model import ModelGenerator
+from protobind_diff.data_loader import InferenceDataset
+from huggingface_hub import hf_hub_download
+REPO_ID = "ai-gero/ProtoBind-Diff"
+FILENAME = "model.ckpt"
+TOKENIZER_FILENAME = "tokenizer_smiles_diffusion.json"
+class ProtobindInference():
+    """
+    Simplified inference class that only supports ProtobindMaskedDiffusion model.
+    """
+    def __init__(self, checkpoint_path, tokenizer_path,
+                 sequence_embedding_dim, lig_max_length: int=170, nucleus_p: float=0.9,
+                 eta: float=0.1, sampling_steps: int=250,
+                 **kwargs):
+        self.checkpoint_path = Path(checkpoint_path)
+        self.tokenizer_path = Path(tokenizer_path)
+        self.sequence_embedding_dim = sequence_embedding_dim
+        # Set up sampler params
+        self.lig_max_length = lig_max_length
+        self.nucleus_p = nucleus_p
+        self.eta = eta
+        self.sampling_steps = sampling_steps
+        # Load model
+        self.model = self.load_model()
+    def predict_on_dataloader(self, dl, devices=1, accelerator='cuda') -> Tuple[np.ndarray, np.ndarray]:
+        if accelerator == 'cuda':
+            torch.set_float32_matmul_precision('medium')
+            precision = "16-mixed"
+        else:
+            precision = "32-true"
+        trainer = pl.Trainer(precision=precision, use_distributed_sampler=False,
+                             inference_mode=True, accelerator=accelerator, devices=devices)
+        predictions_batches = trainer.predict(model=self.model, dataloaders=dl)
+        return predictions_batches
+    def load_model(self):
+        """Simplified model loading - only supports ModelGenerator"""
+        model = ModelGenerator.load_from_checkpoint(
+            self.checkpoint_path,
+            tokenizer_path=self.tokenizer_path,
+            seq_embedding_dim=self.sequence_embedding_dim,
+            load=True,
+        )
+        model.model_length = self.lig_max_length
+        model.nucleus_p = self.nucleus_p
+        model.eta = self.eta
+        model.sampling_steps = self.sampling_steps
+        model.model.eval()
+        return model
+def get_esm_embedding(sequence: str, model_name: str, device: torch.device) -> torch.Tensor:
+    """Generates a protein embedding using a pre-trained ESM model.
+    Args:
+        sequence (str): The amino acid sequence.
+        model_name (str): The name of the ESM model to use.
+        device (torch.device): The device to run the model on.
+    Returns:
+        torch.Tensor: The final residue-level embedding tensor, with start/end tokens removed.
+    """
+    model, alphabet = esm.pretrained.load_model_and_alphabet(model_name)
+    model.eval()
+    number_layers = re.search(r'_t(\d+)_', model_name)
+    number_layers = int(number_layers.group(1))
+    model = model.to(device)
+    batch_converter = alphabet.get_batch_converter()
+    _, _, tokens = batch_converter([("protein", sequence)])
+    tokens = tokens.to(device)
+    with torch.no_grad():
+        out = model(tokens, repr_layers=[number_layers])
+    return out["representations"][number_layers][:, 1:-1, :]  # [1, seq_len, emb_dim]
+def download_from_hub_hf(cache: Path, filename) -> Path:
+    """
+    Fetch file from Hugging Face into `cache`.
+    Returns the local path to the file inside HF’s cache structure.
+    """
+    cache.mkdir(parents=True, exist_ok=True)
+    local_path = hf_hub_download(
+        repo_id=REPO_ID,
+        filename=filename,
+        cache_dir=cache,
+    )
+    return Path(local_path)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--sequence",  help="Amino acid sequence (1-letter code)")
+    parser.add_argument("--output_dir", default="./outputs", help="Output dir for SMILES")
+    parser.add_argument("--output", default="generated_smiles.txt", help="Output file for generated SMILES")
+    parser.add_argument("--n_batches", type=int, default=5, help="Number of batches to generate for this sequence")
+    parser.add_argument("--batch_size", type=int, default=10, help="Max number of generated molecules per batch")
+    parser.add_argument("--fasta_file", default="./examples/input.fasta",  help="Input FASTA file")
+    parser.add_argument("--checkpoint_path", type=str, help="Path to the model checkpoint")
+    parser.add_argument('--model_name', type=str, default='esm2_t33_650M_UR50D',
+                        help="ESM model name. See https://github.com/facebookresearch/esm")
+    parser.add_argument('--tokenizer_path', help='Path to tokenizer.json file. If not provided, uses a default path and downloads if needed.')
+    parser.add_argument('--cache', type=str, default = "./cache", help='Cache folder for ckpt')
+    parser.add_argument("--sampling_steps", type=int, default=250, help="Number of steps during sampling")
+    parser.add_argument("--lig_max_length", type=int, default=170, help="Max length of generated molecules")
+    parser.add_argument("--nucleus_p", type=float, default=0.9,
+                        help="Value of the nucleus sampling parameter. For more details, see https://arxiv.org/abs/2503.00307")
+    parser.add_argument("--eta", type=float, default=0.1,
+                        help="Value of the probability of remasking. For more details, see https://arxiv.org/abs/2503.00307")
+    args = parser.parse_args()
+    if args.fasta_file:
+        sequence = str(next(SeqIO.parse(args.fasta_file, "fasta")).seq)
+    elif args.sequence:
+        sequence = args.sequence.strip().upper()
+    else:
+        sys.exit("Error: provide --sequence of --fasta_file")
+    if args.checkpoint_path:
+        ckpt_path = Path(args.checkpoint_path)
+    else:
+        torch.hub.set_dir(args.cache)  # for ESM model
+        ckpt_path = download_from_hub_hf(Path(args.cache), FILENAME)
+    if args.tokenizer_path:
+        tokenizer_path = Path(args.tokenizer_path)
+        if not tokenizer_path.exists():
+            sys.exit(f"Error: Tokenizer file not found at specified path: {tokenizer_path}")
+    else:
+        tokenizer_path = download_from_hub_hf(Path(args.cache), TOKENIZER_FILENAME)
+    # Determine the device
+    if torch.cuda.is_available():
+        device = torch.device("cuda")  # Use CUDA if available
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")  # Use MPS for Apple Silicon if available
+    else:
+        device = torch.device("cpu")  # Fallback to CPU
+    embedding = get_esm_embedding(sequence, args.model_name, device).to(dtype=torch.bfloat16)
+    sequence_embedding_dim = embedding.shape[2]
+    dataset = InferenceDataset(embedding, batch_size=args.batch_size, n_batches=args.n_batches)
+    loader = DataLoader(dataset, batch_size=None)
+    model = ProtobindInference(ckpt_path, tokenizer_path, sequence_embedding_dim,
+                               sampling_steps=args.sampling_steps, nucleus_p=args.nucleus_p,
+                               eta=args.eta, lig_max_length=args.lig_max_length,)
+    predictions = model.predict_on_dataloader(loader, accelerator=str(device))
+    all_smiles = [smi for batch in predictions for smi in batch[0]]
+    out_dir = Path(args.output_dir)
+    os.makedirs(out_dir, exist_ok=True)
+    with open(out_dir / args.output, "w") as f:
+        f.write("SMILES\n")
+        for smi in all_smiles:
+            f.write(smi + "\n")
+if __name__ == "__main__":
+    main()

protobind_diff/ligands/__init__.py ADDED Viewed

File without changes

protobind_diff/ligands/rdkit_utils.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import importlib
+from typing import Optional, Tuple, Union, List
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from multiprocessing import Pool
+from pathlib import Path
+from rdkit import Chem
+from FPSim2 import FPSim2Engine
+import rdkit
+from rdkit import Chem, RDLogger
+from rdkit.Chem import DataStructs, Descriptors
+from rdkit.DataStructs import BulkTanimotoSimilarity
+from sklearn.cluster import DBSCAN
+import scipy
+RDLogger.DisableLog('rdApp.*')
+class BoostWrapper(object):
+    """ Help joblib to deal with boost functions """
+    def __init__(self, method_name, module_name):
+        self.method_name = method_name
+        self.module = importlib.import_module(module_name)
+    @property
+    def method(self):
+        return getattr(self.module, self.method_name)
+    def __call__(self, *args, **kwargs):
+        return self.method(*args, **kwargs)
+def cluster_fpsim2(distance_path, smiles_h5_path=None, dist_eps=0.15):
+    """ Cluster precomputed FPSim2 distance matrix using DBSCAN algorithm """
+    if isinstance(distance_path, str):
+        distance_path = Path(distance_path, smiles_h5_path=None)
+    if smiles_h5_path is None:
+        smiles_h5_path = distance_path.parent / 'all_smiles.h5'
+    precomputed_indices = FPSim2Engine(smiles_h5_path).fps[:, 0]
+    map_precomputed = np.argsort(precomputed_indices)  # maps original smiles order to FPSim2 order
+    precomputed_distance = scipy.sparse.load_npz(distance_path)
+    db = DBSCAN(eps=dist_eps, min_samples=1, metric='precomputed', n_jobs=-1)
+    labels = db.fit_predict(precomputed_distance)
+    # df_ = pd.DataFrame(data=smiles.keys(), index=list(smiles.values()), columns=['SMILES'])
+    # df_ = df_.sort_index()
+    # df_['cluster'] = labels[map_precomputed]
+    return labels[map_precomputed]
+def tanimoto_smiles(mol1, mol2, fp='rdkit', bits=2048, radius=2):
+    if isinstance(mol1, str):
+        mol1 = Chem.MolFromSmiles(mol1)
+    if isinstance(mol2, str):
+        mol2 = Chem.MolFromSmiles(mol2)
+    _supported_fps = {
+        'rdkit': Chem.RDKFingerprint,
+        'morgan': Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect,
+        'maccs': Chem.rdMolDescriptors.GetMACCSKeysFingerprint,
+    }
+    if fp not in _supported_fps:
+        raise ValueError(f"Fingerprint {fp} is not supported, available fps {_supported_fps.keys()}")
+    ffp = None
+    if fp == 'rdkit':
+        ffp = lambda x: _supported_fps[fp](x, fpSize=bits)
+    elif fp == 'morgan':
+        ffp = lambda x: _supported_fps[fp](x, fpSize=bits, radius=radius, nBits=bits)
+    elif fp == 'maccs':
+        ffp = _supported_fps[fp]
+    return rdkit.DataStructs.TanimotoSimilarity(ffp(mol1), ffp(mol2))
+def validate_smile(smile):
+    try:
+        mol = Chem.MolFromSmiles(smile)
+        Chem.SanitizeMol(mol)
+        return smile
+    except Exception:
+        return None
+def calc_chem_desc(smiles):
+    rdkit_features = {'MolWt': rdkit.Chem.Descriptors.MolWt,
+                      'MolLogP': rdkit.Chem.Descriptors.MolLogP,
+                      'NumRotatableBonds': rdkit.Chem.Descriptors.NumRotatableBonds,
+                      'CalcTPSA': rdkit.Chem.rdMolDescriptors.CalcTPSA,
+                      'RingCount': rdkit.Chem.Descriptors.RingCount,
+                      }
+    if isinstance(smiles[0], str):
+        mols = smiles_to_mols(smiles)
+    elif isinstance(smiles[0], rdkit.Chem.rdchem.Mol):
+        mols = smiles
+    else:
+        raise TypeError(f'smiles must be a string or a rdkit.Chem.rdchem.Mol: {type(smiles[0])}')
+    res = {}
+    for name, func in rdkit_features.items():
+        res[name] = np.asarray([func(m) if m is not None else np.nan for m in mols ])
+    return pd.DataFrame(res)
+def smiles_to_mols(smiles, n_jobs=8):
+    if isinstance(smiles, (list, tuple, np.ndarray)):
+        pass
+    elif isinstance(smiles, pd.Series):
+        smiles = smiles.tolist()
+    else:
+        raise TypeError(f"{type(smiles)=}")
+    assert len(smiles) > 0
+    assert isinstance(smiles[0], str), f"expect smiles string, got f{smiles[0]}"
+    mols = joblib.Parallel(n_jobs=n_jobs)(
+        joblib.delayed(BoostWrapper('MolFromSmiles', 'rdkit.Chem.rdmolfiles', ))(smi) for smi in smiles)
+    return mols
+def smiles_to_fps(smiles_or_mols, finger_type='rdkit', n_jobs=8, fp_param=None):
+    if isinstance(smiles_or_mols, (list, tuple, np.ndarray)):
+        pass
+    elif isinstance(smiles_or_mols, pd.Series):
+        smiles_or_mols = smiles_or_mols.tolist()
+    else:
+        raise TypeError(f"{type(smiles_or_mols)=}")
+    assert len(smiles_or_mols) > 0
+    assert isinstance(smiles_or_mols[0],
+                      (str, rdkit.Chem.rdchem.Mol)), f"variable {smiles_or_mols[0]} has type {type(smiles_or_mols[0])}"
+    if isinstance(smiles_or_mols[0], str):
+        mols = smiles_to_mols(smiles_or_mols)
+    else:
+        mols = smiles_or_mols
+    if fp_param is None:
+        fp_param = {}
+    fp_func, fp_func_name, fp_func_module, fp_params = _find_fingerprint_function(finger_type)
+    fp_params.update(fp_param)
+    if finger_type == 'morgan':
+        fp_func = fp_func(**fp_params).GetFingerprint
+        fp_params = {}
+    fps = joblib.Parallel(n_jobs=n_jobs, prefer="threads")(
+        joblib.delayed(fp_func)(mol, **fp_params) for mol in mols)
+    return fps
+def _find_fingerprint_function(finger_type: str) -> Tuple[callable, str, str, dict]:
+    kwargs = {}
+    if finger_type == 'rdkit':
+        fp_func_name = 'RDKFingerprint'
+        fp_func_module = 'rdkit.Chem'
+    elif finger_type == 'maccs':
+        fp_func_name = 'GetMACCSKeysFingerprint'
+        fp_func_module = 'rdkit.Chem.rdMolDescriptors'
+    elif finger_type == 'morgan':
+        fp_func_name = 'GetMorganGenerator'
+        fp_func_module = 'rdkit.Chem.AllChem'
+        kwargs = dict(atomInvariantsGenerator=rdkit.Chem.rdFingerprintGenerator.GetMorganFeatureAtomInvGen(),
+                      radius=2, fpSize=2048, countSimulation=True)
+    else:
+        raise NotImplementedError(f"Use `rdkit` or `maccs` or `morgan` as fps")
+    fp_func = getattr(importlib.import_module(fp_func_module), fp_func_name)
+    return fp_func, fp_func_name, fp_func_module, kwargs
+def randomize_smiles_rotated(smiles: str, with_order_reversal: bool = True) -> str:
+    """
+    Randomize a SMILES string by doing a cyclic rotation of the atomic indices.
+    Adapted from https://github.com/GLambard/SMILES-X/blob/758478663030580a363a9ee61c11f6d6448e18a1/SMILESX/augm.py#L19.
+    The outputs of this function can be reproduced by setting the seed with random.seed().
+    Raises:
+        InvalidSmiles: for invalid molecules.
+    Args:
+        smiles: SMILES string to randomize.
+        with_order_reversal: whether to reverse the atom order with 50% chance.
+    Returns:
+        Randomized SMILES string.
+    """
+    mol = Chem.MolFromSmiles(smiles, sanitize=False)
+    n_atoms = mol.GetNumAtoms()
+    # Generate random values
+    rotation_index = np.random.randint(0, n_atoms - 1)
+    reverse_order = with_order_reversal and np.random.choice([True, False])
+    # Generate new atom indices order
+    atoms = list(range(n_atoms))
+    new_atoms_order = (
+        atoms[rotation_index % len(atoms) :] + atoms[: rotation_index % len(atoms)]
+    )
+    if reverse_order:
+        new_atoms_order.reverse()
+    mol = Chem.RenumberAtoms(mol, new_atoms_order)
+    return Chem.MolToSmiles(mol, canonical=False)

protobind_diff/ligands/smiles_tokenizer.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# Taken from https://github.com/MolecularAI/Chemformer/
+from typing import Any, Dict, List, Optional, Tuple, Union
+from pysmilesutils.tokenize import SMILESTokenizer
+class ChemformerTokenizer(SMILESTokenizer):
+    """
+    Tokenizer for the Chemformer.
+    There are a few different features that sets this apart from the `SMILESTokenizer`:
+       * It reserves two extra special tokens, "mask" and "sep"
+       * It distinguish between chemical and non-chemical tokens
+    :param smiles: A list of SMILES that are used to create the vocabulary for the tokenizer. Defaults to None.
+    :param tokens:  A list of tokens (strings) that the tokenizer uses when tokenizing SMILES. Defaults to None.
+    :param regex_token_patterns: A list of regular expressions that the tokenizer uses when tokenizing SMILES.
+    :param beginning_of_smiles_token: Token that is added to beginning of SMILES. Defaults to "^".
+    :param end_of_smiles_token: Token that is added to the end of SMILES. Defaults to "&".
+    :param padding_token: Token used for padding. Defalts to " ".
+    :param unknown_token: Token that is used for unknown ids when decoding encoded data. Defaults to "?".
+    :param mask_token: Token that is used by the Masker
+    :param sep_token: Token that is used to separate sentences, currently unused
+    :param filename: if given and `smiles` is None, load the vocabulary from disc
+    :raises: ValueError: If the `encoding_type` is invalid.
+    """
+    def __init__(
+        self,
+        smiles: List[str] = None,
+        tokens: List[str] = None,
+        regex_token_patterns: List[str] = None,
+        beginning_of_smiles_token: str = "^",
+        end_of_smiles_token: str = "&",
+        padding_token: str = "<PAD>",
+        unknown_token: str = "?",
+        mask_token: str = "<MASK>",
+        sep_token: str = "<SEP>",
+        filename: str = None,
+    ) -> None:
+        self._mask_token = mask_token
+        self._sep_token = sep_token
+        self._chem_start_idx = 6  # Default, number of special tokens + 1
+        self._chem_token_idxs: Optional[List[int]] = None
+        super().__init__(
+            smiles=smiles,
+            tokens=tokens,
+            regex_token_patterns=regex_token_patterns,
+            beginning_of_smiles_token=beginning_of_smiles_token,
+            end_of_smiles_token=end_of_smiles_token,
+            padding_token=padding_token,
+            unknown_token=unknown_token,
+            encoding_type="index",
+            filename=filename,
+        )
+    @property
+    def chem_token_idxs(self) -> List[int]:
+        """Returns the indices of the vocabulary that are chemical tokens"""
+        if self._chem_token_idxs is None:
+            self._chem_token_idxs = list(range(self._chem_start_idx, len(self.vocabulary)))
+        return self._chem_token_idxs
+    @property
+    def mask_token_id(self):
+        """Get the mask token id"""
+        return self.vocabulary[self._mask_token]
+    @property
+    def vocab_size(self):
+        return len(self.vocabulary)
+    @property
+    def special_tokens(self) -> Dict[str, str]:
+        """Returns a dictionary of non-character tokens"""
+        return {
+            "start": self._beginning_of_smiles_token,
+            "end": self._end_of_smiles_token,
+            "pad": self._padding_token,
+            "unknown": self._unknown_token,
+            "mask": self._mask_token,
+            "sep": self._sep_token,
+        }
+    def add_tokens(self, tokens: List[str], regex: bool = False, smiles=None) -> None:
+        """Adds tokens to the classes list of tokens.
+        The new tokens are added to the front of the token list and take priority over old tokens. Note that that the
+        vocabulary of the tokenizer is not updated after the tokens are added,
+        and must be updated by calling `create_vocabulary_from_smiles`.
+        If `regex` is False, the tokens are interpreted as non-chemical tokens, which distinguish
+        them for processing by e.g. the masker.
+        :param tokens: List of tokens to be added.
+        :param regex: If `True` the input tokens are treated as
+                regular expressions and are added to the list of regular expressions
+                instead of token list. Defaults to False.
+        :param smiles: If a list of smiles is provided, the vocabulary will be created, defaults to None
+        :raises ValueError: If any of the tokens supplied are already in the list
+                of tokens.
+        """
+        super().add_tokens(tokens, regex, smiles)
+        if not regex:
+            self._chem_start_idx += len(tokens)
+            self._chem_token_idxs = None
+    def _reset_vocabulary(self) -> Dict[str, int]:
+        """Create a new tokens vocabulary.
+        :return: New tokens vocabulary
+        """
+        dict_ = {
+            self._padding_token: 0,
+            self._unknown_token: 1,
+            self._beginning_of_smiles_token: 2,
+            self._end_of_smiles_token: 3,
+            self._mask_token: 4,
+            self._sep_token: 5,
+        }
+        for token in self._tokens:
+            dict_.setdefault(token, len(dict_))
+        return dict_
+    def _state_properties(self) -> Dict[str, Any]:
+        """Return properties to reconstruct the internal state of the tokenizer"""
+        dict_ = super()._state_properties()
+        dict_["chem_start_idx"] = self._chem_start_idx
+        return dict_
+    def _update_state(self, dict_: Dict[str, Any]) -> None:
+        """Update the internal state with properties loaded from disc"""
+        super()._update_state(dict_)
+        self._chem_start_idx = dict_["chem_start_idx"]
+        self._chem_token_idxs = None

protobind_diff/model.py ADDED Viewed

	@@ -0,0 +1,411 @@

+from pathlib import Path
+from typing import Tuple, Optional, Dict
+import numpy as np
+import torch
+from torch import nn
+import lightning.pytorch as pl
+import logging
+import huggingface_hub
+from .ligands.rdkit_utils import validate_smile, calc_chem_desc, tanimoto_smiles
+from .ligands.smiles_tokenizer import ChemformerTokenizer
+from .noise_schedule import _sample_t, q_xt, _sample_categorical, LogLinearNoise
+from .decoder_rope import Decoder_RoPE
+logger = logging.getLogger("lightning")
+class ModelGenerator(pl.LightningModule):
+    """
+    ProtoBind-Diff model with SMILES and ESM-2 protein encodings.
+    """
+    @staticmethod
+    def get_exp_dir(
+            exp_dir: str | None,
+            output_dir: str,
+            exp_dir_prefix: str,
+            split: str
+    ) -> Path:
+        """Determines the experiment directory path."""
+        if exp_dir:
+            return Path(exp_dir)
+        return Path(output_dir) / split / exp_dir_prefix
+    def __init__(self, *args, **kwargs):
+        """Initializes the Lightning Module, saves hyperparameters, and configures the model."""
+        super().__init__()
+        is_load = kwargs['load']
+        if not is_load:
+            self.save_hyperparameters()
+        self.data_dir = Path(kwargs["data_dir"])
+        exp_dir = kwargs.get('exp_dir', None)
+        self.exp_dir = self.get_exp_dir(
+            exp_dir=exp_dir,
+            output_dir=kwargs["output_dir"],
+            exp_dir_prefix=kwargs["exp_dir_prefix"],
+            split=kwargs["split"]
+        )
+        self.configure_model_params(**kwargs)
+    def configure_model_params(self, **kwargs):
+        """Parses keyword arguments to configure the model, tokenizer, and training parameters."""
+        self.learning_rate = kwargs.pop('learning_rate')
+        self.weight_decay = float(kwargs.pop('weight_decay'))
+        # Decoder params for masked diffusion
+        decoder_params = {
+            'nhead': kwargs['num_heads_decoder'],
+            'n_layers': kwargs['num_decoder_layers'],
+            'hidden_size': kwargs['decoder_hidd_dim'],
+            'expand_feedforward': kwargs['expand_feedforward'],
+            'decoder_name': kwargs['decoder_name'],
+        }
+        # Tokenizer params
+        tokenizer_path = kwargs.get('tokenizer_path')
+        if tokenizer_path:
+            self.tokenizer = ChemformerTokenizer(filename=tokenizer_path)
+        else:
+            self.tokenizer = ChemformerTokenizer(filename=self.data_dir / f"{kwargs['tokenizer_json_name']}.json")
+        # Masking params
+        self.noise = LogLinearNoise()
+        self.mask_index = self.tokenizer.mask_token_id
+        # Sampler params
+        self.model_length = 170
+        self.noise_removal = True
+        self.nucleus_p = 0.9
+        self.eta = 0.1
+        self.sampling_steps = 100
+        self.time_conditioning = False
+        self.return_attention = False
+        self.model = ProtobindMaskedDiffusion(
+            embedding_dim=kwargs['seq_embedding_dim'],
+            mask_index=self.mask_index,
+            vocab_size=self.tokenizer.vocab_size,
+            decoder_params=decoder_params,
+            dropout=kwargs['dropout'],
+        )
+        self.optimizer = kwargs.get('optimizer', 'Adam')
+    def generate_mols(self, sequence: Tuple[torch.Tensor, torch.Tensor],
+                      return_attention=False) -> Tuple[np.array, torch.Tensor,np.array]:
+        """Generates and validates SMILES strings for a given protein sequence.
+        This method calls the internal sampler, decodes the generated tokens into
+        SMILES strings, and filters out any invalid molecules.
+        Args:
+            sequence (Tuple[torch.Tensor, torch.Tensor]): The conditioned protein sequence
+                embedding and its length.
+            return_attention (bool): Whether to return attention maps from the sampler.
+        Returns:
+            Tuple[np.array, torch.Tensor, np.array]: A tuple containing the valid SMILES
+            strings, corresponding attention maps, and the mask of valid indices.
+        """
+        samples, attention = self._sample(sequence, return_attention=return_attention)
+        text_samples = self.tokenizer.decode(samples.long())
+        text_samples = np.array([validate_smile(smile) for smile in text_samples])
+        mask_invalid = (text_samples != None) & (text_samples != '.') & (text_samples != '')
+        text_samples = text_samples[mask_invalid]
+        if attention is not None:
+            attention = attention[mask_invalid]
+        return text_samples, attention, mask_invalid
+    def predict_step(self, batch, batch_idx):
+        sequence, smiles, seq_id, smi_id = batch
+        gen_samples, attention, mask_invalid = self.generate_mols(
+            sequence, return_attention=self.return_attention)
+        seq_id = seq_id[mask_invalid]
+        return gen_samples, attention, seq_id
+    def training_step(self, batch, batch_idx):
+        return self.common_step(batch, "train", batch_idx)
+    def validation_step(self, batch, batch_idx, dataloader_idx=None):
+        # dataloader_idx to predict on several validation sets
+        return self.common_step(batch, "val", batch_idx, dataloader_idx)
+    def test_step(self, batch, batch_idx, dataloader_idx=0):
+        return self.common_step(batch, "test", batch_idx)
+    def common_step(self, batch, description, batch_idx, dataloader_idx=None):
+        """Performs a common training, validation, or test step.
+            This method takes a batch, applies noise according to the diffusion
+            timestep, runs the model forward, calculates the loss, and logs metrics.
+            Args:
+                batch (Tuple): The input batch from the dataloader.
+                description (str): The step description (e.g., 'train', 'val').
+                batch_idx (int): The index of the batch.
+            Returns:
+                torch.Tensor: The calculated loss for the batch.
+         """
+        sequence, smiles, seq_id, smi_id = batch
+        # Get data and apply noise
+        X, length = smiles
+        bs = X.shape[0]
+        X = X.squeeze(-1)
+        padding_mask = (X != 0).float()  # 0 is pad token id
+        t = _sample_t(X.shape[0], X.device)
+        sigma, dsigma = self.noise(t)
+        move_chance = 1 - torch.exp(-sigma[:, None])
+        xt = q_xt(X, move_chance, self.mask_index)
+        xt = xt.unsqueeze(dim=2)
+        smiles_t = (xt, length, None)
+        pred_x, _ = self.model(sequence, smiles_t, sigma, padding_mask)
+        total_loss = self.loss_mdlm(X.long(), pred_x, sigma, dsigma, padding_mask=None)
+        if batch_idx % 50 == 0:
+            tokens = pred_x.argmax(dim=-1) * padding_mask
+            true_smiles = self.tokenizer.decode(X.long())
+            pred_smiles = [smile for smile in self.tokenizer.decode(tokens)]
+            pred_smiles_valid = [validate_smile(smile) for smile in pred_smiles]
+            try:
+                tanimoto = np.asarray([tanimoto_smiles(mol_pred, mol_ref) for mol_pred, mol_ref
+                                   in zip(pred_smiles_valid, true_smiles) if mol_pred is not None])
+                tanimoto_mean = np.mean(tanimoto) if len(tanimoto) > 0 else 0
+                num_mols_valid = len(tanimoto)
+            except:
+                num_mols_valid = 0
+                tanimoto_mean = 0.0
+            self.log(f"{description}_tanimoto", tanimoto_mean, prog_bar=True,
+                     on_epoch=True, sync_dist=True)
+            self.log(f"{description}_perc_of_valid", num_mols_valid / bs * 100, prog_bar=True,
+                     on_epoch=True, sync_dist=True)
+        self.log(f"{description}_loss", total_loss, prog_bar=True, on_epoch=True,
+                 sync_dist=True, batch_size=bs)
+        return total_loss
+    def configure_optimizers(self):
+        if self.weight_decay > 0.:
+            optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
+        else:
+            optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
+        return optimizer
+    def loss_mdlm(self, x_0, model_output, sigma, dsigma, padding_mask=None):
+        """Loss for SUBS parameterization, continuous time case"""
+        log_p_theta = torch.gather(
+            input=model_output,
+            dim=-1,
+            index=x_0[:, :, None]).squeeze(-1)
+        loss = - log_p_theta * (dsigma / torch.expm1(sigma))[:, None]
+        if padding_mask is not None:
+            return (loss * padding_mask).sum() / padding_mask.sum()
+        return loss.mean()
+    def _sample_prior(self, *batch_dims):
+        return self.mask_index * torch.ones(*batch_dims, dtype=torch.int64)
+    def _ddpm_caching_update(self, sequence, x, t, dt, p_x0=None, conf=None,
+                             return_attention=False):
+        attention = None
+        if t.ndim > 1:
+            t = t.squeeze(-1)
+        sigma_t, _ = self.noise(t)
+        assert t.ndim == 1
+        move_chance_t = t[:, None, None]
+        move_chance_s = (t - dt)[:, None, None]
+        assert move_chance_t.ndim == 3, move_chance_t.shape
+        padding_mask = (x != 0).float()
+        if p_x0 is None:
+            p_x0, attention = self.model(sequence, (x.unsqueeze(dim=2), None, None), sigma_t,
+                                         padding_mask, return_attention=return_attention)
+            p_x0 = p_x0.exp()
+            if self.nucleus_p < 1:
+                sorted_probs, sorted_indices = torch.sort(p_x0, descending=True, dim=-1)
+                cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+                top_p_mask = cumulative_probs <= self.nucleus_p
+                top_p_mask[..., 0] = True
+                nucleus_probs = sorted_probs * top_p_mask
+                nucleus_probs /= nucleus_probs.sum(dim=-1, keepdim=True)
+                p_x0 = torch.zeros_like(p_x0).scatter_(-1, sorted_indices, nucleus_probs)
+        assert move_chance_t.ndim == p_x0.ndim
+        # Use remdm-cap sampler
+        alpha_t = (1 - move_chance_t)[0].item()
+        alpha_s = (1 - move_chance_s)[0].item()
+        if alpha_t > 0:
+            sigma = min(self.eta, (1 - alpha_s) / alpha_t)
+        else:
+            sigma = self.eta
+        q_xs = p_x0 * (1 - sigma)
+        q_xs[..., self.mask_index] = sigma
+        q_xs_2 = p_x0 * ((alpha_s - (1 - sigma) * alpha_t) / (1 - alpha_t))
+        q_xs_2[..., self.mask_index] = (1 - alpha_s - sigma * alpha_t) / (1 - alpha_t)
+        copy_flag = (x != self.mask_index).to(torch.bool)
+        q_xs = torch.where(copy_flag.unsqueeze(-1), q_xs, q_xs_2)
+        xs = _sample_categorical(q_xs)
+        if torch.allclose(xs, x) and not self.time_conditioning:
+            p_x0_cache = p_x0
+        else:
+            p_x0_cache = None
+        return p_x0_cache, xs, conf, attention
+    @torch.no_grad()
+    def _sample(self, sequence, eps=1e-3, return_attention=False):
+        """Generate samples from the model"""
+        num_steps = self.sampling_steps
+        bs = sequence[0].shape[0]
+        x = self._sample_prior(bs, self.model_length).to(self.device)
+        timesteps = torch.linspace(1, eps, num_steps + 1, device=self.device)
+        dt = (1 - eps) / num_steps
+        p_x0_cache = None
+        min_t = timesteps[-1].item()
+        confident_score = - torch.ones_like(x, device=self.device) * torch.inf
+        for i in range(num_steps):
+            t = timesteps[i] * torch.ones(bs, 1, device=self.device)
+            p_x0_cache, x_next, confident_score, attention = self._ddpm_caching_update(
+                sequence, x, t, dt, p_x0=p_x0_cache, conf=confident_score,
+                return_attention=return_attention)
+            if (not torch.allclose(x_next, x)):
+                p_x0_cache = None
+            x = x_next
+        if self.noise_removal:
+            t = min_t * torch.ones(bs, 1, device=self.device)
+            unet_conditioning = self.noise(t)[0]
+            padding_mask = (x != 0).float()
+            x, attention = self.model(sequence, (x, None, None), unet_conditioning.squeeze(-1),
+                                      padding_mask, return_attention=return_attention)
+            x = x.argmax(dim=-1)
+        return x, attention
+class ProtobindMaskedDiffusion(nn.Module, huggingface_hub.PyTorchModelHubMixin):
+    """The core Protobind-Diff model, which uses a Transformer decoder with RoPE.
+    This model is designed for a masked diffusion task and supports conditioning
+    on ESM-2 protein embeddings and generating ligands with a ChemformerTokenizer.
+    """
+    def __init__(self,
+                 embedding_dim: int,
+                 mask_index: int,
+                 vocab_size: int,
+                 decoder_params: Optional[dict] = None,
+                 dropout: float = 0.2,
+                 parametrization_strategy: str = 'subs',
+                 **kwargs) -> None:
+        """Initializes the ProtobindMaskedDiffusion model.
+        Args:
+            embedding_dim (int): The dimension of the protein sequence embeddings.
+            mask_index (int): The token ID for the MASK token.
+            vocab_size (int): The size of the ligand's vocabulary.
+            decoder_params (Optional[dict]): A dictionary of parameters for the
+                internal Transformer decoder (e.g., nhead, n_layers).
+            dropout (float): The dropout rate.
+            parametrization_strategy (str): The diffusion parameterization to use.
+                Currently only 'subs' is supported.
+        """
+        super().__init__()
+        self.neg_infinity = -1000000.0
+        self.parametrization_strategy = parametrization_strategy
+        self.decoder_name = decoder_params.pop('decoder_name')
+        expand_feedforward = decoder_params.pop('expand_feedforward')
+        self.mask_index = mask_index
+        # Decoder options
+        if self.decoder_name == 'decoder_re':
+            self.decoder = Decoder_RoPE(vocab_size, embedding_dim, expand_feedforward=expand_feedforward,
+                                        dropout=dropout, **decoder_params)
+        else:
+            raise ValueError(f"Model only supports decoder with rotary embeddings ('decoder_re'), got: {self.decoder_name}")
+    def forward(self,
+                sequence: Tuple[torch.Tensor, torch.Tensor],
+                ligands: Tuple[torch.Tensor, torch.Tensor],
+                sigma: torch.Tensor,
+                mask_ligand: torch.Tensor,
+                return_attention: bool = False) -> torch.Tensor:
+        """Performs the main forward pass of the diffusion model.
+        Args:
+            sequence (Tuple[torch.Tensor, torch.Tensor]): A tuple of the conditioning
+                protein sequence embeddings and their lengths.
+            ligands (Tuple[torch.Tensor, torch.Tensor]): A tuple
+                containing the noised ligand `xt`and its length.
+            sigma (torch.Tensor): The diffusion timestep (noise level).
+            mask_ligand (torch.Tensor): The padding mask for the ligand.
+            return_attention (bool): If True, returns attention weights from the decoder.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple containing the final predicted logits
+            and the attention weights.
+        """
+        sequence, sequence_lengths = sequence
+        xt, ligand_lengths, _ = ligands
+        # Decode ligand
+        ligand_masked = xt.squeeze(-1).long()
+        ligand_decoded, attention = self.decoder(ligand_masked,
+                                                 sigma,
+                                                 sequence,
+                                                 sequence_lengths,
+                                                 lig_padding_mask=None,
+                                                 return_attention=return_attention)
+        # Apply parametrization
+        ligand_decoded = self.parametrization(ligand_decoded, xt)
+        return ligand_decoded, attention
+    def parametrization(self, logits, xt):
+        """Applies the chosen parameterization to the model's output logits.
+        The 'subs' strategy modifies the logits to represent the probability
+        p(x_{t-1}|x_t), enforcing that unmasked tokens remain unchanged.
+        Args:
+            logits (torch.Tensor): The raw output logits from the decoder.
+            xt (torch.Tensor): The noised input ligand at timestep t.
+        Returns:
+            torch.Tensor: The re-parameterized logits.
+        """
+        if self.parametrization_strategy == 'subs':
+            # log prob at the mask index = - infinity
+            logits[:, :, self.mask_index] += self.neg_infinity
+            # Normalize the logits
+            logits = logits - torch.logsumexp(logits, dim=-1, keepdim=True)
+            # Apply updates for unmasked tokens
+            xt = xt.squeeze(-1)
+            unmasked_indices = (xt != self.mask_index)
+            logits[unmasked_indices] = self.neg_infinity
+            logits[unmasked_indices, xt[unmasked_indices].long()] = 0
+        else:
+            raise NotImplementedError(f'Parametrization strategy {self.parametrization_strategy} not implemented')
+        return logits

protobind_diff/noise_schedule.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import abc
+import torch
+import torch.nn as nn
+# Flags required to enable jit fusion kernels
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+def _sample_categorical(categorical_probs):
+  gumbel_norm = (
+    1e-10
+    - (torch.rand_like(categorical_probs) + 1e-10).log())
+  return (categorical_probs / gumbel_norm).argmax(dim=-1)
+def _unsqueeze(x, reference):
+  return x.view(
+    * x.shape,
+    * ((1,) * (len(reference.shape) - len(x.shape))))
+def _sample_t(n, device, antithetic_sampling=True, sampling_eps=1e-3):
+  _eps_t = torch.rand(n, device=device)
+  if antithetic_sampling:
+    offset = torch.arange(n, device=device) / n
+    _eps_t = (_eps_t / n + offset) % 1
+  t = (1 - sampling_eps) * _eps_t + sampling_eps
+  return t
+def q_xt( x, move_chance, mask_index):
+  """Computes the noisy sample xt.
+  Args:
+    x: int torch.Tensor with shape (batch_size,
+        diffusion_model_input_length), input.
+    move_chance: float torch.Tensor with shape (batch_size, 1).
+  """
+  move_indices = torch.rand(
+    * x.shape, device=x.device) < move_chance
+  xt = torch.where(move_indices, mask_index, x)
+  return xt
+def get_noise(config, dtype=torch.float32):
+  if config.noise.type == 'geometric':
+    return GeometricNoise(config.noise.sigma_min,
+                          config.noise.sigma_max)
+  elif config.noise.type == 'loglinear':
+    return LogLinearNoise()
+  elif config.noise.type == 'cosine':
+    return CosineNoise()
+  elif config.noise.type == 'cosinesqr':
+    return CosineSqrNoise()
+  elif config.noise.type == 'linear':
+    return Linear(config.noise.sigma_min,
+                  config.noise.sigma_max,
+                  dtype)
+  else:
+    raise ValueError(f'{config.noise.type} is not a valid noise')
+def binary_discretization(z):
+  z_hard = torch.sign(z)
+  z_soft = z / torch.norm(z, dim=-1, keepdim=True)
+  return z_soft + (z_hard - z_soft).detach()
+class Noise(abc.ABC, nn.Module):
+  """
+  Baseline forward method to get the total + rate of noise at a timestep
+  """
+  def forward(self, t):
+    # Assume time goes from 0 to 1
+    return self.total_noise(t), self.rate_noise(t)
+  @abc.abstractmethod
+  def rate_noise(self, t):
+    """
+    Rate of change of noise ie g(t)
+    """
+    pass
+  @abc.abstractmethod
+  def total_noise(self, t):
+    """
+    Total noise ie \int_0^t g(t) dt + g(0)
+    """
+    pass
+class CosineNoise(Noise):
+  def __init__(self, eps=1e-3):
+    super().__init__()
+    self.eps = eps
+  def rate_noise(self, t):
+    cos = (1 - self.eps) * torch.cos(t * torch.pi / 2)
+    sin = (1 - self.eps) * torch.sin(t * torch.pi / 2)
+    scale = torch.pi / 2
+    return scale * sin / (cos + self.eps)
+  def total_noise(self, t):
+    cos = torch.cos(t * torch.pi / 2)
+    return - torch.log(self.eps + (1 - self.eps) * cos)
+class CosineSqrNoise(Noise):
+  def __init__(self, eps=1e-3):
+    super().__init__()
+    self.eps = eps
+  def rate_noise(self, t):
+    cos = (1 - self.eps) * (
+      torch.cos(t * torch.pi / 2) ** 2)
+    sin = (1 - self.eps) * torch.sin(t * torch.pi)
+    scale = torch.pi / 2
+    return scale * sin / (cos + self.eps)
+  def total_noise(self, t):
+    cos = torch.cos(t * torch.pi / 2) ** 2
+    return - torch.log(self.eps + (1 - self.eps) * cos)
+class Linear(Noise):
+  def __init__(self, sigma_min=0, sigma_max=10, dtype=torch.float32):
+    super().__init__()
+    self.sigma_min = torch.tensor(sigma_min, dtype=dtype)
+    self.sigma_max = torch.tensor(sigma_max, dtype=dtype)
+  def rate_noise(self, t):
+    return self.sigma_max - self.sigma_min
+  def total_noise(self, t):
+    return self.sigma_min + t * (self.sigma_max - self.sigma_min)
+  def importance_sampling_transformation(self, t):
+    f_T = torch.log1p(- torch.exp(- self.sigma_max))
+    f_0 = torch.log1p(- torch.exp(- self.sigma_min))
+    sigma_t = - torch.log1p(- torch.exp(t * f_T + (1 - t) * f_0))
+    return (sigma_t - self.sigma_min) / (
+      self.sigma_max - self.sigma_min)
+class GeometricNoise(Noise):
+  def __init__(self, sigma_min=1e-3, sigma_max=1):
+    super().__init__()
+    self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max])
+  def rate_noise(self, t):
+    return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t * (
+      self.sigmas[1].log() - self.sigmas[0].log())
+  def total_noise(self, t):
+    return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t
+class LogLinearNoise(Noise):
+  """Log Linear noise schedule.
+  Built such that 1 - 1/e^(n(t)) interpolates between 0 and 1.
+  """
+  def __init__(self, eps=1e-3):
+    super().__init__()
+    self.eps = eps
+    self.sigma_max = self.total_noise(torch.tensor(1.0))
+    self.sigma_min = self.eps + self.total_noise(torch.tensor(0.0))
+  def rate_noise(self, t):
+    return (1 - self.eps) / (1 - (1 - self.eps) * t)
+  def total_noise(self, t):
+    return -torch.log1p(-(1 - self.eps) * t)
+  def importance_sampling_transformation(self, t):
+    f_T = torch.log1p(- torch.exp(- self.sigma_max))
+    f_0 = torch.log1p(- torch.exp(- self.sigma_min))
+    sigma_t = - torch.log1p(- torch.exp(t * f_T + (1 - t) * f_0))
+    t = - torch.expm1(- sigma_t) / (1 - self.eps)
+    return t

pyproject.toml ADDED Viewed

	@@ -0,0 +1,44 @@

+[project]
+name = "ProtoBind-Diff"
+version = "0.1.0"
+description = "ProtoBind-Diff: A Structure-Free Diffusion Language Model for Protein Sequence-Conditioned Ligand Design"
+readme = "README.md"
+requires-python = ">=3.10,<3.13"
+license = "MIT AND (Apache-2.0 OR BSD-2-Clause)"
+authors = [
+  { name = "Lukia Mistryukova", email = "lukiia.mistriukova@gero.ai" },
+    { name = "Vladimir Manuilov", email = "vladimir.manuylov@gero.ai" },
+    { name = "Konstantin Avchaciov", email = "ka@gero.ai" },
+    { name = "Peter Fedichev", email = "pf@gero.ai" },
+]
+dependencies = [
+    "torch>=2.2",
+    "numpy>=1.26,<2.0",
+    "lightning>=2.3.0",
+    "rdkit>=2024.3.2",
+    "requests==2.32.3",
+    "pandas>=2.2.2",
+    "PyYAML>=6.0",
+    "scipy>=1.13.0",
+    "scikit-learn>=1.1.0",
+    "fair-esm==2.0.0",
+    "biopython>=1.80",
+    "pysmilesutils @ git+https://github.com/MolecularAI/pysmilesutils.git",
+    "FPSim2==0.5.2",
+    "huggingface_hub",
+    "einops==0.8.0",
+    "easydict>=1.11",
+    "tensorboard>=2.14.0",
+    "rich>=13.5.0"
+]
+[project.scripts]
+protobind-train = "protobind_diff.train:main"
+protobind-infer = "protobind_diff.esm_inference:main"
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools.packages.find]
+include = ["protobind_diff"]