update

Files changed (7) hide show

.gitattributes +1 -0
README.md +43 -3
data_preprocessing/data.py +235 -0
data_preprocessing/data_split.py +101 -0
mcts.png +3 -0
mdlm.png +0 -0
peptune.png +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+mcts.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,43 @@
----
-license: cc-by-nc-nd-4.0
----

+---
+extra_gated_fields:
+  Name: text
+  Company: text
+  Country: country
+  Specific date: date_picker
+  I want to use this model for:
+    type: select
+    options:
+      - Research
+      - Education
+      - label: Other
+        value: other
+extra_gated_prompt: "PepTune License: https://duke.box.com/s/5ghseh23rpsyou66kg60qr89sxt5twyu"
+extra_gated_heading: Acknowledge license to access the repository
+extra_gated_button_content: Acknowledge license
+---
+<div align="center">
+  <img src="peptune.png" alt="peptune" width="300" height="300">
+</div>
+# PepTune: *De Novo* Generation of Therapeutic Peptides with Multi-Objective-Guided Discrete Diffusion
+Peptide therapeutics, a major class of medicines, have achieved remarkable success across diseases like diabetes and cancer, with landmark examples such as GLP-1 receptor agonists revolutionizing the treatment of type-2 diabetes and obesity. Despite their success, designing peptides that satisfy multiple conflicting objectives, such as binding affinity, solubility, and membrane permeability, remains a major challenge. Classical drug development and target structure-based design methods are ineffective for such tasks, as they fail to optimize global functional properties critical for therapeutic efficacy. Existing generative frameworks are largely limited to continuous spaces, unconditioned outputs, or single-objective guidance, making them unsuitable for discrete sequence optimization across multiple properties. To address this, we present **PepTune**, a multi-objective discrete diffusion model for the simultaneous generation and optimization of therapeutic peptide SMILES. Built on the Masked Discrete Language Model (MDLM) framework, PepTune ensures valid peptide structures with state-dependent masking schedules and penalty-based objectives. To guide the diffusion process, we propose a Monte Carlo Tree Search (MCTS)-based strategy that balances exploration and exploitation to iteratively refine Pareto-optimal sequences. MCTS integrates classifier-based rewards with search-tree expansion, overcoming gradient estimation challenges and data sparsity inherent to discrete spaces. Using PepTune, we generate diverse, chemically-modified peptides optimized for multiple therapeutic properties, including target binding affinity, membrane permeability, solubility, hemolysis, and non-fouling characteristics on various disease-relevant targets. In total, our results demonstrate that MCTS-guided discrete diffusion is a powerful and modular approach for multi-objective sequence design in discrete state spaces.
+## We build our training framework on top of [Masked Discrete Language Model](https://huggingface.co/kuleshov-group/mdlm-owt).
+![Masked Discrete Language Model Framework](mdlm.png)
+## We optimize desired therapeutic properties of generated sequences based on Monte Carlo Tree Search
+![Monte Carlo Tree Search Schemetic View](mcts.png)
+## Inference API, datasets, and sequences will be freely accessible to the academic community via a non-commercial license upon publication and provisional patent filing
+## Interactive Demo
+You can try out the our peptide visualizer directly in your browser, other property classifiers will be added soon:
+<https://huggingface.co/spaces/ChatterjeeLab/SMILES2PEPTIDE>
+## Usage
+To use this repository, you agree to abide by the [PepTune License](https://duke.box.com/s/5ghseh23rpsyou66kg60qr89sxt5twyu).

data_preprocessing/data.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import sys
+import torch
+from datasets import Dataset, DatasetDict, load_from_disk
+from torch.utils.data import DataLoader
+import os
+from multiprocessing import Pool
+from tqdm import tqdm
+import lightning.pytorch as pl
+sys.path.append('/home/yz927/projects/peptune/scripts/')
+from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
+global_tokenizer = None
+def init_pool(tokenizer):
+    global global_tokenizer
+    global_tokenizer = tokenizer
+class SequenceDataset:
+    def __init__(self, sequences, tokenizer, max_sequence_length, num_cores=8):
+        self.sequences = sequences
+        self.tokenizer = tokenizer
+        self.max_sequence_length = max_sequence_length
+        self.num_cores = 8
+        self.tokenized_sequences = []
+        self.original_sequences = []
+    def tokenize_sequences(self):
+        print(f"Starting parallel tokenization using {self.num_cores} cores")
+        with Pool(processes=self.num_cores, initializer=init_pool, initargs=(self.tokenizer,)) as pool:
+            results = list(tqdm(
+                pool.imap(standalone_tokenize_function, self.sequences),
+                total=len(self.sequences)
+            ))
+        for result, seq in zip(results, self.sequences):
+            if result is not None and len(result['input_ids'][0]) <= self.max_sequence_length:
+                self.tokenized_sequences.append(result)
+                self.original_sequences.append(seq)
+    def process_sequences(self, batch_size):
+        self.tokenize_sequences()
+        lengths = [(len(seq['input_ids'][0]), i) for i, seq in enumerate(self.tokenized_sequences)]
+        lengths.sort()
+        batches = []
+        sequence_batches = []
+        current_batch = []
+        current_sequence_batch = []
+        current_length = 0
+        for length, idx in tqdm(lengths):
+            if current_length + length > self.max_sequence_length or len(current_batch) == batch_size:
+                if current_batch:
+                    batches.append([self.tokenized_sequences[i] for i in current_batch])
+                    sequence_batches.append([self.original_sequences[i] for i in current_batch])
+                current_batch = [idx]
+                current_sequence_batch = [self.original_sequences[idx]]
+                current_length = length
+            else:
+                current_batch.append(idx)
+                current_sequence_batch.append(self.original_sequences[idx])
+                current_length += length
+        if current_batch:
+            batches.append([self.tokenized_sequences[i] for i in current_batch])
+            sequence_batches.append([self.original_sequences[i] for i in current_batch])
+        token_batch_fn = TokenizeBatch(self.tokenizer)
+        processed_batches = [token_batch_fn(batch) for batch in tqdm(batches)]
+        dataset = Dataset.from_dict({
+            'attention_mask': [batch['attention_mask'] for batch in processed_batches],
+            'input_ids': [batch['input_ids'] for batch in processed_batches],
+            'labels': sequence_batches
+        })
+        return dataset
+class DynamicBatchingDataset(Dataset):
+    """
+    Process dynamically batched datasets of Huggingface Datasets object. Need special handling since in the previous
+    steps, each batch (row in the Datasets object) is already processed for per batch loading
+    """
+    def __init__(self, dataset_dict):
+        print('Initializing dataset...')
+        self.dataset_dict = {
+            'attention_mask': [torch.tensor(item) for item in dataset_dict['attention_mask']],
+            'input_ids': [torch.tensor(item) for item in dataset_dict['input_ids']],
+            'labels': dataset_dict['labels']  # Store original sequences as it is
+        }
+    def __len__(self):
+        return len(self.dataset_dict['attention_mask'])
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            return {
+                'attention_mask': self.dataset_dict['attention_mask'][idx],
+                'input_ids': self.dataset_dict['input_ids'][idx],
+                'labels': self.dataset_dict['labels'][idx]
+            }
+        elif isinstance(idx, list):
+            return {
+                'attention_mask': [self.dataset_dict['attention_mask'][i] for i in idx],
+                'input_ids': [self.dataset_dict['input_ids'][i] for i in idx],
+                'labels': [self.dataset_dict['labels'][i] for i in idx]
+            }
+        else:
+            raise ValueError(f"Expected idx to be int or list, but got {type(idx)}")
+    @staticmethod
+    def collate_fn(batch, verbose=False):
+        item = batch[0]
+        return {
+            'input_ids': item['input_ids'],
+            'attention_mask': item['attention_mask'],
+            'labels': item['labels']
+        }
+def standalone_tokenize_function(sequence):
+    global global_tokenizer
+    try:
+        tokens = global_tokenizer(sequence)
+        # The tokenizer already returns lists of integers, so we just need to wrap them in another list
+        # to match the expected format [batch_size, sequence_length]
+        return {
+            'input_ids': [tokens['input_ids']],
+            'attention_mask': [tokens['attention_mask']]
+        }
+    except Exception as e:
+        print(f"Error tokenizing sequence '{sequence}': {e}")
+        return None
+class TokenizeBatch:
+    def __init__(self, tokenizer):
+        self.pad_token_id = tokenizer.pad_token_id
+    def __call__(self, batches):
+        data_tokens = [torch.tensor(batch['input_ids'][0]) for batch in batches]
+        data_tokens_padded = torch.nn.utils.rnn.pad_sequence(data_tokens, batch_first=True, padding_value=self.pad_token_id)
+        attention_masks = (data_tokens_padded != self.pad_token_id).long()
+        return {
+            'input_ids': data_tokens_padded,
+            'attention_mask': attention_masks,
+        }
+class PretrainSequenceDataModule(pl.LightningDataModule):
+    def __init__(self,
+                 tokenizer,
+                 input_dataset_path,
+                 output_dataset_path,
+                 num_workers,
+                 batch_size,
+                 max_sequence_length=512,):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.input_path = input_dataset_path
+        self.output_path = output_dataset_path
+        self.num_workers = num_workers
+        self.batch_size = batch_size
+        self.max_sequence_length = max_sequence_length
+    def prepare_data(self):
+        if not os.path.exists(self.output_path):
+            print("Loading text files")
+            with open(f"{self.input_path}/train.txt", 'r') as f:
+                train_sequences = [line.strip() for line in f if line.strip()]
+            with open(f"{self.input_path}/val.txt", 'r') as f:
+                val_sequences = [line.strip() for line in f if line.strip()]
+            print("Processing training data")
+            train_dataset = SequenceDataset(train_sequences,
+                                        self.tokenizer,
+                                        self.max_sequence_length)
+            print("Processing validation data")
+            val_dataset = SequenceDataset(val_sequences,
+                                        self.tokenizer,
+                                        self.max_sequence_length)
+            processed_train = train_dataset.process_sequences(self.batch_size)
+            processed_val = val_dataset.process_sequences(self.batch_size)
+            print("Combining datasets")
+            combined_dataset = DatasetDict({
+                'train': processed_train,
+                'val': processed_val,
+            })
+            print(f"Saving dataset to {self.output_path}")
+            combined_dataset.save_to_disk(self.output_path)
+    def setup(self, stage: str):
+        print("Loading processed dataset")
+        dataset = load_from_disk(self.output_path)
+        self.train_dataset = DynamicBatchingDataset(dataset['train'])
+        self.val_dataset = DynamicBatchingDataset(dataset['val'])
+    def train_dataloader(self):
+        print("Creating training dataloader")
+        return DataLoader(self.train_dataset,
+                        batch_size=1,
+                        shuffle=False,
+                        num_workers=self.num_workers,
+                        collate_fn=DynamicBatchingDataset.collate_fn,
+                        pin_memory=True)
+    def val_dataloader(self):
+        print("Creating validation dataloader")
+        return DataLoader(self.val_dataset,
+                        batch_size=1,
+                        shuffle=False,
+                        num_workers=self.num_workers,
+                        collate_fn=DynamicBatchingDataset.collate_fn,
+                        pin_memory=True)
+if __name__ == '__main__':
+    tokenizer = SMILES_SPE_Tokenizer('/home/st512/peptune/scripts/peptide-mdlm-mcts/tokenizer/new_vocab.txt',
+                                 '/home/st512/peptune/scripts/peptide-mdlm-mcts/tokenizer/new_splits.txt')
+    dm = PretrainSequenceDataModule(
+        tokenizer=tokenizer,
+        input_dataset_path='/home/yz927/projects/peptune/tokens/11M_smiles',
+        output_dataset_path='/home/yz927/projects/peptune/tokenized/11M_smiles_old_tokenizer_no_limit',
+        num_workers=8,
+        batch_size=2000,
+        max_sequence_length=16*1000,
+    )
+    dm.prepare_data()
+    dm.setup('fit')
+    dm.train_dataloader()
+    dm.val_dataloader()

data_preprocessing/data_split.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from rdkit import Chem
+from rdkit.Chem import AllChem
+from rdkit import DataStructs
+import numpy as np
+from sklearn.cluster import MiniBatchKMeans
+from collections import defaultdict
+from tqdm import tqdm
+import selfies as sf
+from multiprocessing import Pool, cpu_count
+from functools import partial
+def generate_fingerprint_batch_selfies(selfies_batch):
+    fps = []
+    valid_selfies = []
+    for selfies in tqdm(selfies_batch, desc="Generating fingerprints", leave=False):
+        try:
+            # Convert SELFIES to SMILES then to molecule
+            smiles = sf.decoder(selfies)
+            mol = Chem.MolFromSmiles(smiles)
+            if mol is not None:
+                fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 2048)
+                arr = np.zeros((1,))
+                DataStructs.ConvertToNumpyArray(fp, arr)
+                fps.append(arr)
+                valid_selfies.append(selfies)
+        except:
+            continue
+    return np.array(fps), valid_selfies
+def process_batch(batch, n_clusters, seed):
+    fps, valid_selfies = generate_fingerprint_batch_selfies(batch)
+    if len(fps) > 0:
+        clusterer = MiniBatchKMeans(n_clusters=n_clusters, random_state=seed)
+        clusterer.fit(fps)
+        labels = clusterer.predict(fps)
+        return list(zip(labels, valid_selfies))
+    return []
+def parallel_clustering_split_selfies(selfies_list, batch_size=10000, n_clusters=1000, train_ratio=0.9, seed=42):
+    np.random.seed(seed)
+    # Create batches
+    batches = [selfies_list[i:i + batch_size]
+               for i in range(0, len(selfies_list), batch_size)]
+    # Initialize parallel processing
+    n_cores = 12
+    process_batch_partial = partial(process_batch, n_clusters=n_clusters, seed=seed)
+    cluster_assignments = defaultdict(list)
+    with Pool(n_cores) as pool:
+        results = list(tqdm(
+            pool.imap(process_batch_partial, batches),
+            total=len(batches),
+            desc="Processing batches"
+        ))
+    # Combine results
+    for batch_results in results:
+        for label, selfies in batch_results:
+            cluster_assignments[label].append(selfies)
+    # Split into train/val
+    clusters = list(cluster_assignments.values())
+    np.random.shuffle(clusters)
+    train_selfies = []
+    val_selfies = []
+    total_mols = sum(len(cluster) for cluster in clusters)
+    for cluster in tqdm(clusters, desc="Splitting clusters"):
+        if len(train_selfies) / total_mols < train_ratio:
+            train_selfies.extend(cluster)
+        else:
+            val_selfies.extend(cluster)
+    print(f"Final splits: Train={len(train_selfies)}, Validation={len(val_selfies)}")
+    return train_selfies, val_selfies
+try:
+    with open('/home/yz927/projects/peptune/tokens/filtered_peptides_selfies.txt', 'r') as f:
+        selfies_list = [line.strip() for line in f if line.strip()]
+    print(f"Loaded {len(selfies_list)} selfies sequences from file")
+except FileNotFoundError:
+    raise FileNotFoundError(f"Could not find the file at file")
+except Exception as e:
+    raise Exception(f"Error reading file: {str(e)}")
+train_selfies, val_selfies = parallel_clustering_split_selfies(
+    selfies_list,
+    batch_size=10000,
+    n_clusters=1000,
+    train_ratio=0.8
+)
+with open('/home/yz927/projects/peptune/tokens/11M_selfies/train_selfies.txt', 'w') as f:
+    for line in train_selfies:
+        f.write(f"{line}\n")
+with open('/home/yz927/projects/peptune/tokens/11M_selfies/val_selfies.txt', 'w') as f:
+    for line in val_selfies:
+        f.write(f"{line}\n")

mcts.png ADDED Viewed

Git LFS Details

SHA256: e63bdc835269660e4b7bda69973bd60611b61045f25c5c07a9baa277e31d2acd
Pointer size: 132 Bytes
Size of remote file: 1.67 MB

mdlm.png ADDED Viewed

peptune.png ADDED Viewed