GleghornLab
/

SYNTERACT

@@ -56,11 +56,9 @@ import torch
 import torch.nn.functional as F
 from transformers import BertForSequenceClassification, BertTokenizer
-model = BertForSequenceClassification.from_pretrained('GleghornLab/SYNTERACT') # load model
 tokenizer = BertTokenizer.from_pretrained('GleghornLab/SYNTERACT') # load tokenizer
-device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # gather device
-model.to(device) # move to device
-model.eval() # put in eval mode
 sequence_a = 'MEKSCSIGNGREQYGWGHGEQCGTQFLECVYRNASMYSVLGDLITYVVFLGATCYAILFGFRLLLSCVRIVLKVVIALFVIRLLLALGSVDITSVSYSG' # Uniprot A1Z8T3
 sequence_b = 'MRLTLLALIGVLCLACAYALDDSENNDQVVGLLDVADQGANHANDGAREARQLGGWGGGWGGRGGWGGRGGWGGRGGWGGRGGWGGGWGGRGGWGGRGGGWYGR' # Uniprot A1Z8H0
@@ -70,7 +68,7 @@ example = sequence_a + ' [SEP] ' + sequence_b # add SEP token
 example = tokenizer(example, return_tensors='pt', padding=False).to(device) # tokenize example
 with torch.no_grad():
-    logits = model(**example).logits.cpu().detach() # get logits from model
 probability = F.softmax(logits, dim=-1) # use softmax to get "confidence" in the prediction
 prediction = probability.argmax(dim=-1) # 0 for no interaction, 1 for interaction
@@ -92,4 +90,137 @@ The [Gleghorn lab](https://www.gleghornlab.com/) is an interdisciplinary researc
 	publisher = {Cold Spring Harbor Laboratory},
 	journal = {bioRxiv}
 }
 ```

 import torch.nn.functional as F
 from transformers import BertForSequenceClassification, BertTokenizer
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # gather device
+model = BertForSequenceClassification.from_pretrained('GleghornLab/SYNTERACT', attn_implementation='sdpa').device.eval() # load model
 tokenizer = BertTokenizer.from_pretrained('GleghornLab/SYNTERACT') # load tokenizer
 sequence_a = 'MEKSCSIGNGREQYGWGHGEQCGTQFLECVYRNASMYSVLGDLITYVVFLGATCYAILFGFRLLLSCVRIVLKVVIALFVIRLLLALGSVDITSVSYSG' # Uniprot A1Z8T3
 sequence_b = 'MRLTLLALIGVLCLACAYALDDSENNDQVVGLLDVADQGANHANDGAREARQLGGWGGGWGGRGGWGGRGGWGGRGGWGGRGGWGGGWGGRGGWGGRGGGWYGR' # Uniprot A1Z8H0
 example = tokenizer(example, return_tensors='pt', padding=False).to(device) # tokenize example
 with torch.no_grad():
+    logits = model(**example).logits.detach().cpu() # get logits from model
 probability = F.softmax(logits, dim=-1) # use softmax to get "confidence" in the prediction
 prediction = probability.argmax(dim=-1) # 0 for no interaction, 1 for interaction
 	publisher = {Cold Spring Harbor Laboratory},
 	journal = {bioRxiv}
 }
+```
+## A simple inference script
+```python
+import torch
+import re
+import argparse
+import pandas as pd
+from transformers import BertForSequenceClassification, BertTokenizer
+from torch.utils.data import Dataset, DataLoader
+from typing import List, Tuple, Dict
+from tqdm.auto import tqdm
+class PairDataset(Dataset):
+    def __init__(self, sequences_a: List[str], sequences_b: List[str]):
+        self.sequences_a = sequences_a
+        self.sequences_b = sequences_b
+    def __len__(self):
+        return len(self.sequences_a)
+    def __getitem__(self, idx: int) -> Tuple[str, str]:
+        return self.sequences_a[idx], self.sequences_b[idx]
+class PairCollator:
+    def __init__(self, tokenizer, max_length=1024):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def sanitize_seq(self, seq: str) -> str:
+        seq = ' '.join(list(re.sub(r'[UZOB]', 'X', seq)))
+        return seq
+    def __call__(self, batch: List[Tuple[str, str]]) -> Dict[str, torch.Tensor]:
+        seqs_a, seqs_b, = zip(*batch)
+        seqs = []
+        for a, b in zip(seqs_a, seqs_b):
+            seq = self.sanitize_seq(a) + ' [SEP] ' + self.sanitize_seq(b)
+            seqs.append(seq)
+        seqs = self.tokenizer(seqs, padding='longest', truncation=True, max_length=self.max_length, return_tensors='pt')
+        return {
+            'input_ids': seqs['input_ids'],
+            'attention_mask': seqs['attention_mask'],
+        }
+def main(args):
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Using device: {device}")
+    print(f"Loading model from {args.model_path}")
+    model = BertForSequenceClassification.from_pretrained(args.model_path, attn_implementation="sdpa").eval().to(device)
+    # When using PyTorch >= 2.5.1 on a linux machine, spda attention will greatly speed up inference
+    tokenizer = BertTokenizer.from_pretrained(args.model_path)
+    print(f"Tokenizer loaded")
+    """
+    Load your data into two lists of sequences, where you want the PPI for each pair sequences_a[i], sequences_b[i]
+    We recommend trimmed sequence pairs that sum over 1022 tokens (for the 1024 max length limit of SYNTERACT)
+    We also recommend sorting the sequences by length in descending order, as this will speed up inference by reducing padding
+    Example:
+        from datasets import load_dataset
+        data = load_dataset('Synthyra/NEGATOME', split='combined')
+        # Filter out examples where the total length exceeds 1022
+        data = data.filter(lambda x: len(x['SeqA']) + len(x['SeqB']) <= 1022)
+        # Add a new column 'total_length' that is the sum of lengths of SeqA and SeqB
+        data = data.map(lambda x: {"total_length": len(x['SeqA']) + len(x['SeqB'])})
+        # Sort the dataset by 'total_length' in descending order (longest sequences first)
+        data = data.sort("total_length", reverse=True)
+        # Now retrieve the sorted sequences
+        sequences_a = data['SeqA']
+        sequences_b = data['SeqB']
+    """
+    print("Loading data...")
+    sequences_a = []
+    sequences_b = []
+    print("Creating torch dataset...")
+    pair_dataset = PairDataset(sequences_a, sequences_b)
+    pair_collator = PairCollator(tokenizer, max_length=1024)
+    data_loader = DataLoader(pair_dataset, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=pair_collator)
+    all_seqs_a = []
+    all_seqs_b = []
+    all_probs = []
+    all_preds = []
+    print("Starting inference...")
+    with torch.no_grad():
+        for i, batch in enumerate(tqdm(data_loader, total=len(data_loader), desc="Batches processed")):
+            # Because sequences are sorted, the initial estimate for time will be much longer than the actual time it will take
+            input_ids = batch['input_ids'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            logits = model(input_ids, attention_mask=attention_mask).logits.detach().cpu()
+            prob_of_interaction = torch.softmax(logits, dim=1)[:, 1] # can do 1 - this for no interaction prob
+            pred = torch.argmax(logits, dim=1)
+            # Store results
+            batch_start = i * args.batch_size
+            batch_end = min((i + 1) * args.batch_size, len(sequences_a))
+            all_seqs_a.extend(sequences_a[batch_start:batch_end])
+            all_seqs_b.extend(sequences_b[batch_start:batch_end])
+            all_probs.extend(prob_of_interaction.tolist())
+            all_preds.extend(pred.tolist())
+    # round to 5 decimal places
+    all_probs = [round(prob, 5) for prob in all_probs]
+    # Create dataframe and save to CSV
+    results_df = pd.DataFrame({
+        'sequence_a': all_seqs_a,
+        'sequence_b': all_seqs_b,
+        'probabilities': all_probs,
+        'prediction': all_preds
+    })
+    print(f"Saving results to {args.save_path}")
+    results_df.to_csv(args.save_path, index=False)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_path', type=str, default='GleghornLab/SYNTERACT')
+    parser.add_argument('--save_path', type=str, default='ppi_predictions.csv')
+    parser.add_argument('--batch_size', type=int, default=2)
+    parser.add_argument('--num_workers', type=int, default=0) # can increase to use multiprocessing for dataloader, 4 is a good value usually
+    args = parser.parse_args()
+    main(args)
 ```