Make it better

Browse files

Files changed (8) hide show

.DS_Store +0 -0
__pycache__/plapt.cpython-312.pyc +0 -0
index.py +65 -0
models/.DS_Store +0 -0
models/affinity_predictor0734-seed2101.onnx +3 -0
plapt.py +171 -0
plapt_cli.py +53 -0
requirements.txt +6 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

__pycache__/plapt.cpython-312.pyc ADDED Viewed

Binary file (9.54 kB). View file

index.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import numpy as np
+import json
+import onnxruntime
+from transformers import BertTokenizer, RobertaTokenizer
+import torch
+def init():
+    global session, prot_tokenizer, mol_tokenizer, input_name
+    session = onnxruntime.InferenceSession("models/affinity_predictor0734-seed2101.onnx")
+    input_name = session.get_inputs()[0].name
+    prot_tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
+    mol_tokenizer = RobertaTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
+def run(raw_data):
+    try:
+        data = json.loads(raw_data)
+        prot_seq = data['protein']
+        mol_smiles = data['smiles']
+        # Tokenize and encode protein
+        prot_tokens = prot_tokenizer(preprocess_sequence(prot_seq),
+                                      padding=True,
+                                      max_length=3200,
+                                      truncation=True,
+                                      return_tensors='pt')
+        with torch.no_grad():
+            prot_representations = torch.tensor(prot_tokens['input_ids']).unsqueeze(0)
+            prot_representations = prot_representations.squeeze(0)
+        # Tokenize and encode molecule
+        mol_tokens = mol_tokenizer(mol_smiles,
+                                    padding=True,
+                                    max_length=278,
+                                    truncation=True,
+                                    return_tensors='pt')
+        with torch.no_grad():
+            mol_representations = torch.tensor(mol_tokens['input_ids']).unsqueeze(0)
+            mol_representations = mol_representations.squeeze(0)
+        # Combine representations
+        features = torch.cat((prot_representations, mol_representations), dim=0)
+        # Run inference
+        affinity_normalized = session.run(None, {input_name: [features.numpy()], 'TrainingMode': np.array(False)})[0][0][0]
+        # Convert to affinity
+        affinity = convert_to_affinity(affinity_normalized)
+        return (affinity)
+    except Exception as e:
+        return json.dumps({"error": str(e)})
+def preprocess_sequence(seq):
+    import re
+    return " ".join(re.sub(r"[UZOB]", "X", seq))
+def convert_to_affinity(normalized):
+    mean = 6.51286529169358
+    scale = 1.5614094578916633
+    return {
+        "neg_log10_affinity_M": (normalized * scale) + mean,
+        "affinity_uM": (10**6) * (10**(-((normalized * scale) + mean)))
+    }
+print(run({"protein": "MILK", "smiles": "CCO"}))

models/.DS_Store ADDED Viewed

File without changes

models/affinity_predictor0734-seed2101.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bbb242b307274215e542bae5cd524f81d06e6f1102b4cc0cf31042e2a601509c
+size 5924195

plapt.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import torch
+from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
+import re
+import onnxruntime
+import numpy as np
+torch.set_num_threads(1)
+def flatten_list(nested_list):
+    flat_list = []
+    for element in nested_list:
+        if isinstance(element, list):
+            flat_list.extend(flatten_list(element))
+        else:
+            flat_list.append(element)
+    return flat_list
+class PredictionModule:
+    def __init__(self, model_path="models/affinity_predictor0734-seed2101.onnx"):
+        self.session = onnxruntime.InferenceSession(model_path)
+        self.input_name = self.session.get_inputs()[0].name
+        # Normalization scaling parameters
+        self.mean = 6.51286529169358
+        self.scale = 1.5614094578916633
+    def convert_to_affinity(self, normalized):
+        return  {
+                    "neg_log10_affinity_M": (normalized * self.scale) + self.mean,
+                    "affinity_uM" : (10**6) * (10**(-((normalized * self.scale) + self.mean)))
+                }
+    def predict(self, batch_data):
+        """Run predictions on a batch of data."""
+        # Convert each tensor to a numpy array and store in a list
+        batch_data = np.array([t.numpy() for t in batch_data])
+        # Process each feature in the batch individually and store results
+        affinities = []
+        for feature in batch_data:
+            # Run the model on the single feature
+            affinity_normalized = self.session.run(None, {self.input_name: [feature], 'TrainingMode': np.array(False)})[0][0][0]
+            # Append the result
+            affinities.append(self.convert_to_affinity(affinity_normalized))
+        return affinities
+class Plapt:
+    def __init__(self, prediction_module_path = "models/affinity_predictor0734-seed2101.onnx", caching=True, device='cuda'):
+        # Set device for computation
+        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
+        # Load protein tokenizer and encoder
+        self.prot_tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
+        self.prot_encoder = BertModel.from_pretrained("Rostlab/prot_bert").to(self.device)
+        # Load molecule tokenizer and encoder
+        self.mol_tokenizer = RobertaTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
+        self.mol_encoder = RobertaModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1").to(self.device)
+        self.caching = caching
+        self.cache = {}
+        # Load the prediction module ONNX model
+        self.prediction_module = PredictionModule(prediction_module_path)
+    def set_prediction_module(self, prediction_module_path):
+        self.prediction_module = PredictionModule(prediction_module_path)
+    @staticmethod
+    def preprocess_sequence(seq):
+        # Preprocess protein sequence
+        return " ".join(re.sub(r"[UZOB]", "X", seq))
+    def tokenize(self, mol_smiles):
+        # Tokenize and encode molecules
+        mol_tokens = self.mol_tokenizer(mol_smiles,
+                                        padding=True,
+                                        max_length=278,
+                                        truncation=True,
+                                        return_tensors='pt')
+        return mol_tokens
+    def tokenize_prot(self, prot_seq):
+        # Tokenize and encode protein sequences
+        prot_tokens = self.prot_tokenizer(self.preprocess_sequence(prot_seq),
+                                          padding=True,
+                                          max_length=3200,
+                                          truncation=True,
+                                          return_tensors='pt')
+        return prot_tokens
+    # Define the batch functions
+    @staticmethod
+    def make_batches(iterable, n=1):
+        length = len(iterable)
+        for ndx in range(0, length, n):
+            yield iterable[ndx:min(ndx + n, length)]
+    def predict_affinity(self, prot_seq, mol_smiles, batch_size=2):
+        input_strs = mol_smiles
+        prot_tokens = self.tokenize_prot(prot_seq)
+        with torch.no_grad():
+            prot_representations = self.prot_encoder(**prot_tokens.to(self.device)).pooler_output.cpu()
+        prot_representations = prot_representations.squeeze(0)
+        # repeat for zip(prot_representations, mol_representations)
+        prot_representations = [prot_representations for i in range(batch_size)]
+        affinities = []
+        for batch in self.make_batches(input_strs, batch_size):
+            batch_key = str(batch)  # Convert batch to a string to use as a dictionary key
+            if batch_key in self.cache and self.caching:
+                # Use cached features if available
+                features = self.cache[batch_key]
+            else:
+                # Tokenize and encode the batch, then cache the results
+                mol_tokens = self.tokenize(batch)
+                with torch.no_grad():
+                    mol_representations = self.mol_encoder(**mol_tokens.to(self.device)).pooler_output.cpu()
+                    mol_representations = [mol_representations[i, :] for i in range(mol_representations.shape[0])]
+                features = [torch.cat((prot, mol), dim=0) for prot, mol in
+                            zip(prot_representations, mol_representations)]
+                if self.caching:
+                    self.cache[batch_key] = features
+            affinities.extend(self.prediction_module.predict(features))
+        return affinities
+    def score_candidates(self, target_protein, mol_smiles, batch_size=2):
+        target_tokens = self.prot_tokenizer([self.preprocess_sequence(target_protein)],
+                                            padding=True,
+                                            max_length=3200,
+                                            truncation=True,
+                                            return_tensors='pt')
+        with torch.no_grad():
+            target_representation = self.prot_encoder(**target_tokens.to(self.device)).pooler_output.cpu()
+        print(target_representation)
+        affinities = []
+        for mol in mol_smiles:
+            mol_tokens = self.mol_tokenizer(mol,
+                                                padding=True,
+                                                max_length=278,
+                                                truncation=True,
+                                                return_tensors='pt')
+            with torch.no_grad():
+                mol_representations = self.mol_encoder(**mol_tokens.to(self.device)).pooler_output.cpu()
+            print(mol_representations)
+            features = torch.cat((target_representation[0], mol_representations[0]), dim=0)
+            print(features)
+            affinities.extend(self.prediction_module.predict([features]))
+        return affinities
+    def get_cached_features(self):
+        return [tensor.tolist() for tensor in flatten_list(list(self.cache.values()))]
+    def clear_cache(self):
+        self.cache = {}

plapt_cli.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import warnings
+import argparse
+import json
+import csv
+import os
+from plapt import Plapt
+warnings.filterwarnings("ignore")
+def write_json(results, filename):
+    with open(filename, 'w') as json_file:
+        json.dump(results, json_file)
+def write_csv(results, filename):
+    with open(filename, 'w', newline='') as csv_file:
+        writer = csv.writer(csv_file)
+        for result in results:
+            writer.writerow([result])
+def determine_format_and_update_filename(output_arg, format_arg):
+    if output_arg:
+        _, ext = os.path.splitext(output_arg)
+        if ext not in [".csv", ".json"]:
+            output_arg += f".{format_arg or 'json'}"
+        return output_arg, (format_arg or "json" if not ext else ext[1:])
+    return None, "json"
+def main():
+    parser = argparse.ArgumentParser(description="Predict affinity using Plapt.")
+    parser.add_argument("-t", "--target", nargs="+", required=True, help="The target protein sequence")
+    parser.add_argument("-m", "--smiles", nargs="+", required=True, help="List of SMILES strings")
+    parser.add_argument("-o", "--output", help="Optional output file path")
+    parser.add_argument("-f", "--format", choices=["json", "csv"], help="Optional output file format; required if output is specified without an extension")
+    args = parser.parse_args()
+    plapt = Plapt()
+    results = plapt.predict_affinity(args.target[0], args.smiles)
+    args.output, output_format = determine_format_and_update_filename(args.output, args.format)
+    if args.output:
+        if output_format == "json":
+            write_json(results, args.output)
+        elif output_format == "csv":
+            write_csv(results, args.output)
+        print(f"Output written to {args.output}")
+    else:
+        print(results)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+azureml-core
+   azureml-defaults
+torch
+transformers
+onnxruntime
+numpy