Spaces:
Sleeping
Sleeping
| import time | |
| import numpy as np | |
| import gradio as gr | |
| import pandas as pd | |
| import torch | |
| from pathlib import Path | |
| from Bio import SeqIO | |
| from tqdm.auto import tqdm | |
| from uuid import uuid4 | |
| from tempfile import TemporaryDirectory | |
| from torch.utils.data import DataLoader | |
| from pathvalidate import sanitize_filename | |
| from conplex_dti.featurizer import MorganFeaturizer, ProtBertFeaturizer | |
| from publish_model import ConPLex_DTI | |
| theme = "Default" | |
| title = "ConPLex: Predicting Drug-Target Interactions" | |
| description = """ | |
| If you use this interface to make predictions, please let us know (by emailing samsl@mit.edu)! | |
| We want to keep this web version free to use with GPU support, and to do that we need to demonstrate to | |
| our funders that it is being used. Thank you! | |
| """ | |
| # article = """ | |
| # <hr> | |
| # <img style="margin-left:auto; margin-right:auto" src="https://raw.githubusercontent.com/samsledje/D-SCRIPT/main/docs/source/img/dscript_architecture.png" alt="D-SCRIPT architecture" width="70%"/> | |
| # <hr> | |
| # D-SCRIPT is a deep learning method for predicting a physical interaction between two proteins given just their sequences. | |
| # It generalizes well to new species and is robust to limitations in training data size. Its design reflects the intuition that for two proteins to physically interact, | |
| # a subset of amino acids from each protein should be in contact with the other. The intermediate stages of D-SCRIPT directly implement this intuition, with the penultimate stage | |
| # in D-SCRIPT being a rough estimate of the inter-protein contact map of the protein dimer. This structurally-motivated design enhances the interpretability of the results and, | |
| # since structure is more conserved evolutionarily than sequence, improves generalizability across species. | |
| # <hr> | |
| # Computational methods to predict protein-protein interaction (PPI) typically segregate into sequence-based "bottom-up" methods that infer properties from the characteristics of the | |
| # individual protein sequences, or global "top-down" methods that infer properties from the pattern of already known PPIs in the species of interest. However, a way to incorporate | |
| # top-down insights into sequence-based bottom-up PPI prediction methods has been elusive. Topsy-Turvy builds upon D-SCRIPT by synthesizing both views in a sequence-based, | |
| # multi-scale, deep-learning model for PPI prediction. While Topsy-Turvy makes predictions using only sequence data, during the training phase it takes a transfer-learning approach by | |
| # incorporating patterns from both global and molecular-level views of protein interaction. In a cross-species context, we show it achieves state-of-the-art performance, offering the | |
| # ability to perform genome-scale, interpretable PPI prediction for non-model organisms with no existing experimental PPI data. | |
| # """ | |
| article = """ | |
| The pairs file should be a tab-separated values file where each row is a candidate pair, formatted as `[protein ID]\t[molecule ID]\t[protein Sequence]\t[molecule SMILES]` | |
| """ | |
| def predict(run_name, model_name, csv_file, progress=gr.Progress()): | |
| try: | |
| with TemporaryDirectory() as tmpdir: | |
| run_id = uuid4() | |
| run_name = sanitize_filename(run_name) | |
| device = ( | |
| torch.device("cuda:0") | |
| if torch.cuda.is_available() | |
| else torch.device("cpu") | |
| ) | |
| gr.Info("Loading data...") | |
| query_df = pd.read_csv( | |
| csv_file.name, | |
| sep="\t", | |
| names=["proteinID", "moleculeID", "proteinSequence", "moleculeSmiles"], | |
| ) | |
| # Loading model | |
| gr.Info( | |
| "Loading model -- this may take a while, as the ProtBert language model must be downloaded..." | |
| ) | |
| target_featurizer = ProtBertFeaturizer(save_dir=tmpdir, per_tok=False).to( | |
| device | |
| ) | |
| drug_featurizer = MorganFeaturizer(save_dir=tmpdir).to(device) | |
| gr.Info("Preloading embeddings...") | |
| drug_featurizer.preload(query_df["moleculeSmiles"].unique()) | |
| target_featurizer.preload(query_df["proteinSequence"].unique()) | |
| model = ConPLex_DTI.from_pretrained(f"samsl/{model_name}") | |
| model = model.eval() | |
| model = model.to(device) | |
| dt_feature_pairs = [ | |
| ( | |
| drug_featurizer(r["moleculeSmiles"]), | |
| target_featurizer(r["proteinSequence"]), | |
| ) | |
| for _, r in query_df.iterrows() | |
| ] | |
| dloader = DataLoader(dt_feature_pairs, batch_size=1024, shuffle=False) | |
| progress(0, desc="Starting...") | |
| preds = [] | |
| for b in progress.tqdm(dloader): | |
| preds.append(model(b[0], b[1]).detach().cpu().numpy()) | |
| preds = np.concatenate(preds) | |
| results = pd.DataFrame(query_df[["moleculeID", "proteinID"]]) | |
| results["Prediction"] = preds | |
| results.columns = ["Protein", "Small Molecule", "Predicted Interaction"] | |
| file_path = f"/tmp/conplex_{run_name}_{run_id}.tsv" | |
| with open(file_path, "w+") as f: | |
| results.to_csv(f, sep="\t", index=False, header=True) | |
| return file_path | |
| except Exception as e: | |
| gr.Error(e) | |
| print(e) | |
| return None | |
| demo = gr.Interface( | |
| fn=predict, | |
| inputs=[ | |
| gr.Textbox(label="Run Name", placeholder="predictions", type="text"), | |
| gr.Dropdown( | |
| label="Model", | |
| choices=["ConPLex_V1_BindingDB"], | |
| value="ConPLex_V1_BindingDB", | |
| ), | |
| gr.File(label="Pairs (.tsv)", file_types=[".tsv"]), | |
| ], | |
| outputs=[ | |
| # gr.DataFrame( | |
| # label='Results', | |
| # headers=['Protein', 'Small Molecule', 'Predicted Interaction'], | |
| # height = 200, | |
| # row_count = 20 | |
| # ), | |
| gr.File(label="Download results", type="file") | |
| ], | |
| title=title, | |
| description=description, | |
| article=article, | |
| theme=theme, | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=20).launch() | |