Spaces:

samsl
/

D-SCRIPT

Sleeping

App Files Files Community

samsl commited on Sep 29, 2023

Commit

c729a21

1 Parent(s): c680ea1

add message asking for notification of use

Browse files

Files changed (1) hide show

app.py +83 -76

app.py CHANGED Viewed

@@ -20,6 +20,9 @@ model_map = {
 theme = "Default"
 title = "D-SCRIPT: Predicting Protein-Protein Interactions"
 description = """
 """
 # article = """
@@ -52,7 +55,6 @@ article = """
 Note that running here with the "TT3D" model does not run structure prediction on the sequences, but rather uses the [ProstT5](https://github.com/mheinzinger/ProstT5) language model to
 translate amino acid to 3di sequences. This is much faster than running structure prediction, but the results may not be as accurate.
 """
 fold_vocab = {
@@ -81,85 +83,90 @@ fold_vocab = {
 def predict(model_name, pairs_file, sequence_file, progress = gr.Progress()):
-    run_id = uuid4()
-    device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
-    # gr.Info("Loading model...")
-    _ = lm_embed("M", use_cuda = (device.type == "cuda"))
-    model = get_pretrained(model_map[model_name]).to(device)
-    # gr.Info("Loading files...")
     try:
-        seqs = SeqIO.to_dict(SeqIO.parse(sequence_file.name, "fasta"))
-    except ValueError as _:
-        raise gr.Error("Invalid FASTA file - duplicate entry")
-    if Path(pairs_file.name).suffix == ".csv":
-        pairs = pd.read_csv(pairs_file.name)
-    elif Path(pairs_file.name).suffix == ".tsv":
-        pairs = pd.read_csv(pairs_file.name, sep="\t")
-    try:
-        pairs.columns = ["protein1", "protein2"]
-    except ValueError as _:
-        raise gr.Error("Invalid pairs file - must have two columns 'protein1' and 'protein2'")
-    do_foldseek = False
-    if model_name == "TT3D":
-        do_foldseek = True
-        need_to_translate = set(pairs["protein1"]).union(set(pairs["protein2"]))
-        seqs_to_translate = {k: str(seqs[k].seq) for k in need_to_translate if k in seqs}
-        half_precision = False
-        assert not (half_precision and device=="cpu"), print("Running fp16 on CPU is not supported, yet")
-        gr.Info(f"Loading Foldseek embeddings -- this may take some time ({len(seqs_to_translate)} embeddings)...")
-        predictions = get_3di_sequences(
-            seqs_to_translate,
-            model_dir = "Rostlab/ProstT5",
-            report_fn = gr.Info,
-            error_fn = gr.Error,
-            device=device,
-            )
-        foldseek_sequences = predictions_to_dict(predictions)
-        foldseek_embeddings = {k: one_hot_3di_sequence(s.upper(), fold_vocab) for k, s in foldseek_sequences.items()}
-        # for k in seqs_to_translate.keys():
-        #     print(seqs_to_translate[k])
-        #     print(len(seqs_to_translate[k]))
-        #     print(foldseek_embeddings[k])
-        #     print(foldseek_embeddings[k].shape)
-    progress(0, desc="Starting...")
-    results = []
-    for i in progress.tqdm(range(len(pairs))):
-        r = pairs.iloc[i]
-        prot1 = r["protein1"]
-        prot2 = r["protein2"]
-        seq1 = str(seqs[prot1].seq)
-        seq2 = str(seqs[prot2].seq)
-        fold1 = foldseek_embeddings[prot1].to(device) if do_foldseek else None
-        fold2 = foldseek_embeddings[prot2].to(device) if do_foldseek else None
-        lm1 = lm_embed(seq1).to(device)
-        lm2 = lm_embed(seq2).to(device)
-        interaction = model.predict(lm1, lm2, embed_foldseek = do_foldseek, f0 = fold1, f1 = fold2).item()
-        results.append([prot1, prot2, interaction])
-    results = pd.DataFrame(results, columns = ["Protein 1", "Protein 2", "Interaction"])
-    file_path = f"/tmp/{run_id}.tsv"
-    with open(file_path, "w") as f:
-        results.to_csv(f, sep="\t", index=False, header = True)
-    return results, file_path
 demo = gr.Interface(
     fn=predict,

 theme = "Default"
 title = "D-SCRIPT: Predicting Protein-Protein Interactions"
 description = """
+If you use this interface to make predictions, please let us know (email samsl@mit.edu)!
+We want to keep this web version free to use with GPU support, and to do that we need to demonstrate to
+our funders that it is being used. Thank you!
 """
 # article = """
 Note that running here with the "TT3D" model does not run structure prediction on the sequences, but rather uses the [ProstT5](https://github.com/mheinzinger/ProstT5) language model to
 translate amino acid to 3di sequences. This is much faster than running structure prediction, but the results may not be as accurate.
 """
 fold_vocab = {
 def predict(model_name, pairs_file, sequence_file, progress = gr.Progress()):
     try:
+        run_id = uuid4()
+        device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+        # gr.Info("Loading model...")
+        _ = lm_embed("M", use_cuda = (device.type == "cuda"))
+        model = get_pretrained(model_map[model_name]).to(device)
+        # gr.Info("Loading files...")
+        try:
+            seqs = SeqIO.to_dict(SeqIO.parse(sequence_file.name, "fasta"))
+        except ValueError as _:
+            raise gr.Error("Invalid FASTA file - duplicate entry")
+        if Path(pairs_file.name).suffix == ".csv":
+            pairs = pd.read_csv(pairs_file.name)
+        elif Path(pairs_file.name).suffix == ".tsv":
+            pairs = pd.read_csv(pairs_file.name, sep="\t")
+        try:
+            pairs.columns = ["protein1", "protein2"]
+        except ValueError as _:
+            raise gr.Error("Invalid pairs file - must have two columns 'protein1' and 'protein2'")
+        do_foldseek = False
+        if model_name == "TT3D":
+            do_foldseek = True
+            need_to_translate = set(pairs["protein1"]).union(set(pairs["protein2"]))
+            seqs_to_translate = {k: str(seqs[k].seq) for k in need_to_translate if k in seqs}
+            half_precision = False
+            assert not (half_precision and device=="cpu"), print("Running fp16 on CPU is not supported, yet")
+            gr.Info(f"Loading Foldseek embeddings -- this may take some time ({len(seqs_to_translate)} embeddings)...")
+            predictions = get_3di_sequences(
+                seqs_to_translate,
+                model_dir = "Rostlab/ProstT5",
+                report_fn = gr.Info,
+                error_fn = gr.Error,
+                device=device,
+                )
+            foldseek_sequences = predictions_to_dict(predictions)
+            foldseek_embeddings = {k: one_hot_3di_sequence(s.upper(), fold_vocab) for k, s in foldseek_sequences.items()}
+            # for k in seqs_to_translate.keys():
+            #     print(seqs_to_translate[k])
+            #     print(len(seqs_to_translate[k]))
+            #     print(foldseek_embeddings[k])
+            #     print(foldseek_embeddings[k].shape)
+        progress(0, desc="Starting...")
+        results = []
+        for i in progress.tqdm(range(len(pairs))):
+            r = pairs.iloc[i]
+            prot1 = r["protein1"]
+            prot2 = r["protein2"]
+            seq1 = str(seqs[prot1].seq)
+            seq2 = str(seqs[prot2].seq)
+            fold1 = foldseek_embeddings[prot1].to(device) if do_foldseek else None
+            fold2 = foldseek_embeddings[prot2].to(device) if do_foldseek else None
+            lm1 = lm_embed(seq1).to(device)
+            lm2 = lm_embed(seq2).to(device)
+            interaction = model.predict(lm1, lm2, embed_foldseek = do_foldseek, f0 = fold1, f1 = fold2).item()
+            results.append([prot1, prot2, interaction])
+        results = pd.DataFrame(results, columns = ["Protein 1", "Protein 2", "Interaction"])
+        file_path = f"/tmp/{run_id}.tsv"
+        with open(file_path, "w") as f:
+            results.to_csv(f, sep="\t", index=False, header = True)
+        return results, file_path
+    except Exception as e:
+        gr.Error(e)
+        return None, None
 demo = gr.Interface(
     fn=predict,