add message asking for notification of use
Browse files
app.py
CHANGED
|
@@ -20,6 +20,9 @@ model_map = {
|
|
| 20 |
theme = "Default"
|
| 21 |
title = "D-SCRIPT: Predicting Protein-Protein Interactions"
|
| 22 |
description = """
|
|
|
|
|
|
|
|
|
|
| 23 |
"""
|
| 24 |
|
| 25 |
# article = """
|
|
@@ -52,7 +55,6 @@ article = """
|
|
| 52 |
|
| 53 |
Note that running here with the "TT3D" model does not run structure prediction on the sequences, but rather uses the [ProstT5](https://github.com/mheinzinger/ProstT5) language model to
|
| 54 |
translate amino acid to 3di sequences. This is much faster than running structure prediction, but the results may not be as accurate.
|
| 55 |
-
|
| 56 |
"""
|
| 57 |
|
| 58 |
fold_vocab = {
|
|
@@ -81,85 +83,90 @@ fold_vocab = {
|
|
| 81 |
|
| 82 |
def predict(model_name, pairs_file, sequence_file, progress = gr.Progress()):
|
| 83 |
|
| 84 |
-
run_id = uuid4()
|
| 85 |
-
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
|
| 86 |
-
|
| 87 |
-
# gr.Info("Loading model...")
|
| 88 |
-
_ = lm_embed("M", use_cuda = (device.type == "cuda"))
|
| 89 |
-
|
| 90 |
-
model = get_pretrained(model_map[model_name]).to(device)
|
| 91 |
-
|
| 92 |
-
# gr.Info("Loading files...")
|
| 93 |
try:
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
raise gr.Error("Invalid FASTA file - duplicate entry")
|
| 97 |
-
|
| 98 |
-
if Path(pairs_file.name).suffix == ".csv":
|
| 99 |
-
pairs = pd.read_csv(pairs_file.name)
|
| 100 |
-
elif Path(pairs_file.name).suffix == ".tsv":
|
| 101 |
-
pairs = pd.read_csv(pairs_file.name, sep="\t")
|
| 102 |
-
try:
|
| 103 |
-
pairs.columns = ["protein1", "protein2"]
|
| 104 |
-
except ValueError as _:
|
| 105 |
-
raise gr.Error("Invalid pairs file - must have two columns 'protein1' and 'protein2'")
|
| 106 |
-
|
| 107 |
-
do_foldseek = False
|
| 108 |
-
if model_name == "TT3D":
|
| 109 |
-
do_foldseek = True
|
| 110 |
-
|
| 111 |
-
need_to_translate = set(pairs["protein1"]).union(set(pairs["protein2"]))
|
| 112 |
-
seqs_to_translate = {k: str(seqs[k].seq) for k in need_to_translate if k in seqs}
|
| 113 |
-
|
| 114 |
-
half_precision = False
|
| 115 |
-
assert not (half_precision and device=="cpu"), print("Running fp16 on CPU is not supported, yet")
|
| 116 |
-
|
| 117 |
-
gr.Info(f"Loading Foldseek embeddings -- this may take some time ({len(seqs_to_translate)} embeddings)...")
|
| 118 |
-
predictions = get_3di_sequences(
|
| 119 |
-
seqs_to_translate,
|
| 120 |
-
model_dir = "Rostlab/ProstT5",
|
| 121 |
-
report_fn = gr.Info,
|
| 122 |
-
error_fn = gr.Error,
|
| 123 |
-
device=device,
|
| 124 |
-
)
|
| 125 |
-
foldseek_sequences = predictions_to_dict(predictions)
|
| 126 |
-
foldseek_embeddings = {k: one_hot_3di_sequence(s.upper(), fold_vocab) for k, s in foldseek_sequences.items()}
|
| 127 |
-
|
| 128 |
-
# for k in seqs_to_translate.keys():
|
| 129 |
-
# print(seqs_to_translate[k])
|
| 130 |
-
# print(len(seqs_to_translate[k]))
|
| 131 |
-
# print(foldseek_embeddings[k])
|
| 132 |
-
# print(foldseek_embeddings[k].shape)
|
| 133 |
-
|
| 134 |
-
progress(0, desc="Starting...")
|
| 135 |
-
results = []
|
| 136 |
-
for i in progress.tqdm(range(len(pairs))):
|
| 137 |
-
|
| 138 |
-
r = pairs.iloc[i]
|
| 139 |
-
|
| 140 |
-
prot1 = r["protein1"]
|
| 141 |
-
prot2 = r["protein2"]
|
| 142 |
-
|
| 143 |
-
seq1 = str(seqs[prot1].seq)
|
| 144 |
-
seq2 = str(seqs[prot2].seq)
|
| 145 |
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
-
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
return results, file_path
|
| 163 |
|
| 164 |
demo = gr.Interface(
|
| 165 |
fn=predict,
|
|
|
|
| 20 |
theme = "Default"
|
| 21 |
title = "D-SCRIPT: Predicting Protein-Protein Interactions"
|
| 22 |
description = """
|
| 23 |
+
If you use this interface to make predictions, please let us know (email samsl@mit.edu)!
|
| 24 |
+
We want to keep this web version free to use with GPU support, and to do that we need to demonstrate to
|
| 25 |
+
our funders that it is being used. Thank you!
|
| 26 |
"""
|
| 27 |
|
| 28 |
# article = """
|
|
|
|
| 55 |
|
| 56 |
Note that running here with the "TT3D" model does not run structure prediction on the sequences, but rather uses the [ProstT5](https://github.com/mheinzinger/ProstT5) language model to
|
| 57 |
translate amino acid to 3di sequences. This is much faster than running structure prediction, but the results may not be as accurate.
|
|
|
|
| 58 |
"""
|
| 59 |
|
| 60 |
fold_vocab = {
|
|
|
|
| 83 |
|
| 84 |
def predict(model_name, pairs_file, sequence_file, progress = gr.Progress()):
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
try:
|
| 87 |
+
run_id = uuid4()
|
| 88 |
+
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
+
# gr.Info("Loading model...")
|
| 91 |
+
_ = lm_embed("M", use_cuda = (device.type == "cuda"))
|
| 92 |
+
|
| 93 |
+
model = get_pretrained(model_map[model_name]).to(device)
|
| 94 |
+
|
| 95 |
+
# gr.Info("Loading files...")
|
| 96 |
+
try:
|
| 97 |
+
seqs = SeqIO.to_dict(SeqIO.parse(sequence_file.name, "fasta"))
|
| 98 |
+
except ValueError as _:
|
| 99 |
+
raise gr.Error("Invalid FASTA file - duplicate entry")
|
| 100 |
+
|
| 101 |
+
if Path(pairs_file.name).suffix == ".csv":
|
| 102 |
+
pairs = pd.read_csv(pairs_file.name)
|
| 103 |
+
elif Path(pairs_file.name).suffix == ".tsv":
|
| 104 |
+
pairs = pd.read_csv(pairs_file.name, sep="\t")
|
| 105 |
+
try:
|
| 106 |
+
pairs.columns = ["protein1", "protein2"]
|
| 107 |
+
except ValueError as _:
|
| 108 |
+
raise gr.Error("Invalid pairs file - must have two columns 'protein1' and 'protein2'")
|
| 109 |
+
|
| 110 |
+
do_foldseek = False
|
| 111 |
+
if model_name == "TT3D":
|
| 112 |
+
do_foldseek = True
|
| 113 |
+
|
| 114 |
+
need_to_translate = set(pairs["protein1"]).union(set(pairs["protein2"]))
|
| 115 |
+
seqs_to_translate = {k: str(seqs[k].seq) for k in need_to_translate if k in seqs}
|
| 116 |
+
|
| 117 |
+
half_precision = False
|
| 118 |
+
assert not (half_precision and device=="cpu"), print("Running fp16 on CPU is not supported, yet")
|
| 119 |
+
|
| 120 |
+
gr.Info(f"Loading Foldseek embeddings -- this may take some time ({len(seqs_to_translate)} embeddings)...")
|
| 121 |
+
predictions = get_3di_sequences(
|
| 122 |
+
seqs_to_translate,
|
| 123 |
+
model_dir = "Rostlab/ProstT5",
|
| 124 |
+
report_fn = gr.Info,
|
| 125 |
+
error_fn = gr.Error,
|
| 126 |
+
device=device,
|
| 127 |
+
)
|
| 128 |
+
foldseek_sequences = predictions_to_dict(predictions)
|
| 129 |
+
foldseek_embeddings = {k: one_hot_3di_sequence(s.upper(), fold_vocab) for k, s in foldseek_sequences.items()}
|
| 130 |
+
|
| 131 |
+
# for k in seqs_to_translate.keys():
|
| 132 |
+
# print(seqs_to_translate[k])
|
| 133 |
+
# print(len(seqs_to_translate[k]))
|
| 134 |
+
# print(foldseek_embeddings[k])
|
| 135 |
+
# print(foldseek_embeddings[k].shape)
|
| 136 |
+
|
| 137 |
+
progress(0, desc="Starting...")
|
| 138 |
+
results = []
|
| 139 |
+
for i in progress.tqdm(range(len(pairs))):
|
| 140 |
+
|
| 141 |
+
r = pairs.iloc[i]
|
| 142 |
+
|
| 143 |
+
prot1 = r["protein1"]
|
| 144 |
+
prot2 = r["protein2"]
|
| 145 |
+
|
| 146 |
+
seq1 = str(seqs[prot1].seq)
|
| 147 |
+
seq2 = str(seqs[prot2].seq)
|
| 148 |
+
|
| 149 |
+
fold1 = foldseek_embeddings[prot1].to(device) if do_foldseek else None
|
| 150 |
+
fold2 = foldseek_embeddings[prot2].to(device) if do_foldseek else None
|
| 151 |
+
|
| 152 |
+
lm1 = lm_embed(seq1).to(device)
|
| 153 |
+
lm2 = lm_embed(seq2).to(device)
|
| 154 |
+
|
| 155 |
+
interaction = model.predict(lm1, lm2, embed_foldseek = do_foldseek, f0 = fold1, f1 = fold2).item()
|
| 156 |
+
|
| 157 |
+
results.append([prot1, prot2, interaction])
|
| 158 |
+
|
| 159 |
+
results = pd.DataFrame(results, columns = ["Protein 1", "Protein 2", "Interaction"])
|
| 160 |
+
|
| 161 |
+
file_path = f"/tmp/{run_id}.tsv"
|
| 162 |
+
with open(file_path, "w") as f:
|
| 163 |
+
results.to_csv(f, sep="\t", index=False, header = True)
|
| 164 |
|
| 165 |
+
return results, file_path
|
| 166 |
|
| 167 |
+
except Exception as e:
|
| 168 |
+
gr.Error(e)
|
| 169 |
+
return None, None
|
|
|
|
|
|
|
| 170 |
|
| 171 |
demo = gr.Interface(
|
| 172 |
fn=predict,
|