File download and multiple models
Browse files
app.py
CHANGED
|
@@ -5,10 +5,23 @@ from Bio import SeqIO
|
|
| 5 |
from dscript.pretrained import get_pretrained
|
| 6 |
from dscript.language_model import lm_embed
|
| 7 |
from tqdm.auto import tqdm
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
seqs = SeqIO.to_dict(SeqIO.parse(sequence_file.name, "fasta"))
|
| 13 |
if Path(pairs_file.name).suffix == ".csv":
|
| 14 |
pairs = pd.read_csv(pairs_file.name)
|
|
@@ -16,9 +29,11 @@ def predict(sequence_file, pairs_file):
|
|
| 16 |
pairs = pd.read_csv(pairs_file.name, sep="\t")
|
| 17 |
pairs.columns = ["protein1", "protein2"]
|
| 18 |
|
|
|
|
| 19 |
results = []
|
| 20 |
progress = gr.Progress(track_tqdm=True)
|
| 21 |
for i, r in tqdm(pairs.iterrows(), total=len(pairs)):
|
|
|
|
| 22 |
prot1 = r["protein1"]
|
| 23 |
prot2 = r["protein2"]
|
| 24 |
seq1 = str(seqs[prot1].seq)
|
|
@@ -27,20 +42,26 @@ def predict(sequence_file, pairs_file):
|
|
| 27 |
lm2 = lm_embed(seq2)
|
| 28 |
interaction = model.predict(lm1, lm2).item()
|
| 29 |
results.append([prot1, prot2, interaction])
|
| 30 |
-
|
| 31 |
|
| 32 |
results = pd.DataFrame(results, columns = ["Protein 1", "Protein 2", "Interaction"])
|
| 33 |
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
demo = gr.Interface(
|
| 37 |
fn=predict,
|
| 38 |
inputs = [
|
|
|
|
| 39 |
gr.File(label="Sequences (.fasta)", file_types = [".fasta"]),
|
| 40 |
gr.File(label="Pairs (.csv/.tsv)", file_types = [".csv", ".tsv"])
|
| 41 |
],
|
| 42 |
outputs = [
|
| 43 |
-
gr.DataFrame(label='Results', headers=['Protein 1', 'Protein 2', 'Interaction'])
|
|
|
|
| 44 |
]
|
| 45 |
)
|
| 46 |
|
|
|
|
| 5 |
from dscript.pretrained import get_pretrained
|
| 6 |
from dscript.language_model import lm_embed
|
| 7 |
from tqdm.auto import tqdm
|
| 8 |
+
from uuid import uuid4
|
| 9 |
|
| 10 |
+
model_map = {
|
| 11 |
+
"D-SCRIPT": "human_v1",
|
| 12 |
+
"Topsy-Turvy": "human_v2"
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
def predict(model, sequence_file, pairs_file):
|
| 16 |
+
|
| 17 |
+
run_id = uuid4()
|
| 18 |
+
|
| 19 |
+
gr.Info("Loading model...")
|
| 20 |
+
_ = lm_embed("M")
|
| 21 |
+
|
| 22 |
+
model = get_pretrained(model_map[model])
|
| 23 |
+
|
| 24 |
+
gr.Info("Loading files...")
|
| 25 |
seqs = SeqIO.to_dict(SeqIO.parse(sequence_file.name, "fasta"))
|
| 26 |
if Path(pairs_file.name).suffix == ".csv":
|
| 27 |
pairs = pd.read_csv(pairs_file.name)
|
|
|
|
| 29 |
pairs = pd.read_csv(pairs_file.name, sep="\t")
|
| 30 |
pairs.columns = ["protein1", "protein2"]
|
| 31 |
|
| 32 |
+
gr.Info("Predicting...")
|
| 33 |
results = []
|
| 34 |
progress = gr.Progress(track_tqdm=True)
|
| 35 |
for i, r in tqdm(pairs.iterrows(), total=len(pairs)):
|
| 36 |
+
gr.Info(f"[{i+1}/{len(pairs)}]")
|
| 37 |
prot1 = r["protein1"]
|
| 38 |
prot2 = r["protein2"]
|
| 39 |
seq1 = str(seqs[prot1].seq)
|
|
|
|
| 42 |
lm2 = lm_embed(seq2)
|
| 43 |
interaction = model.predict(lm1, lm2).item()
|
| 44 |
results.append([prot1, prot2, interaction])
|
| 45 |
+
progress((i, len(pairs)))
|
| 46 |
|
| 47 |
results = pd.DataFrame(results, columns = ["Protein 1", "Protein 2", "Interaction"])
|
| 48 |
|
| 49 |
+
file_path = f"/tmp/{run_id}.tsv"
|
| 50 |
+
with open(file_path, "w") as f:
|
| 51 |
+
results.to_csv(f, sep="\t", index=False, header = True)
|
| 52 |
+
|
| 53 |
+
return results, file_path
|
| 54 |
|
| 55 |
demo = gr.Interface(
|
| 56 |
fn=predict,
|
| 57 |
inputs = [
|
| 58 |
+
gr.Dropdown(label="Model", choices = ["D-SCRIPT", "Topsy-Turvy"], value = "Topsy-Turvy"),
|
| 59 |
gr.File(label="Sequences (.fasta)", file_types = [".fasta"]),
|
| 60 |
gr.File(label="Pairs (.csv/.tsv)", file_types = [".csv", ".tsv"])
|
| 61 |
],
|
| 62 |
outputs = [
|
| 63 |
+
gr.DataFrame(label='Results', headers=['Protein 1', 'Protein 2', 'Interaction']),
|
| 64 |
+
gr.File(label="Download results", type="file")
|
| 65 |
]
|
| 66 |
)
|
| 67 |
|