Spaces:
Running
Running
roni commited on
Commit ·
1694358
1
Parent(s): 5e7a3eb
using metadata instead of fetching it from the internet
Browse files- app.py +8 -32
- concurrency.py +0 -22
- protein_viz.py +0 -56
app.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
|
| 3 |
-
from concurrency import execute_multithread
|
| 4 |
from get_index import get_engines
|
| 5 |
-
from protein_viz import
|
| 6 |
|
| 7 |
index_repo = "ronig/siamese_protein_index"
|
| 8 |
model_repo = "ronig/protein_search_engine"
|
|
@@ -15,6 +14,7 @@ You can use it to search the full [PDB](https://www.rcsb.org/) database or in a
|
|
| 15 |
"""
|
| 16 |
max_results = 100
|
| 17 |
|
|
|
|
| 18 |
def search_and_display(seq, n_res, index_selection):
|
| 19 |
n_res = int(limit_n_results(n_res))
|
| 20 |
engine = engines[index_selection]
|
|
@@ -47,48 +47,24 @@ def update_dropdown_menu(search_res):
|
|
| 47 |
|
| 48 |
def format_search_results(raw_search_results):
|
| 49 |
formatted_search_results = {}
|
| 50 |
-
for
|
| 51 |
-
|
| 52 |
-
inputs=({"raw_result": res} for res in raw_search_results),
|
| 53 |
-
n_workers=len(raw_search_results),
|
| 54 |
-
):
|
| 55 |
formatted_search_results[key] = value
|
| 56 |
return formatted_search_results
|
| 57 |
|
| 58 |
|
| 59 |
-
def format_search_result(raw_result):
|
| 60 |
-
is_pdb = "pdb_name" in raw_result
|
| 61 |
-
if is_pdb:
|
| 62 |
-
key, value = parse_pdb_search_result(raw_result)
|
| 63 |
-
else:
|
| 64 |
-
key, value = parse_fasta_search_result(raw_result)
|
| 65 |
-
return key, value
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
def parse_fasta_search_result(raw_result):
|
| 69 |
-
gene = parse_gene_from_fasta_entry(raw_result["description"])
|
| 70 |
-
key = f"Gene: {gene}"
|
| 71 |
-
value = raw_result["score"]
|
| 72 |
-
return key, value
|
| 73 |
-
|
| 74 |
-
|
| 75 |
def parse_pdb_search_result(raw_result):
|
| 76 |
prot = raw_result["pdb_name"]
|
| 77 |
chain = raw_result["chain_id"]
|
| 78 |
value = raw_result["score"]
|
| 79 |
-
|
|
|
|
| 80 |
key = f"PDB: {prot}.{chain}"
|
| 81 |
-
if
|
| 82 |
-
key += f" |
|
| 83 |
return key, value
|
| 84 |
|
| 85 |
|
| 86 |
-
def parse_gene_from_fasta_entry(description):
|
| 87 |
-
after = description.split("GN=")[1]
|
| 88 |
-
gene = after.split(" ")[0]
|
| 89 |
-
return gene
|
| 90 |
-
|
| 91 |
-
|
| 92 |
def switch_viz(new_choice):
|
| 93 |
if new_choice is None:
|
| 94 |
html = ""
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
|
|
|
|
| 3 |
from get_index import get_engines
|
| 4 |
+
from protein_viz import get_protein_name, render_html
|
| 5 |
|
| 6 |
index_repo = "ronig/siamese_protein_index"
|
| 7 |
model_repo = "ronig/protein_search_engine"
|
|
|
|
| 14 |
"""
|
| 15 |
max_results = 100
|
| 16 |
|
| 17 |
+
|
| 18 |
def search_and_display(seq, n_res, index_selection):
|
| 19 |
n_res = int(limit_n_results(n_res))
|
| 20 |
engine = engines[index_selection]
|
|
|
|
| 47 |
|
| 48 |
def format_search_results(raw_search_results):
|
| 49 |
formatted_search_results = {}
|
| 50 |
+
for res in raw_search_results:
|
| 51 |
+
key, value = parse_pdb_search_result(res)
|
|
|
|
|
|
|
|
|
|
| 52 |
formatted_search_results[key] = value
|
| 53 |
return formatted_search_results
|
| 54 |
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
def parse_pdb_search_result(raw_result):
|
| 57 |
prot = raw_result["pdb_name"]
|
| 58 |
chain = raw_result["chain_id"]
|
| 59 |
value = raw_result["score"]
|
| 60 |
+
gene_names = raw_result["genes"]
|
| 61 |
+
species = raw_result["organism"]
|
| 62 |
key = f"PDB: {prot}.{chain}"
|
| 63 |
+
if gene_names is not None:
|
| 64 |
+
key += f" | Genes: {gene_names} | Organism: {species}"
|
| 65 |
return key, value
|
| 66 |
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
def switch_viz(new_choice):
|
| 69 |
if new_choice is None:
|
| 70 |
html = ""
|
concurrency.py
DELETED
|
@@ -1,22 +0,0 @@
|
|
| 1 |
-
import concurrent.futures
|
| 2 |
-
import itertools
|
| 3 |
-
from typing import Callable, Iterable
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
def execute_multithread(func: Callable, inputs: Iterable, n_workers):
|
| 7 |
-
with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor:
|
| 8 |
-
futures = {
|
| 9 |
-
executor.submit(func, **task)
|
| 10 |
-
for task in itertools.islice(inputs, n_workers)
|
| 11 |
-
}
|
| 12 |
-
|
| 13 |
-
while futures:
|
| 14 |
-
done, futures = concurrent.futures.wait(
|
| 15 |
-
futures, return_when=concurrent.futures.FIRST_COMPLETED
|
| 16 |
-
)
|
| 17 |
-
|
| 18 |
-
for future in done:
|
| 19 |
-
yield future.result()
|
| 20 |
-
|
| 21 |
-
for task in itertools.islice(inputs, len(done)):
|
| 22 |
-
futures.add(executor.submit(func, **task))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
protein_viz.py
CHANGED
|
@@ -30,62 +30,6 @@ def render_html(pdb_id, chain):
|
|
| 30 |
return iframe
|
| 31 |
|
| 32 |
|
| 33 |
-
def get_gene_name(pdb_id, chain_id):
|
| 34 |
-
entity_id = get_polymer_entity_id(chain_id, pdb_id)
|
| 35 |
-
gene_name, species = get_gene_name_from_polymer_entity(
|
| 36 |
-
pdb_id=pdb_id, entity_id=entity_id
|
| 37 |
-
)
|
| 38 |
-
return gene_name, species
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
def get_polymer_entity_id(chain_id, pdb_id):
|
| 42 |
-
url = (
|
| 43 |
-
f"https://data.rcsb.org/rest/v1/core/"
|
| 44 |
-
f"polymer_entity_instance/{pdb_id}/{chain_id}"
|
| 45 |
-
)
|
| 46 |
-
response = requests.get(url, timeout=1)
|
| 47 |
-
if response.ok:
|
| 48 |
-
res_data = response.json()
|
| 49 |
-
entity_id = int(
|
| 50 |
-
res_data["rcsb_polymer_entity_instance_container_identifiers"]["entity_id"]
|
| 51 |
-
)
|
| 52 |
-
else:
|
| 53 |
-
entity_id = None
|
| 54 |
-
return entity_id
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
def get_gene_name_from_polymer_entity(pdb_id, entity_id):
|
| 58 |
-
gene_name, species = None, None
|
| 59 |
-
if entity_id:
|
| 60 |
-
url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb_id}/{entity_id}"
|
| 61 |
-
response = requests.get(url, timeout=1)
|
| 62 |
-
if response.ok:
|
| 63 |
-
res_data = response.json()
|
| 64 |
-
uniprot_id = _extract_uniprot_id(res_data)
|
| 65 |
-
source_organism = res_data.get("rcsb_entity_source_organism", [{}])[0]
|
| 66 |
-
gene_name = source_organism.get("rcsb_gene_name", [{}])[0].get("value")
|
| 67 |
-
species = source_organism.get("scientific_name")
|
| 68 |
-
if gene_name is None and uniprot_id is not None:
|
| 69 |
-
gene_name = get_gene_name_from_uniprot(uniprot_id)
|
| 70 |
-
return gene_name, species
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
def get_gene_name_from_uniprot(uniprot_id):
|
| 74 |
-
gene_name = None
|
| 75 |
-
url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}"
|
| 76 |
-
response = requests.get(url, timeout=1.0)
|
| 77 |
-
if response.ok:
|
| 78 |
-
uniprot_data = response.json()
|
| 79 |
-
gene_name = uniprot_data.get("genes", [{}])[0].get("geneName", {}).get("value")
|
| 80 |
-
return gene_name
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
def _extract_uniprot_id(res_data):
|
| 84 |
-
ids = res_data.get("rcsb_polymer_entity_container_identifiers", {})
|
| 85 |
-
uniprot_id = ids.get("uniprot_ids", [None])[0]
|
| 86 |
-
return uniprot_id
|
| 87 |
-
|
| 88 |
-
|
| 89 |
def get_protein_name(pdb_id: str):
|
| 90 |
url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
|
| 91 |
response = requests.get(url, timeout=1)
|
|
|
|
| 30 |
return iframe
|
| 31 |
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
def get_protein_name(pdb_id: str):
|
| 34 |
url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
|
| 35 |
response = requests.get(url, timeout=1)
|