Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,7 +12,6 @@ from langchain_huggingface import ChatHuggingFace
|
|
| 12 |
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
|
| 13 |
from langchain_core.runnables import chain
|
| 14 |
from uuid import uuid4
|
| 15 |
-
import re
|
| 16 |
import matplotlib.pyplot as plt
|
| 17 |
|
| 18 |
from rdkit import Chem
|
|
@@ -32,6 +31,9 @@ from chembl_webresource_client.new_client import new_client
|
|
| 32 |
from tqdm.auto import tqdm
|
| 33 |
import requests
|
| 34 |
import spaces
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 37 |
|
|
@@ -57,6 +59,7 @@ class State(TypedDict):
|
|
| 57 |
which_tool: int
|
| 58 |
props_string: str
|
| 59 |
loop_again: str
|
|
|
|
| 60 |
#(Literal["lipinski_tool", "substitution_tool", "pharm_feature_tool"],
|
| 61 |
# Literal["lipinski_tool", "substitution_tool", "pharm_feature_tool"])
|
| 62 |
|
|
@@ -394,6 +397,47 @@ def pdb_node(state: State) -> State:
|
|
| 394 |
state["which_tool"] += 1
|
| 395 |
return state
|
| 396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
def first_node(state: State) -> State:
|
| 398 |
'''
|
| 399 |
The first node of the agent. This node receives the input and asks the LLM
|
|
@@ -427,6 +471,7 @@ def first_node(state: State) -> State:
|
|
| 427 |
props_string = ""
|
| 428 |
state["props_string"] = props_string
|
| 429 |
state["loop_again"] = None
|
|
|
|
| 430 |
|
| 431 |
raw_input = state["messages"][-1].content
|
| 432 |
parts = raw_input.split(',')
|
|
@@ -471,7 +516,8 @@ It returns a string containing the protein ID, gene name, organism, and protein
|
|
| 471 |
list_bioactives_tool: Accepts a given UNIPROT ID and searches for bioactive molecules \n \
|
| 472 |
get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID\n \
|
| 473 |
pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \n \
|
| 474 |
-
protein, as well as other information such as ligands in the structure
|
|
|
|
| 475 |
'
|
| 476 |
res = chat_model.invoke(prompt)
|
| 477 |
|
|
@@ -538,7 +584,8 @@ It returns a string containing the protein ID, gene name, organism, and protein
|
|
| 538 |
list_bioactives_tool: Accepts a given UNIPROT ID and searches for bioactive molecules \n \
|
| 539 |
get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID\n \
|
| 540 |
pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \
|
| 541 |
-
protein, as well as other information such as ligands in the structure. \
|
|
|
|
| 542 |
|
| 543 |
res = chat_model.invoke(prompt)
|
| 544 |
|
|
@@ -712,6 +759,7 @@ builder.add_node("uniprot_node", uniprot_node)
|
|
| 712 |
builder.add_node("listbioactives_node", listbioactives_node)
|
| 713 |
builder.add_node("getbioactives_node", getbioactives_node)
|
| 714 |
builder.add_node("pdb_node", pdb_node)
|
|
|
|
| 715 |
|
| 716 |
builder.add_node("loop_node", loop_node)
|
| 717 |
builder.add_node("parser_node", parser_node)
|
|
@@ -724,6 +772,7 @@ builder.add_conditional_edges("first_node", get_chemtool, {
|
|
| 724 |
"list_bioactives_tool": "listbioactives_node",
|
| 725 |
"get_bioactives_tool": "getbioactives_node",
|
| 726 |
"pdb_tool": "pdb_node",
|
|
|
|
| 727 |
None: "parser_node"})
|
| 728 |
|
| 729 |
builder.add_conditional_edges("retry_node", get_chemtool, {
|
|
@@ -731,18 +780,21 @@ builder.add_conditional_edges("retry_node", get_chemtool, {
|
|
| 731 |
"list_bioactives_tool": "listbioactives_node",
|
| 732 |
"get_bioactives_tool": "getbioactives_node",
|
| 733 |
"pdb_tool": "pdb_node",
|
|
|
|
| 734 |
None: "parser_node"})
|
| 735 |
|
| 736 |
builder.add_edge("uniprot_node", "loop_node")
|
| 737 |
builder.add_edge("listbioactives_node", "loop_node")
|
| 738 |
builder.add_edge("getbioactives_node", "loop_node")
|
| 739 |
builder.add_edge("pdb_node", "loop_node")
|
|
|
|
| 740 |
|
| 741 |
builder.add_conditional_edges("loop_node", get_chemtool, {
|
| 742 |
"uniprot_tool": "uniprot_node",
|
| 743 |
"list_bioactives_tool": "listbioactives_node",
|
| 744 |
"get_bioactives_tool": "getbioactives_node",
|
| 745 |
"pdb_tool": "pdb_node",
|
|
|
|
| 746 |
None: "parser_node"})
|
| 747 |
|
| 748 |
builder.add_conditional_edges("parser_node", loop_or_not, {
|
|
@@ -793,6 +845,7 @@ with gr.Blocks(fill_height=True) as forest:
|
|
| 793 |
- calls Chembl to find hits for a given uniprot id and reports number of bioactive molecules in the hit
|
| 794 |
- calls Chembl to find a list bioactive molecules for a given chembl id and their IC50 values
|
| 795 |
- calls PDB to find the number of chains in a protein, proteins sequences and small molecules in the structure
|
|
|
|
| 796 |
''')
|
| 797 |
|
| 798 |
with gr.Row():
|
|
|
|
| 12 |
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
|
| 13 |
from langchain_core.runnables import chain
|
| 14 |
from uuid import uuid4
|
|
|
|
| 15 |
import matplotlib.pyplot as plt
|
| 16 |
|
| 17 |
from rdkit import Chem
|
|
|
|
| 31 |
from tqdm.auto import tqdm
|
| 32 |
import requests
|
| 33 |
import spaces
|
| 34 |
+
from rcsbapi.search import TextQuery
|
| 35 |
+
import requests
|
| 36 |
+
import itertools
|
| 37 |
|
| 38 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 39 |
|
|
|
|
| 59 |
which_tool: int
|
| 60 |
props_string: str
|
| 61 |
loop_again: str
|
| 62 |
+
which_pdbs: int
|
| 63 |
#(Literal["lipinski_tool", "substitution_tool", "pharm_feature_tool"],
|
| 64 |
# Literal["lipinski_tool", "substitution_tool", "pharm_feature_tool"])
|
| 65 |
|
|
|
|
| 397 |
state["which_tool"] += 1
|
| 398 |
return state
|
| 399 |
|
| 400 |
+
def find_node(state: State) -> State:
|
| 401 |
+
'''
|
| 402 |
+
Accepts a protein name and searches the protein databack for PDB IDs that match along with the entry titles.
|
| 403 |
+
|
| 404 |
+
Args:
|
| 405 |
+
protein_name: the protein to query
|
| 406 |
+
Returns:
|
| 407 |
+
props_string: a string of the
|
| 408 |
+
'''
|
| 409 |
+
test_protein = state["query_protein"].strip()
|
| 410 |
+
which_pdbs = state["which_pdbs"]
|
| 411 |
+
current_props_string = state["props_string"]
|
| 412 |
+
|
| 413 |
+
print(f"find tool using {test_protein}")
|
| 414 |
+
print('===================================================')
|
| 415 |
+
|
| 416 |
+
try:
|
| 417 |
+
query = TextQuery(value=test_protein)
|
| 418 |
+
results = query()
|
| 419 |
+
|
| 420 |
+
def pdb_gen():
|
| 421 |
+
for rid in results:
|
| 422 |
+
yield(rid)
|
| 423 |
+
|
| 424 |
+
take10 = itertools.islice(pdb_gen(), which_pdbs, which_pdbs+10, 1)
|
| 425 |
+
|
| 426 |
+
pdb_string = f'10 PDBs that match the protein {test_protein} are: \n'
|
| 427 |
+
for pdb in take10:
|
| 428 |
+
data = requests.get(f"https://data.rcsb.org/rest/v1/core/entry/{pdb}").json()
|
| 429 |
+
title = data['struct']['title']
|
| 430 |
+
pdb_string += f'PDB ID: {pdb}, with title: {title} \n'
|
| 431 |
+
state["which_pdbs"] = which_pdbs+10
|
| 432 |
+
except:
|
| 433 |
+
pdb_string = ''
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
current_props_string += pdb_string
|
| 437 |
+
state["props_string"] = current_props_string
|
| 438 |
+
state["which_tool"] += 1
|
| 439 |
+
return state
|
| 440 |
+
|
| 441 |
def first_node(state: State) -> State:
|
| 442 |
'''
|
| 443 |
The first node of the agent. This node receives the input and asks the LLM
|
|
|
|
| 471 |
props_string = ""
|
| 472 |
state["props_string"] = props_string
|
| 473 |
state["loop_again"] = None
|
| 474 |
+
state['which_pdbs'] = 0
|
| 475 |
|
| 476 |
raw_input = state["messages"][-1].content
|
| 477 |
parts = raw_input.split(',')
|
|
|
|
| 516 |
list_bioactives_tool: Accepts a given UNIPROT ID and searches for bioactive molecules \n \
|
| 517 |
get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID\n \
|
| 518 |
pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \n \
|
| 519 |
+
protein, as well as other information such as ligands in the structure. \
|
| 520 |
+
find_tool: Accepts a protein name and seaches for PDB IDs that match, returning the PDB ID and the title. \
|
| 521 |
'
|
| 522 |
res = chat_model.invoke(prompt)
|
| 523 |
|
|
|
|
| 584 |
list_bioactives_tool: Accepts a given UNIPROT ID and searches for bioactive molecules \n \
|
| 585 |
get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID\n \
|
| 586 |
pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \
|
| 587 |
+
protein, as well as other information such as ligands in the structure. \
|
| 588 |
+
find_tool: Accepts a protein name and seaches for PDB IDs that match, returning the PDB ID and the title.'
|
| 589 |
|
| 590 |
res = chat_model.invoke(prompt)
|
| 591 |
|
|
|
|
| 759 |
builder.add_node("listbioactives_node", listbioactives_node)
|
| 760 |
builder.add_node("getbioactives_node", getbioactives_node)
|
| 761 |
builder.add_node("pdb_node", pdb_node)
|
| 762 |
+
builder.add_node("find_node", find_node)
|
| 763 |
|
| 764 |
builder.add_node("loop_node", loop_node)
|
| 765 |
builder.add_node("parser_node", parser_node)
|
|
|
|
| 772 |
"list_bioactives_tool": "listbioactives_node",
|
| 773 |
"get_bioactives_tool": "getbioactives_node",
|
| 774 |
"pdb_tool": "pdb_node",
|
| 775 |
+
"find_tool": "find_node",
|
| 776 |
None: "parser_node"})
|
| 777 |
|
| 778 |
builder.add_conditional_edges("retry_node", get_chemtool, {
|
|
|
|
| 780 |
"list_bioactives_tool": "listbioactives_node",
|
| 781 |
"get_bioactives_tool": "getbioactives_node",
|
| 782 |
"pdb_tool": "pdb_node",
|
| 783 |
+
"find_tool": "find_node",
|
| 784 |
None: "parser_node"})
|
| 785 |
|
| 786 |
builder.add_edge("uniprot_node", "loop_node")
|
| 787 |
builder.add_edge("listbioactives_node", "loop_node")
|
| 788 |
builder.add_edge("getbioactives_node", "loop_node")
|
| 789 |
builder.add_edge("pdb_node", "loop_node")
|
| 790 |
+
builder.add_edge("find_node": "loop_node")
|
| 791 |
|
| 792 |
builder.add_conditional_edges("loop_node", get_chemtool, {
|
| 793 |
"uniprot_tool": "uniprot_node",
|
| 794 |
"list_bioactives_tool": "listbioactives_node",
|
| 795 |
"get_bioactives_tool": "getbioactives_node",
|
| 796 |
"pdb_tool": "pdb_node",
|
| 797 |
+
"find_tool": "find_node",
|
| 798 |
None: "parser_node"})
|
| 799 |
|
| 800 |
builder.add_conditional_edges("parser_node", loop_or_not, {
|
|
|
|
| 845 |
- calls Chembl to find hits for a given uniprot id and reports number of bioactive molecules in the hit
|
| 846 |
- calls Chembl to find a list bioactive molecules for a given chembl id and their IC50 values
|
| 847 |
- calls PDB to find the number of chains in a protein, proteins sequences and small molecules in the structure
|
| 848 |
+
- calls PDB to find PDB IDs that match a protein name.
|
| 849 |
''')
|
| 850 |
|
| 851 |
with gr.Row():
|