cafierom commited on
Commit
300aa0f
·
verified ·
1 Parent(s): 11b17b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -3
app.py CHANGED
@@ -12,7 +12,6 @@ from langchain_huggingface import ChatHuggingFace
12
  from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
13
  from langchain_core.runnables import chain
14
  from uuid import uuid4
15
- import re
16
  import matplotlib.pyplot as plt
17
 
18
  from rdkit import Chem
@@ -32,6 +31,9 @@ from chembl_webresource_client.new_client import new_client
32
  from tqdm.auto import tqdm
33
  import requests
34
  import spaces
 
 
 
35
 
36
  device = "cuda" if torch.cuda.is_available() else "cpu"
37
 
@@ -57,6 +59,7 @@ class State(TypedDict):
57
  which_tool: int
58
  props_string: str
59
  loop_again: str
 
60
  #(Literal["lipinski_tool", "substitution_tool", "pharm_feature_tool"],
61
  # Literal["lipinski_tool", "substitution_tool", "pharm_feature_tool"])
62
 
@@ -394,6 +397,47 @@ def pdb_node(state: State) -> State:
394
  state["which_tool"] += 1
395
  return state
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  def first_node(state: State) -> State:
398
  '''
399
  The first node of the agent. This node receives the input and asks the LLM
@@ -427,6 +471,7 @@ def first_node(state: State) -> State:
427
  props_string = ""
428
  state["props_string"] = props_string
429
  state["loop_again"] = None
 
430
 
431
  raw_input = state["messages"][-1].content
432
  parts = raw_input.split(',')
@@ -471,7 +516,8 @@ It returns a string containing the protein ID, gene name, organism, and protein
471
  list_bioactives_tool: Accepts a given UNIPROT ID and searches for bioactive molecules \n \
472
  get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID\n \
473
  pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \n \
474
- protein, as well as other information such as ligands in the structure.\
 
475
  '
476
  res = chat_model.invoke(prompt)
477
 
@@ -538,7 +584,8 @@ It returns a string containing the protein ID, gene name, organism, and protein
538
  list_bioactives_tool: Accepts a given UNIPROT ID and searches for bioactive molecules \n \
539
  get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID\n \
540
  pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \
541
- protein, as well as other information such as ligands in the structure. \n'
 
542
 
543
  res = chat_model.invoke(prompt)
544
 
@@ -712,6 +759,7 @@ builder.add_node("uniprot_node", uniprot_node)
712
  builder.add_node("listbioactives_node", listbioactives_node)
713
  builder.add_node("getbioactives_node", getbioactives_node)
714
  builder.add_node("pdb_node", pdb_node)
 
715
 
716
  builder.add_node("loop_node", loop_node)
717
  builder.add_node("parser_node", parser_node)
@@ -724,6 +772,7 @@ builder.add_conditional_edges("first_node", get_chemtool, {
724
  "list_bioactives_tool": "listbioactives_node",
725
  "get_bioactives_tool": "getbioactives_node",
726
  "pdb_tool": "pdb_node",
 
727
  None: "parser_node"})
728
 
729
  builder.add_conditional_edges("retry_node", get_chemtool, {
@@ -731,18 +780,21 @@ builder.add_conditional_edges("retry_node", get_chemtool, {
731
  "list_bioactives_tool": "listbioactives_node",
732
  "get_bioactives_tool": "getbioactives_node",
733
  "pdb_tool": "pdb_node",
 
734
  None: "parser_node"})
735
 
736
  builder.add_edge("uniprot_node", "loop_node")
737
  builder.add_edge("listbioactives_node", "loop_node")
738
  builder.add_edge("getbioactives_node", "loop_node")
739
  builder.add_edge("pdb_node", "loop_node")
 
740
 
741
  builder.add_conditional_edges("loop_node", get_chemtool, {
742
  "uniprot_tool": "uniprot_node",
743
  "list_bioactives_tool": "listbioactives_node",
744
  "get_bioactives_tool": "getbioactives_node",
745
  "pdb_tool": "pdb_node",
 
746
  None: "parser_node"})
747
 
748
  builder.add_conditional_edges("parser_node", loop_or_not, {
@@ -793,6 +845,7 @@ with gr.Blocks(fill_height=True) as forest:
793
  - calls Chembl to find hits for a given uniprot id and reports number of bioactive molecules in the hit
794
  - calls Chembl to find a list bioactive molecules for a given chembl id and their IC50 values
795
  - calls PDB to find the number of chains in a protein, proteins sequences and small molecules in the structure
 
796
  ''')
797
 
798
  with gr.Row():
 
12
  from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
13
  from langchain_core.runnables import chain
14
  from uuid import uuid4
 
15
  import matplotlib.pyplot as plt
16
 
17
  from rdkit import Chem
 
31
  from tqdm.auto import tqdm
32
  import requests
33
  import spaces
34
+ from rcsbapi.search import TextQuery
35
+ import requests
36
+ import itertools
37
 
38
  device = "cuda" if torch.cuda.is_available() else "cpu"
39
 
 
59
  which_tool: int
60
  props_string: str
61
  loop_again: str
62
+ which_pdbs: int
63
  #(Literal["lipinski_tool", "substitution_tool", "pharm_feature_tool"],
64
  # Literal["lipinski_tool", "substitution_tool", "pharm_feature_tool"])
65
 
 
397
  state["which_tool"] += 1
398
  return state
399
 
400
+ def find_node(state: State) -> State:
401
+ '''
402
+ Accepts a protein name and searches the protein databack for PDB IDs that match along with the entry titles.
403
+
404
+ Args:
405
+ protein_name: the protein to query
406
+ Returns:
407
+ props_string: a string of the
408
+ '''
409
+ test_protein = state["query_protein"].strip()
410
+ which_pdbs = state["which_pdbs"]
411
+ current_props_string = state["props_string"]
412
+
413
+ print(f"find tool using {test_protein}")
414
+ print('===================================================')
415
+
416
+ try:
417
+ query = TextQuery(value=test_protein)
418
+ results = query()
419
+
420
+ def pdb_gen():
421
+ for rid in results:
422
+ yield(rid)
423
+
424
+ take10 = itertools.islice(pdb_gen(), which_pdbs, which_pdbs+10, 1)
425
+
426
+ pdb_string = f'10 PDBs that match the protein {test_protein} are: \n'
427
+ for pdb in take10:
428
+ data = requests.get(f"https://data.rcsb.org/rest/v1/core/entry/{pdb}").json()
429
+ title = data['struct']['title']
430
+ pdb_string += f'PDB ID: {pdb}, with title: {title} \n'
431
+ state["which_pdbs"] = which_pdbs+10
432
+ except:
433
+ pdb_string = ''
434
+
435
+
436
+ current_props_string += pdb_string
437
+ state["props_string"] = current_props_string
438
+ state["which_tool"] += 1
439
+ return state
440
+
441
  def first_node(state: State) -> State:
442
  '''
443
  The first node of the agent. This node receives the input and asks the LLM
 
471
  props_string = ""
472
  state["props_string"] = props_string
473
  state["loop_again"] = None
474
+ state['which_pdbs'] = 0
475
 
476
  raw_input = state["messages"][-1].content
477
  parts = raw_input.split(',')
 
516
  list_bioactives_tool: Accepts a given UNIPROT ID and searches for bioactive molecules \n \
517
  get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID\n \
518
  pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \n \
519
+ protein, as well as other information such as ligands in the structure. \
520
+ find_tool: Accepts a protein name and seaches for PDB IDs that match, returning the PDB ID and the title. \
521
  '
522
  res = chat_model.invoke(prompt)
523
 
 
584
  list_bioactives_tool: Accepts a given UNIPROT ID and searches for bioactive molecules \n \
585
  get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID\n \
586
  pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \
587
+ protein, as well as other information such as ligands in the structure. \
588
+ find_tool: Accepts a protein name and seaches for PDB IDs that match, returning the PDB ID and the title.'
589
 
590
  res = chat_model.invoke(prompt)
591
 
 
759
  builder.add_node("listbioactives_node", listbioactives_node)
760
  builder.add_node("getbioactives_node", getbioactives_node)
761
  builder.add_node("pdb_node", pdb_node)
762
+ builder.add_node("find_node", find_node)
763
 
764
  builder.add_node("loop_node", loop_node)
765
  builder.add_node("parser_node", parser_node)
 
772
  "list_bioactives_tool": "listbioactives_node",
773
  "get_bioactives_tool": "getbioactives_node",
774
  "pdb_tool": "pdb_node",
775
+ "find_tool": "find_node",
776
  None: "parser_node"})
777
 
778
  builder.add_conditional_edges("retry_node", get_chemtool, {
 
780
  "list_bioactives_tool": "listbioactives_node",
781
  "get_bioactives_tool": "getbioactives_node",
782
  "pdb_tool": "pdb_node",
783
+ "find_tool": "find_node",
784
  None: "parser_node"})
785
 
786
  builder.add_edge("uniprot_node", "loop_node")
787
  builder.add_edge("listbioactives_node", "loop_node")
788
  builder.add_edge("getbioactives_node", "loop_node")
789
  builder.add_edge("pdb_node", "loop_node")
790
+ builder.add_edge("find_node": "loop_node")
791
 
792
  builder.add_conditional_edges("loop_node", get_chemtool, {
793
  "uniprot_tool": "uniprot_node",
794
  "list_bioactives_tool": "listbioactives_node",
795
  "get_bioactives_tool": "getbioactives_node",
796
  "pdb_tool": "pdb_node",
797
+ "find_tool": "find_node",
798
  None: "parser_node"})
799
 
800
  builder.add_conditional_edges("parser_node", loop_or_not, {
 
845
  - calls Chembl to find hits for a given uniprot id and reports number of bioactive molecules in the hit
846
  - calls Chembl to find a list bioactive molecules for a given chembl id and their IC50 values
847
  - calls PDB to find the number of chains in a protein, proteins sequences and small molecules in the structure
848
+ - calls PDB to find PDB IDs that match a protein name.
849
  ''')
850
 
851
  with gr.Row():