cafierom commited on
Commit
9c28e37
·
verified ·
1 Parent(s): 5930ddc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -52
app.py CHANGED
@@ -656,24 +656,38 @@ def first_node(state: State) -> State:
656
  query_chembl = None
657
  state["query_chembl"] = query_chembl
658
 
659
- prompt = f'For the QUERY_TASK given below, determine if one or two of the tools descibed below \
660
- can complete the task. If so, reply with only the tool names followed by "#". If two tools \
661
- are required, reply with both tool names separated by a comma and followed by "#". \
662
- If the tools cannot complete the task, reply with "None #".\n \
663
- QUERY_TASK: {query_task}.\n \
664
- Tools: \n \
665
- uniprot_tool: this tool takes in the user requested protein and searches UNIPROT for matches. \
666
- It returns a string containing the protein ID, gene name, organism, and protein name.\n \
667
- list_bioactives_tool: Accepts a given UNIPROT ID and searches for bioactive molecules \n \
668
- get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID\n \
669
- pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \n \
670
- protein, as well as other information such as ligands in the structure. \
671
- find_tool: Accepts a protein name and seaches for PDB IDs that match, returning the PDB ID and the title. \
672
- predict_tool: Predicts the IC50 value for the molecule indicated by the SMILES string provided. \
673
- Uses the LightGBM model. \n \
674
- gpt_tool: Uses a machine-learning GPT model to generate novel molecules for a chembl dataset. It returns a list \
675
- of novel molecules generated by the GPT. \
676
- '
 
 
 
 
 
 
 
 
 
 
 
 
 
 
677
  res = chat_model.invoke(prompt)
678
 
679
  tool_choices = str(res).split('<|assistant|>')[1].split('#')[0].strip()
@@ -719,33 +733,48 @@ def retry_node(state: State) -> State:
719
  query_pdb = state["query_pdb"]
720
  query_smiles = state["query_smiles"]
721
 
722
- prompt = f'You were previously given the QUERY_TASK below, and asked to determine if one \
723
- or two of the tools described below could complete the task. The tool choices did not succeed. \
724
- Please re-examine the tool choices and determine if one or two of the tools described below \
725
- can complete the task. If so, reply with only the tool names followed by "#". If two tools \
726
- are required, reply with both tool names separated by a comma and followed by "#". \
727
- If the tools cannot complete the task, reply with "None #".\n \
728
- The information provided by the user is:\n \
729
- QUERY_PROTEIN: {query_protein}.\n \
730
- QUERY_UP_ID: {query_up_id}.\n \
731
- QUERY_CHEMBL: {query_chembl}.\n \
732
- QUERY_PDB: {query_pdb}.\n \
733
- QUERY_SMILES: {query_smiles}.\n \
734
- The task is: \
735
- QUERY_TASK: {query_task}.\n \
736
- Tool options: \n \
737
- uniprot_tool: this tool takes in the user requested protein and searches UNIPROT for matches. \
738
- It returns a string containing the protein ID, gene name, organism, and protein name.\n \
739
- list_bioactives_tool: Accepts a given UNIPROT ID and searches for bioactive molecules \n \
740
- get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID\n \
741
- pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \
742
- protein, as well as other information such as ligands in the structure. \n \
743
- find_tool: Accepts a protein name and seaches for PDB IDs that match, returning the PDB ID and the title. \
744
- predict_tool: Predicts the IC50 value for the molecule indicated by the SMILES string provided. \
745
- Uses the LightGBM model. \n \
746
- gpt_tool: Uses a machine-learning GPT model to generate novel molecules for a chembl dataset. It returns a list \
747
- of novel molecules generated by the GPT. \
748
- '
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
749
 
750
  res = chat_model.invoke(prompt)
751
 
@@ -978,13 +1007,19 @@ graph = builder.compile()
978
  @spaces.GPU
979
  def ProteinAgent(task, protein, up_id, chembl_id, pdb_id, smiles):
980
  '''
981
- This Agent can perform several protein-related tasks. It can find UNIPROT IDs for a protein, or, given
982
- a UNIPROT ID it can find Chembl IDs that match. It can find numbers of and lists of bioactive molecules
983
- based on a Chembl ID. It can query the protein databank to find PDB IDs matching a protein name and return
984
- the IDs and titles. It can find a particular PDB ID and report information such as how many chains it contains,
985
- the sequence, and any small molecules or ligands bound in the structure. It can predict the IC50 value of a molecule
986
- based on a Chembl dataset using the LightGBM model. It can generate novel molecules using a finetuned GPT based on
987
- a Chembl dataset.
 
 
 
 
 
 
988
 
989
  Args:
990
  task: the task to carry out
 
656
  query_chembl = None
657
  state["query_chembl"] = query_chembl
658
 
659
+ prompt = f'''
660
+ # For the QUERY_TASK given below, determine if one or two of the tools descibed below
661
+ can complete the task. If so, reply with only the tool names followed by "#". If two tools
662
+ are required, reply with both tool names separated by a comma and followed by "#".
663
+ If the tools cannot complete the task, reply with "None #".
664
+
665
+ ## QUERY_TASK: {query_task}.
666
+
667
+ ## Tools:
668
+
669
+ - uniprot_tool: this tool takes in the user requested protein and searches UNIPROT for matches.
670
+
671
+ - It returns a string containing the uniprot protein ID, gene name, organism, and protein name.
672
+
673
+ - list_bioactives_tool: Accepts a given UNIPROT ID and searches for Chembl IDs and bioactive molecules.
674
+ Returns Chembl IDs and numbers of bioactive molecules.
675
+
676
+ - get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID. Requires a
677
+ chembl ID, so the list_bioactives_tool should be called before this tool.
678
+
679
+ - pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the
680
+ protein, as well as other information such as ligands in the structure.
681
+
682
+ - find_tool: Accepts a protein name and seaches for PDB IDs that match, returning the PDB ID and the title.
683
+
684
+ - predict_tool: Predicts the IC50 value for the molecule indicated by the SMILES string provided
685
+ Uses the LightGBM model for prediction. Requires a Chembl dataset, so the get_bioactives_tool should be called before this tool.
686
+
687
+ - gpt_tool: Uses a machine-learning GPT model to generate novel molecules for a chembl dataset. It returns a list
688
+ of novel molecules generated by the GPT and an image of the molecules. Requires a Chembl dataset, so the get_bioactives_tool
689
+ should be called before this tool.
690
+ '''
691
  res = chat_model.invoke(prompt)
692
 
693
  tool_choices = str(res).split('<|assistant|>')[1].split('#')[0].strip()
 
733
  query_pdb = state["query_pdb"]
734
  query_smiles = state["query_smiles"]
735
 
736
+ prompt = f'''
737
+ # You were previously given the QUERY_TASK below, and asked to determine if one
738
+ or two of the tools described below could complete the task. The tool choices did not succeed.
739
+ Please re-examine the tool choices and determine if one or two of the tools described below
740
+ can complete the task. If so, reply with only the tool names followed by "#". If two tools
741
+ are required, reply with both tool names separated by a comma and followed by "#".
742
+ If the tools cannot complete the task, reply with "None #".
743
+
744
+ ## The information provided by the user is:
745
+ - QUERY_PROTEIN: {query_protein}.
746
+ - QUERY_UP_ID: {query_up_id}.
747
+ - QUERY_CHEMBL: {query_chembl}.
748
+ - QUERY_PDB: {query_pdb}.
749
+ - QUERY_SMILES: {query_smiles}.
750
+
751
+ ## The task is:
752
+ - QUERY_TASK: {query_task}.
753
+
754
+ ## Tools:
755
+
756
+ - uniprot_tool: this tool takes in the user requested protein and searches UNIPROT for matches.
757
+
758
+ - It returns a string containing the uniprot protein ID, gene name, organism, and protein name.
759
+
760
+ - list_bioactives_tool: Accepts a given UNIPROT ID and searches for Chembl IDs and bioactive molecules.
761
+ Returns Chembl IDs and numbers of bioactive molecules.
762
+
763
+ - get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID. Requires a
764
+ chembl ID, so the list_bioactives_tool should be called before this tool.
765
+
766
+ - pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the
767
+ protein, as well as other information such as ligands in the structure.
768
+
769
+ - find_tool: Accepts a protein name and seaches for PDB IDs that match, returning the PDB ID and the title.
770
+
771
+ - predict_tool: Predicts the IC50 value for the molecule indicated by the SMILES string provided
772
+ Uses the LightGBM model for prediction. Requires a Chembl dataset, so the get_bioactives_tool should be called before this tool.
773
+
774
+ - gpt_tool: Uses a machine-learning GPT model to generate novel molecules for a chembl dataset. It returns a list
775
+ of novel molecules generated by the GPT and an image of the molecules. Requires a Chembl dataset, so the get_bioactives_tool
776
+ should be called before this tool.
777
+ '''
778
 
779
  res = chat_model.invoke(prompt)
780
 
 
1007
  @spaces.GPU
1008
  def ProteinAgent(task, protein, up_id, chembl_id, pdb_id, smiles):
1009
  '''
1010
+ This Agent can perform several protein-related tasks.
1011
+ 1. It can find UNIPROT IDs for a protein, or,
1012
+ 2. given a UNIPROT ID it can find Chembl IDs that match.
1013
+ 3. It can find numbers of and lists of bioactive molecules based on a Chembl ID.
1014
+ 4. It can query the protein databank to find PDB IDs matching a protein name and return the IDs and titles.
1015
+ 5. It can find a particular PDB ID and report information such as how many chains it contains,
1016
+ the sequence, and any small molecules or ligands bound in the structure.
1017
+ 6. It can predict the IC50 value of a molecule based on a Chembl dataset using the LightGBM model.
1018
+ 7. It can generate novel molecules using a finetuned GPT based on a Chembl dataset.
1019
+
1020
+ If Task 6 or 7 are to be called, a chembl dataset is needed. If a Chembl ID is not provided, then task 2 should be called
1021
+ first to find chembl IDs, then task 4 should be called to collect the dataset based on the ID. If a chembl ID is provided,
1022
+ then task 4 should be called to collect the chembl dataset.
1023
 
1024
  Args:
1025
  task: the task to carry out