Spaces:

cafierom
/

ProteinAgent

Running on Zero

App Files Files Community

cafierom commited on Nov 29, 2025

Commit

e8c9457

verified ·

1 Parent(s): 09f47f9

Update app.py

Browse files

Files changed (1) hide show

app.py +259 -89

app.py CHANGED Viewed

@@ -35,6 +35,15 @@ from rcsbapi.search import TextQuery
 import requests
 import itertools
 device = "cuda" if torch.cuda.is_available() else "cpu"
 hf = HuggingFacePipeline.from_model_id(
@@ -68,19 +77,18 @@ def uniprot_node(state: State) -> State:
   '''
     This tool takes in the user requested protein and searches UNIPROT for matches.
     It returns a string scontaining the protein ID, gene name, organism, and protein name.
       Args:
         query_protein: the name of the protein to search for.
       Returns:
         protein_string: a string containing the protein ID, gene name, organism, and protein name.
   '''
   print("UNIPROT tool")
   print('===================================================')
   protein_name = state["query_protein"]
   current_props_string = state["props_string"]
   try:
     url = f'https://rest.uniprot.org/uniprotkb/search?query={protein_name}&format=tsv'
     response = requests.get(url).text
@@ -91,7 +99,7 @@ def uniprot_node(state: State) -> State:
     prot_df = pd.read_csv(f'{protein_name}_uniprot_ids.tsv', sep='\t')
     prot_human_df = prot_df[prot_df['Organism'] == "Homo sapiens (Human)"]
-    print(f"Found {len(prot_human_df)} Human proteins out of {len(prot_df)} total proteins")
     prot_ids = prot_df['Entry'].tolist()
     prot_ids_human = prot_human_df['Entry'].tolist()
@@ -100,14 +108,14 @@ def uniprot_node(state: State) -> State:
     genes_human = prot_human_df['Gene Names'].tolist()
     organisms = prot_df['Organism'].tolist()
     names = prot_df['Protein names'].tolist()
     names_human = prot_human_df['Protein names'].tolist()
     protein_string = ''
     for id, gene, organism, name in zip(prot_ids, genes, organisms, names):
       protein_string += f'Protein ID: {id}, Gene: {gene}, Organism: {organism}, Name: {name}\n'
   except:
     protein_string = 'No proteins found'
@@ -131,7 +139,7 @@ def get_qed(smiles):
 def listbioactives_node(state: State) -> State:
   '''
-    Accepts a UNIPROT ID and searches for bioactive molecules
       Args:
         up_id: the UNIPROT ID of the protein to search for.
       Returns:
@@ -145,14 +153,14 @@ def listbioactives_node(state: State) -> State:
   targets = new_client.target
   bioact = new_client.activity
   try:
     target_info = targets.get(target_components__accession=up_id).only("target_chembl_id","organism", "pref_name", "target_type")
     target_info = pd.DataFrame.from_records(target_info)
     print(target_info)
     if len(target_info) > 0:
       print(f"Found info for Uniprot ID: {up_id}")
     chembl_ids = target_info['target_chembl_id'].tolist()
     chembl_ids = list(set(chembl_ids))
@@ -171,7 +179,7 @@ def listbioactives_node(state: State) -> State:
       len_this_bioacts = len(bioact_chosen)
       len_all_bioacts.append(len_this_bioacts)
       this_bioact_string = f"Lenth of Bioactivities for ChEMBL ID {chembl_id}: {len_this_bioacts}"
       bioact_string += this_bioact_string + '\n'
   except:
     bioact_string = 'No bioactives found\n'
@@ -195,68 +203,78 @@ def getbioactives_node(state: State) -> State:
   chembl_id = state["query_chembl"].strip()
   current_props_string = state["props_string"]
-  compounds = new_client.molecule
-  bioact = new_client.activity
-  bioact_chosen = bioact.filter(target_chembl_id=chembl_id, type="IC50", relation="=").only(
-      "molecule_chembl_id",
-      "type",
-      "standard_units",
-      "relation",
-      "standard_value",
-  )
-  chembl_ids = []
-  ic50s = []
-  for record in bioact_chosen:
-      if record["standard_units"] == 'nM':
-          chembl_ids.append(record["molecule_chembl_id"])
-          ic50s.append(float(record["standard_value"]))
-  bioact_dict = {'chembl_ids' : chembl_ids, 'IC50s': ic50s}
-  bioact_df = pd.DataFrame.from_dict(bioact_dict)
-  bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
-  print(f"Number of records: {len(bioact_df)}")
-  print(bioact_df.shape)
-  compounds_provider = compounds.filter(molecule_chembl_id__in=bioact_df["chembl_ids"].to_list()).only(
-      "molecule_chembl_id",
-      "molecule_structures"
-  )
-  cids_list = []
-  smiles_list = []
-  for record in compounds_provider:
-      cid = record['molecule_chembl_id']
-      cids_list.append(cid)
-      if record['molecule_structures']:
-          if record['molecule_structures']['canonical_smiles']:
-              smile = record['molecule_structures']['canonical_smiles']
-          else:
-              print("No canonical smiles")
-              smile = None
-      else:
-          print('no structures')
-          smile = None
-      smiles_list.append(smile)
-  new_dict = {'SMILES': smiles_list, 'chembl_ids_2': cids_list}
-  new_df = pd.DataFrame.from_dict(new_dict)
-  total_bioact_df = pd.merge(bioact_df, new_df, left_on='chembl_ids', right_on='chembl_ids_2')
-  print(f"number of records: {len(total_bioact_df)}")
-  total_bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
-  print(f"number of records after removing duplicates: {len(total_bioact_df)}")
-  total_bioact_df.dropna(axis=0, how='any', inplace=True)
-  total_bioact_df.drop(["chembl_ids_2"],axis=1,inplace=True)
-  print(f"number of records after dropping Null values: {len(total_bioact_df)}")
-  total_bioact_df.sort_values(by=["IC50s"],inplace=True)
   limit = 50
   if len(total_bioact_df) > limit:
@@ -266,12 +284,149 @@ def getbioactives_node(state: State) -> State:
   for smile, ic50 in zip(total_bioact_df['SMILES'], total_bioact_df['IC50s']):
     smile = smile.replace('#','~')
     bioact_string += f'Molecule SMILES: {smile}, IC50 (nM): {ic50}\n'
   current_props_string += bioact_string
   state["props_string"] = current_props_string
   state["which_tool"] += 1
   return state
 def get_protein_from_pdb(pdb_id):
   url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
   r = requests.get(url)
@@ -344,12 +499,11 @@ def three_to_one(three_seq):
 def pdb_node(state: State) -> State:
   '''
     Accepts a PDB ID and queires the protein databank for the sequence of the protein, as well as other
-    information such as ligands.
       Args:
         pdb: the PDB ID to query
       Returns:
-        props_string: a string of the
   '''
   test_pdb = state["query_pdb"].strip()
   current_props_string = state["props_string"]
@@ -399,12 +553,11 @@ def pdb_node(state: State) -> State:
 def find_node(state: State) -> State:
   '''
-    Accepts a protein name and searches the protein databack for PDB IDs that match along with the entry titles.
       Args:
         protein_name: the protein to query
       Returns:
-        props_string: a string of the
   '''
   test_protein = state["query_protein"].strip()
   which_pdbs = state["which_pdbs"]
@@ -431,7 +584,7 @@ def find_node(state: State) -> State:
     state["which_pdbs"] = which_pdbs+10
   except:
     pdb_string = ''
   current_props_string += pdb_string
   state["props_string"] = current_props_string
@@ -442,7 +595,6 @@ def first_node(state: State) -> State:
   '''
     The first node of the agent. This node receives the input and asks the LLM
     to determine which is the best tool to use to answer the QUERY TASK.
       Input: the initial prompt from the user. should contain only one of more of the following:
              query_protein: the name of the protein to search for.
              query_up_id: the Uniprot ID of the protein to search for.
@@ -453,7 +605,6 @@ def first_node(state: State) -> State:
              the value should be separated from the name by a ':' and each field should
              be separated from the previous one by a ','.
              All of these values are saved to the state
       Output: the tool choice
   '''
   query_smiles = None
@@ -518,6 +669,10 @@ get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES
 pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \n \
 protein, as well as other information such as ligands in the structure. \
 find_tool: Accepts a protein name and seaches for PDB IDs that match, returning the PDB ID and the title. \
 '
   res = chat_model.invoke(prompt)
@@ -543,7 +698,7 @@ find_tool: Accepts a protein name and seaches for PDB IDs that match, returning
       tool_choice = (tool1, tool2)
   else:
     tool_choice = (None, None)
   state["tool_choice"] = tool_choice
   state["which_tool"] = 0
   print(f"The chosen tools are: {tool_choice}")
@@ -552,7 +707,7 @@ find_tool: Accepts a protein name and seaches for PDB IDs that match, returning
 def retry_node(state: State) -> State:
   '''
-    If the previous loop of the agent does not get enough information from the
     tools to answer the query, this node is called to retry the previous loop.
       Input: the previous loop of the agent.
       Output: the tool choice
@@ -584,14 +739,19 @@ It returns a string containing the protein ID, gene name, organism, and protein
 list_bioactives_tool: Accepts a given UNIPROT ID and searches for bioactive molecules \n \
 get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID\n \
 pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \
-protein, as well as other information such as ligands in the structure. \
-find_tool: Accepts a protein name and seaches for PDB IDs that match, returning the PDB ID and the title.'
   res = chat_model.invoke(prompt)
   tool_choices = str(res).split('<|assistant|>')[1].split('#')[0].strip()
   tool_choices = tool_choices.split(',')
   if len(tool_choices) == 1:
     tool1 = tool_choices[0].strip()
     if tool1.lower() == 'none':
@@ -622,7 +782,6 @@ def loop_node(state: State) -> State:
   '''
     This node accepts the tool returns and decides if it needs to call another
     tool or go on to the parser node.
       Input: the tool returns.
       Output: the next node to call.
   '''
@@ -633,7 +792,6 @@ def parser_node(state: State) -> State:
     This is the third node in the agent. It receives the output from the tool,
     puts it into a prompt as CONTEXT, and asks the LLM to answer the original
     query.
       Input: the output from the tool.
       Output: the answer to the original query.
   '''
@@ -684,7 +842,6 @@ def reflect_node(state: State) -> State:
   '''
     This is the fourth node of the agent. It recieves the LLMs previous answer and
     tries to improve it.
       Input: the LLMs last answer.
       Output: the improved answer.
   '''
@@ -760,6 +917,8 @@ builder.add_node("listbioactives_node", listbioactives_node)
 builder.add_node("getbioactives_node", getbioactives_node)
 builder.add_node("pdb_node", pdb_node)
 builder.add_node("find_node", find_node)
 builder.add_node("loop_node", loop_node)
 builder.add_node("parser_node", parser_node)
@@ -773,6 +932,8 @@ builder.add_conditional_edges("first_node", get_chemtool, {
     "get_bioactives_tool": "getbioactives_node",
     "pdb_tool": "pdb_node",
     "find_tool": "find_node",
     None: "parser_node"})
 builder.add_conditional_edges("retry_node", get_chemtool, {
@@ -781,6 +942,8 @@ builder.add_conditional_edges("retry_node", get_chemtool, {
     "get_bioactives_tool": "getbioactives_node",
     "pdb_tool": "pdb_node",
     "find_tool": "find_node",
     None: "parser_node"})
 builder.add_edge("uniprot_node", "loop_node")
@@ -788,6 +951,8 @@ builder.add_edge("listbioactives_node", "loop_node")
 builder.add_edge("getbioactives_node", "loop_node")
 builder.add_edge("pdb_node", "loop_node")
 builder.add_edge("find_node", "loop_node")
 builder.add_conditional_edges("loop_node", get_chemtool, {
     "uniprot_tool": "uniprot_node",
@@ -795,6 +960,8 @@ builder.add_conditional_edges("loop_node", get_chemtool, {
     "get_bioactives_tool": "getbioactives_node",
     "pdb_tool": "pdb_node",
     "find_tool": "find_node",
     None: "parser_node"})
 builder.add_conditional_edges("parser_node", loop_or_not, {
@@ -827,6 +994,7 @@ def ProteinAgent(task, protein, up_id, chembl_id, pdb_id, smiles):
       reply = c[str(m[0])]['messages']
       if 'assistant' in str(reply):
         reply = str(reply).split("<|assistant|>")[-1].split('#')[0].strip()
         replies.append(reply)
   #check if image exists
   if os.path.exists('Substitution_image.png'):
@@ -846,6 +1014,8 @@ with gr.Blocks(fill_height=True) as forest:
               - calls Chembl to find a list bioactive molecules for a given chembl id and their IC50 values
               - calls PDB to find the number of chains in a protein, proteins sequences and small molecules in the structure
               - calls PDB to find PDB IDs that match a protein name.
               ''')
   with gr.Row():

 import requests
 import itertools
+import lightgbm as lgb
+from lightgbm import LGBMRegressor
+import deepchem as dc
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.preprocessing import StandardScaler
+import tensorflow as tf
+import random
+from finetune_gpt import *
 device = "cuda" if torch.cuda.is_available() else "cpu"
 hf = HuggingFacePipeline.from_model_id(
   '''
     This tool takes in the user requested protein and searches UNIPROT for matches.
     It returns a string scontaining the protein ID, gene name, organism, and protein name.
       Args:
         query_protein: the name of the protein to search for.
       Returns:
         protein_string: a string containing the protein ID, gene name, organism, and protein name.
   '''
   print("UNIPROT tool")
   print('===================================================')
   protein_name = state["query_protein"]
   current_props_string = state["props_string"]
   try:
     url = f'https://rest.uniprot.org/uniprotkb/search?query={protein_name}&format=tsv'
     response = requests.get(url).text
     prot_df = pd.read_csv(f'{protein_name}_uniprot_ids.tsv', sep='\t')
     prot_human_df = prot_df[prot_df['Organism'] == "Homo sapiens (Human)"]
+    print(f"Found {len(prot_human_df)} Human proteins out of {len(prot_df)} total proteins")
     prot_ids = prot_df['Entry'].tolist()
     prot_ids_human = prot_human_df['Entry'].tolist()
     genes_human = prot_human_df['Gene Names'].tolist()
     organisms = prot_df['Organism'].tolist()
     names = prot_df['Protein names'].tolist()
     names_human = prot_human_df['Protein names'].tolist()
     protein_string = ''
     for id, gene, organism, name in zip(prot_ids, genes, organisms, names):
       protein_string += f'Protein ID: {id}, Gene: {gene}, Organism: {organism}, Name: {name}\n'
   except:
     protein_string = 'No proteins found'
 def listbioactives_node(state: State) -> State:
   '''
+    Accepts a UNIPROT ID and searches for bioactive molecules
       Args:
         up_id: the UNIPROT ID of the protein to search for.
       Returns:
   targets = new_client.target
   bioact = new_client.activity
   try:
     target_info = targets.get(target_components__accession=up_id).only("target_chembl_id","organism", "pref_name", "target_type")
     target_info = pd.DataFrame.from_records(target_info)
     print(target_info)
     if len(target_info) > 0:
       print(f"Found info for Uniprot ID: {up_id}")
     chembl_ids = target_info['target_chembl_id'].tolist()
     chembl_ids = list(set(chembl_ids))
       len_this_bioacts = len(bioact_chosen)
       len_all_bioacts.append(len_this_bioacts)
       this_bioact_string = f"Lenth of Bioactivities for ChEMBL ID {chembl_id}: {len_this_bioacts}"
       bioact_string += this_bioact_string + '\n'
   except:
     bioact_string = 'No bioactives found\n'
   chembl_id = state["query_chembl"].strip()
   current_props_string = state["props_string"]
+  #check if f'{chembl_id}_bioactives.csv' exists
+  if os.path.exists(f'{chembl_id}_bioactives.csv'):
+    print(f'Found {chembl_id}_bioactives.csv')
+    total_bioact_df = pd.read_csv(f'{chembl_id}_bioactives.csv')
+    print(f"number of records: {len(total_bioact_df)}")
+  else:
+    compounds = new_client.molecule
+    bioact = new_client.activity
+    bioact_chosen = bioact.filter(target_chembl_id=chembl_id, type="IC50", relation="=").only(
+        "molecule_chembl_id",
+        "type",
+        "standard_units",
+        "relation",
+        "standard_value",
+    )
+    chembl_ids = []
+    ic50s = []
+    for record in bioact_chosen:
+        if record["standard_units"] == 'nM':
+            chembl_ids.append(record["molecule_chembl_id"])
+            ic50s.append(float(record["standard_value"]))
+    bioact_dict = {'chembl_ids' : chembl_ids, 'IC50s': ic50s}
+    bioact_df = pd.DataFrame.from_dict(bioact_dict)
+    bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
+    print(f"Number of records: {len(bioact_df)}")
+    print(bioact_df.shape)
+    compounds_provider = compounds.filter(molecule_chembl_id__in=bioact_df["chembl_ids"].to_list()).only(
+        "molecule_chembl_id",
+        "molecule_structures"
+    )
+    cids_list = []
+    smiles_list = []
+    for record in compounds_provider:
+        cid = record['molecule_chembl_id']
+        cids_list.append(cid)
+        if record['molecule_structures']:
+            if record['molecule_structures']['canonical_smiles']:
+                smile = record['molecule_structures']['canonical_smiles']
+            else:
+                print("No canonical smiles")
+                smile = None
+        else:
+            print('no structures')
+            smile = None
+        smiles_list.append(smile)
+    new_dict = {'SMILES': smiles_list, 'chembl_ids_2': cids_list}
+    new_df = pd.DataFrame.from_dict(new_dict)
+    total_bioact_df = pd.merge(bioact_df, new_df, left_on='chembl_ids', right_on='chembl_ids_2')
+    print(f"number of records: {len(total_bioact_df)}")
+    total_bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
+    print(f"number of records after removing duplicates: {len(total_bioact_df)}")
+    total_bioact_df.dropna(axis=0, how='any', inplace=True)
+    total_bioact_df.drop(["chembl_ids_2"],axis=1,inplace=True)
+    print(f"number of records after dropping Null values: {len(total_bioact_df)}")
+    total_bioact_df.sort_values(by=["IC50s"],inplace=True)
+    if len(total_bioact_df) > 0:
+      total_bioact_df.to_csv(f'{chembl_id}_bioactives.csv')
   limit = 50
   if len(total_bioact_df) > limit:
   for smile, ic50 in zip(total_bioact_df['SMILES'], total_bioact_df['IC50s']):
     smile = smile.replace('#','~')
     bioact_string += f'Molecule SMILES: {smile}, IC50 (nM): {ic50}\n'
+  mols = [Chem.MolFromSmiles(smile) for smile in total_bioact_df['SMILES'].to_list()]
+  legends = [f'IC50: {ic50}' for ic50 in total_bioact_df['IC50s'].to_list()]
+  img = MolsToGridImage(mols, molsPerRow=5, legends=legends, subImgSize=(200,200))
+  filename = "Substitution_image.png"
+    # pic = img.data
+    # with open(filename,'wb+') as outf:
+    #   outf.write(pic)
+  img.save(filename)
   current_props_string += bioact_string
   state["props_string"] = current_props_string
   state["which_tool"] += 1
   return state
+def predict_node(state: State) -> State:
+  '''
+    uses the current_bioactives.csv file from the get_bioactives node to fit the
+    Light GBM model and predict the IC50 for the current smiles.
+  '''
+  print("Predict Tool")
+  print('===================================================')
+  current_props_string = state["props_string"]
+  smiles = state["query_smiles"]
+  chembl_id = state["query_chembl"].strip()
+  print(f"in predict node, smiles: {smiles}")
+  try:
+    df = pd.read_csv(f'{chembl_id}_bioactives.csv')
+    #if length of the dataframe is over 2000, take a random sample of 2000 points
+    if len(df) > 2000:
+      df = df.sample(n=2000, random_state=42)
+    y_raw = df["IC50s"].to_list()
+    smiles_list = df["SMILES"].to_list()
+    ions_to_clean = ["[Na+].",".[Na+]","[Cl-].",".[Cl-]","[K+].",".[K+]"]
+    Xa = []
+    y = []
+    for smile, value in zip(smiles_list, y_raw):
+      for ion in ions_to_clean:
+        smile = smile.replace(ion,"")
+      y.append(np.log10(value))
+      Xa.append(smile)
+    mols = [Chem.MolFromSmiles(smile) for smile in Xa]
+    print(f"Number of molecules: {len(mols)}")
+    featurizer=dc.feat.RDKitDescriptors()
+    featname="RDKitDescriptors"
+    f = featurizer.featurize(mols)
+    nan_indicies = np.isnan(f)
+    bad_rows = []
+    for i, row in enumerate(nan_indicies):
+        for item in row:
+            if item == True:
+                if i not in bad_rows:
+                    print(f"Row {i} has a NaN.")
+                    bad_rows.append(i)
+    print(f"Old dimensions are: {f.shape}.")
+    for j,i in enumerate(bad_rows):
+        k=i-j
+        f = np.delete(f,k,axis=0)
+        y = np.delete(y,k,axis=0)
+        Xa = np.delete(Xa,k,axis=0)
+        print(f"Deleting row {k} from arrays.")
+    print(f"New dimensions are: {f.shape}")
+    if f.shape[0] != len(y) or f.shape[0] != len(Xa):
+      raise ValueError("Number of rows in X and y do not match.")
+    X_train, X_test, y_train, y_test = train_test_split(f, y, test_size=0.2, random_state=42)
+    scaler = StandardScaler()
+    X_train = scaler.fit_transform(X_train)
+    X_test = scaler.transform(X_test)
+    model = LGBMRegressor(metric='rmse', max_depth = 50, verbose = -1, num_leaves = 31,
+                          feature_fraction = 0.8, min_data_in_leaf = 20)
+    modelname = "LightGBM Regressor"
+    model.fit(X_train, y_train)
+    train_score = model.score(X_train,y_train)
+    print(f"score for training set: {train_score:.3f}")
+    valid_score = model.score(X_test, y_test)
+    print(f"score for validation set: {valid_score:.3f}")
+    for ion in ions_to_clean:
+      smiles = smiles.replace(ion,"")
+    test_mol = Chem.MolFromSmiles(smiles)
+    test_feat = featurizer.featurize([test_mol])
+    test_feat = scaler.transform(test_feat)
+    prediction = model.predict(test_feat)
+    test_ic50 = 10**(prediction[0])
+    print(f"Predicted IC50: {test_ic50}")
+    prop_string = f"The predicted IC50 value for the test molecule is : {test_ic50:.3f} nM. \
+The Bioactive data was fitted with the LightGMB model, using RDKit descriptors. The trainin score \
+was {train_score:.3f} and the testing score was {valid_score:.3f}. "
+    print(prop_string)
+  except:
+    prop_string = ''
+  current_props_string += prop_string
+  state["props_string"] = current_props_string
+  state["which_tool"] += 1
+  return state
+def gpt_node(state: State) -> State:
+  '''
+    Uses a Chembl dataset, previously stored in a CSV file by the get_bioactives node, to
+    to finetune a GPT model to generate novel molecules for the target protein.
+    Args:
+      chembl_id
+    returns:
+      prop_string: a string of the novel, generated molecules
+  '''
+  print("GPT node")
+  print('===================================================')
+  current_props_string = state["props_string"]
+  chembl_id = state["query_chembl"].strip()
+  print(f"in gpt node, chembl id: {chembl_id}")
+  try:
+    df = pd.read_csv(f'{chembl_id}_bioactives.csv')
+    prop_string, img = finetune_gpt(df, chembl_id)
+    prop_string = prop_string.replace("#","~")
+    filename = "Substitution_image.png"
+    # pic = img.data
+    # with open(filename,'wb+') as outf:
+    #   outf.write(pic)
+    img.save(filename)
+  except:
+    prop_string = ''
+  current_props_string += prop_string
+  state["props_string"] = current_props_string
+  state["which_tool"] += 1
+  return state
 def get_protein_from_pdb(pdb_id):
   url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
   r = requests.get(url)
 def pdb_node(state: State) -> State:
   '''
     Accepts a PDB ID and queires the protein databank for the sequence of the protein, as well as other
+    information such as ligands.
       Args:
         pdb: the PDB ID to query
       Returns:
+        props_string: a string of the
   '''
   test_pdb = state["query_pdb"].strip()
   current_props_string = state["props_string"]
 def find_node(state: State) -> State:
   '''
+    Accepts a protein name and searches the protein databack for PDB IDs that match along with the entry titles.
       Args:
         protein_name: the protein to query
       Returns:
+        props_string: a string of the
   '''
   test_protein = state["query_protein"].strip()
   which_pdbs = state["which_pdbs"]
     state["which_pdbs"] = which_pdbs+10
   except:
     pdb_string = ''
   current_props_string += pdb_string
   state["props_string"] = current_props_string
   '''
     The first node of the agent. This node receives the input and asks the LLM
     to determine which is the best tool to use to answer the QUERY TASK.
       Input: the initial prompt from the user. should contain only one of more of the following:
              query_protein: the name of the protein to search for.
              query_up_id: the Uniprot ID of the protein to search for.
              the value should be separated from the name by a ':' and each field should
              be separated from the previous one by a ','.
              All of these values are saved to the state
       Output: the tool choice
   '''
   query_smiles = None
 pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \n \
 protein, as well as other information such as ligands in the structure. \
 find_tool: Accepts a protein name and seaches for PDB IDs that match, returning the PDB ID and the title. \
+predict_tool: Predicts the IC50 value for the molecule indicated by the SMILES string provided. \
+Uses the LightGBM model. \n \
+gpt_tool: Uses a machine-learning GPT model to generate novel molecules for a chembl dataset. It returns a list  \
+of novel molecules generated by the GPT. \
 '
   res = chat_model.invoke(prompt)
       tool_choice = (tool1, tool2)
   else:
     tool_choice = (None, None)
   state["tool_choice"] = tool_choice
   state["which_tool"] = 0
   print(f"The chosen tools are: {tool_choice}")
 def retry_node(state: State) -> State:
   '''
+    If the previous loop of the agent does not get enough information from the
     tools to answer the query, this node is called to retry the previous loop.
       Input: the previous loop of the agent.
       Output: the tool choice
 list_bioactives_tool: Accepts a given UNIPROT ID and searches for bioactive molecules \n \
 get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID\n \
 pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \
+protein, as well as other information such as ligands in the structure. \n \
+find_tool: Accepts a protein name and seaches for PDB IDs that match, returning the PDB ID and the title. \
+predict_tool: Predicts the IC50 value for the molecule indicated by the SMILES string provided. \
+Uses the LightGBM model. \n \
+gpt_tool: Uses a machine-learning GPT model to generate novel molecules for a chembl dataset. It returns a list  \
+of novel molecules generated by the GPT. \
+'
   res = chat_model.invoke(prompt)
   tool_choices = str(res).split('<|assistant|>')[1].split('#')[0].strip()
   tool_choices = tool_choices.split(',')
   if len(tool_choices) == 1:
     tool1 = tool_choices[0].strip()
     if tool1.lower() == 'none':
   '''
     This node accepts the tool returns and decides if it needs to call another
     tool or go on to the parser node.
       Input: the tool returns.
       Output: the next node to call.
   '''
     This is the third node in the agent. It receives the output from the tool,
     puts it into a prompt as CONTEXT, and asks the LLM to answer the original
     query.
       Input: the output from the tool.
       Output: the answer to the original query.
   '''
   '''
     This is the fourth node of the agent. It recieves the LLMs previous answer and
     tries to improve it.
       Input: the LLMs last answer.
       Output: the improved answer.
   '''
 builder.add_node("getbioactives_node", getbioactives_node)
 builder.add_node("pdb_node", pdb_node)
 builder.add_node("find_node", find_node)
+builder.add_node("predict_node", predict_node)
+builder.add_node("gpt_node", gpt_node)
 builder.add_node("loop_node", loop_node)
 builder.add_node("parser_node", parser_node)
     "get_bioactives_tool": "getbioactives_node",
     "pdb_tool": "pdb_node",
     "find_tool": "find_node",
+    "predict_tool": "predict_node",
+    "gpt_tool": "gpt_node",
     None: "parser_node"})
 builder.add_conditional_edges("retry_node", get_chemtool, {
     "get_bioactives_tool": "getbioactives_node",
     "pdb_tool": "pdb_node",
     "find_tool": "find_node",
+    "predict_tool": "predict_node",
+    "gpt_tool": "gpt_node",
     None: "parser_node"})
 builder.add_edge("uniprot_node", "loop_node")
 builder.add_edge("getbioactives_node", "loop_node")
 builder.add_edge("pdb_node", "loop_node")
 builder.add_edge("find_node", "loop_node")
+builder.add_edge("predict_node", "loop_node")
+builder.add_edge("gpt_node", "loop_node")
 builder.add_conditional_edges("loop_node", get_chemtool, {
     "uniprot_tool": "uniprot_node",
     "get_bioactives_tool": "getbioactives_node",
     "pdb_tool": "pdb_node",
     "find_tool": "find_node",
+    "predict_tool": "predict_node",
+    "gpt_tool": "gpt_node",
     None: "parser_node"})
 builder.add_conditional_edges("parser_node", loop_or_not, {
       reply = c[str(m[0])]['messages']
       if 'assistant' in str(reply):
         reply = str(reply).split("<|assistant|>")[-1].split('#')[0].strip()
+        reply = reply.replace("~","#")
         replies.append(reply)
   #check if image exists
   if os.path.exists('Substitution_image.png'):
               - calls Chembl to find a list bioactive molecules for a given chembl id and their IC50 values
               - calls PDB to find the number of chains in a protein, proteins sequences and small molecules in the structure
               - calls PDB to find PDB IDs that match a protein name.
+              - Uses Bioactive molecules to predict IC50 values for novel molecules with a LightGBM model.
+              - Uses Bioactive molecules to generate novel molecules using a fine-tuned GPT.
               ''')
   with gr.Row():