cafierom commited on
Commit
e8c9457
·
verified ·
1 Parent(s): 09f47f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +259 -89
app.py CHANGED
@@ -35,6 +35,15 @@ from rcsbapi.search import TextQuery
35
  import requests
36
  import itertools
37
 
 
 
 
 
 
 
 
 
 
38
  device = "cuda" if torch.cuda.is_available() else "cpu"
39
 
40
  hf = HuggingFacePipeline.from_model_id(
@@ -68,19 +77,18 @@ def uniprot_node(state: State) -> State:
68
  '''
69
  This tool takes in the user requested protein and searches UNIPROT for matches.
70
  It returns a string scontaining the protein ID, gene name, organism, and protein name.
71
-
72
  Args:
73
  query_protein: the name of the protein to search for.
74
  Returns:
75
  protein_string: a string containing the protein ID, gene name, organism, and protein name.
76
-
77
  '''
78
  print("UNIPROT tool")
79
  print('===================================================')
80
 
81
  protein_name = state["query_protein"]
82
  current_props_string = state["props_string"]
83
-
84
  try:
85
  url = f'https://rest.uniprot.org/uniprotkb/search?query={protein_name}&format=tsv'
86
  response = requests.get(url).text
@@ -91,7 +99,7 @@ def uniprot_node(state: State) -> State:
91
 
92
  prot_df = pd.read_csv(f'{protein_name}_uniprot_ids.tsv', sep='\t')
93
  prot_human_df = prot_df[prot_df['Organism'] == "Homo sapiens (Human)"]
94
- print(f"Found {len(prot_human_df)} Human proteins out of {len(prot_df)} total proteins")
95
 
96
  prot_ids = prot_df['Entry'].tolist()
97
  prot_ids_human = prot_human_df['Entry'].tolist()
@@ -100,14 +108,14 @@ def uniprot_node(state: State) -> State:
100
  genes_human = prot_human_df['Gene Names'].tolist()
101
 
102
  organisms = prot_df['Organism'].tolist()
103
-
104
  names = prot_df['Protein names'].tolist()
105
  names_human = prot_human_df['Protein names'].tolist()
106
 
107
  protein_string = ''
108
  for id, gene, organism, name in zip(prot_ids, genes, organisms, names):
109
  protein_string += f'Protein ID: {id}, Gene: {gene}, Organism: {organism}, Name: {name}\n'
110
-
111
  except:
112
  protein_string = 'No proteins found'
113
 
@@ -131,7 +139,7 @@ def get_qed(smiles):
131
 
132
  def listbioactives_node(state: State) -> State:
133
  '''
134
- Accepts a UNIPROT ID and searches for bioactive molecules
135
  Args:
136
  up_id: the UNIPROT ID of the protein to search for.
137
  Returns:
@@ -145,14 +153,14 @@ def listbioactives_node(state: State) -> State:
145
 
146
  targets = new_client.target
147
  bioact = new_client.activity
148
-
149
  try:
150
  target_info = targets.get(target_components__accession=up_id).only("target_chembl_id","organism", "pref_name", "target_type")
151
  target_info = pd.DataFrame.from_records(target_info)
152
  print(target_info)
153
  if len(target_info) > 0:
154
  print(f"Found info for Uniprot ID: {up_id}")
155
-
156
  chembl_ids = target_info['target_chembl_id'].tolist()
157
 
158
  chembl_ids = list(set(chembl_ids))
@@ -171,7 +179,7 @@ def listbioactives_node(state: State) -> State:
171
  len_this_bioacts = len(bioact_chosen)
172
  len_all_bioacts.append(len_this_bioacts)
173
  this_bioact_string = f"Lenth of Bioactivities for ChEMBL ID {chembl_id}: {len_this_bioacts}"
174
-
175
  bioact_string += this_bioact_string + '\n'
176
  except:
177
  bioact_string = 'No bioactives found\n'
@@ -195,68 +203,78 @@ def getbioactives_node(state: State) -> State:
195
  chembl_id = state["query_chembl"].strip()
196
  current_props_string = state["props_string"]
197
 
198
- compounds = new_client.molecule
199
- bioact = new_client.activity
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- bioact_chosen = bioact.filter(target_chembl_id=chembl_id, type="IC50", relation="=").only(
202
- "molecule_chembl_id",
203
- "type",
204
- "standard_units",
205
- "relation",
206
- "standard_value",
207
- )
208
-
209
- chembl_ids = []
210
- ic50s = []
211
- for record in bioact_chosen:
212
- if record["standard_units"] == 'nM':
213
- chembl_ids.append(record["molecule_chembl_id"])
214
- ic50s.append(float(record["standard_value"]))
215
-
216
- bioact_dict = {'chembl_ids' : chembl_ids, 'IC50s': ic50s}
217
- bioact_df = pd.DataFrame.from_dict(bioact_dict)
218
- bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
219
- print(f"Number of records: {len(bioact_df)}")
220
- print(bioact_df.shape)
221
-
222
-
223
- compounds_provider = compounds.filter(molecule_chembl_id__in=bioact_df["chembl_ids"].to_list()).only(
224
- "molecule_chembl_id",
225
- "molecule_structures"
226
- )
227
-
228
- cids_list = []
229
- smiles_list = []
230
-
231
- for record in compounds_provider:
232
- cid = record['molecule_chembl_id']
233
- cids_list.append(cid)
234
-
235
- if record['molecule_structures']:
236
- if record['molecule_structures']['canonical_smiles']:
237
- smile = record['molecule_structures']['canonical_smiles']
238
- else:
239
- print("No canonical smiles")
240
- smile = None
241
- else:
242
- print('no structures')
243
- smile = None
244
- smiles_list.append(smile)
245
-
246
- new_dict = {'SMILES': smiles_list, 'chembl_ids_2': cids_list}
247
- new_df = pd.DataFrame.from_dict(new_dict)
248
-
249
- total_bioact_df = pd.merge(bioact_df, new_df, left_on='chembl_ids', right_on='chembl_ids_2')
250
- print(f"number of records: {len(total_bioact_df)}")
251
-
252
- total_bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
253
- print(f"number of records after removing duplicates: {len(total_bioact_df)}")
254
-
255
- total_bioact_df.dropna(axis=0, how='any', inplace=True)
256
- total_bioact_df.drop(["chembl_ids_2"],axis=1,inplace=True)
257
- print(f"number of records after dropping Null values: {len(total_bioact_df)}")
258
-
259
- total_bioact_df.sort_values(by=["IC50s"],inplace=True)
260
 
261
  limit = 50
262
  if len(total_bioact_df) > limit:
@@ -266,12 +284,149 @@ def getbioactives_node(state: State) -> State:
266
  for smile, ic50 in zip(total_bioact_df['SMILES'], total_bioact_df['IC50s']):
267
  smile = smile.replace('#','~')
268
  bioact_string += f'Molecule SMILES: {smile}, IC50 (nM): {ic50}\n'
269
-
 
 
 
 
 
 
 
 
 
270
  current_props_string += bioact_string
271
  state["props_string"] = current_props_string
272
  state["which_tool"] += 1
273
  return state
274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  def get_protein_from_pdb(pdb_id):
276
  url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
277
  r = requests.get(url)
@@ -344,12 +499,11 @@ def three_to_one(three_seq):
344
  def pdb_node(state: State) -> State:
345
  '''
346
  Accepts a PDB ID and queires the protein databank for the sequence of the protein, as well as other
347
- information such as ligands.
348
-
349
  Args:
350
  pdb: the PDB ID to query
351
  Returns:
352
- props_string: a string of the
353
  '''
354
  test_pdb = state["query_pdb"].strip()
355
  current_props_string = state["props_string"]
@@ -399,12 +553,11 @@ def pdb_node(state: State) -> State:
399
 
400
  def find_node(state: State) -> State:
401
  '''
402
- Accepts a protein name and searches the protein databack for PDB IDs that match along with the entry titles.
403
-
404
  Args:
405
  protein_name: the protein to query
406
  Returns:
407
- props_string: a string of the
408
  '''
409
  test_protein = state["query_protein"].strip()
410
  which_pdbs = state["which_pdbs"]
@@ -431,7 +584,7 @@ def find_node(state: State) -> State:
431
  state["which_pdbs"] = which_pdbs+10
432
  except:
433
  pdb_string = ''
434
-
435
 
436
  current_props_string += pdb_string
437
  state["props_string"] = current_props_string
@@ -442,7 +595,6 @@ def first_node(state: State) -> State:
442
  '''
443
  The first node of the agent. This node receives the input and asks the LLM
444
  to determine which is the best tool to use to answer the QUERY TASK.
445
-
446
  Input: the initial prompt from the user. should contain only one of more of the following:
447
  query_protein: the name of the protein to search for.
448
  query_up_id: the Uniprot ID of the protein to search for.
@@ -453,7 +605,6 @@ def first_node(state: State) -> State:
453
  the value should be separated from the name by a ':' and each field should
454
  be separated from the previous one by a ','.
455
  All of these values are saved to the state
456
-
457
  Output: the tool choice
458
  '''
459
  query_smiles = None
@@ -518,6 +669,10 @@ get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES
518
  pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \n \
519
  protein, as well as other information such as ligands in the structure. \
520
  find_tool: Accepts a protein name and seaches for PDB IDs that match, returning the PDB ID and the title. \
 
 
 
 
521
  '
522
  res = chat_model.invoke(prompt)
523
 
@@ -543,7 +698,7 @@ find_tool: Accepts a protein name and seaches for PDB IDs that match, returning
543
  tool_choice = (tool1, tool2)
544
  else:
545
  tool_choice = (None, None)
546
-
547
  state["tool_choice"] = tool_choice
548
  state["which_tool"] = 0
549
  print(f"The chosen tools are: {tool_choice}")
@@ -552,7 +707,7 @@ find_tool: Accepts a protein name and seaches for PDB IDs that match, returning
552
 
553
  def retry_node(state: State) -> State:
554
  '''
555
- If the previous loop of the agent does not get enough information from the
556
  tools to answer the query, this node is called to retry the previous loop.
557
  Input: the previous loop of the agent.
558
  Output: the tool choice
@@ -584,14 +739,19 @@ It returns a string containing the protein ID, gene name, organism, and protein
584
  list_bioactives_tool: Accepts a given UNIPROT ID and searches for bioactive molecules \n \
585
  get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID\n \
586
  pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \
587
- protein, as well as other information such as ligands in the structure. \
588
- find_tool: Accepts a protein name and seaches for PDB IDs that match, returning the PDB ID and the title.'
 
 
 
 
 
589
 
590
  res = chat_model.invoke(prompt)
591
 
592
  tool_choices = str(res).split('<|assistant|>')[1].split('#')[0].strip()
593
  tool_choices = tool_choices.split(',')
594
-
595
  if len(tool_choices) == 1:
596
  tool1 = tool_choices[0].strip()
597
  if tool1.lower() == 'none':
@@ -622,7 +782,6 @@ def loop_node(state: State) -> State:
622
  '''
623
  This node accepts the tool returns and decides if it needs to call another
624
  tool or go on to the parser node.
625
-
626
  Input: the tool returns.
627
  Output: the next node to call.
628
  '''
@@ -633,7 +792,6 @@ def parser_node(state: State) -> State:
633
  This is the third node in the agent. It receives the output from the tool,
634
  puts it into a prompt as CONTEXT, and asks the LLM to answer the original
635
  query.
636
-
637
  Input: the output from the tool.
638
  Output: the answer to the original query.
639
  '''
@@ -684,7 +842,6 @@ def reflect_node(state: State) -> State:
684
  '''
685
  This is the fourth node of the agent. It recieves the LLMs previous answer and
686
  tries to improve it.
687
-
688
  Input: the LLMs last answer.
689
  Output: the improved answer.
690
  '''
@@ -760,6 +917,8 @@ builder.add_node("listbioactives_node", listbioactives_node)
760
  builder.add_node("getbioactives_node", getbioactives_node)
761
  builder.add_node("pdb_node", pdb_node)
762
  builder.add_node("find_node", find_node)
 
 
763
 
764
  builder.add_node("loop_node", loop_node)
765
  builder.add_node("parser_node", parser_node)
@@ -773,6 +932,8 @@ builder.add_conditional_edges("first_node", get_chemtool, {
773
  "get_bioactives_tool": "getbioactives_node",
774
  "pdb_tool": "pdb_node",
775
  "find_tool": "find_node",
 
 
776
  None: "parser_node"})
777
 
778
  builder.add_conditional_edges("retry_node", get_chemtool, {
@@ -781,6 +942,8 @@ builder.add_conditional_edges("retry_node", get_chemtool, {
781
  "get_bioactives_tool": "getbioactives_node",
782
  "pdb_tool": "pdb_node",
783
  "find_tool": "find_node",
 
 
784
  None: "parser_node"})
785
 
786
  builder.add_edge("uniprot_node", "loop_node")
@@ -788,6 +951,8 @@ builder.add_edge("listbioactives_node", "loop_node")
788
  builder.add_edge("getbioactives_node", "loop_node")
789
  builder.add_edge("pdb_node", "loop_node")
790
  builder.add_edge("find_node", "loop_node")
 
 
791
 
792
  builder.add_conditional_edges("loop_node", get_chemtool, {
793
  "uniprot_tool": "uniprot_node",
@@ -795,6 +960,8 @@ builder.add_conditional_edges("loop_node", get_chemtool, {
795
  "get_bioactives_tool": "getbioactives_node",
796
  "pdb_tool": "pdb_node",
797
  "find_tool": "find_node",
 
 
798
  None: "parser_node"})
799
 
800
  builder.add_conditional_edges("parser_node", loop_or_not, {
@@ -827,6 +994,7 @@ def ProteinAgent(task, protein, up_id, chembl_id, pdb_id, smiles):
827
  reply = c[str(m[0])]['messages']
828
  if 'assistant' in str(reply):
829
  reply = str(reply).split("<|assistant|>")[-1].split('#')[0].strip()
 
830
  replies.append(reply)
831
  #check if image exists
832
  if os.path.exists('Substitution_image.png'):
@@ -846,6 +1014,8 @@ with gr.Blocks(fill_height=True) as forest:
846
  - calls Chembl to find a list bioactive molecules for a given chembl id and their IC50 values
847
  - calls PDB to find the number of chains in a protein, proteins sequences and small molecules in the structure
848
  - calls PDB to find PDB IDs that match a protein name.
 
 
849
  ''')
850
 
851
  with gr.Row():
 
35
  import requests
36
  import itertools
37
 
38
+ import lightgbm as lgb
39
+ from lightgbm import LGBMRegressor
40
+ import deepchem as dc
41
+ from sklearn.model_selection import train_test_split, GridSearchCV
42
+ from sklearn.preprocessing import StandardScaler
43
+ import tensorflow as tf
44
+ import random
45
+ from finetune_gpt import *
46
+
47
  device = "cuda" if torch.cuda.is_available() else "cpu"
48
 
49
  hf = HuggingFacePipeline.from_model_id(
 
77
  '''
78
  This tool takes in the user requested protein and searches UNIPROT for matches.
79
  It returns a string scontaining the protein ID, gene name, organism, and protein name.
 
80
  Args:
81
  query_protein: the name of the protein to search for.
82
  Returns:
83
  protein_string: a string containing the protein ID, gene name, organism, and protein name.
84
+
85
  '''
86
  print("UNIPROT tool")
87
  print('===================================================')
88
 
89
  protein_name = state["query_protein"]
90
  current_props_string = state["props_string"]
91
+
92
  try:
93
  url = f'https://rest.uniprot.org/uniprotkb/search?query={protein_name}&format=tsv'
94
  response = requests.get(url).text
 
99
 
100
  prot_df = pd.read_csv(f'{protein_name}_uniprot_ids.tsv', sep='\t')
101
  prot_human_df = prot_df[prot_df['Organism'] == "Homo sapiens (Human)"]
102
+ print(f"Found {len(prot_human_df)} Human proteins out of {len(prot_df)} total proteins")
103
 
104
  prot_ids = prot_df['Entry'].tolist()
105
  prot_ids_human = prot_human_df['Entry'].tolist()
 
108
  genes_human = prot_human_df['Gene Names'].tolist()
109
 
110
  organisms = prot_df['Organism'].tolist()
111
+
112
  names = prot_df['Protein names'].tolist()
113
  names_human = prot_human_df['Protein names'].tolist()
114
 
115
  protein_string = ''
116
  for id, gene, organism, name in zip(prot_ids, genes, organisms, names):
117
  protein_string += f'Protein ID: {id}, Gene: {gene}, Organism: {organism}, Name: {name}\n'
118
+
119
  except:
120
  protein_string = 'No proteins found'
121
 
 
139
 
140
  def listbioactives_node(state: State) -> State:
141
  '''
142
+ Accepts a UNIPROT ID and searches for bioactive molecules
143
  Args:
144
  up_id: the UNIPROT ID of the protein to search for.
145
  Returns:
 
153
 
154
  targets = new_client.target
155
  bioact = new_client.activity
156
+
157
  try:
158
  target_info = targets.get(target_components__accession=up_id).only("target_chembl_id","organism", "pref_name", "target_type")
159
  target_info = pd.DataFrame.from_records(target_info)
160
  print(target_info)
161
  if len(target_info) > 0:
162
  print(f"Found info for Uniprot ID: {up_id}")
163
+
164
  chembl_ids = target_info['target_chembl_id'].tolist()
165
 
166
  chembl_ids = list(set(chembl_ids))
 
179
  len_this_bioacts = len(bioact_chosen)
180
  len_all_bioacts.append(len_this_bioacts)
181
  this_bioact_string = f"Lenth of Bioactivities for ChEMBL ID {chembl_id}: {len_this_bioacts}"
182
+
183
  bioact_string += this_bioact_string + '\n'
184
  except:
185
  bioact_string = 'No bioactives found\n'
 
203
  chembl_id = state["query_chembl"].strip()
204
  current_props_string = state["props_string"]
205
 
206
+ #check if f'{chembl_id}_bioactives.csv' exists
207
+ if os.path.exists(f'{chembl_id}_bioactives.csv'):
208
+ print(f'Found {chembl_id}_bioactives.csv')
209
+ total_bioact_df = pd.read_csv(f'{chembl_id}_bioactives.csv')
210
+ print(f"number of records: {len(total_bioact_df)}")
211
+ else:
212
+
213
+ compounds = new_client.molecule
214
+ bioact = new_client.activity
215
+
216
+ bioact_chosen = bioact.filter(target_chembl_id=chembl_id, type="IC50", relation="=").only(
217
+ "molecule_chembl_id",
218
+ "type",
219
+ "standard_units",
220
+ "relation",
221
+ "standard_value",
222
+ )
223
+
224
+ chembl_ids = []
225
+ ic50s = []
226
+ for record in bioact_chosen:
227
+ if record["standard_units"] == 'nM':
228
+ chembl_ids.append(record["molecule_chembl_id"])
229
+ ic50s.append(float(record["standard_value"]))
230
+
231
+ bioact_dict = {'chembl_ids' : chembl_ids, 'IC50s': ic50s}
232
+ bioact_df = pd.DataFrame.from_dict(bioact_dict)
233
+ bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
234
+ print(f"Number of records: {len(bioact_df)}")
235
+ print(bioact_df.shape)
236
+
237
+
238
+ compounds_provider = compounds.filter(molecule_chembl_id__in=bioact_df["chembl_ids"].to_list()).only(
239
+ "molecule_chembl_id",
240
+ "molecule_structures"
241
+ )
242
 
243
+ cids_list = []
244
+ smiles_list = []
245
+
246
+ for record in compounds_provider:
247
+ cid = record['molecule_chembl_id']
248
+ cids_list.append(cid)
249
+
250
+ if record['molecule_structures']:
251
+ if record['molecule_structures']['canonical_smiles']:
252
+ smile = record['molecule_structures']['canonical_smiles']
253
+ else:
254
+ print("No canonical smiles")
255
+ smile = None
256
+ else:
257
+ print('no structures')
258
+ smile = None
259
+ smiles_list.append(smile)
260
+
261
+ new_dict = {'SMILES': smiles_list, 'chembl_ids_2': cids_list}
262
+ new_df = pd.DataFrame.from_dict(new_dict)
263
+
264
+ total_bioact_df = pd.merge(bioact_df, new_df, left_on='chembl_ids', right_on='chembl_ids_2')
265
+ print(f"number of records: {len(total_bioact_df)}")
266
+
267
+ total_bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
268
+ print(f"number of records after removing duplicates: {len(total_bioact_df)}")
269
+
270
+ total_bioact_df.dropna(axis=0, how='any', inplace=True)
271
+ total_bioact_df.drop(["chembl_ids_2"],axis=1,inplace=True)
272
+ print(f"number of records after dropping Null values: {len(total_bioact_df)}")
273
+
274
+ total_bioact_df.sort_values(by=["IC50s"],inplace=True)
275
+
276
+ if len(total_bioact_df) > 0:
277
+ total_bioact_df.to_csv(f'{chembl_id}_bioactives.csv')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
  limit = 50
280
  if len(total_bioact_df) > limit:
 
284
  for smile, ic50 in zip(total_bioact_df['SMILES'], total_bioact_df['IC50s']):
285
  smile = smile.replace('#','~')
286
  bioact_string += f'Molecule SMILES: {smile}, IC50 (nM): {ic50}\n'
287
+
288
+ mols = [Chem.MolFromSmiles(smile) for smile in total_bioact_df['SMILES'].to_list()]
289
+ legends = [f'IC50: {ic50}' for ic50 in total_bioact_df['IC50s'].to_list()]
290
+ img = MolsToGridImage(mols, molsPerRow=5, legends=legends, subImgSize=(200,200))
291
+ filename = "Substitution_image.png"
292
+ # pic = img.data
293
+ # with open(filename,'wb+') as outf:
294
+ # outf.write(pic)
295
+ img.save(filename)
296
+
297
  current_props_string += bioact_string
298
  state["props_string"] = current_props_string
299
  state["which_tool"] += 1
300
  return state
301
 
302
+ def predict_node(state: State) -> State:
303
+ '''
304
+ uses the current_bioactives.csv file from the get_bioactives node to fit the
305
+ Light GBM model and predict the IC50 for the current smiles.
306
+ '''
307
+ print("Predict Tool")
308
+ print('===================================================')
309
+ current_props_string = state["props_string"]
310
+ smiles = state["query_smiles"]
311
+ chembl_id = state["query_chembl"].strip()
312
+ print(f"in predict node, smiles: {smiles}")
313
+
314
+ try:
315
+ df = pd.read_csv(f'{chembl_id}_bioactives.csv')
316
+ #if length of the dataframe is over 2000, take a random sample of 2000 points
317
+ if len(df) > 2000:
318
+ df = df.sample(n=2000, random_state=42)
319
+
320
+ y_raw = df["IC50s"].to_list()
321
+ smiles_list = df["SMILES"].to_list()
322
+ ions_to_clean = ["[Na+].",".[Na+]","[Cl-].",".[Cl-]","[K+].",".[K+]"]
323
+ Xa = []
324
+ y = []
325
+ for smile, value in zip(smiles_list, y_raw):
326
+ for ion in ions_to_clean:
327
+ smile = smile.replace(ion,"")
328
+ y.append(np.log10(value))
329
+ Xa.append(smile)
330
+
331
+ mols = [Chem.MolFromSmiles(smile) for smile in Xa]
332
+ print(f"Number of molecules: {len(mols)}")
333
+
334
+ featurizer=dc.feat.RDKitDescriptors()
335
+ featname="RDKitDescriptors"
336
+ f = featurizer.featurize(mols)
337
+
338
+ nan_indicies = np.isnan(f)
339
+ bad_rows = []
340
+ for i, row in enumerate(nan_indicies):
341
+ for item in row:
342
+ if item == True:
343
+ if i not in bad_rows:
344
+ print(f"Row {i} has a NaN.")
345
+ bad_rows.append(i)
346
+
347
+ print(f"Old dimensions are: {f.shape}.")
348
+
349
+ for j,i in enumerate(bad_rows):
350
+ k=i-j
351
+ f = np.delete(f,k,axis=0)
352
+ y = np.delete(y,k,axis=0)
353
+ Xa = np.delete(Xa,k,axis=0)
354
+ print(f"Deleting row {k} from arrays.")
355
+
356
+ print(f"New dimensions are: {f.shape}")
357
+ if f.shape[0] != len(y) or f.shape[0] != len(Xa):
358
+ raise ValueError("Number of rows in X and y do not match.")
359
+
360
+ X_train, X_test, y_train, y_test = train_test_split(f, y, test_size=0.2, random_state=42)
361
+ scaler = StandardScaler()
362
+ X_train = scaler.fit_transform(X_train)
363
+ X_test = scaler.transform(X_test)
364
+
365
+ model = LGBMRegressor(metric='rmse', max_depth = 50, verbose = -1, num_leaves = 31,
366
+ feature_fraction = 0.8, min_data_in_leaf = 20)
367
+ modelname = "LightGBM Regressor"
368
+ model.fit(X_train, y_train)
369
+
370
+ train_score = model.score(X_train,y_train)
371
+ print(f"score for training set: {train_score:.3f}")
372
+
373
+ valid_score = model.score(X_test, y_test)
374
+ print(f"score for validation set: {valid_score:.3f}")
375
+
376
+ for ion in ions_to_clean:
377
+ smiles = smiles.replace(ion,"")
378
+ test_mol = Chem.MolFromSmiles(smiles)
379
+ test_feat = featurizer.featurize([test_mol])
380
+ test_feat = scaler.transform(test_feat)
381
+ prediction = model.predict(test_feat)
382
+ test_ic50 = 10**(prediction[0])
383
+ print(f"Predicted IC50: {test_ic50}")
384
+ prop_string = f"The predicted IC50 value for the test molecule is : {test_ic50:.3f} nM. \
385
+ The Bioactive data was fitted with the LightGMB model, using RDKit descriptors. The trainin score \
386
+ was {train_score:.3f} and the testing score was {valid_score:.3f}. "
387
+ print(prop_string)
388
+
389
+ except:
390
+ prop_string = ''
391
+
392
+ current_props_string += prop_string
393
+ state["props_string"] = current_props_string
394
+ state["which_tool"] += 1
395
+ return state
396
+
397
+ def gpt_node(state: State) -> State:
398
+ '''
399
+ Uses a Chembl dataset, previously stored in a CSV file by the get_bioactives node, to
400
+ to finetune a GPT model to generate novel molecules for the target protein.
401
+
402
+ Args:
403
+ chembl_id
404
+ returns:
405
+ prop_string: a string of the novel, generated molecules
406
+ '''
407
+ print("GPT node")
408
+ print('===================================================')
409
+ current_props_string = state["props_string"]
410
+ chembl_id = state["query_chembl"].strip()
411
+ print(f"in gpt node, chembl id: {chembl_id}")
412
+
413
+ try:
414
+ df = pd.read_csv(f'{chembl_id}_bioactives.csv')
415
+ prop_string, img = finetune_gpt(df, chembl_id)
416
+ prop_string = prop_string.replace("#","~")
417
+ filename = "Substitution_image.png"
418
+ # pic = img.data
419
+ # with open(filename,'wb+') as outf:
420
+ # outf.write(pic)
421
+ img.save(filename)
422
+ except:
423
+ prop_string = ''
424
+
425
+ current_props_string += prop_string
426
+ state["props_string"] = current_props_string
427
+ state["which_tool"] += 1
428
+ return state
429
+
430
  def get_protein_from_pdb(pdb_id):
431
  url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
432
  r = requests.get(url)
 
499
  def pdb_node(state: State) -> State:
500
  '''
501
  Accepts a PDB ID and queires the protein databank for the sequence of the protein, as well as other
502
+ information such as ligands.
 
503
  Args:
504
  pdb: the PDB ID to query
505
  Returns:
506
+ props_string: a string of the
507
  '''
508
  test_pdb = state["query_pdb"].strip()
509
  current_props_string = state["props_string"]
 
553
 
554
  def find_node(state: State) -> State:
555
  '''
556
+ Accepts a protein name and searches the protein databack for PDB IDs that match along with the entry titles.
 
557
  Args:
558
  protein_name: the protein to query
559
  Returns:
560
+ props_string: a string of the
561
  '''
562
  test_protein = state["query_protein"].strip()
563
  which_pdbs = state["which_pdbs"]
 
584
  state["which_pdbs"] = which_pdbs+10
585
  except:
586
  pdb_string = ''
587
+
588
 
589
  current_props_string += pdb_string
590
  state["props_string"] = current_props_string
 
595
  '''
596
  The first node of the agent. This node receives the input and asks the LLM
597
  to determine which is the best tool to use to answer the QUERY TASK.
 
598
  Input: the initial prompt from the user. should contain only one of more of the following:
599
  query_protein: the name of the protein to search for.
600
  query_up_id: the Uniprot ID of the protein to search for.
 
605
  the value should be separated from the name by a ':' and each field should
606
  be separated from the previous one by a ','.
607
  All of these values are saved to the state
 
608
  Output: the tool choice
609
  '''
610
  query_smiles = None
 
669
  pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \n \
670
  protein, as well as other information such as ligands in the structure. \
671
  find_tool: Accepts a protein name and seaches for PDB IDs that match, returning the PDB ID and the title. \
672
+ predict_tool: Predicts the IC50 value for the molecule indicated by the SMILES string provided. \
673
+ Uses the LightGBM model. \n \
674
+ gpt_tool: Uses a machine-learning GPT model to generate novel molecules for a chembl dataset. It returns a list \
675
+ of novel molecules generated by the GPT. \
676
  '
677
  res = chat_model.invoke(prompt)
678
 
 
698
  tool_choice = (tool1, tool2)
699
  else:
700
  tool_choice = (None, None)
701
+
702
  state["tool_choice"] = tool_choice
703
  state["which_tool"] = 0
704
  print(f"The chosen tools are: {tool_choice}")
 
707
 
708
  def retry_node(state: State) -> State:
709
  '''
710
+ If the previous loop of the agent does not get enough information from the
711
  tools to answer the query, this node is called to retry the previous loop.
712
  Input: the previous loop of the agent.
713
  Output: the tool choice
 
739
  list_bioactives_tool: Accepts a given UNIPROT ID and searches for bioactive molecules \n \
740
  get_bioactives_tool: Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID\n \
741
  pdb_tool: Accepts a PDB ID and queires the protein databank for the number of chains in and sequence of the \
742
+ protein, as well as other information such as ligands in the structure. \n \
743
+ find_tool: Accepts a protein name and seaches for PDB IDs that match, returning the PDB ID and the title. \
744
+ predict_tool: Predicts the IC50 value for the molecule indicated by the SMILES string provided. \
745
+ Uses the LightGBM model. \n \
746
+ gpt_tool: Uses a machine-learning GPT model to generate novel molecules for a chembl dataset. It returns a list \
747
+ of novel molecules generated by the GPT. \
748
+ '
749
 
750
  res = chat_model.invoke(prompt)
751
 
752
  tool_choices = str(res).split('<|assistant|>')[1].split('#')[0].strip()
753
  tool_choices = tool_choices.split(',')
754
+
755
  if len(tool_choices) == 1:
756
  tool1 = tool_choices[0].strip()
757
  if tool1.lower() == 'none':
 
782
  '''
783
  This node accepts the tool returns and decides if it needs to call another
784
  tool or go on to the parser node.
 
785
  Input: the tool returns.
786
  Output: the next node to call.
787
  '''
 
792
  This is the third node in the agent. It receives the output from the tool,
793
  puts it into a prompt as CONTEXT, and asks the LLM to answer the original
794
  query.
 
795
  Input: the output from the tool.
796
  Output: the answer to the original query.
797
  '''
 
842
  '''
843
  This is the fourth node of the agent. It recieves the LLMs previous answer and
844
  tries to improve it.
 
845
  Input: the LLMs last answer.
846
  Output: the improved answer.
847
  '''
 
917
  builder.add_node("getbioactives_node", getbioactives_node)
918
  builder.add_node("pdb_node", pdb_node)
919
  builder.add_node("find_node", find_node)
920
+ builder.add_node("predict_node", predict_node)
921
+ builder.add_node("gpt_node", gpt_node)
922
 
923
  builder.add_node("loop_node", loop_node)
924
  builder.add_node("parser_node", parser_node)
 
932
  "get_bioactives_tool": "getbioactives_node",
933
  "pdb_tool": "pdb_node",
934
  "find_tool": "find_node",
935
+ "predict_tool": "predict_node",
936
+ "gpt_tool": "gpt_node",
937
  None: "parser_node"})
938
 
939
  builder.add_conditional_edges("retry_node", get_chemtool, {
 
942
  "get_bioactives_tool": "getbioactives_node",
943
  "pdb_tool": "pdb_node",
944
  "find_tool": "find_node",
945
+ "predict_tool": "predict_node",
946
+ "gpt_tool": "gpt_node",
947
  None: "parser_node"})
948
 
949
  builder.add_edge("uniprot_node", "loop_node")
 
951
  builder.add_edge("getbioactives_node", "loop_node")
952
  builder.add_edge("pdb_node", "loop_node")
953
  builder.add_edge("find_node", "loop_node")
954
+ builder.add_edge("predict_node", "loop_node")
955
+ builder.add_edge("gpt_node", "loop_node")
956
 
957
  builder.add_conditional_edges("loop_node", get_chemtool, {
958
  "uniprot_tool": "uniprot_node",
 
960
  "get_bioactives_tool": "getbioactives_node",
961
  "pdb_tool": "pdb_node",
962
  "find_tool": "find_node",
963
+ "predict_tool": "predict_node",
964
+ "gpt_tool": "gpt_node",
965
  None: "parser_node"})
966
 
967
  builder.add_conditional_edges("parser_node", loop_or_not, {
 
994
  reply = c[str(m[0])]['messages']
995
  if 'assistant' in str(reply):
996
  reply = str(reply).split("<|assistant|>")[-1].split('#')[0].strip()
997
+ reply = reply.replace("~","#")
998
  replies.append(reply)
999
  #check if image exists
1000
  if os.path.exists('Substitution_image.png'):
 
1014
  - calls Chembl to find a list bioactive molecules for a given chembl id and their IC50 values
1015
  - calls PDB to find the number of chains in a protein, proteins sequences and small molecules in the structure
1016
  - calls PDB to find PDB IDs that match a protein name.
1017
+ - Uses Bioactive molecules to predict IC50 values for novel molecules with a LightGBM model.
1018
+ - Uses Bioactive molecules to generate novel molecules using a fine-tuned GPT.
1019
  ''')
1020
 
1021
  with gr.Row():