cafierom commited on
Commit
706c748
·
verified ·
1 Parent(s): 14d79fa

Upload 4 files

Browse files
app.py CHANGED
@@ -660,6 +660,7 @@ full_tool_descriptions = {
660
  'smiles_node' : 'Queries Pubchem for the smiles string of the molecule based on the name.',
661
  'name_node' : 'Queries Pubchem for the name of the molecule based on the smiles string.',
662
  'related_node' : 'Queries Pubchem for similar molecules based on the smiles string or name.',
 
663
  'substitution_node' : 'A simple substitution routine that looks for a substituent on a phenyl ring and\
664
  substitutes different fragments in that location. Returns a list of novel molecules and their\
665
  QED score (1 is most drug-like, 0 is least drug-like).',
 
660
  'smiles_node' : 'Queries Pubchem for the smiles string of the molecule based on the name.',
661
  'name_node' : 'Queries Pubchem for the name of the molecule based on the smiles string.',
662
  'related_node' : 'Queries Pubchem for similar molecules based on the smiles string or name.',
663
+ 'structure_node' : 'Generates the 3D structure and chemical formula of the molecule based on the name or smiles string.',
664
  'substitution_node' : 'A simple substitution routine that looks for a substituent on a phenyl ring and\
665
  substitutes different fragments in that location. Returns a list of novel molecules and their\
666
  QED score (1 is most drug-like, 0 is least drug-like).',
input_parsing.py CHANGED
@@ -3,7 +3,7 @@ from gliner import GLiNER
3
  import re
4
  import numpy as np
5
  from rdkit import Chem
6
- from modrag_molecule_functions import name_node, smiles_node, related_node
7
  from modrag_task_graphs import get_actives_for_protein, get_predictions_for_protein, dock_from_names
8
  from modrag_protein_functions import uniprot_node, listbioactives_node, getbioactives_node, predict_node, gpt_node, pdb_node, find_node, docking_node, target_node
9
  from modrag_property_functions import substitution_node, lipinski_node, pharmfeature_node
@@ -38,7 +38,7 @@ Find the similarity in the pharmacophores between ibuprofen and aspirin.',
38
  'name_node': 'Find the name of this molecule c1cc(O)ccc1',
39
  'smiles_node': 'Finds SMILES strings for cyclohexane and aspirin',
40
  'related_node': 'Find molecules similar to c1cc(O)ccc1',
41
-
42
  # modrag_task_graphs.py
43
  'get_actives_for_protein': 'Find the bioactive molecules for the protein DNA gyrase.',
44
  'get_predictions_for_protein': 'Predict the IC50 value for c1cc(O)ccc1 in the protein DNA gyrase.',
@@ -251,6 +251,9 @@ def define_tool_hash(tool: str, proteins_list, names_list, diseases_list, smiles
251
  elif tool == 'related_node':
252
  tool_function_hash = {
253
  'related_node': [related_node, [smiles_list]]}
 
 
 
254
  elif tool == 'get_predictions_for_protein':
255
  tool_function_hash = {
256
  'get_predictions_for_protein': [get_predictions_for_protein, [smiles_list, proteins_list[0]]]}
@@ -325,6 +328,9 @@ def define_tool_reqs(tool: str, proteins_list, names_list, diseases_list, smiles
325
  elif tool == 'related_node':
326
  tool_function_reqs = {
327
  'related_node': [[smiles_list], ['SMILES strings']]}
 
 
 
328
  elif tool == 'get_predictions_for_protein':
329
  tool_function_reqs = {
330
  'get_predictions_for_protein': [[smiles_list, proteins_list], ['SMILES strings', 'protein names']]}
 
3
  import re
4
  import numpy as np
5
  from rdkit import Chem
6
+ from modrag_molecule_functions import name_node, smiles_node, related_node, structure_node
7
  from modrag_task_graphs import get_actives_for_protein, get_predictions_for_protein, dock_from_names
8
  from modrag_protein_functions import uniprot_node, listbioactives_node, getbioactives_node, predict_node, gpt_node, pdb_node, find_node, docking_node, target_node
9
  from modrag_property_functions import substitution_node, lipinski_node, pharmfeature_node
 
38
  'name_node': 'Find the name of this molecule c1cc(O)ccc1',
39
  'smiles_node': 'Finds SMILES strings for cyclohexane and aspirin',
40
  'related_node': 'Find molecules similar to c1cc(O)ccc1',
41
+ 'structure_node': 'Find the structure of the molecule with SMILES string c1cc(O)ccc1, or the name Aspirin.',
42
  # modrag_task_graphs.py
43
  'get_actives_for_protein': 'Find the bioactive molecules for the protein DNA gyrase.',
44
  'get_predictions_for_protein': 'Predict the IC50 value for c1cc(O)ccc1 in the protein DNA gyrase.',
 
251
  elif tool == 'related_node':
252
  tool_function_hash = {
253
  'related_node': [related_node, [smiles_list]]}
254
+ elif tool == 'structure_node':
255
+ tool_function_hash = {
256
+ 'structure_node': [structure_node, [smiles_list]]}
257
  elif tool == 'get_predictions_for_protein':
258
  tool_function_hash = {
259
  'get_predictions_for_protein': [get_predictions_for_protein, [smiles_list, proteins_list[0]]]}
 
328
  elif tool == 'related_node':
329
  tool_function_reqs = {
330
  'related_node': [[smiles_list], ['SMILES strings']]}
331
+ elif tool == 'structure_node':
332
+ tool_function_reqs = {
333
+ 'structure_node': [[smiles_list], ['SMILES strings']]}
334
  elif tool == 'get_predictions_for_protein':
335
  tool_function_reqs = {
336
  'get_predictions_for_protein': [[smiles_list, proteins_list], ['SMILES strings', 'protein names']]}
intake_function.py CHANGED
@@ -637,6 +637,7 @@ full_tool_descriptions = {
637
  'smiles_node' : 'Queries Pubchem for the smiles string of the molecule based on the name.',
638
  'name_node' : 'Queries Pubchem for the name of the molecule based on the smiles string.',
639
  'related_node' : 'Queries Pubchem for similar molecules based on the smiles string or name.',
 
640
  'substitution_node' : 'A simple substitution routine that looks for a substituent on a phenyl ring and\
641
  substitutes different fragments in that location. Returns a list of novel molecules and their\
642
  QED score (1 is most drug-like, 0 is least drug-like).',
@@ -717,14 +718,14 @@ def websearch_node(query: str, embed_model, proxy_flag: bool = True) -> (list[st
717
  max_hits = len(scores)
718
  top_hits = []
719
  hits_idx = 0
720
- while hits_idx < 10:
721
  current_hit_idx = np.argmax(scores[0])
722
  current_score = scores[0][current_hit_idx].item()
723
  top_hits.append((titles[current_hit_idx], links[current_hit_idx], current_score))
724
  scores[0][current_hit_idx] = -1
725
  hits_idx += 1
726
 
727
- search_string = f'The top 10 hits for your query are:\n'
728
  i = 0
729
  for title, link, score in top_hits:
730
  search_string += f'{i}. {title}\nLink: {link}\nScore: {score:.3f}\n\n'
 
637
  'smiles_node' : 'Queries Pubchem for the smiles string of the molecule based on the name.',
638
  'name_node' : 'Queries Pubchem for the name of the molecule based on the smiles string.',
639
  'related_node' : 'Queries Pubchem for similar molecules based on the smiles string or name.',
640
+ 'structure_node' : 'Generates the 3D structure and chemical formula of the molecule based on the name or smiles string.',
641
  'substitution_node' : 'A simple substitution routine that looks for a substituent on a phenyl ring and\
642
  substitutes different fragments in that location. Returns a list of novel molecules and their\
643
  QED score (1 is most drug-like, 0 is least drug-like).',
 
718
  max_hits = len(scores)
719
  top_hits = []
720
  hits_idx = 0
721
+ while hits_idx < max_hits:
722
  current_hit_idx = np.argmax(scores[0])
723
  current_score = scores[0][current_hit_idx].item()
724
  top_hits.append((titles[current_hit_idx], links[current_hit_idx], current_score))
725
  scores[0][current_hit_idx] = -1
726
  hits_idx += 1
727
 
728
+ search_string = f'The top {max_hits} hits for your query are:\n'
729
  i = 0
730
  for title, link, score in top_hits:
731
  search_string += f'{i}. {title}\nLink: {link}\nScore: {score:.3f}\n\n'
modrag_molecule_functions.py CHANGED
@@ -10,6 +10,7 @@ import os, re
10
  from rdkit import RDConfig
11
  import pubchempy as pcp
12
  from PIL import Image
 
13
 
14
  def name_node(smiles_list: list[str]) -> (list[str], str):
15
  '''
@@ -118,4 +119,45 @@ def related_node(smiles_list: list[str]) -> (list[list[str]], str, list):
118
  total_similar_list.append([])
119
  all_images.append(None)
120
 
121
- return total_similar_list, related_string, all_images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  from rdkit import RDConfig
11
  import pubchempy as pcp
12
  from PIL import Image
13
+ from collections import Counter
14
 
15
  def name_node(smiles_list: list[str]) -> (list[str], str):
16
  '''
 
119
  total_similar_list.append([])
120
  all_images.append(None)
121
 
122
+ return total_similar_list, related_string, all_images
123
+
124
+ def structure_node(smiles_list: list[str]) -> (list[str], str, list):
125
+ '''
126
+ Generates the 3D structure of the molecule based on the smiles string.
127
+ Args:
128
+ smiles: the input smiles string
129
+ Returns:
130
+ all_structures: a list of strings of the 3D structure of the molecule
131
+ output_string: a string of the chemical formulae.
132
+ all_images: a list of images of the 3D structure of the molecule
133
+ '''
134
+ print("structure tool")
135
+
136
+ all_mols = []
137
+ all_structures = []
138
+ output_string = ''
139
+
140
+ for smile in smiles_list:
141
+ mol = Chem.MolFromSmiles(smile)
142
+ molH = Chem.AddHs(mol)
143
+ AllChem.EmbedMolecule(molH)
144
+ AllChem.MMFFOptimizeMolecule(molH)
145
+
146
+ structure_string = ""
147
+ all_symbols = []
148
+ for atom in molH.GetAtoms():
149
+ symbol = atom.GetSymbol()
150
+ all_symbols.append(symbol)
151
+ pos = molH.GetConformer().GetAtomPosition(atom.GetIdx())
152
+ structure_string += f'{symbol} {pos[0]} {pos[1]} {pos[2]}\n'
153
+
154
+ atom_freqs = Counter(all_symbols)
155
+ formula = ''.join([f'{atom}{count}' for atom, count in atom_freqs.items()])
156
+
157
+ output_string += f'For {smile}: Formula is: {formula}\n'
158
+ all_structures.append(structure_string)
159
+ all_mols.append(molH)
160
+
161
+ img = Draw.MolsToGridImage(all_mols, molsPerRow=3, subImgSize=(250, 250))
162
+
163
+ return all_structures, output_string, [img]