Upload 4 files
Browse files- app.py +1 -0
- input_parsing.py +8 -2
- intake_function.py +3 -2
- modrag_molecule_functions.py +43 -1
app.py
CHANGED
|
@@ -660,6 +660,7 @@ full_tool_descriptions = {
|
|
| 660 |
'smiles_node' : 'Queries Pubchem for the smiles string of the molecule based on the name.',
|
| 661 |
'name_node' : 'Queries Pubchem for the name of the molecule based on the smiles string.',
|
| 662 |
'related_node' : 'Queries Pubchem for similar molecules based on the smiles string or name.',
|
|
|
|
| 663 |
'substitution_node' : 'A simple substitution routine that looks for a substituent on a phenyl ring and\
|
| 664 |
substitutes different fragments in that location. Returns a list of novel molecules and their\
|
| 665 |
QED score (1 is most drug-like, 0 is least drug-like).',
|
|
|
|
| 660 |
'smiles_node' : 'Queries Pubchem for the smiles string of the molecule based on the name.',
|
| 661 |
'name_node' : 'Queries Pubchem for the name of the molecule based on the smiles string.',
|
| 662 |
'related_node' : 'Queries Pubchem for similar molecules based on the smiles string or name.',
|
| 663 |
+
'structure_node' : 'Generates the 3D structure and chemical formula of the molecule based on the name or smiles string.',
|
| 664 |
'substitution_node' : 'A simple substitution routine that looks for a substituent on a phenyl ring and\
|
| 665 |
substitutes different fragments in that location. Returns a list of novel molecules and their\
|
| 666 |
QED score (1 is most drug-like, 0 is least drug-like).',
|
input_parsing.py
CHANGED
|
@@ -3,7 +3,7 @@ from gliner import GLiNER
|
|
| 3 |
import re
|
| 4 |
import numpy as np
|
| 5 |
from rdkit import Chem
|
| 6 |
-
from modrag_molecule_functions import name_node, smiles_node, related_node
|
| 7 |
from modrag_task_graphs import get_actives_for_protein, get_predictions_for_protein, dock_from_names
|
| 8 |
from modrag_protein_functions import uniprot_node, listbioactives_node, getbioactives_node, predict_node, gpt_node, pdb_node, find_node, docking_node, target_node
|
| 9 |
from modrag_property_functions import substitution_node, lipinski_node, pharmfeature_node
|
|
@@ -38,7 +38,7 @@ Find the similarity in the pharmacophores between ibuprofen and aspirin.',
|
|
| 38 |
'name_node': 'Find the name of this molecule c1cc(O)ccc1',
|
| 39 |
'smiles_node': 'Finds SMILES strings for cyclohexane and aspirin',
|
| 40 |
'related_node': 'Find molecules similar to c1cc(O)ccc1',
|
| 41 |
-
|
| 42 |
# modrag_task_graphs.py
|
| 43 |
'get_actives_for_protein': 'Find the bioactive molecules for the protein DNA gyrase.',
|
| 44 |
'get_predictions_for_protein': 'Predict the IC50 value for c1cc(O)ccc1 in the protein DNA gyrase.',
|
|
@@ -251,6 +251,9 @@ def define_tool_hash(tool: str, proteins_list, names_list, diseases_list, smiles
|
|
| 251 |
elif tool == 'related_node':
|
| 252 |
tool_function_hash = {
|
| 253 |
'related_node': [related_node, [smiles_list]]}
|
|
|
|
|
|
|
|
|
|
| 254 |
elif tool == 'get_predictions_for_protein':
|
| 255 |
tool_function_hash = {
|
| 256 |
'get_predictions_for_protein': [get_predictions_for_protein, [smiles_list, proteins_list[0]]]}
|
|
@@ -325,6 +328,9 @@ def define_tool_reqs(tool: str, proteins_list, names_list, diseases_list, smiles
|
|
| 325 |
elif tool == 'related_node':
|
| 326 |
tool_function_reqs = {
|
| 327 |
'related_node': [[smiles_list], ['SMILES strings']]}
|
|
|
|
|
|
|
|
|
|
| 328 |
elif tool == 'get_predictions_for_protein':
|
| 329 |
tool_function_reqs = {
|
| 330 |
'get_predictions_for_protein': [[smiles_list, proteins_list], ['SMILES strings', 'protein names']]}
|
|
|
|
| 3 |
import re
|
| 4 |
import numpy as np
|
| 5 |
from rdkit import Chem
|
| 6 |
+
from modrag_molecule_functions import name_node, smiles_node, related_node, structure_node
|
| 7 |
from modrag_task_graphs import get_actives_for_protein, get_predictions_for_protein, dock_from_names
|
| 8 |
from modrag_protein_functions import uniprot_node, listbioactives_node, getbioactives_node, predict_node, gpt_node, pdb_node, find_node, docking_node, target_node
|
| 9 |
from modrag_property_functions import substitution_node, lipinski_node, pharmfeature_node
|
|
|
|
| 38 |
'name_node': 'Find the name of this molecule c1cc(O)ccc1',
|
| 39 |
'smiles_node': 'Finds SMILES strings for cyclohexane and aspirin',
|
| 40 |
'related_node': 'Find molecules similar to c1cc(O)ccc1',
|
| 41 |
+
'structure_node': 'Find the structure of the molecule with SMILES string c1cc(O)ccc1, or the name Aspirin.',
|
| 42 |
# modrag_task_graphs.py
|
| 43 |
'get_actives_for_protein': 'Find the bioactive molecules for the protein DNA gyrase.',
|
| 44 |
'get_predictions_for_protein': 'Predict the IC50 value for c1cc(O)ccc1 in the protein DNA gyrase.',
|
|
|
|
| 251 |
elif tool == 'related_node':
|
| 252 |
tool_function_hash = {
|
| 253 |
'related_node': [related_node, [smiles_list]]}
|
| 254 |
+
elif tool == 'structure_node':
|
| 255 |
+
tool_function_hash = {
|
| 256 |
+
'structure_node': [structure_node, [smiles_list]]}
|
| 257 |
elif tool == 'get_predictions_for_protein':
|
| 258 |
tool_function_hash = {
|
| 259 |
'get_predictions_for_protein': [get_predictions_for_protein, [smiles_list, proteins_list[0]]]}
|
|
|
|
| 328 |
elif tool == 'related_node':
|
| 329 |
tool_function_reqs = {
|
| 330 |
'related_node': [[smiles_list], ['SMILES strings']]}
|
| 331 |
+
elif tool == 'structure_node':
|
| 332 |
+
tool_function_reqs = {
|
| 333 |
+
'structure_node': [[smiles_list], ['SMILES strings']]}
|
| 334 |
elif tool == 'get_predictions_for_protein':
|
| 335 |
tool_function_reqs = {
|
| 336 |
'get_predictions_for_protein': [[smiles_list, proteins_list], ['SMILES strings', 'protein names']]}
|
intake_function.py
CHANGED
|
@@ -637,6 +637,7 @@ full_tool_descriptions = {
|
|
| 637 |
'smiles_node' : 'Queries Pubchem for the smiles string of the molecule based on the name.',
|
| 638 |
'name_node' : 'Queries Pubchem for the name of the molecule based on the smiles string.',
|
| 639 |
'related_node' : 'Queries Pubchem for similar molecules based on the smiles string or name.',
|
|
|
|
| 640 |
'substitution_node' : 'A simple substitution routine that looks for a substituent on a phenyl ring and\
|
| 641 |
substitutes different fragments in that location. Returns a list of novel molecules and their\
|
| 642 |
QED score (1 is most drug-like, 0 is least drug-like).',
|
|
@@ -717,14 +718,14 @@ def websearch_node(query: str, embed_model, proxy_flag: bool = True) -> (list[st
|
|
| 717 |
max_hits = len(scores)
|
| 718 |
top_hits = []
|
| 719 |
hits_idx = 0
|
| 720 |
-
while hits_idx <
|
| 721 |
current_hit_idx = np.argmax(scores[0])
|
| 722 |
current_score = scores[0][current_hit_idx].item()
|
| 723 |
top_hits.append((titles[current_hit_idx], links[current_hit_idx], current_score))
|
| 724 |
scores[0][current_hit_idx] = -1
|
| 725 |
hits_idx += 1
|
| 726 |
|
| 727 |
-
search_string = f'The top
|
| 728 |
i = 0
|
| 729 |
for title, link, score in top_hits:
|
| 730 |
search_string += f'{i}. {title}\nLink: {link}\nScore: {score:.3f}\n\n'
|
|
|
|
| 637 |
'smiles_node' : 'Queries Pubchem for the smiles string of the molecule based on the name.',
|
| 638 |
'name_node' : 'Queries Pubchem for the name of the molecule based on the smiles string.',
|
| 639 |
'related_node' : 'Queries Pubchem for similar molecules based on the smiles string or name.',
|
| 640 |
+
'structure_node' : 'Generates the 3D structure and chemical formula of the molecule based on the name or smiles string.',
|
| 641 |
'substitution_node' : 'A simple substitution routine that looks for a substituent on a phenyl ring and\
|
| 642 |
substitutes different fragments in that location. Returns a list of novel molecules and their\
|
| 643 |
QED score (1 is most drug-like, 0 is least drug-like).',
|
|
|
|
| 718 |
max_hits = len(scores)
|
| 719 |
top_hits = []
|
| 720 |
hits_idx = 0
|
| 721 |
+
while hits_idx < max_hits:
|
| 722 |
current_hit_idx = np.argmax(scores[0])
|
| 723 |
current_score = scores[0][current_hit_idx].item()
|
| 724 |
top_hits.append((titles[current_hit_idx], links[current_hit_idx], current_score))
|
| 725 |
scores[0][current_hit_idx] = -1
|
| 726 |
hits_idx += 1
|
| 727 |
|
| 728 |
+
search_string = f'The top {max_hits} hits for your query are:\n'
|
| 729 |
i = 0
|
| 730 |
for title, link, score in top_hits:
|
| 731 |
search_string += f'{i}. {title}\nLink: {link}\nScore: {score:.3f}\n\n'
|
modrag_molecule_functions.py
CHANGED
|
@@ -10,6 +10,7 @@ import os, re
|
|
| 10 |
from rdkit import RDConfig
|
| 11 |
import pubchempy as pcp
|
| 12 |
from PIL import Image
|
|
|
|
| 13 |
|
| 14 |
def name_node(smiles_list: list[str]) -> (list[str], str):
|
| 15 |
'''
|
|
@@ -118,4 +119,45 @@ def related_node(smiles_list: list[str]) -> (list[list[str]], str, list):
|
|
| 118 |
total_similar_list.append([])
|
| 119 |
all_images.append(None)
|
| 120 |
|
| 121 |
-
return total_similar_list, related_string, all_images
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
from rdkit import RDConfig
|
| 11 |
import pubchempy as pcp
|
| 12 |
from PIL import Image
|
| 13 |
+
from collections import Counter
|
| 14 |
|
| 15 |
def name_node(smiles_list: list[str]) -> (list[str], str):
|
| 16 |
'''
|
|
|
|
| 119 |
total_similar_list.append([])
|
| 120 |
all_images.append(None)
|
| 121 |
|
| 122 |
+
return total_similar_list, related_string, all_images
|
| 123 |
+
|
| 124 |
+
def structure_node(smiles_list: list[str]) -> (list[str], str, list):
|
| 125 |
+
'''
|
| 126 |
+
Generates the 3D structure of the molecule based on the smiles string.
|
| 127 |
+
Args:
|
| 128 |
+
smiles: the input smiles string
|
| 129 |
+
Returns:
|
| 130 |
+
all_structures: a list of strings of the 3D structure of the molecule
|
| 131 |
+
output_string: a string of the chemical formulae.
|
| 132 |
+
all_images: a list of images of the 3D structure of the molecule
|
| 133 |
+
'''
|
| 134 |
+
print("structure tool")
|
| 135 |
+
|
| 136 |
+
all_mols = []
|
| 137 |
+
all_structures = []
|
| 138 |
+
output_string = ''
|
| 139 |
+
|
| 140 |
+
for smile in smiles_list:
|
| 141 |
+
mol = Chem.MolFromSmiles(smile)
|
| 142 |
+
molH = Chem.AddHs(mol)
|
| 143 |
+
AllChem.EmbedMolecule(molH)
|
| 144 |
+
AllChem.MMFFOptimizeMolecule(molH)
|
| 145 |
+
|
| 146 |
+
structure_string = ""
|
| 147 |
+
all_symbols = []
|
| 148 |
+
for atom in molH.GetAtoms():
|
| 149 |
+
symbol = atom.GetSymbol()
|
| 150 |
+
all_symbols.append(symbol)
|
| 151 |
+
pos = molH.GetConformer().GetAtomPosition(atom.GetIdx())
|
| 152 |
+
structure_string += f'{symbol} {pos[0]} {pos[1]} {pos[2]}\n'
|
| 153 |
+
|
| 154 |
+
atom_freqs = Counter(all_symbols)
|
| 155 |
+
formula = ''.join([f'{atom}{count}' for atom, count in atom_freqs.items()])
|
| 156 |
+
|
| 157 |
+
output_string += f'For {smile}: Formula is: {formula}\n'
|
| 158 |
+
all_structures.append(structure_string)
|
| 159 |
+
all_mols.append(molH)
|
| 160 |
+
|
| 161 |
+
img = Draw.MolsToGridImage(all_mols, molsPerRow=3, subImgSize=(250, 250))
|
| 162 |
+
|
| 163 |
+
return all_structures, output_string, [img]
|