MoDrAg2-OpenAI / modrag_protein_functions.py
cafierom's picture
Upload modrag_protein_functions.py
ca59a47 verified
from rdkit import Chem
from rdkit.Chem import AllChem, QED
from rdkit.Chem import Draw
from rdkit.Chem.Draw import MolsToGridImage
from rdkit import rdBase
from rdkit.Chem import rdMolAlign
import os, re
from rdkit import RDConfig
from PIL import Image
import numpy as np
import pandas as pd
from chembl_webresource_client.new_client import new_client
from tqdm.auto import tqdm
import requests, json
from rcsbapi.search import TextQuery
import itertools
import lightgbm as lgb
from lightgbm import LGBMRegressor
import deepchem as dc
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import random
from finetune_gpt import *
from dockstring import load_target
from langchain_core.tools import tool
@tool
def uniprot_node(protein_names: list[str], human_flag: bool = False) -> (list[str], str):
'''
This tool takes in the user requested protein and searches UNIPROT for matches.
It returns a string scontaining the protein ID, gene name, organism, and protein name.
Args:
query_protein: the name of the protein to search for.
Returns:
total_ids: a list of UNIPROT IDs for the given protein names.
protein_string: a string containing the protein ID, gene name, organism, and protein name.
'''
print("UNIPROT tool")
print('===================================================')
total_ids = []
protein_string = ''
for protein_name in protein_names:
try:
url = f'https://rest.uniprot.org/uniprotkb/search?query={protein_name}&format=tsv'
response = requests.get(url).text
f = open(f"{protein_name}_uniprot_ids.tsv", "w")
f.write(response)
f.close()
prot_df_raw = pd.read_csv(f'{protein_name}_uniprot_ids.tsv', sep='\t')
if human_flag:
prot_df = prot_df_raw[prot_df_raw['Organism'] == "Homo sapiens (Human)"]
print(f"Found {len(prot_df)} Human proteins out of {len(prot_df_raw)} total proteins")
else:
prot_df = prot_df_raw
prot_ids = prot_df['Entry'].tolist()
genes = prot_df['Gene Names'].tolist()
organisms = prot_df['Organism'].tolist()
names = prot_df['Protein names'].tolist()
sub_ids = []
for id, gene, organism, name in zip(prot_ids, genes, organisms, names):
protein_string += f'Protein {protein_name}, ID: {id}, Gene: {gene}, Organism: {organism}, Name: {name}\n'
sub_ids.append(id)
protein_string += '==========================================================================================\n'
total_ids.append(sub_ids)
except:
protein_string += f'No proteins found for {protein_name}'
protein_string += '==========================================================================================\n'
total_ids.append([])
return total_ids, protein_string, None
def get_qed(smiles):
'''
Helper function to compute QED for a given molecule.
Args:
smiles: the input smiles string
Returns:
qed: the QED score of the molecule.
'''
mol = Chem.MolFromSmiles(smiles)
qed = Chem.QED.default(mol)
return qed
@tool
def listbioactives_node(up_ids_list: list[str]) -> (list[int], list[str], str):
'''
Accepts a UNIPROT ID and searches for bioactive molecules
Args:
up_ids_list: the UNIPROT IDs of the proteins to search for.
Returns:
total_bioacts_list: a list of the number of bioactive molecules for each protein
total_chembl_ids_list: a list of the ChEMBL IDs for each protein
bioact_string: a string containing the results of the search.
'''
print("List bioactives tool")
print('===================================================')
total_bioacts_list = []
total_chembl_ids_list = []
bioact_string = ''
for up_id in up_ids_list:
targets = new_client.target
bioact = new_client.activity
try:
target_info = targets.get(target_components__accession=up_id).only("target_chembl_id","organism", "pref_name", "target_type")
target_info = pd.DataFrame.from_records(target_info)
print(target_info)
if len(target_info) > 0:
print(f"Found info for Uniprot ID: {up_id}")
chembl_ids = target_info['target_chembl_id'].tolist()
chembl_ids = list(set(chembl_ids))
print(f"Found {len(chembl_ids)} unique ChEMBL IDs")
len_all_bioacts = []
for chembl_id in chembl_ids:
bioact_chosen = bioact.filter(target_chembl_id=chembl_id, type="IC50", relation="=").only(
"molecule_chembl_id",
"type",
"standard_units",
"relation",
"standard_value",
)
len_this_bioacts = len(bioact_chosen)
len_all_bioacts.append(len_this_bioacts)
bioact_string += f"For Uniprot {up_id}: length of Bioactivities for ChEMBL ID {chembl_id}: {len_this_bioacts}\n"
bioact_string += f'================================================================================================\n'
total_chembl_ids_list.append(chembl_ids)
total_bioacts_list.append(len_all_bioacts)
except:
bioact_string += f'No bioactives found for Uniprot {up_id}\n'
bioact_string += f'================================================================================================\n'
total_chembl_ids_list.append([])
total_bioacts_list.append([])
return total_bioacts_list, bioact_string, None
@tool
def getbioactives_node(chembl_ids_list: list[str]) -> (list[str], str):
'''
Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID
Args:
chembl_id: the chembl ID to query
Returns:
bioactives_list: a list of the bioactive molecules for each chembl ID
bioactives_string: a string containing the results of the search.
bioactives_images: a list of images for each bioactive molecule.
'''
print("Get bioactives tool")
print('===================================================')
bioactives_list = []
bioactives_images = []
bioactives_string = ''
for chembl_id in chembl_ids_list:
try:
#check if f'{chembl_id}_bioactives.csv' exists
chembl_id = chembl_id.upper()
if os.path.exists(f'{chembl_id}_bioactives.csv'):
print(f'Found {chembl_id}_bioactives.csv')
total_bioact_df = pd.read_csv(f'{chembl_id}_bioactives.csv')
print(f"number of records: {len(total_bioact_df)}")
else:
compounds = new_client.molecule
bioact = new_client.activity
bioact_chosen = bioact.filter(target_chembl_id=chembl_id, type="IC50", relation="=").only(
"molecule_chembl_id",
"type",
"standard_units",
"relation",
"standard_value",
)
chembl_ids = []
ic50s = []
for record in bioact_chosen:
if record["standard_units"] == 'nM':
chembl_ids.append(record["molecule_chembl_id"])
ic50s.append(float(record["standard_value"]))
bioact_dict = {'chembl_ids' : chembl_ids, 'IC50s': ic50s}
bioact_df = pd.DataFrame.from_dict(bioact_dict)
bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
print(f"Number of records: {len(bioact_df)}")
print(bioact_df.shape)
compounds_provider = compounds.filter(molecule_chembl_id__in=bioact_df["chembl_ids"].to_list()).only(
"molecule_chembl_id",
"molecule_structures"
)
cids_list = []
smiles_list = []
for record in compounds_provider:
cid = record['molecule_chembl_id']
cids_list.append(cid)
if record['molecule_structures']:
if record['molecule_structures']['canonical_smiles']:
smile = record['molecule_structures']['canonical_smiles']
else:
print("No canonical smiles")
smile = None
else:
print('no structures')
smile = None
smiles_list.append(smile)
new_dict = {'SMILES': smiles_list, 'chembl_ids_2': cids_list}
new_df = pd.DataFrame.from_dict(new_dict)
total_bioact_df = pd.merge(bioact_df, new_df, left_on='chembl_ids', right_on='chembl_ids_2')
print(f"number of records: {len(total_bioact_df)}")
total_bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
print(f"number of records after removing duplicates: {len(total_bioact_df)}")
total_bioact_df.dropna(axis=0, how='any', inplace=True)
total_bioact_df.drop(["chembl_ids_2"],axis=1,inplace=True)
print(f"number of records after dropping Null values: {len(total_bioact_df)}")
total_bioact_df.sort_values(by=["IC50s"],inplace=True)
if len(total_bioact_df) > 0:
total_bioact_df.to_csv(f'{chembl_id}_bioactives.csv')
limit = 50
if len(total_bioact_df) > limit:
total_bioact_df = total_bioact_df.iloc[:limit]
bioact_tuple_list = []
bioactives_string += f'Results for top bioactivity (IC50 value) for molecules in ChEMBL ID: {chembl_id}. \n'
for smile, ic50 in zip(total_bioact_df['SMILES'], total_bioact_df['IC50s']):
bioactives_string += f'Molecule SMILES: {smile}, IC50 (nM): {ic50}\n'
bioact_tuple_list.append((smile, ic50))
bioactives_string += f'=========================================================================================\n'
mols = [Chem.MolFromSmiles(smile) for smile in total_bioact_df['SMILES'].to_list()]
legends = [f'IC50: {ic50}' for ic50 in total_bioact_df['IC50s'].to_list()]
img = MolsToGridImage(mols, molsPerRow=5, legends=legends, subImgSize=(200,200))
bioactives_images.append(img)
bioactives_list.append(bioact_tuple_list)
except:
bioactives_list.append([])
bioactives_string += f'No bioactives found for ChEMBL ID: {chembl_id}\n'
bioactives_string += f'=========================================================================================\n'
bioactives_images.append(None)
img = bioactives_images[0]
try:
img.save('current_image.png')
except:
pic = img.data
with open('current_image.png', 'wb') as f:
f.write(pic)
img = Image.open('current_image.png')
return bioactives_list, bioactives_string, img
@tool
def predict_node(smiles_list_in: list[str], chembl_id: str) -> (list[float],str):
'''
uses the current_bioactives.csv file from the get_bioactives node to fit the
Light GBM model and predict the IC50 for the current smiles.
Args:
smiles_list: the SMILES strings of the molecules to predict
chembl_id: the chembl ID to query
Returns:
preds: a list of predicted IC50 values for the input SMILES
preds_string: a string containing the predicted IC50 values for the input SMILES
'''
print("Predict Tool")
print('===================================================')
# if f'{chembl_id}_bioactives.csv' does not exist, call the bioactives node
if not os.path.exists(f'{chembl_id}_bioactives.csv'):
_, _, _ = getbioactives_node([chembl_id])
try:
chembl_id = chembl_id.upper()
df = pd.read_csv(f'{chembl_id}_bioactives.csv')
#if length of the dataframe is over 2000, take a random sample of 2000 points
if len(df) > 2000:
df = df.sample(n=2000, random_state=42)
y_raw = df["IC50s"].to_list()
smiles_list = df["SMILES"].to_list()
ions_to_clean = ["[Na+].",".[Na+]","[Cl-].",".[Cl-]","[K+].",".[K+]"]
Xa = []
y = []
for smile, value in zip(smiles_list, y_raw):
for ion in ions_to_clean:
smile = smile.replace(ion,"")
y.append(np.log10(value))
Xa.append(smile)
mols = [Chem.MolFromSmiles(smile) for smile in Xa]
print(f"Number of molecules: {len(mols)}")
featurizer=dc.feat.RDKitDescriptors()
featname="RDKitDescriptors"
f = featurizer.featurize(mols)
nan_indicies = np.isnan(f)
bad_rows = []
for i, row in enumerate(nan_indicies):
for item in row:
if item == True:
if i not in bad_rows:
print(f"Row {i} has a NaN.")
bad_rows.append(i)
print(f"Old dimensions are: {f.shape}.")
for j,i in enumerate(bad_rows):
k=i-j
f = np.delete(f,k,axis=0)
y = np.delete(y,k,axis=0)
Xa = np.delete(Xa,k,axis=0)
print(f"Deleting row {k} from arrays.")
print(f"New dimensions are: {f.shape}")
if f.shape[0] != len(y) or f.shape[0] != len(Xa):
raise ValueError("Number of rows in X and y do not match.")
X_train, X_test, y_train, y_test = train_test_split(f, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
model = LGBMRegressor(metric='rmse', max_depth = 50, verbose = -1, num_leaves = 31,
feature_fraction = 0.8, min_data_in_leaf = 20)
modelname = "LightGBM Regressor"
model.fit(X_train, y_train)
train_score = model.score(X_train,y_train)
print(f"score for training set: {train_score:.3f}")
valid_score = model.score(X_test, y_test)
print(f"score for validation set: {valid_score:.3f}")
except:
return [], 'Model training failed, unable to predict.', None
preds = []
preds_string = ''
for smiles in smiles_list_in:
print(f"in predict node, smiles: {smiles}")
try:
for ion in ions_to_clean:
smiles = smiles.replace(ion,"")
test_mol = Chem.MolFromSmiles(smiles)
test_feat = featurizer.featurize([test_mol])
test_feat = scaler.transform(test_feat)
prediction = model.predict(test_feat)
test_ic50 = 10**(prediction[0])
print(f"Predicted IC50 for {smiles}: {test_ic50}")
preds_string += f"The predicted IC50 value for {smiles} is : {test_ic50:.3f} nM.\n"
preds.append(test_ic50)
except:
preds.append(None)
preds_string += f"The prediction for {smiles} failed.\n"
preds_string += f"The Bioactive data was fitted with the LightGMB model, using RDKit descriptors. The training score \
was {train_score:.3f} and the testing score was {valid_score:.3f}. "
return preds, preds_string, None
@tool
def gpt_node(chembl_id: str) -> (list[str], str, Image.Image):
'''
Uses a Chembl dataset, previously stored in a CSV file by the get_bioactives node, to
to finetune a GPT model to generate novel molecules for the target protein.
Args:
chembl_id: the ChEMBL ID to query
returns:
smiles_list: a list of generated SMILES strings
gpt_string: a string containing the results of the GPT finetuning and generation.
img: an image containing the generated molecules.
'''
print("GPT node")
print('===================================================')
# if f'{chembl_id}_bioactives.csv' does not exist, call the bioactives node
chembl_id = chembl_id.upper()
if not os.path.exists(f'{chembl_id}_bioactives.csv'):
_, _, _ = getbioactives_node_func([chembl_id])
try:
df = pd.read_csv(f'{chembl_id}_bioactives.csv')
smiles_list, gpt_string, img = finetune_gpt(df, chembl_id)
except:
gpt_string = ''
smiles_list = []
img = None
return smiles_list, gpt_string, img
def get_protein_from_pdb(pdb_id):
'''
Helper function to get the protein information from the PDB database.
Args:
pdb_id: the PDB ID of the protein
Returns:
r.text: the PDB information as a string
'''
url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
r = requests.get(url)
return r.text
def one_to_three(one_seq):
'''
Converts a one-letter amino acid sequence to a three-letter sequence.
Args:
one_seq: the one-letter amino acid sequence
Returns:
three_seq: the three-letter amino acid sequence
'''
rev_aa_hash = {
'A': 'ALA',
'R': 'ARG',
'N': 'ASN',
'D': 'ASP',
'C': 'CYS',
'Q': 'GLN',
'E': 'GLU',
'G': 'GLY',
'H': 'HIS',
'I': 'ILE',
'L': 'LEU',
'K': 'LYS',
'M': 'MET',
'F': 'PHE',
'P': 'PRO',
'S': 'SER',
'T': 'THR',
'W': 'TRP',
'Y': 'TYR',
'V': 'VAL'
}
try:
three_seq = rev_aa_hash[one_seq]
except:
three_seq = 'X'
return three_seq
def three_to_one(three_seq):
'''
Converts a three-letter amino acid sequence to a one-letter sequence.
Args:
three_seq: the three-letter amino acid sequence
Returns:
one_seq: the one-letter amino acid sequence
'''
aa_hash = {
'ALA': 'A',
'ARG': 'R',
'ASN': 'N',
'ASP': 'D',
'CYS': 'C',
'GLN': 'Q',
'GLU': 'E',
'GLY': 'G',
'HIS': 'H',
'ILE': 'I',
'LEU': 'L',
'LYS': 'K',
'MET': 'M',
'PHE': 'F',
'PRO': 'P',
'SER': 'S',
'THR': 'T',
'TRP': 'W',
'TYR': 'Y',
'VAL': 'V'
}
one_seq = []
for residue in three_seq:
try:
one_seq.append(aa_hash[residue])
except:
one_seq.append('X')
return one_seq
@tool
def pdb_node(test_pdb_list: list[str]) -> (list[str], str):
'''
Accepts a PDB ID and queires the protein databank for the sequence of the protein, as well as other
information such as ligands.
Args:
test_pdb_list: the PDB IDs to query
Returns:
all_seqs: a list of the sequences for each PDB ID
total_pdb_string: a string containing the results of the PDB query.
(collects all ligands but does not return them currently)
'''
print(f"pdb toolS")
print('===================================================')
total_pdb_string = ''
all_seqs = []
all_ligands = []
for test_pdb in test_pdb_list:
try:
pdb_str = get_protein_from_pdb(test_pdb)
chains = {}
other_molecules = {}
#print(pdb_str.split('\n')[0])
for line in pdb_str.split('\n'):
parts = line.split()
try:
if parts[0] == 'SEQRES':
if parts[2] not in chains:
chains[parts[2]] = []
chains[parts[2]].extend(parts[4:])
if parts[0] == 'HETNAM':
j = 1
if parts[1].strip() in ['2','3','4','5','6','7','8','9']:
j = 2
print(parts[j])
if parts[j] not in other_molecules:
other_molecules[parts[j]] = []
other_molecules[parts[j]].extend(parts[2:])
except:
print('Blank line')
chains_ol = {}
for chain in chains:
chains_ol[chain] = three_to_one(chains[chain])
sub_seqs = []
sub_ligands = []
total_pdb_string += f"Chains in PDB ID {test_pdb}: {', '.join(chains.keys())} \n"
for chain in chains_ol:
total_pdb_string += f"Chain {chain}: {''.join(chains_ol[chain])} \n"
sub_seqs.append(''.join(chains_ol[chain]))
print(f"Chain {chain}: {''.join(chains_ol[chain])}")
total_pdb_string += f"Ligands in PDB ID {test_pdb}.\n"
for mol in other_molecules:
total_pdb_string += f"Molecule {mol}: {''.join(other_molecules[mol])} \n"
sub_ligands.append(''.join(other_molecules[mol]))
total_pdb_string += f'=========================================================================================\n'
all_seqs.append(sub_seqs)
all_ligands.append(sub_ligands)
except:
total_pdb_string += f'Failed to get data for PDB ID {test_pdb}\n'
total_pdb_string += f'=========================================================================================\n'
all_seqs.append([])
all_ligands.append([])
return all_seqs, total_pdb_string, None
@tool
def find_node(test_protein_list: list[str]) -> (list[str], str):
'''
Accepts a protein name and searches the protein databack for PDB IDs that match along with the entry titles.
Args:
test_protein_list: the protein names to query
Returns:
total_ids: a list of the PDB IDs for each protein name
pdb_string: a string containing the results of the PDB search.
'''
print(f"PDB search tool")
print('===================================================')
total_ids = []
pdb_string = ''
which_pdbs = 0
for test_protein in test_protein_list:
try:
query = TextQuery(value=test_protein)
results = query()
def pdb_gen():
for rid in results:
yield(rid)
take10 = itertools.islice(pdb_gen(), which_pdbs, which_pdbs+10, 1)
local_ids = []
pdb_string += f'10 PDBs that match the protein {test_protein} are: \n'
for pdb in take10:
data = requests.get(f"https://data.rcsb.org/rest/v1/core/entry/{pdb}").json()
title = data['struct']['title']
pdb_string += f'PDB ID: {pdb}, with title: {title} \n'
local_ids.append(pdb)
total_ids.append(local_ids)
except:
pdb_string += f'Failed to get PDB IDs for protein {test_protein}\n'
total_ids.append([])
return total_ids, pdb_string, None
@tool
def docking_node(smiles_list: list[str], query_protein: str) -> (list[float], str):
'''
Docking tool: uses dockstring to dock the molecule into the protein
Args:
smiles_list: the SMILES strings of the molecules to dock
protein: the protein to dock into
Returns:
docking_scores: a list of docking scores for each molecule
docking_string: a string containing the results of the docking.
'''
print("docking tool")
print('===================================================')
cpuCount = os.cpu_count()
print(f"Number of CPUs: {cpuCount}")
print(f'query_protein: {query_protein}')
scores_list = []
scores_string = 'Docking below performed with AutoDock Vina on protein structures from the DUDE database.\n'
for query_smiles in smiles_list:
try:
query_smiles = query_smiles.replace('.[Na+]','').replace('.[Na+]','').replace('.[K+]','').replace('[K+].','').replace('.[Cl-]','').replace('[Cl-].','')
target = load_target(query_protein)
print("===============================================")
print(f"Docking molecule with {cpuCount} cpu cores.")
score, aux = target.dock(query_smiles, num_cpus = cpuCount)
scores_list.append(score)
mol = aux['ligand']
print(f"Docking score: {score}")
print("===============================================")
atoms_list = ""
template = mol
molH = Chem.AddHs(mol)
AllChem.ConstrainedEmbed(molH,template, useTethers=True)
xyz_string = f"{molH.GetNumAtoms()}\n\n"
for atom in molH.GetAtoms():
atoms_list += atom.GetSymbol()
pos = molH.GetConformer().GetAtomPosition(atom.GetIdx())
xyz_string += f"{atom.GetSymbol()} {pos[0]} {pos[1]} {pos[2]}\n"
scores_string += f"Docking score for molecule with SMILES: {query_smiles} is: {score} kcal/mol \n\n"
scores_string += f"pose XYZ structure for molecule with SMILES: {query_smiles} is: \n"
lines = xyz_string.split('\n')
for line in lines[2:]:
scores_string += f'{line}\n'
scores_string += f"=========================================================\n"
except:
print(f"Molecule {query_smiles} could not be docked!")
scores_string = 'Could not dock!'
scores_list.append(None)
return scores_list, scores_string, None
@tool
def target_node(search_descriptors: list[str]):
'''
Accepts a disease name and searches Open Targets for associated targets
Args:
search_descriptor (str): Disease name
Returns:
targets_list (list): List of targets
targets_string (str): String of targets
None
'''
base_url = "https://api.platform.opentargets.org/api/v4/graphql"
disease_query_string = """
query searchEntity($queryString: String!) {
search(queryString: $queryString){
total
hits {
id
entity
description
}
}
}
"""
target_query_string = """
query associatedTargets($efo_id: String!) {
disease(efoId: $efo_id) {
id
name
associatedTargets {
count
rows {
target {
id
approvedSymbol
}
score
}
}
}
}
"""
total_targets_list = []
total_targets_string = ''
for search_descriptor in search_descriptors:
variables = {"queryString": search_descriptor}
r = requests.post(base_url, json={"query": disease_query_string, "variables": variables})
disease_list = []
targets_list = []
if r.status_code == 200:
api_response = json.loads(r.text)
if len(api_response['data']['search']['hits']) > 0:
for hit in api_response['data']['search']['hits']:
if hit['entity'] == 'disease':
disease_list.append(hit['id'])
else:
print('Could not find results.')
if len(disease_list) > 0:
q = requests.post(base_url, json={"query": target_query_string, "variables": {"efo_id": disease_list[0]}})
if q.status_code == 200:
api_response = json.loads(q.text)
for target in api_response['data']['disease']['associatedTargets']['rows']:
targets_list.append(target['target']['approvedSymbol'])
targets_string = f'Possible targets for {search_descriptor} include: \n'
if len(targets_list) > 0:
for i, target in enumerate(targets_list):
targets_string += f'{i+1}. {target}\n'
else:
targets_string = f'No targets found for {search_descriptor}'
total_targets_list.append(targets_list)
total_targets_string += targets_string
return total_targets_list, total_targets_string, None
def getbioactives_node_func(chembl_ids_list: list[str]) -> (list[str], str):
'''
Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID
Args:
chembl_id: the chembl ID to query
Returns:
bioactives_list: a list of the bioactive molecules for each chembl ID
bioactives_string: a string containing the results of the search.
bioactives_images: a list of images for each bioactive molecule.
'''
print("Get bioactives tool")
print('===================================================')
bioactives_list = []
bioactives_images = []
bioactives_string = ''
for chembl_id in chembl_ids_list:
try:
#check if f'{chembl_id}_bioactives.csv' exists
chembl_id = chembl_id.upper()
if os.path.exists(f'{chembl_id}_bioactives.csv'):
print(f'Found {chembl_id}_bioactives.csv')
total_bioact_df = pd.read_csv(f'{chembl_id}_bioactives.csv')
print(f"number of records: {len(total_bioact_df)}")
else:
compounds = new_client.molecule
bioact = new_client.activity
bioact_chosen = bioact.filter(target_chembl_id=chembl_id, type="IC50", relation="=").only(
"molecule_chembl_id",
"type",
"standard_units",
"relation",
"standard_value",
)
chembl_ids = []
ic50s = []
for record in bioact_chosen:
if record["standard_units"] == 'nM':
chembl_ids.append(record["molecule_chembl_id"])
ic50s.append(float(record["standard_value"]))
bioact_dict = {'chembl_ids' : chembl_ids, 'IC50s': ic50s}
bioact_df = pd.DataFrame.from_dict(bioact_dict)
bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
print(f"Number of records: {len(bioact_df)}")
print(bioact_df.shape)
compounds_provider = compounds.filter(molecule_chembl_id__in=bioact_df["chembl_ids"].to_list()).only(
"molecule_chembl_id",
"molecule_structures"
)
cids_list = []
smiles_list = []
for record in compounds_provider:
cid = record['molecule_chembl_id']
cids_list.append(cid)
if record['molecule_structures']:
if record['molecule_structures']['canonical_smiles']:
smile = record['molecule_structures']['canonical_smiles']
else:
print("No canonical smiles")
smile = None
else:
print('no structures')
smile = None
smiles_list.append(smile)
new_dict = {'SMILES': smiles_list, 'chembl_ids_2': cids_list}
new_df = pd.DataFrame.from_dict(new_dict)
total_bioact_df = pd.merge(bioact_df, new_df, left_on='chembl_ids', right_on='chembl_ids_2')
print(f"number of records: {len(total_bioact_df)}")
total_bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
print(f"number of records after removing duplicates: {len(total_bioact_df)}")
total_bioact_df.dropna(axis=0, how='any', inplace=True)
total_bioact_df.drop(["chembl_ids_2"],axis=1,inplace=True)
print(f"number of records after dropping Null values: {len(total_bioact_df)}")
total_bioact_df.sort_values(by=["IC50s"],inplace=True)
if len(total_bioact_df) > 0:
total_bioact_df.to_csv(f'{chembl_id}_bioactives.csv')
limit = 50
if len(total_bioact_df) > limit:
total_bioact_df = total_bioact_df.iloc[:limit]
bioact_tuple_list = []
bioactives_string += f'Results for top bioactivity (IC50 value) for molecules in ChEMBL ID: {chembl_id}. \n'
for smile, ic50 in zip(total_bioact_df['SMILES'], total_bioact_df['IC50s']):
bioactives_string += f'Molecule SMILES: {smile}, IC50 (nM): {ic50}\n'
bioact_tuple_list.append((smile, ic50))
bioactives_string += f'=========================================================================================\n'
mols = [Chem.MolFromSmiles(smile) for smile in total_bioact_df['SMILES'].to_list()]
legends = [f'IC50: {ic50}' for ic50 in total_bioact_df['IC50s'].to_list()]
img = MolsToGridImage(mols, molsPerRow=5, legends=legends, subImgSize=(200,200))
bioactives_images.append(img)
bioactives_list.append(bioact_tuple_list)
except:
bioactives_list.append([])
bioactives_string += f'No bioactives found for ChEMBL ID: {chembl_id}\n'
bioactives_string += f'=========================================================================================\n'
bioactives_images.append(None)
img = bioactives_images[0]
try:
img.save('current_image.png')
except:
pic = img.data
with open('current_image.png', 'wb') as f:
f.write(pic)
img = Image.open('current_image.png')
return bioactives_list, bioactives_string, img