Spaces:

cafierom
/

MoDrAg2-OpenAI

Sleeping

App Files Files Community

MoDrAg2-OpenAI / modrag_protein_functions.py

cafierom

Upload modrag_protein_functions.py

ca59a47 verified about 2 months ago

raw

history blame contribute delete

31.7 kB

	from rdkit import Chem
	from rdkit.Chem import AllChem, QED
	from rdkit.Chem import Draw
	from rdkit.Chem.Draw import MolsToGridImage
	from rdkit import rdBase
	from rdkit.Chem import rdMolAlign
	import os, re
	from rdkit import RDConfig
	from PIL import Image

	import numpy as np
	import pandas as pd
	from chembl_webresource_client.new_client import new_client
	from tqdm.auto import tqdm
	import requests, json
	from rcsbapi.search import TextQuery
	import itertools

	import lightgbm as lgb
	from lightgbm import LGBMRegressor
	import deepchem as dc
	from sklearn.model_selection import train_test_split, GridSearchCV
	from sklearn.preprocessing import StandardScaler
	import tensorflow as tf
	import random
	from finetune_gpt import *
	from dockstring import load_target
	from langchain_core.tools import tool


	@tool
	def uniprot_node(protein_names: list[str], human_flag: bool = False) -> (list[str], str):
	'''
	This tool takes in the user requested protein and searches UNIPROT for matches.
	It returns a string scontaining the protein ID, gene name, organism, and protein name.
	Args:
	query_protein: the name of the protein to search for.

	Returns:
	total_ids: a list of UNIPROT IDs for the given protein names.
	protein_string: a string containing the protein ID, gene name, organism, and protein name.

	'''
	print("UNIPROT tool")
	print('===================================================')

	total_ids = []
	protein_string = ''

	for protein_name in protein_names:
	try:
	url = f'https://rest.uniprot.org/uniprotkb/search?query={protein_name}&format=tsv'
	response = requests.get(url).text

	f = open(f"{protein_name}_uniprot_ids.tsv", "w")
	f.write(response)
	f.close()

	prot_df_raw = pd.read_csv(f'{protein_name}_uniprot_ids.tsv', sep='\t')
	if human_flag:
	prot_df = prot_df_raw[prot_df_raw['Organism'] == "Homo sapiens (Human)"]
	print(f"Found {len(prot_df)} Human proteins out of {len(prot_df_raw)} total proteins")
	else:
	prot_df = prot_df_raw

	prot_ids = prot_df['Entry'].tolist()
	genes = prot_df['Gene Names'].tolist()
	organisms = prot_df['Organism'].tolist()
	names = prot_df['Protein names'].tolist()

	sub_ids = []
	for id, gene, organism, name in zip(prot_ids, genes, organisms, names):
	protein_string += f'Protein {protein_name}, ID: {id}, Gene: {gene}, Organism: {organism}, Name: {name}\n'
	sub_ids.append(id)

	protein_string += '==========================================================================================\n'
	total_ids.append(sub_ids)
	except:
	protein_string += f'No proteins found for {protein_name}'
	protein_string += '==========================================================================================\n'
	total_ids.append([])

	return total_ids, protein_string, None

	def get_qed(smiles):
	'''
	Helper function to compute QED for a given molecule.
	Args:
	smiles: the input smiles string
	Returns:
	qed: the QED score of the molecule.
	'''
	mol = Chem.MolFromSmiles(smiles)
	qed = Chem.QED.default(mol)
	return qed

	@tool
	def listbioactives_node(up_ids_list: list[str]) -> (list[int], list[str], str):
	'''
	Accepts a UNIPROT ID and searches for bioactive molecules
	Args:
	up_ids_list: the UNIPROT IDs of the proteins to search for.
	Returns:
	total_bioacts_list: a list of the number of bioactive molecules for each protein
	total_chembl_ids_list: a list of the ChEMBL IDs for each protein
	bioact_string: a string containing the results of the search.
	'''
	print("List bioactives tool")
	print('===================================================')

	total_bioacts_list = []
	total_chembl_ids_list = []
	bioact_string = ''

	for up_id in up_ids_list:

	targets = new_client.target
	bioact = new_client.activity

	try:
	target_info = targets.get(target_components__accession=up_id).only("target_chembl_id","organism", "pref_name", "target_type")
	target_info = pd.DataFrame.from_records(target_info)
	print(target_info)
	if len(target_info) > 0:
	print(f"Found info for Uniprot ID: {up_id}")

	chembl_ids = target_info['target_chembl_id'].tolist()

	chembl_ids = list(set(chembl_ids))
	print(f"Found {len(chembl_ids)} unique ChEMBL IDs")

	len_all_bioacts = []
	for chembl_id in chembl_ids:
	bioact_chosen = bioact.filter(target_chembl_id=chembl_id, type="IC50", relation="=").only(
	"molecule_chembl_id",
	"type",
	"standard_units",
	"relation",
	"standard_value",
	)
	len_this_bioacts = len(bioact_chosen)
	len_all_bioacts.append(len_this_bioacts)
	bioact_string += f"For Uniprot {up_id}: length of Bioactivities for ChEMBL ID {chembl_id}: {len_this_bioacts}\n"

	bioact_string += f'================================================================================================\n'
	total_chembl_ids_list.append(chembl_ids)
	total_bioacts_list.append(len_all_bioacts)

	except:
	bioact_string += f'No bioactives found for Uniprot {up_id}\n'
	bioact_string += f'================================================================================================\n'
	total_chembl_ids_list.append([])
	total_bioacts_list.append([])
	return total_bioacts_list, bioact_string, None

	@tool
	def getbioactives_node(chembl_ids_list: list[str]) -> (list[str], str):
	'''
	Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID
	Args:
	chembl_id: the chembl ID to query
	Returns:
	bioactives_list: a list of the bioactive molecules for each chembl ID
	bioactives_string: a string containing the results of the search.
	bioactives_images: a list of images for each bioactive molecule.
	'''
	print("Get bioactives tool")
	print('===================================================')

	bioactives_list = []
	bioactives_images = []
	bioactives_string = ''

	for chembl_id in chembl_ids_list:
	try:
	#check if f'{chembl_id}_bioactives.csv' exists
	chembl_id = chembl_id.upper()
	if os.path.exists(f'{chembl_id}_bioactives.csv'):
	print(f'Found {chembl_id}_bioactives.csv')
	total_bioact_df = pd.read_csv(f'{chembl_id}_bioactives.csv')
	print(f"number of records: {len(total_bioact_df)}")
	else:

	compounds = new_client.molecule
	bioact = new_client.activity

	bioact_chosen = bioact.filter(target_chembl_id=chembl_id, type="IC50", relation="=").only(
	"molecule_chembl_id",
	"type",
	"standard_units",
	"relation",
	"standard_value",
	)

	chembl_ids = []
	ic50s = []
	for record in bioact_chosen:
	if record["standard_units"] == 'nM':
	chembl_ids.append(record["molecule_chembl_id"])
	ic50s.append(float(record["standard_value"]))

	bioact_dict = {'chembl_ids' : chembl_ids, 'IC50s': ic50s}
	bioact_df = pd.DataFrame.from_dict(bioact_dict)
	bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
	print(f"Number of records: {len(bioact_df)}")
	print(bioact_df.shape)

	compounds_provider = compounds.filter(molecule_chembl_id__in=bioact_df["chembl_ids"].to_list()).only(
	"molecule_chembl_id",
	"molecule_structures"
	)

	cids_list = []
	smiles_list = []

	for record in compounds_provider:
	cid = record['molecule_chembl_id']
	cids_list.append(cid)

	if record['molecule_structures']:
	if record['molecule_structures']['canonical_smiles']:
	smile = record['molecule_structures']['canonical_smiles']
	else:
	print("No canonical smiles")
	smile = None
	else:
	print('no structures')
	smile = None
	smiles_list.append(smile)

	new_dict = {'SMILES': smiles_list, 'chembl_ids_2': cids_list}
	new_df = pd.DataFrame.from_dict(new_dict)

	total_bioact_df = pd.merge(bioact_df, new_df, left_on='chembl_ids', right_on='chembl_ids_2')
	print(f"number of records: {len(total_bioact_df)}")

	total_bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
	print(f"number of records after removing duplicates: {len(total_bioact_df)}")

	total_bioact_df.dropna(axis=0, how='any', inplace=True)
	total_bioact_df.drop(["chembl_ids_2"],axis=1,inplace=True)
	print(f"number of records after dropping Null values: {len(total_bioact_df)}")

	total_bioact_df.sort_values(by=["IC50s"],inplace=True)

	if len(total_bioact_df) > 0:
	total_bioact_df.to_csv(f'{chembl_id}_bioactives.csv')

	limit = 50
	if len(total_bioact_df) > limit:
	total_bioact_df = total_bioact_df.iloc[:limit]

	bioact_tuple_list = []
	bioactives_string += f'Results for top bioactivity (IC50 value) for molecules in ChEMBL ID: {chembl_id}. \n'
	for smile, ic50 in zip(total_bioact_df['SMILES'], total_bioact_df['IC50s']):
	bioactives_string += f'Molecule SMILES: {smile}, IC50 (nM): {ic50}\n'
	bioact_tuple_list.append((smile, ic50))
	bioactives_string += f'=========================================================================================\n'

	mols = [Chem.MolFromSmiles(smile) for smile in total_bioact_df['SMILES'].to_list()]
	legends = [f'IC50: {ic50}' for ic50 in total_bioact_df['IC50s'].to_list()]
	img = MolsToGridImage(mols, molsPerRow=5, legends=legends, subImgSize=(200,200))
	bioactives_images.append(img)
	bioactives_list.append(bioact_tuple_list)
	except:
	bioactives_list.append([])
	bioactives_string += f'No bioactives found for ChEMBL ID: {chembl_id}\n'
	bioactives_string += f'=========================================================================================\n'
	bioactives_images.append(None)

	img = bioactives_images[0]
	try:
	img.save('current_image.png')
	except:
	pic = img.data
	with open('current_image.png', 'wb') as f:
	f.write(pic)
	img = Image.open('current_image.png')

	return bioactives_list, bioactives_string, img

	@tool
	def predict_node(smiles_list_in: list[str], chembl_id: str) -> (list[float],str):
	'''
	uses the current_bioactives.csv file from the get_bioactives node to fit the
	Light GBM model and predict the IC50 for the current smiles.
	Args:
	smiles_list: the SMILES strings of the molecules to predict
	chembl_id: the chembl ID to query
	Returns:
	preds: a list of predicted IC50 values for the input SMILES
	preds_string: a string containing the predicted IC50 values for the input SMILES
	'''
	print("Predict Tool")
	print('===================================================')

	# if f'{chembl_id}_bioactives.csv' does not exist, call the bioactives node
	if not os.path.exists(f'{chembl_id}_bioactives.csv'):
	_, _, _ = getbioactives_node([chembl_id])

	try:
	chembl_id = chembl_id.upper()
	df = pd.read_csv(f'{chembl_id}_bioactives.csv')
	#if length of the dataframe is over 2000, take a random sample of 2000 points
	if len(df) > 2000:
	df = df.sample(n=2000, random_state=42)

	y_raw = df["IC50s"].to_list()
	smiles_list = df["SMILES"].to_list()
	ions_to_clean = ["[Na+].",".[Na+]","[Cl-].",".[Cl-]","[K+].",".[K+]"]
	Xa = []
	y = []
	for smile, value in zip(smiles_list, y_raw):
	for ion in ions_to_clean:
	smile = smile.replace(ion,"")
	y.append(np.log10(value))
	Xa.append(smile)

	mols = [Chem.MolFromSmiles(smile) for smile in Xa]
	print(f"Number of molecules: {len(mols)}")

	featurizer=dc.feat.RDKitDescriptors()
	featname="RDKitDescriptors"
	f = featurizer.featurize(mols)

	nan_indicies = np.isnan(f)
	bad_rows = []
	for i, row in enumerate(nan_indicies):
	for item in row:
	if item == True:
	if i not in bad_rows:
	print(f"Row {i} has a NaN.")
	bad_rows.append(i)

	print(f"Old dimensions are: {f.shape}.")

	for j,i in enumerate(bad_rows):
	k=i-j
	f = np.delete(f,k,axis=0)
	y = np.delete(y,k,axis=0)
	Xa = np.delete(Xa,k,axis=0)
	print(f"Deleting row {k} from arrays.")

	print(f"New dimensions are: {f.shape}")
	if f.shape[0] != len(y) or f.shape[0] != len(Xa):
	raise ValueError("Number of rows in X and y do not match.")

	X_train, X_test, y_train, y_test = train_test_split(f, y, test_size=0.2, random_state=42)
	scaler = StandardScaler()
	X_train = scaler.fit_transform(X_train)
	X_test = scaler.transform(X_test)

	model = LGBMRegressor(metric='rmse', max_depth = 50, verbose = -1, num_leaves = 31,
	feature_fraction = 0.8, min_data_in_leaf = 20)
	modelname = "LightGBM Regressor"
	model.fit(X_train, y_train)

	train_score = model.score(X_train,y_train)
	print(f"score for training set: {train_score:.3f}")

	valid_score = model.score(X_test, y_test)
	print(f"score for validation set: {valid_score:.3f}")
	except:
	return [], 'Model training failed, unable to predict.', None

	preds = []
	preds_string = ''

	for smiles in smiles_list_in:
	print(f"in predict node, smiles: {smiles}")
	try:
	for ion in ions_to_clean:
	smiles = smiles.replace(ion,"")
	test_mol = Chem.MolFromSmiles(smiles)
	test_feat = featurizer.featurize([test_mol])
	test_feat = scaler.transform(test_feat)
	prediction = model.predict(test_feat)
	test_ic50 = 10**(prediction[0])
	print(f"Predicted IC50 for {smiles}: {test_ic50}")
	preds_string += f"The predicted IC50 value for {smiles} is : {test_ic50:.3f} nM.\n"

	preds.append(test_ic50)
	except:
	preds.append(None)
	preds_string += f"The prediction for {smiles} failed.\n"

	preds_string += f"The Bioactive data was fitted with the LightGMB model, using RDKit descriptors. The training score \
	was {train_score:.3f} and the testing score was {valid_score:.3f}. "
	return preds, preds_string, None

	@tool
	def gpt_node(chembl_id: str) -> (list[str], str, Image.Image):
	'''
	Uses a Chembl dataset, previously stored in a CSV file by the get_bioactives node, to
	to finetune a GPT model to generate novel molecules for the target protein.

	Args:
	chembl_id: the ChEMBL ID to query
	returns:
	smiles_list: a list of generated SMILES strings
	gpt_string: a string containing the results of the GPT finetuning and generation.
	img: an image containing the generated molecules.
	'''
	print("GPT node")
	print('===================================================')

	# if f'{chembl_id}_bioactives.csv' does not exist, call the bioactives node
	chembl_id = chembl_id.upper()
	if not os.path.exists(f'{chembl_id}_bioactives.csv'):
	_, _, _ = getbioactives_node_func([chembl_id])

	try:
	df = pd.read_csv(f'{chembl_id}_bioactives.csv')
	smiles_list, gpt_string, img = finetune_gpt(df, chembl_id)

	except:
	gpt_string = ''
	smiles_list = []
	img = None

	return smiles_list, gpt_string, img

	def get_protein_from_pdb(pdb_id):
	'''
	Helper function to get the protein information from the PDB database.
	Args:
	pdb_id: the PDB ID of the protein
	Returns:
	r.text: the PDB information as a string
	'''
	url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
	r = requests.get(url)
	return r.text

	def one_to_three(one_seq):
	'''
	Converts a one-letter amino acid sequence to a three-letter sequence.
	Args:
	one_seq: the one-letter amino acid sequence
	Returns:
	three_seq: the three-letter amino acid sequence
	'''
	rev_aa_hash = {
	'A': 'ALA',
	'R': 'ARG',
	'N': 'ASN',
	'D': 'ASP',
	'C': 'CYS',
	'Q': 'GLN',
	'E': 'GLU',
	'G': 'GLY',
	'H': 'HIS',
	'I': 'ILE',
	'L': 'LEU',
	'K': 'LYS',
	'M': 'MET',
	'F': 'PHE',
	'P': 'PRO',
	'S': 'SER',
	'T': 'THR',
	'W': 'TRP',
	'Y': 'TYR',
	'V': 'VAL'
	}

	try:
	three_seq = rev_aa_hash[one_seq]
	except:
	three_seq = 'X'

	return three_seq

	def three_to_one(three_seq):
	'''
	Converts a three-letter amino acid sequence to a one-letter sequence.
	Args:
	three_seq: the three-letter amino acid sequence
	Returns:
	one_seq: the one-letter amino acid sequence
	'''
	aa_hash = {
	'ALA': 'A',
	'ARG': 'R',
	'ASN': 'N',
	'ASP': 'D',
	'CYS': 'C',
	'GLN': 'Q',
	'GLU': 'E',
	'GLY': 'G',
	'HIS': 'H',
	'ILE': 'I',
	'LEU': 'L',
	'LYS': 'K',
	'MET': 'M',
	'PHE': 'F',
	'PRO': 'P',
	'SER': 'S',
	'THR': 'T',
	'TRP': 'W',
	'TYR': 'Y',
	'VAL': 'V'
	}

	one_seq = []
	for residue in three_seq:
	try:
	one_seq.append(aa_hash[residue])
	except:
	one_seq.append('X')
	return one_seq

	@tool
	def pdb_node(test_pdb_list: list[str]) -> (list[str], str):
	'''
	Accepts a PDB ID and queires the protein databank for the sequence of the protein, as well as other
	information such as ligands.
	Args:
	test_pdb_list: the PDB IDs to query
	Returns:
	all_seqs: a list of the sequences for each PDB ID
	total_pdb_string: a string containing the results of the PDB query.
	(collects all ligands but does not return them currently)
	'''

	print(f"pdb toolS")
	print('===================================================')

	total_pdb_string = ''
	all_seqs = []
	all_ligands = []

	for test_pdb in test_pdb_list:
	try:
	pdb_str = get_protein_from_pdb(test_pdb)
	chains = {}
	other_molecules = {}

	#print(pdb_str.split('\n')[0])
	for line in pdb_str.split('\n'):
	parts = line.split()
	try:
	if parts[0] == 'SEQRES':
	if parts[2] not in chains:
	chains[parts[2]] = []
	chains[parts[2]].extend(parts[4:])
	if parts[0] == 'HETNAM':
	j = 1
	if parts[1].strip() in ['2','3','4','5','6','7','8','9']:
	j = 2
	print(parts[j])
	if parts[j] not in other_molecules:
	other_molecules[parts[j]] = []
	other_molecules[parts[j]].extend(parts[2:])
	except:
	print('Blank line')

	chains_ol = {}
	for chain in chains:
	chains_ol[chain] = three_to_one(chains[chain])

	sub_seqs = []
	sub_ligands = []
	total_pdb_string += f"Chains in PDB ID {test_pdb}: {', '.join(chains.keys())} \n"
	for chain in chains_ol:
	total_pdb_string += f"Chain {chain}: {''.join(chains_ol[chain])} \n"
	sub_seqs.append(''.join(chains_ol[chain]))
	print(f"Chain {chain}: {''.join(chains_ol[chain])}")
	total_pdb_string += f"Ligands in PDB ID {test_pdb}.\n"
	for mol in other_molecules:
	total_pdb_string += f"Molecule {mol}: {''.join(other_molecules[mol])} \n"
	sub_ligands.append(''.join(other_molecules[mol]))
	total_pdb_string += f'=========================================================================================\n'

	all_seqs.append(sub_seqs)
	all_ligands.append(sub_ligands)
	except:
	total_pdb_string += f'Failed to get data for PDB ID {test_pdb}\n'
	total_pdb_string += f'=========================================================================================\n'
	all_seqs.append([])
	all_ligands.append([])
	return all_seqs, total_pdb_string, None

	@tool
	def find_node(test_protein_list: list[str]) -> (list[str], str):
	'''
	Accepts a protein name and searches the protein databack for PDB IDs that match along with the entry titles.
	Args:
	test_protein_list: the protein names to query
	Returns:
	total_ids: a list of the PDB IDs for each protein name
	pdb_string: a string containing the results of the PDB search.
	'''

	print(f"PDB search tool")
	print('===================================================')

	total_ids = []
	pdb_string = ''
	which_pdbs = 0

	for test_protein in test_protein_list:
	try:
	query = TextQuery(value=test_protein)
	results = query()

	def pdb_gen():
	for rid in results:
	yield(rid)

	take10 = itertools.islice(pdb_gen(), which_pdbs, which_pdbs+10, 1)

	local_ids = []
	pdb_string += f'10 PDBs that match the protein {test_protein} are: \n'
	for pdb in take10:
	data = requests.get(f"https://data.rcsb.org/rest/v1/core/entry/{pdb}").json()
	title = data['struct']['title']
	pdb_string += f'PDB ID: {pdb}, with title: {title} \n'
	local_ids.append(pdb)
	total_ids.append(local_ids)
	except:
	pdb_string += f'Failed to get PDB IDs for protein {test_protein}\n'
	total_ids.append([])
	return total_ids, pdb_string, None

	@tool
	def docking_node(smiles_list: list[str], query_protein: str) -> (list[float], str):
	'''
	Docking tool: uses dockstring to dock the molecule into the protein
	Args:
	smiles_list: the SMILES strings of the molecules to dock
	protein: the protein to dock into
	Returns:
	docking_scores: a list of docking scores for each molecule
	docking_string: a string containing the results of the docking.
	'''
	print("docking tool")
	print('===================================================')
	cpuCount = os.cpu_count()
	print(f"Number of CPUs: {cpuCount}")

	print(f'query_protein: {query_protein}')

	scores_list = []
	scores_string = 'Docking below performed with AutoDock Vina on protein structures from the DUDE database.\n'

	for query_smiles in smiles_list:
	try:
	query_smiles = query_smiles.replace('.[Na+]','').replace('.[Na+]','').replace('.[K+]','').replace('[K+].','').replace('.[Cl-]','').replace('[Cl-].','')
	target = load_target(query_protein)
	print("===============================================")
	print(f"Docking molecule with {cpuCount} cpu cores.")
	score, aux = target.dock(query_smiles, num_cpus = cpuCount)
	scores_list.append(score)
	mol = aux['ligand']
	print(f"Docking score: {score}")
	print("===============================================")
	atoms_list = ""
	template = mol
	molH = Chem.AddHs(mol)
	AllChem.ConstrainedEmbed(molH,template, useTethers=True)
	xyz_string = f"{molH.GetNumAtoms()}\n\n"
	for atom in molH.GetAtoms():
	atoms_list += atom.GetSymbol()
	pos = molH.GetConformer().GetAtomPosition(atom.GetIdx())
	xyz_string += f"{atom.GetSymbol()} {pos[0]} {pos[1]} {pos[2]}\n"
	scores_string += f"Docking score for molecule with SMILES: {query_smiles} is: {score} kcal/mol \n\n"
	scores_string += f"pose XYZ structure for molecule with SMILES: {query_smiles} is: \n"
	lines = xyz_string.split('\n')
	for line in lines[2:]:
	scores_string += f'{line}\n'
	scores_string += f"=========================================================\n"

	except:
	print(f"Molecule {query_smiles} could not be docked!")
	scores_string = 'Could not dock!'
	scores_list.append(None)
	return scores_list, scores_string, None

	@tool
	def target_node(search_descriptors: list[str]):
	'''
	Accepts a disease name and searches Open Targets for associated targets

	Args:
	search_descriptor (str): Disease name

	Returns:
	targets_list (list): List of targets
	targets_string (str): String of targets
	None
	'''
	base_url = "https://api.platform.opentargets.org/api/v4/graphql"

	disease_query_string = """
	query searchEntity($queryString: String!) {
	search(queryString: $queryString){
	total
	hits {
	id
	entity
	description
	}
	}
	}
	"""

	target_query_string = """
	query associatedTargets($efo_id: String!) {
	disease(efoId: $efo_id) {
	id
	name
	associatedTargets {
	count
	rows {
	target {
	id
	approvedSymbol
	}
	score
	}
	}
	}
	}
	"""
	total_targets_list = []
	total_targets_string = ''

	for search_descriptor in search_descriptors:

	variables = {"queryString": search_descriptor}
	r = requests.post(base_url, json={"query": disease_query_string, "variables": variables})

	disease_list = []
	targets_list = []

	if r.status_code == 200:
	api_response = json.loads(r.text)
	if len(api_response['data']['search']['hits']) > 0:
	for hit in api_response['data']['search']['hits']:
	if hit['entity'] == 'disease':
	disease_list.append(hit['id'])
	else:
	print('Could not find results.')

	if len(disease_list) > 0:
	q = requests.post(base_url, json={"query": target_query_string, "variables": {"efo_id": disease_list[0]}})
	if q.status_code == 200:
	api_response = json.loads(q.text)
	for target in api_response['data']['disease']['associatedTargets']['rows']:
	targets_list.append(target['target']['approvedSymbol'])

	targets_string = f'Possible targets for {search_descriptor} include: \n'
	if len(targets_list) > 0:
	for i, target in enumerate(targets_list):
	targets_string += f'{i+1}. {target}\n'
	else:
	targets_string = f'No targets found for {search_descriptor}'

	total_targets_list.append(targets_list)
	total_targets_string += targets_string

	return total_targets_list, total_targets_string, None

	def getbioactives_node_func(chembl_ids_list: list[str]) -> (list[str], str):
	'''
	Accepts a Chembl ID and get all bioactives molecule SMILES and IC50s for that ID
	Args:
	chembl_id: the chembl ID to query
	Returns:
	bioactives_list: a list of the bioactive molecules for each chembl ID
	bioactives_string: a string containing the results of the search.
	bioactives_images: a list of images for each bioactive molecule.
	'''
	print("Get bioactives tool")
	print('===================================================')

	bioactives_list = []
	bioactives_images = []
	bioactives_string = ''

	for chembl_id in chembl_ids_list:
	try:
	#check if f'{chembl_id}_bioactives.csv' exists
	chembl_id = chembl_id.upper()
	if os.path.exists(f'{chembl_id}_bioactives.csv'):
	print(f'Found {chembl_id}_bioactives.csv')
	total_bioact_df = pd.read_csv(f'{chembl_id}_bioactives.csv')
	print(f"number of records: {len(total_bioact_df)}")
	else:

	compounds = new_client.molecule
	bioact = new_client.activity

	bioact_chosen = bioact.filter(target_chembl_id=chembl_id, type="IC50", relation="=").only(
	"molecule_chembl_id",
	"type",
	"standard_units",
	"relation",
	"standard_value",
	)

	chembl_ids = []
	ic50s = []
	for record in bioact_chosen:
	if record["standard_units"] == 'nM':
	chembl_ids.append(record["molecule_chembl_id"])
	ic50s.append(float(record["standard_value"]))

	bioact_dict = {'chembl_ids' : chembl_ids, 'IC50s': ic50s}
	bioact_df = pd.DataFrame.from_dict(bioact_dict)
	bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
	print(f"Number of records: {len(bioact_df)}")
	print(bioact_df.shape)

	compounds_provider = compounds.filter(molecule_chembl_id__in=bioact_df["chembl_ids"].to_list()).only(
	"molecule_chembl_id",
	"molecule_structures"
	)

	cids_list = []
	smiles_list = []

	for record in compounds_provider:
	cid = record['molecule_chembl_id']
	cids_list.append(cid)

	if record['molecule_structures']:
	if record['molecule_structures']['canonical_smiles']:
	smile = record['molecule_structures']['canonical_smiles']
	else:
	print("No canonical smiles")
	smile = None
	else:
	print('no structures')
	smile = None
	smiles_list.append(smile)

	new_dict = {'SMILES': smiles_list, 'chembl_ids_2': cids_list}
	new_df = pd.DataFrame.from_dict(new_dict)

	total_bioact_df = pd.merge(bioact_df, new_df, left_on='chembl_ids', right_on='chembl_ids_2')
	print(f"number of records: {len(total_bioact_df)}")

	total_bioact_df.drop_duplicates(subset=["chembl_ids"], keep= "last")
	print(f"number of records after removing duplicates: {len(total_bioact_df)}")

	total_bioact_df.dropna(axis=0, how='any', inplace=True)
	total_bioact_df.drop(["chembl_ids_2"],axis=1,inplace=True)
	print(f"number of records after dropping Null values: {len(total_bioact_df)}")

	total_bioact_df.sort_values(by=["IC50s"],inplace=True)

	if len(total_bioact_df) > 0:
	total_bioact_df.to_csv(f'{chembl_id}_bioactives.csv')

	limit = 50
	if len(total_bioact_df) > limit:
	total_bioact_df = total_bioact_df.iloc[:limit]

	bioact_tuple_list = []
	bioactives_string += f'Results for top bioactivity (IC50 value) for molecules in ChEMBL ID: {chembl_id}. \n'
	for smile, ic50 in zip(total_bioact_df['SMILES'], total_bioact_df['IC50s']):
	bioactives_string += f'Molecule SMILES: {smile}, IC50 (nM): {ic50}\n'
	bioact_tuple_list.append((smile, ic50))
	bioactives_string += f'=========================================================================================\n'

	mols = [Chem.MolFromSmiles(smile) for smile in total_bioact_df['SMILES'].to_list()]
	legends = [f'IC50: {ic50}' for ic50 in total_bioact_df['IC50s'].to_list()]
	img = MolsToGridImage(mols, molsPerRow=5, legends=legends, subImgSize=(200,200))
	bioactives_images.append(img)
	bioactives_list.append(bioact_tuple_list)
	except:
	bioactives_list.append([])
	bioactives_string += f'No bioactives found for ChEMBL ID: {chembl_id}\n'
	bioactives_string += f'=========================================================================================\n'
	bioactives_images.append(None)

	img = bioactives_images[0]
	try:
	img.save('current_image.png')
	except:
	pic = img.data
	with open('current_image.png', 'wb') as f:
	f.write(pic)
	img = Image.open('current_image.png')

	return bioactives_list, bioactives_string, img