import sys,os,glob,re,string from collections import Counter from operator import itemgetter import nltk import pandas as pd import numpy as np import pubchempy as pcp import cirpy import chemicals import bs4 import urllib import requests import json ORGANIC_ATOM_SET = {5, 6, 7, 8, 9, 15, 16, 17, 35, 53} METAL_ATOM_SET = set([3,4,11,12,13] + list(range(19,31+1)) + list(range(37,50+1)) + list(range(55,84+1)) + list(range(87,114+1)) + [116]) with open('data/ceramics_list.txt', 'r') as fp: lines = fp.readlines() CERAMICS_SET = {line.strip() for line in lines} with open('data/salt_list.txt', 'r') as fp: lines = fp.readlines() SALT_SET = {line.strip() for line in lines} ERROR_CODES = {0:None, 1:'Structure could not be determined from the identifier', 2:'Invalid SMILES code', 3:'Invalid CAS number', 4:'Invalid identifier type selected'} ## not sure if this will be possible on pythonanywhere; use this flag to disable related code blocks try_dsstox = True if try_dsstox: import selenium import selenium.webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service ## NOTE this does not seem very robust uname = os.uname() if uname.sysname == 'Darwin': # mac import chromedriver_binary driver_exe = chromedriver_binary.chromedriver_filename elif uname.sysname == 'Linux': # pythonanywhere driver_exe = '/usr/local/bin/chromedriver' else: # fallback... import chromedriver_binary driver_exe = chromedriver_binary.chromedriver_filename from io import BytesIO from PIL import ImageOps import base64 import rdkit from rdkit.Chem import AllChem as Chem from rdkit.Chem import Descriptors,Draw,Crippen ## add custom chemical definitions (i.e., to correct confusion between methane and carbon) db = chemicals.identifiers.get_pubchem_db() db.load('data/custom_chemicals_db.tsv') ## load experimental and predicted properties #dfmp_expt = pd.read_excel('PHYSPROP_MP_data.xlsx') dfmp_expt = pd.read_csv('data/PHYSPROP_MP_data.tsv', sep='\t') #dfmp_pred = pd.read_excel('DSSTOX_MP_pred_data.xlsx') #df_pred = pd.read_excel('Comptox_pred_data.xlsx') df_pred = pd.read_csv('data/Comptox_pred_data.tsv', sep='\t') ## OPERA melting point model import dill as pickle import sklearn import sklearn.neighbors import sklearn.metrics import padelpy from functions import weight_func class opera_data_mp(): n_neighbors = 5 #weight_factor = 5e-3 desc_list = ['SHBd', 'nN', 'maxHBd', 'ATSC1v', 'AATS1i', 'TopoPSA', 'nT6Ring', 'nHBDon', 'WTPT-5', 'minHBd', 'nHBint2', 'IC0', 'MLFER_S', 'MLFER_BO', 'WTPT-3'] with open('mp/model-opera-knn.pkl', 'rb') as fp: knn_all = pickle.load(fp) knn_all.weights = weight_func # fix weird problem on pythonanywhere... with open('mp/model-opera-scalerX.pkl', 'rb') as fp: scaler_X = pickle.load(fp) with open('mp/model-opera-scalerY.pkl', 'rb') as fp: scaler_y = pickle.load(fp) my_opera_data_mp = opera_data_mp() def ResolveChemical(chemName, IDtype, debug=False, get_properties=['logp','rho','mp']): #LogP_func = Crippen.MolLogP LogP_func = getLogP # remove excess whitespace chemName = chemName.strip() name = None smiles = None cas = None Mw = None LogP = None rho = None mp = None im64 = None mp_origin = None rho_origin = None LogP_origin = None error = 0 if IDtype == 'CAS': cas = chemName if not is_cas(cas): error = 3 #invalid cas return (name, cas, smiles, Mw, LogP, rho, mp, im64, error) smiles = cas2smiles(cas) if smiles: name = cas2name(cas) if 'rho' in get_properties: rho, rho_origin = string2density(cas) if 'mp' in get_properties: mp, mp_origin = mol2mp(cas, name, smiles) if 'rho' in get_properties and pd.isna(rho) and name: rho, rho_origin = string2density(name) try: mol = Chem.MolFromSmiles(smiles) except: error = 2 #invalid smiles if mol: Mw = Descriptors.MolWt(mol) if 'logp' in get_properties: LogP, LogP_origin = LogP_func(cas, mol) im = ImageFromSmiles(smiles) im64 = Imageto64(im) else: error = 2 #invalid smiles else: error = 1 # no smiles found elif IDtype == 'SMILES': smiles = chemName try: mol = Chem.MolFromSmiles(smiles) except: error = 2 if mol: Mw = Descriptors.MolWt(mol) if 'logp' in get_properties: LogP, LogP_origin = LogP_func(cas, mol) im = ImageFromSmiles(smiles) im64 = Imageto64(im) else: error = 2 # if SMILES is not valid, skip the other stuff if not error: name = smiles2name(smiles) if name: cas = name2cas(name) if 'rho' in get_properties: rho, rho_origin = string2density(name) if 'rho' in get_properties and pd.isna(rho) and cas: rho, rho_origin = string2density(cas) if 'mp' in get_properties: mp, mp_origin = mol2mp(cas, name, smiles) elif IDtype == 'common': name = chemName name, name_origin = name2iupac(name) smiles = name2smiles(name) cas = name2cas(name) if not smiles: smiles = cas2smiles(cas) if 'rho' in get_properties and pd.isna(rho) and cas: rho, rho_origin = string2density(cas) if 'rho' in get_properties and pd.isna(rho): rho, rho_origin = string2density(name) if 'rho' in get_properties and pd.isna(rho): # try this because sometimes iupac names don't work rho, rho_origin = string2density(chemName) if smiles: if 'mp' in get_properties: mp, mp_origin = mol2mp(cas, name, smiles) try: mol = Chem.MolFromSmiles(smiles) except: error = 2 if mol: Mw = Descriptors.MolWt(mol) if 'logp' in get_properties: LogP, LogP_origin = LogP_func(cas, mol) im = ImageFromSmiles(smiles) im64 = Imageto64(im) else: error = 2 else: error = 1 else: ## should never be here name = None smiles = None cas = None Mw = None LogP = None rho = None mp = None im64 = None error = 4 # invalid IDtype selection, probably not possible # if we couldn't find a name or CAS (but do have SMILES) if not error: if not name: name = 'Not found' if not cas: cas = 'Not found' if mp is not None: mp = float(mp) if debug: return (name, cas, smiles, Mw, LogP, LogP_origin, rho, rho_origin, mp, mp_origin, im64, error) else: return (name, cas, smiles, Mw, LogP, rho, mp, im64, error) def CeramicOrMetal(smiles,mp): # metals/ceramics logic is_ceramic = False mol = Chem.MolFromSmiles(smiles) atom_num_list = [a.GetAtomicNum() for a in mol.GetAtoms()] is_metal = set(atom_num_list) <= METAL_ATOM_SET if not is_metal: # check composition against list of ceramics/salts elements = ','.join(sorted(set([a.GetSymbol() for a in mol.GetAtoms()]))) if elements in CERAMICS_SET: is_ceramic = True if elements in SALT_SET: is_ceramic = True if not is_ceramic: # get number of carbon-carbon bonds num_CC_bonds = sum([1 if b.GetBeginAtom().GetAtomicNum() == 6 and b.GetEndAtom().GetAtomicNum() == 6 else 0 for b in mol.GetBonds()]) if not num_CC_bonds and (mp is not None) and mp > 700.: # if not a metal, no C-C bonds, and mp > 700 (sodium chloride has mp ~ 800), assume ceramic... is_ceramic = True return is_metal, is_ceramic #Generates an image of the molecule represented by the SMILES code given. #Returns None if the image cannot be generated. From https://github.com/ronaldo-prata/flask-test/blob/master/functions.py def ImageFromSmiles(smiles): image = None if type(smiles) is str: try: if smiles == 'C1=CC=C2C(=C1)C3=NC4=NC(=NC5=C6C=CC=CC6=C([N-]5)N=C7C8=CC=CC=C8C(=N7)N=C2[N-]3)C9=CC=CC=C94.[Mn+2]': mol = next(Chem.SDMolSupplier('data/MnPC.sdf', removeHs=False)) image = Draw.MolToImage(mol, size=(350, 350)) else: image = Draw.MolToImage(Chem.MolFromSmiles(smiles), size=(350, 350)) except ValueError: pass return image #Trims the image into a box, removing any excess white background. #The box cannot be smaller than 400x400. This is done due to the difference in quality in the images generated by MolToImage, # if the size is too small (300x300), big molecules are too low quality, but if the size is too big (1000*1000), small molecules appear zoomed out. def wTrim(img): bbox = ImageOps.invert(img).getbbox() crop = (bbox[0], bbox[1], bbox[2], bbox[3]) return img.crop(crop) #Converts a PIL image into its base64 representation. def Imageto64(img): img = wTrim(img) buf = BytesIO() img.save(buf, format="PNG") pngImageB64String = "data:image/png;base64," pngImageB64String += base64.b64encode(buf.getvalue()).decode("utf-8") return pngImageB64String # function to convert SMILES to name def smiles2name(smiles): name = None # first try chemicals package try: cm = chemicals.search_chemical(smiles) if cm.iupac_name: name = cm.iupac_name elif cm.common_name: name = cm.common_name except: name = None # then try pubchem for compounds if not name: try: compounds = pcp.get_compounds(smiles, namespace='smiles') c = compounds[0] name = c.iupac_name if not name: # have seen empty iupac_name before, try synonyms if this happens name = c.synonyms[0] except: name = None # next try cirpy if not name: try: name = cirpy.resolve(smiles, 'iupac_name') except: name = None if type(name) is list: name = name[0] # finally try it as a pubchem substance if not name: try: compounds = pcp.get_substances(smiles, namespace='smiles') # sometimes there are multiple substances, and multiple synonyms per substance allsyns = [syn for c in compounds for syn in c.iupac_name if cas not in syn] # choose the most common synonym fd = nltk.FreqDist(allsyns) name = fd.most_common(1)[0][0] except: name = None return name # function to convert CAS to SMILES def cas2smiles(cas): smiles = None # first try chemicals package try: cm = chemicals.search_chemical(cas) smiles = cm.smiles except: smiles = None # then try pubchem for compounds if not smiles: try: compounds = pcp.get_compounds(cas, namespace='name') c = compounds[0] smiles = c.isomeric_smiles except: smiles = None # next try cirpy if not smiles: try: smiles = cirpy.resolve(cas, 'smiles') except: smiles = None if type(smiles) is list: smiles = smiles[0] # finally try it as a pubchem substance if not smiles: try: compounds = pcp.get_substances(cas, namespace='name') # sometimes there are multiple substances, and multiple synonyms per substance allsyns = [syn for c in compounds for syn in c.isomeric_smiles if cas not in syn] # choose the most common synonym fd = nltk.FreqDist(allsyns) smiles = fd.most_common(1)[0][0] except: smiles = None return smiles # function to convert cas to name def cas2name(cas): name = None #if not is_cas(cas): # name = 'INVALID CAS' # first try chemicals package try: cm = chemicals.search_chemical(cas) if cm.iupac_name: name = cm.iupac_name elif cm.common_name: name = cm.common_name except: name = None # then try cirpy if not name: try: name = cirpy.resolve(cas, 'iupac_name') except: name = None if type(name) is list: name = name[0] # next try pubchem for compounds if not name: try: compounds = pcp.get_compounds(cas, namespace='name') c = compounds[0] name = c.iupac_name if not name: # have seen empty iupac_name before, try synonyms if this happens name = c.synonyms[0] except: name = None return name # function to convert chemical name to iupac name def name2iupac(string): name = None origin = None # try chemicals package try: cm = chemicals.search_chemical(string) if cm.iupac_name: name = cm.iupac_name elif cm.common_name: name = cm.common_name origin = 'chemicals' except: name = None origin = None # try pubchem for compounds if not name: try: compounds = pcp.get_compounds(string, namespace='name') c = compounds[0] name = c.iupac_name if not name: # have seen empty iupac_name before, try synonyms if this happens name = c.synonyms[0] origin = 'PubChem' except: name = None origin = None # next try cirpy if not name: try: #name = cirpy.resolve(string, 'names') name = cirpy.resolve(string, 'iupac_name') if name: origin = 'CIRPY' except: name = None origin = None if type(name) is list: name = name[0] # now try it as a pubchem substance if not name: try: compounds = pcp.get_substances(string, namespace='name') # sometimes there are multiple substances, and multiple synonyms per substance allsyns = [syn for c in compounds for syn in c.synonyms if cas not in syn] # choose the most common synonym fd = nltk.FreqDist(allsyns) name = fd.most_common(1)[0][0] origin = 'PubChem/substance' except: name = None origin = None # strip all spaces and try again... if not name: string_strip = re.sub(' ','',string) # first try pubchem for compounds try: compounds = pcp.get_compounds(string_strip, namespace='name') c = compounds[0] name = c.iupac_name if not name: # have seen empty iupac_name before, try synonyms if this happens name = c.synonyms[0] origin = 'PubChem' except: name = None origin = None # next try cirpy if not name: try: #name = cirpy.resolve(string_strip, 'names') name = cirpy.resolve(string, 'iupac_name') if name: origin = 'CIRPY' except: name = None origin = None if type(name) is list: name = name[0] return name, origin # function to convert name to cas def name2cas(name): cas = None # try chemicals package try: cm = chemicals.search_chemical(name) cas = cm.CASs except: cas = None # then try cirpy if not cas: try: cas = cirpy.resolve(name, 'cas') except: cas = None if type(cas) is list: cas.sort(key=lambda s: np.array(s.split('-'), dtype=int).sum()) cas = cas[0] # next try pubchem for compounds if not cas: try: compounds = pcp.get_compounds(name, namespace='name') c = compounds[0] syns = c.synonyms possible_cas = [syn for syn in syns if is_cas(syn)] # if multiple choose option with smallest sum of digits possible_cas.sort(key=lambda s: np.array(s.split('-'), dtype=int).sum()) cas = possible_cas[0] except: cas = None return cas # function to convert name to SMILES def name2smiles(name): smiles = None # first try chemicals package try: cm = chemicals.search_chemical(name) smiles = cm.smiles except: smiles = None # then try pubchem for compounds if not smiles: try: compounds = pcp.get_compounds(name, namespace='name') c = compounds[0] smiles = c.isomeric_smiles except: smiles = None # next try cirpy if not smiles: try: smiles = cirpy.resolve(name, 'smiles') except: smiles = None if type(smiles) is list: smiles = smiles[0] # then try it as a pubchem substance if not smiles: try: compounds = pcp.get_substances(name, namespace='name') # sometimes there are multiple substances, and multiple synonyms per substance allsyns = [syn for c in compounds for syn in c.isomeric_smiles if name not in syn] # choose the most common synonym fd = nltk.FreqDist(allsyns) smiles = fd.most_common(1)[0][0] except: smiles = None # finally try to resolve SMILES from name using OPSIN #if not smiles: # try: # with open('opsin.tmp.1', 'w') as fp: # fp.write(name) # os.system('java -jar /Users/robert.elder/software/utils/opsin-2.4.0-jar-with-dependencies.jar -osmi opsin.tmp.1 opsin.tmp.2 &> /dev/null') # with open('opsin.tmp.2') as fp: # smiles = fp.read() # if smiles == '\n': # smiles = None # smiles = smiles.strip() #remove trailing newline # except KeyboardInterrupt: # raise # except: # smiles = None return smiles def check_cas(cas): n1,n2,n3 = cas.split('-') # combine and flip first 2 numbers tmp = ''.join([n1,n2])[::-1] # sum of number*position in string check = sum([i*int(tmp[i-1]) for i in range(1,len(tmp)+1)]) # mod 10 check = check%10 # if these match, then it's a legit cas number return check == int(n3) def is_cas(cas): try: return check_cas(cas) except: return False def string2density(name): rho, rho_origin = None, None # predicted values from TEST (CompTox dashboard) if is_cas(name): mask = df_pred['CASRN'] == name if sum(mask): rho = float(df_pred[mask]['DENSITY_G/CM^3_TEST_PRED']) rho_origin = 'comptox/pred' # try to scrape from PubChem if pd.isna(rho): content = None try: compounds = pcp.get_compounds(name, namespace='name') c = compounds[0] cid = c.cid url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON' #fid = urllib.request.urlopen(url) #webpage = fid.read().decode('utf-8') webpage = requests.get(url).text content = json.loads(webpage) except: pass if content: rho_list = [] for i in content['Record']['Section']: if i['TOCHeading'] == 'Chemical and Physical Properties': for j in i['Section']: if j['TOCHeading'] == 'Experimental Properties': for k in j['Section']: if k['TOCHeading'] == 'Density': for ii in k['Information']: try: rho_string = ii['Value']['StringWithMarkup'][0]['String'] rho_string = rho_string.replace('Relative density (water = 1): ', '') #print(rho_string) #tmp_rho = re.match('(?:\d+(?:\.\d*)?|\.\d+)',rho_string) m = re.match(r'((?:\d+(?:\.\d*)?|\.\d+))(?:-((?:\d+(?:\.\d*)?|\.\d+)))?',rho_string) groups = m.groups() if len(groups): for g in groups: try: tmp_rho = float(g) rho_list.append(tmp_rho) except: continue except: continue if rho_list: ## remove outliers using interquartile range (IQR) rho_list = np.array(rho_list) q75,q25 = np.percentile(rho_list,[75,25]) intr_qr = q75-q25 hi = q75+(1.5*intr_qr) lo = q25-(1.5*intr_qr) mask = (rho_list <= hi) & (rho_list >= lo) rho_list = rho_list[mask] rho = np.mean(rho_list) rho_origin = 'pubchem' else: rho, rho_origin = None, None # try to scrape from DSSTOX if try_dsstox: if pd.isna(rho): dtxsid = None try: # try to find it via the dsstox dashboard name_urlsafe = urllib.parse.quote(name) url = f'https://comptox.epa.gov/dashboard/search-results?input_type=synonym_substring&inputs={name_urlsafe}' fid = urllib.request.urlopen(url) webpage = fid.read().decode('utf-8') hits = re.findall(r'DTXSID[0-9]+', webpage) if len(hits): dtxsid = hits[0] except: pass if dtxsid: mysoup = None url = f'https://comptox.epa.gov/dashboard/chemical/properties/{dtxsid}' #print(url) try: options = Options() options.add_argument("--headless") # runs in background instead of showing browser window service = Service(driver_exe) driver = selenium.webdriver.Chrome(service=service, options=options) driver.set_page_load_timeout(15) driver.get(url) webpage = driver.page_source driver.quit() mysoup = bs4.BeautifulSoup(webpage, features='lxml') except KeyboardInterrupt: raise except: pass if mysoup: ifound = None # column of property names rows = mysoup.find_all('div', attrs={'col-id':'property'}) for i,row in enumerate(rows): if 'Density' in row.text: ifound = i break if ifound: rows = mysoup.find_all('div', attrs={'col-id':'exavg'}) text = rows[ifound].text value = re.sub(r' \([0-9]*\)', '', text.strip()) try: rho = float(value) rho_origin = 'dsstox/expt' except: rho, rho_origin = None, None if pd.isna(rho): rows = mysoup.find_all('div', attrs={'col-id':'predavg'}) text = rows[ifound].text value = re.sub(r' \([0-9]*\)', '', text.strip()) try: rho = float(value) rho_origin = 'dsstox/pred' except: rho, rho_origin = None, None else: rho, rho_origin = None, None else: rho, rho_origin = None, None if pd.isna(rho): rho,rho_origin = None, None return rho, rho_origin def return_non_duplicate_index(tuples): ##Given a list of sets return index of non_duplicate items ## from https://github.com/curieshicy/JRgui/ ##step 1, create a new tuple, named "new_tuples" new_tuples = [] ##the elements are the sets for i in tuples: for j in i: new_tuples.append(set(j)) ##step 2, create a dictionary storing one to one relationship between new_tuple and old_tuple values = [] for index, item in enumerate(tuples): if len(item) == 1: values.append(index) else: for i in [index]*len(item): values.append(i) keys = [i for i in range(len(new_tuples))] dict_tuples = {} ## {0:0, 1:1, 2:2, 3:3, 4:3, 5:3, 6:4, 7:4, 8:4, 9:5, 10:6, 11:7, 12:8} for i, j in zip(keys, values): dict_tuples[i] = j ##step 3, remove duplicates in sets terminology remove_index = [] for index_1, item in enumerate(new_tuples): ##starting from beginning for index_2 in range(index_1 + 1, len(new_tuples)): ##loop over the rest items if len(item & new_tuples[index_2]) != 0: if len(item)>len(new_tuples[index_2]): remove_index.append(index_2) #indefoirx elif len(item)= lo) mp_list = mp_list[mask] mp = np.mean(mp_list) mp_origin = 'pubchem' else: mp, mp_origin = None, None # try to scrape from DSSTOX website... if try_dsstox: if pd.isna(mp): dtxsid = None # try to find it via the dsstox dashboard try: name_urlsafe = urllib.parse.quote(name) url = f'https://comptox.epa.gov/dashboard/search-results?input_type=synonym_substring&inputs={name_urlsafe}' fid = urllib.request.urlopen(url) webpage = fid.read().decode('utf-8') hits = re.findall(r'DTXSID[0-9]+', webpage) if len(hits): dtxsid = hits[0] except: pass if dtxsid: mysoup = None url = f'https://comptox.epa.gov/dashboard/chemical/properties/{dtxsid}' #print(url) try: #driver = selenium.webdriver.Firefox() #driver.set_page_load_timeout(15) #driver.get(url) #driver_exe = 'chromedriver' #driver_exe = chromedriver_binary.chromedriver_filename options = Options() options.add_argument("--headless") # runs in background instead of showing browser window service = Service(driver_exe) driver = selenium.webdriver.Chrome(service=service, options=options) #driver = selenium.webdriver.Chrome(driver_exe, options=options) driver.set_page_load_timeout(15) driver.get(url) webpage = driver.page_source driver.quit() mysoup = bs4.BeautifulSoup(webpage, features='lxml') except: pass #print("timeout") # column of property names if mysoup: ifound = None rows = mysoup.find_all('div', attrs={'col-id':'property'}) for i,row in enumerate(rows): if 'Melting Point' in row.text: ifound = i break if ifound: rows = mysoup.find_all('div', attrs={'col-id':'exavg'}) text = rows[ifound].text value = re.sub(r' \([0-9]*\)', '', text.strip()) try: mp = float(value) mp_origin = 'dsstox/expt' except: mp, mp_origin = None, None if pd.isna(mp): rows = mysoup.find_all('div', attrs={'col-id':'predavg'}) text = rows[ifound].text value = re.sub(r' \([0-9]*\)', '', text.strip()) try: mp = float(value) mp_origin = 'dsstox/pred' except: mp, mp_origin = None, None else: mp, mp_origin = None, None else: mp, mp_origin = None, None if pd.isna(mp): mp, mp_origin = None, None return mp, mp_origin def smiles2mp(smiles): try: SUPPORTED_ATOM_SET = {6, 7, 8, 9, 16, 17, 35, 53} m = Chem.MolFromSmiles(str(smiles)) atom_num_set = set([a.GetAtomicNum() for a in m.GetAtoms()]) if atom_num_set.issubset(SUPPORTED_ATOM_SET): mp = compute_phys_properties(smiles) else: mp = None except: mp = None return mp def smiles2mp_opera(smiles): descs = padelpy.from_smiles(smiles, descriptortypes='mp/descriptors.xml') #dfd = pd.DataFrame(descs,index=[0]) #dfd = dfd.replace('',0).infer_objects(copy=False) #dfd = pd.DataFrame(dfd, dtype=float) dfd = pd.DataFrame(descs, index=[0]).apply(pd.to_numeric, errors="coerce").fillna(0.0).astype(float) X = np.array(dfd[my_opera_data_mp.desc_list]) X_scale = my_opera_data_mp.scaler_X.transform(X) y_pred = my_opera_data_mp.scaler_y.inverse_transform(my_opera_data_mp.knn_all.predict(X_scale)) return y_pred[0][0] def mol2mp(cas, name, smiles): mp, mp_origin = None, None if pd.isna(mp) and smiles: try: mp = smiles2mp_opera(smiles) mp_origin = 'opera/calc' except: mp, mp_origin = None, None if pd.isna(mp) and cas: try: mp, mp_origin = string2mp(cas) except: mp, mp_origin = None, None if pd.isna(mp) and name: try: mp, mp_origin = string2mp(name) except: mp, mp_origin = None, None if pd.isna(mp) and smiles: try: mp = smiles2mp(smiles) mp_origin = 'joback-reid/calc' except: mp, mp_origin = None, None return mp, mp_origin def getLogP(cas,mol): LogP, LogP_origin = None, None if cas: mask = df_pred['CASRN'] == cas if sum(mask): LogP = float(df_pred[mask]['OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED']) LogP_origin = 'comptox/pred' if pd.isna(LogP) and mol: LogP = Crippen.MolLogP(mol) LogP_origin = 'rdkit/calc' return LogP, LogP_origin