| import sys,os,glob,re,string |
| from collections import Counter |
| from operator import itemgetter |
| import nltk |
| import pandas as pd |
| import numpy as np |
| import pubchempy as pcp |
| import cirpy |
| import chemicals |
|
|
| import bs4 |
| import urllib |
| import requests |
| import json |
|
|
| ORGANIC_ATOM_SET = {5, 6, 7, 8, 9, 15, 16, 17, 35, 53} |
| METAL_ATOM_SET = set([3,4,11,12,13] + list(range(19,31+1)) + list(range(37,50+1)) + list(range(55,84+1)) + list(range(87,114+1)) + [116]) |
| with open('data/ceramics_list.txt', 'r') as fp: |
| lines = fp.readlines() |
| CERAMICS_SET = {line.strip() for line in lines} |
| with open('data/salt_list.txt', 'r') as fp: |
| lines = fp.readlines() |
| SALT_SET = {line.strip() for line in lines} |
|
|
| ERROR_CODES = {0:None, 1:'Structure could not be determined from the identifier', 2:'Invalid SMILES code', 3:'Invalid CAS number', 4:'Invalid identifier type selected'} |
|
|
| |
| try_dsstox = True |
| if try_dsstox: |
| import selenium |
| import selenium.webdriver |
| from selenium.webdriver.chrome.options import Options |
| from selenium.webdriver.chrome.service import Service |
| |
| uname = os.uname() |
| if uname.sysname == 'Darwin': |
| |
| import chromedriver_binary |
| driver_exe = chromedriver_binary.chromedriver_filename |
| elif uname.sysname == 'Linux': |
| |
| driver_exe = '/usr/local/bin/chromedriver' |
| else: |
| |
| import chromedriver_binary |
| driver_exe = chromedriver_binary.chromedriver_filename |
|
|
| from io import BytesIO |
| from PIL import ImageOps |
| import base64 |
|
|
| import rdkit |
| from rdkit.Chem import AllChem as Chem |
| from rdkit.Chem import Descriptors,Draw,Crippen |
|
|
| |
| db = chemicals.identifiers.get_pubchem_db() |
| db.load('data/custom_chemicals_db.tsv') |
| |
| |
| dfmp_expt = pd.read_csv('data/PHYSPROP_MP_data.tsv', sep='\t') |
| |
| |
| df_pred = pd.read_csv('data/Comptox_pred_data.tsv', sep='\t') |
|
|
| |
| import dill as pickle |
| import sklearn |
| import sklearn.neighbors |
| import sklearn.metrics |
| import padelpy |
| from functions import weight_func |
| class opera_data_mp(): |
| n_neighbors = 5 |
| |
| desc_list = ['SHBd', 'nN', 'maxHBd', 'ATSC1v', 'AATS1i', 'TopoPSA', 'nT6Ring', 'nHBDon', 'WTPT-5', 'minHBd', 'nHBint2', 'IC0', 'MLFER_S', 'MLFER_BO', 'WTPT-3'] |
| with open('mp/model-opera-knn.pkl', 'rb') as fp: |
| knn_all = pickle.load(fp) |
| knn_all.weights = weight_func |
| with open('mp/model-opera-scalerX.pkl', 'rb') as fp: |
| scaler_X = pickle.load(fp) |
| with open('mp/model-opera-scalerY.pkl', 'rb') as fp: |
| scaler_y = pickle.load(fp) |
| my_opera_data_mp = opera_data_mp() |
|
|
| def ResolveChemical(chemName, IDtype, debug=False, get_properties=['logp','rho','mp']): |
|
|
| |
| LogP_func = getLogP |
| |
| |
| chemName = chemName.strip() |
| |
| name = None |
| smiles = None |
| cas = None |
| Mw = None |
| LogP = None |
| rho = None |
| mp = None |
| im64 = None |
| mp_origin = None |
| rho_origin = None |
| LogP_origin = None |
| error = 0 |
|
|
| if IDtype == 'CAS': |
| cas = chemName |
|
|
| if not is_cas(cas): |
| error = 3 |
| return (name, cas, smiles, Mw, LogP, rho, mp, im64, error) |
|
|
| smiles = cas2smiles(cas) |
|
|
| if smiles: |
| name = cas2name(cas) |
| if 'rho' in get_properties: |
| rho, rho_origin = string2density(cas) |
| if 'mp' in get_properties: |
| mp, mp_origin = mol2mp(cas, name, smiles) |
| if 'rho' in get_properties and pd.isna(rho) and name: |
| rho, rho_origin = string2density(name) |
| try: |
| mol = Chem.MolFromSmiles(smiles) |
| except: |
| error = 2 |
| if mol: |
| Mw = Descriptors.MolWt(mol) |
| if 'logp' in get_properties: |
| LogP, LogP_origin = LogP_func(cas, mol) |
| im = ImageFromSmiles(smiles) |
| im64 = Imageto64(im) |
| else: |
| error = 2 |
| else: |
| error = 1 |
| elif IDtype == 'SMILES': |
| smiles = chemName |
|
|
| try: |
| mol = Chem.MolFromSmiles(smiles) |
| except: |
| error = 2 |
| if mol: |
| Mw = Descriptors.MolWt(mol) |
| if 'logp' in get_properties: |
| LogP, LogP_origin = LogP_func(cas, mol) |
| im = ImageFromSmiles(smiles) |
| im64 = Imageto64(im) |
| else: |
| error = 2 |
|
|
| |
| if not error: |
| name = smiles2name(smiles) |
| if name: |
| cas = name2cas(name) |
| if 'rho' in get_properties: |
| rho, rho_origin = string2density(name) |
| if 'rho' in get_properties and pd.isna(rho) and cas: |
| rho, rho_origin = string2density(cas) |
|
|
| if 'mp' in get_properties: |
| mp, mp_origin = mol2mp(cas, name, smiles) |
| elif IDtype == 'common': |
| name = chemName |
|
|
| name, name_origin = name2iupac(name) |
| smiles = name2smiles(name) |
| cas = name2cas(name) |
| if not smiles: |
| smiles = cas2smiles(cas) |
|
|
| if 'rho' in get_properties and pd.isna(rho) and cas: |
| rho, rho_origin = string2density(cas) |
| if 'rho' in get_properties and pd.isna(rho): |
| rho, rho_origin = string2density(name) |
| if 'rho' in get_properties and pd.isna(rho): |
| |
| rho, rho_origin = string2density(chemName) |
|
|
| if smiles: |
| if 'mp' in get_properties: |
| mp, mp_origin = mol2mp(cas, name, smiles) |
| try: |
| mol = Chem.MolFromSmiles(smiles) |
| except: |
| error = 2 |
| if mol: |
| Mw = Descriptors.MolWt(mol) |
| if 'logp' in get_properties: |
| LogP, LogP_origin = LogP_func(cas, mol) |
| im = ImageFromSmiles(smiles) |
| im64 = Imageto64(im) |
| else: |
| error = 2 |
| else: |
| error = 1 |
| else: |
| |
| name = None |
| smiles = None |
| cas = None |
| Mw = None |
| LogP = None |
| rho = None |
| mp = None |
| im64 = None |
| error = 4 |
|
|
| |
| if not error: |
| if not name: |
| name = 'Not found' |
| if not cas: |
| cas = 'Not found' |
| |
| if mp is not None: |
| mp = float(mp) |
|
|
| if debug: |
| return (name, cas, smiles, Mw, LogP, LogP_origin, rho, rho_origin, mp, mp_origin, im64, error) |
| else: |
| return (name, cas, smiles, Mw, LogP, rho, mp, im64, error) |
|
|
| def CeramicOrMetal(smiles,mp): |
| |
| is_ceramic = False |
| mol = Chem.MolFromSmiles(smiles) |
| atom_num_list = [a.GetAtomicNum() for a in mol.GetAtoms()] |
| is_metal = set(atom_num_list) <= METAL_ATOM_SET |
| if not is_metal: |
| |
| elements = ','.join(sorted(set([a.GetSymbol() for a in mol.GetAtoms()]))) |
| if elements in CERAMICS_SET: |
| is_ceramic = True |
| if elements in SALT_SET: |
| is_ceramic = True |
| if not is_ceramic: |
| |
| num_CC_bonds = sum([1 if b.GetBeginAtom().GetAtomicNum() == 6 and b.GetEndAtom().GetAtomicNum() == 6 else 0 for b in mol.GetBonds()]) |
| if not num_CC_bonds and (mp is not None) and mp > 700.: |
| |
| is_ceramic = True |
| return is_metal, is_ceramic |
|
|
| |
| |
|
|
| def ImageFromSmiles(smiles): |
| image = None |
| if type(smiles) is str: |
| try: |
| if smiles == 'C1=CC=C2C(=C1)C3=NC4=NC(=NC5=C6C=CC=CC6=C([N-]5)N=C7C8=CC=CC=C8C(=N7)N=C2[N-]3)C9=CC=CC=C94.[Mn+2]': |
| mol = next(Chem.SDMolSupplier('data/MnPC.sdf', removeHs=False)) |
| image = Draw.MolToImage(mol, size=(350, 350)) |
| else: |
| image = Draw.MolToImage(Chem.MolFromSmiles(smiles), size=(350, 350)) |
| except ValueError: |
| pass |
| return image |
|
|
| |
| |
| |
|
|
| def wTrim(img): |
| bbox = ImageOps.invert(img).getbbox() |
| crop = (bbox[0], bbox[1], bbox[2], bbox[3]) |
|
|
| return img.crop(crop) |
|
|
| |
| def Imageto64(img): |
| img = wTrim(img) |
| buf = BytesIO() |
| img.save(buf, format="PNG") |
| pngImageB64String = "data:image/png;base64," |
| pngImageB64String += base64.b64encode(buf.getvalue()).decode("utf-8") |
|
|
| return pngImageB64String |
|
|
| |
| def smiles2name(smiles): |
| name = None |
| |
| try: |
| cm = chemicals.search_chemical(smiles) |
| if cm.iupac_name: |
| name = cm.iupac_name |
| elif cm.common_name: |
| name = cm.common_name |
| except: |
| name = None |
| |
| if not name: |
| try: |
| compounds = pcp.get_compounds(smiles, namespace='smiles') |
| c = compounds[0] |
| name = c.iupac_name |
| if not name: |
| |
| name = c.synonyms[0] |
| except: |
| name = None |
| |
| if not name: |
| try: |
| name = cirpy.resolve(smiles, 'iupac_name') |
| except: |
| name = None |
| if type(name) is list: |
| name = name[0] |
| |
| if not name: |
| try: |
| compounds = pcp.get_substances(smiles, namespace='smiles') |
| |
| allsyns = [syn for c in compounds for syn in c.iupac_name if cas not in syn] |
| |
| fd = nltk.FreqDist(allsyns) |
| name = fd.most_common(1)[0][0] |
| except: |
| name = None |
| return name |
|
|
| |
| def cas2smiles(cas): |
| smiles = None |
| |
| try: |
| cm = chemicals.search_chemical(cas) |
| smiles = cm.smiles |
| except: |
| smiles = None |
| |
| if not smiles: |
| try: |
| compounds = pcp.get_compounds(cas, namespace='name') |
| c = compounds[0] |
| smiles = c.isomeric_smiles |
| except: |
| smiles = None |
| |
| if not smiles: |
| try: |
| smiles = cirpy.resolve(cas, 'smiles') |
| except: |
| smiles = None |
| if type(smiles) is list: |
| smiles = smiles[0] |
| |
| if not smiles: |
| try: |
| compounds = pcp.get_substances(cas, namespace='name') |
| |
| allsyns = [syn for c in compounds for syn in c.isomeric_smiles if cas not in syn] |
| |
| fd = nltk.FreqDist(allsyns) |
| smiles = fd.most_common(1)[0][0] |
| except: |
| smiles = None |
| return smiles |
|
|
| |
| def cas2name(cas): |
| name = None |
| |
| |
| |
| try: |
| cm = chemicals.search_chemical(cas) |
| if cm.iupac_name: |
| name = cm.iupac_name |
| elif cm.common_name: |
| name = cm.common_name |
| except: |
| name = None |
| |
| if not name: |
| try: |
| name = cirpy.resolve(cas, 'iupac_name') |
| except: |
| name = None |
| if type(name) is list: |
| name = name[0] |
| |
| if not name: |
| try: |
| compounds = pcp.get_compounds(cas, namespace='name') |
| c = compounds[0] |
| name = c.iupac_name |
| if not name: |
| |
| name = c.synonyms[0] |
| except: |
| name = None |
| return name |
|
|
| |
| def name2iupac(string): |
| name = None |
| origin = None |
| |
| try: |
| cm = chemicals.search_chemical(string) |
| if cm.iupac_name: |
| name = cm.iupac_name |
| elif cm.common_name: |
| name = cm.common_name |
| origin = 'chemicals' |
| except: |
| name = None |
| origin = None |
| |
| if not name: |
| try: |
| compounds = pcp.get_compounds(string, namespace='name') |
| c = compounds[0] |
| name = c.iupac_name |
| if not name: |
| |
| name = c.synonyms[0] |
| origin = 'PubChem' |
| except: |
| name = None |
| origin = None |
| |
| if not name: |
| try: |
| |
| name = cirpy.resolve(string, 'iupac_name') |
| if name: origin = 'CIRPY' |
| except: |
| name = None |
| origin = None |
| if type(name) is list: |
| name = name[0] |
| |
| if not name: |
| try: |
| compounds = pcp.get_substances(string, namespace='name') |
| |
| allsyns = [syn for c in compounds for syn in c.synonyms if cas not in syn] |
| |
| fd = nltk.FreqDist(allsyns) |
| name = fd.most_common(1)[0][0] |
| origin = 'PubChem/substance' |
| except: |
| name = None |
| origin = None |
| |
| if not name: |
| string_strip = re.sub(' ','',string) |
| |
| try: |
| compounds = pcp.get_compounds(string_strip, namespace='name') |
| c = compounds[0] |
| name = c.iupac_name |
| if not name: |
| |
| name = c.synonyms[0] |
| origin = 'PubChem' |
| except: |
| name = None |
| origin = None |
| |
| if not name: |
| try: |
| |
| name = cirpy.resolve(string, 'iupac_name') |
| if name: origin = 'CIRPY' |
| except: |
| name = None |
| origin = None |
| if type(name) is list: |
| name = name[0] |
| return name, origin |
|
|
| |
| def name2cas(name): |
| cas = None |
| |
| try: |
| cm = chemicals.search_chemical(name) |
| cas = cm.CASs |
| except: |
| cas = None |
| |
| if not cas: |
| try: |
| cas = cirpy.resolve(name, 'cas') |
| except: |
| cas = None |
| if type(cas) is list: |
| cas.sort(key=lambda s: np.array(s.split('-'), dtype=int).sum()) |
| cas = cas[0] |
| |
| if not cas: |
| try: |
| compounds = pcp.get_compounds(name, namespace='name') |
| c = compounds[0] |
| syns = c.synonyms |
| possible_cas = [syn for syn in syns if is_cas(syn)] |
| |
| possible_cas.sort(key=lambda s: np.array(s.split('-'), dtype=int).sum()) |
| cas = possible_cas[0] |
| except: |
| cas = None |
| return cas |
|
|
| |
| def name2smiles(name): |
| smiles = None |
| |
| try: |
| cm = chemicals.search_chemical(name) |
| smiles = cm.smiles |
| except: |
| smiles = None |
| |
| if not smiles: |
| try: |
| compounds = pcp.get_compounds(name, namespace='name') |
| c = compounds[0] |
| smiles = c.isomeric_smiles |
| except: |
| smiles = None |
| |
| if not smiles: |
| try: |
| smiles = cirpy.resolve(name, 'smiles') |
| except: |
| smiles = None |
| if type(smiles) is list: |
| smiles = smiles[0] |
| |
| if not smiles: |
| try: |
| compounds = pcp.get_substances(name, namespace='name') |
| |
| allsyns = [syn for c in compounds for syn in c.isomeric_smiles if name not in syn] |
| |
| fd = nltk.FreqDist(allsyns) |
| smiles = fd.most_common(1)[0][0] |
| except: |
| smiles = None |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| return smiles |
|
|
| def check_cas(cas): |
| n1,n2,n3 = cas.split('-') |
| |
| tmp = ''.join([n1,n2])[::-1] |
| |
| check = sum([i*int(tmp[i-1]) for i in range(1,len(tmp)+1)]) |
| |
| check = check%10 |
| |
| return check == int(n3) |
|
|
| def is_cas(cas): |
| try: |
| return check_cas(cas) |
| except: |
| return False |
|
|
| def string2density(name): |
| rho, rho_origin = None, None |
| |
| if is_cas(name): |
| mask = df_pred['CASRN'] == name |
| if sum(mask): |
| rho = float(df_pred[mask]['DENSITY_G/CM^3_TEST_PRED']) |
| rho_origin = 'comptox/pred' |
| |
| if pd.isna(rho): |
| content = None |
| try: |
| compounds = pcp.get_compounds(name, namespace='name') |
| c = compounds[0] |
| cid = c.cid |
| url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON' |
| |
| |
| webpage = requests.get(url).text |
| content = json.loads(webpage) |
| except: |
| pass |
| if content: |
| rho_list = [] |
| for i in content['Record']['Section']: |
| if i['TOCHeading'] == 'Chemical and Physical Properties': |
| for j in i['Section']: |
| if j['TOCHeading'] == 'Experimental Properties': |
| for k in j['Section']: |
| if k['TOCHeading'] == 'Density': |
| for ii in k['Information']: |
| try: |
| rho_string = ii['Value']['StringWithMarkup'][0]['String'] |
| rho_string = rho_string.replace('Relative density (water = 1): ', '') |
| |
| |
| m = re.match(r'((?:\d+(?:\.\d*)?|\.\d+))(?:-((?:\d+(?:\.\d*)?|\.\d+)))?',rho_string) |
| groups = m.groups() |
| if len(groups): |
| for g in groups: |
| try: |
| tmp_rho = float(g) |
| rho_list.append(tmp_rho) |
| except: |
| continue |
| except: |
| continue |
| if rho_list: |
| |
| rho_list = np.array(rho_list) |
| q75,q25 = np.percentile(rho_list,[75,25]) |
| intr_qr = q75-q25 |
| hi = q75+(1.5*intr_qr) |
| lo = q25-(1.5*intr_qr) |
| mask = (rho_list <= hi) & (rho_list >= lo) |
| rho_list = rho_list[mask] |
| rho = np.mean(rho_list) |
| rho_origin = 'pubchem' |
| else: |
| rho, rho_origin = None, None |
| |
| if try_dsstox: |
| if pd.isna(rho): |
| dtxsid = None |
| try: |
| |
| name_urlsafe = urllib.parse.quote(name) |
| url = f'https://comptox.epa.gov/dashboard/search-results?input_type=synonym_substring&inputs={name_urlsafe}' |
| fid = urllib.request.urlopen(url) |
| webpage = fid.read().decode('utf-8') |
| hits = re.findall(r'DTXSID[0-9]+', webpage) |
| if len(hits): |
| dtxsid = hits[0] |
| except: |
| pass |
| if dtxsid: |
| mysoup = None |
| url = f'https://comptox.epa.gov/dashboard/chemical/properties/{dtxsid}' |
| |
| try: |
| options = Options() |
| options.add_argument("--headless") |
| service = Service(driver_exe) |
| driver = selenium.webdriver.Chrome(service=service, options=options) |
| driver.set_page_load_timeout(15) |
| driver.get(url) |
| webpage = driver.page_source |
| driver.quit() |
| mysoup = bs4.BeautifulSoup(webpage, features='lxml') |
| except KeyboardInterrupt: |
| raise |
| except: |
| pass |
| if mysoup: |
| ifound = None |
| |
| rows = mysoup.find_all('div', attrs={'col-id':'property'}) |
| for i,row in enumerate(rows): |
| if 'Density' in row.text: |
| ifound = i |
| break |
| if ifound: |
| rows = mysoup.find_all('div', attrs={'col-id':'exavg'}) |
| text = rows[ifound].text |
| value = re.sub(r' \([0-9]*\)', '', text.strip()) |
| try: |
| rho = float(value) |
| rho_origin = 'dsstox/expt' |
| except: |
| rho, rho_origin = None, None |
| if pd.isna(rho): |
| rows = mysoup.find_all('div', attrs={'col-id':'predavg'}) |
| text = rows[ifound].text |
| value = re.sub(r' \([0-9]*\)', '', text.strip()) |
| try: |
| rho = float(value) |
| rho_origin = 'dsstox/pred' |
| except: |
| rho, rho_origin = None, None |
| else: |
| rho, rho_origin = None, None |
| else: |
| rho, rho_origin = None, None |
| if pd.isna(rho): rho,rho_origin = None, None |
| return rho, rho_origin |
|
|
| def return_non_duplicate_index(tuples): |
| |
| |
| new_tuples = [] |
| for i in tuples: |
| for j in i: |
| new_tuples.append(set(j)) |
| |
| values = [] |
| for index, item in enumerate(tuples): |
| if len(item) == 1: |
| values.append(index) |
| else: |
| for i in [index]*len(item): |
| values.append(i) |
| keys = [i for i in range(len(new_tuples))] |
| dict_tuples = {} |
| for i, j in zip(keys, values): |
| dict_tuples[i] = j |
| |
| remove_index = [] |
| for index_1, item in enumerate(new_tuples): |
| for index_2 in range(index_1 + 1, len(new_tuples)): |
| if len(item & new_tuples[index_2]) != 0: |
| if len(item)>len(new_tuples[index_2]): |
| remove_index.append(index_2) |
| elif len(item)<len(new_tuples[index_2]): |
| remove_index.append(index_1) |
| elif len(item)==len(new_tuples[index_2]): |
| remove_index.append(index_2) |
| remain_sets = set(range(len(new_tuples))).difference(set(remove_index)) |
| |
| index_1 = [] |
| index_length = [] |
| for i in remain_sets: |
| index_1.append(dict_tuples[i]) |
| counts = Counter(index_1) |
| list_counts = counts.most_common() |
| for i in range(len(list_counts)): |
| index_length.append([list_counts[i][0], list_counts[i][1]]) |
| index_length = sorted(index_length, key = itemgetter(0)) |
| return index_length |
|
|
| def search_func_groups(smiles): |
| |
| smarts = ["[$([CX2H0](=*)=*)]", "[$([CX2H1]#[!#7])]", "[$([CX2H0]#[!#7])]", "[OX2H]-[C]=O", "[#6X3H0;!$([#6X3H0](~O)(~O)(~O))](=[#8X1])[#8X2H0]", |
| "[$([#6X3H0](=[OX1]));!$([#6X3](=[#8X1])~[#8X2]);R]=O", "[CH;D2;$(C-!@C)](=O)", "[OX2H;!$([OX2H]-[#6]=[O]);!$([OX2H]-a)]", "[O;H1;$(O-!@c)]", |
| "[#8X2H0;R;!$([#8X2H0]~[#6]=[#8])]", "[$([CX3H0](=[OX1]));!$([CX3](=[OX1])-[OX2]);!R]=O", "[OX2H0;!R;!$([OX2H0]-[#6]=[#8])]", |
| "[$([#7X3,#7X3+][!#8])](=[O])~[O-]", "[OX1H0;!$([OX1H0]~[#6X3]);!$([OX1H0]~[#7X3]~[#8])]", "[#7X2H0;R]", "[#7X3H1;R]", "[#7X2H1]", |
| "[#7X2H0;!R]","[#6X2]#[#7X1H0]","[NX3H2]", "[NX3H1;!R]", "[#7X3H0;!$([#7](~O)~O)]","[SX2H]","[#16X2H0;!R]","[#16X2H0;R]", "[R;CX3H1,cX3H1]", |
| "[$([R;#6X3H0]);!$([R;#6X3H0]=[#8])]","[R;CX4H2]","[R;CX4H]","[R;CX4H0]", "[CX3H2]", "[!R;CX3H1;!$([CX3H1](=O))]", |
| "[$([!R;#6X3H0]);!$([!R;#6X3H0]=[#8])]","[CX4H3]","[!R;CX4H2]", "[!R;CX4H]","[!R;CX4H0]","[F]","[Cl]","[Br]", "[I]"] |
| tuples = [] |
| index_list = [] |
| final_index_and_length = [] |
| m = Chem.MolFromSmiles(str(smiles)) |
| for index, smart in enumerate(smarts): |
| if m.HasSubstructMatch(Chem.MolFromSmarts(smart)) == True: |
| tuples.append(m.GetSubstructMatches(Chem.MolFromSmarts(smart))) |
| index_list.append(index) |
| temp = return_non_duplicate_index(tuples) |
| for i in temp: |
| final_index_and_length.append([index_list[i[0]], i[1]]) |
| return final_index_and_length |
|
|
| def compute_phys_properties(smiles): |
| |
| |
| |
| |
| DB = [[0.0026, 0.0028, 36, 26.15, 17.78, 142.14, 136.70, 2.74E+1, -5.57E-2, 1.01E-4, -5.02E-8, 4.720, 2.661, None, None], |
| [0.0027, -0.0008,46, 9.20, -11.18, 79.30, 77.71, 2.45E+1, -2.71E-2, 1.11E-4, -6.78E-8, 2.322, 1.155, None, None], |
| [0.0020, 0.0016, 37, 27.38, 64.32, 115.51, 109.82, 7.87, 2.01E-2, -8.33E-6, 1.39E-9, 4.151, 3.302, None, None], |
| [0.0791, 0.0077, 89, 169.09, 155.50, -426.72,-387.87,2.41E+1, 4.27E-2, 8.04E-5, -6.87E-8, 11.051, 19.537, 1317.23,-2.578], |
| [0.0481, 0.0005, 82, 81.10, 53.60, -337.92,-301.95,2.45E+1, 4.02E-2, 4.02E-5, -4.52E-8, 6.959, 9.633, 483.88, -0.966], |
| [0.0284, 0.0028, 55, 94.97, 75.97, -164.50,-126.27,3.04E+1, -8.29E-2, 2.36E-4, -1.31E-7, None, 6.645, None, None], |
| [0.0379, 0.0030, 82, 72.24, 36.90, -162.03,-143.48,3.09E+1, -3.36E-2, 1.60E-4, -9.88E-8, 3.197, 9.093, 740.92, -1.713], |
| [0.0741, 0.0112, 28, 92.88, 44.45, -208.04,-189.20,2.57E+1, -6.91E-2, 1.77E-4, -9.88E-8, 2.406, 16.826, 2173.72,-5.057], |
| [0.0240, 0.0184, -25, 76.34, 82.83, -221.65,-197.37,-2.81, 1.11E-1, -1.16E-4, 4.94E-8, 4.490, 12.499, 3018.17,-7.314], |
| [0.0098, 0.0048, 13, 31.22, 23.05, -138.16,-98.22, 1.22E+1, -1.26E-2, 6.03E-5, -3.86E-8, 5.879, 4.682, 440.24, -0.953], |
| [0.0380, 0.0031, 62, 76.75, 61.20, -133.22,-120.50,6.45, 6.70E-2, -3.57E-5, 2.86E-9, 4.189, 8.972, 340.35, -0.350], |
| [0.0168, 0.0015, 18, 22.42, 22.23, -132.22,-105.00,2.55E+1, -6.32E-2, 1.11E-4, -5.48E-8, 1.188, 2.410, 122.09, -0.386], |
| [0.0437, 0.0064, 91, 152.54, 127.24, -66.57, -16.83, 2.59E+1, -3.74E-3, 1.29E-4, -8.88E-8, 9.679, 16.738, None, None], |
| [0.0143, 0.0101, 36, -10.50, 2.08, -247.61,-250.83,6.82, 1.96E-2, 1.27E-5, -1.78E-8, 3.624, 5.909, 675.24, -1.340], |
| [0.0085, 0.0076, 34, 57.55, 68.40, 55.52, 79.93, 8.83, -3.84E-3, 4.35E-5, -2.60E-8, 3.649, 6.528, None, None], |
| [0.0130, 0.0114, 29, 52.82, 101.51, 31.65,75.61,1.18E+1, -2.30E-2, 1.07E-4, -6.28E-8, 7.490, 6.930, None, None], |
| [None, None, None, 83.08, 68.91, 93.70, 119.66, 5.69, -4.12E-3, 1.28E-4, -8.88E-8, None, 12.169, None, None], |
| [0.0255, -0.0099,None, 74.60, None, 23.61, None, None, None, None, None, None, 3.335, None, None], |
| [0.0496, -0.0101,91, 125.66, 59.89, 88.43, 89.22, 3.65E+1, -7.33E-2, 1.84E-4, -1.03E-7, 2.414, 12.851, None, None], |
| [0.0243, 0.0109, 38, 73.23, 66.89, -2.02, 14.07,2.69E+1, -4.12E-2, 1.64E-4, -9.76E-8, 3.515, 10.788, None, None], |
| [0.0295, 0.0077, 35, 50.17, 52.66, 53.47, 89.39,-1.21, 7.62E-2, -4.86E-5, 1.05E-8, 5.009, 6.436, None, None], |
| [0.0169, 0.0074, 9, 11.74, 48.84, 123.34, 163.16,-3.11E+1, 2.27E-1, -3.20E-4, 1.46E-7, 4.703, 1.896, None, None], |
| [0.0031, 0.0084, 63, 63.56, 20.09, -17.33, -22.99, 3.53E+1, -7.58E-2, 1.85E-4, -1.03E-7, 2.360, 6.884, None, None], |
| [0.0119, 0.0049, 54, 68.78, 34.40, 41.87, 33.12, 1.96E+1, -5.61E-3, 4.02E-5, -2.76E-8, 4.130, 6.817, None, None], |
| [0.0019, 0.0051, 38, 52.10, 79.93, 39.10, 27.76, 1.67E+1, 4.81E-3, 2.77E-5, -2.11E-8, 1.557, 5.984, None, None], |
| [0.0082, 0.0011, 41, 26.73, 8.13, 2.09, 11.30, -2.14, 5.74E-2, -1.64E-6, -1.59E-8, 1.101, 2.544, 259.65, -0.702], |
| [0.0143, 0.0008, 32, 31.01, 37.02, 46.43, 54.05, -8.25, 1.01E-1, -1.42E-4, 6.78E-8, 2.394, 3.059, -245.74,0.912], |
| [0.0100, 0.0025, 48, 27.15, 7.75, -26.80, -3.68, -6.03, 8.54E-2, -8.00E-6, -1.80E-8, 0.490, 2.398, 307.53, -0.798], |
| [0.0122, 0.0004, 38, 21.78, 19.88, 8.67, 40.99, -2.05E+1, 1.62E-1, -1.60E-4, 6.24E-8, 3.243, 1.942, -394.29,1.251], |
| [0.0042, 0.0061, 27, 21.32, 60.15, 79.72, 87.88, -9.09E+1, 5.57E-1, -9.00E-4, 4.69E-7, -1.373, 0.644, None, None], |
| [0.0113, -0.0028,56, 18.18, -4.32, -9.630, 3.77, 2.36E+1, -3.81E-2, 1.72E-4, -1.03E-7, -0.473, 1.724, 495.01, -1.539], |
| [0.0129, -0.0006,46, 24.96, 8.73, 37.97, 48.53, -8.00, 1.05E-1, -9.63E-5, 3.56E-8, 2.691, 2.205, 82.28, -0.242], |
| [0.0117, 0.0011, 38, 24.14, 11.14, 83.99, 92.36, -2.81E+1, 2.08E-1, -3.06E-4, 1.46E-7, 3.063, 2.138, None, None], |
| [0.0141, -0.0012,65, 23.58, -5.10, -76.45, -43.96, 1.95E+1, -8.08E-3, 1.53E-4, -9.67E-8, 0.908, 2.373, 548.29, -1.719], |
| [0.0189, 0.0000, 56, 22.88, 11.27, -20.64, 8.42, -9.09E-1, 9.50E-2, -5.44E-5, 1.19E-8, 2.590, 2.226, 94.16, -0.199], |
| [0.0164, 0.0020, 41, 21.74, 12.64, 29.89, 58.36, -2.30E+1, 2.04E-1, -2.65E-4, 1.20E-7, 0.749, 1.691, -322.15,1.187], |
| [0.0067, 0.0043, 27, 18.25, 46.43, 82.23, 116.02, -6.62E+1, 4.27E-1, -6.41E-4, 3.01E-7, -1.460, 0.636, -573.56,2.307], |
| [0.0111, -0.0057,27, -0.03, -15.78, -251.92,-247.19,2.65E+1, -9.13E-2, 1.91E-4, -1.03E-7, 1.398, -0.670, None, None], |
| [0.0105, -0.0049,58, 38.13, 13.55, -71.55,-64.31, 3.33E+1, -9.63E-2, 1.87E-4, -9.96E-8, 2.515, 4.532, 625.45, -1.814], |
| [0.0133, 0.0057, 71, 66.86, 43.43, -29.48, -38.06, 2.86E+1, -6.49E-2, 1.36E-4, -7.45E-8, 3.603, 6.582, 738.91, -2.038], |
| [0.0068, -0.0034,97, 93.84, 41.69, 21.06, 5.74, 3.21E+1, -6.41E-2, 1.26E-4, -6.87E-8, 2.724, 9.520, 809.55, -2.224]] |
|
|
| mol = Chem.MolFromSmiles(str(smiles)) |
| NoA = Chem.AddHs(mol).GetNumAtoms() |
| MW = Descriptors.MolWt(Chem.AddHs(mol)) |
| LogP = Descriptors.MolLogP(Chem.AddHs(mol)) |
| MR = Descriptors.MolMR(Chem.AddHs(mol)) |
|
|
| double_lists = search_func_groups(smiles) |
|
|
| entry_index_by_users = [] |
| entry_data_by_users = [] |
|
|
| for item in double_lists: |
| entry_index_by_users.append(item[0]) |
| entry_data_by_users.append(item[1]) |
| |
| fiveteen_columns = [] |
| for index, data in zip(entry_index_by_users, entry_data_by_users): |
| for i in range(15): |
| if DB[index][i] == None: |
| temp = None |
| else: |
| temp = data*DB[index][i] |
| fiveteen_columns.append(temp) |
|
|
| temperature = 310 |
| Tc = [] |
| Pc = [] |
| Vc = [] |
| Tb = [] |
| Tm = [] |
| Hfor = [] |
| Gf = [] |
| Cpa = [] |
| Cpb = [] |
| Cpc = [] |
| Cpd = [] |
| Hfus = [] |
| Hvap = [] |
| Ya = [] |
| Yb =[] |
| fc = fiveteen_columns |
| for i in range(len(entry_index_by_users)): |
| Tc.append(fc[i*15]) |
| Pc.append(fc[i*15 + 1]) |
| Vc.append(fc[i*15 + 2]) |
| Tb.append(fc[i*15 + 3]) |
| Tm.append(fc[i*15 + 4]) |
| Hfor.append(fc[i*15 + 5]) |
| Gf.append(fc[i*15 + 6]) |
| Cpa.append(fc[i*15 + 7]) |
| Cpb.append(fc[i*15 + 8]) |
| Cpc.append(fc[i*15 + 9]) |
| Cpd.append(fc[i*15 + 10]) |
| Hfus.append(fc[i*15 + 11]) |
| Hvap.append(fc[i*15 + 12]) |
| Ya.append(fc[i*15 + 13]) |
| Yb.append(fc[i*15 + 14]) |
| try: |
| BoilingPoint = 198.2 + sum(Tb) |
| except: |
| BoilingPoint = None |
| try: |
| MeltingPoint = 122.5 + sum(Tm) |
| except: |
| MeltingPoint = None |
| try: |
| CriticalTemp = (sum(Tb) + 198.2)/(0.584 + 0.965*sum(Tc) - sum(Tc)**2) |
| except: |
| CriticalTemp = None |
| try: |
| CriticalPress = 1./(0.113 + 0.0032*float(NoA) - sum(Pc))**2 |
| except: |
| CriticalPress = None |
| try: |
| CriticalVolume = 17.5 + sum(Vc) |
| except: |
| CriticalVolume = None |
| try: |
| EnthalpyForm = 68.29 + sum(Hfor) |
| except: |
| EnthalpyForm = None |
| try: |
| GibbsEnergy = 53.88 + sum(Gf) |
| except: |
| GibbsEnergy = None |
| try: |
| HeatCapacity = (sum(Cpa) - 37.93) + (sum(Cpb) + 0.210)*float(temperature) + (sum(Cpc) - 3.91*10**(-4))*float(temperature)**2 + (sum(Cpd) + 2.06*10**(-7))*float(temperature)**3 |
| except: |
| HeatCapacity = None |
| try: |
| EnthalpyVap = 15.30 + sum(Hvap) |
| except: |
| EnthalpyVap = None |
| try: |
| EnthalpyFus = -0.88 + sum(Hfus) |
| except: |
| EnthalpyFus = None |
| try: |
| LiquidVisco = float(MW)*math.exp((sum(Ya) - 597.82)/float(temperature) + sum(Yb) - 11.202) |
| except: |
| LiquidVisco = None |
| try: |
| CrystalSolub_1 = 10**(0.8 - float(LogP) - 0.01*(sum(Tm)+122.5 - 273.15 - 25.))*1000.*float(MW) |
| except: |
| CrystalSolub_1 = None |
| try: |
| CrystalSolub_2 = 10**(0.5 - float(LogP) - 0.01*(sum(Tm)+122.5 - 273.15 - 25.))*1000.*float(MW) |
| except: |
| CrystalSolub_2 = None |
| try: |
| AmorphSolub_1 = 10**(0.8 - float(LogP) - 0.01*(sum(Tm)+122.5 - 273.15 - 25.)) *1000.*float(MW)*math.exp((sum(Hfus)-0.88)*(sum(Tm) + 122.5 - float(temperature))*float(temperature)/(sum(Tm) + 122.5)**2/(2.479*float(temperature)/298.)) |
| except: |
| AmorphSolub_1 = None |
| try: |
| AmorphSolub_2 = 10**(0.5 - float(LogP) - 0.01*(sum(Tm)+122.5 - 273.15 - 25.)) *1000.*float(MW)*math.exp((sum(Hfus)-0.88)*(sum(Tm) + 122.5 - float(temperature))*float(temperature)/(sum(Tm) + 122.5)**2/(2.479*float(temperature)/298.)) |
| except: |
| AmorphSolub_2 = None |
| return MeltingPoint-273.15 |
|
|
| def string2mp(name, namespace='name'): |
| mp, mp_origin = None, None |
| |
| if is_cas(name): |
| cas = name |
| if pd.isna(mp): |
| mp = chemicals.Tm(cas) |
| if mp: mp = mp-273.15 |
| methods = chemicals.Tm_methods(cas) |
| if methods: mp_origin = 'chem/'+methods[0] |
| |
| if pd.isna(mp): |
| |
| mask = dfmp_expt['CAS'].str.contains(cas) |
| if sum(mask): |
| subdf = dfmp_expt[mask]['CAS'].str.split(', ') |
| for index,subcas in zip(subdf.index,subdf): |
| if cas in subcas: |
| mp = float(dfmp_expt.iloc[index]['MP']) |
| mp_origin = 'expt' |
| |
| |
| |
| |
| |
| |
| |
| if pd.isna(mp): |
| mask = df_pred['CASRN'] == cas |
| if sum(mask): |
| mp = float(df_pred[mask]['MELTING_POINT_DEGC_OPERA_PRED'].iloc[0]) |
| mp_origin = 'comptox/pred' |
| |
| if pd.isna(mp): |
| content = None |
| try: |
| compounds = pcp.get_compounds(name, namespace=namespace) |
| c = compounds[0] |
| cid = c.cid |
| url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON' |
| |
| |
| webpage = requests.get(url).text |
| content = json.loads(webpage) |
| except: |
| pass |
| if content: |
| mp_list = [] |
| for i in content['Record']['Section']: |
| if i['TOCHeading'] == 'Chemical and Physical Properties': |
| for j in i['Section']: |
| if j['TOCHeading'] == 'Experimental Properties': |
| for k in j['Section']: |
| if k['TOCHeading'] == 'Melting Point': |
| for ii in k['Information']: |
| try: |
| mp_string = ii['Value']['StringWithMarkup'][0]['String'] |
| |
| |
| |
| |
| m = re.match(r'(-?(?:\d+(?:\.\d*)?|\.\d+))(?:-((?:\d+(?:\.\d*)?|\.\d+)))?( ?°?C)',mp_string) |
| if m is not None: |
| groups = m.groups() |
| if len(groups): |
| for g in groups: |
| try: |
| tmp_mp = float(g) |
| mp_list.append(tmp_mp) |
| except: |
| continue |
| m = re.match(r'(-?(?:\d+(?:\.\d*)?|\.\d+))(?:-((?:\d+(?:\.\d*)?|\.\d+)))?( ?°?F)',mp_string) |
| if m is not None: |
| groups = m.groups() |
| if len(groups): |
| for g in groups: |
| try: |
| |
| tmp_mp = (float(g)-32)*5/9 |
| mp_list.append(tmp_mp) |
| except: |
| continue |
| except: |
| continue |
| if mp_list: |
| |
| mp_list = np.array(mp_list) |
| q75,q25 = np.percentile(mp_list,[75,25]) |
| intr_qr = q75-q25 |
| hi = q75+(1.5*intr_qr) |
| lo = q25-(1.5*intr_qr) |
| mask = (mp_list <= hi) & (mp_list >= lo) |
| mp_list = mp_list[mask] |
| mp = np.mean(mp_list) |
| mp_origin = 'pubchem' |
| else: |
| mp, mp_origin = None, None |
| |
| if try_dsstox: |
| if pd.isna(mp): |
| dtxsid = None |
| |
| try: |
| name_urlsafe = urllib.parse.quote(name) |
| url = f'https://comptox.epa.gov/dashboard/search-results?input_type=synonym_substring&inputs={name_urlsafe}' |
| fid = urllib.request.urlopen(url) |
| webpage = fid.read().decode('utf-8') |
| hits = re.findall(r'DTXSID[0-9]+', webpage) |
| if len(hits): |
| dtxsid = hits[0] |
| except: |
| pass |
| if dtxsid: |
| mysoup = None |
| url = f'https://comptox.epa.gov/dashboard/chemical/properties/{dtxsid}' |
| |
| try: |
| |
| |
| |
| |
| |
| options = Options() |
| options.add_argument("--headless") |
| service = Service(driver_exe) |
| driver = selenium.webdriver.Chrome(service=service, options=options) |
| |
| driver.set_page_load_timeout(15) |
| driver.get(url) |
| webpage = driver.page_source |
| driver.quit() |
| mysoup = bs4.BeautifulSoup(webpage, features='lxml') |
| except: |
| pass |
| |
| |
| if mysoup: |
| ifound = None |
| rows = mysoup.find_all('div', attrs={'col-id':'property'}) |
| for i,row in enumerate(rows): |
| if 'Melting Point' in row.text: |
| ifound = i |
| break |
| if ifound: |
| rows = mysoup.find_all('div', attrs={'col-id':'exavg'}) |
| text = rows[ifound].text |
| value = re.sub(r' \([0-9]*\)', '', text.strip()) |
| try: |
| mp = float(value) |
| mp_origin = 'dsstox/expt' |
| except: |
| mp, mp_origin = None, None |
| if pd.isna(mp): |
| rows = mysoup.find_all('div', attrs={'col-id':'predavg'}) |
| text = rows[ifound].text |
| value = re.sub(r' \([0-9]*\)', '', text.strip()) |
| try: |
| mp = float(value) |
| mp_origin = 'dsstox/pred' |
| except: |
| mp, mp_origin = None, None |
| else: |
| mp, mp_origin = None, None |
| else: |
| mp, mp_origin = None, None |
| if pd.isna(mp): mp, mp_origin = None, None |
| return mp, mp_origin |
|
|
| def smiles2mp(smiles): |
| try: |
| SUPPORTED_ATOM_SET = {6, 7, 8, 9, 16, 17, 35, 53} |
| m = Chem.MolFromSmiles(str(smiles)) |
| atom_num_set = set([a.GetAtomicNum() for a in m.GetAtoms()]) |
| if atom_num_set.issubset(SUPPORTED_ATOM_SET): |
| mp = compute_phys_properties(smiles) |
| else: |
| mp = None |
| except: |
| mp = None |
| return mp |
|
|
| def smiles2mp_opera(smiles): |
| descs = padelpy.from_smiles(smiles, descriptortypes='mp/descriptors.xml') |
| |
| |
| |
| dfd = pd.DataFrame(descs, index=[0]).apply(pd.to_numeric, errors="coerce").fillna(0.0).astype(float) |
| X = np.array(dfd[my_opera_data_mp.desc_list]) |
| X_scale = my_opera_data_mp.scaler_X.transform(X) |
| y_pred = my_opera_data_mp.scaler_y.inverse_transform(my_opera_data_mp.knn_all.predict(X_scale)) |
| return y_pred[0][0] |
|
|
| def mol2mp(cas, name, smiles): |
| mp, mp_origin = None, None |
| if pd.isna(mp) and smiles: |
| try: |
| mp = smiles2mp_opera(smiles) |
| mp_origin = 'opera/calc' |
| except: |
| mp, mp_origin = None, None |
| if pd.isna(mp) and cas: |
| try: |
| mp, mp_origin = string2mp(cas) |
| except: |
| mp, mp_origin = None, None |
| if pd.isna(mp) and name: |
| try: |
| mp, mp_origin = string2mp(name) |
| except: |
| mp, mp_origin = None, None |
| if pd.isna(mp) and smiles: |
| try: |
| mp = smiles2mp(smiles) |
| mp_origin = 'joback-reid/calc' |
| except: |
| mp, mp_origin = None, None |
| return mp, mp_origin |
|
|
| def getLogP(cas,mol): |
| LogP, LogP_origin = None, None |
| if cas: |
| mask = df_pred['CASRN'] == cas |
| if sum(mask): |
| LogP = float(df_pred[mask]['OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED']) |
| LogP_origin = 'comptox/pred' |
| if pd.isna(LogP) and mol: |
| LogP = Crippen.MolLogP(mol) |
| LogP_origin = 'rdkit/calc' |
| return LogP, LogP_origin |
|
|
|
|