CHRIS / ChemID.py
Robert Elder
kludge for several specific ceramics
6a4a1e6
import sys,os,glob,re,string
from collections import Counter
from operator import itemgetter
import nltk
import pandas as pd
import numpy as np
import pubchempy as pcp
import cirpy
import chemicals
import bs4
import urllib
import requests
import json
ORGANIC_ATOM_SET = {5, 6, 7, 8, 9, 15, 16, 17, 35, 53}
METAL_ATOM_SET = set([3,4,11,12,13] + list(range(19,31+1)) + list(range(37,50+1)) + list(range(55,84+1)) + list(range(87,114+1)) + [116])
with open('data/ceramics_list.txt', 'r') as fp:
lines = fp.readlines()
CERAMICS_SET = {line.strip() for line in lines}
with open('data/salt_list.txt', 'r') as fp:
lines = fp.readlines()
SALT_SET = {line.strip() for line in lines}
ERROR_CODES = {0:None, 1:'Structure could not be determined from the identifier', 2:'Invalid SMILES code', 3:'Invalid CAS number', 4:'Invalid identifier type selected'}
## not sure if this will be possible on pythonanywhere; use this flag to disable related code blocks
try_dsstox = True
if try_dsstox:
import selenium
import selenium.webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
## NOTE this does not seem very robust
uname = os.uname()
if uname.sysname == 'Darwin':
# mac
import chromedriver_binary
driver_exe = chromedriver_binary.chromedriver_filename
elif uname.sysname == 'Linux':
# pythonanywhere
driver_exe = '/usr/local/bin/chromedriver'
else:
# fallback...
import chromedriver_binary
driver_exe = chromedriver_binary.chromedriver_filename
from io import BytesIO
from PIL import ImageOps
import base64
import rdkit
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Descriptors,Draw,Crippen
## add custom chemical definitions (i.e., to correct confusion between methane and carbon)
db = chemicals.identifiers.get_pubchem_db()
db.load('data/custom_chemicals_db.tsv')
## load experimental and predicted properties
#dfmp_expt = pd.read_excel('PHYSPROP_MP_data.xlsx')
dfmp_expt = pd.read_csv('data/PHYSPROP_MP_data.tsv', sep='\t')
#dfmp_pred = pd.read_excel('DSSTOX_MP_pred_data.xlsx')
#df_pred = pd.read_excel('Comptox_pred_data.xlsx')
df_pred = pd.read_csv('data/Comptox_pred_data.tsv', sep='\t')
## OPERA melting point model
import dill as pickle
import sklearn
import sklearn.neighbors
import sklearn.metrics
import padelpy
from functions import weight_func
class opera_data_mp():
n_neighbors = 5
#weight_factor = 5e-3
desc_list = ['SHBd', 'nN', 'maxHBd', 'ATSC1v', 'AATS1i', 'TopoPSA', 'nT6Ring', 'nHBDon', 'WTPT-5', 'minHBd', 'nHBint2', 'IC0', 'MLFER_S', 'MLFER_BO', 'WTPT-3']
with open('mp/model-opera-knn.pkl', 'rb') as fp:
knn_all = pickle.load(fp)
knn_all.weights = weight_func # fix weird problem on pythonanywhere...
with open('mp/model-opera-scalerX.pkl', 'rb') as fp:
scaler_X = pickle.load(fp)
with open('mp/model-opera-scalerY.pkl', 'rb') as fp:
scaler_y = pickle.load(fp)
my_opera_data_mp = opera_data_mp()
def ResolveChemical(chemName, IDtype, debug=False, get_properties=['logp','rho','mp']):
#LogP_func = Crippen.MolLogP
LogP_func = getLogP
# remove excess whitespace
chemName = chemName.strip()
name = None
smiles = None
cas = None
Mw = None
LogP = None
rho = None
mp = None
im64 = None
mp_origin = None
rho_origin = None
LogP_origin = None
error = 0
if IDtype == 'CAS':
cas = chemName
if not is_cas(cas):
error = 3 #invalid cas
return (name, cas, smiles, Mw, LogP, rho, mp, im64, error)
smiles = cas2smiles(cas)
if smiles:
name = cas2name(cas)
if 'rho' in get_properties:
rho, rho_origin = string2density(cas)
if 'mp' in get_properties:
mp, mp_origin = mol2mp(cas, name, smiles)
if 'rho' in get_properties and pd.isna(rho) and name:
rho, rho_origin = string2density(name)
try:
mol = Chem.MolFromSmiles(smiles)
except:
error = 2 #invalid smiles
if mol:
Mw = Descriptors.MolWt(mol)
if 'logp' in get_properties:
LogP, LogP_origin = LogP_func(cas, mol)
im = ImageFromSmiles(smiles)
im64 = Imageto64(im)
else:
error = 2 #invalid smiles
else:
error = 1 # no smiles found
elif IDtype == 'SMILES':
smiles = chemName
try:
mol = Chem.MolFromSmiles(smiles)
except:
error = 2
if mol:
Mw = Descriptors.MolWt(mol)
if 'logp' in get_properties:
LogP, LogP_origin = LogP_func(cas, mol)
im = ImageFromSmiles(smiles)
im64 = Imageto64(im)
else:
error = 2
# if SMILES is not valid, skip the other stuff
if not error:
name = smiles2name(smiles)
if name:
cas = name2cas(name)
if 'rho' in get_properties:
rho, rho_origin = string2density(name)
if 'rho' in get_properties and pd.isna(rho) and cas:
rho, rho_origin = string2density(cas)
if 'mp' in get_properties:
mp, mp_origin = mol2mp(cas, name, smiles)
elif IDtype == 'common':
name = chemName
name, name_origin = name2iupac(name)
smiles = name2smiles(name)
cas = name2cas(name)
if not smiles:
smiles = cas2smiles(cas)
if 'rho' in get_properties and pd.isna(rho) and cas:
rho, rho_origin = string2density(cas)
if 'rho' in get_properties and pd.isna(rho):
rho, rho_origin = string2density(name)
if 'rho' in get_properties and pd.isna(rho):
# try this because sometimes iupac names don't work
rho, rho_origin = string2density(chemName)
if smiles:
if 'mp' in get_properties:
mp, mp_origin = mol2mp(cas, name, smiles)
try:
mol = Chem.MolFromSmiles(smiles)
except:
error = 2
if mol:
Mw = Descriptors.MolWt(mol)
if 'logp' in get_properties:
LogP, LogP_origin = LogP_func(cas, mol)
im = ImageFromSmiles(smiles)
im64 = Imageto64(im)
else:
error = 2
else:
error = 1
else:
## should never be here
name = None
smiles = None
cas = None
Mw = None
LogP = None
rho = None
mp = None
im64 = None
error = 4 # invalid IDtype selection, probably not possible
# if we couldn't find a name or CAS (but do have SMILES)
if not error:
if not name:
name = 'Not found'
if not cas:
cas = 'Not found'
if mp is not None:
mp = float(mp)
if debug:
return (name, cas, smiles, Mw, LogP, LogP_origin, rho, rho_origin, mp, mp_origin, im64, error)
else:
return (name, cas, smiles, Mw, LogP, rho, mp, im64, error)
def CeramicOrMetal(smiles,mp):
# metals/ceramics logic
is_ceramic = False
mol = Chem.MolFromSmiles(smiles)
atom_num_list = [a.GetAtomicNum() for a in mol.GetAtoms()]
is_metal = set(atom_num_list) <= METAL_ATOM_SET
if not is_metal:
# check composition against list of ceramics/salts
elements = ','.join(sorted(set([a.GetSymbol() for a in mol.GetAtoms()])))
if elements in CERAMICS_SET:
is_ceramic = True
if elements in SALT_SET:
is_ceramic = True
if not is_ceramic:
# get number of carbon-carbon bonds
num_CC_bonds = sum([1 if b.GetBeginAtom().GetAtomicNum() == 6 and b.GetEndAtom().GetAtomicNum() == 6 else 0 for b in mol.GetBonds()])
if not num_CC_bonds and (mp is not None) and mp > 700.:
# if not a metal, no C-C bonds, and mp > 700 (sodium chloride has mp ~ 800), assume ceramic...
is_ceramic = True
return is_metal, is_ceramic
#Generates an image of the molecule represented by the SMILES code given.
#Returns None if the image cannot be generated. From https://github.com/ronaldo-prata/flask-test/blob/master/functions.py
def ImageFromSmiles(smiles):
image = None
if type(smiles) is str:
try:
if smiles == 'C1=CC=C2C(=C1)C3=NC4=NC(=NC5=C6C=CC=CC6=C([N-]5)N=C7C8=CC=CC=C8C(=N7)N=C2[N-]3)C9=CC=CC=C94.[Mn+2]':
mol = next(Chem.SDMolSupplier('data/MnPC.sdf', removeHs=False))
image = Draw.MolToImage(mol, size=(350, 350))
else:
image = Draw.MolToImage(Chem.MolFromSmiles(smiles), size=(350, 350))
except ValueError:
pass
return image
#Trims the image into a box, removing any excess white background.
#The box cannot be smaller than 400x400. This is done due to the difference in quality in the images generated by MolToImage,
# if the size is too small (300x300), big molecules are too low quality, but if the size is too big (1000*1000), small molecules appear zoomed out.
def wTrim(img):
bbox = ImageOps.invert(img).getbbox()
crop = (bbox[0], bbox[1], bbox[2], bbox[3])
return img.crop(crop)
#Converts a PIL image into its base64 representation.
def Imageto64(img):
img = wTrim(img)
buf = BytesIO()
img.save(buf, format="PNG")
pngImageB64String = "data:image/png;base64,"
pngImageB64String += base64.b64encode(buf.getvalue()).decode("utf-8")
return pngImageB64String
# function to convert SMILES to name
def smiles2name(smiles):
name = None
# first try chemicals package
try:
cm = chemicals.search_chemical(smiles)
if cm.iupac_name:
name = cm.iupac_name
elif cm.common_name:
name = cm.common_name
except:
name = None
# then try pubchem for compounds
if not name:
try:
compounds = pcp.get_compounds(smiles, namespace='smiles')
c = compounds[0]
name = c.iupac_name
if not name:
# have seen empty iupac_name before, try synonyms if this happens
name = c.synonyms[0]
except:
name = None
# next try cirpy
if not name:
try:
name = cirpy.resolve(smiles, 'iupac_name')
except:
name = None
if type(name) is list:
name = name[0]
# finally try it as a pubchem substance
if not name:
try:
compounds = pcp.get_substances(smiles, namespace='smiles')
# sometimes there are multiple substances, and multiple synonyms per substance
allsyns = [syn for c in compounds for syn in c.iupac_name if cas not in syn]
# choose the most common synonym
fd = nltk.FreqDist(allsyns)
name = fd.most_common(1)[0][0]
except:
name = None
return name
# function to convert CAS to SMILES
def cas2smiles(cas):
smiles = None
# first try chemicals package
try:
cm = chemicals.search_chemical(cas)
smiles = cm.smiles
except:
smiles = None
# then try pubchem for compounds
if not smiles:
try:
compounds = pcp.get_compounds(cas, namespace='name')
c = compounds[0]
smiles = c.isomeric_smiles
except:
smiles = None
# next try cirpy
if not smiles:
try:
smiles = cirpy.resolve(cas, 'smiles')
except:
smiles = None
if type(smiles) is list:
smiles = smiles[0]
# finally try it as a pubchem substance
if not smiles:
try:
compounds = pcp.get_substances(cas, namespace='name')
# sometimes there are multiple substances, and multiple synonyms per substance
allsyns = [syn for c in compounds for syn in c.isomeric_smiles if cas not in syn]
# choose the most common synonym
fd = nltk.FreqDist(allsyns)
smiles = fd.most_common(1)[0][0]
except:
smiles = None
return smiles
# function to convert cas to name
def cas2name(cas):
name = None
#if not is_cas(cas):
# name = 'INVALID CAS'
# first try chemicals package
try:
cm = chemicals.search_chemical(cas)
if cm.iupac_name:
name = cm.iupac_name
elif cm.common_name:
name = cm.common_name
except:
name = None
# then try cirpy
if not name:
try:
name = cirpy.resolve(cas, 'iupac_name')
except:
name = None
if type(name) is list:
name = name[0]
# next try pubchem for compounds
if not name:
try:
compounds = pcp.get_compounds(cas, namespace='name')
c = compounds[0]
name = c.iupac_name
if not name:
# have seen empty iupac_name before, try synonyms if this happens
name = c.synonyms[0]
except:
name = None
return name
# function to convert chemical name to iupac name
def name2iupac(string):
name = None
origin = None
# try chemicals package
try:
cm = chemicals.search_chemical(string)
if cm.iupac_name:
name = cm.iupac_name
elif cm.common_name:
name = cm.common_name
origin = 'chemicals'
except:
name = None
origin = None
# try pubchem for compounds
if not name:
try:
compounds = pcp.get_compounds(string, namespace='name')
c = compounds[0]
name = c.iupac_name
if not name:
# have seen empty iupac_name before, try synonyms if this happens
name = c.synonyms[0]
origin = 'PubChem'
except:
name = None
origin = None
# next try cirpy
if not name:
try:
#name = cirpy.resolve(string, 'names')
name = cirpy.resolve(string, 'iupac_name')
if name: origin = 'CIRPY'
except:
name = None
origin = None
if type(name) is list:
name = name[0]
# now try it as a pubchem substance
if not name:
try:
compounds = pcp.get_substances(string, namespace='name')
# sometimes there are multiple substances, and multiple synonyms per substance
allsyns = [syn for c in compounds for syn in c.synonyms if cas not in syn]
# choose the most common synonym
fd = nltk.FreqDist(allsyns)
name = fd.most_common(1)[0][0]
origin = 'PubChem/substance'
except:
name = None
origin = None
# strip all spaces and try again...
if not name:
string_strip = re.sub(' ','',string)
# first try pubchem for compounds
try:
compounds = pcp.get_compounds(string_strip, namespace='name')
c = compounds[0]
name = c.iupac_name
if not name:
# have seen empty iupac_name before, try synonyms if this happens
name = c.synonyms[0]
origin = 'PubChem'
except:
name = None
origin = None
# next try cirpy
if not name:
try:
#name = cirpy.resolve(string_strip, 'names')
name = cirpy.resolve(string, 'iupac_name')
if name: origin = 'CIRPY'
except:
name = None
origin = None
if type(name) is list:
name = name[0]
return name, origin
# function to convert name to cas
def name2cas(name):
cas = None
# try chemicals package
try:
cm = chemicals.search_chemical(name)
cas = cm.CASs
except:
cas = None
# then try cirpy
if not cas:
try:
cas = cirpy.resolve(name, 'cas')
except:
cas = None
if type(cas) is list:
cas.sort(key=lambda s: np.array(s.split('-'), dtype=int).sum())
cas = cas[0]
# next try pubchem for compounds
if not cas:
try:
compounds = pcp.get_compounds(name, namespace='name')
c = compounds[0]
syns = c.synonyms
possible_cas = [syn for syn in syns if is_cas(syn)]
# if multiple choose option with smallest sum of digits
possible_cas.sort(key=lambda s: np.array(s.split('-'), dtype=int).sum())
cas = possible_cas[0]
except:
cas = None
return cas
# function to convert name to SMILES
def name2smiles(name):
smiles = None
# first try chemicals package
try:
cm = chemicals.search_chemical(name)
smiles = cm.smiles
except:
smiles = None
# then try pubchem for compounds
if not smiles:
try:
compounds = pcp.get_compounds(name, namespace='name')
c = compounds[0]
smiles = c.isomeric_smiles
except:
smiles = None
# next try cirpy
if not smiles:
try:
smiles = cirpy.resolve(name, 'smiles')
except:
smiles = None
if type(smiles) is list:
smiles = smiles[0]
# then try it as a pubchem substance
if not smiles:
try:
compounds = pcp.get_substances(name, namespace='name')
# sometimes there are multiple substances, and multiple synonyms per substance
allsyns = [syn for c in compounds for syn in c.isomeric_smiles if name not in syn]
# choose the most common synonym
fd = nltk.FreqDist(allsyns)
smiles = fd.most_common(1)[0][0]
except:
smiles = None
# finally try to resolve SMILES from name using OPSIN
#if not smiles:
# try:
# with open('opsin.tmp.1', 'w') as fp:
# fp.write(name)
# os.system('java -jar /Users/robert.elder/software/utils/opsin-2.4.0-jar-with-dependencies.jar -osmi opsin.tmp.1 opsin.tmp.2 &> /dev/null')
# with open('opsin.tmp.2') as fp:
# smiles = fp.read()
# if smiles == '\n':
# smiles = None
# smiles = smiles.strip() #remove trailing newline
# except KeyboardInterrupt:
# raise
# except:
# smiles = None
return smiles
def check_cas(cas):
n1,n2,n3 = cas.split('-')
# combine and flip first 2 numbers
tmp = ''.join([n1,n2])[::-1]
# sum of number*position in string
check = sum([i*int(tmp[i-1]) for i in range(1,len(tmp)+1)])
# mod 10
check = check%10
# if these match, then it's a legit cas number
return check == int(n3)
def is_cas(cas):
try:
return check_cas(cas)
except:
return False
def string2density(name):
rho, rho_origin = None, None
# predicted values from TEST (CompTox dashboard)
if is_cas(name):
mask = df_pred['CASRN'] == name
if sum(mask):
rho = float(df_pred[mask]['DENSITY_G/CM^3_TEST_PRED'])
rho_origin = 'comptox/pred'
# try to scrape from PubChem
if pd.isna(rho):
content = None
try:
compounds = pcp.get_compounds(name, namespace='name')
c = compounds[0]
cid = c.cid
url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON'
#fid = urllib.request.urlopen(url)
#webpage = fid.read().decode('utf-8')
webpage = requests.get(url).text
content = json.loads(webpage)
except:
pass
if content:
rho_list = []
for i in content['Record']['Section']:
if i['TOCHeading'] == 'Chemical and Physical Properties':
for j in i['Section']:
if j['TOCHeading'] == 'Experimental Properties':
for k in j['Section']:
if k['TOCHeading'] == 'Density':
for ii in k['Information']:
try:
rho_string = ii['Value']['StringWithMarkup'][0]['String']
rho_string = rho_string.replace('Relative density (water = 1): ', '')
#print(rho_string)
#tmp_rho = re.match('(?:\d+(?:\.\d*)?|\.\d+)',rho_string)
m = re.match(r'((?:\d+(?:\.\d*)?|\.\d+))(?:-((?:\d+(?:\.\d*)?|\.\d+)))?',rho_string)
groups = m.groups()
if len(groups):
for g in groups:
try:
tmp_rho = float(g)
rho_list.append(tmp_rho)
except:
continue
except:
continue
if rho_list:
## remove outliers using interquartile range (IQR)
rho_list = np.array(rho_list)
q75,q25 = np.percentile(rho_list,[75,25])
intr_qr = q75-q25
hi = q75+(1.5*intr_qr)
lo = q25-(1.5*intr_qr)
mask = (rho_list <= hi) & (rho_list >= lo)
rho_list = rho_list[mask]
rho = np.mean(rho_list)
rho_origin = 'pubchem'
else:
rho, rho_origin = None, None
# try to scrape from DSSTOX
if try_dsstox:
if pd.isna(rho):
dtxsid = None
try:
# try to find it via the dsstox dashboard
name_urlsafe = urllib.parse.quote(name)
url = f'https://comptox.epa.gov/dashboard/search-results?input_type=synonym_substring&inputs={name_urlsafe}'
fid = urllib.request.urlopen(url)
webpage = fid.read().decode('utf-8')
hits = re.findall(r'DTXSID[0-9]+', webpage)
if len(hits):
dtxsid = hits[0]
except:
pass
if dtxsid:
mysoup = None
url = f'https://comptox.epa.gov/dashboard/chemical/properties/{dtxsid}'
#print(url)
try:
options = Options()
options.add_argument("--headless") # runs in background instead of showing browser window
service = Service(driver_exe)
driver = selenium.webdriver.Chrome(service=service, options=options)
driver.set_page_load_timeout(15)
driver.get(url)
webpage = driver.page_source
driver.quit()
mysoup = bs4.BeautifulSoup(webpage, features='lxml')
except KeyboardInterrupt:
raise
except:
pass
if mysoup:
ifound = None
# column of property names
rows = mysoup.find_all('div', attrs={'col-id':'property'})
for i,row in enumerate(rows):
if 'Density' in row.text:
ifound = i
break
if ifound:
rows = mysoup.find_all('div', attrs={'col-id':'exavg'})
text = rows[ifound].text
value = re.sub(r' \([0-9]*\)', '', text.strip())
try:
rho = float(value)
rho_origin = 'dsstox/expt'
except:
rho, rho_origin = None, None
if pd.isna(rho):
rows = mysoup.find_all('div', attrs={'col-id':'predavg'})
text = rows[ifound].text
value = re.sub(r' \([0-9]*\)', '', text.strip())
try:
rho = float(value)
rho_origin = 'dsstox/pred'
except:
rho, rho_origin = None, None
else:
rho, rho_origin = None, None
else:
rho, rho_origin = None, None
if pd.isna(rho): rho,rho_origin = None, None
return rho, rho_origin
def return_non_duplicate_index(tuples): ##Given a list of sets return index of non_duplicate items
## from https://github.com/curieshicy/JRgui/
##step 1, create a new tuple, named "new_tuples"
new_tuples = [] ##the elements are the sets
for i in tuples:
for j in i:
new_tuples.append(set(j))
##step 2, create a dictionary storing one to one relationship between new_tuple and old_tuple
values = []
for index, item in enumerate(tuples):
if len(item) == 1:
values.append(index)
else:
for i in [index]*len(item):
values.append(i)
keys = [i for i in range(len(new_tuples))]
dict_tuples = {} ## {0:0, 1:1, 2:2, 3:3, 4:3, 5:3, 6:4, 7:4, 8:4, 9:5, 10:6, 11:7, 12:8}
for i, j in zip(keys, values):
dict_tuples[i] = j
##step 3, remove duplicates in sets terminology
remove_index = []
for index_1, item in enumerate(new_tuples): ##starting from beginning
for index_2 in range(index_1 + 1, len(new_tuples)): ##loop over the rest items
if len(item & new_tuples[index_2]) != 0:
if len(item)>len(new_tuples[index_2]):
remove_index.append(index_2) #indefoirx
elif len(item)<len(new_tuples[index_2]):
remove_index.append(index_1) #index
elif len(item)==len(new_tuples[index_2]):
remove_index.append(index_2) #index
remain_sets = set(range(len(new_tuples))).difference(set(remove_index))
##step 4, spit out final index and length
index_1 = [] ## [0,1,2,3,3,3,4,4]
index_length = []
for i in remain_sets:
index_1.append(dict_tuples[i])
counts = Counter(index_1) ##this is a dictionary return Counter({3:3, 4:2, 0:1, 1:1, 2:1}) ##index:length
list_counts = counts.most_common() ## convert to a list [(3,3), (4,2), (0,1), (1,1), (2,1)]
for i in range(len(list_counts)):
index_length.append([list_counts[i][0], list_counts[i][1]])
index_length = sorted(index_length, key = itemgetter(0))
return index_length
def search_func_groups(smiles): ##this is to search functional groups and print out them with numbers
## from https://github.com/curieshicy/JRgui/
smarts = ["[$([CX2H0](=*)=*)]", "[$([CX2H1]#[!#7])]", "[$([CX2H0]#[!#7])]", "[OX2H]-[C]=O", "[#6X3H0;!$([#6X3H0](~O)(~O)(~O))](=[#8X1])[#8X2H0]",
"[$([#6X3H0](=[OX1]));!$([#6X3](=[#8X1])~[#8X2]);R]=O", "[CH;D2;$(C-!@C)](=O)", "[OX2H;!$([OX2H]-[#6]=[O]);!$([OX2H]-a)]", "[O;H1;$(O-!@c)]",
"[#8X2H0;R;!$([#8X2H0]~[#6]=[#8])]", "[$([CX3H0](=[OX1]));!$([CX3](=[OX1])-[OX2]);!R]=O", "[OX2H0;!R;!$([OX2H0]-[#6]=[#8])]",
"[$([#7X3,#7X3+][!#8])](=[O])~[O-]", "[OX1H0;!$([OX1H0]~[#6X3]);!$([OX1H0]~[#7X3]~[#8])]", "[#7X2H0;R]", "[#7X3H1;R]", "[#7X2H1]",
"[#7X2H0;!R]","[#6X2]#[#7X1H0]","[NX3H2]", "[NX3H1;!R]", "[#7X3H0;!$([#7](~O)~O)]","[SX2H]","[#16X2H0;!R]","[#16X2H0;R]", "[R;CX3H1,cX3H1]",
"[$([R;#6X3H0]);!$([R;#6X3H0]=[#8])]","[R;CX4H2]","[R;CX4H]","[R;CX4H0]", "[CX3H2]", "[!R;CX3H1;!$([CX3H1](=O))]",
"[$([!R;#6X3H0]);!$([!R;#6X3H0]=[#8])]","[CX4H3]","[!R;CX4H2]", "[!R;CX4H]","[!R;CX4H0]","[F]","[Cl]","[Br]", "[I]"]
tuples = []
index_list = []
final_index_and_length = []
m = Chem.MolFromSmiles(str(smiles))
for index, smart in enumerate(smarts):
if m.HasSubstructMatch(Chem.MolFromSmarts(smart)) == True:
tuples.append(m.GetSubstructMatches(Chem.MolFromSmarts(smart))) ## this is atom position
index_list.append(index)
temp = return_non_duplicate_index(tuples) # [[0, 1], [1, 1], [3, 1], [4, 7], [5, 6], [6, 1], [7, 1], [8, 1], [9, 1]]
for i in temp:
final_index_and_length.append([index_list[i[0]], i[1]])
return final_index_and_length
def compute_phys_properties(smiles):
## from https://github.com/curieshicy/JRgui/
## method from: K. G. Joback, R. C. Reid, ESTIMATION OF PURE-COMPONENT PROPERTIES FROM GROUP-CONTRIBUTIONS. Chemical Engineering Communications 57, 233-243 (1987).
## this doesn't look very accurate, but it's a start
##[[], [], ...[]] in total 41 nested list inside a list
DB = [[0.0026, 0.0028, 36, 26.15, 17.78, 142.14, 136.70, 2.74E+1, -5.57E-2, 1.01E-4, -5.02E-8, 4.720, 2.661, None, None],
[0.0027, -0.0008,46, 9.20, -11.18, 79.30, 77.71, 2.45E+1, -2.71E-2, 1.11E-4, -6.78E-8, 2.322, 1.155, None, None],
[0.0020, 0.0016, 37, 27.38, 64.32, 115.51, 109.82, 7.87, 2.01E-2, -8.33E-6, 1.39E-9, 4.151, 3.302, None, None],
[0.0791, 0.0077, 89, 169.09, 155.50, -426.72,-387.87,2.41E+1, 4.27E-2, 8.04E-5, -6.87E-8, 11.051, 19.537, 1317.23,-2.578],
[0.0481, 0.0005, 82, 81.10, 53.60, -337.92,-301.95,2.45E+1, 4.02E-2, 4.02E-5, -4.52E-8, 6.959, 9.633, 483.88, -0.966],
[0.0284, 0.0028, 55, 94.97, 75.97, -164.50,-126.27,3.04E+1, -8.29E-2, 2.36E-4, -1.31E-7, None, 6.645, None, None],
[0.0379, 0.0030, 82, 72.24, 36.90, -162.03,-143.48,3.09E+1, -3.36E-2, 1.60E-4, -9.88E-8, 3.197, 9.093, 740.92, -1.713],
[0.0741, 0.0112, 28, 92.88, 44.45, -208.04,-189.20,2.57E+1, -6.91E-2, 1.77E-4, -9.88E-8, 2.406, 16.826, 2173.72,-5.057],
[0.0240, 0.0184, -25, 76.34, 82.83, -221.65,-197.37,-2.81, 1.11E-1, -1.16E-4, 4.94E-8, 4.490, 12.499, 3018.17,-7.314],
[0.0098, 0.0048, 13, 31.22, 23.05, -138.16,-98.22, 1.22E+1, -1.26E-2, 6.03E-5, -3.86E-8, 5.879, 4.682, 440.24, -0.953],
[0.0380, 0.0031, 62, 76.75, 61.20, -133.22,-120.50,6.45, 6.70E-2, -3.57E-5, 2.86E-9, 4.189, 8.972, 340.35, -0.350],
[0.0168, 0.0015, 18, 22.42, 22.23, -132.22,-105.00,2.55E+1, -6.32E-2, 1.11E-4, -5.48E-8, 1.188, 2.410, 122.09, -0.386],
[0.0437, 0.0064, 91, 152.54, 127.24, -66.57, -16.83, 2.59E+1, -3.74E-3, 1.29E-4, -8.88E-8, 9.679, 16.738, None, None],
[0.0143, 0.0101, 36, -10.50, 2.08, -247.61,-250.83,6.82, 1.96E-2, 1.27E-5, -1.78E-8, 3.624, 5.909, 675.24, -1.340],
[0.0085, 0.0076, 34, 57.55, 68.40, 55.52, 79.93, 8.83, -3.84E-3, 4.35E-5, -2.60E-8, 3.649, 6.528, None, None],
[0.0130, 0.0114, 29, 52.82, 101.51, 31.65,75.61,1.18E+1, -2.30E-2, 1.07E-4, -6.28E-8, 7.490, 6.930, None, None],
[None, None, None, 83.08, 68.91, 93.70, 119.66, 5.69, -4.12E-3, 1.28E-4, -8.88E-8, None, 12.169, None, None],
[0.0255, -0.0099,None, 74.60, None, 23.61, None, None, None, None, None, None, 3.335, None, None],
[0.0496, -0.0101,91, 125.66, 59.89, 88.43, 89.22, 3.65E+1, -7.33E-2, 1.84E-4, -1.03E-7, 2.414, 12.851, None, None],
[0.0243, 0.0109, 38, 73.23, 66.89, -2.02, 14.07,2.69E+1, -4.12E-2, 1.64E-4, -9.76E-8, 3.515, 10.788, None, None],
[0.0295, 0.0077, 35, 50.17, 52.66, 53.47, 89.39,-1.21, 7.62E-2, -4.86E-5, 1.05E-8, 5.009, 6.436, None, None],
[0.0169, 0.0074, 9, 11.74, 48.84, 123.34, 163.16,-3.11E+1, 2.27E-1, -3.20E-4, 1.46E-7, 4.703, 1.896, None, None],
[0.0031, 0.0084, 63, 63.56, 20.09, -17.33, -22.99, 3.53E+1, -7.58E-2, 1.85E-4, -1.03E-7, 2.360, 6.884, None, None],
[0.0119, 0.0049, 54, 68.78, 34.40, 41.87, 33.12, 1.96E+1, -5.61E-3, 4.02E-5, -2.76E-8, 4.130, 6.817, None, None],
[0.0019, 0.0051, 38, 52.10, 79.93, 39.10, 27.76, 1.67E+1, 4.81E-3, 2.77E-5, -2.11E-8, 1.557, 5.984, None, None],
[0.0082, 0.0011, 41, 26.73, 8.13, 2.09, 11.30, -2.14, 5.74E-2, -1.64E-6, -1.59E-8, 1.101, 2.544, 259.65, -0.702],
[0.0143, 0.0008, 32, 31.01, 37.02, 46.43, 54.05, -8.25, 1.01E-1, -1.42E-4, 6.78E-8, 2.394, 3.059, -245.74,0.912],
[0.0100, 0.0025, 48, 27.15, 7.75, -26.80, -3.68, -6.03, 8.54E-2, -8.00E-6, -1.80E-8, 0.490, 2.398, 307.53, -0.798],
[0.0122, 0.0004, 38, 21.78, 19.88, 8.67, 40.99, -2.05E+1, 1.62E-1, -1.60E-4, 6.24E-8, 3.243, 1.942, -394.29,1.251],
[0.0042, 0.0061, 27, 21.32, 60.15, 79.72, 87.88, -9.09E+1, 5.57E-1, -9.00E-4, 4.69E-7, -1.373, 0.644, None, None],
[0.0113, -0.0028,56, 18.18, -4.32, -9.630, 3.77, 2.36E+1, -3.81E-2, 1.72E-4, -1.03E-7, -0.473, 1.724, 495.01, -1.539],
[0.0129, -0.0006,46, 24.96, 8.73, 37.97, 48.53, -8.00, 1.05E-1, -9.63E-5, 3.56E-8, 2.691, 2.205, 82.28, -0.242],
[0.0117, 0.0011, 38, 24.14, 11.14, 83.99, 92.36, -2.81E+1, 2.08E-1, -3.06E-4, 1.46E-7, 3.063, 2.138, None, None],
[0.0141, -0.0012,65, 23.58, -5.10, -76.45, -43.96, 1.95E+1, -8.08E-3, 1.53E-4, -9.67E-8, 0.908, 2.373, 548.29, -1.719],
[0.0189, 0.0000, 56, 22.88, 11.27, -20.64, 8.42, -9.09E-1, 9.50E-2, -5.44E-5, 1.19E-8, 2.590, 2.226, 94.16, -0.199],
[0.0164, 0.0020, 41, 21.74, 12.64, 29.89, 58.36, -2.30E+1, 2.04E-1, -2.65E-4, 1.20E-7, 0.749, 1.691, -322.15,1.187],
[0.0067, 0.0043, 27, 18.25, 46.43, 82.23, 116.02, -6.62E+1, 4.27E-1, -6.41E-4, 3.01E-7, -1.460, 0.636, -573.56,2.307],
[0.0111, -0.0057,27, -0.03, -15.78, -251.92,-247.19,2.65E+1, -9.13E-2, 1.91E-4, -1.03E-7, 1.398, -0.670, None, None],
[0.0105, -0.0049,58, 38.13, 13.55, -71.55,-64.31, 3.33E+1, -9.63E-2, 1.87E-4, -9.96E-8, 2.515, 4.532, 625.45, -1.814],
[0.0133, 0.0057, 71, 66.86, 43.43, -29.48, -38.06, 2.86E+1, -6.49E-2, 1.36E-4, -7.45E-8, 3.603, 6.582, 738.91, -2.038],
[0.0068, -0.0034,97, 93.84, 41.69, 21.06, 5.74, 3.21E+1, -6.41E-2, 1.26E-4, -6.87E-8, 2.724, 9.520, 809.55, -2.224]]
mol = Chem.MolFromSmiles(str(smiles))
NoA = Chem.AddHs(mol).GetNumAtoms()
MW = Descriptors.MolWt(Chem.AddHs(mol))
LogP = Descriptors.MolLogP(Chem.AddHs(mol))
MR = Descriptors.MolMR(Chem.AddHs(mol))
double_lists = search_func_groups(smiles)
entry_index_by_users = []
entry_data_by_users = []
for item in double_lists:
entry_index_by_users.append(item[0])
entry_data_by_users.append(item[1])
fiveteen_columns = [] ##length = 15*len(entry_index_by_users)
for index, data in zip(entry_index_by_users, entry_data_by_users):
for i in range(15):
if DB[index][i] == None:
temp = None
else:
temp = data*DB[index][i]
fiveteen_columns.append(temp)
temperature = 310
Tc = []
Pc = []
Vc = []
Tb = []
Tm = []
Hfor = []
Gf = []
Cpa = []
Cpb = []
Cpc = []
Cpd = []
Hfus = []
Hvap = []
Ya = []
Yb =[]
fc = fiveteen_columns ## short hand
for i in range(len(entry_index_by_users)):
Tc.append(fc[i*15])
Pc.append(fc[i*15 + 1])
Vc.append(fc[i*15 + 2])
Tb.append(fc[i*15 + 3])
Tm.append(fc[i*15 + 4])
Hfor.append(fc[i*15 + 5])
Gf.append(fc[i*15 + 6])
Cpa.append(fc[i*15 + 7])
Cpb.append(fc[i*15 + 8])
Cpc.append(fc[i*15 + 9])
Cpd.append(fc[i*15 + 10])
Hfus.append(fc[i*15 + 11])
Hvap.append(fc[i*15 + 12])
Ya.append(fc[i*15 + 13])
Yb.append(fc[i*15 + 14])
try:
BoilingPoint = 198.2 + sum(Tb)
except:
BoilingPoint = None
try:
MeltingPoint = 122.5 + sum(Tm)
except:
MeltingPoint = None
try:
CriticalTemp = (sum(Tb) + 198.2)/(0.584 + 0.965*sum(Tc) - sum(Tc)**2)
except:
CriticalTemp = None
try:
CriticalPress = 1./(0.113 + 0.0032*float(NoA) - sum(Pc))**2
except:
CriticalPress = None
try:
CriticalVolume = 17.5 + sum(Vc)
except:
CriticalVolume = None
try:
EnthalpyForm = 68.29 + sum(Hfor)
except:
EnthalpyForm = None
try:
GibbsEnergy = 53.88 + sum(Gf)
except:
GibbsEnergy = None
try:
HeatCapacity = (sum(Cpa) - 37.93) + (sum(Cpb) + 0.210)*float(temperature) + (sum(Cpc) - 3.91*10**(-4))*float(temperature)**2 + (sum(Cpd) + 2.06*10**(-7))*float(temperature)**3
except:
HeatCapacity = None
try:
EnthalpyVap = 15.30 + sum(Hvap)
except:
EnthalpyVap = None
try:
EnthalpyFus = -0.88 + sum(Hfus)
except:
EnthalpyFus = None
try:
LiquidVisco = float(MW)*math.exp((sum(Ya) - 597.82)/float(temperature) + sum(Yb) - 11.202)
except:
LiquidVisco = None
try:
CrystalSolub_1 = 10**(0.8 - float(LogP) - 0.01*(sum(Tm)+122.5 - 273.15 - 25.))*1000.*float(MW)
except:
CrystalSolub_1 = None
try:
CrystalSolub_2 = 10**(0.5 - float(LogP) - 0.01*(sum(Tm)+122.5 - 273.15 - 25.))*1000.*float(MW)
except:
CrystalSolub_2 = None
try:
AmorphSolub_1 = 10**(0.8 - float(LogP) - 0.01*(sum(Tm)+122.5 - 273.15 - 25.)) *1000.*float(MW)*math.exp((sum(Hfus)-0.88)*(sum(Tm) + 122.5 - float(temperature))*float(temperature)/(sum(Tm) + 122.5)**2/(2.479*float(temperature)/298.))
except:
AmorphSolub_1 = None
try:
AmorphSolub_2 = 10**(0.5 - float(LogP) - 0.01*(sum(Tm)+122.5 - 273.15 - 25.)) *1000.*float(MW)*math.exp((sum(Hfus)-0.88)*(sum(Tm) + 122.5 - float(temperature))*float(temperature)/(sum(Tm) + 122.5)**2/(2.479*float(temperature)/298.))
except:
AmorphSolub_2 = None
return MeltingPoint-273.15
def string2mp(name, namespace='name'):
mp, mp_origin = None, None
# try the chemicals package
if is_cas(name):
cas = name
if pd.isna(mp):
mp = chemicals.Tm(cas)
if mp: mp = mp-273.15
methods = chemicals.Tm_methods(cas)
if methods: mp_origin = 'chem/'+methods[0]
# experimental values
if pd.isna(mp):
# some rows have multiple comma-separated cas numbers
mask = dfmp_expt['CAS'].str.contains(cas)
if sum(mask):
subdf = dfmp_expt[mask]['CAS'].str.split(', ')
for index,subcas in zip(subdf.index,subdf):
if cas in subcas:
mp = float(dfmp_expt.iloc[index]['MP'])
mp_origin = 'expt'
# predicted values from Mansouri
#if pd.isna(mp):
# mask = dfmp_pred['Substance_CASRN'] == cas
# if sum(mask):
# mp = float(dfmp_pred[mask]['NCCT_MP'])
# mp_origin = 'pred'
# predicted values from OPERA (CompTox dashboard)
if pd.isna(mp):
mask = df_pred['CASRN'] == cas
if sum(mask):
mp = float(df_pred[mask]['MELTING_POINT_DEGC_OPERA_PRED'].iloc[0])
mp_origin = 'comptox/pred'
# try to scrape from PubChem
if pd.isna(mp):
content = None
try:
compounds = pcp.get_compounds(name, namespace=namespace)
c = compounds[0]
cid = c.cid
url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON'
#fid = urllib.request.urlopen(url)
#webpage = fid.read().decode('utf-8')
webpage = requests.get(url).text
content = json.loads(webpage)
except:
pass
if content:
mp_list = []
for i in content['Record']['Section']:
if i['TOCHeading'] == 'Chemical and Physical Properties':
for j in i['Section']:
if j['TOCHeading'] == 'Experimental Properties':
for k in j['Section']:
if k['TOCHeading'] == 'Melting Point':
for ii in k['Information']:
try:
mp_string = ii['Value']['StringWithMarkup'][0]['String']
#rho_string = rho_string.replace('Relative density (water = 1): ', '')
#print(mp_string)
#tmp_rho = re.match('(?:\d+(?:\.\d*)?|\.\d+)',rho_string)
#m = re.match('((?:\d+(?:\.\d*)?|\.\d+))(?:-((?:\d+(?:\.\d*)?|\.\d+)))?',rho_string)
m = re.match(r'(-?(?:\d+(?:\.\d*)?|\.\d+))(?:-((?:\d+(?:\.\d*)?|\.\d+)))?( ?°?C)',mp_string)
if m is not None:
groups = m.groups()
if len(groups):
for g in groups:
try:
tmp_mp = float(g)
mp_list.append(tmp_mp)
except:
continue
m = re.match(r'(-?(?:\d+(?:\.\d*)?|\.\d+))(?:-((?:\d+(?:\.\d*)?|\.\d+)))?( ?°?F)',mp_string)
if m is not None:
groups = m.groups()
if len(groups):
for g in groups:
try:
# (T/F − 32) × 5/9 = T/C
tmp_mp = (float(g)-32)*5/9
mp_list.append(tmp_mp)
except:
continue
except:
continue
if mp_list:
## remove outliers using interquartile range (IQR)
mp_list = np.array(mp_list)
q75,q25 = np.percentile(mp_list,[75,25])
intr_qr = q75-q25
hi = q75+(1.5*intr_qr)
lo = q25-(1.5*intr_qr)
mask = (mp_list <= hi) & (mp_list >= lo)
mp_list = mp_list[mask]
mp = np.mean(mp_list)
mp_origin = 'pubchem'
else:
mp, mp_origin = None, None
# try to scrape from DSSTOX website...
if try_dsstox:
if pd.isna(mp):
dtxsid = None
# try to find it via the dsstox dashboard
try:
name_urlsafe = urllib.parse.quote(name)
url = f'https://comptox.epa.gov/dashboard/search-results?input_type=synonym_substring&inputs={name_urlsafe}'
fid = urllib.request.urlopen(url)
webpage = fid.read().decode('utf-8')
hits = re.findall(r'DTXSID[0-9]+', webpage)
if len(hits):
dtxsid = hits[0]
except:
pass
if dtxsid:
mysoup = None
url = f'https://comptox.epa.gov/dashboard/chemical/properties/{dtxsid}'
#print(url)
try:
#driver = selenium.webdriver.Firefox()
#driver.set_page_load_timeout(15)
#driver.get(url)
#driver_exe = 'chromedriver'
#driver_exe = chromedriver_binary.chromedriver_filename
options = Options()
options.add_argument("--headless") # runs in background instead of showing browser window
service = Service(driver_exe)
driver = selenium.webdriver.Chrome(service=service, options=options)
#driver = selenium.webdriver.Chrome(driver_exe, options=options)
driver.set_page_load_timeout(15)
driver.get(url)
webpage = driver.page_source
driver.quit()
mysoup = bs4.BeautifulSoup(webpage, features='lxml')
except:
pass
#print("timeout")
# column of property names
if mysoup:
ifound = None
rows = mysoup.find_all('div', attrs={'col-id':'property'})
for i,row in enumerate(rows):
if 'Melting Point' in row.text:
ifound = i
break
if ifound:
rows = mysoup.find_all('div', attrs={'col-id':'exavg'})
text = rows[ifound].text
value = re.sub(r' \([0-9]*\)', '', text.strip())
try:
mp = float(value)
mp_origin = 'dsstox/expt'
except:
mp, mp_origin = None, None
if pd.isna(mp):
rows = mysoup.find_all('div', attrs={'col-id':'predavg'})
text = rows[ifound].text
value = re.sub(r' \([0-9]*\)', '', text.strip())
try:
mp = float(value)
mp_origin = 'dsstox/pred'
except:
mp, mp_origin = None, None
else:
mp, mp_origin = None, None
else:
mp, mp_origin = None, None
if pd.isna(mp): mp, mp_origin = None, None
return mp, mp_origin
def smiles2mp(smiles):
try:
SUPPORTED_ATOM_SET = {6, 7, 8, 9, 16, 17, 35, 53}
m = Chem.MolFromSmiles(str(smiles))
atom_num_set = set([a.GetAtomicNum() for a in m.GetAtoms()])
if atom_num_set.issubset(SUPPORTED_ATOM_SET):
mp = compute_phys_properties(smiles)
else:
mp = None
except:
mp = None
return mp
def smiles2mp_opera(smiles):
descs = padelpy.from_smiles(smiles, descriptortypes='mp/descriptors.xml')
#dfd = pd.DataFrame(descs,index=[0])
#dfd = dfd.replace('',0).infer_objects(copy=False)
#dfd = pd.DataFrame(dfd, dtype=float)
dfd = pd.DataFrame(descs, index=[0]).apply(pd.to_numeric, errors="coerce").fillna(0.0).astype(float)
X = np.array(dfd[my_opera_data_mp.desc_list])
X_scale = my_opera_data_mp.scaler_X.transform(X)
y_pred = my_opera_data_mp.scaler_y.inverse_transform(my_opera_data_mp.knn_all.predict(X_scale))
return y_pred[0][0]
def mol2mp(cas, name, smiles):
mp, mp_origin = None, None
if pd.isna(mp) and smiles:
try:
mp = smiles2mp_opera(smiles)
mp_origin = 'opera/calc'
except:
mp, mp_origin = None, None
if pd.isna(mp) and cas:
try:
mp, mp_origin = string2mp(cas)
except:
mp, mp_origin = None, None
if pd.isna(mp) and name:
try:
mp, mp_origin = string2mp(name)
except:
mp, mp_origin = None, None
if pd.isna(mp) and smiles:
try:
mp = smiles2mp(smiles)
mp_origin = 'joback-reid/calc'
except:
mp, mp_origin = None, None
return mp, mp_origin
def getLogP(cas,mol):
LogP, LogP_origin = None, None
if cas:
mask = df_pred['CASRN'] == cas
if sum(mask):
LogP = float(df_pred[mask]['OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED'])
LogP_origin = 'comptox/pred'
if pd.isna(LogP) and mol:
LogP = Crippen.MolLogP(mol)
LogP_origin = 'rdkit/calc'
return LogP, LogP_origin