Spaces:

dmsaylor
/

CHRIS

Running

CHRIS / ChemID.py

Robert Elder

kludge for several specific ceramics

6a4a1e6 about 1 month ago

49.1 kB

	import sys,os,glob,re,string
	from collections import Counter
	from operator import itemgetter
	import nltk
	import pandas as pd
	import numpy as np
	import pubchempy as pcp
	import cirpy
	import chemicals

	import bs4
	import urllib
	import requests
	import json

	ORGANIC_ATOM_SET = {5, 6, 7, 8, 9, 15, 16, 17, 35, 53}
	METAL_ATOM_SET = set([3,4,11,12,13] + list(range(19,31+1)) + list(range(37,50+1)) + list(range(55,84+1)) + list(range(87,114+1)) + [116])
	with open('data/ceramics_list.txt', 'r') as fp:
	lines = fp.readlines()
	CERAMICS_SET = {line.strip() for line in lines}
	with open('data/salt_list.txt', 'r') as fp:
	lines = fp.readlines()
	SALT_SET = {line.strip() for line in lines}

	ERROR_CODES = {0:None, 1:'Structure could not be determined from the identifier', 2:'Invalid SMILES code', 3:'Invalid CAS number', 4:'Invalid identifier type selected'}

	## not sure if this will be possible on pythonanywhere; use this flag to disable related code blocks
	try_dsstox = True
	if try_dsstox:
	import selenium
	import selenium.webdriver
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.chrome.service import Service
	## NOTE this does not seem very robust
	uname = os.uname()
	if uname.sysname == 'Darwin':
	# mac
	import chromedriver_binary
	driver_exe = chromedriver_binary.chromedriver_filename
	elif uname.sysname == 'Linux':
	# pythonanywhere
	driver_exe = '/usr/local/bin/chromedriver'
	else:
	# fallback...
	import chromedriver_binary
	driver_exe = chromedriver_binary.chromedriver_filename

	from io import BytesIO
	from PIL import ImageOps
	import base64

	import rdkit
	from rdkit.Chem import AllChem as Chem
	from rdkit.Chem import Descriptors,Draw,Crippen

	## add custom chemical definitions (i.e., to correct confusion between methane and carbon)
	db = chemicals.identifiers.get_pubchem_db()
	db.load('data/custom_chemicals_db.tsv')
	## load experimental and predicted properties
	#dfmp_expt = pd.read_excel('PHYSPROP_MP_data.xlsx')
	dfmp_expt = pd.read_csv('data/PHYSPROP_MP_data.tsv', sep='\t')
	#dfmp_pred = pd.read_excel('DSSTOX_MP_pred_data.xlsx')
	#df_pred = pd.read_excel('Comptox_pred_data.xlsx')
	df_pred = pd.read_csv('data/Comptox_pred_data.tsv', sep='\t')

	## OPERA melting point model
	import dill as pickle
	import sklearn
	import sklearn.neighbors
	import sklearn.metrics
	import padelpy
	from functions import weight_func
	class opera_data_mp():
	n_neighbors = 5
	#weight_factor = 5e-3
	desc_list = ['SHBd', 'nN', 'maxHBd', 'ATSC1v', 'AATS1i', 'TopoPSA', 'nT6Ring', 'nHBDon', 'WTPT-5', 'minHBd', 'nHBint2', 'IC0', 'MLFER_S', 'MLFER_BO', 'WTPT-3']
	with open('mp/model-opera-knn.pkl', 'rb') as fp:
	knn_all = pickle.load(fp)
	knn_all.weights = weight_func # fix weird problem on pythonanywhere...
	with open('mp/model-opera-scalerX.pkl', 'rb') as fp:
	scaler_X = pickle.load(fp)
	with open('mp/model-opera-scalerY.pkl', 'rb') as fp:
	scaler_y = pickle.load(fp)
	my_opera_data_mp = opera_data_mp()

	def ResolveChemical(chemName, IDtype, debug=False, get_properties=['logp','rho','mp']):

	#LogP_func = Crippen.MolLogP
	LogP_func = getLogP

	# remove excess whitespace
	chemName = chemName.strip()

	name = None
	smiles = None
	cas = None
	Mw = None
	LogP = None
	rho = None
	mp = None
	im64 = None
	mp_origin = None
	rho_origin = None
	LogP_origin = None
	error = 0

	if IDtype == 'CAS':
	cas = chemName

	if not is_cas(cas):
	error = 3 #invalid cas
	return (name, cas, smiles, Mw, LogP, rho, mp, im64, error)

	smiles = cas2smiles(cas)

	if smiles:
	name = cas2name(cas)
	if 'rho' in get_properties:
	rho, rho_origin = string2density(cas)
	if 'mp' in get_properties:
	mp, mp_origin = mol2mp(cas, name, smiles)
	if 'rho' in get_properties and pd.isna(rho) and name:
	rho, rho_origin = string2density(name)
	try:
	mol = Chem.MolFromSmiles(smiles)
	except:
	error = 2 #invalid smiles
	if mol:
	Mw = Descriptors.MolWt(mol)
	if 'logp' in get_properties:
	LogP, LogP_origin = LogP_func(cas, mol)
	im = ImageFromSmiles(smiles)
	im64 = Imageto64(im)
	else:
	error = 2 #invalid smiles
	else:
	error = 1 # no smiles found
	elif IDtype == 'SMILES':
	smiles = chemName

	try:
	mol = Chem.MolFromSmiles(smiles)
	except:
	error = 2
	if mol:
	Mw = Descriptors.MolWt(mol)
	if 'logp' in get_properties:
	LogP, LogP_origin = LogP_func(cas, mol)
	im = ImageFromSmiles(smiles)
	im64 = Imageto64(im)
	else:
	error = 2

	# if SMILES is not valid, skip the other stuff
	if not error:
	name = smiles2name(smiles)
	if name:
	cas = name2cas(name)
	if 'rho' in get_properties:
	rho, rho_origin = string2density(name)
	if 'rho' in get_properties and pd.isna(rho) and cas:
	rho, rho_origin = string2density(cas)

	if 'mp' in get_properties:
	mp, mp_origin = mol2mp(cas, name, smiles)
	elif IDtype == 'common':
	name = chemName

	name, name_origin = name2iupac(name)
	smiles = name2smiles(name)
	cas = name2cas(name)
	if not smiles:
	smiles = cas2smiles(cas)

	if 'rho' in get_properties and pd.isna(rho) and cas:
	rho, rho_origin = string2density(cas)
	if 'rho' in get_properties and pd.isna(rho):
	rho, rho_origin = string2density(name)
	if 'rho' in get_properties and pd.isna(rho):
	# try this because sometimes iupac names don't work
	rho, rho_origin = string2density(chemName)

	if smiles:
	if 'mp' in get_properties:
	mp, mp_origin = mol2mp(cas, name, smiles)
	try:
	mol = Chem.MolFromSmiles(smiles)
	except:
	error = 2
	if mol:
	Mw = Descriptors.MolWt(mol)
	if 'logp' in get_properties:
	LogP, LogP_origin = LogP_func(cas, mol)
	im = ImageFromSmiles(smiles)
	im64 = Imageto64(im)
	else:
	error = 2
	else:
	error = 1
	else:
	## should never be here
	name = None
	smiles = None
	cas = None
	Mw = None
	LogP = None
	rho = None
	mp = None
	im64 = None
	error = 4 # invalid IDtype selection, probably not possible

	# if we couldn't find a name or CAS (but do have SMILES)
	if not error:
	if not name:
	name = 'Not found'
	if not cas:
	cas = 'Not found'

	if mp is not None:
	mp = float(mp)

	if debug:
	return (name, cas, smiles, Mw, LogP, LogP_origin, rho, rho_origin, mp, mp_origin, im64, error)
	else:
	return (name, cas, smiles, Mw, LogP, rho, mp, im64, error)

	def CeramicOrMetal(smiles,mp):
	# metals/ceramics logic
	is_ceramic = False
	mol = Chem.MolFromSmiles(smiles)
	atom_num_list = [a.GetAtomicNum() for a in mol.GetAtoms()]
	is_metal = set(atom_num_list) <= METAL_ATOM_SET
	if not is_metal:
	# check composition against list of ceramics/salts
	elements = ','.join(sorted(set([a.GetSymbol() for a in mol.GetAtoms()])))
	if elements in CERAMICS_SET:
	is_ceramic = True
	if elements in SALT_SET:
	is_ceramic = True
	if not is_ceramic:
	# get number of carbon-carbon bonds
	num_CC_bonds = sum([1 if b.GetBeginAtom().GetAtomicNum() == 6 and b.GetEndAtom().GetAtomicNum() == 6 else 0 for b in mol.GetBonds()])
	if not num_CC_bonds and (mp is not None) and mp > 700.:
	# if not a metal, no C-C bonds, and mp > 700 (sodium chloride has mp ~ 800), assume ceramic...
	is_ceramic = True
	return is_metal, is_ceramic

	#Generates an image of the molecule represented by the SMILES code given.
	#Returns None if the image cannot be generated. From https://github.com/ronaldo-prata/flask-test/blob/master/functions.py

	def ImageFromSmiles(smiles):
	image = None
	if type(smiles) is str:
	try:
	if smiles == 'C1=CC=C2C(=C1)C3=NC4=NC(=NC5=C6C=CC=CC6=C([N-]5)N=C7C8=CC=CC=C8C(=N7)N=C2[N-]3)C9=CC=CC=C94.[Mn+2]':
	mol = next(Chem.SDMolSupplier('data/MnPC.sdf', removeHs=False))
	image = Draw.MolToImage(mol, size=(350, 350))
	else:
	image = Draw.MolToImage(Chem.MolFromSmiles(smiles), size=(350, 350))
	except ValueError:
	pass
	return image

	#Trims the image into a box, removing any excess white background.
	#The box cannot be smaller than 400x400. This is done due to the difference in quality in the images generated by MolToImage,
	# if the size is too small (300x300), big molecules are too low quality, but if the size is too big (1000*1000), small molecules appear zoomed out.

	def wTrim(img):
	bbox = ImageOps.invert(img).getbbox()
	crop = (bbox[0], bbox[1], bbox[2], bbox[3])

	return img.crop(crop)

	#Converts a PIL image into its base64 representation.
	def Imageto64(img):
	img = wTrim(img)
	buf = BytesIO()
	img.save(buf, format="PNG")
	pngImageB64String = "data:image/png;base64,"
	pngImageB64String += base64.b64encode(buf.getvalue()).decode("utf-8")

	return pngImageB64String

	# function to convert SMILES to name
	def smiles2name(smiles):
	name = None
	# first try chemicals package
	try:
	cm = chemicals.search_chemical(smiles)
	if cm.iupac_name:
	name = cm.iupac_name
	elif cm.common_name:
	name = cm.common_name
	except:
	name = None
	# then try pubchem for compounds
	if not name:
	try:
	compounds = pcp.get_compounds(smiles, namespace='smiles')
	c = compounds[0]
	name = c.iupac_name
	if not name:
	# have seen empty iupac_name before, try synonyms if this happens
	name = c.synonyms[0]
	except:
	name = None
	# next try cirpy
	if not name:
	try:
	name = cirpy.resolve(smiles, 'iupac_name')
	except:
	name = None
	if type(name) is list:
	name = name[0]
	# finally try it as a pubchem substance
	if not name:
	try:
	compounds = pcp.get_substances(smiles, namespace='smiles')
	# sometimes there are multiple substances, and multiple synonyms per substance
	allsyns = [syn for c in compounds for syn in c.iupac_name if cas not in syn]
	# choose the most common synonym
	fd = nltk.FreqDist(allsyns)
	name = fd.most_common(1)[0][0]
	except:
	name = None
	return name

	# function to convert CAS to SMILES
	def cas2smiles(cas):
	smiles = None
	# first try chemicals package
	try:
	cm = chemicals.search_chemical(cas)
	smiles = cm.smiles
	except:
	smiles = None
	# then try pubchem for compounds
	if not smiles:
	try:
	compounds = pcp.get_compounds(cas, namespace='name')
	c = compounds[0]
	smiles = c.isomeric_smiles
	except:
	smiles = None
	# next try cirpy
	if not smiles:
	try:
	smiles = cirpy.resolve(cas, 'smiles')
	except:
	smiles = None
	if type(smiles) is list:
	smiles = smiles[0]
	# finally try it as a pubchem substance
	if not smiles:
	try:
	compounds = pcp.get_substances(cas, namespace='name')
	# sometimes there are multiple substances, and multiple synonyms per substance
	allsyns = [syn for c in compounds for syn in c.isomeric_smiles if cas not in syn]
	# choose the most common synonym
	fd = nltk.FreqDist(allsyns)
	smiles = fd.most_common(1)[0][0]
	except:
	smiles = None
	return smiles

	# function to convert cas to name
	def cas2name(cas):
	name = None
	#if not is_cas(cas):
	# name = 'INVALID CAS'
	# first try chemicals package
	try:
	cm = chemicals.search_chemical(cas)
	if cm.iupac_name:
	name = cm.iupac_name
	elif cm.common_name:
	name = cm.common_name
	except:
	name = None
	# then try cirpy
	if not name:
	try:
	name = cirpy.resolve(cas, 'iupac_name')
	except:
	name = None
	if type(name) is list:
	name = name[0]
	# next try pubchem for compounds
	if not name:
	try:
	compounds = pcp.get_compounds(cas, namespace='name')
	c = compounds[0]
	name = c.iupac_name
	if not name:
	# have seen empty iupac_name before, try synonyms if this happens
	name = c.synonyms[0]
	except:
	name = None
	return name

	# function to convert chemical name to iupac name
	def name2iupac(string):
	name = None
	origin = None
	# try chemicals package
	try:
	cm = chemicals.search_chemical(string)
	if cm.iupac_name:
	name = cm.iupac_name
	elif cm.common_name:
	name = cm.common_name
	origin = 'chemicals'
	except:
	name = None
	origin = None
	# try pubchem for compounds
	if not name:
	try:
	compounds = pcp.get_compounds(string, namespace='name')
	c = compounds[0]
	name = c.iupac_name
	if not name:
	# have seen empty iupac_name before, try synonyms if this happens
	name = c.synonyms[0]
	origin = 'PubChem'
	except:
	name = None
	origin = None
	# next try cirpy
	if not name:
	try:
	#name = cirpy.resolve(string, 'names')
	name = cirpy.resolve(string, 'iupac_name')
	if name: origin = 'CIRPY'
	except:
	name = None
	origin = None
	if type(name) is list:
	name = name[0]
	# now try it as a pubchem substance
	if not name:
	try:
	compounds = pcp.get_substances(string, namespace='name')
	# sometimes there are multiple substances, and multiple synonyms per substance
	allsyns = [syn for c in compounds for syn in c.synonyms if cas not in syn]
	# choose the most common synonym
	fd = nltk.FreqDist(allsyns)
	name = fd.most_common(1)[0][0]
	origin = 'PubChem/substance'
	except:
	name = None
	origin = None
	# strip all spaces and try again...
	if not name:
	string_strip = re.sub(' ','',string)
	# first try pubchem for compounds
	try:
	compounds = pcp.get_compounds(string_strip, namespace='name')
	c = compounds[0]
	name = c.iupac_name
	if not name:
	# have seen empty iupac_name before, try synonyms if this happens
	name = c.synonyms[0]
	origin = 'PubChem'
	except:
	name = None
	origin = None
	# next try cirpy
	if not name:
	try:
	#name = cirpy.resolve(string_strip, 'names')
	name = cirpy.resolve(string, 'iupac_name')
	if name: origin = 'CIRPY'
	except:
	name = None
	origin = None
	if type(name) is list:
	name = name[0]
	return name, origin

	# function to convert name to cas
	def name2cas(name):
	cas = None
	# try chemicals package
	try:
	cm = chemicals.search_chemical(name)
	cas = cm.CASs
	except:
	cas = None
	# then try cirpy
	if not cas:
	try:
	cas = cirpy.resolve(name, 'cas')
	except:
	cas = None
	if type(cas) is list:
	cas.sort(key=lambda s: np.array(s.split('-'), dtype=int).sum())
	cas = cas[0]
	# next try pubchem for compounds
	if not cas:
	try:
	compounds = pcp.get_compounds(name, namespace='name')
	c = compounds[0]
	syns = c.synonyms
	possible_cas = [syn for syn in syns if is_cas(syn)]
	# if multiple choose option with smallest sum of digits
	possible_cas.sort(key=lambda s: np.array(s.split('-'), dtype=int).sum())
	cas = possible_cas[0]
	except:
	cas = None
	return cas

	# function to convert name to SMILES
	def name2smiles(name):
	smiles = None
	# first try chemicals package
	try:
	cm = chemicals.search_chemical(name)
	smiles = cm.smiles
	except:
	smiles = None
	# then try pubchem for compounds
	if not smiles:
	try:
	compounds = pcp.get_compounds(name, namespace='name')
	c = compounds[0]
	smiles = c.isomeric_smiles
	except:
	smiles = None
	# next try cirpy
	if not smiles:
	try:
	smiles = cirpy.resolve(name, 'smiles')
	except:
	smiles = None
	if type(smiles) is list:
	smiles = smiles[0]
	# then try it as a pubchem substance
	if not smiles:
	try:
	compounds = pcp.get_substances(name, namespace='name')
	# sometimes there are multiple substances, and multiple synonyms per substance
	allsyns = [syn for c in compounds for syn in c.isomeric_smiles if name not in syn]
	# choose the most common synonym
	fd = nltk.FreqDist(allsyns)
	smiles = fd.most_common(1)[0][0]
	except:
	smiles = None
	# finally try to resolve SMILES from name using OPSIN
	#if not smiles:
	# try:
	# with open('opsin.tmp.1', 'w') as fp:
	# fp.write(name)
	# os.system('java -jar /Users/robert.elder/software/utils/opsin-2.4.0-jar-with-dependencies.jar -osmi opsin.tmp.1 opsin.tmp.2 &> /dev/null')
	# with open('opsin.tmp.2') as fp:
	# smiles = fp.read()
	# if smiles == '\n':
	# smiles = None
	# smiles = smiles.strip() #remove trailing newline
	# except KeyboardInterrupt:
	# raise
	# except:
	# smiles = None
	return smiles

	def check_cas(cas):
	n1,n2,n3 = cas.split('-')
	# combine and flip first 2 numbers
	tmp = ''.join([n1,n2])[::-1]
	# sum of number*position in string
	check = sum([i*int(tmp[i-1]) for i in range(1,len(tmp)+1)])
	# mod 10
	check = check%10
	# if these match, then it's a legit cas number
	return check == int(n3)

	def is_cas(cas):
	try:
	return check_cas(cas)
	except:
	return False

	def string2density(name):
	rho, rho_origin = None, None
	# predicted values from TEST (CompTox dashboard)
	if is_cas(name):
	mask = df_pred['CASRN'] == name
	if sum(mask):
	rho = float(df_pred[mask]['DENSITY_G/CM^3_TEST_PRED'])
	rho_origin = 'comptox/pred'
	# try to scrape from PubChem
	if pd.isna(rho):
	content = None
	try:
	compounds = pcp.get_compounds(name, namespace='name')
	c = compounds[0]
	cid = c.cid
	url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON'
	#fid = urllib.request.urlopen(url)
	#webpage = fid.read().decode('utf-8')
	webpage = requests.get(url).text
	content = json.loads(webpage)
	except:
	pass
	if content:
	rho_list = []
	for i in content['Record']['Section']:
	if i['TOCHeading'] == 'Chemical and Physical Properties':
	for j in i['Section']:
	if j['TOCHeading'] == 'Experimental Properties':
	for k in j['Section']:
	if k['TOCHeading'] == 'Density':
	for ii in k['Information']:
	try:
	rho_string = ii['Value']['StringWithMarkup'][0]['String']
	rho_string = rho_string.replace('Relative density (water = 1): ', '')
	#print(rho_string)
	#tmp_rho = re.match('(?:\d+(?:\.\d*)?\|\.\d+)',rho_string)
	m = re.match(r'((?:\d+(?:\.\d)?\|\.\d+))(?:-((?:\d+(?:\.\d)?\|\.\d+)))?',rho_string)
	groups = m.groups()
	if len(groups):
	for g in groups:
	try:
	tmp_rho = float(g)
	rho_list.append(tmp_rho)
	except:
	continue
	except:
	continue
	if rho_list:
	## remove outliers using interquartile range (IQR)
	rho_list = np.array(rho_list)
	q75,q25 = np.percentile(rho_list,[75,25])
	intr_qr = q75-q25
	hi = q75+(1.5*intr_qr)
	lo = q25-(1.5*intr_qr)
	mask = (rho_list <= hi) & (rho_list >= lo)
	rho_list = rho_list[mask]
	rho = np.mean(rho_list)
	rho_origin = 'pubchem'
	else:
	rho, rho_origin = None, None
	# try to scrape from DSSTOX
	if try_dsstox:
	if pd.isna(rho):
	dtxsid = None
	try:
	# try to find it via the dsstox dashboard
	name_urlsafe = urllib.parse.quote(name)
	url = f'https://comptox.epa.gov/dashboard/search-results?input_type=synonym_substring&inputs={name_urlsafe}'
	fid = urllib.request.urlopen(url)
	webpage = fid.read().decode('utf-8')
	hits = re.findall(r'DTXSID[0-9]+', webpage)
	if len(hits):
	dtxsid = hits[0]
	except:
	pass
	if dtxsid:
	mysoup = None
	url = f'https://comptox.epa.gov/dashboard/chemical/properties/{dtxsid}'
	#print(url)
	try:
	options = Options()
	options.add_argument("--headless") # runs in background instead of showing browser window
	service = Service(driver_exe)
	driver = selenium.webdriver.Chrome(service=service, options=options)
	driver.set_page_load_timeout(15)
	driver.get(url)
	webpage = driver.page_source
	driver.quit()
	mysoup = bs4.BeautifulSoup(webpage, features='lxml')
	except KeyboardInterrupt:
	raise
	except:
	pass
	if mysoup:
	ifound = None
	# column of property names
	rows = mysoup.find_all('div', attrs={'col-id':'property'})
	for i,row in enumerate(rows):
	if 'Density' in row.text:
	ifound = i
	break
	if ifound:
	rows = mysoup.find_all('div', attrs={'col-id':'exavg'})
	text = rows[ifound].text
	value = re.sub(r' $[0-9]*$', '', text.strip())
	try:
	rho = float(value)
	rho_origin = 'dsstox/expt'
	except:
	rho, rho_origin = None, None
	if pd.isna(rho):
	rows = mysoup.find_all('div', attrs={'col-id':'predavg'})
	text = rows[ifound].text
	value = re.sub(r' $[0-9]*$', '', text.strip())
	try:
	rho = float(value)
	rho_origin = 'dsstox/pred'
	except:
	rho, rho_origin = None, None
	else:
	rho, rho_origin = None, None
	else:
	rho, rho_origin = None, None
	if pd.isna(rho): rho,rho_origin = None, None
	return rho, rho_origin

	def return_non_duplicate_index(tuples): ##Given a list of sets return index of non_duplicate items
	## from https://github.com/curieshicy/JRgui/
	##step 1, create a new tuple, named "new_tuples"
	new_tuples = [] ##the elements are the sets
	for i in tuples:
	for j in i:
	new_tuples.append(set(j))
	##step 2, create a dictionary storing one to one relationship between new_tuple and old_tuple
	values = []
	for index, item in enumerate(tuples):
	if len(item) == 1:
	values.append(index)
	else:
	for i in [index]*len(item):
	values.append(i)
	keys = [i for i in range(len(new_tuples))]
	dict_tuples = {} ## {0:0, 1:1, 2:2, 3:3, 4:3, 5:3, 6:4, 7:4, 8:4, 9:5, 10:6, 11:7, 12:8}
	for i, j in zip(keys, values):
	dict_tuples[i] = j
	##step 3, remove duplicates in sets terminology
	remove_index = []
	for index_1, item in enumerate(new_tuples): ##starting from beginning
	for index_2 in range(index_1 + 1, len(new_tuples)): ##loop over the rest items
	if len(item & new_tuples[index_2]) != 0:
	if len(item)>len(new_tuples[index_2]):
	remove_index.append(index_2) #indefoirx
	elif len(item)<len(new_tuples[index_2]):
	remove_index.append(index_1) #index
	elif len(item)==len(new_tuples[index_2]):
	remove_index.append(index_2) #index
	remain_sets = set(range(len(new_tuples))).difference(set(remove_index))
	##step 4, spit out final index and length
	index_1 = [] ## [0,1,2,3,3,3,4,4]
	index_length = []
	for i in remain_sets:
	index_1.append(dict_tuples[i])
	counts = Counter(index_1) ##this is a dictionary return Counter({3:3, 4:2, 0:1, 1:1, 2:1}) ##index:length
	list_counts = counts.most_common() ## convert to a list [(3,3), (4,2), (0,1), (1,1), (2,1)]
	for i in range(len(list_counts)):
	index_length.append([list_counts[i][0], list_counts[i][1]])
	index_length = sorted(index_length, key = itemgetter(0))
	return index_length

	def search_func_groups(smiles): ##this is to search functional groups and print out them with numbers
	## from https://github.com/curieshicy/JRgui/
	smarts = ["[$([CX2H0](=)=)]", "[$([CX2H1]#[!#7])]", "[$([CX2H0]#[!#7])]", "[OX2H]-[C]=O", "[#6X3H0;!$([#6X3H0](~O)(~O)(~O))](=[#8X1])[#8X2H0]",
	"[$([#6X3H0](=[OX1]));!$([#6X3](=[#8X1])~[#8X2]);R]=O", "[CH;D2;$(C-!@C)](=O)", "[OX2H;!$([OX2H]-[#6]=[O]);!$([OX2H]-a)]", "[O;H1;$(O-!@c)]",
	"[#8X2H0;R;!$([#8X2H0]~[#6]=[#8])]", "[$([CX3H0](=[OX1]));!$([CX3](=[OX1])-[OX2]);!R]=O", "[OX2H0;!R;!$([OX2H0]-[#6]=[#8])]",
	"[$([#7X3,#7X3+][!#8])](=[O])~[O-]", "[OX1H0;!$([OX1H0]~[#6X3]);!$([OX1H0]~[#7X3]~[#8])]", "[#7X2H0;R]", "[#7X3H1;R]", "[#7X2H1]",
	"[#7X2H0;!R]","[#6X2]#[#7X1H0]","[NX3H2]", "[NX3H1;!R]", "[#7X3H0;!$([#7](~O)~O)]","[SX2H]","[#16X2H0;!R]","[#16X2H0;R]", "[R;CX3H1,cX3H1]",
	"[$([R;#6X3H0]);!$([R;#6X3H0]=[#8])]","[R;CX4H2]","[R;CX4H]","[R;CX4H0]", "[CX3H2]", "[!R;CX3H1;!$([CX3H1](=O))]",
	"[$([!R;#6X3H0]);!$([!R;#6X3H0]=[#8])]","[CX4H3]","[!R;CX4H2]", "[!R;CX4H]","[!R;CX4H0]","[F]","[Cl]","[Br]", "[I]"]
	tuples = []
	index_list = []
	final_index_and_length = []
	m = Chem.MolFromSmiles(str(smiles))
	for index, smart in enumerate(smarts):
	if m.HasSubstructMatch(Chem.MolFromSmarts(smart)) == True:
	tuples.append(m.GetSubstructMatches(Chem.MolFromSmarts(smart))) ## this is atom position
	index_list.append(index)
	temp = return_non_duplicate_index(tuples) # [[0, 1], [1, 1], [3, 1], [4, 7], [5, 6], [6, 1], [7, 1], [8, 1], [9, 1]]
	for i in temp:
	final_index_and_length.append([index_list[i[0]], i[1]])
	return final_index_and_length

	def compute_phys_properties(smiles):
	## from https://github.com/curieshicy/JRgui/
	## method from: K. G. Joback, R. C. Reid, ESTIMATION OF PURE-COMPONENT PROPERTIES FROM GROUP-CONTRIBUTIONS. Chemical Engineering Communications 57, 233-243 (1987).
	## this doesn't look very accurate, but it's a start
	##[[], [], ...[]] in total 41 nested list inside a list
	DB = [[0.0026, 0.0028, 36, 26.15, 17.78, 142.14, 136.70, 2.74E+1, -5.57E-2, 1.01E-4, -5.02E-8, 4.720, 2.661, None, None],
	[0.0027, -0.0008,46, 9.20, -11.18, 79.30, 77.71, 2.45E+1, -2.71E-2, 1.11E-4, -6.78E-8, 2.322, 1.155, None, None],
	[0.0020, 0.0016, 37, 27.38, 64.32, 115.51, 109.82, 7.87, 2.01E-2, -8.33E-6, 1.39E-9, 4.151, 3.302, None, None],
	[0.0791, 0.0077, 89, 169.09, 155.50, -426.72,-387.87,2.41E+1, 4.27E-2, 8.04E-5, -6.87E-8, 11.051, 19.537, 1317.23,-2.578],
	[0.0481, 0.0005, 82, 81.10, 53.60, -337.92,-301.95,2.45E+1, 4.02E-2, 4.02E-5, -4.52E-8, 6.959, 9.633, 483.88, -0.966],
	[0.0284, 0.0028, 55, 94.97, 75.97, -164.50,-126.27,3.04E+1, -8.29E-2, 2.36E-4, -1.31E-7, None, 6.645, None, None],
	[0.0379, 0.0030, 82, 72.24, 36.90, -162.03,-143.48,3.09E+1, -3.36E-2, 1.60E-4, -9.88E-8, 3.197, 9.093, 740.92, -1.713],
	[0.0741, 0.0112, 28, 92.88, 44.45, -208.04,-189.20,2.57E+1, -6.91E-2, 1.77E-4, -9.88E-8, 2.406, 16.826, 2173.72,-5.057],
	[0.0240, 0.0184, -25, 76.34, 82.83, -221.65,-197.37,-2.81, 1.11E-1, -1.16E-4, 4.94E-8, 4.490, 12.499, 3018.17,-7.314],
	[0.0098, 0.0048, 13, 31.22, 23.05, -138.16,-98.22, 1.22E+1, -1.26E-2, 6.03E-5, -3.86E-8, 5.879, 4.682, 440.24, -0.953],
	[0.0380, 0.0031, 62, 76.75, 61.20, -133.22,-120.50,6.45, 6.70E-2, -3.57E-5, 2.86E-9, 4.189, 8.972, 340.35, -0.350],
	[0.0168, 0.0015, 18, 22.42, 22.23, -132.22,-105.00,2.55E+1, -6.32E-2, 1.11E-4, -5.48E-8, 1.188, 2.410, 122.09, -0.386],
	[0.0437, 0.0064, 91, 152.54, 127.24, -66.57, -16.83, 2.59E+1, -3.74E-3, 1.29E-4, -8.88E-8, 9.679, 16.738, None, None],
	[0.0143, 0.0101, 36, -10.50, 2.08, -247.61,-250.83,6.82, 1.96E-2, 1.27E-5, -1.78E-8, 3.624, 5.909, 675.24, -1.340],
	[0.0085, 0.0076, 34, 57.55, 68.40, 55.52, 79.93, 8.83, -3.84E-3, 4.35E-5, -2.60E-8, 3.649, 6.528, None, None],
	[0.0130, 0.0114, 29, 52.82, 101.51, 31.65,75.61,1.18E+1, -2.30E-2, 1.07E-4, -6.28E-8, 7.490, 6.930, None, None],
	[None, None, None, 83.08, 68.91, 93.70, 119.66, 5.69, -4.12E-3, 1.28E-4, -8.88E-8, None, 12.169, None, None],
	[0.0255, -0.0099,None, 74.60, None, 23.61, None, None, None, None, None, None, 3.335, None, None],
	[0.0496, -0.0101,91, 125.66, 59.89, 88.43, 89.22, 3.65E+1, -7.33E-2, 1.84E-4, -1.03E-7, 2.414, 12.851, None, None],
	[0.0243, 0.0109, 38, 73.23, 66.89, -2.02, 14.07,2.69E+1, -4.12E-2, 1.64E-4, -9.76E-8, 3.515, 10.788, None, None],
	[0.0295, 0.0077, 35, 50.17, 52.66, 53.47, 89.39,-1.21, 7.62E-2, -4.86E-5, 1.05E-8, 5.009, 6.436, None, None],
	[0.0169, 0.0074, 9, 11.74, 48.84, 123.34, 163.16,-3.11E+1, 2.27E-1, -3.20E-4, 1.46E-7, 4.703, 1.896, None, None],
	[0.0031, 0.0084, 63, 63.56, 20.09, -17.33, -22.99, 3.53E+1, -7.58E-2, 1.85E-4, -1.03E-7, 2.360, 6.884, None, None],
	[0.0119, 0.0049, 54, 68.78, 34.40, 41.87, 33.12, 1.96E+1, -5.61E-3, 4.02E-5, -2.76E-8, 4.130, 6.817, None, None],
	[0.0019, 0.0051, 38, 52.10, 79.93, 39.10, 27.76, 1.67E+1, 4.81E-3, 2.77E-5, -2.11E-8, 1.557, 5.984, None, None],
	[0.0082, 0.0011, 41, 26.73, 8.13, 2.09, 11.30, -2.14, 5.74E-2, -1.64E-6, -1.59E-8, 1.101, 2.544, 259.65, -0.702],
	[0.0143, 0.0008, 32, 31.01, 37.02, 46.43, 54.05, -8.25, 1.01E-1, -1.42E-4, 6.78E-8, 2.394, 3.059, -245.74,0.912],
	[0.0100, 0.0025, 48, 27.15, 7.75, -26.80, -3.68, -6.03, 8.54E-2, -8.00E-6, -1.80E-8, 0.490, 2.398, 307.53, -0.798],
	[0.0122, 0.0004, 38, 21.78, 19.88, 8.67, 40.99, -2.05E+1, 1.62E-1, -1.60E-4, 6.24E-8, 3.243, 1.942, -394.29,1.251],
	[0.0042, 0.0061, 27, 21.32, 60.15, 79.72, 87.88, -9.09E+1, 5.57E-1, -9.00E-4, 4.69E-7, -1.373, 0.644, None, None],
	[0.0113, -0.0028,56, 18.18, -4.32, -9.630, 3.77, 2.36E+1, -3.81E-2, 1.72E-4, -1.03E-7, -0.473, 1.724, 495.01, -1.539],
	[0.0129, -0.0006,46, 24.96, 8.73, 37.97, 48.53, -8.00, 1.05E-1, -9.63E-5, 3.56E-8, 2.691, 2.205, 82.28, -0.242],
	[0.0117, 0.0011, 38, 24.14, 11.14, 83.99, 92.36, -2.81E+1, 2.08E-1, -3.06E-4, 1.46E-7, 3.063, 2.138, None, None],
	[0.0141, -0.0012,65, 23.58, -5.10, -76.45, -43.96, 1.95E+1, -8.08E-3, 1.53E-4, -9.67E-8, 0.908, 2.373, 548.29, -1.719],
	[0.0189, 0.0000, 56, 22.88, 11.27, -20.64, 8.42, -9.09E-1, 9.50E-2, -5.44E-5, 1.19E-8, 2.590, 2.226, 94.16, -0.199],
	[0.0164, 0.0020, 41, 21.74, 12.64, 29.89, 58.36, -2.30E+1, 2.04E-1, -2.65E-4, 1.20E-7, 0.749, 1.691, -322.15,1.187],
	[0.0067, 0.0043, 27, 18.25, 46.43, 82.23, 116.02, -6.62E+1, 4.27E-1, -6.41E-4, 3.01E-7, -1.460, 0.636, -573.56,2.307],
	[0.0111, -0.0057,27, -0.03, -15.78, -251.92,-247.19,2.65E+1, -9.13E-2, 1.91E-4, -1.03E-7, 1.398, -0.670, None, None],
	[0.0105, -0.0049,58, 38.13, 13.55, -71.55,-64.31, 3.33E+1, -9.63E-2, 1.87E-4, -9.96E-8, 2.515, 4.532, 625.45, -1.814],
	[0.0133, 0.0057, 71, 66.86, 43.43, -29.48, -38.06, 2.86E+1, -6.49E-2, 1.36E-4, -7.45E-8, 3.603, 6.582, 738.91, -2.038],
	[0.0068, -0.0034,97, 93.84, 41.69, 21.06, 5.74, 3.21E+1, -6.41E-2, 1.26E-4, -6.87E-8, 2.724, 9.520, 809.55, -2.224]]

	mol = Chem.MolFromSmiles(str(smiles))
	NoA = Chem.AddHs(mol).GetNumAtoms()
	MW = Descriptors.MolWt(Chem.AddHs(mol))
	LogP = Descriptors.MolLogP(Chem.AddHs(mol))
	MR = Descriptors.MolMR(Chem.AddHs(mol))

	double_lists = search_func_groups(smiles)

	entry_index_by_users = []
	entry_data_by_users = []

	for item in double_lists:
	entry_index_by_users.append(item[0])
	entry_data_by_users.append(item[1])

	fiveteen_columns = [] ##length = 15*len(entry_index_by_users)
	for index, data in zip(entry_index_by_users, entry_data_by_users):
	for i in range(15):
	if DB[index][i] == None:
	temp = None
	else:
	temp = data*DB[index][i]
	fiveteen_columns.append(temp)

	temperature = 310
	Tc = []
	Pc = []
	Vc = []
	Tb = []
	Tm = []
	Hfor = []
	Gf = []
	Cpa = []
	Cpb = []
	Cpc = []
	Cpd = []
	Hfus = []
	Hvap = []
	Ya = []
	Yb =[]
	fc = fiveteen_columns ## short hand
	for i in range(len(entry_index_by_users)):
	Tc.append(fc[i*15])
	Pc.append(fc[i*15 + 1])
	Vc.append(fc[i*15 + 2])
	Tb.append(fc[i*15 + 3])
	Tm.append(fc[i*15 + 4])
	Hfor.append(fc[i*15 + 5])
	Gf.append(fc[i*15 + 6])
	Cpa.append(fc[i*15 + 7])
	Cpb.append(fc[i*15 + 8])
	Cpc.append(fc[i*15 + 9])
	Cpd.append(fc[i*15 + 10])
	Hfus.append(fc[i*15 + 11])
	Hvap.append(fc[i*15 + 12])
	Ya.append(fc[i*15 + 13])
	Yb.append(fc[i*15 + 14])
	try:
	BoilingPoint = 198.2 + sum(Tb)
	except:
	BoilingPoint = None
	try:
	MeltingPoint = 122.5 + sum(Tm)
	except:
	MeltingPoint = None
	try:
	CriticalTemp = (sum(Tb) + 198.2)/(0.584 + 0.965sum(Tc) - sum(Tc)*2)
	except:
	CriticalTemp = None
	try:
	CriticalPress = 1./(0.113 + 0.0032float(NoA) - sum(Pc))*2
	except:
	CriticalPress = None
	try:
	CriticalVolume = 17.5 + sum(Vc)
	except:
	CriticalVolume = None
	try:
	EnthalpyForm = 68.29 + sum(Hfor)
	except:
	EnthalpyForm = None
	try:
	GibbsEnergy = 53.88 + sum(Gf)
	except:
	GibbsEnergy = None
	try:
	HeatCapacity = (sum(Cpa) - 37.93) + (sum(Cpb) + 0.210)float(temperature) + (sum(Cpc) - 3.9110*(-4))float(temperature)*2 + (sum(Cpd) + 2.0610*(-7))float(temperature)**3
	except:
	HeatCapacity = None
	try:
	EnthalpyVap = 15.30 + sum(Hvap)
	except:
	EnthalpyVap = None
	try:
	EnthalpyFus = -0.88 + sum(Hfus)
	except:
	EnthalpyFus = None
	try:
	LiquidVisco = float(MW)*math.exp((sum(Ya) - 597.82)/float(temperature) + sum(Yb) - 11.202)
	except:
	LiquidVisco = None
	try:
	CrystalSolub_1 = 10*(0.8 - float(LogP) - 0.01(sum(Tm)+122.5 - 273.15 - 25.))1000.float(MW)
	except:
	CrystalSolub_1 = None
	try:
	CrystalSolub_2 = 10*(0.5 - float(LogP) - 0.01(sum(Tm)+122.5 - 273.15 - 25.))1000.float(MW)
	except:
	CrystalSolub_2 = None
	try:
	AmorphSolub_1 = 10*(0.8 - float(LogP) - 0.01(sum(Tm)+122.5 - 273.15 - 25.)) 1000.float(MW)math.exp((sum(Hfus)-0.88)(sum(Tm) + 122.5 - float(temperature))float(temperature)/(sum(Tm) + 122.5)2/(2.479float(temperature)/298.))
	except:
	AmorphSolub_1 = None
	try:
	AmorphSolub_2 = 10*(0.5 - float(LogP) - 0.01(sum(Tm)+122.5 - 273.15 - 25.)) 1000.float(MW)math.exp((sum(Hfus)-0.88)(sum(Tm) + 122.5 - float(temperature))float(temperature)/(sum(Tm) + 122.5)2/(2.479float(temperature)/298.))
	except:
	AmorphSolub_2 = None
	return MeltingPoint-273.15

	def string2mp(name, namespace='name'):
	mp, mp_origin = None, None
	# try the chemicals package
	if is_cas(name):
	cas = name
	if pd.isna(mp):
	mp = chemicals.Tm(cas)
	if mp: mp = mp-273.15
	methods = chemicals.Tm_methods(cas)
	if methods: mp_origin = 'chem/'+methods[0]
	# experimental values
	if pd.isna(mp):
	# some rows have multiple comma-separated cas numbers
	mask = dfmp_expt['CAS'].str.contains(cas)
	if sum(mask):
	subdf = dfmp_expt[mask]['CAS'].str.split(', ')
	for index,subcas in zip(subdf.index,subdf):
	if cas in subcas:
	mp = float(dfmp_expt.iloc[index]['MP'])
	mp_origin = 'expt'
	# predicted values from Mansouri
	#if pd.isna(mp):
	# mask = dfmp_pred['Substance_CASRN'] == cas
	# if sum(mask):
	# mp = float(dfmp_pred[mask]['NCCT_MP'])
	# mp_origin = 'pred'
	# predicted values from OPERA (CompTox dashboard)
	if pd.isna(mp):
	mask = df_pred['CASRN'] == cas
	if sum(mask):
	mp = float(df_pred[mask]['MELTING_POINT_DEGC_OPERA_PRED'].iloc[0])
	mp_origin = 'comptox/pred'
	# try to scrape from PubChem
	if pd.isna(mp):
	content = None
	try:
	compounds = pcp.get_compounds(name, namespace=namespace)
	c = compounds[0]
	cid = c.cid
	url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON'
	#fid = urllib.request.urlopen(url)
	#webpage = fid.read().decode('utf-8')
	webpage = requests.get(url).text
	content = json.loads(webpage)
	except:
	pass
	if content:
	mp_list = []
	for i in content['Record']['Section']:
	if i['TOCHeading'] == 'Chemical and Physical Properties':
	for j in i['Section']:
	if j['TOCHeading'] == 'Experimental Properties':
	for k in j['Section']:
	if k['TOCHeading'] == 'Melting Point':
	for ii in k['Information']:
	try:
	mp_string = ii['Value']['StringWithMarkup'][0]['String']
	#rho_string = rho_string.replace('Relative density (water = 1): ', '')
	#print(mp_string)
	#tmp_rho = re.match('(?:\d+(?:\.\d*)?\|\.\d+)',rho_string)
	#m = re.match('((?:\d+(?:\.\d)?\|\.\d+))(?:-((?:\d+(?:\.\d)?\|\.\d+)))?',rho_string)
	m = re.match(r'(-?(?:\d+(?:\.\d)?\|\.\d+))(?:-((?:\d+(?:\.\d)?\|\.\d+)))?( ?°?C)',mp_string)
	if m is not None:
	groups = m.groups()
	if len(groups):
	for g in groups:
	try:
	tmp_mp = float(g)
	mp_list.append(tmp_mp)
	except:
	continue
	m = re.match(r'(-?(?:\d+(?:\.\d)?\|\.\d+))(?:-((?:\d+(?:\.\d)?\|\.\d+)))?( ?°?F)',mp_string)
	if m is not None:
	groups = m.groups()
	if len(groups):
	for g in groups:
	try:
	# (T/F − 32) × 5/9 = T/C
	tmp_mp = (float(g)-32)*5/9
	mp_list.append(tmp_mp)
	except:
	continue
	except:
	continue
	if mp_list:
	## remove outliers using interquartile range (IQR)
	mp_list = np.array(mp_list)
	q75,q25 = np.percentile(mp_list,[75,25])
	intr_qr = q75-q25
	hi = q75+(1.5*intr_qr)
	lo = q25-(1.5*intr_qr)
	mask = (mp_list <= hi) & (mp_list >= lo)
	mp_list = mp_list[mask]
	mp = np.mean(mp_list)
	mp_origin = 'pubchem'
	else:
	mp, mp_origin = None, None
	# try to scrape from DSSTOX website...
	if try_dsstox:
	if pd.isna(mp):
	dtxsid = None
	# try to find it via the dsstox dashboard
	try:
	name_urlsafe = urllib.parse.quote(name)
	url = f'https://comptox.epa.gov/dashboard/search-results?input_type=synonym_substring&inputs={name_urlsafe}'
	fid = urllib.request.urlopen(url)
	webpage = fid.read().decode('utf-8')
	hits = re.findall(r'DTXSID[0-9]+', webpage)
	if len(hits):
	dtxsid = hits[0]
	except:
	pass
	if dtxsid:
	mysoup = None
	url = f'https://comptox.epa.gov/dashboard/chemical/properties/{dtxsid}'
	#print(url)
	try:
	#driver = selenium.webdriver.Firefox()
	#driver.set_page_load_timeout(15)
	#driver.get(url)
	#driver_exe = 'chromedriver'
	#driver_exe = chromedriver_binary.chromedriver_filename
	options = Options()
	options.add_argument("--headless") # runs in background instead of showing browser window
	service = Service(driver_exe)
	driver = selenium.webdriver.Chrome(service=service, options=options)
	#driver = selenium.webdriver.Chrome(driver_exe, options=options)
	driver.set_page_load_timeout(15)
	driver.get(url)
	webpage = driver.page_source
	driver.quit()
	mysoup = bs4.BeautifulSoup(webpage, features='lxml')
	except:
	pass
	#print("timeout")
	# column of property names
	if mysoup:
	ifound = None
	rows = mysoup.find_all('div', attrs={'col-id':'property'})
	for i,row in enumerate(rows):
	if 'Melting Point' in row.text:
	ifound = i
	break
	if ifound:
	rows = mysoup.find_all('div', attrs={'col-id':'exavg'})
	text = rows[ifound].text
	value = re.sub(r' $[0-9]*$', '', text.strip())
	try:
	mp = float(value)
	mp_origin = 'dsstox/expt'
	except:
	mp, mp_origin = None, None
	if pd.isna(mp):
	rows = mysoup.find_all('div', attrs={'col-id':'predavg'})
	text = rows[ifound].text
	value = re.sub(r' $[0-9]*$', '', text.strip())
	try:
	mp = float(value)
	mp_origin = 'dsstox/pred'
	except:
	mp, mp_origin = None, None
	else:
	mp, mp_origin = None, None
	else:
	mp, mp_origin = None, None
	if pd.isna(mp): mp, mp_origin = None, None
	return mp, mp_origin

	def smiles2mp(smiles):
	try:
	SUPPORTED_ATOM_SET = {6, 7, 8, 9, 16, 17, 35, 53}
	m = Chem.MolFromSmiles(str(smiles))
	atom_num_set = set([a.GetAtomicNum() for a in m.GetAtoms()])
	if atom_num_set.issubset(SUPPORTED_ATOM_SET):
	mp = compute_phys_properties(smiles)
	else:
	mp = None
	except:
	mp = None
	return mp

	def smiles2mp_opera(smiles):
	descs = padelpy.from_smiles(smiles, descriptortypes='mp/descriptors.xml')
	#dfd = pd.DataFrame(descs,index=[0])
	#dfd = dfd.replace('',0).infer_objects(copy=False)
	#dfd = pd.DataFrame(dfd, dtype=float)
	dfd = pd.DataFrame(descs, index=[0]).apply(pd.to_numeric, errors="coerce").fillna(0.0).astype(float)
	X = np.array(dfd[my_opera_data_mp.desc_list])
	X_scale = my_opera_data_mp.scaler_X.transform(X)
	y_pred = my_opera_data_mp.scaler_y.inverse_transform(my_opera_data_mp.knn_all.predict(X_scale))
	return y_pred[0][0]

	def mol2mp(cas, name, smiles):
	mp, mp_origin = None, None
	if pd.isna(mp) and smiles:
	try:
	mp = smiles2mp_opera(smiles)
	mp_origin = 'opera/calc'
	except:
	mp, mp_origin = None, None
	if pd.isna(mp) and cas:
	try:
	mp, mp_origin = string2mp(cas)
	except:
	mp, mp_origin = None, None
	if pd.isna(mp) and name:
	try:
	mp, mp_origin = string2mp(name)
	except:
	mp, mp_origin = None, None
	if pd.isna(mp) and smiles:
	try:
	mp = smiles2mp(smiles)
	mp_origin = 'joback-reid/calc'
	except:
	mp, mp_origin = None, None
	return mp, mp_origin

	def getLogP(cas,mol):
	LogP, LogP_origin = None, None
	if cas:
	mask = df_pred['CASRN'] == cas
	if sum(mask):
	LogP = float(df_pred[mask]['OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED'])
	LogP_origin = 'comptox/pred'
	if pd.isna(LogP) and mol:
	LogP = Crippen.MolLogP(mol)
	LogP_origin = 'rdkit/calc'
	return LogP, LogP_origin