Spaces:

dmsaylor
/

CHRIS

Running

File size: 49,148 Bytes

import sys,os,glob,re,string
from collections import Counter
from operator import itemgetter
import nltk
import pandas as pd
import numpy as np
import pubchempy as pcp
import cirpy
import chemicals

import bs4
import urllib
import requests
import json

ORGANIC_ATOM_SET = {5, 6, 7, 8, 9, 15, 16, 17, 35, 53}
METAL_ATOM_SET = set([3,4,11,12,13] + list(range(19,31+1)) + list(range(37,50+1)) + list(range(55,84+1)) + list(range(87,114+1)) + [116])
with open('data/ceramics_list.txt', 'r') as fp:
    lines = fp.readlines()
CERAMICS_SET = {line.strip() for line in lines}
with open('data/salt_list.txt', 'r') as fp:
    lines = fp.readlines()
SALT_SET = {line.strip() for line in lines}

ERROR_CODES = {0:None, 1:'Structure could not be determined from the identifier', 2:'Invalid SMILES code', 3:'Invalid CAS number', 4:'Invalid identifier type selected'}

## not sure if this will be possible on pythonanywhere; use this flag to disable related code blocks
try_dsstox = True
if try_dsstox:
    import selenium
    import selenium.webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.chrome.service import Service
    ## NOTE this does not seem very robust
    uname = os.uname()
    if uname.sysname == 'Darwin':
        # mac
        import chromedriver_binary
        driver_exe = chromedriver_binary.chromedriver_filename
    elif uname.sysname == 'Linux':
        # pythonanywhere
        driver_exe = '/usr/local/bin/chromedriver'
    else:
        # fallback...
        import chromedriver_binary
        driver_exe = chromedriver_binary.chromedriver_filename

from io import BytesIO
from PIL import ImageOps
import base64

import rdkit
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Descriptors,Draw,Crippen

## add custom chemical definitions (i.e., to correct confusion between methane and carbon)
db = chemicals.identifiers.get_pubchem_db()
db.load('data/custom_chemicals_db.tsv')
## load experimental and predicted properties
#dfmp_expt = pd.read_excel('PHYSPROP_MP_data.xlsx')
dfmp_expt = pd.read_csv('data/PHYSPROP_MP_data.tsv', sep='\t')
#dfmp_pred = pd.read_excel('DSSTOX_MP_pred_data.xlsx')
#df_pred = pd.read_excel('Comptox_pred_data.xlsx')
df_pred = pd.read_csv('data/Comptox_pred_data.tsv', sep='\t')

## OPERA melting point model
import dill as pickle
import sklearn
import sklearn.neighbors
import sklearn.metrics
import padelpy
from functions import weight_func
class opera_data_mp():
    n_neighbors = 5
    #weight_factor = 5e-3
    desc_list = ['SHBd', 'nN', 'maxHBd', 'ATSC1v', 'AATS1i', 'TopoPSA', 'nT6Ring', 'nHBDon', 'WTPT-5', 'minHBd', 'nHBint2', 'IC0', 'MLFER_S', 'MLFER_BO', 'WTPT-3']
    with open('mp/model-opera-knn.pkl', 'rb') as fp:
        knn_all = pickle.load(fp)
    knn_all.weights = weight_func # fix weird problem on pythonanywhere...
    with open('mp/model-opera-scalerX.pkl', 'rb') as fp:
        scaler_X = pickle.load(fp)
    with open('mp/model-opera-scalerY.pkl', 'rb') as fp:
        scaler_y = pickle.load(fp)
my_opera_data_mp = opera_data_mp()

def ResolveChemical(chemName, IDtype, debug=False, get_properties=['logp','rho','mp']):

    #LogP_func = Crippen.MolLogP
    LogP_func = getLogP
    
    # remove excess whitespace
    chemName = chemName.strip()
    
    name = None
    smiles = None
    cas = None
    Mw = None
    LogP = None
    rho = None
    mp = None
    im64 = None
    mp_origin = None
    rho_origin = None
    LogP_origin = None
    error = 0

    if IDtype == 'CAS':
        cas = chemName

        if not is_cas(cas):
            error = 3 #invalid cas
            return (name, cas, smiles, Mw, LogP, rho, mp, im64, error)

        smiles = cas2smiles(cas)

        if smiles:
            name = cas2name(cas)
            if 'rho' in get_properties:
                rho, rho_origin = string2density(cas)
            if 'mp' in get_properties:
                mp, mp_origin = mol2mp(cas, name, smiles)
            if 'rho' in get_properties and pd.isna(rho) and name:
                rho, rho_origin = string2density(name)
            try:
                mol = Chem.MolFromSmiles(smiles)
            except:
                error = 2 #invalid smiles
            if mol:
                Mw = Descriptors.MolWt(mol)
                if 'logp' in get_properties:
                    LogP, LogP_origin = LogP_func(cas, mol)
                im = ImageFromSmiles(smiles)
                im64 = Imageto64(im)
            else:
                error = 2 #invalid smiles
        else:
            error = 1 # no smiles found
    elif IDtype == 'SMILES':
        smiles = chemName

        try:
            mol = Chem.MolFromSmiles(smiles)
        except:
            error = 2
        if mol:
            Mw = Descriptors.MolWt(mol)
            if 'logp' in get_properties:
                LogP, LogP_origin = LogP_func(cas, mol)
            im = ImageFromSmiles(smiles)
            im64 = Imageto64(im)
        else:
            error = 2

        # if SMILES is not valid, skip the other stuff
        if not error:
            name = smiles2name(smiles)
            if name:
                cas = name2cas(name)
                if 'rho' in get_properties:
                    rho, rho_origin = string2density(name)
            if 'rho' in get_properties and pd.isna(rho) and cas:
                rho, rho_origin = string2density(cas)

            if 'mp' in get_properties:
                mp, mp_origin = mol2mp(cas, name, smiles)
    elif IDtype == 'common':
        name = chemName

        name, name_origin = name2iupac(name)
        smiles = name2smiles(name)
        cas = name2cas(name)
        if not smiles:
            smiles = cas2smiles(cas)

        if 'rho' in get_properties and pd.isna(rho) and cas:
            rho, rho_origin = string2density(cas)
        if 'rho' in get_properties and pd.isna(rho):
            rho, rho_origin = string2density(name)
        if 'rho' in get_properties and pd.isna(rho):
            # try this because sometimes iupac names don't work
            rho, rho_origin = string2density(chemName)

        if smiles:
            if 'mp' in get_properties:
                mp, mp_origin = mol2mp(cas, name, smiles)
            try:
                mol = Chem.MolFromSmiles(smiles)
            except:
                error = 2
            if mol:
                Mw = Descriptors.MolWt(mol)
                if 'logp' in get_properties:
                    LogP, LogP_origin = LogP_func(cas, mol)
                im = ImageFromSmiles(smiles)
                im64 = Imageto64(im)
            else:
                error = 2
        else:
            error = 1
    else:
        ## should never be here
        name = None
        smiles = None
        cas = None
        Mw = None
        LogP = None
        rho = None
        mp = None
        im64 = None
        error = 4 # invalid IDtype selection, probably not possible

    # if we couldn't find a name or CAS (but do have SMILES)
    if not error:
        if not name:
            name = 'Not found'
        if not cas:
            cas = 'Not found'
    
    if mp is not None:
        mp = float(mp)

    if debug:
        return (name, cas, smiles, Mw, LogP, LogP_origin, rho, rho_origin, mp, mp_origin, im64, error)
    else:
        return (name, cas, smiles, Mw, LogP, rho, mp, im64, error)

def CeramicOrMetal(smiles,mp):
    # metals/ceramics logic
    is_ceramic = False
    mol = Chem.MolFromSmiles(smiles)
    atom_num_list = [a.GetAtomicNum() for a in mol.GetAtoms()]
    is_metal = set(atom_num_list) <= METAL_ATOM_SET
    if not is_metal:
        # check composition against list of ceramics/salts
        elements = ','.join(sorted(set([a.GetSymbol() for a in mol.GetAtoms()])))
        if elements in CERAMICS_SET:
            is_ceramic = True
        if elements in SALT_SET:
            is_ceramic = True
        if not is_ceramic:
            # get number of carbon-carbon bonds
            num_CC_bonds = sum([1 if b.GetBeginAtom().GetAtomicNum() == 6 and b.GetEndAtom().GetAtomicNum() == 6 else 0 for b in mol.GetBonds()])
            if not num_CC_bonds and (mp is not None) and mp > 700.:
                # if not a metal, no C-C bonds, and mp > 700 (sodium chloride has mp ~ 800), assume ceramic...
                is_ceramic = True
    return is_metal, is_ceramic

#Generates an image of the molecule represented by the SMILES code given.
#Returns None if the image cannot be generated. From https://github.com/ronaldo-prata/flask-test/blob/master/functions.py

def ImageFromSmiles(smiles):
    image = None
    if type(smiles) is str:
        try:
            if smiles == 'C1=CC=C2C(=C1)C3=NC4=NC(=NC5=C6C=CC=CC6=C([N-]5)N=C7C8=CC=CC=C8C(=N7)N=C2[N-]3)C9=CC=CC=C94.[Mn+2]':
                mol = next(Chem.SDMolSupplier('data/MnPC.sdf', removeHs=False))
                image = Draw.MolToImage(mol, size=(350, 350))
            else:
                image = Draw.MolToImage(Chem.MolFromSmiles(smiles), size=(350, 350))
        except ValueError:
            pass
    return image

#Trims the image into a box, removing any excess white background.
#The box cannot be smaller than 400x400. This is done due to the difference in quality in the images generated by MolToImage,
#  if the size is too small (300x300), big molecules are too low quality, but if the size is too big (1000*1000), small molecules appear zoomed out.

def wTrim(img):
    bbox = ImageOps.invert(img).getbbox()
    crop = (bbox[0], bbox[1], bbox[2], bbox[3])

    return img.crop(crop)

#Converts a PIL image into its base64 representation.
def Imageto64(img):
    img = wTrim(img)
    buf = BytesIO()
    img.save(buf, format="PNG")
    pngImageB64String = "data:image/png;base64,"
    pngImageB64String += base64.b64encode(buf.getvalue()).decode("utf-8")

    return pngImageB64String

# function to convert SMILES to name
def smiles2name(smiles):
    name = None
    # first try chemicals package
    try:
        cm = chemicals.search_chemical(smiles)
        if cm.iupac_name:
            name = cm.iupac_name
        elif cm.common_name:
            name = cm.common_name
    except:
        name = None
    # then try pubchem for compounds
    if not name:
        try:
            compounds = pcp.get_compounds(smiles, namespace='smiles')
            c = compounds[0]
            name = c.iupac_name
            if not name:
                # have seen empty iupac_name before, try synonyms if this happens
                name = c.synonyms[0]
        except:
            name = None
    # next try cirpy
    if not name:
        try:
            name = cirpy.resolve(smiles, 'iupac_name')
        except:
            name = None
    if type(name) is list:
        name = name[0]
    # finally try it as a pubchem substance
    if not name:
        try:
            compounds = pcp.get_substances(smiles, namespace='smiles')
            # sometimes there are multiple substances, and multiple synonyms per substance
            allsyns = [syn for c in compounds for syn in c.iupac_name if cas not in syn]
            # choose the most common synonym
            fd = nltk.FreqDist(allsyns)
            name = fd.most_common(1)[0][0]
        except:
            name = None
    return name

# function to convert CAS to SMILES
def cas2smiles(cas):
    smiles = None
    # first try chemicals package
    try:
        cm = chemicals.search_chemical(cas)
        smiles = cm.smiles
    except:
        smiles = None
    # then try pubchem for compounds
    if not smiles:
        try:
            compounds = pcp.get_compounds(cas, namespace='name')
            c = compounds[0]
            smiles = c.isomeric_smiles
        except:
            smiles = None
    # next try cirpy
    if not smiles:
        try:
            smiles = cirpy.resolve(cas, 'smiles')
        except:
            smiles = None
    if type(smiles) is list:
        smiles = smiles[0]
    # finally try it as a pubchem substance
    if not smiles:
        try:
            compounds = pcp.get_substances(cas, namespace='name')
            # sometimes there are multiple substances, and multiple synonyms per substance
            allsyns = [syn for c in compounds for syn in c.isomeric_smiles if cas not in syn]
            # choose the most common synonym
            fd = nltk.FreqDist(allsyns)
            smiles = fd.most_common(1)[0][0]
        except:
            smiles = None
    return smiles

# function to convert cas to name
def cas2name(cas):
    name = None
    #if not is_cas(cas):
    #    name = 'INVALID CAS'
    # first try chemicals package
    try:
        cm = chemicals.search_chemical(cas)
        if cm.iupac_name:
            name = cm.iupac_name
        elif cm.common_name:
            name = cm.common_name
    except:
        name = None
    # then try cirpy
    if not name:
        try:
            name = cirpy.resolve(cas, 'iupac_name')
        except:
            name = None
    if type(name) is list:
        name = name[0]
    # next try pubchem for compounds
    if not name:
        try:
            compounds = pcp.get_compounds(cas, namespace='name')
            c = compounds[0]
            name = c.iupac_name
            if not name:
                # have seen empty iupac_name before, try synonyms if this happens
                name = c.synonyms[0]
        except:
            name = None
    return name

# function to convert chemical name to iupac name
def name2iupac(string):
    name = None
    origin = None
    # try chemicals package
    try:
        cm = chemicals.search_chemical(string)
        if cm.iupac_name:
            name = cm.iupac_name
        elif cm.common_name:
            name = cm.common_name
        origin = 'chemicals'
    except:
        name = None
        origin = None
    # try pubchem for compounds
    if not name:
        try:
            compounds = pcp.get_compounds(string, namespace='name')
            c = compounds[0]
            name = c.iupac_name
            if not name:
                # have seen empty iupac_name before, try synonyms if this happens
                name = c.synonyms[0]
            origin = 'PubChem'
        except:
            name = None
            origin = None
    # next try cirpy
    if not name:
        try:
            #name = cirpy.resolve(string, 'names')
            name = cirpy.resolve(string, 'iupac_name')
            if name: origin = 'CIRPY'
        except:
            name = None
            origin = None
    if type(name) is list:
        name = name[0]
    # now try it as a pubchem substance
    if not name:
        try:
            compounds = pcp.get_substances(string, namespace='name')
            # sometimes there are multiple substances, and multiple synonyms per substance
            allsyns = [syn for c in compounds for syn in c.synonyms if cas not in syn]
            # choose the most common synonym
            fd = nltk.FreqDist(allsyns)
            name = fd.most_common(1)[0][0]
            origin = 'PubChem/substance'
        except:
            name = None
            origin = None
    # strip all spaces and try again...
    if not name:
        string_strip = re.sub(' ','',string)
        # first try pubchem for compounds
        try:
            compounds = pcp.get_compounds(string_strip, namespace='name')
            c = compounds[0]
            name = c.iupac_name
            if not name:
                # have seen empty iupac_name before, try synonyms if this happens
                name = c.synonyms[0]
            origin = 'PubChem'
        except:
            name = None
            origin = None
        # next try cirpy
        if not name:
            try:
                #name = cirpy.resolve(string_strip, 'names')
                name = cirpy.resolve(string, 'iupac_name')
                if name: origin = 'CIRPY'
            except:
                name = None
                origin = None
        if type(name) is list:
            name = name[0]
    return name, origin

# function to convert name to cas
def name2cas(name):
    cas = None
    # try chemicals package
    try:
        cm = chemicals.search_chemical(name)
        cas = cm.CASs
    except:
        cas = None
    # then try cirpy
    if not cas:
        try:
            cas = cirpy.resolve(name, 'cas')
        except:
            cas = None
    if type(cas) is list:
        cas.sort(key=lambda s: np.array(s.split('-'), dtype=int).sum())
        cas = cas[0]
    # next try pubchem for compounds
    if not cas:
        try:
            compounds = pcp.get_compounds(name, namespace='name')
            c = compounds[0]
            syns = c.synonyms
            possible_cas = [syn for syn in syns if is_cas(syn)]
            # if multiple choose option with smallest sum of digits
            possible_cas.sort(key=lambda s: np.array(s.split('-'), dtype=int).sum())
            cas = possible_cas[0]
        except:
            cas = None
    return cas

# function to convert name to SMILES
def name2smiles(name):
    smiles = None
    # first try chemicals package
    try:
        cm = chemicals.search_chemical(name)
        smiles = cm.smiles
    except:
        smiles = None
    # then try pubchem for compounds
    if not smiles:
        try:
            compounds = pcp.get_compounds(name, namespace='name')
            c = compounds[0]
            smiles = c.isomeric_smiles
        except:
            smiles = None
    # next try cirpy
    if not smiles:
        try:
            smiles = cirpy.resolve(name, 'smiles')
        except:
            smiles = None
    if type(smiles) is list:
        smiles = smiles[0]
    # then try it as a pubchem substance
    if not smiles:
        try:
            compounds = pcp.get_substances(name, namespace='name')
            # sometimes there are multiple substances, and multiple synonyms per substance
            allsyns = [syn for c in compounds for syn in c.isomeric_smiles if name not in syn]
            # choose the most common synonym
            fd = nltk.FreqDist(allsyns)
            smiles = fd.most_common(1)[0][0]
        except:
            smiles = None
    # finally try to resolve SMILES from name using OPSIN
    #if not smiles:
    #    try:
    #        with open('opsin.tmp.1', 'w') as fp:
    #            fp.write(name)
    #        os.system('java -jar /Users/robert.elder/software/utils/opsin-2.4.0-jar-with-dependencies.jar -osmi opsin.tmp.1 opsin.tmp.2 &> /dev/null')
    #        with open('opsin.tmp.2') as fp:
    #            smiles = fp.read()
    #        if smiles == '\n':
    #            smiles = None
    #        smiles = smiles.strip() #remove trailing newline
    #    except KeyboardInterrupt:
    #        raise
    #    except:
    #        smiles = None
    return smiles

def check_cas(cas):
    n1,n2,n3 = cas.split('-')
    # combine and flip first 2 numbers
    tmp = ''.join([n1,n2])[::-1]
    # sum of number*position in string
    check = sum([i*int(tmp[i-1]) for i in range(1,len(tmp)+1)])
    # mod 10
    check = check%10
    # if these match, then it's a legit cas number
    return check == int(n3)

def is_cas(cas):
    try:
        return check_cas(cas)
    except:
        return False

def string2density(name):
    rho, rho_origin = None, None
    # predicted values from TEST (CompTox dashboard)
    if is_cas(name):
        mask = df_pred['CASRN'] == name
        if sum(mask):
            rho = float(df_pred[mask]['DENSITY_G/CM^3_TEST_PRED'])
            rho_origin = 'comptox/pred'
    # try to scrape from PubChem
    if pd.isna(rho):
        content = None
        try:
            compounds = pcp.get_compounds(name, namespace='name')
            c = compounds[0]
            cid = c.cid
            url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON'
            #fid = urllib.request.urlopen(url)
            #webpage = fid.read().decode('utf-8')
            webpage = requests.get(url).text
            content = json.loads(webpage)
        except:
            pass
        if content:
            rho_list = []
            for i in content['Record']['Section']:
                if i['TOCHeading'] == 'Chemical and Physical Properties':
                    for j in i['Section']:
                        if j['TOCHeading'] == 'Experimental Properties':
                            for k in j['Section']:
                                if k['TOCHeading'] == 'Density':
                                    for ii in k['Information']:
                                        try:
                                            rho_string = ii['Value']['StringWithMarkup'][0]['String']
                                            rho_string = rho_string.replace('Relative density (water = 1): ', '')
                                            #print(rho_string)
                                            #tmp_rho = re.match('(?:\d+(?:\.\d*)?|\.\d+)',rho_string)
                                            m = re.match(r'((?:\d+(?:\.\d*)?|\.\d+))(?:-((?:\d+(?:\.\d*)?|\.\d+)))?',rho_string)
                                            groups = m.groups()
                                            if len(groups):
                                                for g in groups:
                                                    try:
                                                        tmp_rho = float(g)
                                                        rho_list.append(tmp_rho)
                                                    except:
                                                        continue
                                        except:
                                            continue
            if rho_list:
                ## remove outliers using interquartile range (IQR)
                rho_list = np.array(rho_list)
                q75,q25 = np.percentile(rho_list,[75,25])
                intr_qr = q75-q25
                hi = q75+(1.5*intr_qr)
                lo = q25-(1.5*intr_qr)
                mask = (rho_list <= hi) & (rho_list >= lo)
                rho_list = rho_list[mask]
                rho = np.mean(rho_list)
                rho_origin = 'pubchem'
            else:
                rho, rho_origin = None, None
    # try to scrape from DSSTOX
    if try_dsstox:
        if pd.isna(rho):
            dtxsid = None
            try:
                # try to find it via the dsstox dashboard
                name_urlsafe = urllib.parse.quote(name)
                url = f'https://comptox.epa.gov/dashboard/search-results?input_type=synonym_substring&inputs={name_urlsafe}'
                fid = urllib.request.urlopen(url)
                webpage = fid.read().decode('utf-8')
                hits = re.findall(r'DTXSID[0-9]+', webpage)
                if len(hits):
                    dtxsid = hits[0]
            except:
                pass
            if dtxsid:
                mysoup = None
                url = f'https://comptox.epa.gov/dashboard/chemical/properties/{dtxsid}'
                #print(url)
                try:
                    options = Options()
                    options.add_argument("--headless") # runs in background instead of showing browser window
                    service = Service(driver_exe)
                    driver = selenium.webdriver.Chrome(service=service, options=options)
                    driver.set_page_load_timeout(15)
                    driver.get(url)
                    webpage = driver.page_source
                    driver.quit()
                    mysoup = bs4.BeautifulSoup(webpage, features='lxml')
                except KeyboardInterrupt:
                    raise
                except:
                    pass
                if mysoup:
                    ifound = None
                    # column of property names
                    rows = mysoup.find_all('div', attrs={'col-id':'property'})
                    for i,row in enumerate(rows):
                        if 'Density' in row.text:
                            ifound = i
                            break
                    if ifound:
                        rows = mysoup.find_all('div', attrs={'col-id':'exavg'})
                        text = rows[ifound].text
                        value = re.sub(r' \([0-9]*\)', '', text.strip())
                        try:
                            rho = float(value)
                            rho_origin = 'dsstox/expt'
                        except:
                            rho, rho_origin = None, None
                        if pd.isna(rho):
                            rows = mysoup.find_all('div', attrs={'col-id':'predavg'})
                            text = rows[ifound].text
                            value = re.sub(r' \([0-9]*\)', '', text.strip())
                            try:
                                rho = float(value)
                                rho_origin = 'dsstox/pred'
                            except:
                                rho, rho_origin = None, None
                    else:
                        rho, rho_origin = None, None
            else:
                rho, rho_origin = None, None
    if pd.isna(rho): rho,rho_origin = None, None
    return rho, rho_origin

def return_non_duplicate_index(tuples): ##Given a list of sets return index of non_duplicate items
    ## from https://github.com/curieshicy/JRgui/
    ##step 1, create a new tuple, named "new_tuples"
    new_tuples = [] ##the elements are the sets
    for i in tuples:
        for j in i:
            new_tuples.append(set(j))
    ##step 2, create a dictionary storing one to one relationship between new_tuple and old_tuple
    values = []
    for index, item in enumerate(tuples):
        if len(item) == 1:
            values.append(index)
        else:
            for i in [index]*len(item):
                values.append(i)
    keys = [i for i in range(len(new_tuples))]
    dict_tuples = {}  ## {0:0, 1:1, 2:2, 3:3, 4:3, 5:3, 6:4, 7:4, 8:4, 9:5, 10:6, 11:7, 12:8}
    for i, j in zip(keys, values):
        dict_tuples[i] = j
    ##step 3, remove duplicates in sets terminology    
    remove_index = []
    for index_1, item in enumerate(new_tuples): ##starting from beginning
        for index_2 in range(index_1 + 1, len(new_tuples)): ##loop over the rest items
            if len(item & new_tuples[index_2]) != 0:
                if len(item)>len(new_tuples[index_2]):
                    remove_index.append(index_2) #indefoirx
                elif len(item)<len(new_tuples[index_2]):
                    remove_index.append(index_1) #index
                elif len(item)==len(new_tuples[index_2]):
                    remove_index.append(index_2) #index
    remain_sets = set(range(len(new_tuples))).difference(set(remove_index))
    ##step 4, spit out final index and length
    index_1 = [] ## [0,1,2,3,3,3,4,4]
    index_length = []
    for i in remain_sets:
        index_1.append(dict_tuples[i])
    counts = Counter(index_1) ##this is a dictionary return Counter({3:3, 4:2, 0:1, 1:1, 2:1}) ##index:length
    list_counts = counts.most_common() ## convert to a list [(3,3), (4,2), (0,1), (1,1), (2,1)]
    for i in range(len(list_counts)):
        index_length.append([list_counts[i][0], list_counts[i][1]])
    index_length = sorted(index_length, key = itemgetter(0))
    return index_length

def search_func_groups(smiles): ##this is to search functional groups and print out them with numbers
    ## from https://github.com/curieshicy/JRgui/
    smarts = ["[$([CX2H0](=*)=*)]", "[$([CX2H1]#[!#7])]", "[$([CX2H0]#[!#7])]", "[OX2H]-[C]=O", "[#6X3H0;!$([#6X3H0](~O)(~O)(~O))](=[#8X1])[#8X2H0]",
              "[$([#6X3H0](=[OX1]));!$([#6X3](=[#8X1])~[#8X2]);R]=O", "[CH;D2;$(C-!@C)](=O)", "[OX2H;!$([OX2H]-[#6]=[O]);!$([OX2H]-a)]", "[O;H1;$(O-!@c)]", 
              "[#8X2H0;R;!$([#8X2H0]~[#6]=[#8])]", "[$([CX3H0](=[OX1]));!$([CX3](=[OX1])-[OX2]);!R]=O", "[OX2H0;!R;!$([OX2H0]-[#6]=[#8])]", 
              "[$([#7X3,#7X3+][!#8])](=[O])~[O-]", "[OX1H0;!$([OX1H0]~[#6X3]);!$([OX1H0]~[#7X3]~[#8])]", "[#7X2H0;R]",  "[#7X3H1;R]", "[#7X2H1]",
              "[#7X2H0;!R]","[#6X2]#[#7X1H0]","[NX3H2]", "[NX3H1;!R]", "[#7X3H0;!$([#7](~O)~O)]","[SX2H]","[#16X2H0;!R]","[#16X2H0;R]",  "[R;CX3H1,cX3H1]", 
              "[$([R;#6X3H0]);!$([R;#6X3H0]=[#8])]","[R;CX4H2]","[R;CX4H]","[R;CX4H0]", "[CX3H2]", "[!R;CX3H1;!$([CX3H1](=O))]",
              "[$([!R;#6X3H0]);!$([!R;#6X3H0]=[#8])]","[CX4H3]","[!R;CX4H2]", "[!R;CX4H]","[!R;CX4H0]","[F]","[Cl]","[Br]", "[I]"]
    tuples = []
    index_list = []
    final_index_and_length = []
    m = Chem.MolFromSmiles(str(smiles))    
    for index, smart in enumerate(smarts):
        if m.HasSubstructMatch(Chem.MolFromSmarts(smart)) == True: 
            tuples.append(m.GetSubstructMatches(Chem.MolFromSmarts(smart))) ## this is atom position
            index_list.append(index)
    temp = return_non_duplicate_index(tuples) # [[0, 1], [1, 1], [3, 1], [4, 7], [5, 6], [6, 1], [7, 1], [8, 1], [9, 1]]
    for i in temp:
        final_index_and_length.append([index_list[i[0]], i[1]])
    return final_index_and_length      

def compute_phys_properties(smiles):
    ## from https://github.com/curieshicy/JRgui/
    ## method from: K. G. Joback, R. C. Reid, ESTIMATION OF PURE-COMPONENT PROPERTIES FROM GROUP-CONTRIBUTIONS. Chemical Engineering Communications 57, 233-243 (1987).
    ## this doesn't look very accurate, but it's a start
    ##[[], [], ...[]] in total 41 nested list inside a list
    DB =    [[0.0026,	0.0028,	36,	26.15,	17.78,	142.14,	136.70,	2.74E+1,	-5.57E-2,	1.01E-4,	-5.02E-8,	4.720,	2.661,	None,	None],
            [0.0027,	-0.0008,46,	9.20,	-11.18,	79.30,	77.71,	2.45E+1,	-2.71E-2,	1.11E-4,	-6.78E-8,	2.322,	1.155,	None,	None],
            [0.0020,	0.0016,	37,	27.38,	64.32,	115.51,	109.82,	7.87,	         2.01E-2,	-8.33E-6,	1.39E-9,	4.151,	3.302,	None,	None],
            [0.0791,	0.0077,	89,	169.09,	155.50,	-426.72,-387.87,2.41E+1,	4.27E-2,	8.04E-5,       -6.87E-8,	11.051,	19.537,	1317.23,-2.578],
            [0.0481,	0.0005,	82,	81.10,	53.60,	-337.92,-301.95,2.45E+1,	4.02E-2,	4.02E-5,	-4.52E-8,	6.959,	9.633,	483.88,	-0.966],
            [0.0284,	0.0028,	55,	94.97,	75.97,	-164.50,-126.27,3.04E+1,	-8.29E-2,	2.36E-4,       -1.31E-7,	None,	6.645,	None,	None],
            [0.0379,	0.0030,	82,	72.24,	36.90,	-162.03,-143.48,3.09E+1,	-3.36E-2,	1.60E-4,       -9.88E-8,	3.197,	9.093,	740.92,	-1.713],
            [0.0741,	0.0112,	28,	92.88,	44.45,	-208.04,-189.20,2.57E+1,	-6.91E-2,	1.77E-4,	-9.88E-8,	2.406,	16.826,	2173.72,-5.057],
            [0.0240,	0.0184,	-25,	76.34,	82.83,	-221.65,-197.37,-2.81,	         1.11E-1,	-1.16E-4,	4.94E-8,	4.490,	12.499,	3018.17,-7.314],
            [0.0098,	0.0048,	13,	31.22,	23.05,	-138.16,-98.22,	1.22E+1,	-1.26E-2,	6.03E-5,	-3.86E-8,	5.879,	4.682,	440.24,	-0.953],
            [0.0380,	0.0031,	62,	76.75,	61.20,	-133.22,-120.50,6.45,	         6.70E-2,	-3.57E-5,	2.86E-9,	4.189,	8.972,	340.35,	-0.350],
            [0.0168,	0.0015,	18,	22.42,	22.23,	-132.22,-105.00,2.55E+1,	-6.32E-2,	1.11E-4,       -5.48E-8,	1.188,	2.410,	122.09,	-0.386],
            [0.0437,	0.0064,	91,	152.54,	127.24,	-66.57,	-16.83,	2.59E+1,	-3.74E-3,	1.29E-4,	-8.88E-8,	9.679,	16.738,	None,	None],
            [0.0143,	0.0101,	36,	-10.50,	2.08,	-247.61,-250.83,6.82,	        1.96E-2,	1.27E-5,	-1.78E-8,	3.624,	5.909,	675.24,	-1.340],
            [0.0085,	0.0076,	34,	57.55,	68.40,	55.52,	79.93,	8.83,	      -3.84E-3,	         4.35E-5,	-2.60E-8,	3.649,	6.528,	None,	None],
            [0.0130,	0.0114,	29,	52.82,	101.51,	31.65,75.61,1.18E+1,	        -2.30E-2,	1.07E-4,	-6.28E-8,	7.490,	6.930,	None,	None],
            [None,  	None,	None,	83.08,	68.91,	93.70,	119.66,	5.69,	       -4.12E-3,	1.28E-4,	-8.88E-8,	None,	12.169,	None,	None],
            [0.0255,	-0.0099,None,	74.60,	None,	23.61,	None,     None,	         None,	         None,	          None, 	None,	3.335,	None,	None],
            [0.0496,	-0.0101,91,	125.66,	59.89,	88.43,	89.22,	3.65E+1,	-7.33E-2,	1.84E-4,	-1.03E-7,	2.414,	12.851,	None,	None],
            [0.0243,	0.0109,	38,	73.23,	66.89,	-2.02,	14.07,2.69E+1,	        -4.12E-2,	1.64E-4,        -9.76E-8,	3.515,	10.788,	None,	None],
            [0.0295,	0.0077,	35,	50.17,	52.66,	53.47,	89.39,-1.21,	        7.62E-2,	-4.86E-5,	1.05E-8,	5.009,	6.436,	None,	None],
            [0.0169,	0.0074,	9,	11.74,	48.84,	123.34,	163.16,-3.11E+1,	2.27E-1,	-3.20E-4,	1.46E-7,	4.703,	1.896,	None,	None],
            [0.0031,	0.0084,	63,	63.56,	20.09,	-17.33,	-22.99,	3.53E+1,	-7.58E-2,	1.85E-4,	-1.03E-7,	2.360,	6.884,	None,	None],
            [0.0119,	0.0049,	54,	68.78,	34.40,	41.87,	33.12,	1.96E+1,	-5.61E-3,	4.02E-5,	-2.76E-8,	4.130,	6.817,	None,	None],
            [0.0019,	0.0051,	38,	52.10,	79.93,	39.10,	27.76,	1.67E+1,	4.81E-3,	2.77E-5,	-2.11E-8,	1.557,	5.984,	None,	None],
            [0.0082,	0.0011,	41,	26.73,	8.13,	2.09,	11.30,	-2.14,	        5.74E-2,	-1.64E-6,	-1.59E-8,	1.101,	2.544,	259.65,	-0.702],
            [0.0143,	0.0008,	32,	31.01,	37.02,	46.43,	54.05,	-8.25,	        1.01E-1,	-1.42E-4,	6.78E-8,	2.394,	3.059,	-245.74,0.912],
            [0.0100,	0.0025,	48,	27.15,	7.75,	-26.80,	-3.68,	-6.03,	         8.54E-2,	-8.00E-6,	-1.80E-8,	0.490,	2.398,	307.53,	-0.798],
            [0.0122,	0.0004,	38,	21.78,	19.88,	8.67,	40.99,	-2.05E+1,	1.62E-1,	-1.60E-4,	6.24E-8,	3.243,	1.942,	-394.29,1.251],
            [0.0042,	0.0061,	27,	21.32,	60.15,	79.72,	87.88,	-9.09E+1,	5.57E-1,	-9.00E-4,	4.69E-7,	-1.373,	0.644,	None,	None],
            [0.0113,	-0.0028,56,	18.18,	-4.32,	-9.630,	3.77,	2.36E+1,	-3.81E-2,	1.72E-4,	-1.03E-7,	-0.473,	1.724,	495.01,	-1.539],
            [0.0129,	-0.0006,46,	24.96,	8.73,	37.97,	48.53,	-8.00,	        1.05E-1,	-9.63E-5,	3.56E-8,	2.691,	2.205,	82.28,	-0.242],
            [0.0117,	0.0011,	38,	24.14,	11.14,	83.99,	92.36,	-2.81E+1,	2.08E-1,	-3.06E-4,	1.46E-7,	3.063,	2.138,	None,	None],
            [0.0141,	-0.0012,65,	23.58,	-5.10,	-76.45,	-43.96,	1.95E+1,	-8.08E-3,	1.53E-4,	-9.67E-8,	0.908,	2.373,	548.29,	-1.719],
            [0.0189,	0.0000,	56,	22.88,	11.27,	-20.64,	8.42,	-9.09E-1,	9.50E-2,	-5.44E-5,	1.19E-8,	2.590,	2.226,	94.16,	-0.199],
            [0.0164,	0.0020,	41,	21.74,	12.64,	29.89,	58.36,	-2.30E+1,	2.04E-1,	-2.65E-4,	1.20E-7,	0.749,	1.691,	-322.15,1.187],
            [0.0067,	0.0043,	27,	18.25,	46.43,	82.23,	116.02,	-6.62E+1,	4.27E-1,	-6.41E-4,	3.01E-7,	-1.460,	0.636,	-573.56,2.307],
            [0.0111,	-0.0057,27,	-0.03,	-15.78,	-251.92,-247.19,2.65E+1,	-9.13E-2,	1.91E-4,	-1.03E-7,	1.398,	-0.670,	None,	None],
            [0.0105,	-0.0049,58,	38.13,	13.55,	-71.55,-64.31,	3.33E+1,	-9.63E-2,	1.87E-4,	-9.96E-8,	2.515,	4.532,	625.45,	-1.814],
            [0.0133,	0.0057,	71,	66.86,	43.43,	-29.48,	-38.06,	2.86E+1,	-6.49E-2,	1.36E-4,	-7.45E-8,	3.603,	6.582,	738.91,	-2.038],
            [0.0068,	-0.0034,97,	93.84,	41.69,	21.06,	5.74,	3.21E+1,	-6.41E-2,	1.26E-4,	-6.87E-8,	2.724,	9.520,	809.55,	-2.224]] 

    mol = Chem.MolFromSmiles(str(smiles))
    NoA = Chem.AddHs(mol).GetNumAtoms()
    MW = Descriptors.MolWt(Chem.AddHs(mol))
    LogP = Descriptors.MolLogP(Chem.AddHs(mol))
    MR = Descriptors.MolMR(Chem.AddHs(mol))

    double_lists = search_func_groups(smiles)

    entry_index_by_users = []
    entry_data_by_users = []

    for item in double_lists:
        entry_index_by_users.append(item[0])
        entry_data_by_users.append(item[1])
        
    fiveteen_columns = [] ##length  = 15*len(entry_index_by_users)
    for index, data in zip(entry_index_by_users, entry_data_by_users):
        for i in range(15):
            if DB[index][i] == None:
                temp = None
            else:
                temp = data*DB[index][i]
            fiveteen_columns.append(temp)

    temperature = 310
    Tc = []
    Pc = []
    Vc = []
    Tb = []
    Tm = []
    Hfor = []
    Gf = []
    Cpa = []
    Cpb = []
    Cpc = []
    Cpd = []
    Hfus = []
    Hvap = []
    Ya = []
    Yb =[]        
    fc = fiveteen_columns ## short hand
    for i in range(len(entry_index_by_users)):
        Tc.append(fc[i*15])
        Pc.append(fc[i*15 + 1])
        Vc.append(fc[i*15 + 2])
        Tb.append(fc[i*15 + 3])
        Tm.append(fc[i*15 + 4])
        Hfor.append(fc[i*15 + 5])
        Gf.append(fc[i*15 + 6])
        Cpa.append(fc[i*15 + 7])
        Cpb.append(fc[i*15 + 8])
        Cpc.append(fc[i*15 + 9])
        Cpd.append(fc[i*15 + 10])
        Hfus.append(fc[i*15 + 11])
        Hvap.append(fc[i*15 + 12])
        Ya.append(fc[i*15 + 13])
        Yb.append(fc[i*15 + 14])
    try:
        BoilingPoint = 198.2 + sum(Tb)
    except:
        BoilingPoint = None        
    try:
        MeltingPoint = 122.5 + sum(Tm)
    except:
        MeltingPoint = None
    try:
        CriticalTemp =  (sum(Tb) + 198.2)/(0.584 + 0.965*sum(Tc) - sum(Tc)**2)
    except:
        CriticalTemp = None
    try:
        CriticalPress = 1./(0.113 + 0.0032*float(NoA) - sum(Pc))**2
    except:
        CriticalPress = None
    try:
        CriticalVolume = 17.5 + sum(Vc)
    except:
        CriticalVolume = None
    try:
        EnthalpyForm = 68.29 + sum(Hfor)
    except:
        EnthalpyForm = None
    try:
        GibbsEnergy = 53.88 + sum(Gf)
    except:
        GibbsEnergy = None
    try:
        HeatCapacity = (sum(Cpa) - 37.93) + (sum(Cpb) + 0.210)*float(temperature) + (sum(Cpc) - 3.91*10**(-4))*float(temperature)**2 + (sum(Cpd) + 2.06*10**(-7))*float(temperature)**3
    except:
        HeatCapacity = None
    try:
        EnthalpyVap = 15.30 + sum(Hvap)
    except:
        EnthalpyVap = None
    try:
        EnthalpyFus = -0.88 + sum(Hfus)
    except:
        EnthalpyFus = None
    try:
        LiquidVisco = float(MW)*math.exp((sum(Ya) - 597.82)/float(temperature) + sum(Yb) - 11.202)
    except:
        LiquidVisco = None
    try:
        CrystalSolub_1 = 10**(0.8 - float(LogP) - 0.01*(sum(Tm)+122.5 - 273.15 - 25.))*1000.*float(MW)
    except:
        CrystalSolub_1 = None
    try:
        CrystalSolub_2 = 10**(0.5 - float(LogP) - 0.01*(sum(Tm)+122.5 - 273.15 - 25.))*1000.*float(MW)
    except:
        CrystalSolub_2 = None
    try:
        AmorphSolub_1 = 10**(0.8 - float(LogP) - 0.01*(sum(Tm)+122.5 - 273.15 - 25.)) *1000.*float(MW)*math.exp((sum(Hfus)-0.88)*(sum(Tm) + 122.5 - float(temperature))*float(temperature)/(sum(Tm) + 122.5)**2/(2.479*float(temperature)/298.))
    except:
        AmorphSolub_1 = None
    try:
        AmorphSolub_2 = 10**(0.5 - float(LogP) - 0.01*(sum(Tm)+122.5 - 273.15 - 25.)) *1000.*float(MW)*math.exp((sum(Hfus)-0.88)*(sum(Tm) + 122.5 - float(temperature))*float(temperature)/(sum(Tm) + 122.5)**2/(2.479*float(temperature)/298.))
    except:
        AmorphSolub_2 = None
    return MeltingPoint-273.15

def string2mp(name, namespace='name'):
    mp, mp_origin = None, None
    # try the chemicals package
    if is_cas(name):
        cas = name
        if pd.isna(mp):
            mp = chemicals.Tm(cas)
            if mp: mp = mp-273.15
            methods = chemicals.Tm_methods(cas)
            if methods: mp_origin = 'chem/'+methods[0]
        # experimental values
        if pd.isna(mp):
            # some rows have multiple comma-separated cas numbers
            mask = dfmp_expt['CAS'].str.contains(cas)
            if sum(mask):
                subdf = dfmp_expt[mask]['CAS'].str.split(', ')
                for index,subcas in zip(subdf.index,subdf):
                    if cas in subcas:
                        mp = float(dfmp_expt.iloc[index]['MP'])
                        mp_origin = 'expt'
        # predicted values from Mansouri
        #if pd.isna(mp):
        #    mask = dfmp_pred['Substance_CASRN'] == cas
        #    if sum(mask):
        #        mp = float(dfmp_pred[mask]['NCCT_MP'])
        #        mp_origin = 'pred'
        # predicted values from OPERA (CompTox dashboard)
        if pd.isna(mp):
            mask = df_pred['CASRN'] == cas
            if sum(mask):
                mp = float(df_pred[mask]['MELTING_POINT_DEGC_OPERA_PRED'].iloc[0])
                mp_origin = 'comptox/pred'
    # try to scrape from PubChem
    if pd.isna(mp):
        content = None
        try:
            compounds = pcp.get_compounds(name, namespace=namespace)
            c = compounds[0]
            cid = c.cid
            url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON'
            #fid = urllib.request.urlopen(url)
            #webpage = fid.read().decode('utf-8')
            webpage = requests.get(url).text
            content = json.loads(webpage)
        except:
            pass
        if content:
            mp_list = []
            for i in content['Record']['Section']:
                if i['TOCHeading'] == 'Chemical and Physical Properties':
                    for j in i['Section']:
                        if j['TOCHeading'] == 'Experimental Properties':
                            for k in j['Section']:
                                if k['TOCHeading'] == 'Melting Point':
                                    for ii in k['Information']:
                                        try:
                                            mp_string = ii['Value']['StringWithMarkup'][0]['String']
                                            #rho_string = rho_string.replace('Relative density (water = 1): ', '')
                                            #print(mp_string)
                                            #tmp_rho = re.match('(?:\d+(?:\.\d*)?|\.\d+)',rho_string)
                                            #m = re.match('((?:\d+(?:\.\d*)?|\.\d+))(?:-((?:\d+(?:\.\d*)?|\.\d+)))?',rho_string)
                                            m = re.match(r'(-?(?:\d+(?:\.\d*)?|\.\d+))(?:-((?:\d+(?:\.\d*)?|\.\d+)))?( ?°?C)',mp_string)
                                            if m is not None:
                                                groups = m.groups()
                                                if len(groups):
                                                    for g in groups:
                                                        try:
                                                            tmp_mp = float(g)
                                                            mp_list.append(tmp_mp)
                                                        except:
                                                            continue
                                            m = re.match(r'(-?(?:\d+(?:\.\d*)?|\.\d+))(?:-((?:\d+(?:\.\d*)?|\.\d+)))?( ?°?F)',mp_string)
                                            if m is not None:
                                                groups = m.groups()
                                                if len(groups):
                                                    for g in groups:
                                                        try:
                                                            # (T/F − 32) × 5/9 = T/C
                                                            tmp_mp = (float(g)-32)*5/9
                                                            mp_list.append(tmp_mp)
                                                        except:
                                                            continue
                                        except:
                                            continue
            if mp_list:
                ## remove outliers using interquartile range (IQR)
                mp_list = np.array(mp_list)
                q75,q25 = np.percentile(mp_list,[75,25])
                intr_qr = q75-q25
                hi = q75+(1.5*intr_qr)
                lo = q25-(1.5*intr_qr)
                mask = (mp_list <= hi) & (mp_list >= lo)
                mp_list = mp_list[mask]
                mp = np.mean(mp_list)
                mp_origin = 'pubchem'
            else:
                mp, mp_origin = None, None
    # try to scrape from DSSTOX website...
    if try_dsstox:
        if pd.isna(mp):
            dtxsid = None
            # try to find it via the dsstox dashboard
            try:
                name_urlsafe = urllib.parse.quote(name)
                url = f'https://comptox.epa.gov/dashboard/search-results?input_type=synonym_substring&inputs={name_urlsafe}'
                fid = urllib.request.urlopen(url)
                webpage = fid.read().decode('utf-8')
                hits = re.findall(r'DTXSID[0-9]+', webpage)
                if len(hits):
                    dtxsid = hits[0]
            except:
                pass
            if dtxsid:
                mysoup = None
                url = f'https://comptox.epa.gov/dashboard/chemical/properties/{dtxsid}'
                #print(url)
                try:
                    #driver = selenium.webdriver.Firefox()
                    #driver.set_page_load_timeout(15)
                    #driver.get(url)
                    #driver_exe = 'chromedriver'
                    #driver_exe = chromedriver_binary.chromedriver_filename
                    options = Options()
                    options.add_argument("--headless") # runs in background instead of showing browser window
                    service = Service(driver_exe)
                    driver = selenium.webdriver.Chrome(service=service, options=options)
                    #driver = selenium.webdriver.Chrome(driver_exe, options=options)
                    driver.set_page_load_timeout(15)
                    driver.get(url)
                    webpage = driver.page_source
                    driver.quit()
                    mysoup = bs4.BeautifulSoup(webpage, features='lxml')
                except:
                    pass
                    #print("timeout")
                # column of property names
                if mysoup:
                    ifound = None
                    rows = mysoup.find_all('div', attrs={'col-id':'property'})
                    for i,row in enumerate(rows):
                        if 'Melting Point' in row.text:
                            ifound = i
                            break
                    if ifound:
                        rows = mysoup.find_all('div', attrs={'col-id':'exavg'})
                        text = rows[ifound].text
                        value = re.sub(r' \([0-9]*\)', '', text.strip())
                        try:
                            mp = float(value)
                            mp_origin = 'dsstox/expt'
                        except:
                            mp, mp_origin = None, None
                        if pd.isna(mp):
                            rows = mysoup.find_all('div', attrs={'col-id':'predavg'})
                            text = rows[ifound].text
                            value = re.sub(r' \([0-9]*\)', '', text.strip())
                            try:
                                mp = float(value)
                                mp_origin = 'dsstox/pred'
                            except:
                                mp, mp_origin = None, None
                    else:
                        mp, mp_origin = None, None
            else:
                mp, mp_origin = None, None
    if pd.isna(mp): mp, mp_origin = None, None
    return mp, mp_origin

def smiles2mp(smiles):
    try:
        SUPPORTED_ATOM_SET = {6, 7, 8, 9, 16, 17, 35, 53}
        m = Chem.MolFromSmiles(str(smiles))
        atom_num_set = set([a.GetAtomicNum() for a in m.GetAtoms()])
        if atom_num_set.issubset(SUPPORTED_ATOM_SET):
            mp = compute_phys_properties(smiles)
        else:
            mp = None
    except:
        mp = None
    return mp

def smiles2mp_opera(smiles):
    descs = padelpy.from_smiles(smiles, descriptortypes='mp/descriptors.xml')
    #dfd = pd.DataFrame(descs,index=[0])
    #dfd = dfd.replace('',0).infer_objects(copy=False)
    #dfd = pd.DataFrame(dfd, dtype=float)
    dfd = pd.DataFrame(descs, index=[0]).apply(pd.to_numeric, errors="coerce").fillna(0.0).astype(float)
    X = np.array(dfd[my_opera_data_mp.desc_list])
    X_scale = my_opera_data_mp.scaler_X.transform(X)
    y_pred = my_opera_data_mp.scaler_y.inverse_transform(my_opera_data_mp.knn_all.predict(X_scale))
    return y_pred[0][0]

def mol2mp(cas, name, smiles):
    mp, mp_origin = None, None
    if pd.isna(mp) and smiles:
        try:
            mp = smiles2mp_opera(smiles)
            mp_origin = 'opera/calc'
        except:
            mp, mp_origin = None, None
    if pd.isna(mp) and cas:
        try:
            mp, mp_origin = string2mp(cas)
        except:
            mp, mp_origin = None, None
    if pd.isna(mp) and name:
        try:
            mp, mp_origin = string2mp(name)
        except:
            mp, mp_origin = None, None
    if pd.isna(mp) and smiles:
        try:
            mp = smiles2mp(smiles)
            mp_origin = 'joback-reid/calc'
        except:
            mp, mp_origin = None, None
    return mp, mp_origin

def getLogP(cas,mol):
    LogP, LogP_origin = None, None
    if cas:
        mask = df_pred['CASRN'] == cas
        if sum(mask):
            LogP = float(df_pred[mask]['OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED'])
            LogP_origin = 'comptox/pred'
    if pd.isna(LogP) and mol:
        LogP = Crippen.MolLogP(mol)
        LogP_origin = 'rdkit/calc'
    return LogP, LogP_origin