robert.elder commited on
Commit ·
41b73a8
1
Parent(s): c8e574b
better error checking on selenium codeblocks
Browse files
ChemID.py
CHANGED
|
@@ -39,7 +39,7 @@ dfmp_expt = pd.read_csv('PHYSPROP_MP_data.tsv', sep='\t')
|
|
| 39 |
#df_pred = pd.read_excel('Comptox_pred_data.xlsx')
|
| 40 |
df_pred = pd.read_csv('Comptox_pred_data.tsv', sep='\t')
|
| 41 |
|
| 42 |
-
def ResolveChemical(chemName, IDtype):
|
| 43 |
|
| 44 |
#LogP_func = Crippen.MolLogP
|
| 45 |
LogP_func = getLogP
|
|
@@ -52,6 +52,9 @@ def ResolveChemical(chemName, IDtype):
|
|
| 52 |
rho = None
|
| 53 |
mp = None
|
| 54 |
im64 = None
|
|
|
|
|
|
|
|
|
|
| 55 |
error = 0
|
| 56 |
|
| 57 |
if IDtype == 'CAS':
|
|
@@ -156,7 +159,10 @@ def ResolveChemical(chemName, IDtype):
|
|
| 156 |
if not cas:
|
| 157 |
cas = 'Not found'
|
| 158 |
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
#Generates an image of the molecule represented by the SMILES code given.
|
| 162 |
#Returns None if the image cannot be generated. From https://github.com/ronaldo-prata/flask-test/blob/master/functions.py
|
|
@@ -498,7 +504,7 @@ def string2density(name):
|
|
| 498 |
rho = float(df_pred[mask]['DENSITY_G/CM^3_TEST_PRED'])
|
| 499 |
rho_origin = 'pred'
|
| 500 |
# try to scrape from PubChem
|
| 501 |
-
if rho
|
| 502 |
try:
|
| 503 |
content = None
|
| 504 |
compounds = pcp.get_compounds(name, namespace='name')
|
|
@@ -551,7 +557,7 @@ def string2density(name):
|
|
| 551 |
rho, rho_origin = None, None
|
| 552 |
# try to scrape from DSSTOX
|
| 553 |
if try_dsstox:
|
| 554 |
-
if rho
|
| 555 |
dtxsid = None
|
| 556 |
try:
|
| 557 |
# try to find it via the dsstox dashboard
|
|
@@ -565,6 +571,7 @@ def string2density(name):
|
|
| 565 |
except:
|
| 566 |
pass
|
| 567 |
if dtxsid:
|
|
|
|
| 568 |
url = f'https://comptox.epa.gov/dashboard/chemical/properties/{dtxsid}'
|
| 569 |
#print(url)
|
| 570 |
try:
|
|
@@ -575,45 +582,46 @@ def string2density(name):
|
|
| 575 |
driver = selenium.webdriver.Chrome(service=service, options=options)
|
| 576 |
driver.set_page_load_timeout(15)
|
| 577 |
driver.get(url)
|
|
|
|
|
|
|
|
|
|
| 578 |
except KeyboardInterrupt:
|
| 579 |
raise
|
| 580 |
except:
|
| 581 |
pass
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
if
|
| 591 |
-
|
| 592 |
-
break
|
| 593 |
-
if ifound:
|
| 594 |
-
rows = mysoup.find_all('div', attrs={'col-id':'exavg'})
|
| 595 |
-
text = rows[ifound].text
|
| 596 |
-
value = re.sub(' \([0-9]*\)', '', text.strip())
|
| 597 |
-
try:
|
| 598 |
-
rho = float(value)
|
| 599 |
-
rho_origin = 'expt/dsstox'
|
| 600 |
-
except:
|
| 601 |
-
rho, rho_origin = None, None
|
| 602 |
-
if rho is None:
|
| 603 |
-
rows = mysoup.find_all('div', attrs={'col-id':'predavg'})
|
| 604 |
text = rows[ifound].text
|
| 605 |
value = re.sub(' \([0-9]*\)', '', text.strip())
|
| 606 |
try:
|
| 607 |
rho = float(value)
|
| 608 |
-
rho_origin = '
|
| 609 |
except:
|
| 610 |
rho, rho_origin = None, None
|
| 611 |
-
|
| 612 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
else:
|
| 614 |
rho, rho_origin = None, None
|
| 615 |
-
if
|
| 616 |
-
|
|
|
|
| 617 |
return rho, rho_origin
|
| 618 |
|
| 619 |
def return_non_duplicate_index(tuples): ##Given a list of sets return index of non_duplicate items
|
|
@@ -960,6 +968,7 @@ def string2mp(name):
|
|
| 960 |
except:
|
| 961 |
pass
|
| 962 |
if dtxsid:
|
|
|
|
| 963 |
url = f'https://comptox.epa.gov/dashboard/chemical/properties/{dtxsid}'
|
| 964 |
#print(url)
|
| 965 |
try:
|
|
@@ -975,39 +984,40 @@ def string2mp(name):
|
|
| 975 |
#driver = selenium.webdriver.Chrome(driver_exe, options=options)
|
| 976 |
driver.set_page_load_timeout(15)
|
| 977 |
driver.get(url)
|
|
|
|
|
|
|
|
|
|
| 978 |
except:
|
| 979 |
pass
|
| 980 |
#print("timeout")
|
| 981 |
-
webpage = driver.page_source
|
| 982 |
-
driver.quit()
|
| 983 |
-
mysoup = bs4.BeautifulSoup(webpage, features='lxml')
|
| 984 |
# column of property names
|
| 985 |
-
|
| 986 |
-
|
| 987 |
-
|
| 988 |
-
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
|
| 992 |
-
|
| 993 |
-
|
| 994 |
-
value = re.sub(' \([0-9]*\)', '', text.strip())
|
| 995 |
-
try:
|
| 996 |
-
mp = float(value)
|
| 997 |
-
mp_origin = 'expt/dsstox'
|
| 998 |
-
except:
|
| 999 |
-
mp, mp_origin = None, None
|
| 1000 |
-
if mp is None:
|
| 1001 |
-
rows = mysoup.find_all('div', attrs={'col-id':'predavg'})
|
| 1002 |
text = rows[ifound].text
|
| 1003 |
value = re.sub(' \([0-9]*\)', '', text.strip())
|
| 1004 |
try:
|
| 1005 |
mp = float(value)
|
| 1006 |
-
mp_origin = '
|
| 1007 |
except:
|
| 1008 |
mp, mp_origin = None, None
|
| 1009 |
-
|
| 1010 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1011 |
else:
|
| 1012 |
mp, mp_origin = None, None
|
| 1013 |
if mp is not None and np.isnan(mp): mp = None
|
|
|
|
| 39 |
#df_pred = pd.read_excel('Comptox_pred_data.xlsx')
|
| 40 |
df_pred = pd.read_csv('Comptox_pred_data.tsv', sep='\t')
|
| 41 |
|
| 42 |
+
def ResolveChemical(chemName, IDtype, debug=False):
|
| 43 |
|
| 44 |
#LogP_func = Crippen.MolLogP
|
| 45 |
LogP_func = getLogP
|
|
|
|
| 52 |
rho = None
|
| 53 |
mp = None
|
| 54 |
im64 = None
|
| 55 |
+
mp_origin = None
|
| 56 |
+
rho_origin = None
|
| 57 |
+
LogP_origin = None
|
| 58 |
error = 0
|
| 59 |
|
| 60 |
if IDtype == 'CAS':
|
|
|
|
| 159 |
if not cas:
|
| 160 |
cas = 'Not found'
|
| 161 |
|
| 162 |
+
if debug:
|
| 163 |
+
return (name, cas, smiles, Mw, LogP, LogP_origin, rho, rho_origin, mp, mp_origin, im64, error)
|
| 164 |
+
else:
|
| 165 |
+
return (name, cas, smiles, Mw, LogP, rho, mp, im64, error)
|
| 166 |
|
| 167 |
#Generates an image of the molecule represented by the SMILES code given.
|
| 168 |
#Returns None if the image cannot be generated. From https://github.com/ronaldo-prata/flask-test/blob/master/functions.py
|
|
|
|
| 504 |
rho = float(df_pred[mask]['DENSITY_G/CM^3_TEST_PRED'])
|
| 505 |
rho_origin = 'pred'
|
| 506 |
# try to scrape from PubChem
|
| 507 |
+
if pd.isna(rho):
|
| 508 |
try:
|
| 509 |
content = None
|
| 510 |
compounds = pcp.get_compounds(name, namespace='name')
|
|
|
|
| 557 |
rho, rho_origin = None, None
|
| 558 |
# try to scrape from DSSTOX
|
| 559 |
if try_dsstox:
|
| 560 |
+
if pd.isna(rho):
|
| 561 |
dtxsid = None
|
| 562 |
try:
|
| 563 |
# try to find it via the dsstox dashboard
|
|
|
|
| 571 |
except:
|
| 572 |
pass
|
| 573 |
if dtxsid:
|
| 574 |
+
mysoup = None
|
| 575 |
url = f'https://comptox.epa.gov/dashboard/chemical/properties/{dtxsid}'
|
| 576 |
#print(url)
|
| 577 |
try:
|
|
|
|
| 582 |
driver = selenium.webdriver.Chrome(service=service, options=options)
|
| 583 |
driver.set_page_load_timeout(15)
|
| 584 |
driver.get(url)
|
| 585 |
+
webpage = driver.page_source
|
| 586 |
+
driver.quit()
|
| 587 |
+
mysoup = bs4.BeautifulSoup(webpage, features='lxml')
|
| 588 |
except KeyboardInterrupt:
|
| 589 |
raise
|
| 590 |
except:
|
| 591 |
pass
|
| 592 |
+
if mysoup:
|
| 593 |
+
ifound = None
|
| 594 |
+
# column of property names
|
| 595 |
+
rows = mysoup.find_all('div', attrs={'col-id':'property'})
|
| 596 |
+
for i,row in enumerate(rows):
|
| 597 |
+
if 'Density' in row.text:
|
| 598 |
+
ifound = i
|
| 599 |
+
break
|
| 600 |
+
if ifound:
|
| 601 |
+
rows = mysoup.find_all('div', attrs={'col-id':'exavg'})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 602 |
text = rows[ifound].text
|
| 603 |
value = re.sub(' \([0-9]*\)', '', text.strip())
|
| 604 |
try:
|
| 605 |
rho = float(value)
|
| 606 |
+
rho_origin = 'expt/dsstox'
|
| 607 |
except:
|
| 608 |
rho, rho_origin = None, None
|
| 609 |
+
if pd.isna(rho):
|
| 610 |
+
rows = mysoup.find_all('div', attrs={'col-id':'predavg'})
|
| 611 |
+
text = rows[ifound].text
|
| 612 |
+
value = re.sub(' \([0-9]*\)', '', text.strip())
|
| 613 |
+
try:
|
| 614 |
+
rho = float(value)
|
| 615 |
+
rho_origin = 'pred/dsstox'
|
| 616 |
+
except:
|
| 617 |
+
rho, rho_origin = None, None
|
| 618 |
+
else:
|
| 619 |
+
rho, rho_origin = None, None
|
| 620 |
else:
|
| 621 |
rho, rho_origin = None, None
|
| 622 |
+
if pd.isna(rho):
|
| 623 |
+
rho = None
|
| 624 |
+
rho_origin = None
|
| 625 |
return rho, rho_origin
|
| 626 |
|
| 627 |
def return_non_duplicate_index(tuples): ##Given a list of sets return index of non_duplicate items
|
|
|
|
| 968 |
except:
|
| 969 |
pass
|
| 970 |
if dtxsid:
|
| 971 |
+
mysoup = None
|
| 972 |
url = f'https://comptox.epa.gov/dashboard/chemical/properties/{dtxsid}'
|
| 973 |
#print(url)
|
| 974 |
try:
|
|
|
|
| 984 |
#driver = selenium.webdriver.Chrome(driver_exe, options=options)
|
| 985 |
driver.set_page_load_timeout(15)
|
| 986 |
driver.get(url)
|
| 987 |
+
webpage = driver.page_source
|
| 988 |
+
driver.quit()
|
| 989 |
+
mysoup = bs4.BeautifulSoup(webpage, features='lxml')
|
| 990 |
except:
|
| 991 |
pass
|
| 992 |
#print("timeout")
|
|
|
|
|
|
|
|
|
|
| 993 |
# column of property names
|
| 994 |
+
if mysoup:
|
| 995 |
+
ifound = None
|
| 996 |
+
rows = mysoup.find_all('div', attrs={'col-id':'property'})
|
| 997 |
+
for i,row in enumerate(rows):
|
| 998 |
+
if 'Melting Point' in row.text:
|
| 999 |
+
ifound = i
|
| 1000 |
+
break
|
| 1001 |
+
if ifound:
|
| 1002 |
+
rows = mysoup.find_all('div', attrs={'col-id':'exavg'})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1003 |
text = rows[ifound].text
|
| 1004 |
value = re.sub(' \([0-9]*\)', '', text.strip())
|
| 1005 |
try:
|
| 1006 |
mp = float(value)
|
| 1007 |
+
mp_origin = 'expt/dsstox'
|
| 1008 |
except:
|
| 1009 |
mp, mp_origin = None, None
|
| 1010 |
+
if mp is None:
|
| 1011 |
+
rows = mysoup.find_all('div', attrs={'col-id':'predavg'})
|
| 1012 |
+
text = rows[ifound].text
|
| 1013 |
+
value = re.sub(' \([0-9]*\)', '', text.strip())
|
| 1014 |
+
try:
|
| 1015 |
+
mp = float(value)
|
| 1016 |
+
mp_origin = 'pred/dsstox'
|
| 1017 |
+
except:
|
| 1018 |
+
mp, mp_origin = None, None
|
| 1019 |
+
else:
|
| 1020 |
+
mp, mp_origin = None, None
|
| 1021 |
else:
|
| 1022 |
mp, mp_origin = None, None
|
| 1023 |
if mp is not None and np.isnan(mp): mp = None
|