Robert Elder commited on
Commit Β·
d33329a
1
Parent(s): 53e90d9
quantity module + qrf updates
Browse files- .gitignore +3 -0
- ChemID.py +6 -6
- color3_module/colors.py +1 -1
- Comptox_pred_data.tsv β data/Comptox_pred_data.tsv +0 -0
- MnPC.sdf β data/MnPC.sdf +0 -0
- PHYSPROP_MP_data.tsv β data/PHYSPROP_MP_data.tsv +0 -0
- ceramics_list.txt β data/ceramics_list.txt +0 -0
- custom_chemicals_db.tsv β data/custom_chemicals_db.tsv +0 -0
- salt_list.txt β data/salt_list.txt +0 -0
- exposure3_module/exposure.py +1 -1
- qrf/db-D-interp-allT-semiclean.xlsx +3 -0
- qrf_functions.py β qrf/functions.py +20 -5
- qrf/mordred-descriptors.xlsx +3 -0
- qrf/qrf_model_bundle_25.pkl +3 -0
- qrf/qrf_model_bundle_30.pkl +3 -0
- qrf/qrf_model_bundle_35.pkl +3 -0
- qrf_model_bundle_37.pkl β qrf/qrf_model_bundle_37.pkl +2 -2
- qrf/qrf_model_bundle_40.pkl +3 -0
- qrf/qrf_model_bundle_45.pkl +3 -0
- qrf/qrf_model_bundle_50.pkl +3 -0
- qrf/qrf_model_bundle_55.pkl +3 -0
- qrf/qrf_model_bundle_60.pkl +3 -0
- qrf/qrf_model_bundle_65.pkl +3 -0
- qrf/qrf_model_bundle_70.pkl +3 -0
- qrf/qrf_model_bundle_75.pkl +3 -0
- qrf/qrf_parameters_allT.xlsx +3 -0
- qrf/qrf_train.py +72 -0
- qrf_x.xlsx β qrf/qrf_x.xlsx +0 -0
- qrf_y.xlsx β qrf/qrf_y.xlsx +0 -0
- qrf_train.py +0 -36
- quantity_functions.py β quantity_module/functions.py +0 -0
- quantity_module/quantity.py +21 -9
- quantity_module/templates/quantity_index.html +1 -1
- quantity_module/templates/quantity_report.html +7 -0
.gitignore
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
__pycache__/*
|
| 2 |
.idea/*
|
| 3 |
.DS_Store
|
|
|
|
| 1 |
+
polymer_names.tsv.bkup
|
| 2 |
+
quantity_module/data/copy-data.sh
|
| 3 |
+
qrf/copy-data.sh
|
| 4 |
__pycache__/*
|
| 5 |
.idea/*
|
| 6 |
.DS_Store
|
ChemID.py
CHANGED
|
@@ -15,10 +15,10 @@ import json
|
|
| 15 |
|
| 16 |
ORGANIC_ATOM_SET = {5, 6, 7, 8, 9, 15, 16, 17, 35, 53}
|
| 17 |
METAL_ATOM_SET = set([3,4,11,12,13] + list(range(19,31+1)) + list(range(37,50+1)) + list(range(55,84+1)) + list(range(87,114+1)) + [116])
|
| 18 |
-
with open('ceramics_list.txt', 'r') as fp:
|
| 19 |
lines = fp.readlines()
|
| 20 |
CERAMICS_SET = {line.strip() for line in lines}
|
| 21 |
-
with open('salt_list.txt', 'r') as fp:
|
| 22 |
lines = fp.readlines()
|
| 23 |
SALT_SET = {line.strip() for line in lines}
|
| 24 |
|
|
@@ -53,13 +53,13 @@ from rdkit.Chem import Descriptors,Draw,Crippen
|
|
| 53 |
|
| 54 |
## add custom chemical definitions (i.e., to correct confusion between methane and carbon)
|
| 55 |
db = chemicals.identifiers.get_pubchem_db()
|
| 56 |
-
db.load('custom_chemicals_db.tsv')
|
| 57 |
## load experimental and predicted properties
|
| 58 |
#dfmp_expt = pd.read_excel('PHYSPROP_MP_data.xlsx')
|
| 59 |
-
dfmp_expt = pd.read_csv('PHYSPROP_MP_data.tsv', sep='\t')
|
| 60 |
#dfmp_pred = pd.read_excel('DSSTOX_MP_pred_data.xlsx')
|
| 61 |
#df_pred = pd.read_excel('Comptox_pred_data.xlsx')
|
| 62 |
-
df_pred = pd.read_csv('Comptox_pred_data.tsv', sep='\t')
|
| 63 |
|
| 64 |
## OPERA melting point model
|
| 65 |
import dill as pickle
|
|
@@ -249,7 +249,7 @@ def ImageFromSmiles(smiles):
|
|
| 249 |
if type(smiles) is str:
|
| 250 |
try:
|
| 251 |
if smiles == 'C1=CC=C2C(=C1)C3=NC4=NC(=NC5=C6C=CC=CC6=C([N-]5)N=C7C8=CC=CC=C8C(=N7)N=C2[N-]3)C9=CC=CC=C94.[Mn+2]':
|
| 252 |
-
mol = next(Chem.SDMolSupplier('MnPC.sdf', removeHs=False))
|
| 253 |
image = Draw.MolToImage(mol, size=(350, 350))
|
| 254 |
else:
|
| 255 |
image = Draw.MolToImage(Chem.MolFromSmiles(smiles), size=(350, 350))
|
|
|
|
| 15 |
|
| 16 |
ORGANIC_ATOM_SET = {5, 6, 7, 8, 9, 15, 16, 17, 35, 53}
|
| 17 |
METAL_ATOM_SET = set([3,4,11,12,13] + list(range(19,31+1)) + list(range(37,50+1)) + list(range(55,84+1)) + list(range(87,114+1)) + [116])
|
| 18 |
+
with open('data/ceramics_list.txt', 'r') as fp:
|
| 19 |
lines = fp.readlines()
|
| 20 |
CERAMICS_SET = {line.strip() for line in lines}
|
| 21 |
+
with open('data/salt_list.txt', 'r') as fp:
|
| 22 |
lines = fp.readlines()
|
| 23 |
SALT_SET = {line.strip() for line in lines}
|
| 24 |
|
|
|
|
| 53 |
|
| 54 |
## add custom chemical definitions (i.e., to correct confusion between methane and carbon)
|
| 55 |
db = chemicals.identifiers.get_pubchem_db()
|
| 56 |
+
db.load('data/custom_chemicals_db.tsv')
|
| 57 |
## load experimental and predicted properties
|
| 58 |
#dfmp_expt = pd.read_excel('PHYSPROP_MP_data.xlsx')
|
| 59 |
+
dfmp_expt = pd.read_csv('data/PHYSPROP_MP_data.tsv', sep='\t')
|
| 60 |
#dfmp_pred = pd.read_excel('DSSTOX_MP_pred_data.xlsx')
|
| 61 |
#df_pred = pd.read_excel('Comptox_pred_data.xlsx')
|
| 62 |
+
df_pred = pd.read_csv('data/Comptox_pred_data.tsv', sep='\t')
|
| 63 |
|
| 64 |
## OPERA melting point model
|
| 65 |
import dill as pickle
|
|
|
|
| 249 |
if type(smiles) is str:
|
| 250 |
try:
|
| 251 |
if smiles == 'C1=CC=C2C(=C1)C3=NC4=NC(=NC5=C6C=CC=CC6=C([N-]5)N=C7C8=CC=CC=C8C(=N7)N=C2[N-]3)C9=CC=CC=C94.[Mn+2]':
|
| 252 |
+
mol = next(Chem.SDMolSupplier('data/MnPC.sdf', removeHs=False))
|
| 253 |
image = Draw.MolToImage(mol, size=(350, 350))
|
| 254 |
else:
|
| 255 |
image = Draw.MolToImage(Chem.MolFromSmiles(smiles), size=(350, 350))
|
color3_module/colors.py
CHANGED
|
@@ -4,7 +4,7 @@ import numpy as np
|
|
| 4 |
import pandas as pd
|
| 5 |
from functions import SigFigs, Piringer, WilkeChang, SheetRelease, SheetRates, RatePlot
|
| 6 |
from functions import Piecewise, PowerLaw
|
| 7 |
-
from
|
| 8 |
from . import blueprint
|
| 9 |
from polymers import Polymers, Polymers3
|
| 10 |
from ChemID import *
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
from functions import SigFigs, Piringer, WilkeChang, SheetRelease, SheetRates, RatePlot
|
| 6 |
from functions import Piecewise, PowerLaw
|
| 7 |
+
from qrf.functions import QRF_Apply, QRF_Ceramic
|
| 8 |
from . import blueprint
|
| 9 |
from polymers import Polymers, Polymers3
|
| 10 |
from ChemID import *
|
Comptox_pred_data.tsv β data/Comptox_pred_data.tsv
RENAMED
|
File without changes
|
MnPC.sdf β data/MnPC.sdf
RENAMED
|
File without changes
|
PHYSPROP_MP_data.tsv β data/PHYSPROP_MP_data.tsv
RENAMED
|
File without changes
|
ceramics_list.txt β data/ceramics_list.txt
RENAMED
|
File without changes
|
custom_chemicals_db.tsv β data/custom_chemicals_db.tsv
RENAMED
|
File without changes
|
salt_list.txt β data/salt_list.txt
RENAMED
|
File without changes
|
exposure3_module/exposure.py
CHANGED
|
@@ -4,7 +4,7 @@ import pandas as pd
|
|
| 4 |
from flask import render_template, request
|
| 5 |
from functions import SigFigs, Piringer, WilkeChang, SheetRelease, SheetRates, RatePlot
|
| 6 |
from functions import Piecewise, PowerLaw
|
| 7 |
-
from
|
| 8 |
from . import blueprint
|
| 9 |
from polymers import Polymers, Polymers3
|
| 10 |
from ChemID import *
|
|
|
|
| 4 |
from flask import render_template, request
|
| 5 |
from functions import SigFigs, Piringer, WilkeChang, SheetRelease, SheetRates, RatePlot
|
| 6 |
from functions import Piecewise, PowerLaw
|
| 7 |
+
from qrf.functions import QRF_Apply, QRF_Ceramic
|
| 8 |
from . import blueprint
|
| 9 |
from polymers import Polymers, Polymers3
|
| 10 |
from ChemID import *
|
qrf/db-D-interp-allT-semiclean.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c5c4399929d0bb5f72e3fee3896058e7accfbf68d63df155e409a4c2b6b87a2
|
| 3 |
+
size 13087796
|
qrf_functions.py β qrf/functions.py
RENAMED
|
@@ -9,11 +9,23 @@ import mordred.descriptors
|
|
| 9 |
import rdkit
|
| 10 |
from rdkit import Chem
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def QRF_Ceramic(density, polytg, quantiles=[0.03,0.5,0.97], T=37, worstcase='hi'):
|
| 13 |
-
|
|
|
|
| 14 |
reg, imp, scaler_X, sub_desc_list = pickle.load(f)
|
| 15 |
-
df_X = pd.read_excel('qrf_x.xlsx')
|
| 16 |
-
df_y = pd.read_excel('qrf_y.xlsx')
|
|
|
|
|
|
|
|
|
|
| 17 |
X_all = imp.transform(df_X)
|
| 18 |
X_all_scale = scaler_X.transform(X_all)
|
| 19 |
## use "worst-case" solute values
|
|
@@ -39,7 +51,8 @@ def QRF_Ceramic(density, polytg, quantiles=[0.03,0.5,0.97], T=37, worstcase='hi'
|
|
| 39 |
|
| 40 |
|
| 41 |
def QRF_Apply(density, polytg, smiles, quantiles=[0.03,0.5,0.97], T=37):
|
| 42 |
-
|
|
|
|
| 43 |
reg, imp, scaler_X, sub_desc_list = pickle.load(f)
|
| 44 |
# get list of descriptors to calculate
|
| 45 |
solute_desc_list = sub_desc_list.copy()
|
|
@@ -67,7 +80,9 @@ def QRF_Apply(density, polytg, smiles, quantiles=[0.03,0.5,0.97], T=37):
|
|
| 67 |
# return 1D array regardless of quantiles setting
|
| 68 |
D_pred = D_pred[0]
|
| 69 |
## domain extrapolation check
|
| 70 |
-
df_X = pd.read_excel('qrf_x.xlsx')
|
|
|
|
|
|
|
| 71 |
X_all = imp.transform(df_X)
|
| 72 |
X_all_scale = scaler_X.transform(X_all)
|
| 73 |
dij = QRF_DomainExtrap(reg, X_all_scale, descs_scale)
|
|
|
|
| 9 |
import rdkit
|
| 10 |
from rdkit import Chem
|
| 11 |
|
| 12 |
+
QRF_T_list = np.array([25,30,35,37,40,45,50,55,60,65,70,75])
|
| 13 |
+
QRF_T_cut = 2.5
|
| 14 |
+
df_QRF = pd.read_excel('qrf/db-D-interp-allT-semiclean.xlsx')
|
| 15 |
+
df_desc = pd.read_excel('qrf/mordred-descriptors.xlsx')
|
| 16 |
+
calc = mordred.Calculator(mordred.descriptors)
|
| 17 |
+
colnames_mordred = [str(d) for d in calc.descriptors]
|
| 18 |
+
df_QRF = pd.merge(df_QRF, df_desc[['Solute_InChIKey',*colnames_mordred]], how='left', on='Solute_InChIKey', suffixes=('', '_dupe'))
|
| 19 |
+
|
| 20 |
def QRF_Ceramic(density, polytg, quantiles=[0.03,0.5,0.97], T=37, worstcase='hi'):
|
| 21 |
+
nearest_T = QRF_T_list[np.abs(T-QRF_T_list).argmin()]
|
| 22 |
+
with open(f'qrf/qrf_model_bundle_{int(nearest_T)}.pkl','rb') as f:
|
| 23 |
reg, imp, scaler_X, sub_desc_list = pickle.load(f)
|
| 24 |
+
#df_X = pd.read_excel('qrf/qrf_x.xlsx')
|
| 25 |
+
#df_y = pd.read_excel('qrf/qrf_y.xlsx')
|
| 26 |
+
mask_T = (df_QRF['T']>nearest_T-QRF_T_cut) & (df_QRF['T']<nearest_T+QRF_T_cut)
|
| 27 |
+
df_X = df_QRF.loc[mask_T, sub_desc_list]
|
| 28 |
+
df_y = df_QRF.loc[mask_T, 'LogD']
|
| 29 |
X_all = imp.transform(df_X)
|
| 30 |
X_all_scale = scaler_X.transform(X_all)
|
| 31 |
## use "worst-case" solute values
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
def QRF_Apply(density, polytg, smiles, quantiles=[0.03,0.5,0.97], T=37):
|
| 54 |
+
nearest_T = QRF_T_list[np.abs(T-QRF_T_list).argmin()]
|
| 55 |
+
with open(f'qrf/qrf_model_bundle_{int(nearest_T)}.pkl','rb') as f:
|
| 56 |
reg, imp, scaler_X, sub_desc_list = pickle.load(f)
|
| 57 |
# get list of descriptors to calculate
|
| 58 |
solute_desc_list = sub_desc_list.copy()
|
|
|
|
| 80 |
# return 1D array regardless of quantiles setting
|
| 81 |
D_pred = D_pred[0]
|
| 82 |
## domain extrapolation check
|
| 83 |
+
#df_X = pd.read_excel('qrf/qrf_x.xlsx')
|
| 84 |
+
mask_T = (df_QRF['T']>nearest_T-QRF_T_cut) & (df_QRF['T']<nearest_T+QRF_T_cut)
|
| 85 |
+
df_X = df_QRF.loc[mask_T, sub_desc_list]
|
| 86 |
X_all = imp.transform(df_X)
|
| 87 |
X_all_scale = scaler_X.transform(X_all)
|
| 88 |
dij = QRF_DomainExtrap(reg, X_all_scale, descs_scale)
|
qrf/mordred-descriptors.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:121f72b88fa46a0f16af6a1244af761ee6b9d679af7ab2e32d545538f8b5c5b5
|
| 3 |
+
size 10251595
|
qrf/qrf_model_bundle_25.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:61e7f3f4acd41d0548897c8d00cd41ba9129b4ab39e4d6d03bb9924a56bae417
|
| 3 |
+
size 8024827
|
qrf/qrf_model_bundle_30.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:de06c8cfff7b657ed755fd6b3dab6d2a09c742b3a1134d5bec4dc224135bba90
|
| 3 |
+
size 6488642
|
qrf/qrf_model_bundle_35.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebb21d88ccb961223607c258be1b14f45fa67c0dd6fb6a3de41d2ade394b092e
|
| 3 |
+
size 13733182
|
qrf_model_bundle_37.pkl β qrf/qrf_model_bundle_37.pkl
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b6df089e005123321d21b33506ccf0fc4df4dafb4d953e1bd5931b92bd2445d7
|
| 3 |
+
size 14843969
|
qrf/qrf_model_bundle_40.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0e99f5b40d4c460c174e90fa44f82d10f1a54f024d961499fb08c8d18d3835e5
|
| 3 |
+
size 2773504
|
qrf/qrf_model_bundle_45.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8db0082b872c4ce725e9ebfb54ea2dcd5fd1621e4de13a2020d4842ba18f5753
|
| 3 |
+
size 7288402
|
qrf/qrf_model_bundle_50.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a38bef3650a1adb76ee07c4e9fa09058e16ffe3fc64ff18b31511f94d78a0d29
|
| 3 |
+
size 7743347
|
qrf/qrf_model_bundle_55.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:07aaf56f2d7dbeac952097146544c8ffef0a48f05a67b5b9961aca2a5c1c1127
|
| 3 |
+
size 6981710
|
qrf/qrf_model_bundle_60.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:022eba72bc51d42134229f6a9757f07ddafb763a9ae3176dc4a3c08760e3494b
|
| 3 |
+
size 2651150
|
qrf/qrf_model_bundle_65.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2c04563ec96989bd0ba4f44d09070cc543546326ca4b9aba8ef6000b8e27894d
|
| 3 |
+
size 15893003
|
qrf/qrf_model_bundle_70.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c4e7d02658d5b048d5a5d8d1de63e1bd892b561f0d0dbba95cf35a9280f48c7
|
| 3 |
+
size 6673668
|
qrf/qrf_model_bundle_75.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:36653c20dcb9c7cdcb72f32bd1121cc83d98a174e3969ba0c6dec719d93cac67
|
| 3 |
+
size 2548133
|
qrf/qrf_parameters_allT.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff8d83fd6f4f5efd9352cb33e9a4c48d05d94c4aee1b64b2a3134d847856ae52
|
| 3 |
+
size 5755
|
qrf/qrf_train.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pickle
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import sklearn
|
| 5 |
+
import sklearn.impute
|
| 6 |
+
import sklearn.metrics
|
| 7 |
+
from quantile_forest import RandomForestQuantileRegressor
|
| 8 |
+
import mordred
|
| 9 |
+
from mordred import Calculator, descriptors
|
| 10 |
+
|
| 11 |
+
#T_target = 37
|
| 12 |
+
#T_cut = 2.5
|
| 13 |
+
qhiv, qlov = 0.97, 0.03
|
| 14 |
+
state = 12345
|
| 15 |
+
|
| 16 |
+
QRF_T_list = np.array([25,30,35,37,40,45,50,55,60,65,70,75])
|
| 17 |
+
QRF_T_cut = 2.5
|
| 18 |
+
df_QRF = pd.read_excel('db-D-interp-allT-semiclean.xlsx')
|
| 19 |
+
df_desc = pd.read_excel('mordred-descriptors.xlsx')
|
| 20 |
+
calc = mordred.Calculator(mordred.descriptors)
|
| 21 |
+
colnames_mordred = [str(d) for d in calc.descriptors]
|
| 22 |
+
df_QRF = pd.merge(df_QRF, df_desc[['Solute_InChIKey',*colnames_mordred]], how='left', on='Solute_InChIKey', suffixes=('', '_dupe'))
|
| 23 |
+
|
| 24 |
+
df_params = pd.read_excel('qrf_parameters_allT.xlsx')
|
| 25 |
+
|
| 26 |
+
for T_target in QRF_T_list:
|
| 27 |
+
print(T_target)
|
| 28 |
+
if 1:
|
| 29 |
+
if T_target == 37:
|
| 30 |
+
sub_desc_list = ['MW', 'Polymer_Tg', 'Polymer_Density', 'VR2_A', 'ATS0Z', 'AATS5d', 'BCUTv-1h', 'BCUTse-1l', 'Xch-7dv', 'Mp', 'Mi', 'SaasC', 'ETA_epsilon_5', 'fragCpx', 'JGI5', 'JGI8']
|
| 31 |
+
params = {'bootstrap': True, 'max_depth': 7, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000} # best from -18-2.py w fout<0.040 (and 0.045)
|
| 32 |
+
elif T_target == 50:
|
| 33 |
+
sub_desc_list = ['MW', 'Polymer_Tg', 'Polymer_Density', 'ATS0m', 'ATSC2dv', 'ATSC6dv', 'ATSC0m', 'ATSC6i', 'BCUTse-1l', 'BCUTp-1h', 'Mp', 'Mi', 'SaasC']
|
| 34 |
+
params = {'bootstrap': True, 'max_depth': 6, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 6, 'min_samples_split': 2, 'n_estimators': 1000} # best from -19.py and -19-2.py with fout<0.040
|
| 35 |
+
else:
|
| 36 |
+
mask_T = df_params['T']==T_target
|
| 37 |
+
sub_desc_list = df_params.loc[mask_T, 'sub_desc_list'].iloc[0].split('|')
|
| 38 |
+
params = df_params.loc[mask_T, ['bootstrap', 'max_depth', 'max_features', 'max_samples', 'min_samples_leaf', 'min_samples_split', 'n_estimators']].iloc[0].to_dict()
|
| 39 |
+
params['max_samples'] = float(params['max_samples'])
|
| 40 |
+
if 0:
|
| 41 |
+
sub_desc_list = ['MW', 'Polymer_Tg', 'Polymer_Density', 'VR2_A', 'ATS0Z', 'AATS5d', 'BCUTv-1h', 'BCUTse-1l', 'Xch-7dv', 'Mp', 'Mi', 'SaasC', 'ETA_epsilon_5', 'fragCpx', 'JGI5', 'JGI8']
|
| 42 |
+
params = {'bootstrap': True, 'max_depth': 7, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000} # best from -18-2.py w fout<0.040 (and 0.045)
|
| 43 |
+
|
| 44 |
+
## read data
|
| 45 |
+
#df_X = pd.read_excel('qrf_x.xlsx')
|
| 46 |
+
#df_y = pd.read_excel('qrf_y.xlsx')
|
| 47 |
+
mask_T = (df_QRF['T']>T_target-QRF_T_cut) & (df_QRF['T']<T_target+QRF_T_cut)
|
| 48 |
+
df_X = df_QRF.loc[mask_T, sub_desc_list]
|
| 49 |
+
df_y = df_QRF.loc[mask_T, 'LogD']
|
| 50 |
+
#sub_desc_list = list(df_X.columns)
|
| 51 |
+
|
| 52 |
+
## fit transforms
|
| 53 |
+
imp = sklearn.impute.SimpleImputer(missing_values=np.nan, strategy='mean')
|
| 54 |
+
imp.fit(df_X)
|
| 55 |
+
X_all = imp.transform(df_X)
|
| 56 |
+
y_all = np.array(df_y)
|
| 57 |
+
scaler_X = sklearn.preprocessing.StandardScaler().fit(X_all)
|
| 58 |
+
X_all_scale = scaler_X.transform(X_all)
|
| 59 |
+
|
| 60 |
+
reg_all = RandomForestQuantileRegressor(random_state=state, n_jobs=-1, **params)
|
| 61 |
+
reg_all.fit(X_all_scale,y_all)
|
| 62 |
+
|
| 63 |
+
with open(f'qrf_model_bundle_{T_target}.pkl','wb') as f:
|
| 64 |
+
pickle.dump([reg_all,imp,scaler_X,sub_desc_list],f)
|
| 65 |
+
|
| 66 |
+
print(sub_desc_list)
|
| 67 |
+
print(params)
|
| 68 |
+
print(mask_T.sum())
|
| 69 |
+
y_pred = reg_all.predict(X_all_scale)
|
| 70 |
+
print(y_pred.mean(),y_pred.std())
|
| 71 |
+
print()
|
| 72 |
+
|
qrf_x.xlsx β qrf/qrf_x.xlsx
RENAMED
|
File without changes
|
qrf_y.xlsx β qrf/qrf_y.xlsx
RENAMED
|
File without changes
|
qrf_train.py
DELETED
|
@@ -1,36 +0,0 @@
|
|
| 1 |
-
import pickle
|
| 2 |
-
import numpy as np
|
| 3 |
-
import pandas as pd
|
| 4 |
-
import sklearn
|
| 5 |
-
import sklearn.impute
|
| 6 |
-
from quantile_forest import RandomForestQuantileRegressor
|
| 7 |
-
|
| 8 |
-
T_target = 37
|
| 9 |
-
T_cut = 5
|
| 10 |
-
qhiv, qlov = 0.97, 0.03
|
| 11 |
-
state = 12345
|
| 12 |
-
|
| 13 |
-
if T_target == 37:
|
| 14 |
-
params = {'bootstrap': True, 'max_depth': 7, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000} # best from -18-2.py w fout<0.040 (and 0.045)
|
| 15 |
-
|
| 16 |
-
if T_target == 50:
|
| 17 |
-
params = {'bootstrap': True, 'max_depth': 6, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 6, 'min_samples_split': 2, 'n_estimators': 1000} # best from -19.py and -19-2.py with fout<0.040
|
| 18 |
-
|
| 19 |
-
## read data
|
| 20 |
-
df_X = pd.read_excel('qrf_x.xlsx')
|
| 21 |
-
df_y = pd.read_excel('qrf_y.xlsx')
|
| 22 |
-
sub_desc_list = list(df_X.columns)
|
| 23 |
-
|
| 24 |
-
## fit transforms
|
| 25 |
-
imp = sklearn.impute.SimpleImputer(missing_values=np.nan, strategy='mean')
|
| 26 |
-
imp.fit(df_X)
|
| 27 |
-
X_all = imp.transform(df_X)
|
| 28 |
-
y_all = np.array(df_y['LogD'])
|
| 29 |
-
scaler_X = sklearn.preprocessing.StandardScaler().fit(X_all)
|
| 30 |
-
X_all_scale = scaler_X.transform(X_all)
|
| 31 |
-
|
| 32 |
-
reg_all = RandomForestQuantileRegressor(random_state=state, n_jobs=-1, **params)
|
| 33 |
-
reg_all.fit(X_all_scale,y_all)
|
| 34 |
-
|
| 35 |
-
with open(f'qrf_model_bundle_{T_target}.pkl','wb') as f:
|
| 36 |
-
pickle.dump([reg_all,imp,scaler_X,sub_desc_list],f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
quantity_functions.py β quantity_module/functions.py
RENAMED
|
File without changes
|
quantity_module/quantity.py
CHANGED
|
@@ -4,11 +4,11 @@ import pandas as pd
|
|
| 4 |
from flask import render_template, request
|
| 5 |
from functions import SigFigs, HtmlNumber, Piringer, WilkeChang, CdfPlot
|
| 6 |
#from functions import Piecewise, PowerLaw
|
| 7 |
-
from
|
| 8 |
from . import blueprint
|
| 9 |
from polymers import Polymers, Polymers3
|
| 10 |
from ChemID import *
|
| 11 |
-
from
|
| 12 |
import rdkit
|
| 13 |
from rdkit.Chem import AllChem as Chem
|
| 14 |
|
|
@@ -104,17 +104,23 @@ def exp_post():
|
|
| 104 |
Solvent_MW = Solvent_MWs[Solvent_Name]
|
| 105 |
Solute_MW = MW
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
polymer = request.form['polymer']
|
| 108 |
pIndex = np.argmax(polymers == polymer)
|
| 109 |
|
| 110 |
-
# QRF is
|
| 111 |
-
if polymer == 'Other polymer'
|
| 112 |
use_qrf = True
|
| 113 |
else:
|
| 114 |
use_qrf = False
|
| 115 |
|
| 116 |
if use_qrf:
|
| 117 |
-
quantiles = list(np.linspace(0.
|
|
|
|
| 118 |
if is_ceramic:
|
| 119 |
diff,domain_extrap = QRF_Ceramic(Polymer_Density, Polymer_Tg, quantiles=quantiles, T=T-273.15, worstcase='lo')
|
| 120 |
else:
|
|
@@ -143,13 +149,18 @@ def exp_post():
|
|
| 143 |
else:
|
| 144 |
method = 'wc'
|
| 145 |
if 1:
|
|
|
|
| 146 |
print('Swelling_wtfrac, T, Polymer_Tg, Solvent_Name, Solvent_MW, Solute_MW, CHRIS_category')
|
| 147 |
print(Swelling_wtfrac, T, Polymer_Tg, Solvent_Name, Solvent_MW, Solute_MW, CHRIS_category)
|
| 148 |
print(np.nanquantile(D_dist_noswell, [0.05,0.5,0.95]))
|
| 149 |
print(np.nanquantile(D_dist_swell, [0.05,0.5,0.95]))
|
| 150 |
-
print('M_expt, Polymer_Volume, Surface_Area, Solvent_Volume, Extraction_Time*3600, K_expt')
|
| 151 |
-
print(M_expt, Polymer_Volume, Surface_Area, Solvent_Volume, Extraction_Time*3600, K_expt)
|
| 152 |
print(np.nanquantile(M0_pred, [0.05,0.5,0.95]))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
# Generate the rate plot using matplotlib
|
| 155 |
#pngImageB64String = CdfPlot(M0_pred[~np.isnan(M0_pred)], units=units)
|
|
@@ -172,11 +183,12 @@ def exp_post():
|
|
| 172 |
|
| 173 |
M0_out = SigFigs(np.nanquantile(M0_pred,0.5),6)
|
| 174 |
tau_out = SigFigs(tau,6)
|
|
|
|
| 175 |
|
| 176 |
return render_template('quantity_report.html', show_properties=show_properties, polymers=polymers, pIndex=pIndex,
|
| 177 |
area=Surface_Area, vol=Polymer_Volume, units=units, M=M_expt, M0=M0_out, time=Extraction_Time,
|
| 178 |
solventvol=Solvent_Volume, solventname=Solvent_Name, swelling=Swelling_percent, K=K_expt, T=T, tau=tau_out,
|
| 179 |
chemName=chemName, MW=MW, LogP=LogP, rho=rho, mp=mp, iupac=iupac, cas=cas, smiles=smiles, molImage=molImage, table=table,
|
| 180 |
-
LogP_origin=LogP_origin, rho_origin=rho_origin, mp_origin=mp_origin, ceramic=is_ceramic, methods=[method,round(Polymer_Tg),Polymer_Density],
|
| 181 |
-
mass=mass, density=Polymer_Density)
|
| 182 |
|
|
|
|
| 4 |
from flask import render_template, request
|
| 5 |
from functions import SigFigs, HtmlNumber, Piringer, WilkeChang, CdfPlot
|
| 6 |
#from functions import Piecewise, PowerLaw
|
| 7 |
+
from qrf.functions import QRF_Apply, QRF_Ceramic
|
| 8 |
from . import blueprint
|
| 9 |
from polymers import Polymers, Polymers3
|
| 10 |
from ChemID import *
|
| 11 |
+
from quantity_module.functions import *
|
| 12 |
import rdkit
|
| 13 |
from rdkit.Chem import AllChem as Chem
|
| 14 |
|
|
|
|
| 104 |
Solvent_MW = Solvent_MWs[Solvent_Name]
|
| 105 |
Solute_MW = MW
|
| 106 |
|
| 107 |
+
if units == 'mg':
|
| 108 |
+
mass_units = mass*1e3
|
| 109 |
+
elif units == 'Β΅g':
|
| 110 |
+
mass_units = mass*1e6
|
| 111 |
+
|
| 112 |
polymer = request.form['polymer']
|
| 113 |
pIndex = np.argmax(polymers == polymer)
|
| 114 |
|
| 115 |
+
# QRF is implemented for 25-75 C
|
| 116 |
+
if polymer == 'Other polymer':
|
| 117 |
use_qrf = True
|
| 118 |
else:
|
| 119 |
use_qrf = False
|
| 120 |
|
| 121 |
if use_qrf:
|
| 122 |
+
quantiles = list(np.linspace(0.0,1.0,201))
|
| 123 |
+
#quantiles = list(np.linspace(0.05,0.95,181))
|
| 124 |
if is_ceramic:
|
| 125 |
diff,domain_extrap = QRF_Ceramic(Polymer_Density, Polymer_Tg, quantiles=quantiles, T=T-273.15, worstcase='lo')
|
| 126 |
else:
|
|
|
|
| 149 |
else:
|
| 150 |
method = 'wc'
|
| 151 |
if 1:
|
| 152 |
+
print('DEBUG')
|
| 153 |
print('Swelling_wtfrac, T, Polymer_Tg, Solvent_Name, Solvent_MW, Solute_MW, CHRIS_category')
|
| 154 |
print(Swelling_wtfrac, T, Polymer_Tg, Solvent_Name, Solvent_MW, Solute_MW, CHRIS_category)
|
| 155 |
print(np.nanquantile(D_dist_noswell, [0.05,0.5,0.95]))
|
| 156 |
print(np.nanquantile(D_dist_swell, [0.05,0.5,0.95]))
|
| 157 |
+
print('M_expt, Polymer_Volume, Surface_Area, Solvent_Volume, Extraction_Time*3600, K_expt, method')
|
| 158 |
+
print(M_expt, Polymer_Volume, Surface_Area, Solvent_Volume, Extraction_Time*3600, K_expt, method)
|
| 159 |
print(np.nanquantile(M0_pred, [0.05,0.5,0.95]))
|
| 160 |
+
V1,V2 = get_D_dists(Swelling_wtfrac, T, Polymer_Tg, Solvent_Name, Solvent_MW, Solute_MW, 'G2', rng, return_DCs=False, N=N_sample)
|
| 161 |
+
V3 = get_M_dist(V2, M_expt, Polymer_Volume, Surface_Area, Solvent_Volume, Extraction_Time*3600, K_expt=K_expt)
|
| 162 |
+
print(np.nanquantile(V2, [0.05,0.5,0.95]))
|
| 163 |
+
print(np.nanquantile(V3, [0.05,0.5,0.95]))
|
| 164 |
|
| 165 |
# Generate the rate plot using matplotlib
|
| 166 |
#pngImageB64String = CdfPlot(M0_pred[~np.isnan(M0_pred)], units=units)
|
|
|
|
| 183 |
|
| 184 |
M0_out = SigFigs(np.nanquantile(M0_pred,0.5),6)
|
| 185 |
tau_out = SigFigs(tau,6)
|
| 186 |
+
mass_units = SigFigs(mass_units,6)
|
| 187 |
|
| 188 |
return render_template('quantity_report.html', show_properties=show_properties, polymers=polymers, pIndex=pIndex,
|
| 189 |
area=Surface_Area, vol=Polymer_Volume, units=units, M=M_expt, M0=M0_out, time=Extraction_Time,
|
| 190 |
solventvol=Solvent_Volume, solventname=Solvent_Name, swelling=Swelling_percent, K=K_expt, T=T, tau=tau_out,
|
| 191 |
chemName=chemName, MW=MW, LogP=LogP, rho=rho, mp=mp, iupac=iupac, cas=cas, smiles=smiles, molImage=molImage, table=table,
|
| 192 |
+
LogP_origin=LogP_origin, rho_origin=rho_origin, mp_origin=mp_origin, ceramic=is_ceramic, methods=[method,round(Polymer_Tg-273.15),Polymer_Density],
|
| 193 |
+
mass=mass, mass_units=mass_units, density=Polymer_Density)
|
| 194 |
|
quantity_module/templates/quantity_index.html
CHANGED
|
@@ -149,7 +149,7 @@ please see the <a href="{{url_for('.static', filename='RST.html')}}"> RST inform
|
|
| 149 |
<tr><td colspan="2"><h4> Extraction parameters <button type=button class="Info_btn" data-toggle="modal" data-target="#ExtractionModal">ⓘ</button></td></tr> </h4>
|
| 150 |
<tr><th>Device surface area (cm<sup>2</sup>)</th><td> <input name="area" id="area" step="any" value="5.0" min="0.001" type="number" required></td></tr>
|
| 151 |
<tr><th>Duration (hours)</th><td> <input name="time" id="time" step="any" value="24.0" min="0.001" type="number" required></td></tr>
|
| 152 |
-
<tr><th>Temperature (°C)</th><td> <input name="T" id="T" step="any" value="50.0" min="
|
| 153 |
<tr><th>Solvent</th>
|
| 154 |
<td> <select name="solventname" id="solventname">
|
| 155 |
<option value="{{solvents[0]}}" selected>{{solvents[0]}}</option>
|
|
|
|
| 149 |
<tr><td colspan="2"><h4> Extraction parameters <button type=button class="Info_btn" data-toggle="modal" data-target="#ExtractionModal">ⓘ</button></td></tr> </h4>
|
| 150 |
<tr><th>Device surface area (cm<sup>2</sup>)</th><td> <input name="area" id="area" step="any" value="5.0" min="0.001" type="number" required></td></tr>
|
| 151 |
<tr><th>Duration (hours)</th><td> <input name="time" id="time" step="any" value="24.0" min="0.001" type="number" required></td></tr>
|
| 152 |
+
<tr><th>Temperature (°C)</th><td> <input name="T" id="T" step="any" value="50.0" min="25" max="75" type="number" required></td></tr>
|
| 153 |
<tr><th>Solvent</th>
|
| 154 |
<td> <select name="solventname" id="solventname">
|
| 155 |
<option value="{{solvents[0]}}" selected>{{solvents[0]}}</option>
|
quantity_module/templates/quantity_report.html
CHANGED
|
@@ -119,6 +119,13 @@ Swelling = {{swelling}} wt% (used to estimate \( D \))<br>
|
|
| 119 |
|
| 120 |
<p>The progress of the extraction can be expressed through the dimensionless time \( \tau \). For your extraction, \( \tau \) = {{tau}}. Extractions with \( \tau \) ≥ 0.1 result in more accurate estimates of the total quantity, and when \( \tau \) ≥ 1.0 the extracted amount may be used directly as the total quantity if the extraction is diffusion-controlled.</p>
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
<p><button type="button" onclick="javascript:history.back()">Back</button></p>
|
| 123 |
|
| 124 |
</body>
|
|
|
|
| 119 |
|
| 120 |
<p>The progress of the extraction can be expressed through the dimensionless time \( \tau \). For your extraction, \( \tau \) = {{tau}}. Extractions with \( \tau \) ≥ 0.1 result in more accurate estimates of the total quantity, and when \( \tau \) ≥ 1.0 the extracted amount may be used directly as the total quantity if the extraction is diffusion-controlled.</p>
|
| 121 |
|
| 122 |
+
{% if M0>=mass_units %}
|
| 123 |
+
<p>
|
| 124 |
+
<font color="red">The predicted amount ({{M0}} {{units}}) is larger than the device mass ({{mass_units}} {{units}}) due to uncertainty and conservatism in the prediction.
|
| 125 |
+
In this case the device mass may be used as a conservative estimate of the total quantity of this extractable.</font>
|
| 126 |
+
</p>
|
| 127 |
+
{% endif %}
|
| 128 |
+
|
| 129 |
<p><button type="button" onclick="javascript:history.back()">Back</button></p>
|
| 130 |
|
| 131 |
</body>
|