Robert Elder commited on
Commit
faa664b
·
1 Parent(s): a9dd5bb

updates to quantity module; adding qrf_model_bundle trained in environment matching container

Browse files
CHRIS.py CHANGED
@@ -1,5 +1,4 @@
1
  from flask import Flask, render_template
2
- #from functions import weight_func
3
 
4
  app = Flask(__name__)
5
  app.debug = False
 
1
  from flask import Flask, render_template
 
2
 
3
  app = Flask(__name__)
4
  app.debug = False
qrf_functions.py CHANGED
@@ -9,7 +9,7 @@ import mordred.descriptors
9
  import rdkit
10
  from rdkit import Chem
11
 
12
- def QRF_Ceramic(density, polytg, quantiles=[0.03,0.5,0.97], T=37):
13
  with open(f'qrf_model_bundle_{int(T)}.pkl','rb') as f:
14
  reg, imp, scaler_X, sub_desc_list = pickle.load(f)
15
  df_X = pd.read_excel('qrf_x.xlsx')
@@ -17,8 +17,12 @@ def QRF_Ceramic(density, polytg, quantiles=[0.03,0.5,0.97], T=37):
17
  X_all = imp.transform(df_X)
18
  X_all_scale = scaler_X.transform(X_all)
19
  ## use "worst-case" solute values
20
- tmpq = np.array([0.95]*len(sub_desc_list))
21
- tmpq[df_X.corrwith(df_y['LogD'])>0] = 0.05 # positive correlations (increase in variable increases D) use low values of variable, negative correlations use high values of variable
 
 
 
 
22
  tmpv = [np.nanquantile(X_all_scale[:,i], q) for i,q in enumerate(tmpq)] # "worst-case" values of scaled descriptors
23
  tmps = [polytg if n == 'Polymer_Tg' else (density if n == 'Polymer_Density' else 0) for i,n in enumerate(sub_desc_list)]
24
  tmps = scaler_X.transform([tmps])[0] # scaled values of polymer descriptors
 
9
  import rdkit
10
  from rdkit import Chem
11
 
12
+ def QRF_Ceramic(density, polytg, quantiles=[0.03,0.5,0.97], T=37, worstcase='hi'):
13
  with open(f'qrf_model_bundle_{int(T)}.pkl','rb') as f:
14
  reg, imp, scaler_X, sub_desc_list = pickle.load(f)
15
  df_X = pd.read_excel('qrf_x.xlsx')
 
17
  X_all = imp.transform(df_X)
18
  X_all_scale = scaler_X.transform(X_all)
19
  ## use "worst-case" solute values
20
+ if worstcase == 'hi':
21
+ tmpq = np.array([0.95]*len(sub_desc_list))
22
+ tmpq[df_X.corrwith(df_y['LogD'])>0] = 0.05 # positive correlations (increase in variable increases D) use low values of variable, negative correlations use high values of variable
23
+ elif worstcase == 'lo':
24
+ tmpq = np.array([0.05]*len(sub_desc_list))
25
+ tmpq[df_X.corrwith(df_y['LogD'])>0] = 0.95 # inverse of above
26
  tmpv = [np.nanquantile(X_all_scale[:,i], q) for i,q in enumerate(tmpq)] # "worst-case" values of scaled descriptors
27
  tmps = [polytg if n == 'Polymer_Tg' else (density if n == 'Polymer_Density' else 0) for i,n in enumerate(sub_desc_list)]
28
  tmps = scaler_X.transform([tmps])[0] # scaled values of polymer descriptors
qrf_model_bundle_37.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5c2b2899461c3597863a6271f0bb37658f053daf1d0cba509b98b445fb45698
3
+ size 15583438
qrf_train.py CHANGED
@@ -5,7 +5,7 @@ import sklearn
5
  import sklearn.impute
6
  from quantile_forest import RandomForestQuantileRegressor
7
 
8
- T_target = 50
9
  T_cut = 5
10
  qhiv, qlov = 0.97, 0.03
11
  state = 12345
 
5
  import sklearn.impute
6
  from quantile_forest import RandomForestQuantileRegressor
7
 
8
+ T_target = 37
9
  T_cut = 5
10
  qhiv, qlov = 0.97, 0.03
11
  state = 12345
quantity_functions.py CHANGED
@@ -15,10 +15,8 @@ import mordred.descriptors
15
 
16
  from functions import PowerLaw, Piringer
17
 
18
- ### TODO arbitrary T
19
-
20
  ## sampling parameters
21
- N_sample = int(1e6)
22
  #rng = np.random.Generator(np.random.PCG64(seed=12345))
23
 
24
  ## list of solvents to include
@@ -32,11 +30,6 @@ use_new = True
32
  T_cut_new = 0.5
33
 
34
  #### read data files
35
- # CHRIS bounds
36
- #with open('quantity_module/data/bounds.pkl', 'rb') as fp:
37
- # params_dict_ub, params_dict_lb, params_dict_ub_band, params_dict_lb_band, params_dict_q95, params_dict_q50, params_dict_q05 = pickle.load(fp)
38
- #with open('quantity_module/data/bounds-50.pkl', 'rb') as fp:
39
- # params_dict_ub_50, params_dict_lb_50, params_dict_ub_band_50, params_dict_lb_band_50, params_dict_q95_50, params_dict_q50_50, params_dict_q05_50 = pickle.load(fp)
40
  # CHRIS parameter distributions
41
  if not use_new:
42
  param_dists = {}
@@ -50,10 +43,6 @@ else:
50
  # other stuff
51
  df_visc = pd.read_excel('quantity_module/data/solvent-viscosity.xlsx')
52
  df_desc = pd.read_excel(f'quantity_module/data/data-descriptors-mordred-numconfs51.xlsx', usecols=['Solute_InChIKey', 'Vabc','VMcGowan'])
53
- #dfp = pd.read_excel('quantity_module/data/db-properties-polymer.xlsx') ## XXX?
54
- #df2 = pd.read_excel('quantity_module/data/db-D.xlsx', sheet_name=1) ## XXX?
55
- #df2['CHRIS Class'] = df2['CHRIS Class'].fillna('none')
56
- #df2['New Class'] = df2['New Class'].fillna('none')
57
  if not use_new:
58
  ## clean data
59
  df_final_37 = pd.read_excel('quantity_module/data/db-D-interp-37-clean.xlsx')
@@ -86,7 +75,7 @@ for solv in df_visc['Solvent_Name']:
86
  mws.append(mw)
87
  df_visc['MW'] = mws
88
  ## selected solvent MWs
89
- SolventMWs = {solv:df_visc.loc[df_visc['Solvent_Name']==solv,'MW'].iloc[0] for solv in solvents}
90
  # linear relation to estimate Vabc when it fails for a molecule
91
  Vabc = df_desc['Vabc']
92
  Vmcg = df_desc['VMcGowan']
@@ -241,7 +230,7 @@ def get_D_dists(w,T,Polymer_Tg,Solvent_Name,Solvent_MW,Solute_MW,CHRIS_category,
241
  else:
242
  Ball = params[1]
243
  A_list = params[2:]
244
- D_list = np.exp([PowerLaw(Solute_MW, Ai, Ball) for Ai in A_list])
245
  else:
246
  D_list = input_Ds
247
  else:
@@ -260,7 +249,7 @@ def get_D_dists(w,T,Polymer_Tg,Solvent_Name,Solvent_MW,Solute_MW,CHRIS_category,
260
  else:
261
  Ball = params[1]
262
  A_list = params[2:]
263
- D_list += list(np.exp([PowerLaw(Solute_MW, Ai, Ball) for Ai in A_list]))
264
  else:
265
  D_list = input_Ds
266
  D_dist_noswell = rng.choice(D_list, N)
 
15
 
16
  from functions import PowerLaw, Piringer
17
 
 
 
18
  ## sampling parameters
19
+ N_sample = int(1e5)
20
  #rng = np.random.Generator(np.random.PCG64(seed=12345))
21
 
22
  ## list of solvents to include
 
30
  T_cut_new = 0.5
31
 
32
  #### read data files
 
 
 
 
 
33
  # CHRIS parameter distributions
34
  if not use_new:
35
  param_dists = {}
 
43
  # other stuff
44
  df_visc = pd.read_excel('quantity_module/data/solvent-viscosity.xlsx')
45
  df_desc = pd.read_excel(f'quantity_module/data/data-descriptors-mordred-numconfs51.xlsx', usecols=['Solute_InChIKey', 'Vabc','VMcGowan'])
 
 
 
 
46
  if not use_new:
47
  ## clean data
48
  df_final_37 = pd.read_excel('quantity_module/data/db-D-interp-37-clean.xlsx')
 
75
  mws.append(mw)
76
  df_visc['MW'] = mws
77
  ## selected solvent MWs
78
+ Solvent_MWs = {solv:df_visc.loc[df_visc['Solvent_Name']==solv,'MW'].iloc[0] for solv in solvents}
79
  # linear relation to estimate Vabc when it fails for a molecule
80
  Vabc = df_desc['Vabc']
81
  Vmcg = df_desc['VMcGowan']
 
230
  else:
231
  Ball = params[1]
232
  A_list = params[2:]
233
+ D_list = np.array([PowerLaw(Solute_MW, Ai, Ball) for Ai in A_list])
234
  else:
235
  D_list = input_Ds
236
  else:
 
249
  else:
250
  Ball = params[1]
251
  A_list = params[2:]
252
+ D_list += [PowerLaw(Solute_MW, Ai, Ball) for Ai in A_list]
253
  else:
254
  D_list = input_Ds
255
  D_dist_noswell = rng.choice(D_list, N)
quantity_module/quantity.py CHANGED
@@ -3,7 +3,7 @@ import numpy as np
3
  import pandas as pd
4
  from flask import render_template, request
5
  from functions import SigFigs, HtmlNumber, Piringer, WilkeChang, CdfPlot
6
- from functions import Piecewise, PowerLaw
7
  from qrf_functions import QRF_Apply, QRF_Ceramic
8
  from . import blueprint
9
  from polymers import Polymers, Polymers3
@@ -36,7 +36,10 @@ def exp_post():
36
 
37
  Polymer_Tg = float(request.form['Polymer_Tg']) ## NOTE Tg is provided in C
38
  T = float(request.form['T'])
 
 
39
  rng = np.random.Generator(np.random.PCG64(seed=12345))
 
40
 
41
  if T<Polymer_Tg:
42
  return render_template('quantity_temperatureError.html')
@@ -89,43 +92,44 @@ def exp_post():
89
  M_expt = float(request.form['amount']) # amount
90
  units = request.form['units']
91
  mass = float(request.form['mass'])
92
- PolymerDensity = float(request.form['density'])
93
- PolymerVolume = mass / PolymerDensity # vol
94
- SurfaceArea = float(request.form['area']) # area
95
- SolventVolume = float(request.form['solventvol'])
96
- SolventName = request.form['solventname']
97
  Swelling_percent = float(request.form['swelling'])
98
  Swelling_wtfrac = Swelling_percent/100
99
- ExtractionTime = float(request.form['time'])
100
  K_expt = float(request.form['K'])
101
- SolventMW = SolventMWs[SolventName]
102
- SoluteMW = MW
103
 
104
  polymer = request.form['polymer']
105
  pIndex = np.argmax(polymers == polymer)
106
 
107
- use_qrf = False
108
- if polymer == 'Other polymer':
109
  use_qrf = True
 
 
110
 
111
  if use_qrf:
112
  method = 'qrf'
113
- quantiles = list(np.linspace(0,1,101))
 
 
114
  if is_ceramic:
115
- diff,domain_extrap = QRF_Ceramic(PolymerDensity, Polymer_Tg, quantiles=quantiles, T=T)
116
  else:
117
- diff,domain_extrap = QRF_Apply(PolymerDensity, Polymer_Tg, smiles, quantiles=quantiles, T=T)
118
- #diff = diff[2] # upper bound
119
  if domain_extrap:
120
  # outside training domain, default to Wilke-Chang
121
- #diff = Piecewise(MW, params[None])
122
- D_dist_noswell, D_dist_swell = get_D_dists(Swelling_wtfrac, T+273.15, Polymer_Tg+273.15, SolventName, SolventMW, SoluteMW, 'G2', rng, return_DCs=False, N=N_sample)
123
- M0_pred = get_M_dist(D_dist_swell, M_expt, PolymerVolume, SurfaceArea, SolventVolume, ExtractionTime*3600, K_expt=K_expt)
124
  method = 'qrf/wc'
125
  else:
126
- ## TODO implement total quantity prediction with QRF
127
- D_dist_noswell, D_dist_swell = get_D_dists(Swelling_wtfrac, T+273.15, Polymer_Tg+273.15, SolventName, SolventMW, SoluteMW, 'G2', rng, return_DCs=False, N=N_sample, input_Ds=diff)
128
- M0_pred = get_M_dist(D_dist_swell, M_expt, PolymerVolume, SurfaceArea, SolventVolume, ExtractionTime*3600, K_expt=K_expt)
129
  else:
130
  ## use categories
131
  CHRIS_category = categories[pIndex]
@@ -134,19 +138,21 @@ def exp_post():
134
  ## worst-case for a generic polymer --> G2
135
  CHRIS_flag = 'wc'
136
  CHRIS_category = 'G2'
137
- D_dist_noswell, D_dist_swell = get_D_dists(Swelling_wtfrac, T+273.15, Polymer_Tg+273.15, SolventName, SolventMW, SoluteMW, CHRIS_category, rng, return_DCs=False, N=N_sample)
138
- M0_pred = get_M_dist(D_dist_swell, M_expt, PolymerVolume, SurfaceArea, SolventVolume, ExtractionTime*3600, K_expt=K_expt)
139
- if 0:
140
- print('Swelling_wtfrac, T+273.15, Polymer_Tg+273.15, SolventName, SolventMW, SoluteMW, CHRIS_category')
141
- print(Swelling_wtfrac, T+273.15, Polymer_Tg+273.15, SolventName, SolventMW, SoluteMW, CHRIS_category)
142
- print(np.nanquantile(D_dist_swell, [0.05,0.5,0.95]))
143
- print('M_expt, PolymerVolume, SurfaceArea, SolventVolume, ExtractionTime*3600, K_expt')
144
- print(M_expt, PolymerVolume, SurfaceArea, SolventVolume, ExtractionTime*3600, K_expt)
145
- print(np.nanquantile(M0_pred, [0.05,0.5,0.95]))
146
  if CHRIS_flag is None:
147
  method = 'category'
148
  else:
149
  method = 'wc'
 
 
 
 
 
 
 
 
 
150
 
151
  # Generate the rate plot using matplotlib
152
  #pngImageB64String = CdfPlot(M0_pred[~np.isnan(M0_pred)], units=units)
@@ -165,15 +171,15 @@ def exp_post():
165
  table = df_table.style.set_properties(subset=[f'\( M_0 \) ({units})', r'\( M_0 \) (% median)'], **{'text-align': 'right'}).set_table_attributes('border="1"').hide(axis='index').to_html(index=False, escape=False, justify='center')
166
  #print(table)
167
 
168
- tau = np.nanquantile(D_dist_swell,0.5) * (ExtractionTime*3600) / (PolymerVolume/SurfaceArea)**2
169
 
170
  M0_out = SigFigs(np.nanquantile(M0_pred,0.5),6)
171
  tau_out = SigFigs(tau,6)
172
 
173
  return render_template('quantity_report.html', show_properties=show_properties, polymers=polymers, pIndex=pIndex,
174
- area=SurfaceArea, vol=PolymerVolume, units=units, M=M_expt, M0=M0_out, time=ExtractionTime,
175
- solventvol=SolventVolume, solventname=SolventName, swelling=Swelling_percent, K=K_expt, T=T, tau=tau_out,
176
  chemName=chemName, MW=MW, LogP=LogP, rho=rho, mp=mp, iupac=iupac, cas=cas, smiles=smiles, molImage=molImage, table=table,
177
- LogP_origin=LogP_origin, rho_origin=rho_origin, mp_origin=mp_origin, ceramic=is_ceramic, methods=[method,Polymer_Tg,PolymerDensity],
178
- mass=mass, density=PolymerDensity)
179
 
 
3
  import pandas as pd
4
  from flask import render_template, request
5
  from functions import SigFigs, HtmlNumber, Piringer, WilkeChang, CdfPlot
6
+ #from functions import Piecewise, PowerLaw
7
  from qrf_functions import QRF_Apply, QRF_Ceramic
8
  from . import blueprint
9
  from polymers import Polymers, Polymers3
 
36
 
37
  Polymer_Tg = float(request.form['Polymer_Tg']) ## NOTE Tg is provided in C
38
  T = float(request.form['T'])
39
+ Polymer_Tg += 273.15
40
+ T += 273.15
41
  rng = np.random.Generator(np.random.PCG64(seed=12345))
42
+ CHRIS_category = None
43
 
44
  if T<Polymer_Tg:
45
  return render_template('quantity_temperatureError.html')
 
92
  M_expt = float(request.form['amount']) # amount
93
  units = request.form['units']
94
  mass = float(request.form['mass'])
95
+ Polymer_Density = float(request.form['density'])
96
+ Polymer_Volume = mass / Polymer_Density # vol
97
+ Surface_Area = float(request.form['area']) # area
98
+ Solvent_Volume = float(request.form['solventvol'])
99
+ Solvent_Name = request.form['solventname']
100
  Swelling_percent = float(request.form['swelling'])
101
  Swelling_wtfrac = Swelling_percent/100
102
+ Extraction_Time = float(request.form['time'])
103
  K_expt = float(request.form['K'])
104
+ Solvent_MW = Solvent_MWs[Solvent_Name]
105
+ Solute_MW = MW
106
 
107
  polymer = request.form['polymer']
108
  pIndex = np.argmax(polymers == polymer)
109
 
110
+ # QRF is only implemented for 37 and 50 C
111
+ if polymer == 'Other polymer' and round(T) in [310,323]:
112
  use_qrf = True
113
+ else:
114
+ use_qrf = False
115
 
116
  if use_qrf:
117
  method = 'qrf'
118
+ ## XXX fix this so the density of quantiles matches density of distribution??
119
+ #quantiles = list(np.linspace(0,1,101))
120
+ quantiles = list(np.linspace(0.05,0.95,181))
121
  if is_ceramic:
122
+ diff,domain_extrap = QRF_Ceramic(Polymer_Density, Polymer_Tg, quantiles=quantiles, T=T-273.15, worstcase='lo')
123
  else:
124
+ diff,domain_extrap = QRF_Apply(Polymer_Density, Polymer_Tg, smiles, quantiles=quantiles, T=T-273.15)
 
125
  if domain_extrap:
126
  # outside training domain, default to Wilke-Chang
127
+ D_dist_noswell, D_dist_swell = get_D_dists(Swelling_wtfrac, T, Polymer_Tg, Solvent_Name, Solvent_MW, Solute_MW, 'G2', rng, return_DCs=False, N=N_sample)
128
+ M0_pred = get_M_dist(D_dist_swell, M_expt, Polymer_Volume, Surface_Area, Solvent_Volume, Extraction_Time*3600, K_expt=K_expt)
 
129
  method = 'qrf/wc'
130
  else:
131
+ D_dist_noswell, D_dist_swell = get_D_dists(Swelling_wtfrac, T, Polymer_Tg, Solvent_Name, Solvent_MW, Solute_MW, None, rng, return_DCs=False, N=N_sample, input_Ds=diff)
132
+ M0_pred = get_M_dist(D_dist_swell, M_expt, Polymer_Volume, Surface_Area, Solvent_Volume, Extraction_Time*3600, K_expt=K_expt)
 
133
  else:
134
  ## use categories
135
  CHRIS_category = categories[pIndex]
 
138
  ## worst-case for a generic polymer --> G2
139
  CHRIS_flag = 'wc'
140
  CHRIS_category = 'G2'
141
+ D_dist_noswell, D_dist_swell = get_D_dists(Swelling_wtfrac, T, Polymer_Tg, Solvent_Name, Solvent_MW, Solute_MW, CHRIS_category, rng, return_DCs=False, N=N_sample)
142
+ M0_pred = get_M_dist(D_dist_swell, M_expt, Polymer_Volume, Surface_Area, Solvent_Volume, Extraction_Time*3600, K_expt=K_expt)
 
 
 
 
 
 
 
143
  if CHRIS_flag is None:
144
  method = 'category'
145
  else:
146
  method = 'wc'
147
+ if 1:
148
+ print('Swelling_wtfrac, T, Polymer_Tg, Solvent_Name, Solvent_MW, Solute_MW, CHRIS_category')
149
+ print(Swelling_wtfrac, T, Polymer_Tg, Solvent_Name, Solvent_MW, Solute_MW, CHRIS_category)
150
+ print(D_dist_noswell)
151
+ print(np.nanquantile(D_dist_noswell, [0.05,0.5,0.95]))
152
+ print(np.nanquantile(D_dist_swell, [0.05,0.5,0.95]))
153
+ print('M_expt, Polymer_Volume, Surface_Area, Solvent_Volume, Extraction_Time*3600, K_expt')
154
+ print(M_expt, Polymer_Volume, Surface_Area, Solvent_Volume, Extraction_Time*3600, K_expt)
155
+ print(np.nanquantile(M0_pred, [0.05,0.5,0.95]))
156
 
157
  # Generate the rate plot using matplotlib
158
  #pngImageB64String = CdfPlot(M0_pred[~np.isnan(M0_pred)], units=units)
 
171
  table = df_table.style.set_properties(subset=[f'\( M_0 \) ({units})', r'\( M_0 \) (% median)'], **{'text-align': 'right'}).set_table_attributes('border="1"').hide(axis='index').to_html(index=False, escape=False, justify='center')
172
  #print(table)
173
 
174
+ tau = np.nanquantile(D_dist_swell,0.5) * (Extraction_Time*3600) / (Polymer_Volume/Surface_Area)**2
175
 
176
  M0_out = SigFigs(np.nanquantile(M0_pred,0.5),6)
177
  tau_out = SigFigs(tau,6)
178
 
179
  return render_template('quantity_report.html', show_properties=show_properties, polymers=polymers, pIndex=pIndex,
180
+ area=Surface_Area, vol=Polymer_Volume, units=units, M=M_expt, M0=M0_out, time=Extraction_Time,
181
+ solventvol=Solvent_Volume, solventname=Solvent_Name, swelling=Swelling_percent, K=K_expt, T=T, tau=tau_out,
182
  chemName=chemName, MW=MW, LogP=LogP, rho=rho, mp=mp, iupac=iupac, cas=cas, smiles=smiles, molImage=molImage, table=table,
183
+ LogP_origin=LogP_origin, rho_origin=rho_origin, mp_origin=mp_origin, ceramic=is_ceramic, methods=[method,Polymer_Tg,Polymer_Density],
184
+ mass=mass, density=Polymer_Density)
185