Spaces:
Sleeping
Sleeping
| import json,pickle | |
| import numbers | |
| import numpy as np | |
| import pandas as pd | |
| import scipy as sp | |
| from scipy.optimize import bisect | |
| import scipy.special | |
| import rdkit | |
| from rdkit.Chem import AllChem as Chem | |
| import chemicals | |
| import mordred | |
| import mordred.descriptors | |
| def Piringer(Mw, Ap, T=310.): | |
| # Semi-empirical model for D(Mw) given polymer property Ap- Toxicol. Sci. 2019, 172 (1), 201–212. | |
| if Mw > 1100.: # if molecule is greater than 1100 g/mol, default to that value as worst case | |
| Mw = 1100. | |
| return 1e4 * np.exp(Ap - 0.1351 * Mw ** (2. / 3.) + 0.003 * Mw - 10454. / T) | |
| def PowerLaw(Mw, A, B): | |
| logMw = np.log(Mw) | |
| logD = A+logMw*B | |
| return np.exp(logD) | |
| def Polymers(): | |
| PolyData = pd.read_csv('data/polymer_names_noglasses.tsv', sep='\t') | |
| polymers = np.array(list(PolyData['Polymer_Name'])) | |
| categories = np.array(list(PolyData['New Class'])) | |
| #polymers = np.array(list(PolyData['Polymer_Name']) + ['Other polymer']) | |
| #categories = np.array(list(PolyData['New Class']) + [None]) | |
| return polymers, categories | |
| # Get polymers | |
| polymers, categories = Polymers() | |
| # Get solutes | |
| soluteData = pd.read_excel('data/soluteData.xlsx') | |
| nSolutes = len(soluteData) | |
| ## list of solvents to include, all semi-polar and non-polar solvents in ISO 10993-18:2020 Table D.1 (except DMSO, which is not used in practice) | |
| df_visc = pd.read_excel('data/solventData.xlsx') | |
| solvents = df_visc["Solvent_Name"].tolist() | |
| ## sampling parameters | |
| N_sample = int(1e5) | |
| #rng = np.random.Generator(np.random.PCG64(seed=12345)) | |
| ## c distribution parameters | |
| T_cut = 20 | |
| MW_cut = 20 | |
| use_new = True | |
| T_cut_new = 0.5 | |
| #### read data files | |
| # CHRIS parameter distributions | |
| if not use_new: | |
| param_dists = {} | |
| with open('data/param_distribution_37.json','r') as fp: | |
| param_dists[37] = json.load(fp) | |
| with open('data/param_distribution_50.json','r') as fp: | |
| param_dists[50] = json.load(fp) | |
| else: | |
| with open('data/param_distribution_allT.json','r') as fp: | |
| param_dists = json.load(fp) | |
| # other stuff | |
| df_desc = pd.read_excel(f'data/data-descriptors-mordred-numconfs51.xlsx', usecols=['Solute_InChIKey', 'Vabc','VMcGowan']) | |
| if not use_new: | |
| ## clean data | |
| df_final_37 = pd.read_excel('data/db-D-interp-37-clean.xlsx') | |
| df_final_50 = pd.read_excel('data/db-D-interp-50-clean.xlsx') | |
| # convert all T to K | |
| df_final_37['T'] = df_final_37['T'] + 273.15 | |
| df_final_37['Polymer_Tg'] = df_final_37['Polymer_Tg'] + 273.15 | |
| df_final_37['Polymer_Tm'] = df_final_37['Polymer_Tm'] + 273.15 | |
| df_final_50['T'] = df_final_50['T'] + 273.15 | |
| df_final_50['Polymer_Tg'] = df_final_50['Polymer_Tg'] + 273.15 | |
| df_final_50['Polymer_Tm'] = df_final_50['Polymer_Tm'] + 273.15 | |
| # add volumes | |
| df_final_37 = pd.merge(df_final_37, df_desc[['Solute_InChIKey', 'Vabc', 'VMcGowan']], how='left', on='Solute_InChIKey', suffixes=('', '_dupe')) | |
| df_final_50 = pd.merge(df_final_50, df_desc[['Solute_InChIKey', 'Vabc', 'VMcGowan']], how='left', on='Solute_InChIKey', suffixes=('', '_dupe')) | |
| else: | |
| ## clean data | |
| df_final = pd.read_excel('data/db-D-interp-allT-clean.xlsx') | |
| # convert all T to K | |
| df_final['T'] = df_final['T'] + 273.15 | |
| df_final['Polymer_Tg'] = df_final['Polymer_Tg'] + 273.15 | |
| df_final['Polymer_Tm'] = df_final['Polymer_Tm'] + 273.15 | |
| # add volumes | |
| df_final = pd.merge(df_final, df_desc[['Solute_InChIKey', 'Vabc', 'VMcGowan']], how='left', on='Solute_InChIKey', suffixes=('', '_dupe')) | |
| #### solvent-specific viscosity | |
| # add MW | |
| Solvent_MWs = {solv:df_visc.loc[df_visc['Solvent_Name']==solv,'MW'].iloc[0] for solv in solvents} | |
| #Solvent_Densities = {solv:string2density(solv)[0] for solv in solvents} | |
| Solvent_Densities = {solv:df_visc.loc[df_visc['Solvent_Name']==solv,'density'].iloc[0] for solv in solvents} | |
| Solvent_PIs = {solv:df_visc.loc[df_visc['Solvent_Name']==solv,'polarity index'].iloc[0] for solv in solvents} | |
| # linear relation to estimate Vabc when it fails for a molecule | |
| Vabc = df_desc['Vabc'] | |
| Vmcg = df_desc['VMcGowan'] | |
| m = ~pd.isna(Vabc) | |
| popt_V = np.polyfit(Vmcg[m], Vabc[m], 1) | |
| # ---- model: Grunberg–Nissan style with polynomial interaction that vanishes at x=0,1 ---- | |
| # fitted to data from R. Belda, J. V. Herráez, O. Diez, Rheological study and thermodynamic analysis of the binary system (water/ethanol): Influence of concentration. Physics and Chemistry of Liquids 42, 467-479 (2004). | |
| popt_etoh = np.array([-6.35036532e+00, 1.86507282e+03, -5.30902320e+00, 1.60463200e+03, -1.03040657e+01, 3.05646061e+00, -4.93824317e+00, 4.16274239e+03, -1.18411097e+03, 1.69557649e+03]) | |
| def predict_lneta(p, T, x, n_poly=3, interaction_has_T=True): | |
| Aw, Bw, Ae, Be = p[:4] # ln(eta_w)=Aw+Bw/T, ln(eta_e)=Ae+Be/T | |
| ln_eta_w = Aw + Bw / T | |
| ln_eta_e = Ae + Be / T | |
| xc = 2.0*x - 1.0 # map wt frac [0,1] -> [-1,1] | |
| Phi = np.vstack([xc**k for k in range(n_poly)]) # (n_poly, N) | |
| if interaction_has_T: | |
| a = p[4:4+n_poly] | |
| b = p[4+n_poly:4+2*n_poly] | |
| G = (a @ Phi) + (b @ Phi) / T | |
| else: | |
| a = p[4:4+n_poly] | |
| G = (a @ Phi) | |
| return x*ln_eta_e + (1-x)*ln_eta_w + x*(1-x)*G | |
| def get_WC(T,solv,V): | |
| params = df_visc[df_visc['Solvent_Name']==solv].iloc[0] | |
| if params['Equation'] == '10^A(1/T-1/B)': | |
| eta = 10**(params['A']*(1/T-1/params['B'])) | |
| elif params['Equation'] == 'A*exp(B/T)': | |
| eta = params['A']*np.exp(params['B']/T) | |
| elif params['Equation'] == 'E*exp(A+B/(T/298.15)+C/(T/298.15)^2+D/(T/298.15)^3)': | |
| eta = params['E']*np.exp(params['A'] + params['B']/(T/298.15) + params['C']/(T/298.15)**2 + params['D']/(T/298.15)**3) | |
| elif params['Equation'] == 'A*exp(-0.01*B*(T-298.15))': | |
| eta = params['A']*np.exp(-0.01*params['B']*(T-298.15)) | |
| elif params['Equation'] == 'A+BT/1+CT+DT^2': | |
| eta = (params['A']+params['B']*T) / (1 + params['C']*T + params['D']*T**2) | |
| elif params['Equation'] == 'A+B/T+C/T^2+D/T^3': | |
| eta = params['A'] + params['B']/T + params['C']/T**2 + params['D']/T**3 | |
| elif params['Equation'] == 'A*298.15/T': | |
| eta = params['A'] * 298.15/T | |
| elif params['Equation'] == 'A*T+B': | |
| eta = params['A'] * T + params['A'] | |
| elif params['Equation'] == 'fitted_EtOH': | |
| # assuming 50% is by volume --> by mass for consistency with fitted model | |
| eta = np.exp(predict_lneta(popt_etoh, T, 0.5*0.7898/(0.5*0.7898+0.5*1.000), n_poly=3, interaction_has_T=True)) | |
| else: | |
| eta = np.nan | |
| D_WC = 7.4e-8*(params['MW']*params['WC_assoc_param'])**0.5*(T)/eta/V**0.6 | |
| return D_WC, eta, params['MW'] | |
| #### add Wilke-Chang | |
| if not use_new: | |
| ## 50 C | |
| # estimate Vabc for those with nan values | |
| m = pd.isna(df_final_50['Vabc']) | |
| v = np.polyval(popt_V, df_final_50['VMcGowan'][m]) | |
| df_final_50.loc[m, 'Vabc'] = v | |
| T = df_final_50['T'] | |
| V = df_final_50['Vabc'] | |
| for solv in solvents: | |
| D_WC, eta, MW_solvent = get_WC(T, solv, V) | |
| df_final_50[f'eta_{solv}'] = eta | |
| df_final_50[f'D_WC_{solv}'] = D_WC | |
| df_final_50[f'MW_solvent_{solv}'] = MW_solvent | |
| ## 37 C | |
| # estimate Vabc for those with nan values | |
| m = pd.isna(df_final_37['Vabc']) | |
| v = np.polyval(popt_V, df_final_37['VMcGowan'][m]) | |
| df_final_37.loc[m, 'Vabc'] = v | |
| T = df_final_37['T'] | |
| V = df_final_37['Vabc'] | |
| for solv in solvents: | |
| D_WC, eta, MW_solvent = get_WC(T, solv, V) | |
| df_final_37[f'eta_{solv}'] = eta | |
| df_final_37[f'D_WC_{solv}'] = D_WC | |
| df_final_37[f'MW_solvent_{solv}'] = MW_solvent | |
| else: | |
| # estimate Vabc for those with nan values | |
| m = pd.isna(df_final['Vabc']) | |
| v = np.polyval(popt_V, df_final['VMcGowan'][m]) | |
| df_final.loc[m, 'Vabc'] = v | |
| T = df_final['T'] | |
| V = df_final['Vabc'] | |
| for solv in solvents: | |
| D_WC, eta, MW_solvent = get_WC(T, solv, V) | |
| df_final[f'eta_{solv}'] = eta | |
| df_final[f'D_WC_{solv}'] = D_WC | |
| df_final[f'MW_solvent_{solv}'] = MW_solvent | |
| def get_V(smiles): | |
| mol = Chem.MolFromSmiles(smiles) | |
| calc = mordred.Calculator([mordred.descriptors.VdwVolumeABC, mordred.descriptors.McGowanVolume]) | |
| Vabc,Vmcg = list(calc(mol).values()) | |
| if not isinstance(Vabc, numbers.Number): | |
| Vabc = np.polyval(popt_V, Vmcg) | |
| return Vabc | |
| #### Vrentas-Duda setup | |
| df_vd_solv = pd.read_excel('data/vrentas-duda-params.xlsx', sheet_name='Solutes') | |
| df_vd_solv.drop_duplicates(keep='first', inplace=True, ignore_index=True) # drop exact duplicates | |
| df_vd_poly = pd.read_excel('data/vrentas-duda-params.xlsx', sheet_name='Polymers') | |
| df_vd_poly.drop_duplicates(keep='first', inplace=True, ignore_index=True) # drop exact duplicates | |
| df_props = pd.read_excel('data/db-polymer-properties-and-categories.xlsx') | |
| df_vd_poly = pd.merge(df_vd_poly, df_props[['Polymer_Name','Polymer_Tg','Polymer_Tm', 'CHRIS Class', 'New Class']], how='left', on='Polymer_Name') | |
| df_vd_poly['New Class'] = df_vd_poly['New Class'].fillna('none') | |
| df_vd_poly['CHRIS Class'] = df_vd_poly['CHRIS Class'].fillna('none') | |
| ## Calculate c | |
| dfs_vd_allT = [] | |
| for T in np.arange(100,800,20): | |
| fV_polyT = (df_vd_poly['K12']*(df_vd_poly['K22-Tg2']+T)) | |
| #fV_polyT[fV_polyT<0.025] = 0.025 | |
| for solvname in set(df_vd_solv['Solute_Name']): | |
| df_sol = df_vd_solv[df_vd_solv['Solute_Name']==solvname] | |
| for row in df_sol.iterrows(): | |
| row = row[1] | |
| fV_sol = (row['K11']*(row['K21-Tg1']+T)) | |
| c_sol = fV_sol / fV_polyT | |
| df_vd_allT = pd.concat([row]*len(df_vd_poly), axis=1, ignore_index=True).T | |
| df_vd_allT = pd.concat([df_vd_allT, df_vd_poly], axis=1) | |
| df_vd_allT['c'] = c_sol | |
| df_vd_allT['T'] = T | |
| dfs_vd_allT.append(df_vd_allT) | |
| df_vd_allT = pd.concat(dfs_vd_allT, ignore_index=True) | |
| df_vd_allT['T-Tg'] = df_vd_allT['T']-df_vd_allT['Tg2'] | |
| df_vd_allT['T-Tg1'] = df_vd_allT['T']-df_vd_allT['Tg1'] | |
| def get_c_dist(T,Tg,MW): | |
| m = (~pd.isna(df_vd_allT['c'])) & (np.abs(df_vd_allT['T-Tg']-max(T_cut,T-Tg))<T_cut) & (np.abs(df_vd_allT['M1']-MW)<MW_cut) | |
| cs = df_vd_allT.loc[m, 'c'] | |
| cs = np.array(cs) | |
| cs = cs[~np.isnan(cs)] | |
| cs = cs[cs>0] | |
| return cs | |
| def get_c_dist_cat(T,CHRIS_category,MW): | |
| m = (~pd.isna(df_vd_allT['c'])) & (df_vd_allT['T-Tg']>0) & (np.abs(df_vd_allT['T']-T)<T_cut) & (np.abs(df_vd_allT['M1']-MW)<MW_cut) & (df_vd_allT['New Class']==CHRIS_category) | |
| cs = df_vd_allT.loc[m, 'c'] | |
| cs = np.array(cs) | |
| cs = cs[~np.isnan(cs)] | |
| cs = cs[cs>0] | |
| return cs | |
| def get_D_Extract(w,T,Polymer_Tg,Solvent_Name,Solvent_MW,Solute_MW,Solute_Vabc,CHRIS_category,N=10000,return_DCs=False,input_Ds=None): | |
| df_final_T = df_final.loc[np.abs(df_final['T']-T)<T_cut_new] | |
| if (T <= Polymer_Tg) or (input_Ds is not None): | |
| cs = get_c_dist(T,Polymer_Tg,Solvent_MW) | |
| else: | |
| cs = get_c_dist_cat(T,CHRIS_category,Solvent_MW) | |
| if not len(cs): | |
| cs = get_c_dist(T,Polymer_Tg,Solvent_MW) | |
| rng = np.random.Generator(np.random.PCG64(seed=12345)) | |
| c = rng.choice(cs, N) | |
| if Solute_Vabc is None: | |
| if Solute_MW < 50: | |
| m50 = df_final_T['MW']<=50 | |
| else: | |
| m50 = df_final_T['MW']>50 | |
| ## within cutoffs, with at least N closest (by sorting, separating at MW = 50) | |
| m2 = (np.abs(df_final_T['Polymer_Tg']-Polymer_Tg)<T_cut) & (np.abs(df_final_T['MW']-Solute_MW)<MW_cut) & m50 | |
| if m2.sum()<25: | |
| vT = df_final_T.loc[m50,'Polymer_Tg']-Polymer_Tg; vM = df_final_T.loc[m50,'MW']-Solute_MW; m3 = pd.concat([np.abs(vT), np.abs(vM)], axis=1).sort_values(by=['Polymer_Tg', 'MW']).index[1:26] | |
| m2 = list(set(m2.index[m2]).union(set(m3))) | |
| if return_DCs: | |
| Ds,DWCs,DCs = rng.choice([df_final_T.loc[m2,'D'], df_final_T.loc[m2,f'D_WC_{Solvent_Name}'], df_final_T.loc[m2,f'D_CHRIS_q50']], N, axis=1) | |
| else: | |
| Ds,DWCs = rng.choice([df_final_T.loc[m2,'D'], df_final_T.loc[m2,f'D_WC_{Solvent_Name}']], N, axis=1) | |
| else: | |
| DWCs, eta, MW_solvent = get_WC(T, Solvent_Name, Solute_Vabc) | |
| ## distribution of D_CHRIS | |
| if input_Ds is None: | |
| if Solute_MW > 50: | |
| subkey = f'{CHRIS_category}_hi' | |
| else: | |
| subkey = f'{CHRIS_category}_lo' | |
| allparams = [param_dists[Ti][subkey] for Ti in param_dists if T+T_cut_new >= int(Ti)+273.15 >= T-T_cut_new] | |
| D_list = [] | |
| for params in allparams: | |
| if params[0] == 'pir': | |
| A_list = params[1:] | |
| D_list += [Piringer(Solute_MW, Ai, T) for Ai in A_list] | |
| else: | |
| Ball = params[1] | |
| A_list = params[2:] | |
| D_list += [PowerLaw(Solute_MW, Ai, Ball) for Ai in A_list] | |
| else: | |
| D_list = input_Ds | |
| D_dist_noswell = rng.choice(D_list, N) | |
| if Solute_Vabc is None: | |
| lnD_D0 = c*w/(1+(c-1)*w) * np.log(DWCs/Ds) | |
| else: | |
| lnD_D0 = c*w/(1+(c-1)*w) * np.log(DWCs/D_dist_noswell) | |
| D_dist_swell = np.exp(np.log(D_dist_noswell)+lnD_D0) | |
| if return_DCs: | |
| return D_dist_noswell, D_dist_swell, (c, Ds, DWCs, DCs) | |
| else: | |
| return D_dist_swell | |
| def get_D_CHRIS(Solute_MW,CHRIS_category,N=10000): | |
| T = 310.15 | |
| rng = np.random.Generator(np.random.PCG64(seed=12345)) | |
| if Solute_MW > 50: | |
| subkey = f'{CHRIS_category}_hi' | |
| else: | |
| subkey = f'{CHRIS_category}_lo' | |
| allparams = [param_dists[Ti][subkey] for Ti in param_dists if T+T_cut_new >= int(Ti)+273.15 >= T-T_cut_new] | |
| D_list = [] | |
| for params in allparams: | |
| if params[0] == 'pir': | |
| A_list = params[1:] | |
| D_list += [Piringer(Solute_MW, Ai, T) for Ai in A_list] | |
| else: | |
| Ball = params[1] | |
| A_list = params[2:] | |
| D_list += [PowerLaw(Solute_MW, Ai, Ball) for Ai in A_list] | |
| return rng.choice(D_list, N) | |