File size: 3,473 Bytes
d33329a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | import pickle
import numpy as np
import pandas as pd
import sklearn
import sklearn.impute
import sklearn.metrics
from quantile_forest import RandomForestQuantileRegressor
import mordred
from mordred import Calculator, descriptors
#T_target = 37
#T_cut = 2.5
qhiv, qlov = 0.97, 0.03
state = 12345
QRF_T_list = np.array([25,30,35,37,40,45,50,55,60,65,70,75])
QRF_T_cut = 2.5
df_QRF = pd.read_excel('db-D-interp-allT-semiclean.xlsx')
df_desc = pd.read_excel('mordred-descriptors.xlsx')
calc = mordred.Calculator(mordred.descriptors)
colnames_mordred = [str(d) for d in calc.descriptors]
df_QRF = pd.merge(df_QRF, df_desc[['Solute_InChIKey',*colnames_mordred]], how='left', on='Solute_InChIKey', suffixes=('', '_dupe'))
df_params = pd.read_excel('qrf_parameters_allT.xlsx')
for T_target in QRF_T_list:
print(T_target)
if 1:
if T_target == 37:
sub_desc_list = ['MW', 'Polymer_Tg', 'Polymer_Density', 'VR2_A', 'ATS0Z', 'AATS5d', 'BCUTv-1h', 'BCUTse-1l', 'Xch-7dv', 'Mp', 'Mi', 'SaasC', 'ETA_epsilon_5', 'fragCpx', 'JGI5', 'JGI8']
params = {'bootstrap': True, 'max_depth': 7, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000} # best from -18-2.py w fout<0.040 (and 0.045)
elif T_target == 50:
sub_desc_list = ['MW', 'Polymer_Tg', 'Polymer_Density', 'ATS0m', 'ATSC2dv', 'ATSC6dv', 'ATSC0m', 'ATSC6i', 'BCUTse-1l', 'BCUTp-1h', 'Mp', 'Mi', 'SaasC']
params = {'bootstrap': True, 'max_depth': 6, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 6, 'min_samples_split': 2, 'n_estimators': 1000} # best from -19.py and -19-2.py with fout<0.040
else:
mask_T = df_params['T']==T_target
sub_desc_list = df_params.loc[mask_T, 'sub_desc_list'].iloc[0].split('|')
params = df_params.loc[mask_T, ['bootstrap', 'max_depth', 'max_features', 'max_samples', 'min_samples_leaf', 'min_samples_split', 'n_estimators']].iloc[0].to_dict()
params['max_samples'] = float(params['max_samples'])
if 0:
sub_desc_list = ['MW', 'Polymer_Tg', 'Polymer_Density', 'VR2_A', 'ATS0Z', 'AATS5d', 'BCUTv-1h', 'BCUTse-1l', 'Xch-7dv', 'Mp', 'Mi', 'SaasC', 'ETA_epsilon_5', 'fragCpx', 'JGI5', 'JGI8']
params = {'bootstrap': True, 'max_depth': 7, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000} # best from -18-2.py w fout<0.040 (and 0.045)
## read data
#df_X = pd.read_excel('qrf_x.xlsx')
#df_y = pd.read_excel('qrf_y.xlsx')
mask_T = (df_QRF['T']>T_target-QRF_T_cut) & (df_QRF['T']<T_target+QRF_T_cut)
df_X = df_QRF.loc[mask_T, sub_desc_list]
df_y = df_QRF.loc[mask_T, 'LogD']
#sub_desc_list = list(df_X.columns)
## fit transforms
imp = sklearn.impute.SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(df_X)
X_all = imp.transform(df_X)
y_all = np.array(df_y)
scaler_X = sklearn.preprocessing.StandardScaler().fit(X_all)
X_all_scale = scaler_X.transform(X_all)
reg_all = RandomForestQuantileRegressor(random_state=state, n_jobs=-1, **params)
reg_all.fit(X_all_scale,y_all)
with open(f'qrf_model_bundle_{T_target}.pkl','wb') as f:
pickle.dump([reg_all,imp,scaler_X,sub_desc_list],f)
print(sub_desc_list)
print(params)
print(mask_T.sum())
y_pred = reg_all.predict(X_all_scale)
print(y_pred.mean(),y_pred.std())
print()
|