| import pickle |
| import numpy as np |
| import pandas as pd |
| import sklearn |
| import sklearn.impute |
| import sklearn.metrics |
| from quantile_forest import RandomForestQuantileRegressor |
| import mordred |
| from mordred import Calculator, descriptors |
|
|
| |
| |
| qhiv, qlov = 0.97, 0.03 |
| state = 12345 |
|
|
| QRF_T_list = np.array([25,30,35,37,40,45,50,55,60,65,70,75]) |
| QRF_T_cut = 2.5 |
| df_QRF = pd.read_excel('db-D-interp-allT-semiclean.xlsx') |
| df_desc = pd.read_excel('mordred-descriptors.xlsx') |
| calc = mordred.Calculator(mordred.descriptors) |
| colnames_mordred = [str(d) for d in calc.descriptors] |
| df_QRF = pd.merge(df_QRF, df_desc[['Solute_InChIKey',*colnames_mordred]], how='left', on='Solute_InChIKey', suffixes=('', '_dupe')) |
|
|
| df_params = pd.read_excel('qrf_parameters_allT.xlsx') |
|
|
| for T_target in QRF_T_list: |
| print(T_target) |
| if 1: |
| if T_target == 37: |
| sub_desc_list = ['MW', 'Polymer_Tg', 'Polymer_Density', 'VR2_A', 'ATS0Z', 'AATS5d', 'BCUTv-1h', 'BCUTse-1l', 'Xch-7dv', 'Mp', 'Mi', 'SaasC', 'ETA_epsilon_5', 'fragCpx', 'JGI5', 'JGI8'] |
| params = {'bootstrap': True, 'max_depth': 7, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000} |
| elif T_target == 50: |
| sub_desc_list = ['MW', 'Polymer_Tg', 'Polymer_Density', 'ATS0m', 'ATSC2dv', 'ATSC6dv', 'ATSC0m', 'ATSC6i', 'BCUTse-1l', 'BCUTp-1h', 'Mp', 'Mi', 'SaasC'] |
| params = {'bootstrap': True, 'max_depth': 6, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 6, 'min_samples_split': 2, 'n_estimators': 1000} |
| else: |
| mask_T = df_params['T']==T_target |
| sub_desc_list = df_params.loc[mask_T, 'sub_desc_list'].iloc[0].split('|') |
| params = df_params.loc[mask_T, ['bootstrap', 'max_depth', 'max_features', 'max_samples', 'min_samples_leaf', 'min_samples_split', 'n_estimators']].iloc[0].to_dict() |
| params['max_samples'] = float(params['max_samples']) |
| if 0: |
| sub_desc_list = ['MW', 'Polymer_Tg', 'Polymer_Density', 'VR2_A', 'ATS0Z', 'AATS5d', 'BCUTv-1h', 'BCUTse-1l', 'Xch-7dv', 'Mp', 'Mi', 'SaasC', 'ETA_epsilon_5', 'fragCpx', 'JGI5', 'JGI8'] |
| params = {'bootstrap': True, 'max_depth': 7, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000} |
|
|
| |
| |
| |
| mask_T = (df_QRF['T']>T_target-QRF_T_cut) & (df_QRF['T']<T_target+QRF_T_cut) |
| df_X = df_QRF.loc[mask_T, sub_desc_list] |
| df_y = df_QRF.loc[mask_T, 'LogD'] |
| |
|
|
| |
| imp = sklearn.impute.SimpleImputer(missing_values=np.nan, strategy='mean') |
| imp.fit(df_X) |
| X_all = imp.transform(df_X) |
| y_all = np.array(df_y) |
| scaler_X = sklearn.preprocessing.StandardScaler().fit(X_all) |
| X_all_scale = scaler_X.transform(X_all) |
|
|
| reg_all = RandomForestQuantileRegressor(random_state=state, n_jobs=-1, **params) |
| reg_all.fit(X_all_scale,y_all) |
|
|
| with open(f'qrf_model_bundle_{T_target}.pkl','wb') as f: |
| pickle.dump([reg_all,imp,scaler_X,sub_desc_list],f) |
|
|
| print(sub_desc_list) |
| print(params) |
| print(mask_T.sum()) |
| y_pred = reg_all.predict(X_all_scale) |
| print(y_pred.mean(),y_pred.std()) |
| print() |
|
|
|
|