import pickle import numpy as np import pandas as pd import sklearn import sklearn.impute import sklearn.metrics from quantile_forest import RandomForestQuantileRegressor import mordred from mordred import Calculator, descriptors #T_target = 37 #T_cut = 2.5 qhiv, qlov = 0.97, 0.03 state = 12345 QRF_T_list = np.array([25,30,35,37,40,45,50,55,60,65,70,75]) QRF_T_cut = 2.5 df_QRF = pd.read_excel('db-D-interp-allT-semiclean.xlsx') df_desc = pd.read_excel('mordred-descriptors.xlsx') calc = mordred.Calculator(mordred.descriptors) colnames_mordred = [str(d) for d in calc.descriptors] df_QRF = pd.merge(df_QRF, df_desc[['Solute_InChIKey',*colnames_mordred]], how='left', on='Solute_InChIKey', suffixes=('', '_dupe')) df_params = pd.read_excel('qrf_parameters_allT.xlsx') for T_target in QRF_T_list: print(T_target) if 1: if T_target == 37: sub_desc_list = ['MW', 'Polymer_Tg', 'Polymer_Density', 'VR2_A', 'ATS0Z', 'AATS5d', 'BCUTv-1h', 'BCUTse-1l', 'Xch-7dv', 'Mp', 'Mi', 'SaasC', 'ETA_epsilon_5', 'fragCpx', 'JGI5', 'JGI8'] params = {'bootstrap': True, 'max_depth': 7, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000} # best from -18-2.py w fout<0.040 (and 0.045) elif T_target == 50: sub_desc_list = ['MW', 'Polymer_Tg', 'Polymer_Density', 'ATS0m', 'ATSC2dv', 'ATSC6dv', 'ATSC0m', 'ATSC6i', 'BCUTse-1l', 'BCUTp-1h', 'Mp', 'Mi', 'SaasC'] params = {'bootstrap': True, 'max_depth': 6, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 6, 'min_samples_split': 2, 'n_estimators': 1000} # best from -19.py and -19-2.py with fout<0.040 else: mask_T = df_params['T']==T_target sub_desc_list = df_params.loc[mask_T, 'sub_desc_list'].iloc[0].split('|') params = df_params.loc[mask_T, ['bootstrap', 'max_depth', 'max_features', 'max_samples', 'min_samples_leaf', 'min_samples_split', 'n_estimators']].iloc[0].to_dict() params['max_samples'] = float(params['max_samples']) if 0: sub_desc_list = ['MW', 'Polymer_Tg', 'Polymer_Density', 'VR2_A', 'ATS0Z', 'AATS5d', 'BCUTv-1h', 'BCUTse-1l', 'Xch-7dv', 'Mp', 'Mi', 'SaasC', 'ETA_epsilon_5', 'fragCpx', 'JGI5', 'JGI8'] params = {'bootstrap': True, 'max_depth': 7, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000} # best from -18-2.py w fout<0.040 (and 0.045) ## read data #df_X = pd.read_excel('qrf_x.xlsx') #df_y = pd.read_excel('qrf_y.xlsx') mask_T = (df_QRF['T']>T_target-QRF_T_cut) & (df_QRF['T']