File size: 3,473 Bytes
d33329a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pickle
import numpy as np
import pandas as pd
import sklearn
import sklearn.impute
import sklearn.metrics
from quantile_forest import RandomForestQuantileRegressor
import mordred
from mordred import Calculator, descriptors

#T_target = 37
#T_cut = 2.5
qhiv, qlov = 0.97, 0.03
state = 12345

QRF_T_list = np.array([25,30,35,37,40,45,50,55,60,65,70,75])
QRF_T_cut = 2.5
df_QRF = pd.read_excel('db-D-interp-allT-semiclean.xlsx')
df_desc =  pd.read_excel('mordred-descriptors.xlsx')
calc = mordred.Calculator(mordred.descriptors)
colnames_mordred = [str(d) for d in calc.descriptors]
df_QRF = pd.merge(df_QRF, df_desc[['Solute_InChIKey',*colnames_mordred]], how='left', on='Solute_InChIKey', suffixes=('', '_dupe'))

df_params = pd.read_excel('qrf_parameters_allT.xlsx')

for T_target in QRF_T_list:
    print(T_target)
    if 1:
        if T_target == 37:
            sub_desc_list = ['MW', 'Polymer_Tg', 'Polymer_Density', 'VR2_A', 'ATS0Z', 'AATS5d', 'BCUTv-1h', 'BCUTse-1l', 'Xch-7dv', 'Mp', 'Mi', 'SaasC', 'ETA_epsilon_5', 'fragCpx', 'JGI5', 'JGI8']
            params = {'bootstrap': True, 'max_depth': 7, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000} # best from -18-2.py w fout<0.040 (and 0.045)
        elif T_target == 50:
            sub_desc_list = ['MW', 'Polymer_Tg', 'Polymer_Density', 'ATS0m', 'ATSC2dv', 'ATSC6dv', 'ATSC0m', 'ATSC6i', 'BCUTse-1l', 'BCUTp-1h', 'Mp', 'Mi', 'SaasC']
            params = {'bootstrap': True, 'max_depth': 6, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 6, 'min_samples_split': 2, 'n_estimators': 1000} # best from -19.py and -19-2.py with fout<0.040
        else:
            mask_T = df_params['T']==T_target
            sub_desc_list = df_params.loc[mask_T, 'sub_desc_list'].iloc[0].split('|')
            params = df_params.loc[mask_T, ['bootstrap', 'max_depth', 'max_features', 'max_samples', 'min_samples_leaf', 'min_samples_split', 'n_estimators']].iloc[0].to_dict()
            params['max_samples'] = float(params['max_samples'])
    if 0:
        sub_desc_list = ['MW', 'Polymer_Tg', 'Polymer_Density', 'VR2_A', 'ATS0Z', 'AATS5d', 'BCUTv-1h', 'BCUTse-1l', 'Xch-7dv', 'Mp', 'Mi', 'SaasC', 'ETA_epsilon_5', 'fragCpx', 'JGI5', 'JGI8']
        params = {'bootstrap': True, 'max_depth': 7, 'max_features': 0.4, 'max_samples': 1.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000} # best from -18-2.py w fout<0.040 (and 0.045)

    ## read data
    #df_X = pd.read_excel('qrf_x.xlsx')
    #df_y = pd.read_excel('qrf_y.xlsx')
    mask_T = (df_QRF['T']>T_target-QRF_T_cut) & (df_QRF['T']<T_target+QRF_T_cut)
    df_X = df_QRF.loc[mask_T, sub_desc_list]
    df_y = df_QRF.loc[mask_T, 'LogD']
    #sub_desc_list = list(df_X.columns)

    ## fit transforms
    imp = sklearn.impute.SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit(df_X)
    X_all = imp.transform(df_X)
    y_all = np.array(df_y)
    scaler_X = sklearn.preprocessing.StandardScaler().fit(X_all)
    X_all_scale = scaler_X.transform(X_all)

    reg_all = RandomForestQuantileRegressor(random_state=state, n_jobs=-1, **params)
    reg_all.fit(X_all_scale,y_all)

    with open(f'qrf_model_bundle_{T_target}.pkl','wb') as f:
        pickle.dump([reg_all,imp,scaler_X,sub_desc_list],f)

    print(sub_desc_list)
    print(params)
    print(mask_T.sum())
    y_pred = reg_all.predict(X_all_scale)
    print(y_pred.mean(),y_pred.std())
    print()