| | import os |
| | import numpy as np |
| | import pandas as pd |
| | import mlflow |
| | import shutil |
| | import model_h |
| |
|
| | |
| | from sklearn.linear_model import LogisticRegression |
| | from sklearn.ensemble import RandomForestClassifier |
| | from imblearn.ensemble import BalancedRandomForestClassifier |
| | import xgboost as xgb |
| | from skopt import BayesSearchCV |
| | from skopt.space import Real, Integer |
| |
|
| | |
| | |
| | |
| | model_only_hosp = False |
| | if model_only_hosp is True: |
| | file_suffix = "_only_hosp" |
| | else: |
| | file_suffix = "_hosp_comm" |
| |
|
| | |
| | |
| | |
| | |
| | fold_patients = np.load( |
| | './data/cohort_info/fold_patients' + file_suffix + '.npy', allow_pickle=True) |
| |
|
| | |
| | train_data_imp = model_h.load_data_for_modelling( |
| | './data/model_data/train_data_cv_imp' + file_suffix + '.pkl') |
| |
|
| | |
| | train_data_no_imp = model_h.load_data_for_modelling( |
| | './data/model_data/train_data_cv_no_imp' + file_suffix + '.pkl') |
| |
|
| | |
| | test_data_imp = model_h.load_data_for_modelling( |
| | './data/model_data/test_data_imp' + file_suffix + '.pkl') |
| | |
| | test_data_no_imp = model_h.load_data_for_modelling( |
| | './data/model_data/test_data_no_imp' + file_suffix + '.pkl') |
| |
|
| | |
| | |
| | cross_val_fold_indices = [] |
| | for fold in fold_patients: |
| | fold_val_ids = train_data_no_imp[train_data_no_imp.StudyId.isin(fold)] |
| | fold_train_ids = train_data_no_imp[~( |
| | train_data_no_imp.StudyId.isin(fold_val_ids.StudyId))] |
| |
|
| | |
| | fold_val_index = fold_val_ids.index |
| | fold_train_index = fold_train_ids.index |
| |
|
| | |
| | cross_val_fold_indices.append((fold_train_index, fold_val_index)) |
| |
|
| | |
| | cols_to_drop = ['StudyId', 'ExacWithin3Months', 'IndexDate'] |
| | features_list = [col for col in train_data_no_imp.columns if col not in cols_to_drop] |
| |
|
| | |
| | |
| | train_features_no_imp = train_data_no_imp[features_list].astype('float') |
| | train_target_no_imp = train_data_no_imp.ExacWithin3Months.astype('float') |
| | |
| | train_features_imp = train_data_imp[features_list].astype('float') |
| | train_target_imp = train_data_imp.ExacWithin3Months.astype('float') |
| |
|
| | |
| | |
| | test_features_no_imp = test_data_no_imp[features_list].astype('float') |
| | test_target_no_imp = test_data_no_imp.ExacWithin3Months.astype('float') |
| | |
| | test_features_imp = test_data_imp[features_list].astype('float') |
| | test_target_imp = test_data_imp.ExacWithin3Months.astype('float') |
| |
|
| | |
| | |
| | if not train_target_no_imp.equals(train_target_imp): |
| | raise ValueError( |
| | 'Target variable is not the same in imputed and non imputed datasets in the train set.') |
| | if not test_target_no_imp.equals(test_target_imp): |
| | raise ValueError( |
| | 'Target variable is not the same in imputed and non imputed datasets in the test set.') |
| | train_target = train_target_no_imp |
| | test_target = test_target_no_imp |
| |
|
| | |
| | for features in [train_features_no_imp, train_features_imp, |
| | test_features_no_imp, test_features_imp]: |
| | for col in features: |
| | features[col] = pd.to_numeric(features[col], errors='coerce') |
| |
|
| | |
| | |
| | |
| | |
| | mlflow.set_tracking_uri("sqlite:///mlruns.db") |
| | mlflow.set_experiment('model_h_drop_1' + file_suffix) |
| |
|
| | |
| | scoring_methods = ['average_precision'] |
| | scale_pos_weight = train_target.value_counts()[0] / train_target.value_counts()[1] |
| |
|
| | |
| | |
| | models = [] |
| | |
| | if model_only_hosp is True: |
| | |
| | models.append((LogisticRegression(), |
| | 'logistic_regression', 'imputed', 'linear')) |
| | |
| | models.append((BalancedRandomForestClassifier(), |
| | 'balanced_random_forest', 'imputed', 'tree')) |
| | |
| | models.append((xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), |
| | 'xgb', 'not_imputed', 'tree')) |
| | if model_only_hosp is False: |
| | |
| | models.append((LogisticRegression(), |
| | 'logistic_regression', 'imputed', 'linear')) |
| | |
| | models.append((RandomForestClassifier(), |
| | 'random_forest', 'imputed', 'tree')) |
| | |
| | models.append((BalancedRandomForestClassifier(), |
| | 'balanced_random_forest', 'imputed', 'tree')) |
| | |
| | models.append((xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), |
| | 'xgb', 'not_imputed', 'tree')) |
| |
|
| | |
| | log_reg_search_spaces = {'penalty': ['l2', None], |
| | 'class_weight': ['balanced', None], |
| | 'max_iter': Integer(50, 300), |
| | 'C': Real(0.001, 10), |
| | } |
| | rf_search_spaces = {'max_depth': Integer(4, 10), |
| | 'n_estimators': Integer(70, 850), |
| | 'min_samples_split': Integer(2, 10), |
| | 'class_weight': ['balanced', None], |
| | } |
| | xgb_search_spaces = {'max_depth': Integer(4, 10), |
| | 'n_estimators': Integer(70, 850), |
| | 'subsample': Real(0.55, 0.95), |
| | 'colsample_bytree': Real(0.55, 0.95), |
| | 'learning_rate': Real(0.05, 0.14), |
| | 'scale_pos_weight': Real(1, scale_pos_weight), |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | with mlflow.start_run(run_name='hyperparameter_tuning_2023_tot_length'): |
| | for scoring_method in scoring_methods: |
| | for model in models: |
| | with mlflow.start_run(run_name=model[1], nested=True): |
| | print(model[1]) |
| | |
| | artifact_dir = './tmp' |
| | os.makedirs(artifact_dir, exist_ok=True) |
| | |
| | shutil.rmtree(artifact_dir) |
| |
|
| | |
| | if (model[1] == 'balanced_random_forest') | (model[1] == 'random_forest'): |
| | opt = BayesSearchCV(model[0], |
| | search_spaces= rf_search_spaces, |
| | n_iter=200, |
| | random_state=0, |
| | cv=cross_val_fold_indices, |
| | scoring=scoring_method, |
| | ) |
| | |
| | np.int = int |
| | opt.fit(train_features_imp, train_target) |
| |
|
| | if model[1] == 'logistic_regression': |
| | opt = BayesSearchCV(model[0], |
| | search_spaces= log_reg_search_spaces, |
| | n_iter=200, |
| | random_state=0, |
| | cv=cross_val_fold_indices, |
| | scoring=scoring_method, |
| | ) |
| | np.int = int |
| | opt.fit(train_features_imp, train_target) |
| |
|
| | if model[1] == 'xgb': |
| | opt = BayesSearchCV(model[0], |
| | search_spaces= xgb_search_spaces, |
| | n_iter=200, |
| | random_state=0, |
| | cv=cross_val_fold_indices, |
| | scoring=scoring_method, |
| | ) |
| | np.int = int |
| | opt.fit(train_features_no_imp, train_target) |
| |
|
| | |
| | print(opt.best_params_) |
| | print(opt.best_score_) |
| |
|
| | |
| | mlflow.log_param('opt_scorer', scoring_method) |
| | mlflow.log_params(opt.best_params_) |
| | mlflow.log_metric("opt_best_score", opt.best_score_) |
| | mlflow.end_run() |
| |
|