| | """Perform cross validation using a variety of algorithms.""" |
| | import os |
| | import pandas as pd |
| | import numpy as np |
| |
|
| | from lenusml import splits, plots |
| |
|
| | |
| | from sklearn.ensemble import RandomForestClassifier |
| | from sklearn.model_selection import cross_validate, cross_val_predict |
| | from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier |
| | from interpret.glassbox import ExplainableBoostingClassifier |
| | import lightgbm as lgb |
| | import xgboost as xgb |
| | import mlflow |
| |
|
| |
|
| | data_dir = '../data/models/model1/' |
| | cohort_info_dir = '../data/cohort_info/' |
| | output_dir = '../data/models/model1/output' |
| |
|
| | |
| | fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'), |
| | allow_pickle=True) |
| | train_data = pd.read_pickle(os.path.join(data_dir, 'train_data_cv.pkl')) |
| |
|
| | |
| | cross_validation_fold_indices = splits.custom_cv_fold_indices(fold_patients=fold_patients, |
| | train_data=train_data, |
| | id_column='StudyId') |
| |
|
| | |
| | cols_to_drop = ['StudyId', 'IsExac'] |
| | features_list = [col for col in train_data.columns if col not in cols_to_drop] |
| |
|
| | |
| | features = train_data[features_list].astype('float') |
| | target = train_data.IsExac.astype('float') |
| |
|
| | scale_pos_weight = target.value_counts()[0] / target.value_counts()[1] |
| |
|
| | mlflow.set_tracking_uri("sqlite:///mlruns.sqlite") |
| | mlflow.set_experiment('model_drop2') |
| |
|
| | |
| | scoring = ['f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall', 'roc_auc', |
| | 'average_precision', 'neg_brier_score'] |
| | scale_pos_weight = target.value_counts()[0] / target.value_counts()[1] |
| |
|
| | models = [] |
| | models.append((RandomForestClassifier(random_state=0), 'random_forest')) |
| | models.append((RandomForestClassifier(random_state=0, class_weight='balanced'), |
| | 'random_forest_class_weight')) |
| | models.append((BalancedBaggingClassifier(random_state=0), |
| | 'balanced_bagging')) |
| | models.append((BalancedRandomForestClassifier(random_state=0), 'balanced_random_forest')) |
| | models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, |
| | eval_metric='logloss'), 'xgb')) |
| | models.append((lgb.LGBMClassifier(random_state=0), 'lgbm')) |
| | models.append((xgb.XGBClassifier(random_state=0, use_label_encoder=False, |
| | eval_metric='logloss', scale_pos_weight=scale_pos_weight), 'xgb_spw')) |
| | models.append((ExplainableBoostingClassifier(random_state=0), 'ebm')) |
| |
|
| | with mlflow.start_run(run_name='model_selection'): |
| | |
| | for model in models: |
| | with mlflow.start_run(run_name=model[1], nested=True): |
| | |
| | artifact_dir = './tmp' |
| | os.makedirs(artifact_dir, exist_ok=True) |
| | |
| | |
| | for f in os.listdir(artifact_dir): |
| | os.remove(os.path.join(artifact_dir, f)) |
| |
|
| | crossval = cross_validate(model[0], features, target, |
| | cv=cross_validation_fold_indices, |
| | return_estimator=True, scoring=scoring) |
| | |
| | probabilities_cv = cross_val_predict(model[0], features, target, |
| | cv=cross_validation_fold_indices, |
| | method='predict_proba')[:, 1] |
| | model_scores = pd.DataFrame({'model_score': probabilities_cv, |
| | 'true_label': target}) |
| |
|
| | |
| | for score in scoring: |
| | mlflow.log_metric(score, crossval['test_' + score].mean()) |
| |
|
| | |
| | params = model[0].get_params() |
| | for param in params: |
| | mlflow.log_param(param, params[param]) |
| |
|
| | plots.plot_lift_curve(scores=model_scores, savefig=True, |
| | output_dir=artifact_dir, figname='lift_curve.png') |
| | plots.plot_cumulative_gains_curve(scores=model_scores, savefig=True, |
| | output_dir=artifact_dir, |
| | figname='cumulative_gains_curve.png') |
| |
|
| | |
| | plots.plot_score_distribution(scores=model_scores, postive_class_name='Exac', |
| | negative_class_name='No exac', savefig=True, |
| | output_dir=artifact_dir, |
| | figname='model_score_distribution.png') |
| |
|
| | |
| | mlflow.log_artifacts(artifact_dir) |
| | mlflow.end_run() |
| |
|