library(tidyverse) library(tidymodels) chd <- read_csv("data/framingham.csv") set.seed(1001) chd_split <- chd |> mutate(TenYearCHD = factor(TenYearCHD, levels = c(1,0))) |> initial_split(prop = 0.75, strata = TenYearCHD) chd_training <- chd_split |> training() chd_testing <- chd_split |> testing() cv_fold <- chd_training |> vfold_cv(v = 5, repeats = 2) chd_training |> count(male) |> mutate(pct = n/sum(n)) chd_training |> count(TenYearCHD) |> mutate(pct = n/sum(n)) chd_training |> count(currentSmoker) |> mutate(pct = n/sum(n)) chd_training |> count(BPMeds) |> mutate(pct = n/sum(n)) chd_training |> count(TenYearCHD, BPMeds) |> group_by(TenYearCHD) |> mutate(pct = n/sum(n)) chd_training |> count(prevalentStroke) |> mutate(pct = n/sum(n)) chd_training |> count(TenYearCHD, prevalentStroke) |> group_by(TenYearCHD) |> mutate(pct = n/sum(n)) chd_training |> count(prevalentHyp) |> mutate(pct = n/sum(n)) chd_training |> count(TenYearCHD, prevalentHyp) |> group_by(TenYearCHD) |> mutate(pct = n/sum(n)) chd_training |> count(diabetes) |> mutate(pct = n/sum(n)) chd_training |> count(TenYearCHD, diabetes) |> group_by(TenYearCHD) |> mutate(pct = n/sum(n)) chd_training |> summary() chd_training |> mutate(pct = n/sum(n)) chd_recipe <- chd_training |> mutate(TenYearCHD = factor(TenYearCHD, levels = c(1,0))) |> recipe(TenYearCHD ~ .) |> step_mutate(across(.cols = c(male, education, currentSmoker, BPMeds, prevalentHyp, prevalentStroke, diabetes, cigsPerDay), .fns = as.integer), skip = FALSE) |> step_impute_median(all_integer_predictors()) |> step_impute_mean(all_double_predictors()) |> step_normalize(all_double_predictors()) |> themis::step_smote(TenYearCHD, over_ratio = 0.5) |> prep(training = chd_training) chd_recipe |> juice() |> summary() reglog_spec <- logistic_reg() |> set_engine("glm") |> set_mode("classification") dtree_spec <- decision_tree(tree_depth = tune(), min_n = tune()) |> set_engine("rpart") |> set_mode("classification") rf_spec <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) |> set_engine("ranger") |> set_mode("classification") xgb_spec <- boost_tree(mtry = tune(), min_n = tune(), trees = 1000) |> set_engine("xgboost") |> set_mode("classification") svm_spec <- svm_poly(degree = tune()) |> set_engine("kernlab") |> set_mode("classification") mlp_spec <- mlp(hidden_units = tune()) |> set_engine("nnet") |> set_mode("classification") chd_wfset <- workflow_set(preproc = list(rec = chd_recipe), models = list(reglog = reglog_spec, # dtree = dtree_spec, rf = rf_spec, # xgb = xgb_spec, # svm = svm_spec, mlp = mlp_spec), cross = TRUE) race_ctrl <- control_grid( save_pred = TRUE, parallel_over = "everything", save_workflow = TRUE ) chd_res <- chd_wfset |> workflow_map(resamples = cv_fold, grid = 25, fn = "tune_race_anova", control = race_ctrl, verbose = TRUE, seed = 1001) chd_res chd_res |> autoplot(rank_metric = "roc_auc", metric = "roc_auc", select_best = TRUE) + geom_text(aes(y = mean-1/50, label = wflow_id), angle = 90, hjust = 1) + lims(y = c(0.6, 0.75)) + theme(legend.position = "none") chd_res |> workflowsets::rank_results(rank_metric = "roc_auc", select_best = TRUE) |> print(width = Inf) chd_res |> workflowsets::rank_results(rank_metric = "accuracy", select_best = TRUE) |> print(width = Inf) chd_wf_best <- chd_res |> workflowsets::extract_workflow_set_result("rec_reglog") |> select_best(metric = "roc_auc") chd_wf_fit <- chd_res |> workflowsets::extract_workflow(id = "rec_reglog") |> finalize_workflow(chd_wf_best) |> last_fit(split = chd_split) chd_wf_fit |> collect_metrics() chd_final_fit <- chd_wf_fit |> extract_workflow() # Make a grid to predict the whole space: set.seed(1001) grid_cross <- crossing( male = c(0, 1), age = seq(32, 70, length.out = 5), education = 1:4, currentSmoker = c(0, 1), cigsPerDay = seq(0, 70, length.out = 5), BPMeds = c(0, 1), prevalentStroke = c(0, 1), prevalentHyp = c(0, 1), diabetes = c(0, 1), totChol = seq(113, 696, length.out = 5), sysBP = seq(83, 295, length.out = 5), diaBP = seq(48, 90, length.out = 5), BMI = seq(15.96, 56.80, length.out = 5), heartRate = seq(45, 143, length.out = 5), glucose = seq(40, 394, length.out = 5) ) |> slice_sample(n = 1000) grid_pred <- grid_cross %>% bind_cols(predict(chd_final_fit, grid_cross, type = "prob")) grid_pred |> print(width = Inf) chd_pred <- chd_testing %>% bind_cols(predict(chd_final_fit, chd_testing, type = "prob"), predict(chd_final_fit, chd_testing)) chd_pred |> print(width = Inf) chd_pred |> conf_mat(TenYearCHD, .pred_class) chd_pred |> accuracy(TenYearCHD, .pred_class) chd_pred |> roc_auc(TenYearCHD, .pred_1) chd_pred |> roc_curve(TenYearCHD, .pred_1) |> autoplot() chd_pred |> sensitivity(TenYearCHD, .pred_class) chd_pred |> specificity(TenYearCHD, .pred_class) chd_pred |> f_meas(TenYearCHD, .pred_class) library(DALEX) library(DALEXtra) explainer <- chd_final_fit |> explain_tidymodels( data = chd_training |> select(-TenYearCHD), y = as.numeric(chd_training$TenYearCHD == "1"), verbose = TRUE, type = "classification" ) library(modelStudio) explainer |> modelStudio(facet_dim = c(2, 4))