Spaces:
Build error
Build error
| library(tidyverse) | |
| library(tidymodels) | |
| chd <- read_csv("data/framingham.csv") | |
| set.seed(1001) | |
| chd_split <- chd |> | |
| mutate(TenYearCHD = factor(TenYearCHD, levels = c(1,0))) |> | |
| initial_split(prop = 0.75, strata = TenYearCHD) | |
| chd_training <- chd_split |> | |
| training() | |
| chd_testing <- chd_split |> | |
| testing() | |
| cv_fold <- chd_training |> | |
| vfold_cv(v = 5, repeats = 2) | |
| chd_training |> | |
| count(male) |> | |
| mutate(pct = n/sum(n)) | |
| chd_training |> | |
| count(TenYearCHD) |> | |
| mutate(pct = n/sum(n)) | |
| chd_training |> | |
| count(currentSmoker) |> | |
| mutate(pct = n/sum(n)) | |
| chd_training |> | |
| count(BPMeds) |> | |
| mutate(pct = n/sum(n)) | |
| chd_training |> | |
| count(TenYearCHD, BPMeds) |> | |
| group_by(TenYearCHD) |> | |
| mutate(pct = n/sum(n)) | |
| chd_training |> | |
| count(prevalentStroke) |> | |
| mutate(pct = n/sum(n)) | |
| chd_training |> | |
| count(TenYearCHD, prevalentStroke) |> | |
| group_by(TenYearCHD) |> | |
| mutate(pct = n/sum(n)) | |
| chd_training |> | |
| count(prevalentHyp) |> | |
| mutate(pct = n/sum(n)) | |
| chd_training |> | |
| count(TenYearCHD, prevalentHyp) |> | |
| group_by(TenYearCHD) |> | |
| mutate(pct = n/sum(n)) | |
| chd_training |> | |
| count(diabetes) |> | |
| mutate(pct = n/sum(n)) | |
| chd_training |> | |
| count(TenYearCHD, diabetes) |> | |
| group_by(TenYearCHD) |> | |
| mutate(pct = n/sum(n)) | |
| chd_training |> | |
| summary() | |
| chd_training |> | |
| mutate(pct = n/sum(n)) | |
| chd_recipe <- chd_training |> | |
| mutate(TenYearCHD = factor(TenYearCHD, levels = c(1,0))) |> | |
| recipe(TenYearCHD ~ .) |> | |
| step_mutate(across(.cols = c(male, education, | |
| currentSmoker, BPMeds, | |
| prevalentHyp, prevalentStroke, | |
| diabetes, cigsPerDay), | |
| .fns = as.integer), | |
| skip = FALSE) |> | |
| step_impute_median(all_integer_predictors()) |> | |
| step_impute_mean(all_double_predictors()) |> | |
| step_normalize(all_double_predictors()) |> | |
| themis::step_smote(TenYearCHD, over_ratio = 0.5) |> | |
| prep(training = chd_training) | |
| chd_recipe |> | |
| juice() |> summary() | |
| reglog_spec <- logistic_reg() |> | |
| set_engine("glm") |> | |
| set_mode("classification") | |
| dtree_spec <- decision_tree(tree_depth = tune(), min_n = tune()) |> | |
| set_engine("rpart") |> | |
| set_mode("classification") | |
| rf_spec <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) |> | |
| set_engine("ranger") |> | |
| set_mode("classification") | |
| xgb_spec <- boost_tree(mtry = tune(), min_n = tune(), trees = 1000) |> | |
| set_engine("xgboost") |> | |
| set_mode("classification") | |
| svm_spec <- svm_poly(degree = tune()) |> | |
| set_engine("kernlab") |> | |
| set_mode("classification") | |
| mlp_spec <- mlp(hidden_units = tune()) |> | |
| set_engine("nnet") |> | |
| set_mode("classification") | |
| chd_wfset <- workflow_set(preproc = list(rec = chd_recipe), | |
| models = list(reglog = reglog_spec, | |
| # dtree = dtree_spec, | |
| rf = rf_spec, | |
| # xgb = xgb_spec, | |
| # svm = svm_spec, | |
| mlp = mlp_spec), | |
| cross = TRUE) | |
| race_ctrl <- control_grid( | |
| save_pred = TRUE, | |
| parallel_over = "everything", | |
| save_workflow = TRUE | |
| ) | |
| chd_res <- chd_wfset |> | |
| workflow_map(resamples = cv_fold, | |
| grid = 25, | |
| fn = "tune_race_anova", | |
| control = race_ctrl, | |
| verbose = TRUE, seed = 1001) | |
| chd_res | |
| chd_res |> | |
| autoplot(rank_metric = "roc_auc", | |
| metric = "roc_auc", | |
| select_best = TRUE) + | |
| geom_text(aes(y = mean-1/50, label = wflow_id), angle = 90, hjust = 1) + | |
| lims(y = c(0.6, 0.75)) + | |
| theme(legend.position = "none") | |
| chd_res |> | |
| workflowsets::rank_results(rank_metric = "roc_auc", select_best = TRUE) |> | |
| print(width = Inf) | |
| chd_res |> | |
| workflowsets::rank_results(rank_metric = "accuracy", select_best = TRUE) |> | |
| print(width = Inf) | |
| chd_wf_best <- chd_res |> | |
| workflowsets::extract_workflow_set_result("rec_reglog") |> | |
| select_best(metric = "roc_auc") | |
| chd_wf_fit <- chd_res |> | |
| workflowsets::extract_workflow(id = "rec_reglog") |> | |
| finalize_workflow(chd_wf_best) |> | |
| last_fit(split = chd_split) | |
| chd_wf_fit |> | |
| collect_metrics() | |
| chd_final_fit <- chd_wf_fit |> | |
| extract_workflow() | |
| # Make a grid to predict the whole space: | |
| set.seed(1001) | |
| grid_cross <- crossing( | |
| male = c(0, 1), | |
| age = seq(32, 70, length.out = 5), | |
| education = 1:4, | |
| currentSmoker = c(0, 1), | |
| cigsPerDay = seq(0, 70, length.out = 5), | |
| BPMeds = c(0, 1), | |
| prevalentStroke = c(0, 1), | |
| prevalentHyp = c(0, 1), | |
| diabetes = c(0, 1), | |
| totChol = seq(113, 696, length.out = 5), | |
| sysBP = seq(83, 295, length.out = 5), | |
| diaBP = seq(48, 90, length.out = 5), | |
| BMI = seq(15.96, 56.80, length.out = 5), | |
| heartRate = seq(45, 143, length.out = 5), | |
| glucose = seq(40, 394, length.out = 5) | |
| ) |> | |
| slice_sample(n = 1000) | |
| grid_pred <- grid_cross %>% | |
| bind_cols(predict(chd_final_fit, grid_cross, type = "prob")) | |
| grid_pred |> | |
| print(width = Inf) | |
| chd_pred <- chd_testing %>% | |
| bind_cols(predict(chd_final_fit, chd_testing, type = "prob"), | |
| predict(chd_final_fit, chd_testing)) | |
| chd_pred |> | |
| print(width = Inf) | |
| chd_pred |> | |
| conf_mat(TenYearCHD, .pred_class) | |
| chd_pred |> | |
| accuracy(TenYearCHD, .pred_class) | |
| chd_pred |> | |
| roc_auc(TenYearCHD, .pred_1) | |
| chd_pred |> | |
| roc_curve(TenYearCHD, .pred_1) |> | |
| autoplot() | |
| chd_pred |> | |
| sensitivity(TenYearCHD, .pred_class) | |
| chd_pred |> | |
| specificity(TenYearCHD, .pred_class) | |
| chd_pred |> | |
| f_meas(TenYearCHD, .pred_class) | |
| library(DALEX) | |
| library(DALEXtra) | |
| explainer <- chd_final_fit |> | |
| explain_tidymodels( | |
| data = chd_training |> select(-TenYearCHD), | |
| y = as.numeric(chd_training$TenYearCHD == "1"), | |
| verbose = TRUE, type = "classification" | |
| ) | |
| library(modelStudio) | |
| explainer |> | |
| modelStudio(facet_dim = c(2, 4)) | |