MLOps-framinghamCHD / tidymodels-ml.R
aepstar
modeling file
48ba82e
library(tidyverse)
library(tidymodels)
chd <- read_csv("data/framingham.csv")
set.seed(1001)
chd_split <- chd |>
mutate(TenYearCHD = factor(TenYearCHD, levels = c(1,0))) |>
initial_split(prop = 0.75, strata = TenYearCHD)
chd_training <- chd_split |>
training()
chd_testing <- chd_split |>
testing()
cv_fold <- chd_training |>
vfold_cv(v = 5, repeats = 2)
chd_training |>
count(male) |>
mutate(pct = n/sum(n))
chd_training |>
count(TenYearCHD) |>
mutate(pct = n/sum(n))
chd_training |>
count(currentSmoker) |>
mutate(pct = n/sum(n))
chd_training |>
count(BPMeds) |>
mutate(pct = n/sum(n))
chd_training |>
count(TenYearCHD, BPMeds) |>
group_by(TenYearCHD) |>
mutate(pct = n/sum(n))
chd_training |>
count(prevalentStroke) |>
mutate(pct = n/sum(n))
chd_training |>
count(TenYearCHD, prevalentStroke) |>
group_by(TenYearCHD) |>
mutate(pct = n/sum(n))
chd_training |>
count(prevalentHyp) |>
mutate(pct = n/sum(n))
chd_training |>
count(TenYearCHD, prevalentHyp) |>
group_by(TenYearCHD) |>
mutate(pct = n/sum(n))
chd_training |>
count(diabetes) |>
mutate(pct = n/sum(n))
chd_training |>
count(TenYearCHD, diabetes) |>
group_by(TenYearCHD) |>
mutate(pct = n/sum(n))
chd_training |>
summary()
chd_training |>
mutate(pct = n/sum(n))
chd_recipe <- chd_training |>
mutate(TenYearCHD = factor(TenYearCHD, levels = c(1,0))) |>
recipe(TenYearCHD ~ .) |>
step_mutate(across(.cols = c(male, education,
currentSmoker, BPMeds,
prevalentHyp, prevalentStroke,
diabetes, cigsPerDay),
.fns = as.integer),
skip = FALSE) |>
step_impute_median(all_integer_predictors()) |>
step_impute_mean(all_double_predictors()) |>
step_normalize(all_double_predictors()) |>
themis::step_smote(TenYearCHD, over_ratio = 0.5) |>
prep(training = chd_training)
chd_recipe |>
juice() |> summary()
reglog_spec <- logistic_reg() |>
set_engine("glm") |>
set_mode("classification")
dtree_spec <- decision_tree(tree_depth = tune(), min_n = tune()) |>
set_engine("rpart") |>
set_mode("classification")
rf_spec <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) |>
set_engine("ranger") |>
set_mode("classification")
xgb_spec <- boost_tree(mtry = tune(), min_n = tune(), trees = 1000) |>
set_engine("xgboost") |>
set_mode("classification")
svm_spec <- svm_poly(degree = tune()) |>
set_engine("kernlab") |>
set_mode("classification")
mlp_spec <- mlp(hidden_units = tune()) |>
set_engine("nnet") |>
set_mode("classification")
chd_wfset <- workflow_set(preproc = list(rec = chd_recipe),
models = list(reglog = reglog_spec,
# dtree = dtree_spec,
rf = rf_spec,
# xgb = xgb_spec,
# svm = svm_spec,
mlp = mlp_spec),
cross = TRUE)
race_ctrl <- control_grid(
save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
)
chd_res <- chd_wfset |>
workflow_map(resamples = cv_fold,
grid = 25,
fn = "tune_race_anova",
control = race_ctrl,
verbose = TRUE, seed = 1001)
chd_res
chd_res |>
autoplot(rank_metric = "roc_auc",
metric = "roc_auc",
select_best = TRUE) +
geom_text(aes(y = mean-1/50, label = wflow_id), angle = 90, hjust = 1) +
lims(y = c(0.6, 0.75)) +
theme(legend.position = "none")
chd_res |>
workflowsets::rank_results(rank_metric = "roc_auc", select_best = TRUE) |>
print(width = Inf)
chd_res |>
workflowsets::rank_results(rank_metric = "accuracy", select_best = TRUE) |>
print(width = Inf)
chd_wf_best <- chd_res |>
workflowsets::extract_workflow_set_result("rec_reglog") |>
select_best(metric = "roc_auc")
chd_wf_fit <- chd_res |>
workflowsets::extract_workflow(id = "rec_reglog") |>
finalize_workflow(chd_wf_best) |>
last_fit(split = chd_split)
chd_wf_fit |>
collect_metrics()
chd_final_fit <- chd_wf_fit |>
extract_workflow()
# Make a grid to predict the whole space:
set.seed(1001)
grid_cross <- crossing(
male = c(0, 1),
age = seq(32, 70, length.out = 5),
education = 1:4,
currentSmoker = c(0, 1),
cigsPerDay = seq(0, 70, length.out = 5),
BPMeds = c(0, 1),
prevalentStroke = c(0, 1),
prevalentHyp = c(0, 1),
diabetes = c(0, 1),
totChol = seq(113, 696, length.out = 5),
sysBP = seq(83, 295, length.out = 5),
diaBP = seq(48, 90, length.out = 5),
BMI = seq(15.96, 56.80, length.out = 5),
heartRate = seq(45, 143, length.out = 5),
glucose = seq(40, 394, length.out = 5)
) |>
slice_sample(n = 1000)
grid_pred <- grid_cross %>%
bind_cols(predict(chd_final_fit, grid_cross, type = "prob"))
grid_pred |>
print(width = Inf)
chd_pred <- chd_testing %>%
bind_cols(predict(chd_final_fit, chd_testing, type = "prob"),
predict(chd_final_fit, chd_testing))
chd_pred |>
print(width = Inf)
chd_pred |>
conf_mat(TenYearCHD, .pred_class)
chd_pred |>
accuracy(TenYearCHD, .pred_class)
chd_pred |>
roc_auc(TenYearCHD, .pred_1)
chd_pred |>
roc_curve(TenYearCHD, .pred_1) |>
autoplot()
chd_pred |>
sensitivity(TenYearCHD, .pred_class)
chd_pred |>
specificity(TenYearCHD, .pred_class)
chd_pred |>
f_meas(TenYearCHD, .pred_class)
library(DALEX)
library(DALEXtra)
explainer <- chd_final_fit |>
explain_tidymodels(
data = chd_training |> select(-TenYearCHD),
y = as.numeric(chd_training$TenYearCHD == "1"),
verbose = TRUE, type = "classification"
)
library(modelStudio)
explainer |>
modelStudio(facet_dim = c(2, 4))