File size: 5,916 Bytes
48ba82e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
library(tidyverse)
library(tidymodels)

chd <- read_csv("data/framingham.csv") 

set.seed(1001)
chd_split <- chd |> 
  mutate(TenYearCHD = factor(TenYearCHD, levels = c(1,0))) |> 
  initial_split(prop = 0.75, strata = TenYearCHD)
chd_training <- chd_split |> 
  training()
chd_testing <- chd_split |> 
  testing()

cv_fold <- chd_training |> 
  vfold_cv(v = 5, repeats = 2)

chd_training |> 
  count(male) |> 
  mutate(pct = n/sum(n))

chd_training |> 
  count(TenYearCHD) |> 
  mutate(pct = n/sum(n))

chd_training |> 
  count(currentSmoker) |> 
  mutate(pct = n/sum(n))

chd_training |> 
  count(BPMeds) |> 
  mutate(pct = n/sum(n))

chd_training |> 
  count(TenYearCHD, BPMeds) |> 
  group_by(TenYearCHD) |> 
  mutate(pct = n/sum(n))

chd_training |> 
  count(prevalentStroke) |> 
  mutate(pct = n/sum(n))

chd_training |> 
  count(TenYearCHD, prevalentStroke) |> 
  group_by(TenYearCHD) |> 
  mutate(pct = n/sum(n))

chd_training |> 
  count(prevalentHyp) |> 
  mutate(pct = n/sum(n))

chd_training |> 
  count(TenYearCHD, prevalentHyp) |> 
  group_by(TenYearCHD) |> 
  mutate(pct = n/sum(n))

chd_training |> 
  count(diabetes) |> 
  mutate(pct = n/sum(n))

chd_training |> 
  count(TenYearCHD, diabetes) |> 
  group_by(TenYearCHD) |> 
  mutate(pct = n/sum(n))

chd_training |> 
  summary()

chd_training |> 
  mutate(pct = n/sum(n))

chd_recipe <- chd_training |> 
  mutate(TenYearCHD = factor(TenYearCHD, levels = c(1,0))) |> 
  recipe(TenYearCHD ~ .) |> 
  step_mutate(across(.cols = c(male, education, 
                               currentSmoker, BPMeds, 
                               prevalentHyp, prevalentStroke, 
                               diabetes, cigsPerDay), 
                     .fns = as.integer), 
              skip = FALSE) |> 
  step_impute_median(all_integer_predictors()) |>
  step_impute_mean(all_double_predictors()) |>
  step_normalize(all_double_predictors()) |> 
  themis::step_smote(TenYearCHD, over_ratio = 0.5) |> 
  prep(training = chd_training)

chd_recipe |> 
  juice() |> summary()

reglog_spec <- logistic_reg() |> 
  set_engine("glm") |> 
  set_mode("classification")
dtree_spec <- decision_tree(tree_depth = tune(), min_n = tune()) |> 
  set_engine("rpart") |> 
  set_mode("classification")
rf_spec <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) |> 
  set_engine("ranger") |> 
  set_mode("classification")
xgb_spec <- boost_tree(mtry = tune(), min_n = tune(), trees = 1000) |> 
  set_engine("xgboost") |> 
  set_mode("classification")
svm_spec <- svm_poly(degree = tune()) |> 
  set_engine("kernlab") |> 
  set_mode("classification")
mlp_spec <- mlp(hidden_units = tune()) |> 
  set_engine("nnet") |> 
  set_mode("classification")

chd_wfset <- workflow_set(preproc = list(rec = chd_recipe), 
                          models = list(reglog = reglog_spec, 
                                        # dtree = dtree_spec, 
                                        rf = rf_spec, 
                                        # xgb = xgb_spec, 
                                        # svm = svm_spec, 
                                        mlp = mlp_spec), 
                          cross = TRUE)

race_ctrl <- control_grid(
    save_pred = TRUE,
    parallel_over = "everything",
    save_workflow = TRUE
  )

chd_res <- chd_wfset |> 
  workflow_map(resamples = cv_fold, 
               grid = 25, 
               fn = "tune_race_anova", 
               control = race_ctrl, 
               verbose = TRUE, seed = 1001)
chd_res

chd_res |> 
  autoplot(rank_metric = "roc_auc", 
           metric = "roc_auc", 
           select_best = TRUE) +
  geom_text(aes(y = mean-1/50, label = wflow_id), angle = 90, hjust = 1) +
  lims(y = c(0.6, 0.75)) +
  theme(legend.position = "none")

chd_res |> 
  workflowsets::rank_results(rank_metric = "roc_auc", select_best = TRUE) |> 
  print(width = Inf)
chd_res |> 
  workflowsets::rank_results(rank_metric = "accuracy", select_best = TRUE) |> 
  print(width = Inf)

chd_wf_best <- chd_res |> 
  workflowsets::extract_workflow_set_result("rec_reglog") |> 
  select_best(metric = "roc_auc")

chd_wf_fit <- chd_res |> 
  workflowsets::extract_workflow(id = "rec_reglog") |> 
  finalize_workflow(chd_wf_best) |> 
  last_fit(split = chd_split)

chd_wf_fit |> 
  collect_metrics()

chd_final_fit <- chd_wf_fit |> 
  extract_workflow()

# Make a grid to predict the whole space:
set.seed(1001)
grid_cross <- crossing(
  male = c(0, 1), 
  age = seq(32, 70, length.out = 5), 
  education = 1:4, 
  currentSmoker = c(0, 1), 
  cigsPerDay = seq(0, 70, length.out = 5), 
  BPMeds = c(0, 1), 
  prevalentStroke = c(0, 1), 
  prevalentHyp = c(0, 1), 
  diabetes = c(0, 1), 
  totChol = seq(113, 696, length.out = 5), 
  sysBP = seq(83, 295, length.out = 5), 
  diaBP = seq(48, 90, length.out = 5), 
  BMI = seq(15.96, 56.80, length.out = 5), 
  heartRate = seq(45, 143, length.out = 5), 
  glucose = seq(40, 394, length.out = 5)
) |> 
  slice_sample(n = 1000)


grid_pred <- grid_cross %>% 
  bind_cols(predict(chd_final_fit, grid_cross, type = "prob"))
grid_pred |> 
  print(width = Inf)

chd_pred <- chd_testing %>% 
  bind_cols(predict(chd_final_fit, chd_testing, type = "prob"), 
            predict(chd_final_fit, chd_testing))

chd_pred |> 
  print(width = Inf)

chd_pred |> 
  conf_mat(TenYearCHD, .pred_class)
chd_pred |> 
  accuracy(TenYearCHD, .pred_class)
chd_pred |> 
  roc_auc(TenYearCHD, .pred_1)
chd_pred |> 
  roc_curve(TenYearCHD, .pred_1) |> 
  autoplot()

chd_pred |> 
  sensitivity(TenYearCHD, .pred_class)
chd_pred |> 
  specificity(TenYearCHD, .pred_class)
chd_pred |> 
  f_meas(TenYearCHD, .pred_class)

library(DALEX)
library(DALEXtra)

explainer <- chd_final_fit |> 
  explain_tidymodels(
    data = chd_training |> select(-TenYearCHD), 
    y = as.numeric(chd_training$TenYearCHD == "1"), 
    verbose = TRUE, type = "classification"
  )

library(modelStudio)
explainer |> 
  modelStudio(facet_dim = c(2, 4))