Spaces:

aephiday
/

MLOps-framinghamCHD

Build error

MLOps-framinghamCHD / tidymodels-ml.R

aepstar

modeling file

48ba82e almost 2 years ago

5.92 kB

	library(tidyverse)
	library(tidymodels)

	chd <- read_csv("data/framingham.csv")

	set.seed(1001)
	chd_split <- chd \|>
	mutate(TenYearCHD = factor(TenYearCHD, levels = c(1,0))) \|>
	initial_split(prop = 0.75, strata = TenYearCHD)
	chd_training <- chd_split \|>
	training()
	chd_testing <- chd_split \|>
	testing()

	cv_fold <- chd_training \|>
	vfold_cv(v = 5, repeats = 2)

	chd_training \|>
	count(male) \|>
	mutate(pct = n/sum(n))

	chd_training \|>
	count(TenYearCHD) \|>
	mutate(pct = n/sum(n))

	chd_training \|>
	count(currentSmoker) \|>
	mutate(pct = n/sum(n))

	chd_training \|>
	count(BPMeds) \|>
	mutate(pct = n/sum(n))

	chd_training \|>
	count(TenYearCHD, BPMeds) \|>
	group_by(TenYearCHD) \|>
	mutate(pct = n/sum(n))

	chd_training \|>
	count(prevalentStroke) \|>
	mutate(pct = n/sum(n))

	chd_training \|>
	count(TenYearCHD, prevalentStroke) \|>
	group_by(TenYearCHD) \|>
	mutate(pct = n/sum(n))

	chd_training \|>
	count(prevalentHyp) \|>
	mutate(pct = n/sum(n))

	chd_training \|>
	count(TenYearCHD, prevalentHyp) \|>
	group_by(TenYearCHD) \|>
	mutate(pct = n/sum(n))

	chd_training \|>
	count(diabetes) \|>
	mutate(pct = n/sum(n))

	chd_training \|>
	count(TenYearCHD, diabetes) \|>
	group_by(TenYearCHD) \|>
	mutate(pct = n/sum(n))

	chd_training \|>
	summary()

	chd_training \|>
	mutate(pct = n/sum(n))

	chd_recipe <- chd_training \|>
	mutate(TenYearCHD = factor(TenYearCHD, levels = c(1,0))) \|>
	recipe(TenYearCHD ~ .) \|>
	step_mutate(across(.cols = c(male, education,
	currentSmoker, BPMeds,
	prevalentHyp, prevalentStroke,
	diabetes, cigsPerDay),
	.fns = as.integer),
	skip = FALSE) \|>
	step_impute_median(all_integer_predictors()) \|>
	step_impute_mean(all_double_predictors()) \|>
	step_normalize(all_double_predictors()) \|>
	themis::step_smote(TenYearCHD, over_ratio = 0.5) \|>
	prep(training = chd_training)

	chd_recipe \|>
	juice() \|> summary()

	reglog_spec <- logistic_reg() \|>
	set_engine("glm") \|>
	set_mode("classification")
	dtree_spec <- decision_tree(tree_depth = tune(), min_n = tune()) \|>
	set_engine("rpart") \|>
	set_mode("classification")
	rf_spec <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) \|>
	set_engine("ranger") \|>
	set_mode("classification")
	xgb_spec <- boost_tree(mtry = tune(), min_n = tune(), trees = 1000) \|>
	set_engine("xgboost") \|>
	set_mode("classification")
	svm_spec <- svm_poly(degree = tune()) \|>
	set_engine("kernlab") \|>
	set_mode("classification")
	mlp_spec <- mlp(hidden_units = tune()) \|>
	set_engine("nnet") \|>
	set_mode("classification")

	chd_wfset <- workflow_set(preproc = list(rec = chd_recipe),
	models = list(reglog = reglog_spec,
	# dtree = dtree_spec,
	rf = rf_spec,
	# xgb = xgb_spec,
	# svm = svm_spec,
	mlp = mlp_spec),
	cross = TRUE)

	race_ctrl <- control_grid(
	save_pred = TRUE,
	parallel_over = "everything",
	save_workflow = TRUE
	)

	chd_res <- chd_wfset \|>
	workflow_map(resamples = cv_fold,
	grid = 25,
	fn = "tune_race_anova",
	control = race_ctrl,
	verbose = TRUE, seed = 1001)
	chd_res

	chd_res \|>
	autoplot(rank_metric = "roc_auc",
	metric = "roc_auc",
	select_best = TRUE) +
	geom_text(aes(y = mean-1/50, label = wflow_id), angle = 90, hjust = 1) +
	lims(y = c(0.6, 0.75)) +
	theme(legend.position = "none")

	chd_res \|>
	workflowsets::rank_results(rank_metric = "roc_auc", select_best = TRUE) \|>
	print(width = Inf)
	chd_res \|>
	workflowsets::rank_results(rank_metric = "accuracy", select_best = TRUE) \|>
	print(width = Inf)

	chd_wf_best <- chd_res \|>
	workflowsets::extract_workflow_set_result("rec_reglog") \|>
	select_best(metric = "roc_auc")

	chd_wf_fit <- chd_res \|>
	workflowsets::extract_workflow(id = "rec_reglog") \|>
	finalize_workflow(chd_wf_best) \|>
	last_fit(split = chd_split)

	chd_wf_fit \|>
	collect_metrics()

	chd_final_fit <- chd_wf_fit \|>
	extract_workflow()

	# Make a grid to predict the whole space:
	set.seed(1001)
	grid_cross <- crossing(
	male = c(0, 1),
	age = seq(32, 70, length.out = 5),
	education = 1:4,
	currentSmoker = c(0, 1),
	cigsPerDay = seq(0, 70, length.out = 5),
	BPMeds = c(0, 1),
	prevalentStroke = c(0, 1),
	prevalentHyp = c(0, 1),
	diabetes = c(0, 1),
	totChol = seq(113, 696, length.out = 5),
	sysBP = seq(83, 295, length.out = 5),
	diaBP = seq(48, 90, length.out = 5),
	BMI = seq(15.96, 56.80, length.out = 5),
	heartRate = seq(45, 143, length.out = 5),
	glucose = seq(40, 394, length.out = 5)
	) \|>
	slice_sample(n = 1000)


	grid_pred <- grid_cross %>%
	bind_cols(predict(chd_final_fit, grid_cross, type = "prob"))
	grid_pred \|>
	print(width = Inf)

	chd_pred <- chd_testing %>%
	bind_cols(predict(chd_final_fit, chd_testing, type = "prob"),
	predict(chd_final_fit, chd_testing))

	chd_pred \|>
	print(width = Inf)

	chd_pred \|>
	conf_mat(TenYearCHD, .pred_class)
	chd_pred \|>
	accuracy(TenYearCHD, .pred_class)
	chd_pred \|>
	roc_auc(TenYearCHD, .pred_1)
	chd_pred \|>
	roc_curve(TenYearCHD, .pred_1) \|>
	autoplot()

	chd_pred \|>
	sensitivity(TenYearCHD, .pred_class)
	chd_pred \|>
	specificity(TenYearCHD, .pred_class)
	chd_pred \|>
	f_meas(TenYearCHD, .pred_class)

	library(DALEX)
	library(DALEXtra)

	explainer <- chd_final_fit \|>
	explain_tidymodels(
	data = chd_training \|> select(-TenYearCHD),
	y = as.numeric(chd_training$TenYearCHD == "1"),
	verbose = TRUE, type = "classification"
	)

	library(modelStudio)
	explainer \|>
	modelStudio(facet_dim = c(2, 4))