XRachel commited on
Commit
51167cf
·
verified ·
1 Parent(s): da75a09

Upload 2 files

Browse files
BankChurn_Version1.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
BankChurn_Version1_R.ipynb ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "f5a2a1f9",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "# ---- Packages (R) ----\n",
11
+ "suppressPackageStartupMessages({\n",
12
+ " library(readr)\n",
13
+ " library(dplyr)\n",
14
+ " library(tidyr)\n",
15
+ " library(ggplot2)\n",
16
+ " library(forcats)\n",
17
+ "})\n"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": null,
23
+ "id": "258c5234",
24
+ "metadata": {},
25
+ "outputs": [],
26
+ "source": [
27
+ "# Load data\n",
28
+ "bankChurn <- read_csv(\"./bankChurn.csv\", locale = locale(encoding = \"UTF-8\"))\n",
29
+ "head(bankChurn)\n"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": null,
35
+ "id": "0e683c34",
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "# Column names\n",
40
+ "names(bankChurn)\n"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": null,
46
+ "id": "e522e3d6",
47
+ "metadata": {},
48
+ "outputs": [],
49
+ "source": [
50
+ "# Analysis of numerical data\n",
51
+ "bankChurn %>%\n",
52
+ " select(where(is.numeric)) %>%\n",
53
+ " summary()\n"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": null,
59
+ "id": "89d7e296",
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "# Description of categorical (string) data\n",
64
+ "bankChurn %>%\n",
65
+ " select(where(~ is.character(.x) || is.factor(.x) || is.logical(.x))) %>%\n",
66
+ " summarise(across(everything(), ~ {\n",
67
+ " x <- .x\n",
68
+ " if (!is.factor(x)) x <- as.factor(x)\n",
69
+ " n <- length(x)\n",
70
+ " n_unique <- nlevels(x)\n",
71
+ " n_na <- sum(is.na(x))\n",
72
+ " tab <- table(x, useNA = \"ifany\")\n",
73
+ " top <- names(sort(tab, decreasing = TRUE))[1]\n",
74
+ " top_freq <- as.integer(max(tab))\n",
75
+ " paste0(\"n=\", n, \", unique=\", n_unique, \", NA=\", n_na,\n",
76
+ " \", top=\", top, \", top_freq=\", top_freq)\n",
77
+ " }))\n"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": null,
83
+ "id": "53ca7bcf",
84
+ "metadata": {},
85
+ "outputs": [],
86
+ "source": [
87
+ "# Load external data\n",
88
+ "externalData <- read_csv(\"./ExternalData.csv\", locale = locale(encoding = \"UTF-8\"))\n",
89
+ "head(externalData)\n"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": null,
95
+ "id": "cd05fba1",
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "# externalData shape\n",
100
+ "dim(externalData)\n"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": null,
106
+ "id": "c5f022a3",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "# externalData numeric summary\n",
111
+ "externalData %>%\n",
112
+ " select(where(is.numeric)) %>%\n",
113
+ " summary()\n"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": null,
119
+ "id": "1bffe23b",
120
+ "metadata": {},
121
+ "outputs": [],
122
+ "source": [
123
+ "# Description of categorical (string) data\n",
124
+ "externalData %>%\n",
125
+ " select(where(~ is.character(.x) || is.factor(.x) || is.logical(.x))) %>%\n",
126
+ " summarise(across(everything(), ~ {\n",
127
+ " x <- .x\n",
128
+ " if (!is.factor(x)) x <- as.factor(x)\n",
129
+ " n <- length(x)\n",
130
+ " n_unique <- nlevels(x)\n",
131
+ " n_na <- sum(is.na(x))\n",
132
+ " tab <- table(x, useNA = \"ifany\")\n",
133
+ " top <- names(sort(tab, decreasing = TRUE))[1]\n",
134
+ " top_freq <- as.integer(max(tab))\n",
135
+ " paste0(\"n=\", n, \", unique=\", n_unique, \", NA=\", n_na,\n",
136
+ " \", top=\", top, \", top_freq=\", top_freq)\n",
137
+ " }))\n"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": null,
143
+ "id": "0cd780d3",
144
+ "metadata": {},
145
+ "outputs": [],
146
+ "source": [
147
+ "# Plot distributions (numeric: hist + density; categorical: bar)\n",
148
+ "plot_distribution_like_sample <- function(dataset,\n",
149
+ " cols = 2,\n",
150
+ " rows_per_page = 6,\n",
151
+ " width = 20,\n",
152
+ " height_per_row = 1.2,\n",
153
+ " bins = 30,\n",
154
+ " max_label_len = 18,\n",
155
+ " top_k_cats = 30,\n",
156
+ " missing_codes = c(-99999, -9999, 99999, 9999)) {\n",
157
+ " df <- dataset\n",
158
+ "\n",
159
+ " # unify type for bind_rows\n",
160
+ " long <- dplyr::bind_rows(lapply(names(df), function(nm) {\n",
161
+ " v <- df[[nm]]\n",
162
+ " tibble::tibble(\n",
163
+ " feature = nm,\n",
164
+ " type = if (is.numeric(v)) \"numeric\" else \"categorical\",\n",
165
+ " value = as.character(v)\n",
166
+ " )\n",
167
+ " }))\n",
168
+ "\n",
169
+ " # numeric: convert back + treat missing codes as NA\n",
170
+ " long_num <- long %>%\n",
171
+ " dplyr::filter(type == \"numeric\") %>%\n",
172
+ " dplyr::mutate(value = suppressWarnings(as.numeric(value))) %>%\n",
173
+ " dplyr::mutate(value = ifelse(value %in% missing_codes, NA_real_, value)) %>%\n",
174
+ " dplyr::filter(!is.na(value))\n",
175
+ "\n",
176
+ " # categorical\n",
177
+ " long_cat <- long %>%\n",
178
+ " dplyr::filter(type == \"categorical\") %>%\n",
179
+ " dplyr::mutate(value = ifelse(is.na(value) | value == \"\", \"NaN\", value)) %>%\n",
180
+ " dplyr::group_by(feature) %>%\n",
181
+ " dplyr::mutate(value = forcats::fct_lump_n(factor(value), n = top_k_cats, other_level = \"Other\")) %>%\n",
182
+ " dplyr::ungroup() %>%\n",
183
+ " dplyr::mutate(label = substr(as.character(value), 1, max_label_len))\n",
184
+ "\n",
185
+ " plot_facets_in_pages <- function(data, make_plot, cols, rows_per_page, width, height_per_row) {\n",
186
+ " feats <- unique(data$feature)\n",
187
+ " per_page <- cols * rows_per_page\n",
188
+ " pages <- ceiling(length(feats) / per_page)\n",
189
+ "\n",
190
+ " for (pg in seq_len(pages)) {\n",
191
+ " feats_pg <- feats[((pg - 1) * per_page + 1):min(pg * per_page, length(feats))]\n",
192
+ "\n",
193
+ " options(repr.plot.width = width,\n",
194
+ " repr.plot.height = height_per_row * rows_per_page)\n",
195
+ "\n",
196
+ " print(make_plot(dplyr::filter(data, feature %in% feats_pg), cols))\n",
197
+ " if (pg < pages) message(\"---- Page \", pg, \"/\", pages, \" done ----\")\n",
198
+ " }\n",
199
+ " }\n",
200
+ "\n",
201
+ " make_num_plot <- function(d, cols) {\n",
202
+ " ggplot2::ggplot(d, ggplot2::aes(x = value)) +\n",
203
+ " ggplot2::geom_histogram(ggplot2::aes(y = after_stat(density)), bins = bins) +\n",
204
+ " ggplot2::geom_density() +\n",
205
+ " ggplot2::facet_wrap(~ feature, ncol = cols, scales = \"free\") +\n",
206
+ " ggplot2::labs(y = \"Density\", x = \"\") +\n",
207
+ " ggplot2::theme_minimal(base_size = 12) +\n",
208
+ " ggplot2::theme(\n",
209
+ " strip.text = ggplot2::element_text(size = 10),\n",
210
+ " axis.text.x = ggplot2::element_text(angle = 25, hjust = 1)\n",
211
+ " )\n",
212
+ " }\n",
213
+ "\n",
214
+ " make_cat_plot <- function(d, cols) {\n",
215
+ " ggplot2::ggplot(d, ggplot2::aes(y = forcats::fct_rev(factor(label)))) +\n",
216
+ " ggplot2::geom_bar() +\n",
217
+ " ggplot2::facet_wrap(~ feature, ncol = cols, scales = \"free_y\") +\n",
218
+ " ggplot2::labs(x = \"count\", y = \"\") +\n",
219
+ " ggplot2::theme_minimal(base_size = 12) +\n",
220
+ " ggplot2::theme(\n",
221
+ " strip.text = ggplot2::element_text(size = 10),\n",
222
+ " axis.text.x = ggplot2::element_text(angle = 25, hjust = 1)\n",
223
+ " )\n",
224
+ " }\n",
225
+ "\n",
226
+ " plot_facets_in_pages(long_num, make_num_plot, cols, rows_per_page, width, height_per_row)\n",
227
+ " plot_facets_in_pages(long_cat, make_cat_plot, cols, rows_per_page, width, height_per_row)\n",
228
+ "}\n"
229
+ ]
230
+ },
231
+ {
232
+ "cell_type": "code",
233
+ "execution_count": null,
234
+ "id": "ae86a9e2",
235
+ "metadata": {},
236
+ "outputs": [],
237
+ "source": [
238
+ "plot_distribution_like_sample(bankChurn, cols = 2, width = 20, height_per_row = 1.2)\n"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": null,
244
+ "id": "cb60f083",
245
+ "metadata": {},
246
+ "outputs": [],
247
+ "source": [
248
+ "# Numeric vs binary target performance\n",
249
+ "NumVarPerf <- function(df, col, target, truncation = FALSE, bins = 30,\n",
250
+ " missing_codes = c(-99999, -9999, 99999, 9999)) {\n",
251
+ " stopifnot(col %in% names(df), target %in% names(df))\n",
252
+ "\n",
253
+ " validDf <- df %>%\n",
254
+ " select(all_of(c(col, target))) %>%\n",
255
+ " mutate(across(all_of(col), ~ ifelse(.x %in% missing_codes, NA, .x))) %>%\n",
256
+ " filter(!is.na(.data[[col]]), !is.na(.data[[target]]))\n",
257
+ "\n",
258
+ " if (nrow(validDf) == 0) stop(sprintf(\"No valid (non-NA) data for '%s' and '%s'.\", col, target))\n",
259
+ "\n",
260
+ " validRcd <- nrow(validDf) / nrow(df)\n",
261
+ " validRcdFmt <- sprintf(\"%.2f%%\", validRcd * 100)\n",
262
+ "\n",
263
+ " mu <- format(mean(validDf[[col]]), scientific = TRUE, digits = 2)\n",
264
+ " std <- format(sd(validDf[[col]]), scientific = TRUE, digits = 2)\n",
265
+ " minVal <- format(min(validDf[[col]]), scientific = TRUE, digits = 2)\n",
266
+ " maxVal <- format(max(validDf[[col]]), scientific = TRUE, digits = 2)\n",
267
+ "\n",
268
+ " x <- validDf %>% filter(.data[[target]] == 1) %>% pull(.data[[col]])\n",
269
+ " y <- validDf %>% filter(.data[[target]] == 0) %>% pull(.data[[col]])\n",
270
+ "\n",
271
+ " if (truncation) {\n",
272
+ " pcnt95 <- as.numeric(quantile(validDf[[col]], 0.95, na.rm = TRUE))\n",
273
+ " x <- pmin(x, pcnt95)\n",
274
+ " y <- pmin(y, pcnt95)\n",
275
+ " }\n",
276
+ "\n",
277
+ " plotDf <- bind_rows(\n",
278
+ " tibble(value = x, group = \"Attrition\"),\n",
279
+ " tibble(value = y, group = \"Retained\")\n",
280
+ " ) %>%\n",
281
+ " group_by(group) %>%\n",
282
+ " mutate(weight = 100 / n()) %>%\n",
283
+ " ungroup()\n",
284
+ "\n",
285
+ " titleText <- paste0(\n",
286
+ " \"Histogram of \", col, \"\n",
287
+ "\",\n",
288
+ " \"valid pcnt = \", validRcdFmt,\n",
289
+ " \", Mean = \", mu,\n",
290
+ " \", Std = \", std,\n",
291
+ " \", Min = \", minVal,\n",
292
+ " \", Max = \", maxVal\n",
293
+ " )\n",
294
+ "\n",
295
+ " p <- ggplot(plotDf, aes(x = value, weight = weight, fill = group)) +\n",
296
+ " geom_histogram(position = \"identity\", alpha = 0.5, bins = bins) +\n",
297
+ " labs(title = titleText, y = \"% of Dataset in Bin\", x = \"\") +\n",
298
+ " theme_minimal(base_size = 12)\n",
299
+ "\n",
300
+ " print(p)\n",
301
+ "}\n"
302
+ ]
303
+ },
304
+ {
305
+ "cell_type": "code",
306
+ "execution_count": null,
307
+ "id": "b542b9db",
308
+ "metadata": {},
309
+ "outputs": [],
310
+ "source": [
311
+ "NumVarPerf(bankChurn, col = \"AGE\", target = \"CHURN_CUST_IND\", truncation = FALSE, bins = 30)\n"
312
+ ]
313
+ },
314
+ {
315
+ "cell_type": "code",
316
+ "execution_count": null,
317
+ "id": "7bcf04d2",
318
+ "metadata": {},
319
+ "outputs": [],
320
+ "source": [
321
+ "# Remove extreme values (truncation=True).\n"
322
+ ]
323
+ },
324
+ {
325
+ "cell_type": "code",
326
+ "execution_count": null,
327
+ "id": "aa94c1f3",
328
+ "metadata": {},
329
+ "outputs": [],
330
+ "source": [
331
+ "# Categorical vs binary target performance\n",
332
+ "CharVarPerf <- function(df, col, target) {\n",
333
+ " stopifnot(col %in% names(df), target %in% names(df))\n",
334
+ "\n",
335
+ " validDf <- df %>%\n",
336
+ " select(all_of(c(col, target))) %>%\n",
337
+ " filter(!is.na(.data[[col]]), !is.na(.data[[target]]))\n",
338
+ "\n",
339
+ " if (nrow(validDf) == 0) stop(sprintf(\"No valid data for column '%s'.\", col))\n",
340
+ "\n",
341
+ " validRcd <- nrow(validDf) / nrow(df)\n",
342
+ " validRcdFmt <- sprintf(\"%.2f%%\", validRcd * 100)\n",
343
+ "\n",
344
+ " descStats <- validDf %>%\n",
345
+ " mutate(cat = as.character(.data[[col]])) %>%\n",
346
+ " group_by(cat) %>%\n",
347
+ " summarise(\n",
348
+ " percentage = n() / nrow(validDf),\n",
349
+ " churn_rate = mean(.data[[target]]),\n",
350
+ " .groups = \"drop\"\n",
351
+ " ) %>%\n",
352
+ " arrange(churn_rate)\n",
353
+ "\n",
354
+ " max_cr <- max(descStats$churn_rate, na.rm = TRUE)\n",
355
+ " max_pc <- max(descStats$percentage, na.rm = TRUE)\n",
356
+ " scale_factor <- ifelse(max_pc == 0, 1, max_cr / max_pc)\n",
357
+ "\n",
358
+ " p <- ggplot(descStats, aes(x = reorder(cat, churn_rate))) +\n",
359
+ " geom_col(aes(y = percentage * scale_factor), alpha = 0.4) +\n",
360
+ " geom_line(aes(y = churn_rate, group = 1), linewidth = 1) +\n",
361
+ " geom_point(aes(y = churn_rate), size = 2) +\n",
362
+ " scale_y_continuous(\n",
363
+ " name = \"Churn Rate\",\n",
364
+ " sec.axis = sec_axis(~ . / scale_factor, name = \"Percentage\")\n",
365
+ " ) +\n",
366
+ " labs(\n",
367
+ " title = paste0(\"The percentage and churn rate for \", col, \"\n",
368
+ "valid percentage = \", validRcdFmt),\n",
369
+ " x = col\n",
370
+ " ) +\n",
371
+ " theme_minimal(base_size = 12) +\n",
372
+ " theme(axis.text.x = element_text(angle = 45, hjust = 1))\n",
373
+ "\n",
374
+ " print(p)\n",
375
+ "}\n"
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "code",
380
+ "execution_count": null,
381
+ "id": "1f8fc91a",
382
+ "metadata": {},
383
+ "outputs": [],
384
+ "source": [
385
+ "# Relationship between GENDER_CD (gender code) and churn status\n",
386
+ "CharVarPerf(bankChurn, col = \"GENDER_CD\", target = \"CHURN_CUST_IND\")\n"
387
+ ]
388
+ },
389
+ {
390
+ "cell_type": "code",
391
+ "execution_count": null,
392
+ "id": "9bae6508",
393
+ "metadata": {},
394
+ "outputs": [],
395
+ "source": [
396
+ "# Relationship between whether the customer has home address information\n",
397
+ "CharVarPerf(bankChurn, col = \"HASNT_HOME_ADDRESS_INF\", target = \"CHURN_CUST_IND\")\n"
398
+ ]
399
+ },
400
+ {
401
+ "cell_type": "code",
402
+ "execution_count": null,
403
+ "id": "3c6d08c9",
404
+ "metadata": {},
405
+ "outputs": [],
406
+ "source": [
407
+ "# check null\n",
408
+ "colSums(is.na(bankChurn))\n"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": null,
414
+ "id": "82861964",
415
+ "metadata": {},
416
+ "outputs": [],
417
+ "source": [
418
+ "# preprocess_numeric: 3-sigma clipping + missing value imputation\n",
419
+ "preprocess_numeric <- function(df, col, fill_method = c(\"mean\", \"random\"), truncate = TRUE) {\n",
420
+ " fill_method <- match.arg(fill_method)\n",
421
+ " stopifnot(col %in% names(df))\n",
422
+ "\n",
423
+ " series <- df[[col]]\n",
424
+ "\n",
425
+ " if (truncate) {\n",
426
+ " mu <- mean(series, na.rm = TRUE)\n",
427
+ " std <- sd(series, na.rm = TRUE)\n",
428
+ " upper <- mu + 3 * std\n",
429
+ " lower <- mu - 3 * std\n",
430
+ " series <- pmin(pmax(series, lower), upper)\n",
431
+ " }\n",
432
+ "\n",
433
+ " if (fill_method == \"mean\") {\n",
434
+ " series[is.na(series)] <- mean(series, na.rm = TRUE)\n",
435
+ " } else if (fill_method == \"random\") {\n",
436
+ " valid_values <- series[!is.na(series)]\n",
437
+ " series[is.na(series)] <- sample(valid_values, sum(is.na(series)), replace = TRUE)\n",
438
+ " }\n",
439
+ "\n",
440
+ " df[[col]] <- series\n",
441
+ " df\n",
442
+ "}\n"
443
+ ]
444
+ },
445
+ {
446
+ "cell_type": "code",
447
+ "execution_count": null,
448
+ "id": "12122b10",
449
+ "metadata": {},
450
+ "outputs": [],
451
+ "source": [
452
+ "# (optional) example usage:\n",
453
+ "# bankChurn <- preprocess_numeric(bankChurn, \"AGE\", fill_method = \"mean\", truncate = TRUE)\n"
454
+ ]
455
+ }
456
+ ],
457
+ "metadata": {
458
+ "kernelspec": {
459
+ "display_name": "R",
460
+ "language": "R",
461
+ "name": "ir"
462
+ },
463
+ "language_info": {
464
+ "file_extension": ".r",
465
+ "mimetype": "text/x-r-source",
466
+ "name": "R"
467
+ }
468
+ },
469
+ "nbformat": 4,
470
+ "nbformat_minor": 5
471
+ }