XRachel commited on
Commit
a696af2
·
verified ·
1 Parent(s): 5947371

Upload 2 files

Browse files
BankChurn_Version1.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
BankChurn_Version1_R.ipynb CHANGED
@@ -1,471 +1 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "id": "f5a2a1f9",
7
- "metadata": {},
8
- "outputs": [],
9
- "source": [
10
- "# ---- Packages (R) ----\n",
11
- "suppressPackageStartupMessages({\n",
12
- " library(readr)\n",
13
- " library(dplyr)\n",
14
- " library(tidyr)\n",
15
- " library(ggplot2)\n",
16
- " library(forcats)\n",
17
- "})\n"
18
- ]
19
- },
20
- {
21
- "cell_type": "code",
22
- "execution_count": null,
23
- "id": "258c5234",
24
- "metadata": {},
25
- "outputs": [],
26
- "source": [
27
- "# Load data\n",
28
- "bankChurn <- read_csv(\"./bankChurn.csv\", locale = locale(encoding = \"UTF-8\"))\n",
29
- "head(bankChurn)\n"
30
- ]
31
- },
32
- {
33
- "cell_type": "code",
34
- "execution_count": null,
35
- "id": "0e683c34",
36
- "metadata": {},
37
- "outputs": [],
38
- "source": [
39
- "# Column names\n",
40
- "names(bankChurn)\n"
41
- ]
42
- },
43
- {
44
- "cell_type": "code",
45
- "execution_count": null,
46
- "id": "e522e3d6",
47
- "metadata": {},
48
- "outputs": [],
49
- "source": [
50
- "# Analysis of numerical data\n",
51
- "bankChurn %>%\n",
52
- " select(where(is.numeric)) %>%\n",
53
- " summary()\n"
54
- ]
55
- },
56
- {
57
- "cell_type": "code",
58
- "execution_count": null,
59
- "id": "89d7e296",
60
- "metadata": {},
61
- "outputs": [],
62
- "source": [
63
- "# Description of categorical (string) data\n",
64
- "bankChurn %>%\n",
65
- " select(where(~ is.character(.x) || is.factor(.x) || is.logical(.x))) %>%\n",
66
- " summarise(across(everything(), ~ {\n",
67
- " x <- .x\n",
68
- " if (!is.factor(x)) x <- as.factor(x)\n",
69
- " n <- length(x)\n",
70
- " n_unique <- nlevels(x)\n",
71
- " n_na <- sum(is.na(x))\n",
72
- " tab <- table(x, useNA = \"ifany\")\n",
73
- " top <- names(sort(tab, decreasing = TRUE))[1]\n",
74
- " top_freq <- as.integer(max(tab))\n",
75
- " paste0(\"n=\", n, \", unique=\", n_unique, \", NA=\", n_na,\n",
76
- " \", top=\", top, \", top_freq=\", top_freq)\n",
77
- " }))\n"
78
- ]
79
- },
80
- {
81
- "cell_type": "code",
82
- "execution_count": null,
83
- "id": "53ca7bcf",
84
- "metadata": {},
85
- "outputs": [],
86
- "source": [
87
- "# Load external data\n",
88
- "externalData <- read_csv(\"./ExternalData.csv\", locale = locale(encoding = \"UTF-8\"))\n",
89
- "head(externalData)\n"
90
- ]
91
- },
92
- {
93
- "cell_type": "code",
94
- "execution_count": null,
95
- "id": "cd05fba1",
96
- "metadata": {},
97
- "outputs": [],
98
- "source": [
99
- "# externalData shape\n",
100
- "dim(externalData)\n"
101
- ]
102
- },
103
- {
104
- "cell_type": "code",
105
- "execution_count": null,
106
- "id": "c5f022a3",
107
- "metadata": {},
108
- "outputs": [],
109
- "source": [
110
- "# externalData numeric summary\n",
111
- "externalData %>%\n",
112
- " select(where(is.numeric)) %>%\n",
113
- " summary()\n"
114
- ]
115
- },
116
- {
117
- "cell_type": "code",
118
- "execution_count": null,
119
- "id": "1bffe23b",
120
- "metadata": {},
121
- "outputs": [],
122
- "source": [
123
- "# Description of categorical (string) data\n",
124
- "externalData %>%\n",
125
- " select(where(~ is.character(.x) || is.factor(.x) || is.logical(.x))) %>%\n",
126
- " summarise(across(everything(), ~ {\n",
127
- " x <- .x\n",
128
- " if (!is.factor(x)) x <- as.factor(x)\n",
129
- " n <- length(x)\n",
130
- " n_unique <- nlevels(x)\n",
131
- " n_na <- sum(is.na(x))\n",
132
- " tab <- table(x, useNA = \"ifany\")\n",
133
- " top <- names(sort(tab, decreasing = TRUE))[1]\n",
134
- " top_freq <- as.integer(max(tab))\n",
135
- " paste0(\"n=\", n, \", unique=\", n_unique, \", NA=\", n_na,\n",
136
- " \", top=\", top, \", top_freq=\", top_freq)\n",
137
- " }))\n"
138
- ]
139
- },
140
- {
141
- "cell_type": "code",
142
- "execution_count": null,
143
- "id": "0cd780d3",
144
- "metadata": {},
145
- "outputs": [],
146
- "source": [
147
- "# Plot distributions (numeric: hist + density; categorical: bar)\n",
148
- "plot_distribution_like_sample <- function(dataset,\n",
149
- " cols = 2,\n",
150
- " rows_per_page = 6,\n",
151
- " width = 20,\n",
152
- " height_per_row = 1.2,\n",
153
- " bins = 30,\n",
154
- " max_label_len = 18,\n",
155
- " top_k_cats = 30,\n",
156
- " missing_codes = c(-99999, -9999, 99999, 9999)) {\n",
157
- " df <- dataset\n",
158
- "\n",
159
- " # unify type for bind_rows\n",
160
- " long <- dplyr::bind_rows(lapply(names(df), function(nm) {\n",
161
- " v <- df[[nm]]\n",
162
- " tibble::tibble(\n",
163
- " feature = nm,\n",
164
- " type = if (is.numeric(v)) \"numeric\" else \"categorical\",\n",
165
- " value = as.character(v)\n",
166
- " )\n",
167
- " }))\n",
168
- "\n",
169
- " # numeric: convert back + treat missing codes as NA\n",
170
- " long_num <- long %>%\n",
171
- " dplyr::filter(type == \"numeric\") %>%\n",
172
- " dplyr::mutate(value = suppressWarnings(as.numeric(value))) %>%\n",
173
- " dplyr::mutate(value = ifelse(value %in% missing_codes, NA_real_, value)) %>%\n",
174
- " dplyr::filter(!is.na(value))\n",
175
- "\n",
176
- " # categorical\n",
177
- " long_cat <- long %>%\n",
178
- " dplyr::filter(type == \"categorical\") %>%\n",
179
- " dplyr::mutate(value = ifelse(is.na(value) | value == \"\", \"NaN\", value)) %>%\n",
180
- " dplyr::group_by(feature) %>%\n",
181
- " dplyr::mutate(value = forcats::fct_lump_n(factor(value), n = top_k_cats, other_level = \"Other\")) %>%\n",
182
- " dplyr::ungroup() %>%\n",
183
- " dplyr::mutate(label = substr(as.character(value), 1, max_label_len))\n",
184
- "\n",
185
- " plot_facets_in_pages <- function(data, make_plot, cols, rows_per_page, width, height_per_row) {\n",
186
- " feats <- unique(data$feature)\n",
187
- " per_page <- cols * rows_per_page\n",
188
- " pages <- ceiling(length(feats) / per_page)\n",
189
- "\n",
190
- " for (pg in seq_len(pages)) {\n",
191
- " feats_pg <- feats[((pg - 1) * per_page + 1):min(pg * per_page, length(feats))]\n",
192
- "\n",
193
- " options(repr.plot.width = width,\n",
194
- " repr.plot.height = height_per_row * rows_per_page)\n",
195
- "\n",
196
- " print(make_plot(dplyr::filter(data, feature %in% feats_pg), cols))\n",
197
- " if (pg < pages) message(\"---- Page \", pg, \"/\", pages, \" done ----\")\n",
198
- " }\n",
199
- " }\n",
200
- "\n",
201
- " make_num_plot <- function(d, cols) {\n",
202
- " ggplot2::ggplot(d, ggplot2::aes(x = value)) +\n",
203
- " ggplot2::geom_histogram(ggplot2::aes(y = after_stat(density)), bins = bins) +\n",
204
- " ggplot2::geom_density() +\n",
205
- " ggplot2::facet_wrap(~ feature, ncol = cols, scales = \"free\") +\n",
206
- " ggplot2::labs(y = \"Density\", x = \"\") +\n",
207
- " ggplot2::theme_minimal(base_size = 12) +\n",
208
- " ggplot2::theme(\n",
209
- " strip.text = ggplot2::element_text(size = 10),\n",
210
- " axis.text.x = ggplot2::element_text(angle = 25, hjust = 1)\n",
211
- " )\n",
212
- " }\n",
213
- "\n",
214
- " make_cat_plot <- function(d, cols) {\n",
215
- " ggplot2::ggplot(d, ggplot2::aes(y = forcats::fct_rev(factor(label)))) +\n",
216
- " ggplot2::geom_bar() +\n",
217
- " ggplot2::facet_wrap(~ feature, ncol = cols, scales = \"free_y\") +\n",
218
- " ggplot2::labs(x = \"count\", y = \"\") +\n",
219
- " ggplot2::theme_minimal(base_size = 12) +\n",
220
- " ggplot2::theme(\n",
221
- " strip.text = ggplot2::element_text(size = 10),\n",
222
- " axis.text.x = ggplot2::element_text(angle = 25, hjust = 1)\n",
223
- " )\n",
224
- " }\n",
225
- "\n",
226
- " plot_facets_in_pages(long_num, make_num_plot, cols, rows_per_page, width, height_per_row)\n",
227
- " plot_facets_in_pages(long_cat, make_cat_plot, cols, rows_per_page, width, height_per_row)\n",
228
- "}\n"
229
- ]
230
- },
231
- {
232
- "cell_type": "code",
233
- "execution_count": null,
234
- "id": "ae86a9e2",
235
- "metadata": {},
236
- "outputs": [],
237
- "source": [
238
- "plot_distribution_like_sample(bankChurn, cols = 2, width = 20, height_per_row = 1.2)\n"
239
- ]
240
- },
241
- {
242
- "cell_type": "code",
243
- "execution_count": null,
244
- "id": "cb60f083",
245
- "metadata": {},
246
- "outputs": [],
247
- "source": [
248
- "# Numeric vs binary target performance\n",
249
- "NumVarPerf <- function(df, col, target, truncation = FALSE, bins = 30,\n",
250
- " missing_codes = c(-99999, -9999, 99999, 9999)) {\n",
251
- " stopifnot(col %in% names(df), target %in% names(df))\n",
252
- "\n",
253
- " validDf <- df %>%\n",
254
- " select(all_of(c(col, target))) %>%\n",
255
- " mutate(across(all_of(col), ~ ifelse(.x %in% missing_codes, NA, .x))) %>%\n",
256
- " filter(!is.na(.data[[col]]), !is.na(.data[[target]]))\n",
257
- "\n",
258
- " if (nrow(validDf) == 0) stop(sprintf(\"No valid (non-NA) data for '%s' and '%s'.\", col, target))\n",
259
- "\n",
260
- " validRcd <- nrow(validDf) / nrow(df)\n",
261
- " validRcdFmt <- sprintf(\"%.2f%%\", validRcd * 100)\n",
262
- "\n",
263
- " mu <- format(mean(validDf[[col]]), scientific = TRUE, digits = 2)\n",
264
- " std <- format(sd(validDf[[col]]), scientific = TRUE, digits = 2)\n",
265
- " minVal <- format(min(validDf[[col]]), scientific = TRUE, digits = 2)\n",
266
- " maxVal <- format(max(validDf[[col]]), scientific = TRUE, digits = 2)\n",
267
- "\n",
268
- " x <- validDf %>% filter(.data[[target]] == 1) %>% pull(.data[[col]])\n",
269
- " y <- validDf %>% filter(.data[[target]] == 0) %>% pull(.data[[col]])\n",
270
- "\n",
271
- " if (truncation) {\n",
272
- " pcnt95 <- as.numeric(quantile(validDf[[col]], 0.95, na.rm = TRUE))\n",
273
- " x <- pmin(x, pcnt95)\n",
274
- " y <- pmin(y, pcnt95)\n",
275
- " }\n",
276
- "\n",
277
- " plotDf <- bind_rows(\n",
278
- " tibble(value = x, group = \"Attrition\"),\n",
279
- " tibble(value = y, group = \"Retained\")\n",
280
- " ) %>%\n",
281
- " group_by(group) %>%\n",
282
- " mutate(weight = 100 / n()) %>%\n",
283
- " ungroup()\n",
284
- "\n",
285
- " titleText <- paste0(\n",
286
- " \"Histogram of \", col, \"\n",
287
- "\",\n",
288
- " \"valid pcnt = \", validRcdFmt,\n",
289
- " \", Mean = \", mu,\n",
290
- " \", Std = \", std,\n",
291
- " \", Min = \", minVal,\n",
292
- " \", Max = \", maxVal\n",
293
- " )\n",
294
- "\n",
295
- " p <- ggplot(plotDf, aes(x = value, weight = weight, fill = group)) +\n",
296
- " geom_histogram(position = \"identity\", alpha = 0.5, bins = bins) +\n",
297
- " labs(title = titleText, y = \"% of Dataset in Bin\", x = \"\") +\n",
298
- " theme_minimal(base_size = 12)\n",
299
- "\n",
300
- " print(p)\n",
301
- "}\n"
302
- ]
303
- },
304
- {
305
- "cell_type": "code",
306
- "execution_count": null,
307
- "id": "b542b9db",
308
- "metadata": {},
309
- "outputs": [],
310
- "source": [
311
- "NumVarPerf(bankChurn, col = \"AGE\", target = \"CHURN_CUST_IND\", truncation = FALSE, bins = 30)\n"
312
- ]
313
- },
314
- {
315
- "cell_type": "code",
316
- "execution_count": null,
317
- "id": "7bcf04d2",
318
- "metadata": {},
319
- "outputs": [],
320
- "source": [
321
- "# Remove extreme values (truncation=True).\n"
322
- ]
323
- },
324
- {
325
- "cell_type": "code",
326
- "execution_count": null,
327
- "id": "aa94c1f3",
328
- "metadata": {},
329
- "outputs": [],
330
- "source": [
331
- "# Categorical vs binary target performance\n",
332
- "CharVarPerf <- function(df, col, target) {\n",
333
- " stopifnot(col %in% names(df), target %in% names(df))\n",
334
- "\n",
335
- " validDf <- df %>%\n",
336
- " select(all_of(c(col, target))) %>%\n",
337
- " filter(!is.na(.data[[col]]), !is.na(.data[[target]]))\n",
338
- "\n",
339
- " if (nrow(validDf) == 0) stop(sprintf(\"No valid data for column '%s'.\", col))\n",
340
- "\n",
341
- " validRcd <- nrow(validDf) / nrow(df)\n",
342
- " validRcdFmt <- sprintf(\"%.2f%%\", validRcd * 100)\n",
343
- "\n",
344
- " descStats <- validDf %>%\n",
345
- " mutate(cat = as.character(.data[[col]])) %>%\n",
346
- " group_by(cat) %>%\n",
347
- " summarise(\n",
348
- " percentage = n() / nrow(validDf),\n",
349
- " churn_rate = mean(.data[[target]]),\n",
350
- " .groups = \"drop\"\n",
351
- " ) %>%\n",
352
- " arrange(churn_rate)\n",
353
- "\n",
354
- " max_cr <- max(descStats$churn_rate, na.rm = TRUE)\n",
355
- " max_pc <- max(descStats$percentage, na.rm = TRUE)\n",
356
- " scale_factor <- ifelse(max_pc == 0, 1, max_cr / max_pc)\n",
357
- "\n",
358
- " p <- ggplot(descStats, aes(x = reorder(cat, churn_rate))) +\n",
359
- " geom_col(aes(y = percentage * scale_factor), alpha = 0.4) +\n",
360
- " geom_line(aes(y = churn_rate, group = 1), linewidth = 1) +\n",
361
- " geom_point(aes(y = churn_rate), size = 2) +\n",
362
- " scale_y_continuous(\n",
363
- " name = \"Churn Rate\",\n",
364
- " sec.axis = sec_axis(~ . / scale_factor, name = \"Percentage\")\n",
365
- " ) +\n",
366
- " labs(\n",
367
- " title = paste0(\"The percentage and churn rate for \", col, \"\n",
368
- "valid percentage = \", validRcdFmt),\n",
369
- " x = col\n",
370
- " ) +\n",
371
- " theme_minimal(base_size = 12) +\n",
372
- " theme(axis.text.x = element_text(angle = 45, hjust = 1))\n",
373
- "\n",
374
- " print(p)\n",
375
- "}\n"
376
- ]
377
- },
378
- {
379
- "cell_type": "code",
380
- "execution_count": null,
381
- "id": "1f8fc91a",
382
- "metadata": {},
383
- "outputs": [],
384
- "source": [
385
- "# Relationship between GENDER_CD (gender code) and churn status\n",
386
- "CharVarPerf(bankChurn, col = \"GENDER_CD\", target = \"CHURN_CUST_IND\")\n"
387
- ]
388
- },
389
- {
390
- "cell_type": "code",
391
- "execution_count": null,
392
- "id": "9bae6508",
393
- "metadata": {},
394
- "outputs": [],
395
- "source": [
396
- "# Relationship between whether the customer has home address information\n",
397
- "CharVarPerf(bankChurn, col = \"HASNT_HOME_ADDRESS_INF\", target = \"CHURN_CUST_IND\")\n"
398
- ]
399
- },
400
- {
401
- "cell_type": "code",
402
- "execution_count": null,
403
- "id": "3c6d08c9",
404
- "metadata": {},
405
- "outputs": [],
406
- "source": [
407
- "# check null\n",
408
- "colSums(is.na(bankChurn))\n"
409
- ]
410
- },
411
- {
412
- "cell_type": "code",
413
- "execution_count": null,
414
- "id": "82861964",
415
- "metadata": {},
416
- "outputs": [],
417
- "source": [
418
- "# preprocess_numeric: 3-sigma clipping + missing value imputation\n",
419
- "preprocess_numeric <- function(df, col, fill_method = c(\"mean\", \"random\"), truncate = TRUE) {\n",
420
- " fill_method <- match.arg(fill_method)\n",
421
- " stopifnot(col %in% names(df))\n",
422
- "\n",
423
- " series <- df[[col]]\n",
424
- "\n",
425
- " if (truncate) {\n",
426
- " mu <- mean(series, na.rm = TRUE)\n",
427
- " std <- sd(series, na.rm = TRUE)\n",
428
- " upper <- mu + 3 * std\n",
429
- " lower <- mu - 3 * std\n",
430
- " series <- pmin(pmax(series, lower), upper)\n",
431
- " }\n",
432
- "\n",
433
- " if (fill_method == \"mean\") {\n",
434
- " series[is.na(series)] <- mean(series, na.rm = TRUE)\n",
435
- " } else if (fill_method == \"random\") {\n",
436
- " valid_values <- series[!is.na(series)]\n",
437
- " series[is.na(series)] <- sample(valid_values, sum(is.na(series)), replace = TRUE)\n",
438
- " }\n",
439
- "\n",
440
- " df[[col]] <- series\n",
441
- " df\n",
442
- "}\n"
443
- ]
444
- },
445
- {
446
- "cell_type": "code",
447
- "execution_count": null,
448
- "id": "12122b10",
449
- "metadata": {},
450
- "outputs": [],
451
- "source": [
452
- "# (optional) example usage:\n",
453
- "# bankChurn <- preprocess_numeric(bankChurn, \"AGE\", fill_method = \"mean\", truncate = TRUE)\n"
454
- ]
455
- }
456
- ],
457
- "metadata": {
458
- "kernelspec": {
459
- "display_name": "R",
460
- "language": "R",
461
- "name": "ir"
462
- },
463
- "language_info": {
464
- "file_extension": ".r",
465
- "mimetype": "text/x-r-source",
466
- "name": "R"
467
- }
468
- },
469
- "nbformat": 4,
470
- "nbformat_minor": 5
471
- }
 
1
+ {"cells": [{"cell_type": "code", "metadata": {"language": "R"}, "source": ["library(readr)\n", "library(dplyr)\n", "dir.create('artifacts/r/tables', recursive=TRUE, showWarnings=FALSE)\n", "bankChurn <- read_csv('bankChurn.csv')\n", "summary_geo <- bankChurn |> group_by(Geography) |> summarise(churn_rate = mean(Exited))\n", "write_csv(summary_geo, 'artifacts/r/tables/r_churn_geo.csv')\n", "summary_geo\n"], "outputs": [], "execution_count": null}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}