Spaces:

XRachel
/

bc4

Sleeping

App Files Files Community

XRachel commited on Mar 16

Commit

51167cf

verified ·

1 Parent(s): da75a09

Upload 2 files

Browse files

Files changed (2) hide show

BankChurn_Version1.ipynb +0 -0
BankChurn_Version1_R.ipynb +471 -0

BankChurn_Version1.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

BankChurn_Version1_R.ipynb ADDED Viewed

	@@ -0,0 +1,471 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f5a2a1f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---- Packages (R) ----\n",
+    "suppressPackageStartupMessages({\n",
+    "  library(readr)\n",
+    "  library(dplyr)\n",
+    "  library(tidyr)\n",
+    "  library(ggplot2)\n",
+    "  library(forcats)\n",
+    "})\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "258c5234",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load data\n",
+    "bankChurn <- read_csv(\"./bankChurn.csv\", locale = locale(encoding = \"UTF-8\"))\n",
+    "head(bankChurn)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e683c34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Column names\n",
+    "names(bankChurn)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e522e3d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Analysis of numerical data\n",
+    "bankChurn %>%\n",
+    "  select(where(is.numeric)) %>%\n",
+    "  summary()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89d7e296",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Description of categorical (string) data\n",
+    "bankChurn %>%\n",
+    "  select(where(~ is.character(.x) || is.factor(.x) || is.logical(.x))) %>%\n",
+    "  summarise(across(everything(), ~ {\n",
+    "    x <- .x\n",
+    "    if (!is.factor(x)) x <- as.factor(x)\n",
+    "    n <- length(x)\n",
+    "    n_unique <- nlevels(x)\n",
+    "    n_na <- sum(is.na(x))\n",
+    "    tab <- table(x, useNA = \"ifany\")\n",
+    "    top <- names(sort(tab, decreasing = TRUE))[1]\n",
+    "    top_freq <- as.integer(max(tab))\n",
+    "    paste0(\"n=\", n, \", unique=\", n_unique, \", NA=\", n_na,\n",
+    "           \", top=\", top, \", top_freq=\", top_freq)\n",
+    "  }))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53ca7bcf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load external data\n",
+    "externalData <- read_csv(\"./ExternalData.csv\", locale = locale(encoding = \"UTF-8\"))\n",
+    "head(externalData)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd05fba1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# externalData shape\n",
+    "dim(externalData)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5f022a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# externalData numeric summary\n",
+    "externalData %>%\n",
+    "  select(where(is.numeric)) %>%\n",
+    "  summary()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1bffe23b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Description of categorical (string) data\n",
+    "externalData %>%\n",
+    "  select(where(~ is.character(.x) || is.factor(.x) || is.logical(.x))) %>%\n",
+    "  summarise(across(everything(), ~ {\n",
+    "    x <- .x\n",
+    "    if (!is.factor(x)) x <- as.factor(x)\n",
+    "    n <- length(x)\n",
+    "    n_unique <- nlevels(x)\n",
+    "    n_na <- sum(is.na(x))\n",
+    "    tab <- table(x, useNA = \"ifany\")\n",
+    "    top <- names(sort(tab, decreasing = TRUE))[1]\n",
+    "    top_freq <- as.integer(max(tab))\n",
+    "    paste0(\"n=\", n, \", unique=\", n_unique, \", NA=\", n_na,\n",
+    "           \", top=\", top, \", top_freq=\", top_freq)\n",
+    "  }))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0cd780d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plot distributions (numeric: hist + density; categorical: bar)\n",
+    "plot_distribution_like_sample <- function(dataset,\n",
+    "                                          cols = 2,\n",
+    "                                          rows_per_page = 6,\n",
+    "                                          width = 20,\n",
+    "                                          height_per_row = 1.2,\n",
+    "                                          bins = 30,\n",
+    "                                          max_label_len = 18,\n",
+    "                                          top_k_cats = 30,\n",
+    "                                          missing_codes = c(-99999, -9999, 99999, 9999)) {\n",
+    "  df <- dataset\n",
+    "\n",
+    "  # unify type for bind_rows\n",
+    "  long <- dplyr::bind_rows(lapply(names(df), function(nm) {\n",
+    "    v <- df[[nm]]\n",
+    "    tibble::tibble(\n",
+    "      feature = nm,\n",
+    "      type = if (is.numeric(v)) \"numeric\" else \"categorical\",\n",
+    "      value = as.character(v)\n",
+    "    )\n",
+    "  }))\n",
+    "\n",
+    "  # numeric: convert back + treat missing codes as NA\n",
+    "  long_num <- long %>%\n",
+    "    dplyr::filter(type == \"numeric\") %>%\n",
+    "    dplyr::mutate(value = suppressWarnings(as.numeric(value))) %>%\n",
+    "    dplyr::mutate(value = ifelse(value %in% missing_codes, NA_real_, value)) %>%\n",
+    "    dplyr::filter(!is.na(value))\n",
+    "\n",
+    "  # categorical\n",
+    "  long_cat <- long %>%\n",
+    "    dplyr::filter(type == \"categorical\") %>%\n",
+    "    dplyr::mutate(value = ifelse(is.na(value) | value == \"\", \"NaN\", value)) %>%\n",
+    "    dplyr::group_by(feature) %>%\n",
+    "    dplyr::mutate(value = forcats::fct_lump_n(factor(value), n = top_k_cats, other_level = \"Other\")) %>%\n",
+    "    dplyr::ungroup() %>%\n",
+    "    dplyr::mutate(label = substr(as.character(value), 1, max_label_len))\n",
+    "\n",
+    "  plot_facets_in_pages <- function(data, make_plot, cols, rows_per_page, width, height_per_row) {\n",
+    "    feats <- unique(data$feature)\n",
+    "    per_page <- cols * rows_per_page\n",
+    "    pages <- ceiling(length(feats) / per_page)\n",
+    "\n",
+    "    for (pg in seq_len(pages)) {\n",
+    "      feats_pg <- feats[((pg - 1) * per_page + 1):min(pg * per_page, length(feats))]\n",
+    "\n",
+    "      options(repr.plot.width = width,\n",
+    "              repr.plot.height = height_per_row * rows_per_page)\n",
+    "\n",
+    "      print(make_plot(dplyr::filter(data, feature %in% feats_pg), cols))\n",
+    "      if (pg < pages) message(\"---- Page \", pg, \"/\", pages, \" done ----\")\n",
+    "    }\n",
+    "  }\n",
+    "\n",
+    "  make_num_plot <- function(d, cols) {\n",
+    "    ggplot2::ggplot(d, ggplot2::aes(x = value)) +\n",
+    "      ggplot2::geom_histogram(ggplot2::aes(y = after_stat(density)), bins = bins) +\n",
+    "      ggplot2::geom_density() +\n",
+    "      ggplot2::facet_wrap(~ feature, ncol = cols, scales = \"free\") +\n",
+    "      ggplot2::labs(y = \"Density\", x = \"\") +\n",
+    "      ggplot2::theme_minimal(base_size = 12) +\n",
+    "      ggplot2::theme(\n",
+    "        strip.text = ggplot2::element_text(size = 10),\n",
+    "        axis.text.x = ggplot2::element_text(angle = 25, hjust = 1)\n",
+    "      )\n",
+    "  }\n",
+    "\n",
+    "  make_cat_plot <- function(d, cols) {\n",
+    "    ggplot2::ggplot(d, ggplot2::aes(y = forcats::fct_rev(factor(label)))) +\n",
+    "      ggplot2::geom_bar() +\n",
+    "      ggplot2::facet_wrap(~ feature, ncol = cols, scales = \"free_y\") +\n",
+    "      ggplot2::labs(x = \"count\", y = \"\") +\n",
+    "      ggplot2::theme_minimal(base_size = 12) +\n",
+    "      ggplot2::theme(\n",
+    "        strip.text = ggplot2::element_text(size = 10),\n",
+    "        axis.text.x = ggplot2::element_text(angle = 25, hjust = 1)\n",
+    "      )\n",
+    "  }\n",
+    "\n",
+    "  plot_facets_in_pages(long_num, make_num_plot, cols, rows_per_page, width, height_per_row)\n",
+    "  plot_facets_in_pages(long_cat, make_cat_plot, cols, rows_per_page, width, height_per_row)\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae86a9e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_distribution_like_sample(bankChurn, cols = 2, width = 20, height_per_row = 1.2)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb60f083",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Numeric vs binary target performance\n",
+    "NumVarPerf <- function(df, col, target, truncation = FALSE, bins = 30,\n",
+    "                       missing_codes = c(-99999, -9999, 99999, 9999)) {\n",
+    "  stopifnot(col %in% names(df), target %in% names(df))\n",
+    "\n",
+    "  validDf <- df %>%\n",
+    "    select(all_of(c(col, target))) %>%\n",
+    "    mutate(across(all_of(col), ~ ifelse(.x %in% missing_codes, NA, .x))) %>%\n",
+    "    filter(!is.na(.data[[col]]), !is.na(.data[[target]]))\n",
+    "\n",
+    "  if (nrow(validDf) == 0) stop(sprintf(\"No valid (non-NA) data for '%s' and '%s'.\", col, target))\n",
+    "\n",
+    "  validRcd <- nrow(validDf) / nrow(df)\n",
+    "  validRcdFmt <- sprintf(\"%.2f%%\", validRcd * 100)\n",
+    "\n",
+    "  mu <- format(mean(validDf[[col]]), scientific = TRUE, digits = 2)\n",
+    "  std <- format(sd(validDf[[col]]), scientific = TRUE, digits = 2)\n",
+    "  minVal <- format(min(validDf[[col]]), scientific = TRUE, digits = 2)\n",
+    "  maxVal <- format(max(validDf[[col]]), scientific = TRUE, digits = 2)\n",
+    "\n",
+    "  x <- validDf %>% filter(.data[[target]] == 1) %>% pull(.data[[col]])\n",
+    "  y <- validDf %>% filter(.data[[target]] == 0) %>% pull(.data[[col]])\n",
+    "\n",
+    "  if (truncation) {\n",
+    "    pcnt95 <- as.numeric(quantile(validDf[[col]], 0.95, na.rm = TRUE))\n",
+    "    x <- pmin(x, pcnt95)\n",
+    "    y <- pmin(y, pcnt95)\n",
+    "  }\n",
+    "\n",
+    "  plotDf <- bind_rows(\n",
+    "    tibble(value = x, group = \"Attrition\"),\n",
+    "    tibble(value = y, group = \"Retained\")\n",
+    "  ) %>%\n",
+    "    group_by(group) %>%\n",
+    "    mutate(weight = 100 / n()) %>%\n",
+    "    ungroup()\n",
+    "\n",
+    "  titleText <- paste0(\n",
+    "    \"Histogram of \", col, \"\n",
+    "\",\n",
+    "    \"valid pcnt = \", validRcdFmt,\n",
+    "    \", Mean = \", mu,\n",
+    "    \", Std = \", std,\n",
+    "    \", Min = \", minVal,\n",
+    "    \", Max = \", maxVal\n",
+    "  )\n",
+    "\n",
+    "  p <- ggplot(plotDf, aes(x = value, weight = weight, fill = group)) +\n",
+    "    geom_histogram(position = \"identity\", alpha = 0.5, bins = bins) +\n",
+    "    labs(title = titleText, y = \"% of Dataset in Bin\", x = \"\") +\n",
+    "    theme_minimal(base_size = 12)\n",
+    "\n",
+    "  print(p)\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b542b9db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NumVarPerf(bankChurn, col = \"AGE\", target = \"CHURN_CUST_IND\", truncation = FALSE, bins = 30)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bcf04d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Remove extreme values (truncation=True).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa94c1f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Categorical vs binary target performance\n",
+    "CharVarPerf <- function(df, col, target) {\n",
+    "  stopifnot(col %in% names(df), target %in% names(df))\n",
+    "\n",
+    "  validDf <- df %>%\n",
+    "    select(all_of(c(col, target))) %>%\n",
+    "    filter(!is.na(.data[[col]]), !is.na(.data[[target]]))\n",
+    "\n",
+    "  if (nrow(validDf) == 0) stop(sprintf(\"No valid data for column '%s'.\", col))\n",
+    "\n",
+    "  validRcd <- nrow(validDf) / nrow(df)\n",
+    "  validRcdFmt <- sprintf(\"%.2f%%\", validRcd * 100)\n",
+    "\n",
+    "  descStats <- validDf %>%\n",
+    "    mutate(cat = as.character(.data[[col]])) %>%\n",
+    "    group_by(cat) %>%\n",
+    "    summarise(\n",
+    "      percentage = n() / nrow(validDf),\n",
+    "      churn_rate = mean(.data[[target]]),\n",
+    "      .groups = \"drop\"\n",
+    "    ) %>%\n",
+    "    arrange(churn_rate)\n",
+    "\n",
+    "  max_cr <- max(descStats$churn_rate, na.rm = TRUE)\n",
+    "  max_pc <- max(descStats$percentage, na.rm = TRUE)\n",
+    "  scale_factor <- ifelse(max_pc == 0, 1, max_cr / max_pc)\n",
+    "\n",
+    "  p <- ggplot(descStats, aes(x = reorder(cat, churn_rate))) +\n",
+    "    geom_col(aes(y = percentage * scale_factor), alpha = 0.4) +\n",
+    "    geom_line(aes(y = churn_rate, group = 1), linewidth = 1) +\n",
+    "    geom_point(aes(y = churn_rate), size = 2) +\n",
+    "    scale_y_continuous(\n",
+    "      name = \"Churn Rate\",\n",
+    "      sec.axis = sec_axis(~ . / scale_factor, name = \"Percentage\")\n",
+    "    ) +\n",
+    "    labs(\n",
+    "      title = paste0(\"The percentage and churn rate for \", col, \"\n",
+    "valid percentage = \", validRcdFmt),\n",
+    "      x = col\n",
+    "    ) +\n",
+    "    theme_minimal(base_size = 12) +\n",
+    "    theme(axis.text.x = element_text(angle = 45, hjust = 1))\n",
+    "\n",
+    "  print(p)\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f8fc91a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Relationship between GENDER_CD (gender code) and churn status\n",
+    "CharVarPerf(bankChurn, col = \"GENDER_CD\", target = \"CHURN_CUST_IND\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9bae6508",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Relationship between whether the customer has home address information\n",
+    "CharVarPerf(bankChurn, col = \"HASNT_HOME_ADDRESS_INF\", target = \"CHURN_CUST_IND\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c6d08c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# check null\n",
+    "colSums(is.na(bankChurn))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82861964",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# preprocess_numeric: 3-sigma clipping + missing value imputation\n",
+    "preprocess_numeric <- function(df, col, fill_method = c(\"mean\", \"random\"), truncate = TRUE) {\n",
+    "  fill_method <- match.arg(fill_method)\n",
+    "  stopifnot(col %in% names(df))\n",
+    "\n",
+    "  series <- df[[col]]\n",
+    "\n",
+    "  if (truncate) {\n",
+    "    mu <- mean(series, na.rm = TRUE)\n",
+    "    std <- sd(series, na.rm = TRUE)\n",
+    "    upper <- mu + 3 * std\n",
+    "    lower <- mu - 3 * std\n",
+    "    series <- pmin(pmax(series, lower), upper)\n",
+    "  }\n",
+    "\n",
+    "  if (fill_method == \"mean\") {\n",
+    "    series[is.na(series)] <- mean(series, na.rm = TRUE)\n",
+    "  } else if (fill_method == \"random\") {\n",
+    "    valid_values <- series[!is.na(series)]\n",
+    "    series[is.na(series)] <- sample(valid_values, sum(is.na(series)), replace = TRUE)\n",
+    "  }\n",
+    "\n",
+    "  df[[col]] <- series\n",
+    "  df\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12122b10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# (optional) example usage:\n",
+    "# bankChurn <- preprocess_numeric(bankChurn, \"AGE\", fill_method = \"mean\", truncate = TRUE)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "R",
+   "language": "R",
+   "name": "ir"
+  },
+  "language_info": {
+   "file_extension": ".r",
+   "mimetype": "text/x-r-source",
+   "name": "R"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}