Spaces:

XRachel
/

bc4

Sleeping

App Files Files Community

XRachel commited on Mar 16

Commit

a696af2

verified ·

1 Parent(s): 5947371

Upload 2 files

Browse files

Files changed (2) hide show

BankChurn_Version1.ipynb +0 -0
BankChurn_Version1_R.ipynb +1 -471

BankChurn_Version1.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

BankChurn_Version1_R.ipynb CHANGED Viewed

@@ -1,471 +1 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f5a2a1f9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ---- Packages (R) ----\n",
-    "suppressPackageStartupMessages({\n",
-    "  library(readr)\n",
-    "  library(dplyr)\n",
-    "  library(tidyr)\n",
-    "  library(ggplot2)\n",
-    "  library(forcats)\n",
-    "})\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "258c5234",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Load data\n",
-    "bankChurn <- read_csv(\"./bankChurn.csv\", locale = locale(encoding = \"UTF-8\"))\n",
-    "head(bankChurn)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0e683c34",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Column names\n",
-    "names(bankChurn)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e522e3d6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Analysis of numerical data\n",
-    "bankChurn %>%\n",
-    "  select(where(is.numeric)) %>%\n",
-    "  summary()\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "89d7e296",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Description of categorical (string) data\n",
-    "bankChurn %>%\n",
-    "  select(where(~ is.character(.x) || is.factor(.x) || is.logical(.x))) %>%\n",
-    "  summarise(across(everything(), ~ {\n",
-    "    x <- .x\n",
-    "    if (!is.factor(x)) x <- as.factor(x)\n",
-    "    n <- length(x)\n",
-    "    n_unique <- nlevels(x)\n",
-    "    n_na <- sum(is.na(x))\n",
-    "    tab <- table(x, useNA = \"ifany\")\n",
-    "    top <- names(sort(tab, decreasing = TRUE))[1]\n",
-    "    top_freq <- as.integer(max(tab))\n",
-    "    paste0(\"n=\", n, \", unique=\", n_unique, \", NA=\", n_na,\n",
-    "           \", top=\", top, \", top_freq=\", top_freq)\n",
-    "  }))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "53ca7bcf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Load external data\n",
-    "externalData <- read_csv(\"./ExternalData.csv\", locale = locale(encoding = \"UTF-8\"))\n",
-    "head(externalData)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cd05fba1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# externalData shape\n",
-    "dim(externalData)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c5f022a3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# externalData numeric summary\n",
-    "externalData %>%\n",
-    "  select(where(is.numeric)) %>%\n",
-    "  summary()\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1bffe23b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Description of categorical (string) data\n",
-    "externalData %>%\n",
-    "  select(where(~ is.character(.x) || is.factor(.x) || is.logical(.x))) %>%\n",
-    "  summarise(across(everything(), ~ {\n",
-    "    x <- .x\n",
-    "    if (!is.factor(x)) x <- as.factor(x)\n",
-    "    n <- length(x)\n",
-    "    n_unique <- nlevels(x)\n",
-    "    n_na <- sum(is.na(x))\n",
-    "    tab <- table(x, useNA = \"ifany\")\n",
-    "    top <- names(sort(tab, decreasing = TRUE))[1]\n",
-    "    top_freq <- as.integer(max(tab))\n",
-    "    paste0(\"n=\", n, \", unique=\", n_unique, \", NA=\", n_na,\n",
-    "           \", top=\", top, \", top_freq=\", top_freq)\n",
-    "  }))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0cd780d3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Plot distributions (numeric: hist + density; categorical: bar)\n",
-    "plot_distribution_like_sample <- function(dataset,\n",
-    "                                          cols = 2,\n",
-    "                                          rows_per_page = 6,\n",
-    "                                          width = 20,\n",
-    "                                          height_per_row = 1.2,\n",
-    "                                          bins = 30,\n",
-    "                                          max_label_len = 18,\n",
-    "                                          top_k_cats = 30,\n",
-    "                                          missing_codes = c(-99999, -9999, 99999, 9999)) {\n",
-    "  df <- dataset\n",
-    "\n",
-    "  # unify type for bind_rows\n",
-    "  long <- dplyr::bind_rows(lapply(names(df), function(nm) {\n",
-    "    v <- df[[nm]]\n",
-    "    tibble::tibble(\n",
-    "      feature = nm,\n",
-    "      type = if (is.numeric(v)) \"numeric\" else \"categorical\",\n",
-    "      value = as.character(v)\n",
-    "    )\n",
-    "  }))\n",
-    "\n",
-    "  # numeric: convert back + treat missing codes as NA\n",
-    "  long_num <- long %>%\n",
-    "    dplyr::filter(type == \"numeric\") %>%\n",
-    "    dplyr::mutate(value = suppressWarnings(as.numeric(value))) %>%\n",
-    "    dplyr::mutate(value = ifelse(value %in% missing_codes, NA_real_, value)) %>%\n",
-    "    dplyr::filter(!is.na(value))\n",
-    "\n",
-    "  # categorical\n",
-    "  long_cat <- long %>%\n",
-    "    dplyr::filter(type == \"categorical\") %>%\n",
-    "    dplyr::mutate(value = ifelse(is.na(value) | value == \"\", \"NaN\", value)) %>%\n",
-    "    dplyr::group_by(feature) %>%\n",
-    "    dplyr::mutate(value = forcats::fct_lump_n(factor(value), n = top_k_cats, other_level = \"Other\")) %>%\n",
-    "    dplyr::ungroup() %>%\n",
-    "    dplyr::mutate(label = substr(as.character(value), 1, max_label_len))\n",
-    "\n",
-    "  plot_facets_in_pages <- function(data, make_plot, cols, rows_per_page, width, height_per_row) {\n",
-    "    feats <- unique(data$feature)\n",
-    "    per_page <- cols * rows_per_page\n",
-    "    pages <- ceiling(length(feats) / per_page)\n",
-    "\n",
-    "    for (pg in seq_len(pages)) {\n",
-    "      feats_pg <- feats[((pg - 1) * per_page + 1):min(pg * per_page, length(feats))]\n",
-    "\n",
-    "      options(repr.plot.width = width,\n",
-    "              repr.plot.height = height_per_row * rows_per_page)\n",
-    "\n",
-    "      print(make_plot(dplyr::filter(data, feature %in% feats_pg), cols))\n",
-    "      if (pg < pages) message(\"---- Page \", pg, \"/\", pages, \" done ----\")\n",
-    "    }\n",
-    "  }\n",
-    "\n",
-    "  make_num_plot <- function(d, cols) {\n",
-    "    ggplot2::ggplot(d, ggplot2::aes(x = value)) +\n",
-    "      ggplot2::geom_histogram(ggplot2::aes(y = after_stat(density)), bins = bins) +\n",
-    "      ggplot2::geom_density() +\n",
-    "      ggplot2::facet_wrap(~ feature, ncol = cols, scales = \"free\") +\n",
-    "      ggplot2::labs(y = \"Density\", x = \"\") +\n",
-    "      ggplot2::theme_minimal(base_size = 12) +\n",
-    "      ggplot2::theme(\n",
-    "        strip.text = ggplot2::element_text(size = 10),\n",
-    "        axis.text.x = ggplot2::element_text(angle = 25, hjust = 1)\n",
-    "      )\n",
-    "  }\n",
-    "\n",
-    "  make_cat_plot <- function(d, cols) {\n",
-    "    ggplot2::ggplot(d, ggplot2::aes(y = forcats::fct_rev(factor(label)))) +\n",
-    "      ggplot2::geom_bar() +\n",
-    "      ggplot2::facet_wrap(~ feature, ncol = cols, scales = \"free_y\") +\n",
-    "      ggplot2::labs(x = \"count\", y = \"\") +\n",
-    "      ggplot2::theme_minimal(base_size = 12) +\n",
-    "      ggplot2::theme(\n",
-    "        strip.text = ggplot2::element_text(size = 10),\n",
-    "        axis.text.x = ggplot2::element_text(angle = 25, hjust = 1)\n",
-    "      )\n",
-    "  }\n",
-    "\n",
-    "  plot_facets_in_pages(long_num, make_num_plot, cols, rows_per_page, width, height_per_row)\n",
-    "  plot_facets_in_pages(long_cat, make_cat_plot, cols, rows_per_page, width, height_per_row)\n",
-    "}\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ae86a9e2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plot_distribution_like_sample(bankChurn, cols = 2, width = 20, height_per_row = 1.2)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cb60f083",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Numeric vs binary target performance\n",
-    "NumVarPerf <- function(df, col, target, truncation = FALSE, bins = 30,\n",
-    "                       missing_codes = c(-99999, -9999, 99999, 9999)) {\n",
-    "  stopifnot(col %in% names(df), target %in% names(df))\n",
-    "\n",
-    "  validDf <- df %>%\n",
-    "    select(all_of(c(col, target))) %>%\n",
-    "    mutate(across(all_of(col), ~ ifelse(.x %in% missing_codes, NA, .x))) %>%\n",
-    "    filter(!is.na(.data[[col]]), !is.na(.data[[target]]))\n",
-    "\n",
-    "  if (nrow(validDf) == 0) stop(sprintf(\"No valid (non-NA) data for '%s' and '%s'.\", col, target))\n",
-    "\n",
-    "  validRcd <- nrow(validDf) / nrow(df)\n",
-    "  validRcdFmt <- sprintf(\"%.2f%%\", validRcd * 100)\n",
-    "\n",
-    "  mu <- format(mean(validDf[[col]]), scientific = TRUE, digits = 2)\n",
-    "  std <- format(sd(validDf[[col]]), scientific = TRUE, digits = 2)\n",
-    "  minVal <- format(min(validDf[[col]]), scientific = TRUE, digits = 2)\n",
-    "  maxVal <- format(max(validDf[[col]]), scientific = TRUE, digits = 2)\n",
-    "\n",
-    "  x <- validDf %>% filter(.data[[target]] == 1) %>% pull(.data[[col]])\n",
-    "  y <- validDf %>% filter(.data[[target]] == 0) %>% pull(.data[[col]])\n",
-    "\n",
-    "  if (truncation) {\n",
-    "    pcnt95 <- as.numeric(quantile(validDf[[col]], 0.95, na.rm = TRUE))\n",
-    "    x <- pmin(x, pcnt95)\n",
-    "    y <- pmin(y, pcnt95)\n",
-    "  }\n",
-    "\n",
-    "  plotDf <- bind_rows(\n",
-    "    tibble(value = x, group = \"Attrition\"),\n",
-    "    tibble(value = y, group = \"Retained\")\n",
-    "  ) %>%\n",
-    "    group_by(group) %>%\n",
-    "    mutate(weight = 100 / n()) %>%\n",
-    "    ungroup()\n",
-    "\n",
-    "  titleText <- paste0(\n",
-    "    \"Histogram of \", col, \"\n",
-    "\",\n",
-    "    \"valid pcnt = \", validRcdFmt,\n",
-    "    \", Mean = \", mu,\n",
-    "    \", Std = \", std,\n",
-    "    \", Min = \", minVal,\n",
-    "    \", Max = \", maxVal\n",
-    "  )\n",
-    "\n",
-    "  p <- ggplot(plotDf, aes(x = value, weight = weight, fill = group)) +\n",
-    "    geom_histogram(position = \"identity\", alpha = 0.5, bins = bins) +\n",
-    "    labs(title = titleText, y = \"% of Dataset in Bin\", x = \"\") +\n",
-    "    theme_minimal(base_size = 12)\n",
-    "\n",
-    "  print(p)\n",
-    "}\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b542b9db",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "NumVarPerf(bankChurn, col = \"AGE\", target = \"CHURN_CUST_IND\", truncation = FALSE, bins = 30)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7bcf04d2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Remove extreme values (truncation=True).\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "aa94c1f3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Categorical vs binary target performance\n",
-    "CharVarPerf <- function(df, col, target) {\n",
-    "  stopifnot(col %in% names(df), target %in% names(df))\n",
-    "\n",
-    "  validDf <- df %>%\n",
-    "    select(all_of(c(col, target))) %>%\n",
-    "    filter(!is.na(.data[[col]]), !is.na(.data[[target]]))\n",
-    "\n",
-    "  if (nrow(validDf) == 0) stop(sprintf(\"No valid data for column '%s'.\", col))\n",
-    "\n",
-    "  validRcd <- nrow(validDf) / nrow(df)\n",
-    "  validRcdFmt <- sprintf(\"%.2f%%\", validRcd * 100)\n",
-    "\n",
-    "  descStats <- validDf %>%\n",
-    "    mutate(cat = as.character(.data[[col]])) %>%\n",
-    "    group_by(cat) %>%\n",
-    "    summarise(\n",
-    "      percentage = n() / nrow(validDf),\n",
-    "      churn_rate = mean(.data[[target]]),\n",
-    "      .groups = \"drop\"\n",
-    "    ) %>%\n",
-    "    arrange(churn_rate)\n",
-    "\n",
-    "  max_cr <- max(descStats$churn_rate, na.rm = TRUE)\n",
-    "  max_pc <- max(descStats$percentage, na.rm = TRUE)\n",
-    "  scale_factor <- ifelse(max_pc == 0, 1, max_cr / max_pc)\n",
-    "\n",
-    "  p <- ggplot(descStats, aes(x = reorder(cat, churn_rate))) +\n",
-    "    geom_col(aes(y = percentage * scale_factor), alpha = 0.4) +\n",
-    "    geom_line(aes(y = churn_rate, group = 1), linewidth = 1) +\n",
-    "    geom_point(aes(y = churn_rate), size = 2) +\n",
-    "    scale_y_continuous(\n",
-    "      name = \"Churn Rate\",\n",
-    "      sec.axis = sec_axis(~ . / scale_factor, name = \"Percentage\")\n",
-    "    ) +\n",
-    "    labs(\n",
-    "      title = paste0(\"The percentage and churn rate for \", col, \"\n",
-    "valid percentage = \", validRcdFmt),\n",
-    "      x = col\n",
-    "    ) +\n",
-    "    theme_minimal(base_size = 12) +\n",
-    "    theme(axis.text.x = element_text(angle = 45, hjust = 1))\n",
-    "\n",
-    "  print(p)\n",
-    "}\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1f8fc91a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Relationship between GENDER_CD (gender code) and churn status\n",
-    "CharVarPerf(bankChurn, col = \"GENDER_CD\", target = \"CHURN_CUST_IND\")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9bae6508",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Relationship between whether the customer has home address information\n",
-    "CharVarPerf(bankChurn, col = \"HASNT_HOME_ADDRESS_INF\", target = \"CHURN_CUST_IND\")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3c6d08c9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# check null\n",
-    "colSums(is.na(bankChurn))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "82861964",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# preprocess_numeric: 3-sigma clipping + missing value imputation\n",
-    "preprocess_numeric <- function(df, col, fill_method = c(\"mean\", \"random\"), truncate = TRUE) {\n",
-    "  fill_method <- match.arg(fill_method)\n",
-    "  stopifnot(col %in% names(df))\n",
-    "\n",
-    "  series <- df[[col]]\n",
-    "\n",
-    "  if (truncate) {\n",
-    "    mu <- mean(series, na.rm = TRUE)\n",
-    "    std <- sd(series, na.rm = TRUE)\n",
-    "    upper <- mu + 3 * std\n",
-    "    lower <- mu - 3 * std\n",
-    "    series <- pmin(pmax(series, lower), upper)\n",
-    "  }\n",
-    "\n",
-    "  if (fill_method == \"mean\") {\n",
-    "    series[is.na(series)] <- mean(series, na.rm = TRUE)\n",
-    "  } else if (fill_method == \"random\") {\n",
-    "    valid_values <- series[!is.na(series)]\n",
-    "    series[is.na(series)] <- sample(valid_values, sum(is.na(series)), replace = TRUE)\n",
-    "  }\n",
-    "\n",
-    "  df[[col]] <- series\n",
-    "  df\n",
-    "}\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "12122b10",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# (optional) example usage:\n",
-    "# bankChurn <- preprocess_numeric(bankChurn, \"AGE\", fill_method = \"mean\", truncate = TRUE)\n"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "R",
-   "language": "R",
-   "name": "ir"
-  },
-  "language_info": {
-   "file_extension": ".r",
-   "mimetype": "text/x-r-source",
-   "name": "R"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}


1	+ {"cells": [{"cell_type": "code", "metadata": {"language": "R"}, "source": ["library(readr)\n", "library(dplyr)\n", "dir.create('artifacts/r/tables', recursive=TRUE, showWarnings=FALSE)\n", "bankChurn <- read_csv('bankChurn.csv')\n", "summary_geo <- bankChurn \|> group_by(Geography) \|> summarise(churn_rate = mean(Exited))\n", "write_csv(summary_geo, 'artifacts/r/tables/r_churn_geo.csv')\n", "summary_geo\n"], "outputs": [], "execution_count": null}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}