| from __future__ import annotations |
|
|
| import json |
| from pathlib import Path |
|
|
| import joblib |
| import matplotlib.pyplot as plt |
| import numpy as np |
| import pandas as pd |
| from sklearn.compose import ColumnTransformer |
| from sklearn.impute import SimpleImputer |
| from sklearn.inspection import permutation_importance |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.metrics import roc_auc_score |
| from sklearn.model_selection import train_test_split |
| from sklearn.pipeline import Pipeline |
| from sklearn.preprocessing import OneHotEncoder, StandardScaler |
|
|
| APP_DIR = Path(__file__).resolve().parents[1] |
| DATA_PATH = APP_DIR / "data" / "bankChurn.csv" |
| MODELS_DIR = APP_DIR / "models" |
| OUT_DIR = APP_DIR / "outputs" |
| FIG_DIR = OUT_DIR / "figures" |
| TAB_DIR = OUT_DIR / "tables" |
|
|
| TARGET = "CHURN_CUST_IND" |
| FEATURES = [ |
| "AGE", |
| "OPEN_ACC_DUR", |
| "GENDER_CD", |
| "HASNT_HOME_ADDRESS_INF", |
| "HASNT_MOBILE_TEL_NUM_INF", |
| "LOCAL_CUR_MON_AVG_BAL", |
| "LOCAL_FIX_MON_AVG_BAL", |
| "LOCAL_SAV_CUR_ALL_BAL", |
| "POS_CONSUME_TX_AMT", |
| "ATM_ALL_TX_NUM", |
| "COUNTER_ALL_TX_NUM", |
| ] |
| CAT_COLS = ["GENDER_CD", "HASNT_HOME_ADDRESS_INF", "HASNT_MOBILE_TEL_NUM_INF"] |
| NUM_COLS = [c for c in FEATURES if c not in CAT_COLS] |
|
|
|
|
| def ensure_dirs() -> None: |
| MODELS_DIR.mkdir(parents=True, exist_ok=True) |
| FIG_DIR.mkdir(parents=True, exist_ok=True) |
| TAB_DIR.mkdir(parents=True, exist_ok=True) |
|
|
|
|
| def step1_prepare() -> pd.DataFrame: |
| print("=" * 58) |
| print("STEP 1/3: Data Preparation") |
| print("=" * 58) |
| df = pd.read_csv(DATA_PATH) |
| keep = FEATURES + [TARGET] |
| missing = [c for c in keep if c not in df.columns] |
| if missing: |
| raise ValueError(f"Missing expected columns: {missing}") |
|
|
| df = df[keep].copy() |
| for c in CAT_COLS: |
| df[c] = df[c].astype(str) |
| for c in NUM_COLS + [TARGET]: |
| df[c] = pd.to_numeric(df[c], errors="coerce") |
|
|
| processed_path = OUT_DIR / "processed_bank_churn.csv" |
| df.to_csv(processed_path, index=False) |
| print(f"Rows: {len(df):,} | Cols: {df.shape[1]}") |
| print(f"Saved: {processed_path.relative_to(APP_DIR)}") |
| return df |
|
|
|
|
| def build_pipeline() -> Pipeline: |
| numeric_pipe = Pipeline( |
| steps=[ |
| ("imputer", SimpleImputer(strategy="median")), |
| ("scaler", StandardScaler()), |
| ] |
| ) |
| categorical_pipe = Pipeline( |
| steps=[ |
| ("imputer", SimpleImputer(strategy="most_frequent")), |
| ("onehot", OneHotEncoder(handle_unknown="ignore")), |
| ] |
| ) |
| preprocess = ColumnTransformer( |
| transformers=[ |
| ("num", numeric_pipe, NUM_COLS), |
| ("cat", categorical_pipe, CAT_COLS), |
| ] |
| ) |
| model = LogisticRegression(max_iter=1500, class_weight="balanced") |
| return Pipeline(steps=[("preprocess", preprocess), ("model", model)]) |
|
|
|
|
| def step2_train(df: pd.DataFrame) -> tuple[Pipeline, pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: |
| print("\n" + "=" * 58) |
| print("STEP 2/3: Train Model + Artifacts") |
| print("=" * 58) |
| X = df[FEATURES].copy() |
| y = df[TARGET].astype(int) |
|
|
| X_train, X_test, y_train, y_test = train_test_split( |
| X, y, test_size=0.2, random_state=42, stratify=y |
| ) |
|
|
| pipe = build_pipeline() |
| pipe.fit(X_train, y_train) |
|
|
| proba = pipe.predict_proba(X_test)[:, 1] |
| pred = (proba >= 0.5).astype(int) |
| auc = float(roc_auc_score(y_test, proba)) |
|
|
| model_path = MODELS_DIR / "pipeline.joblib" |
| joblib.dump(pipe, model_path) |
| print(f"Saved model: {model_path.relative_to(APP_DIR)}") |
| print(f"ROC-AUC: {auc:.4f}") |
|
|
| pred_df = X_test.copy() |
| pred_df["actual"] = y_test.to_numpy() |
| pred_df["churn_proba"] = proba |
| pred_df["churn_pred"] = pred |
| test_pred_path = TAB_DIR / "test_predictions.csv" |
| pred_df.to_csv(test_pred_path, index=False) |
| print(f"Saved: {test_pred_path.relative_to(APP_DIR)}") |
|
|
| r = permutation_importance(pipe, X_test, y_test, n_repeats=5, random_state=42, scoring="roc_auc") |
| fi = pd.DataFrame({"feature": FEATURES, "importance": r.importances_mean}).sort_values("importance", ascending=False) |
| fi_path = TAB_DIR / "feature_importance.csv" |
| fi.to_csv(fi_path, index=False) |
|
|
| plt.figure(figsize=(8, 4.5)) |
| plt.barh(fi["feature"][::-1], fi["importance"][::-1]) |
| plt.title("Feature Importance (Permutation)") |
| plt.xlabel("Importance") |
| plt.tight_layout() |
| fi_fig = FIG_DIR / "feature_importance.png" |
| plt.savefig(fi_fig, dpi=160) |
| plt.close() |
| print(f"Saved: {fi_path.relative_to(APP_DIR)}") |
| print(f"Saved: {fi_fig.relative_to(APP_DIR)}") |
|
|
| return pipe, X_train, y_train, X_test, y_test |
|
|
|
|
| def step3_finalize(pipe: Pipeline, X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series) -> None: |
| print("\n" + "=" * 58) |
| print("STEP 3/3: Validation + SHAP Background Cache") |
| print("=" * 58) |
| bg = X_train.sample(min(80, len(X_train)), random_state=42) |
| bg_path = MODELS_DIR / "background_sample.csv" |
| bg.to_csv(bg_path, index=False) |
|
|
| proba = pipe.predict_proba(X_test)[:, 1] |
| meta = { |
| "features": FEATURES, |
| "categorical_features": CAT_COLS, |
| "numeric_features": NUM_COLS, |
| "target": TARGET, |
| "threshold": 0.5, |
| "positive_rate_test": float(np.mean(y_test)), |
| "mean_predicted_proba_test": float(np.mean(proba)), |
| } |
| meta_path = MODELS_DIR / "model_meta.json" |
| meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8") |
| print(f"Saved: {bg_path.relative_to(APP_DIR)}") |
| print(f"Saved: {meta_path.relative_to(APP_DIR)}") |
| print("Pipeline completed successfully.") |
|
|
|
|
| def main() -> int: |
| ensure_dirs() |
| df = step1_prepare() |
| pipe, X_train, y_train, X_test, y_test = step2_train(df) |
| step3_finalize(pipe, X_train, y_train, X_test, y_test) |
| print("DONE") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|