from __future__ import annotations import json from pathlib import Path import joblib import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.inspection import permutation_importance from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler APP_DIR = Path(__file__).resolve().parents[1] DATA_PATH = APP_DIR / "data" / "bankChurn.csv" MODELS_DIR = APP_DIR / "models" OUT_DIR = APP_DIR / "outputs" FIG_DIR = OUT_DIR / "figures" TAB_DIR = OUT_DIR / "tables" TARGET = "CHURN_CUST_IND" FEATURES = [ "AGE", "OPEN_ACC_DUR", "GENDER_CD", "HASNT_HOME_ADDRESS_INF", "HASNT_MOBILE_TEL_NUM_INF", "LOCAL_CUR_MON_AVG_BAL", "LOCAL_FIX_MON_AVG_BAL", "LOCAL_SAV_CUR_ALL_BAL", "POS_CONSUME_TX_AMT", "ATM_ALL_TX_NUM", "COUNTER_ALL_TX_NUM", ] CAT_COLS = ["GENDER_CD", "HASNT_HOME_ADDRESS_INF", "HASNT_MOBILE_TEL_NUM_INF"] NUM_COLS = [c for c in FEATURES if c not in CAT_COLS] def ensure_dirs() -> None: MODELS_DIR.mkdir(parents=True, exist_ok=True) FIG_DIR.mkdir(parents=True, exist_ok=True) TAB_DIR.mkdir(parents=True, exist_ok=True) def step1_prepare() -> pd.DataFrame: print("=" * 58) print("STEP 1/3: Data Preparation") print("=" * 58) df = pd.read_csv(DATA_PATH) keep = FEATURES + [TARGET] missing = [c for c in keep if c not in df.columns] if missing: raise ValueError(f"Missing expected columns: {missing}") df = df[keep].copy() for c in CAT_COLS: df[c] = df[c].astype(str) for c in NUM_COLS + [TARGET]: df[c] = pd.to_numeric(df[c], errors="coerce") processed_path = OUT_DIR / "processed_bank_churn.csv" df.to_csv(processed_path, index=False) print(f"Rows: {len(df):,} | Cols: {df.shape[1]}") print(f"Saved: {processed_path.relative_to(APP_DIR)}") return df def build_pipeline() -> Pipeline: numeric_pipe = Pipeline( steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ] ) categorical_pipe = Pipeline( steps=[ ("imputer", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore")), ] ) preprocess = ColumnTransformer( transformers=[ ("num", numeric_pipe, NUM_COLS), ("cat", categorical_pipe, CAT_COLS), ] ) model = LogisticRegression(max_iter=1500, class_weight="balanced") return Pipeline(steps=[("preprocess", preprocess), ("model", model)]) def step2_train(df: pd.DataFrame) -> tuple[Pipeline, pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: print("\n" + "=" * 58) print("STEP 2/3: Train Model + Artifacts") print("=" * 58) X = df[FEATURES].copy() y = df[TARGET].astype(int) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) pipe = build_pipeline() pipe.fit(X_train, y_train) proba = pipe.predict_proba(X_test)[:, 1] pred = (proba >= 0.5).astype(int) auc = float(roc_auc_score(y_test, proba)) model_path = MODELS_DIR / "pipeline.joblib" joblib.dump(pipe, model_path) print(f"Saved model: {model_path.relative_to(APP_DIR)}") print(f"ROC-AUC: {auc:.4f}") pred_df = X_test.copy() pred_df["actual"] = y_test.to_numpy() pred_df["churn_proba"] = proba pred_df["churn_pred"] = pred test_pred_path = TAB_DIR / "test_predictions.csv" pred_df.to_csv(test_pred_path, index=False) print(f"Saved: {test_pred_path.relative_to(APP_DIR)}") r = permutation_importance(pipe, X_test, y_test, n_repeats=5, random_state=42, scoring="roc_auc") fi = pd.DataFrame({"feature": FEATURES, "importance": r.importances_mean}).sort_values("importance", ascending=False) fi_path = TAB_DIR / "feature_importance.csv" fi.to_csv(fi_path, index=False) plt.figure(figsize=(8, 4.5)) plt.barh(fi["feature"][::-1], fi["importance"][::-1]) plt.title("Feature Importance (Permutation)") plt.xlabel("Importance") plt.tight_layout() fi_fig = FIG_DIR / "feature_importance.png" plt.savefig(fi_fig, dpi=160) plt.close() print(f"Saved: {fi_path.relative_to(APP_DIR)}") print(f"Saved: {fi_fig.relative_to(APP_DIR)}") return pipe, X_train, y_train, X_test, y_test def step3_finalize(pipe: Pipeline, X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series) -> None: print("\n" + "=" * 58) print("STEP 3/3: Validation + SHAP Background Cache") print("=" * 58) bg = X_train.sample(min(80, len(X_train)), random_state=42) bg_path = MODELS_DIR / "background_sample.csv" bg.to_csv(bg_path, index=False) proba = pipe.predict_proba(X_test)[:, 1] meta = { "features": FEATURES, "categorical_features": CAT_COLS, "numeric_features": NUM_COLS, "target": TARGET, "threshold": 0.5, "positive_rate_test": float(np.mean(y_test)), "mean_predicted_proba_test": float(np.mean(proba)), } meta_path = MODELS_DIR / "model_meta.json" meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8") print(f"Saved: {bg_path.relative_to(APP_DIR)}") print(f"Saved: {meta_path.relative_to(APP_DIR)}") print("Pipeline completed successfully.") def main() -> int: ensure_dirs() df = step1_prepare() pipe, X_train, y_train, X_test, y_test = step2_train(df) step3_finalize(pipe, X_train, y_train, X_test, y_test) print("DONE") return 0 if __name__ == "__main__": raise SystemExit(main())