XRachel commited on
Commit
fde2bc0
·
verified ·
1 Parent(s): 194fb23

Upload 9 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/Bank[[:space:]]Churn.png filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1
4
+ ENV PYTHONUNBUFFERED=1
5
+ ENV GRADIO_SERVER_NAME=0.0.0.0
6
+ ENV GRADIO_SERVER_PORT=7860
7
+
8
+ WORKDIR /app
9
+
10
+ COPY requirements.txt /app/requirements.txt
11
+ RUN pip install --no-cache-dir --upgrade pip && \
12
+ pip install --no-cache-dir -r /app/requirements.txt
13
+
14
+ COPY . /app
15
+
16
+ EXPOSE 7860
17
+
18
+ CMD ["python", "-u", "app.py"]
README.md CHANGED
@@ -1,10 +1,34 @@
1
  ---
2
- title: Bc5
3
- emoji: 📉
4
- colorFrom: purple
5
- colorTo: yellow
6
  sdk: docker
 
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Bank Churn Pro Demo
3
+ emoji: 🏦
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
  ---
10
 
11
+ # Bank Churn Pro Demo
12
+
13
+ A Hugging Face Docker Space for bank customer churn analysis with:
14
+
15
+ - Full-screen Bank Churn background UI
16
+ - Pipeline Step 1/2/3 execution log
17
+ - Feature importance chart
18
+ - Churn probability gauge
19
+ - CSV batch prediction
20
+ - SHAP explainability
21
+
22
+ ## Included files
23
+
24
+ - `app.py` - Gradio app
25
+ - `scripts/pipeline.py` - training / artifact generation pipeline
26
+ - `data/bankChurn.csv` - sample dataset
27
+ - `assets/Bank Churn.png` - background image
28
+
29
+ ## Expected workflow
30
+
31
+ 1. Open the Space
32
+ 2. Go to **Pipeline** and click **Run Pipeline**
33
+ 3. Wait for the 3-step pipeline to finish
34
+ 4. Use **Single Prediction**, **Batch CSV**, and **Explainability** tabs
app.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import subprocess
6
+ from pathlib import Path
7
+ from typing import Generator
8
+
9
+ import gradio as gr
10
+ import joblib
11
+ import matplotlib.pyplot as plt
12
+ import pandas as pd
13
+ import shap
14
+
15
+ APP_DIR = Path(__file__).parent.resolve()
16
+ STYLE_FILE = APP_DIR / "style.css"
17
+ ASSETS_DIR = APP_DIR / "assets"
18
+ DATA_DIR = APP_DIR / "data"
19
+ MODELS_DIR = APP_DIR / "models"
20
+ OUT_DIR = APP_DIR / "outputs"
21
+ FIG_DIR = OUT_DIR / "figures"
22
+ TAB_DIR = OUT_DIR / "tables"
23
+
24
+ MODEL_FILE = MODELS_DIR / "pipeline.joblib"
25
+ META_FILE = MODELS_DIR / "model_meta.json"
26
+ BG_FILE = MODELS_DIR / "background_sample.csv"
27
+ TEMPLATE_CSV = DATA_DIR / "batch_template.csv"
28
+
29
+ DEFAULTS = {
30
+ "AGE": 42,
31
+ "OPEN_ACC_DUR": 120,
32
+ "GENDER_CD": "1",
33
+ "HASNT_HOME_ADDRESS_INF": "N",
34
+ "HASNT_MOBILE_TEL_NUM_INF": "N",
35
+ "LOCAL_CUR_MON_AVG_BAL": 25000.0,
36
+ "LOCAL_FIX_MON_AVG_BAL": 18000.0,
37
+ "LOCAL_SAV_CUR_ALL_BAL": 28000.0,
38
+ "POS_CONSUME_TX_AMT": 5000.0,
39
+ "ATM_ALL_TX_NUM": 6,
40
+ "COUNTER_ALL_TX_NUM": 2,
41
+ }
42
+ FEATURES = list(DEFAULTS.keys())
43
+
44
+ PIPE = None
45
+ META = None
46
+
47
+
48
+ def ensure_template_csv() -> None:
49
+ if not TEMPLATE_CSV.exists():
50
+ pd.DataFrame([DEFAULTS]).to_csv(TEMPLATE_CSV, index=False)
51
+
52
+
53
+ def load_assets() -> tuple[object | None, dict | None]:
54
+ pipe = joblib.load(MODEL_FILE) if MODEL_FILE.exists() else None
55
+ meta = json.loads(META_FILE.read_text(encoding="utf-8")) if META_FILE.exists() else None
56
+ return pipe, meta
57
+
58
+
59
+ def refresh_model_state() -> str:
60
+ global PIPE, META
61
+ PIPE, META = load_assets()
62
+ if PIPE is None:
63
+ return "⚠️ 当前为演示状态:请先在 Pipeline 标签页点击 **Run Pipeline** 生成模型。"
64
+ return "✅ 模型已加载,可以进行单条预测、批量预测和 SHAP 解释。"
65
+
66
+
67
+ def gauge_html(prob: float) -> str:
68
+ pct = max(0.0, min(100.0, prob * 100.0))
69
+ color = "#16a34a" if prob < 0.35 else ("#f59e0b" if prob < 0.65 else "#dc2626")
70
+ return f"""
71
+ <div style='background:rgba(255,255,255,0.88);padding:16px;border-radius:18px'>
72
+ <div style='font-size:18px;font-weight:700;margin-bottom:8px'>Churn Probability Gauge</div>
73
+ <div style='width:100%;height:20px;background:#e5e7eb;border-radius:999px;overflow:hidden'>
74
+ <div style='width:{pct:.1f}%;height:20px;background:{color};border-radius:999px'></div>
75
+ </div>
76
+ <div style='margin-top:10px;font-size:28px;font-weight:800;color:{color}'>{pct:.1f}%</div>
77
+ </div>
78
+ """
79
+
80
+
81
+ def input_df(age, open_acc_dur, gender_cd, hasnt_home_address_inf, hasnt_mobile_tel_num_inf,
82
+ local_cur_mon_avg_bal, local_fix_mon_avg_bal, local_sav_cur_all_bal,
83
+ pos_consume_tx_amt, atm_all_tx_num, counter_all_tx_num) -> pd.DataFrame:
84
+ return pd.DataFrame([{
85
+ "AGE": int(age),
86
+ "OPEN_ACC_DUR": int(open_acc_dur),
87
+ "GENDER_CD": str(gender_cd),
88
+ "HASNT_HOME_ADDRESS_INF": str(hasnt_home_address_inf),
89
+ "HASNT_MOBILE_TEL_NUM_INF": str(hasnt_mobile_tel_num_inf),
90
+ "LOCAL_CUR_MON_AVG_BAL": float(local_cur_mon_avg_bal),
91
+ "LOCAL_FIX_MON_AVG_BAL": float(local_fix_mon_avg_bal),
92
+ "LOCAL_SAV_CUR_ALL_BAL": float(local_sav_cur_all_bal),
93
+ "POS_CONSUME_TX_AMT": float(pos_consume_tx_amt),
94
+ "ATM_ALL_TX_NUM": int(atm_all_tx_num),
95
+ "COUNTER_ALL_TX_NUM": int(counter_all_tx_num),
96
+ }])
97
+
98
+
99
+ def predict_single(age, open_acc_dur, gender_cd, hasnt_home_address_inf, hasnt_mobile_tel_num_inf,
100
+ local_cur_mon_avg_bal, local_fix_mon_avg_bal, local_sav_cur_all_bal,
101
+ pos_consume_tx_amt, atm_all_tx_num, counter_all_tx_num):
102
+ if PIPE is None:
103
+ return {"error": "Run Pipeline first."}, "请先运行 Pipeline。", gauge_html(0.0), None
104
+ df = input_df(age, open_acc_dur, gender_cd, hasnt_home_address_inf, hasnt_mobile_tel_num_inf,
105
+ local_cur_mon_avg_bal, local_fix_mon_avg_bal, local_sav_cur_all_bal,
106
+ pos_consume_tx_amt, atm_all_tx_num, counter_all_tx_num)
107
+ prob = float(PIPE.predict_proba(df)[0, 1])
108
+ pred = int(prob >= 0.5)
109
+ risk = "低风险" if prob < 0.35 else ("中风险" if prob < 0.65 else "高风险")
110
+ payload = {
111
+ "churn_probability": round(prob, 6),
112
+ "predicted_label": pred,
113
+ "risk_level": risk,
114
+ }
115
+ summary = f"**预测结果**:{'流失' if pred == 1 else '留存'} \n\n**概率**:{prob:.2%} \n**风险等级**:{risk}"
116
+ return payload, summary, gauge_html(prob), None
117
+
118
+
119
+ def predict_batch(file_obj):
120
+ if PIPE is None:
121
+ return None, None, "请先运行 Pipeline。"
122
+ if file_obj is None:
123
+ return None, None, "请先上传 CSV。"
124
+ df = pd.read_csv(file_obj.name)
125
+ missing = [c for c in FEATURES if c not in df.columns]
126
+ if missing:
127
+ return None, None, f"CSV 缺少列:{missing}"
128
+ x = df[FEATURES].copy()
129
+ proba = PIPE.predict_proba(x)[:, 1]
130
+ pred = (proba >= 0.5).astype(int)
131
+ out = df.copy()
132
+ out["churn_proba"] = proba
133
+ out["churn_pred"] = pred
134
+ out_path = OUT_DIR / "batch_predictions.csv"
135
+ out.to_csv(out_path, index=False)
136
+ return out.head(50), str(out_path), "批量预测完成。"
137
+
138
+
139
+ def make_feature_importance_plot():
140
+ fp = TAB_DIR / "feature_importance.csv"
141
+ if not fp.exists():
142
+ return None
143
+ fi = pd.read_csv(fp)
144
+ plt.figure(figsize=(8, 4.5))
145
+ plt.barh(fi["feature"][::-1], fi["importance"][::-1])
146
+ plt.title("Feature Importance")
147
+ plt.xlabel("Importance")
148
+ plt.tight_layout()
149
+ fig_path = FIG_DIR / "feature_importance_runtime.png"
150
+ plt.savefig(fig_path, dpi=160)
151
+ plt.close()
152
+ return str(fig_path)
153
+
154
+
155
+ def explain_single(age, open_acc_dur, gender_cd, hasnt_home_address_inf, hasnt_mobile_tel_num_inf,
156
+ local_cur_mon_avg_bal, local_fix_mon_avg_bal, local_sav_cur_all_bal,
157
+ pos_consume_tx_amt, atm_all_tx_num, counter_all_tx_num):
158
+ if PIPE is None or not BG_FILE.exists():
159
+ return None, "请先运行 Pipeline。"
160
+
161
+ row = input_df(age, open_acc_dur, gender_cd, hasnt_home_address_inf, hasnt_mobile_tel_num_inf,
162
+ local_cur_mon_avg_bal, local_fix_mon_avg_bal, local_sav_cur_all_bal,
163
+ pos_consume_tx_amt, atm_all_tx_num, counter_all_tx_num)
164
+ background = pd.read_csv(BG_FILE)
165
+ background = background[FEATURES].head(40)
166
+
167
+ def f(x):
168
+ x_df = pd.DataFrame(x, columns=FEATURES)
169
+ for c in ["GENDER_CD", "HASNT_HOME_ADDRESS_INF", "HASNT_MOBILE_TEL_NUM_INF"]:
170
+ x_df[c] = x_df[c].astype(str)
171
+ for c in [col for col in FEATURES if col not in ["GENDER_CD", "HASNT_HOME_ADDRESS_INF", "HASNT_MOBILE_TEL_NUM_INF"]]:
172
+ x_df[c] = pd.to_numeric(x_df[c], errors="coerce")
173
+ return PIPE.predict_proba(x_df)[:, 1]
174
+
175
+ explainer = shap.Explainer(f, background, feature_names=FEATURES)
176
+ sv = explainer(row)
177
+
178
+ plt.figure(figsize=(9, 4.8))
179
+ shap.plots.waterfall(sv[0], max_display=10, show=False)
180
+ plt.tight_layout()
181
+ out_path = FIG_DIR / "shap_waterfall.png"
182
+ plt.savefig(out_path, dpi=160, bbox_inches="tight")
183
+ plt.close()
184
+ prob = float(PIPE.predict_proba(row)[0, 1])
185
+ txt = f"SHAP 解释已生成。该客户流失概率约为 **{prob:.2%}**。"
186
+ return str(out_path), txt
187
+
188
+
189
+ def run_pipeline_stream() -> Generator[tuple[str, str, str], None, None]:
190
+ log_lines = []
191
+ cmd = ["python", "-u", str(APP_DIR / "scripts" / "pipeline.py")]
192
+ proc = subprocess.Popen(cmd, cwd=str(APP_DIR), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1)
193
+ assert proc.stdout is not None
194
+ yield "", "⏳ Pipeline 正在运行...", refresh_model_state()
195
+ for line in proc.stdout:
196
+ log_lines.append(line.rstrip("\n"))
197
+ if len(log_lines) > 400:
198
+ log_lines = log_lines[-400:]
199
+ yield "\n".join(log_lines), "⏳ Pipeline 正在运行...", refresh_model_state()
200
+ rc = proc.wait()
201
+ status = "✅ Pipeline 运行完成。" if rc == 0 else f"❌ Pipeline 失败,退出码 {rc}。"
202
+ model_status = refresh_model_state()
203
+ yield "\n".join(log_lines), status, model_status
204
+
205
+
206
+ def build_ui():
207
+ ensure_template_csv()
208
+ gr.set_static_paths(paths=[str(ASSETS_DIR)])
209
+ css = STYLE_FILE.read_text(encoding="utf-8") if STYLE_FILE.exists() else ""
210
+ model_status = refresh_model_state()
211
+ with gr.Blocks() as demo:
212
+ gr.HTML(f"<style>{css}</style>")
213
+ with gr.Column(elem_id="main_panel"):
214
+ gr.Markdown("# 🏦 Bank Churn Pro Demo\n全屏背景 + Pipeline 日志 + 特征重要性 + 概率仪表盘 + CSV 批量预测 + SHAP 解释")
215
+ model_state_md = gr.Markdown(model_status)
216
+ pipeline_status_md = gr.Markdown("尚未运行 Pipeline。")
217
+
218
+ with gr.Tabs():
219
+ with gr.Tab("Pipeline"):
220
+ gr.Markdown("点击按钮执行 3 步流水线:数据准备 → 模型训练与特征重要性 → 验证与 SHAP 背景缓存")
221
+ run_btn = gr.Button("▶ Run Pipeline", variant="primary")
222
+ log_box = gr.Textbox(label="Pipeline Step 1/2/3 日志", lines=22, interactive=False)
223
+ fi_image = gr.Image(label="Feature Importance 图", type="filepath")
224
+ run_btn.click(fn=run_pipeline_stream, inputs=[], outputs=[log_box, pipeline_status_md, model_state_md]).then(fn=make_feature_importance_plot, inputs=[], outputs=fi_image)
225
+
226
+ with gr.Tab("Single Prediction"):
227
+ with gr.Row():
228
+ with gr.Column():
229
+ age = gr.Slider(18, 100, value=DEFAULTS["AGE"], step=1, label="AGE")
230
+ open_acc_dur = gr.Slider(0, 400, value=DEFAULTS["OPEN_ACC_DUR"], step=1, label="OPEN_ACC_DUR")
231
+ gender_cd = gr.Dropdown(choices=["0", "1"], value=DEFAULTS["GENDER_CD"], label="GENDER_CD")
232
+ hasnt_home = gr.Dropdown(choices=["N", "Y"], value=DEFAULTS["HASNT_HOME_ADDRESS_INF"], label="HASNT_HOME_ADDRESS_INF")
233
+ hasnt_mobile = gr.Dropdown(choices=["N", "Y"], value=DEFAULTS["HASNT_MOBILE_TEL_NUM_INF"], label="HASNT_MOBILE_TEL_NUM_INF")
234
+ local_cur = gr.Number(value=DEFAULTS["LOCAL_CUR_MON_AVG_BAL"], label="LOCAL_CUR_MON_AVG_BAL")
235
+ local_fix = gr.Number(value=DEFAULTS["LOCAL_FIX_MON_AVG_BAL"], label="LOCAL_FIX_MON_AVG_BAL")
236
+ local_sav = gr.Number(value=DEFAULTS["LOCAL_SAV_CUR_ALL_BAL"], label="LOCAL_SAV_CUR_ALL_BAL")
237
+ pos_amt = gr.Number(value=DEFAULTS["POS_CONSUME_TX_AMT"], label="POS_CONSUME_TX_AMT")
238
+ atm_num = gr.Slider(0, 100, value=DEFAULTS["ATM_ALL_TX_NUM"], step=1, label="ATM_ALL_TX_NUM")
239
+ counter_num = gr.Slider(0, 100, value=DEFAULTS["COUNTER_ALL_TX_NUM"], step=1, label="COUNTER_ALL_TX_NUM")
240
+ pred_btn = gr.Button("Predict", variant="primary")
241
+ with gr.Column():
242
+ pred_json = gr.JSON(label="Prediction JSON")
243
+ pred_md = gr.Markdown()
244
+ gauge = gr.HTML(label="Gauge")
245
+ pred_btn.click(
246
+ fn=predict_single,
247
+ inputs=[age, open_acc_dur, gender_cd, hasnt_home, hasnt_mobile, local_cur, local_fix, local_sav, pos_amt, atm_num, counter_num],
248
+ outputs=[pred_json, pred_md, gauge, fi_image],
249
+ )
250
+
251
+ with gr.Tab("CSV Batch"):
252
+ gr.Markdown("上传包含以下列的 CSV:" + ", ".join(FEATURES))
253
+ with gr.Row():
254
+ batch_file = gr.File(label="Upload CSV", file_types=[".csv"])
255
+ template_file = gr.File(value=str(TEMPLATE_CSV), label="Template CSV")
256
+ batch_btn = gr.Button("Run Batch Prediction")
257
+ batch_df = gr.Dataframe(label="Preview (Top 50)")
258
+ batch_out_file = gr.File(label="Download Result CSV")
259
+ batch_msg = gr.Markdown()
260
+ batch_btn.click(fn=predict_batch, inputs=[batch_file], outputs=[batch_df, batch_out_file, batch_msg])
261
+
262
+ with gr.Tab("Explainability"):
263
+ gr.Markdown("使用当前表单中的同一组输入生成 SHAP waterfall 图。")
264
+ explain_btn = gr.Button("Generate SHAP Explainability")
265
+ shap_image = gr.Image(label="SHAP Explainability", type="filepath")
266
+ shap_md = gr.Markdown()
267
+ explain_btn.click(
268
+ fn=explain_single,
269
+ inputs=[age, open_acc_dur, gender_cd, hasnt_home, hasnt_mobile, local_cur, local_fix, local_sav, pos_amt, atm_num, counter_num],
270
+ outputs=[shap_image, shap_md],
271
+ )
272
+
273
+ gr.Markdown("<div class='footer-note'>提示:首次进入请先运行 Pipeline,再使用预测、批量预测和解释功能。</div>")
274
+ return demo
275
+
276
+
277
+ if __name__ == "__main__":
278
+ demo = build_ui()
279
+ demo.queue()
280
+ port = int(os.environ.get("PORT", "7860"))
281
+ demo.launch(server_name="0.0.0.0", server_port=port)
assets/Bank Churn.png ADDED

Git LFS Details

  • SHA256: 1ce095ba6507023114cb93cb1cca0c5ef4a2153a17b14545d40f1bbd99ef16eb
  • Pointer size: 132 Bytes
  • Size of remote file: 4.54 MB
data/bankChurn.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/batch_template.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ AGE,OPEN_ACC_DUR,GENDER_CD,HASNT_HOME_ADDRESS_INF,HASNT_MOBILE_TEL_NUM_INF,LOCAL_CUR_MON_AVG_BAL,LOCAL_FIX_MON_AVG_BAL,LOCAL_SAV_CUR_ALL_BAL,POS_CONSUME_TX_AMT,ATM_ALL_TX_NUM,COUNTER_ALL_TX_NUM
2
+ 42,120,1,N,N,25000.0,18000.0,28000.0,5000.0,6,2
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio==4.44.1
2
+ pandas>=2.0.0
3
+ numpy>=1.24.0
4
+ scikit-learn>=1.3.0
5
+ joblib>=1.3.0
6
+ matplotlib>=3.8.0
7
+ shap>=0.45.0
scripts/pipeline.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ import joblib
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ from sklearn.compose import ColumnTransformer
11
+ from sklearn.impute import SimpleImputer
12
+ from sklearn.inspection import permutation_importance
13
+ from sklearn.linear_model import LogisticRegression
14
+ from sklearn.metrics import roc_auc_score
15
+ from sklearn.model_selection import train_test_split
16
+ from sklearn.pipeline import Pipeline
17
+ from sklearn.preprocessing import OneHotEncoder, StandardScaler
18
+
19
+ APP_DIR = Path(__file__).resolve().parents[1]
20
+ DATA_PATH = APP_DIR / "data" / "bankChurn.csv"
21
+ MODELS_DIR = APP_DIR / "models"
22
+ OUT_DIR = APP_DIR / "outputs"
23
+ FIG_DIR = OUT_DIR / "figures"
24
+ TAB_DIR = OUT_DIR / "tables"
25
+
26
+ TARGET = "CHURN_CUST_IND"
27
+ FEATURES = [
28
+ "AGE",
29
+ "OPEN_ACC_DUR",
30
+ "GENDER_CD",
31
+ "HASNT_HOME_ADDRESS_INF",
32
+ "HASNT_MOBILE_TEL_NUM_INF",
33
+ "LOCAL_CUR_MON_AVG_BAL",
34
+ "LOCAL_FIX_MON_AVG_BAL",
35
+ "LOCAL_SAV_CUR_ALL_BAL",
36
+ "POS_CONSUME_TX_AMT",
37
+ "ATM_ALL_TX_NUM",
38
+ "COUNTER_ALL_TX_NUM",
39
+ ]
40
+ CAT_COLS = ["GENDER_CD", "HASNT_HOME_ADDRESS_INF", "HASNT_MOBILE_TEL_NUM_INF"]
41
+ NUM_COLS = [c for c in FEATURES if c not in CAT_COLS]
42
+
43
+
44
+ def ensure_dirs() -> None:
45
+ MODELS_DIR.mkdir(parents=True, exist_ok=True)
46
+ FIG_DIR.mkdir(parents=True, exist_ok=True)
47
+ TAB_DIR.mkdir(parents=True, exist_ok=True)
48
+
49
+
50
+ def step1_prepare() -> pd.DataFrame:
51
+ print("=" * 58)
52
+ print("STEP 1/3: Data Preparation")
53
+ print("=" * 58)
54
+ df = pd.read_csv(DATA_PATH)
55
+ keep = FEATURES + [TARGET]
56
+ missing = [c for c in keep if c not in df.columns]
57
+ if missing:
58
+ raise ValueError(f"Missing expected columns: {missing}")
59
+
60
+ df = df[keep].copy()
61
+ for c in CAT_COLS:
62
+ df[c] = df[c].astype(str)
63
+ for c in NUM_COLS + [TARGET]:
64
+ df[c] = pd.to_numeric(df[c], errors="coerce")
65
+
66
+ processed_path = OUT_DIR / "processed_bank_churn.csv"
67
+ df.to_csv(processed_path, index=False)
68
+ print(f"Rows: {len(df):,} | Cols: {df.shape[1]}")
69
+ print(f"Saved: {processed_path.relative_to(APP_DIR)}")
70
+ return df
71
+
72
+
73
+ def build_pipeline() -> Pipeline:
74
+ numeric_pipe = Pipeline(
75
+ steps=[
76
+ ("imputer", SimpleImputer(strategy="median")),
77
+ ("scaler", StandardScaler()),
78
+ ]
79
+ )
80
+ categorical_pipe = Pipeline(
81
+ steps=[
82
+ ("imputer", SimpleImputer(strategy="most_frequent")),
83
+ ("onehot", OneHotEncoder(handle_unknown="ignore")),
84
+ ]
85
+ )
86
+ preprocess = ColumnTransformer(
87
+ transformers=[
88
+ ("num", numeric_pipe, NUM_COLS),
89
+ ("cat", categorical_pipe, CAT_COLS),
90
+ ]
91
+ )
92
+ model = LogisticRegression(max_iter=1500, class_weight="balanced")
93
+ return Pipeline(steps=[("preprocess", preprocess), ("model", model)])
94
+
95
+
96
+ def step2_train(df: pd.DataFrame) -> tuple[Pipeline, pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
97
+ print("\n" + "=" * 58)
98
+ print("STEP 2/3: Train Model + Artifacts")
99
+ print("=" * 58)
100
+ X = df[FEATURES].copy()
101
+ y = df[TARGET].astype(int)
102
+
103
+ X_train, X_test, y_train, y_test = train_test_split(
104
+ X, y, test_size=0.2, random_state=42, stratify=y
105
+ )
106
+
107
+ pipe = build_pipeline()
108
+ pipe.fit(X_train, y_train)
109
+
110
+ proba = pipe.predict_proba(X_test)[:, 1]
111
+ pred = (proba >= 0.5).astype(int)
112
+ auc = float(roc_auc_score(y_test, proba))
113
+
114
+ model_path = MODELS_DIR / "pipeline.joblib"
115
+ joblib.dump(pipe, model_path)
116
+ print(f"Saved model: {model_path.relative_to(APP_DIR)}")
117
+ print(f"ROC-AUC: {auc:.4f}")
118
+
119
+ pred_df = X_test.copy()
120
+ pred_df["actual"] = y_test.to_numpy()
121
+ pred_df["churn_proba"] = proba
122
+ pred_df["churn_pred"] = pred
123
+ test_pred_path = TAB_DIR / "test_predictions.csv"
124
+ pred_df.to_csv(test_pred_path, index=False)
125
+ print(f"Saved: {test_pred_path.relative_to(APP_DIR)}")
126
+
127
+ r = permutation_importance(pipe, X_test, y_test, n_repeats=5, random_state=42, scoring="roc_auc")
128
+ fi = pd.DataFrame({"feature": FEATURES, "importance": r.importances_mean}).sort_values("importance", ascending=False)
129
+ fi_path = TAB_DIR / "feature_importance.csv"
130
+ fi.to_csv(fi_path, index=False)
131
+
132
+ plt.figure(figsize=(8, 4.5))
133
+ plt.barh(fi["feature"][::-1], fi["importance"][::-1])
134
+ plt.title("Feature Importance (Permutation)")
135
+ plt.xlabel("Importance")
136
+ plt.tight_layout()
137
+ fi_fig = FIG_DIR / "feature_importance.png"
138
+ plt.savefig(fi_fig, dpi=160)
139
+ plt.close()
140
+ print(f"Saved: {fi_path.relative_to(APP_DIR)}")
141
+ print(f"Saved: {fi_fig.relative_to(APP_DIR)}")
142
+
143
+ return pipe, X_train, y_train, X_test, y_test
144
+
145
+
146
+ def step3_finalize(pipe: Pipeline, X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series) -> None:
147
+ print("\n" + "=" * 58)
148
+ print("STEP 3/3: Validation + SHAP Background Cache")
149
+ print("=" * 58)
150
+ bg = X_train.sample(min(80, len(X_train)), random_state=42)
151
+ bg_path = MODELS_DIR / "background_sample.csv"
152
+ bg.to_csv(bg_path, index=False)
153
+
154
+ proba = pipe.predict_proba(X_test)[:, 1]
155
+ meta = {
156
+ "features": FEATURES,
157
+ "categorical_features": CAT_COLS,
158
+ "numeric_features": NUM_COLS,
159
+ "target": TARGET,
160
+ "threshold": 0.5,
161
+ "positive_rate_test": float(np.mean(y_test)),
162
+ "mean_predicted_proba_test": float(np.mean(proba)),
163
+ }
164
+ meta_path = MODELS_DIR / "model_meta.json"
165
+ meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
166
+ print(f"Saved: {bg_path.relative_to(APP_DIR)}")
167
+ print(f"Saved: {meta_path.relative_to(APP_DIR)}")
168
+ print("Pipeline completed successfully.")
169
+
170
+
171
+ def main() -> int:
172
+ ensure_dirs()
173
+ df = step1_prepare()
174
+ pipe, X_train, y_train, X_test, y_test = step2_train(df)
175
+ step3_finalize(pipe, X_train, y_train, X_test, y_test)
176
+ print("DONE")
177
+ return 0
178
+
179
+
180
+ if __name__ == "__main__":
181
+ raise SystemExit(main())
style.css ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ html, body, .gradio-container {
2
+ min-height: 100vh !important;
3
+ margin: 0 !important;
4
+ background: #09111f !important;
5
+ }
6
+
7
+ body {
8
+ background-image: url('/gradio_api/file=assets/Bank%20Churn.png') !important;
9
+ background-size: cover !important;
10
+ background-repeat: no-repeat !important;
11
+ background-position: center center !important;
12
+ }
13
+
14
+ .gradio-container {
15
+ background: transparent !important;
16
+ }
17
+
18
+ #main_panel {
19
+ background: rgba(255,255,255,0.90);
20
+ border-radius: 22px;
21
+ padding: 18px;
22
+ box-shadow: 0 16px 40px rgba(0,0,0,0.30);
23
+ }
24
+
25
+ .soft-card {
26
+ background: rgba(255,255,255,0.86);
27
+ border-radius: 18px;
28
+ padding: 12px;
29
+ }
30
+
31
+ .footer-note {
32
+ opacity: 0.8;
33
+ font-size: 0.92rem;
34
+ }