Spaces:

XRachel
/

bc5

Runtime error

App Files Files Community

bc5 / scripts /pipeline.py

XRachel

Upload 9 files

fde2bc0 verified 3 months ago

raw

history blame contribute delete

5.88 kB

	from __future__ import annotations

	import json
	from pathlib import Path

	import joblib
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	from sklearn.compose import ColumnTransformer
	from sklearn.impute import SimpleImputer
	from sklearn.inspection import permutation_importance
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import roc_auc_score
	from sklearn.model_selection import train_test_split
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import OneHotEncoder, StandardScaler

	APP_DIR = Path(__file__).resolve().parents[1]
	DATA_PATH = APP_DIR / "data" / "bankChurn.csv"
	MODELS_DIR = APP_DIR / "models"
	OUT_DIR = APP_DIR / "outputs"
	FIG_DIR = OUT_DIR / "figures"
	TAB_DIR = OUT_DIR / "tables"

	TARGET = "CHURN_CUST_IND"
	FEATURES = [
	"AGE",
	"OPEN_ACC_DUR",
	"GENDER_CD",
	"HASNT_HOME_ADDRESS_INF",
	"HASNT_MOBILE_TEL_NUM_INF",
	"LOCAL_CUR_MON_AVG_BAL",
	"LOCAL_FIX_MON_AVG_BAL",
	"LOCAL_SAV_CUR_ALL_BAL",
	"POS_CONSUME_TX_AMT",
	"ATM_ALL_TX_NUM",
	"COUNTER_ALL_TX_NUM",
	]
	CAT_COLS = ["GENDER_CD", "HASNT_HOME_ADDRESS_INF", "HASNT_MOBILE_TEL_NUM_INF"]
	NUM_COLS = [c for c in FEATURES if c not in CAT_COLS]


	def ensure_dirs() -> None:
	MODELS_DIR.mkdir(parents=True, exist_ok=True)
	FIG_DIR.mkdir(parents=True, exist_ok=True)
	TAB_DIR.mkdir(parents=True, exist_ok=True)


	def step1_prepare() -> pd.DataFrame:
	print("=" * 58)
	print("STEP 1/3: Data Preparation")
	print("=" * 58)
	df = pd.read_csv(DATA_PATH)
	keep = FEATURES + [TARGET]
	missing = [c for c in keep if c not in df.columns]
	if missing:
	raise ValueError(f"Missing expected columns: {missing}")

	df = df[keep].copy()
	for c in CAT_COLS:
	df[c] = df[c].astype(str)
	for c in NUM_COLS + [TARGET]:
	df[c] = pd.to_numeric(df[c], errors="coerce")

	processed_path = OUT_DIR / "processed_bank_churn.csv"
	df.to_csv(processed_path, index=False)
	print(f"Rows: {len(df):,} \| Cols: {df.shape[1]}")
	print(f"Saved: {processed_path.relative_to(APP_DIR)}")
	return df


	def build_pipeline() -> Pipeline:
	numeric_pipe = Pipeline(
	steps=[
	("imputer", SimpleImputer(strategy="median")),
	("scaler", StandardScaler()),
	]
	)
	categorical_pipe = Pipeline(
	steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("onehot", OneHotEncoder(handle_unknown="ignore")),
	]
	)
	preprocess = ColumnTransformer(
	transformers=[
	("num", numeric_pipe, NUM_COLS),
	("cat", categorical_pipe, CAT_COLS),
	]
	)
	model = LogisticRegression(max_iter=1500, class_weight="balanced")
	return Pipeline(steps=[("preprocess", preprocess), ("model", model)])


	def step2_train(df: pd.DataFrame) -> tuple[Pipeline, pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
	print("\n" + "=" * 58)
	print("STEP 2/3: Train Model + Artifacts")
	print("=" * 58)
	X = df[FEATURES].copy()
	y = df[TARGET].astype(int)

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42, stratify=y
	)

	pipe = build_pipeline()
	pipe.fit(X_train, y_train)

	proba = pipe.predict_proba(X_test)[:, 1]
	pred = (proba >= 0.5).astype(int)
	auc = float(roc_auc_score(y_test, proba))

	model_path = MODELS_DIR / "pipeline.joblib"
	joblib.dump(pipe, model_path)
	print(f"Saved model: {model_path.relative_to(APP_DIR)}")
	print(f"ROC-AUC: {auc:.4f}")

	pred_df = X_test.copy()
	pred_df["actual"] = y_test.to_numpy()
	pred_df["churn_proba"] = proba
	pred_df["churn_pred"] = pred
	test_pred_path = TAB_DIR / "test_predictions.csv"
	pred_df.to_csv(test_pred_path, index=False)
	print(f"Saved: {test_pred_path.relative_to(APP_DIR)}")

	r = permutation_importance(pipe, X_test, y_test, n_repeats=5, random_state=42, scoring="roc_auc")
	fi = pd.DataFrame({"feature": FEATURES, "importance": r.importances_mean}).sort_values("importance", ascending=False)
	fi_path = TAB_DIR / "feature_importance.csv"
	fi.to_csv(fi_path, index=False)

	plt.figure(figsize=(8, 4.5))
	plt.barh(fi["feature"][::-1], fi["importance"][::-1])
	plt.title("Feature Importance (Permutation)")
	plt.xlabel("Importance")
	plt.tight_layout()
	fi_fig = FIG_DIR / "feature_importance.png"
	plt.savefig(fi_fig, dpi=160)
	plt.close()
	print(f"Saved: {fi_path.relative_to(APP_DIR)}")
	print(f"Saved: {fi_fig.relative_to(APP_DIR)}")

	return pipe, X_train, y_train, X_test, y_test


	def step3_finalize(pipe: Pipeline, X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series) -> None:
	print("\n" + "=" * 58)
	print("STEP 3/3: Validation + SHAP Background Cache")
	print("=" * 58)
	bg = X_train.sample(min(80, len(X_train)), random_state=42)
	bg_path = MODELS_DIR / "background_sample.csv"
	bg.to_csv(bg_path, index=False)

	proba = pipe.predict_proba(X_test)[:, 1]
	meta = {
	"features": FEATURES,
	"categorical_features": CAT_COLS,
	"numeric_features": NUM_COLS,
	"target": TARGET,
	"threshold": 0.5,
	"positive_rate_test": float(np.mean(y_test)),
	"mean_predicted_proba_test": float(np.mean(proba)),
	}
	meta_path = MODELS_DIR / "model_meta.json"
	meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
	print(f"Saved: {bg_path.relative_to(APP_DIR)}")
	print(f"Saved: {meta_path.relative_to(APP_DIR)}")
	print("Pipeline completed successfully.")


	def main() -> int:
	ensure_dirs()
	df = step1_prepare()
	pipe, X_train, y_train, X_test, y_test = step2_train(df)
	step3_finalize(pipe, X_train, y_train, X_test, y_test)
	print("DONE")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())