bankchurn / scripts /train_and_export.py
XRachel's picture
Upload 14 files
7f9c8dd verified
"""Train a simple churn model and export sklearn pipeline to models/pipeline.joblib.
This is optional. The Space can run in demo mode without this file.
Run locally:
python scripts/train_and_export.py
It expects data/bankChurn.csv (Kaggle-style churn dataset) OR you can edit paths.
"""
from pathlib import Path
import joblib
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
APP_DIR = Path(__file__).resolve().parents[1]
DATA_PATH = APP_DIR / "data" / "bankChurn.csv"
OUT_PATH = APP_DIR / "models" / "pipeline.joblib"
# Target column in many churn datasets
TARGET_CANDIDATES = ["Exited", "churn", "Churn", "target"]
def main():
if not DATA_PATH.exists():
raise FileNotFoundError(f"Missing dataset: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)
target = next((c for c in TARGET_CANDIDATES if c in df.columns), None)
if target is None:
raise ValueError(f"Could not find a target column. Tried: {TARGET_CANDIDATES}")
# Align with app FEATURE_SCHEMA if available
feature_cols = [
"CreditScore",
"Geography",
"Gender",
"Age",
"Tenure",
"Balance",
"NumOfProducts",
"HasCrCard",
"IsActiveMember",
"EstimatedSalary",
]
missing = [c for c in feature_cols if c not in df.columns]
if missing:
raise ValueError(f"Dataset missing required feature columns: {missing}")
X = df[feature_cols]
y = df[target].astype(int)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
cat_cols = ["Geography", "Gender"]
num_cols = [c for c in feature_cols if c not in cat_cols]
numeric = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler()),
]
)
categorical = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
pre = ColumnTransformer(
transformers=[
("num", numeric, num_cols),
("cat", categorical, cat_cols),
]
)
clf = LogisticRegression(max_iter=2000)
pipe = Pipeline(steps=[("preprocess", pre), ("model", clf)])
pipe.fit(X_train, y_train)
proba = pipe.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, proba)
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(pipe, OUT_PATH)
print(f"Saved pipeline to: {OUT_PATH}")
print(f"Test ROC-AUC: {auc:.4f}")
if __name__ == "__main__":
main()