| """Train a simple churn model and export sklearn pipeline to models/pipeline.joblib. |
| |
| This is optional. The Space can run in demo mode without this file. |
| Run locally: |
| python scripts/train_and_export.py |
| |
| It expects data/bankChurn.csv (Kaggle-style churn dataset) OR you can edit paths. |
| """ |
|
|
| from pathlib import Path |
|
|
| import joblib |
| import pandas as pd |
| from sklearn.compose import ColumnTransformer |
| from sklearn.impute import SimpleImputer |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.metrics import roc_auc_score |
| from sklearn.model_selection import train_test_split |
| from sklearn.pipeline import Pipeline |
| from sklearn.preprocessing import OneHotEncoder, StandardScaler |
|
|
| APP_DIR = Path(__file__).resolve().parents[1] |
| DATA_PATH = APP_DIR / "data" / "bankChurn.csv" |
| OUT_PATH = APP_DIR / "models" / "pipeline.joblib" |
|
|
| |
| TARGET_CANDIDATES = ["Exited", "churn", "Churn", "target"] |
|
|
|
|
| def main(): |
| if not DATA_PATH.exists(): |
| raise FileNotFoundError(f"Missing dataset: {DATA_PATH}") |
|
|
| df = pd.read_csv(DATA_PATH) |
|
|
| target = next((c for c in TARGET_CANDIDATES if c in df.columns), None) |
| if target is None: |
| raise ValueError(f"Could not find a target column. Tried: {TARGET_CANDIDATES}") |
|
|
| |
| feature_cols = [ |
| "CreditScore", |
| "Geography", |
| "Gender", |
| "Age", |
| "Tenure", |
| "Balance", |
| "NumOfProducts", |
| "HasCrCard", |
| "IsActiveMember", |
| "EstimatedSalary", |
| ] |
|
|
| missing = [c for c in feature_cols if c not in df.columns] |
| if missing: |
| raise ValueError(f"Dataset missing required feature columns: {missing}") |
|
|
| X = df[feature_cols] |
| y = df[target].astype(int) |
|
|
| X_train, X_test, y_train, y_test = train_test_split( |
| X, y, test_size=0.2, random_state=42, stratify=y |
| ) |
|
|
| cat_cols = ["Geography", "Gender"] |
| num_cols = [c for c in feature_cols if c not in cat_cols] |
|
|
| numeric = Pipeline( |
| steps=[ |
| ("imputer", SimpleImputer(strategy="median")), |
| ("scaler", StandardScaler()), |
| ] |
| ) |
|
|
| categorical = Pipeline( |
| steps=[ |
| ("imputer", SimpleImputer(strategy="most_frequent")), |
| ("onehot", OneHotEncoder(handle_unknown="ignore")), |
| ] |
| ) |
|
|
| pre = ColumnTransformer( |
| transformers=[ |
| ("num", numeric, num_cols), |
| ("cat", categorical, cat_cols), |
| ] |
| ) |
|
|
| clf = LogisticRegression(max_iter=2000) |
|
|
| pipe = Pipeline(steps=[("preprocess", pre), ("model", clf)]) |
| pipe.fit(X_train, y_train) |
|
|
| proba = pipe.predict_proba(X_test)[:, 1] |
| auc = roc_auc_score(y_test, proba) |
|
|
| OUT_PATH.parent.mkdir(parents=True, exist_ok=True) |
| joblib.dump(pipe, OUT_PATH) |
|
|
| print(f"Saved pipeline to: {OUT_PATH}") |
| print(f"Test ROC-AUC: {auc:.4f}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|