"""Train a simple churn model and export sklearn pipeline to models/pipeline.joblib. This is optional. The Space can run in demo mode without this file. Run locally: python scripts/train_and_export.py It expects data/bankChurn.csv (Kaggle-style churn dataset) OR you can edit paths. """ from pathlib import Path import joblib import pandas as pd from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler APP_DIR = Path(__file__).resolve().parents[1] DATA_PATH = APP_DIR / "data" / "bankChurn.csv" OUT_PATH = APP_DIR / "models" / "pipeline.joblib" # Target column in many churn datasets TARGET_CANDIDATES = ["Exited", "churn", "Churn", "target"] def main(): if not DATA_PATH.exists(): raise FileNotFoundError(f"Missing dataset: {DATA_PATH}") df = pd.read_csv(DATA_PATH) target = next((c for c in TARGET_CANDIDATES if c in df.columns), None) if target is None: raise ValueError(f"Could not find a target column. Tried: {TARGET_CANDIDATES}") # Align with app FEATURE_SCHEMA if available feature_cols = [ "CreditScore", "Geography", "Gender", "Age", "Tenure", "Balance", "NumOfProducts", "HasCrCard", "IsActiveMember", "EstimatedSalary", ] missing = [c for c in feature_cols if c not in df.columns] if missing: raise ValueError(f"Dataset missing required feature columns: {missing}") X = df[feature_cols] y = df[target].astype(int) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) cat_cols = ["Geography", "Gender"] num_cols = [c for c in feature_cols if c not in cat_cols] numeric = Pipeline( steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ] ) categorical = Pipeline( steps=[ ("imputer", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore")), ] ) pre = ColumnTransformer( transformers=[ ("num", numeric, num_cols), ("cat", categorical, cat_cols), ] ) clf = LogisticRegression(max_iter=2000) pipe = Pipeline(steps=[("preprocess", pre), ("model", clf)]) pipe.fit(X_train, y_train) proba = pipe.predict_proba(X_test)[:, 1] auc = roc_auc_score(y_test, proba) OUT_PATH.parent.mkdir(parents=True, exist_ok=True) joblib.dump(pipe, OUT_PATH) print(f"Saved pipeline to: {OUT_PATH}") print(f"Test ROC-AUC: {auc:.4f}") if __name__ == "__main__": main()