import pandas as pd from sklearn.model_selection import train_test_split from xgboost import XGBClassifier import optuna print("=== Phase 2: Modeling with XGBoost ===") df = pd.read_csv("data/processed/telco_churn_processed.csv") # target must be numeric 0/1 if df["Churn"].dtype == "object": df["Churn"] = df["Churn"].str.strip().map({"No": 0, "Yes": 1}) assert df["Churn"].isna().sum() == 0, "Churn has NaNs" assert set(df["Churn"].unique()) <= {0, 1}, "Churn not 0/1" X = df.drop(columns=["Churn"]) y = df["Churn"] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, stratify=y, random_state=42 ) THRESHOLD = 0.4 def objective(trial): params = { "n_estimators": trial.suggest_int("n_estimators", 300, 800), "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2), "max_depth": trial.suggest_int("max_depth", 3, 10), "subsample": trial.suggest_float("subsample", 0.5, 1.0), "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0), "min_child_weight": trial.suggest_int("min_child_weight", 1, 10), "gamma": trial.suggest_float("gamma", 0, 5), "reg_alpha": trial.suggest_float("reg_alpha", 0, 5), "reg_lambda": trial.suggest_float("reg_lambda", 0, 5), "random_state": 42, "n_jobs": -1, "scale_pos_weight": (y_train == 0).sum() / (y_train == 1).sum(), "eval_metric": "logloss", } model = XGBClassifier(**params) model.fit(X_train, y_train) proba = model.predict_proba(X_test)[:, 1] y_pred = (proba >= THRESHOLD).astype(int) from sklearn.metrics import recall_score return recall_score(y_test, y_pred, pos_label=1) study = optuna.create_study(direction="maximize") study.optimize(objective, n_trials=30) print("Best Params:", study.best_params) print("Best Recall:", study.best_value)