telco-churn-predictor / scripts /test_pipeline_phase2_modeling.py
logan-codes's picture
Add pipeline scripts, tests and deps
07e37a4
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import optuna
print("=== Phase 2: Modeling with XGBoost ===")
df = pd.read_csv("data/processed/telco_churn_processed.csv")
# target must be numeric 0/1
if df["Churn"].dtype == "object":
df["Churn"] = df["Churn"].str.strip().map({"No": 0, "Yes": 1})
assert df["Churn"].isna().sum() == 0, "Churn has NaNs"
assert set(df["Churn"].unique()) <= {0, 1}, "Churn not 0/1"
X = df.drop(columns=["Churn"])
y = df["Churn"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
THRESHOLD = 0.4
def objective(trial):
params = {
"n_estimators": trial.suggest_int("n_estimators", 300, 800),
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
"max_depth": trial.suggest_int("max_depth", 3, 10),
"subsample": trial.suggest_float("subsample", 0.5, 1.0),
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
"min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
"gamma": trial.suggest_float("gamma", 0, 5),
"reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
"reg_lambda": trial.suggest_float("reg_lambda", 0, 5),
"random_state": 42,
"n_jobs": -1,
"scale_pos_weight": (y_train == 0).sum() / (y_train == 1).sum(),
"eval_metric": "logloss",
}
model = XGBClassifier(**params)
model.fit(X_train, y_train)
proba = model.predict_proba(X_test)[:, 1]
y_pred = (proba >= THRESHOLD).astype(int)
from sklearn.metrics import recall_score
return recall_score(y_test, y_pred, pos_label=1)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)
print("Best Params:", study.best_params)
print("Best Recall:", study.best_value)