Spaces:

logan-codes
/

telco-churn-predictor

Sleeping

App Files Files Community

telco-churn-predictor / scripts /test_pipeline_phase2_modeling.py

logan-codes

Add pipeline scripts, tests and deps

07e37a4 about 2 months ago

raw

history blame contribute delete

1.88 kB

	import pandas as pd
	from sklearn.model_selection import train_test_split
	from xgboost import XGBClassifier
	import optuna

	print("=== Phase 2: Modeling with XGBoost ===")

	df = pd.read_csv("data/processed/telco_churn_processed.csv")

	# target must be numeric 0/1
	if df["Churn"].dtype == "object":
	df["Churn"] = df["Churn"].str.strip().map({"No": 0, "Yes": 1})

	assert df["Churn"].isna().sum() == 0, "Churn has NaNs"
	assert set(df["Churn"].unique()) <= {0, 1}, "Churn not 0/1"

	X = df.drop(columns=["Churn"])
	y = df["Churn"]

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, stratify=y, random_state=42
	)

	THRESHOLD = 0.4

	def objective(trial):
	params = {
	"n_estimators": trial.suggest_int("n_estimators", 300, 800),
	"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
	"max_depth": trial.suggest_int("max_depth", 3, 10),
	"subsample": trial.suggest_float("subsample", 0.5, 1.0),
	"colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
	"min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
	"gamma": trial.suggest_float("gamma", 0, 5),
	"reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
	"reg_lambda": trial.suggest_float("reg_lambda", 0, 5),
	"random_state": 42,
	"n_jobs": -1,
	"scale_pos_weight": (y_train == 0).sum() / (y_train == 1).sum(),
	"eval_metric": "logloss",
	}
	model = XGBClassifier(**params)
	model.fit(X_train, y_train)
	proba = model.predict_proba(X_test)[:, 1]
	y_pred = (proba >= THRESHOLD).astype(int)
	from sklearn.metrics import recall_score
	return recall_score(y_test, y_pred, pos_label=1)

	study = optuna.create_study(direction="maximize")
	study.optimize(objective, n_trials=30)
	print("Best Params:", study.best_params)
	print("Best Recall:", study.best_value)