Spaces:

VasTk
/

user-churn

Sleeping

user-churn / utils /modelling.py

VasithaTilakumara

Version 2.0 - added LFS tracking for lsapp.tsv and updated features

53b92fc 4 months ago

3.98 kB

	import pandas as pd
	import joblib
	from sklearn.preprocessing import StandardScaler
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
	from utils.data_prep import split_app

	def train_models(X_train, y_train):
	rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
	lr = LogisticRegression(max_iter=1000, solver="liblinear")

	rf.fit(X_train, y_train)
	lr.fit(X_train, y_train)
	return {"RandomForest": rf, "LogisticRegression": lr}

	def evaluate_model(model, X_test, y_test):
	y_pred = model.predict(X_test)
	y_prob = model.predict_proba(X_test)[:, 1]

	acc = accuracy_score(y_test, y_pred)
	auc = roc_auc_score(y_test, y_prob)
	cm = confusion_matrix(y_test, y_pred)
	report = classification_report(y_test, y_pred, output_dict=True)

	return {"accuracy": acc, "auc": auc, "cm": cm, "report": report}

	# def add_churn_probability(app_df, model=None):
	# X = app_df[["session_count", "recency"]]
	# y = app_df["churn"]

	# X_train, X_test, y_train, y_test = train_test_split(
	# X, y, test_size=0.3, random_state=42, stratify=y
	# )
	# # X_train, X_test, y_train, y_test = split_app(app_df)

	# # Train simple model
	# # model = RandomForestClassifier(n_estimators=100, random_state=42)
	# # model.fit(X_train, y_train)
	# if model is None:
	# model = RandomForestClassifier(n_estimators=100, random_state=42)
	# model.fit(X_train, y_train)

	# # Predict churn probability on holdout
	# probs = model.predict_proba(X_test)[:, 1]

	# # Build new df for plotting
	# df_plot = X_test.copy()
	# # df_plot["userid"] = app_df.loc[X_test.index, "userid"].values
	# # df_plot["recency"] =
	# df_plot["ChurnProbability"] = probs
	# return df_plot

	def train_and_predict_with_features(df, feature_cols, target_col="churn", model_out="models/rf_app.pkl", csv_out="results/appdata_rf.csv"):
	"""
	Trains a RandomForest model, predicts churn and probabilities,
	and returns a dataframe that merges predictions with original features.
	"""
	X = df[feature_cols]
	y = df[target_col]

	# Split
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.3, random_state=42, stratify=y
	)

	# Scale
	scaler = StandardScaler()
	X_train_scaled = scaler.fit_transform(X_train)
	X_test_scaled = scaler.transform(X_test)

	# Train RF
	rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
	rf.fit(X_train_scaled, y_train)

	# Save model
	# joblib.dump(rf, model_out)

	# Predictions
	churn_pred = rf.predict(X_test_scaled)
	churn_prob = rf.predict_proba(X_test_scaled)[:, 1]

	# Convert back to DataFrame
	X_test_df = pd.DataFrame(X_test_scaled,
	columns=[f"{c}_scaled" for c in feature_cols],
	index=y_test.index)
	churn_pred_series = pd.Series(churn_pred, index=X_test_df.index, name="PredictedChurn")
	churn_prob_series = pd.Series(churn_prob, index=X_test_df.index, name="ChurnProbability")

	# Join predictions with scaled features
	df_preds = X_test_df.copy()
	df_preds["TrueLabel"] = y_test
	df_preds = df_preds.join(churn_pred_series).join(churn_prob_series)

	# Merge with original unscaled features for interpretability
	original_features_test = df.loc[X_test_df.index, feature_cols]
	predictions_with_features = pd.concat(
	[original_features_test.reset_index(drop=True), df_preds.reset_index(drop=True)],
	axis=1
	)

	# Save results
	# predictions_with_features.to_csv(csv_out, index=False)

	return predictions_with_features

	def save_model(model, path):
	joblib.dump(model, path)

	def load_model(path):
	return joblib.load(path)