import pandas as pd import joblib from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report from utils.data_prep import split_app def train_models(X_train, y_train): rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42) lr = LogisticRegression(max_iter=1000, solver="liblinear") rf.fit(X_train, y_train) lr.fit(X_train, y_train) return {"RandomForest": rf, "LogisticRegression": lr} def evaluate_model(model, X_test, y_test): y_pred = model.predict(X_test) y_prob = model.predict_proba(X_test)[:, 1] acc = accuracy_score(y_test, y_pred) auc = roc_auc_score(y_test, y_prob) cm = confusion_matrix(y_test, y_pred) report = classification_report(y_test, y_pred, output_dict=True) return {"accuracy": acc, "auc": auc, "cm": cm, "report": report} # def add_churn_probability(app_df, model=None): # X = app_df[["session_count", "recency"]] # y = app_df["churn"] # X_train, X_test, y_train, y_test = train_test_split( # X, y, test_size=0.3, random_state=42, stratify=y # ) # # X_train, X_test, y_train, y_test = split_app(app_df) # # Train simple model # # model = RandomForestClassifier(n_estimators=100, random_state=42) # # model.fit(X_train, y_train) # if model is None: # model = RandomForestClassifier(n_estimators=100, random_state=42) # model.fit(X_train, y_train) # # Predict churn probability on holdout # probs = model.predict_proba(X_test)[:, 1] # # Build new df for plotting # df_plot = X_test.copy() # # df_plot["userid"] = app_df.loc[X_test.index, "userid"].values # # df_plot["recency"] = # df_plot["ChurnProbability"] = probs # return df_plot def train_and_predict_with_features(df, feature_cols, target_col="churn", model_out="models/rf_app.pkl", csv_out="results/appdata_rf.csv"): """ Trains a RandomForest model, predicts churn and probabilities, and returns a dataframe that merges predictions with original features. """ X = df[feature_cols] y = df[target_col] # Split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42, stratify=y ) # Scale scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # Train RF rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42) rf.fit(X_train_scaled, y_train) # Save model # joblib.dump(rf, model_out) # Predictions churn_pred = rf.predict(X_test_scaled) churn_prob = rf.predict_proba(X_test_scaled)[:, 1] # Convert back to DataFrame X_test_df = pd.DataFrame(X_test_scaled, columns=[f"{c}_scaled" for c in feature_cols], index=y_test.index) churn_pred_series = pd.Series(churn_pred, index=X_test_df.index, name="PredictedChurn") churn_prob_series = pd.Series(churn_prob, index=X_test_df.index, name="ChurnProbability") # Join predictions with scaled features df_preds = X_test_df.copy() df_preds["TrueLabel"] = y_test df_preds = df_preds.join(churn_pred_series).join(churn_prob_series) # Merge with original unscaled features for interpretability original_features_test = df.loc[X_test_df.index, feature_cols] predictions_with_features = pd.concat( [original_features_test.reset_index(drop=True), df_preds.reset_index(drop=True)], axis=1 ) # Save results # predictions_with_features.to_csv(csv_out, index=False) return predictions_with_features def save_model(model, path): joblib.dump(model, path) def load_model(path): return joblib.load(path)