Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import joblib | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report | |
| from utils.data_prep import split_app | |
| def train_models(X_train, y_train): | |
| rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42) | |
| lr = LogisticRegression(max_iter=1000, solver="liblinear") | |
| rf.fit(X_train, y_train) | |
| lr.fit(X_train, y_train) | |
| return {"RandomForest": rf, "LogisticRegression": lr} | |
| def evaluate_model(model, X_test, y_test): | |
| y_pred = model.predict(X_test) | |
| y_prob = model.predict_proba(X_test)[:, 1] | |
| acc = accuracy_score(y_test, y_pred) | |
| auc = roc_auc_score(y_test, y_prob) | |
| cm = confusion_matrix(y_test, y_pred) | |
| report = classification_report(y_test, y_pred, output_dict=True) | |
| return {"accuracy": acc, "auc": auc, "cm": cm, "report": report} | |
| # def add_churn_probability(app_df, model=None): | |
| # X = app_df[["session_count", "recency"]] | |
| # y = app_df["churn"] | |
| # X_train, X_test, y_train, y_test = train_test_split( | |
| # X, y, test_size=0.3, random_state=42, stratify=y | |
| # ) | |
| # # X_train, X_test, y_train, y_test = split_app(app_df) | |
| # # Train simple model | |
| # # model = RandomForestClassifier(n_estimators=100, random_state=42) | |
| # # model.fit(X_train, y_train) | |
| # if model is None: | |
| # model = RandomForestClassifier(n_estimators=100, random_state=42) | |
| # model.fit(X_train, y_train) | |
| # # Predict churn probability on holdout | |
| # probs = model.predict_proba(X_test)[:, 1] | |
| # # Build new df for plotting | |
| # df_plot = X_test.copy() | |
| # # df_plot["userid"] = app_df.loc[X_test.index, "userid"].values | |
| # # df_plot["recency"] = | |
| # df_plot["ChurnProbability"] = probs | |
| # return df_plot | |
| def train_and_predict_with_features(df, feature_cols, target_col="churn", model_out="models/rf_app.pkl", csv_out="results/appdata_rf.csv"): | |
| """ | |
| Trains a RandomForest model, predicts churn and probabilities, | |
| and returns a dataframe that merges predictions with original features. | |
| """ | |
| X = df[feature_cols] | |
| y = df[target_col] | |
| # Split | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.3, random_state=42, stratify=y | |
| ) | |
| # Scale | |
| scaler = StandardScaler() | |
| X_train_scaled = scaler.fit_transform(X_train) | |
| X_test_scaled = scaler.transform(X_test) | |
| # Train RF | |
| rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42) | |
| rf.fit(X_train_scaled, y_train) | |
| # Save model | |
| # joblib.dump(rf, model_out) | |
| # Predictions | |
| churn_pred = rf.predict(X_test_scaled) | |
| churn_prob = rf.predict_proba(X_test_scaled)[:, 1] | |
| # Convert back to DataFrame | |
| X_test_df = pd.DataFrame(X_test_scaled, | |
| columns=[f"{c}_scaled" for c in feature_cols], | |
| index=y_test.index) | |
| churn_pred_series = pd.Series(churn_pred, index=X_test_df.index, name="PredictedChurn") | |
| churn_prob_series = pd.Series(churn_prob, index=X_test_df.index, name="ChurnProbability") | |
| # Join predictions with scaled features | |
| df_preds = X_test_df.copy() | |
| df_preds["TrueLabel"] = y_test | |
| df_preds = df_preds.join(churn_pred_series).join(churn_prob_series) | |
| # Merge with original unscaled features for interpretability | |
| original_features_test = df.loc[X_test_df.index, feature_cols] | |
| predictions_with_features = pd.concat( | |
| [original_features_test.reset_index(drop=True), df_preds.reset_index(drop=True)], | |
| axis=1 | |
| ) | |
| # Save results | |
| # predictions_with_features.to_csv(csv_out, index=False) | |
| return predictions_with_features | |
| def save_model(model, path): | |
| joblib.dump(model, path) | |
| def load_model(path): | |
| return joblib.load(path) | |