File size: 3,977 Bytes
53b92fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from utils.data_prep import split_app

def train_models(X_train, y_train):
    rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
    lr = LogisticRegression(max_iter=1000, solver="liblinear")

    rf.fit(X_train, y_train)
    lr.fit(X_train, y_train)
    return {"RandomForest": rf, "LogisticRegression": lr}

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    return {"accuracy": acc, "auc": auc, "cm": cm, "report": report}

# def add_churn_probability(app_df, model=None):
#     X = app_df[["session_count", "recency"]]
#     y = app_df["churn"]

#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.3, random_state=42, stratify=y
#     )
#     # X_train, X_test, y_train, y_test = split_app(app_df)

#     # Train simple model
#     # model = RandomForestClassifier(n_estimators=100, random_state=42)
#     # model.fit(X_train, y_train)
#     if model is None:
#         model = RandomForestClassifier(n_estimators=100, random_state=42)
#         model.fit(X_train, y_train)

#     # Predict churn probability on holdout
#     probs = model.predict_proba(X_test)[:, 1]

#     # Build new df for plotting
#     df_plot = X_test.copy()
#     # df_plot["userid"] = app_df.loc[X_test.index, "userid"].values
#     # df_plot["recency"] = 
#     df_plot["ChurnProbability"] = probs
#     return df_plot

def train_and_predict_with_features(df, feature_cols, target_col="churn", model_out="models/rf_app.pkl", csv_out="results/appdata_rf.csv"):
    """
    Trains a RandomForest model, predicts churn and probabilities,
    and returns a dataframe that merges predictions with original features.
    """
    X = df[feature_cols]
    y = df[target_col]

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    # Scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train RF
    rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
    rf.fit(X_train_scaled, y_train)

    # Save model
    # joblib.dump(rf, model_out)

    # Predictions
    churn_pred = rf.predict(X_test_scaled)
    churn_prob = rf.predict_proba(X_test_scaled)[:, 1]

    # Convert back to DataFrame
    X_test_df = pd.DataFrame(X_test_scaled, 
                         columns=[f"{c}_scaled" for c in feature_cols], 
                         index=y_test.index)
    churn_pred_series = pd.Series(churn_pred, index=X_test_df.index, name="PredictedChurn")
    churn_prob_series = pd.Series(churn_prob, index=X_test_df.index, name="ChurnProbability")

    # Join predictions with scaled features
    df_preds = X_test_df.copy()
    df_preds["TrueLabel"] = y_test
    df_preds = df_preds.join(churn_pred_series).join(churn_prob_series)

    # Merge with original unscaled features for interpretability
    original_features_test = df.loc[X_test_df.index, feature_cols]
    predictions_with_features = pd.concat(
        [original_features_test.reset_index(drop=True), df_preds.reset_index(drop=True)],
        axis=1
    )

    # Save results
    # predictions_with_features.to_csv(csv_out, index=False)

    return predictions_with_features

def save_model(model, path):
    joblib.dump(model, path)

def load_model(path):
    return joblib.load(path)