Spaces:
Sleeping
Sleeping
File size: 3,977 Bytes
53b92fc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from utils.data_prep import split_app
def train_models(X_train, y_train):
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
lr = LogisticRegression(max_iter=1000, solver="liblinear")
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)
return {"RandomForest": rf, "LogisticRegression": lr}
def evaluate_model(model, X_test, y_test):
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)
return {"accuracy": acc, "auc": auc, "cm": cm, "report": report}
# def add_churn_probability(app_df, model=None):
# X = app_df[["session_count", "recency"]]
# y = app_df["churn"]
# X_train, X_test, y_train, y_test = train_test_split(
# X, y, test_size=0.3, random_state=42, stratify=y
# )
# # X_train, X_test, y_train, y_test = split_app(app_df)
# # Train simple model
# # model = RandomForestClassifier(n_estimators=100, random_state=42)
# # model.fit(X_train, y_train)
# if model is None:
# model = RandomForestClassifier(n_estimators=100, random_state=42)
# model.fit(X_train, y_train)
# # Predict churn probability on holdout
# probs = model.predict_proba(X_test)[:, 1]
# # Build new df for plotting
# df_plot = X_test.copy()
# # df_plot["userid"] = app_df.loc[X_test.index, "userid"].values
# # df_plot["recency"] =
# df_plot["ChurnProbability"] = probs
# return df_plot
def train_and_predict_with_features(df, feature_cols, target_col="churn", model_out="models/rf_app.pkl", csv_out="results/appdata_rf.csv"):
"""
Trains a RandomForest model, predicts churn and probabilities,
and returns a dataframe that merges predictions with original features.
"""
X = df[feature_cols]
y = df[target_col]
# Split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42, stratify=y
)
# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train RF
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train_scaled, y_train)
# Save model
# joblib.dump(rf, model_out)
# Predictions
churn_pred = rf.predict(X_test_scaled)
churn_prob = rf.predict_proba(X_test_scaled)[:, 1]
# Convert back to DataFrame
X_test_df = pd.DataFrame(X_test_scaled,
columns=[f"{c}_scaled" for c in feature_cols],
index=y_test.index)
churn_pred_series = pd.Series(churn_pred, index=X_test_df.index, name="PredictedChurn")
churn_prob_series = pd.Series(churn_prob, index=X_test_df.index, name="ChurnProbability")
# Join predictions with scaled features
df_preds = X_test_df.copy()
df_preds["TrueLabel"] = y_test
df_preds = df_preds.join(churn_pred_series).join(churn_prob_series)
# Merge with original unscaled features for interpretability
original_features_test = df.loc[X_test_df.index, feature_cols]
predictions_with_features = pd.concat(
[original_features_test.reset_index(drop=True), df_preds.reset_index(drop=True)],
axis=1
)
# Save results
# predictions_with_features.to_csv(csv_out, index=False)
return predictions_with_features
def save_model(model, path):
joblib.dump(model, path)
def load_model(path):
return joblib.load(path)
|