|
|
import marimo |
|
|
|
|
|
__generated_with = "0.11.20" |
|
|
app = marimo.App(width="medium") |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(mo): |
|
|
mo.md(r"""# Customer Churn Analysis""") |
|
|
return |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(): |
|
|
import marimo as mo |
|
|
import polars as pl |
|
|
import altair as alt |
|
|
return alt, mo, pl |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(pl): |
|
|
df = pl.read_csv( |
|
|
"hf://datasets/louiecerv/customer_churn/customer_churn_data.csv" |
|
|
) |
|
|
df.describe() |
|
|
return (df,) |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(df): |
|
|
df.head() |
|
|
return |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(df, pl): |
|
|
from sklearn.preprocessing import ( |
|
|
RobustScaler, |
|
|
OneHotEncoder, |
|
|
MinMaxScaler, |
|
|
OrdinalEncoder, |
|
|
) |
|
|
from sklearn.pipeline import make_pipeline |
|
|
from sklearn.compose import make_column_transformer |
|
|
from sklearn.linear_model import ( |
|
|
LogisticRegression, |
|
|
BayesianRidge, |
|
|
RidgeClassifier, |
|
|
SGDClassifier, |
|
|
) |
|
|
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis |
|
|
from sklearn.naive_bayes import BernoulliNB |
|
|
from sklearn.svm import SVC |
|
|
from sklearn.tree import DecisionTreeClassifier |
|
|
from sklearn.neighbors import KNeighborsClassifier |
|
|
from sklearn.ensemble import ( |
|
|
VotingClassifier, |
|
|
BaggingClassifier, |
|
|
GradientBoostingClassifier, |
|
|
RandomForestClassifier, |
|
|
) |
|
|
from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector |
|
|
from sklearn.model_selection import train_test_split |
|
|
|
|
|
num_features = ["tenure", "monthly_charges", "total_charges"] |
|
|
cat_features = ["contract_One Two year", "internet_service_Fiber No"] |
|
|
random_state = 33 |
|
|
|
|
|
df2 = df.with_columns( |
|
|
(pl.col("contract_One year") + "_" + pl.col("contract_Two year")).alias( |
|
|
"contract_One Two year" |
|
|
), |
|
|
( |
|
|
pl.col("internet_service_Fiber optic") |
|
|
+ "_" |
|
|
+ pl.col("internet_service_No") |
|
|
).alias("internet_service_Fiber No"), |
|
|
) |
|
|
|
|
|
X, y = df2.select(num_features + cat_features), df2.select(["churn"]) |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
X, y, test_size=0.32, random_state=random_state |
|
|
) |
|
|
|
|
|
preprocessor = make_column_transformer( |
|
|
(OneHotEncoder(), cat_features), |
|
|
(MinMaxScaler(), num_features), |
|
|
) |
|
|
|
|
|
knc = KNeighborsClassifier(algorithm="ball_tree") |
|
|
dtree = DecisionTreeClassifier(criterion="entropy", random_state=random_state) |
|
|
rfc = RandomForestClassifier( |
|
|
criterion="entropy", max_features=0.3, random_state=random_state |
|
|
) |
|
|
gbc = GradientBoostingClassifier(random_state=random_state) |
|
|
bag = BaggingClassifier( |
|
|
KNeighborsClassifier(), |
|
|
max_samples=0.8, |
|
|
max_features=0.8, |
|
|
random_state=random_state, |
|
|
) |
|
|
|
|
|
log_pipe = make_pipeline( |
|
|
preprocessor, LogisticRegression(max_iter=10000, random_state=random_state) |
|
|
) |
|
|
bridge_pipe = make_pipeline(preprocessor, BayesianRidge(max_iter=10000)) |
|
|
ridge_pipe = make_pipeline( |
|
|
preprocessor, RidgeClassifier(max_iter=10000, random_state=random_state) |
|
|
) |
|
|
sgd_pipe = make_pipeline( |
|
|
preprocessor, |
|
|
SGDClassifier( |
|
|
loss="hinge", penalty="l2", max_iter=10000, random_state=random_state |
|
|
), |
|
|
) |
|
|
lda_pipe = make_pipeline(preprocessor, QuadraticDiscriminantAnalysis()) |
|
|
bnb_pipe = make_pipeline(preprocessor, BernoulliNB()) |
|
|
svc_pipe = make_pipeline( |
|
|
preprocessor, SVC(kernel="rbf", max_iter=10000, random_state=random_state) |
|
|
) |
|
|
dtree_pipe = make_pipeline(preprocessor, dtree) |
|
|
rfc_pipe = make_pipeline(preprocessor, rfc) |
|
|
knc_pipe = make_pipeline(preprocessor, knc) |
|
|
gbc_pipe = make_pipeline(preprocessor, gbc) |
|
|
vot_pipe = make_pipeline( |
|
|
preprocessor, |
|
|
VotingClassifier( |
|
|
estimators=[ |
|
|
("qda", QuadraticDiscriminantAnalysis()), |
|
|
("dtree", dtree), |
|
|
], |
|
|
voting="soft", |
|
|
weights=[5, 2], |
|
|
), |
|
|
) |
|
|
bag_pipe = make_pipeline(preprocessor, bag) |
|
|
|
|
|
log_pred = log_pipe.fit(X_train, y_train).predict(X_test) |
|
|
bridge_pred = bridge_pipe.fit(X_train, y_train).predict(X_test) |
|
|
ridge_pred = ridge_pipe.fit(X_train, y_train).predict(X_test) |
|
|
sgd_pred = sgd_pipe.fit(X_train, y_train).predict(X_test) |
|
|
lda_pred = lda_pipe.fit(X_train, y_train).predict(X_test) |
|
|
bnb_pred = bnb_pipe.fit(X_train, y_train).predict(X_test) |
|
|
svc_pred = svc_pipe.fit(X_train, y_train).predict(X_test) |
|
|
dtree_pred = dtree_pipe.fit(X_train, y_train).predict(X_test) |
|
|
rfc_pred = dtree_pipe.fit(X_train, y_train).predict(X_test) |
|
|
knc_pred = knc_pipe.fit(X_train, y_train).predict(X_test) |
|
|
gbc_pred = gbc_pipe.fit(X_train, y_train).predict(X_test) |
|
|
vot_pred = vot_pipe.fit(X_train, y_train).predict(X_test) |
|
|
bag_pred = bag_pipe.fit(X_train, y_train).predict(X_test) |
|
|
return ( |
|
|
BaggingClassifier, |
|
|
BayesianRidge, |
|
|
BernoulliNB, |
|
|
DecisionTreeClassifier, |
|
|
GradientBoostingClassifier, |
|
|
KNeighborsClassifier, |
|
|
LogisticRegression, |
|
|
MinMaxScaler, |
|
|
OneHotEncoder, |
|
|
OrdinalEncoder, |
|
|
QuadraticDiscriminantAnalysis, |
|
|
RFE, |
|
|
RFECV, |
|
|
RandomForestClassifier, |
|
|
RidgeClassifier, |
|
|
RobustScaler, |
|
|
SGDClassifier, |
|
|
SVC, |
|
|
SequentialFeatureSelector, |
|
|
VotingClassifier, |
|
|
X, |
|
|
X_test, |
|
|
X_train, |
|
|
bag, |
|
|
bag_pipe, |
|
|
bag_pred, |
|
|
bnb_pipe, |
|
|
bnb_pred, |
|
|
bridge_pipe, |
|
|
bridge_pred, |
|
|
cat_features, |
|
|
df2, |
|
|
dtree, |
|
|
dtree_pipe, |
|
|
dtree_pred, |
|
|
gbc, |
|
|
gbc_pipe, |
|
|
gbc_pred, |
|
|
knc, |
|
|
knc_pipe, |
|
|
knc_pred, |
|
|
lda_pipe, |
|
|
lda_pred, |
|
|
log_pipe, |
|
|
log_pred, |
|
|
make_column_transformer, |
|
|
make_pipeline, |
|
|
num_features, |
|
|
preprocessor, |
|
|
random_state, |
|
|
rfc, |
|
|
rfc_pipe, |
|
|
rfc_pred, |
|
|
ridge_pipe, |
|
|
ridge_pred, |
|
|
sgd_pipe, |
|
|
sgd_pred, |
|
|
svc_pipe, |
|
|
svc_pred, |
|
|
train_test_split, |
|
|
vot_pipe, |
|
|
vot_pred, |
|
|
y, |
|
|
y_test, |
|
|
y_train, |
|
|
) |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _( |
|
|
bag_pred, |
|
|
bnb_pred, |
|
|
bridge_pred, |
|
|
dtree_pred, |
|
|
gbc_pred, |
|
|
knc_pred, |
|
|
lda_pred, |
|
|
log_pred, |
|
|
mo, |
|
|
rfc_pred, |
|
|
ridge_pred, |
|
|
sgd_pred, |
|
|
svc_pred, |
|
|
vot_pred, |
|
|
y_test, |
|
|
): |
|
|
from sklearn.metrics import ( |
|
|
accuracy_score, |
|
|
precision_score, |
|
|
f1_score, |
|
|
recall_score, |
|
|
roc_auc_score, |
|
|
log_loss, |
|
|
mean_squared_error, |
|
|
root_mean_squared_error, |
|
|
mean_absolute_error, |
|
|
r2_score, |
|
|
explained_variance_score, |
|
|
) |
|
|
|
|
|
mo.md(f""" |
|
|
# Model Metrics |
|
|
|
|
|
## Logistic Regression |
|
|
|
|
|
- Accuracy: {accuracy_score(y_test, log_pred)} |
|
|
- Precision: {precision_score(y_test, log_pred)} |
|
|
- Recall: {recall_score(y_test, log_pred)} |
|
|
- F1: {f1_score(y_test, log_pred)} |
|
|
- ROC-AUC: {roc_auc_score(y_test, log_pred)} |
|
|
- Log Loss: {log_loss(y_test, log_pred)} |
|
|
|
|
|
## Ridge Classifier |
|
|
|
|
|
- Accuracy: {accuracy_score(y_test, ridge_pred)} |
|
|
- Precision: {precision_score(y_test, ridge_pred)} |
|
|
- Recall: {recall_score(y_test, ridge_pred)} |
|
|
- F1: {f1_score(y_test, ridge_pred)} |
|
|
- ROC-AUC: {roc_auc_score(y_test, ridge_pred)} |
|
|
- Log Loss: {log_loss(y_test, ridge_pred)} |
|
|
|
|
|
## SGD Classifier |
|
|
|
|
|
- Accuracy: {accuracy_score(y_test, sgd_pred)} |
|
|
- Precision: {precision_score(y_test, sgd_pred)} |
|
|
- Recall: {recall_score(y_test, sgd_pred)} |
|
|
- F1: {f1_score(y_test, sgd_pred)} |
|
|
- ROC-AUC: {roc_auc_score(y_test, sgd_pred)} |
|
|
- Log Loss: {log_loss(y_test, sgd_pred)} |
|
|
|
|
|
## Bayesian Ridge Regression |
|
|
|
|
|
- Mean Squared Error: {mean_squared_error(y_test, bridge_pred)} |
|
|
- Root Mean Squared Error: {root_mean_squared_error(y_test, bridge_pred)} |
|
|
- Mean Absolute Error: {mean_absolute_error(y_test, bridge_pred)} |
|
|
- R^2: {r2_score(y_test, bridge_pred)} |
|
|
- Explained Variance: {explained_variance_score(y_test, bridge_pred)} |
|
|
|
|
|
## Quadratic Discriminant Analysis |
|
|
|
|
|
- Accuracy: {accuracy_score(y_test, lda_pred)} |
|
|
- Precision: {precision_score(y_test, lda_pred)} |
|
|
- Recall: {recall_score(y_test, lda_pred)} |
|
|
- F1: {f1_score(y_test, lda_pred)} |
|
|
- ROC-AUC: {roc_auc_score(y_test, lda_pred)} |
|
|
- Log Loss: {log_loss(y_test, lda_pred)} |
|
|
|
|
|
## Bernoulli Naive Bayes |
|
|
|
|
|
- Accuracy: {accuracy_score(y_test, bnb_pred)} |
|
|
- Precision: {precision_score(y_test, bnb_pred)} |
|
|
- Recall: {recall_score(y_test, bnb_pred)} |
|
|
- F1: {f1_score(y_test, bnb_pred)} |
|
|
- ROC-AUC: {roc_auc_score(y_test, bnb_pred)} |
|
|
- Log Loss: {log_loss(y_test, bnb_pred)} |
|
|
|
|
|
## C-Support Vector Classifier |
|
|
|
|
|
- Accuracy: {accuracy_score(y_test, svc_pred)} |
|
|
- Precision: {precision_score(y_test, svc_pred)} |
|
|
- Recall: {recall_score(y_test, svc_pred)} |
|
|
- F1: {f1_score(y_test, svc_pred)} |
|
|
- ROC-AUC: {roc_auc_score(y_test, svc_pred)} |
|
|
- Log Loss: {log_loss(y_test, svc_pred)} |
|
|
|
|
|
## Decision Tree Classifier |
|
|
|
|
|
- Accuracy: {accuracy_score(y_test, dtree_pred)} |
|
|
- Precision: {precision_score(y_test, dtree_pred)} |
|
|
- Recall: {recall_score(y_test, dtree_pred)} |
|
|
- F1: {f1_score(y_test, dtree_pred)} |
|
|
- ROC-AUC: {roc_auc_score(y_test, dtree_pred)} |
|
|
- Log Loss: {log_loss(y_test, dtree_pred)} |
|
|
|
|
|
## Random Forest Classifier |
|
|
|
|
|
- Accuracy: {accuracy_score(y_test, rfc_pred)} |
|
|
- Precision: {precision_score(y_test, rfc_pred)} |
|
|
- Recall: {recall_score(y_test, rfc_pred)} |
|
|
- F1: {f1_score(y_test, rfc_pred)} |
|
|
- ROC-AUC: {roc_auc_score(y_test, rfc_pred)} |
|
|
- Log Loss: {log_loss(y_test, rfc_pred)} |
|
|
|
|
|
## K Neighbors Classifier |
|
|
|
|
|
- Accuracy: {accuracy_score(y_test, knc_pred)} |
|
|
- Precision: {precision_score(y_test, knc_pred)} |
|
|
- Recall: {recall_score(y_test, knc_pred)} |
|
|
- F1: {f1_score(y_test, knc_pred)} |
|
|
- ROC-AUC: {roc_auc_score(y_test, knc_pred)} |
|
|
- Log Loss: {log_loss(y_test, knc_pred)} |
|
|
|
|
|
## Gradient Boosting Classifier |
|
|
|
|
|
- Accuracy: {accuracy_score(y_test, gbc_pred)} |
|
|
- Precision: {precision_score(y_test, gbc_pred)} |
|
|
- Recall: {recall_score(y_test, gbc_pred)} |
|
|
- F1: {f1_score(y_test, gbc_pred)} |
|
|
- ROC-AUC: {roc_auc_score(y_test, gbc_pred)} |
|
|
- Log Loss: {log_loss(y_test, gbc_pred)} |
|
|
|
|
|
## Voting Classifier |
|
|
|
|
|
- Accuracy: {accuracy_score(y_test, vot_pred)} |
|
|
- Precision: {precision_score(y_test, vot_pred)} |
|
|
- Recall: {recall_score(y_test, vot_pred)} |
|
|
- F1: {f1_score(y_test, vot_pred)} |
|
|
- ROC-AUC: {roc_auc_score(y_test, vot_pred)} |
|
|
- Log Loss: {log_loss(y_test, vot_pred)} |
|
|
|
|
|
## Bagging Classifier |
|
|
|
|
|
- Accuracy: {accuracy_score(y_test, bag_pred)} |
|
|
- Precision: {precision_score(y_test, bag_pred)} |
|
|
- Recall: {recall_score(y_test, bag_pred)} |
|
|
- F1: {f1_score(y_test, bag_pred)} |
|
|
- ROC-AUC: {roc_auc_score(y_test, bag_pred)} |
|
|
- Log Loss: {log_loss(y_test, bag_pred)} |
|
|
|
|
|
{ |
|
|
mo.callout( |
|
|
"From the metrics, the Quadratic Discriminant Analysis and the Decision Tree Classifier perform the best, thus, they were chosen for the Voting Classifier", |
|
|
kind="info", |
|
|
) |
|
|
} |
|
|
""") |
|
|
return ( |
|
|
accuracy_score, |
|
|
explained_variance_score, |
|
|
f1_score, |
|
|
log_loss, |
|
|
mean_absolute_error, |
|
|
mean_squared_error, |
|
|
precision_score, |
|
|
r2_score, |
|
|
recall_score, |
|
|
roc_auc_score, |
|
|
root_mean_squared_error, |
|
|
) |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(mo): |
|
|
user_inputs = mo.ui.dictionary( |
|
|
{ |
|
|
"tenure": mo.ui.number(label="Tenure", start=1, stop=72, step=1), |
|
|
"monthly_charges": mo.ui.number( |
|
|
label="Monthly Charges", start=20, stop=120, step=1 |
|
|
), |
|
|
"total_charges": mo.ui.number( |
|
|
label="Total Charges", start=20, stop=8000, step=1 |
|
|
), |
|
|
"contract": mo.ui.dropdown( |
|
|
label="Contract (Year)", options=["None", "One", "Two"] |
|
|
), |
|
|
"service": mo.ui.dropdown( |
|
|
label="Service", options=["None", "Basic", "Fiber Optic"] |
|
|
), |
|
|
} |
|
|
) |
|
|
|
|
|
mo.vstack(user_inputs.values()) |
|
|
return (user_inputs,) |
|
|
|
|
|
|
|
|
@app.cell |
|
|
def _(mo, pl, user_inputs, vot_pipe): |
|
|
contract = None |
|
|
service = None |
|
|
|
|
|
match user_inputs["contract"].value: |
|
|
case "None": |
|
|
contract = "false_false" |
|
|
case "One": |
|
|
contract = "true_false" |
|
|
case "Two": |
|
|
contract = "false_true" |
|
|
case _: |
|
|
pass |
|
|
|
|
|
match user_inputs["service"].value: |
|
|
case "None": |
|
|
service = "false_false" |
|
|
case "Basic": |
|
|
service = "true_false" |
|
|
case "Fiber Optic": |
|
|
service = "false_true" |
|
|
case _: |
|
|
pass |
|
|
|
|
|
preds = pl.DataFrame({ |
|
|
"tenure": user_inputs["tenure"].value, |
|
|
"monthly_charges": user_inputs["monthly_charges"].value, |
|
|
"total_charges": user_inputs["total_charges"].value, |
|
|
"contract_One Two year": contract, |
|
|
"internet_service_Fiber No": service, |
|
|
}) |
|
|
|
|
|
prediction = (vot_pipe.predict(preds), vot_pipe.predict_proba(preds)) |
|
|
|
|
|
mo.md(f"Prediction: {"Yes" if prediction[0][0] else "No" }, with about {prediction[1][0][0] * 100 if not prediction[0][0] else prediction[1][0][1] * 100:.2f}% probability.") |
|
|
return contract, prediction, preds, service |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.run() |
|
|
|