computerscience-person's picture
Add predictor portion.
6df7a65
import marimo
__generated_with = "0.11.20"
app = marimo.App(width="medium")
@app.cell
def _(mo):
mo.md(r"""# Customer Churn Analysis""")
return
@app.cell
def _():
import marimo as mo
import polars as pl
import altair as alt
return alt, mo, pl
@app.cell
def _(pl):
df = pl.read_csv(
"hf://datasets/louiecerv/customer_churn/customer_churn_data.csv"
)
df.describe()
return (df,)
@app.cell
def _(df):
df.head()
return
@app.cell
def _(df, pl):
from sklearn.preprocessing import (
RobustScaler,
OneHotEncoder,
MinMaxScaler,
OrdinalEncoder,
)
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.linear_model import (
LogisticRegression,
BayesianRidge,
RidgeClassifier,
SGDClassifier,
)
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
VotingClassifier,
BaggingClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
)
from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector
from sklearn.model_selection import train_test_split
num_features = ["tenure", "monthly_charges", "total_charges"]
cat_features = ["contract_One Two year", "internet_service_Fiber No"]
random_state = 33
df2 = df.with_columns(
(pl.col("contract_One year") + "_" + pl.col("contract_Two year")).alias(
"contract_One Two year"
),
(
pl.col("internet_service_Fiber optic")
+ "_"
+ pl.col("internet_service_No")
).alias("internet_service_Fiber No"),
)
X, y = df2.select(num_features + cat_features), df2.select(["churn"])
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.32, random_state=random_state
)
preprocessor = make_column_transformer(
(OneHotEncoder(), cat_features),
(MinMaxScaler(), num_features),
)
knc = KNeighborsClassifier(algorithm="ball_tree")
dtree = DecisionTreeClassifier(criterion="entropy", random_state=random_state)
rfc = RandomForestClassifier(
criterion="entropy", max_features=0.3, random_state=random_state
)
gbc = GradientBoostingClassifier(random_state=random_state)
bag = BaggingClassifier(
KNeighborsClassifier(),
max_samples=0.8,
max_features=0.8,
random_state=random_state,
)
log_pipe = make_pipeline(
preprocessor, LogisticRegression(max_iter=10000, random_state=random_state)
)
bridge_pipe = make_pipeline(preprocessor, BayesianRidge(max_iter=10000))
ridge_pipe = make_pipeline(
preprocessor, RidgeClassifier(max_iter=10000, random_state=random_state)
)
sgd_pipe = make_pipeline(
preprocessor,
SGDClassifier(
loss="hinge", penalty="l2", max_iter=10000, random_state=random_state
),
)
lda_pipe = make_pipeline(preprocessor, QuadraticDiscriminantAnalysis())
bnb_pipe = make_pipeline(preprocessor, BernoulliNB())
svc_pipe = make_pipeline(
preprocessor, SVC(kernel="rbf", max_iter=10000, random_state=random_state)
)
dtree_pipe = make_pipeline(preprocessor, dtree)
rfc_pipe = make_pipeline(preprocessor, rfc)
knc_pipe = make_pipeline(preprocessor, knc)
gbc_pipe = make_pipeline(preprocessor, gbc)
vot_pipe = make_pipeline(
preprocessor,
VotingClassifier(
estimators=[
("qda", QuadraticDiscriminantAnalysis()),
("dtree", dtree),
],
voting="soft",
weights=[5, 2],
),
)
bag_pipe = make_pipeline(preprocessor, bag)
log_pred = log_pipe.fit(X_train, y_train).predict(X_test)
bridge_pred = bridge_pipe.fit(X_train, y_train).predict(X_test)
ridge_pred = ridge_pipe.fit(X_train, y_train).predict(X_test)
sgd_pred = sgd_pipe.fit(X_train, y_train).predict(X_test)
lda_pred = lda_pipe.fit(X_train, y_train).predict(X_test)
bnb_pred = bnb_pipe.fit(X_train, y_train).predict(X_test)
svc_pred = svc_pipe.fit(X_train, y_train).predict(X_test)
dtree_pred = dtree_pipe.fit(X_train, y_train).predict(X_test)
rfc_pred = dtree_pipe.fit(X_train, y_train).predict(X_test)
knc_pred = knc_pipe.fit(X_train, y_train).predict(X_test)
gbc_pred = gbc_pipe.fit(X_train, y_train).predict(X_test)
vot_pred = vot_pipe.fit(X_train, y_train).predict(X_test)
bag_pred = bag_pipe.fit(X_train, y_train).predict(X_test)
return (
BaggingClassifier,
BayesianRidge,
BernoulliNB,
DecisionTreeClassifier,
GradientBoostingClassifier,
KNeighborsClassifier,
LogisticRegression,
MinMaxScaler,
OneHotEncoder,
OrdinalEncoder,
QuadraticDiscriminantAnalysis,
RFE,
RFECV,
RandomForestClassifier,
RidgeClassifier,
RobustScaler,
SGDClassifier,
SVC,
SequentialFeatureSelector,
VotingClassifier,
X,
X_test,
X_train,
bag,
bag_pipe,
bag_pred,
bnb_pipe,
bnb_pred,
bridge_pipe,
bridge_pred,
cat_features,
df2,
dtree,
dtree_pipe,
dtree_pred,
gbc,
gbc_pipe,
gbc_pred,
knc,
knc_pipe,
knc_pred,
lda_pipe,
lda_pred,
log_pipe,
log_pred,
make_column_transformer,
make_pipeline,
num_features,
preprocessor,
random_state,
rfc,
rfc_pipe,
rfc_pred,
ridge_pipe,
ridge_pred,
sgd_pipe,
sgd_pred,
svc_pipe,
svc_pred,
train_test_split,
vot_pipe,
vot_pred,
y,
y_test,
y_train,
)
@app.cell
def _(
bag_pred,
bnb_pred,
bridge_pred,
dtree_pred,
gbc_pred,
knc_pred,
lda_pred,
log_pred,
mo,
rfc_pred,
ridge_pred,
sgd_pred,
svc_pred,
vot_pred,
y_test,
):
from sklearn.metrics import (
accuracy_score,
precision_score,
f1_score,
recall_score,
roc_auc_score,
log_loss,
mean_squared_error,
root_mean_squared_error,
mean_absolute_error,
r2_score,
explained_variance_score,
)
mo.md(f"""
# Model Metrics
## Logistic Regression
- Accuracy: {accuracy_score(y_test, log_pred)}
- Precision: {precision_score(y_test, log_pred)}
- Recall: {recall_score(y_test, log_pred)}
- F1: {f1_score(y_test, log_pred)}
- ROC-AUC: {roc_auc_score(y_test, log_pred)}
- Log Loss: {log_loss(y_test, log_pred)}
## Ridge Classifier
- Accuracy: {accuracy_score(y_test, ridge_pred)}
- Precision: {precision_score(y_test, ridge_pred)}
- Recall: {recall_score(y_test, ridge_pred)}
- F1: {f1_score(y_test, ridge_pred)}
- ROC-AUC: {roc_auc_score(y_test, ridge_pred)}
- Log Loss: {log_loss(y_test, ridge_pred)}
## SGD Classifier
- Accuracy: {accuracy_score(y_test, sgd_pred)}
- Precision: {precision_score(y_test, sgd_pred)}
- Recall: {recall_score(y_test, sgd_pred)}
- F1: {f1_score(y_test, sgd_pred)}
- ROC-AUC: {roc_auc_score(y_test, sgd_pred)}
- Log Loss: {log_loss(y_test, sgd_pred)}
## Bayesian Ridge Regression
- Mean Squared Error: {mean_squared_error(y_test, bridge_pred)}
- Root Mean Squared Error: {root_mean_squared_error(y_test, bridge_pred)}
- Mean Absolute Error: {mean_absolute_error(y_test, bridge_pred)}
- R^2: {r2_score(y_test, bridge_pred)}
- Explained Variance: {explained_variance_score(y_test, bridge_pred)}
## Quadratic Discriminant Analysis
- Accuracy: {accuracy_score(y_test, lda_pred)}
- Precision: {precision_score(y_test, lda_pred)}
- Recall: {recall_score(y_test, lda_pred)}
- F1: {f1_score(y_test, lda_pred)}
- ROC-AUC: {roc_auc_score(y_test, lda_pred)}
- Log Loss: {log_loss(y_test, lda_pred)}
## Bernoulli Naive Bayes
- Accuracy: {accuracy_score(y_test, bnb_pred)}
- Precision: {precision_score(y_test, bnb_pred)}
- Recall: {recall_score(y_test, bnb_pred)}
- F1: {f1_score(y_test, bnb_pred)}
- ROC-AUC: {roc_auc_score(y_test, bnb_pred)}
- Log Loss: {log_loss(y_test, bnb_pred)}
## C-Support Vector Classifier
- Accuracy: {accuracy_score(y_test, svc_pred)}
- Precision: {precision_score(y_test, svc_pred)}
- Recall: {recall_score(y_test, svc_pred)}
- F1: {f1_score(y_test, svc_pred)}
- ROC-AUC: {roc_auc_score(y_test, svc_pred)}
- Log Loss: {log_loss(y_test, svc_pred)}
## Decision Tree Classifier
- Accuracy: {accuracy_score(y_test, dtree_pred)}
- Precision: {precision_score(y_test, dtree_pred)}
- Recall: {recall_score(y_test, dtree_pred)}
- F1: {f1_score(y_test, dtree_pred)}
- ROC-AUC: {roc_auc_score(y_test, dtree_pred)}
- Log Loss: {log_loss(y_test, dtree_pred)}
## Random Forest Classifier
- Accuracy: {accuracy_score(y_test, rfc_pred)}
- Precision: {precision_score(y_test, rfc_pred)}
- Recall: {recall_score(y_test, rfc_pred)}
- F1: {f1_score(y_test, rfc_pred)}
- ROC-AUC: {roc_auc_score(y_test, rfc_pred)}
- Log Loss: {log_loss(y_test, rfc_pred)}
## K Neighbors Classifier
- Accuracy: {accuracy_score(y_test, knc_pred)}
- Precision: {precision_score(y_test, knc_pred)}
- Recall: {recall_score(y_test, knc_pred)}
- F1: {f1_score(y_test, knc_pred)}
- ROC-AUC: {roc_auc_score(y_test, knc_pred)}
- Log Loss: {log_loss(y_test, knc_pred)}
## Gradient Boosting Classifier
- Accuracy: {accuracy_score(y_test, gbc_pred)}
- Precision: {precision_score(y_test, gbc_pred)}
- Recall: {recall_score(y_test, gbc_pred)}
- F1: {f1_score(y_test, gbc_pred)}
- ROC-AUC: {roc_auc_score(y_test, gbc_pred)}
- Log Loss: {log_loss(y_test, gbc_pred)}
## Voting Classifier
- Accuracy: {accuracy_score(y_test, vot_pred)}
- Precision: {precision_score(y_test, vot_pred)}
- Recall: {recall_score(y_test, vot_pred)}
- F1: {f1_score(y_test, vot_pred)}
- ROC-AUC: {roc_auc_score(y_test, vot_pred)}
- Log Loss: {log_loss(y_test, vot_pred)}
## Bagging Classifier
- Accuracy: {accuracy_score(y_test, bag_pred)}
- Precision: {precision_score(y_test, bag_pred)}
- Recall: {recall_score(y_test, bag_pred)}
- F1: {f1_score(y_test, bag_pred)}
- ROC-AUC: {roc_auc_score(y_test, bag_pred)}
- Log Loss: {log_loss(y_test, bag_pred)}
{
mo.callout(
"From the metrics, the Quadratic Discriminant Analysis and the Decision Tree Classifier perform the best, thus, they were chosen for the Voting Classifier",
kind="info",
)
}
""")
return (
accuracy_score,
explained_variance_score,
f1_score,
log_loss,
mean_absolute_error,
mean_squared_error,
precision_score,
r2_score,
recall_score,
roc_auc_score,
root_mean_squared_error,
)
@app.cell
def _(mo):
user_inputs = mo.ui.dictionary(
{
"tenure": mo.ui.number(label="Tenure", start=1, stop=72, step=1),
"monthly_charges": mo.ui.number(
label="Monthly Charges", start=20, stop=120, step=1
),
"total_charges": mo.ui.number(
label="Total Charges", start=20, stop=8000, step=1
),
"contract": mo.ui.dropdown(
label="Contract (Year)", options=["None", "One", "Two"]
),
"service": mo.ui.dropdown(
label="Service", options=["None", "Basic", "Fiber Optic"]
),
}
)
mo.vstack(user_inputs.values())
return (user_inputs,)
@app.cell
def _(mo, pl, user_inputs, vot_pipe):
contract = None
service = None
match user_inputs["contract"].value:
case "None":
contract = "false_false"
case "One":
contract = "true_false"
case "Two":
contract = "false_true"
case _:
pass
match user_inputs["service"].value:
case "None":
service = "false_false"
case "Basic":
service = "true_false"
case "Fiber Optic":
service = "false_true"
case _:
pass
preds = pl.DataFrame({
"tenure": user_inputs["tenure"].value,
"monthly_charges": user_inputs["monthly_charges"].value,
"total_charges": user_inputs["total_charges"].value,
"contract_One Two year": contract,
"internet_service_Fiber No": service,
})
prediction = (vot_pipe.predict(preds), vot_pipe.predict_proba(preds))
mo.md(f"Prediction: {"Yes" if prediction[0][0] else "No" }, with about {prediction[1][0][0] * 100 if not prediction[0][0] else prediction[1][0][1] * 100:.2f}% probability.")
return contract, prediction, preds, service
if __name__ == "__main__":
app.run()