import marimo __generated_with = "0.11.20" app = marimo.App(width="medium") @app.cell def _(mo): mo.md(r"""# Customer Churn Analysis""") return @app.cell def _(): import marimo as mo import polars as pl import altair as alt return alt, mo, pl @app.cell def _(pl): df = pl.read_csv( "hf://datasets/louiecerv/customer_churn/customer_churn_data.csv" ) df.describe() return (df,) @app.cell def _(df): df.head() return @app.cell def _(df, pl): from sklearn.preprocessing import ( RobustScaler, OneHotEncoder, MinMaxScaler, OrdinalEncoder, ) from sklearn.pipeline import make_pipeline from sklearn.compose import make_column_transformer from sklearn.linear_model import ( LogisticRegression, BayesianRidge, RidgeClassifier, SGDClassifier, ) from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.naive_bayes import BernoulliNB from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import ( VotingClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier, ) from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector from sklearn.model_selection import train_test_split num_features = ["tenure", "monthly_charges", "total_charges"] cat_features = ["contract_One Two year", "internet_service_Fiber No"] random_state = 33 df2 = df.with_columns( (pl.col("contract_One year") + "_" + pl.col("contract_Two year")).alias( "contract_One Two year" ), ( pl.col("internet_service_Fiber optic") + "_" + pl.col("internet_service_No") ).alias("internet_service_Fiber No"), ) X, y = df2.select(num_features + cat_features), df2.select(["churn"]) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.32, random_state=random_state ) preprocessor = make_column_transformer( (OneHotEncoder(), cat_features), (MinMaxScaler(), num_features), ) knc = KNeighborsClassifier(algorithm="ball_tree") dtree = DecisionTreeClassifier(criterion="entropy", random_state=random_state) rfc = RandomForestClassifier( criterion="entropy", max_features=0.3, random_state=random_state ) gbc = GradientBoostingClassifier(random_state=random_state) bag = BaggingClassifier( KNeighborsClassifier(), max_samples=0.8, max_features=0.8, random_state=random_state, ) log_pipe = make_pipeline( preprocessor, LogisticRegression(max_iter=10000, random_state=random_state) ) bridge_pipe = make_pipeline(preprocessor, BayesianRidge(max_iter=10000)) ridge_pipe = make_pipeline( preprocessor, RidgeClassifier(max_iter=10000, random_state=random_state) ) sgd_pipe = make_pipeline( preprocessor, SGDClassifier( loss="hinge", penalty="l2", max_iter=10000, random_state=random_state ), ) lda_pipe = make_pipeline(preprocessor, QuadraticDiscriminantAnalysis()) bnb_pipe = make_pipeline(preprocessor, BernoulliNB()) svc_pipe = make_pipeline( preprocessor, SVC(kernel="rbf", max_iter=10000, random_state=random_state) ) dtree_pipe = make_pipeline(preprocessor, dtree) rfc_pipe = make_pipeline(preprocessor, rfc) knc_pipe = make_pipeline(preprocessor, knc) gbc_pipe = make_pipeline(preprocessor, gbc) vot_pipe = make_pipeline( preprocessor, VotingClassifier( estimators=[ ("qda", QuadraticDiscriminantAnalysis()), ("dtree", dtree), ], voting="soft", weights=[5, 2], ), ) bag_pipe = make_pipeline(preprocessor, bag) log_pred = log_pipe.fit(X_train, y_train).predict(X_test) bridge_pred = bridge_pipe.fit(X_train, y_train).predict(X_test) ridge_pred = ridge_pipe.fit(X_train, y_train).predict(X_test) sgd_pred = sgd_pipe.fit(X_train, y_train).predict(X_test) lda_pred = lda_pipe.fit(X_train, y_train).predict(X_test) bnb_pred = bnb_pipe.fit(X_train, y_train).predict(X_test) svc_pred = svc_pipe.fit(X_train, y_train).predict(X_test) dtree_pred = dtree_pipe.fit(X_train, y_train).predict(X_test) rfc_pred = dtree_pipe.fit(X_train, y_train).predict(X_test) knc_pred = knc_pipe.fit(X_train, y_train).predict(X_test) gbc_pred = gbc_pipe.fit(X_train, y_train).predict(X_test) vot_pred = vot_pipe.fit(X_train, y_train).predict(X_test) bag_pred = bag_pipe.fit(X_train, y_train).predict(X_test) return ( BaggingClassifier, BayesianRidge, BernoulliNB, DecisionTreeClassifier, GradientBoostingClassifier, KNeighborsClassifier, LogisticRegression, MinMaxScaler, OneHotEncoder, OrdinalEncoder, QuadraticDiscriminantAnalysis, RFE, RFECV, RandomForestClassifier, RidgeClassifier, RobustScaler, SGDClassifier, SVC, SequentialFeatureSelector, VotingClassifier, X, X_test, X_train, bag, bag_pipe, bag_pred, bnb_pipe, bnb_pred, bridge_pipe, bridge_pred, cat_features, df2, dtree, dtree_pipe, dtree_pred, gbc, gbc_pipe, gbc_pred, knc, knc_pipe, knc_pred, lda_pipe, lda_pred, log_pipe, log_pred, make_column_transformer, make_pipeline, num_features, preprocessor, random_state, rfc, rfc_pipe, rfc_pred, ridge_pipe, ridge_pred, sgd_pipe, sgd_pred, svc_pipe, svc_pred, train_test_split, vot_pipe, vot_pred, y, y_test, y_train, ) @app.cell def _( bag_pred, bnb_pred, bridge_pred, dtree_pred, gbc_pred, knc_pred, lda_pred, log_pred, mo, rfc_pred, ridge_pred, sgd_pred, svc_pred, vot_pred, y_test, ): from sklearn.metrics import ( accuracy_score, precision_score, f1_score, recall_score, roc_auc_score, log_loss, mean_squared_error, root_mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, ) mo.md(f""" # Model Metrics ## Logistic Regression - Accuracy: {accuracy_score(y_test, log_pred)} - Precision: {precision_score(y_test, log_pred)} - Recall: {recall_score(y_test, log_pred)} - F1: {f1_score(y_test, log_pred)} - ROC-AUC: {roc_auc_score(y_test, log_pred)} - Log Loss: {log_loss(y_test, log_pred)} ## Ridge Classifier - Accuracy: {accuracy_score(y_test, ridge_pred)} - Precision: {precision_score(y_test, ridge_pred)} - Recall: {recall_score(y_test, ridge_pred)} - F1: {f1_score(y_test, ridge_pred)} - ROC-AUC: {roc_auc_score(y_test, ridge_pred)} - Log Loss: {log_loss(y_test, ridge_pred)} ## SGD Classifier - Accuracy: {accuracy_score(y_test, sgd_pred)} - Precision: {precision_score(y_test, sgd_pred)} - Recall: {recall_score(y_test, sgd_pred)} - F1: {f1_score(y_test, sgd_pred)} - ROC-AUC: {roc_auc_score(y_test, sgd_pred)} - Log Loss: {log_loss(y_test, sgd_pred)} ## Bayesian Ridge Regression - Mean Squared Error: {mean_squared_error(y_test, bridge_pred)} - Root Mean Squared Error: {root_mean_squared_error(y_test, bridge_pred)} - Mean Absolute Error: {mean_absolute_error(y_test, bridge_pred)} - R^2: {r2_score(y_test, bridge_pred)} - Explained Variance: {explained_variance_score(y_test, bridge_pred)} ## Quadratic Discriminant Analysis - Accuracy: {accuracy_score(y_test, lda_pred)} - Precision: {precision_score(y_test, lda_pred)} - Recall: {recall_score(y_test, lda_pred)} - F1: {f1_score(y_test, lda_pred)} - ROC-AUC: {roc_auc_score(y_test, lda_pred)} - Log Loss: {log_loss(y_test, lda_pred)} ## Bernoulli Naive Bayes - Accuracy: {accuracy_score(y_test, bnb_pred)} - Precision: {precision_score(y_test, bnb_pred)} - Recall: {recall_score(y_test, bnb_pred)} - F1: {f1_score(y_test, bnb_pred)} - ROC-AUC: {roc_auc_score(y_test, bnb_pred)} - Log Loss: {log_loss(y_test, bnb_pred)} ## C-Support Vector Classifier - Accuracy: {accuracy_score(y_test, svc_pred)} - Precision: {precision_score(y_test, svc_pred)} - Recall: {recall_score(y_test, svc_pred)} - F1: {f1_score(y_test, svc_pred)} - ROC-AUC: {roc_auc_score(y_test, svc_pred)} - Log Loss: {log_loss(y_test, svc_pred)} ## Decision Tree Classifier - Accuracy: {accuracy_score(y_test, dtree_pred)} - Precision: {precision_score(y_test, dtree_pred)} - Recall: {recall_score(y_test, dtree_pred)} - F1: {f1_score(y_test, dtree_pred)} - ROC-AUC: {roc_auc_score(y_test, dtree_pred)} - Log Loss: {log_loss(y_test, dtree_pred)} ## Random Forest Classifier - Accuracy: {accuracy_score(y_test, rfc_pred)} - Precision: {precision_score(y_test, rfc_pred)} - Recall: {recall_score(y_test, rfc_pred)} - F1: {f1_score(y_test, rfc_pred)} - ROC-AUC: {roc_auc_score(y_test, rfc_pred)} - Log Loss: {log_loss(y_test, rfc_pred)} ## K Neighbors Classifier - Accuracy: {accuracy_score(y_test, knc_pred)} - Precision: {precision_score(y_test, knc_pred)} - Recall: {recall_score(y_test, knc_pred)} - F1: {f1_score(y_test, knc_pred)} - ROC-AUC: {roc_auc_score(y_test, knc_pred)} - Log Loss: {log_loss(y_test, knc_pred)} ## Gradient Boosting Classifier - Accuracy: {accuracy_score(y_test, gbc_pred)} - Precision: {precision_score(y_test, gbc_pred)} - Recall: {recall_score(y_test, gbc_pred)} - F1: {f1_score(y_test, gbc_pred)} - ROC-AUC: {roc_auc_score(y_test, gbc_pred)} - Log Loss: {log_loss(y_test, gbc_pred)} ## Voting Classifier - Accuracy: {accuracy_score(y_test, vot_pred)} - Precision: {precision_score(y_test, vot_pred)} - Recall: {recall_score(y_test, vot_pred)} - F1: {f1_score(y_test, vot_pred)} - ROC-AUC: {roc_auc_score(y_test, vot_pred)} - Log Loss: {log_loss(y_test, vot_pred)} ## Bagging Classifier - Accuracy: {accuracy_score(y_test, bag_pred)} - Precision: {precision_score(y_test, bag_pred)} - Recall: {recall_score(y_test, bag_pred)} - F1: {f1_score(y_test, bag_pred)} - ROC-AUC: {roc_auc_score(y_test, bag_pred)} - Log Loss: {log_loss(y_test, bag_pred)} { mo.callout( "From the metrics, the Quadratic Discriminant Analysis and the Decision Tree Classifier perform the best, thus, they were chosen for the Voting Classifier", kind="info", ) } """) return ( accuracy_score, explained_variance_score, f1_score, log_loss, mean_absolute_error, mean_squared_error, precision_score, r2_score, recall_score, roc_auc_score, root_mean_squared_error, ) @app.cell def _(mo): user_inputs = mo.ui.dictionary( { "tenure": mo.ui.number(label="Tenure", start=1, stop=72, step=1), "monthly_charges": mo.ui.number( label="Monthly Charges", start=20, stop=120, step=1 ), "total_charges": mo.ui.number( label="Total Charges", start=20, stop=8000, step=1 ), "contract": mo.ui.dropdown( label="Contract (Year)", options=["None", "One", "Two"] ), "service": mo.ui.dropdown( label="Service", options=["None", "Basic", "Fiber Optic"] ), } ) mo.vstack(user_inputs.values()) return (user_inputs,) @app.cell def _(mo, pl, user_inputs, vot_pipe): contract = None service = None match user_inputs["contract"].value: case "None": contract = "false_false" case "One": contract = "true_false" case "Two": contract = "false_true" case _: pass match user_inputs["service"].value: case "None": service = "false_false" case "Basic": service = "true_false" case "Fiber Optic": service = "false_true" case _: pass preds = pl.DataFrame({ "tenure": user_inputs["tenure"].value, "monthly_charges": user_inputs["monthly_charges"].value, "total_charges": user_inputs["total_charges"].value, "contract_One Two year": contract, "internet_service_Fiber No": service, }) prediction = (vot_pipe.predict(preds), vot_pipe.predict_proba(preds)) mo.md(f"Prediction: {"Yes" if prediction[0][0] else "No" }, with about {prediction[1][0][0] * 100 if not prediction[0][0] else prediction[1][0][1] * 100:.2f}% probability.") return contract, prediction, preds, service if __name__ == "__main__": app.run()