import marimo

__generated_with = "0.11.20"
app = marimo.App(width="medium")


@app.cell
def _(mo):
    mo.md(r"""# Customer Churn Analysis""")
    return


@app.cell
def _():
    import marimo as mo
    import polars as pl
    import altair as alt
    return alt, mo, pl


@app.cell
def _(pl):
    df = pl.read_csv(
        "hf://datasets/louiecerv/customer_churn/customer_churn_data.csv"
    )
    df.describe()
    return (df,)


@app.cell
def _(df):
    df.head()
    return


@app.cell
def _(df, pl):
    from sklearn.preprocessing import (
        RobustScaler,
        OneHotEncoder,
        MinMaxScaler,
        OrdinalEncoder,
    )
    from sklearn.pipeline import make_pipeline
    from sklearn.compose import make_column_transformer
    from sklearn.linear_model import (
        LogisticRegression,
        BayesianRidge,
        RidgeClassifier,
        SGDClassifier,
    )
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.svm import SVC
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.ensemble import (
        VotingClassifier,
        BaggingClassifier,
        GradientBoostingClassifier,
        RandomForestClassifier,
    )
    from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector
    from sklearn.model_selection import train_test_split

    num_features = ["tenure", "monthly_charges", "total_charges"]
    cat_features = ["contract_One Two year", "internet_service_Fiber No"]
    random_state = 33

    df2 = df.with_columns(
        (pl.col("contract_One year") + "_" + pl.col("contract_Two year")).alias(
            "contract_One Two year"
        ),
        (
            pl.col("internet_service_Fiber optic")
            + "_"
            + pl.col("internet_service_No")
        ).alias("internet_service_Fiber No"),
    )

    X, y = df2.select(num_features + cat_features), df2.select(["churn"])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.32, random_state=random_state
    )

    preprocessor = make_column_transformer(
        (OneHotEncoder(), cat_features),
        (MinMaxScaler(), num_features),
    )

    knc = KNeighborsClassifier(algorithm="ball_tree")
    dtree = DecisionTreeClassifier(criterion="entropy", random_state=random_state)
    rfc = RandomForestClassifier(
        criterion="entropy", max_features=0.3, random_state=random_state
    )
    gbc = GradientBoostingClassifier(random_state=random_state)
    bag = BaggingClassifier(
        KNeighborsClassifier(),
        max_samples=0.8,
        max_features=0.8,
        random_state=random_state,
    )

    log_pipe = make_pipeline(
        preprocessor, LogisticRegression(max_iter=10000, random_state=random_state)
    )
    bridge_pipe = make_pipeline(preprocessor, BayesianRidge(max_iter=10000))
    ridge_pipe = make_pipeline(
        preprocessor, RidgeClassifier(max_iter=10000, random_state=random_state)
    )
    sgd_pipe = make_pipeline(
        preprocessor,
        SGDClassifier(
            loss="hinge", penalty="l2", max_iter=10000, random_state=random_state
        ),
    )
    lda_pipe = make_pipeline(preprocessor, QuadraticDiscriminantAnalysis())
    bnb_pipe = make_pipeline(preprocessor, BernoulliNB())
    svc_pipe = make_pipeline(
        preprocessor, SVC(kernel="rbf", max_iter=10000, random_state=random_state)
    )
    dtree_pipe = make_pipeline(preprocessor, dtree)
    rfc_pipe = make_pipeline(preprocessor, rfc)
    knc_pipe = make_pipeline(preprocessor, knc)
    gbc_pipe = make_pipeline(preprocessor, gbc)
    vot_pipe = make_pipeline(
        preprocessor,
        VotingClassifier(
            estimators=[
                ("qda", QuadraticDiscriminantAnalysis()),
                ("dtree", dtree),
            ],
            voting="soft",
            weights=[5, 2],
        ),
    )
    bag_pipe = make_pipeline(preprocessor, bag)

    log_pred = log_pipe.fit(X_train, y_train).predict(X_test)
    bridge_pred = bridge_pipe.fit(X_train, y_train).predict(X_test)
    ridge_pred = ridge_pipe.fit(X_train, y_train).predict(X_test)
    sgd_pred = sgd_pipe.fit(X_train, y_train).predict(X_test)
    lda_pred = lda_pipe.fit(X_train, y_train).predict(X_test)
    bnb_pred = bnb_pipe.fit(X_train, y_train).predict(X_test)
    svc_pred = svc_pipe.fit(X_train, y_train).predict(X_test)
    dtree_pred = dtree_pipe.fit(X_train, y_train).predict(X_test)
    rfc_pred = dtree_pipe.fit(X_train, y_train).predict(X_test)
    knc_pred = knc_pipe.fit(X_train, y_train).predict(X_test)
    gbc_pred = gbc_pipe.fit(X_train, y_train).predict(X_test)
    vot_pred = vot_pipe.fit(X_train, y_train).predict(X_test)
    bag_pred = bag_pipe.fit(X_train, y_train).predict(X_test)
    return (
        BaggingClassifier,
        BayesianRidge,
        BernoulliNB,
        DecisionTreeClassifier,
        GradientBoostingClassifier,
        KNeighborsClassifier,
        LogisticRegression,
        MinMaxScaler,
        OneHotEncoder,
        OrdinalEncoder,
        QuadraticDiscriminantAnalysis,
        RFE,
        RFECV,
        RandomForestClassifier,
        RidgeClassifier,
        RobustScaler,
        SGDClassifier,
        SVC,
        SequentialFeatureSelector,
        VotingClassifier,
        X,
        X_test,
        X_train,
        bag,
        bag_pipe,
        bag_pred,
        bnb_pipe,
        bnb_pred,
        bridge_pipe,
        bridge_pred,
        cat_features,
        df2,
        dtree,
        dtree_pipe,
        dtree_pred,
        gbc,
        gbc_pipe,
        gbc_pred,
        knc,
        knc_pipe,
        knc_pred,
        lda_pipe,
        lda_pred,
        log_pipe,
        log_pred,
        make_column_transformer,
        make_pipeline,
        num_features,
        preprocessor,
        random_state,
        rfc,
        rfc_pipe,
        rfc_pred,
        ridge_pipe,
        ridge_pred,
        sgd_pipe,
        sgd_pred,
        svc_pipe,
        svc_pred,
        train_test_split,
        vot_pipe,
        vot_pred,
        y,
        y_test,
        y_train,
    )


@app.cell
def _(
    bag_pred,
    bnb_pred,
    bridge_pred,
    dtree_pred,
    gbc_pred,
    knc_pred,
    lda_pred,
    log_pred,
    mo,
    rfc_pred,
    ridge_pred,
    sgd_pred,
    svc_pred,
    vot_pred,
    y_test,
):
    from sklearn.metrics import (
        accuracy_score,
        precision_score,
        f1_score,
        recall_score,
        roc_auc_score,
        log_loss,
        mean_squared_error,
        root_mean_squared_error,
        mean_absolute_error,
        r2_score,
        explained_variance_score,
    )

    mo.md(f"""
    # Model Metrics

    ## Logistic Regression

    - Accuracy: {accuracy_score(y_test, log_pred)}
    - Precision: {precision_score(y_test, log_pred)}
    - Recall: {recall_score(y_test, log_pred)}
    - F1: {f1_score(y_test, log_pred)}
    - ROC-AUC: {roc_auc_score(y_test, log_pred)}
    - Log Loss: {log_loss(y_test, log_pred)}

    ## Ridge Classifier

    - Accuracy: {accuracy_score(y_test, ridge_pred)}
    - Precision: {precision_score(y_test, ridge_pred)}
    - Recall: {recall_score(y_test, ridge_pred)}
    - F1: {f1_score(y_test, ridge_pred)}
    - ROC-AUC: {roc_auc_score(y_test, ridge_pred)}
    - Log Loss: {log_loss(y_test, ridge_pred)}

    ## SGD Classifier

    - Accuracy: {accuracy_score(y_test, sgd_pred)}
    - Precision: {precision_score(y_test, sgd_pred)}
    - Recall: {recall_score(y_test, sgd_pred)}
    - F1: {f1_score(y_test, sgd_pred)}
    - ROC-AUC: {roc_auc_score(y_test, sgd_pred)}
    - Log Loss: {log_loss(y_test, sgd_pred)}

    ## Bayesian Ridge Regression

    - Mean Squared Error: {mean_squared_error(y_test, bridge_pred)}
    - Root Mean Squared Error: {root_mean_squared_error(y_test, bridge_pred)}
    - Mean Absolute Error: {mean_absolute_error(y_test, bridge_pred)}
    - R^2: {r2_score(y_test, bridge_pred)}
    - Explained Variance: {explained_variance_score(y_test, bridge_pred)}

    ## Quadratic Discriminant Analysis

    - Accuracy: {accuracy_score(y_test, lda_pred)}
    - Precision: {precision_score(y_test, lda_pred)}
    - Recall: {recall_score(y_test, lda_pred)}
    - F1: {f1_score(y_test, lda_pred)}
    - ROC-AUC: {roc_auc_score(y_test, lda_pred)}
    - Log Loss: {log_loss(y_test, lda_pred)}

    ## Bernoulli Naive Bayes

    - Accuracy: {accuracy_score(y_test, bnb_pred)}
    - Precision: {precision_score(y_test, bnb_pred)}
    - Recall: {recall_score(y_test, bnb_pred)}
    - F1: {f1_score(y_test, bnb_pred)}
    - ROC-AUC: {roc_auc_score(y_test, bnb_pred)}
    - Log Loss: {log_loss(y_test, bnb_pred)}

    ## C-Support Vector Classifier

    - Accuracy: {accuracy_score(y_test, svc_pred)}
    - Precision: {precision_score(y_test, svc_pred)}
    - Recall: {recall_score(y_test, svc_pred)}
    - F1: {f1_score(y_test, svc_pred)}
    - ROC-AUC: {roc_auc_score(y_test, svc_pred)}
    - Log Loss: {log_loss(y_test, svc_pred)}

    ## Decision Tree Classifier

    - Accuracy: {accuracy_score(y_test, dtree_pred)}
    - Precision: {precision_score(y_test, dtree_pred)}
    - Recall: {recall_score(y_test, dtree_pred)}
    - F1: {f1_score(y_test, dtree_pred)}
    - ROC-AUC: {roc_auc_score(y_test, dtree_pred)}
    - Log Loss: {log_loss(y_test, dtree_pred)}

    ## Random Forest Classifier

    - Accuracy: {accuracy_score(y_test, rfc_pred)}
    - Precision: {precision_score(y_test, rfc_pred)}
    - Recall: {recall_score(y_test, rfc_pred)}
    - F1: {f1_score(y_test, rfc_pred)}
    - ROC-AUC: {roc_auc_score(y_test, rfc_pred)}
    - Log Loss: {log_loss(y_test, rfc_pred)}

    ## K Neighbors Classifier

    - Accuracy: {accuracy_score(y_test, knc_pred)}
    - Precision: {precision_score(y_test, knc_pred)}
    - Recall: {recall_score(y_test, knc_pred)}
    - F1: {f1_score(y_test, knc_pred)}
    - ROC-AUC: {roc_auc_score(y_test, knc_pred)}
    - Log Loss: {log_loss(y_test, knc_pred)}

    ## Gradient Boosting Classifier

    - Accuracy: {accuracy_score(y_test, gbc_pred)}
    - Precision: {precision_score(y_test, gbc_pred)}
    - Recall: {recall_score(y_test, gbc_pred)}
    - F1: {f1_score(y_test, gbc_pred)}
    - ROC-AUC: {roc_auc_score(y_test, gbc_pred)}
    - Log Loss: {log_loss(y_test, gbc_pred)}

    ## Voting Classifier

    - Accuracy: {accuracy_score(y_test, vot_pred)}
    - Precision: {precision_score(y_test, vot_pred)}
    - Recall: {recall_score(y_test, vot_pred)}
    - F1: {f1_score(y_test, vot_pred)}
    - ROC-AUC: {roc_auc_score(y_test, vot_pred)}
    - Log Loss: {log_loss(y_test, vot_pred)}

    ## Bagging Classifier

    - Accuracy: {accuracy_score(y_test, bag_pred)}
    - Precision: {precision_score(y_test, bag_pred)}
    - Recall: {recall_score(y_test, bag_pred)}
    - F1: {f1_score(y_test, bag_pred)}
    - ROC-AUC: {roc_auc_score(y_test, bag_pred)}
    - Log Loss: {log_loss(y_test, bag_pred)}

    {
        mo.callout(
            "From the metrics, the Quadratic Discriminant Analysis and the Decision Tree Classifier perform the best, thus, they were chosen for the Voting Classifier",
            kind="info",
        )
    }
    """)
    return (
        accuracy_score,
        explained_variance_score,
        f1_score,
        log_loss,
        mean_absolute_error,
        mean_squared_error,
        precision_score,
        r2_score,
        recall_score,
        roc_auc_score,
        root_mean_squared_error,
    )


@app.cell
def _(mo):
    user_inputs = mo.ui.dictionary(
        {
            "tenure": mo.ui.number(label="Tenure", start=1, stop=72, step=1),
            "monthly_charges": mo.ui.number(
                label="Monthly Charges", start=20, stop=120, step=1
            ),
            "total_charges": mo.ui.number(
                label="Total Charges", start=20, stop=8000, step=1
            ),
            "contract": mo.ui.dropdown(
                label="Contract (Year)", options=["None", "One", "Two"]
            ),
            "service": mo.ui.dropdown(
                label="Service", options=["None", "Basic", "Fiber Optic"]
            ),
        }
    )

    mo.vstack(user_inputs.values())
    return (user_inputs,)


@app.cell
def _(mo, pl, user_inputs, vot_pipe):
    contract = None
    service = None

    match user_inputs["contract"].value:
        case "None":
            contract = "false_false"
        case "One":
            contract = "true_false"
        case "Two":
            contract = "false_true"
        case _:
            pass

    match user_inputs["service"].value:
        case "None":
            service = "false_false"
        case "Basic":
            service = "true_false"
        case "Fiber Optic":
            service = "false_true"
        case _:
            pass

    preds = pl.DataFrame({
        "tenure": user_inputs["tenure"].value,
        "monthly_charges": user_inputs["monthly_charges"].value,
        "total_charges": user_inputs["total_charges"].value,
        "contract_One Two year": contract,
        "internet_service_Fiber No": service,
    })

    prediction = (vot_pipe.predict(preds), vot_pipe.predict_proba(preds)) 

    mo.md(f"Prediction: {"Yes" if prediction[0][0] else "No" }, with about {prediction[1][0][0] * 100 if not prediction[0][0] else prediction[1][0][1] * 100:.2f}% probability.")
    return contract, prediction, preds, service


if __name__ == "__main__":
    app.run()