Spaces:

SandyTheAdventurer
/

PredictCustomerChurn

Sleeping

File size: 9,058 Bytes

import gradio as gr
import main
import numpy as np
import pandas as pd
from main import clf_rf, clf_log, accuracy_score_rf, accuracy_score_lr, brier_score_rf, brier_score_lr, roc_rf, roc_lr, logistic, rf_clf, encoder, scaler

def eda(Graphs):
    match Graphs:
        case "Customer Churn":
            return gr.Image("graphs/EDAGraphs/Churn.png")
        case "Contract":
            return gr.Image("graphs/EDAGraphs/Contract.png")
        case "Dependents":
            return gr.Image("graphs/EDAGraphs/Dependents.png")
        case "Device Protection":
            return gr.Image("graphs/EDAGraphs/DeviceProtection.png")
        case "Heatmap":
            return gr.Image("graphs/EDAGraphs/Heatmap.png")
        case "Monthly Charges":
            return gr.Image("graphs/EDAGraphs/MonthlyCharges.png")
        case "Online Backup":
            return gr.Image("graphs/EDAGraphs/OnlineBackup.png")
        case "Online Security":
            return gr.Image("graphs/EDAGraphs/OnlineSecurity.png")
        case "Paperless Billing":
            return gr.Image("graphs/EDAGraphs/PaperlessBilling.png")
        case "Partner":
            return gr.Image("graphs/EDAGraphs/Partner.png")
        case "Payment Method":
            return gr.Image("graphs/EDAGraphs/PaymentMethod.png")
        case "Senior Citizen":
            return gr.Image("graphs/EDAGraphs/SeniorCitizen.png")
        case "Tech Support":
            return gr.Image("graphs/EDAGraphs/TechSupport.png")
        case "Tenure":
            return gr.Image("graphs/EDAGraphs/tenure.png")

def result(Graphs):
    match Graphs:
        case "Correlation":
            inf = """
Inferences:

    - Churn is perfectly correlated with itself (1.0).

    - Tenure and Contract have strong negative correlation with Churn.

    - MonthlyCharges and PaperlessBilling have moderate positive correlation with Churn.

    - Gender, PhoneService, and MultipleLines have near-zero correlation.

    - OnlineSecurity, TechSupport, and DeviceProtection have moderate negative correlation.
"""
            return [gr.Image("graphs/EDAGraphs/Correlation.png"), inf]
        case "Confusion Matrix for Random Forest":
            inf = """
Inferences:

    - True Positives (181) are lower than Logistic Regression’s (215), indicating worse recall for churn.

    - False Negatives (192) are higher – more churners are missed.

    - False Positives (104) are slightly higher than logistic regression’s — marginally more false alarms.

    - True Negatives (932) are comparable – performs similarly for non-churn cases.

    - Overall: Slightly poorer at identifying churn compared to Logistic Regression in this case.

"""
            return [gr.Image("graphs/OutputGraphs/ConfusionMatrixRandomForest.png"), inf]
        case "Confusion Matrix for Logistic regression":
            inf = """
Inferences:

    - True Negatives (936) and True Positives (215) indicate good performance in correctly identifying both classes.

    - False Negatives (158): A moderate number of actual churns are missed, which could be critical in business decisions.

    - False Positives (100): Some non-churning customers are predicted to churn — might lead to unnecessary retention efforts.

    - Model Bias: Slight bias towards predicting the majority class (non-churn).

    - Overall: Reasonable balance, but recall for churn could be improved.

"""
            return [gr.Image("graphs/OutputGraphs/ConfusionMatrixLogistic.png"), inf]
        case "SHAP analysis for Random Forest":
            inf = """
Inferences:

    - Bimodal Distribution: Indicates two clear groups — likely senior citizens and non-seniors with distinct behavior patterns.

    - Near-Zero SHAP values: SeniorCitizen has limited standalone predictive power.

    - Interaction: The variable may influence predictions when combined with other features (e.g., internet service or contract).

    - Red vs Blue Dots: Represents SHAP values across classes — they’re symmetric, confirming weak influence.

    - Overall: SeniorCitizen is not a key predictor by itself in this model.
"""
            return [gr.Image("graphs/OutputGraphs/SHAP_RandomForest_Summary.png"), inf]
        case "SHAP analysis for Logistic Regression":
            inf = """
Inferences:

    - Top Feature: tenure is the most influential in predicting churn — lower tenure likely increases churn risk.

    - MonthlyCharges & Contract also have strong effects — customers on monthly or expensive plans may churn more.

    - Security-related services (e.g., OnlineSecurity, TechSupport) have moderate influence — presence may reduce churn.
    
    - PaperlessBilling and OnlineBackup show notable contributions, possibly associated with digital-savvy customers.

    - Features like DeviceProtection, Partner, and PaymentMethod have minimal impact.
"""
            return [gr.Image("graphs/OutputGraphs/SHAP_Logistic_Summary.png"), inf]

def metrics(Algorithms):
    match Algorithms:
        case "Random Forest":
            df = pd.DataFrame(clf_rf)
            df = df.drop(columns = ['accuracy', 'macro avg', 'weighted avg']).T
            df = df.reset_index().rename(columns={'index': 'class'})
            df_clf = gr.DataFrame(
                value = df
            )
            df_acc = gr.DataFrame(
                headers = ['Accuracy Score', 'Brier Score', 'ROC Score'],
                value = [list([accuracy_score_rf, brier_score_rf, roc_rf])],
            )
            return df_clf, df_acc
        
        case "Logistic Regression":
            df = pd.DataFrame(clf_log)
            df = df.drop(columns = ['accuracy', 'macro avg', 'weighted avg']).T
            df = df.reset_index().rename(columns={'index': 'class'})
            df_clf = gr.DataFrame(
                value = df
            )
            df_acc = gr.DataFrame(
                headers = ['Accuracy Score', 'Brier Score', 'ROC Score'],
                value = [list([accuracy_score_lr, brier_score_lr, roc_lr])],
            )
            return df_clf, df_acc

def predictChurn(model, filename):
    dataset = pd.read_csv(filename)

    customers = dataset["customerID"]
    dataset.drop(columns=['customerID'], inplace=True)

    for column in dataset.select_dtypes(include=['int64', 'float64']).columns:
        dataset[column] = scaler.fit_transform(dataset[column].values.reshape(-1, 1))
    for column in dataset.select_dtypes(include=['object']).columns:
        dataset[column] = encoder.fit_transform(dataset[column])

    dataset.drop(columns=['gender', 'PhoneService', 'MultipleLines', 'InternetService', 'StreamingTV', 'StreamingMovies', 'TotalCharges'], inplace=True)

    match model:
        case "Logistic Regression":
            model = logistic
        case "Random Forest":
            model = rf_clf

    y_predictions = model.predict(dataset)
    print(y_predictions)
    d = {0: "No Churn", 1: "Churn"}

    l = zip(customers, y_predictions)

    df = pd.DataFrame(l, columns= ["Customer ID", "Output"])

    for i in range(len(df["Customer ID"])):
        output = d[y_predictions[i]]
        df["Output"][i] = output

    return gr.DataFrame(value = df)

with gr.Blocks() as Output:
    gr.Markdown("View Exploratory data Analysis and Output")
    with gr.Tab("EDA Graphs"):
        eda_input = gr.Radio(["Customer Churn", "Contract", "Dependents", "Device Protection", "Heatmap", "Monthly Charges", "Online Backup", "Online Security", "Paperless Billing", "Partner", "Payment Method", "Senior Citizen", "Tech Support", "Tenure"], show_label= False)
        eda_output = gr.Image()

        eda_input.change(fn = eda, inputs= eda_input, outputs= eda_output)

    with gr.Tab("Output Graphs"):
        result_input = gr.Radio(["Correlation", "Confusion Matrix for Random Forest", "Confusion Matrix for Logistic regression", "SHAP analysis for Random Forest", "SHAP analysis for Logistic Regression"], show_label = False)

        result_output = [gr.Image(), gr.Markdown()]

        result_input.change(fn = result, inputs=result_input, outputs = result_output)

    with gr.Tab("Performance Metrics"):
        algorithm = gr.Radio(["Random Forest", "Logistic Regression"], show_label= False)

        metrics_output = [gr.DataFrame(), gr.DataFrame()]

        algorithm.change(fn = metrics, inputs = algorithm, outputs = metrics_output)

    with gr.Tab("Predict Live"):
        gr.Markdown("# Predict Churn")
        model = gr.Radio(["Logistic Regression", "Random Forest"], show_label = False)
        file = gr.File()
        dataset = gr.UploadButton("Upload Dataset(as CSV file)", file_count = "single")
        predict = gr.Button("Predict")

        op_df = gr.DataFrame(headers = ["Customer ID", "Output"])
        op_md = gr.Markdown("# Predicted Churns")

        clear = gr.ClearButton(components = [model, file, dataset, op_df])

        dataset.upload(lambda file: file, dataset, file)
        predict.click(fn = predictChurn, inputs = [model, dataset], outputs = op_df)
        clear.click()

Output.launch()