Spaces:

hhhar
/

ChurnPredUpdated

Sleeping

File size: 15,881 Bytes

import streamlit as st
import joblib
import numpy as np
import os
import pandas as pd
import openpyxl
import zipfile
import io

# Load the preprocessor
preprocessor_path = 'modelExports/preprocessor.pkl'
preprocessor = joblib.load(preprocessor_path)

def find_header_row(df, required_columns, max_rows_to_check=10):
    required_columns_upper = [col.upper().strip() for col in required_columns]
    for i in range(min(max_rows_to_check, len(df))):
        row_values = [str(val).upper().strip() for val in df.iloc[i].values]
        if all(col in row_values for col in required_columns_upper):
            return i  # Header row found at row i
    return -1  # Header row not found

def process_uploaded_file(uploaded_file, required_columns):
    try:
        file_extension = uploaded_file.name.split('.')[-1].lower()

        if file_extension == 'csv':
            # Read the first few rows to check for headers
            df = pd.read_csv(uploaded_file, nrows=10, header=None)
        elif file_extension in ['xlsx', 'xls']:
            df = pd.read_excel(uploaded_file, nrows=10, header=None, engine='openpyxl')
        else:
            st.error("Unsupported file format. Please upload a CSV or Excel file.")
            return None

        header_row = find_header_row(df, required_columns)

        if header_row == -1:
            st.error(f"Required columns not found in the first {len(df)} rows.")
            st.write("Expected columns:", required_columns)
            st.write("Found data rows:", df.head().values.tolist())
            return None

        # Re-read the file with the correct header row
        uploaded_file.seek(0)  # Reset file pointer
        if file_extension == 'csv':
            df = pd.read_csv(uploaded_file, header=header_row)
        else:
            df = pd.read_excel(uploaded_file, header=header_row, engine='openpyxl')

        # Standardize column names to uppercase and strip spaces
        df.columns = df.columns.str.upper().str.strip()

        # Standardize required columns to uppercase and strip spaces
        required_columns_upper = [col.upper().strip() for col in required_columns]

        # Check if all required columns are present
        missing_columns = [col for col in required_columns_upper if col not in df.columns]
        if missing_columns:
            st.error(f"The following required columns are missing: {missing_columns}")
            return None

        st.write(f"Uploaded data has {df.shape[0]} rows and {df.shape[1]} columns.")

        return df
    except Exception as e:
        st.error(f"Error reading the file: {e}")
        return None

def predict_with_model(model, data, includes_preprocessor):
    if includes_preprocessor:
        return model.predict(data)
    else:
        return model.predict(preprocessor.transform(data))

def create_model_output(df, predictions, model_name):
    output_df = df[['PID']].copy()  # Keep PID from original dataframe
    output_df['Prediction'] = predictions
    output_df['Churn Probability'] = predictions
    output_df['Churn'] = output_df['Prediction'].apply(
        lambda x: 'Yes' if x == 1 else 'No')
    return output_df

# Load models and record whether they include the preprocessor
model_folder = 'modelExports'
models = {}
models_with_preprocessor = {}
for file_name in os.listdir(model_folder):
    if file_name.endswith('.pkl') and file_name != 'preprocessor.pkl':
        model_name = file_name.replace('.pkl', '').replace('_', ' ').upper()
        model = joblib.load(os.path.join(model_folder, file_name))
        models[model_name] = model

        # Check if model includes preprocessor
        includes_preprocessor = hasattr(
            model, 'named_steps') and 'preprocessor' in model.named_steps
        models_with_preprocessor[model_name] = includes_preprocessor

# Model accuracies
model_accuracies = {
    "GAUSSIAN NAIVE BAYES WITH SMOTE MODEL": 86,
    "GAUSSIAN NAIVE BAYES WITHOUT SMOTE MODEL": 85,
    "GRADIENT BOOSTING WITH SMOTE MODEL": 95,
    "GRADIENT BOOSTING WITHOUT SMOTE MODEL": 94,
    "LINEAR DISCRIMINANT ANALYSIS WITH SMOTE MODEL": 88,
    "LINEAR DISCRIMINANT ANALYSIS WITHOUT SMOTE MODEL": 87,
    "LOGISTIC REGRESSION WITH SMOTE MODEL": 90,
    "LOGISTIC REGRESSION WITHOUT SMOTE MODEL": 89,
    "RANDOM FOREST WITH SMOTE MODEL": 95,
    "RANDOM FOREST WITHOUT SMOTE MODEL": 93,
    "SUPPORT VECTOR MACHINE WITH SMOTE MODEL": 91,
    "SUPPORT VECTOR MACHINE WITHOUT SMOTE MODEL": 90,
    "ADABOOST WITH SMOTE MODEL": 92,
    "ADABOOST WITHOUT SMOTE MODEL": 90,
    "DECISION TREE WITH SMOTE MODEL": 88,
    "DECISION TREE WITHOUT SMOTE MODEL": 86
}

# Define the Streamlit app
st.title('Customer Churn Prediction')

# Sidebar for interface selection
st.sidebar.header('Interface Selection')
interface = st.sidebar.radio(
    "Choose an interface",
    ("Single Prediction", "Batch Prediction")
)

# Sidebar for model selection
st.sidebar.header('Model Selection')
selected_models = st.sidebar.multiselect(
    'Select models for prediction',
    list(models.keys()),
    default=list(models.keys())
)

# Define categorical options
crm_pid_value_segment_options = ['Bronze', 'Iron', 'Gold', 'Silver', 'Lead',
                                 'Platinum', 'SME', 'SE', 'Sliver', 'Unknown']
effective_segment_options = ['SOHO', 'VSE', 'Other', 'SME', 'LE', 'SE']
ka_name_options = ['Vladimir Manahilov', 'Desislava Ivanova', 'Martin Tilev',
                   'Anna Dimitrova', 'Rumiana Jordanova', 'Anna Dimova',
                   'Vania Uzunova', 'Varta Torosian', 'Daniela Stefanova',
                   'Ginka Vachkova', 'Tatiana Trifonova', 'Jenia Gogova', 'Unknown']

if interface == "Single Prediction":
    # Input fields for new customer data
    st.header('Enter New Customer Data')

    # Collect input data
    input_data = {}

    # Categorical inputs
    input_data['CRM_PID_VALUE_SEGMENT'] = st.selectbox(
        'CRM_PID_VALUE_SEGMENT', crm_pid_value_segment_options)
    input_data['EFFECTIVESEGMENT'] = st.selectbox(
        'EFFECTIVESEGMENT', effective_segment_options)
    input_data['KA_NAME'] = st.selectbox('KA_NAME', ka_name_options)

    # Numerical inputs
    input_data['BILLING_ZIP'] = st.number_input(
        'BILLING_ZIP', min_value=0, format="%d")
    input_data['ACTIVE_SUBSCRIBERS'] = st.number_input(
        'ACTIVE_SUBSCRIBERS', min_value=0, format="%d")
    input_data['NOT_ACTIVE_SUBSCRIBERS'] = st.number_input(
        'NOT_ACTIVE_SUBSCRIBERS', min_value=0, format="%d")
    input_data['SUSPENDED_SUBSCRIBERS'] = st.number_input(
        'SUSPENDED_SUBSCRIBERS', min_value=0, format="%d")
    input_data['TOTAL_SUBS'] = st.number_input(
        'TOTAL_SUBS', min_value=0, format="%d")
    input_data['AVGMOBILEREVENUE'] = st.number_input(
        'AVGMOBILEREVENUE', min_value=0.0, format="%.2f")
    input_data['AVGFIXREVENUE'] = st.number_input(
        'AVGFIXREVENUE', min_value=0.0, format="%.2f")
    input_data['TOTALREVENUE'] = st.number_input(
        'TOTALREVENUE', min_value=0.0, format="%.2f")
    input_data['ARPU'] = st.number_input('ARPU', min_value=0.0, format="%.2f")

    # Predict churn
    if st.button('Predict Churn'):
        # Convert input data to DataFrame
        input_df = pd.DataFrame([input_data])

        # Standardize column names to uppercase
        input_df.columns = input_df.columns.str.upper().str.strip()

        # Preprocess the data only if needed
        input_data_transformed = preprocessor.transform(input_df)

        st.write("### Model Predictions")

        predictions = {}
        weighted_votes = {'Churn': 0, 'No Churn': 0}

        for model_name in selected_models:
            model = models[model_name]
            includes_preprocessor = models_with_preprocessor[model_name]

            try:
                if includes_preprocessor:
                    # Model includes preprocessor; use raw data
                    prediction = model.predict(input_df)
                else:
                    # Model does not include preprocessor; use preprocessed data
                    prediction = model.predict(input_data_transformed)
            except Exception as e:
                st.error(f"Error predicting with model {model_name}: {e}")
                continue

            churn_prediction = 'Churn' if prediction[0] == 1 else 'No Churn'
            predictions[model_name] = churn_prediction

            # Add weighted vote
            weight = model_accuracies.get(model_name, 1)
            weighted_votes[churn_prediction] += weight

            # Display individual model predictions
            st.write(
                f"**{model_name}:** {churn_prediction} (Accuracy: {weight}%)")

        # Calculate and display the overall prediction
        total_weight = sum(weighted_votes.values())
        if total_weight == 0:
            st.error(
                "No valid predictions were made. Cannot compute churn probability.")
        else:
            churn_probability = weighted_votes['Churn'] / total_weight
            overall_prediction = 'Churn' if churn_probability > 0.5 else 'No Churn'

            st.write("### Overall Prediction")
            st.write(f"**Final Prediction:** {overall_prediction}")
            st.write(f"**Churn Probability:** {churn_probability:.2%}")
            st.write(f"**No Churn Probability:** {1 - churn_probability:.2%}")

            # Visualize the predictions
            st.write("### Prediction Visualization")
            chart_data = pd.DataFrame(
                {
                    'Prediction': ['Churn', 'No Churn'],
                    'Weighted Vote': [
                        weighted_votes['Churn'],
                        weighted_votes['No Churn']
                    ]
                }
            )
            st.bar_chart(chart_data.set_index('Prediction'))

elif interface == "Batch Prediction":
    # Batch Prediction Interface
    st.header('Batch Prediction')
    st.write('Upload a CSV or Excel file containing customer data.')

    uploaded_file = st.file_uploader(
        "Choose a CSV or Excel file", type=["csv", "xlsx", "xls"])

    if uploaded_file is not None:
        # Check if models are selected
        if not selected_models:
            st.error(
                "No models selected for prediction. Please select at least one model in the sidebar.")
            st.stop()

        required_columns = [
            'PID', 'CRM_PID_VALUE_SEGMENT', 'EFFECTIVESEGMENT', 'BILLING_ZIP', 'KA_NAME',
            'ACTIVE_SUBSCRIBERS', 'NOT_ACTIVE_SUBSCRIBERS', 'SUSPENDED_SUBSCRIBERS',
            'TOTAL_SUBS', 'AVGMOBILEREVENUE', 'AVGFIXREVENUE', 'TOTALREVENUE', 'ARPU'
        ]

        df = process_uploaded_file(uploaded_file, required_columns)
        if df is None:
            st.stop()

        # Standardize required columns to uppercase and strip spaces
        required_columns_upper = [col.upper().strip() for col in required_columns]

        # Convert numerical columns to numeric data types
        numerical_columns = [
            'BILLING_ZIP', 'ACTIVE_SUBSCRIBERS', 'NOT_ACTIVE_SUBSCRIBERS',
            'SUSPENDED_SUBSCRIBERS', 'TOTAL_SUBS', 'AVGMOBILEREVENUE',
            'AVGFIXREVENUE', 'TOTALREVENUE', 'ARPU'
        ]

        for col in numerical_columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

        # Fill missing values if any
        df.fillna({
            'CRM_PID_VALUE_SEGMENT': 'Unknown',
            'EFFECTIVESEGMENT': 'Unknown',
            'KA_NAME': 'Unknown',
            'BILLING_ZIP': 0,
            'ACTIVE_SUBSCRIBERS': 0,
            'NOT_ACTIVE_SUBSCRIBERS': 0,
            'SUSPENDED_SUBSCRIBERS': 0,
            'TOTAL_SUBS': 0,
            'AVGMOBILEREVENUE': 0.0,
            'AVGFIXREVENUE': 0.0,
            'TOTALREVENUE': 0.0,
            'ARPU': 0.0
        }, inplace=True)

        st.write("### Processing Batch Predictions...")

        model_outputs = {}

        for model_name in selected_models:
            model = models[model_name]
            includes_preprocessor = models_with_preprocessor[model_name]

            try:
                # Prepare data for prediction
                data_for_prediction = df[required_columns_upper[1:]]  # Exclude 'PID'

                if not includes_preprocessor:
                    data_for_prediction = preprocessor.transform(data_for_prediction)

                predictions = model.predict(data_for_prediction)
                model_outputs[model_name] = create_model_output(
                    df, predictions, model_name)
                st.success(f"Predictions completed for {model_name}")
            except Exception as e:
                st.error(f"Error predicting with model {model_name}: {e}")

        st.success('Batch predictions completed for all selected models.')

        # Download options
        st.header('Download Predictions')
        download_option = st.radio(
            "Choose how to download your predictions:",
            ("All Models in Separate Files", "Churn and Non-Churn in Separate Files", "Download All at Once")
        )

        if download_option == "All Models in Separate Files":
            # Allow user to download the results for each model
            for model_name, output_df in model_outputs.items():
                csv = output_df.to_csv(index=False).encode('utf-8')
                st.download_button(
                    label=f"Download {model_name} Predictions as CSV",
                    data=csv,
                    file_name=f'{model_name.lower().replace(" ", "_")}_predictions.csv',
                    mime='text/csv',
                )
        elif download_option == "Churn and Non-Churn in Separate Files":
            # Consolidate results for all models and split into churn and non-churn files
            for model_name, output_df in model_outputs.items():
                churn_df = output_df[output_df['Churn'] == 'Yes']
                non_churn_df = output_df[output_df['Churn'] == 'No']

                churn_csv = churn_df.to_csv(index=False).encode('utf-8')
                non_churn_csv = non_churn_df.to_csv(index=False).encode('utf-8')

                st.download_button(
                    label=f"Download {model_name} Churn Predictions as CSV",
                    data=churn_csv,
                    file_name=f'{model_name.lower().replace(" ", "_")}_churn_predictions.csv',
                    mime='text/csv',
                )
                st.download_button(
                    label=f"Download {model_name} Non-Churn Predictions as CSV",
                    data=non_churn_csv,
                    file_name=f'{model_name.lower().replace(" ", "_")}_non_churn_predictions.csv',
                    mime='text/csv',
                )
        elif download_option == "Download All at Once":
            # Create a zip file containing all outputs
            zip_buffer = io.BytesIO()
            with zipfile.ZipFile(zip_buffer, "w") as zip_file:
                for model_name, output_df in model_outputs.items():
                    csv_data = output_df.to_csv(index=False).encode('utf-8')
                    zip_file.writestr(f'{model_name.lower().replace(" ", "_")}_predictions.csv', csv_data)
            zip_buffer.seek(0)

            st.download_button(
                label="Download All Predictions as ZIP",
                data=zip_buffer,
                file_name='all_model_predictions.zip',
                mime='application/zip'
            )
    else:
        st.info('Awaiting CSV or Excel file to be uploaded.')

# Sidebar information
st.sidebar.write("### Model Information")
st.sidebar.write(f"Total models available: {len(models)}")
st.sidebar.write(f"Models selected for prediction: {len(selected_models)}")
st.sidebar.write("### Model Accuracies")
for model, accuracy in model_accuracies.items():
    st.sidebar.write(f"{model}: {accuracy}%")