Spaces:

gilangw
/

ChurnDefender

Sleeping

File size: 15,049 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt
import seaborn as sns
import json

def run():
    # Load Model Classification
    with open('adaboost_logreg_best.pkl', 'rb') as file_1:
    # with open('adaboost_logreg_10_features.pkl', 'rb') as file_1:
        classification_model = pickle.load(file_1)
    
    # Load Model Clustering
    with open('kp.pkl','rb') as file_2:
        clustering_model = pickle.load(file_2)

    # Load Clustering Scaler
    with open('scaler.pkl','rb') as file_3:
        scaler = pickle.load(file_3)
    
    # Load Clustering Numerical
    with open('num_col.txt','rb') as file_4:
        num_col = pickle.load(file_4)

    # Load Clustering Categorical
    with open('cat_col.txt','rb') as file_5:
        cat_col = pickle.load(file_5)

    # Choice of input: Upload or Manual Input
    inputType = st.selectbox("How would you like to input data ?", ["Upload Excel or CSV File", "Manual Input"])
    st.markdown('---')

    # Create Function for Prediction
    def predictData(df):
        totalCustomer = len(df)

        if totalCustomer < 1:
            st.write('## There is no Customer on this data, please check again.')
        else:
            # Classification prediction
            y_pred_uploaded = classification_model.predict(df)
            df['churn'] = y_pred_uploaded
            # st.dataframe(df)

            # Filter the DataFrame for Predicted Churn (1) 
            df_churn = df[df['churn'] == 1]
            
            churnCustomer = len(df_churn)

            if churnCustomer == 0:
                st.write('## There is no Customer predicted as Churn from this Data!')
            else:
                # Clustering prediction for Predicted Churn (1)
                ## Split Numerical and Categorical for K-Prototype
                data_cluster_num = df_churn[num_col]
                data_cluster_cat = df_churn[cat_col]

                ## Scale Numerical column
                num_scaled = scaler.transform(data_cluster_num)

                ## Merge Scaled Numerical + Categorical
                data_cluster_final = np.concatenate([num_scaled, data_cluster_cat], axis=1)
                data_cluster_final = pd.DataFrame(data_cluster_final, columns=['tenure', 'monthly_charges'] + cat_col)
                data_cluster_final = data_cluster_final.infer_objects()

                ## Mark Categorical Column
                index_cat_columns = [data_cluster_final.columns.get_loc(col) for col in cat_col] 

                ## Predict Cluster
                y_cluster = clustering_model.predict(data_cluster_final, categorical=index_cat_columns)
                # y_cluster = []
                #for rd in range(0, len(df_churn)): y_cluster.append(random.randint(0, 2)) # Random Generator for testing
                df_churn['cluster'] = y_cluster

                temp_cols = df_churn.columns.tolist()
                new_cols = temp_cols[0:1] + temp_cols[-2:] + temp_cols[1:-2]
                df_churn = df_churn[new_cols]
                df_churn = df_churn.sort_values(by=['cluster'], ascending=True)

                # Saving Result to Excel
                df_churn.to_excel('model_result.xlsx', index=False)
                
                # Split Data into 3 Cluster DataFrames
                df_cluster_0 = df_churn[df_churn['cluster'] == 0]
                df_cluster_1 = df_churn[df_churn['cluster'] == 1]
                df_cluster_2 = df_churn[df_churn['cluster'] == 2]

                st.write(f'## Result : `{churnCustomer} customer` from total {totalCustomer} customer ({int((churnCustomer/totalCustomer)*100)}%) are predicted as churn!')
                st.write('##### Here are some suggestion to minimalize churn potential for each customer depend on their cluster')
                c0, c1, c2 = '', '', ''
                for x in df_cluster_0['name']: c0 += str(x) + ', '
                for y in df_cluster_1['name']: c1 += str(y) + ', '
                for z in df_cluster_2['name']: c2 += str(z) + ', '
                
                cluster_0 = '''
                    - Most of them are senior citizen
                    - Having partner and dependents
                    - High monthly charges
                '''

                # suggestion_0 = '''
                #     - Offers packages with additional speed for 3 months for those who have subscribed for more than 3 years
                #     - Open all TV channels during big holiday events such as Eid, Christmas and others
                #     - Provide special offers to increase internet speed to them
                # '''
                suggestion_0 = '''
                    - Offers long term packages
                    - Give limited time offer
                    - Maintain good communication with this customer
                '''

                cluster_1 = '''
                    - Mix of senior citizan and youngster
                    - Having partner and dependents
                    - Low monthly charges
                '''

                suggestion_1 = '''
                    - Provides offers with many benefits if they subscribe for the long term
                    - Offers annual DSL internet packages at affordable prices
                    - New customer onboarding and orientation
                '''

                cluster_2 = '''
                    - Most of them are young people
                    - Most of them have no partner and dependents
                    - Moderate monthly charges
                '''

                # suggestion_2 = '''
                #     Providing special packages with the following criteria:
                #     - High speed internet but lower bandwidth at a cheaper price than normal packages
                #     - Low speed internet but large bandwidth so the connection is much more stable at a cheaper price than normal packages
                # '''
                suggestion_2 = '''
                    - Make an affordable internet package prices for this cluster
                    - Provides variation in Payment Method
                '''

                if c0 != '':
                    st.write(f'##### Cluster 1 - Elder Group - {len(df_cluster_0)} customer ({((len(df_cluster_0)/churnCustomer)*100):.1f}%)')
                    st.write(cluster_0)
                    st.write('Suggestion for `', c0[0:-2], '` is')
                    st.write(suggestion_0)
                    st.markdown('---')
                
                if c1 != '':
                    st.write(f'##### Cluster 2 - Mixage - {len(df_cluster_1)} customer ({((len(df_cluster_1)/churnCustomer)*100):.1f}%)')
                    st.write(cluster_1)
                    st.write('Suggestion for `', c1[0:-2], '` is')
                    st.write(suggestion_1)
                    st.markdown('---')
                
                if c2 != '':
                    st.write(f'##### Cluster 3 - Young Blood - {len(df_cluster_2)} customer ({((len(df_cluster_2)/churnCustomer)*100):.1f}%)')
                    st.write(cluster_2)
                    st.write('Suggestion for `', c2[0:-2], '` is')
                    st.write(suggestion_2)
                    st.markdown('---')
                
                # Create Bar Plot for Analyze Cluster
                num_agg_df = df_churn.groupby(['cluster']).agg({'tenure': 'mean', 'monthly_charges': 'mean'})
                num_agg_df = np.round(num_agg_df, decimals=2)
                fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(9, 5))

                # Loop through each subplot to populate it
                for ax, column in zip(axes, num_agg_df.columns):
                    sns.barplot(ax=ax, data=num_agg_df, x=num_agg_df.index, y=column, orient='v')
                    ax.set_title(f'Average {column} by Cluster')
                    ax.set_xlabel('Cluster')
                    ax.set_ylabel(f'Average {column}')
                    ax.bar_label(ax.containers[0])
                
                # plt.style.use('dark_background')
                plt.tight_layout()
                st.pyplot(fig)
                # plt.style.use('default')

                with open('model_result.xlsx', 'rb') as file:
                    st.download_button(
                        label='💾 Download Prediction Result',
                        data=file,
                        file_name='model_result.xlsx',
                        mime='application/vnd.ms-excel'
                )
    
    def tenureMonthToYear():
        year = st.session_state.tenurem % 12
        if year == 0:
            st.session_state.tenurey = int((st.session_state.tenurem / 12))
        else:
            st.session_state.tenurey = int((st.session_state.tenurem / 12) + 1)
    
    def calculateChargesAndCategory():
        st.session_state.tcharges = int((st.session_state.mcharges * st.session_state.tenurem))
        if st.session_state.mcharges <= 30:
            st.session_state.catcharges = 'Low Expense'
        elif st.session_state.mcharges <= 60:
            st.session_state.catcharges = 'Medium Expense'
        elif st.session_state.mcharges <= 90:
            st.session_state.catcharges = 'Medium High Expense'
        else:
            st.session_state.catcharges = 'High Expense'

    # A. For CSV
    if inputType == "Upload Excel or CSV File":
        dl_1, dl_2, dl_3 = st.columns([3, 3, 3])
        with open('telco_data_test.xlsx', 'rb') as file:
            dl_1.download_button(
                label='💾 Download Data Example',
                data=file,
                file_name='telco_example.xlsx',
                mime='application/vnd.ms-excel'
            )
        
        with open('telco_data_template.xlsx', 'rb') as file:
            dl_2.download_button(
                label='💾 Download Template Excel',
                data=file,
                file_name='telco_template.xlsx',
                mime='application/vnd.ms-excel'
            )

        uploaded_file = st.file_uploader("Choose Excel or CSV file", type=["csv", "xlsx"], accept_multiple_files=False)
        if uploaded_file is not None:
            split_file_name = os.path.splitext(uploaded_file.name)
            # file_name = split_file_name[0]
            file_extension = split_file_name[1]

            if file_extension == '.csv':
                df = pd.read_csv(uploaded_file)
            else:    
                df = pd.read_excel(uploaded_file)
            # st.dataframe(df.head())
            predictData(df)
    # B. For Manual        
    else:
        # Create Form
        # with st.form(key='Form Parameters'):
        name = st.text_input('Name', value='', help='Customer Name')

        col_left, col_mid, col_right = st.columns([3, 2, 2])
        gender =  col_left.selectbox('Gender', ('Male', 'Female'), index=0)
        with col_mid:
            tenure =  st.number_input('Tenure (Month)', min_value=1, max_value=999, step=1, help='Month', key='tenurem', on_change=tenureMonthToYear)
        with col_right:
            tenure_year = st.number_input('Tenure (Year)', min_value=1, max_value=999, step=1, disabled=True, key='tenurey')
        
        col1, col2, col3 = st.columns([1, 1, 1])
        senior_citizen = col1.radio(label='Senior Citizen?', options=['Yes', 'No'], help='Choose \'Yes\' for 61 years old above')
        partner = col2.radio(label='Having a partner?', options=['Yes', 'No'])
        dependents = col3.radio(label='Having a dependents?', options=['Yes', 'No'], help='For example : children')
        
        # col4, col5 = st.columns([1, 1])
        # internet_service =  col4.selectbox('Internet Service', ('DSL', 'Fiber optic', 'No'), index=0)

        col4, col5, col6 = st.columns([1, 1, 1])
        internet_service =  col4.radio(label='Subs for Internet service?', options=['DSL', 'Fiber optic', 'No'])
        phone_service = col5.radio(label='Subs for Phone service?', options=['Yes', 'No'])
        multiple_lines = col6.radio(label='Subs for Multiple Lines?', options=['Yes', 'No', 'No Phone Services'])

        col7, col8, col9 = st.columns([1, 1, 1])
        online_security = col7.radio(label='Subs for Online Security?', options=['Yes', 'No', 'No Internet Services'])
        online_backup = col8.radio(label='Subs for Online Backup?', options=['Yes', 'No', 'No Internet Services'])
        device_protection = col9.radio(label='Having Device Protections?', options=['Yes', 'No', 'No Internet Services'])
        tech_support = col7.radio(label='Having Tech Support service?', options=['Yes', 'No', 'No Internet Services'])
        streaming_tv = col8.radio(label='Subs for TV Streaming?', options=['Yes', 'No', 'No Internet Services'])
        streaming_movies = col9.radio(label='Subs for Movie Streaming?', options=['Yes', 'No', 'No Internet Services'])

        col_pm1, col_pm2, col_pm3 = st.columns([3, 3, 2])
        contract = col_pm1.selectbox('Contract', ('Month-to-month', 'One year', 'Two year'), index=0)
        payment_method = col_pm2.selectbox('Payment Method', ('Electronic check', 'Mailed check', 'Bank transfer (automatic)', 'Credit card (automatic)'), index=0)
        paperless_billing = col_pm3.selectbox('Paperless billing?', ('Yes', 'No'), index=0)

        col_charges1, col_charges2, col_charges3 = st.columns([1, 1, 2])
        monthly_charges =  col_charges1.number_input('Monthly Charges', min_value=1, max_value=999, step=1, help='Amount to paid per month', key='mcharges', on_change=calculateChargesAndCategory)
        total_charges = col_charges2.number_input('Total Charges', min_value=1, max_value=999999, step=1, disabled=True, key='tcharges')
        charges_cat = col_charges3.text_input('Charges Category', disabled=True, key='catcharges')

        # st.button('Predict', on_click=predict)
        data_inf = {
            'name': name, 
            'gender': gender, 
            'senior_citizen': senior_citizen, 
            'partner': partner, 
            'dependents': dependents, 
            'tenure': tenure, 
            'phone_service': phone_service, 
            'multiple_lines': multiple_lines, 
            'internet_service': internet_service, 
            'online_security': online_security, 
            'online_backup': online_backup, 
            'device_protection': device_protection, 
            'tech_support': tech_support, 
            'streaming_tv': streaming_tv, 
            'streaming_movies': streaming_movies, 
            'contract': contract, 
            'paperless_billing': paperless_billing, 
            'payment_method': payment_method, 
            'monthly_charges': monthly_charges, 
            'total_charges': total_charges, 
            'monthly_charges_cat': charges_cat, 
            'tenure_year': tenure_year
            }

        if st.button('Predict'):
            data_inf = pd.DataFrame([data_inf])
            # st.dataframe(data_inf.head())
            predictData(data_inf)

if __name__ == '__main__':
    run()