Spaces:

Heizsenberg
/

analytic-data-efficiency

Sleeping

File size: 11,955 Bytes

import streamlit as st 
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
import plotly.express as px
from PIL import Image
# from ucimlrepo import fetch_ucirepo 

def eda_bin_chart(df, columnAgainst):
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    df_copy = df.copy()
    bin = list(range(0, 1_000_001, 50_000))
    label = [
        f"{bin[i]//1000}k–{bin[i+1]//1000}k"
        for i in range(len(bin) - 1)
    ]


    df_copy['bin'] = pd.cut(df_copy[columnAgainst], bins=bin, labels=label)
    labels = [
        "0–100k", "100–200k", "200–300k",
        "300–500k", "500–750k", "750k+"
    ]
    
    bin_groups = (
        df_copy
        .groupby(['bin', 'default payment next month'])
        .size().to_frame('bin_count')
    )
    
    bin_group_unstack = bin_groups.unstack(level=-1)
    bin_group_unstack

    bin_group_unstack.columns = [f"{col[0]}_{col[1]}" for col in bin_group_unstack.columns]
    bin_group_unstack = bin_group_unstack.reset_index()
    
    bin_0 = bin_group_unstack[['bin', 'bin_count_0']]
    bin_1 = bin_group_unstack[['bin', 'bin_count_1']]

    bin_0.rename(columns={'bin_count_0': 'count'}, inplace=True)
    bin_0['label'] = '0'

    bin_1.rename(columns={'bin_count_1': 'count'}, inplace=True)
    bin_1['label'] = '1'

    bin_combined = pd.concat([bin_0, bin_1])
    bin_combined['bin_str'] = bin_combined['bin'].astype(str)   
    
    # show populace
    populance = sns.barplot(data=bin_combined, x='bin_str', y='count',hue='label', ax=axes[0])
    for item in populance.get_xticklabels():
        item.set_rotation(90)
    axes[0].set_title("Populance")
    # Count Proportion
    
    bin_group_unstack['default_rate'] = (bin_group_unstack['bin_count_1'] / (bin_group_unstack['bin_count_1'] + bin_group_unstack['bin_count_0'])) * 100
    bin_group_unstack['bin_str'] = bin_group_unstack['bin'].astype(str)
    
    bin_group_unstack.plot(kind='bar', x='bin_str', y='default_rate', ax=axes[1])
    axes[0].set_title("Proportion in Percentage")
    
    st.pyplot(fig)
    
    # return populace, proportions

def customer_history_eda(df, columnAgainst):
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    pay_groupby = (
        df
        .groupby([columnAgainst, 'default payment next month'])
        .size().to_frame('default_count')
    )
    
    pay_groupby_unstacked = pay_groupby.unstack(level=1)

    pay_groupby_unstacked.columns = [f"{col[0]}_{col[1]}" for col in pay_groupby_unstacked.columns]

    pay_groupby_unstacked['default_propotions'] = (pay_groupby_unstacked['default_count_1'] / (pay_groupby_unstacked['default_count_1'] + pay_groupby_unstacked['default_count_0']) * 100)
    
    pay_groupby_unstacked = pay_groupby_unstacked.reset_index()
    
    # count per category chart
    pay_group_0 = pay_groupby_unstacked[[columnAgainst, 'default_count_0']]
    pay_group_1 = pay_groupby_unstacked[[columnAgainst, 'default_count_1']]

    pay_group_0.rename(columns={"default_count_0": "count"}, inplace=True)
    pay_group_1.rename(columns={"default_count_1": "count"}, inplace=True)

    pay_group_0['label'] = '0'
    pay_group_1['label'] = '1'
    
    pay_group_0['count'].fillna(0)
    pay_group_1['count'].fillna(0)

    pay_group_combined = pd.concat([pay_group_0,pay_group_1])
    
    pay_group_combined.replace(np.nan, 0, inplace=True)

    sns.barplot(data=pay_group_combined, x=columnAgainst, y='count',hue='label', ax=axes[0])
    axes[0].set_title("Populance")
    pay_groupby_unstacked.plot(kind='bar', x=columnAgainst, y='default_propotions', ax=axes[1])
    axes[0].set_title("Proportion in Percentage")
    
    st.pyplot(fig)

def customer_behavior_exploratory(default_credit_card_df):
    Pay_data = default_credit_card_df[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']]
    
    Pay_data_melt = pd.melt(Pay_data,  var_name='period')
    
    pay_data_group_by = (
        Pay_data_melt
        .groupby(['period','value'])
        .size()
    )
    
    grouped_df = pay_data_group_by.reset_index(name='count')
    grouped_df.sort_values('period', ascending=False, inplace=True)
    
    Grouped_totalSum = (
        grouped_df
        .groupby('period')
        .sum()
    )

    Grouped_totalSum = Grouped_totalSum.drop('value', axis=1)
    
    result = pd.merge(grouped_df, Grouped_totalSum, on='period')
    result['proportion'] = (result['count_x'] / result['count_y']) * 100
    
    result = result.drop(['count_x', 'count_y'], axis=1)
    
    grouped_matrixes = result.pivot(index='value', columns='period')
    grouped_matrixes.fillna(0)
    
    return grouped_matrixes

def show_grouped_matrix(grouped_matrixes):
    fig, ax = plt.subplots(figsize=(8, 6))
    
    sns.heatmap(
        grouped_matrixes,
        cmap="Blues",
        annot=True,
        fmt=".2f",
        ax=ax
    )
    
    ax.set_title("Grouped Matrices Heatmap")
    st.pyplot(fig)


def run():
    st.title('Customer Credit Default Prediction App')
    st.subheader("this page contains the EDA about customer payment behavior over time")
    
    # image = Image.open("./src/credit_card.jpg")
    # st.image(image, caption="Credit Card")
    
    # write
    st.write("the EDA will explore and analyse the customer credit default based on customer's payment behavior")
    
    # fetch dataset 
    data = pd.read_excel('https://raw.githubusercontent.com/KevinH2810/csv-customer-credit-default/main/default_of_credit_card_clients.xls')
    data.columns = data.iloc[0]
    data = data.drop(data.index[0])
    
    st.write("we will explore each category columns in this dataset to see if there's any anomali or not")
    
    categorical_columns = ['SEX', 'EDUCATION', 'MARRIAGE','PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
    
    st.write("### official informations from the dataset website as follows")
    st.write("""
                X1(LIMIT_BAL): Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit. \n
                X2: Gender (1 = male; 2 = female). \n
                X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others). \n
                X4: Marital status (1 = married; 2 = single; 3 = others). \n
                X5: Age (year). \n
                X6 - X11(PAY_0 - PAY_6): History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005; . . .;X11 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above. \n
                X12-X17(BILL_AMT1 - BILL_AMT6): Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005.  \n
                X18-X23(PAY_AMT1-PAY_AMT6): Amount of previous payment (NT dollar). X18 = amount paid in September, 2005; X19 = amount paid in August, 2005; . . .;X23 = amount paid in April, 2005 \n
             """)
    st.write("======================================================================================================================================= \n")
    
    st.write("due to the same type and content from column PAY_0 to PAY_6, we will be using PAY_0 only as the sample for exploratory")
    
    categorical_columns = ['SEX', 'EDUCATION', 'MARRIAGE','PAY_0']
    
    for column in categorical_columns:
        st.write("======================================================================================================================================= \n")
        st.write(f"""
                 column: {column} \n
                 unique values = {data[column].unique()}
                 """)
        
    st.write("### Customer Behavior Exploratory")
    
    grouped_matrixes = customer_behavior_exploratory(data)
 
    show_grouped_matrix(grouped_matrixes)
        
    st.write("### Insight")
    st.write("""
                - on-time states remain dominant across all periods, with moderate fluctuations across months. \n
                - the proportion of one-month late payment is higher in PAY_0 compared to earlier periods.\n
                - PAY_0 period shows a different composition compared to earlier periods, with relatively higher proportions of mild late payment states\n
                - High severity late payments remain consistently rare across all periods
             """)
    st.write("during the exploratory, we found several unknown category in several fields  that dont have any official description. these categories is treated as `Unknown` categories so as to not be removed to avoid unecessary data losses")
    
    st.write("## LIMIT_BAL vs DEFAULT")
    eda_bin_chart(data, 'LIMIT_BAL')
    st. write("""
              ### Insight

            generaly, the default rate decreases as the LIMIT_BAL increase. this support the idea that lower `LIMIT_BAL` customer showed higher default risk rate. \n
            although on higher `LIMIT_BAL` profile showed higher default rates, the estimates is less stable due to limited data on that range
              """)
    
    st.write("## PAY_0 vs Default Rate")
    customer_history_eda(data, 'PAY_0')
    st.write("""
             #### Insights

            generally, the default rate trend in recent month (PAY_0) increases as the customer's payment behavior category delayed. this indicates that recent payment behavior is highly predictive
             """)
    
    st.write("## PAY_4 vs Default Rate")
    customer_history_eda(data, 'PAY_4')
    
    st.write("""
             ### Insight

            from the population chart, most customers fall into repayment statuses -2, -1, and 0, indicating that the majority were paying duly or had no delay four months prior. As repayment delay categories increase (PAY_4 ≥ 2), the number of customers drops sharply, showing that severe payment delays are relatively rare in this period.

            although the default rate chart shows a clear upward trend: customers with higher payment delay categories exhibit substantially higher default rates.

            it is also important to note that default rate estimates at extreme delay categories are less stable due to the small number of observations. 

            nonetheless, the consistent monotonic increase indicates that historical repayment behavior—even four months prior—remains a strong predictor of future default risk.
             """)
    
    st.write("## PAY_6 vs Default Rate")
    customer_history_eda(data, 'PAY_6')
    
    st.write("""
             ### Insight

            From the population chart, most customers six months prior fall into repayment statuses -2, -1, and 0, indicating that the majority were paying duly or had no payment delay in that period. As the delay category increases (PAY_6 ≥ 2), the number of customers decreases sharply, showing that severe payment delays six months prior are relatively rare.

            Despite the smaller population at higher delay categories, the default rate chart shows a clear increasing trend: customers with greater historical payment delays exhibit substantially higher default rates.

            However, similar to PAY_4, default rate estimates at extreme delay categories are less stable due to the limited number of observations. 

            Even so, the consistent upward pattern indicates that repayment behavior as far as six months prior still provides meaningful predictive signal for future default risk, although its strength is weaker compared to more recent repayment behavior.
             """)
    
    
if __name__ == '__main__':
    run()