File size: 11,955 Bytes
b33b5ec
 
8728ef3
b33b5ec
 
 
 
 
 
8728ef3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b33b5ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42c76a5
 
b33b5ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc5c2b4
b33b5ec
 
 
 
 
 
7cabfa7
b33b5ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8728ef3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b33b5ec
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import streamlit as st 
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
import plotly.express as px
from PIL import Image
# from ucimlrepo import fetch_ucirepo 

def eda_bin_chart(df, columnAgainst):
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    df_copy = df.copy()
    bin = list(range(0, 1_000_001, 50_000))
    label = [
        f"{bin[i]//1000}k–{bin[i+1]//1000}k"
        for i in range(len(bin) - 1)
    ]


    df_copy['bin'] = pd.cut(df_copy[columnAgainst], bins=bin, labels=label)
    labels = [
        "0–100k", "100–200k", "200–300k",
        "300–500k", "500–750k", "750k+"
    ]
    
    bin_groups = (
        df_copy
        .groupby(['bin', 'default payment next month'])
        .size().to_frame('bin_count')
    )
    
    bin_group_unstack = bin_groups.unstack(level=-1)
    bin_group_unstack

    bin_group_unstack.columns = [f"{col[0]}_{col[1]}" for col in bin_group_unstack.columns]
    bin_group_unstack = bin_group_unstack.reset_index()
    
    bin_0 = bin_group_unstack[['bin', 'bin_count_0']]
    bin_1 = bin_group_unstack[['bin', 'bin_count_1']]

    bin_0.rename(columns={'bin_count_0': 'count'}, inplace=True)
    bin_0['label'] = '0'

    bin_1.rename(columns={'bin_count_1': 'count'}, inplace=True)
    bin_1['label'] = '1'

    bin_combined = pd.concat([bin_0, bin_1])
    bin_combined['bin_str'] = bin_combined['bin'].astype(str)   
    
    # show populace
    populance = sns.barplot(data=bin_combined, x='bin_str', y='count',hue='label', ax=axes[0])
    for item in populance.get_xticklabels():
        item.set_rotation(90)
    axes[0].set_title("Populance")
    # Count Proportion
    
    bin_group_unstack['default_rate'] = (bin_group_unstack['bin_count_1'] / (bin_group_unstack['bin_count_1'] + bin_group_unstack['bin_count_0'])) * 100
    bin_group_unstack['bin_str'] = bin_group_unstack['bin'].astype(str)
    
    bin_group_unstack.plot(kind='bar', x='bin_str', y='default_rate', ax=axes[1])
    axes[0].set_title("Proportion in Percentage")
    
    st.pyplot(fig)
    
    # return populace, proportions

def customer_history_eda(df, columnAgainst):
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    pay_groupby = (
        df
        .groupby([columnAgainst, 'default payment next month'])
        .size().to_frame('default_count')
    )
    
    pay_groupby_unstacked = pay_groupby.unstack(level=1)

    pay_groupby_unstacked.columns = [f"{col[0]}_{col[1]}" for col in pay_groupby_unstacked.columns]

    pay_groupby_unstacked['default_propotions'] = (pay_groupby_unstacked['default_count_1'] / (pay_groupby_unstacked['default_count_1'] + pay_groupby_unstacked['default_count_0']) * 100)
    
    pay_groupby_unstacked = pay_groupby_unstacked.reset_index()
    
    # count per category chart
    pay_group_0 = pay_groupby_unstacked[[columnAgainst, 'default_count_0']]
    pay_group_1 = pay_groupby_unstacked[[columnAgainst, 'default_count_1']]

    pay_group_0.rename(columns={"default_count_0": "count"}, inplace=True)
    pay_group_1.rename(columns={"default_count_1": "count"}, inplace=True)

    pay_group_0['label'] = '0'
    pay_group_1['label'] = '1'
    
    pay_group_0['count'].fillna(0)
    pay_group_1['count'].fillna(0)

    pay_group_combined = pd.concat([pay_group_0,pay_group_1])
    
    pay_group_combined.replace(np.nan, 0, inplace=True)

    sns.barplot(data=pay_group_combined, x=columnAgainst, y='count',hue='label', ax=axes[0])
    axes[0].set_title("Populance")
    pay_groupby_unstacked.plot(kind='bar', x=columnAgainst, y='default_propotions', ax=axes[1])
    axes[0].set_title("Proportion in Percentage")
    
    st.pyplot(fig)

def customer_behavior_exploratory(default_credit_card_df):
    Pay_data = default_credit_card_df[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']]
    
    Pay_data_melt = pd.melt(Pay_data,  var_name='period')
    
    pay_data_group_by = (
        Pay_data_melt
        .groupby(['period','value'])
        .size()
    )
    
    grouped_df = pay_data_group_by.reset_index(name='count')
    grouped_df.sort_values('period', ascending=False, inplace=True)
    
    Grouped_totalSum = (
        grouped_df
        .groupby('period')
        .sum()
    )

    Grouped_totalSum = Grouped_totalSum.drop('value', axis=1)
    
    result = pd.merge(grouped_df, Grouped_totalSum, on='period')
    result['proportion'] = (result['count_x'] / result['count_y']) * 100
    
    result = result.drop(['count_x', 'count_y'], axis=1)
    
    grouped_matrixes = result.pivot(index='value', columns='period')
    grouped_matrixes.fillna(0)
    
    return grouped_matrixes

def show_grouped_matrix(grouped_matrixes):
    fig, ax = plt.subplots(figsize=(8, 6))
    
    sns.heatmap(
        grouped_matrixes,
        cmap="Blues",
        annot=True,
        fmt=".2f",
        ax=ax
    )
    
    ax.set_title("Grouped Matrices Heatmap")
    st.pyplot(fig)


def run():
    st.title('Customer Credit Default Prediction App')
    st.subheader("this page contains the EDA about customer payment behavior over time")
    
    # image = Image.open("./src/credit_card.jpg")
    # st.image(image, caption="Credit Card")
    
    # write
    st.write("the EDA will explore and analyse the customer credit default based on customer's payment behavior")
    
    # fetch dataset 
    data = pd.read_excel('https://raw.githubusercontent.com/KevinH2810/csv-customer-credit-default/main/default_of_credit_card_clients.xls')
    data.columns = data.iloc[0]
    data = data.drop(data.index[0])
    
    st.write("we will explore each category columns in this dataset to see if there's any anomali or not")
    
    categorical_columns = ['SEX', 'EDUCATION', 'MARRIAGE','PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
    
    st.write("### official informations from the dataset website as follows")
    st.write("""
                X1(LIMIT_BAL): Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit. \n
                X2: Gender (1 = male; 2 = female). \n
                X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others). \n
                X4: Marital status (1 = married; 2 = single; 3 = others). \n
                X5: Age (year). \n
                X6 - X11(PAY_0 - PAY_6): History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005; . . .;X11 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above. \n
                X12-X17(BILL_AMT1 - BILL_AMT6): Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005.  \n
                X18-X23(PAY_AMT1-PAY_AMT6): Amount of previous payment (NT dollar). X18 = amount paid in September, 2005; X19 = amount paid in August, 2005; . . .;X23 = amount paid in April, 2005 \n
             """)
    st.write("======================================================================================================================================= \n")
    
    st.write("due to the same type and content from column PAY_0 to PAY_6, we will be using PAY_0 only as the sample for exploratory")
    
    categorical_columns = ['SEX', 'EDUCATION', 'MARRIAGE','PAY_0']
    
    for column in categorical_columns:
        st.write("======================================================================================================================================= \n")
        st.write(f"""
                 column: {column} \n
                 unique values = {data[column].unique()}
                 """)
        
    st.write("### Customer Behavior Exploratory")
    
    grouped_matrixes = customer_behavior_exploratory(data)
 
    show_grouped_matrix(grouped_matrixes)
        
    st.write("### Insight")
    st.write("""
                - on-time states remain dominant across all periods, with moderate fluctuations across months. \n
                - the proportion of one-month late payment is higher in PAY_0 compared to earlier periods.\n
                - PAY_0 period shows a different composition compared to earlier periods, with relatively higher proportions of mild late payment states\n
                - High severity late payments remain consistently rare across all periods
             """)
    st.write("during the exploratory, we found several unknown category in several fields  that dont have any official description. these categories is treated as `Unknown` categories so as to not be removed to avoid unecessary data losses")
    
    st.write("## LIMIT_BAL vs DEFAULT")
    eda_bin_chart(data, 'LIMIT_BAL')
    st. write("""
              ### Insight

            generaly, the default rate decreases as the LIMIT_BAL increase. this support the idea that lower `LIMIT_BAL` customer showed higher default risk rate. \n
            although on higher `LIMIT_BAL` profile showed higher default rates, the estimates is less stable due to limited data on that range
              """)
    
    st.write("## PAY_0 vs Default Rate")
    customer_history_eda(data, 'PAY_0')
    st.write("""
             #### Insights

            generally, the default rate trend in recent month (PAY_0) increases as the customer's payment behavior category delayed. this indicates that recent payment behavior is highly predictive
             """)
    
    st.write("## PAY_4 vs Default Rate")
    customer_history_eda(data, 'PAY_4')
    
    st.write("""
             ### Insight

            from the population chart, most customers fall into repayment statuses -2, -1, and 0, indicating that the majority were paying duly or had no delay four months prior. As repayment delay categories increase (PAY_4 ≥ 2), the number of customers drops sharply, showing that severe payment delays are relatively rare in this period.

            although the default rate chart shows a clear upward trend: customers with higher payment delay categories exhibit substantially higher default rates.

            it is also important to note that default rate estimates at extreme delay categories are less stable due to the small number of observations. 

            nonetheless, the consistent monotonic increase indicates that historical repayment behavior—even four months prior—remains a strong predictor of future default risk.
             """)
    
    st.write("## PAY_6 vs Default Rate")
    customer_history_eda(data, 'PAY_6')
    
    st.write("""
             ### Insight

            From the population chart, most customers six months prior fall into repayment statuses -2, -1, and 0, indicating that the majority were paying duly or had no payment delay in that period. As the delay category increases (PAY_6 ≥ 2), the number of customers decreases sharply, showing that severe payment delays six months prior are relatively rare.

            Despite the smaller population at higher delay categories, the default rate chart shows a clear increasing trend: customers with greater historical payment delays exhibit substantially higher default rates.

            However, similar to PAY_4, default rate estimates at extreme delay categories are less stable due to the limited number of observations. 

            Even so, the consistent upward pattern indicates that repayment behavior as far as six months prior still provides meaningful predictive signal for future default risk, although its strength is weaker compared to more recent repayment behavior.
             """)
    
    
if __name__ == '__main__':
    run()