File size: 11,955 Bytes
b33b5ec 8728ef3 b33b5ec 8728ef3 b33b5ec 42c76a5 b33b5ec fc5c2b4 b33b5ec 7cabfa7 b33b5ec 8728ef3 b33b5ec | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 | import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from PIL import Image
# from ucimlrepo import fetch_ucirepo
def eda_bin_chart(df, columnAgainst):
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
df_copy = df.copy()
bin = list(range(0, 1_000_001, 50_000))
label = [
f"{bin[i]//1000}k–{bin[i+1]//1000}k"
for i in range(len(bin) - 1)
]
df_copy['bin'] = pd.cut(df_copy[columnAgainst], bins=bin, labels=label)
labels = [
"0–100k", "100–200k", "200–300k",
"300–500k", "500–750k", "750k+"
]
bin_groups = (
df_copy
.groupby(['bin', 'default payment next month'])
.size().to_frame('bin_count')
)
bin_group_unstack = bin_groups.unstack(level=-1)
bin_group_unstack
bin_group_unstack.columns = [f"{col[0]}_{col[1]}" for col in bin_group_unstack.columns]
bin_group_unstack = bin_group_unstack.reset_index()
bin_0 = bin_group_unstack[['bin', 'bin_count_0']]
bin_1 = bin_group_unstack[['bin', 'bin_count_1']]
bin_0.rename(columns={'bin_count_0': 'count'}, inplace=True)
bin_0['label'] = '0'
bin_1.rename(columns={'bin_count_1': 'count'}, inplace=True)
bin_1['label'] = '1'
bin_combined = pd.concat([bin_0, bin_1])
bin_combined['bin_str'] = bin_combined['bin'].astype(str)
# show populace
populance = sns.barplot(data=bin_combined, x='bin_str', y='count',hue='label', ax=axes[0])
for item in populance.get_xticklabels():
item.set_rotation(90)
axes[0].set_title("Populance")
# Count Proportion
bin_group_unstack['default_rate'] = (bin_group_unstack['bin_count_1'] / (bin_group_unstack['bin_count_1'] + bin_group_unstack['bin_count_0'])) * 100
bin_group_unstack['bin_str'] = bin_group_unstack['bin'].astype(str)
bin_group_unstack.plot(kind='bar', x='bin_str', y='default_rate', ax=axes[1])
axes[0].set_title("Proportion in Percentage")
st.pyplot(fig)
# return populace, proportions
def customer_history_eda(df, columnAgainst):
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
pay_groupby = (
df
.groupby([columnAgainst, 'default payment next month'])
.size().to_frame('default_count')
)
pay_groupby_unstacked = pay_groupby.unstack(level=1)
pay_groupby_unstacked.columns = [f"{col[0]}_{col[1]}" for col in pay_groupby_unstacked.columns]
pay_groupby_unstacked['default_propotions'] = (pay_groupby_unstacked['default_count_1'] / (pay_groupby_unstacked['default_count_1'] + pay_groupby_unstacked['default_count_0']) * 100)
pay_groupby_unstacked = pay_groupby_unstacked.reset_index()
# count per category chart
pay_group_0 = pay_groupby_unstacked[[columnAgainst, 'default_count_0']]
pay_group_1 = pay_groupby_unstacked[[columnAgainst, 'default_count_1']]
pay_group_0.rename(columns={"default_count_0": "count"}, inplace=True)
pay_group_1.rename(columns={"default_count_1": "count"}, inplace=True)
pay_group_0['label'] = '0'
pay_group_1['label'] = '1'
pay_group_0['count'].fillna(0)
pay_group_1['count'].fillna(0)
pay_group_combined = pd.concat([pay_group_0,pay_group_1])
pay_group_combined.replace(np.nan, 0, inplace=True)
sns.barplot(data=pay_group_combined, x=columnAgainst, y='count',hue='label', ax=axes[0])
axes[0].set_title("Populance")
pay_groupby_unstacked.plot(kind='bar', x=columnAgainst, y='default_propotions', ax=axes[1])
axes[0].set_title("Proportion in Percentage")
st.pyplot(fig)
def customer_behavior_exploratory(default_credit_card_df):
Pay_data = default_credit_card_df[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']]
Pay_data_melt = pd.melt(Pay_data, var_name='period')
pay_data_group_by = (
Pay_data_melt
.groupby(['period','value'])
.size()
)
grouped_df = pay_data_group_by.reset_index(name='count')
grouped_df.sort_values('period', ascending=False, inplace=True)
Grouped_totalSum = (
grouped_df
.groupby('period')
.sum()
)
Grouped_totalSum = Grouped_totalSum.drop('value', axis=1)
result = pd.merge(grouped_df, Grouped_totalSum, on='period')
result['proportion'] = (result['count_x'] / result['count_y']) * 100
result = result.drop(['count_x', 'count_y'], axis=1)
grouped_matrixes = result.pivot(index='value', columns='period')
grouped_matrixes.fillna(0)
return grouped_matrixes
def show_grouped_matrix(grouped_matrixes):
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(
grouped_matrixes,
cmap="Blues",
annot=True,
fmt=".2f",
ax=ax
)
ax.set_title("Grouped Matrices Heatmap")
st.pyplot(fig)
def run():
st.title('Customer Credit Default Prediction App')
st.subheader("this page contains the EDA about customer payment behavior over time")
# image = Image.open("./src/credit_card.jpg")
# st.image(image, caption="Credit Card")
# write
st.write("the EDA will explore and analyse the customer credit default based on customer's payment behavior")
# fetch dataset
data = pd.read_excel('https://raw.githubusercontent.com/KevinH2810/csv-customer-credit-default/main/default_of_credit_card_clients.xls')
data.columns = data.iloc[0]
data = data.drop(data.index[0])
st.write("we will explore each category columns in this dataset to see if there's any anomali or not")
categorical_columns = ['SEX', 'EDUCATION', 'MARRIAGE','PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
st.write("### official informations from the dataset website as follows")
st.write("""
X1(LIMIT_BAL): Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit. \n
X2: Gender (1 = male; 2 = female). \n
X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others). \n
X4: Marital status (1 = married; 2 = single; 3 = others). \n
X5: Age (year). \n
X6 - X11(PAY_0 - PAY_6): History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005; . . .;X11 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above. \n
X12-X17(BILL_AMT1 - BILL_AMT6): Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005. \n
X18-X23(PAY_AMT1-PAY_AMT6): Amount of previous payment (NT dollar). X18 = amount paid in September, 2005; X19 = amount paid in August, 2005; . . .;X23 = amount paid in April, 2005 \n
""")
st.write("======================================================================================================================================= \n")
st.write("due to the same type and content from column PAY_0 to PAY_6, we will be using PAY_0 only as the sample for exploratory")
categorical_columns = ['SEX', 'EDUCATION', 'MARRIAGE','PAY_0']
for column in categorical_columns:
st.write("======================================================================================================================================= \n")
st.write(f"""
column: {column} \n
unique values = {data[column].unique()}
""")
st.write("### Customer Behavior Exploratory")
grouped_matrixes = customer_behavior_exploratory(data)
show_grouped_matrix(grouped_matrixes)
st.write("### Insight")
st.write("""
- on-time states remain dominant across all periods, with moderate fluctuations across months. \n
- the proportion of one-month late payment is higher in PAY_0 compared to earlier periods.\n
- PAY_0 period shows a different composition compared to earlier periods, with relatively higher proportions of mild late payment states\n
- High severity late payments remain consistently rare across all periods
""")
st.write("during the exploratory, we found several unknown category in several fields that dont have any official description. these categories is treated as `Unknown` categories so as to not be removed to avoid unecessary data losses")
st.write("## LIMIT_BAL vs DEFAULT")
eda_bin_chart(data, 'LIMIT_BAL')
st. write("""
### Insight
generaly, the default rate decreases as the LIMIT_BAL increase. this support the idea that lower `LIMIT_BAL` customer showed higher default risk rate. \n
although on higher `LIMIT_BAL` profile showed higher default rates, the estimates is less stable due to limited data on that range
""")
st.write("## PAY_0 vs Default Rate")
customer_history_eda(data, 'PAY_0')
st.write("""
#### Insights
generally, the default rate trend in recent month (PAY_0) increases as the customer's payment behavior category delayed. this indicates that recent payment behavior is highly predictive
""")
st.write("## PAY_4 vs Default Rate")
customer_history_eda(data, 'PAY_4')
st.write("""
### Insight
from the population chart, most customers fall into repayment statuses -2, -1, and 0, indicating that the majority were paying duly or had no delay four months prior. As repayment delay categories increase (PAY_4 ≥ 2), the number of customers drops sharply, showing that severe payment delays are relatively rare in this period.
although the default rate chart shows a clear upward trend: customers with higher payment delay categories exhibit substantially higher default rates.
it is also important to note that default rate estimates at extreme delay categories are less stable due to the small number of observations.
nonetheless, the consistent monotonic increase indicates that historical repayment behavior—even four months prior—remains a strong predictor of future default risk.
""")
st.write("## PAY_6 vs Default Rate")
customer_history_eda(data, 'PAY_6')
st.write("""
### Insight
From the population chart, most customers six months prior fall into repayment statuses -2, -1, and 0, indicating that the majority were paying duly or had no payment delay in that period. As the delay category increases (PAY_6 ≥ 2), the number of customers decreases sharply, showing that severe payment delays six months prior are relatively rare.
Despite the smaller population at higher delay categories, the default rate chart shows a clear increasing trend: customers with greater historical payment delays exhibit substantially higher default rates.
However, similar to PAY_4, default rate estimates at extreme delay categories are less stable due to the limited number of observations.
Even so, the consistent upward pattern indicates that repayment behavior as far as six months prior still provides meaningful predictive signal for future default risk, although its strength is weaker compared to more recent repayment behavior.
""")
if __name__ == '__main__':
run() |