Spaces:

Heizsenberg
/

analytic-data-efficiency

Sleeping

App Files Files Community

analytic-data-efficiency / src /eda.py

Heizsenberg

updated EDA

8728ef3 about 1 month ago

raw

history blame contribute delete

12 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	import plotly.express as px
	from PIL import Image
	# from ucimlrepo import fetch_ucirepo

	def eda_bin_chart(df, columnAgainst):
	fig, axes = plt.subplots(1, 2, figsize=(12, 4))

	df_copy = df.copy()
	bin = list(range(0, 1_000_001, 50_000))
	label = [
	f"{bin[i]//1000}k–{bin[i+1]//1000}k"
	for i in range(len(bin) - 1)
	]


	df_copy['bin'] = pd.cut(df_copy[columnAgainst], bins=bin, labels=label)
	labels = [
	"0–100k", "100–200k", "200–300k",
	"300–500k", "500–750k", "750k+"
	]

	bin_groups = (
	df_copy
	.groupby(['bin', 'default payment next month'])
	.size().to_frame('bin_count')
	)

	bin_group_unstack = bin_groups.unstack(level=-1)
	bin_group_unstack

	bin_group_unstack.columns = [f"{col[0]}_{col[1]}" for col in bin_group_unstack.columns]
	bin_group_unstack = bin_group_unstack.reset_index()

	bin_0 = bin_group_unstack[['bin', 'bin_count_0']]
	bin_1 = bin_group_unstack[['bin', 'bin_count_1']]

	bin_0.rename(columns={'bin_count_0': 'count'}, inplace=True)
	bin_0['label'] = '0'

	bin_1.rename(columns={'bin_count_1': 'count'}, inplace=True)
	bin_1['label'] = '1'

	bin_combined = pd.concat([bin_0, bin_1])
	bin_combined['bin_str'] = bin_combined['bin'].astype(str)

	# show populace
	populance = sns.barplot(data=bin_combined, x='bin_str', y='count',hue='label', ax=axes[0])
	for item in populance.get_xticklabels():
	item.set_rotation(90)
	axes[0].set_title("Populance")
	# Count Proportion

	bin_group_unstack['default_rate'] = (bin_group_unstack['bin_count_1'] / (bin_group_unstack['bin_count_1'] + bin_group_unstack['bin_count_0'])) * 100
	bin_group_unstack['bin_str'] = bin_group_unstack['bin'].astype(str)

	bin_group_unstack.plot(kind='bar', x='bin_str', y='default_rate', ax=axes[1])
	axes[0].set_title("Proportion in Percentage")

	st.pyplot(fig)

	# return populace, proportions

	def customer_history_eda(df, columnAgainst):

	fig, axes = plt.subplots(1, 2, figsize=(12, 4))

	pay_groupby = (
	df
	.groupby([columnAgainst, 'default payment next month'])
	.size().to_frame('default_count')
	)

	pay_groupby_unstacked = pay_groupby.unstack(level=1)

	pay_groupby_unstacked.columns = [f"{col[0]}_{col[1]}" for col in pay_groupby_unstacked.columns]

	pay_groupby_unstacked['default_propotions'] = (pay_groupby_unstacked['default_count_1'] / (pay_groupby_unstacked['default_count_1'] + pay_groupby_unstacked['default_count_0']) * 100)

	pay_groupby_unstacked = pay_groupby_unstacked.reset_index()

	# count per category chart
	pay_group_0 = pay_groupby_unstacked[[columnAgainst, 'default_count_0']]
	pay_group_1 = pay_groupby_unstacked[[columnAgainst, 'default_count_1']]

	pay_group_0.rename(columns={"default_count_0": "count"}, inplace=True)
	pay_group_1.rename(columns={"default_count_1": "count"}, inplace=True)

	pay_group_0['label'] = '0'
	pay_group_1['label'] = '1'

	pay_group_0['count'].fillna(0)
	pay_group_1['count'].fillna(0)

	pay_group_combined = pd.concat([pay_group_0,pay_group_1])

	pay_group_combined.replace(np.nan, 0, inplace=True)

	sns.barplot(data=pay_group_combined, x=columnAgainst, y='count',hue='label', ax=axes[0])
	axes[0].set_title("Populance")
	pay_groupby_unstacked.plot(kind='bar', x=columnAgainst, y='default_propotions', ax=axes[1])
	axes[0].set_title("Proportion in Percentage")

	st.pyplot(fig)

	def customer_behavior_exploratory(default_credit_card_df):
	Pay_data = default_credit_card_df[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']]

	Pay_data_melt = pd.melt(Pay_data, var_name='period')

	pay_data_group_by = (
	Pay_data_melt
	.groupby(['period','value'])
	.size()
	)

	grouped_df = pay_data_group_by.reset_index(name='count')
	grouped_df.sort_values('period', ascending=False, inplace=True)

	Grouped_totalSum = (
	grouped_df
	.groupby('period')
	.sum()
	)

	Grouped_totalSum = Grouped_totalSum.drop('value', axis=1)

	result = pd.merge(grouped_df, Grouped_totalSum, on='period')
	result['proportion'] = (result['count_x'] / result['count_y']) * 100

	result = result.drop(['count_x', 'count_y'], axis=1)

	grouped_matrixes = result.pivot(index='value', columns='period')
	grouped_matrixes.fillna(0)

	return grouped_matrixes

	def show_grouped_matrix(grouped_matrixes):
	fig, ax = plt.subplots(figsize=(8, 6))

	sns.heatmap(
	grouped_matrixes,
	cmap="Blues",
	annot=True,
	fmt=".2f",
	ax=ax
	)

	ax.set_title("Grouped Matrices Heatmap")
	st.pyplot(fig)


	def run():
	st.title('Customer Credit Default Prediction App')
	st.subheader("this page contains the EDA about customer payment behavior over time")

	# image = Image.open("./src/credit_card.jpg")
	# st.image(image, caption="Credit Card")

	# write
	st.write("the EDA will explore and analyse the customer credit default based on customer's payment behavior")

	# fetch dataset
	data = pd.read_excel('https://raw.githubusercontent.com/KevinH2810/csv-customer-credit-default/main/default_of_credit_card_clients.xls')
	data.columns = data.iloc[0]
	data = data.drop(data.index[0])

	st.write("we will explore each category columns in this dataset to see if there's any anomali or not")

	categorical_columns = ['SEX', 'EDUCATION', 'MARRIAGE','PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

	st.write("### official informations from the dataset website as follows")
	st.write("""
	X1(LIMIT_BAL): Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit. \n
	X2: Gender (1 = male; 2 = female). \n
	X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others). \n
	X4: Marital status (1 = married; 2 = single; 3 = others). \n
	X5: Age (year). \n
	X6 - X11(PAY_0 - PAY_6): History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005; . . .;X11 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above. \n
	X12-X17(BILL_AMT1 - BILL_AMT6): Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005. \n
	X18-X23(PAY_AMT1-PAY_AMT6): Amount of previous payment (NT dollar). X18 = amount paid in September, 2005; X19 = amount paid in August, 2005; . . .;X23 = amount paid in April, 2005 \n
	""")
	st.write("======================================================================================================================================= \n")

	st.write("due to the same type and content from column PAY_0 to PAY_6, we will be using PAY_0 only as the sample for exploratory")

	categorical_columns = ['SEX', 'EDUCATION', 'MARRIAGE','PAY_0']

	for column in categorical_columns:
	st.write("======================================================================================================================================= \n")
	st.write(f"""
	column: {column} \n
	unique values = {data[column].unique()}
	""")

	st.write("### Customer Behavior Exploratory")

	grouped_matrixes = customer_behavior_exploratory(data)

	show_grouped_matrix(grouped_matrixes)

	st.write("### Insight")
	st.write("""
	- on-time states remain dominant across all periods, with moderate fluctuations across months. \n
	- the proportion of one-month late payment is higher in PAY_0 compared to earlier periods.\n
	- PAY_0 period shows a different composition compared to earlier periods, with relatively higher proportions of mild late payment states\n
	- High severity late payments remain consistently rare across all periods
	""")
	st.write("during the exploratory, we found several unknown category in several fields that dont have any official description. these categories is treated as `Unknown` categories so as to not be removed to avoid unecessary data losses")

	st.write("## LIMIT_BAL vs DEFAULT")
	eda_bin_chart(data, 'LIMIT_BAL')
	st. write("""
	### Insight

	generaly, the default rate decreases as the LIMIT_BAL increase. this support the idea that lower `LIMIT_BAL` customer showed higher default risk rate. \n
	although on higher `LIMIT_BAL` profile showed higher default rates, the estimates is less stable due to limited data on that range
	""")

	st.write("## PAY_0 vs Default Rate")
	customer_history_eda(data, 'PAY_0')
	st.write("""
	#### Insights

	generally, the default rate trend in recent month (PAY_0) increases as the customer's payment behavior category delayed. this indicates that recent payment behavior is highly predictive
	""")

	st.write("## PAY_4 vs Default Rate")
	customer_history_eda(data, 'PAY_4')

	st.write("""
	### Insight

	from the population chart, most customers fall into repayment statuses -2, -1, and 0, indicating that the majority were paying duly or had no delay four months prior. As repayment delay categories increase (PAY_4 ≥ 2), the number of customers drops sharply, showing that severe payment delays are relatively rare in this period.

	although the default rate chart shows a clear upward trend: customers with higher payment delay categories exhibit substantially higher default rates.

	it is also important to note that default rate estimates at extreme delay categories are less stable due to the small number of observations.

	nonetheless, the consistent monotonic increase indicates that historical repayment behavior—even four months prior—remains a strong predictor of future default risk.
	""")

	st.write("## PAY_6 vs Default Rate")
	customer_history_eda(data, 'PAY_6')

	st.write("""
	### Insight

	From the population chart, most customers six months prior fall into repayment statuses -2, -1, and 0, indicating that the majority were paying duly or had no payment delay in that period. As the delay category increases (PAY_6 ≥ 2), the number of customers decreases sharply, showing that severe payment delays six months prior are relatively rare.

	Despite the smaller population at higher delay categories, the default rate chart shows a clear increasing trend: customers with greater historical payment delays exhibit substantially higher default rates.

	However, similar to PAY_4, default rate estimates at extreme delay categories are less stable due to the limited number of observations.

	Even so, the consistent upward pattern indicates that repayment behavior as far as six months prior still provides meaningful predictive signal for future default risk, although its strength is weaker compared to more recent repayment behavior.
	""")


	if __name__ == '__main__':
	run()