Heizsenberg commited on
Commit
b33b5ec
·
1 Parent(s): c0e95b4
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.xls
requirements.txt CHANGED
@@ -1,3 +1,9 @@
1
- altair
2
  pandas
3
- streamlit
 
 
 
 
 
 
 
1
+ streamlit
2
  pandas
3
+ numpy
4
+ scikit-learn
5
+ joblib
6
+ matplotlib
7
+ seaborn
8
+ plotly
9
+ pillow
src/__pycache__/eda.cpython-39.pyc ADDED
Binary file (4.98 kB). View file
 
src/__pycache__/prediction.cpython-39.pyc ADDED
Binary file (3.55 kB). View file
 
src/eda.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+ import plotly.express as px
6
+ from PIL import Image
7
+ # from ucimlrepo import fetch_ucirepo
8
+
9
+ def customer_behavior_exploratory(default_credit_card_df):
10
+ Pay_data = default_credit_card_df[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']]
11
+
12
+ Pay_data_melt = pd.melt(Pay_data, var_name='period')
13
+
14
+ pay_data_group_by = (
15
+ Pay_data_melt
16
+ .groupby(['period','value'])
17
+ .size()
18
+ )
19
+
20
+ grouped_df = pay_data_group_by.reset_index(name='count')
21
+ grouped_df.sort_values('period', ascending=False, inplace=True)
22
+
23
+ Grouped_totalSum = (
24
+ grouped_df
25
+ .groupby('period')
26
+ .sum()
27
+ )
28
+
29
+ Grouped_totalSum = Grouped_totalSum.drop('value', axis=1)
30
+
31
+ result = pd.merge(grouped_df, Grouped_totalSum, on='period')
32
+ result['proportion'] = (result['count_x'] / result['count_y']) * 100
33
+
34
+ result = result.drop(['count_x', 'count_y'], axis=1)
35
+
36
+ grouped_matrixes = result.pivot(index='value', columns='period')
37
+ grouped_matrixes.fillna(0)
38
+
39
+ return grouped_matrixes
40
+
41
+ def show_grouped_matrix(grouped_matrixes):
42
+ fig, ax = plt.subplots(figsize=(8, 6))
43
+
44
+ sns.heatmap(
45
+ grouped_matrixes,
46
+ cmap="Blues",
47
+ annot=True,
48
+ fmt=".2f",
49
+ ax=ax
50
+ )
51
+
52
+ ax.set_title("Grouped Matrices Heatmap")
53
+ st.pyplot(fig)
54
+
55
+
56
+ def run():
57
+ st.title('Customer Credit Default Prediction App')
58
+ st.subheader("this page contains the EDA about customer payment behavior over time")
59
+
60
+ image = Image.open("./src/credit_card.jpg")
61
+ st.image(image, caption="Credit Card")
62
+
63
+ # write
64
+ st.write("the EDA will explore and analyse the customer credit default based on customer's payment behavior")
65
+
66
+ # fetch dataset
67
+ data = pd.read_excel('https://raw.githubusercontent.com/KevinH2810/csv-customer-credit-default/main/default_of_credit_card_clients.xls')
68
+ data.columns = data.iloc[0]
69
+ data = data.drop(data.index[0])
70
+
71
+ st.write("we will explore each category columns in this dataset to see if there's any anomali or not")
72
+
73
+ categorical_columns = ['SEX', 'EDUCATION', 'MARRIAGE','PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
74
+
75
+ st.write("### official informations from the dataset website as follows")
76
+ st.write("""
77
+ X1(LIMIT_BAL): Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit. \n
78
+ X2: Gender (1 = male; 2 = female). \n
79
+ X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others). \n
80
+ X4: Marital status (1 = married; 2 = single; 3 = others). \n
81
+ X5: Age (year). \n
82
+ X6 - X11(PAY_0 - PAY_6): History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005; . . .;X11 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above. \n
83
+ X12-X17(BILL_AMT1 - BILL_AMT6): Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005. \n
84
+ X18-X23(PAY_AMT1-PAY_AMT6): Amount of previous payment (NT dollar). X18 = amount paid in September, 2005; X19 = amount paid in August, 2005; . . .;X23 = amount paid in April, 2005 \n
85
+ """)
86
+
87
+ st.write("due to the same type and content from column PAY_0 to PAY_6, we will be using PAY_0 only as the sample for exploratory")
88
+
89
+ categorical_columns = ['SEX', 'EDUCATION', 'MARRIAGE','PAY_0']
90
+
91
+ for column in categorical_columns:
92
+ st.write(f"""
93
+ column: {column} \n
94
+ unique values = {data[column].unique()}
95
+ """)
96
+
97
+ st.write("### Customer Behavior Exploratory")
98
+
99
+ grouped_matrixes = customer_behavior_exploratory(data)
100
+
101
+ show_grouped_matrix(grouped_matrixes)
102
+
103
+ st.write("### Insight")
104
+ st.write("""
105
+ - on-time states remain dominant across all periods, with moderate fluctuations across months. \n
106
+ - the proportion of one-month late payment is higher in PAY_0 compared to earlier periods.\n
107
+ - PAY_0 period shows a different composition compared to earlier periods, with relatively higher proportions of mild late payment states\n
108
+ - High severity late payments remain consistently rare across all periods
109
+ """)
110
+ st.write("during the exploratory, we found several unknown category in several fields that dont have any official description. these categories is treated as `Unknown` categories so as to not be removed to avoid unecessary data losses")
111
+
112
+ if __name__ == '__main__':
113
+ run()
src/logistic_regression_pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4e86262d15a5aa962d40789d492c8643f423c10ee9e1db862766f14a5798175
3
+ size 5857
src/prediction.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import joblib
4
+ import streamlit as st
5
+
6
+ def predict(new_customer_obj, model_pipeline):
7
+ new_cust_df = pd.DataFrame([new_customer_obj])
8
+ st.dataframe(new_cust_df)
9
+ st.write(new_cust_df.info())
10
+
11
+ user1_prediction = model_pipeline.predict_proba(new_cust_df)[:, 1]
12
+ label = model_pipeline.predict(new_cust_df)[0]
13
+ return user1_prediction, label
14
+
15
+ def run():
16
+ # load pipeline
17
+ model_pipeline = joblib.load("./src/logistic_regression_pipeline.pkl")
18
+
19
+ # fetch dataset
20
+ # data = pd.read_excel('./src/default_of_credit_card_clients.xls', index_col=0)
21
+ # data.columns = data.iloc[0]
22
+ # data = data.drop(data.index[0])
23
+
24
+ # form creation
25
+ with st.form(key="prediction_form"):
26
+ limit_balance = st.number_input(label='limit balance', value=0, step=1, min_value=0)
27
+
28
+ # SEX
29
+ sex_options = {
30
+ "Male": "1",
31
+ "Female": "2"
32
+ }
33
+ sex_label = st.selectbox(
34
+ "Sex",
35
+ options=list(sex_options.keys()),
36
+ index=None
37
+ )
38
+
39
+ # EDUCATION
40
+ education_options = {
41
+ "Graduate School": "1",
42
+ "University": "2",
43
+ "High School": "3",
44
+ "Others": "4"
45
+ }
46
+ education_label = st.selectbox(
47
+ "Education",
48
+ options=list(education_options.keys()),
49
+ index=None
50
+ )
51
+
52
+ # MARITAL STATUS
53
+ marital_options = {
54
+ "Married": "1",
55
+ "Single": "2",
56
+ "Others": "3"
57
+ }
58
+ maritial_label = st.selectbox(
59
+ "Martial Status",
60
+ options=list(marital_options.keys()),
61
+ index=None
62
+ )
63
+
64
+ age = st.number_input("Age", value=0, step=1)
65
+
66
+ st.write(" === Customer Credit Payment History === ")
67
+
68
+ pay_options = {
69
+ "none": "0",
70
+ "pay duly": "-1",
71
+ "payment delay for 1 month": "1",
72
+ "payment delay for 2 month": "2",
73
+ "payment delay for 3 month": "3",
74
+ "payment delay for 4 month": "4",
75
+ "payment delay for 5 month": "5",
76
+ "payment delay for 6 month": "6",
77
+ "payment delay for 7 month": "7",
78
+ "payment delay for 8 month": "8",
79
+ "payment delay for 9 month and above": "9"
80
+ }
81
+
82
+ pay_4_label = st.selectbox(
83
+ "June 2005",
84
+ options=list(pay_options.keys()),
85
+ )
86
+ pay_3_label = st.selectbox(
87
+ "July 2005",
88
+ options=list(pay_options.keys()),
89
+ )
90
+ pay_2_label = st.selectbox(
91
+ "August 2005",
92
+ options=list(pay_options.keys()),
93
+ )
94
+ pay_0_label = st.selectbox(
95
+ "September 2005",
96
+ options=list(pay_options.keys()),
97
+ )
98
+
99
+ st.write(" === Amount of Bill Statement ===")
100
+
101
+ bill_amt_4 = st.number_input("June 2005", value=0, step=1, help="can be minus. ex: -123456")
102
+ bill_amt_3 = st.number_input("July 2005", value=0, step=1, help="can be minus. ex: -123456")
103
+ bill_amt_2 = st.number_input("August 2005", value=0, step=1, help="can be minus. ex: -123456")
104
+ bill_amt_1 = st.number_input("September 2005", value=0, step=1, help="can be minus. ex: -123456")
105
+
106
+ st.write(" === Amount of Previous Statement ===")
107
+
108
+ pay_amt_4 = st.number_input("June 2005", step=1, min_value=0, help="minimum 0")
109
+ pay_amt_3 = st.number_input("July 2005", step=1, min_value=0, help="minimum 0")
110
+ pay_amt_2 = st.number_input("August 2005", step=1, min_value=0, help="minimum 0")
111
+ pay_amt_1 = st.number_input("September 2005", step=1, min_value=0, help="minimum 0")
112
+
113
+ submitted = st.form_submit_button('Predict')
114
+
115
+ if submitted:
116
+ # error handling
117
+ if sex_label == None:
118
+ st.error('Sex cannot be empty')
119
+ st.stop()
120
+
121
+ if education_label == None:
122
+ st.error('Education cannot be empty')
123
+ st.stop()
124
+
125
+ if maritial_label == None:
126
+ st.error('Marital Status cannot be empty')
127
+ st.stop()
128
+
129
+ sex = sex_options[sex_label]
130
+ education = education_options[education_label]
131
+ marital_status = marital_options[maritial_label]
132
+
133
+ pay_0 = pay_options[pay_0_label]
134
+ pay_2 = pay_options[pay_2_label]
135
+ pay_3 = pay_options[pay_3_label]
136
+ pay_4 = pay_options[pay_4_label]
137
+
138
+ new_customer = {
139
+ "LIMIT_BAL": limit_balance,
140
+ "SEX": sex, # 1 = male, 2 = female
141
+ "EDUCATION": education, # 1 = graduate, 2 = university, 3 = high school
142
+ "MARRIAGE": marital_status, # 1 = married, 2 = single
143
+ "AGE": age,
144
+ "PAY_0": pay_0,
145
+ "PAY_2": pay_2,
146
+ "PAY_3": pay_3,
147
+ "PAY_4": pay_4,
148
+ "BILL_AMT1": bill_amt_1,
149
+ "BILL_AMT2": bill_amt_2,
150
+ "BILL_AMT3": bill_amt_3,
151
+ "BILL_AMT4": bill_amt_4,
152
+ "PAY_AMT1": pay_amt_1,
153
+ "PAY_AMT2": pay_amt_2,
154
+ "PAY_AMT3": pay_amt_3,
155
+ "PAY_AMT4": pay_amt_4
156
+ }
157
+
158
+ proba_default,predict_label = predict(new_customer, model_pipeline)
159
+
160
+ proba_label = "Not Default"
161
+ if predict_label == 1:
162
+ proba_label = "Default"
163
+
164
+ st.write(f"""
165
+ proba_default = {proba_default}
166
+ prediction of default payment next month \n
167
+ prediction probability ={(proba_default * 100)[0]:.3f}% \n
168
+ prediction label = {proba_label}
169
+ """)
170
+
171
+
172
+ if __name__ == '__main__':
173
+ run()
src/streamlit_app.py CHANGED
@@ -1,40 +1,16 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
 
 
 
8
 
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
 
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
1
  import streamlit as st
2
+ import eda
3
+ import prediction
4
 
5
+ st.set_page_config(
6
+ page_title="Customer Credit Default Prediction",
7
+ layout = 'wide',
8
+ initial_sidebar_state='expanded'
9
+ )
10
 
11
+ page = st.sidebar.selectbox('Pilih Page: ', ('EDA', 'Prediction'))
 
 
12
 
13
+ if page == 'EDA':
14
+ eda.run()
15
+ else:
16
+ prediction.run()