Spaces:

Heizsenberg
/

analytic-data-efficiency

Runtime error

App Files Files Community

Heizsenberg commited on Feb 3

Commit

b33b5ec

1 Parent(s): c0e95b4

final

Browse files

Files changed (8) hide show

.gitattributes +1 -0
requirements.txt +8 -2
src/__pycache__/eda.cpython-39.pyc +0 -0
src/__pycache__/prediction.cpython-39.pyc +0 -0
src/eda.py +113 -0
src/logistic_regression_pipeline.pkl +3 -0
src/prediction.py +173 -0
src/streamlit_app.py +12 -36

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.xls

requirements.txt CHANGED Viewed

@@ -1,3 +1,9 @@
-altair
 pandas
-streamlit

+streamlit
 pandas
+numpy
+scikit-learn
+joblib
+matplotlib
+seaborn
+plotly
+pillow

src/__pycache__/eda.cpython-39.pyc ADDED Viewed

Binary file (4.98 kB). View file

src/__pycache__/prediction.cpython-39.pyc ADDED Viewed

Binary file (3.55 kB). View file

src/eda.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import streamlit as st
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import plotly.express as px
+from PIL import Image
+# from ucimlrepo import fetch_ucirepo
+def customer_behavior_exploratory(default_credit_card_df):
+    Pay_data = default_credit_card_df[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']]
+    Pay_data_melt = pd.melt(Pay_data,  var_name='period')
+    pay_data_group_by = (
+        Pay_data_melt
+        .groupby(['period','value'])
+        .size()
+    )
+    grouped_df = pay_data_group_by.reset_index(name='count')
+    grouped_df.sort_values('period', ascending=False, inplace=True)
+    Grouped_totalSum = (
+        grouped_df
+        .groupby('period')
+        .sum()
+    )
+    Grouped_totalSum = Grouped_totalSum.drop('value', axis=1)
+    result = pd.merge(grouped_df, Grouped_totalSum, on='period')
+    result['proportion'] = (result['count_x'] / result['count_y']) * 100
+    result = result.drop(['count_x', 'count_y'], axis=1)
+    grouped_matrixes = result.pivot(index='value', columns='period')
+    grouped_matrixes.fillna(0)
+    return grouped_matrixes
+def show_grouped_matrix(grouped_matrixes):
+    fig, ax = plt.subplots(figsize=(8, 6))
+    sns.heatmap(
+        grouped_matrixes,
+        cmap="Blues",
+        annot=True,
+        fmt=".2f",
+        ax=ax
+    )
+    ax.set_title("Grouped Matrices Heatmap")
+    st.pyplot(fig)
+def run():
+    st.title('Customer Credit Default Prediction App')
+    st.subheader("this page contains the EDA about customer payment behavior over time")
+    image = Image.open("./src/credit_card.jpg")
+    st.image(image, caption="Credit Card")
+    # write
+    st.write("the EDA will explore and analyse the customer credit default based on customer's payment behavior")
+    # fetch dataset
+    data = pd.read_excel('https://raw.githubusercontent.com/KevinH2810/csv-customer-credit-default/main/default_of_credit_card_clients.xls')
+    data.columns = data.iloc[0]
+    data = data.drop(data.index[0])
+    st.write("we will explore each category columns in this dataset to see if there's any anomali or not")
+    categorical_columns = ['SEX', 'EDUCATION', 'MARRIAGE','PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
+    st.write("### official informations from the dataset website as follows")
+    st.write("""
+                X1(LIMIT_BAL): Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit. \n
+                X2: Gender (1 = male; 2 = female). \n
+                X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others). \n
+                X4: Marital status (1 = married; 2 = single; 3 = others). \n
+                X5: Age (year). \n
+                X6 - X11(PAY_0 - PAY_6): History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005; . . .;X11 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above. \n
+                X12-X17(BILL_AMT1 - BILL_AMT6): Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005.  \n
+                X18-X23(PAY_AMT1-PAY_AMT6): Amount of previous payment (NT dollar). X18 = amount paid in September, 2005; X19 = amount paid in August, 2005; . . .;X23 = amount paid in April, 2005 \n
+             """)
+    st.write("due to the same type and content from column PAY_0 to PAY_6, we will be using PAY_0 only as the sample for exploratory")
+    categorical_columns = ['SEX', 'EDUCATION', 'MARRIAGE','PAY_0']
+    for column in categorical_columns:
+        st.write(f"""
+                 column: {column} \n
+                 unique values = {data[column].unique()}
+                 """)
+    st.write("### Customer Behavior Exploratory")
+    grouped_matrixes = customer_behavior_exploratory(data)
+    show_grouped_matrix(grouped_matrixes)
+    st.write("### Insight")
+    st.write("""
+                - on-time states remain dominant across all periods, with moderate fluctuations across months. \n
+                - the proportion of one-month late payment is higher in PAY_0 compared to earlier periods.\n
+                - PAY_0 period shows a different composition compared to earlier periods, with relatively higher proportions of mild late payment states\n
+                - High severity late payments remain consistently rare across all periods
+             """)
+    st.write("during the exploratory, we found several unknown category in several fields  that dont have any official description. these categories is treated as `Unknown` categories so as to not be removed to avoid unecessary data losses")
+if __name__ == '__main__':
+    run()

src/logistic_regression_pipeline.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4e86262d15a5aa962d40789d492c8643f423c10ee9e1db862766f14a5798175
+size 5857

src/prediction.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import pandas as pd
+import numpy as np
+import joblib
+import streamlit as st
+def predict(new_customer_obj, model_pipeline):
+    new_cust_df = pd.DataFrame([new_customer_obj])
+    st.dataframe(new_cust_df)
+    st.write(new_cust_df.info())
+    user1_prediction = model_pipeline.predict_proba(new_cust_df)[:, 1]
+    label = model_pipeline.predict(new_cust_df)[0]
+    return user1_prediction, label
+def run():
+    # load pipeline
+    model_pipeline = joblib.load("./src/logistic_regression_pipeline.pkl")
+    # fetch dataset
+    # data = pd.read_excel('./src/default_of_credit_card_clients.xls', index_col=0)
+    # data.columns = data.iloc[0]
+    # data = data.drop(data.index[0])
+    # form creation
+    with st.form(key="prediction_form"):
+        limit_balance = st.number_input(label='limit balance', value=0, step=1, min_value=0)
+        # SEX
+        sex_options = {
+            "Male": "1",
+            "Female": "2"
+        }
+        sex_label = st.selectbox(
+            "Sex",
+            options=list(sex_options.keys()),
+            index=None
+        )
+        # EDUCATION
+        education_options = {
+            "Graduate School": "1",
+            "University": "2",
+            "High School": "3",
+            "Others": "4"
+        }
+        education_label = st.selectbox(
+            "Education",
+            options=list(education_options.keys()),
+            index=None
+        )
+        # MARITAL STATUS
+        marital_options = {
+            "Married": "1",
+            "Single": "2",
+            "Others": "3"
+        }
+        maritial_label = st.selectbox(
+            "Martial Status",
+            options=list(marital_options.keys()),
+            index=None
+        )
+        age = st.number_input("Age", value=0, step=1)
+        st.write(" === Customer Credit Payment History === ")
+        pay_options = {
+                    "none": "0",
+                    "pay duly": "-1",
+                    "payment delay for 1 month": "1",
+                    "payment delay for 2 month": "2",
+                    "payment delay for 3 month": "3",
+                    "payment delay for 4 month": "4",
+                    "payment delay for 5 month": "5",
+                    "payment delay for 6 month": "6",
+                    "payment delay for 7 month": "7",
+                    "payment delay for 8 month": "8",
+                    "payment delay for 9 month and above": "9"
+                }
+        pay_4_label = st.selectbox(
+            "June 2005",
+            options=list(pay_options.keys()),
+        )
+        pay_3_label = st.selectbox(
+            "July 2005",
+            options=list(pay_options.keys()),
+        )
+        pay_2_label = st.selectbox(
+            "August 2005",
+            options=list(pay_options.keys()),
+        )
+        pay_0_label = st.selectbox(
+            "September 2005",
+            options=list(pay_options.keys()),
+        )
+        st.write(" === Amount of Bill Statement ===")
+        bill_amt_4 = st.number_input("June 2005", value=0, step=1, help="can be minus. ex: -123456")
+        bill_amt_3 = st.number_input("July 2005", value=0, step=1, help="can be minus. ex: -123456")
+        bill_amt_2 = st.number_input("August 2005", value=0, step=1, help="can be minus. ex: -123456")
+        bill_amt_1 = st.number_input("September 2005", value=0, step=1, help="can be minus. ex: -123456")
+        st.write(" === Amount of Previous Statement ===")
+        pay_amt_4 = st.number_input("June 2005", step=1, min_value=0, help="minimum 0")
+        pay_amt_3 = st.number_input("July 2005", step=1, min_value=0, help="minimum 0")
+        pay_amt_2 = st.number_input("August 2005", step=1, min_value=0, help="minimum 0")
+        pay_amt_1 = st.number_input("September 2005", step=1, min_value=0, help="minimum 0")
+        submitted = st.form_submit_button('Predict')
+        if submitted:
+            # error handling
+            if sex_label == None:
+                st.error('Sex cannot be empty')
+                st.stop()
+            if education_label == None:
+                st.error('Education cannot be empty')
+                st.stop()
+            if maritial_label == None:
+                st.error('Marital Status cannot be empty')
+                st.stop()
+            sex = sex_options[sex_label]
+            education = education_options[education_label]
+            marital_status = marital_options[maritial_label]
+            pay_0 = pay_options[pay_0_label]
+            pay_2 = pay_options[pay_2_label]
+            pay_3 = pay_options[pay_3_label]
+            pay_4 = pay_options[pay_4_label]
+            new_customer = {
+                "LIMIT_BAL": limit_balance,
+                "SEX": sex,            # 1 = male, 2 = female
+                "EDUCATION": education,      # 1 = graduate, 2 = university, 3 = high school
+                "MARRIAGE": marital_status,       # 1 = married, 2 = single
+                "AGE": age,
+                "PAY_0": pay_0,
+                "PAY_2": pay_2,
+                "PAY_3": pay_3,
+                "PAY_4": pay_4,
+                "BILL_AMT1": bill_amt_1,
+                "BILL_AMT2": bill_amt_2,
+                "BILL_AMT3": bill_amt_3,
+                "BILL_AMT4": bill_amt_4,
+                "PAY_AMT1": pay_amt_1,
+                "PAY_AMT2": pay_amt_2,
+                "PAY_AMT3": pay_amt_3,
+                "PAY_AMT4": pay_amt_4
+            }
+            proba_default,predict_label = predict(new_customer, model_pipeline)
+            proba_label = "Not Default"
+            if predict_label == 1:
+                proba_label = "Default"
+            st.write(f"""
+                     proba_default = {proba_default}
+                     prediction of default payment next month \n
+                     prediction probability ={(proba_default * 100)[0]:.3f}% \n
+                     prediction label = {proba_label}
+                     """)
+if __name__ == '__main__':
+    run()

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,16 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import eda
+import prediction
+st.set_page_config(
+    page_title="Customer Credit Default Prediction",
+    layout = 'wide',
+    initial_sidebar_state='expanded'
+)
+page = st.sidebar.selectbox('Pilih Page: ', ('EDA', 'Prediction'))
+if page == 'EDA':
+    eda.run()
+else:
+    prediction.run()