iBrokeTheCode commited on
Commit
0df7f5d
·
1 Parent(s): 19ad66f

chore: Complete EDA

Browse files
Files changed (2) hide show
  1. app.py +103 -39
  2. src/plots.py +176 -0
app.py CHANGED
@@ -25,19 +25,30 @@ def _(mo):
25
  @app.cell
26
  def _():
27
  import matplotlib.pyplot as plt
28
- import seaborn as sns
29
  import pandas as pd
 
30
 
31
- from src.utils import get_dataset, get_features_target, get_train_test_sets
 
 
 
 
 
 
 
32
  from src.theme import custom_palette
 
33
  return (
34
- custom_palette,
35
  get_dataset,
36
  get_features_target,
37
  get_train_test_sets,
38
  pd,
39
- plt,
40
- sns,
 
 
 
 
41
  )
42
 
43
 
@@ -69,9 +80,11 @@ def _(mo):
69
 
70
  @app.cell
71
  def _(X_test, X_train, df):
72
- print("Train dataset samples: {}".format(X_train.shape[0]))
73
- print("Test dataset samples: {}".format(X_test.shape[0]))
74
- print("Number of columns: {}".format(df.shape[1]))
 
 
75
  return
76
 
77
 
@@ -108,40 +121,21 @@ def _(mo):
108
 
109
 
110
  @app.cell
111
- def _(custom_palette, df, plt, sns):
112
- # Get value counts and percentages
113
- target_counts = df["TARGET"].value_counts()
114
- target_percent = (target_counts / target_counts.sum() * 100).round(2)
115
-
116
- # Combine into a DataFrame for clarity
117
- target_df = target_counts.to_frame(name="Count")
118
- target_df["Percentage"] = target_percent
119
 
120
- # Plot
121
- plt.figure(figsize=(8, 5))
122
- ax = sns.barplot(
123
- data=target_df,
124
- x="TARGET",
125
- y="Count",
126
- hue="TARGET",
127
- palette=custom_palette[:2],
128
- )
129
-
130
- # Titles and formatting
131
- plt.title("Distribution of TARGET variable")
132
- plt.xlabel("Payment Difficulties (1 = Yes, 0 = No)", fontsize=12)
133
- plt.ylabel("Count", fontsize=12)
134
- plt.grid(axis="y", linestyle="--", alpha=0.4)
135
- plt.tight_layout()
136
- plt.show()
137
 
138
- target_df
 
 
139
  return
140
 
141
 
142
  @app.cell
143
  def _(mo):
144
- mo.md("**e. Number of columns of each data type**")
145
  return
146
 
147
 
@@ -162,7 +156,7 @@ def _(X):
162
 
163
  @app.cell
164
  def _(mo):
165
- mo.md("**f. Missing data**")
166
  return
167
 
168
 
@@ -180,18 +174,88 @@ def _(X, pd):
180
 
181
  @app.cell
182
  def _(mo):
183
- mo.md("### Distribution of Variables")
184
  return
185
 
186
 
187
  @app.cell
188
  def _(mo):
189
- mo.callout(kind="info", value="Continues at point 1.9")
190
  return
191
 
192
 
193
  @app.cell
194
- def _():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  return
196
 
197
 
 
25
  @app.cell
26
  def _():
27
  import matplotlib.pyplot as plt
 
28
  import pandas as pd
29
+ import seaborn as sns
30
 
31
+ from src.plots import (
32
+ plot_target_distribution,
33
+ plot_credit_amounts,
34
+ plot_education_levels,
35
+ plot_occupation,
36
+ plot_family_status,
37
+ plot_income_type,
38
+ )
39
  from src.theme import custom_palette
40
+ from src.utils import get_dataset, get_features_target, get_train_test_sets
41
  return (
 
42
  get_dataset,
43
  get_features_target,
44
  get_train_test_sets,
45
  pd,
46
+ plot_credit_amounts,
47
+ plot_education_levels,
48
+ plot_family_status,
49
+ plot_income_type,
50
+ plot_occupation,
51
+ plot_target_distribution,
52
  )
53
 
54
 
 
80
 
81
  @app.cell
82
  def _(X_test, X_train, df):
83
+ train_samples = "Train dataset samples: {}".format(X_train.shape[0])
84
+ test_samples = "Test dataset samples: {}".format(X_test.shape[0])
85
+ columns_number = "Number of columns: {}".format(df.shape[1])
86
+
87
+ train_samples, test_samples, columns_number
88
  return
89
 
90
 
 
121
 
122
 
123
  @app.cell
124
+ def _(df, plot_target_distribution):
125
+ target_table, target_plot = plot_target_distribution(df=df)
126
+ target_table
127
+ return (target_plot,)
 
 
 
 
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ @app.cell
131
+ def _(target_plot):
132
+ target_plot
133
  return
134
 
135
 
136
  @app.cell
137
  def _(mo):
138
+ mo.md("""**e. Number of columns of each data type**""")
139
  return
140
 
141
 
 
156
 
157
  @app.cell
158
  def _(mo):
159
+ mo.md("""**f. Missing data**""")
160
  return
161
 
162
 
 
174
 
175
  @app.cell
176
  def _(mo):
177
+ mo.md("""### Distribution of Variables""")
178
  return
179
 
180
 
181
  @app.cell
182
  def _(mo):
183
+ mo.md("""**a. Credit Amounts**""")
184
  return
185
 
186
 
187
  @app.cell
188
+ def _(X, plot_credit_amounts):
189
+ plot_credit_amounts(df=X)
190
+ return
191
+
192
+
193
+ @app.cell
194
+ def _(mo):
195
+ mo.md("""**b. Education Level of Credit Applicants**""")
196
+ return
197
+
198
+
199
+ @app.cell
200
+ def _(X, plot_education_levels):
201
+ education_table, education_plot = plot_education_levels(df=X)
202
+ education_table
203
+ return (education_plot,)
204
+
205
+
206
+ @app.cell
207
+ def _(education_plot):
208
+ education_plot
209
+ return
210
+
211
+
212
+ @app.cell
213
+ def _(mo):
214
+ mo.md("""**c. Ocupation of Credit Applicants**""")
215
+ return
216
+
217
+
218
+ @app.cell
219
+ def _(X, plot_occupation):
220
+ occupation_table, occupation_plot = plot_occupation(df=X)
221
+ occupation_table
222
+ return (occupation_plot,)
223
+
224
+
225
+ @app.cell
226
+ def _(occupation_plot):
227
+ occupation_plot
228
+ return
229
+
230
+
231
+ @app.cell
232
+ def _(mo):
233
+ mo.md("""**d. Family Status of Applicants**""")
234
+ return
235
+
236
+
237
+ @app.cell
238
+ def _(X, plot_family_status):
239
+ family_status_table, family_status_plot = plot_family_status(df=X)
240
+ family_status_table
241
+ return (family_status_plot,)
242
+
243
+
244
+ @app.cell
245
+ def _(family_status_plot):
246
+ family_status_plot
247
+ return
248
+
249
+
250
+ @app.cell
251
+ def _(mo):
252
+ mo.md("""**e. Income Type of Applicants by Target Variable**""")
253
+ return
254
+
255
+
256
+ @app.cell
257
+ def _(df, plot_income_type):
258
+ plot_income_type(df=df)
259
  return
260
 
261
 
src/plots.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import seaborn as sns
3
+ from matplotlib.figure import Figure
4
+ from pandas import DataFrame, Series
5
+
6
+ from src.theme import custom_palette
7
+
8
+
9
+ def plot_target_distribution(df: DataFrame) -> tuple[DataFrame, Figure]:
10
+ """
11
+ Plot the distribution of the 'TARGET' column in a DataFrame.
12
+
13
+ Args:
14
+ df (DataFrame): The input DataFrame containing the 'TARGET' column.
15
+
16
+ Returns:
17
+ DataFrame: A DataFrame containing the count and percentage of each class.
18
+ Figure: The matplotlib Figure object containing the plot.
19
+ """
20
+ target_counts = df["TARGET"].value_counts()
21
+ target_percent = (target_counts / target_counts.sum() * 100).round(2)
22
+
23
+ # Combine into a DataFrame for clarity
24
+ target_df = target_counts.to_frame(name="Count")
25
+ target_df["Percentage"] = target_percent
26
+
27
+ fig, ax = plt.subplots(figsize=(8, 5))
28
+ sns.barplot(
29
+ data=target_df,
30
+ x="TARGET",
31
+ y="Count",
32
+ hue="TARGET",
33
+ palette=custom_palette[:2],
34
+ )
35
+
36
+ # Titles and formatting
37
+ ax.set_xlabel("Payment Difficulties (1 = Yes, 0 = No)", fontsize=12)
38
+ ax.set_ylabel("Count", fontsize=12)
39
+ ax.grid(axis="y", linestyle="--", alpha=0.4)
40
+ fig.tight_layout()
41
+
42
+ return target_df, fig
43
+
44
+
45
+ def plot_credit_amounts(df: DataFrame) -> Figure:
46
+ """
47
+ Plot a histogram of credit amounts.
48
+
49
+ Args:
50
+ df (DataFrame): The DataFrame containing the credit amount data.
51
+
52
+ Returns:
53
+ Figure: The matplotlib figure object containing the plot.
54
+ """
55
+ fig, ax = plt.subplots(figsize=(10, 6))
56
+ sns.histplot(data=df, x="AMT_CREDIT", bins=100, kde=True, color=custom_palette[0])
57
+ ax.grid(axis="y", linestyle="--", alpha=0.5)
58
+ fig.tight_layout()
59
+
60
+ return fig
61
+
62
+
63
+ def plot_education_levels(df: DataFrame) -> tuple[DataFrame, Figure]:
64
+ """
65
+ Plot a bar chart of education levels.
66
+
67
+ Args:
68
+ df (DataFrame): The DataFrame containing the education level data.
69
+
70
+ Returns:
71
+ DataFrame: The DataFrame containing the education level counts and percentages.
72
+ Figure: The matplotlib figure object containing the plot.
73
+ """
74
+ education_count = (
75
+ df["NAME_EDUCATION_TYPE"].value_counts().sort_values(ascending=False)
76
+ )
77
+ education_percentage = (education_count / df.shape[0] * 100).round(2)
78
+
79
+ education_df = education_count.to_frame(name="Count")
80
+ education_df["Percentage"] = education_percentage
81
+
82
+ fig, ax = plt.subplots(figsize=(10, 6))
83
+ sns.countplot(
84
+ data=df,
85
+ y="NAME_EDUCATION_TYPE",
86
+ hue="NAME_EDUCATION_TYPE",
87
+ palette=custom_palette[:5],
88
+ )
89
+ ax.set_xlabel("Count")
90
+ ax.set_ylabel("Education Level")
91
+ ax.grid(axis="x", linestyle="--", alpha=0.5)
92
+ fig.tight_layout()
93
+
94
+ return education_df, fig
95
+
96
+
97
+ def plot_occupation(df: DataFrame) -> tuple[Series, Figure]:
98
+ """
99
+ Plot the distribution of occupations in the dataset.
100
+
101
+ Args:
102
+ df (DataFrame): The DataFrame containing the data.
103
+
104
+ Returns:
105
+ Series: A Series containing the count of each occupation.
106
+ Figure: A Matplotlib Figure object containing the plot.
107
+ """
108
+ occupation_df = df["OCCUPATION_TYPE"].value_counts(dropna=False, ascending=False)
109
+
110
+ fig, ax = plt.subplots(figsize=(10, 6))
111
+ sns.barplot(
112
+ x=occupation_df.values,
113
+ y=occupation_df.index,
114
+ hue=occupation_df.index,
115
+ legend=False,
116
+ )
117
+ ax.set_xlabel("Number of Applicants")
118
+ ax.set_ylabel("Occupation")
119
+ ax.grid(axis="x", linestyle="--", alpha=0.5)
120
+ fig.tight_layout()
121
+
122
+ return occupation_df, fig
123
+
124
+
125
+ def plot_family_status(df: DataFrame) -> tuple[Series, Figure]:
126
+ """
127
+ Plot the distribution of family statuses in the dataset.
128
+
129
+ Args:
130
+ df (DataFrame): The DataFrame containing the data.
131
+
132
+ Returns:
133
+ Series: A Series containing the count of each family status.
134
+ Figure: A Matplotlib Figure object containing the plot.
135
+ """
136
+ family_status_df = df["NAME_FAMILY_STATUS"].value_counts(
137
+ dropna=False, ascending=False
138
+ )
139
+
140
+ fig, ax = plt.subplots(figsize=(10, 6))
141
+ sns.barplot(
142
+ x=family_status_df.values,
143
+ y=family_status_df.index,
144
+ hue=family_status_df.index,
145
+ palette=custom_palette[:6],
146
+ legend=False,
147
+ )
148
+ ax.set_xlabel("Number of Applicants")
149
+ ax.set_ylabel("Family Status")
150
+ ax.grid(axis="x", linestyle="--", alpha=0.5)
151
+ fig.tight_layout()
152
+
153
+ return family_status_df, fig
154
+
155
+
156
+ def plot_income_type(df: DataFrame) -> Figure:
157
+ """
158
+ Plot the count of income types for each target group.
159
+
160
+ Args:
161
+ df (DataFrame): The DataFrame containing the data.
162
+
163
+ Returns:
164
+ Figure: A Matplotlib Figure object containing the plot.
165
+ """
166
+ fig, ax1 = plt.subplots(figsize=(10, 6))
167
+ sns.countplot(
168
+ data=df, y="NAME_INCOME_TYPE", hue="TARGET", palette=custom_palette[:2]
169
+ )
170
+ ax1.legend(loc="lower right", title="Target")
171
+ ax1.set_xlabel("Number of Applicants")
172
+ ax1.set_ylabel("Income Type")
173
+ ax1.grid(axis="x", linestyle="--", alpha=0.5)
174
+ fig.tight_layout()
175
+
176
+ return fig