iBrokeTheCode commited on
Commit
bd9910a
Β·
1 Parent(s): 7c1a09b

chore: Complete preprocessing stage

Browse files
Files changed (2) hide show
  1. app.py +62 -12
  2. src/preprocessing.py +87 -0
app.py CHANGED
@@ -16,12 +16,6 @@ def _(mo):
16
  return
17
 
18
 
19
- @app.cell
20
- def _(mo):
21
- mo.md("""## Importing Libraries""")
22
- return
23
-
24
-
25
  @app.cell
26
  def _():
27
  import matplotlib.pyplot as plt
@@ -38,6 +32,7 @@ def _():
38
  )
39
  from src.theme import custom_palette
40
  from src.utils import get_dataset, get_features_target, get_train_test_sets
 
41
  return (
42
  get_dataset,
43
  get_features_target,
@@ -49,26 +44,26 @@ def _():
49
  plot_income_type,
50
  plot_occupation,
51
  plot_target_distribution,
 
52
  )
53
 
54
 
55
  @app.cell
56
- def _(get_dataset, get_features_target, get_train_test_sets):
57
  df = get_dataset()
58
  X, y = get_features_target(df)
59
- X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
60
- return X, X_test, X_train, df
61
 
62
 
63
  @app.cell
64
  def _(mo):
65
- mo.md("""## Exploratory Data Analysis""")
66
  return
67
 
68
 
69
  @app.cell
70
  def _(mo):
71
- mo.md("""### Dataset Information""")
72
  return
73
 
74
 
@@ -174,7 +169,7 @@ def _(X, pd):
174
 
175
  @app.cell
176
  def _(mo):
177
- mo.md("""### Distribution of Variables""")
178
  return
179
 
180
 
@@ -259,5 +254,60 @@ def _(df, plot_income_type):
259
  return
260
 
261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  if __name__ == "__main__":
263
  app.run()
 
16
  return
17
 
18
 
 
 
 
 
 
 
19
  @app.cell
20
  def _():
21
  import matplotlib.pyplot as plt
 
32
  )
33
  from src.theme import custom_palette
34
  from src.utils import get_dataset, get_features_target, get_train_test_sets
35
+ from src.preprocessing import preprocess_data
36
  return (
37
  get_dataset,
38
  get_features_target,
 
44
  plot_income_type,
45
  plot_occupation,
46
  plot_target_distribution,
47
+ preprocess_data,
48
  )
49
 
50
 
51
  @app.cell
52
+ def _(get_dataset, get_features_target):
53
  df = get_dataset()
54
  X, y = get_features_target(df)
55
+ return X, df, y
 
56
 
57
 
58
  @app.cell
59
  def _(mo):
60
+ mo.md("""## 1. Exploratory Data Analysis""")
61
  return
62
 
63
 
64
  @app.cell
65
  def _(mo):
66
+ mo.md("""### 1.1 Dataset Information""")
67
  return
68
 
69
 
 
169
 
170
  @app.cell
171
  def _(mo):
172
+ mo.md("""### 1.2 Distribution of Variables""")
173
  return
174
 
175
 
 
254
  return
255
 
256
 
257
+ @app.cell
258
+ def _(mo):
259
+ mo.md("""## 2. Preprocessing""")
260
+ return
261
+
262
+
263
+ @app.cell
264
+ def _(mo):
265
+ mo.md("""**a. Separate Train and Test Datasets**""")
266
+ return
267
+
268
+
269
+ @app.cell
270
+ def _(X, get_train_test_sets, y):
271
+ X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
272
+ X_train.shape, y_train.shape, X_test.shape, y_test.shape
273
+ return X_test, X_train
274
+
275
+
276
+ @app.cell
277
+ def _(mo):
278
+ mo.md("""**b. Preprocess Data**""")
279
+ return
280
+
281
+
282
+ @app.cell
283
+ def _(mo):
284
+ mo.md(
285
+ r"""
286
+ This preprocessing perform:
287
+
288
+ - Correct outliers/anomalous values in numerical columns (`DAYS_EMPLOYED` column).
289
+ - Encode string categorical features (`dtype object`).
290
+ - If the feature has 2 categories, Binary Encoding is applied.
291
+ - One Hot Encoding for more than 2 categories.
292
+ - Impute values for all columns with missing data (using median as imputing value).
293
+ - Feature scaling with Min-Max scaler
294
+ """
295
+ )
296
+ return
297
+
298
+
299
+ @app.cell
300
+ def _(X_test, X_train, preprocess_data):
301
+ train_data, test_data = preprocess_data(train_df=X_train, test_df=X_test)
302
+ train_data.shape, test_data.shape
303
+ return
304
+
305
+
306
+ @app.cell
307
+ def _(mo):
308
+ mo.md("## 3. Training Models")
309
+ return
310
+
311
+
312
  if __name__ == "__main__":
313
  app.run()
src/preprocessing.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from numpy import nan, ndarray
2
+ from pandas import DataFrame, concat
3
+ from sklearn.impute import SimpleImputer
4
+ from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
5
+
6
+
7
+ def preprocess_data(train_df: DataFrame, test_df: DataFrame) -> tuple[ndarray, ndarray]:
8
+ """
9
+ Pre process data for modeling. Receives train and test dataframes, cleans them up, and returns ndarrays with feature engineering already performed.
10
+
11
+ Args:
12
+ train_df (DataFrame): The training dataframe.
13
+ test_df (DataFrame): The test dataframe.
14
+
15
+ Returns:
16
+ tuple[ndarray, ndarray]: A tuple with the preprocessed train and test data as ndarrays
17
+ """
18
+ aux_train_df = train_df.copy()
19
+ aux_test_df = test_df.copy()
20
+
21
+ # πŸ“Œ [1] Correct outliers/anomalous values in numerical columns
22
+ aux_train_df["DAYS_EMPLOYED"] = aux_train_df["DAYS_EMPLOYED"].replace({365243: nan})
23
+ aux_test_df["DAYS_EMPLOYED"] = aux_test_df["DAYS_EMPLOYED"].replace({365243: nan})
24
+
25
+ # πŸ“Œ [2] Encode string categorical features
26
+ categorical_cols = aux_train_df.select_dtypes(include="object").columns
27
+ binary_cols = [col for col in categorical_cols if aux_train_df[col].nunique() == 2]
28
+ multi_cols = [col for col in categorical_cols if aux_train_df[col].nunique() > 2]
29
+
30
+ # [2.1] Encode Binary Categorical Features
31
+ ordinal_encoder = OrdinalEncoder()
32
+
33
+ ordinal_encoder.fit(aux_train_df[binary_cols])
34
+ aux_train_df[binary_cols] = ordinal_encoder.transform(aux_train_df[binary_cols])
35
+ aux_test_df[binary_cols] = ordinal_encoder.transform(aux_test_df[binary_cols])
36
+
37
+ # [2.2] Encode Multi Categorical Features
38
+ one_hot_encoder = OneHotEncoder(
39
+ handle_unknown="ignore", # Prevents errors when test set contain categories that didn't appear in train dataframe
40
+ sparse_output=False, # Returns a dense array instead of a sparse matrix
41
+ )
42
+
43
+ one_hot_encoder.fit(aux_train_df[multi_cols])
44
+ ohe_train = one_hot_encoder.transform(aux_train_df[multi_cols])
45
+ ohe_test = one_hot_encoder.transform(aux_test_df[multi_cols])
46
+
47
+ # Get columns names
48
+ ohe_cols = one_hot_encoder.get_feature_names_out(input_features=multi_cols)
49
+
50
+ # Convert arrays to DataFrames
51
+ ohe_train_df = DataFrame(data=ohe_train, columns=ohe_cols, index=aux_train_df.index) # type: ignore
52
+ ohe_test_df = DataFrame(data=ohe_test, columns=ohe_cols, index=aux_test_df.index) # type: ignore
53
+
54
+ # Drop original multi category columns
55
+ aux_train_df.drop(columns=multi_cols, inplace=True)
56
+ aux_test_df.drop(columns=multi_cols, inplace=True)
57
+
58
+ # Concatenate encoded dataframe
59
+ aux_train_df = concat([aux_train_df, ohe_train_df], axis=1)
60
+ aux_test_df = concat([aux_test_df, ohe_test_df], axis=1)
61
+
62
+ # πŸ“Œ [3] Impute values for columns with missing data
63
+ imputer = SimpleImputer(strategy="median")
64
+ imputer.fit(aux_train_df)
65
+
66
+ imputer_train = imputer.transform(aux_train_df)
67
+ imputer_test = imputer.transform(aux_test_df)
68
+
69
+ aux_train_df = DataFrame(
70
+ data=imputer_train, # type: ignore
71
+ columns=aux_train_df.columns,
72
+ index=aux_train_df.index,
73
+ )
74
+ aux_test_df = DataFrame(
75
+ data=imputer_test, # type: ignore
76
+ columns=aux_test_df.columns,
77
+ index=aux_test_df.index,
78
+ )
79
+
80
+ # πŸ“Œ [4] Feature Scaling with Min-Max Scaler
81
+ scaler = MinMaxScaler()
82
+ scaler.fit(aux_train_df)
83
+
84
+ scaler_train = scaler.transform(aux_train_df)
85
+ scaler_test = scaler.transform(aux_test_df)
86
+
87
+ return scaler_train, scaler_test