Spaces:

iBrokeTheCode
/

Home_Credit_Default_Risk_Prediction

Sleeping

App Files Files Community

iBrokeTheCode commited on Aug 7, 2025

Commit

bd9910a

1 Parent(s): 7c1a09b

chore: Complete preprocessing stage

Browse files

Files changed (2) hide show

app.py +62 -12
src/preprocessing.py +87 -0

app.py CHANGED Viewed

@@ -16,12 +16,6 @@ def _(mo):
     return
-@app.cell
-def _(mo):
-    mo.md("""## Importing Libraries""")
-    return
 @app.cell
 def _():
     import matplotlib.pyplot as plt
@@ -38,6 +32,7 @@ def _():
     )
     from src.theme import custom_palette
     from src.utils import get_dataset, get_features_target, get_train_test_sets
     return (
         get_dataset,
         get_features_target,
@@ -49,26 +44,26 @@ def _():
         plot_income_type,
         plot_occupation,
         plot_target_distribution,
     )
 @app.cell
-def _(get_dataset, get_features_target, get_train_test_sets):
     df = get_dataset()
     X, y = get_features_target(df)
-    X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
-    return X, X_test, X_train, df
 @app.cell
 def _(mo):
-    mo.md("""## Exploratory Data Analysis""")
     return
 @app.cell
 def _(mo):
-    mo.md("""### Dataset Information""")
     return
@@ -174,7 +169,7 @@ def _(X, pd):
 @app.cell
 def _(mo):
-    mo.md("""### Distribution of Variables""")
     return
@@ -259,5 +254,60 @@ def _(df, plot_income_type):
     return
 if __name__ == "__main__":
     app.run()

     return
 @app.cell
 def _():
     import matplotlib.pyplot as plt
     )
     from src.theme import custom_palette
     from src.utils import get_dataset, get_features_target, get_train_test_sets
+    from src.preprocessing import preprocess_data
     return (
         get_dataset,
         get_features_target,
         plot_income_type,
         plot_occupation,
         plot_target_distribution,
+        preprocess_data,
     )
 @app.cell
+def _(get_dataset, get_features_target):
     df = get_dataset()
     X, y = get_features_target(df)
+    return X, df, y
 @app.cell
 def _(mo):
+    mo.md("""## 1. Exploratory Data Analysis""")
     return
 @app.cell
 def _(mo):
+    mo.md("""### 1.1 Dataset Information""")
     return
 @app.cell
 def _(mo):
+    mo.md("""### 1.2 Distribution of Variables""")
     return
     return
+@app.cell
+def _(mo):
+    mo.md("""## 2. Preprocessing""")
+    return
+@app.cell
+def _(mo):
+    mo.md("""**a. Separate Train and Test Datasets**""")
+    return
+@app.cell
+def _(X, get_train_test_sets, y):
+    X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
+    X_train.shape, y_train.shape, X_test.shape, y_test.shape
+    return X_test, X_train
+@app.cell
+def _(mo):
+    mo.md("""**b. Preprocess Data**""")
+    return
+@app.cell
+def _(mo):
+    mo.md(
+        r"""
+    This preprocessing perform:
+    - Correct outliers/anomalous values in numerical columns (`DAYS_EMPLOYED` column).
+    - Encode string categorical features (`dtype object`).
+        - If the feature has 2 categories, Binary Encoding is applied.
+        - One Hot Encoding for more than 2 categories.
+    - Impute values for all columns with missing data (using median as imputing value).
+    - Feature scaling with Min-Max scaler
+    """
+    )
+    return
+@app.cell
+def _(X_test, X_train, preprocess_data):
+    train_data, test_data = preprocess_data(train_df=X_train, test_df=X_test)
+    train_data.shape, test_data.shape
+    return
+@app.cell
+def _(mo):
+    mo.md("## 3. Training Models")
+    return
 if __name__ == "__main__":
     app.run()

src/preprocessing.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from numpy import nan, ndarray
+from pandas import DataFrame, concat
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
+def preprocess_data(train_df: DataFrame, test_df: DataFrame) -> tuple[ndarray, ndarray]:
+    """
+    Pre process data for modeling. Receives train and test dataframes, cleans them up, and returns ndarrays with feature engineering already performed.
+    Args:
+        train_df (DataFrame): The training dataframe.
+        test_df (DataFrame): The test dataframe.
+    Returns:
+        tuple[ndarray, ndarray]: A tuple with the preprocessed train and test data as ndarrays
+    """
+    aux_train_df = train_df.copy()
+    aux_test_df = test_df.copy()
+    # 📌 [1] Correct outliers/anomalous values in numerical columns
+    aux_train_df["DAYS_EMPLOYED"] = aux_train_df["DAYS_EMPLOYED"].replace({365243: nan})
+    aux_test_df["DAYS_EMPLOYED"] = aux_test_df["DAYS_EMPLOYED"].replace({365243: nan})
+    # 📌 [2] Encode string categorical features
+    categorical_cols = aux_train_df.select_dtypes(include="object").columns
+    binary_cols = [col for col in categorical_cols if aux_train_df[col].nunique() == 2]
+    multi_cols = [col for col in categorical_cols if aux_train_df[col].nunique() > 2]
+    # [2.1] Encode Binary Categorical Features
+    ordinal_encoder = OrdinalEncoder()
+    ordinal_encoder.fit(aux_train_df[binary_cols])
+    aux_train_df[binary_cols] = ordinal_encoder.transform(aux_train_df[binary_cols])
+    aux_test_df[binary_cols] = ordinal_encoder.transform(aux_test_df[binary_cols])
+    # [2.2] Encode Multi Categorical Features
+    one_hot_encoder = OneHotEncoder(
+        handle_unknown="ignore",  # Prevents errors when test set contain categories that didn't appear in train dataframe
+        sparse_output=False,  # Returns a dense array instead of a sparse matrix
+    )
+    one_hot_encoder.fit(aux_train_df[multi_cols])
+    ohe_train = one_hot_encoder.transform(aux_train_df[multi_cols])
+    ohe_test = one_hot_encoder.transform(aux_test_df[multi_cols])
+    # Get columns names
+    ohe_cols = one_hot_encoder.get_feature_names_out(input_features=multi_cols)
+    # Convert arrays to DataFrames
+    ohe_train_df = DataFrame(data=ohe_train, columns=ohe_cols, index=aux_train_df.index)  # type: ignore
+    ohe_test_df = DataFrame(data=ohe_test, columns=ohe_cols, index=aux_test_df.index)  # type: ignore
+    # Drop original multi category columns
+    aux_train_df.drop(columns=multi_cols, inplace=True)
+    aux_test_df.drop(columns=multi_cols, inplace=True)
+    # Concatenate encoded dataframe
+    aux_train_df = concat([aux_train_df, ohe_train_df], axis=1)
+    aux_test_df = concat([aux_test_df, ohe_test_df], axis=1)
+    # 📌 [3] Impute values for columns with missing data
+    imputer = SimpleImputer(strategy="median")
+    imputer.fit(aux_train_df)
+    imputer_train = imputer.transform(aux_train_df)
+    imputer_test = imputer.transform(aux_test_df)
+    aux_train_df = DataFrame(
+        data=imputer_train,  # type: ignore
+        columns=aux_train_df.columns,
+        index=aux_train_df.index,
+    )
+    aux_test_df = DataFrame(
+        data=imputer_test,  # type: ignore
+        columns=aux_test_df.columns,
+        index=aux_test_df.index,
+    )
+    # 📌 [4]  Feature Scaling with Min-Max Scaler
+    scaler = MinMaxScaler()
+    scaler.fit(aux_train_df)
+    scaler_train = scaler.transform(aux_train_df)
+    scaler_test = scaler.transform(aux_test_df)
+    return scaler_train, scaler_test