Commit ·
c742ac4
1
Parent(s): d77c733
chore: Save LightGBM model
Browse files- .gitattributes +1 -0
- app.py +1 -1
- app_bk.py +728 -0
- model/lgbm_model.joblib +3 -0
- tutorial_app.ipynb +619 -11
.gitattributes
CHANGED
|
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
home_credit_dataset.csv filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
home_credit_dataset.csv filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
lgbm_model.joblib filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
|
@@ -689,7 +689,7 @@ def _(mo):
|
|
| 689 |
|
| 690 |
@app.cell
|
| 691 |
def _(mo):
|
| 692 |
-
mo.md(r"""## 5.
|
| 693 |
return
|
| 694 |
|
| 695 |
|
|
|
|
| 689 |
|
| 690 |
@app.cell
|
| 691 |
def _(mo):
|
| 692 |
+
mo.md(r"""## 5. Model Selection""")
|
| 693 |
return
|
| 694 |
|
| 695 |
|
app_bk.py
ADDED
|
@@ -0,0 +1,728 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import marimo
|
| 2 |
+
|
| 3 |
+
__generated_with = "0.14.16"
|
| 4 |
+
app = marimo.App()
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@app.cell
|
| 8 |
+
def _():
|
| 9 |
+
import marimo as mo
|
| 10 |
+
return (mo,)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@app.cell
|
| 14 |
+
def _(mo):
|
| 15 |
+
mo.center(mo.md("# Home Credit Default Risk Prediction"))
|
| 16 |
+
return
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@app.cell
|
| 20 |
+
def _():
|
| 21 |
+
import pandas as pd
|
| 22 |
+
|
| 23 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 24 |
+
from sklearn.linear_model import LogisticRegression
|
| 25 |
+
from sklearn.metrics import roc_auc_score
|
| 26 |
+
from sklearn.model_selection import RandomizedSearchCV
|
| 27 |
+
|
| 28 |
+
from sklearn.pipeline import Pipeline
|
| 29 |
+
from sklearn.compose import ColumnTransformer
|
| 30 |
+
from sklearn.impute import SimpleImputer
|
| 31 |
+
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
|
| 32 |
+
|
| 33 |
+
from lightgbm import LGBMClassifier
|
| 34 |
+
|
| 35 |
+
from src.plots import (
|
| 36 |
+
plot_target_distribution,
|
| 37 |
+
plot_credit_amounts,
|
| 38 |
+
plot_education_levels,
|
| 39 |
+
plot_occupation,
|
| 40 |
+
plot_family_status,
|
| 41 |
+
plot_income_type,
|
| 42 |
+
)
|
| 43 |
+
from src.utils import get_dataset, get_features_target, get_train_test_sets
|
| 44 |
+
from src.preprocessing import preprocess_data_pipeline
|
| 45 |
+
return (
|
| 46 |
+
get_dataset,
|
| 47 |
+
get_features_target,
|
| 48 |
+
get_train_test_sets,
|
| 49 |
+
pd,
|
| 50 |
+
plot_credit_amounts,
|
| 51 |
+
plot_education_levels,
|
| 52 |
+
plot_family_status,
|
| 53 |
+
plot_income_type,
|
| 54 |
+
plot_occupation,
|
| 55 |
+
plot_target_distribution,
|
| 56 |
+
preprocess_data_pipeline,
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@app.cell
|
| 61 |
+
def _(get_dataset, get_features_target):
|
| 62 |
+
df = get_dataset()
|
| 63 |
+
X, y = get_features_target(df)
|
| 64 |
+
return X, df, y
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@app.cell
|
| 68 |
+
def _(mo):
|
| 69 |
+
mo.md("""## 1. Exploratory Data Analysis""")
|
| 70 |
+
return
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@app.cell
|
| 74 |
+
def _(mo):
|
| 75 |
+
mo.callout(
|
| 76 |
+
kind="info",
|
| 77 |
+
value=mo.md(
|
| 78 |
+
"""💡 **Want a step-by-step walkthrough instead?**
|
| 79 |
+
Check the Jupyter notebook version here: 👉 [Jupyter notebook](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/tutorial_app.ipynb)""",
|
| 80 |
+
),
|
| 81 |
+
)
|
| 82 |
+
return
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
@app.cell
|
| 86 |
+
def _(mo):
|
| 87 |
+
mo.md("""### 1.1 Dataset Information""")
|
| 88 |
+
return
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
@app.cell
|
| 92 |
+
def _(mo):
|
| 93 |
+
mo.md("""**a. Shape of the train and test datasets**""")
|
| 94 |
+
return
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
@app.cell
|
| 98 |
+
def _(X_test, X_train, df):
|
| 99 |
+
train_samples = "Train dataset samples: {}".format(X_train.shape[0])
|
| 100 |
+
test_samples = "Test dataset samples: {}".format(X_test.shape[0])
|
| 101 |
+
columns_number = "Number of columns: {}".format(df.shape[1])
|
| 102 |
+
|
| 103 |
+
train_samples, test_samples, columns_number
|
| 104 |
+
return
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
@app.cell
|
| 108 |
+
def _(mo):
|
| 109 |
+
mo.md("""**b. Dataset features**""")
|
| 110 |
+
return
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
@app.cell
|
| 114 |
+
def _(X):
|
| 115 |
+
X.columns
|
| 116 |
+
return
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
@app.cell
|
| 120 |
+
def _(mo):
|
| 121 |
+
mo.md("""**c. Sample from dataset**""")
|
| 122 |
+
return
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
@app.cell
|
| 126 |
+
def _(X):
|
| 127 |
+
sample = X.head(5).T
|
| 128 |
+
sample.columns = [
|
| 129 |
+
str(col) for col in sample.columns
|
| 130 |
+
] # fix integer name warning
|
| 131 |
+
sample = sample.astype(str) # avoid numeric conversion issues in viewer
|
| 132 |
+
sample
|
| 133 |
+
return
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
@app.cell
|
| 137 |
+
def _(mo):
|
| 138 |
+
mo.md("""**d. Target variable Distribution**""")
|
| 139 |
+
return
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
@app.cell
|
| 143 |
+
def _(df, plot_target_distribution):
|
| 144 |
+
target_table, target_plot = plot_target_distribution(df=df)
|
| 145 |
+
target_table
|
| 146 |
+
return (target_plot,)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
@app.cell
|
| 150 |
+
def _(target_plot):
|
| 151 |
+
target_plot
|
| 152 |
+
return
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
@app.cell
|
| 156 |
+
def _(mo):
|
| 157 |
+
mo.md("""**e. Number of columns of each data type**""")
|
| 158 |
+
return
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
@app.cell
|
| 162 |
+
def _(X):
|
| 163 |
+
X.dtypes.value_counts().sort_values(ascending=False)
|
| 164 |
+
return
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
@app.cell
|
| 168 |
+
def _(X):
|
| 169 |
+
categorical_cols = (
|
| 170 |
+
X.select_dtypes(include=["object"]).nunique().sort_values(ascending=False)
|
| 171 |
+
)
|
| 172 |
+
categorical_cols
|
| 173 |
+
return
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
@app.cell
|
| 177 |
+
def _(mo):
|
| 178 |
+
mo.md("""**f. Missing data**""")
|
| 179 |
+
return
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
@app.cell
|
| 183 |
+
def _(X, pd):
|
| 184 |
+
missing_count = X.isna().sum().sort_values(ascending=False)
|
| 185 |
+
missing_percentage = (missing_count / X.shape[0] * 100).round(2)
|
| 186 |
+
|
| 187 |
+
missing_data = pd.DataFrame(
|
| 188 |
+
data={"Count": missing_count, "percentage": missing_percentage}
|
| 189 |
+
)
|
| 190 |
+
missing_data
|
| 191 |
+
return
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
@app.cell
|
| 195 |
+
def _(mo):
|
| 196 |
+
mo.md("""### 1.2 Distribution of Variables""")
|
| 197 |
+
return
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
@app.cell
|
| 201 |
+
def _(mo):
|
| 202 |
+
mo.md(
|
| 203 |
+
r"""Want to see how these plots were created? You can find the source code for the visualizations in [plots.py](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/src/plots.py)."""
|
| 204 |
+
)
|
| 205 |
+
return
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
@app.cell
|
| 209 |
+
def _(mo):
|
| 210 |
+
mo.md("""**a. Credit Amounts**""")
|
| 211 |
+
return
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
@app.cell
|
| 215 |
+
def _(X, plot_credit_amounts):
|
| 216 |
+
plot_credit_amounts(df=X)
|
| 217 |
+
return
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
@app.cell
|
| 221 |
+
def _(mo):
|
| 222 |
+
mo.md("""**b. Education Level of Credit Applicants**""")
|
| 223 |
+
return
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
@app.cell
|
| 227 |
+
def _(X, plot_education_levels):
|
| 228 |
+
education_table, education_plot = plot_education_levels(df=X)
|
| 229 |
+
education_table
|
| 230 |
+
return (education_plot,)
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
@app.cell
|
| 234 |
+
def _(education_plot):
|
| 235 |
+
education_plot
|
| 236 |
+
return
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
@app.cell
|
| 240 |
+
def _(mo):
|
| 241 |
+
mo.md("""**c. Ocupation of Credit Applicants**""")
|
| 242 |
+
return
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
@app.cell
|
| 246 |
+
def _(X, plot_occupation):
|
| 247 |
+
occupation_table, occupation_plot = plot_occupation(df=X)
|
| 248 |
+
occupation_table
|
| 249 |
+
return (occupation_plot,)
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
@app.cell
|
| 253 |
+
def _(occupation_plot):
|
| 254 |
+
occupation_plot
|
| 255 |
+
return
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
@app.cell
|
| 259 |
+
def _(mo):
|
| 260 |
+
mo.md("""**d. Family Status of Applicants**""")
|
| 261 |
+
return
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
@app.cell
|
| 265 |
+
def _(X, plot_family_status):
|
| 266 |
+
family_status_table, family_status_plot = plot_family_status(df=X)
|
| 267 |
+
family_status_table
|
| 268 |
+
return (family_status_plot,)
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
@app.cell
|
| 272 |
+
def _(family_status_plot):
|
| 273 |
+
family_status_plot
|
| 274 |
+
return
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
@app.cell
|
| 278 |
+
def _(mo):
|
| 279 |
+
mo.md("""**e. Income Type of Applicants by Target Variable**""")
|
| 280 |
+
return
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
@app.cell
|
| 284 |
+
def _(df, plot_income_type):
|
| 285 |
+
plot_income_type(df=df)
|
| 286 |
+
return
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
@app.cell
|
| 290 |
+
def _(mo):
|
| 291 |
+
mo.md("""## 2. Preprocessing""")
|
| 292 |
+
return
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
@app.cell
|
| 296 |
+
def _(mo):
|
| 297 |
+
mo.md("""**a. Separate Train and Test Datasets**""")
|
| 298 |
+
return
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
@app.cell
|
| 302 |
+
def _(X, get_train_test_sets, y):
|
| 303 |
+
X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
|
| 304 |
+
X_train.shape, y_train.shape, X_test.shape, y_test.shape
|
| 305 |
+
return X_test, X_train
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
@app.cell
|
| 309 |
+
def _(mo):
|
| 310 |
+
mo.md("""**b. Preprocess Data**""")
|
| 311 |
+
return
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
@app.cell
|
| 315 |
+
def _(mo):
|
| 316 |
+
mo.md(
|
| 317 |
+
r"""
|
| 318 |
+
This preprocessing perform:
|
| 319 |
+
|
| 320 |
+
- Correct outliers/anomalous values in numerical columns (`DAYS_EMPLOYED` column).
|
| 321 |
+
- Encode string categorical features (`dtype object`).
|
| 322 |
+
- If the feature has 2 categories, Binary Encoding is applied.
|
| 323 |
+
- One Hot Encoding for more than 2 categories.
|
| 324 |
+
- Impute values for all columns with missing data (using median as imputing value).
|
| 325 |
+
- Feature scaling with Min-Max scaler
|
| 326 |
+
|
| 327 |
+
Want to see how the dataset was processed? You can find the code for the preprocessing steps in [preprocessing.py](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/src/preprocessing.py).
|
| 328 |
+
"""
|
| 329 |
+
)
|
| 330 |
+
return
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
@app.cell
|
| 334 |
+
def _(X_test, X_train, preprocess_data_pipeline):
|
| 335 |
+
train_data, test_data = preprocess_data_pipeline(
|
| 336 |
+
train_df=X_train, test_df=X_test
|
| 337 |
+
)
|
| 338 |
+
train_data.shape, test_data.shape
|
| 339 |
+
return
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
@app.cell
|
| 343 |
+
def _(mo):
|
| 344 |
+
mo.md("""## 3. Training Models""")
|
| 345 |
+
return
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
@app.cell
|
| 349 |
+
def _(mo):
|
| 350 |
+
mo.md(
|
| 351 |
+
r"""At this points, we will work with `train_data` and `test_data` as features sets; also `y_train` and `y_test` as target sets."""
|
| 352 |
+
)
|
| 353 |
+
return
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
@app.cell
|
| 357 |
+
def _(mo):
|
| 358 |
+
mo.md(r"""### 3.1 Logistic Regression""")
|
| 359 |
+
return
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
@app.cell
|
| 363 |
+
def _(mo):
|
| 364 |
+
mo.callout(
|
| 365 |
+
mo.md("""
|
| 366 |
+
In Logistic Regression, C is the inverse of regularization strength:
|
| 367 |
+
|
| 368 |
+
- **Small C** → Stronger regularization → Simpler model, less overfitting risk, but may underfit.
|
| 369 |
+
- **Large C** → Weaker regularization → Model fits training data more closely, but may overfit.
|
| 370 |
+
"""),
|
| 371 |
+
kind="info",
|
| 372 |
+
)
|
| 373 |
+
return
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
@app.cell
|
| 377 |
+
def _(mo):
|
| 378 |
+
mo.md(
|
| 379 |
+
r"""
|
| 380 |
+
We trained our Logistic Regression model using the following code:
|
| 381 |
+
|
| 382 |
+
```py
|
| 383 |
+
# 📌 Logistic Regression
|
| 384 |
+
log_reg = LogisticRegression(C=0.0001)
|
| 385 |
+
log_reg.fit(train_data, y_train)
|
| 386 |
+
|
| 387 |
+
# Train data predicton (class 1)
|
| 388 |
+
lr_train_pred = log_reg.predict_proba(train_data)[:, 1]
|
| 389 |
+
|
| 390 |
+
# Test data prediction (class 1)
|
| 391 |
+
lr_test_pred = log_reg.predict_proba(test_data)[:, 1]
|
| 392 |
+
|
| 393 |
+
# Get the ROC AUC Score on train and test datasets
|
| 394 |
+
log_reg_scores = {
|
| 395 |
+
"train_score": roc_auc_score(y_train, lr_train_pred),
|
| 396 |
+
"test_score": roc_auc_score(y_test, lr_test_pred),
|
| 397 |
+
}
|
| 398 |
+
log_reg_scores
|
| 399 |
+
```
|
| 400 |
+
|
| 401 |
+
📈 The ROC AUC scores obtained:
|
| 402 |
+
"""
|
| 403 |
+
)
|
| 404 |
+
return
|
| 405 |
+
|
| 406 |
+
|
| 407 |
+
@app.cell
|
| 408 |
+
def _():
|
| 409 |
+
lr_scores = {
|
| 410 |
+
"train_score": 0.6868418961663535,
|
| 411 |
+
"test_score": 0.6854973003347028,
|
| 412 |
+
}
|
| 413 |
+
lr_scores
|
| 414 |
+
return
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
@app.cell
|
| 418 |
+
def _(mo):
|
| 419 |
+
mo.md(r"""### 3.2 Random Forest Classifier""")
|
| 420 |
+
return
|
| 421 |
+
|
| 422 |
+
|
| 423 |
+
@app.cell
|
| 424 |
+
def _(mo):
|
| 425 |
+
mo.md(
|
| 426 |
+
r"""
|
| 427 |
+
We trained our Random Forest Classifier model using the following code:
|
| 428 |
+
|
| 429 |
+
```py
|
| 430 |
+
# 📌 Random Forest Classifier
|
| 431 |
+
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
|
| 432 |
+
rf.fit(train_data, y_train)
|
| 433 |
+
|
| 434 |
+
rf_train_pred = rf.predict_proba(train_data)[:, 1]
|
| 435 |
+
rf_test_pred = rf.predict_proba(test_data)[:, 1]
|
| 436 |
+
|
| 437 |
+
rf_scores = {
|
| 438 |
+
"train_score": roc_auc_score(y_train, rf_train_pred),
|
| 439 |
+
"test_score": roc_auc_score(y_test, rf_test_pred),
|
| 440 |
+
}
|
| 441 |
+
rf_scores
|
| 442 |
+
```
|
| 443 |
+
|
| 444 |
+
📈 The ROC AUC scores obtained:
|
| 445 |
+
"""
|
| 446 |
+
)
|
| 447 |
+
return
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
@app.cell
|
| 451 |
+
def _():
|
| 452 |
+
rf_scores = {"train_score": 1.0, "test_score": 0.7066811557903828}
|
| 453 |
+
rf_scores
|
| 454 |
+
return
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
@app.cell
|
| 458 |
+
def _(mo):
|
| 459 |
+
mo.md(r"""### 3.3. Randomized Search with Cross Validations""")
|
| 460 |
+
return
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
@app.cell
|
| 464 |
+
def _(mo):
|
| 465 |
+
mo.md(
|
| 466 |
+
r"""
|
| 467 |
+
We trained the Randomized Search CV using the following code:
|
| 468 |
+
|
| 469 |
+
```py
|
| 470 |
+
# 📌 RandomizedSearchCV
|
| 471 |
+
param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}
|
| 472 |
+
|
| 473 |
+
rf_optimized = RandomForestClassifier(random_state=42, n_jobs=-1)
|
| 474 |
+
rscv = RandomizedSearchCV(
|
| 475 |
+
estimator=rf_optimized,
|
| 476 |
+
param_distributions=param_dist,
|
| 477 |
+
n_iter=5,
|
| 478 |
+
scoring="roc_auc",
|
| 479 |
+
cv=3,
|
| 480 |
+
random_state=42,
|
| 481 |
+
n_jobs=-1,
|
| 482 |
+
)
|
| 483 |
+
|
| 484 |
+
rscv.fit(train_data, y_train)
|
| 485 |
+
|
| 486 |
+
rfo_train_pred = rscv.predict_proba(train_data)[:, 1]
|
| 487 |
+
rfo_test_pred = rscv.predict_proba(test_data)[:, 1]
|
| 488 |
+
|
| 489 |
+
rfo_scores = {
|
| 490 |
+
"train_score": roc_auc_score(y_train, rfo_train_pred),
|
| 491 |
+
"test_score": roc_auc_score(y_test, rfo_test_pred),
|
| 492 |
+
}
|
| 493 |
+
rfo_scores
|
| 494 |
+
```
|
| 495 |
+
|
| 496 |
+
📈 The ROC AUC scores obtained:
|
| 497 |
+
"""
|
| 498 |
+
)
|
| 499 |
+
return
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
@app.cell
|
| 503 |
+
def _():
|
| 504 |
+
rfo_scores = {
|
| 505 |
+
"train_score": 0.8196620915431655,
|
| 506 |
+
"test_score": 0.7308385425476998,
|
| 507 |
+
}
|
| 508 |
+
rfo_scores
|
| 509 |
+
return
|
| 510 |
+
|
| 511 |
+
|
| 512 |
+
@app.cell
|
| 513 |
+
def _(mo):
|
| 514 |
+
mo.md(r"""🥇The best results:""")
|
| 515 |
+
return
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
@app.cell
|
| 519 |
+
def _():
|
| 520 |
+
optimized_results = {
|
| 521 |
+
"best_params_": {"n_estimators": 100, "max_depth": 10},
|
| 522 |
+
"best_score_": 0.7296259755147781,
|
| 523 |
+
"best_estimator_": "RandomForestClassifier(max_depth=10, n_jobs=-1, random_state=42)",
|
| 524 |
+
}
|
| 525 |
+
optimized_results
|
| 526 |
+
return
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
@app.cell
|
| 530 |
+
def _(mo):
|
| 531 |
+
mo.md(r"""### 3.4 LightGBM""")
|
| 532 |
+
return
|
| 533 |
+
|
| 534 |
+
|
| 535 |
+
@app.cell
|
| 536 |
+
def _(mo):
|
| 537 |
+
mo.md(
|
| 538 |
+
r"""
|
| 539 |
+
We trained our LightGBM Classifier model using the following code:
|
| 540 |
+
|
| 541 |
+
```py
|
| 542 |
+
# 📌 LightGBM
|
| 543 |
+
import warnings
|
| 544 |
+
|
| 545 |
+
warnings.filterwarnings(
|
| 546 |
+
"ignore", message="X does not have valid feature names"
|
| 547 |
+
)
|
| 548 |
+
|
| 549 |
+
# 📌 Get numerical and categorical variables (binary and mutiple)
|
| 550 |
+
num_cols = X_train.select_dtypes(include="number").columns.to_list()
|
| 551 |
+
cat_cols = X_train.select_dtypes(include="object").columns.to_list()
|
| 552 |
+
|
| 553 |
+
binary_cols = [col for col in cat_cols if X_train[col].nunique() == 2]
|
| 554 |
+
multi_cols = [col for col in cat_cols if X_train[col].nunique() > 2]
|
| 555 |
+
|
| 556 |
+
# 📌 [1] Create the pipelines for different data types
|
| 557 |
+
numerical_pipeline = Pipeline(
|
| 558 |
+
steps=[
|
| 559 |
+
("imputer", SimpleImputer(strategy="median")),
|
| 560 |
+
("scaler", MinMaxScaler()),
|
| 561 |
+
]
|
| 562 |
+
)
|
| 563 |
+
|
| 564 |
+
binary_pipeline = Pipeline(
|
| 565 |
+
steps=[
|
| 566 |
+
("imputer", SimpleImputer(strategy="most_frequent")),
|
| 567 |
+
("ordinal", OrdinalEncoder()),
|
| 568 |
+
("scaler", MinMaxScaler()),
|
| 569 |
+
]
|
| 570 |
+
)
|
| 571 |
+
|
| 572 |
+
multi_pipeline = Pipeline(
|
| 573 |
+
steps=[
|
| 574 |
+
("imputer", SimpleImputer(strategy="most_frequent")),
|
| 575 |
+
(
|
| 576 |
+
"onehot",
|
| 577 |
+
OneHotEncoder(handle_unknown="ignore", sparse_output=False),
|
| 578 |
+
),
|
| 579 |
+
("scaler", MinMaxScaler()),
|
| 580 |
+
]
|
| 581 |
+
)
|
| 582 |
+
|
| 583 |
+
# 📌 [2] Create the preprocessor using ColumnTransformer
|
| 584 |
+
preprocessor = ColumnTransformer(
|
| 585 |
+
transformers=[
|
| 586 |
+
("binary", binary_pipeline, binary_cols),
|
| 587 |
+
("multi", multi_pipeline, multi_cols),
|
| 588 |
+
("numerical", numerical_pipeline, num_cols),
|
| 589 |
+
],
|
| 590 |
+
remainder="passthrough",
|
| 591 |
+
)
|
| 592 |
+
|
| 593 |
+
# 📌 [3] Create the Final Pipeline that combines the preprocessor and the model
|
| 594 |
+
lgbm = LGBMClassifier(
|
| 595 |
+
n_estimators=500,
|
| 596 |
+
learning_rate=0.05,
|
| 597 |
+
max_depth=-1,
|
| 598 |
+
random_state=42,
|
| 599 |
+
class_weight="balanced",
|
| 600 |
+
n_jobs=-1,
|
| 601 |
+
)
|
| 602 |
+
|
| 603 |
+
lgbm_pipeline = Pipeline(
|
| 604 |
+
steps=[("preprocessor", preprocessor), ("classifier", lgbm)]
|
| 605 |
+
)
|
| 606 |
+
|
| 607 |
+
# 📌 [4] Fit the Final Pipeline on the ORIGINAL, unprocessed data
|
| 608 |
+
# The pipeline takes care of all the preprocessing internally.
|
| 609 |
+
lgbm_pipeline.fit(X_train, y_train)
|
| 610 |
+
|
| 611 |
+
lgbm_train_pred = lgbm_pipeline.predict_proba(X_train)[:, 1]
|
| 612 |
+
lgbm_test_pred = lgbm_pipeline.predict_proba(X_test)[:, 1]
|
| 613 |
+
|
| 614 |
+
lgbm_scores = {
|
| 615 |
+
"train_score": roc_auc_score(y_train, lgbm_train_pred),
|
| 616 |
+
"test_score": roc_auc_score(y_test, lgbm_test_pred),
|
| 617 |
+
}
|
| 618 |
+
lgbm_scores
|
| 619 |
+
```
|
| 620 |
+
|
| 621 |
+
📈 The ROC AUC scores obtained:
|
| 622 |
+
"""
|
| 623 |
+
)
|
| 624 |
+
return
|
| 625 |
+
|
| 626 |
+
|
| 627 |
+
@app.cell
|
| 628 |
+
def _():
|
| 629 |
+
lgbm_scores = {
|
| 630 |
+
"train_score": 0.8523466410959462,
|
| 631 |
+
"test_score": 0.7514895868142193,
|
| 632 |
+
}
|
| 633 |
+
lgbm_scores
|
| 634 |
+
return
|
| 635 |
+
|
| 636 |
+
|
| 637 |
+
@app.cell
|
| 638 |
+
def _(mo):
|
| 639 |
+
mo.md(r"""## 4. Model Performance Analysis""")
|
| 640 |
+
return
|
| 641 |
+
|
| 642 |
+
|
| 643 |
+
@app.cell
|
| 644 |
+
def _(mo):
|
| 645 |
+
lg_stat = mo.stat(
|
| 646 |
+
label="Logistic Regression",
|
| 647 |
+
bordered=True,
|
| 648 |
+
value="🏋️ 0.687 🔎 0.685",
|
| 649 |
+
caption="Scores are consistent across train and test, indicating no overfitting. However, the overall AUC is low, suggesting underfitting — the model is too simple to capture complex patterns.",
|
| 650 |
+
direction="decrease",
|
| 651 |
+
)
|
| 652 |
+
|
| 653 |
+
rfc_stat = mo.stat(
|
| 654 |
+
label="Random Forest Classifier",
|
| 655 |
+
bordered=True,
|
| 656 |
+
value="🏋️ 1.0 🔎 0.707",
|
| 657 |
+
caption="Perfect training AUC indicates severe overfitting — the model memorized the training set. While the test score is better than Logistic Regression, the gap is too large for good generalization.",
|
| 658 |
+
direction="decrease",
|
| 659 |
+
)
|
| 660 |
+
|
| 661 |
+
rfo_stat = mo.stat(
|
| 662 |
+
label="Random Forest with Randomized Search",
|
| 663 |
+
bordered=True,
|
| 664 |
+
value="🏋️ 0.820 🔎 0.731",
|
| 665 |
+
caption="Hyperparameter tuning greatly reduced overfitting. The smaller train–test gap and improved test AUC show better generalization and a strong performance.",
|
| 666 |
+
direction="increase",
|
| 667 |
+
)
|
| 668 |
+
|
| 669 |
+
lgbm_stat = mo.stat(
|
| 670 |
+
label="LightGBM",
|
| 671 |
+
bordered=True,
|
| 672 |
+
value="🏋️ 0.852 🔎 0.751",
|
| 673 |
+
caption="Best overall performance. Small train–test gap and highest test AUC indicate a well-balanced model with strong generalization.",
|
| 674 |
+
direction="increase",
|
| 675 |
+
)
|
| 676 |
+
|
| 677 |
+
mo.vstack(
|
| 678 |
+
items=[
|
| 679 |
+
mo.hstack(items=[lg_stat, rfc_stat], widths="equal", gap=1),
|
| 680 |
+
mo.hstack(items=[rfo_stat, lgbm_stat], widths="equal", gap=1),
|
| 681 |
+
],
|
| 682 |
+
gap=1,
|
| 683 |
+
heights="equal",
|
| 684 |
+
align="center",
|
| 685 |
+
justify="center",
|
| 686 |
+
)
|
| 687 |
+
return
|
| 688 |
+
|
| 689 |
+
|
| 690 |
+
@app.cell
|
| 691 |
+
def _(mo):
|
| 692 |
+
mo.md(r"""## 5. Model Selection""")
|
| 693 |
+
return
|
| 694 |
+
|
| 695 |
+
|
| 696 |
+
@app.cell
|
| 697 |
+
def _(mo):
|
| 698 |
+
mo.md(
|
| 699 |
+
r"""
|
| 700 |
+
Based on a comparison of all the models, the final model selection is clear.
|
| 701 |
+
|
| 702 |
+
| Model | Train Score (AUC ROC) | Test Score (AUC ROC) |
|
| 703 |
+
| :--- | :---: | :---: |
|
| 704 |
+
| Logistic Regression | 0.687 | 0.685 |
|
| 705 |
+
| Random Forest Classifier | 1.000 | 0.707 |
|
| 706 |
+
| Randomized Search (Tuned RF) | 0.820 | 0.731 |
|
| 707 |
+
| **LightGBM** | 0.852 | **0.751** |
|
| 708 |
+
|
| 709 |
+
* The **Logistic Regression** model performed poorly due to underfitting.
|
| 710 |
+
* The base **Random Forest** model, while better, suffered from severe overfitting.
|
| 711 |
+
* The tuned **Random Forest** model was a significant improvement and a strong contender, achieving a solid `test_score`.
|
| 712 |
+
* However, the **LightGBM** model ultimately demonstrated the best performance, achieving the highest **ROC AUC test score of 0.751**. This indicates that it is the most robust and accurate model for predicting loan repayment risk on unseen data.
|
| 713 |
+
"""
|
| 714 |
+
)
|
| 715 |
+
return
|
| 716 |
+
|
| 717 |
+
|
| 718 |
+
@app.cell
|
| 719 |
+
def _(mo):
|
| 720 |
+
mo.callout(
|
| 721 |
+
kind="success",
|
| 722 |
+
value="🥇 Therefore, we will select the LightGBM model as our final choice for deployment.",
|
| 723 |
+
)
|
| 724 |
+
return
|
| 725 |
+
|
| 726 |
+
|
| 727 |
+
if __name__ == "__main__":
|
| 728 |
+
app.run()
|
model/lgbm_model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1931e98f1ec892e21f92b9be570f02bac96347dad2d8ae57732d582cdf2951f3
|
| 3 |
+
size 1813887
|
tutorial_app.ipynb
CHANGED
|
@@ -10,12 +10,14 @@
|
|
| 10 |
},
|
| 11 |
{
|
| 12 |
"cell_type": "code",
|
| 13 |
-
"execution_count":
|
| 14 |
"id": "vblA",
|
| 15 |
"metadata": {},
|
| 16 |
"outputs": [],
|
| 17 |
"source": [
|
| 18 |
"import pandas as pd\n",
|
|
|
|
|
|
|
| 19 |
"\n",
|
| 20 |
"from sklearn.ensemble import RandomForestClassifier\n",
|
| 21 |
"from sklearn.linear_model import LogisticRegression\n",
|
|
@@ -43,7 +45,7 @@
|
|
| 43 |
},
|
| 44 |
{
|
| 45 |
"cell_type": "code",
|
| 46 |
-
"execution_count":
|
| 47 |
"id": "bkHC",
|
| 48 |
"metadata": {},
|
| 49 |
"outputs": [],
|
|
@@ -828,7 +830,7 @@
|
|
| 828 |
},
|
| 829 |
{
|
| 830 |
"cell_type": "code",
|
| 831 |
-
"execution_count":
|
| 832 |
"id": "kqZH",
|
| 833 |
"metadata": {},
|
| 834 |
"outputs": [
|
|
@@ -838,7 +840,7 @@
|
|
| 838 |
"((196806, 121), (196806,), (49202, 121), (49202,))"
|
| 839 |
]
|
| 840 |
},
|
| 841 |
-
"execution_count":
|
| 842 |
"metadata": {},
|
| 843 |
"output_type": "execute_result"
|
| 844 |
}
|
|
@@ -1178,7 +1180,7 @@
|
|
| 1178 |
},
|
| 1179 |
{
|
| 1180 |
"cell_type": "code",
|
| 1181 |
-
"execution_count":
|
| 1182 |
"id": "cEAS",
|
| 1183 |
"metadata": {},
|
| 1184 |
"outputs": [
|
|
@@ -1187,7 +1189,7 @@
|
|
| 1187 |
"output_type": "stream",
|
| 1188 |
"text": [
|
| 1189 |
"[LightGBM] [Info] Number of positive: 15784, number of negative: 181022\n",
|
| 1190 |
-
"[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.
|
| 1191 |
"You can set `force_col_wise=true` to remove the overhead.\n",
|
| 1192 |
"[LightGBM] [Info] Total Bins 11594\n",
|
| 1193 |
"[LightGBM] [Info] Number of data points in the train set: 196806, number of used features: 229\n",
|
|
@@ -1314,25 +1316,25 @@
|
|
| 1314 |
"source": [
|
| 1315 |
"### 4.1 Logistic Regression\n",
|
| 1316 |
"\n",
|
| 1317 |
-
"
|
| 1318 |
"\n",
|
| 1319 |
"**Interpretation:** This model's performance is consistent across the training and testing sets, as indicated by the very small gap between the scores. This means the model is not overfitting. However, the overall scores are relatively low for a binary classification task, suggesting that the model is likely **underfitting**. It's too simple to capture the underlying patterns in the data effectively.\n",
|
| 1320 |
"\n",
|
| 1321 |
"### 4.2 Random Forest Classifier\n",
|
| 1322 |
"\n",
|
| 1323 |
-
"
|
| 1324 |
"\n",
|
| 1325 |
"**Interpretation:** The perfect `train_score` of 1.0 is a clear and severe sign of **overfitting**. The model has essentially memorized the training data, and this does not generalize well to unseen data, as shown by the much lower `test_score`. While the test score is better than the Logistic Regression, the model is too complex and needs to be regularized or tuned to perform better.\n",
|
| 1326 |
"\n",
|
| 1327 |
"### 4.3 Randomized Search with Cross Validations (Random Forest)\n",
|
| 1328 |
"\n",
|
| 1329 |
-
"
|
| 1330 |
"\n",
|
| 1331 |
"**Interpretation:** This is a much better result than the base Random Forest. The gap between the `train_score` and `test_score` is significantly smaller, indicating that the hyperparameter tuning successfully **reduced overfitting**. The `test_score` of 0.731 is also a notable improvement, showing that the model now generalizes better to unseen data. This is a well-performing and well-tuned model.\n",
|
| 1332 |
"\n",
|
| 1333 |
"### 4.4 LightGBM\n",
|
| 1334 |
"\n",
|
| 1335 |
-
"
|
| 1336 |
"\n",
|
| 1337 |
"**Interpretation:** The LightGBM model shows the best overall performance with the highest `test_score` of 0.751. There is a small gap between the training and testing scores, which is normal for a powerful boosting model, suggesting a good balance between capturing complex patterns and generalizing well. The model is performing exceptionally and is neither severely overfitting nor underfitting.\n"
|
| 1338 |
]
|
|
@@ -1342,7 +1344,7 @@
|
|
| 1342 |
"id": "5d48c191",
|
| 1343 |
"metadata": {},
|
| 1344 |
"source": [
|
| 1345 |
-
"## 5.
|
| 1346 |
]
|
| 1347 |
},
|
| 1348 |
{
|
|
@@ -1372,6 +1374,612 @@
|
|
| 1372 |
"source": [
|
| 1373 |
"> 🥇 Therefore, we will select the **LightGBM** model as our final choice for deployment.\n"
|
| 1374 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1375 |
}
|
| 1376 |
],
|
| 1377 |
"metadata": {
|
|
|
|
| 10 |
},
|
| 11 |
{
|
| 12 |
"cell_type": "code",
|
| 13 |
+
"execution_count": 12,
|
| 14 |
"id": "vblA",
|
| 15 |
"metadata": {},
|
| 16 |
"outputs": [],
|
| 17 |
"source": [
|
| 18 |
"import pandas as pd\n",
|
| 19 |
+
"import joblib\n",
|
| 20 |
+
"import numpy as np\n",
|
| 21 |
"\n",
|
| 22 |
"from sklearn.ensemble import RandomForestClassifier\n",
|
| 23 |
"from sklearn.linear_model import LogisticRegression\n",
|
|
|
|
| 45 |
},
|
| 46 |
{
|
| 47 |
"cell_type": "code",
|
| 48 |
+
"execution_count": 3,
|
| 49 |
"id": "bkHC",
|
| 50 |
"metadata": {},
|
| 51 |
"outputs": [],
|
|
|
|
| 830 |
},
|
| 831 |
{
|
| 832 |
"cell_type": "code",
|
| 833 |
+
"execution_count": 4,
|
| 834 |
"id": "kqZH",
|
| 835 |
"metadata": {},
|
| 836 |
"outputs": [
|
|
|
|
| 840 |
"((196806, 121), (196806,), (49202, 121), (49202,))"
|
| 841 |
]
|
| 842 |
},
|
| 843 |
+
"execution_count": 4,
|
| 844 |
"metadata": {},
|
| 845 |
"output_type": "execute_result"
|
| 846 |
}
|
|
|
|
| 1180 |
},
|
| 1181 |
{
|
| 1182 |
"cell_type": "code",
|
| 1183 |
+
"execution_count": 5,
|
| 1184 |
"id": "cEAS",
|
| 1185 |
"metadata": {},
|
| 1186 |
"outputs": [
|
|
|
|
| 1189 |
"output_type": "stream",
|
| 1190 |
"text": [
|
| 1191 |
"[LightGBM] [Info] Number of positive: 15784, number of negative: 181022\n",
|
| 1192 |
+
"[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060479 seconds.\n",
|
| 1193 |
"You can set `force_col_wise=true` to remove the overhead.\n",
|
| 1194 |
"[LightGBM] [Info] Total Bins 11594\n",
|
| 1195 |
"[LightGBM] [Info] Number of data points in the train set: 196806, number of used features: 229\n",
|
|
|
|
| 1316 |
"source": [
|
| 1317 |
"### 4.1 Logistic Regression\n",
|
| 1318 |
"\n",
|
| 1319 |
+
"_The Logistic Regression model shows a `train_score` of 0.687 and a `test_score` of 0.685._\n",
|
| 1320 |
"\n",
|
| 1321 |
"**Interpretation:** This model's performance is consistent across the training and testing sets, as indicated by the very small gap between the scores. This means the model is not overfitting. However, the overall scores are relatively low for a binary classification task, suggesting that the model is likely **underfitting**. It's too simple to capture the underlying patterns in the data effectively.\n",
|
| 1322 |
"\n",
|
| 1323 |
"### 4.2 Random Forest Classifier\n",
|
| 1324 |
"\n",
|
| 1325 |
+
"_The base Random Forest model produced a `train_score` of 1.0 and a `test_score` of 0.707._\n",
|
| 1326 |
"\n",
|
| 1327 |
"**Interpretation:** The perfect `train_score` of 1.0 is a clear and severe sign of **overfitting**. The model has essentially memorized the training data, and this does not generalize well to unseen data, as shown by the much lower `test_score`. While the test score is better than the Logistic Regression, the model is too complex and needs to be regularized or tuned to perform better.\n",
|
| 1328 |
"\n",
|
| 1329 |
"### 4.3 Randomized Search with Cross Validations (Random Forest)\n",
|
| 1330 |
"\n",
|
| 1331 |
+
"_The hyperparameter-tuned Random Forest model achieved a `train_score` of 0.820 and a `test_score` of 0.731._\n",
|
| 1332 |
"\n",
|
| 1333 |
"**Interpretation:** This is a much better result than the base Random Forest. The gap between the `train_score` and `test_score` is significantly smaller, indicating that the hyperparameter tuning successfully **reduced overfitting**. The `test_score` of 0.731 is also a notable improvement, showing that the model now generalizes better to unseen data. This is a well-performing and well-tuned model.\n",
|
| 1334 |
"\n",
|
| 1335 |
"### 4.4 LightGBM\n",
|
| 1336 |
"\n",
|
| 1337 |
+
"_The LightGBM model produced a `train_score` of 0.852 and a `test_score` of 0.751._\n",
|
| 1338 |
"\n",
|
| 1339 |
"**Interpretation:** The LightGBM model shows the best overall performance with the highest `test_score` of 0.751. There is a small gap between the training and testing scores, which is normal for a powerful boosting model, suggesting a good balance between capturing complex patterns and generalizing well. The model is performing exceptionally and is neither severely overfitting nor underfitting.\n"
|
| 1340 |
]
|
|
|
|
| 1344 |
"id": "5d48c191",
|
| 1345 |
"metadata": {},
|
| 1346 |
"source": [
|
| 1347 |
+
"## 5. Model Selection\n"
|
| 1348 |
]
|
| 1349 |
},
|
| 1350 |
{
|
|
|
|
| 1374 |
"source": [
|
| 1375 |
"> 🥇 Therefore, we will select the **LightGBM** model as our final choice for deployment.\n"
|
| 1376 |
]
|
| 1377 |
+
},
|
| 1378 |
+
{
|
| 1379 |
+
"cell_type": "markdown",
|
| 1380 |
+
"id": "4aa60dcb",
|
| 1381 |
+
"metadata": {},
|
| 1382 |
+
"source": [
|
| 1383 |
+
"## 6. Saving the Model\n"
|
| 1384 |
+
]
|
| 1385 |
+
},
|
| 1386 |
+
{
|
| 1387 |
+
"cell_type": "markdown",
|
| 1388 |
+
"id": "bca2853b",
|
| 1389 |
+
"metadata": {},
|
| 1390 |
+
"source": [
|
| 1391 |
+
"### 6.1 Saving the Model\n"
|
| 1392 |
+
]
|
| 1393 |
+
},
|
| 1394 |
+
{
|
| 1395 |
+
"cell_type": "code",
|
| 1396 |
+
"execution_count": 10,
|
| 1397 |
+
"id": "3246c249",
|
| 1398 |
+
"metadata": {},
|
| 1399 |
+
"outputs": [
|
| 1400 |
+
{
|
| 1401 |
+
"name": "stdout",
|
| 1402 |
+
"output_type": "stream",
|
| 1403 |
+
"text": [
|
| 1404 |
+
"Model saved successfully as lgbm_model.joblib\n"
|
| 1405 |
+
]
|
| 1406 |
+
}
|
| 1407 |
+
],
|
| 1408 |
+
"source": [
|
| 1409 |
+
"joblib.dump(lgbm_pipeline, \"model/lgbm_model.joblib\")\n",
|
| 1410 |
+
"print(\"Model saved successfully as lgbm_model.joblib\")"
|
| 1411 |
+
]
|
| 1412 |
+
},
|
| 1413 |
+
{
|
| 1414 |
+
"cell_type": "markdown",
|
| 1415 |
+
"id": "a69129f2",
|
| 1416 |
+
"metadata": {},
|
| 1417 |
+
"source": [
|
| 1418 |
+
"### 6.2 Feature Importances\n"
|
| 1419 |
+
]
|
| 1420 |
+
},
|
| 1421 |
+
{
|
| 1422 |
+
"cell_type": "markdown",
|
| 1423 |
+
"id": "5db0729b",
|
| 1424 |
+
"metadata": {},
|
| 1425 |
+
"source": [
|
| 1426 |
+
"We will select the top 10 features based on their importances, in order to use them in the prediction interface.\n"
|
| 1427 |
+
]
|
| 1428 |
+
},
|
| 1429 |
+
{
|
| 1430 |
+
"cell_type": "code",
|
| 1431 |
+
"execution_count": 15,
|
| 1432 |
+
"id": "12e917d7",
|
| 1433 |
+
"metadata": {},
|
| 1434 |
+
"outputs": [
|
| 1435 |
+
{
|
| 1436 |
+
"data": {
|
| 1437 |
+
"text/plain": [
|
| 1438 |
+
"array([ 86, 65, 21, 0, 72, 50, 0, 9, 23, 0, 6, 1, 10,\n",
|
| 1439 |
+
" 34, 0, 13, 0, 9, 28, 0, 0, 46, 27, 70, 10, 12,\n",
|
| 1440 |
+
" 47, 23, 58, 20, 27, 0, 21, 3, 30, 12, 12, 8, 11,\n",
|
| 1441 |
+
" 33, 9, 7, 43, 28, 3, 15, 0, 28, 17, 10, 15, 2,\n",
|
| 1442 |
+
" 2, 15, 2, 6, 2, 22, 35, 20, 25, 22, 21, 21, 3,\n",
|
| 1443 |
+
" 4, 10, 7, 8, 31, 0, 32, 1, 7, 1, 12, 4, 1,\n",
|
| 1444 |
+
" 2, 0, 2, 17, 0, 0, 10, 2, 8, 0, 8, 0, 20,\n",
|
| 1445 |
+
" 0, 6, 9, 12, 23, 0, 6, 9, 3, 23, 0, 6, 23,\n",
|
| 1446 |
+
" 3, 18, 28, 10, 0, 0, 4, 12, 0, 0, 4, 4, 3,\n",
|
| 1447 |
+
" 6, 23, 10, 5, 0, 13, 14, 11, 7, 2, 5, 1, 10,\n",
|
| 1448 |
+
" 2, 2, 5, 15, 19, 0, 536, 47, 410, 642, 660, 489, 375,\n",
|
| 1449 |
+
" 752, 630, 556, 628, 196, 0, 2, 50, 0, 45, 13, 75, 24,\n",
|
| 1450 |
+
" 54, 230, 0, 23, 10, 36, 24, 16, 732, 792, 833, 130, 85,\n",
|
| 1451 |
+
" 100, 58, 77, 42, 66, 57, 36, 116, 75, 90, 54, 106, 106,\n",
|
| 1452 |
+
" 129, 112, 69, 111, 26, 39, 10, 22, 104, 69, 131, 37, 99,\n",
|
| 1453 |
+
" 47, 67, 93, 29, 40, 7, 23, 18, 16, 67, 34, 43, 19,\n",
|
| 1454 |
+
" 76, 176, 121, 68, 38, 57, 536, 0, 75, 0, 8, 10, 0,\n",
|
| 1455 |
+
" 10, 5, 0, 16, 0, 21, 14, 14, 33, 0, 31, 5, 0,\n",
|
| 1456 |
+
" 1, 2, 10, 29, 69, 82, 141], dtype=int32)"
|
| 1457 |
+
]
|
| 1458 |
+
},
|
| 1459 |
+
"execution_count": 15,
|
| 1460 |
+
"metadata": {},
|
| 1461 |
+
"output_type": "execute_result"
|
| 1462 |
+
}
|
| 1463 |
+
],
|
| 1464 |
+
"source": [
|
| 1465 |
+
"loaded_pipeline = joblib.load(\"model/lgbm_model.joblib\")\n",
|
| 1466 |
+
"feature_importances = loaded_pipeline.named_steps[\"classifier\"].feature_importances_\n",
|
| 1467 |
+
"feature_importances"
|
| 1468 |
+
]
|
| 1469 |
+
},
|
| 1470 |
+
{
|
| 1471 |
+
"cell_type": "code",
|
| 1472 |
+
"execution_count": 17,
|
| 1473 |
+
"id": "ad40c56f",
|
| 1474 |
+
"metadata": {},
|
| 1475 |
+
"outputs": [
|
| 1476 |
+
{
|
| 1477 |
+
"data": {
|
| 1478 |
+
"text/plain": [
|
| 1479 |
+
"array(['binary__NAME_CONTRACT_TYPE', 'binary__FLAG_OWN_CAR',\n",
|
| 1480 |
+
" 'binary__FLAG_OWN_REALTY', 'binary__EMERGENCYSTATE_MODE',\n",
|
| 1481 |
+
" 'multi__CODE_GENDER_F', 'multi__CODE_GENDER_M',\n",
|
| 1482 |
+
" 'multi__CODE_GENDER_XNA', 'multi__NAME_TYPE_SUITE_Children',\n",
|
| 1483 |
+
" 'multi__NAME_TYPE_SUITE_Family',\n",
|
| 1484 |
+
" 'multi__NAME_TYPE_SUITE_Group of people',\n",
|
| 1485 |
+
" 'multi__NAME_TYPE_SUITE_Other_A', 'multi__NAME_TYPE_SUITE_Other_B',\n",
|
| 1486 |
+
" 'multi__NAME_TYPE_SUITE_Spouse, partner',\n",
|
| 1487 |
+
" 'multi__NAME_TYPE_SUITE_Unaccompanied',\n",
|
| 1488 |
+
" 'multi__NAME_INCOME_TYPE_Businessman',\n",
|
| 1489 |
+
" 'multi__NAME_INCOME_TYPE_Commercial associate',\n",
|
| 1490 |
+
" 'multi__NAME_INCOME_TYPE_Maternity leave',\n",
|
| 1491 |
+
" 'multi__NAME_INCOME_TYPE_Pensioner',\n",
|
| 1492 |
+
" 'multi__NAME_INCOME_TYPE_State servant',\n",
|
| 1493 |
+
" 'multi__NAME_INCOME_TYPE_Student',\n",
|
| 1494 |
+
" 'multi__NAME_INCOME_TYPE_Unemployed',\n",
|
| 1495 |
+
" 'multi__NAME_INCOME_TYPE_Working',\n",
|
| 1496 |
+
" 'multi__NAME_EDUCATION_TYPE_Academic degree',\n",
|
| 1497 |
+
" 'multi__NAME_EDUCATION_TYPE_Higher education',\n",
|
| 1498 |
+
" 'multi__NAME_EDUCATION_TYPE_Incomplete higher',\n",
|
| 1499 |
+
" 'multi__NAME_EDUCATION_TYPE_Lower secondary',\n",
|
| 1500 |
+
" 'multi__NAME_EDUCATION_TYPE_Secondary / secondary special',\n",
|
| 1501 |
+
" 'multi__NAME_FAMILY_STATUS_Civil marriage',\n",
|
| 1502 |
+
" 'multi__NAME_FAMILY_STATUS_Married',\n",
|
| 1503 |
+
" 'multi__NAME_FAMILY_STATUS_Separated',\n",
|
| 1504 |
+
" 'multi__NAME_FAMILY_STATUS_Single / not married',\n",
|
| 1505 |
+
" 'multi__NAME_FAMILY_STATUS_Unknown',\n",
|
| 1506 |
+
" 'multi__NAME_FAMILY_STATUS_Widow',\n",
|
| 1507 |
+
" 'multi__NAME_HOUSING_TYPE_Co-op apartment',\n",
|
| 1508 |
+
" 'multi__NAME_HOUSING_TYPE_House / apartment',\n",
|
| 1509 |
+
" 'multi__NAME_HOUSING_TYPE_Municipal apartment',\n",
|
| 1510 |
+
" 'multi__NAME_HOUSING_TYPE_Office apartment',\n",
|
| 1511 |
+
" 'multi__NAME_HOUSING_TYPE_Rented apartment',\n",
|
| 1512 |
+
" 'multi__NAME_HOUSING_TYPE_With parents',\n",
|
| 1513 |
+
" 'multi__OCCUPATION_TYPE_Accountants',\n",
|
| 1514 |
+
" 'multi__OCCUPATION_TYPE_Cleaning staff',\n",
|
| 1515 |
+
" 'multi__OCCUPATION_TYPE_Cooking staff',\n",
|
| 1516 |
+
" 'multi__OCCUPATION_TYPE_Core staff',\n",
|
| 1517 |
+
" 'multi__OCCUPATION_TYPE_Drivers',\n",
|
| 1518 |
+
" 'multi__OCCUPATION_TYPE_HR staff',\n",
|
| 1519 |
+
" 'multi__OCCUPATION_TYPE_High skill tech staff',\n",
|
| 1520 |
+
" 'multi__OCCUPATION_TYPE_IT staff',\n",
|
| 1521 |
+
" 'multi__OCCUPATION_TYPE_Laborers',\n",
|
| 1522 |
+
" 'multi__OCCUPATION_TYPE_Low-skill Laborers',\n",
|
| 1523 |
+
" 'multi__OCCUPATION_TYPE_Managers',\n",
|
| 1524 |
+
" 'multi__OCCUPATION_TYPE_Medicine staff',\n",
|
| 1525 |
+
" 'multi__OCCUPATION_TYPE_Private service staff',\n",
|
| 1526 |
+
" 'multi__OCCUPATION_TYPE_Realty agents',\n",
|
| 1527 |
+
" 'multi__OCCUPATION_TYPE_Sales staff',\n",
|
| 1528 |
+
" 'multi__OCCUPATION_TYPE_Secretaries',\n",
|
| 1529 |
+
" 'multi__OCCUPATION_TYPE_Security staff',\n",
|
| 1530 |
+
" 'multi__OCCUPATION_TYPE_Waiters/barmen staff',\n",
|
| 1531 |
+
" 'multi__WEEKDAY_APPR_PROCESS_START_FRIDAY',\n",
|
| 1532 |
+
" 'multi__WEEKDAY_APPR_PROCESS_START_MONDAY',\n",
|
| 1533 |
+
" 'multi__WEEKDAY_APPR_PROCESS_START_SATURDAY',\n",
|
| 1534 |
+
" 'multi__WEEKDAY_APPR_PROCESS_START_SUNDAY',\n",
|
| 1535 |
+
" 'multi__WEEKDAY_APPR_PROCESS_START_THURSDAY',\n",
|
| 1536 |
+
" 'multi__WEEKDAY_APPR_PROCESS_START_TUESDAY',\n",
|
| 1537 |
+
" 'multi__WEEKDAY_APPR_PROCESS_START_WEDNESDAY',\n",
|
| 1538 |
+
" 'multi__ORGANIZATION_TYPE_Advertising',\n",
|
| 1539 |
+
" 'multi__ORGANIZATION_TYPE_Agriculture',\n",
|
| 1540 |
+
" 'multi__ORGANIZATION_TYPE_Bank',\n",
|
| 1541 |
+
" 'multi__ORGANIZATION_TYPE_Business Entity Type 1',\n",
|
| 1542 |
+
" 'multi__ORGANIZATION_TYPE_Business Entity Type 2',\n",
|
| 1543 |
+
" 'multi__ORGANIZATION_TYPE_Business Entity Type 3',\n",
|
| 1544 |
+
" 'multi__ORGANIZATION_TYPE_Cleaning',\n",
|
| 1545 |
+
" 'multi__ORGANIZATION_TYPE_Construction',\n",
|
| 1546 |
+
" 'multi__ORGANIZATION_TYPE_Culture',\n",
|
| 1547 |
+
" 'multi__ORGANIZATION_TYPE_Electricity',\n",
|
| 1548 |
+
" 'multi__ORGANIZATION_TYPE_Emergency',\n",
|
| 1549 |
+
" 'multi__ORGANIZATION_TYPE_Government',\n",
|
| 1550 |
+
" 'multi__ORGANIZATION_TYPE_Hotel',\n",
|
| 1551 |
+
" 'multi__ORGANIZATION_TYPE_Housing',\n",
|
| 1552 |
+
" 'multi__ORGANIZATION_TYPE_Industry: type 1',\n",
|
| 1553 |
+
" 'multi__ORGANIZATION_TYPE_Industry: type 10',\n",
|
| 1554 |
+
" 'multi__ORGANIZATION_TYPE_Industry: type 11',\n",
|
| 1555 |
+
" 'multi__ORGANIZATION_TYPE_Industry: type 12',\n",
|
| 1556 |
+
" 'multi__ORGANIZATION_TYPE_Industry: type 13',\n",
|
| 1557 |
+
" 'multi__ORGANIZATION_TYPE_Industry: type 2',\n",
|
| 1558 |
+
" 'multi__ORGANIZATION_TYPE_Industry: type 3',\n",
|
| 1559 |
+
" 'multi__ORGANIZATION_TYPE_Industry: type 4',\n",
|
| 1560 |
+
" 'multi__ORGANIZATION_TYPE_Industry: type 5',\n",
|
| 1561 |
+
" 'multi__ORGANIZATION_TYPE_Industry: type 6',\n",
|
| 1562 |
+
" 'multi__ORGANIZATION_TYPE_Industry: type 7',\n",
|
| 1563 |
+
" 'multi__ORGANIZATION_TYPE_Industry: type 8',\n",
|
| 1564 |
+
" 'multi__ORGANIZATION_TYPE_Industry: type 9',\n",
|
| 1565 |
+
" 'multi__ORGANIZATION_TYPE_Insurance',\n",
|
| 1566 |
+
" 'multi__ORGANIZATION_TYPE_Kindergarten',\n",
|
| 1567 |
+
" 'multi__ORGANIZATION_TYPE_Legal Services',\n",
|
| 1568 |
+
" 'multi__ORGANIZATION_TYPE_Medicine',\n",
|
| 1569 |
+
" 'multi__ORGANIZATION_TYPE_Military',\n",
|
| 1570 |
+
" 'multi__ORGANIZATION_TYPE_Mobile',\n",
|
| 1571 |
+
" 'multi__ORGANIZATION_TYPE_Other',\n",
|
| 1572 |
+
" 'multi__ORGANIZATION_TYPE_Police',\n",
|
| 1573 |
+
" 'multi__ORGANIZATION_TYPE_Postal',\n",
|
| 1574 |
+
" 'multi__ORGANIZATION_TYPE_Realtor',\n",
|
| 1575 |
+
" 'multi__ORGANIZATION_TYPE_Religion',\n",
|
| 1576 |
+
" 'multi__ORGANIZATION_TYPE_Restaurant',\n",
|
| 1577 |
+
" 'multi__ORGANIZATION_TYPE_School',\n",
|
| 1578 |
+
" 'multi__ORGANIZATION_TYPE_Security',\n",
|
| 1579 |
+
" 'multi__ORGANIZATION_TYPE_Security Ministries',\n",
|
| 1580 |
+
" 'multi__ORGANIZATION_TYPE_Self-employed',\n",
|
| 1581 |
+
" 'multi__ORGANIZATION_TYPE_Services',\n",
|
| 1582 |
+
" 'multi__ORGANIZATION_TYPE_Telecom',\n",
|
| 1583 |
+
" 'multi__ORGANIZATION_TYPE_Trade: type 1',\n",
|
| 1584 |
+
" 'multi__ORGANIZATION_TYPE_Trade: type 2',\n",
|
| 1585 |
+
" 'multi__ORGANIZATION_TYPE_Trade: type 3',\n",
|
| 1586 |
+
" 'multi__ORGANIZATION_TYPE_Trade: type 4',\n",
|
| 1587 |
+
" 'multi__ORGANIZATION_TYPE_Trade: type 5',\n",
|
| 1588 |
+
" 'multi__ORGANIZATION_TYPE_Trade: type 6',\n",
|
| 1589 |
+
" 'multi__ORGANIZATION_TYPE_Trade: type 7',\n",
|
| 1590 |
+
" 'multi__ORGANIZATION_TYPE_Transport: type 1',\n",
|
| 1591 |
+
" 'multi__ORGANIZATION_TYPE_Transport: type 2',\n",
|
| 1592 |
+
" 'multi__ORGANIZATION_TYPE_Transport: type 3',\n",
|
| 1593 |
+
" 'multi__ORGANIZATION_TYPE_Transport: type 4',\n",
|
| 1594 |
+
" 'multi__ORGANIZATION_TYPE_University',\n",
|
| 1595 |
+
" 'multi__ORGANIZATION_TYPE_XNA',\n",
|
| 1596 |
+
" 'multi__FONDKAPREMONT_MODE_not specified',\n",
|
| 1597 |
+
" 'multi__FONDKAPREMONT_MODE_org spec account',\n",
|
| 1598 |
+
" 'multi__FONDKAPREMONT_MODE_reg oper account',\n",
|
| 1599 |
+
" 'multi__FONDKAPREMONT_MODE_reg oper spec account',\n",
|
| 1600 |
+
" 'multi__HOUSETYPE_MODE_block of flats',\n",
|
| 1601 |
+
" 'multi__HOUSETYPE_MODE_specific housing',\n",
|
| 1602 |
+
" 'multi__HOUSETYPE_MODE_terraced house',\n",
|
| 1603 |
+
" 'multi__WALLSMATERIAL_MODE_Block',\n",
|
| 1604 |
+
" 'multi__WALLSMATERIAL_MODE_Mixed',\n",
|
| 1605 |
+
" 'multi__WALLSMATERIAL_MODE_Monolithic',\n",
|
| 1606 |
+
" 'multi__WALLSMATERIAL_MODE_Others',\n",
|
| 1607 |
+
" 'multi__WALLSMATERIAL_MODE_Panel',\n",
|
| 1608 |
+
" 'multi__WALLSMATERIAL_MODE_Stone, brick',\n",
|
| 1609 |
+
" 'multi__WALLSMATERIAL_MODE_Wooden', 'numerical__SK_ID_CURR',\n",
|
| 1610 |
+
" 'numerical__CNT_CHILDREN', 'numerical__AMT_INCOME_TOTAL',\n",
|
| 1611 |
+
" 'numerical__AMT_CREDIT', 'numerical__AMT_ANNUITY',\n",
|
| 1612 |
+
" 'numerical__AMT_GOODS_PRICE',\n",
|
| 1613 |
+
" 'numerical__REGION_POPULATION_RELATIVE', 'numerical__DAYS_BIRTH',\n",
|
| 1614 |
+
" 'numerical__DAYS_EMPLOYED', 'numerical__DAYS_REGISTRATION',\n",
|
| 1615 |
+
" 'numerical__DAYS_ID_PUBLISH', 'numerical__OWN_CAR_AGE',\n",
|
| 1616 |
+
" 'numerical__FLAG_MOBIL', 'numerical__FLAG_EMP_PHONE',\n",
|
| 1617 |
+
" 'numerical__FLAG_WORK_PHONE', 'numerical__FLAG_CONT_MOBILE',\n",
|
| 1618 |
+
" 'numerical__FLAG_PHONE', 'numerical__FLAG_EMAIL',\n",
|
| 1619 |
+
" 'numerical__CNT_FAM_MEMBERS', 'numerical__REGION_RATING_CLIENT',\n",
|
| 1620 |
+
" 'numerical__REGION_RATING_CLIENT_W_CITY',\n",
|
| 1621 |
+
" 'numerical__HOUR_APPR_PROCESS_START',\n",
|
| 1622 |
+
" 'numerical__REG_REGION_NOT_LIVE_REGION',\n",
|
| 1623 |
+
" 'numerical__REG_REGION_NOT_WORK_REGION',\n",
|
| 1624 |
+
" 'numerical__LIVE_REGION_NOT_WORK_REGION',\n",
|
| 1625 |
+
" 'numerical__REG_CITY_NOT_LIVE_CITY',\n",
|
| 1626 |
+
" 'numerical__REG_CITY_NOT_WORK_CITY',\n",
|
| 1627 |
+
" 'numerical__LIVE_CITY_NOT_WORK_CITY', 'numerical__EXT_SOURCE_1',\n",
|
| 1628 |
+
" 'numerical__EXT_SOURCE_2', 'numerical__EXT_SOURCE_3',\n",
|
| 1629 |
+
" 'numerical__APARTMENTS_AVG', 'numerical__BASEMENTAREA_AVG',\n",
|
| 1630 |
+
" 'numerical__YEARS_BEGINEXPLUATATION_AVG',\n",
|
| 1631 |
+
" 'numerical__YEARS_BUILD_AVG', 'numerical__COMMONAREA_AVG',\n",
|
| 1632 |
+
" 'numerical__ELEVATORS_AVG', 'numerical__ENTRANCES_AVG',\n",
|
| 1633 |
+
" 'numerical__FLOORSMAX_AVG', 'numerical__FLOORSMIN_AVG',\n",
|
| 1634 |
+
" 'numerical__LANDAREA_AVG', 'numerical__LIVINGAPARTMENTS_AVG',\n",
|
| 1635 |
+
" 'numerical__LIVINGAREA_AVG', 'numerical__NONLIVINGAPARTMENTS_AVG',\n",
|
| 1636 |
+
" 'numerical__NONLIVINGAREA_AVG', 'numerical__APARTMENTS_MODE',\n",
|
| 1637 |
+
" 'numerical__BASEMENTAREA_MODE',\n",
|
| 1638 |
+
" 'numerical__YEARS_BEGINEXPLUATATION_MODE',\n",
|
| 1639 |
+
" 'numerical__YEARS_BUILD_MODE', 'numerical__COMMONAREA_MODE',\n",
|
| 1640 |
+
" 'numerical__ELEVATORS_MODE', 'numerical__ENTRANCES_MODE',\n",
|
| 1641 |
+
" 'numerical__FLOORSMAX_MODE', 'numerical__FLOORSMIN_MODE',\n",
|
| 1642 |
+
" 'numerical__LANDAREA_MODE', 'numerical__LIVINGAPARTMENTS_MODE',\n",
|
| 1643 |
+
" 'numerical__LIVINGAREA_MODE',\n",
|
| 1644 |
+
" 'numerical__NONLIVINGAPARTMENTS_MODE',\n",
|
| 1645 |
+
" 'numerical__NONLIVINGAREA_MODE', 'numerical__APARTMENTS_MEDI',\n",
|
| 1646 |
+
" 'numerical__BASEMENTAREA_MEDI',\n",
|
| 1647 |
+
" 'numerical__YEARS_BEGINEXPLUATATION_MEDI',\n",
|
| 1648 |
+
" 'numerical__YEARS_BUILD_MEDI', 'numerical__COMMONAREA_MEDI',\n",
|
| 1649 |
+
" 'numerical__ELEVATORS_MEDI', 'numerical__ENTRANCES_MEDI',\n",
|
| 1650 |
+
" 'numerical__FLOORSMAX_MEDI', 'numerical__FLOORSMIN_MEDI',\n",
|
| 1651 |
+
" 'numerical__LANDAREA_MEDI', 'numerical__LIVINGAPARTMENTS_MEDI',\n",
|
| 1652 |
+
" 'numerical__LIVINGAREA_MEDI',\n",
|
| 1653 |
+
" 'numerical__NONLIVINGAPARTMENTS_MEDI',\n",
|
| 1654 |
+
" 'numerical__NONLIVINGAREA_MEDI', 'numerical__TOTALAREA_MODE',\n",
|
| 1655 |
+
" 'numerical__OBS_30_CNT_SOCIAL_CIRCLE',\n",
|
| 1656 |
+
" 'numerical__DEF_30_CNT_SOCIAL_CIRCLE',\n",
|
| 1657 |
+
" 'numerical__OBS_60_CNT_SOCIAL_CIRCLE',\n",
|
| 1658 |
+
" 'numerical__DEF_60_CNT_SOCIAL_CIRCLE',\n",
|
| 1659 |
+
" 'numerical__DAYS_LAST_PHONE_CHANGE', 'numerical__FLAG_DOCUMENT_2',\n",
|
| 1660 |
+
" 'numerical__FLAG_DOCUMENT_3', 'numerical__FLAG_DOCUMENT_4',\n",
|
| 1661 |
+
" 'numerical__FLAG_DOCUMENT_5', 'numerical__FLAG_DOCUMENT_6',\n",
|
| 1662 |
+
" 'numerical__FLAG_DOCUMENT_7', 'numerical__FLAG_DOCUMENT_8',\n",
|
| 1663 |
+
" 'numerical__FLAG_DOCUMENT_9', 'numerical__FLAG_DOCUMENT_10',\n",
|
| 1664 |
+
" 'numerical__FLAG_DOCUMENT_11', 'numerical__FLAG_DOCUMENT_12',\n",
|
| 1665 |
+
" 'numerical__FLAG_DOCUMENT_13', 'numerical__FLAG_DOCUMENT_14',\n",
|
| 1666 |
+
" 'numerical__FLAG_DOCUMENT_15', 'numerical__FLAG_DOCUMENT_16',\n",
|
| 1667 |
+
" 'numerical__FLAG_DOCUMENT_17', 'numerical__FLAG_DOCUMENT_18',\n",
|
| 1668 |
+
" 'numerical__FLAG_DOCUMENT_19', 'numerical__FLAG_DOCUMENT_20',\n",
|
| 1669 |
+
" 'numerical__FLAG_DOCUMENT_21',\n",
|
| 1670 |
+
" 'numerical__AMT_REQ_CREDIT_BUREAU_HOUR',\n",
|
| 1671 |
+
" 'numerical__AMT_REQ_CREDIT_BUREAU_DAY',\n",
|
| 1672 |
+
" 'numerical__AMT_REQ_CREDIT_BUREAU_WEEK',\n",
|
| 1673 |
+
" 'numerical__AMT_REQ_CREDIT_BUREAU_MON',\n",
|
| 1674 |
+
" 'numerical__AMT_REQ_CREDIT_BUREAU_QRT',\n",
|
| 1675 |
+
" 'numerical__AMT_REQ_CREDIT_BUREAU_YEAR'], dtype=object)"
|
| 1676 |
+
]
|
| 1677 |
+
},
|
| 1678 |
+
"execution_count": 17,
|
| 1679 |
+
"metadata": {},
|
| 1680 |
+
"output_type": "execute_result"
|
| 1681 |
+
}
|
| 1682 |
+
],
|
| 1683 |
+
"source": [
|
| 1684 |
+
"# Get then names of the final features after preprocessing\n",
|
| 1685 |
+
"preprocessor = loaded_pipeline.named_steps[\"preprocessor\"]\n",
|
| 1686 |
+
"final_features_names = preprocessor.get_feature_names_out()\n",
|
| 1687 |
+
"final_features_names"
|
| 1688 |
+
]
|
| 1689 |
+
},
|
| 1690 |
+
{
|
| 1691 |
+
"cell_type": "code",
|
| 1692 |
+
"execution_count": null,
|
| 1693 |
+
"id": "336a580f",
|
| 1694 |
+
"metadata": {},
|
| 1695 |
+
"outputs": [],
|
| 1696 |
+
"source": [
|
| 1697 |
+
"# Create a DataFrame to store the feature names and their corresponding importances\n",
|
| 1698 |
+
"feature_importances_df = pd.DataFrame(\n",
|
| 1699 |
+
" {\"feature\": final_features_names, \"importance\": feature_importances}\n",
|
| 1700 |
+
")\n",
|
| 1701 |
+
"\n",
|
| 1702 |
+
"sorted_feature_importance = feature_importances_df.sort_values(\n",
|
| 1703 |
+
" by=\"importance\", ascending=False\n",
|
| 1704 |
+
").reset_index(drop=True)"
|
| 1705 |
+
]
|
| 1706 |
+
},
|
| 1707 |
+
{
|
| 1708 |
+
"cell_type": "markdown",
|
| 1709 |
+
"id": "e86a01f7",
|
| 1710 |
+
"metadata": {},
|
| 1711 |
+
"source": [
|
| 1712 |
+
"**Top 10 most important features**\n"
|
| 1713 |
+
]
|
| 1714 |
+
},
|
| 1715 |
+
{
|
| 1716 |
+
"cell_type": "code",
|
| 1717 |
+
"execution_count": 21,
|
| 1718 |
+
"id": "c7bc9e30",
|
| 1719 |
+
"metadata": {},
|
| 1720 |
+
"outputs": [
|
| 1721 |
+
{
|
| 1722 |
+
"data": {
|
| 1723 |
+
"text/html": [
|
| 1724 |
+
"<div>\n",
|
| 1725 |
+
"<style scoped>\n",
|
| 1726 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 1727 |
+
" vertical-align: middle;\n",
|
| 1728 |
+
" }\n",
|
| 1729 |
+
"\n",
|
| 1730 |
+
" .dataframe tbody tr th {\n",
|
| 1731 |
+
" vertical-align: top;\n",
|
| 1732 |
+
" }\n",
|
| 1733 |
+
"\n",
|
| 1734 |
+
" .dataframe thead th {\n",
|
| 1735 |
+
" text-align: right;\n",
|
| 1736 |
+
" }\n",
|
| 1737 |
+
"</style>\n",
|
| 1738 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 1739 |
+
" <thead>\n",
|
| 1740 |
+
" <tr style=\"text-align: right;\">\n",
|
| 1741 |
+
" <th></th>\n",
|
| 1742 |
+
" <th>feature</th>\n",
|
| 1743 |
+
" <th>importance</th>\n",
|
| 1744 |
+
" </tr>\n",
|
| 1745 |
+
" </thead>\n",
|
| 1746 |
+
" <tbody>\n",
|
| 1747 |
+
" <tr>\n",
|
| 1748 |
+
" <th>0</th>\n",
|
| 1749 |
+
" <td>numerical__EXT_SOURCE_3</td>\n",
|
| 1750 |
+
" <td>833</td>\n",
|
| 1751 |
+
" </tr>\n",
|
| 1752 |
+
" <tr>\n",
|
| 1753 |
+
" <th>1</th>\n",
|
| 1754 |
+
" <td>numerical__EXT_SOURCE_2</td>\n",
|
| 1755 |
+
" <td>792</td>\n",
|
| 1756 |
+
" </tr>\n",
|
| 1757 |
+
" <tr>\n",
|
| 1758 |
+
" <th>2</th>\n",
|
| 1759 |
+
" <td>numerical__DAYS_BIRTH</td>\n",
|
| 1760 |
+
" <td>752</td>\n",
|
| 1761 |
+
" </tr>\n",
|
| 1762 |
+
" <tr>\n",
|
| 1763 |
+
" <th>3</th>\n",
|
| 1764 |
+
" <td>numerical__EXT_SOURCE_1</td>\n",
|
| 1765 |
+
" <td>732</td>\n",
|
| 1766 |
+
" </tr>\n",
|
| 1767 |
+
" <tr>\n",
|
| 1768 |
+
" <th>4</th>\n",
|
| 1769 |
+
" <td>numerical__AMT_ANNUITY</td>\n",
|
| 1770 |
+
" <td>660</td>\n",
|
| 1771 |
+
" </tr>\n",
|
| 1772 |
+
" <tr>\n",
|
| 1773 |
+
" <th>5</th>\n",
|
| 1774 |
+
" <td>numerical__AMT_CREDIT</td>\n",
|
| 1775 |
+
" <td>642</td>\n",
|
| 1776 |
+
" </tr>\n",
|
| 1777 |
+
" <tr>\n",
|
| 1778 |
+
" <th>6</th>\n",
|
| 1779 |
+
" <td>numerical__DAYS_EMPLOYED</td>\n",
|
| 1780 |
+
" <td>630</td>\n",
|
| 1781 |
+
" </tr>\n",
|
| 1782 |
+
" <tr>\n",
|
| 1783 |
+
" <th>7</th>\n",
|
| 1784 |
+
" <td>numerical__DAYS_ID_PUBLISH</td>\n",
|
| 1785 |
+
" <td>628</td>\n",
|
| 1786 |
+
" </tr>\n",
|
| 1787 |
+
" <tr>\n",
|
| 1788 |
+
" <th>8</th>\n",
|
| 1789 |
+
" <td>numerical__DAYS_REGISTRATION</td>\n",
|
| 1790 |
+
" <td>556</td>\n",
|
| 1791 |
+
" </tr>\n",
|
| 1792 |
+
" <tr>\n",
|
| 1793 |
+
" <th>9</th>\n",
|
| 1794 |
+
" <td>numerical__SK_ID_CURR</td>\n",
|
| 1795 |
+
" <td>536</td>\n",
|
| 1796 |
+
" </tr>\n",
|
| 1797 |
+
" </tbody>\n",
|
| 1798 |
+
"</table>\n",
|
| 1799 |
+
"</div>"
|
| 1800 |
+
],
|
| 1801 |
+
"text/plain": [
|
| 1802 |
+
" feature importance\n",
|
| 1803 |
+
"0 numerical__EXT_SOURCE_3 833\n",
|
| 1804 |
+
"1 numerical__EXT_SOURCE_2 792\n",
|
| 1805 |
+
"2 numerical__DAYS_BIRTH 752\n",
|
| 1806 |
+
"3 numerical__EXT_SOURCE_1 732\n",
|
| 1807 |
+
"4 numerical__AMT_ANNUITY 660\n",
|
| 1808 |
+
"5 numerical__AMT_CREDIT 642\n",
|
| 1809 |
+
"6 numerical__DAYS_EMPLOYED 630\n",
|
| 1810 |
+
"7 numerical__DAYS_ID_PUBLISH 628\n",
|
| 1811 |
+
"8 numerical__DAYS_REGISTRATION 556\n",
|
| 1812 |
+
"9 numerical__SK_ID_CURR 536"
|
| 1813 |
+
]
|
| 1814 |
+
},
|
| 1815 |
+
"execution_count": 21,
|
| 1816 |
+
"metadata": {},
|
| 1817 |
+
"output_type": "execute_result"
|
| 1818 |
+
}
|
| 1819 |
+
],
|
| 1820 |
+
"source": [
|
| 1821 |
+
"sorted_feature_importance.head(10)"
|
| 1822 |
+
]
|
| 1823 |
+
},
|
| 1824 |
+
{
|
| 1825 |
+
"cell_type": "markdown",
|
| 1826 |
+
"id": "37d77ecb",
|
| 1827 |
+
"metadata": {},
|
| 1828 |
+
"source": [
|
| 1829 |
+
"**Calculate default values for remaining features**\n"
|
| 1830 |
+
]
|
| 1831 |
+
},
|
| 1832 |
+
{
|
| 1833 |
+
"cell_type": "code",
|
| 1834 |
+
"execution_count": 24,
|
| 1835 |
+
"id": "0c5f45cb",
|
| 1836 |
+
"metadata": {},
|
| 1837 |
+
"outputs": [
|
| 1838 |
+
{
|
| 1839 |
+
"data": {
|
| 1840 |
+
"text/plain": [
|
| 1841 |
+
"{'SK_ID_CURR': 277659.5,\n",
|
| 1842 |
+
" 'CNT_CHILDREN': 0.0,\n",
|
| 1843 |
+
" 'AMT_INCOME_TOTAL': 147150.0,\n",
|
| 1844 |
+
" 'AMT_CREDIT': 512997.75,\n",
|
| 1845 |
+
" 'AMT_ANNUITY': 24885.0,\n",
|
| 1846 |
+
" 'AMT_GOODS_PRICE': 450000.0,\n",
|
| 1847 |
+
" 'REGION_POPULATION_RELATIVE': 0.01885,\n",
|
| 1848 |
+
" 'DAYS_BIRTH': -15743.5,\n",
|
| 1849 |
+
" 'DAYS_EMPLOYED': -1219.0,\n",
|
| 1850 |
+
" 'DAYS_REGISTRATION': -4492.0,\n",
|
| 1851 |
+
" 'DAYS_ID_PUBLISH': -3254.0,\n",
|
| 1852 |
+
" 'OWN_CAR_AGE': 9.0,\n",
|
| 1853 |
+
" 'FLAG_MOBIL': 1.0,\n",
|
| 1854 |
+
" 'FLAG_EMP_PHONE': 1.0,\n",
|
| 1855 |
+
" 'FLAG_WORK_PHONE': 0.0,\n",
|
| 1856 |
+
" 'FLAG_CONT_MOBILE': 1.0,\n",
|
| 1857 |
+
" 'FLAG_PHONE': 0.0,\n",
|
| 1858 |
+
" 'FLAG_EMAIL': 0.0,\n",
|
| 1859 |
+
" 'CNT_FAM_MEMBERS': 2.0,\n",
|
| 1860 |
+
" 'REGION_RATING_CLIENT': 2.0,\n",
|
| 1861 |
+
" 'REGION_RATING_CLIENT_W_CITY': 2.0,\n",
|
| 1862 |
+
" 'HOUR_APPR_PROCESS_START': 12.0,\n",
|
| 1863 |
+
" 'REG_REGION_NOT_LIVE_REGION': 0.0,\n",
|
| 1864 |
+
" 'REG_REGION_NOT_WORK_REGION': 0.0,\n",
|
| 1865 |
+
" 'LIVE_REGION_NOT_WORK_REGION': 0.0,\n",
|
| 1866 |
+
" 'REG_CITY_NOT_LIVE_CITY': 0.0,\n",
|
| 1867 |
+
" 'REG_CITY_NOT_WORK_CITY': 0.0,\n",
|
| 1868 |
+
" 'LIVE_CITY_NOT_WORK_CITY': 0.0,\n",
|
| 1869 |
+
" 'EXT_SOURCE_1': 0.5068839442599388,\n",
|
| 1870 |
+
" 'EXT_SOURCE_2': 0.5662837032261614,\n",
|
| 1871 |
+
" 'EXT_SOURCE_3': 0.5370699579791587,\n",
|
| 1872 |
+
" 'APARTMENTS_AVG': 0.0876,\n",
|
| 1873 |
+
" 'BASEMENTAREA_AVG': 0.0764,\n",
|
| 1874 |
+
" 'YEARS_BEGINEXPLUATATION_AVG': 0.9816,\n",
|
| 1875 |
+
" 'YEARS_BUILD_AVG': 0.7552,\n",
|
| 1876 |
+
" 'COMMONAREA_AVG': 0.0211,\n",
|
| 1877 |
+
" 'ELEVATORS_AVG': 0.0,\n",
|
| 1878 |
+
" 'ENTRANCES_AVG': 0.1379,\n",
|
| 1879 |
+
" 'FLOORSMAX_AVG': 0.1667,\n",
|
| 1880 |
+
" 'FLOORSMIN_AVG': 0.2083,\n",
|
| 1881 |
+
" 'LANDAREA_AVG': 0.0483,\n",
|
| 1882 |
+
" 'LIVINGAPARTMENTS_AVG': 0.0756,\n",
|
| 1883 |
+
" 'LIVINGAREA_AVG': 0.0746,\n",
|
| 1884 |
+
" 'NONLIVINGAPARTMENTS_AVG': 0.0,\n",
|
| 1885 |
+
" 'NONLIVINGAREA_AVG': 0.0035,\n",
|
| 1886 |
+
" 'APARTMENTS_MODE': 0.084,\n",
|
| 1887 |
+
" 'BASEMENTAREA_MODE': 0.0748,\n",
|
| 1888 |
+
" 'YEARS_BEGINEXPLUATATION_MODE': 0.9816,\n",
|
| 1889 |
+
" 'YEARS_BUILD_MODE': 0.7648,\n",
|
| 1890 |
+
" 'COMMONAREA_MODE': 0.0191,\n",
|
| 1891 |
+
" 'ELEVATORS_MODE': 0.0,\n",
|
| 1892 |
+
" 'ENTRANCES_MODE': 0.1379,\n",
|
| 1893 |
+
" 'FLOORSMAX_MODE': 0.1667,\n",
|
| 1894 |
+
" 'FLOORSMIN_MODE': 0.2083,\n",
|
| 1895 |
+
" 'LANDAREA_MODE': 0.0459,\n",
|
| 1896 |
+
" 'LIVINGAPARTMENTS_MODE': 0.0771,\n",
|
| 1897 |
+
" 'LIVINGAREA_MODE': 0.0731,\n",
|
| 1898 |
+
" 'NONLIVINGAPARTMENTS_MODE': 0.0,\n",
|
| 1899 |
+
" 'NONLIVINGAREA_MODE': 0.0011,\n",
|
| 1900 |
+
" 'APARTMENTS_MEDI': 0.0864,\n",
|
| 1901 |
+
" 'BASEMENTAREA_MEDI': 0.0761,\n",
|
| 1902 |
+
" 'YEARS_BEGINEXPLUATATION_MEDI': 0.9816,\n",
|
| 1903 |
+
" 'YEARS_BUILD_MEDI': 0.7585,\n",
|
| 1904 |
+
" 'COMMONAREA_MEDI': 0.0209,\n",
|
| 1905 |
+
" 'ELEVATORS_MEDI': 0.0,\n",
|
| 1906 |
+
" 'ENTRANCES_MEDI': 0.1379,\n",
|
| 1907 |
+
" 'FLOORSMAX_MEDI': 0.1667,\n",
|
| 1908 |
+
" 'FLOORSMIN_MEDI': 0.2083,\n",
|
| 1909 |
+
" 'LANDAREA_MEDI': 0.0488,\n",
|
| 1910 |
+
" 'LIVINGAPARTMENTS_MEDI': 0.0765,\n",
|
| 1911 |
+
" 'LIVINGAREA_MEDI': 0.0749,\n",
|
| 1912 |
+
" 'NONLIVINGAPARTMENTS_MEDI': 0.0,\n",
|
| 1913 |
+
" 'NONLIVINGAREA_MEDI': 0.003,\n",
|
| 1914 |
+
" 'TOTALAREA_MODE': 0.0687,\n",
|
| 1915 |
+
" 'OBS_30_CNT_SOCIAL_CIRCLE': 0.0,\n",
|
| 1916 |
+
" 'DEF_30_CNT_SOCIAL_CIRCLE': 0.0,\n",
|
| 1917 |
+
" 'OBS_60_CNT_SOCIAL_CIRCLE': 0.0,\n",
|
| 1918 |
+
" 'DEF_60_CNT_SOCIAL_CIRCLE': 0.0,\n",
|
| 1919 |
+
" 'DAYS_LAST_PHONE_CHANGE': -755.0,\n",
|
| 1920 |
+
" 'FLAG_DOCUMENT_2': 0.0,\n",
|
| 1921 |
+
" 'FLAG_DOCUMENT_3': 1.0,\n",
|
| 1922 |
+
" 'FLAG_DOCUMENT_4': 0.0,\n",
|
| 1923 |
+
" 'FLAG_DOCUMENT_5': 0.0,\n",
|
| 1924 |
+
" 'FLAG_DOCUMENT_6': 0.0,\n",
|
| 1925 |
+
" 'FLAG_DOCUMENT_7': 0.0,\n",
|
| 1926 |
+
" 'FLAG_DOCUMENT_8': 0.0,\n",
|
| 1927 |
+
" 'FLAG_DOCUMENT_9': 0.0,\n",
|
| 1928 |
+
" 'FLAG_DOCUMENT_10': 0.0,\n",
|
| 1929 |
+
" 'FLAG_DOCUMENT_11': 0.0,\n",
|
| 1930 |
+
" 'FLAG_DOCUMENT_12': 0.0,\n",
|
| 1931 |
+
" 'FLAG_DOCUMENT_13': 0.0,\n",
|
| 1932 |
+
" 'FLAG_DOCUMENT_14': 0.0,\n",
|
| 1933 |
+
" 'FLAG_DOCUMENT_15': 0.0,\n",
|
| 1934 |
+
" 'FLAG_DOCUMENT_16': 0.0,\n",
|
| 1935 |
+
" 'FLAG_DOCUMENT_17': 0.0,\n",
|
| 1936 |
+
" 'FLAG_DOCUMENT_18': 0.0,\n",
|
| 1937 |
+
" 'FLAG_DOCUMENT_19': 0.0,\n",
|
| 1938 |
+
" 'FLAG_DOCUMENT_20': 0.0,\n",
|
| 1939 |
+
" 'FLAG_DOCUMENT_21': 0.0,\n",
|
| 1940 |
+
" 'AMT_REQ_CREDIT_BUREAU_HOUR': 0.0,\n",
|
| 1941 |
+
" 'AMT_REQ_CREDIT_BUREAU_DAY': 0.0,\n",
|
| 1942 |
+
" 'AMT_REQ_CREDIT_BUREAU_WEEK': 0.0,\n",
|
| 1943 |
+
" 'AMT_REQ_CREDIT_BUREAU_MON': 0.0,\n",
|
| 1944 |
+
" 'AMT_REQ_CREDIT_BUREAU_QRT': 0.0,\n",
|
| 1945 |
+
" 'AMT_REQ_CREDIT_BUREAU_YEAR': 1.0,\n",
|
| 1946 |
+
" 'NAME_CONTRACT_TYPE': 'Cash loans',\n",
|
| 1947 |
+
" 'CODE_GENDER': 'F',\n",
|
| 1948 |
+
" 'FLAG_OWN_CAR': 'N',\n",
|
| 1949 |
+
" 'FLAG_OWN_REALTY': 'Y',\n",
|
| 1950 |
+
" 'NAME_TYPE_SUITE': 'Unaccompanied',\n",
|
| 1951 |
+
" 'NAME_INCOME_TYPE': 'Working',\n",
|
| 1952 |
+
" 'NAME_EDUCATION_TYPE': 'Secondary / secondary special',\n",
|
| 1953 |
+
" 'NAME_FAMILY_STATUS': 'Married',\n",
|
| 1954 |
+
" 'NAME_HOUSING_TYPE': 'House / apartment',\n",
|
| 1955 |
+
" 'OCCUPATION_TYPE': 'Laborers',\n",
|
| 1956 |
+
" 'WEEKDAY_APPR_PROCESS_START': 'TUESDAY',\n",
|
| 1957 |
+
" 'ORGANIZATION_TYPE': 'Business Entity Type 3',\n",
|
| 1958 |
+
" 'FONDKAPREMONT_MODE': 'reg oper account',\n",
|
| 1959 |
+
" 'HOUSETYPE_MODE': 'block of flats',\n",
|
| 1960 |
+
" 'WALLSMATERIAL_MODE': 'Panel',\n",
|
| 1961 |
+
" 'EMERGENCYSTATE_MODE': 'No'}"
|
| 1962 |
+
]
|
| 1963 |
+
},
|
| 1964 |
+
"execution_count": 24,
|
| 1965 |
+
"metadata": {},
|
| 1966 |
+
"output_type": "execute_result"
|
| 1967 |
+
}
|
| 1968 |
+
],
|
| 1969 |
+
"source": [
|
| 1970 |
+
"all_features = X_train.columns.to_list()\n",
|
| 1971 |
+
"ui_features = sorted_feature_importance[\"feature\"].head(10).tolist()\n",
|
| 1972 |
+
"\n",
|
| 1973 |
+
"default_values = {}\n",
|
| 1974 |
+
"\n",
|
| 1975 |
+
"num_default = X_train.select_dtypes(include=[\"number\"]).median().to_dict()\n",
|
| 1976 |
+
"default_values.update(num_default)\n",
|
| 1977 |
+
"\n",
|
| 1978 |
+
"cat_defaults = X_train.select_dtypes(include=[\"object\"]).mode().iloc[0].to_dict()\n",
|
| 1979 |
+
"default_values.update(cat_defaults)\n",
|
| 1980 |
+
"\n",
|
| 1981 |
+
"default_values\n"
|
| 1982 |
+
]
|
| 1983 |
}
|
| 1984 |
],
|
| 1985 |
"metadata": {
|