iBrokeTheCode commited on
Commit
c742ac4
·
1 Parent(s): d77c733

chore: Save LightGBM model

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. app.py +1 -1
  3. app_bk.py +728 -0
  4. model/lgbm_model.joblib +3 -0
  5. tutorial_app.ipynb +619 -11
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  home_credit_dataset.csv filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  home_credit_dataset.csv filter=lfs diff=lfs merge=lfs -text
37
+ lgbm_model.joblib filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -689,7 +689,7 @@ def _(mo):
689
 
690
  @app.cell
691
  def _(mo):
692
- mo.md(r"""## 5. Conclusion""")
693
  return
694
 
695
 
 
689
 
690
  @app.cell
691
  def _(mo):
692
+ mo.md(r"""## 5. Model Selection""")
693
  return
694
 
695
 
app_bk.py ADDED
@@ -0,0 +1,728 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import marimo
2
+
3
+ __generated_with = "0.14.16"
4
+ app = marimo.App()
5
+
6
+
7
+ @app.cell
8
+ def _():
9
+ import marimo as mo
10
+ return (mo,)
11
+
12
+
13
+ @app.cell
14
+ def _(mo):
15
+ mo.center(mo.md("# Home Credit Default Risk Prediction"))
16
+ return
17
+
18
+
19
+ @app.cell
20
+ def _():
21
+ import pandas as pd
22
+
23
+ from sklearn.ensemble import RandomForestClassifier
24
+ from sklearn.linear_model import LogisticRegression
25
+ from sklearn.metrics import roc_auc_score
26
+ from sklearn.model_selection import RandomizedSearchCV
27
+
28
+ from sklearn.pipeline import Pipeline
29
+ from sklearn.compose import ColumnTransformer
30
+ from sklearn.impute import SimpleImputer
31
+ from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
32
+
33
+ from lightgbm import LGBMClassifier
34
+
35
+ from src.plots import (
36
+ plot_target_distribution,
37
+ plot_credit_amounts,
38
+ plot_education_levels,
39
+ plot_occupation,
40
+ plot_family_status,
41
+ plot_income_type,
42
+ )
43
+ from src.utils import get_dataset, get_features_target, get_train_test_sets
44
+ from src.preprocessing import preprocess_data_pipeline
45
+ return (
46
+ get_dataset,
47
+ get_features_target,
48
+ get_train_test_sets,
49
+ pd,
50
+ plot_credit_amounts,
51
+ plot_education_levels,
52
+ plot_family_status,
53
+ plot_income_type,
54
+ plot_occupation,
55
+ plot_target_distribution,
56
+ preprocess_data_pipeline,
57
+ )
58
+
59
+
60
+ @app.cell
61
+ def _(get_dataset, get_features_target):
62
+ df = get_dataset()
63
+ X, y = get_features_target(df)
64
+ return X, df, y
65
+
66
+
67
+ @app.cell
68
+ def _(mo):
69
+ mo.md("""## 1. Exploratory Data Analysis""")
70
+ return
71
+
72
+
73
+ @app.cell
74
+ def _(mo):
75
+ mo.callout(
76
+ kind="info",
77
+ value=mo.md(
78
+ """💡 **Want a step-by-step walkthrough instead?**
79
+ Check the Jupyter notebook version here: 👉 [Jupyter notebook](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/tutorial_app.ipynb)""",
80
+ ),
81
+ )
82
+ return
83
+
84
+
85
+ @app.cell
86
+ def _(mo):
87
+ mo.md("""### 1.1 Dataset Information""")
88
+ return
89
+
90
+
91
+ @app.cell
92
+ def _(mo):
93
+ mo.md("""**a. Shape of the train and test datasets**""")
94
+ return
95
+
96
+
97
+ @app.cell
98
+ def _(X_test, X_train, df):
99
+ train_samples = "Train dataset samples: {}".format(X_train.shape[0])
100
+ test_samples = "Test dataset samples: {}".format(X_test.shape[0])
101
+ columns_number = "Number of columns: {}".format(df.shape[1])
102
+
103
+ train_samples, test_samples, columns_number
104
+ return
105
+
106
+
107
+ @app.cell
108
+ def _(mo):
109
+ mo.md("""**b. Dataset features**""")
110
+ return
111
+
112
+
113
+ @app.cell
114
+ def _(X):
115
+ X.columns
116
+ return
117
+
118
+
119
+ @app.cell
120
+ def _(mo):
121
+ mo.md("""**c. Sample from dataset**""")
122
+ return
123
+
124
+
125
+ @app.cell
126
+ def _(X):
127
+ sample = X.head(5).T
128
+ sample.columns = [
129
+ str(col) for col in sample.columns
130
+ ] # fix integer name warning
131
+ sample = sample.astype(str) # avoid numeric conversion issues in viewer
132
+ sample
133
+ return
134
+
135
+
136
+ @app.cell
137
+ def _(mo):
138
+ mo.md("""**d. Target variable Distribution**""")
139
+ return
140
+
141
+
142
+ @app.cell
143
+ def _(df, plot_target_distribution):
144
+ target_table, target_plot = plot_target_distribution(df=df)
145
+ target_table
146
+ return (target_plot,)
147
+
148
+
149
+ @app.cell
150
+ def _(target_plot):
151
+ target_plot
152
+ return
153
+
154
+
155
+ @app.cell
156
+ def _(mo):
157
+ mo.md("""**e. Number of columns of each data type**""")
158
+ return
159
+
160
+
161
+ @app.cell
162
+ def _(X):
163
+ X.dtypes.value_counts().sort_values(ascending=False)
164
+ return
165
+
166
+
167
+ @app.cell
168
+ def _(X):
169
+ categorical_cols = (
170
+ X.select_dtypes(include=["object"]).nunique().sort_values(ascending=False)
171
+ )
172
+ categorical_cols
173
+ return
174
+
175
+
176
+ @app.cell
177
+ def _(mo):
178
+ mo.md("""**f. Missing data**""")
179
+ return
180
+
181
+
182
+ @app.cell
183
+ def _(X, pd):
184
+ missing_count = X.isna().sum().sort_values(ascending=False)
185
+ missing_percentage = (missing_count / X.shape[0] * 100).round(2)
186
+
187
+ missing_data = pd.DataFrame(
188
+ data={"Count": missing_count, "percentage": missing_percentage}
189
+ )
190
+ missing_data
191
+ return
192
+
193
+
194
+ @app.cell
195
+ def _(mo):
196
+ mo.md("""### 1.2 Distribution of Variables""")
197
+ return
198
+
199
+
200
+ @app.cell
201
+ def _(mo):
202
+ mo.md(
203
+ r"""Want to see how these plots were created? You can find the source code for the visualizations in [plots.py](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/src/plots.py)."""
204
+ )
205
+ return
206
+
207
+
208
+ @app.cell
209
+ def _(mo):
210
+ mo.md("""**a. Credit Amounts**""")
211
+ return
212
+
213
+
214
+ @app.cell
215
+ def _(X, plot_credit_amounts):
216
+ plot_credit_amounts(df=X)
217
+ return
218
+
219
+
220
+ @app.cell
221
+ def _(mo):
222
+ mo.md("""**b. Education Level of Credit Applicants**""")
223
+ return
224
+
225
+
226
+ @app.cell
227
+ def _(X, plot_education_levels):
228
+ education_table, education_plot = plot_education_levels(df=X)
229
+ education_table
230
+ return (education_plot,)
231
+
232
+
233
+ @app.cell
234
+ def _(education_plot):
235
+ education_plot
236
+ return
237
+
238
+
239
+ @app.cell
240
+ def _(mo):
241
+ mo.md("""**c. Ocupation of Credit Applicants**""")
242
+ return
243
+
244
+
245
+ @app.cell
246
+ def _(X, plot_occupation):
247
+ occupation_table, occupation_plot = plot_occupation(df=X)
248
+ occupation_table
249
+ return (occupation_plot,)
250
+
251
+
252
+ @app.cell
253
+ def _(occupation_plot):
254
+ occupation_plot
255
+ return
256
+
257
+
258
+ @app.cell
259
+ def _(mo):
260
+ mo.md("""**d. Family Status of Applicants**""")
261
+ return
262
+
263
+
264
+ @app.cell
265
+ def _(X, plot_family_status):
266
+ family_status_table, family_status_plot = plot_family_status(df=X)
267
+ family_status_table
268
+ return (family_status_plot,)
269
+
270
+
271
+ @app.cell
272
+ def _(family_status_plot):
273
+ family_status_plot
274
+ return
275
+
276
+
277
+ @app.cell
278
+ def _(mo):
279
+ mo.md("""**e. Income Type of Applicants by Target Variable**""")
280
+ return
281
+
282
+
283
+ @app.cell
284
+ def _(df, plot_income_type):
285
+ plot_income_type(df=df)
286
+ return
287
+
288
+
289
+ @app.cell
290
+ def _(mo):
291
+ mo.md("""## 2. Preprocessing""")
292
+ return
293
+
294
+
295
+ @app.cell
296
+ def _(mo):
297
+ mo.md("""**a. Separate Train and Test Datasets**""")
298
+ return
299
+
300
+
301
+ @app.cell
302
+ def _(X, get_train_test_sets, y):
303
+ X_train, y_train, X_test, y_test = get_train_test_sets(X, y)
304
+ X_train.shape, y_train.shape, X_test.shape, y_test.shape
305
+ return X_test, X_train
306
+
307
+
308
+ @app.cell
309
+ def _(mo):
310
+ mo.md("""**b. Preprocess Data**""")
311
+ return
312
+
313
+
314
+ @app.cell
315
+ def _(mo):
316
+ mo.md(
317
+ r"""
318
+ This preprocessing perform:
319
+
320
+ - Correct outliers/anomalous values in numerical columns (`DAYS_EMPLOYED` column).
321
+ - Encode string categorical features (`dtype object`).
322
+ - If the feature has 2 categories, Binary Encoding is applied.
323
+ - One Hot Encoding for more than 2 categories.
324
+ - Impute values for all columns with missing data (using median as imputing value).
325
+ - Feature scaling with Min-Max scaler
326
+
327
+ Want to see how the dataset was processed? You can find the code for the preprocessing steps in [preprocessing.py](https://huggingface.co/spaces/iBrokeTheCode/Home_Credit_Default_Risk_Prediction/blob/main/src/preprocessing.py).
328
+ """
329
+ )
330
+ return
331
+
332
+
333
+ @app.cell
334
+ def _(X_test, X_train, preprocess_data_pipeline):
335
+ train_data, test_data = preprocess_data_pipeline(
336
+ train_df=X_train, test_df=X_test
337
+ )
338
+ train_data.shape, test_data.shape
339
+ return
340
+
341
+
342
+ @app.cell
343
+ def _(mo):
344
+ mo.md("""## 3. Training Models""")
345
+ return
346
+
347
+
348
+ @app.cell
349
+ def _(mo):
350
+ mo.md(
351
+ r"""At this points, we will work with `train_data` and `test_data` as features sets; also `y_train` and `y_test` as target sets."""
352
+ )
353
+ return
354
+
355
+
356
+ @app.cell
357
+ def _(mo):
358
+ mo.md(r"""### 3.1 Logistic Regression""")
359
+ return
360
+
361
+
362
+ @app.cell
363
+ def _(mo):
364
+ mo.callout(
365
+ mo.md("""
366
+ In Logistic Regression, C is the inverse of regularization strength:
367
+
368
+ - **Small C** → Stronger regularization → Simpler model, less overfitting risk, but may underfit.
369
+ - **Large C** → Weaker regularization → Model fits training data more closely, but may overfit.
370
+ """),
371
+ kind="info",
372
+ )
373
+ return
374
+
375
+
376
+ @app.cell
377
+ def _(mo):
378
+ mo.md(
379
+ r"""
380
+ We trained our Logistic Regression model using the following code:
381
+
382
+ ```py
383
+ # 📌 Logistic Regression
384
+ log_reg = LogisticRegression(C=0.0001)
385
+ log_reg.fit(train_data, y_train)
386
+
387
+ # Train data predicton (class 1)
388
+ lr_train_pred = log_reg.predict_proba(train_data)[:, 1]
389
+
390
+ # Test data prediction (class 1)
391
+ lr_test_pred = log_reg.predict_proba(test_data)[:, 1]
392
+
393
+ # Get the ROC AUC Score on train and test datasets
394
+ log_reg_scores = {
395
+ "train_score": roc_auc_score(y_train, lr_train_pred),
396
+ "test_score": roc_auc_score(y_test, lr_test_pred),
397
+ }
398
+ log_reg_scores
399
+ ```
400
+
401
+ 📈 The ROC AUC scores obtained:
402
+ """
403
+ )
404
+ return
405
+
406
+
407
+ @app.cell
408
+ def _():
409
+ lr_scores = {
410
+ "train_score": 0.6868418961663535,
411
+ "test_score": 0.6854973003347028,
412
+ }
413
+ lr_scores
414
+ return
415
+
416
+
417
+ @app.cell
418
+ def _(mo):
419
+ mo.md(r"""### 3.2 Random Forest Classifier""")
420
+ return
421
+
422
+
423
+ @app.cell
424
+ def _(mo):
425
+ mo.md(
426
+ r"""
427
+ We trained our Random Forest Classifier model using the following code:
428
+
429
+ ```py
430
+ # 📌 Random Forest Classifier
431
+ rf = RandomForestClassifier(random_state=42, n_jobs=-1)
432
+ rf.fit(train_data, y_train)
433
+
434
+ rf_train_pred = rf.predict_proba(train_data)[:, 1]
435
+ rf_test_pred = rf.predict_proba(test_data)[:, 1]
436
+
437
+ rf_scores = {
438
+ "train_score": roc_auc_score(y_train, rf_train_pred),
439
+ "test_score": roc_auc_score(y_test, rf_test_pred),
440
+ }
441
+ rf_scores
442
+ ```
443
+
444
+ 📈 The ROC AUC scores obtained:
445
+ """
446
+ )
447
+ return
448
+
449
+
450
+ @app.cell
451
+ def _():
452
+ rf_scores = {"train_score": 1.0, "test_score": 0.7066811557903828}
453
+ rf_scores
454
+ return
455
+
456
+
457
+ @app.cell
458
+ def _(mo):
459
+ mo.md(r"""### 3.3. Randomized Search with Cross Validations""")
460
+ return
461
+
462
+
463
+ @app.cell
464
+ def _(mo):
465
+ mo.md(
466
+ r"""
467
+ We trained the Randomized Search CV using the following code:
468
+
469
+ ```py
470
+ # 📌 RandomizedSearchCV
471
+ param_dist = {"n_estimators": [50, 100, 150], "max_depth": [10, 20, 30]}
472
+
473
+ rf_optimized = RandomForestClassifier(random_state=42, n_jobs=-1)
474
+ rscv = RandomizedSearchCV(
475
+ estimator=rf_optimized,
476
+ param_distributions=param_dist,
477
+ n_iter=5,
478
+ scoring="roc_auc",
479
+ cv=3,
480
+ random_state=42,
481
+ n_jobs=-1,
482
+ )
483
+
484
+ rscv.fit(train_data, y_train)
485
+
486
+ rfo_train_pred = rscv.predict_proba(train_data)[:, 1]
487
+ rfo_test_pred = rscv.predict_proba(test_data)[:, 1]
488
+
489
+ rfo_scores = {
490
+ "train_score": roc_auc_score(y_train, rfo_train_pred),
491
+ "test_score": roc_auc_score(y_test, rfo_test_pred),
492
+ }
493
+ rfo_scores
494
+ ```
495
+
496
+ 📈 The ROC AUC scores obtained:
497
+ """
498
+ )
499
+ return
500
+
501
+
502
+ @app.cell
503
+ def _():
504
+ rfo_scores = {
505
+ "train_score": 0.8196620915431655,
506
+ "test_score": 0.7308385425476998,
507
+ }
508
+ rfo_scores
509
+ return
510
+
511
+
512
+ @app.cell
513
+ def _(mo):
514
+ mo.md(r"""🥇The best results:""")
515
+ return
516
+
517
+
518
+ @app.cell
519
+ def _():
520
+ optimized_results = {
521
+ "best_params_": {"n_estimators": 100, "max_depth": 10},
522
+ "best_score_": 0.7296259755147781,
523
+ "best_estimator_": "RandomForestClassifier(max_depth=10, n_jobs=-1, random_state=42)",
524
+ }
525
+ optimized_results
526
+ return
527
+
528
+
529
+ @app.cell
530
+ def _(mo):
531
+ mo.md(r"""### 3.4 LightGBM""")
532
+ return
533
+
534
+
535
+ @app.cell
536
+ def _(mo):
537
+ mo.md(
538
+ r"""
539
+ We trained our LightGBM Classifier model using the following code:
540
+
541
+ ```py
542
+ # 📌 LightGBM
543
+ import warnings
544
+
545
+ warnings.filterwarnings(
546
+ "ignore", message="X does not have valid feature names"
547
+ )
548
+
549
+ # 📌 Get numerical and categorical variables (binary and mutiple)
550
+ num_cols = X_train.select_dtypes(include="number").columns.to_list()
551
+ cat_cols = X_train.select_dtypes(include="object").columns.to_list()
552
+
553
+ binary_cols = [col for col in cat_cols if X_train[col].nunique() == 2]
554
+ multi_cols = [col for col in cat_cols if X_train[col].nunique() > 2]
555
+
556
+ # 📌 [1] Create the pipelines for different data types
557
+ numerical_pipeline = Pipeline(
558
+ steps=[
559
+ ("imputer", SimpleImputer(strategy="median")),
560
+ ("scaler", MinMaxScaler()),
561
+ ]
562
+ )
563
+
564
+ binary_pipeline = Pipeline(
565
+ steps=[
566
+ ("imputer", SimpleImputer(strategy="most_frequent")),
567
+ ("ordinal", OrdinalEncoder()),
568
+ ("scaler", MinMaxScaler()),
569
+ ]
570
+ )
571
+
572
+ multi_pipeline = Pipeline(
573
+ steps=[
574
+ ("imputer", SimpleImputer(strategy="most_frequent")),
575
+ (
576
+ "onehot",
577
+ OneHotEncoder(handle_unknown="ignore", sparse_output=False),
578
+ ),
579
+ ("scaler", MinMaxScaler()),
580
+ ]
581
+ )
582
+
583
+ # 📌 [2] Create the preprocessor using ColumnTransformer
584
+ preprocessor = ColumnTransformer(
585
+ transformers=[
586
+ ("binary", binary_pipeline, binary_cols),
587
+ ("multi", multi_pipeline, multi_cols),
588
+ ("numerical", numerical_pipeline, num_cols),
589
+ ],
590
+ remainder="passthrough",
591
+ )
592
+
593
+ # 📌 [3] Create the Final Pipeline that combines the preprocessor and the model
594
+ lgbm = LGBMClassifier(
595
+ n_estimators=500,
596
+ learning_rate=0.05,
597
+ max_depth=-1,
598
+ random_state=42,
599
+ class_weight="balanced",
600
+ n_jobs=-1,
601
+ )
602
+
603
+ lgbm_pipeline = Pipeline(
604
+ steps=[("preprocessor", preprocessor), ("classifier", lgbm)]
605
+ )
606
+
607
+ # 📌 [4] Fit the Final Pipeline on the ORIGINAL, unprocessed data
608
+ # The pipeline takes care of all the preprocessing internally.
609
+ lgbm_pipeline.fit(X_train, y_train)
610
+
611
+ lgbm_train_pred = lgbm_pipeline.predict_proba(X_train)[:, 1]
612
+ lgbm_test_pred = lgbm_pipeline.predict_proba(X_test)[:, 1]
613
+
614
+ lgbm_scores = {
615
+ "train_score": roc_auc_score(y_train, lgbm_train_pred),
616
+ "test_score": roc_auc_score(y_test, lgbm_test_pred),
617
+ }
618
+ lgbm_scores
619
+ ```
620
+
621
+ 📈 The ROC AUC scores obtained:
622
+ """
623
+ )
624
+ return
625
+
626
+
627
+ @app.cell
628
+ def _():
629
+ lgbm_scores = {
630
+ "train_score": 0.8523466410959462,
631
+ "test_score": 0.7514895868142193,
632
+ }
633
+ lgbm_scores
634
+ return
635
+
636
+
637
+ @app.cell
638
+ def _(mo):
639
+ mo.md(r"""## 4. Model Performance Analysis""")
640
+ return
641
+
642
+
643
+ @app.cell
644
+ def _(mo):
645
+ lg_stat = mo.stat(
646
+ label="Logistic Regression",
647
+ bordered=True,
648
+ value="🏋️ 0.687 🔎 0.685",
649
+ caption="Scores are consistent across train and test, indicating no overfitting. However, the overall AUC is low, suggesting underfitting — the model is too simple to capture complex patterns.",
650
+ direction="decrease",
651
+ )
652
+
653
+ rfc_stat = mo.stat(
654
+ label="Random Forest Classifier",
655
+ bordered=True,
656
+ value="🏋️ 1.0 🔎 0.707",
657
+ caption="Perfect training AUC indicates severe overfitting — the model memorized the training set. While the test score is better than Logistic Regression, the gap is too large for good generalization.",
658
+ direction="decrease",
659
+ )
660
+
661
+ rfo_stat = mo.stat(
662
+ label="Random Forest with Randomized Search",
663
+ bordered=True,
664
+ value="🏋️ 0.820 🔎 0.731",
665
+ caption="Hyperparameter tuning greatly reduced overfitting. The smaller train–test gap and improved test AUC show better generalization and a strong performance.",
666
+ direction="increase",
667
+ )
668
+
669
+ lgbm_stat = mo.stat(
670
+ label="LightGBM",
671
+ bordered=True,
672
+ value="🏋️ 0.852 🔎 0.751",
673
+ caption="Best overall performance. Small train–test gap and highest test AUC indicate a well-balanced model with strong generalization.",
674
+ direction="increase",
675
+ )
676
+
677
+ mo.vstack(
678
+ items=[
679
+ mo.hstack(items=[lg_stat, rfc_stat], widths="equal", gap=1),
680
+ mo.hstack(items=[rfo_stat, lgbm_stat], widths="equal", gap=1),
681
+ ],
682
+ gap=1,
683
+ heights="equal",
684
+ align="center",
685
+ justify="center",
686
+ )
687
+ return
688
+
689
+
690
+ @app.cell
691
+ def _(mo):
692
+ mo.md(r"""## 5. Model Selection""")
693
+ return
694
+
695
+
696
+ @app.cell
697
+ def _(mo):
698
+ mo.md(
699
+ r"""
700
+ Based on a comparison of all the models, the final model selection is clear.
701
+
702
+ | Model | Train Score (AUC ROC) | Test Score (AUC ROC) |
703
+ | :--- | :---: | :---: |
704
+ | Logistic Regression | 0.687 | 0.685 |
705
+ | Random Forest Classifier | 1.000 | 0.707 |
706
+ | Randomized Search (Tuned RF) | 0.820 | 0.731 |
707
+ | **LightGBM** | 0.852 | **0.751** |
708
+
709
+ * The **Logistic Regression** model performed poorly due to underfitting.
710
+ * The base **Random Forest** model, while better, suffered from severe overfitting.
711
+ * The tuned **Random Forest** model was a significant improvement and a strong contender, achieving a solid `test_score`.
712
+ * However, the **LightGBM** model ultimately demonstrated the best performance, achieving the highest **ROC AUC test score of 0.751**. This indicates that it is the most robust and accurate model for predicting loan repayment risk on unseen data.
713
+ """
714
+ )
715
+ return
716
+
717
+
718
+ @app.cell
719
+ def _(mo):
720
+ mo.callout(
721
+ kind="success",
722
+ value="🥇 Therefore, we will select the LightGBM model as our final choice for deployment.",
723
+ )
724
+ return
725
+
726
+
727
+ if __name__ == "__main__":
728
+ app.run()
model/lgbm_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1931e98f1ec892e21f92b9be570f02bac96347dad2d8ae57732d582cdf2951f3
3
+ size 1813887
tutorial_app.ipynb CHANGED
@@ -10,12 +10,14 @@
10
  },
11
  {
12
  "cell_type": "code",
13
- "execution_count": 4,
14
  "id": "vblA",
15
  "metadata": {},
16
  "outputs": [],
17
  "source": [
18
  "import pandas as pd\n",
 
 
19
  "\n",
20
  "from sklearn.ensemble import RandomForestClassifier\n",
21
  "from sklearn.linear_model import LogisticRegression\n",
@@ -43,7 +45,7 @@
43
  },
44
  {
45
  "cell_type": "code",
46
- "execution_count": 5,
47
  "id": "bkHC",
48
  "metadata": {},
49
  "outputs": [],
@@ -828,7 +830,7 @@
828
  },
829
  {
830
  "cell_type": "code",
831
- "execution_count": 26,
832
  "id": "kqZH",
833
  "metadata": {},
834
  "outputs": [
@@ -838,7 +840,7 @@
838
  "((196806, 121), (196806,), (49202, 121), (49202,))"
839
  ]
840
  },
841
- "execution_count": 26,
842
  "metadata": {},
843
  "output_type": "execute_result"
844
  }
@@ -1178,7 +1180,7 @@
1178
  },
1179
  {
1180
  "cell_type": "code",
1181
- "execution_count": 38,
1182
  "id": "cEAS",
1183
  "metadata": {},
1184
  "outputs": [
@@ -1187,7 +1189,7 @@
1187
  "output_type": "stream",
1188
  "text": [
1189
  "[LightGBM] [Info] Number of positive: 15784, number of negative: 181022\n",
1190
- "[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.056678 seconds.\n",
1191
  "You can set `force_col_wise=true` to remove the overhead.\n",
1192
  "[LightGBM] [Info] Total Bins 11594\n",
1193
  "[LightGBM] [Info] Number of data points in the train set: 196806, number of used features: 229\n",
@@ -1314,25 +1316,25 @@
1314
  "source": [
1315
  "### 4.1 Logistic Regression\n",
1316
  "\n",
1317
- "The Logistic Regression model shows a `train_score` of 0.687 and a `test_score` of 0.685.\n",
1318
  "\n",
1319
  "**Interpretation:** This model's performance is consistent across the training and testing sets, as indicated by the very small gap between the scores. This means the model is not overfitting. However, the overall scores are relatively low for a binary classification task, suggesting that the model is likely **underfitting**. It's too simple to capture the underlying patterns in the data effectively.\n",
1320
  "\n",
1321
  "### 4.2 Random Forest Classifier\n",
1322
  "\n",
1323
- "The base Random Forest model produced a `train_score` of 1.0 and a `test_score` of 0.707.\n",
1324
  "\n",
1325
  "**Interpretation:** The perfect `train_score` of 1.0 is a clear and severe sign of **overfitting**. The model has essentially memorized the training data, and this does not generalize well to unseen data, as shown by the much lower `test_score`. While the test score is better than the Logistic Regression, the model is too complex and needs to be regularized or tuned to perform better.\n",
1326
  "\n",
1327
  "### 4.3 Randomized Search with Cross Validations (Random Forest)\n",
1328
  "\n",
1329
- "The hyperparameter-tuned Random Forest model achieved a `train_score` of 0.820 and a `test_score` of 0.731.\n",
1330
  "\n",
1331
  "**Interpretation:** This is a much better result than the base Random Forest. The gap between the `train_score` and `test_score` is significantly smaller, indicating that the hyperparameter tuning successfully **reduced overfitting**. The `test_score` of 0.731 is also a notable improvement, showing that the model now generalizes better to unseen data. This is a well-performing and well-tuned model.\n",
1332
  "\n",
1333
  "### 4.4 LightGBM\n",
1334
  "\n",
1335
- "The LightGBM model produced a `train_score` of 0.852 and a `test_score` of 0.751.\n",
1336
  "\n",
1337
  "**Interpretation:** The LightGBM model shows the best overall performance with the highest `test_score` of 0.751. There is a small gap between the training and testing scores, which is normal for a powerful boosting model, suggesting a good balance between capturing complex patterns and generalizing well. The model is performing exceptionally and is neither severely overfitting nor underfitting.\n"
1338
  ]
@@ -1342,7 +1344,7 @@
1342
  "id": "5d48c191",
1343
  "metadata": {},
1344
  "source": [
1345
- "## 5. Conclusion\n"
1346
  ]
1347
  },
1348
  {
@@ -1372,6 +1374,612 @@
1372
  "source": [
1373
  "> 🥇 Therefore, we will select the **LightGBM** model as our final choice for deployment.\n"
1374
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1375
  }
1376
  ],
1377
  "metadata": {
 
10
  },
11
  {
12
  "cell_type": "code",
13
+ "execution_count": 12,
14
  "id": "vblA",
15
  "metadata": {},
16
  "outputs": [],
17
  "source": [
18
  "import pandas as pd\n",
19
+ "import joblib\n",
20
+ "import numpy as np\n",
21
  "\n",
22
  "from sklearn.ensemble import RandomForestClassifier\n",
23
  "from sklearn.linear_model import LogisticRegression\n",
 
45
  },
46
  {
47
  "cell_type": "code",
48
+ "execution_count": 3,
49
  "id": "bkHC",
50
  "metadata": {},
51
  "outputs": [],
 
830
  },
831
  {
832
  "cell_type": "code",
833
+ "execution_count": 4,
834
  "id": "kqZH",
835
  "metadata": {},
836
  "outputs": [
 
840
  "((196806, 121), (196806,), (49202, 121), (49202,))"
841
  ]
842
  },
843
+ "execution_count": 4,
844
  "metadata": {},
845
  "output_type": "execute_result"
846
  }
 
1180
  },
1181
  {
1182
  "cell_type": "code",
1183
+ "execution_count": 5,
1184
  "id": "cEAS",
1185
  "metadata": {},
1186
  "outputs": [
 
1189
  "output_type": "stream",
1190
  "text": [
1191
  "[LightGBM] [Info] Number of positive: 15784, number of negative: 181022\n",
1192
+ "[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060479 seconds.\n",
1193
  "You can set `force_col_wise=true` to remove the overhead.\n",
1194
  "[LightGBM] [Info] Total Bins 11594\n",
1195
  "[LightGBM] [Info] Number of data points in the train set: 196806, number of used features: 229\n",
 
1316
  "source": [
1317
  "### 4.1 Logistic Regression\n",
1318
  "\n",
1319
+ "_The Logistic Regression model shows a `train_score` of 0.687 and a `test_score` of 0.685._\n",
1320
  "\n",
1321
  "**Interpretation:** This model's performance is consistent across the training and testing sets, as indicated by the very small gap between the scores. This means the model is not overfitting. However, the overall scores are relatively low for a binary classification task, suggesting that the model is likely **underfitting**. It's too simple to capture the underlying patterns in the data effectively.\n",
1322
  "\n",
1323
  "### 4.2 Random Forest Classifier\n",
1324
  "\n",
1325
+ "_The base Random Forest model produced a `train_score` of 1.0 and a `test_score` of 0.707._\n",
1326
  "\n",
1327
  "**Interpretation:** The perfect `train_score` of 1.0 is a clear and severe sign of **overfitting**. The model has essentially memorized the training data, and this does not generalize well to unseen data, as shown by the much lower `test_score`. While the test score is better than the Logistic Regression, the model is too complex and needs to be regularized or tuned to perform better.\n",
1328
  "\n",
1329
  "### 4.3 Randomized Search with Cross Validations (Random Forest)\n",
1330
  "\n",
1331
+ "_The hyperparameter-tuned Random Forest model achieved a `train_score` of 0.820 and a `test_score` of 0.731._\n",
1332
  "\n",
1333
  "**Interpretation:** This is a much better result than the base Random Forest. The gap between the `train_score` and `test_score` is significantly smaller, indicating that the hyperparameter tuning successfully **reduced overfitting**. The `test_score` of 0.731 is also a notable improvement, showing that the model now generalizes better to unseen data. This is a well-performing and well-tuned model.\n",
1334
  "\n",
1335
  "### 4.4 LightGBM\n",
1336
  "\n",
1337
+ "_The LightGBM model produced a `train_score` of 0.852 and a `test_score` of 0.751._\n",
1338
  "\n",
1339
  "**Interpretation:** The LightGBM model shows the best overall performance with the highest `test_score` of 0.751. There is a small gap between the training and testing scores, which is normal for a powerful boosting model, suggesting a good balance between capturing complex patterns and generalizing well. The model is performing exceptionally and is neither severely overfitting nor underfitting.\n"
1340
  ]
 
1344
  "id": "5d48c191",
1345
  "metadata": {},
1346
  "source": [
1347
+ "## 5. Model Selection\n"
1348
  ]
1349
  },
1350
  {
 
1374
  "source": [
1375
  "> 🥇 Therefore, we will select the **LightGBM** model as our final choice for deployment.\n"
1376
  ]
1377
+ },
1378
+ {
1379
+ "cell_type": "markdown",
1380
+ "id": "4aa60dcb",
1381
+ "metadata": {},
1382
+ "source": [
1383
+ "## 6. Saving the Model\n"
1384
+ ]
1385
+ },
1386
+ {
1387
+ "cell_type": "markdown",
1388
+ "id": "bca2853b",
1389
+ "metadata": {},
1390
+ "source": [
1391
+ "### 6.1 Saving the Model\n"
1392
+ ]
1393
+ },
1394
+ {
1395
+ "cell_type": "code",
1396
+ "execution_count": 10,
1397
+ "id": "3246c249",
1398
+ "metadata": {},
1399
+ "outputs": [
1400
+ {
1401
+ "name": "stdout",
1402
+ "output_type": "stream",
1403
+ "text": [
1404
+ "Model saved successfully as lgbm_model.joblib\n"
1405
+ ]
1406
+ }
1407
+ ],
1408
+ "source": [
1409
+ "joblib.dump(lgbm_pipeline, \"model/lgbm_model.joblib\")\n",
1410
+ "print(\"Model saved successfully as lgbm_model.joblib\")"
1411
+ ]
1412
+ },
1413
+ {
1414
+ "cell_type": "markdown",
1415
+ "id": "a69129f2",
1416
+ "metadata": {},
1417
+ "source": [
1418
+ "### 6.2 Feature Importances\n"
1419
+ ]
1420
+ },
1421
+ {
1422
+ "cell_type": "markdown",
1423
+ "id": "5db0729b",
1424
+ "metadata": {},
1425
+ "source": [
1426
+ "We will select the top 10 features based on their importances, in order to use them in the prediction interface.\n"
1427
+ ]
1428
+ },
1429
+ {
1430
+ "cell_type": "code",
1431
+ "execution_count": 15,
1432
+ "id": "12e917d7",
1433
+ "metadata": {},
1434
+ "outputs": [
1435
+ {
1436
+ "data": {
1437
+ "text/plain": [
1438
+ "array([ 86, 65, 21, 0, 72, 50, 0, 9, 23, 0, 6, 1, 10,\n",
1439
+ " 34, 0, 13, 0, 9, 28, 0, 0, 46, 27, 70, 10, 12,\n",
1440
+ " 47, 23, 58, 20, 27, 0, 21, 3, 30, 12, 12, 8, 11,\n",
1441
+ " 33, 9, 7, 43, 28, 3, 15, 0, 28, 17, 10, 15, 2,\n",
1442
+ " 2, 15, 2, 6, 2, 22, 35, 20, 25, 22, 21, 21, 3,\n",
1443
+ " 4, 10, 7, 8, 31, 0, 32, 1, 7, 1, 12, 4, 1,\n",
1444
+ " 2, 0, 2, 17, 0, 0, 10, 2, 8, 0, 8, 0, 20,\n",
1445
+ " 0, 6, 9, 12, 23, 0, 6, 9, 3, 23, 0, 6, 23,\n",
1446
+ " 3, 18, 28, 10, 0, 0, 4, 12, 0, 0, 4, 4, 3,\n",
1447
+ " 6, 23, 10, 5, 0, 13, 14, 11, 7, 2, 5, 1, 10,\n",
1448
+ " 2, 2, 5, 15, 19, 0, 536, 47, 410, 642, 660, 489, 375,\n",
1449
+ " 752, 630, 556, 628, 196, 0, 2, 50, 0, 45, 13, 75, 24,\n",
1450
+ " 54, 230, 0, 23, 10, 36, 24, 16, 732, 792, 833, 130, 85,\n",
1451
+ " 100, 58, 77, 42, 66, 57, 36, 116, 75, 90, 54, 106, 106,\n",
1452
+ " 129, 112, 69, 111, 26, 39, 10, 22, 104, 69, 131, 37, 99,\n",
1453
+ " 47, 67, 93, 29, 40, 7, 23, 18, 16, 67, 34, 43, 19,\n",
1454
+ " 76, 176, 121, 68, 38, 57, 536, 0, 75, 0, 8, 10, 0,\n",
1455
+ " 10, 5, 0, 16, 0, 21, 14, 14, 33, 0, 31, 5, 0,\n",
1456
+ " 1, 2, 10, 29, 69, 82, 141], dtype=int32)"
1457
+ ]
1458
+ },
1459
+ "execution_count": 15,
1460
+ "metadata": {},
1461
+ "output_type": "execute_result"
1462
+ }
1463
+ ],
1464
+ "source": [
1465
+ "loaded_pipeline = joblib.load(\"model/lgbm_model.joblib\")\n",
1466
+ "feature_importances = loaded_pipeline.named_steps[\"classifier\"].feature_importances_\n",
1467
+ "feature_importances"
1468
+ ]
1469
+ },
1470
+ {
1471
+ "cell_type": "code",
1472
+ "execution_count": 17,
1473
+ "id": "ad40c56f",
1474
+ "metadata": {},
1475
+ "outputs": [
1476
+ {
1477
+ "data": {
1478
+ "text/plain": [
1479
+ "array(['binary__NAME_CONTRACT_TYPE', 'binary__FLAG_OWN_CAR',\n",
1480
+ " 'binary__FLAG_OWN_REALTY', 'binary__EMERGENCYSTATE_MODE',\n",
1481
+ " 'multi__CODE_GENDER_F', 'multi__CODE_GENDER_M',\n",
1482
+ " 'multi__CODE_GENDER_XNA', 'multi__NAME_TYPE_SUITE_Children',\n",
1483
+ " 'multi__NAME_TYPE_SUITE_Family',\n",
1484
+ " 'multi__NAME_TYPE_SUITE_Group of people',\n",
1485
+ " 'multi__NAME_TYPE_SUITE_Other_A', 'multi__NAME_TYPE_SUITE_Other_B',\n",
1486
+ " 'multi__NAME_TYPE_SUITE_Spouse, partner',\n",
1487
+ " 'multi__NAME_TYPE_SUITE_Unaccompanied',\n",
1488
+ " 'multi__NAME_INCOME_TYPE_Businessman',\n",
1489
+ " 'multi__NAME_INCOME_TYPE_Commercial associate',\n",
1490
+ " 'multi__NAME_INCOME_TYPE_Maternity leave',\n",
1491
+ " 'multi__NAME_INCOME_TYPE_Pensioner',\n",
1492
+ " 'multi__NAME_INCOME_TYPE_State servant',\n",
1493
+ " 'multi__NAME_INCOME_TYPE_Student',\n",
1494
+ " 'multi__NAME_INCOME_TYPE_Unemployed',\n",
1495
+ " 'multi__NAME_INCOME_TYPE_Working',\n",
1496
+ " 'multi__NAME_EDUCATION_TYPE_Academic degree',\n",
1497
+ " 'multi__NAME_EDUCATION_TYPE_Higher education',\n",
1498
+ " 'multi__NAME_EDUCATION_TYPE_Incomplete higher',\n",
1499
+ " 'multi__NAME_EDUCATION_TYPE_Lower secondary',\n",
1500
+ " 'multi__NAME_EDUCATION_TYPE_Secondary / secondary special',\n",
1501
+ " 'multi__NAME_FAMILY_STATUS_Civil marriage',\n",
1502
+ " 'multi__NAME_FAMILY_STATUS_Married',\n",
1503
+ " 'multi__NAME_FAMILY_STATUS_Separated',\n",
1504
+ " 'multi__NAME_FAMILY_STATUS_Single / not married',\n",
1505
+ " 'multi__NAME_FAMILY_STATUS_Unknown',\n",
1506
+ " 'multi__NAME_FAMILY_STATUS_Widow',\n",
1507
+ " 'multi__NAME_HOUSING_TYPE_Co-op apartment',\n",
1508
+ " 'multi__NAME_HOUSING_TYPE_House / apartment',\n",
1509
+ " 'multi__NAME_HOUSING_TYPE_Municipal apartment',\n",
1510
+ " 'multi__NAME_HOUSING_TYPE_Office apartment',\n",
1511
+ " 'multi__NAME_HOUSING_TYPE_Rented apartment',\n",
1512
+ " 'multi__NAME_HOUSING_TYPE_With parents',\n",
1513
+ " 'multi__OCCUPATION_TYPE_Accountants',\n",
1514
+ " 'multi__OCCUPATION_TYPE_Cleaning staff',\n",
1515
+ " 'multi__OCCUPATION_TYPE_Cooking staff',\n",
1516
+ " 'multi__OCCUPATION_TYPE_Core staff',\n",
1517
+ " 'multi__OCCUPATION_TYPE_Drivers',\n",
1518
+ " 'multi__OCCUPATION_TYPE_HR staff',\n",
1519
+ " 'multi__OCCUPATION_TYPE_High skill tech staff',\n",
1520
+ " 'multi__OCCUPATION_TYPE_IT staff',\n",
1521
+ " 'multi__OCCUPATION_TYPE_Laborers',\n",
1522
+ " 'multi__OCCUPATION_TYPE_Low-skill Laborers',\n",
1523
+ " 'multi__OCCUPATION_TYPE_Managers',\n",
1524
+ " 'multi__OCCUPATION_TYPE_Medicine staff',\n",
1525
+ " 'multi__OCCUPATION_TYPE_Private service staff',\n",
1526
+ " 'multi__OCCUPATION_TYPE_Realty agents',\n",
1527
+ " 'multi__OCCUPATION_TYPE_Sales staff',\n",
1528
+ " 'multi__OCCUPATION_TYPE_Secretaries',\n",
1529
+ " 'multi__OCCUPATION_TYPE_Security staff',\n",
1530
+ " 'multi__OCCUPATION_TYPE_Waiters/barmen staff',\n",
1531
+ " 'multi__WEEKDAY_APPR_PROCESS_START_FRIDAY',\n",
1532
+ " 'multi__WEEKDAY_APPR_PROCESS_START_MONDAY',\n",
1533
+ " 'multi__WEEKDAY_APPR_PROCESS_START_SATURDAY',\n",
1534
+ " 'multi__WEEKDAY_APPR_PROCESS_START_SUNDAY',\n",
1535
+ " 'multi__WEEKDAY_APPR_PROCESS_START_THURSDAY',\n",
1536
+ " 'multi__WEEKDAY_APPR_PROCESS_START_TUESDAY',\n",
1537
+ " 'multi__WEEKDAY_APPR_PROCESS_START_WEDNESDAY',\n",
1538
+ " 'multi__ORGANIZATION_TYPE_Advertising',\n",
1539
+ " 'multi__ORGANIZATION_TYPE_Agriculture',\n",
1540
+ " 'multi__ORGANIZATION_TYPE_Bank',\n",
1541
+ " 'multi__ORGANIZATION_TYPE_Business Entity Type 1',\n",
1542
+ " 'multi__ORGANIZATION_TYPE_Business Entity Type 2',\n",
1543
+ " 'multi__ORGANIZATION_TYPE_Business Entity Type 3',\n",
1544
+ " 'multi__ORGANIZATION_TYPE_Cleaning',\n",
1545
+ " 'multi__ORGANIZATION_TYPE_Construction',\n",
1546
+ " 'multi__ORGANIZATION_TYPE_Culture',\n",
1547
+ " 'multi__ORGANIZATION_TYPE_Electricity',\n",
1548
+ " 'multi__ORGANIZATION_TYPE_Emergency',\n",
1549
+ " 'multi__ORGANIZATION_TYPE_Government',\n",
1550
+ " 'multi__ORGANIZATION_TYPE_Hotel',\n",
1551
+ " 'multi__ORGANIZATION_TYPE_Housing',\n",
1552
+ " 'multi__ORGANIZATION_TYPE_Industry: type 1',\n",
1553
+ " 'multi__ORGANIZATION_TYPE_Industry: type 10',\n",
1554
+ " 'multi__ORGANIZATION_TYPE_Industry: type 11',\n",
1555
+ " 'multi__ORGANIZATION_TYPE_Industry: type 12',\n",
1556
+ " 'multi__ORGANIZATION_TYPE_Industry: type 13',\n",
1557
+ " 'multi__ORGANIZATION_TYPE_Industry: type 2',\n",
1558
+ " 'multi__ORGANIZATION_TYPE_Industry: type 3',\n",
1559
+ " 'multi__ORGANIZATION_TYPE_Industry: type 4',\n",
1560
+ " 'multi__ORGANIZATION_TYPE_Industry: type 5',\n",
1561
+ " 'multi__ORGANIZATION_TYPE_Industry: type 6',\n",
1562
+ " 'multi__ORGANIZATION_TYPE_Industry: type 7',\n",
1563
+ " 'multi__ORGANIZATION_TYPE_Industry: type 8',\n",
1564
+ " 'multi__ORGANIZATION_TYPE_Industry: type 9',\n",
1565
+ " 'multi__ORGANIZATION_TYPE_Insurance',\n",
1566
+ " 'multi__ORGANIZATION_TYPE_Kindergarten',\n",
1567
+ " 'multi__ORGANIZATION_TYPE_Legal Services',\n",
1568
+ " 'multi__ORGANIZATION_TYPE_Medicine',\n",
1569
+ " 'multi__ORGANIZATION_TYPE_Military',\n",
1570
+ " 'multi__ORGANIZATION_TYPE_Mobile',\n",
1571
+ " 'multi__ORGANIZATION_TYPE_Other',\n",
1572
+ " 'multi__ORGANIZATION_TYPE_Police',\n",
1573
+ " 'multi__ORGANIZATION_TYPE_Postal',\n",
1574
+ " 'multi__ORGANIZATION_TYPE_Realtor',\n",
1575
+ " 'multi__ORGANIZATION_TYPE_Religion',\n",
1576
+ " 'multi__ORGANIZATION_TYPE_Restaurant',\n",
1577
+ " 'multi__ORGANIZATION_TYPE_School',\n",
1578
+ " 'multi__ORGANIZATION_TYPE_Security',\n",
1579
+ " 'multi__ORGANIZATION_TYPE_Security Ministries',\n",
1580
+ " 'multi__ORGANIZATION_TYPE_Self-employed',\n",
1581
+ " 'multi__ORGANIZATION_TYPE_Services',\n",
1582
+ " 'multi__ORGANIZATION_TYPE_Telecom',\n",
1583
+ " 'multi__ORGANIZATION_TYPE_Trade: type 1',\n",
1584
+ " 'multi__ORGANIZATION_TYPE_Trade: type 2',\n",
1585
+ " 'multi__ORGANIZATION_TYPE_Trade: type 3',\n",
1586
+ " 'multi__ORGANIZATION_TYPE_Trade: type 4',\n",
1587
+ " 'multi__ORGANIZATION_TYPE_Trade: type 5',\n",
1588
+ " 'multi__ORGANIZATION_TYPE_Trade: type 6',\n",
1589
+ " 'multi__ORGANIZATION_TYPE_Trade: type 7',\n",
1590
+ " 'multi__ORGANIZATION_TYPE_Transport: type 1',\n",
1591
+ " 'multi__ORGANIZATION_TYPE_Transport: type 2',\n",
1592
+ " 'multi__ORGANIZATION_TYPE_Transport: type 3',\n",
1593
+ " 'multi__ORGANIZATION_TYPE_Transport: type 4',\n",
1594
+ " 'multi__ORGANIZATION_TYPE_University',\n",
1595
+ " 'multi__ORGANIZATION_TYPE_XNA',\n",
1596
+ " 'multi__FONDKAPREMONT_MODE_not specified',\n",
1597
+ " 'multi__FONDKAPREMONT_MODE_org spec account',\n",
1598
+ " 'multi__FONDKAPREMONT_MODE_reg oper account',\n",
1599
+ " 'multi__FONDKAPREMONT_MODE_reg oper spec account',\n",
1600
+ " 'multi__HOUSETYPE_MODE_block of flats',\n",
1601
+ " 'multi__HOUSETYPE_MODE_specific housing',\n",
1602
+ " 'multi__HOUSETYPE_MODE_terraced house',\n",
1603
+ " 'multi__WALLSMATERIAL_MODE_Block',\n",
1604
+ " 'multi__WALLSMATERIAL_MODE_Mixed',\n",
1605
+ " 'multi__WALLSMATERIAL_MODE_Monolithic',\n",
1606
+ " 'multi__WALLSMATERIAL_MODE_Others',\n",
1607
+ " 'multi__WALLSMATERIAL_MODE_Panel',\n",
1608
+ " 'multi__WALLSMATERIAL_MODE_Stone, brick',\n",
1609
+ " 'multi__WALLSMATERIAL_MODE_Wooden', 'numerical__SK_ID_CURR',\n",
1610
+ " 'numerical__CNT_CHILDREN', 'numerical__AMT_INCOME_TOTAL',\n",
1611
+ " 'numerical__AMT_CREDIT', 'numerical__AMT_ANNUITY',\n",
1612
+ " 'numerical__AMT_GOODS_PRICE',\n",
1613
+ " 'numerical__REGION_POPULATION_RELATIVE', 'numerical__DAYS_BIRTH',\n",
1614
+ " 'numerical__DAYS_EMPLOYED', 'numerical__DAYS_REGISTRATION',\n",
1615
+ " 'numerical__DAYS_ID_PUBLISH', 'numerical__OWN_CAR_AGE',\n",
1616
+ " 'numerical__FLAG_MOBIL', 'numerical__FLAG_EMP_PHONE',\n",
1617
+ " 'numerical__FLAG_WORK_PHONE', 'numerical__FLAG_CONT_MOBILE',\n",
1618
+ " 'numerical__FLAG_PHONE', 'numerical__FLAG_EMAIL',\n",
1619
+ " 'numerical__CNT_FAM_MEMBERS', 'numerical__REGION_RATING_CLIENT',\n",
1620
+ " 'numerical__REGION_RATING_CLIENT_W_CITY',\n",
1621
+ " 'numerical__HOUR_APPR_PROCESS_START',\n",
1622
+ " 'numerical__REG_REGION_NOT_LIVE_REGION',\n",
1623
+ " 'numerical__REG_REGION_NOT_WORK_REGION',\n",
1624
+ " 'numerical__LIVE_REGION_NOT_WORK_REGION',\n",
1625
+ " 'numerical__REG_CITY_NOT_LIVE_CITY',\n",
1626
+ " 'numerical__REG_CITY_NOT_WORK_CITY',\n",
1627
+ " 'numerical__LIVE_CITY_NOT_WORK_CITY', 'numerical__EXT_SOURCE_1',\n",
1628
+ " 'numerical__EXT_SOURCE_2', 'numerical__EXT_SOURCE_3',\n",
1629
+ " 'numerical__APARTMENTS_AVG', 'numerical__BASEMENTAREA_AVG',\n",
1630
+ " 'numerical__YEARS_BEGINEXPLUATATION_AVG',\n",
1631
+ " 'numerical__YEARS_BUILD_AVG', 'numerical__COMMONAREA_AVG',\n",
1632
+ " 'numerical__ELEVATORS_AVG', 'numerical__ENTRANCES_AVG',\n",
1633
+ " 'numerical__FLOORSMAX_AVG', 'numerical__FLOORSMIN_AVG',\n",
1634
+ " 'numerical__LANDAREA_AVG', 'numerical__LIVINGAPARTMENTS_AVG',\n",
1635
+ " 'numerical__LIVINGAREA_AVG', 'numerical__NONLIVINGAPARTMENTS_AVG',\n",
1636
+ " 'numerical__NONLIVINGAREA_AVG', 'numerical__APARTMENTS_MODE',\n",
1637
+ " 'numerical__BASEMENTAREA_MODE',\n",
1638
+ " 'numerical__YEARS_BEGINEXPLUATATION_MODE',\n",
1639
+ " 'numerical__YEARS_BUILD_MODE', 'numerical__COMMONAREA_MODE',\n",
1640
+ " 'numerical__ELEVATORS_MODE', 'numerical__ENTRANCES_MODE',\n",
1641
+ " 'numerical__FLOORSMAX_MODE', 'numerical__FLOORSMIN_MODE',\n",
1642
+ " 'numerical__LANDAREA_MODE', 'numerical__LIVINGAPARTMENTS_MODE',\n",
1643
+ " 'numerical__LIVINGAREA_MODE',\n",
1644
+ " 'numerical__NONLIVINGAPARTMENTS_MODE',\n",
1645
+ " 'numerical__NONLIVINGAREA_MODE', 'numerical__APARTMENTS_MEDI',\n",
1646
+ " 'numerical__BASEMENTAREA_MEDI',\n",
1647
+ " 'numerical__YEARS_BEGINEXPLUATATION_MEDI',\n",
1648
+ " 'numerical__YEARS_BUILD_MEDI', 'numerical__COMMONAREA_MEDI',\n",
1649
+ " 'numerical__ELEVATORS_MEDI', 'numerical__ENTRANCES_MEDI',\n",
1650
+ " 'numerical__FLOORSMAX_MEDI', 'numerical__FLOORSMIN_MEDI',\n",
1651
+ " 'numerical__LANDAREA_MEDI', 'numerical__LIVINGAPARTMENTS_MEDI',\n",
1652
+ " 'numerical__LIVINGAREA_MEDI',\n",
1653
+ " 'numerical__NONLIVINGAPARTMENTS_MEDI',\n",
1654
+ " 'numerical__NONLIVINGAREA_MEDI', 'numerical__TOTALAREA_MODE',\n",
1655
+ " 'numerical__OBS_30_CNT_SOCIAL_CIRCLE',\n",
1656
+ " 'numerical__DEF_30_CNT_SOCIAL_CIRCLE',\n",
1657
+ " 'numerical__OBS_60_CNT_SOCIAL_CIRCLE',\n",
1658
+ " 'numerical__DEF_60_CNT_SOCIAL_CIRCLE',\n",
1659
+ " 'numerical__DAYS_LAST_PHONE_CHANGE', 'numerical__FLAG_DOCUMENT_2',\n",
1660
+ " 'numerical__FLAG_DOCUMENT_3', 'numerical__FLAG_DOCUMENT_4',\n",
1661
+ " 'numerical__FLAG_DOCUMENT_5', 'numerical__FLAG_DOCUMENT_6',\n",
1662
+ " 'numerical__FLAG_DOCUMENT_7', 'numerical__FLAG_DOCUMENT_8',\n",
1663
+ " 'numerical__FLAG_DOCUMENT_9', 'numerical__FLAG_DOCUMENT_10',\n",
1664
+ " 'numerical__FLAG_DOCUMENT_11', 'numerical__FLAG_DOCUMENT_12',\n",
1665
+ " 'numerical__FLAG_DOCUMENT_13', 'numerical__FLAG_DOCUMENT_14',\n",
1666
+ " 'numerical__FLAG_DOCUMENT_15', 'numerical__FLAG_DOCUMENT_16',\n",
1667
+ " 'numerical__FLAG_DOCUMENT_17', 'numerical__FLAG_DOCUMENT_18',\n",
1668
+ " 'numerical__FLAG_DOCUMENT_19', 'numerical__FLAG_DOCUMENT_20',\n",
1669
+ " 'numerical__FLAG_DOCUMENT_21',\n",
1670
+ " 'numerical__AMT_REQ_CREDIT_BUREAU_HOUR',\n",
1671
+ " 'numerical__AMT_REQ_CREDIT_BUREAU_DAY',\n",
1672
+ " 'numerical__AMT_REQ_CREDIT_BUREAU_WEEK',\n",
1673
+ " 'numerical__AMT_REQ_CREDIT_BUREAU_MON',\n",
1674
+ " 'numerical__AMT_REQ_CREDIT_BUREAU_QRT',\n",
1675
+ " 'numerical__AMT_REQ_CREDIT_BUREAU_YEAR'], dtype=object)"
1676
+ ]
1677
+ },
1678
+ "execution_count": 17,
1679
+ "metadata": {},
1680
+ "output_type": "execute_result"
1681
+ }
1682
+ ],
1683
+ "source": [
1684
+ "# Get then names of the final features after preprocessing\n",
1685
+ "preprocessor = loaded_pipeline.named_steps[\"preprocessor\"]\n",
1686
+ "final_features_names = preprocessor.get_feature_names_out()\n",
1687
+ "final_features_names"
1688
+ ]
1689
+ },
1690
+ {
1691
+ "cell_type": "code",
1692
+ "execution_count": null,
1693
+ "id": "336a580f",
1694
+ "metadata": {},
1695
+ "outputs": [],
1696
+ "source": [
1697
+ "# Create a DataFrame to store the feature names and their corresponding importances\n",
1698
+ "feature_importances_df = pd.DataFrame(\n",
1699
+ " {\"feature\": final_features_names, \"importance\": feature_importances}\n",
1700
+ ")\n",
1701
+ "\n",
1702
+ "sorted_feature_importance = feature_importances_df.sort_values(\n",
1703
+ " by=\"importance\", ascending=False\n",
1704
+ ").reset_index(drop=True)"
1705
+ ]
1706
+ },
1707
+ {
1708
+ "cell_type": "markdown",
1709
+ "id": "e86a01f7",
1710
+ "metadata": {},
1711
+ "source": [
1712
+ "**Top 10 most important features**\n"
1713
+ ]
1714
+ },
1715
+ {
1716
+ "cell_type": "code",
1717
+ "execution_count": 21,
1718
+ "id": "c7bc9e30",
1719
+ "metadata": {},
1720
+ "outputs": [
1721
+ {
1722
+ "data": {
1723
+ "text/html": [
1724
+ "<div>\n",
1725
+ "<style scoped>\n",
1726
+ " .dataframe tbody tr th:only-of-type {\n",
1727
+ " vertical-align: middle;\n",
1728
+ " }\n",
1729
+ "\n",
1730
+ " .dataframe tbody tr th {\n",
1731
+ " vertical-align: top;\n",
1732
+ " }\n",
1733
+ "\n",
1734
+ " .dataframe thead th {\n",
1735
+ " text-align: right;\n",
1736
+ " }\n",
1737
+ "</style>\n",
1738
+ "<table border=\"1\" class=\"dataframe\">\n",
1739
+ " <thead>\n",
1740
+ " <tr style=\"text-align: right;\">\n",
1741
+ " <th></th>\n",
1742
+ " <th>feature</th>\n",
1743
+ " <th>importance</th>\n",
1744
+ " </tr>\n",
1745
+ " </thead>\n",
1746
+ " <tbody>\n",
1747
+ " <tr>\n",
1748
+ " <th>0</th>\n",
1749
+ " <td>numerical__EXT_SOURCE_3</td>\n",
1750
+ " <td>833</td>\n",
1751
+ " </tr>\n",
1752
+ " <tr>\n",
1753
+ " <th>1</th>\n",
1754
+ " <td>numerical__EXT_SOURCE_2</td>\n",
1755
+ " <td>792</td>\n",
1756
+ " </tr>\n",
1757
+ " <tr>\n",
1758
+ " <th>2</th>\n",
1759
+ " <td>numerical__DAYS_BIRTH</td>\n",
1760
+ " <td>752</td>\n",
1761
+ " </tr>\n",
1762
+ " <tr>\n",
1763
+ " <th>3</th>\n",
1764
+ " <td>numerical__EXT_SOURCE_1</td>\n",
1765
+ " <td>732</td>\n",
1766
+ " </tr>\n",
1767
+ " <tr>\n",
1768
+ " <th>4</th>\n",
1769
+ " <td>numerical__AMT_ANNUITY</td>\n",
1770
+ " <td>660</td>\n",
1771
+ " </tr>\n",
1772
+ " <tr>\n",
1773
+ " <th>5</th>\n",
1774
+ " <td>numerical__AMT_CREDIT</td>\n",
1775
+ " <td>642</td>\n",
1776
+ " </tr>\n",
1777
+ " <tr>\n",
1778
+ " <th>6</th>\n",
1779
+ " <td>numerical__DAYS_EMPLOYED</td>\n",
1780
+ " <td>630</td>\n",
1781
+ " </tr>\n",
1782
+ " <tr>\n",
1783
+ " <th>7</th>\n",
1784
+ " <td>numerical__DAYS_ID_PUBLISH</td>\n",
1785
+ " <td>628</td>\n",
1786
+ " </tr>\n",
1787
+ " <tr>\n",
1788
+ " <th>8</th>\n",
1789
+ " <td>numerical__DAYS_REGISTRATION</td>\n",
1790
+ " <td>556</td>\n",
1791
+ " </tr>\n",
1792
+ " <tr>\n",
1793
+ " <th>9</th>\n",
1794
+ " <td>numerical__SK_ID_CURR</td>\n",
1795
+ " <td>536</td>\n",
1796
+ " </tr>\n",
1797
+ " </tbody>\n",
1798
+ "</table>\n",
1799
+ "</div>"
1800
+ ],
1801
+ "text/plain": [
1802
+ " feature importance\n",
1803
+ "0 numerical__EXT_SOURCE_3 833\n",
1804
+ "1 numerical__EXT_SOURCE_2 792\n",
1805
+ "2 numerical__DAYS_BIRTH 752\n",
1806
+ "3 numerical__EXT_SOURCE_1 732\n",
1807
+ "4 numerical__AMT_ANNUITY 660\n",
1808
+ "5 numerical__AMT_CREDIT 642\n",
1809
+ "6 numerical__DAYS_EMPLOYED 630\n",
1810
+ "7 numerical__DAYS_ID_PUBLISH 628\n",
1811
+ "8 numerical__DAYS_REGISTRATION 556\n",
1812
+ "9 numerical__SK_ID_CURR 536"
1813
+ ]
1814
+ },
1815
+ "execution_count": 21,
1816
+ "metadata": {},
1817
+ "output_type": "execute_result"
1818
+ }
1819
+ ],
1820
+ "source": [
1821
+ "sorted_feature_importance.head(10)"
1822
+ ]
1823
+ },
1824
+ {
1825
+ "cell_type": "markdown",
1826
+ "id": "37d77ecb",
1827
+ "metadata": {},
1828
+ "source": [
1829
+ "**Calculate default values for remaining features**\n"
1830
+ ]
1831
+ },
1832
+ {
1833
+ "cell_type": "code",
1834
+ "execution_count": 24,
1835
+ "id": "0c5f45cb",
1836
+ "metadata": {},
1837
+ "outputs": [
1838
+ {
1839
+ "data": {
1840
+ "text/plain": [
1841
+ "{'SK_ID_CURR': 277659.5,\n",
1842
+ " 'CNT_CHILDREN': 0.0,\n",
1843
+ " 'AMT_INCOME_TOTAL': 147150.0,\n",
1844
+ " 'AMT_CREDIT': 512997.75,\n",
1845
+ " 'AMT_ANNUITY': 24885.0,\n",
1846
+ " 'AMT_GOODS_PRICE': 450000.0,\n",
1847
+ " 'REGION_POPULATION_RELATIVE': 0.01885,\n",
1848
+ " 'DAYS_BIRTH': -15743.5,\n",
1849
+ " 'DAYS_EMPLOYED': -1219.0,\n",
1850
+ " 'DAYS_REGISTRATION': -4492.0,\n",
1851
+ " 'DAYS_ID_PUBLISH': -3254.0,\n",
1852
+ " 'OWN_CAR_AGE': 9.0,\n",
1853
+ " 'FLAG_MOBIL': 1.0,\n",
1854
+ " 'FLAG_EMP_PHONE': 1.0,\n",
1855
+ " 'FLAG_WORK_PHONE': 0.0,\n",
1856
+ " 'FLAG_CONT_MOBILE': 1.0,\n",
1857
+ " 'FLAG_PHONE': 0.0,\n",
1858
+ " 'FLAG_EMAIL': 0.0,\n",
1859
+ " 'CNT_FAM_MEMBERS': 2.0,\n",
1860
+ " 'REGION_RATING_CLIENT': 2.0,\n",
1861
+ " 'REGION_RATING_CLIENT_W_CITY': 2.0,\n",
1862
+ " 'HOUR_APPR_PROCESS_START': 12.0,\n",
1863
+ " 'REG_REGION_NOT_LIVE_REGION': 0.0,\n",
1864
+ " 'REG_REGION_NOT_WORK_REGION': 0.0,\n",
1865
+ " 'LIVE_REGION_NOT_WORK_REGION': 0.0,\n",
1866
+ " 'REG_CITY_NOT_LIVE_CITY': 0.0,\n",
1867
+ " 'REG_CITY_NOT_WORK_CITY': 0.0,\n",
1868
+ " 'LIVE_CITY_NOT_WORK_CITY': 0.0,\n",
1869
+ " 'EXT_SOURCE_1': 0.5068839442599388,\n",
1870
+ " 'EXT_SOURCE_2': 0.5662837032261614,\n",
1871
+ " 'EXT_SOURCE_3': 0.5370699579791587,\n",
1872
+ " 'APARTMENTS_AVG': 0.0876,\n",
1873
+ " 'BASEMENTAREA_AVG': 0.0764,\n",
1874
+ " 'YEARS_BEGINEXPLUATATION_AVG': 0.9816,\n",
1875
+ " 'YEARS_BUILD_AVG': 0.7552,\n",
1876
+ " 'COMMONAREA_AVG': 0.0211,\n",
1877
+ " 'ELEVATORS_AVG': 0.0,\n",
1878
+ " 'ENTRANCES_AVG': 0.1379,\n",
1879
+ " 'FLOORSMAX_AVG': 0.1667,\n",
1880
+ " 'FLOORSMIN_AVG': 0.2083,\n",
1881
+ " 'LANDAREA_AVG': 0.0483,\n",
1882
+ " 'LIVINGAPARTMENTS_AVG': 0.0756,\n",
1883
+ " 'LIVINGAREA_AVG': 0.0746,\n",
1884
+ " 'NONLIVINGAPARTMENTS_AVG': 0.0,\n",
1885
+ " 'NONLIVINGAREA_AVG': 0.0035,\n",
1886
+ " 'APARTMENTS_MODE': 0.084,\n",
1887
+ " 'BASEMENTAREA_MODE': 0.0748,\n",
1888
+ " 'YEARS_BEGINEXPLUATATION_MODE': 0.9816,\n",
1889
+ " 'YEARS_BUILD_MODE': 0.7648,\n",
1890
+ " 'COMMONAREA_MODE': 0.0191,\n",
1891
+ " 'ELEVATORS_MODE': 0.0,\n",
1892
+ " 'ENTRANCES_MODE': 0.1379,\n",
1893
+ " 'FLOORSMAX_MODE': 0.1667,\n",
1894
+ " 'FLOORSMIN_MODE': 0.2083,\n",
1895
+ " 'LANDAREA_MODE': 0.0459,\n",
1896
+ " 'LIVINGAPARTMENTS_MODE': 0.0771,\n",
1897
+ " 'LIVINGAREA_MODE': 0.0731,\n",
1898
+ " 'NONLIVINGAPARTMENTS_MODE': 0.0,\n",
1899
+ " 'NONLIVINGAREA_MODE': 0.0011,\n",
1900
+ " 'APARTMENTS_MEDI': 0.0864,\n",
1901
+ " 'BASEMENTAREA_MEDI': 0.0761,\n",
1902
+ " 'YEARS_BEGINEXPLUATATION_MEDI': 0.9816,\n",
1903
+ " 'YEARS_BUILD_MEDI': 0.7585,\n",
1904
+ " 'COMMONAREA_MEDI': 0.0209,\n",
1905
+ " 'ELEVATORS_MEDI': 0.0,\n",
1906
+ " 'ENTRANCES_MEDI': 0.1379,\n",
1907
+ " 'FLOORSMAX_MEDI': 0.1667,\n",
1908
+ " 'FLOORSMIN_MEDI': 0.2083,\n",
1909
+ " 'LANDAREA_MEDI': 0.0488,\n",
1910
+ " 'LIVINGAPARTMENTS_MEDI': 0.0765,\n",
1911
+ " 'LIVINGAREA_MEDI': 0.0749,\n",
1912
+ " 'NONLIVINGAPARTMENTS_MEDI': 0.0,\n",
1913
+ " 'NONLIVINGAREA_MEDI': 0.003,\n",
1914
+ " 'TOTALAREA_MODE': 0.0687,\n",
1915
+ " 'OBS_30_CNT_SOCIAL_CIRCLE': 0.0,\n",
1916
+ " 'DEF_30_CNT_SOCIAL_CIRCLE': 0.0,\n",
1917
+ " 'OBS_60_CNT_SOCIAL_CIRCLE': 0.0,\n",
1918
+ " 'DEF_60_CNT_SOCIAL_CIRCLE': 0.0,\n",
1919
+ " 'DAYS_LAST_PHONE_CHANGE': -755.0,\n",
1920
+ " 'FLAG_DOCUMENT_2': 0.0,\n",
1921
+ " 'FLAG_DOCUMENT_3': 1.0,\n",
1922
+ " 'FLAG_DOCUMENT_4': 0.0,\n",
1923
+ " 'FLAG_DOCUMENT_5': 0.0,\n",
1924
+ " 'FLAG_DOCUMENT_6': 0.0,\n",
1925
+ " 'FLAG_DOCUMENT_7': 0.0,\n",
1926
+ " 'FLAG_DOCUMENT_8': 0.0,\n",
1927
+ " 'FLAG_DOCUMENT_9': 0.0,\n",
1928
+ " 'FLAG_DOCUMENT_10': 0.0,\n",
1929
+ " 'FLAG_DOCUMENT_11': 0.0,\n",
1930
+ " 'FLAG_DOCUMENT_12': 0.0,\n",
1931
+ " 'FLAG_DOCUMENT_13': 0.0,\n",
1932
+ " 'FLAG_DOCUMENT_14': 0.0,\n",
1933
+ " 'FLAG_DOCUMENT_15': 0.0,\n",
1934
+ " 'FLAG_DOCUMENT_16': 0.0,\n",
1935
+ " 'FLAG_DOCUMENT_17': 0.0,\n",
1936
+ " 'FLAG_DOCUMENT_18': 0.0,\n",
1937
+ " 'FLAG_DOCUMENT_19': 0.0,\n",
1938
+ " 'FLAG_DOCUMENT_20': 0.0,\n",
1939
+ " 'FLAG_DOCUMENT_21': 0.0,\n",
1940
+ " 'AMT_REQ_CREDIT_BUREAU_HOUR': 0.0,\n",
1941
+ " 'AMT_REQ_CREDIT_BUREAU_DAY': 0.0,\n",
1942
+ " 'AMT_REQ_CREDIT_BUREAU_WEEK': 0.0,\n",
1943
+ " 'AMT_REQ_CREDIT_BUREAU_MON': 0.0,\n",
1944
+ " 'AMT_REQ_CREDIT_BUREAU_QRT': 0.0,\n",
1945
+ " 'AMT_REQ_CREDIT_BUREAU_YEAR': 1.0,\n",
1946
+ " 'NAME_CONTRACT_TYPE': 'Cash loans',\n",
1947
+ " 'CODE_GENDER': 'F',\n",
1948
+ " 'FLAG_OWN_CAR': 'N',\n",
1949
+ " 'FLAG_OWN_REALTY': 'Y',\n",
1950
+ " 'NAME_TYPE_SUITE': 'Unaccompanied',\n",
1951
+ " 'NAME_INCOME_TYPE': 'Working',\n",
1952
+ " 'NAME_EDUCATION_TYPE': 'Secondary / secondary special',\n",
1953
+ " 'NAME_FAMILY_STATUS': 'Married',\n",
1954
+ " 'NAME_HOUSING_TYPE': 'House / apartment',\n",
1955
+ " 'OCCUPATION_TYPE': 'Laborers',\n",
1956
+ " 'WEEKDAY_APPR_PROCESS_START': 'TUESDAY',\n",
1957
+ " 'ORGANIZATION_TYPE': 'Business Entity Type 3',\n",
1958
+ " 'FONDKAPREMONT_MODE': 'reg oper account',\n",
1959
+ " 'HOUSETYPE_MODE': 'block of flats',\n",
1960
+ " 'WALLSMATERIAL_MODE': 'Panel',\n",
1961
+ " 'EMERGENCYSTATE_MODE': 'No'}"
1962
+ ]
1963
+ },
1964
+ "execution_count": 24,
1965
+ "metadata": {},
1966
+ "output_type": "execute_result"
1967
+ }
1968
+ ],
1969
+ "source": [
1970
+ "all_features = X_train.columns.to_list()\n",
1971
+ "ui_features = sorted_feature_importance[\"feature\"].head(10).tolist()\n",
1972
+ "\n",
1973
+ "default_values = {}\n",
1974
+ "\n",
1975
+ "num_default = X_train.select_dtypes(include=[\"number\"]).median().to_dict()\n",
1976
+ "default_values.update(num_default)\n",
1977
+ "\n",
1978
+ "cat_defaults = X_train.select_dtypes(include=[\"object\"]).mode().iloc[0].to_dict()\n",
1979
+ "default_values.update(cat_defaults)\n",
1980
+ "\n",
1981
+ "default_values\n"
1982
+ ]
1983
  }
1984
  ],
1985
  "metadata": {