EdwinLH commited on
Commit
299c10a
·
verified ·
1 Parent(s): 3569ec3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +451 -0
app.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import joblib
5
+ import tempfile
6
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
7
+ from sklearn.base import BaseEstimator, TransformerMixin
8
+ from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
9
+ from sklearn.compose import ColumnTransformer
10
+
11
+
12
+ # FeatureEngineer Class
13
+ class FeatureEngineer(BaseEstimator, TransformerMixin):
14
+ def __init__(self): # Save the learned values during training to be used to populate the missing data in any test set
15
+
16
+ # Numeric group means (LTV excluded as it is a logical computation and will be performed on the actual test set)
17
+ self.rate_of_interest_means = None
18
+ self.interest_rate_spread_means = None
19
+ self.upfront_charges_means = None
20
+ self.overall_rate_of_interest_mean = None
21
+ self.overall_interest_rate_spread_mean = None
22
+ self.overall_upfront_charges_mean = None
23
+ self.income_means_by_age = None
24
+ self.overall_income_mean = None
25
+ self.term_mean = None
26
+ self.property_value_mean = None
27
+ self.dtir1_mean = None
28
+ self.loan_amount_mean = None
29
+ self.credit_score_mean = None
30
+
31
+
32
+ # Most frequent categorical values
33
+ self.categorical_features = [
34
+ 'loan_limit', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness',
35
+ 'open_credit', 'business_or_commercial', 'Neg_ammortization', 'interest_only',
36
+ 'lump_sum_payment', 'construction_type', 'occupancy_type', 'Secured_by', 'total_units',
37
+ 'credit_type', 'co-applicant_credit_type', 'age', 'submission_of_application', 'Security_Type'
38
+ ]
39
+
40
+ self.most_frequent_cats = {}
41
+
42
+ def fit(self, X, y=None): # Learn parameters from training data only. Called only during training
43
+ X = X.copy()
44
+
45
+ # Calculate the numeric means for imputation
46
+ self.rate_of_interest_means = X.groupby(['loan_type', 'term'])['rate_of_interest'].mean() # pandas series indexed by (loan_type and term) tuples
47
+ self.interest_rate_spread_means = X.groupby(['loan_type', 'term'])['Interest_rate_spread'].mean()
48
+ self.upfront_charges_means = X.groupby(['loan_type', 'term'])['Upfront_charges'].mean()
49
+
50
+ self.overall_rate_of_interest_mean = X['rate_of_interest'].mean() # calculate the over global mean if combination not found
51
+ self.overall_interest_rate_spread_mean = X['Interest_rate_spread'].mean()
52
+ self.overall_upfront_charges_mean = X['Upfront_charges'].mean()
53
+
54
+ self.income_means_by_age = X.groupby('age')['income'].mean()
55
+ self.overall_income_mean = X['income'].mean()
56
+
57
+ self.term_mean = X['term'].mean().round(0)
58
+ self.property_value_mean = round(X['property_value'].mean(), -3)
59
+ self.dtir1_mean = X['dtir1'].mean().round(0)
60
+
61
+ self.loan_amount_mean = X['loan_amount'].mean() # Remaining numerical features with global mean
62
+ self.credit_score_mean = X['Credit_Score'].mean()
63
+
64
+ # Impute the categorical with the most frequent
65
+ for col in self.categorical_features:
66
+ if col in X.columns:
67
+ self.most_frequent_cats[col] = X[col].mode(dropna=True)[0]
68
+
69
+ return self
70
+
71
+ def transform(self, X): # Use during test set using self value
72
+ X = X.copy()
73
+
74
+ # Search for an available combination group for numeric imputations
75
+ def impute_feature(row, feature_name, group_means, overall_mean, group_keys):
76
+ if pd.isna(row[feature_name]):
77
+ key = tuple(row[k] for k in group_keys) # look up the group_keys such as ('Type1', 360) for ['loan_type', 'term']
78
+ if key in group_means:
79
+ return group_means[key]
80
+ else:
81
+ return overall_mean
82
+ else:
83
+ return row[feature_name]
84
+
85
+ # Impute rate_of_interest
86
+ X['rate_of_interest'] = X.apply(
87
+ lambda row: impute_feature(row, 'rate_of_interest',
88
+ self.rate_of_interest_means,
89
+ self.overall_rate_of_interest_mean,
90
+ ['loan_type', 'term']),
91
+ axis=1
92
+ ).round(3)
93
+
94
+ # Impute Interest_rate_spread
95
+ X['Interest_rate_spread'] = X.apply(
96
+ lambda row: impute_feature(row, 'Interest_rate_spread',
97
+ self.interest_rate_spread_means,
98
+ self.overall_interest_rate_spread_mean,
99
+ ['loan_type', 'term']),
100
+ axis=1
101
+ ).round(4)
102
+
103
+ # Impute Upfront_charges
104
+ X['Upfront_charges'] = X.apply(
105
+ lambda row: impute_feature(row, 'Upfront_charges',
106
+ self.upfront_charges_means,
107
+ self.overall_upfront_charges_mean,
108
+ ['loan_type', 'term']),
109
+ axis=1
110
+ ).round(2)
111
+
112
+ # Impute income by age
113
+ def impute_income(row):
114
+ if pd.isna(row['income']):
115
+ age = row['age']
116
+ if age in self.income_means_by_age:
117
+ return self.income_means_by_age[age]
118
+ else:
119
+ return self.overall_income_mean
120
+ else:
121
+ return row['income']
122
+
123
+ X['income'] = X.apply(impute_income, axis=1)
124
+ X['income'] = X['income'].fillna(self.overall_income_mean)
125
+ X['income'] = X['income'].round(-2)
126
+
127
+ # Impute term, property_value, dtir1, loan_amount, Credit_Score
128
+ X['term'] = X['term'].fillna(self.term_mean).round(0)
129
+ X['property_value'] = X['property_value'].fillna(self.property_value_mean).round(-3)
130
+ X['dtir1'] = X['dtir1'].fillna(self.dtir1_mean).round(0)
131
+ X['loan_amount'] = X['loan_amount'].fillna(self.loan_amount_mean)
132
+ X['Credit_Score'] = X['Credit_Score'].fillna(self.credit_score_mean)
133
+
134
+
135
+ # LTV calculation: LTV = (loan_amount / property_value) * 100
136
+ missing_ltv_mask = X['LTV'].isna()
137
+ X.loc[missing_ltv_mask, 'LTV'] = (
138
+ (X.loc[missing_ltv_mask, 'loan_amount'] /
139
+ X.loc[missing_ltv_mask, 'property_value']) * 100
140
+ ).round(8)
141
+
142
+ # Impute categorical with the most frequent
143
+ for col, most_freq in self.most_frequent_cats.items():
144
+ if col in X.columns:
145
+ X[col] = X[col].fillna(most_freq)
146
+
147
+ numeric_cols = X.select_dtypes(include=[np.number]).columns
148
+
149
+ return X
150
+
151
+
152
+
153
+ # Custom Ordinal Mapper
154
+ class OrdinalMapper(BaseEstimator, TransformerMixin):
155
+ def __init__(self, columns=None, mapping=None):
156
+ self.columns = columns
157
+ self.mapping = mapping
158
+
159
+ def fit(self, X, y=None):
160
+ return self
161
+
162
+ def transform(self, X):
163
+ X_ = X.copy()
164
+ for col in self.columns:
165
+ X_[col] = X_[col].map(self.mapping).fillna(-1) # Handle unexpected or missing values
166
+ return X_
167
+
168
+ # Define the feature lists
169
+ ordinal_cols = ['age']
170
+
171
+ binary_nominal_cols = [
172
+ 'loan_limit', 'approv_in_adv', 'Credit_Worthiness', 'open_credit',
173
+ 'business_or_commercial', 'Neg_ammortization', 'interest_only',
174
+ 'lump_sum_payment', 'construction_type', 'Secured_by',
175
+ 'co-applicant_credit_type', 'Security_Type'
176
+ ]
177
+
178
+ multi_nominal_cols = [
179
+ 'loan_type', 'loan_purpose', 'occupancy_type', 'total_units',
180
+ 'credit_type', 'submission_of_application'
181
+ ]
182
+
183
+ numeric_cols = [
184
+ 'loan_amount', 'rate_of_interest', 'Interest_rate_spread',
185
+ 'Upfront_charges', 'term', 'property_value', 'income',
186
+ 'Credit_Score', 'LTV', 'dtir1'
187
+ ]
188
+
189
+ # Ordinal mapping for 'age'
190
+ condition_order = ['<25', '25-34', '35-44', '45-54', '55-64', '65-74', '>74']
191
+ ordinal_map = {code: idx for idx, code in enumerate(condition_order)}
192
+
193
+ # Define the transformers
194
+ ordinal_transformer = OrdinalMapper(columns=ordinal_cols, mapping=ordinal_map)
195
+ binary_transformer = OrdinalEncoder(dtype=int) # maps binary categories to 0/1
196
+ onehot_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')
197
+ # numeric_transformer = StandardScaler()
198
+
199
+ # Building the column transformer, similar to how a pipeline works
200
+ preprocessor = ColumnTransformer(transformers=[
201
+ ('ord', ordinal_transformer, ordinal_cols),
202
+ ('bin', binary_transformer, binary_nominal_cols),
203
+ ('ohe', onehot_transformer, multi_nominal_cols),
204
+ ('num', 'passthrough', numeric_cols) # leave numeric untouched before passing to SMOTE
205
+
206
+ ])
207
+
208
+
209
+
210
+ # Transformer to scale the last 10 columns after SMOTE.
211
+ # Last 10 columns are numerical based on the number of numerical features of this dataset and the order in preprocessing
212
+ # Transformer to scale the last `n_numeric` columns
213
+ class ScaleLastColumns(BaseEstimator, TransformerMixin):
214
+ def __init__(self, n_numeric):
215
+ self.n_numeric = n_numeric
216
+ self.scaler = StandardScaler() # Save 10 sets of mean/std for each numerical feature and apply on the test set during scaling
217
+
218
+ def fit(self, X, y=None):
219
+ # Assume X is NumPy array after SMOTE
220
+ self.scaler.fit(X[:, -self.n_numeric:])
221
+ return self
222
+
223
+ def transform(self, X):
224
+ X_ = X.copy()
225
+ X_[:, -self.n_numeric:] = self.scaler.transform(X_[:, -self.n_numeric:])
226
+ return X_
227
+
228
+
229
+
230
+ # Load trained pipeline
231
+ log_best_pipeline = joblib.load("best_logreg_pipeline.pkl")
232
+ xgb_best_pipeline = joblib.load("best_xgb_pipeline.pkl")
233
+ rf_best_pipeline = joblib.load("best_rf_pipeline.pkl")
234
+ nb_best_pipeline = joblib.load("best_nb_pipeline.pkl")
235
+
236
+ # Custom threshold
237
+ thresholds = {
238
+ "Logistic Regression": 0.2680,
239
+ "Random Forest": 0.4850,
240
+ "XGBoost": None,
241
+ "Naive Bayes": None
242
+ }
243
+
244
+ # Map model name to pipeline
245
+ pipelines = {
246
+ "Logistic Regression": log_best_pipeline,
247
+ "XGBoost": xgb_best_pipeline,
248
+ "Random Forest": rf_best_pipeline,
249
+ "Naive Bayes": nb_best_pipeline
250
+ }
251
+
252
+
253
+
254
+ # ------------------- Batch Prediction (Excel) -------------------
255
+ def predict_from_excel(file, model_name):
256
+ # Load Excel file
257
+ test_df = pd.read_excel(file.name)
258
+
259
+ # Split into features and target
260
+ X_test = test_df.drop(columns=['ID', 'year', 'Gender', 'Region', 'Status'])
261
+ y_test = test_df['Status']
262
+
263
+ # Get pipeline
264
+ pipeline = pipelines[model_name]
265
+
266
+ # Predict probabilities
267
+ y_proba = pipeline.predict_proba(X_test)[:, 1]
268
+
269
+ # Apply custom threshold if defined
270
+ thresh = thresholds.get(model_name)
271
+ if thresh is not None:
272
+ y_pred = (y_proba >= thresh).astype(int)
273
+ else:
274
+ y_pred = (y_proba >= 0.5).astype(int)
275
+
276
+ # Compute metrics
277
+ acc = accuracy_score(y_test, y_pred)
278
+ prec = precision_score(y_test, y_pred)
279
+ rec = recall_score(y_test, y_pred)
280
+ f1 = f1_score(y_test, y_pred)
281
+ auc = roc_auc_score(y_test, y_proba)
282
+ report = classification_report(y_test, y_pred, output_dict=True)
283
+
284
+ # Return metrics + results table
285
+ metrics = {
286
+ "Accuracy": round(acc, 4),
287
+ "Precision": round(prec, 4),
288
+ "Recall": round(rec, 4),
289
+ "F1 Score": round(f1, 4),
290
+ "ROC AUC": round(auc, 4),
291
+ }
292
+
293
+ # Add predictions to dataframe for inspection
294
+ results_df = test_df.copy()
295
+ results_df["Predicted"] = y_pred
296
+ results_df["Probability"] = y_proba
297
+
298
+ # Save temporary Excel file
299
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
300
+ results_df.to_excel(temp_file.name, index=False)
301
+
302
+ return metrics, results_df, temp_file.name
303
+
304
+
305
+
306
+ # ------------------- Manual Prediction -------------------
307
+ def predict_single(
308
+ model_name,
309
+ loan_limit, Gender, approv_in_adv, loan_type, loan_purpose, Credit_Worthiness,
310
+ open_credit, business_or_commercial, loan_amount, rate_of_interest,
311
+ Interest_rate_spread, Upfront_charges, term, Neg_ammortization,
312
+ interest_only, lump_sum_payment, property_value, construction_type,
313
+ occupancy_type, Secured_by, total_units, income, credit_type,
314
+ Credit_Score, co_applicant_credit_type, age, submission_of_application,
315
+ Region, Security_Type, dtir1
316
+ ):
317
+ # --- Helper for numeric fields ---
318
+ def safe_float(x):
319
+ try:
320
+ if x is None or x == "" or (isinstance(x, float) and np.isnan(x)):
321
+ return np.nan
322
+ return float(x)
323
+ except:
324
+ return np.nan
325
+
326
+ # --- Compute derived feature LTV ---
327
+ la = safe_float(loan_amount)
328
+ pv = safe_float(property_value)
329
+ ltv = np.nan if (pv is None or pv == 0 or np.isnan(pv)) else la / pv
330
+
331
+ input_dict = {
332
+ "loan_limit": [loan_limit],
333
+ "approv_in_adv": [approv_in_adv],
334
+ "loan_type": [loan_type],
335
+ "loan_purpose": [loan_purpose],
336
+ "Credit_Worthiness": [Credit_Worthiness],
337
+ "open_credit": [open_credit],
338
+ "business_or_commercial": [business_or_commercial],
339
+ "loan_amount": [la],
340
+ "rate_of_interest": [safe_float(rate_of_interest)],
341
+ "Interest_rate_spread": [safe_float(Interest_rate_spread)],
342
+ "Upfront_charges": [safe_float(Upfront_charges)],
343
+ "term": [safe_float(term)],
344
+ "Neg_ammortization": [Neg_ammortization],
345
+ "interest_only": [interest_only],
346
+ "lump_sum_payment": [lump_sum_payment],
347
+ "property_value": [pv],
348
+ "construction_type": [construction_type],
349
+ "occupancy_type": [occupancy_type],
350
+ "Secured_by": [Secured_by],
351
+ "total_units": [safe_float(total_units)],
352
+ "income": [safe_float(income)],
353
+ "credit_type": [credit_type],
354
+ "Credit_Score": [safe_float(Credit_Score)],
355
+ "co-applicant_credit_type": [co_applicant_credit_type],
356
+ "age": [age],
357
+ "submission_of_application": [submission_of_application],
358
+ "LTV": [ltv],
359
+ "Region": [Region],
360
+ "Security_Type": [Security_Type],
361
+ "dtir1": [safe_float(dtir1)]
362
+ }
363
+
364
+ X_input = pd.DataFrame(input_dict)
365
+
366
+ pipeline = pipelines[model_name]
367
+ y_proba = pipeline.predict_proba(X_input)[:, 1]
368
+
369
+ thresh = thresholds.get(model_name)
370
+ if thresh is not None:
371
+ y_pred = (y_proba >= thresh).astype(int)
372
+ else:
373
+ y_pred = (y_proba >= 0.5).astype(int)
374
+
375
+ result = {
376
+ "Predicted Class": int(y_pred[0]),
377
+ "Probability": round(float(y_proba[0]), 4)
378
+ }
379
+ return result
380
+
381
+
382
+ # ------------------- UI Components -------------------
383
+ # Batch Tab
384
+ batch_tab = gr.Interface(
385
+ fn=predict_from_excel,
386
+ inputs=[
387
+ gr.File(label="Upload Excel"),
388
+ gr.Dropdown(
389
+ ["Logistic Regression", "XGBoost", "Random Forest", "Naive Bayes"],
390
+ label="Select Model"
391
+ )
392
+ ],
393
+ outputs=[
394
+ gr.JSON(label="Evaluation Metrics"),
395
+ gr.Dataframe(label="Predictions with Probabilities"),
396
+ gr.File(label="Download Predictions")
397
+ ],
398
+ title="Batch Loan Default Prediction"
399
+ )
400
+
401
+ # Manual Tab
402
+ manual_inputs = [
403
+ gr.Dropdown(
404
+ ["Logistic Regression", "XGBoost", "Random Forest", "Naive Bayes"],
405
+ label="Select Model"
406
+ ),
407
+ gr.Dropdown(["cf", "ncf"], label="loan_limit"),
408
+ gr.Dropdown(["Male", "Female", "Joint"], label="Gender"),
409
+ gr.Dropdown(["pre", "nopre"], label="approv_in_adv"),
410
+ gr.Dropdown(["type1", "type2", "type3"], label="loan_type"),
411
+ gr.Dropdown(["p1", "p2", "p3", "p4"], label="loan_purpose"),
412
+ gr.Dropdown(["l1", "l2"], label="Credit_Worthiness"),
413
+ gr.Dropdown(["opc", "nopc"], label="open_credit"),
414
+ gr.Dropdown(["b/c", "nob/c"], label="business_or_commercial"),
415
+ gr.Number(label="loan_amount"),
416
+ gr.Number(label="rate_of_interest"),
417
+ gr.Number(label="Interest_rate_spread"),
418
+ gr.Number(label="Upfront_charges"),
419
+ gr.Number(label="term"),
420
+ gr.Dropdown(["neg_amm", "not_neg"], label="Neg_ammortization"),
421
+ gr.Dropdown(["int_only", "not_int"], label="interest_only"),
422
+ gr.Dropdown(["lpsm", "not_lpsm"], label="lump_sum_payment"),
423
+ gr.Number(label="property_value"),
424
+ gr.Dropdown(["mh", "sb"], label="construction_type"),
425
+ gr.Dropdown(["ir", "pr", "sr"], label="occupancy_type"),
426
+ gr.Dropdown(["home", "land"], label="Secured_by"),
427
+ gr.Dropdown(["1U", "2U", "3U", "4U"], label="total_units"),
428
+ gr.Number(label="income"),
429
+ gr.Dropdown(["CIB", "CRIF", "EQUI", "EXP"], label="credit_type"),
430
+ gr.Number(label="Credit_Score"),
431
+ gr.Dropdown(["CIB", "EXP"], label="co-applicant_credit_type"),
432
+ gr.Dropdown(["<25", "25-34", "35-44", "45-54", "55-64", "65-74", ">74"], label="age"),
433
+ gr.Dropdown(["to_inst", "not_inst"], label="submission_of_application"),
434
+ # gr.Number(label="LTV"),
435
+ gr.Dropdown(["central", "North", "North-East", "south"], label="Region"),
436
+ gr.Dropdown(["direct", "Indriect"], label="Security_Type"),
437
+ gr.Number(label="dtir1")
438
+ ]
439
+
440
+ manual_tab = gr.Interface(
441
+ fn=predict_single,
442
+ inputs=manual_inputs,
443
+ outputs=gr.JSON(label="Prediction Result"),
444
+ title="Manual Loan Default Prediction"
445
+ )
446
+
447
+ # Combine Tabs
448
+ demo = gr.TabbedInterface([batch_tab, manual_tab], ["Batch Prediction", "Manual Prediction"])
449
+
450
+ if __name__ == "__main__":
451
+ demo.launch(share=False)