EdwinLH commited on
Commit
db90404
·
verified ·
1 Parent(s): e7b0f29

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +322 -0
app.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import joblib
5
+ import tempfile
6
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
7
+ from sklearn.base import BaseEstimator, TransformerMixin
8
+ from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
9
+ from sklearn.compose import ColumnTransformer
10
+
11
+
12
+ # FeatureEngineer Class
13
+ class FeatureEngineer(BaseEstimator, TransformerMixin):
14
+ def __init__(self): # Save the learned values during training to be used to populate the missing data in any test set
15
+
16
+ # Numeric group means (LTV excluded as it is a logical computation and will be performed on the actual test set)
17
+ self.rate_of_interest_means = None
18
+ self.interest_rate_spread_means = None
19
+ self.upfront_charges_means = None
20
+ self.overall_rate_of_interest_mean = None
21
+ self.overall_interest_rate_spread_mean = None
22
+ self.overall_upfront_charges_mean = None
23
+ self.income_means_by_age = None
24
+ self.overall_income_mean = None
25
+ self.term_mean = None
26
+ self.property_value_mean = None
27
+ self.dtir1_mean = None
28
+ self.loan_amount_mean = None
29
+ self.credit_score_mean = None
30
+
31
+
32
+ # Most frequent categorical values
33
+ self.categorical_features = [
34
+ 'loan_limit', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness',
35
+ 'open_credit', 'business_or_commercial', 'Neg_ammortization', 'interest_only',
36
+ 'lump_sum_payment', 'construction_type', 'occupancy_type', 'Secured_by', 'total_units',
37
+ 'credit_type', 'co-applicant_credit_type', 'age', 'submission_of_application', 'Security_Type'
38
+ ]
39
+
40
+ self.most_frequent_cats = {}
41
+
42
+ def fit(self, X, y=None): # Learn parameters from training data only. Called only during training
43
+ X = X.copy()
44
+
45
+ # Calculate the numeric means for imputation
46
+ self.rate_of_interest_means = X.groupby(['loan_type', 'term'])['rate_of_interest'].mean() # pandas series indexed by (loan_type and term) tuples
47
+ self.interest_rate_spread_means = X.groupby(['loan_type', 'term'])['Interest_rate_spread'].mean()
48
+ self.upfront_charges_means = X.groupby(['loan_type', 'term'])['Upfront_charges'].mean()
49
+
50
+ self.overall_rate_of_interest_mean = X['rate_of_interest'].mean() # calculate the over global mean if combination not found
51
+ self.overall_interest_rate_spread_mean = X['Interest_rate_spread'].mean()
52
+ self.overall_upfront_charges_mean = X['Upfront_charges'].mean()
53
+
54
+ self.income_means_by_age = X.groupby('age')['income'].mean()
55
+ self.overall_income_mean = X['income'].mean()
56
+
57
+ self.term_mean = X['term'].mean().round(0)
58
+ self.property_value_mean = round(X['property_value'].mean(), -3)
59
+ self.dtir1_mean = X['dtir1'].mean().round(0)
60
+
61
+ self.loan_amount_mean = X['loan_amount'].mean() # Remaining numerical features with global mean
62
+ self.credit_score_mean = X['Credit_Score'].mean()
63
+
64
+ # Impute the categorical with the most frequent
65
+ for col in self.categorical_features:
66
+ if col in X.columns:
67
+ self.most_frequent_cats[col] = X[col].mode(dropna=True)[0]
68
+
69
+ return self
70
+
71
+ def transform(self, X): # Use during test set using self value
72
+ X = X.copy()
73
+
74
+ # Search for an available combination group for numeric imputations
75
+ def impute_feature(row, feature_name, group_means, overall_mean, group_keys):
76
+ if pd.isna(row[feature_name]):
77
+ key = tuple(row[k] for k in group_keys) # look up the group_keys such as ('Type1', 360) for ['loan_type', 'term']
78
+ if key in group_means:
79
+ return group_means[key]
80
+ else:
81
+ return overall_mean
82
+ else:
83
+ return row[feature_name]
84
+
85
+ # Impute rate_of_interest
86
+ X['rate_of_interest'] = X.apply(
87
+ lambda row: impute_feature(row, 'rate_of_interest',
88
+ self.rate_of_interest_means,
89
+ self.overall_rate_of_interest_mean,
90
+ ['loan_type', 'term']),
91
+ axis=1
92
+ ).round(3)
93
+
94
+ # Impute Interest_rate_spread
95
+ X['Interest_rate_spread'] = X.apply(
96
+ lambda row: impute_feature(row, 'Interest_rate_spread',
97
+ self.interest_rate_spread_means,
98
+ self.overall_interest_rate_spread_mean,
99
+ ['loan_type', 'term']),
100
+ axis=1
101
+ ).round(4)
102
+
103
+ # Impute Upfront_charges
104
+ X['Upfront_charges'] = X.apply(
105
+ lambda row: impute_feature(row, 'Upfront_charges',
106
+ self.upfront_charges_means,
107
+ self.overall_upfront_charges_mean,
108
+ ['loan_type', 'term']),
109
+ axis=1
110
+ ).round(2)
111
+
112
+ # Impute income by age
113
+ def impute_income(row):
114
+ if pd.isna(row['income']):
115
+ age = row['age']
116
+ if age in self.income_means_by_age:
117
+ return self.income_means_by_age[age]
118
+ else:
119
+ return self.overall_income_mean
120
+ else:
121
+ return row['income']
122
+
123
+ X['income'] = X.apply(impute_income, axis=1)
124
+ X['income'] = X['income'].fillna(self.overall_income_mean)
125
+ X['income'] = X['income'].round(-2)
126
+
127
+ # Impute term, property_value, dtir1, loan_amount, Credit_Score
128
+ X['term'] = X['term'].fillna(self.term_mean).round(0)
129
+ X['property_value'] = X['property_value'].fillna(self.property_value_mean).round(-3)
130
+ X['dtir1'] = X['dtir1'].fillna(self.dtir1_mean).round(0)
131
+ X['loan_amount'] = X['loan_amount'].fillna(self.loan_amount_mean)
132
+ X['Credit_Score'] = X['Credit_Score'].fillna(self.credit_score_mean)
133
+
134
+
135
+ # LTV calculation: LTV = (loan_amount / property_value) * 100
136
+ missing_ltv_mask = X['LTV'].isna()
137
+ X.loc[missing_ltv_mask, 'LTV'] = (
138
+ (X.loc[missing_ltv_mask, 'loan_amount'] /
139
+ X.loc[missing_ltv_mask, 'property_value']) * 100
140
+ ).round(8)
141
+
142
+ # Impute categorical with the most frequent
143
+ for col, most_freq in self.most_frequent_cats.items():
144
+ if col in X.columns:
145
+ X[col] = X[col].fillna(most_freq)
146
+
147
+ numeric_cols = X.select_dtypes(include=[np.number]).columns
148
+
149
+ return X
150
+
151
+
152
+
153
+ # Custom Ordinal Mapper
154
+ class OrdinalMapper(BaseEstimator, TransformerMixin):
155
+ def __init__(self, columns=None, mapping=None):
156
+ self.columns = columns
157
+ self.mapping = mapping
158
+
159
+ def fit(self, X, y=None):
160
+ return self
161
+
162
+ def transform(self, X):
163
+ X_ = X.copy()
164
+ for col in self.columns:
165
+ X_[col] = X_[col].map(self.mapping).fillna(-1) # Handle unexpected or missing values
166
+ return X_
167
+
168
+ # Define the feature lists
169
+ ordinal_cols = ['age']
170
+
171
+ binary_nominal_cols = [
172
+ 'loan_limit', 'approv_in_adv', 'Credit_Worthiness', 'open_credit',
173
+ 'business_or_commercial', 'Neg_ammortization', 'interest_only',
174
+ 'lump_sum_payment', 'construction_type', 'Secured_by',
175
+ 'co-applicant_credit_type', 'Security_Type'
176
+ ]
177
+
178
+ multi_nominal_cols = [
179
+ 'loan_type', 'loan_purpose', 'occupancy_type', 'total_units',
180
+ 'credit_type', 'submission_of_application'
181
+ ]
182
+
183
+ numeric_cols = [
184
+ 'loan_amount', 'rate_of_interest', 'Interest_rate_spread',
185
+ 'Upfront_charges', 'term', 'property_value', 'income',
186
+ 'Credit_Score', 'LTV', 'dtir1'
187
+ ]
188
+
189
+ # Ordinal mapping for 'age'
190
+ condition_order = ['<25', '25-34', '35-44', '45-54', '55-64', '65-74', '>74']
191
+ ordinal_map = {code: idx for idx, code in enumerate(condition_order)}
192
+
193
+ # Define the transformers
194
+ ordinal_transformer = OrdinalMapper(columns=ordinal_cols, mapping=ordinal_map)
195
+ binary_transformer = OrdinalEncoder(dtype=int) # maps binary categories to 0/1
196
+ onehot_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')
197
+ # numeric_transformer = StandardScaler()
198
+
199
+ # Building the column transformer, similar to how a pipeline works
200
+ preprocessor = ColumnTransformer(transformers=[
201
+ ('ord', ordinal_transformer, ordinal_cols),
202
+ ('bin', binary_transformer, binary_nominal_cols),
203
+ ('ohe', onehot_transformer, multi_nominal_cols),
204
+ ('num', 'passthrough', numeric_cols) # leave numeric untouched before passing to SMOTE
205
+
206
+ ])
207
+
208
+
209
+
210
+ # Transformer to scale the last 10 columns after SMOTE.
211
+ # Last 10 columns are numerical based on the number of numerical features of this dataset and the order in preprocessing
212
+ # Transformer to scale the last `n_numeric` columns
213
+ class ScaleLastColumns(BaseEstimator, TransformerMixin):
214
+ def __init__(self, n_numeric):
215
+ self.n_numeric = n_numeric
216
+ self.scaler = StandardScaler() # Save 10 sets of mean/std for each numerical feature and apply on the test set during scaling
217
+
218
+ def fit(self, X, y=None):
219
+ # Assume X is NumPy array after SMOTE
220
+ self.scaler.fit(X[:, -self.n_numeric:])
221
+ return self
222
+
223
+ def transform(self, X):
224
+ X_ = X.copy()
225
+ X_[:, -self.n_numeric:] = self.scaler.transform(X_[:, -self.n_numeric:])
226
+ return X_
227
+
228
+
229
+
230
+ # Load trained pipeline
231
+ log_best_pipeline = joblib.load("best_logreg_pipeline.pkl")
232
+ xgb_best_pipeline = joblib.load("best_xgb_pipeline.pkl")
233
+ rf_best_pipeline = joblib.load("best_rf_pipeline.pkl")
234
+ nb_best_pipeline = joblib.load("best_nb_pipeline.pkl")
235
+
236
+ # Custom threshold
237
+ thresholds = {
238
+ "Logistic Regression": 0.2680,
239
+ "Random Forest": 0.4850,
240
+ "XGBoost": None,
241
+ "Naive Bayes": None
242
+ }
243
+
244
+ # Map model name to pipeline
245
+ pipelines = {
246
+ "Logistic Regression": log_best_pipeline,
247
+ "XGBoost": xgb_best_pipeline,
248
+ "Random Forest": rf_best_pipeline,
249
+ "Naive Bayes": nb_best_pipeline
250
+ }
251
+
252
+
253
+ def predict_from_excel(file, model_name):
254
+ # Load Excel file
255
+ test_df = pd.read_excel(file.name)
256
+
257
+ # Split into features and target
258
+ X_test = test_df.drop(columns=['ID', 'year', 'Gender', 'Region', 'Status'])
259
+ y_test = test_df['Status']
260
+
261
+ # Get pipeline
262
+ pipeline = pipelines[model_name]
263
+
264
+ # Predict probabilities
265
+ y_proba = pipeline.predict_proba(X_test)[:, 1]
266
+
267
+ # Apply custom threshold if defined
268
+ thresh = thresholds.get(model_name)
269
+ if thresh is not None:
270
+ y_pred = (y_proba >= thresh).astype(int)
271
+ else:
272
+ y_pred = (y_proba >= 0.5).astype(int)
273
+
274
+ # Compute metrics
275
+ acc = accuracy_score(y_test, y_pred)
276
+ prec = precision_score(y_test, y_pred)
277
+ rec = recall_score(y_test, y_pred)
278
+ f1 = f1_score(y_test, y_pred)
279
+ auc = roc_auc_score(y_test, y_proba)
280
+ report = classification_report(y_test, y_pred, output_dict=True)
281
+
282
+ # Return metrics + results table
283
+ metrics = {
284
+ "Accuracy": round(acc, 4),
285
+ "Precision": round(prec, 4),
286
+ "Recall": round(rec, 4),
287
+ "F1 Score": round(f1, 4),
288
+ "ROC AUC": round(auc, 4),
289
+ }
290
+
291
+ # Add predictions to dataframe for inspection
292
+ results_df = test_df.copy()
293
+ results_df["Predicted"] = y_pred
294
+ results_df["Probability"] = y_proba
295
+
296
+ # Save temporary Excel file
297
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
298
+ results_df.to_excel(temp_file.name, index=False)
299
+
300
+ return metrics, results_df, temp_file.name
301
+
302
+ # Gradio UI
303
+ demo = gr.Interface(
304
+ fn=predict_from_excel,
305
+ inputs=[
306
+ gr.File(label="Upload Excel"),
307
+ gr.Dropdown(
308
+ ["Logistic Regression", "XGBoost", "Random Forest", "Naive Bayes"],
309
+ label="Select Model"
310
+ )
311
+ ],
312
+ outputs=[
313
+ gr.JSON(label="Evaluation Metrics"),
314
+ gr.Dataframe(label="Predictions with Probabilities"),
315
+ gr.File(label="Download Predictions")
316
+ ],
317
+ title="Loan Default Prediction",
318
+ description="Upload an Excel file with loan applications to predict loan default risk."
319
+ )
320
+
321
+ if __name__ == "__main__":
322
+ demo.launch(share=False)