Spaces:
Sleeping
Sleeping
File size: 12,106 Bytes
db90404 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 |
import gradio as gr
import pandas as pd
import numpy as np
import joblib
import tempfile
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
# FeatureEngineer Class
class FeatureEngineer(BaseEstimator, TransformerMixin):
def __init__(self): # Save the learned values during training to be used to populate the missing data in any test set
# Numeric group means (LTV excluded as it is a logical computation and will be performed on the actual test set)
self.rate_of_interest_means = None
self.interest_rate_spread_means = None
self.upfront_charges_means = None
self.overall_rate_of_interest_mean = None
self.overall_interest_rate_spread_mean = None
self.overall_upfront_charges_mean = None
self.income_means_by_age = None
self.overall_income_mean = None
self.term_mean = None
self.property_value_mean = None
self.dtir1_mean = None
self.loan_amount_mean = None
self.credit_score_mean = None
# Most frequent categorical values
self.categorical_features = [
'loan_limit', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness',
'open_credit', 'business_or_commercial', 'Neg_ammortization', 'interest_only',
'lump_sum_payment', 'construction_type', 'occupancy_type', 'Secured_by', 'total_units',
'credit_type', 'co-applicant_credit_type', 'age', 'submission_of_application', 'Security_Type'
]
self.most_frequent_cats = {}
def fit(self, X, y=None): # Learn parameters from training data only. Called only during training
X = X.copy()
# Calculate the numeric means for imputation
self.rate_of_interest_means = X.groupby(['loan_type', 'term'])['rate_of_interest'].mean() # pandas series indexed by (loan_type and term) tuples
self.interest_rate_spread_means = X.groupby(['loan_type', 'term'])['Interest_rate_spread'].mean()
self.upfront_charges_means = X.groupby(['loan_type', 'term'])['Upfront_charges'].mean()
self.overall_rate_of_interest_mean = X['rate_of_interest'].mean() # calculate the over global mean if combination not found
self.overall_interest_rate_spread_mean = X['Interest_rate_spread'].mean()
self.overall_upfront_charges_mean = X['Upfront_charges'].mean()
self.income_means_by_age = X.groupby('age')['income'].mean()
self.overall_income_mean = X['income'].mean()
self.term_mean = X['term'].mean().round(0)
self.property_value_mean = round(X['property_value'].mean(), -3)
self.dtir1_mean = X['dtir1'].mean().round(0)
self.loan_amount_mean = X['loan_amount'].mean() # Remaining numerical features with global mean
self.credit_score_mean = X['Credit_Score'].mean()
# Impute the categorical with the most frequent
for col in self.categorical_features:
if col in X.columns:
self.most_frequent_cats[col] = X[col].mode(dropna=True)[0]
return self
def transform(self, X): # Use during test set using self value
X = X.copy()
# Search for an available combination group for numeric imputations
def impute_feature(row, feature_name, group_means, overall_mean, group_keys):
if pd.isna(row[feature_name]):
key = tuple(row[k] for k in group_keys) # look up the group_keys such as ('Type1', 360) for ['loan_type', 'term']
if key in group_means:
return group_means[key]
else:
return overall_mean
else:
return row[feature_name]
# Impute rate_of_interest
X['rate_of_interest'] = X.apply(
lambda row: impute_feature(row, 'rate_of_interest',
self.rate_of_interest_means,
self.overall_rate_of_interest_mean,
['loan_type', 'term']),
axis=1
).round(3)
# Impute Interest_rate_spread
X['Interest_rate_spread'] = X.apply(
lambda row: impute_feature(row, 'Interest_rate_spread',
self.interest_rate_spread_means,
self.overall_interest_rate_spread_mean,
['loan_type', 'term']),
axis=1
).round(4)
# Impute Upfront_charges
X['Upfront_charges'] = X.apply(
lambda row: impute_feature(row, 'Upfront_charges',
self.upfront_charges_means,
self.overall_upfront_charges_mean,
['loan_type', 'term']),
axis=1
).round(2)
# Impute income by age
def impute_income(row):
if pd.isna(row['income']):
age = row['age']
if age in self.income_means_by_age:
return self.income_means_by_age[age]
else:
return self.overall_income_mean
else:
return row['income']
X['income'] = X.apply(impute_income, axis=1)
X['income'] = X['income'].fillna(self.overall_income_mean)
X['income'] = X['income'].round(-2)
# Impute term, property_value, dtir1, loan_amount, Credit_Score
X['term'] = X['term'].fillna(self.term_mean).round(0)
X['property_value'] = X['property_value'].fillna(self.property_value_mean).round(-3)
X['dtir1'] = X['dtir1'].fillna(self.dtir1_mean).round(0)
X['loan_amount'] = X['loan_amount'].fillna(self.loan_amount_mean)
X['Credit_Score'] = X['Credit_Score'].fillna(self.credit_score_mean)
# LTV calculation: LTV = (loan_amount / property_value) * 100
missing_ltv_mask = X['LTV'].isna()
X.loc[missing_ltv_mask, 'LTV'] = (
(X.loc[missing_ltv_mask, 'loan_amount'] /
X.loc[missing_ltv_mask, 'property_value']) * 100
).round(8)
# Impute categorical with the most frequent
for col, most_freq in self.most_frequent_cats.items():
if col in X.columns:
X[col] = X[col].fillna(most_freq)
numeric_cols = X.select_dtypes(include=[np.number]).columns
return X
# Custom Ordinal Mapper
class OrdinalMapper(BaseEstimator, TransformerMixin):
def __init__(self, columns=None, mapping=None):
self.columns = columns
self.mapping = mapping
def fit(self, X, y=None):
return self
def transform(self, X):
X_ = X.copy()
for col in self.columns:
X_[col] = X_[col].map(self.mapping).fillna(-1) # Handle unexpected or missing values
return X_
# Define the feature lists
ordinal_cols = ['age']
binary_nominal_cols = [
'loan_limit', 'approv_in_adv', 'Credit_Worthiness', 'open_credit',
'business_or_commercial', 'Neg_ammortization', 'interest_only',
'lump_sum_payment', 'construction_type', 'Secured_by',
'co-applicant_credit_type', 'Security_Type'
]
multi_nominal_cols = [
'loan_type', 'loan_purpose', 'occupancy_type', 'total_units',
'credit_type', 'submission_of_application'
]
numeric_cols = [
'loan_amount', 'rate_of_interest', 'Interest_rate_spread',
'Upfront_charges', 'term', 'property_value', 'income',
'Credit_Score', 'LTV', 'dtir1'
]
# Ordinal mapping for 'age'
condition_order = ['<25', '25-34', '35-44', '45-54', '55-64', '65-74', '>74']
ordinal_map = {code: idx for idx, code in enumerate(condition_order)}
# Define the transformers
ordinal_transformer = OrdinalMapper(columns=ordinal_cols, mapping=ordinal_map)
binary_transformer = OrdinalEncoder(dtype=int) # maps binary categories to 0/1
onehot_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')
# numeric_transformer = StandardScaler()
# Building the column transformer, similar to how a pipeline works
preprocessor = ColumnTransformer(transformers=[
('ord', ordinal_transformer, ordinal_cols),
('bin', binary_transformer, binary_nominal_cols),
('ohe', onehot_transformer, multi_nominal_cols),
('num', 'passthrough', numeric_cols) # leave numeric untouched before passing to SMOTE
])
# Transformer to scale the last 10 columns after SMOTE.
# Last 10 columns are numerical based on the number of numerical features of this dataset and the order in preprocessing
# Transformer to scale the last `n_numeric` columns
class ScaleLastColumns(BaseEstimator, TransformerMixin):
def __init__(self, n_numeric):
self.n_numeric = n_numeric
self.scaler = StandardScaler() # Save 10 sets of mean/std for each numerical feature and apply on the test set during scaling
def fit(self, X, y=None):
# Assume X is NumPy array after SMOTE
self.scaler.fit(X[:, -self.n_numeric:])
return self
def transform(self, X):
X_ = X.copy()
X_[:, -self.n_numeric:] = self.scaler.transform(X_[:, -self.n_numeric:])
return X_
# Load trained pipeline
log_best_pipeline = joblib.load("best_logreg_pipeline.pkl")
xgb_best_pipeline = joblib.load("best_xgb_pipeline.pkl")
rf_best_pipeline = joblib.load("best_rf_pipeline.pkl")
nb_best_pipeline = joblib.load("best_nb_pipeline.pkl")
# Custom threshold
thresholds = {
"Logistic Regression": 0.2680,
"Random Forest": 0.4850,
"XGBoost": None,
"Naive Bayes": None
}
# Map model name to pipeline
pipelines = {
"Logistic Regression": log_best_pipeline,
"XGBoost": xgb_best_pipeline,
"Random Forest": rf_best_pipeline,
"Naive Bayes": nb_best_pipeline
}
def predict_from_excel(file, model_name):
# Load Excel file
test_df = pd.read_excel(file.name)
# Split into features and target
X_test = test_df.drop(columns=['ID', 'year', 'Gender', 'Region', 'Status'])
y_test = test_df['Status']
# Get pipeline
pipeline = pipelines[model_name]
# Predict probabilities
y_proba = pipeline.predict_proba(X_test)[:, 1]
# Apply custom threshold if defined
thresh = thresholds.get(model_name)
if thresh is not None:
y_pred = (y_proba >= thresh).astype(int)
else:
y_pred = (y_proba >= 0.5).astype(int)
# Compute metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
report = classification_report(y_test, y_pred, output_dict=True)
# Return metrics + results table
metrics = {
"Accuracy": round(acc, 4),
"Precision": round(prec, 4),
"Recall": round(rec, 4),
"F1 Score": round(f1, 4),
"ROC AUC": round(auc, 4),
}
# Add predictions to dataframe for inspection
results_df = test_df.copy()
results_df["Predicted"] = y_pred
results_df["Probability"] = y_proba
# Save temporary Excel file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
results_df.to_excel(temp_file.name, index=False)
return metrics, results_df, temp_file.name
# Gradio UI
demo = gr.Interface(
fn=predict_from_excel,
inputs=[
gr.File(label="Upload Excel"),
gr.Dropdown(
["Logistic Regression", "XGBoost", "Random Forest", "Naive Bayes"],
label="Select Model"
)
],
outputs=[
gr.JSON(label="Evaluation Metrics"),
gr.Dataframe(label="Predictions with Probabilities"),
gr.File(label="Download Predictions")
],
title="Loan Default Prediction",
description="Upload an Excel file with loan applications to predict loan default risk."
)
if __name__ == "__main__":
demo.launch(share=False) |