import gradio as gr import pandas as pd import numpy as np import joblib import tempfile from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler from sklearn.compose import ColumnTransformer # FeatureEngineer Class class FeatureEngineer(BaseEstimator, TransformerMixin): def __init__(self): # Save the learned values during training to be used to populate the missing data in any test set # Numeric group means (LTV excluded as it is a logical computation and will be performed on the actual test set) self.rate_of_interest_means = None self.interest_rate_spread_means = None self.upfront_charges_means = None self.overall_rate_of_interest_mean = None self.overall_interest_rate_spread_mean = None self.overall_upfront_charges_mean = None self.income_means_by_age = None self.overall_income_mean = None self.term_mean = None self.property_value_mean = None self.dtir1_mean = None self.loan_amount_mean = None self.credit_score_mean = None # Most frequent categorical values self.categorical_features = [ 'loan_limit', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness', 'open_credit', 'business_or_commercial', 'Neg_ammortization', 'interest_only', 'lump_sum_payment', 'construction_type', 'occupancy_type', 'Secured_by', 'total_units', 'credit_type', 'co-applicant_credit_type', 'age', 'submission_of_application', 'Security_Type' ] self.most_frequent_cats = {} def fit(self, X, y=None): # Learn parameters from training data only. Called only during training X = X.copy() # Calculate the numeric means for imputation self.rate_of_interest_means = X.groupby(['loan_type', 'term'])['rate_of_interest'].mean() # pandas series indexed by (loan_type and term) tuples self.interest_rate_spread_means = X.groupby(['loan_type', 'term'])['Interest_rate_spread'].mean() self.upfront_charges_means = X.groupby(['loan_type', 'term'])['Upfront_charges'].mean() self.overall_rate_of_interest_mean = X['rate_of_interest'].mean() # calculate the over global mean if combination not found self.overall_interest_rate_spread_mean = X['Interest_rate_spread'].mean() self.overall_upfront_charges_mean = X['Upfront_charges'].mean() self.income_means_by_age = X.groupby('age')['income'].mean() self.overall_income_mean = X['income'].mean() self.term_mean = X['term'].mean().round(0) self.property_value_mean = round(X['property_value'].mean(), -3) self.dtir1_mean = X['dtir1'].mean().round(0) self.loan_amount_mean = X['loan_amount'].mean() # Remaining numerical features with global mean self.credit_score_mean = X['Credit_Score'].mean() # Impute the categorical with the most frequent for col in self.categorical_features: if col in X.columns: self.most_frequent_cats[col] = X[col].mode(dropna=True)[0] return self def transform(self, X): # Use during test set using self value X = X.copy() # Search for an available combination group for numeric imputations def impute_feature(row, feature_name, group_means, overall_mean, group_keys): if pd.isna(row[feature_name]): key = tuple(row[k] for k in group_keys) # look up the group_keys such as ('Type1', 360) for ['loan_type', 'term'] if key in group_means: return group_means[key] else: return overall_mean else: return row[feature_name] # Impute rate_of_interest X['rate_of_interest'] = X.apply( lambda row: impute_feature(row, 'rate_of_interest', self.rate_of_interest_means, self.overall_rate_of_interest_mean, ['loan_type', 'term']), axis=1 ).round(3) # Impute Interest_rate_spread X['Interest_rate_spread'] = X.apply( lambda row: impute_feature(row, 'Interest_rate_spread', self.interest_rate_spread_means, self.overall_interest_rate_spread_mean, ['loan_type', 'term']), axis=1 ).round(4) # Impute Upfront_charges X['Upfront_charges'] = X.apply( lambda row: impute_feature(row, 'Upfront_charges', self.upfront_charges_means, self.overall_upfront_charges_mean, ['loan_type', 'term']), axis=1 ).round(2) # Impute income by age def impute_income(row): if pd.isna(row['income']): age = row['age'] if age in self.income_means_by_age: return self.income_means_by_age[age] else: return self.overall_income_mean else: return row['income'] X['income'] = X.apply(impute_income, axis=1) X['income'] = X['income'].fillna(self.overall_income_mean) X['income'] = X['income'].round(-2) # Impute term, property_value, dtir1, loan_amount, Credit_Score X['term'] = X['term'].fillna(self.term_mean).round(0) X['property_value'] = X['property_value'].fillna(self.property_value_mean).round(-3) X['dtir1'] = X['dtir1'].fillna(self.dtir1_mean).round(0) X['loan_amount'] = X['loan_amount'].fillna(self.loan_amount_mean) X['Credit_Score'] = X['Credit_Score'].fillna(self.credit_score_mean) # LTV calculation: LTV = (loan_amount / property_value) * 100 missing_ltv_mask = X['LTV'].isna() X.loc[missing_ltv_mask, 'LTV'] = ( (X.loc[missing_ltv_mask, 'loan_amount'] / X.loc[missing_ltv_mask, 'property_value']) * 100 ).round(8) # Impute categorical with the most frequent for col, most_freq in self.most_frequent_cats.items(): if col in X.columns: X[col] = X[col].fillna(most_freq) numeric_cols = X.select_dtypes(include=[np.number]).columns return X # Custom Ordinal Mapper class OrdinalMapper(BaseEstimator, TransformerMixin): def __init__(self, columns=None, mapping=None): self.columns = columns self.mapping = mapping def fit(self, X, y=None): return self def transform(self, X): X_ = X.copy() for col in self.columns: X_[col] = X_[col].map(self.mapping).fillna(-1) # Handle unexpected or missing values return X_ # Define the feature lists ordinal_cols = ['age'] binary_nominal_cols = [ 'loan_limit', 'approv_in_adv', 'Credit_Worthiness', 'open_credit', 'business_or_commercial', 'Neg_ammortization', 'interest_only', 'lump_sum_payment', 'construction_type', 'Secured_by', 'co-applicant_credit_type', 'Security_Type' ] multi_nominal_cols = [ 'loan_type', 'loan_purpose', 'occupancy_type', 'total_units', 'credit_type', 'submission_of_application' ] numeric_cols = [ 'loan_amount', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'term', 'property_value', 'income', 'Credit_Score', 'LTV', 'dtir1' ] # Ordinal mapping for 'age' condition_order = ['<25', '25-34', '35-44', '45-54', '55-64', '65-74', '>74'] ordinal_map = {code: idx for idx, code in enumerate(condition_order)} # Define the transformers ordinal_transformer = OrdinalMapper(columns=ordinal_cols, mapping=ordinal_map) binary_transformer = OrdinalEncoder(dtype=int) # maps binary categories to 0/1 onehot_transformer = OneHotEncoder(drop='first', handle_unknown='ignore') # numeric_transformer = StandardScaler() # Building the column transformer, similar to how a pipeline works preprocessor = ColumnTransformer(transformers=[ ('ord', ordinal_transformer, ordinal_cols), ('bin', binary_transformer, binary_nominal_cols), ('ohe', onehot_transformer, multi_nominal_cols), ('num', 'passthrough', numeric_cols) # leave numeric untouched before passing to SMOTE ]) # Transformer to scale the last 10 columns after SMOTE. # Last 10 columns are numerical based on the number of numerical features of this dataset and the order in preprocessing # Transformer to scale the last `n_numeric` columns class ScaleLastColumns(BaseEstimator, TransformerMixin): def __init__(self, n_numeric): self.n_numeric = n_numeric self.scaler = StandardScaler() # Save 10 sets of mean/std for each numerical feature and apply on the test set during scaling def fit(self, X, y=None): # Assume X is NumPy array after SMOTE self.scaler.fit(X[:, -self.n_numeric:]) return self def transform(self, X): X_ = X.copy() X_[:, -self.n_numeric:] = self.scaler.transform(X_[:, -self.n_numeric:]) return X_ # Load trained pipeline log_best_pipeline = joblib.load("best_logreg_pipeline.pkl") xgb_best_pipeline = joblib.load("best_xgb_pipeline.pkl") rf_best_pipeline = joblib.load("best_rf_pipeline.pkl") nb_best_pipeline = joblib.load("best_nb_pipeline.pkl") # Custom threshold thresholds = { "Logistic Regression": 0.2680, "Random Forest": 0.4850, "XGBoost": None, "Naive Bayes": None } # Map model name to pipeline pipelines = { "Logistic Regression": log_best_pipeline, "XGBoost": xgb_best_pipeline, "Random Forest": rf_best_pipeline, "Naive Bayes": nb_best_pipeline } def predict_from_excel(file, model_name): # Load Excel file test_df = pd.read_excel(file.name) # Split into features and target X_test = test_df.drop(columns=['ID', 'year', 'Gender', 'Region', 'Status']) y_test = test_df['Status'] # Get pipeline pipeline = pipelines[model_name] # Predict probabilities y_proba = pipeline.predict_proba(X_test)[:, 1] # Apply custom threshold if defined thresh = thresholds.get(model_name) if thresh is not None: y_pred = (y_proba >= thresh).astype(int) else: y_pred = (y_proba >= 0.5).astype(int) # Compute metrics acc = accuracy_score(y_test, y_pred) prec = precision_score(y_test, y_pred) rec = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) auc = roc_auc_score(y_test, y_proba) report = classification_report(y_test, y_pred, output_dict=True) # Return metrics + results table metrics = { "Accuracy": round(acc, 4), "Precision": round(prec, 4), "Recall": round(rec, 4), "F1 Score": round(f1, 4), "ROC AUC": round(auc, 4), } # Add predictions to dataframe for inspection results_df = test_df.copy() results_df["Predicted"] = y_pred results_df["Probability"] = y_proba # Save temporary Excel file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") results_df.to_excel(temp_file.name, index=False) return metrics, results_df, temp_file.name # Gradio UI demo = gr.Interface( fn=predict_from_excel, inputs=[ gr.File(label="Upload Excel"), gr.Dropdown( ["Logistic Regression", "XGBoost", "Random Forest", "Naive Bayes"], label="Select Model" ) ], outputs=[ gr.JSON(label="Evaluation Metrics"), gr.Dataframe(label="Predictions with Probabilities"), gr.File(label="Download Predictions") ], title="Loan Default Prediction", description="Upload an Excel file with loan applications to predict loan default risk." ) if __name__ == "__main__": demo.launch(share=False)