Spaces:

EdwinLH
/

ITI105_Project

Sleeping

File size: 12,106 Bytes

db90404

import gradio as gr
import pandas as pd
import numpy as np
import joblib
import tempfile
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


# FeatureEngineer Class
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self): # Save the learned values during training to be used to populate the missing data in any test set

        # Numeric group means (LTV excluded as it is a logical computation and will be performed on the actual test set)
        self.rate_of_interest_means = None
        self.interest_rate_spread_means = None
        self.upfront_charges_means = None
        self.overall_rate_of_interest_mean = None
        self.overall_interest_rate_spread_mean = None
        self.overall_upfront_charges_mean = None
        self.income_means_by_age = None
        self.overall_income_mean = None
        self.term_mean = None
        self.property_value_mean = None
        self.dtir1_mean = None
        self.loan_amount_mean = None
        self.credit_score_mean = None


        # Most frequent categorical values
        self.categorical_features = [
            'loan_limit', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness',
            'open_credit', 'business_or_commercial', 'Neg_ammortization', 'interest_only',
            'lump_sum_payment', 'construction_type', 'occupancy_type', 'Secured_by', 'total_units',
            'credit_type', 'co-applicant_credit_type', 'age', 'submission_of_application', 'Security_Type'
        ]

        self.most_frequent_cats = {}

    def fit(self, X, y=None): # Learn parameters from training data only. Called only during training
        X = X.copy()

        # Calculate the numeric means for imputation
        self.rate_of_interest_means = X.groupby(['loan_type', 'term'])['rate_of_interest'].mean() # pandas series indexed by (loan_type and term) tuples
        self.interest_rate_spread_means = X.groupby(['loan_type', 'term'])['Interest_rate_spread'].mean()
        self.upfront_charges_means = X.groupby(['loan_type', 'term'])['Upfront_charges'].mean()

        self.overall_rate_of_interest_mean = X['rate_of_interest'].mean() # calculate the over global mean if combination not found
        self.overall_interest_rate_spread_mean = X['Interest_rate_spread'].mean()
        self.overall_upfront_charges_mean = X['Upfront_charges'].mean()

        self.income_means_by_age = X.groupby('age')['income'].mean()
        self.overall_income_mean = X['income'].mean()

        self.term_mean = X['term'].mean().round(0)
        self.property_value_mean = round(X['property_value'].mean(), -3)
        self.dtir1_mean = X['dtir1'].mean().round(0)

        self.loan_amount_mean = X['loan_amount'].mean() # Remaining numerical features with global mean
        self.credit_score_mean = X['Credit_Score'].mean()

        # Impute the categorical with the most frequent
        for col in self.categorical_features:
            if col in X.columns:
                self.most_frequent_cats[col] = X[col].mode(dropna=True)[0]

        return self

    def transform(self, X): # Use during test set using self value
        X = X.copy()

        # Search for an available combination group for numeric imputations
        def impute_feature(row, feature_name, group_means, overall_mean, group_keys):
            if pd.isna(row[feature_name]):
                key = tuple(row[k] for k in group_keys) # look up the group_keys such as ('Type1', 360) for ['loan_type', 'term']
                if key in group_means:
                    return group_means[key]
                else:
                    return overall_mean
            else:
                return row[feature_name]

        # Impute rate_of_interest
        X['rate_of_interest'] = X.apply(
            lambda row: impute_feature(row, 'rate_of_interest',
                                       self.rate_of_interest_means,
                                       self.overall_rate_of_interest_mean,
                                       ['loan_type', 'term']),
            axis=1
        ).round(3)

        # Impute Interest_rate_spread
        X['Interest_rate_spread'] = X.apply(
            lambda row: impute_feature(row, 'Interest_rate_spread',
                                       self.interest_rate_spread_means,
                                       self.overall_interest_rate_spread_mean,
                                       ['loan_type', 'term']),
            axis=1
        ).round(4)

        # Impute Upfront_charges
        X['Upfront_charges'] = X.apply(
            lambda row: impute_feature(row, 'Upfront_charges',
                                       self.upfront_charges_means,
                                       self.overall_upfront_charges_mean,
                                       ['loan_type', 'term']),
            axis=1
        ).round(2)

        # Impute income by age
        def impute_income(row):
            if pd.isna(row['income']):
                age = row['age']
                if age in self.income_means_by_age:
                    return self.income_means_by_age[age]
                else:
                    return self.overall_income_mean
            else:
                return row['income']

        X['income'] = X.apply(impute_income, axis=1)
        X['income'] = X['income'].fillna(self.overall_income_mean)
        X['income'] = X['income'].round(-2)

        # Impute term, property_value, dtir1, loan_amount, Credit_Score
        X['term'] = X['term'].fillna(self.term_mean).round(0)
        X['property_value'] = X['property_value'].fillna(self.property_value_mean).round(-3)
        X['dtir1'] = X['dtir1'].fillna(self.dtir1_mean).round(0)
        X['loan_amount'] = X['loan_amount'].fillna(self.loan_amount_mean)
        X['Credit_Score'] = X['Credit_Score'].fillna(self.credit_score_mean)


        # LTV calculation: LTV = (loan_amount / property_value) * 100
        missing_ltv_mask = X['LTV'].isna()
        X.loc[missing_ltv_mask, 'LTV'] = (
            (X.loc[missing_ltv_mask, 'loan_amount'] /
             X.loc[missing_ltv_mask, 'property_value']) * 100
        ).round(8)

        # Impute categorical with the most frequent
        for col, most_freq in self.most_frequent_cats.items():
            if col in X.columns:
                X[col] = X[col].fillna(most_freq)

        numeric_cols = X.select_dtypes(include=[np.number]).columns

        return X



# Custom Ordinal Mapper
class OrdinalMapper(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, mapping=None):
        self.columns = columns
        self.mapping = mapping

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        for col in self.columns:
            X_[col] = X_[col].map(self.mapping).fillna(-1)  # Handle unexpected or missing values
        return X_

# Define the feature lists
ordinal_cols = ['age']

binary_nominal_cols = [
    'loan_limit', 'approv_in_adv', 'Credit_Worthiness', 'open_credit',
    'business_or_commercial', 'Neg_ammortization', 'interest_only',
    'lump_sum_payment', 'construction_type', 'Secured_by',
    'co-applicant_credit_type', 'Security_Type'
]

multi_nominal_cols = [
    'loan_type', 'loan_purpose', 'occupancy_type', 'total_units',
    'credit_type', 'submission_of_application'
]

numeric_cols = [
    'loan_amount', 'rate_of_interest', 'Interest_rate_spread',
    'Upfront_charges', 'term', 'property_value', 'income',
    'Credit_Score', 'LTV', 'dtir1'
]

# Ordinal mapping for 'age'
condition_order = ['<25', '25-34', '35-44', '45-54', '55-64', '65-74', '>74']
ordinal_map = {code: idx for idx, code in enumerate(condition_order)}

# Define the transformers
ordinal_transformer = OrdinalMapper(columns=ordinal_cols, mapping=ordinal_map)
binary_transformer = OrdinalEncoder(dtype=int)  # maps binary categories to 0/1
onehot_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')
# numeric_transformer = StandardScaler()

# Building the column transformer, similar to how a pipeline works
preprocessor = ColumnTransformer(transformers=[
    ('ord', ordinal_transformer, ordinal_cols),
    ('bin', binary_transformer, binary_nominal_cols),
    ('ohe', onehot_transformer, multi_nominal_cols),
    ('num', 'passthrough', numeric_cols)  # leave numeric untouched before passing to SMOTE

])



# Transformer to scale the last 10 columns after SMOTE.
# Last 10 columns are numerical based on the number of numerical features of this dataset and the order in preprocessing
# Transformer to scale the last `n_numeric` columns
class ScaleLastColumns(BaseEstimator, TransformerMixin):
    def __init__(self, n_numeric):
        self.n_numeric = n_numeric
        self.scaler = StandardScaler()  # Save 10 sets of mean/std for each numerical feature and apply on the test set during scaling

    def fit(self, X, y=None):
        # Assume X is NumPy array after SMOTE
        self.scaler.fit(X[:, -self.n_numeric:])
        return self

    def transform(self, X):
        X_ = X.copy()
        X_[:, -self.n_numeric:] = self.scaler.transform(X_[:, -self.n_numeric:])
        return X_



# Load trained pipeline
log_best_pipeline = joblib.load("best_logreg_pipeline.pkl")
xgb_best_pipeline = joblib.load("best_xgb_pipeline.pkl")
rf_best_pipeline  = joblib.load("best_rf_pipeline.pkl")
nb_best_pipeline  = joblib.load("best_nb_pipeline.pkl")

# Custom threshold
thresholds = {
    "Logistic Regression": 0.2680,
    "Random Forest": 0.4850,
    "XGBoost": None,
    "Naive Bayes": None
}

# Map model name to pipeline
pipelines = {
    "Logistic Regression": log_best_pipeline,
    "XGBoost": xgb_best_pipeline,
    "Random Forest": rf_best_pipeline,
    "Naive Bayes": nb_best_pipeline
}


def predict_from_excel(file, model_name):
    # Load Excel file
    test_df = pd.read_excel(file.name)

    # Split into features and target
    X_test = test_df.drop(columns=['ID', 'year', 'Gender', 'Region', 'Status'])
    y_test = test_df['Status']

    # Get pipeline
    pipeline = pipelines[model_name]
    
    # Predict probabilities
    y_proba = pipeline.predict_proba(X_test)[:, 1]

    # Apply custom threshold if defined
    thresh = thresholds.get(model_name)
    if thresh is not None:
        y_pred = (y_proba >= thresh).astype(int)
    else:
        y_pred = (y_proba >= 0.5).astype(int)

    # Compute metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    report = classification_report(y_test, y_pred, output_dict=True)

    # Return metrics + results table
    metrics = {
        "Accuracy": round(acc, 4),
        "Precision": round(prec, 4),
        "Recall": round(rec, 4),
        "F1 Score": round(f1, 4),
        "ROC AUC": round(auc, 4),
    }

    # Add predictions to dataframe for inspection
    results_df = test_df.copy()
    results_df["Predicted"] = y_pred
    results_df["Probability"] = y_proba

    # Save temporary Excel file
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
    results_df.to_excel(temp_file.name, index=False)
 
    return metrics, results_df, temp_file.name

# Gradio UI
demo = gr.Interface(
    fn=predict_from_excel,
    inputs=[
        gr.File(label="Upload Excel"),
        gr.Dropdown(
            ["Logistic Regression", "XGBoost", "Random Forest", "Naive Bayes"],
            label="Select Model"
        )
    ],
    outputs=[
        gr.JSON(label="Evaluation Metrics"),
        gr.Dataframe(label="Predictions with Probabilities"),
        gr.File(label="Download Predictions")
    ],
    title="Loan Default Prediction",
    description="Upload an Excel file with loan applications to predict loan default risk."
)

if __name__ == "__main__":
    demo.launch(share=False)