Spaces:

lkchew
/

ITI105_Project

Sleeping

File size: 6,158 Bytes

b00d985
 
 
 
73ec2e0
 
 
 
 
 
50b44af
73ec2e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76afcf0
73ec2e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0adbd62
80f2d4a
 
 
0adbd62
76afcf0
73ec2e0
 
 
0adbd62
 
 
 
b00d985
 
73ec2e0
b00d985
73ec2e0
b00d985
 
 
73ec2e0
b00d985
 
 
 
 
 
 
835afa7
73ec2e0
0adbd62
b00d985
 
835afa7
0adbd62
b00d985
73ec2e0
b00d985
 
 
 
 
 
 
 
 
73ec2e0
 
 
b00d985
0adbd62
835afa7
0adbd62
835afa7
 
b00d985
835afa7
0adbd62
b00d985
 
 
 
0adbd62
73ec2e0

import gradio as gr
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# ----------------------------
# Custom Numeric Imputer
# ----------------------------
class CustomImputer(TransformerMixin):
    def fit(self, X, y=None):
        # Precompute group means for imputations
        self.group_means = {
            'rate_of_interest': X.groupby(['loan_type', 'term'])['rate_of_interest'].mean(),
            'Interest_rate_spread': X.groupby(['loan_type', 'term'])['Interest_rate_spread'].mean(),
            'Upfront_charges': X.groupby(['loan_type', 'term'])['Upfront_charges'].mean(),
            'rate_of_interest_loan': X.groupby(['loan_type'])['rate_of_interest'].mean(),
            'Interest_rate_spread_loan': X.groupby(['loan_type'])['Interest_rate_spread'].mean(),
            'Upfront_charges_loan': X.groupby(['loan_type'])['Upfront_charges'].mean(),
            'income_by_age': X.groupby(['age'])['income'].mean(),
            'property_value_mean': X['property_value'].mean(),
            'dtir1_mean': X['dtir1'].mean(),
            'income_mean': X['income'].mean(),
        }
        return self

    def transform(self, X):
        X = X.copy()

        # Impute numerical features using group-based means
        for col in ['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges']:
            X[col] = X.groupby(['loan_type', 'term'])[col].transform(lambda x: x.fillna(x.mean())).round(3 if col == 'rate_of_interest' else 4)

        for col in ['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges']:
            loan_mean = self.group_means[col + '_loan']
            X[col] = X.apply(
                lambda row: row[col] if pd.notnull(row[col]) else loan_mean.get(row['loan_type'], np.nan),
                axis=1
            )
            X[col] = X[col].round(3 if col == 'rate_of_interest' else 4)

        # Impute property_value and dtir1
        X['property_value'] = X['property_value'].fillna(self.group_means['property_value_mean'])
        X['property_value'] = np.round(X['property_value'], -3)

        X['dtir1'] = X['dtir1'].fillna(self.group_means['dtir1_mean']).round(0)

        # Income
        X['income'] = X.groupby(['age'])['income'].transform(lambda x: x.fillna(x.mean()))
        X['income'] = X['income'].fillna(self.group_means['income_mean'])
        X['income'] = np.round(X['income'], -2)

        # LTV
        X['LTV'] = X['LTV'].fillna(X['loan_amount'] / X['property_value'] * 100).round(8)

        return X

# ----------------------------
# Custom Categorical Cleaner
# ----------------------------
class CustomCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, drop_cols=None, missing_placeholders=None, cat_cols=None):
        self.drop_cols = drop_cols
        self.missing_placeholders = missing_placeholders if missing_placeholders is not None else ['', 'NA', 'nan', 'NaN']
        self.cat_cols = cat_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        if self.drop_cols:
            X = X.drop(self.drop_cols, axis=1)

        if 'Security_Type' in X.columns:
            X['Security_Type'] = X['Security_Type'].replace({'Indriect': 'Indirect'})

        if self.cat_cols:
            for col in self.cat_cols:
                if col in X.columns:
                    X[col] = X[col].replace(self.missing_placeholders, np.nan)

        return X

# ----------------------------
# Load models and preprocessing pipelines
# ----------------------------
gb_loaded = joblib.load('gradient_boosting_model.pkl')
rf_loaded = joblib.load("random_forest_model.pkl")
num_pipeline = joblib.load('num_pipeline.pkl')       # numeric imputer pipeline
custom_cleaner = joblib.load('custom_cleaner.pkl')   # custom cleaning transformer
cat_preprocessing = joblib.load('cat_preprocessing.pkl')  # categorical preprocessing

# ----------------------------
# Predefined CSV file options
# ----------------------------
csv_files = {
    "Default 1": "Default_1.csv",
    "Default 2": "Default_2.csv",
    "Non Default": "Non_default.csv"
}

# ----------------------------
# Prediction function
# ----------------------------
def predict_csv_from_dropdown(file_choice, model_choice):
    # Read CSV based on dropdown choice
    file_path = csv_files[file_choice]
    df = pd.read_csv(file_path)
    
    # Filter rows with 'term' not null
    df_cleaned = df[df['term'].notnull()].copy()
    
    # Drop target if exists
    if 'target' in df_cleaned.columns:
        df_cleaned = df_cleaned.drop(columns=['target'])
    
    # Numeric preprocessing
    X_num = num_pipeline.transform(df_cleaned)
    
    # Custom cleaning
    X_cleaned = custom_cleaner.transform(X_num)
    
    # Categorical preprocessing
    X_processed = cat_preprocessing.transform(X_cleaned)

    # Select model
    model = rf_loaded if model_choice == "Random Forest" else gb_loaded
    
    # Predict
    preds = model.predict(X_processed)
    probs = model.predict_proba(X_processed).max(axis=1)
    
    # Convert to readable labels
    labels = ['Non-default' if c == 0 else 'Default' for c in preds]
    
    results = pd.DataFrame({
        'Prediction': labels,
        'Confidence': probs
    })
    
    return results

# ----------------------------
# Gradio Interface
# ----------------------------
iface = gr.Interface(
    fn=predict_csv_from_dropdown,
    inputs=[
        gr.Dropdown(choices=list(csv_files.keys()), label="Select CSV File"),
        gr.Dropdown(choices=["Random Forest", "Gradient Boosting"], label="Select Model")
    ],
    outputs=gr.Dataframe(headers=["Prediction", "Confidence"]),
    title="Loan Default Prediction",
    description="Select a CSV file and model to predict whether the applicant will Default (1) or Non-default (0) the loan."
)

if __name__ == "__main__":
    iface.launch()