ITI105_Project / app.py
lkchew's picture
Update app.py
73ec2e0 verified
import gradio as gr
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
# ----------------------------
# Custom Numeric Imputer
# ----------------------------
class CustomImputer(TransformerMixin):
def fit(self, X, y=None):
# Precompute group means for imputations
self.group_means = {
'rate_of_interest': X.groupby(['loan_type', 'term'])['rate_of_interest'].mean(),
'Interest_rate_spread': X.groupby(['loan_type', 'term'])['Interest_rate_spread'].mean(),
'Upfront_charges': X.groupby(['loan_type', 'term'])['Upfront_charges'].mean(),
'rate_of_interest_loan': X.groupby(['loan_type'])['rate_of_interest'].mean(),
'Interest_rate_spread_loan': X.groupby(['loan_type'])['Interest_rate_spread'].mean(),
'Upfront_charges_loan': X.groupby(['loan_type'])['Upfront_charges'].mean(),
'income_by_age': X.groupby(['age'])['income'].mean(),
'property_value_mean': X['property_value'].mean(),
'dtir1_mean': X['dtir1'].mean(),
'income_mean': X['income'].mean(),
}
return self
def transform(self, X):
X = X.copy()
# Impute numerical features using group-based means
for col in ['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges']:
X[col] = X.groupby(['loan_type', 'term'])[col].transform(lambda x: x.fillna(x.mean())).round(3 if col == 'rate_of_interest' else 4)
for col in ['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges']:
loan_mean = self.group_means[col + '_loan']
X[col] = X.apply(
lambda row: row[col] if pd.notnull(row[col]) else loan_mean.get(row['loan_type'], np.nan),
axis=1
)
X[col] = X[col].round(3 if col == 'rate_of_interest' else 4)
# Impute property_value and dtir1
X['property_value'] = X['property_value'].fillna(self.group_means['property_value_mean'])
X['property_value'] = np.round(X['property_value'], -3)
X['dtir1'] = X['dtir1'].fillna(self.group_means['dtir1_mean']).round(0)
# Income
X['income'] = X.groupby(['age'])['income'].transform(lambda x: x.fillna(x.mean()))
X['income'] = X['income'].fillna(self.group_means['income_mean'])
X['income'] = np.round(X['income'], -2)
# LTV
X['LTV'] = X['LTV'].fillna(X['loan_amount'] / X['property_value'] * 100).round(8)
return X
# ----------------------------
# Custom Categorical Cleaner
# ----------------------------
class CustomCleaner(BaseEstimator, TransformerMixin):
def __init__(self, drop_cols=None, missing_placeholders=None, cat_cols=None):
self.drop_cols = drop_cols
self.missing_placeholders = missing_placeholders if missing_placeholders is not None else ['', 'NA', 'nan', 'NaN']
self.cat_cols = cat_cols
def fit(self, X, y=None):
return self
def transform(self, X):
X = X.copy()
if self.drop_cols:
X = X.drop(self.drop_cols, axis=1)
if 'Security_Type' in X.columns:
X['Security_Type'] = X['Security_Type'].replace({'Indriect': 'Indirect'})
if self.cat_cols:
for col in self.cat_cols:
if col in X.columns:
X[col] = X[col].replace(self.missing_placeholders, np.nan)
return X
# ----------------------------
# Load models and preprocessing pipelines
# ----------------------------
gb_loaded = joblib.load('gradient_boosting_model.pkl')
rf_loaded = joblib.load("random_forest_model.pkl")
num_pipeline = joblib.load('num_pipeline.pkl') # numeric imputer pipeline
custom_cleaner = joblib.load('custom_cleaner.pkl') # custom cleaning transformer
cat_preprocessing = joblib.load('cat_preprocessing.pkl') # categorical preprocessing
# ----------------------------
# Predefined CSV file options
# ----------------------------
csv_files = {
"Default 1": "Default_1.csv",
"Default 2": "Default_2.csv",
"Non Default": "Non_default.csv"
}
# ----------------------------
# Prediction function
# ----------------------------
def predict_csv_from_dropdown(file_choice, model_choice):
# Read CSV based on dropdown choice
file_path = csv_files[file_choice]
df = pd.read_csv(file_path)
# Filter rows with 'term' not null
df_cleaned = df[df['term'].notnull()].copy()
# Drop target if exists
if 'target' in df_cleaned.columns:
df_cleaned = df_cleaned.drop(columns=['target'])
# Numeric preprocessing
X_num = num_pipeline.transform(df_cleaned)
# Custom cleaning
X_cleaned = custom_cleaner.transform(X_num)
# Categorical preprocessing
X_processed = cat_preprocessing.transform(X_cleaned)
# Select model
model = rf_loaded if model_choice == "Random Forest" else gb_loaded
# Predict
preds = model.predict(X_processed)
probs = model.predict_proba(X_processed).max(axis=1)
# Convert to readable labels
labels = ['Non-default' if c == 0 else 'Default' for c in preds]
results = pd.DataFrame({
'Prediction': labels,
'Confidence': probs
})
return results
# ----------------------------
# Gradio Interface
# ----------------------------
iface = gr.Interface(
fn=predict_csv_from_dropdown,
inputs=[
gr.Dropdown(choices=list(csv_files.keys()), label="Select CSV File"),
gr.Dropdown(choices=["Random Forest", "Gradient Boosting"], label="Select Model")
],
outputs=gr.Dataframe(headers=["Prediction", "Confidence"]),
title="Loan Default Prediction",
description="Select a CSV file and model to predict whether the applicant will Default (1) or Non-default (0) the loan."
)
if __name__ == "__main__":
iface.launch()