Spaces:

EdwinLH
/

ITI105_ProjectV2

Sleeping

App Files Files Community

EdwinLH commited on Aug 22, 2025

Commit

299c10a

verified ·

1 Parent(s): 3569ec3

Create app.py

Browse files

Files changed (1) hide show

app.py +451 -0

app.py ADDED Viewed

	@@ -0,0 +1,451 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import joblib
+import tempfile
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
+from sklearn.compose import ColumnTransformer
+# FeatureEngineer Class
+class FeatureEngineer(BaseEstimator, TransformerMixin):
+    def __init__(self): # Save the learned values during training to be used to populate the missing data in any test set
+        # Numeric group means (LTV excluded as it is a logical computation and will be performed on the actual test set)
+        self.rate_of_interest_means = None
+        self.interest_rate_spread_means = None
+        self.upfront_charges_means = None
+        self.overall_rate_of_interest_mean = None
+        self.overall_interest_rate_spread_mean = None
+        self.overall_upfront_charges_mean = None
+        self.income_means_by_age = None
+        self.overall_income_mean = None
+        self.term_mean = None
+        self.property_value_mean = None
+        self.dtir1_mean = None
+        self.loan_amount_mean = None
+        self.credit_score_mean = None
+        # Most frequent categorical values
+        self.categorical_features = [
+            'loan_limit', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness',
+            'open_credit', 'business_or_commercial', 'Neg_ammortization', 'interest_only',
+            'lump_sum_payment', 'construction_type', 'occupancy_type', 'Secured_by', 'total_units',
+            'credit_type', 'co-applicant_credit_type', 'age', 'submission_of_application', 'Security_Type'
+        ]
+        self.most_frequent_cats = {}
+    def fit(self, X, y=None): # Learn parameters from training data only. Called only during training
+        X = X.copy()
+        # Calculate the numeric means for imputation
+        self.rate_of_interest_means = X.groupby(['loan_type', 'term'])['rate_of_interest'].mean() # pandas series indexed by (loan_type and term) tuples
+        self.interest_rate_spread_means = X.groupby(['loan_type', 'term'])['Interest_rate_spread'].mean()
+        self.upfront_charges_means = X.groupby(['loan_type', 'term'])['Upfront_charges'].mean()
+        self.overall_rate_of_interest_mean = X['rate_of_interest'].mean() # calculate the over global mean if combination not found
+        self.overall_interest_rate_spread_mean = X['Interest_rate_spread'].mean()
+        self.overall_upfront_charges_mean = X['Upfront_charges'].mean()
+        self.income_means_by_age = X.groupby('age')['income'].mean()
+        self.overall_income_mean = X['income'].mean()
+        self.term_mean = X['term'].mean().round(0)
+        self.property_value_mean = round(X['property_value'].mean(), -3)
+        self.dtir1_mean = X['dtir1'].mean().round(0)
+        self.loan_amount_mean = X['loan_amount'].mean() # Remaining numerical features with global mean
+        self.credit_score_mean = X['Credit_Score'].mean()
+        # Impute the categorical with the most frequent
+        for col in self.categorical_features:
+            if col in X.columns:
+                self.most_frequent_cats[col] = X[col].mode(dropna=True)[0]
+        return self
+    def transform(self, X): # Use during test set using self value
+        X = X.copy()
+        # Search for an available combination group for numeric imputations
+        def impute_feature(row, feature_name, group_means, overall_mean, group_keys):
+            if pd.isna(row[feature_name]):
+                key = tuple(row[k] for k in group_keys) # look up the group_keys such as ('Type1', 360) for ['loan_type', 'term']
+                if key in group_means:
+                    return group_means[key]
+                else:
+                    return overall_mean
+            else:
+                return row[feature_name]
+        # Impute rate_of_interest
+        X['rate_of_interest'] = X.apply(
+            lambda row: impute_feature(row, 'rate_of_interest',
+                                       self.rate_of_interest_means,
+                                       self.overall_rate_of_interest_mean,
+                                       ['loan_type', 'term']),
+            axis=1
+        ).round(3)
+        # Impute Interest_rate_spread
+        X['Interest_rate_spread'] = X.apply(
+            lambda row: impute_feature(row, 'Interest_rate_spread',
+                                       self.interest_rate_spread_means,
+                                       self.overall_interest_rate_spread_mean,
+                                       ['loan_type', 'term']),
+            axis=1
+        ).round(4)
+        # Impute Upfront_charges
+        X['Upfront_charges'] = X.apply(
+            lambda row: impute_feature(row, 'Upfront_charges',
+                                       self.upfront_charges_means,
+                                       self.overall_upfront_charges_mean,
+                                       ['loan_type', 'term']),
+            axis=1
+        ).round(2)
+        # Impute income by age
+        def impute_income(row):
+            if pd.isna(row['income']):
+                age = row['age']
+                if age in self.income_means_by_age:
+                    return self.income_means_by_age[age]
+                else:
+                    return self.overall_income_mean
+            else:
+                return row['income']
+        X['income'] = X.apply(impute_income, axis=1)
+        X['income'] = X['income'].fillna(self.overall_income_mean)
+        X['income'] = X['income'].round(-2)
+        # Impute term, property_value, dtir1, loan_amount, Credit_Score
+        X['term'] = X['term'].fillna(self.term_mean).round(0)
+        X['property_value'] = X['property_value'].fillna(self.property_value_mean).round(-3)
+        X['dtir1'] = X['dtir1'].fillna(self.dtir1_mean).round(0)
+        X['loan_amount'] = X['loan_amount'].fillna(self.loan_amount_mean)
+        X['Credit_Score'] = X['Credit_Score'].fillna(self.credit_score_mean)
+        # LTV calculation: LTV = (loan_amount / property_value) * 100
+        missing_ltv_mask = X['LTV'].isna()
+        X.loc[missing_ltv_mask, 'LTV'] = (
+            (X.loc[missing_ltv_mask, 'loan_amount'] /
+             X.loc[missing_ltv_mask, 'property_value']) * 100
+        ).round(8)
+        # Impute categorical with the most frequent
+        for col, most_freq in self.most_frequent_cats.items():
+            if col in X.columns:
+                X[col] = X[col].fillna(most_freq)
+        numeric_cols = X.select_dtypes(include=[np.number]).columns
+        return X
+# Custom Ordinal Mapper
+class OrdinalMapper(BaseEstimator, TransformerMixin):
+    def __init__(self, columns=None, mapping=None):
+        self.columns = columns
+        self.mapping = mapping
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        X_ = X.copy()
+        for col in self.columns:
+            X_[col] = X_[col].map(self.mapping).fillna(-1)  # Handle unexpected or missing values
+        return X_
+# Define the feature lists
+ordinal_cols = ['age']
+binary_nominal_cols = [
+    'loan_limit', 'approv_in_adv', 'Credit_Worthiness', 'open_credit',
+    'business_or_commercial', 'Neg_ammortization', 'interest_only',
+    'lump_sum_payment', 'construction_type', 'Secured_by',
+    'co-applicant_credit_type', 'Security_Type'
+]
+multi_nominal_cols = [
+    'loan_type', 'loan_purpose', 'occupancy_type', 'total_units',
+    'credit_type', 'submission_of_application'
+]
+numeric_cols = [
+    'loan_amount', 'rate_of_interest', 'Interest_rate_spread',
+    'Upfront_charges', 'term', 'property_value', 'income',
+    'Credit_Score', 'LTV', 'dtir1'
+]
+# Ordinal mapping for 'age'
+condition_order = ['<25', '25-34', '35-44', '45-54', '55-64', '65-74', '>74']
+ordinal_map = {code: idx for idx, code in enumerate(condition_order)}
+# Define the transformers
+ordinal_transformer = OrdinalMapper(columns=ordinal_cols, mapping=ordinal_map)
+binary_transformer = OrdinalEncoder(dtype=int)  # maps binary categories to 0/1
+onehot_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')
+# numeric_transformer = StandardScaler()
+# Building the column transformer, similar to how a pipeline works
+preprocessor = ColumnTransformer(transformers=[
+    ('ord', ordinal_transformer, ordinal_cols),
+    ('bin', binary_transformer, binary_nominal_cols),
+    ('ohe', onehot_transformer, multi_nominal_cols),
+    ('num', 'passthrough', numeric_cols)  # leave numeric untouched before passing to SMOTE
+])
+# Transformer to scale the last 10 columns after SMOTE.
+# Last 10 columns are numerical based on the number of numerical features of this dataset and the order in preprocessing
+# Transformer to scale the last `n_numeric` columns
+class ScaleLastColumns(BaseEstimator, TransformerMixin):
+    def __init__(self, n_numeric):
+        self.n_numeric = n_numeric
+        self.scaler = StandardScaler()  # Save 10 sets of mean/std for each numerical feature and apply on the test set during scaling
+    def fit(self, X, y=None):
+        # Assume X is NumPy array after SMOTE
+        self.scaler.fit(X[:, -self.n_numeric:])
+        return self
+    def transform(self, X):
+        X_ = X.copy()
+        X_[:, -self.n_numeric:] = self.scaler.transform(X_[:, -self.n_numeric:])
+        return X_
+# Load trained pipeline
+log_best_pipeline = joblib.load("best_logreg_pipeline.pkl")
+xgb_best_pipeline = joblib.load("best_xgb_pipeline.pkl")
+rf_best_pipeline  = joblib.load("best_rf_pipeline.pkl")
+nb_best_pipeline  = joblib.load("best_nb_pipeline.pkl")
+# Custom threshold
+thresholds = {
+    "Logistic Regression": 0.2680,
+    "Random Forest": 0.4850,
+    "XGBoost": None,
+    "Naive Bayes": None
+}
+# Map model name to pipeline
+pipelines = {
+    "Logistic Regression": log_best_pipeline,
+    "XGBoost": xgb_best_pipeline,
+    "Random Forest": rf_best_pipeline,
+    "Naive Bayes": nb_best_pipeline
+}
+# ------------------- Batch Prediction (Excel) -------------------
+def predict_from_excel(file, model_name):
+    # Load Excel file
+    test_df = pd.read_excel(file.name)
+    # Split into features and target
+    X_test = test_df.drop(columns=['ID', 'year', 'Gender', 'Region', 'Status'])
+    y_test = test_df['Status']
+    # Get pipeline
+    pipeline = pipelines[model_name]
+    # Predict probabilities
+    y_proba = pipeline.predict_proba(X_test)[:, 1]
+    # Apply custom threshold if defined
+    thresh = thresholds.get(model_name)
+    if thresh is not None:
+        y_pred = (y_proba >= thresh).astype(int)
+    else:
+        y_pred = (y_proba >= 0.5).astype(int)
+    # Compute metrics
+    acc = accuracy_score(y_test, y_pred)
+    prec = precision_score(y_test, y_pred)
+    rec = recall_score(y_test, y_pred)
+    f1 = f1_score(y_test, y_pred)
+    auc = roc_auc_score(y_test, y_proba)
+    report = classification_report(y_test, y_pred, output_dict=True)
+    # Return metrics + results table
+    metrics = {
+        "Accuracy": round(acc, 4),
+        "Precision": round(prec, 4),
+        "Recall": round(rec, 4),
+        "F1 Score": round(f1, 4),
+        "ROC AUC": round(auc, 4),
+    }
+    # Add predictions to dataframe for inspection
+    results_df = test_df.copy()
+    results_df["Predicted"] = y_pred
+    results_df["Probability"] = y_proba
+    # Save temporary Excel file
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
+    results_df.to_excel(temp_file.name, index=False)
+    return metrics, results_df, temp_file.name
+# ------------------- Manual Prediction -------------------
+def predict_single(
+    model_name,
+    loan_limit, Gender, approv_in_adv, loan_type, loan_purpose, Credit_Worthiness,
+    open_credit, business_or_commercial, loan_amount, rate_of_interest,
+    Interest_rate_spread, Upfront_charges, term, Neg_ammortization,
+    interest_only, lump_sum_payment, property_value, construction_type,
+    occupancy_type, Secured_by, total_units, income, credit_type,
+    Credit_Score, co_applicant_credit_type, age, submission_of_application,
+    Region, Security_Type, dtir1
+):
+    # --- Helper for numeric fields ---
+    def safe_float(x):
+        try:
+            if x is None or x == "" or (isinstance(x, float) and np.isnan(x)):
+                return np.nan
+            return float(x)
+        except:
+            return np.nan
+    # --- Compute derived feature LTV ---
+    la = safe_float(loan_amount)
+    pv = safe_float(property_value)
+    ltv = np.nan if (pv is None or pv == 0 or np.isnan(pv)) else la / pv
+    input_dict = {
+        "loan_limit": [loan_limit],
+        "approv_in_adv": [approv_in_adv],
+        "loan_type": [loan_type],
+        "loan_purpose": [loan_purpose],
+        "Credit_Worthiness": [Credit_Worthiness],
+        "open_credit": [open_credit],
+        "business_or_commercial": [business_or_commercial],
+        "loan_amount": [la],
+        "rate_of_interest": [safe_float(rate_of_interest)],
+        "Interest_rate_spread": [safe_float(Interest_rate_spread)],
+        "Upfront_charges": [safe_float(Upfront_charges)],
+        "term": [safe_float(term)],
+        "Neg_ammortization": [Neg_ammortization],
+        "interest_only": [interest_only],
+        "lump_sum_payment": [lump_sum_payment],
+        "property_value": [pv],
+        "construction_type": [construction_type],
+        "occupancy_type": [occupancy_type],
+        "Secured_by": [Secured_by],
+        "total_units": [safe_float(total_units)],
+        "income": [safe_float(income)],
+        "credit_type": [credit_type],
+        "Credit_Score": [safe_float(Credit_Score)],
+        "co-applicant_credit_type": [co_applicant_credit_type],
+        "age": [age],
+        "submission_of_application": [submission_of_application],
+        "LTV": [ltv],
+        "Region": [Region],
+        "Security_Type": [Security_Type],
+        "dtir1": [safe_float(dtir1)]
+    }
+    X_input = pd.DataFrame(input_dict)
+    pipeline = pipelines[model_name]
+    y_proba = pipeline.predict_proba(X_input)[:, 1]
+    thresh = thresholds.get(model_name)
+    if thresh is not None:
+        y_pred = (y_proba >= thresh).astype(int)
+    else:
+        y_pred = (y_proba >= 0.5).astype(int)
+    result = {
+        "Predicted Class": int(y_pred[0]),
+        "Probability": round(float(y_proba[0]), 4)
+    }
+    return result
+# ------------------- UI Components -------------------
+# Batch Tab
+batch_tab = gr.Interface(
+    fn=predict_from_excel,
+    inputs=[
+        gr.File(label="Upload Excel"),
+        gr.Dropdown(
+            ["Logistic Regression", "XGBoost", "Random Forest", "Naive Bayes"],
+            label="Select Model"
+        )
+    ],
+    outputs=[
+        gr.JSON(label="Evaluation Metrics"),
+        gr.Dataframe(label="Predictions with Probabilities"),
+        gr.File(label="Download Predictions")
+    ],
+    title="Batch Loan Default Prediction"
+)
+# Manual Tab
+manual_inputs = [
+    gr.Dropdown(
+        ["Logistic Regression", "XGBoost", "Random Forest", "Naive Bayes"],
+        label="Select Model"
+    ),
+    gr.Dropdown(["cf", "ncf"], label="loan_limit"),
+    gr.Dropdown(["Male", "Female", "Joint"], label="Gender"),
+    gr.Dropdown(["pre", "nopre"], label="approv_in_adv"),
+    gr.Dropdown(["type1", "type2", "type3"], label="loan_type"),
+    gr.Dropdown(["p1", "p2", "p3", "p4"], label="loan_purpose"),
+    gr.Dropdown(["l1", "l2"], label="Credit_Worthiness"),
+    gr.Dropdown(["opc", "nopc"], label="open_credit"),
+    gr.Dropdown(["b/c", "nob/c"], label="business_or_commercial"),
+    gr.Number(label="loan_amount"),
+    gr.Number(label="rate_of_interest"),
+    gr.Number(label="Interest_rate_spread"),
+    gr.Number(label="Upfront_charges"),
+    gr.Number(label="term"),
+    gr.Dropdown(["neg_amm", "not_neg"], label="Neg_ammortization"),
+    gr.Dropdown(["int_only", "not_int"], label="interest_only"),
+    gr.Dropdown(["lpsm", "not_lpsm"], label="lump_sum_payment"),
+    gr.Number(label="property_value"),
+    gr.Dropdown(["mh", "sb"], label="construction_type"),
+    gr.Dropdown(["ir", "pr", "sr"], label="occupancy_type"),
+    gr.Dropdown(["home", "land"], label="Secured_by"),
+    gr.Dropdown(["1U", "2U", "3U", "4U"], label="total_units"),
+    gr.Number(label="income"),
+    gr.Dropdown(["CIB", "CRIF", "EQUI", "EXP"], label="credit_type"),
+    gr.Number(label="Credit_Score"),
+    gr.Dropdown(["CIB", "EXP"], label="co-applicant_credit_type"),
+    gr.Dropdown(["<25", "25-34", "35-44", "45-54", "55-64", "65-74", ">74"], label="age"),
+    gr.Dropdown(["to_inst", "not_inst"], label="submission_of_application"),
+    # gr.Number(label="LTV"),
+    gr.Dropdown(["central", "North", "North-East", "south"], label="Region"),
+    gr.Dropdown(["direct", "Indriect"], label="Security_Type"),
+    gr.Number(label="dtir1")
+]
+manual_tab = gr.Interface(
+    fn=predict_single,
+    inputs=manual_inputs,
+    outputs=gr.JSON(label="Prediction Result"),
+    title="Manual Loan Default Prediction"
+)
+# Combine Tabs
+demo = gr.TabbedInterface([batch_tab, manual_tab], ["Batch Prediction", "Manual Prediction"])
+if __name__ == "__main__":
+    demo.launch(share=False)