Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import joblib | |
| import tempfile | |
| from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler | |
| from sklearn.compose import ColumnTransformer | |
| # FeatureEngineer Class | |
| class FeatureEngineer(BaseEstimator, TransformerMixin): | |
| def __init__(self): # Save the learned values during training to be used to populate the missing data in any test set | |
| # Numeric group means (LTV excluded as it is a logical computation and will be performed on the actual test set) | |
| self.rate_of_interest_means = None | |
| self.interest_rate_spread_means = None | |
| self.upfront_charges_means = None | |
| self.overall_rate_of_interest_mean = None | |
| self.overall_interest_rate_spread_mean = None | |
| self.overall_upfront_charges_mean = None | |
| self.income_means_by_age = None | |
| self.overall_income_mean = None | |
| self.term_mean = None | |
| self.property_value_mean = None | |
| self.dtir1_mean = None | |
| self.loan_amount_mean = None | |
| self.credit_score_mean = None | |
| # Most frequent categorical values | |
| self.categorical_features = [ | |
| 'loan_limit', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness', | |
| 'open_credit', 'business_or_commercial', 'Neg_ammortization', 'interest_only', | |
| 'lump_sum_payment', 'construction_type', 'occupancy_type', 'Secured_by', 'total_units', | |
| 'credit_type', 'co-applicant_credit_type', 'age', 'submission_of_application', 'Security_Type' | |
| ] | |
| self.most_frequent_cats = {} | |
| def fit(self, X, y=None): # Learn parameters from training data only. Called only during training | |
| X = X.copy() | |
| # Calculate the numeric means for imputation | |
| self.rate_of_interest_means = X.groupby(['loan_type', 'term'])['rate_of_interest'].mean() # pandas series indexed by (loan_type and term) tuples | |
| self.interest_rate_spread_means = X.groupby(['loan_type', 'term'])['Interest_rate_spread'].mean() | |
| self.upfront_charges_means = X.groupby(['loan_type', 'term'])['Upfront_charges'].mean() | |
| self.overall_rate_of_interest_mean = X['rate_of_interest'].mean() # calculate the over global mean if combination not found | |
| self.overall_interest_rate_spread_mean = X['Interest_rate_spread'].mean() | |
| self.overall_upfront_charges_mean = X['Upfront_charges'].mean() | |
| self.income_means_by_age = X.groupby('age')['income'].mean() | |
| self.overall_income_mean = X['income'].mean() | |
| self.term_mean = X['term'].mean().round(0) | |
| self.property_value_mean = round(X['property_value'].mean(), -3) | |
| self.dtir1_mean = X['dtir1'].mean().round(0) | |
| self.loan_amount_mean = X['loan_amount'].mean() # Remaining numerical features with global mean | |
| self.credit_score_mean = X['Credit_Score'].mean() | |
| # Impute the categorical with the most frequent | |
| for col in self.categorical_features: | |
| if col in X.columns: | |
| self.most_frequent_cats[col] = X[col].mode(dropna=True)[0] | |
| return self | |
| def transform(self, X): # Use during test set using self value | |
| X = X.copy() | |
| # Search for an available combination group for numeric imputations | |
| def impute_feature(row, feature_name, group_means, overall_mean, group_keys): | |
| if pd.isna(row[feature_name]): | |
| key = tuple(row[k] for k in group_keys) # look up the group_keys such as ('Type1', 360) for ['loan_type', 'term'] | |
| if key in group_means: | |
| return group_means[key] | |
| else: | |
| return overall_mean | |
| else: | |
| return row[feature_name] | |
| # Impute rate_of_interest | |
| X['rate_of_interest'] = X.apply( | |
| lambda row: impute_feature(row, 'rate_of_interest', | |
| self.rate_of_interest_means, | |
| self.overall_rate_of_interest_mean, | |
| ['loan_type', 'term']), | |
| axis=1 | |
| ).round(3) | |
| # Impute Interest_rate_spread | |
| X['Interest_rate_spread'] = X.apply( | |
| lambda row: impute_feature(row, 'Interest_rate_spread', | |
| self.interest_rate_spread_means, | |
| self.overall_interest_rate_spread_mean, | |
| ['loan_type', 'term']), | |
| axis=1 | |
| ).round(4) | |
| # Impute Upfront_charges | |
| X['Upfront_charges'] = X.apply( | |
| lambda row: impute_feature(row, 'Upfront_charges', | |
| self.upfront_charges_means, | |
| self.overall_upfront_charges_mean, | |
| ['loan_type', 'term']), | |
| axis=1 | |
| ).round(2) | |
| # Impute income by age | |
| def impute_income(row): | |
| if pd.isna(row['income']): | |
| age = row['age'] | |
| if age in self.income_means_by_age: | |
| return self.income_means_by_age[age] | |
| else: | |
| return self.overall_income_mean | |
| else: | |
| return row['income'] | |
| X['income'] = X.apply(impute_income, axis=1) | |
| X['income'] = X['income'].fillna(self.overall_income_mean) | |
| X['income'] = X['income'].round(-2) | |
| # Impute term, property_value, dtir1, loan_amount, Credit_Score | |
| X['term'] = X['term'].fillna(self.term_mean).round(0) | |
| X['property_value'] = X['property_value'].fillna(self.property_value_mean).round(-3) | |
| X['dtir1'] = X['dtir1'].fillna(self.dtir1_mean).round(0) | |
| X['loan_amount'] = X['loan_amount'].fillna(self.loan_amount_mean) | |
| X['Credit_Score'] = X['Credit_Score'].fillna(self.credit_score_mean) | |
| # LTV calculation: LTV = (loan_amount / property_value) * 100 | |
| missing_ltv_mask = X['LTV'].isna() | |
| X.loc[missing_ltv_mask, 'LTV'] = ( | |
| (X.loc[missing_ltv_mask, 'loan_amount'] / | |
| X.loc[missing_ltv_mask, 'property_value']) * 100 | |
| ).round(8) | |
| # Impute categorical with the most frequent | |
| for col, most_freq in self.most_frequent_cats.items(): | |
| if col in X.columns: | |
| X[col] = X[col].fillna(most_freq) | |
| numeric_cols = X.select_dtypes(include=[np.number]).columns | |
| return X | |
| # Custom Ordinal Mapper | |
| class OrdinalMapper(BaseEstimator, TransformerMixin): | |
| def __init__(self, columns=None, mapping=None): | |
| self.columns = columns | |
| self.mapping = mapping | |
| def fit(self, X, y=None): | |
| return self | |
| def transform(self, X): | |
| X_ = X.copy() | |
| for col in self.columns: | |
| X_[col] = X_[col].map(self.mapping).fillna(-1) # Handle unexpected or missing values | |
| return X_ | |
| # Define the feature lists | |
| ordinal_cols = ['age'] | |
| binary_nominal_cols = [ | |
| 'loan_limit', 'approv_in_adv', 'Credit_Worthiness', 'open_credit', | |
| 'business_or_commercial', 'Neg_ammortization', 'interest_only', | |
| 'lump_sum_payment', 'construction_type', 'Secured_by', | |
| 'co-applicant_credit_type', 'Security_Type' | |
| ] | |
| multi_nominal_cols = [ | |
| 'loan_type', 'loan_purpose', 'occupancy_type', 'total_units', | |
| 'credit_type', 'submission_of_application' | |
| ] | |
| numeric_cols = [ | |
| 'loan_amount', 'rate_of_interest', 'Interest_rate_spread', | |
| 'Upfront_charges', 'term', 'property_value', 'income', | |
| 'Credit_Score', 'LTV', 'dtir1' | |
| ] | |
| # Ordinal mapping for 'age' | |
| condition_order = ['<25', '25-34', '35-44', '45-54', '55-64', '65-74', '>74'] | |
| ordinal_map = {code: idx for idx, code in enumerate(condition_order)} | |
| # Define the transformers | |
| ordinal_transformer = OrdinalMapper(columns=ordinal_cols, mapping=ordinal_map) | |
| binary_transformer = OrdinalEncoder(dtype=int) # maps binary categories to 0/1 | |
| onehot_transformer = OneHotEncoder(drop='first', handle_unknown='ignore') | |
| # numeric_transformer = StandardScaler() | |
| # Building the column transformer, similar to how a pipeline works | |
| preprocessor = ColumnTransformer(transformers=[ | |
| ('ord', ordinal_transformer, ordinal_cols), | |
| ('bin', binary_transformer, binary_nominal_cols), | |
| ('ohe', onehot_transformer, multi_nominal_cols), | |
| ('num', 'passthrough', numeric_cols) # leave numeric untouched before passing to SMOTE | |
| ]) | |
| # Transformer to scale the last 10 columns after SMOTE. | |
| # Last 10 columns are numerical based on the number of numerical features of this dataset and the order in preprocessing | |
| # Transformer to scale the last `n_numeric` columns | |
| class ScaleLastColumns(BaseEstimator, TransformerMixin): | |
| def __init__(self, n_numeric): | |
| self.n_numeric = n_numeric | |
| self.scaler = StandardScaler() # Save 10 sets of mean/std for each numerical feature and apply on the test set during scaling | |
| def fit(self, X, y=None): | |
| # Assume X is NumPy array after SMOTE | |
| self.scaler.fit(X[:, -self.n_numeric:]) | |
| return self | |
| def transform(self, X): | |
| X_ = X.copy() | |
| X_[:, -self.n_numeric:] = self.scaler.transform(X_[:, -self.n_numeric:]) | |
| return X_ | |
| # Load trained pipeline | |
| log_best_pipeline = joblib.load("best_logreg_pipeline.pkl") | |
| xgb_best_pipeline = joblib.load("best_xgb_pipeline.pkl") | |
| rf_best_pipeline = joblib.load("best_rf_pipeline.pkl") | |
| nb_best_pipeline = joblib.load("best_nb_pipeline.pkl") | |
| # Custom threshold | |
| thresholds = { | |
| "Logistic Regression": 0.2721, | |
| "Random Forest": None, | |
| "XGBoost": None, | |
| "Naive Bayes": None | |
| } | |
| # Map model name to pipeline | |
| pipelines = { | |
| "Logistic Regression": log_best_pipeline, | |
| "XGBoost": xgb_best_pipeline, | |
| "Random Forest": rf_best_pipeline, | |
| "Naive Bayes": nb_best_pipeline | |
| } | |
| # ------------------- Batch Prediction (Excel) ------------------- | |
| def predict_from_excel(file, model_name): | |
| # Load Excel file | |
| test_df = pd.read_excel(file.name) | |
| # Split into features and target | |
| X_test = test_df.drop(columns=['ID', 'year', 'Gender', 'Region', 'Status']) | |
| y_test = test_df['Status'] | |
| # Get pipeline | |
| pipeline = pipelines[model_name] | |
| # Predict probabilities | |
| y_proba = pipeline.predict_proba(X_test)[:, 1] | |
| # Apply custom threshold if defined | |
| thresh = thresholds.get(model_name) | |
| if thresh is not None: | |
| y_pred = (y_proba >= thresh).astype(int) | |
| else: | |
| y_pred = (y_proba >= 0.5).astype(int) | |
| # Compute metrics | |
| acc = accuracy_score(y_test, y_pred) | |
| prec = precision_score(y_test, y_pred) | |
| rec = recall_score(y_test, y_pred) | |
| f1 = f1_score(y_test, y_pred) | |
| auc = roc_auc_score(y_test, y_proba) | |
| report = classification_report(y_test, y_pred, output_dict=True) | |
| # Return metrics + results table | |
| metrics = { | |
| "Accuracy": round(acc, 4), | |
| "Precision": round(prec, 4), | |
| "Recall": round(rec, 4), | |
| "F1 Score": round(f1, 4), | |
| "ROC AUC": round(auc, 4), | |
| } | |
| # Add predictions to dataframe for inspection | |
| results_df = test_df.copy() | |
| results_df["Predicted"] = y_pred | |
| results_df["Probability"] = y_proba | |
| # Save temporary Excel file | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") | |
| results_df.to_excel(temp_file.name, index=False) | |
| return metrics, results_df, temp_file.name | |
| # ------------------- Manual Prediction ------------------- | |
| def predict_single( | |
| model_name, | |
| loan_limit, Gender, approv_in_adv, loan_type, loan_purpose, Credit_Worthiness, | |
| open_credit, business_or_commercial, loan_amount, rate_of_interest, | |
| Interest_rate_spread, Upfront_charges, term, Neg_ammortization, | |
| interest_only, lump_sum_payment, property_value, construction_type, | |
| occupancy_type, Secured_by, total_units, income, credit_type, | |
| Credit_Score, co_applicant_credit_type, age, submission_of_application, | |
| Region, Security_Type, dtir1 | |
| ): | |
| # --- Helper for numeric fields --- | |
| def safe_float(x): | |
| try: | |
| if x is None or x == "" or (isinstance(x, float) and np.isnan(x)): | |
| return np.nan | |
| return float(x) | |
| except: | |
| return np.nan | |
| # --- Compute derived feature LTV --- | |
| la = safe_float(loan_amount) | |
| pv = safe_float(property_value) | |
| ltv = 0 if (pv is None or pv == 0 or np.isnan(pv)) else (la / pv) * 100 | |
| input_dict = { | |
| "loan_limit": [loan_limit], | |
| "approv_in_adv": [approv_in_adv], | |
| "loan_type": [loan_type], | |
| "loan_purpose": [loan_purpose], | |
| "Credit_Worthiness": [Credit_Worthiness], | |
| "open_credit": [open_credit], | |
| "business_or_commercial": [business_or_commercial], | |
| "loan_amount": [la], | |
| "rate_of_interest": [safe_float(rate_of_interest)], | |
| "Interest_rate_spread": [safe_float(Interest_rate_spread)], | |
| "Upfront_charges": [safe_float(Upfront_charges)], | |
| "term": [safe_float(term)], | |
| "Neg_ammortization": [Neg_ammortization], | |
| "interest_only": [interest_only], | |
| "lump_sum_payment": [lump_sum_payment], | |
| "property_value": [pv], | |
| "construction_type": [construction_type], | |
| "occupancy_type": [occupancy_type], | |
| "Secured_by": [Secured_by], | |
| "total_units": [total_units], | |
| "income": [safe_float(income)], | |
| "credit_type": [credit_type], | |
| "Credit_Score": [safe_float(Credit_Score)], | |
| "co-applicant_credit_type": [co_applicant_credit_type], | |
| "age": [age], | |
| "submission_of_application": [submission_of_application], | |
| "LTV": [ltv], | |
| "Region": [Region], | |
| "Security_Type": [Security_Type], | |
| "dtir1": [safe_float(dtir1)] | |
| } | |
| X_input = pd.DataFrame(input_dict) | |
| pipeline = pipelines[model_name] | |
| y_proba = pipeline.predict_proba(X_input)[:, 1] | |
| thresh = thresholds.get(model_name) | |
| if thresh is not None: | |
| y_pred = (y_proba >= thresh).astype(int) | |
| else: | |
| y_pred = (y_proba >= 0.5).astype(int) | |
| result = { | |
| "Predicted Class": int(y_pred[0]), | |
| "Probability": round(float(y_proba[0]), 4) | |
| } | |
| # Create results DataFrame | |
| results_df = X_input.copy() | |
| results_df["Predicted"] = y_pred | |
| results_df["Probability"] = y_proba.round(4) | |
| return results_df.to_dict(orient="records") # return as JSON-friendly format | |
| # ------------------- UI Components ------------------- | |
| # Batch Tab | |
| batch_tab = gr.Interface( | |
| fn=predict_from_excel, | |
| inputs=[ | |
| gr.File(label="Upload Excel"), | |
| gr.Dropdown( | |
| ["Logistic Regression", "XGBoost", "Random Forest", "Naive Bayes"], | |
| label="Select Model" | |
| ) | |
| ], | |
| outputs=[ | |
| gr.JSON(label="Evaluation Metrics"), | |
| gr.Dataframe(label="Predictions with Probabilities"), | |
| gr.File(label="Download Predictions") | |
| ], | |
| title="Batch Loan Default Prediction" | |
| ) | |
| # Manual Tab | |
| manual_inputs = [ | |
| gr.Dropdown( | |
| ["Logistic Regression", "XGBoost", "Random Forest", "Naive Bayes"], | |
| label="Select Model" | |
| ), | |
| gr.Dropdown(["cf", "ncf"], label="loan_limit"), | |
| gr.Dropdown(["Male", "Female", "Joint"], label="Gender"), | |
| gr.Dropdown(["pre", "nopre"], label="approv_in_adv"), | |
| gr.Dropdown(["type1", "type2", "type3"], label="loan_type"), | |
| gr.Dropdown(["p1", "p2", "p3", "p4"], label="loan_purpose"), | |
| gr.Dropdown(["l1", "l2"], label="Credit_Worthiness"), | |
| gr.Dropdown(["opc", "nopc"], label="open_credit"), | |
| gr.Dropdown(["b/c", "nob/c"], label="business_or_commercial"), | |
| gr.Number(label="loan_amount"), | |
| gr.Number(label="rate_of_interest"), | |
| gr.Number(label="Interest_rate_spread"), | |
| gr.Number(label="Upfront_charges"), | |
| gr.Number(label="term"), | |
| gr.Dropdown(["neg_amm", "not_neg"], label="Neg_ammortization"), | |
| gr.Dropdown(["int_only", "not_int"], label="interest_only"), | |
| gr.Dropdown(["lpsm", "not_lpsm"], label="lump_sum_payment"), | |
| gr.Number(label="property_value"), | |
| gr.Dropdown(["mh", "sb"], label="construction_type"), | |
| gr.Dropdown(["ir", "pr", "sr"], label="occupancy_type"), | |
| gr.Dropdown(["home", "land"], label="Secured_by"), | |
| gr.Dropdown(["1U", "2U", "3U", "4U"], label="total_units"), | |
| gr.Number(label="income"), | |
| gr.Dropdown(["CIB", "CRIF", "EQUI", "EXP"], label="credit_type"), | |
| gr.Number(label="Credit_Score"), | |
| gr.Dropdown(["CIB", "EXP"], label="co-applicant_credit_type"), | |
| gr.Dropdown(["<25", "25-34", "35-44", "45-54", "55-64", "65-74", ">74"], label="age"), | |
| gr.Dropdown(["to_inst", "not_inst"], label="submission_of_application"), | |
| # gr.Number(label="LTV"), | |
| gr.Dropdown(["central", "North", "North-East", "south"], label="Region"), | |
| gr.Dropdown(["direct", "Indriect"], label="Security_Type"), | |
| gr.Number(label="dtir1") | |
| ] | |
| manual_tab = gr.Interface( | |
| fn=predict_single, | |
| inputs=manual_inputs, | |
| outputs=gr.JSON(label="Prediction Result"), | |
| title="Manual Loan Default Prediction" | |
| ) | |
| # Combine Tabs | |
| demo = gr.TabbedInterface([batch_tab, manual_tab], ["Batch Prediction", "Manual Prediction"]) | |
| if __name__ == "__main__": | |
| demo.launch(share=False) |