lkchew commited on
Commit
73ec2e0
·
verified ·
1 Parent(s): 80f2d4a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -10
app.py CHANGED
@@ -2,30 +2,126 @@ import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
  import joblib
 
 
 
 
 
 
5
 
6
- # ... keep your existing import statements, CustomImputer, CustomCleaner, and model loading ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- # Define a mapping of dropdown options to file paths
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  csv_files = {
10
  "Default 1": "Default_1.csv",
11
  "Default 2": "Default_2.csv",
12
  "Non Default": "Non_default.csv"
13
  }
14
 
 
 
 
15
  def predict_csv_from_dropdown(file_choice, model_choice):
16
  # Read CSV based on dropdown choice
17
  file_path = csv_files[file_choice]
18
  df = pd.read_csv(file_path)
19
 
20
  # Filter rows with 'term' not null
21
- mask = df['term'].notnull()
22
- df_cleaned = df[mask].copy()
23
 
24
- # If target column exists in CSV, drop it
25
  if 'target' in df_cleaned.columns:
26
  df_cleaned = df_cleaned.drop(columns=['target'])
27
 
28
- # Apply numeric imputation
29
  X_num = num_pipeline.transform(df_cleaned)
30
 
31
  # Custom cleaning
@@ -34,17 +130,16 @@ def predict_csv_from_dropdown(file_choice, model_choice):
34
  # Categorical preprocessing
35
  X_processed = cat_preprocessing.transform(X_cleaned)
36
 
37
- # Choose model based on dropdown
38
  model = rf_loaded if model_choice == "Random Forest" else gb_loaded
39
 
40
  # Predict
41
  preds = model.predict(X_processed)
42
  probs = model.predict_proba(X_processed).max(axis=1)
43
 
44
- # Convert classes to readable labels
45
  labels = ['Non-default' if c == 0 else 'Default' for c in preds]
46
 
47
- # Combine results
48
  results = pd.DataFrame({
49
  'Prediction': labels,
50
  'Confidence': probs
@@ -52,7 +147,9 @@ def predict_csv_from_dropdown(file_choice, model_choice):
52
 
53
  return results
54
 
55
- # Gradio interface
 
 
56
  iface = gr.Interface(
57
  fn=predict_csv_from_dropdown,
58
  inputs=[
@@ -67,3 +164,4 @@ iface = gr.Interface(
67
  if __name__ == "__main__":
68
  iface.launch()
69
 
 
 
2
  import pandas as pd
3
  import numpy as np
4
  import joblib
5
+ from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
6
+ from sklearn.impute import SimpleImputer
7
+ from sklearn.preprocessing import OneHotEncoder
8
+ from sklearn.compose import ColumnTransformer
9
+ from sklearn.pipeline import Pipeline
10
+ from sklearn.base import BaseEstimator, TransformerMixin
11
 
12
+ # ----------------------------
13
+ # Custom Numeric Imputer
14
+ # ----------------------------
15
+ class CustomImputer(TransformerMixin):
16
+ def fit(self, X, y=None):
17
+ # Precompute group means for imputations
18
+ self.group_means = {
19
+ 'rate_of_interest': X.groupby(['loan_type', 'term'])['rate_of_interest'].mean(),
20
+ 'Interest_rate_spread': X.groupby(['loan_type', 'term'])['Interest_rate_spread'].mean(),
21
+ 'Upfront_charges': X.groupby(['loan_type', 'term'])['Upfront_charges'].mean(),
22
+ 'rate_of_interest_loan': X.groupby(['loan_type'])['rate_of_interest'].mean(),
23
+ 'Interest_rate_spread_loan': X.groupby(['loan_type'])['Interest_rate_spread'].mean(),
24
+ 'Upfront_charges_loan': X.groupby(['loan_type'])['Upfront_charges'].mean(),
25
+ 'income_by_age': X.groupby(['age'])['income'].mean(),
26
+ 'property_value_mean': X['property_value'].mean(),
27
+ 'dtir1_mean': X['dtir1'].mean(),
28
+ 'income_mean': X['income'].mean(),
29
+ }
30
+ return self
31
 
32
+ def transform(self, X):
33
+ X = X.copy()
34
+
35
+ # Impute numerical features using group-based means
36
+ for col in ['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges']:
37
+ X[col] = X.groupby(['loan_type', 'term'])[col].transform(lambda x: x.fillna(x.mean())).round(3 if col == 'rate_of_interest' else 4)
38
+
39
+ for col in ['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges']:
40
+ loan_mean = self.group_means[col + '_loan']
41
+ X[col] = X.apply(
42
+ lambda row: row[col] if pd.notnull(row[col]) else loan_mean.get(row['loan_type'], np.nan),
43
+ axis=1
44
+ )
45
+ X[col] = X[col].round(3 if col == 'rate_of_interest' else 4)
46
+
47
+ # Impute property_value and dtir1
48
+ X['property_value'] = X['property_value'].fillna(self.group_means['property_value_mean'])
49
+ X['property_value'] = np.round(X['property_value'], -3)
50
+
51
+ X['dtir1'] = X['dtir1'].fillna(self.group_means['dtir1_mean']).round(0)
52
+
53
+ # Income
54
+ X['income'] = X.groupby(['age'])['income'].transform(lambda x: x.fillna(x.mean()))
55
+ X['income'] = X['income'].fillna(self.group_means['income_mean'])
56
+ X['income'] = np.round(X['income'], -2)
57
+
58
+ # LTV
59
+ X['LTV'] = X['LTV'].fillna(X['loan_amount'] / X['property_value'] * 100).round(8)
60
+
61
+ return X
62
+
63
+ # ----------------------------
64
+ # Custom Categorical Cleaner
65
+ # ----------------------------
66
+ class CustomCleaner(BaseEstimator, TransformerMixin):
67
+ def __init__(self, drop_cols=None, missing_placeholders=None, cat_cols=None):
68
+ self.drop_cols = drop_cols
69
+ self.missing_placeholders = missing_placeholders if missing_placeholders is not None else ['', 'NA', 'nan', 'NaN']
70
+ self.cat_cols = cat_cols
71
+
72
+ def fit(self, X, y=None):
73
+ return self
74
+
75
+ def transform(self, X):
76
+ X = X.copy()
77
+
78
+ if self.drop_cols:
79
+ X = X.drop(self.drop_cols, axis=1)
80
+
81
+ if 'Security_Type' in X.columns:
82
+ X['Security_Type'] = X['Security_Type'].replace({'Indriect': 'Indirect'})
83
+
84
+ if self.cat_cols:
85
+ for col in self.cat_cols:
86
+ if col in X.columns:
87
+ X[col] = X[col].replace(self.missing_placeholders, np.nan)
88
+
89
+ return X
90
+
91
+ # ----------------------------
92
+ # Load models and preprocessing pipelines
93
+ # ----------------------------
94
+ gb_loaded = joblib.load('gradient_boosting_model.pkl')
95
+ rf_loaded = joblib.load("random_forest_model.pkl")
96
+ num_pipeline = joblib.load('num_pipeline.pkl') # numeric imputer pipeline
97
+ custom_cleaner = joblib.load('custom_cleaner.pkl') # custom cleaning transformer
98
+ cat_preprocessing = joblib.load('cat_preprocessing.pkl') # categorical preprocessing
99
+
100
+ # ----------------------------
101
+ # Predefined CSV file options
102
+ # ----------------------------
103
  csv_files = {
104
  "Default 1": "Default_1.csv",
105
  "Default 2": "Default_2.csv",
106
  "Non Default": "Non_default.csv"
107
  }
108
 
109
+ # ----------------------------
110
+ # Prediction function
111
+ # ----------------------------
112
  def predict_csv_from_dropdown(file_choice, model_choice):
113
  # Read CSV based on dropdown choice
114
  file_path = csv_files[file_choice]
115
  df = pd.read_csv(file_path)
116
 
117
  # Filter rows with 'term' not null
118
+ df_cleaned = df[df['term'].notnull()].copy()
 
119
 
120
+ # Drop target if exists
121
  if 'target' in df_cleaned.columns:
122
  df_cleaned = df_cleaned.drop(columns=['target'])
123
 
124
+ # Numeric preprocessing
125
  X_num = num_pipeline.transform(df_cleaned)
126
 
127
  # Custom cleaning
 
130
  # Categorical preprocessing
131
  X_processed = cat_preprocessing.transform(X_cleaned)
132
 
133
+ # Select model
134
  model = rf_loaded if model_choice == "Random Forest" else gb_loaded
135
 
136
  # Predict
137
  preds = model.predict(X_processed)
138
  probs = model.predict_proba(X_processed).max(axis=1)
139
 
140
+ # Convert to readable labels
141
  labels = ['Non-default' if c == 0 else 'Default' for c in preds]
142
 
 
143
  results = pd.DataFrame({
144
  'Prediction': labels,
145
  'Confidence': probs
 
147
 
148
  return results
149
 
150
+ # ----------------------------
151
+ # Gradio Interface
152
+ # ----------------------------
153
  iface = gr.Interface(
154
  fn=predict_csv_from_dropdown,
155
  inputs=[
 
164
  if __name__ == "__main__":
165
  iface.launch()
166
 
167
+