lkchew commited on
Commit
76afcf0
·
verified ·
1 Parent(s): 2d65244

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -1
app.py CHANGED
@@ -9,6 +9,87 @@ from sklearn.compose import ColumnTransformer
9
  from sklearn.pipeline import Pipeline
10
  from sklearn.base import BaseEstimator, TransformerMixin
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Load trained model and preprocessing objects
14
  gb_loaded = joblib.load('gradient_boosting_model.pkl')
@@ -24,7 +105,6 @@ def predict_csv(file):
24
  mask = df['term'].notnull()
25
  df_cleaned = df[mask].copy()
26
 
27
- # Separate features if needed
28
  # If target column exists in CSV, drop it
29
  if 'target' in df_cleaned.columns:
30
  df_cleaned = df_cleaned.drop(columns=['target'])
 
9
  from sklearn.pipeline import Pipeline
10
  from sklearn.base import BaseEstimator, TransformerMixin
11
 
12
+ #numeric features imputer
13
+ class CustomImputer(TransformerMixin):
14
+ def fit(self, X, y=None):
15
+ # Precompute group means for imputations
16
+ self.group_means = {
17
+ 'rate_of_interest': X.groupby(['loan_type', 'term'])['rate_of_interest'].mean(),
18
+ 'Interest_rate_spread': X.groupby(['loan_type', 'term'])['Interest_rate_spread'].mean(),
19
+ 'Upfront_charges': X.groupby(['loan_type', 'term'])['Upfront_charges'].mean(),
20
+ 'rate_of_interest_loan': X.groupby(['loan_type'])['rate_of_interest'].mean(),
21
+ 'Interest_rate_spread_loan': X.groupby(['loan_type'])['Interest_rate_spread'].mean(),
22
+ 'Upfront_charges_loan': X.groupby(['loan_type'])['Upfront_charges'].mean(),
23
+ 'income_by_age': X.groupby(['age'])['income'].mean(),
24
+ 'property_value_mean': X['property_value'].mean(),
25
+ 'dtir1_mean': X['dtir1'].mean(),
26
+ 'income_mean': X['income'].mean(),
27
+ }
28
+
29
+ return self
30
+
31
+ def transform(self, X):
32
+ X = X.copy()
33
+
34
+ # Impute numerical features using group-based means
35
+ for col in ['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges']:
36
+ X[col] = X.groupby(['loan_type', 'term'])[col].transform(lambda x: x.fillna(x.mean())).round(3 if col == 'rate_of_interest' else 4)
37
+
38
+ for col in ['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges']:
39
+ loan_mean = self.group_means[col + '_loan']
40
+ X[col] = X.apply(
41
+ lambda row: row[col] if pd.notnull(row[col]) else loan_mean.get(row['loan_type'], np.nan),
42
+ axis=1
43
+ )
44
+ X[col] = X[col].round(3 if col == 'rate_of_interest' else 4)
45
+
46
+ # Impute property_value and dtir1
47
+ X['property_value'] = X['property_value'].fillna(self.group_means['property_value_mean'])
48
+ X['property_value'] = np.round(X['property_value'], -3)
49
+
50
+ X['dtir1'] = X['dtir1'].fillna(self.group_means['dtir1_mean']).round(0)
51
+
52
+ # Income
53
+ X['income'] = X.groupby(['age'])['income'].transform(lambda x: x.fillna(x.mean()))
54
+ X['income'] = X['income'].fillna(self.group_means['income_mean'])
55
+ X['income'] = np.round(X['income'], -2)
56
+
57
+ # LTV
58
+ X['LTV'] = X['LTV'].fillna(X['loan_amount'] / X['property_value'] * 100).round(8)
59
+
60
+ return X
61
+
62
+ # Categorical features imputer
63
+ # CustomCleaner - will handle wrong spelling, drop features and convert missing values to np.nan
64
+ class CustomCleaner(BaseEstimator, TransformerMixin):
65
+ def __init__(self, drop_cols=None, missing_placeholders=None, cat_cols=None):
66
+ self.drop_cols = drop_cols
67
+ self.missing_placeholders = missing_placeholders if missing_placeholders is not None else ['', 'NA', 'nan', 'NaN']
68
+ self.cat_cols = cat_cols # list of categorical columns to fix missing values
69
+
70
+ def fit(self, X, y=None):
71
+ return self
72
+
73
+ def transform(self, X):
74
+ X = X.copy()
75
+
76
+ # Drop unwanted columns
77
+ if self.drop_cols:
78
+ X = X.drop(self.drop_cols, axis=1)
79
+
80
+ # Fix wrong spelling in 'Security_Type'
81
+ if 'Security_Type' in X.columns:
82
+ X['Security_Type'] = X['Security_Type'].replace({'Indriect': 'Indirect'})
83
+
84
+ # Replace placeholders with np.nan in categorical columns
85
+ if self.cat_cols:
86
+ for col in self.cat_cols:
87
+ if col in X.columns:
88
+ X[col] = X[col].replace(self.missing_placeholders, np.nan)
89
+
90
+ return X
91
+
92
+ custom_cleaner = CustomCleaner(drop_cols=['ID', 'year', 'Gender', 'property_value', 'loan_amount', 'Interest_rate_spread'])
93
 
94
  # Load trained model and preprocessing objects
95
  gb_loaded = joblib.load('gradient_boosting_model.pkl')
 
105
  mask = df['term'].notnull()
106
  df_cleaned = df[mask].copy()
107
 
 
108
  # If target column exists in CSV, drop it
109
  if 'target' in df_cleaned.columns:
110
  df_cleaned = df_cleaned.drop(columns=['target'])