Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,6 +9,87 @@ from sklearn.compose import ColumnTransformer
|
|
| 9 |
from sklearn.pipeline import Pipeline
|
| 10 |
from sklearn.base import BaseEstimator, TransformerMixin
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# Load trained model and preprocessing objects
|
| 14 |
gb_loaded = joblib.load('gradient_boosting_model.pkl')
|
|
@@ -24,7 +105,6 @@ def predict_csv(file):
|
|
| 24 |
mask = df['term'].notnull()
|
| 25 |
df_cleaned = df[mask].copy()
|
| 26 |
|
| 27 |
-
# Separate features if needed
|
| 28 |
# If target column exists in CSV, drop it
|
| 29 |
if 'target' in df_cleaned.columns:
|
| 30 |
df_cleaned = df_cleaned.drop(columns=['target'])
|
|
|
|
| 9 |
from sklearn.pipeline import Pipeline
|
| 10 |
from sklearn.base import BaseEstimator, TransformerMixin
|
| 11 |
|
| 12 |
+
#numeric features imputer
|
| 13 |
+
class CustomImputer(TransformerMixin):
|
| 14 |
+
def fit(self, X, y=None):
|
| 15 |
+
# Precompute group means for imputations
|
| 16 |
+
self.group_means = {
|
| 17 |
+
'rate_of_interest': X.groupby(['loan_type', 'term'])['rate_of_interest'].mean(),
|
| 18 |
+
'Interest_rate_spread': X.groupby(['loan_type', 'term'])['Interest_rate_spread'].mean(),
|
| 19 |
+
'Upfront_charges': X.groupby(['loan_type', 'term'])['Upfront_charges'].mean(),
|
| 20 |
+
'rate_of_interest_loan': X.groupby(['loan_type'])['rate_of_interest'].mean(),
|
| 21 |
+
'Interest_rate_spread_loan': X.groupby(['loan_type'])['Interest_rate_spread'].mean(),
|
| 22 |
+
'Upfront_charges_loan': X.groupby(['loan_type'])['Upfront_charges'].mean(),
|
| 23 |
+
'income_by_age': X.groupby(['age'])['income'].mean(),
|
| 24 |
+
'property_value_mean': X['property_value'].mean(),
|
| 25 |
+
'dtir1_mean': X['dtir1'].mean(),
|
| 26 |
+
'income_mean': X['income'].mean(),
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
return self
|
| 30 |
+
|
| 31 |
+
def transform(self, X):
|
| 32 |
+
X = X.copy()
|
| 33 |
+
|
| 34 |
+
# Impute numerical features using group-based means
|
| 35 |
+
for col in ['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges']:
|
| 36 |
+
X[col] = X.groupby(['loan_type', 'term'])[col].transform(lambda x: x.fillna(x.mean())).round(3 if col == 'rate_of_interest' else 4)
|
| 37 |
+
|
| 38 |
+
for col in ['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges']:
|
| 39 |
+
loan_mean = self.group_means[col + '_loan']
|
| 40 |
+
X[col] = X.apply(
|
| 41 |
+
lambda row: row[col] if pd.notnull(row[col]) else loan_mean.get(row['loan_type'], np.nan),
|
| 42 |
+
axis=1
|
| 43 |
+
)
|
| 44 |
+
X[col] = X[col].round(3 if col == 'rate_of_interest' else 4)
|
| 45 |
+
|
| 46 |
+
# Impute property_value and dtir1
|
| 47 |
+
X['property_value'] = X['property_value'].fillna(self.group_means['property_value_mean'])
|
| 48 |
+
X['property_value'] = np.round(X['property_value'], -3)
|
| 49 |
+
|
| 50 |
+
X['dtir1'] = X['dtir1'].fillna(self.group_means['dtir1_mean']).round(0)
|
| 51 |
+
|
| 52 |
+
# Income
|
| 53 |
+
X['income'] = X.groupby(['age'])['income'].transform(lambda x: x.fillna(x.mean()))
|
| 54 |
+
X['income'] = X['income'].fillna(self.group_means['income_mean'])
|
| 55 |
+
X['income'] = np.round(X['income'], -2)
|
| 56 |
+
|
| 57 |
+
# LTV
|
| 58 |
+
X['LTV'] = X['LTV'].fillna(X['loan_amount'] / X['property_value'] * 100).round(8)
|
| 59 |
+
|
| 60 |
+
return X
|
| 61 |
+
|
| 62 |
+
# Categorical features imputer
|
| 63 |
+
# CustomCleaner - will handle wrong spelling, drop features and convert missing values to np.nan
|
| 64 |
+
class CustomCleaner(BaseEstimator, TransformerMixin):
|
| 65 |
+
def __init__(self, drop_cols=None, missing_placeholders=None, cat_cols=None):
|
| 66 |
+
self.drop_cols = drop_cols
|
| 67 |
+
self.missing_placeholders = missing_placeholders if missing_placeholders is not None else ['', 'NA', 'nan', 'NaN']
|
| 68 |
+
self.cat_cols = cat_cols # list of categorical columns to fix missing values
|
| 69 |
+
|
| 70 |
+
def fit(self, X, y=None):
|
| 71 |
+
return self
|
| 72 |
+
|
| 73 |
+
def transform(self, X):
|
| 74 |
+
X = X.copy()
|
| 75 |
+
|
| 76 |
+
# Drop unwanted columns
|
| 77 |
+
if self.drop_cols:
|
| 78 |
+
X = X.drop(self.drop_cols, axis=1)
|
| 79 |
+
|
| 80 |
+
# Fix wrong spelling in 'Security_Type'
|
| 81 |
+
if 'Security_Type' in X.columns:
|
| 82 |
+
X['Security_Type'] = X['Security_Type'].replace({'Indriect': 'Indirect'})
|
| 83 |
+
|
| 84 |
+
# Replace placeholders with np.nan in categorical columns
|
| 85 |
+
if self.cat_cols:
|
| 86 |
+
for col in self.cat_cols:
|
| 87 |
+
if col in X.columns:
|
| 88 |
+
X[col] = X[col].replace(self.missing_placeholders, np.nan)
|
| 89 |
+
|
| 90 |
+
return X
|
| 91 |
+
|
| 92 |
+
custom_cleaner = CustomCleaner(drop_cols=['ID', 'year', 'Gender', 'property_value', 'loan_amount', 'Interest_rate_spread'])
|
| 93 |
|
| 94 |
# Load trained model and preprocessing objects
|
| 95 |
gb_loaded = joblib.load('gradient_boosting_model.pkl')
|
|
|
|
| 105 |
mask = df['term'].notnull()
|
| 106 |
df_cleaned = df[mask].copy()
|
| 107 |
|
|
|
|
| 108 |
# If target column exists in CSV, drop it
|
| 109 |
if 'target' in df_cleaned.columns:
|
| 110 |
df_cleaned = df_cleaned.drop(columns=['target'])
|