Spaces:

lkchew
/

ITI105_Project

Sleeping

App Files Files Community

lkchew commited on Aug 14, 2025

Commit

76afcf0

verified ·

1 Parent(s): 2d65244

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -1

app.py CHANGED Viewed

@@ -9,6 +9,87 @@ from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 from sklearn.base import BaseEstimator, TransformerMixin
 # Load trained model and preprocessing objects
 gb_loaded = joblib.load('gradient_boosting_model.pkl')
@@ -24,7 +105,6 @@ def predict_csv(file):
     mask = df['term'].notnull()
     df_cleaned = df[mask].copy()
-    # Separate features if needed
     # If target column exists in CSV, drop it
     if 'target' in df_cleaned.columns:
         df_cleaned = df_cleaned.drop(columns=['target'])

 from sklearn.pipeline import Pipeline
 from sklearn.base import BaseEstimator, TransformerMixin
+#numeric features imputer
+class CustomImputer(TransformerMixin):
+    def fit(self, X, y=None):
+        # Precompute group means for imputations
+        self.group_means = {
+            'rate_of_interest': X.groupby(['loan_type', 'term'])['rate_of_interest'].mean(),
+            'Interest_rate_spread': X.groupby(['loan_type', 'term'])['Interest_rate_spread'].mean(),
+            'Upfront_charges': X.groupby(['loan_type', 'term'])['Upfront_charges'].mean(),
+            'rate_of_interest_loan': X.groupby(['loan_type'])['rate_of_interest'].mean(),
+            'Interest_rate_spread_loan': X.groupby(['loan_type'])['Interest_rate_spread'].mean(),
+            'Upfront_charges_loan': X.groupby(['loan_type'])['Upfront_charges'].mean(),
+            'income_by_age': X.groupby(['age'])['income'].mean(),
+            'property_value_mean': X['property_value'].mean(),
+            'dtir1_mean': X['dtir1'].mean(),
+            'income_mean': X['income'].mean(),
+        }
+        return self
+    def transform(self, X):
+        X = X.copy()
+        # Impute numerical features using group-based means
+        for col in ['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges']:
+            X[col] = X.groupby(['loan_type', 'term'])[col].transform(lambda x: x.fillna(x.mean())).round(3 if col == 'rate_of_interest' else 4)
+        for col in ['rate_of_interest', 'Interest_rate_spread', 'Upfront_charges']:
+            loan_mean = self.group_means[col + '_loan']
+            X[col] = X.apply(
+                lambda row: row[col] if pd.notnull(row[col]) else loan_mean.get(row['loan_type'], np.nan),
+                axis=1
+            )
+            X[col] = X[col].round(3 if col == 'rate_of_interest' else 4)
+        # Impute property_value and dtir1
+        X['property_value'] = X['property_value'].fillna(self.group_means['property_value_mean'])
+        X['property_value'] = np.round(X['property_value'], -3)
+        X['dtir1'] = X['dtir1'].fillna(self.group_means['dtir1_mean']).round(0)
+        # Income
+        X['income'] = X.groupby(['age'])['income'].transform(lambda x: x.fillna(x.mean()))
+        X['income'] = X['income'].fillna(self.group_means['income_mean'])
+        X['income'] = np.round(X['income'], -2)
+        # LTV
+        X['LTV'] = X['LTV'].fillna(X['loan_amount'] / X['property_value'] * 100).round(8)
+        return X
+# Categorical features imputer
+# CustomCleaner - will handle wrong spelling, drop features and convert missing values to np.nan
+class CustomCleaner(BaseEstimator, TransformerMixin):
+    def __init__(self, drop_cols=None, missing_placeholders=None, cat_cols=None):
+        self.drop_cols = drop_cols
+        self.missing_placeholders = missing_placeholders if missing_placeholders is not None else ['', 'NA', 'nan', 'NaN']
+        self.cat_cols = cat_cols  # list of categorical columns to fix missing values
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        X = X.copy()
+        # Drop unwanted columns
+        if self.drop_cols:
+            X = X.drop(self.drop_cols, axis=1)
+        # Fix wrong spelling in 'Security_Type'
+        if 'Security_Type' in X.columns:
+            X['Security_Type'] = X['Security_Type'].replace({'Indriect': 'Indirect'})
+        # Replace placeholders with np.nan in categorical columns
+        if self.cat_cols:
+            for col in self.cat_cols:
+                if col in X.columns:
+                    X[col] = X[col].replace(self.missing_placeholders, np.nan)
+        return X
+custom_cleaner = CustomCleaner(drop_cols=['ID', 'year', 'Gender', 'property_value', 'loan_amount', 'Interest_rate_spread'])
 # Load trained model and preprocessing objects
 gb_loaded = joblib.load('gradient_boosting_model.pkl')
     mask = df['term'].notnull()
     df_cleaned = df[mask].copy()
     # If target column exists in CSV, drop it
     if 'target' in df_cleaned.columns:
         df_cleaned = df_cleaned.drop(columns=['target'])