| import pandas as pd |
| import numpy as np |
| import joblib |
| from sklearn.base import BaseEstimator, TransformerMixin |
|
|
|
|
| class CreditDataPreprocessor(BaseEstimator, TransformerMixin): |
|
|
| |
|
|
| def __init__(self, |
| NumberOfDependents_fill_value=0, |
| NumberOfDependents_up_threshold=10, |
| MonthlyIncome_fill_value=0, |
| RevolvingUtilizationOfUnsecuredLines_drop_threshold=2, |
| age_low_drop_threshold=18, |
| age_up_drop_threshold=80, |
| DebtRatio_up_threshold=5, |
| PastDueRiskScore_weights=[1.0, 1.2, 1.3], |
| NumberRealEstateLoansOrLines_drop_threshold=20, |
| drop_special_codes=False): |
| self.NumberOfDependents_fill_value = NumberOfDependents_fill_value |
| self.NumberOfDependents_up_threshold = NumberOfDependents_up_threshold |
|
|
| self.MonthlyIncome_fill_value = MonthlyIncome_fill_value |
|
|
| self.RevolvingUtilizationOfUnsecuredLines_drop_threshold = RevolvingUtilizationOfUnsecuredLines_drop_threshold |
|
|
| self.age_low_drop_threshold = age_low_drop_threshold |
| self.age_up_drop_threshold = age_up_drop_threshold |
|
|
| self.DebtRatio_up_threshold = DebtRatio_up_threshold |
|
|
| self.PastDueRiskScore_weights = PastDueRiskScore_weights |
|
|
| self.NumberRealEstateLoansOrLines_drop_threshold = NumberRealEstateLoansOrLines_drop_threshold |
|
|
| self.drop_special_codes = drop_special_codes |
|
|
| def fit(self, X, y=None): |
| return self |
|
|
| def transform(self, X): |
| X_copy = X.copy() |
|
|
| X_copy['NumberOfDependents'] = X_copy['NumberOfDependents'].fillna(value=self.NumberOfDependents_fill_value) |
| X_copy['NumberOfDependents'] = X_copy['NumberOfDependents'].clip(0, self.NumberOfDependents_up_threshold).copy() |
|
|
| X_copy['MonthlyIncomeIsMissing'] = 0 |
| X_copy.loc[X_copy['MonthlyIncome'].isna(), 'MonthlyIncomeIsMissing'] = 1 |
| X_copy['MonthlyIncome'] = X['MonthlyIncome'].fillna(value=self.MonthlyIncome_fill_value) |
|
|
| X_copy['RevolvingUtilizationOverOne'] = 0.0 |
| X_copy.loc[X_copy['RevolvingUtilizationOfUnsecuredLines'] > 1, 'RevolvingUtilizationOverOne'] = 1.0 |
| X_copy['RevolvingUtilizationOfUnsecuredLines'] = X_copy['RevolvingUtilizationOfUnsecuredLines'].clip(0, |
| 1).copy() |
|
|
| X_copy['DebtPayments'] = 0.0 |
| X_copy.loc[X_copy['MonthlyIncome'] == 0, 'DebtPayments'] = X_copy.loc[X_copy['MonthlyIncome'] == 0, 'DebtRatio'] |
| X_copy.loc[X_copy['MonthlyIncome'] != 0, 'DebtPayments'] = X_copy.loc[ |
| X_copy['MonthlyIncome'] != 0, 'DebtRatio'] * \ |
| X_copy.loc[ |
| X_copy['MonthlyIncome'] != 0, 'MonthlyIncome'] |
| X_copy['DebtRatio'] = X_copy['DebtRatio'].clip(0, self.DebtRatio_up_threshold).copy() |
|
|
| X_copy['DebtPayments_over_10k'] = 0.0 |
| X_copy.loc[X_copy['DebtPayments'] > 10000, 'DebtPayments_over_10k'] = 1.0 |
| X_copy['DebtPayments'] = X_copy['DebtPayments'].clip(0, 10000).copy() |
|
|
| X_copy['MonthlyIncome_over_20k'] = 0.0 |
| X_copy.loc[X_copy['MonthlyIncome'] >= 20000, 'MonthlyIncome_over_20k'] = 1.0 |
| X_copy['MonthlyIncome'] = X_copy['MonthlyIncome'].clip(0, 20000) |
|
|
| X_copy['Code96'] = 0.0 |
| X_copy['Code98'] = 0.0 |
| X_copy.loc[X_copy['NumberOfTime30-59DaysPastDueNotWorse'] == 96, 'Code96'] = 1.0 |
| X_copy.loc[X_copy['NumberOfTime30-59DaysPastDueNotWorse'] == 98, 'Code98'] = 1.0 |
|
|
| X_copy['PastDueRiskScore'] = ( |
| self.PastDueRiskScore_weights[0] * X_copy['NumberOfTime30-59DaysPastDueNotWorse'] + |
| self.PastDueRiskScore_weights[1] * X_copy['NumberOfTime60-89DaysPastDueNotWorse'] + |
| self.PastDueRiskScore_weights[2] * X_copy['NumberOfTimes90DaysLate']) |
| X_copy.loc[X_copy['NumberOfTime30-59DaysPastDueNotWorse'] == 96, 'PastDueRiskScore'] = 96 |
| X_copy.loc[X_copy['NumberOfTime30-59DaysPastDueNotWorse'] == 98, 'PastDueRiskScore'] = 98 |
| X_copy = X_copy.drop(columns=['NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTime60-89DaysPastDueNotWorse', |
| 'NumberOfTimes90DaysLate']) |
|
|
| X_copy['NumberOfOpenCreditLinesAndLoans_over_30'] = 0.0 |
| X_copy.loc[X_copy['NumberOfOpenCreditLinesAndLoans'] > 30, 'NumberOfOpenCreditLinesAndLoans_over_30'] = 1.0 |
| X_copy['NumberOfOpenCreditLinesAndLoans'] = X_copy['NumberOfOpenCreditLinesAndLoans'].clip(0, 30).copy() |
|
|
| X_copy['NumberRealEstateLoansOrLines_over_5'] = 0.0 |
| X_copy.loc[X_copy['NumberRealEstateLoansOrLines'] > 5, 'NumberRealEstateLoansOrLines_over_5'] = 1.0 |
| X_copy['NumberRealEstateLoansOrLines'] = X_copy['NumberRealEstateLoansOrLines'].clip(0, 5).copy() |
|
|
| X_copy['ConsumerCredit_Group'] = pd.cut(X_copy['NumberOfOpenCreditLinesAndLoans'], |
| bins=[0, 1, 2, 6, 15, 31], |
| labels=[ |
| '0_loans', |
| '1_loans', |
| '2-5_loans', |
| '6-14_loans', |
| '16-30_loans' |
| ]) |
| consumer_dummy = pd.get_dummies(X_copy['ConsumerCredit_Group'], prefix='Consumer', drop_first=False).astype( |
| 'float') |
|
|
| X_copy['RealEstateLoans_Group'] = pd.cut(X_copy['NumberRealEstateLoansOrLines'], |
| bins=[-1, 0, 3, 100], |
| labels=[ |
| '0_loans', |
| '1-3_loans', |
| '4+_loans', |
| ]) |
| estate_dummy = pd.get_dummies(X_copy['RealEstateLoans_Group'], prefix='RealEstateLoans', |
| drop_first=False).astype('float') |
|
|
| X_copy = pd.concat([X_copy, consumer_dummy, estate_dummy], axis=1).copy() |
| X_copy = X_copy.drop(columns=['ConsumerCredit_Group', |
| 'RealEstateLoans_Group']).copy() |
|
|
| X_copy = X_copy.drop(columns=['Consumer_6-14_loans', |
| 'RealEstateLoans_0_loans']).copy() |
|
|
| X_copy = X_copy.drop(columns=['NumberOfOpenCreditLinesAndLoans', |
| 'NumberRealEstateLoansOrLines', |
| 'MonthlyIncomeIsMissing', |
| 'MonthlyIncome_over_20k', |
| 'Consumer_0_loans', |
| 'NumberOfOpenCreditLinesAndLoans_over_30']).copy() |
|
|
| if self.drop_special_codes: |
| X_copy = X_copy.drop(columns=['Code96', 'Code98']) |
|
|
| return X_copy |
|
|
|
|
| def fit_transform(self, X, y=None): |
| return self.fit(X, y).transform(X) |
|
|
| def clean_train(self, X, y=None): |
| mask = ( |
| (X[ |
| 'RevolvingUtilizationOfUnsecuredLines'] <= self.RevolvingUtilizationOfUnsecuredLines_drop_threshold) & |
| (X['age'] >= self.age_low_drop_threshold) & |
| (X['age'] <= self.age_up_drop_threshold) & |
| (X['NumberRealEstateLoansOrLines'] <= self.NumberRealEstateLoansOrLines_drop_threshold) |
| ) |
|
|
| X_clean = X[mask].copy() |
|
|
| if y is not None: |
| y_clean = y[mask].copy() |
| return X_clean, y_clean |
|
|
| return X_clean |
|
|
| from sklearn.base import BaseEstimator, TransformerMixin |
| from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler |
|
|
| class CreditScaler(BaseEstimator, TransformerMixin): |
| """ |
| Масштабирует только не-булевые колонки. |
| Можно задать различные способы масштабирования |
| """ |
|
|
| def __init__(self, scaler_type='standard'): |
| """ |
| Параметр scaler_type - тип scaler'а. |
| |
| Доступные типы: |
| - 'standard': StandardScaler (среднее=0, дисперсия=1) |
| - 'robust': RobustScaler (устойчив к выбросам) |
| - 'minmax': MinMaxScaler (приводит к [0, 1]) |
| - 'maxabs': MaxAbsScaler (приводит к [-1, 1]) |
| """ |
|
|
| self.boolean_columns = [ |
| 'RevolvingUtilizationOverOne', |
| 'DebtPayments_over_10k', |
| 'Code96', |
| 'Code98', |
| 'NumberRealEstateLoansOrLines_over_5', |
| 'Consumer_1_loans', |
| 'Consumer_2-5_loans', |
| 'Consumer_16-30_loans', |
| 'RealEstateLoans_1-3_loans', |
| 'RealEstateLoans_4+_loans' |
| ] |
|
|
| self.scaler_type = scaler_type |
| self._create_scaler() |
|
|
| |
| self.columns_to_scale_ = None |
| self.n_features_in_ = None |
| self.feature_names_in_ = None |
|
|
| def _create_scaler(self): |
| """Создает scaler по типу""" |
| if self.scaler_type == 'standard': |
| self.scaler = StandardScaler() |
| elif self.scaler_type == 'robust': |
| self.scaler = RobustScaler() |
| elif self.scaler_type == 'minmax': |
| self.scaler = MinMaxScaler() |
| elif self.scaler_type == 'maxabs': |
| self.scaler = MaxAbsScaler() |
| else: |
| raise ValueError( |
| f"Unknown scaler_type: {self.scaler_type}. " |
| f"Available: standard, robust, minmax, maxabs" |
| ) |
|
|
| def fit(self, X, y=None): |
| """ |
| Определяет колонки для масштабирования (все, кроме булевых) |
| и обучает scaler. |
| """ |
|
|
| self.feature_names_in_ = X.columns.tolist() |
| self.n_features_in_ = len(self.feature_names_in_) |
|
|
| self.columns_to_scale_ = [ |
| col for col in self.feature_names_in_ |
| if col not in self.boolean_columns |
| ] |
|
|
| self.scaler.fit(X[self.columns_to_scale_]) |
| return self |
|
|
| def transform(self, X, y=None): |
| """ |
| Масштабирует только не-булевы колонки. |
| """ |
| X_copy = X.copy() |
|
|
| X_copy[self.columns_to_scale_] = self.scaler.transform(X_copy[self.columns_to_scale_]) |
|
|
| return X_copy |
|
|
| def fit_transform(self, X, y=None): |
| return self.fit(X, y).transform(X, y) |
|
|
| def get_feature_names_out(self, input_features=None): |
| """Для совместимости с sklearn""" |
| if input_features is not None: |
| return input_features |
| return self.feature_names_in_ if self.feature_names_in_ is not None else [] |
|
|
| def set_params(self, **params): |
| """Для совместимости с GridSearchCV""" |
| if 'scaler_type' in params and params['scaler_type'] != self.scaler_type: |
| self.scaler_type = params['scaler_type'] |
| self._create_scaler() |
| return super().set_params(**params) |
|
|
|
|
| def check_business_rules(age, monthly_income, monthly_debt, debt_ratio, |
| late_90, late_60_89, late_30_59, credit_lines, |
| real_estate, utilization, dependents): |
|
|
| |
| if age < 18: |
| return { |
| 'needs_manual': False, |
| 'message': 'Возраст менее 18 лет - кредит не выдаётся', |
| 'decision': 1 |
| } |
|
|
| |
| if (late_90 == 98) or (late_60_89 == 98) or (late_30_59 == 98): |
| return { |
| 'needs_manual': True, |
| 'message': 'Код 98: Списание долга как безнадежного', |
| 'decision': None |
| } |
|
|
| if (late_90 == 96) or (late_60_89 == 96) or (late_30_59 == 96): |
| return { |
| 'needs_manual': True, |
| 'message': 'Код 96: Изъятие залога или реализация имущества', |
| 'decision': None |
| } |
|
|
| |
| if age > 80: |
| return { |
| 'needs_manual': True, |
| 'message': 'Возраст > 80 лет - требуется ручной разбор (индивидуальные условия)', |
| 'decision': None |
| } |
|
|
| if monthly_income > 1000000: |
| return { |
| 'needs_manual': True, |
| 'message': 'Доход свыше 1,000,000 $ - требуется ручной разбор', |
| 'decision': None |
| } |
|
|
| if monthly_debt > 1000000: |
| return { |
| 'needs_manual': True, |
| 'message': 'Платежи свыше 1,000,000 $ - требуется ручной разбор', |
| 'decision': None |
| } |
|
|
| if utilization > 2: |
| return { |
| 'needs_manual': True, |
| 'message': 'Использование кредитных средств превышает 200%', |
| 'decision': None |
| } |
|
|
| if real_estate > 20: |
| return { |
| 'needs_manual': True, |
| 'message': 'Количество кредитов под залог недвижимости слишком велико - ручной разбор', |
| 'decision': None |
| } |
|
|
| |
| return { |
| 'needs_manual': False, |
| 'decision': None, |
| } |
|
|
|
|
|
|
|
|