Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from typing import Any | |
| import numpy as np | |
| import pandas as pd | |
| from app.main import ( | |
| DAYS_EMPLOYED_SENTINEL, | |
| ENGINEERED_SOURCES, | |
| IGNORE_FEATURES, | |
| MISSING_INDICATOR_MIN_RATE, | |
| OUTLIER_COLUMNS, | |
| OUTLIER_LOWER_Q, | |
| OUTLIER_UPPER_Q, | |
| _apply_correlated_imputation, | |
| _validate_numeric_inputs, | |
| add_missingness_indicators, | |
| apply_outlier_clipping, | |
| compute_outlier_bounds, | |
| new_features_creation, | |
| select_missing_indicator_columns, | |
| ) | |
| def preprocess_for_training(df_raw: pd.DataFrame, artifacts: Any) -> pd.DataFrame: | |
| df = df_raw.copy() | |
| for col in artifacts.required_input_columns: | |
| if col not in df.columns: | |
| df[col] = np.nan | |
| _validate_numeric_inputs(df, artifacts.numeric_required_columns) | |
| df['is_train'] = 0 | |
| df['is_test'] = 1 | |
| if 'TARGET' not in df.columns: | |
| df['TARGET'] = 0 | |
| df = new_features_creation( | |
| df, | |
| days_employed_sentinel=DAYS_EMPLOYED_SENTINEL, | |
| engineered_sources=ENGINEERED_SOURCES, | |
| ) | |
| df.replace([np.inf, -np.inf], np.nan, inplace=True) | |
| df = df.reindex(columns=artifacts.columns_keep, fill_value=np.nan) | |
| indicator_cols = getattr(artifacts, 'missing_indicator_columns', None) or select_missing_indicator_columns( | |
| df, | |
| exclude_cols=set(IGNORE_FEATURES), | |
| min_missing_rate=MISSING_INDICATOR_MIN_RATE, | |
| ) | |
| df = add_missingness_indicators(df, indicator_cols) | |
| outlier_bounds = getattr(artifacts, 'outlier_bounds', {}) or compute_outlier_bounds( | |
| df, | |
| OUTLIER_COLUMNS, | |
| lower_q=OUTLIER_LOWER_Q, | |
| upper_q=OUTLIER_UPPER_Q, | |
| ) | |
| df = apply_outlier_clipping(df, outlier_bounds) | |
| _apply_correlated_imputation(df, artifacts) | |
| for col, median in artifacts.numeric_medians.items(): | |
| if col in df.columns: | |
| df[col] = pd.to_numeric(df[col], errors='coerce') | |
| df[col] = df[col].fillna(median) | |
| for col in artifacts.categorical_columns: | |
| if col in df.columns: | |
| df[col] = df[col].fillna('Unknown') | |
| df_hot = pd.get_dummies(df, columns=artifacts.categorical_columns) | |
| df_hot = df_hot.reindex(columns=artifacts.features_to_scaled, fill_value=0) | |
| scaled = artifacts.scaler.transform(df_hot) | |
| return pd.DataFrame(scaled, columns=artifacts.features_to_scaled, index=df.index) | |