Spaces:
Sleeping
Sleeping
File size: 2,378 Bytes
114e6ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
from __future__ import annotations
from typing import Any
import numpy as np
import pandas as pd
from app.main import (
DAYS_EMPLOYED_SENTINEL,
ENGINEERED_SOURCES,
IGNORE_FEATURES,
MISSING_INDICATOR_MIN_RATE,
OUTLIER_COLUMNS,
OUTLIER_LOWER_Q,
OUTLIER_UPPER_Q,
_apply_correlated_imputation,
_validate_numeric_inputs,
add_missingness_indicators,
apply_outlier_clipping,
compute_outlier_bounds,
new_features_creation,
select_missing_indicator_columns,
)
def preprocess_for_training(df_raw: pd.DataFrame, artifacts: Any) -> pd.DataFrame:
df = df_raw.copy()
for col in artifacts.required_input_columns:
if col not in df.columns:
df[col] = np.nan
_validate_numeric_inputs(df, artifacts.numeric_required_columns)
df['is_train'] = 0
df['is_test'] = 1
if 'TARGET' not in df.columns:
df['TARGET'] = 0
df = new_features_creation(
df,
days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
engineered_sources=ENGINEERED_SOURCES,
)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.reindex(columns=artifacts.columns_keep, fill_value=np.nan)
indicator_cols = getattr(artifacts, 'missing_indicator_columns', None) or select_missing_indicator_columns(
df,
exclude_cols=set(IGNORE_FEATURES),
min_missing_rate=MISSING_INDICATOR_MIN_RATE,
)
df = add_missingness_indicators(df, indicator_cols)
outlier_bounds = getattr(artifacts, 'outlier_bounds', {}) or compute_outlier_bounds(
df,
OUTLIER_COLUMNS,
lower_q=OUTLIER_LOWER_Q,
upper_q=OUTLIER_UPPER_Q,
)
df = apply_outlier_clipping(df, outlier_bounds)
_apply_correlated_imputation(df, artifacts)
for col, median in artifacts.numeric_medians.items():
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
df[col] = df[col].fillna(median)
for col in artifacts.categorical_columns:
if col in df.columns:
df[col] = df[col].fillna('Unknown')
df_hot = pd.get_dummies(df, columns=artifacts.categorical_columns)
df_hot = df_hot.reindex(columns=artifacts.features_to_scaled, fill_value=0)
scaled = artifacts.scaler.transform(df_hot)
return pd.DataFrame(scaled, columns=artifacts.features_to_scaled, index=df.index)
|