File size: 2,378 Bytes
114e6ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from __future__ import annotations

from typing import Any

import numpy as np
import pandas as pd

from app.main import (
    DAYS_EMPLOYED_SENTINEL,
    ENGINEERED_SOURCES,
    IGNORE_FEATURES,
    MISSING_INDICATOR_MIN_RATE,
    OUTLIER_COLUMNS,
    OUTLIER_LOWER_Q,
    OUTLIER_UPPER_Q,
    _apply_correlated_imputation,
    _validate_numeric_inputs,
    add_missingness_indicators,
    apply_outlier_clipping,
    compute_outlier_bounds,
    new_features_creation,
    select_missing_indicator_columns,
)


def preprocess_for_training(df_raw: pd.DataFrame, artifacts: Any) -> pd.DataFrame:
    df = df_raw.copy()

    for col in artifacts.required_input_columns:
        if col not in df.columns:
            df[col] = np.nan

    _validate_numeric_inputs(df, artifacts.numeric_required_columns)

    df['is_train'] = 0
    df['is_test'] = 1
    if 'TARGET' not in df.columns:
        df['TARGET'] = 0

    df = new_features_creation(
        df,
        days_employed_sentinel=DAYS_EMPLOYED_SENTINEL,
        engineered_sources=ENGINEERED_SOURCES,
    )
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    df = df.reindex(columns=artifacts.columns_keep, fill_value=np.nan)

    indicator_cols = getattr(artifacts, 'missing_indicator_columns', None) or select_missing_indicator_columns(
        df,
        exclude_cols=set(IGNORE_FEATURES),
        min_missing_rate=MISSING_INDICATOR_MIN_RATE,
    )
    df = add_missingness_indicators(df, indicator_cols)

    outlier_bounds = getattr(artifacts, 'outlier_bounds', {}) or compute_outlier_bounds(
        df,
        OUTLIER_COLUMNS,
        lower_q=OUTLIER_LOWER_Q,
        upper_q=OUTLIER_UPPER_Q,
    )
    df = apply_outlier_clipping(df, outlier_bounds)

    _apply_correlated_imputation(df, artifacts)

    for col, median in artifacts.numeric_medians.items():
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df[col] = df[col].fillna(median)

    for col in artifacts.categorical_columns:
        if col in df.columns:
            df[col] = df[col].fillna('Unknown')

    df_hot = pd.get_dummies(df, columns=artifacts.categorical_columns)
    df_hot = df_hot.reindex(columns=artifacts.features_to_scaled, fill_value=0)
    scaled = artifacts.scaler.transform(df_hot)
    return pd.DataFrame(scaled, columns=artifacts.features_to_scaled, index=df.index)