File size: 1,995 Bytes
75b9644
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""Feature preparation utilities for training and app inference."""

from __future__ import annotations

import pandas as pd

from credit_risk.config import FEATURE_GROUPS, SELECTED_FEATURES, TARGET_COLUMN


def _validate_raw_columns(df: pd.DataFrame) -> None:
    """Fail fast when required raw columns are missing."""
    required_columns = {TARGET_COLUMN, *[group.source_column for group in FEATURE_GROUPS]}
    missing = sorted(required_columns.difference(df.columns))
    if missing:
        raise ValueError(f"Missing required columns in raw dataset: {missing}")


def build_training_frame(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
    """
    Build the model matrix from the raw CSV.

    Notes:
    - Uses one-hot encoding only on the columns represented in FEATURE_GROUPS.
    - Reindexes to SELECTED_FEATURES so train/inference always match column order.
    """
    _validate_raw_columns(df)

    categorical_columns = [group.source_column for group in FEATURE_GROUPS]
    encoded = pd.get_dummies(df[categorical_columns], columns=categorical_columns, dtype="int64")

    # Guarantee all model columns exist even if a category is absent in the current dataset.
    feature_frame = encoded.reindex(columns=SELECTED_FEATURES, fill_value=0).astype("int64")
    target = df[TARGET_COLUMN].astype("int64")
    return feature_frame, target


def build_inference_frame(selection_by_group: dict[str, str | None]) -> pd.DataFrame:
    """
    Convert app selections to a one-row DataFrame matching model schema.

    The dict format is:
      {"Account Balance": "No account", "Purpose": None, ...}
    """
    values = {column: 0 for column in SELECTED_FEATURES}

    for group in FEATURE_GROUPS:
        selected_label = selection_by_group.get(group.name)
        selected_column = group.column_from_label(selected_label)
        if selected_column is not None:
            values[selected_column] = 1

    return pd.DataFrame([values], columns=SELECTED_FEATURES, dtype="int64")