File size: 1,995 Bytes
75b9644 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | """Feature preparation utilities for training and app inference."""
from __future__ import annotations
import pandas as pd
from credit_risk.config import FEATURE_GROUPS, SELECTED_FEATURES, TARGET_COLUMN
def _validate_raw_columns(df: pd.DataFrame) -> None:
"""Fail fast when required raw columns are missing."""
required_columns = {TARGET_COLUMN, *[group.source_column for group in FEATURE_GROUPS]}
missing = sorted(required_columns.difference(df.columns))
if missing:
raise ValueError(f"Missing required columns in raw dataset: {missing}")
def build_training_frame(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
"""
Build the model matrix from the raw CSV.
Notes:
- Uses one-hot encoding only on the columns represented in FEATURE_GROUPS.
- Reindexes to SELECTED_FEATURES so train/inference always match column order.
"""
_validate_raw_columns(df)
categorical_columns = [group.source_column for group in FEATURE_GROUPS]
encoded = pd.get_dummies(df[categorical_columns], columns=categorical_columns, dtype="int64")
# Guarantee all model columns exist even if a category is absent in the current dataset.
feature_frame = encoded.reindex(columns=SELECTED_FEATURES, fill_value=0).astype("int64")
target = df[TARGET_COLUMN].astype("int64")
return feature_frame, target
def build_inference_frame(selection_by_group: dict[str, str | None]) -> pd.DataFrame:
"""
Convert app selections to a one-row DataFrame matching model schema.
The dict format is:
{"Account Balance": "No account", "Purpose": None, ...}
"""
values = {column: 0 for column in SELECTED_FEATURES}
for group in FEATURE_GROUPS:
selected_label = selection_by_group.get(group.name)
selected_column = group.column_from_label(selected_label)
if selected_column is not None:
values[selected_column] = 1
return pd.DataFrame([values], columns=SELECTED_FEATURES, dtype="int64")
|