""" Feature engineering and preprocessing utilities for the stunting risk ML pipeline. Shared by the Jupyter notebook, training script, and RiskScorer inference class. """ import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline # --------------------------------------------------------------------------- # Ordinal encoding maps (higher value = higher risk) # --------------------------------------------------------------------------- WATER_MAP = { 'piped_into_dwelling': 0, 'public_tap': 1, 'protected_well': 2, 'unprotected_well': 3, 'surface_water': 4, } SANITATION_MAP = { 'improved': 0, 'basic': 1, 'unimproved': 2, 'none': 3, } INCOME_MAP = { 'high': 0, 'medium': 1, 'low': 2, } # Ordered feature list used by the model FEATURE_NAMES = [ 'avg_meal_count', 'water_source_enc', 'sanitation_tier_enc', 'income_band_enc', 'children_under5', 'meal_x_water', 'deprivation_index', ] def encode_categoricals(df: pd.DataFrame) -> pd.DataFrame: """ Apply ordinal encoding to water_source, sanitation_tier, income_band. Raises ValueError for any unrecognised categorical value. Returns a copy of df with three new *_enc columns added. """ df = df.copy() for col, mapping in [ ('water_source', WATER_MAP), ('sanitation_tier', SANITATION_MAP), ('income_band', INCOME_MAP), ]: unknown = set(df[col].dropna().unique()) - set(mapping.keys()) if unknown: raise ValueError( f"Unrecognised value(s) in '{col}': {unknown}. " f"Expected one of {set(mapping.keys())}" ) enc_col = col.replace('water_source', 'water_source_enc') \ .replace('sanitation_tier', 'sanitation_tier_enc') \ .replace('income_band', 'income_band_enc') # Build enc col name properly enc_col = col + '_enc' df[enc_col] = df[col].map(mapping) return df def engineer_features(df: pd.DataFrame) -> pd.DataFrame: """ Add interaction and composite features. Expects water_source_enc, sanitation_tier_enc, income_band_enc to already exist. Returns a copy with meal_x_water and deprivation_index added. """ df = df.copy() df['meal_x_water'] = df['avg_meal_count'] * df['water_source_enc'] df['deprivation_index'] = ( df['water_source_enc'] + df['sanitation_tier_enc'] + df['income_band_enc'] ) return df def build_preprocessor() -> StandardScaler: """ Returns a StandardScaler that will be fit on the feature matrix. The feature matrix is already numerically encoded before this step. """ return StandardScaler() def prepare_feature_matrix(df: pd.DataFrame) -> pd.DataFrame: """ Full preprocessing pipeline: 1. encode_categoricals 2. engineer_features 3. select FEATURE_NAMES columns Returns a DataFrame with exactly FEATURE_NAMES columns. """ df = encode_categoricals(df) df = engineer_features(df) return df[FEATURE_NAMES]