stunting-risk-model / feature_engineering.py
getachewgetu's picture
Upload feature_engineering.py with huggingface_hub
721d7f2 verified
"""
Feature engineering and preprocessing utilities for the stunting risk ML pipeline.
Shared by the Jupyter notebook, training script, and RiskScorer inference class.
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# ---------------------------------------------------------------------------
# Ordinal encoding maps (higher value = higher risk)
# ---------------------------------------------------------------------------
WATER_MAP = {
'piped_into_dwelling': 0,
'public_tap': 1,
'protected_well': 2,
'unprotected_well': 3,
'surface_water': 4,
}
SANITATION_MAP = {
'improved': 0,
'basic': 1,
'unimproved': 2,
'none': 3,
}
INCOME_MAP = {
'high': 0,
'medium': 1,
'low': 2,
}
# Ordered feature list used by the model
FEATURE_NAMES = [
'avg_meal_count',
'water_source_enc',
'sanitation_tier_enc',
'income_band_enc',
'children_under5',
'meal_x_water',
'deprivation_index',
]
def encode_categoricals(df: pd.DataFrame) -> pd.DataFrame:
"""
Apply ordinal encoding to water_source, sanitation_tier, income_band.
Raises ValueError for any unrecognised categorical value.
Returns a copy of df with three new *_enc columns added.
"""
df = df.copy()
for col, mapping in [
('water_source', WATER_MAP),
('sanitation_tier', SANITATION_MAP),
('income_band', INCOME_MAP),
]:
unknown = set(df[col].dropna().unique()) - set(mapping.keys())
if unknown:
raise ValueError(
f"Unrecognised value(s) in '{col}': {unknown}. "
f"Expected one of {set(mapping.keys())}"
)
enc_col = col.replace('water_source', 'water_source_enc') \
.replace('sanitation_tier', 'sanitation_tier_enc') \
.replace('income_band', 'income_band_enc')
# Build enc col name properly
enc_col = col + '_enc'
df[enc_col] = df[col].map(mapping)
return df
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Add interaction and composite features.
Expects water_source_enc, sanitation_tier_enc, income_band_enc to already exist.
Returns a copy with meal_x_water and deprivation_index added.
"""
df = df.copy()
df['meal_x_water'] = df['avg_meal_count'] * df['water_source_enc']
df['deprivation_index'] = (
df['water_source_enc'] +
df['sanitation_tier_enc'] +
df['income_band_enc']
)
return df
def build_preprocessor() -> StandardScaler:
"""
Returns a StandardScaler that will be fit on the feature matrix.
The feature matrix is already numerically encoded before this step.
"""
return StandardScaler()
def prepare_feature_matrix(df: pd.DataFrame) -> pd.DataFrame:
"""
Full preprocessing pipeline:
1. encode_categoricals
2. engineer_features
3. select FEATURE_NAMES columns
Returns a DataFrame with exactly FEATURE_NAMES columns.
"""
df = encode_categoricals(df)
df = engineer_features(df)
return df[FEATURE_NAMES]