stunting-risk-model / feature_engineering.py

Upload feature_engineering.py with huggingface_hub

721d7f2 verified about 1 month ago

3.28 kB

	"""
	Feature engineering and preprocessing utilities for the stunting risk ML pipeline.
	Shared by the Jupyter notebook, training script, and RiskScorer inference class.
	"""

	import pandas as pd
	import numpy as np
	from sklearn.preprocessing import StandardScaler
	from sklearn.pipeline import Pipeline

	# ---------------------------------------------------------------------------
	# Ordinal encoding maps (higher value = higher risk)
	# ---------------------------------------------------------------------------
	WATER_MAP = {
	'piped_into_dwelling': 0,
	'public_tap': 1,
	'protected_well': 2,
	'unprotected_well': 3,
	'surface_water': 4,
	}

	SANITATION_MAP = {
	'improved': 0,
	'basic': 1,
	'unimproved': 2,
	'none': 3,
	}

	INCOME_MAP = {
	'high': 0,
	'medium': 1,
	'low': 2,
	}

	# Ordered feature list used by the model
	FEATURE_NAMES = [
	'avg_meal_count',
	'water_source_enc',
	'sanitation_tier_enc',
	'income_band_enc',
	'children_under5',
	'meal_x_water',
	'deprivation_index',
	]


	def encode_categoricals(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Apply ordinal encoding to water_source, sanitation_tier, income_band.
	Raises ValueError for any unrecognised categorical value.
	Returns a copy of df with three new *_enc columns added.
	"""
	df = df.copy()

	for col, mapping in [
	('water_source', WATER_MAP),
	('sanitation_tier', SANITATION_MAP),
	('income_band', INCOME_MAP),
	]:
	unknown = set(df[col].dropna().unique()) - set(mapping.keys())
	if unknown:
	raise ValueError(
	f"Unrecognised value(s) in '{col}': {unknown}. "
	f"Expected one of {set(mapping.keys())}"
	)
	enc_col = col.replace('water_source', 'water_source_enc') \
	.replace('sanitation_tier', 'sanitation_tier_enc') \
	.replace('income_band', 'income_band_enc')
	# Build enc col name properly
	enc_col = col + '_enc'
	df[enc_col] = df[col].map(mapping)

	return df


	def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Add interaction and composite features.
	Expects water_source_enc, sanitation_tier_enc, income_band_enc to already exist.
	Returns a copy with meal_x_water and deprivation_index added.
	"""
	df = df.copy()
	df['meal_x_water'] = df['avg_meal_count'] * df['water_source_enc']
	df['deprivation_index'] = (
	df['water_source_enc'] +
	df['sanitation_tier_enc'] +
	df['income_band_enc']
	)
	return df


	def build_preprocessor() -> StandardScaler:
	"""
	Returns a StandardScaler that will be fit on the feature matrix.
	The feature matrix is already numerically encoded before this step.
	"""
	return StandardScaler()


	def prepare_feature_matrix(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Full preprocessing pipeline:
	1. encode_categoricals
	2. engineer_features
	3. select FEATURE_NAMES columns
	Returns a DataFrame with exactly FEATURE_NAMES columns.
	"""
	df = encode_categoricals(df)
	df = engineer_features(df)
	return df[FEATURE_NAMES]