| import numpy as np |
| import pandas as pd |
| from typing import List, Dict, Optional |
| from sklearn.base import BaseEstimator, TransformerMixin |
| from pandas.api.types import is_numeric_dtype, is_object_dtype |
|
|
| from utils.constants import DEFAULT_COL_MAP |
|
|
|
|
| class FeatureCreation(BaseEstimator, TransformerMixin): |
| """ |
| Clinically grounded feature creation for diabetes outcome modeling. |
| |
| This transformer performs: |
| -------------------------------------------------------------- |
| 1. Missing-value handling (modeling phase only) |
| Supported strategies: |
| - "median" β numeric only |
| - "mean" β numeric only |
| - "most_frequent" β categorical only |
| - "unknown" β fill categorical with "Unknown" |
| - "none" β fill categorical with "None" |
| - None β skip imputation |
| |
| 2. Clinical binning (categorical transformations) |
| - Glucose: low / normal / prediabetic / diabetic |
| - Pregnancies: nulliparous / uniparous / multiparous / grand / great-grand multiparous |
| - Blood pressure: normal/elevated / stage1 / stage2 |
| - BMI: underweight / normal / overweight / obese |
| - Age: young / middle-aged / older adult |
| - Skin thickness: cohort-relative (25thβ75th percentiles) |
| - Insulin: cohort-relative (25thβ75th percentiles) |
| |
| 3. Interaction terms (numeric Γ numeric) |
| - glucose Γ bmi |
| - insulin Γ skin_thickness |
| - age Γ diabetes_pedigree_function |
| - pregnancies Γ glucose |
| - pregnancies Γ bmi |
| - pregnancies Γ age |
| |
| 4. Schema locking |
| - After fit(), the transformer remembers the full output schema. |
| - transform() always returns columns in the same order. |
| - Missing columns are added as NaN to maintain consistency. |
| |
| This transformer is intended for the **modeling phase**, not EDA. |
| """ |
|
|
| def __init__( |
| self, |
| col_map: Optional[Dict[str, str]] = None, |
| drop_cols: Optional[List[str]] = None, |
| create_bins: bool = True, |
| create_interactions: bool = True, |
| ): |
| """ |
| Parameters |
| ---------- |
| col_map : dict |
| Mapping of column β missing-value strategy. |
| If None, DEFAULT_COL_MAP is used. |
| |
| drop_cols : list |
| Raw columns to drop after features are created. |
| |
| create_bins : bool |
| Whether to generate clinical binning features. |
| |
| create_interactions : bool |
| Whether to generate interaction features. |
| """ |
| self.col_map = col_map |
| self.drop_cols = drop_cols |
| self.create_bins = create_bins |
| self.create_interactions = create_interactions |
|
|
| |
| |
| |
| def _handle_missing(self, df: pd.DataFrame) -> pd.DataFrame: |
| """ |
| Apply missing-value strategies defined in col_map. |
| |
| Enhancements: |
| - Adds a missingness indicator column *before* imputation. |
| For each column 'col', creates: col + "_missing" |
| This captures: |
| * True missing values (NaN) |
| * Zero-as-missing patterns for clinical numeric fields |
| |
| Rules: |
| - "median" and "mean" β numeric only |
| - "most_frequent" β categorical only |
| - "unknown" / "none" β categorical only |
| - None β skip |
| """ |
| df = df.copy() |
| if self.col_map is None: |
| return df |
|
|
| for col, strategy in self.col_map.items(): |
| if col not in df.columns or strategy is None: |
| continue |
|
|
| col_is_numeric = is_numeric_dtype(df[col]) |
| col_is_categorical = ( |
| is_object_dtype(df[col]) or df[col].dtype.name == "category" |
| ) |
|
|
| |
| |
| |
| |
| if col_is_numeric: |
| df[f"{col}_missing"] = df[col].isna() | (df[col] == 0) |
| else: |
| df[f"{col}_missing"] = df[col].isna() |
|
|
| |
| |
| |
| if strategy in ["unknown", "none"]: |
| if not col_is_categorical: |
| raise ValueError( |
| f"Column '{col}' is numeric but strategy '{strategy}' " |
| "is for categorical variables only." |
| ) |
| fill_value = "Unknown" if strategy == "unknown" else "None" |
| df[col] = df[col].fillna(fill_value) |
| continue |
|
|
| if strategy == "most_frequent": |
| if not col_is_categorical: |
| raise ValueError( |
| f"'most_frequent' strategy is only valid for categorical columns. " |
| f"Column '{col}' is numeric." |
| ) |
| df[col] = df[col].fillna(df[col].mode().iloc[0]) |
| continue |
|
|
| |
| |
| |
| if strategy == "median": |
| if not col_is_numeric: |
| raise ValueError( |
| f"'median' strategy is only valid for numeric columns. " |
| f"Column '{col}' is categorical." |
| ) |
| df[col] = df[col].fillna(df[col].median()) |
| continue |
|
|
| if strategy == "mean": |
| if not col_is_numeric: |
| raise ValueError( |
| f"'mean' strategy is only valid for numeric columns. " |
| f"Column '{col}' is categorical." |
| ) |
| df[col] = df[col].fillna(df[col].mean()) |
| continue |
|
|
| |
| |
| |
| raise ValueError( |
| f"Invalid missing strategy '{strategy}' for column '{col}'." |
| ) |
|
|
| return df |
|
|
| |
| |
| |
| def _bin_glucose(self, s): |
| """ADA 2025 diagnostic cutoffs.""" |
| bins = [-np.inf, 70, 100, 126, np.inf] |
| labels = ["low", "normal", "prediabetic", "diabetic"] |
| return pd.cut( |
| pd.to_numeric(s, errors="coerce"), bins=bins, labels=labels, right=False |
| ) |
|
|
| def _bin_pregnancies(self, s): |
| """Parity categories from obstetric epidemiology.""" |
| bins = [-np.inf, 0.5, 1.5, 5.5, 9.5, np.inf] |
| labels = [ |
| "nulliparous", |
| "uniparous", |
| "multiparous", |
| "grand_multiparous", |
| "great_grand_multiparous", |
| ] |
| return pd.cut(pd.to_numeric(s, errors="coerce"), bins=bins, labels=labels) |
|
|
| def _bin_blood_pressure(self, s): |
| """ACC/AHA diastolic BP categories.""" |
| bins = [-np.inf, 80, 90, np.inf] |
| labels = ["normal_or_elevated", "stage1_htn", "stage2_htn"] |
| return pd.cut( |
| pd.to_numeric(s, errors="coerce"), bins=bins, labels=labels, right=False |
| ) |
|
|
| def _bin_bmi(self, s): |
| """WHO BMI categories.""" |
| bins = [-np.inf, 18.5, 25.0, 30.0, np.inf] |
| labels = ["underweight", "normal", "overweight", "obese"] |
| return pd.cut( |
| pd.to_numeric(s, errors="coerce"), bins=bins, labels=labels, right=False |
| ) |
|
|
| def _bin_age(self, s): |
| """Epidemiologic age bands.""" |
| bins = [-np.inf, 40, 60, np.inf] |
| labels = ["young_adult", "middle_aged", "older_adult"] |
| return pd.cut( |
| pd.to_numeric(s, errors="coerce"), bins=bins, labels=labels, right=False |
| ) |
|
|
| def _bin_skin(self, s): |
| """Cohort-relative adiposity bins.""" |
| bins = [-np.inf, self.skin_p25_, self.skin_p75_, np.inf] |
| labels = ["low_adiposity", "typical_adiposity", "high_adiposity"] |
| return pd.cut(pd.to_numeric(s, errors="coerce"), bins=bins, labels=labels) |
|
|
| def _bin_insulin(self, s): |
| """Cohort-relative insulin bins.""" |
| bins = [-np.inf, self.insulin_p25_, self.insulin_p75_, np.inf] |
| labels = ["low_insulin", "typical_insulin", "high_insulin"] |
| return pd.cut(pd.to_numeric(s, errors="coerce"), bins=bins, labels=labels) |
|
|
| def _add_binning(self, df: pd.DataFrame) -> pd.DataFrame: |
| """ |
| Add feature binning using learned thresholds. |
| Assumes missing handling and percentile learning have already occurred. |
| """ |
| df = df.copy() |
|
|
| if "glucose" in df: |
| df["glucose_bin"] = self._bin_glucose(df["glucose"]) |
|
|
| if "pregnancies" in df: |
| df["pregnancies_bin"] = self._bin_pregnancies(df["pregnancies"]) |
|
|
| if "blood_pressure" in df: |
| df["blood_pressure_bin"] = self._bin_blood_pressure(df["blood_pressure"]) |
|
|
| if "bmi" in df: |
| df["bmi_bin"] = self._bin_bmi(df["bmi"]) |
|
|
| if "age" in df: |
| df["age_bin"] = self._bin_age(df["age"]) |
|
|
| |
| if "skin_thickness" in df and self.skin_p25_ is not None: |
| df["skin_thickness_bin"] = self._bin_skin(df["skin_thickness"]) |
|
|
| if "insulin" in df and self.insulin_p25_ is not None: |
| df["insulin_bin"] = self._bin_insulin(df["insulin"]) |
|
|
| return df |
|
|
| |
| |
| |
| def _add_interactions(self, df): |
| """Create clinically meaningful numeric Γ numeric interactions.""" |
| df = df.copy() |
|
|
| def num(col): |
| return pd.to_numeric(df[col], errors="coerce") |
|
|
| if "glucose" in df and "bmi" in df: |
| df["glucose_x_bmi"] = num("glucose") * num("bmi") |
|
|
| if "insulin" in df and "skin_thickness" in df: |
| df["insulin_x_skin_thickness"] = num("insulin") * num("skin_thickness") |
|
|
| if "age" in df and "diabetes_pedigree_function" in df: |
| df["age_x_dpf"] = num("age") * num("diabetes_pedigree_function") |
|
|
| if "pregnancies" in df and "glucose" in df: |
| df["pregnancies_x_glucose"] = num("pregnancies") * num("glucose") |
|
|
| if "pregnancies" in df and "bmi" in df: |
| df["pregnancies_x_bmi"] = num("pregnancies") * num("bmi") |
|
|
| if "pregnancies" in df and "age" in df: |
| df["pregnancies_x_age"] = num("pregnancies") * num("age") |
|
|
| return df |
|
|
| |
| |
| |
| def fit(self, X, y=None): |
| """ |
| Learn cohort percentiles for skin_thickness and insulin. |
| Also locks the final output schema. |
| """ |
| df = X.copy() |
|
|
| |
| self.skin_p25_ = None |
| self.skin_p75_ = None |
| self.insulin_p25_ = None |
| self.insulin_p75_ = None |
| self.feature_names_out_ = [] |
|
|
| |
| if self.col_map is None: |
| self.col_map = DEFAULT_COL_MAP |
|
|
| |
| df = self._handle_missing(df) |
|
|
| |
| if "skin_thickness" in df: |
| s = pd.to_numeric(df["skin_thickness"], errors="coerce") |
| self.skin_p25_, self.skin_p75_ = ( |
| float(s.quantile(0.25)), |
| float(s.quantile(0.75)), |
| ) |
|
|
| if "insulin" in df: |
| s = pd.to_numeric(df["insulin"], errors="coerce") |
| self.insulin_p25_, self.insulin_p75_ = ( |
| float(s.quantile(0.25)), |
| float(s.quantile(0.75)), |
| ) |
|
|
| |
| df = df.drop(columns=[c for c in (self.drop_cols or []) if c in df]) |
|
|
| |
| df_full = self.transform(df) |
| self.feature_names_out_ = list(df_full.columns) |
|
|
| return self |
|
|
| |
| |
| |
| def transform(self, X): |
| """ |
| Apply missing handling, binning, and interactions. |
| Enforce schema consistency. |
| """ |
| df = X.copy() |
|
|
| |
| df = self._handle_missing(df) |
|
|
| |
| if self.create_bins: |
| df = self._add_binning(df) |
|
|
| |
| if self.create_interactions: |
| df = self._add_interactions(df) |
|
|
| |
| df = df.drop(columns=[c for c in (self.drop_cols or []) if c in df]) |
|
|
| |
| if self.feature_names_out_: |
| for col in self.feature_names_out_: |
| if col not in df: |
| df[col] = np.nan |
| df = df[self.feature_names_out_] |
|
|
| return df |
|
|
| def get_feature_names_out(self, input_features=None): |
| """Return final output schema.""" |
| return np.array(self.feature_names_out_) |
|
|
| def set_output(self, *, transform: Optional[str] = None): |
| """Set output container format.""" |
| return self |
|
|
|
|
| class FeatureCreationEda(BaseEstimator, TransformerMixin): |
| """ |
| EDA-only feature creation for the Pima Indians Diabetes dataset. |
| |
| Characteristics: |
| - No imputation (zeros preserved except for missingness flags) |
| - No interactions |
| - No schema locking |
| - Only: |
| * Missingness exposure |
| * Clinical binning |
| * Cohort-relative binning for insulin & skinfold thickness |
| """ |
|
|
| def __init__(self): |
| |
| self.skin_p25_ = None |
| self.skin_p75_ = None |
| self.insulin_p25_ = None |
| self.insulin_p75_ = None |
|
|
| |
| |
| |
| def fit(self, X: pd.DataFrame, y=None): |
| df = X.copy() |
|
|
| if "skin_thickness" in df: |
| s = pd.to_numeric(df["skin_thickness"], errors="coerce") |
| self.skin_p25_ = float(s.quantile(0.25)) |
| self.skin_p75_ = float(s.quantile(0.75)) |
|
|
| if "insulin" in df: |
| s = pd.to_numeric(df["insulin"], errors="coerce") |
| self.insulin_p25_ = float(s.quantile(0.25)) |
| self.insulin_p75_ = float(s.quantile(0.75)) |
|
|
| return self |
|
|
| |
| |
| |
| def _bin_glucose(self, s): |
| bins = [-np.inf, 70, 100, 126, np.inf] |
| labels = ["low", "normal", "prediabetic", "diabetic"] |
| return pd.cut( |
| pd.to_numeric(s, errors="coerce"), bins=bins, labels=labels, right=False |
| ) |
|
|
| def _bin_pregnancies(self, s): |
| bins = [-np.inf, 0.5, 1.5, 5.5, 9.5, np.inf] |
| labels = [ |
| "nulliparous", |
| "uniparous", |
| "multiparous", |
| "grand_multiparous", |
| "great_grand_multiparous", |
| ] |
| return pd.cut(pd.to_numeric(s, errors="coerce"), bins=bins, labels=labels) |
|
|
| def _bin_blood_pressure(self, s): |
| bins = [-np.inf, 80, 90, np.inf] |
| labels = ["normal_or_elevated", "stage1_htn", "stage2_htn"] |
| return pd.cut( |
| pd.to_numeric(s, errors="coerce"), bins=bins, labels=labels, right=False |
| ) |
|
|
| def _bin_bmi(self, s): |
| bins = [-np.inf, 18.5, 25.0, 30.0, np.inf] |
| labels = ["underweight", "normal", "overweight", "obese"] |
| return pd.cut( |
| pd.to_numeric(s, errors="coerce"), bins=bins, labels=labels, right=False |
| ) |
|
|
| def _bin_age(self, s): |
| bins = [-np.inf, 40, 60, np.inf] |
| labels = ["young_adult", "middle_aged", "older_adult"] |
| return pd.cut( |
| pd.to_numeric(s, errors="coerce"), bins=bins, labels=labels, right=False |
| ) |
|
|
| def _bin_skin(self, s): |
| bins = [-np.inf, self.skin_p25_, self.skin_p75_, np.inf] |
| labels = ["low_adiposity", "typical_adiposity", "high_adiposity"] |
| return pd.cut(pd.to_numeric(s, errors="coerce"), bins=bins, labels=labels) |
|
|
| def _bin_insulin(self, s): |
| bins = [-np.inf, self.insulin_p25_, self.insulin_p75_, np.inf] |
| labels = ["low_insulin", "typical_insulin", "high_insulin"] |
| return pd.cut(pd.to_numeric(s, errors="coerce"), bins=bins, labels=labels) |
|
|
| |
| |
| |
| def transform(self, X: pd.DataFrame) -> pd.DataFrame: |
| df = X.copy() |
|
|
| |
| |
| |
| for col in ["glucose", "blood_pressure", "skin_thickness", "insulin", "bmi"]: |
| if col in df: |
| df[f"{col}_missing"] = df[col].isna() | (df[col] == 0) |
|
|
| |
| |
| |
| if "glucose" in df: |
| df["glucose_bin"] = self._bin_glucose(df["glucose"]) |
|
|
| if "pregnancies" in df: |
| df["pregnancies_bin"] = self._bin_pregnancies(df["pregnancies"]) |
|
|
| if "blood_pressure" in df: |
| df["blood_pressure_bin"] = self._bin_blood_pressure(df["blood_pressure"]) |
|
|
| if "bmi" in df: |
| df["bmi_bin"] = self._bin_bmi(df["bmi"]) |
|
|
| if "age" in df: |
| df["age_bin"] = self._bin_age(df["age"]) |
|
|
| if "skin_thickness" in df and self.skin_p25_ is not None: |
| df["skin_thickness_bin"] = self._bin_skin(df["skin_thickness"]) |
|
|
| if "insulin" in df and self.insulin_p25_ is not None: |
| df["insulin_bin"] = self._bin_insulin(df["insulin"]) |
|
|
| return df |
|
|