""" OmniDiag — Base Feature Engineer ================================= Abstract base class for disease-specific feature engineering modules. All disease feature engineers must inherit from this class and implement the engineer_heuristic() and engineer_clinical() methods. This ensures a consistent interface across all diseases, making the system predictable, testable, and extensible. """ from abc import ABC, abstractmethod import pandas as pd from typing import Dict class BaseFeatureEngineer(ABC): """ Abstract base class for disease-specific feature engineering. Each disease defines two feature engineering paths: 1. Heuristic (Statistical): Mathematical interactions derived from raw features. 2. Clinical (Medical): Risk scores based on domain-specific medical formulas. Usage: class MyDiseaseFeatures(BaseFeatureEngineer): def engineer_heuristic(self, df): df['my_feature'] = df['A'] * df['B'] return df def engineer_clinical(self, df): df['risk_score'] = exp(df['A'] * 0.1) return df """ def __init__(self, config: dict): """ Initialize with the disease-specific configuration dictionary. Args: config: The parsed YAML config dict for this disease. Expected to have keys: disease, data, features, model, etc. """ self.config = config self.disease_name = config.get("disease", {}).get("name", "unknown") self.target_col = config.get("disease", {}).get("target_column", "target") @abstractmethod def engineer_heuristic(self, df: pd.DataFrame) -> pd.DataFrame: """ Generate statistical/heuristic features from raw data. These are mathematical interactions (multiplication, ratios, etc.) that let the model discover non-linear patterns on its own. Args: df: Input DataFrame with base features. Returns: DataFrame with heuristic features added. """ pass @abstractmethod def engineer_clinical(self, df: pd.DataFrame) -> pd.DataFrame: """ Generate clinically-derived features from raw data. These are risk scores or indices based on established medical formulas with fixed coefficients. Args: df: Input DataFrame with base features. Returns: DataFrame with clinical features added. """ pass def run_all(self, df: pd.DataFrame) -> Dict[str, pd.DataFrame]: """ Run both feature engineering paths and return results as a dict. This is the main entry point for generating both datasets for A/B testing (heuristic vs clinical). Args: df: Input DataFrame with base features. Returns: Dictionary with keys 'heuristic' and 'clinical', each containing the respective DataFrame. """ return { "heuristic": self.engineer_heuristic(df.copy()), "clinical": self.engineer_clinical(df.copy()) }