""" Block 2: ML Health Classifier — USDA nutrition lookup + Logistic Regression scoring. """ from __future__ import annotations import logging from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, Optional import numpy as np logger = logging.getLogger(__name__) _PROJECT_ROOT = Path(__file__).parent.parent _DEFAULT_MODEL_PATH = _PROJECT_ROOT / "models" / "health_classifier.pkl" @dataclass class NutritionResult: """Nutrition result dataclass — kept for backward compatibility with tests. Attributes mirror the USDA-based nutritional values per 100g. """ food_label: str energy_kcal: float fat_g: float saturated_fat_g: float sugars_g: float fiber_g: float proteins_g: float salt_g: float health_score: float health_label: str nutriscore: Optional[str] = None class MLModel: """USDA nutrition lookup + Logistic Regression health classifier. The model bundle (``health_classifier.pkl``) contains: - ``model``: fitted LogisticRegression - ``scaler``: fitted StandardScaler - ``label_encoder``: fitted LabelEncoder (healthy / medium / unhealthy) - ``feature_cols``: list of 16 feature column names - ``usda_nutrition``: dict of curated nutrition data per food class Args: model_path: Override path to the ``.pkl`` bundle. """ def __init__(self, model_path: Optional[str] = None) -> None: self.model_path = Path(model_path) if model_path else _DEFAULT_MODEL_PATH self._classifier: Optional[Dict[str, Any]] = None def _load(self) -> None: """Lazy-load the model bundle from disk using joblib.""" try: import joblib except ImportError as exc: raise ImportError( "joblib is required. Run: pip install joblib" ) from exc if not self.model_path.exists(): raise FileNotFoundError( f"Model bundle not found: {self.model_path}. " "Run notebook 03_ml_health_classifier.ipynb first." ) logger.info("Loading ML model from %s ...", self.model_path) self._classifier = joblib.load(self.model_path) logger.info( "ML model loaded (type: %s, accuracy: %.4f)", self._classifier.get("model_type"), self._classifier.get("test_accuracy", 0), ) def _build_features(self, nutrition: Dict[str, float]) -> np.ndarray: """Compute 16 model features from 8 base USDA nutrients. Feature engineering mirrors Notebook 03 (cell 8 / cell 17). """ kcal = nutrition["kcal"] fat = nutrition["fat"] sat_fat = nutrition["sat_fat"] carbs = nutrition["carbs"] sugar = nutrition["sugar"] fiber = nutrition["fiber"] protein = nutrition["protein"] salt = nutrition["salt"] feature_map: Dict[str, float] = { "kcal": kcal, "fat": fat, "sat_fat": sat_fat, "carbs": carbs, "sugar": sugar, "fiber": fiber, "protein": protein, "salt": salt, "sugar_to_carb_ratio": sugar / (carbs + 1e-6), "sat_fat_pct_of_fat": sat_fat / (fat + 1e-6), "calorie_density": kcal / 100, "protein_to_kcal": protein * 4 / (kcal + 1e-6), "fiber_to_carb_ratio": fiber / (carbs + 1e-6), "high_sugar": float(sugar > 15), "high_salt": float(salt > 1.5), "high_sat_fat": float(sat_fat > 5), } cols = self._classifier["feature_cols"] return np.array([[feature_map[c] for c in cols]]) def predict(self, food_class: str) -> Dict[str, Any]: """Look up USDA nutrition and predict health label. Args: food_class: Food class name as returned by CVModel (e.g. ``"pizza"``). Returns: { "food_class": str, "nutrition": {"kcal": float, "fat": float, "sat_fat": float, "carbs": float, "sugar": float, "fiber": float, "protein": float, "salt": float}, "health_label": str, # "healthy" | "medium" | "unhealthy" "probabilities": {"healthy": float, "medium": float, "unhealthy": float} } Raises: ValueError: If ``food_class`` is not in the USDA nutrition table. FileNotFoundError: If the model bundle is missing. """ if self._classifier is None: self._load() bundle = self._classifier usda: Dict[str, Dict[str, float]] = bundle["usda_nutrition"] normalized = food_class.lower().replace(" ", "_").replace("-", "_") if normalized not in usda: raise ValueError( f"Unknown food class: '{food_class}'. " f"Supported: {sorted(usda.keys())}" ) nutrition = usda[normalized] X = self._build_features(nutrition) X_scaled = bundle["scaler"].transform(X) pred_idx = bundle["model"].predict(X_scaled)[0] proba = bundle["model"].predict_proba(X_scaled)[0] le = bundle["label_encoder"] health_label = str(le.inverse_transform([pred_idx])[0]) probabilities = { str(cls): round(float(p), 4) for cls, p in zip(le.classes_, proba) } return { "food_class": normalized, "nutrition": nutrition, "health_label": health_label, "probabilities": probabilities, }