Spaces:
Running
Running
| """ | |
| Block 2: ML Health Classifier — USDA nutrition lookup + Logistic Regression scoring. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import Any, Dict, Optional | |
| import numpy as np | |
| logger = logging.getLogger(__name__) | |
| _PROJECT_ROOT = Path(__file__).parent.parent | |
| _DEFAULT_MODEL_PATH = _PROJECT_ROOT / "models" / "health_classifier.pkl" | |
| class NutritionResult: | |
| """Nutrition result dataclass — kept for backward compatibility with tests. | |
| Attributes mirror the USDA-based nutritional values per 100g. | |
| """ | |
| food_label: str | |
| energy_kcal: float | |
| fat_g: float | |
| saturated_fat_g: float | |
| sugars_g: float | |
| fiber_g: float | |
| proteins_g: float | |
| salt_g: float | |
| health_score: float | |
| health_label: str | |
| nutriscore: Optional[str] = None | |
| class MLModel: | |
| """USDA nutrition lookup + Logistic Regression health classifier. | |
| The model bundle (``health_classifier.pkl``) contains: | |
| - ``model``: fitted LogisticRegression | |
| - ``scaler``: fitted StandardScaler | |
| - ``label_encoder``: fitted LabelEncoder (healthy / medium / unhealthy) | |
| - ``feature_cols``: list of 16 feature column names | |
| - ``usda_nutrition``: dict of curated nutrition data per food class | |
| Args: | |
| model_path: Override path to the ``.pkl`` bundle. | |
| """ | |
| def __init__(self, model_path: Optional[str] = None) -> None: | |
| self.model_path = Path(model_path) if model_path else _DEFAULT_MODEL_PATH | |
| self._classifier: Optional[Dict[str, Any]] = None | |
| def _load(self) -> None: | |
| """Lazy-load the model bundle from disk using joblib.""" | |
| try: | |
| import joblib | |
| except ImportError as exc: | |
| raise ImportError( | |
| "joblib is required. Run: pip install joblib" | |
| ) from exc | |
| if not self.model_path.exists(): | |
| raise FileNotFoundError( | |
| f"Model bundle not found: {self.model_path}. " | |
| "Run notebook 03_ml_health_classifier.ipynb first." | |
| ) | |
| logger.info("Loading ML model from %s ...", self.model_path) | |
| self._classifier = joblib.load(self.model_path) | |
| logger.info( | |
| "ML model loaded (type: %s, accuracy: %.4f)", | |
| self._classifier.get("model_type"), | |
| self._classifier.get("test_accuracy", 0), | |
| ) | |
| def _build_features(self, nutrition: Dict[str, float]) -> np.ndarray: | |
| """Compute 16 model features from 8 base USDA nutrients. | |
| Feature engineering mirrors Notebook 03 (cell 8 / cell 17). | |
| """ | |
| kcal = nutrition["kcal"] | |
| fat = nutrition["fat"] | |
| sat_fat = nutrition["sat_fat"] | |
| carbs = nutrition["carbs"] | |
| sugar = nutrition["sugar"] | |
| fiber = nutrition["fiber"] | |
| protein = nutrition["protein"] | |
| salt = nutrition["salt"] | |
| feature_map: Dict[str, float] = { | |
| "kcal": kcal, | |
| "fat": fat, | |
| "sat_fat": sat_fat, | |
| "carbs": carbs, | |
| "sugar": sugar, | |
| "fiber": fiber, | |
| "protein": protein, | |
| "salt": salt, | |
| "sugar_to_carb_ratio": sugar / (carbs + 1e-6), | |
| "sat_fat_pct_of_fat": sat_fat / (fat + 1e-6), | |
| "calorie_density": kcal / 100, | |
| "protein_to_kcal": protein * 4 / (kcal + 1e-6), | |
| "fiber_to_carb_ratio": fiber / (carbs + 1e-6), | |
| "high_sugar": float(sugar > 15), | |
| "high_salt": float(salt > 1.5), | |
| "high_sat_fat": float(sat_fat > 5), | |
| } | |
| cols = self._classifier["feature_cols"] | |
| return np.array([[feature_map[c] for c in cols]]) | |
| def predict(self, food_class: str) -> Dict[str, Any]: | |
| """Look up USDA nutrition and predict health label. | |
| Args: | |
| food_class: Food class name as returned by CVModel (e.g. ``"pizza"``). | |
| Returns: | |
| { | |
| "food_class": str, | |
| "nutrition": {"kcal": float, "fat": float, "sat_fat": float, | |
| "carbs": float, "sugar": float, "fiber": float, | |
| "protein": float, "salt": float}, | |
| "health_label": str, # "healthy" | "medium" | "unhealthy" | |
| "probabilities": {"healthy": float, "medium": float, "unhealthy": float} | |
| } | |
| Raises: | |
| ValueError: If ``food_class`` is not in the USDA nutrition table. | |
| FileNotFoundError: If the model bundle is missing. | |
| """ | |
| if self._classifier is None: | |
| self._load() | |
| bundle = self._classifier | |
| usda: Dict[str, Dict[str, float]] = bundle["usda_nutrition"] | |
| normalized = food_class.lower().replace(" ", "_").replace("-", "_") | |
| if normalized not in usda: | |
| raise ValueError( | |
| f"Unknown food class: '{food_class}'. " | |
| f"Supported: {sorted(usda.keys())}" | |
| ) | |
| nutrition = usda[normalized] | |
| X = self._build_features(nutrition) | |
| X_scaled = bundle["scaler"].transform(X) | |
| pred_idx = bundle["model"].predict(X_scaled)[0] | |
| proba = bundle["model"].predict_proba(X_scaled)[0] | |
| le = bundle["label_encoder"] | |
| health_label = str(le.inverse_transform([pred_idx])[0]) | |
| probabilities = { | |
| str(cls): round(float(p), 4) | |
| for cls, p in zip(le.classes_, proba) | |
| } | |
| return { | |
| "food_class": normalized, | |
| "nutrition": nutrition, | |
| "health_label": health_label, | |
| "probabilities": probabilities, | |
| } | |