smartplate / src /ml_model.py
Gianone's picture
feat: deploy SmartPlate full pipeline (CV + ML + NLP)
c173dc3
Raw
History Blame Contribute Delete
5.66 kB
"""
Block 2: ML Health Classifier — USDA nutrition lookup + Logistic Regression scoring.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Optional
import numpy as np
logger = logging.getLogger(__name__)
_PROJECT_ROOT = Path(__file__).parent.parent
_DEFAULT_MODEL_PATH = _PROJECT_ROOT / "models" / "health_classifier.pkl"
@dataclass
class NutritionResult:
"""Nutrition result dataclass — kept for backward compatibility with tests.
Attributes mirror the USDA-based nutritional values per 100g.
"""
food_label: str
energy_kcal: float
fat_g: float
saturated_fat_g: float
sugars_g: float
fiber_g: float
proteins_g: float
salt_g: float
health_score: float
health_label: str
nutriscore: Optional[str] = None
class MLModel:
"""USDA nutrition lookup + Logistic Regression health classifier.
The model bundle (``health_classifier.pkl``) contains:
- ``model``: fitted LogisticRegression
- ``scaler``: fitted StandardScaler
- ``label_encoder``: fitted LabelEncoder (healthy / medium / unhealthy)
- ``feature_cols``: list of 16 feature column names
- ``usda_nutrition``: dict of curated nutrition data per food class
Args:
model_path: Override path to the ``.pkl`` bundle.
"""
def __init__(self, model_path: Optional[str] = None) -> None:
self.model_path = Path(model_path) if model_path else _DEFAULT_MODEL_PATH
self._classifier: Optional[Dict[str, Any]] = None
def _load(self) -> None:
"""Lazy-load the model bundle from disk using joblib."""
try:
import joblib
except ImportError as exc:
raise ImportError(
"joblib is required. Run: pip install joblib"
) from exc
if not self.model_path.exists():
raise FileNotFoundError(
f"Model bundle not found: {self.model_path}. "
"Run notebook 03_ml_health_classifier.ipynb first."
)
logger.info("Loading ML model from %s ...", self.model_path)
self._classifier = joblib.load(self.model_path)
logger.info(
"ML model loaded (type: %s, accuracy: %.4f)",
self._classifier.get("model_type"),
self._classifier.get("test_accuracy", 0),
)
def _build_features(self, nutrition: Dict[str, float]) -> np.ndarray:
"""Compute 16 model features from 8 base USDA nutrients.
Feature engineering mirrors Notebook 03 (cell 8 / cell 17).
"""
kcal = nutrition["kcal"]
fat = nutrition["fat"]
sat_fat = nutrition["sat_fat"]
carbs = nutrition["carbs"]
sugar = nutrition["sugar"]
fiber = nutrition["fiber"]
protein = nutrition["protein"]
salt = nutrition["salt"]
feature_map: Dict[str, float] = {
"kcal": kcal,
"fat": fat,
"sat_fat": sat_fat,
"carbs": carbs,
"sugar": sugar,
"fiber": fiber,
"protein": protein,
"salt": salt,
"sugar_to_carb_ratio": sugar / (carbs + 1e-6),
"sat_fat_pct_of_fat": sat_fat / (fat + 1e-6),
"calorie_density": kcal / 100,
"protein_to_kcal": protein * 4 / (kcal + 1e-6),
"fiber_to_carb_ratio": fiber / (carbs + 1e-6),
"high_sugar": float(sugar > 15),
"high_salt": float(salt > 1.5),
"high_sat_fat": float(sat_fat > 5),
}
cols = self._classifier["feature_cols"]
return np.array([[feature_map[c] for c in cols]])
def predict(self, food_class: str) -> Dict[str, Any]:
"""Look up USDA nutrition and predict health label.
Args:
food_class: Food class name as returned by CVModel (e.g. ``"pizza"``).
Returns:
{
"food_class": str,
"nutrition": {"kcal": float, "fat": float, "sat_fat": float,
"carbs": float, "sugar": float, "fiber": float,
"protein": float, "salt": float},
"health_label": str, # "healthy" | "medium" | "unhealthy"
"probabilities": {"healthy": float, "medium": float, "unhealthy": float}
}
Raises:
ValueError: If ``food_class`` is not in the USDA nutrition table.
FileNotFoundError: If the model bundle is missing.
"""
if self._classifier is None:
self._load()
bundle = self._classifier
usda: Dict[str, Dict[str, float]] = bundle["usda_nutrition"]
normalized = food_class.lower().replace(" ", "_").replace("-", "_")
if normalized not in usda:
raise ValueError(
f"Unknown food class: '{food_class}'. "
f"Supported: {sorted(usda.keys())}"
)
nutrition = usda[normalized]
X = self._build_features(nutrition)
X_scaled = bundle["scaler"].transform(X)
pred_idx = bundle["model"].predict(X_scaled)[0]
proba = bundle["model"].predict_proba(X_scaled)[0]
le = bundle["label_encoder"]
health_label = str(le.inverse_transform([pred_idx])[0])
probabilities = {
str(cls): round(float(p), 4)
for cls, p in zip(le.classes_, proba)
}
return {
"food_class": normalized,
"nutrition": nutrition,
"health_label": health_label,
"probabilities": probabilities,
}