""" SmartPlate data loading utilities. This module provides stub functions for loading and preparing data for each of the three AI blocks. Actual implementation happens in the training notebooks; these stubs define the interface used by the pipeline. """ from __future__ import annotations import os from pathlib import Path from typing import Optional def download_food101_subset( classes: list[str], output_dir: str, split: str = "train", max_samples_per_class: Optional[int] = None, ) -> dict[str, list[str]]: """Download a subset of the Food-101 dataset from HuggingFace Datasets. Pulls only the specified classes from Food-101, optionally capping the number of samples per class. Images are saved to ``output_dir`` under class-named subdirectories (e.g. ``output_dir/pizza/image001.jpg``). Args: classes: List of Food-101 class names to include. Must be valid Food-101 labels (lowercase, underscores for spaces), e.g. ``["pizza", "sushi", "caesar_salad"]``. output_dir: Root directory where images will be saved. Will be created if it does not exist. split: Dataset split to download — ``"train"``, ``"validation"``, or ``"test"``. Defaults to ``"train"``. max_samples_per_class: If set, cap the number of images per class. Useful for quick experiments. ``None`` means no cap (all images). Returns: A dict mapping each class name to a list of absolute file paths of the downloaded images, e.g.:: { "pizza": ["/data/raw/pizza/img_001.jpg", ...], "sushi": ["/data/raw/sushi/img_001.jpg", ...], } Raises: ValueError: If any class name is not a valid Food-101 label. OSError: If ``output_dir`` cannot be created. Example: >>> paths = download_food101_subset( ... classes=["pizza", "sushi", "salad"], ... output_dir="data/raw/food101", ... split="train", ... max_samples_per_class=100, ... ) >>> len(paths["pizza"]) 100 """ # TODO: Implement in notebook 01_eda_food101.ipynb, then port here. # Suggested implementation: # from datasets import load_dataset # ds = load_dataset("food101", split=split) # ds_subset = ds.filter(lambda x: x["label"] in class_indices) raise NotImplementedError("Implement in notebook 02_train_vit_cv.ipynb first.") def load_openfoodfacts_sample( food_name: str, max_results: int = 20, country: str = "world", ) -> list[dict]: """Query the Open Food Facts API for nutritional data about a food item. Searches Open Food Facts by product name and returns a list of matching products with their nutritional information per 100g. Args: food_name: The dish or food item to search for, e.g. ``"pizza margherita"``. Accepts free-form text; the API handles partial matches. max_results: Maximum number of products to return. Defaults to 20. country: Country-specific endpoint to query. Defaults to ``"world"`` (global database). Use ``"ch"`` for Swiss products. Returns: A list of product dicts. Each dict contains at minimum:: { "product_name": str, "energy_100g": float, # kcal per 100g "fat_100g": float, "saturated_fat_100g": float, "sugars_100g": float, "fiber_100g": float, "proteins_100g": float, "salt_100g": float, "nutriscore_grade": str, # "a" through "e", or None } Returns an empty list if no products are found. Raises: requests.HTTPError: If the API request fails. Example: >>> products = load_openfoodfacts_sample("margherita pizza", max_results=5) >>> products[0]["energy_100g"] 266.0 """ # TODO: Implement in notebook 03_ml_health_classifier.ipynb, then port here. # Suggested implementation: # import requests # url = f"https://{country}.openfoodfacts.org/cgi/search.pl" # params = {"search_terms": food_name, "json": 1, "page_size": max_results} # response = requests.get(url, params=params) # return [_extract_nutrients(p) for p in response.json()["products"]] raise NotImplementedError("Implement in notebook 03_ml_health_classifier.ipynb first.") def load_knowledge_base_documents(folder: str) -> list[dict]: """Load and parse all documents in the RAG knowledge base folder. Reads all ``.pdf``, ``.txt``, and ``.md`` files from the specified folder, extracts their text content, and returns a list of document dicts ready for chunking and embedding into ChromaDB. Args: folder: Path to the folder containing knowledge base documents. Typically ``data/knowledge_base/``. Subdirectories are not traversed — only top-level files are read. Returns: A list of document dicts, one per file:: [ { "source": "who_healthy_diet.pdf", "content": "A healthy diet helps protect against ...", "metadata": { "file_type": "pdf", "num_pages": 4, "file_size_kb": 112, }, }, ... ] Returns an empty list if the folder contains no supported files. Raises: FileNotFoundError: If ``folder`` does not exist. ValueError: If a file cannot be parsed (e.g., encrypted PDF). Example: >>> docs = load_knowledge_base_documents("data/knowledge_base/") >>> len(docs) 3 >>> docs[0]["source"] 'who_healthy_diet.pdf' """ # TODO: Implement in notebook 04_rag_setup.ipynb, then port here. # Suggested implementation: # from pypdf import PdfReader # for path in Path(folder).glob("*.pdf"): # reader = PdfReader(path) # text = "\n".join(page.extract_text() for page in reader.pages) # docs.append({"source": path.name, "content": text, ...}) folder_path = Path(folder) if not folder_path.exists(): raise FileNotFoundError(f"Knowledge base folder not found: {folder}") raise NotImplementedError("Implement in notebook 04_rag_setup.ipynb first.")