Spaces:
Running
Running
| """ | |
| SmartPlate data loading utilities. | |
| This module provides stub functions for loading and preparing data for each | |
| of the three AI blocks. Actual implementation happens in the training notebooks; | |
| these stubs define the interface used by the pipeline. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| from pathlib import Path | |
| from typing import Optional | |
| def download_food101_subset( | |
| classes: list[str], | |
| output_dir: str, | |
| split: str = "train", | |
| max_samples_per_class: Optional[int] = None, | |
| ) -> dict[str, list[str]]: | |
| """Download a subset of the Food-101 dataset from HuggingFace Datasets. | |
| Pulls only the specified classes from Food-101, optionally capping the | |
| number of samples per class. Images are saved to ``output_dir`` under | |
| class-named subdirectories (e.g. ``output_dir/pizza/image001.jpg``). | |
| Args: | |
| classes: List of Food-101 class names to include. | |
| Must be valid Food-101 labels (lowercase, underscores for spaces), | |
| e.g. ``["pizza", "sushi", "caesar_salad"]``. | |
| output_dir: Root directory where images will be saved. | |
| Will be created if it does not exist. | |
| split: Dataset split to download — ``"train"``, ``"validation"``, or | |
| ``"test"``. Defaults to ``"train"``. | |
| max_samples_per_class: If set, cap the number of images per class. | |
| Useful for quick experiments. ``None`` means no cap (all images). | |
| Returns: | |
| A dict mapping each class name to a list of absolute file paths of | |
| the downloaded images, e.g.:: | |
| { | |
| "pizza": ["/data/raw/pizza/img_001.jpg", ...], | |
| "sushi": ["/data/raw/sushi/img_001.jpg", ...], | |
| } | |
| Raises: | |
| ValueError: If any class name is not a valid Food-101 label. | |
| OSError: If ``output_dir`` cannot be created. | |
| Example: | |
| >>> paths = download_food101_subset( | |
| ... classes=["pizza", "sushi", "salad"], | |
| ... output_dir="data/raw/food101", | |
| ... split="train", | |
| ... max_samples_per_class=100, | |
| ... ) | |
| >>> len(paths["pizza"]) | |
| 100 | |
| """ | |
| # TODO: Implement in notebook 01_eda_food101.ipynb, then port here. | |
| # Suggested implementation: | |
| # from datasets import load_dataset | |
| # ds = load_dataset("food101", split=split) | |
| # ds_subset = ds.filter(lambda x: x["label"] in class_indices) | |
| raise NotImplementedError("Implement in notebook 02_train_vit_cv.ipynb first.") | |
| def load_openfoodfacts_sample( | |
| food_name: str, | |
| max_results: int = 20, | |
| country: str = "world", | |
| ) -> list[dict]: | |
| """Query the Open Food Facts API for nutritional data about a food item. | |
| Searches Open Food Facts by product name and returns a list of matching | |
| products with their nutritional information per 100g. | |
| Args: | |
| food_name: The dish or food item to search for, e.g. ``"pizza margherita"``. | |
| Accepts free-form text; the API handles partial matches. | |
| max_results: Maximum number of products to return. Defaults to 20. | |
| country: Country-specific endpoint to query. Defaults to ``"world"`` | |
| (global database). Use ``"ch"`` for Swiss products. | |
| Returns: | |
| A list of product dicts. Each dict contains at minimum:: | |
| { | |
| "product_name": str, | |
| "energy_100g": float, # kcal per 100g | |
| "fat_100g": float, | |
| "saturated_fat_100g": float, | |
| "sugars_100g": float, | |
| "fiber_100g": float, | |
| "proteins_100g": float, | |
| "salt_100g": float, | |
| "nutriscore_grade": str, # "a" through "e", or None | |
| } | |
| Returns an empty list if no products are found. | |
| Raises: | |
| requests.HTTPError: If the API request fails. | |
| Example: | |
| >>> products = load_openfoodfacts_sample("margherita pizza", max_results=5) | |
| >>> products[0]["energy_100g"] | |
| 266.0 | |
| """ | |
| # TODO: Implement in notebook 03_ml_health_classifier.ipynb, then port here. | |
| # Suggested implementation: | |
| # import requests | |
| # url = f"https://{country}.openfoodfacts.org/cgi/search.pl" | |
| # params = {"search_terms": food_name, "json": 1, "page_size": max_results} | |
| # response = requests.get(url, params=params) | |
| # return [_extract_nutrients(p) for p in response.json()["products"]] | |
| raise NotImplementedError("Implement in notebook 03_ml_health_classifier.ipynb first.") | |
| def load_knowledge_base_documents(folder: str) -> list[dict]: | |
| """Load and parse all documents in the RAG knowledge base folder. | |
| Reads all ``.pdf``, ``.txt``, and ``.md`` files from the specified folder, | |
| extracts their text content, and returns a list of document dicts ready | |
| for chunking and embedding into ChromaDB. | |
| Args: | |
| folder: Path to the folder containing knowledge base documents. | |
| Typically ``data/knowledge_base/``. Subdirectories are not | |
| traversed — only top-level files are read. | |
| Returns: | |
| A list of document dicts, one per file:: | |
| [ | |
| { | |
| "source": "who_healthy_diet.pdf", | |
| "content": "A healthy diet helps protect against ...", | |
| "metadata": { | |
| "file_type": "pdf", | |
| "num_pages": 4, | |
| "file_size_kb": 112, | |
| }, | |
| }, | |
| ... | |
| ] | |
| Returns an empty list if the folder contains no supported files. | |
| Raises: | |
| FileNotFoundError: If ``folder`` does not exist. | |
| ValueError: If a file cannot be parsed (e.g., encrypted PDF). | |
| Example: | |
| >>> docs = load_knowledge_base_documents("data/knowledge_base/") | |
| >>> len(docs) | |
| 3 | |
| >>> docs[0]["source"] | |
| 'who_healthy_diet.pdf' | |
| """ | |
| # TODO: Implement in notebook 04_rag_setup.ipynb, then port here. | |
| # Suggested implementation: | |
| # from pypdf import PdfReader | |
| # for path in Path(folder).glob("*.pdf"): | |
| # reader = PdfReader(path) | |
| # text = "\n".join(page.extract_text() for page in reader.pages) | |
| # docs.append({"source": path.name, "content": text, ...}) | |
| folder_path = Path(folder) | |
| if not folder_path.exists(): | |
| raise FileNotFoundError(f"Knowledge base folder not found: {folder}") | |
| raise NotImplementedError("Implement in notebook 04_rag_setup.ipynb first.") | |