smartplate / src /data_loader.py
Gianone's picture
feat: deploy SmartPlate full pipeline (CV + ML + NLP)
c173dc3
Raw
History Blame Contribute Delete
6.52 kB
"""
SmartPlate data loading utilities.
This module provides stub functions for loading and preparing data for each
of the three AI blocks. Actual implementation happens in the training notebooks;
these stubs define the interface used by the pipeline.
"""
from __future__ import annotations
import os
from pathlib import Path
from typing import Optional
def download_food101_subset(
classes: list[str],
output_dir: str,
split: str = "train",
max_samples_per_class: Optional[int] = None,
) -> dict[str, list[str]]:
"""Download a subset of the Food-101 dataset from HuggingFace Datasets.
Pulls only the specified classes from Food-101, optionally capping the
number of samples per class. Images are saved to ``output_dir`` under
class-named subdirectories (e.g. ``output_dir/pizza/image001.jpg``).
Args:
classes: List of Food-101 class names to include.
Must be valid Food-101 labels (lowercase, underscores for spaces),
e.g. ``["pizza", "sushi", "caesar_salad"]``.
output_dir: Root directory where images will be saved.
Will be created if it does not exist.
split: Dataset split to download — ``"train"``, ``"validation"``, or
``"test"``. Defaults to ``"train"``.
max_samples_per_class: If set, cap the number of images per class.
Useful for quick experiments. ``None`` means no cap (all images).
Returns:
A dict mapping each class name to a list of absolute file paths of
the downloaded images, e.g.::
{
"pizza": ["/data/raw/pizza/img_001.jpg", ...],
"sushi": ["/data/raw/sushi/img_001.jpg", ...],
}
Raises:
ValueError: If any class name is not a valid Food-101 label.
OSError: If ``output_dir`` cannot be created.
Example:
>>> paths = download_food101_subset(
... classes=["pizza", "sushi", "salad"],
... output_dir="data/raw/food101",
... split="train",
... max_samples_per_class=100,
... )
>>> len(paths["pizza"])
100
"""
# TODO: Implement in notebook 01_eda_food101.ipynb, then port here.
# Suggested implementation:
# from datasets import load_dataset
# ds = load_dataset("food101", split=split)
# ds_subset = ds.filter(lambda x: x["label"] in class_indices)
raise NotImplementedError("Implement in notebook 02_train_vit_cv.ipynb first.")
def load_openfoodfacts_sample(
food_name: str,
max_results: int = 20,
country: str = "world",
) -> list[dict]:
"""Query the Open Food Facts API for nutritional data about a food item.
Searches Open Food Facts by product name and returns a list of matching
products with their nutritional information per 100g.
Args:
food_name: The dish or food item to search for, e.g. ``"pizza margherita"``.
Accepts free-form text; the API handles partial matches.
max_results: Maximum number of products to return. Defaults to 20.
country: Country-specific endpoint to query. Defaults to ``"world"``
(global database). Use ``"ch"`` for Swiss products.
Returns:
A list of product dicts. Each dict contains at minimum::
{
"product_name": str,
"energy_100g": float, # kcal per 100g
"fat_100g": float,
"saturated_fat_100g": float,
"sugars_100g": float,
"fiber_100g": float,
"proteins_100g": float,
"salt_100g": float,
"nutriscore_grade": str, # "a" through "e", or None
}
Returns an empty list if no products are found.
Raises:
requests.HTTPError: If the API request fails.
Example:
>>> products = load_openfoodfacts_sample("margherita pizza", max_results=5)
>>> products[0]["energy_100g"]
266.0
"""
# TODO: Implement in notebook 03_ml_health_classifier.ipynb, then port here.
# Suggested implementation:
# import requests
# url = f"https://{country}.openfoodfacts.org/cgi/search.pl"
# params = {"search_terms": food_name, "json": 1, "page_size": max_results}
# response = requests.get(url, params=params)
# return [_extract_nutrients(p) for p in response.json()["products"]]
raise NotImplementedError("Implement in notebook 03_ml_health_classifier.ipynb first.")
def load_knowledge_base_documents(folder: str) -> list[dict]:
"""Load and parse all documents in the RAG knowledge base folder.
Reads all ``.pdf``, ``.txt``, and ``.md`` files from the specified folder,
extracts their text content, and returns a list of document dicts ready
for chunking and embedding into ChromaDB.
Args:
folder: Path to the folder containing knowledge base documents.
Typically ``data/knowledge_base/``. Subdirectories are not
traversed — only top-level files are read.
Returns:
A list of document dicts, one per file::
[
{
"source": "who_healthy_diet.pdf",
"content": "A healthy diet helps protect against ...",
"metadata": {
"file_type": "pdf",
"num_pages": 4,
"file_size_kb": 112,
},
},
...
]
Returns an empty list if the folder contains no supported files.
Raises:
FileNotFoundError: If ``folder`` does not exist.
ValueError: If a file cannot be parsed (e.g., encrypted PDF).
Example:
>>> docs = load_knowledge_base_documents("data/knowledge_base/")
>>> len(docs)
3
>>> docs[0]["source"]
'who_healthy_diet.pdf'
"""
# TODO: Implement in notebook 04_rag_setup.ipynb, then port here.
# Suggested implementation:
# from pypdf import PdfReader
# for path in Path(folder).glob("*.pdf"):
# reader = PdfReader(path)
# text = "\n".join(page.extract_text() for page in reader.pages)
# docs.append({"source": path.name, "content": text, ...})
folder_path = Path(folder)
if not folder_path.exists():
raise FileNotFoundError(f"Knowledge base folder not found: {folder}")
raise NotImplementedError("Implement in notebook 04_rag_setup.ipynb first.")