Spaces:
Running
Running
File size: 6,517 Bytes
c173dc3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | """
SmartPlate data loading utilities.
This module provides stub functions for loading and preparing data for each
of the three AI blocks. Actual implementation happens in the training notebooks;
these stubs define the interface used by the pipeline.
"""
from __future__ import annotations
import os
from pathlib import Path
from typing import Optional
def download_food101_subset(
classes: list[str],
output_dir: str,
split: str = "train",
max_samples_per_class: Optional[int] = None,
) -> dict[str, list[str]]:
"""Download a subset of the Food-101 dataset from HuggingFace Datasets.
Pulls only the specified classes from Food-101, optionally capping the
number of samples per class. Images are saved to ``output_dir`` under
class-named subdirectories (e.g. ``output_dir/pizza/image001.jpg``).
Args:
classes: List of Food-101 class names to include.
Must be valid Food-101 labels (lowercase, underscores for spaces),
e.g. ``["pizza", "sushi", "caesar_salad"]``.
output_dir: Root directory where images will be saved.
Will be created if it does not exist.
split: Dataset split to download — ``"train"``, ``"validation"``, or
``"test"``. Defaults to ``"train"``.
max_samples_per_class: If set, cap the number of images per class.
Useful for quick experiments. ``None`` means no cap (all images).
Returns:
A dict mapping each class name to a list of absolute file paths of
the downloaded images, e.g.::
{
"pizza": ["/data/raw/pizza/img_001.jpg", ...],
"sushi": ["/data/raw/sushi/img_001.jpg", ...],
}
Raises:
ValueError: If any class name is not a valid Food-101 label.
OSError: If ``output_dir`` cannot be created.
Example:
>>> paths = download_food101_subset(
... classes=["pizza", "sushi", "salad"],
... output_dir="data/raw/food101",
... split="train",
... max_samples_per_class=100,
... )
>>> len(paths["pizza"])
100
"""
# TODO: Implement in notebook 01_eda_food101.ipynb, then port here.
# Suggested implementation:
# from datasets import load_dataset
# ds = load_dataset("food101", split=split)
# ds_subset = ds.filter(lambda x: x["label"] in class_indices)
raise NotImplementedError("Implement in notebook 02_train_vit_cv.ipynb first.")
def load_openfoodfacts_sample(
food_name: str,
max_results: int = 20,
country: str = "world",
) -> list[dict]:
"""Query the Open Food Facts API for nutritional data about a food item.
Searches Open Food Facts by product name and returns a list of matching
products with their nutritional information per 100g.
Args:
food_name: The dish or food item to search for, e.g. ``"pizza margherita"``.
Accepts free-form text; the API handles partial matches.
max_results: Maximum number of products to return. Defaults to 20.
country: Country-specific endpoint to query. Defaults to ``"world"``
(global database). Use ``"ch"`` for Swiss products.
Returns:
A list of product dicts. Each dict contains at minimum::
{
"product_name": str,
"energy_100g": float, # kcal per 100g
"fat_100g": float,
"saturated_fat_100g": float,
"sugars_100g": float,
"fiber_100g": float,
"proteins_100g": float,
"salt_100g": float,
"nutriscore_grade": str, # "a" through "e", or None
}
Returns an empty list if no products are found.
Raises:
requests.HTTPError: If the API request fails.
Example:
>>> products = load_openfoodfacts_sample("margherita pizza", max_results=5)
>>> products[0]["energy_100g"]
266.0
"""
# TODO: Implement in notebook 03_ml_health_classifier.ipynb, then port here.
# Suggested implementation:
# import requests
# url = f"https://{country}.openfoodfacts.org/cgi/search.pl"
# params = {"search_terms": food_name, "json": 1, "page_size": max_results}
# response = requests.get(url, params=params)
# return [_extract_nutrients(p) for p in response.json()["products"]]
raise NotImplementedError("Implement in notebook 03_ml_health_classifier.ipynb first.")
def load_knowledge_base_documents(folder: str) -> list[dict]:
"""Load and parse all documents in the RAG knowledge base folder.
Reads all ``.pdf``, ``.txt``, and ``.md`` files from the specified folder,
extracts their text content, and returns a list of document dicts ready
for chunking and embedding into ChromaDB.
Args:
folder: Path to the folder containing knowledge base documents.
Typically ``data/knowledge_base/``. Subdirectories are not
traversed — only top-level files are read.
Returns:
A list of document dicts, one per file::
[
{
"source": "who_healthy_diet.pdf",
"content": "A healthy diet helps protect against ...",
"metadata": {
"file_type": "pdf",
"num_pages": 4,
"file_size_kb": 112,
},
},
...
]
Returns an empty list if the folder contains no supported files.
Raises:
FileNotFoundError: If ``folder`` does not exist.
ValueError: If a file cannot be parsed (e.g., encrypted PDF).
Example:
>>> docs = load_knowledge_base_documents("data/knowledge_base/")
>>> len(docs)
3
>>> docs[0]["source"]
'who_healthy_diet.pdf'
"""
# TODO: Implement in notebook 04_rag_setup.ipynb, then port here.
# Suggested implementation:
# from pypdf import PdfReader
# for path in Path(folder).glob("*.pdf"):
# reader = PdfReader(path)
# text = "\n".join(page.extract_text() for page in reader.pages)
# docs.append({"source": path.name, "content": text, ...})
folder_path = Path(folder)
if not folder_path.exists():
raise FileNotFoundError(f"Knowledge base folder not found: {folder}")
raise NotImplementedError("Implement in notebook 04_rag_setup.ipynb first.")
|