File size: 6,517 Bytes
c173dc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""
SmartPlate data loading utilities.

This module provides stub functions for loading and preparing data for each
of the three AI blocks. Actual implementation happens in the training notebooks;
these stubs define the interface used by the pipeline.
"""

from __future__ import annotations

import os
from pathlib import Path
from typing import Optional


def download_food101_subset(
    classes: list[str],
    output_dir: str,
    split: str = "train",
    max_samples_per_class: Optional[int] = None,
) -> dict[str, list[str]]:
    """Download a subset of the Food-101 dataset from HuggingFace Datasets.

    Pulls only the specified classes from Food-101, optionally capping the
    number of samples per class. Images are saved to ``output_dir`` under
    class-named subdirectories (e.g. ``output_dir/pizza/image001.jpg``).

    Args:
        classes: List of Food-101 class names to include.
            Must be valid Food-101 labels (lowercase, underscores for spaces),
            e.g. ``["pizza", "sushi", "caesar_salad"]``.
        output_dir: Root directory where images will be saved.
            Will be created if it does not exist.
        split: Dataset split to download — ``"train"``, ``"validation"``, or
            ``"test"``. Defaults to ``"train"``.
        max_samples_per_class: If set, cap the number of images per class.
            Useful for quick experiments. ``None`` means no cap (all images).

    Returns:
        A dict mapping each class name to a list of absolute file paths of
        the downloaded images, e.g.::

            {
                "pizza": ["/data/raw/pizza/img_001.jpg", ...],
                "sushi": ["/data/raw/sushi/img_001.jpg", ...],
            }

    Raises:
        ValueError: If any class name is not a valid Food-101 label.
        OSError: If ``output_dir`` cannot be created.

    Example:
        >>> paths = download_food101_subset(
        ...     classes=["pizza", "sushi", "salad"],
        ...     output_dir="data/raw/food101",
        ...     split="train",
        ...     max_samples_per_class=100,
        ... )
        >>> len(paths["pizza"])
        100
    """
    # TODO: Implement in notebook 01_eda_food101.ipynb, then port here.
    # Suggested implementation:
    #   from datasets import load_dataset
    #   ds = load_dataset("food101", split=split)
    #   ds_subset = ds.filter(lambda x: x["label"] in class_indices)
    raise NotImplementedError("Implement in notebook 02_train_vit_cv.ipynb first.")


def load_openfoodfacts_sample(
    food_name: str,
    max_results: int = 20,
    country: str = "world",
) -> list[dict]:
    """Query the Open Food Facts API for nutritional data about a food item.

    Searches Open Food Facts by product name and returns a list of matching
    products with their nutritional information per 100g.

    Args:
        food_name: The dish or food item to search for, e.g. ``"pizza margherita"``.
            Accepts free-form text; the API handles partial matches.
        max_results: Maximum number of products to return. Defaults to 20.
        country: Country-specific endpoint to query. Defaults to ``"world"``
            (global database). Use ``"ch"`` for Swiss products.

    Returns:
        A list of product dicts. Each dict contains at minimum::

            {
                "product_name": str,
                "energy_100g": float,        # kcal per 100g
                "fat_100g": float,
                "saturated_fat_100g": float,
                "sugars_100g": float,
                "fiber_100g": float,
                "proteins_100g": float,
                "salt_100g": float,
                "nutriscore_grade": str,     # "a" through "e", or None
            }

        Returns an empty list if no products are found.

    Raises:
        requests.HTTPError: If the API request fails.

    Example:
        >>> products = load_openfoodfacts_sample("margherita pizza", max_results=5)
        >>> products[0]["energy_100g"]
        266.0
    """
    # TODO: Implement in notebook 03_ml_health_classifier.ipynb, then port here.
    # Suggested implementation:
    #   import requests
    #   url = f"https://{country}.openfoodfacts.org/cgi/search.pl"
    #   params = {"search_terms": food_name, "json": 1, "page_size": max_results}
    #   response = requests.get(url, params=params)
    #   return [_extract_nutrients(p) for p in response.json()["products"]]
    raise NotImplementedError("Implement in notebook 03_ml_health_classifier.ipynb first.")


def load_knowledge_base_documents(folder: str) -> list[dict]:
    """Load and parse all documents in the RAG knowledge base folder.

    Reads all ``.pdf``, ``.txt``, and ``.md`` files from the specified folder,
    extracts their text content, and returns a list of document dicts ready
    for chunking and embedding into ChromaDB.

    Args:
        folder: Path to the folder containing knowledge base documents.
            Typically ``data/knowledge_base/``. Subdirectories are not
            traversed — only top-level files are read.

    Returns:
        A list of document dicts, one per file::

            [
                {
                    "source": "who_healthy_diet.pdf",
                    "content": "A healthy diet helps protect against ...",
                    "metadata": {
                        "file_type": "pdf",
                        "num_pages": 4,
                        "file_size_kb": 112,
                    },
                },
                ...
            ]

        Returns an empty list if the folder contains no supported files.

    Raises:
        FileNotFoundError: If ``folder`` does not exist.
        ValueError: If a file cannot be parsed (e.g., encrypted PDF).

    Example:
        >>> docs = load_knowledge_base_documents("data/knowledge_base/")
        >>> len(docs)
        3
        >>> docs[0]["source"]
        'who_healthy_diet.pdf'
    """
    # TODO: Implement in notebook 04_rag_setup.ipynb, then port here.
    # Suggested implementation:
    #   from pypdf import PdfReader
    #   for path in Path(folder).glob("*.pdf"):
    #       reader = PdfReader(path)
    #       text = "\n".join(page.extract_text() for page in reader.pages)
    #       docs.append({"source": path.name, "content": text, ...})
    folder_path = Path(folder)
    if not folder_path.exists():
        raise FileNotFoundError(f"Knowledge base folder not found: {folder}")
    raise NotImplementedError("Implement in notebook 04_rag_setup.ipynb first.")