anaygupta commited on
Commit
9373226
·
verified ·
1 Parent(s): 62fcf17

Upload 12 files

Browse files
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PIP_NO_CACHE_DIR=1 BITEWISE_ENABLE_SEMANTIC_DOWNLOAD=1
4
+
5
+ WORKDIR /app
6
+
7
+ RUN apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/*
8
+
9
+ COPY requirements.txt /app/requirements.txt
10
+ RUN pip install -r requirements.txt
11
+
12
+ COPY . /app
13
+
14
+ EXPOSE 7860
15
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Dict, Literal
5
+
6
+ from fastapi import FastAPI, HTTPException
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from fastapi.responses import FileResponse, JSONResponse
9
+ from pydantic import BaseModel, Field
10
+
11
+ from services.config import settings
12
+ from services.dataset import SubstitutionDatabase
13
+ from services.recipe_service import RecipeAdapterService
14
+ from services.semantic import WordVectorFallback
15
+
16
+ app = FastAPI(title="BiteWise API", version="2.0.0")
17
+
18
+ app.add_middleware(
19
+ CORSMiddleware,
20
+ allow_origins=["*"],
21
+ allow_credentials=True,
22
+ allow_methods=["*"],
23
+ allow_headers=["*"],
24
+ )
25
+
26
+
27
+ class AdaptRequest(BaseModel):
28
+ recipe_text: str = Field(min_length=5)
29
+ diet: Literal["vegan", "keto", "both"] = "vegan"
30
+
31
+
32
+ _db = None
33
+ _semantic = None
34
+ _service = None
35
+
36
+
37
+ def get_service() -> RecipeAdapterService:
38
+ global _db, _semantic, _service
39
+ if _service is not None:
40
+ return _service
41
+
42
+ _db = SubstitutionDatabase(settings.dataset_path)
43
+ _semantic = WordVectorFallback(
44
+ model_name=settings.semantic_model_name,
45
+ model_path=settings.semantic_model_path,
46
+ enable_download=settings.enable_semantic_download,
47
+ )
48
+ _service = RecipeAdapterService(db=_db, semantic=_semantic)
49
+ return _service
50
+
51
+
52
+ @app.get("/")
53
+ def root():
54
+ index = Path("static/index.html")
55
+ if index.exists():
56
+ return FileResponse(index)
57
+ return JSONResponse(
58
+ {
59
+ "name": "BiteWise API",
60
+ "status": "running",
61
+ "hint": "POST /api/adapt with {recipe_text, diet}",
62
+ }
63
+ )
64
+
65
+
66
+ @app.get("/health")
67
+ def health():
68
+ return {"ok": True}
69
+
70
+
71
+ @app.get("/api/meta")
72
+ def meta():
73
+ service = get_service()
74
+ return {
75
+ "ner_model": settings.ner_model_name,
76
+ "qa_model": settings.qa_model_name,
77
+ "semantic_model": settings.semantic_model_name,
78
+ "semantic_available": service.semantic.available,
79
+ "semantic_mode": service.semantic._kind,
80
+ "dataset_path": str(settings.dataset_path),
81
+ }
82
+
83
+
84
+ @app.post("/api/adapt")
85
+ def adapt(req: AdaptRequest) -> Dict[str, Any]:
86
+ try:
87
+ service = get_service()
88
+ return service.adapt(req.recipe_text, req.diet)
89
+ except FileNotFoundError as e:
90
+ raise HTTPException(status_code=500, detail=str(e))
91
+ except ValueError as e:
92
+ raise HTTPException(status_code=400, detail=str(e))
93
+ except Exception as e:
94
+ raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.110
2
+ uvicorn[standard]>=0.27
3
+ pydantic>=2.6
4
+ pandas>=2.1
5
+ numpy>=1.26
6
+ transformers>=4.41
7
+ torch>=2.2
8
+ gensim>=4.3
9
+ python-multipart>=0.0.9
services/__init__.py ADDED
File without changes
services/classify.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from functools import lru_cache
4
+ from typing import Literal
5
+
6
+ import torch
7
+ from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
8
+
9
+ from .config import settings
10
+ from .text_utils import normalize_text
11
+
12
+ RecipeType = Literal["baked", "cooked"]
13
+
14
+ BAKE_KEYWORDS = [
15
+ "bake", "baking", "oven", "preheat", "flour", "dough", "batter",
16
+ "cake", "cookie", "muffin", "bread", "pastry", "brownie", "tart",
17
+ "pie", "scone", "loaf", "whisk", "fold in", "sift", "knead",
18
+ "leavening", "baking soda", "baking powder", "yeast",
19
+ ]
20
+ COOK_KEYWORDS = [
21
+ "saute", "sauté", "fry", "boil", "simmer", "stir", "grill",
22
+ "roast", "steam", "poach", "braise", "sear", "stove", "skillet",
23
+ "pan", "wok", "sauce", "soup", "stew", "marinate",
24
+ ]
25
+
26
+
27
+ @lru_cache(maxsize=1)
28
+ def get_qa_pipeline():
29
+ tokenizer = AutoTokenizer.from_pretrained(settings.qa_model_name)
30
+ model = AutoModelForQuestionAnswering.from_pretrained(settings.qa_model_name)
31
+ device = 0 if torch.cuda.is_available() else -1
32
+ return pipeline(
33
+ "question-answering",
34
+ model=model,
35
+ tokenizer=tokenizer,
36
+ device=device,
37
+ )
38
+
39
+
40
+ def classify_recipe(recipe_text: str) -> RecipeType:
41
+ text = normalize_text(recipe_text)
42
+
43
+ bake_score = sum(1 for kw in BAKE_KEYWORDS if kw in text)
44
+ cook_score = sum(1 for kw in COOK_KEYWORDS if kw in text)
45
+
46
+ answer = ""
47
+ try:
48
+ qa = get_qa_pipeline()
49
+ result = qa(question="Is this recipe for baking or cooking?", context=recipe_text)
50
+ answer = normalize_text(str(result.get("answer", "")))
51
+ except Exception:
52
+ pass
53
+
54
+ if any(sig in answer for sig in ("bak", "oven", "pastry", "dough")):
55
+ return "baked"
56
+ if any(sig in answer for sig in ("cook", "fry", "boil", "saut", "grill", "stir")):
57
+ return "cooked"
58
+
59
+ if bake_score > cook_score:
60
+ return "baked"
61
+ if cook_score > bake_score:
62
+ return "cooked"
63
+
64
+ return "cooked"
services/config.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ import os
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class Settings:
10
+ dataset_path: Path = Path(os.getenv("BITEWISE_DATASET_PATH", "data/united_master_database_corrected.csv"))
11
+ ner_model_name: str = os.getenv("BITEWISE_NER_MODEL", "Dizex/InstaFoodRoBERTa-NER")
12
+ qa_model_name: str = os.getenv(
13
+ "BITEWISE_QA_MODEL",
14
+ "bert-large-uncased-whole-word-masking-finetuned-squad",
15
+ )
16
+ semantic_model_name: str = os.getenv("BITEWISE_SEMANTIC_MODEL", "glove-wiki-gigaword-50")
17
+ semantic_model_path: str = os.getenv("BITEWISE_SEMANTIC_PATH", "")
18
+ enable_semantic_download: bool = os.getenv("BITEWISE_ENABLE_SEMANTIC_DOWNLOAD", "1") == "1"
19
+ max_ingredients: int = int(os.getenv("BITEWISE_MAX_INGREDIENTS", "48"))
20
+ similarity_threshold: float = float(os.getenv("BITEWISE_SIM_THRESHOLD", "0.52"))
21
+
22
+
23
+ settings = Settings()
services/dataset.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Dict, List, Optional
6
+ import re
7
+
8
+ import pandas as pd
9
+
10
+ from .text_utils import as_aliases, dedupe_preserve_order, ingredient_variants, normalize_text
11
+
12
+ REQUIRED_COLUMNS = [
13
+ "Ingredient",
14
+ "Context",
15
+ "Aliases",
16
+ "Category",
17
+ "Is_Keto_Friendly",
18
+ "Is_Vegan_Friendly",
19
+ "Keto_Substitution",
20
+ "Keto_Instruction",
21
+ "Vegan_Substitution",
22
+ "Vegan_Instruction",
23
+ "Vegan_Keto_Substitution",
24
+ "Vegan_Keto_Instruction",
25
+ ]
26
+
27
+ BAKE_CONTEXTS = {
28
+ "Baking & Desserts",
29
+ "Baking (Binder)",
30
+ "Baking (Leavening)",
31
+ "Pastries",
32
+ "Bagels",
33
+ "Puddings",
34
+ "Tiramisu",
35
+ }
36
+
37
+ COOK_CONTEXTS = {
38
+ "Main Course & Heavy Cooking",
39
+ "Sauces, Dips & Dressings",
40
+ "Soups & Savory Liquids",
41
+ "Cold Prep & Light Meals",
42
+ "Pasta",
43
+ "Lasagna",
44
+ "Roast",
45
+ "Stir-Fry",
46
+ "Appetizer",
47
+ "Indian",
48
+ "Beverages",
49
+ "Cheese Making",
50
+ "Processed",
51
+ "Technical & Additives",
52
+ }
53
+
54
+
55
+ @dataclass(frozen=True)
56
+ class IngredientRow:
57
+ ingredient: str
58
+ context: str
59
+ aliases: List[str]
60
+ category: str
61
+ is_keto_friendly: bool
62
+ is_vegan_friendly: bool
63
+ keto_substitution: Optional[str]
64
+ keto_instruction: Optional[str]
65
+ vegan_substitution: Optional[str]
66
+ vegan_instruction: Optional[str]
67
+ vegan_keto_substitution: Optional[str]
68
+ vegan_keto_instruction: Optional[str]
69
+
70
+ @property
71
+ def lookup_terms(self) -> List[str]:
72
+ terms = [self.ingredient, *self.aliases]
73
+ ing = normalize_text(self.ingredient)
74
+
75
+ if "egg" in ing:
76
+ terms.extend([
77
+ "egg", "eggs", "whole egg", "whole eggs", "egg white", "egg whites",
78
+ "egg yolk", "egg yolks",
79
+ ])
80
+ if "pancetta" in ing or "bacon" in ing:
81
+ terms.extend(["pancetta", "bacon", "guanciale", "prosciutto", "cured pork"])
82
+ if "chicken" in ing:
83
+ terms.extend(["chicken", "chicken thighs", "chicken breast", "chicken pieces", "chicken drumsticks"])
84
+ if "milk" in ing:
85
+ terms.extend(["milk", "whole milk", "dairy milk"])
86
+ if "cheese" in ing:
87
+ terms.extend(["cheese", "hard cheese", "soft cheese"])
88
+
89
+ return dedupe_preserve_order(terms)
90
+
91
+
92
+ class SubstitutionDatabase:
93
+ def __init__(self, csv_path: str | Path):
94
+ self.csv_path = Path(csv_path)
95
+ self.df = self._load()
96
+ self.rows = [self._row_from_series(row) for _, row in self.df.iterrows()]
97
+ self._preferred_rows_cache: Dict[str, List[IngredientRow]] = {}
98
+ self._semantic_terms_cache: Dict[str, List[str]] = {}
99
+
100
+ def _load(self) -> pd.DataFrame:
101
+ if not self.csv_path.exists():
102
+ raise FileNotFoundError(
103
+ f"Could not find substitution database at {self.csv_path}. "
104
+ "Place united_master_database_corrected.csv in data/ or set BITEWISE_DATASET_PATH."
105
+ )
106
+
107
+ df = pd.read_csv(self.csv_path)
108
+ missing = [c for c in REQUIRED_COLUMNS if c not in df.columns]
109
+ if missing:
110
+ raise ValueError(f"Dataset missing required columns: {missing}")
111
+
112
+ df = df.copy()
113
+ df = df.dropna(subset=["Ingredient", "Context"])
114
+ df["_ingredient_norm"] = df["Ingredient"].astype(str).map(normalize_text)
115
+ df["_aliases_norm"] = df["Aliases"].apply(as_aliases)
116
+ return df.reset_index(drop=True)
117
+
118
+ def _row_from_series(self, row: pd.Series) -> IngredientRow:
119
+ return IngredientRow(
120
+ ingredient=str(row["Ingredient"]),
121
+ context=str(row["Context"]),
122
+ aliases=as_aliases(row.get("Aliases")),
123
+ category=str(row.get("Category", "")),
124
+ is_keto_friendly=bool(row.get("Is_Keto_Friendly", False)),
125
+ is_vegan_friendly=bool(row.get("Is_Vegan_Friendly", False)),
126
+ keto_substitution=None if pd.isna(row.get("Keto_Substitution")) else str(row.get("Keto_Substitution")),
127
+ keto_instruction=None if pd.isna(row.get("Keto_Instruction")) else str(row.get("Keto_Instruction")),
128
+ vegan_substitution=None if pd.isna(row.get("Vegan_Substitution")) else str(row.get("Vegan_Substitution")),
129
+ vegan_instruction=None if pd.isna(row.get("Vegan_Instruction")) else str(row.get("Vegan_Instruction")),
130
+ vegan_keto_substitution=None if pd.isna(row.get("Vegan_Keto_Substitution")) else str(row.get("Vegan_Keto_Substitution")),
131
+ vegan_keto_instruction=None if pd.isna(row.get("Vegan_Keto_Instruction")) else str(row.get("Vegan_Keto_Instruction")),
132
+ )
133
+
134
+ def contexts_for_recipe_type(self, recipe_type: str) -> set[str]:
135
+ return BAKE_CONTEXTS if recipe_type == "baked" else COOK_CONTEXTS
136
+
137
+ def preferred_rows(self, recipe_type: str) -> List[IngredientRow]:
138
+ if recipe_type not in self._preferred_rows_cache:
139
+ contexts = self.contexts_for_recipe_type(recipe_type)
140
+ self._preferred_rows_cache[recipe_type] = [row for row in self.rows if row.context in contexts]
141
+ return self._preferred_rows_cache[recipe_type]
142
+
143
+ def _normalize_terms(self, row: IngredientRow) -> List[str]:
144
+ return [normalize_text(term) for term in row.lookup_terms if normalize_text(term)]
145
+
146
+ def _match_exact(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
147
+ query = normalize_text(query)
148
+ exact_rows = []
149
+ for row in rows:
150
+ terms = self._normalize_terms(row)
151
+ if not terms:
152
+ continue
153
+ if normalize_text(row.ingredient) == query or query in terms:
154
+ exact_rows.append(row)
155
+ return exact_rows
156
+
157
+ def _match_partial(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
158
+ query = normalize_text(query)
159
+ if len(query) < 4:
160
+ return []
161
+ pattern = re.compile(rf"(?<!\w){re.escape(query)}(?!\w)")
162
+ partial_rows = []
163
+ for row in rows:
164
+ candidates = [normalize_text(row.ingredient), *self._normalize_terms(row)]
165
+ if any(pattern.search(candidate) for candidate in candidates):
166
+ partial_rows.append(row)
167
+ return partial_rows
168
+
169
+ def _rank_rows(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
170
+ q = normalize_text(query)
171
+
172
+ def score(row: IngredientRow) -> tuple[int, int, int, int]:
173
+ ingredient_norm = normalize_text(row.ingredient)
174
+ alias_norms = [normalize_text(a) for a in row.aliases]
175
+ exact_ingredient = int(ingredient_norm == q)
176
+ exact_alias = int(q in alias_norms)
177
+ alias_specificity = int(len(row.aliases) > 0)
178
+ length = len(ingredient_norm)
179
+ return (exact_ingredient, exact_alias, alias_specificity, length)
180
+
181
+ return sorted(rows, key=score, reverse=True)
182
+
183
+ def find_rows(self, query: str, recipe_type: str) -> List[IngredientRow]:
184
+ query = normalize_text(query)
185
+ if not query:
186
+ return []
187
+
188
+ variants = ingredient_variants(query)
189
+ preferred = self.preferred_rows(recipe_type)
190
+
191
+ for candidate in variants:
192
+ exact_preferred = self._match_exact(preferred, candidate)
193
+ if exact_preferred:
194
+ return self._rank_rows(exact_preferred, candidate)
195
+
196
+ for candidate in variants:
197
+ exact_all = self._match_exact(self.rows, candidate)
198
+ if exact_all:
199
+ return self._rank_rows(exact_all, candidate)
200
+
201
+ for candidate in variants:
202
+ partial_preferred = self._match_partial(preferred, candidate)
203
+ if partial_preferred:
204
+ return self._rank_rows(partial_preferred, candidate)
205
+
206
+ for candidate in variants:
207
+ partial_all = self._match_partial(self.rows, candidate)
208
+ if partial_all:
209
+ return self._rank_rows(partial_all, candidate)
210
+
211
+ return []
212
+
213
+ def semantic_terms(self, recipe_type: str) -> List[str]:
214
+ if recipe_type not in self._semantic_terms_cache:
215
+ rows = self.preferred_rows(recipe_type) or self.rows
216
+ terms = []
217
+ for row in rows:
218
+ terms.extend(row.lookup_terms)
219
+ self._semantic_terms_cache[recipe_type] = dedupe_preserve_order(terms)
220
+ return self._semantic_terms_cache[recipe_type]
221
+
222
+ def pick_substitution(self, row: IngredientRow, diet: str) -> tuple[str, str, bool]:
223
+ if diet == "vegan":
224
+ sub = row.vegan_substitution
225
+ instr = row.vegan_instruction
226
+ compatible = row.is_vegan_friendly
227
+ elif diet == "keto":
228
+ sub = row.keto_substitution
229
+ instr = row.keto_instruction
230
+ compatible = row.is_keto_friendly
231
+ else:
232
+ sub = row.vegan_keto_substitution
233
+ instr = row.vegan_keto_instruction
234
+ compatible = row.is_vegan_friendly and row.is_keto_friendly
235
+
236
+ if compatible or not sub or str(sub).strip().lower() in {"nan", "none"}:
237
+ return row.ingredient, "Already compatible — no substitution needed.", True
238
+
239
+ return str(sub), (str(instr) if instr and str(instr).lower() != "nan" else ""), False
services/ner.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from functools import lru_cache
4
+ from typing import List
5
+
6
+ import torch
7
+ from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
8
+
9
+ from .config import settings
10
+ from .text_utils import dedupe_preserve_order, strip_amounts_and_preps, tokenize_recipe_segments, normalize_text
11
+
12
+
13
+ @lru_cache(maxsize=1)
14
+ def get_ner_pipeline():
15
+ tokenizer = AutoTokenizer.from_pretrained(settings.ner_model_name)
16
+ model = AutoModelForTokenClassification.from_pretrained(settings.ner_model_name)
17
+ device = 0 if torch.cuda.is_available() else -1
18
+ return pipeline(
19
+ "token-classification",
20
+ model=model,
21
+ tokenizer=tokenizer,
22
+ aggregation_strategy="simple",
23
+ device=device,
24
+ )
25
+
26
+
27
+ def _best_span_from_segment(segment: str) -> str:
28
+ segment = (segment or "").strip()
29
+ if not segment:
30
+ return ""
31
+
32
+ try:
33
+ pipe = get_ner_pipeline()
34
+ ents = pipe(segment)
35
+ except Exception:
36
+ ents = []
37
+
38
+ spans: List[str] = []
39
+ for ent in ents:
40
+ text = segment[ent["start"] : ent["end"]].strip()
41
+ text = normalize_text(text)
42
+ if not text or len(text) < 2:
43
+ continue
44
+ if text in {"and", "or", "with", "of"}:
45
+ continue
46
+ spans.append(text)
47
+
48
+ if spans:
49
+ # Prefer the longest span because ingredient models sometimes emit
50
+ # smaller fragments when the input chunk is short.
51
+ spans.sort(key=len, reverse=True)
52
+ return spans[0]
53
+
54
+ return strip_amounts_and_preps(segment)
55
+
56
+
57
+ def extract_ingredients(recipe_text: str, max_items: int = 48) -> List[str]:
58
+ segments = tokenize_recipe_segments(recipe_text)
59
+ if not segments:
60
+ return []
61
+
62
+ out: List[str] = []
63
+ for segment in segments:
64
+ candidate = _best_span_from_segment(segment)
65
+ candidate = strip_amounts_and_preps(candidate)
66
+ if not candidate or len(candidate) < 2:
67
+ continue
68
+ if candidate in {"and", "or", "with", "of"}:
69
+ continue
70
+ out.append(candidate)
71
+ if len(out) >= max_items:
72
+ break
73
+
74
+ return dedupe_preserve_order(out)
services/recipe_service.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, List, Literal, Optional
5
+
6
+ from .classify import classify_recipe
7
+ from .config import settings
8
+ from .dataset import SubstitutionDatabase
9
+ from .ner import extract_ingredients
10
+ from .semantic import WordVectorFallback
11
+ from .text_utils import normalize_text, singularize
12
+
13
+ Diet = Literal["vegan", "keto", "both"]
14
+
15
+
16
+ NEUTRAL_OK = {
17
+ "salt", "pepper", "black pepper", "white pepper", "water", "olive oil", "vegetable oil", "oil",
18
+ "garlic", "onion", "lemon", "lime", "vinegar", "basil", "oregano", "thyme", "rosemary",
19
+ "cumin", "paprika", "turmeric", "ginger", "chili", "chilli", "coriander", "parsley",
20
+ "bay leaf", "bay leaves", "nutmeg", "cinnamon", "cardamom", "cloves", "allspice",
21
+ "saffron", "vanilla", "cocoa powder", "baking powder", "baking soda", "yeast",
22
+ }
23
+
24
+ VEGAN_ANIMAL_HINTS = {
25
+ "pancetta", "bacon", "guanciale", "pork", "ham", "prosciutto", "sausage",
26
+ "chicken", "beef", "turkey", "lamb", "fish", "shrimp", "anchovy", "gelatin", "lard",
27
+ }
28
+
29
+ VEGAN_DAIRY_HINTS = {
30
+ "milk", "cream", "butter", "cheese", "yogurt", "whey", "casein", "ghee",
31
+ }
32
+
33
+ VEGAN_EGG_HINTS = {
34
+ "egg", "eggs", "whole egg", "whole eggs", "egg white", "egg whites", "egg yolk", "egg yolks",
35
+ }
36
+
37
+ KETO_CARB_HINTS = {
38
+ "pasta", "spaghetti", "noodle", "noodles", "bread", "flour", "sugar", "rice",
39
+ "potato", "potatoes", "corn", "oats", "beans", "bean", "honey", "syrup", "maple", "couscous",
40
+ }
41
+
42
+
43
+ @dataclass
44
+ class IngredientResult:
45
+ original: str
46
+ normalized: str
47
+ compliant: bool
48
+ substitute: str
49
+ instructions: str
50
+ source: str
51
+ matched_ingredient: Optional[str] = None
52
+ confidence: float = 1.0
53
+ notes: Optional[str] = None
54
+
55
+ def as_dict(self) -> Dict[str, Any]:
56
+ payload = {
57
+ "original": self.original,
58
+ "normalized": self.normalized,
59
+ "compliant": self.compliant,
60
+ "substitute": self.substitute,
61
+ "instructions": self.instructions,
62
+ "source": self.source,
63
+ "matched_ingredient": self.matched_ingredient,
64
+ "confidence": round(float(self.confidence), 3),
65
+ }
66
+ if self.notes:
67
+ payload["notes"] = self.notes
68
+ return payload
69
+
70
+
71
+ class RecipeAdapterService:
72
+ def __init__(self, db: SubstitutionDatabase, semantic: WordVectorFallback):
73
+ self.db = db
74
+ self.semantic = semantic
75
+
76
+ def _row_to_result(self, ingredient: str, row, diet: Diet) -> IngredientResult:
77
+ substitute, instructions, compliant = self.db.pick_substitution(row, diet)
78
+ normalized = normalize_text(ingredient)
79
+
80
+ if compliant:
81
+ return IngredientResult(
82
+ original=ingredient,
83
+ normalized=normalized,
84
+ compliant=True,
85
+ substitute=ingredient,
86
+ instructions="Already compatible — no substitution needed.",
87
+ source="database match",
88
+ matched_ingredient=row.ingredient,
89
+ confidence=1.0,
90
+ )
91
+
92
+ return IngredientResult(
93
+ original=ingredient,
94
+ normalized=normalized,
95
+ compliant=False,
96
+ substitute=substitute,
97
+ instructions=instructions or "",
98
+ source="database match",
99
+ matched_ingredient=row.ingredient,
100
+ confidence=0.96,
101
+ )
102
+
103
+ def _rule_fallback(self, ingredient: str, diet: Diet, recipe_type: str) -> Optional[IngredientResult]:
104
+ text = normalize_text(ingredient)
105
+ singular = singularize(text)
106
+ if singular != text:
107
+ text = f"{text} {singular}"
108
+
109
+ def result(sub: str, instr: str, source: str = "diet rule fallback", conf: float = 0.94) -> IngredientResult:
110
+ return IngredientResult(
111
+ original=ingredient,
112
+ normalized=normalize_text(ingredient),
113
+ compliant=False,
114
+ substitute=sub,
115
+ instructions=instr,
116
+ source=source,
117
+ matched_ingredient=None,
118
+ confidence=conf,
119
+ )
120
+
121
+ if diet in {"vegan", "both"}:
122
+ if any(hint in text for hint in VEGAN_EGG_HINTS):
123
+ if recipe_type == "baked":
124
+ return result("Flax Egg", "1 tbsp ground flax + 3 tbsp water per egg. Rest 5 min before using.")
125
+ if diet == "both":
126
+ return result("Silken Tofu", "Blend until smooth and use as a creamy egg-free binder.")
127
+ return result("Silken Tofu or Tofu Scramble", "Blend silken tofu for sauces or use crumbled tofu for savory dishes.")
128
+
129
+ if any(hint in text for hint in VEGAN_ANIMAL_HINTS):
130
+ if any(hint in text for hint in {"pancetta", "bacon", "guanciale", "pork", "ham", "prosciutto", "sausage"}):
131
+ if diet == "both":
132
+ return result("Extra Firm Tofu", "Press tofu 30 min, cube and pan-fry until golden.")
133
+ return result("Smoked Tofu or Tempeh Bacon", "Dice and pan-fry until crispy.")
134
+ if any(hint in text for hint in {"chicken", "beef", "turkey", "lamb", "fish", "shrimp", "anchovy"}):
135
+ if diet == "both":
136
+ return result("Extra Firm Tofu", "Use as a 1:1 savory protein substitute.")
137
+ return result("Soy Curls or Extra Firm Tofu", "Use as a 1:1 meat substitute.")
138
+ return result("Plant-based alternative", "Choose a vegan substitute that matches the recipe context.")
139
+
140
+ if any(hint in text for hint in VEGAN_DAIRY_HINTS):
141
+ if "butter" in text:
142
+ return result("Vegan Butter", "Use 1:1 in place of butter.")
143
+ if "milk" in text:
144
+ return result("Unsweetened Plant Milk", "Use 1:1 in place of dairy milk.")
145
+ if "cream" in text:
146
+ return result("Cashew Cream", "Use as a rich dairy-free cream substitute.")
147
+ if "cheese" in text:
148
+ return result("Vegan Cheese", "Use a meltable vegan cheese or nutritional yeast blend.")
149
+ return result("Plant-based alternative", "Choose a vegan substitute that matches the recipe context.")
150
+
151
+ if diet in {"keto", "both"}:
152
+ if any(hint in text for hint in KETO_CARB_HINTS):
153
+ if "spaghetti" in text or "pasta" in text or "noodle" in text:
154
+ return result("Zucchini Noodles or Shirataki Noodles", "Use in a 1:1 swap for pasta-style dishes.")
155
+ if "rice" in text:
156
+ return result("Cauliflower Rice", "Use as a low-carb rice substitute.")
157
+ if "flour" in text:
158
+ return result("Almond Flour", "Use a keto baking flour blend.")
159
+ if "sugar" in text or "honey" in text or "syrup" in text:
160
+ return result("Erythritol or Allulose", "Use a keto-friendly sweetener to taste.")
161
+ if "potato" in text:
162
+ return result("Cauliflower", "Use roasted cauliflower or cauliflower mash.")
163
+ return result("Low-carb alternative", "Choose a keto-friendly substitute that matches the recipe context.")
164
+
165
+ return None
166
+
167
+ def _semantic_fallback(self, ingredient: str, diet: Diet, recipe_type: str) -> Optional[IngredientResult]:
168
+ if not self.semantic.available:
169
+ return None
170
+
171
+ candidates = self.db.semantic_terms(recipe_type)
172
+ if not candidates:
173
+ return None
174
+
175
+ hits = self.semantic.nearest(ingredient, candidates, top_k=5)
176
+ if not hits:
177
+ return None
178
+
179
+ for hit in hits:
180
+ if hit.score < settings.similarity_threshold:
181
+ continue
182
+ matched_rows = self.db.find_rows(hit.term, recipe_type)
183
+ if not matched_rows:
184
+ continue
185
+ row = matched_rows[0]
186
+ result = self._row_to_result(ingredient, row, diet)
187
+ result.source = f"glove semantic match ({hit.term}, score={hit.score:.2f})"
188
+ result.confidence = max(0.5, min(0.95, hit.score))
189
+ return result
190
+
191
+ return None
192
+
193
+ def _manual_review_result(self, ingredient: str) -> IngredientResult:
194
+ text = normalize_text(ingredient)
195
+ if text in NEUTRAL_OK:
196
+ return IngredientResult(
197
+ original=ingredient,
198
+ normalized=text,
199
+ compliant=True,
200
+ substitute=ingredient,
201
+ instructions="Already compatible — no substitution needed.",
202
+ source="known compatible",
203
+ matched_ingredient=None,
204
+ confidence=0.9,
205
+ )
206
+
207
+ return IngredientResult(
208
+ original=ingredient,
209
+ normalized=text,
210
+ compliant=False,
211
+ substitute=ingredient,
212
+ instructions="No reliable substitution found — please review manually.",
213
+ source="not in database",
214
+ matched_ingredient=None,
215
+ confidence=0.35,
216
+ notes="No reliable database or semantic match found.",
217
+ )
218
+
219
+ def adapt(self, recipe_text: str, diet: Diet) -> Dict[str, Any]:
220
+ if diet not in {"vegan", "keto", "both"}:
221
+ raise ValueError("diet must be one of: vegan, keto, both")
222
+
223
+ recipe_text = (recipe_text or "").strip()
224
+ if len(recipe_text) < 5:
225
+ raise ValueError("recipe_text is too short")
226
+
227
+ recipe_type = classify_recipe(recipe_text)
228
+ ingredients = extract_ingredients(recipe_text, max_items=settings.max_ingredients)
229
+
230
+ results: List[IngredientResult] = []
231
+ substitutions = 0
232
+
233
+ for ingredient in ingredients:
234
+ matches = self.db.find_rows(ingredient, recipe_type)
235
+ if matches:
236
+ result = self._row_to_result(ingredient, matches[0], diet)
237
+ else:
238
+ result = self._rule_fallback(ingredient, diet, recipe_type)
239
+ if result is None:
240
+ result = self._semantic_fallback(ingredient, diet, recipe_type)
241
+ if result is None:
242
+ result = self._manual_review_result(ingredient)
243
+
244
+ if not result.compliant:
245
+ substitutions += 1
246
+ results.append(result)
247
+
248
+ return {
249
+ "diet": diet,
250
+ "recipe_type": recipe_type,
251
+ "ingredients": [r.as_dict() for r in results],
252
+ "ingredients_found": len(results),
253
+ "substitution_count": substitutions,
254
+ "model_metadata": {
255
+ "ner_model": settings.ner_model_name,
256
+ "qa_model": settings.qa_model_name,
257
+ "semantic_model": settings.semantic_model_name,
258
+ "semantic_available": self.semantic.available,
259
+ "semantic_mode": self.semantic._kind,
260
+ "word2vec_available": self.semantic.available,
261
+ "word2vec_mode": self.semantic._kind,
262
+ "dataset_path": str(self.db.csv_path),
263
+ },
264
+ "disclaimer": (
265
+ "This is an assistive recipe tool, not nutritional or allergen medical advice. "
266
+ "Verify substitutions before cooking."
267
+ ),
268
+ }
services/semantic.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable, List, Optional
5
+
6
+ import numpy as np
7
+
8
+ from .text_utils import normalize_text, singularize
9
+
10
+
11
+ @dataclass
12
+ class SemanticHit:
13
+ term: str
14
+ score: float
15
+
16
+
17
+ class WordVectorFallback:
18
+ """Small Glove-based semantic fallback.
19
+
20
+ The model is optional so the app can still boot if the host blocks downloads.
21
+ """
22
+
23
+ def __init__(self, model_name: str = "glove-wiki-gigaword-50", model_path: str = "", enable_download: bool = True):
24
+ self.model = None
25
+ self._kind = "disabled"
26
+ self._model_name = model_name
27
+ self._load(model_name=model_name, model_path=model_path, enable_download=enable_download)
28
+
29
+ def _load(self, model_name: str, model_path: str, enable_download: bool) -> None:
30
+ try:
31
+ from gensim.models import KeyedVectors
32
+ import gensim.downloader as api
33
+ except Exception:
34
+ self.model = None
35
+ self._kind = "unavailable"
36
+ return
37
+
38
+ if model_path:
39
+ try:
40
+ self.model = KeyedVectors.load(model_path, mmap="r")
41
+ self._kind = f"local:{model_path}"
42
+ return
43
+ except Exception:
44
+ try:
45
+ self.model = KeyedVectors.load_word2vec_format(model_path, binary=model_path.endswith(".bin"))
46
+ self._kind = f"local-vec:{model_path}"
47
+ return
48
+ except Exception:
49
+ self.model = None
50
+
51
+ if enable_download:
52
+ try:
53
+ self.model = api.load(model_name)
54
+ self._kind = model_name
55
+ except Exception:
56
+ self.model = None
57
+ self._kind = "download-failed"
58
+ else:
59
+ self.model = None
60
+ self._kind = "disabled"
61
+
62
+ @property
63
+ def available(self) -> bool:
64
+ return self.model is not None
65
+
66
+ def vector_for(self, phrase: str) -> Optional[np.ndarray]:
67
+ if not self.available:
68
+ return None
69
+
70
+ normalized = normalize_text(phrase)
71
+ tokens = [singularize(t) for t in normalized.split()]
72
+ vectors = []
73
+ for token in tokens:
74
+ if token in self.model:
75
+ vectors.append(self.model[token])
76
+
77
+ if vectors:
78
+ return np.mean(np.stack(vectors), axis=0)
79
+
80
+ phrase_key = normalized.replace(" ", "_")
81
+ if phrase_key in self.model:
82
+ return self.model[phrase_key]
83
+
84
+ if normalized in self.model:
85
+ return self.model[normalized]
86
+
87
+ return None
88
+
89
+ def nearest(self, query: str, candidates: Iterable[str], top_k: int = 3) -> List[SemanticHit]:
90
+ if not self.available:
91
+ return []
92
+
93
+ qv = self.vector_for(query)
94
+ if qv is None:
95
+ return []
96
+
97
+ scored: List[SemanticHit] = []
98
+ qnorm = np.linalg.norm(qv) + 1e-8
99
+ for candidate in candidates:
100
+ cv = self.vector_for(candidate)
101
+ if cv is None:
102
+ continue
103
+ score = float(np.dot(qv, cv) / (qnorm * (np.linalg.norm(cv) + 1e-8)))
104
+ scored.append(SemanticHit(term=candidate, score=score))
105
+
106
+ scored.sort(key=lambda x: x.score, reverse=True)
107
+ return scored[:top_k]
services/text_utils.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from functools import lru_cache
5
+ from typing import Iterable, List
6
+
7
+ STOPWORDS = {
8
+ "and", "or", "the", "a", "an", "some", "fresh", "dried", "chopped", "minced",
9
+ "diced", "sliced", "grated", "ground", "cooked", "raw", "cold", "hot", "warm",
10
+ "to", "taste", "optional", "plus", "more",
11
+ }
12
+
13
+ _AMOUNT_RE = re.compile(r"^(?:\d+(?:\.\d+)?|\d+\/\d+|[¼½¾⅓⅔⅛⅜⅝⅞])\s*")
14
+ _MEASURE_RE = re.compile(
15
+ r"^(?:g|kg|mg|ml|l|oz|lb|lbs|cup|cups|tbsp|tablespoon|tsp|teaspoon|clove|cloves|slice|slices|piece|pieces|can|cans|bunch|handful|pinch|large|small|medium|whole)\s+"
16
+ )
17
+
18
+
19
+ def normalize_text(text: str) -> str:
20
+ text = (text or "").lower().strip()
21
+ text = re.sub(r"[‘’“”]", "'", text)
22
+ text = re.sub(r"[^a-z0-9\s\-']+", " ", text)
23
+ text = re.sub(r"\s+", " ", text).strip()
24
+ return text
25
+
26
+
27
+ @lru_cache(maxsize=4096)
28
+ def singularize(word: str) -> str:
29
+ word = normalize_text(word)
30
+ if len(word) <= 3:
31
+ return word
32
+ if word.endswith("ies") and len(word) > 4:
33
+ return word[:-3] + "y"
34
+ if word.endswith("ves") and len(word) > 4:
35
+ return word[:-3] + "f"
36
+ if word.endswith("ses") or word.endswith("xes") or word.endswith("zes") or word.endswith("ches") or word.endswith("shes"):
37
+ return word[:-2]
38
+ if word.endswith("s") and not word.endswith("ss"):
39
+ return word[:-1]
40
+ return word
41
+
42
+
43
+ def dedupe_preserve_order(items: Iterable[str]) -> List[str]:
44
+ seen = set()
45
+ out = []
46
+ for item in items:
47
+ item = normalize_text(item)
48
+ if item and item not in seen:
49
+ seen.add(item)
50
+ out.append(item)
51
+ return out
52
+
53
+
54
+ def strip_amounts_and_preps(text: str) -> str:
55
+ text = normalize_text(text)
56
+ text = _AMOUNT_RE.sub("", text)
57
+ text = _MEASURE_RE.sub("", text)
58
+ text = re.sub(r"^of\s+", "", text)
59
+ text = re.sub(r"\(.*?\)", "", text)
60
+ text = re.sub(r"\s+", " ", text).strip()
61
+ return text
62
+
63
+
64
+ def tokenize_recipe_segments(text: str) -> List[str]:
65
+ raw = text or ""
66
+ parts = re.split(r",|\n|;|\s+and\s+", raw, flags=re.IGNORECASE)
67
+ cleaned = []
68
+ for part in parts:
69
+ item = strip_amounts_and_preps(part)
70
+ if item and len(item) > 1:
71
+ cleaned.append(item)
72
+ return dedupe_preserve_order(cleaned)
73
+
74
+
75
+ def ingredient_variants(ingredient: str) -> List[str]:
76
+ ing = normalize_text(ingredient)
77
+ variants = [ing]
78
+
79
+ singular = singularize(ing)
80
+ if singular != ing:
81
+ variants.append(singular)
82
+
83
+ suffixes = [" cheese", " oil", " milk", " cream", " butter", " powder", " sauce", " paste", " extract"]
84
+ for suffix in suffixes:
85
+ if ing.endswith(suffix) and len(ing) > len(suffix) + 1:
86
+ base = ing[:-len(suffix)].strip()
87
+ variants.append(base)
88
+ base_singular = singularize(base)
89
+ if base_singular != base:
90
+ variants.append(base_singular)
91
+
92
+ words = ing.split()
93
+ if len(words) > 1:
94
+ variants.extend([words[0], words[-1], " ".join(words[:2]), " ".join(words[1:])])
95
+
96
+ return dedupe_preserve_order(variants)
97
+
98
+
99
+ def as_aliases(aliases: str | float | None) -> List[str]:
100
+ if aliases is None or not isinstance(aliases, str):
101
+ return []
102
+ return dedupe_preserve_order(alias.strip() for alias in aliases.split("|"))
static/index.html ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>BiteWise API</title>
7
+ <style>
8
+ body { font-family: Arial, sans-serif; background: #f5f4f0; color: #1a1a1a; margin: 0; padding: 40px; }
9
+ .card { max-width: 720px; background: #fff; border: 1px solid #e8e6e0; border-radius: 16px; padding: 24px; }
10
+ code { background: #f7f7f7; padding: 2px 6px; border-radius: 6px; }
11
+ </style>
12
+ </head>
13
+ <body>
14
+ <div class="card">
15
+ <h1>BiteWise is running</h1>
16
+ <p>Use <code>POST /api/adapt</code> to adapt a recipe for vegan, keto, or both.</p>
17
+ <p>Open <code>/docs</code> to test the API.</p>
18
+ </div>
19
+ </body>
20
+ </html>