Spaces:
Sleeping
Sleeping
Upload 12 files
Browse files- Dockerfile +15 -0
- main.py +94 -0
- requirements.txt +9 -0
- services/__init__.py +0 -0
- services/classify.py +64 -0
- services/config.py +23 -0
- services/dataset.py +239 -0
- services/ner.py +74 -0
- services/recipe_service.py +268 -0
- services/semantic.py +107 -0
- services/text_utils.py +102 -0
- static/index.html +20 -0
Dockerfile
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PIP_NO_CACHE_DIR=1 BITEWISE_ENABLE_SEMANTIC_DOWNLOAD=1
|
| 4 |
+
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
RUN apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/*
|
| 8 |
+
|
| 9 |
+
COPY requirements.txt /app/requirements.txt
|
| 10 |
+
RUN pip install -r requirements.txt
|
| 11 |
+
|
| 12 |
+
COPY . /app
|
| 13 |
+
|
| 14 |
+
EXPOSE 7860
|
| 15 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
main.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Any, Dict, Literal
|
| 5 |
+
|
| 6 |
+
from fastapi import FastAPI, HTTPException
|
| 7 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
+
from fastapi.responses import FileResponse, JSONResponse
|
| 9 |
+
from pydantic import BaseModel, Field
|
| 10 |
+
|
| 11 |
+
from services.config import settings
|
| 12 |
+
from services.dataset import SubstitutionDatabase
|
| 13 |
+
from services.recipe_service import RecipeAdapterService
|
| 14 |
+
from services.semantic import WordVectorFallback
|
| 15 |
+
|
| 16 |
+
app = FastAPI(title="BiteWise API", version="2.0.0")
|
| 17 |
+
|
| 18 |
+
app.add_middleware(
|
| 19 |
+
CORSMiddleware,
|
| 20 |
+
allow_origins=["*"],
|
| 21 |
+
allow_credentials=True,
|
| 22 |
+
allow_methods=["*"],
|
| 23 |
+
allow_headers=["*"],
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class AdaptRequest(BaseModel):
|
| 28 |
+
recipe_text: str = Field(min_length=5)
|
| 29 |
+
diet: Literal["vegan", "keto", "both"] = "vegan"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
_db = None
|
| 33 |
+
_semantic = None
|
| 34 |
+
_service = None
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def get_service() -> RecipeAdapterService:
|
| 38 |
+
global _db, _semantic, _service
|
| 39 |
+
if _service is not None:
|
| 40 |
+
return _service
|
| 41 |
+
|
| 42 |
+
_db = SubstitutionDatabase(settings.dataset_path)
|
| 43 |
+
_semantic = WordVectorFallback(
|
| 44 |
+
model_name=settings.semantic_model_name,
|
| 45 |
+
model_path=settings.semantic_model_path,
|
| 46 |
+
enable_download=settings.enable_semantic_download,
|
| 47 |
+
)
|
| 48 |
+
_service = RecipeAdapterService(db=_db, semantic=_semantic)
|
| 49 |
+
return _service
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@app.get("/")
|
| 53 |
+
def root():
|
| 54 |
+
index = Path("static/index.html")
|
| 55 |
+
if index.exists():
|
| 56 |
+
return FileResponse(index)
|
| 57 |
+
return JSONResponse(
|
| 58 |
+
{
|
| 59 |
+
"name": "BiteWise API",
|
| 60 |
+
"status": "running",
|
| 61 |
+
"hint": "POST /api/adapt with {recipe_text, diet}",
|
| 62 |
+
}
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@app.get("/health")
|
| 67 |
+
def health():
|
| 68 |
+
return {"ok": True}
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
@app.get("/api/meta")
|
| 72 |
+
def meta():
|
| 73 |
+
service = get_service()
|
| 74 |
+
return {
|
| 75 |
+
"ner_model": settings.ner_model_name,
|
| 76 |
+
"qa_model": settings.qa_model_name,
|
| 77 |
+
"semantic_model": settings.semantic_model_name,
|
| 78 |
+
"semantic_available": service.semantic.available,
|
| 79 |
+
"semantic_mode": service.semantic._kind,
|
| 80 |
+
"dataset_path": str(settings.dataset_path),
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
@app.post("/api/adapt")
|
| 85 |
+
def adapt(req: AdaptRequest) -> Dict[str, Any]:
|
| 86 |
+
try:
|
| 87 |
+
service = get_service()
|
| 88 |
+
return service.adapt(req.recipe_text, req.diet)
|
| 89 |
+
except FileNotFoundError as e:
|
| 90 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 91 |
+
except ValueError as e:
|
| 92 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 93 |
+
except Exception as e:
|
| 94 |
+
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.110
|
| 2 |
+
uvicorn[standard]>=0.27
|
| 3 |
+
pydantic>=2.6
|
| 4 |
+
pandas>=2.1
|
| 5 |
+
numpy>=1.26
|
| 6 |
+
transformers>=4.41
|
| 7 |
+
torch>=2.2
|
| 8 |
+
gensim>=4.3
|
| 9 |
+
python-multipart>=0.0.9
|
services/__init__.py
ADDED
|
File without changes
|
services/classify.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
from typing import Literal
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
|
| 8 |
+
|
| 9 |
+
from .config import settings
|
| 10 |
+
from .text_utils import normalize_text
|
| 11 |
+
|
| 12 |
+
RecipeType = Literal["baked", "cooked"]
|
| 13 |
+
|
| 14 |
+
BAKE_KEYWORDS = [
|
| 15 |
+
"bake", "baking", "oven", "preheat", "flour", "dough", "batter",
|
| 16 |
+
"cake", "cookie", "muffin", "bread", "pastry", "brownie", "tart",
|
| 17 |
+
"pie", "scone", "loaf", "whisk", "fold in", "sift", "knead",
|
| 18 |
+
"leavening", "baking soda", "baking powder", "yeast",
|
| 19 |
+
]
|
| 20 |
+
COOK_KEYWORDS = [
|
| 21 |
+
"saute", "sauté", "fry", "boil", "simmer", "stir", "grill",
|
| 22 |
+
"roast", "steam", "poach", "braise", "sear", "stove", "skillet",
|
| 23 |
+
"pan", "wok", "sauce", "soup", "stew", "marinate",
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@lru_cache(maxsize=1)
|
| 28 |
+
def get_qa_pipeline():
|
| 29 |
+
tokenizer = AutoTokenizer.from_pretrained(settings.qa_model_name)
|
| 30 |
+
model = AutoModelForQuestionAnswering.from_pretrained(settings.qa_model_name)
|
| 31 |
+
device = 0 if torch.cuda.is_available() else -1
|
| 32 |
+
return pipeline(
|
| 33 |
+
"question-answering",
|
| 34 |
+
model=model,
|
| 35 |
+
tokenizer=tokenizer,
|
| 36 |
+
device=device,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def classify_recipe(recipe_text: str) -> RecipeType:
|
| 41 |
+
text = normalize_text(recipe_text)
|
| 42 |
+
|
| 43 |
+
bake_score = sum(1 for kw in BAKE_KEYWORDS if kw in text)
|
| 44 |
+
cook_score = sum(1 for kw in COOK_KEYWORDS if kw in text)
|
| 45 |
+
|
| 46 |
+
answer = ""
|
| 47 |
+
try:
|
| 48 |
+
qa = get_qa_pipeline()
|
| 49 |
+
result = qa(question="Is this recipe for baking or cooking?", context=recipe_text)
|
| 50 |
+
answer = normalize_text(str(result.get("answer", "")))
|
| 51 |
+
except Exception:
|
| 52 |
+
pass
|
| 53 |
+
|
| 54 |
+
if any(sig in answer for sig in ("bak", "oven", "pastry", "dough")):
|
| 55 |
+
return "baked"
|
| 56 |
+
if any(sig in answer for sig in ("cook", "fry", "boil", "saut", "grill", "stir")):
|
| 57 |
+
return "cooked"
|
| 58 |
+
|
| 59 |
+
if bake_score > cook_score:
|
| 60 |
+
return "baked"
|
| 61 |
+
if cook_score > bake_score:
|
| 62 |
+
return "cooked"
|
| 63 |
+
|
| 64 |
+
return "cooked"
|
services/config.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@dataclass(frozen=True)
|
| 9 |
+
class Settings:
|
| 10 |
+
dataset_path: Path = Path(os.getenv("BITEWISE_DATASET_PATH", "data/united_master_database_corrected.csv"))
|
| 11 |
+
ner_model_name: str = os.getenv("BITEWISE_NER_MODEL", "Dizex/InstaFoodRoBERTa-NER")
|
| 12 |
+
qa_model_name: str = os.getenv(
|
| 13 |
+
"BITEWISE_QA_MODEL",
|
| 14 |
+
"bert-large-uncased-whole-word-masking-finetuned-squad",
|
| 15 |
+
)
|
| 16 |
+
semantic_model_name: str = os.getenv("BITEWISE_SEMANTIC_MODEL", "glove-wiki-gigaword-50")
|
| 17 |
+
semantic_model_path: str = os.getenv("BITEWISE_SEMANTIC_PATH", "")
|
| 18 |
+
enable_semantic_download: bool = os.getenv("BITEWISE_ENABLE_SEMANTIC_DOWNLOAD", "1") == "1"
|
| 19 |
+
max_ingredients: int = int(os.getenv("BITEWISE_MAX_INGREDIENTS", "48"))
|
| 20 |
+
similarity_threshold: float = float(os.getenv("BITEWISE_SIM_THRESHOLD", "0.52"))
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
settings = Settings()
|
services/dataset.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Dict, List, Optional
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
from .text_utils import as_aliases, dedupe_preserve_order, ingredient_variants, normalize_text
|
| 11 |
+
|
| 12 |
+
REQUIRED_COLUMNS = [
|
| 13 |
+
"Ingredient",
|
| 14 |
+
"Context",
|
| 15 |
+
"Aliases",
|
| 16 |
+
"Category",
|
| 17 |
+
"Is_Keto_Friendly",
|
| 18 |
+
"Is_Vegan_Friendly",
|
| 19 |
+
"Keto_Substitution",
|
| 20 |
+
"Keto_Instruction",
|
| 21 |
+
"Vegan_Substitution",
|
| 22 |
+
"Vegan_Instruction",
|
| 23 |
+
"Vegan_Keto_Substitution",
|
| 24 |
+
"Vegan_Keto_Instruction",
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
BAKE_CONTEXTS = {
|
| 28 |
+
"Baking & Desserts",
|
| 29 |
+
"Baking (Binder)",
|
| 30 |
+
"Baking (Leavening)",
|
| 31 |
+
"Pastries",
|
| 32 |
+
"Bagels",
|
| 33 |
+
"Puddings",
|
| 34 |
+
"Tiramisu",
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
COOK_CONTEXTS = {
|
| 38 |
+
"Main Course & Heavy Cooking",
|
| 39 |
+
"Sauces, Dips & Dressings",
|
| 40 |
+
"Soups & Savory Liquids",
|
| 41 |
+
"Cold Prep & Light Meals",
|
| 42 |
+
"Pasta",
|
| 43 |
+
"Lasagna",
|
| 44 |
+
"Roast",
|
| 45 |
+
"Stir-Fry",
|
| 46 |
+
"Appetizer",
|
| 47 |
+
"Indian",
|
| 48 |
+
"Beverages",
|
| 49 |
+
"Cheese Making",
|
| 50 |
+
"Processed",
|
| 51 |
+
"Technical & Additives",
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
@dataclass(frozen=True)
|
| 56 |
+
class IngredientRow:
|
| 57 |
+
ingredient: str
|
| 58 |
+
context: str
|
| 59 |
+
aliases: List[str]
|
| 60 |
+
category: str
|
| 61 |
+
is_keto_friendly: bool
|
| 62 |
+
is_vegan_friendly: bool
|
| 63 |
+
keto_substitution: Optional[str]
|
| 64 |
+
keto_instruction: Optional[str]
|
| 65 |
+
vegan_substitution: Optional[str]
|
| 66 |
+
vegan_instruction: Optional[str]
|
| 67 |
+
vegan_keto_substitution: Optional[str]
|
| 68 |
+
vegan_keto_instruction: Optional[str]
|
| 69 |
+
|
| 70 |
+
@property
|
| 71 |
+
def lookup_terms(self) -> List[str]:
|
| 72 |
+
terms = [self.ingredient, *self.aliases]
|
| 73 |
+
ing = normalize_text(self.ingredient)
|
| 74 |
+
|
| 75 |
+
if "egg" in ing:
|
| 76 |
+
terms.extend([
|
| 77 |
+
"egg", "eggs", "whole egg", "whole eggs", "egg white", "egg whites",
|
| 78 |
+
"egg yolk", "egg yolks",
|
| 79 |
+
])
|
| 80 |
+
if "pancetta" in ing or "bacon" in ing:
|
| 81 |
+
terms.extend(["pancetta", "bacon", "guanciale", "prosciutto", "cured pork"])
|
| 82 |
+
if "chicken" in ing:
|
| 83 |
+
terms.extend(["chicken", "chicken thighs", "chicken breast", "chicken pieces", "chicken drumsticks"])
|
| 84 |
+
if "milk" in ing:
|
| 85 |
+
terms.extend(["milk", "whole milk", "dairy milk"])
|
| 86 |
+
if "cheese" in ing:
|
| 87 |
+
terms.extend(["cheese", "hard cheese", "soft cheese"])
|
| 88 |
+
|
| 89 |
+
return dedupe_preserve_order(terms)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
class SubstitutionDatabase:
|
| 93 |
+
def __init__(self, csv_path: str | Path):
|
| 94 |
+
self.csv_path = Path(csv_path)
|
| 95 |
+
self.df = self._load()
|
| 96 |
+
self.rows = [self._row_from_series(row) for _, row in self.df.iterrows()]
|
| 97 |
+
self._preferred_rows_cache: Dict[str, List[IngredientRow]] = {}
|
| 98 |
+
self._semantic_terms_cache: Dict[str, List[str]] = {}
|
| 99 |
+
|
| 100 |
+
def _load(self) -> pd.DataFrame:
|
| 101 |
+
if not self.csv_path.exists():
|
| 102 |
+
raise FileNotFoundError(
|
| 103 |
+
f"Could not find substitution database at {self.csv_path}. "
|
| 104 |
+
"Place united_master_database_corrected.csv in data/ or set BITEWISE_DATASET_PATH."
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
df = pd.read_csv(self.csv_path)
|
| 108 |
+
missing = [c for c in REQUIRED_COLUMNS if c not in df.columns]
|
| 109 |
+
if missing:
|
| 110 |
+
raise ValueError(f"Dataset missing required columns: {missing}")
|
| 111 |
+
|
| 112 |
+
df = df.copy()
|
| 113 |
+
df = df.dropna(subset=["Ingredient", "Context"])
|
| 114 |
+
df["_ingredient_norm"] = df["Ingredient"].astype(str).map(normalize_text)
|
| 115 |
+
df["_aliases_norm"] = df["Aliases"].apply(as_aliases)
|
| 116 |
+
return df.reset_index(drop=True)
|
| 117 |
+
|
| 118 |
+
def _row_from_series(self, row: pd.Series) -> IngredientRow:
|
| 119 |
+
return IngredientRow(
|
| 120 |
+
ingredient=str(row["Ingredient"]),
|
| 121 |
+
context=str(row["Context"]),
|
| 122 |
+
aliases=as_aliases(row.get("Aliases")),
|
| 123 |
+
category=str(row.get("Category", "")),
|
| 124 |
+
is_keto_friendly=bool(row.get("Is_Keto_Friendly", False)),
|
| 125 |
+
is_vegan_friendly=bool(row.get("Is_Vegan_Friendly", False)),
|
| 126 |
+
keto_substitution=None if pd.isna(row.get("Keto_Substitution")) else str(row.get("Keto_Substitution")),
|
| 127 |
+
keto_instruction=None if pd.isna(row.get("Keto_Instruction")) else str(row.get("Keto_Instruction")),
|
| 128 |
+
vegan_substitution=None if pd.isna(row.get("Vegan_Substitution")) else str(row.get("Vegan_Substitution")),
|
| 129 |
+
vegan_instruction=None if pd.isna(row.get("Vegan_Instruction")) else str(row.get("Vegan_Instruction")),
|
| 130 |
+
vegan_keto_substitution=None if pd.isna(row.get("Vegan_Keto_Substitution")) else str(row.get("Vegan_Keto_Substitution")),
|
| 131 |
+
vegan_keto_instruction=None if pd.isna(row.get("Vegan_Keto_Instruction")) else str(row.get("Vegan_Keto_Instruction")),
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
def contexts_for_recipe_type(self, recipe_type: str) -> set[str]:
|
| 135 |
+
return BAKE_CONTEXTS if recipe_type == "baked" else COOK_CONTEXTS
|
| 136 |
+
|
| 137 |
+
def preferred_rows(self, recipe_type: str) -> List[IngredientRow]:
|
| 138 |
+
if recipe_type not in self._preferred_rows_cache:
|
| 139 |
+
contexts = self.contexts_for_recipe_type(recipe_type)
|
| 140 |
+
self._preferred_rows_cache[recipe_type] = [row for row in self.rows if row.context in contexts]
|
| 141 |
+
return self._preferred_rows_cache[recipe_type]
|
| 142 |
+
|
| 143 |
+
def _normalize_terms(self, row: IngredientRow) -> List[str]:
|
| 144 |
+
return [normalize_text(term) for term in row.lookup_terms if normalize_text(term)]
|
| 145 |
+
|
| 146 |
+
def _match_exact(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
|
| 147 |
+
query = normalize_text(query)
|
| 148 |
+
exact_rows = []
|
| 149 |
+
for row in rows:
|
| 150 |
+
terms = self._normalize_terms(row)
|
| 151 |
+
if not terms:
|
| 152 |
+
continue
|
| 153 |
+
if normalize_text(row.ingredient) == query or query in terms:
|
| 154 |
+
exact_rows.append(row)
|
| 155 |
+
return exact_rows
|
| 156 |
+
|
| 157 |
+
def _match_partial(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
|
| 158 |
+
query = normalize_text(query)
|
| 159 |
+
if len(query) < 4:
|
| 160 |
+
return []
|
| 161 |
+
pattern = re.compile(rf"(?<!\w){re.escape(query)}(?!\w)")
|
| 162 |
+
partial_rows = []
|
| 163 |
+
for row in rows:
|
| 164 |
+
candidates = [normalize_text(row.ingredient), *self._normalize_terms(row)]
|
| 165 |
+
if any(pattern.search(candidate) for candidate in candidates):
|
| 166 |
+
partial_rows.append(row)
|
| 167 |
+
return partial_rows
|
| 168 |
+
|
| 169 |
+
def _rank_rows(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
|
| 170 |
+
q = normalize_text(query)
|
| 171 |
+
|
| 172 |
+
def score(row: IngredientRow) -> tuple[int, int, int, int]:
|
| 173 |
+
ingredient_norm = normalize_text(row.ingredient)
|
| 174 |
+
alias_norms = [normalize_text(a) for a in row.aliases]
|
| 175 |
+
exact_ingredient = int(ingredient_norm == q)
|
| 176 |
+
exact_alias = int(q in alias_norms)
|
| 177 |
+
alias_specificity = int(len(row.aliases) > 0)
|
| 178 |
+
length = len(ingredient_norm)
|
| 179 |
+
return (exact_ingredient, exact_alias, alias_specificity, length)
|
| 180 |
+
|
| 181 |
+
return sorted(rows, key=score, reverse=True)
|
| 182 |
+
|
| 183 |
+
def find_rows(self, query: str, recipe_type: str) -> List[IngredientRow]:
|
| 184 |
+
query = normalize_text(query)
|
| 185 |
+
if not query:
|
| 186 |
+
return []
|
| 187 |
+
|
| 188 |
+
variants = ingredient_variants(query)
|
| 189 |
+
preferred = self.preferred_rows(recipe_type)
|
| 190 |
+
|
| 191 |
+
for candidate in variants:
|
| 192 |
+
exact_preferred = self._match_exact(preferred, candidate)
|
| 193 |
+
if exact_preferred:
|
| 194 |
+
return self._rank_rows(exact_preferred, candidate)
|
| 195 |
+
|
| 196 |
+
for candidate in variants:
|
| 197 |
+
exact_all = self._match_exact(self.rows, candidate)
|
| 198 |
+
if exact_all:
|
| 199 |
+
return self._rank_rows(exact_all, candidate)
|
| 200 |
+
|
| 201 |
+
for candidate in variants:
|
| 202 |
+
partial_preferred = self._match_partial(preferred, candidate)
|
| 203 |
+
if partial_preferred:
|
| 204 |
+
return self._rank_rows(partial_preferred, candidate)
|
| 205 |
+
|
| 206 |
+
for candidate in variants:
|
| 207 |
+
partial_all = self._match_partial(self.rows, candidate)
|
| 208 |
+
if partial_all:
|
| 209 |
+
return self._rank_rows(partial_all, candidate)
|
| 210 |
+
|
| 211 |
+
return []
|
| 212 |
+
|
| 213 |
+
def semantic_terms(self, recipe_type: str) -> List[str]:
|
| 214 |
+
if recipe_type not in self._semantic_terms_cache:
|
| 215 |
+
rows = self.preferred_rows(recipe_type) or self.rows
|
| 216 |
+
terms = []
|
| 217 |
+
for row in rows:
|
| 218 |
+
terms.extend(row.lookup_terms)
|
| 219 |
+
self._semantic_terms_cache[recipe_type] = dedupe_preserve_order(terms)
|
| 220 |
+
return self._semantic_terms_cache[recipe_type]
|
| 221 |
+
|
| 222 |
+
def pick_substitution(self, row: IngredientRow, diet: str) -> tuple[str, str, bool]:
|
| 223 |
+
if diet == "vegan":
|
| 224 |
+
sub = row.vegan_substitution
|
| 225 |
+
instr = row.vegan_instruction
|
| 226 |
+
compatible = row.is_vegan_friendly
|
| 227 |
+
elif diet == "keto":
|
| 228 |
+
sub = row.keto_substitution
|
| 229 |
+
instr = row.keto_instruction
|
| 230 |
+
compatible = row.is_keto_friendly
|
| 231 |
+
else:
|
| 232 |
+
sub = row.vegan_keto_substitution
|
| 233 |
+
instr = row.vegan_keto_instruction
|
| 234 |
+
compatible = row.is_vegan_friendly and row.is_keto_friendly
|
| 235 |
+
|
| 236 |
+
if compatible or not sub or str(sub).strip().lower() in {"nan", "none"}:
|
| 237 |
+
return row.ingredient, "Already compatible — no substitution needed.", True
|
| 238 |
+
|
| 239 |
+
return str(sub), (str(instr) if instr and str(instr).lower() != "nan" else ""), False
|
services/ner.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
from typing import List
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
|
| 8 |
+
|
| 9 |
+
from .config import settings
|
| 10 |
+
from .text_utils import dedupe_preserve_order, strip_amounts_and_preps, tokenize_recipe_segments, normalize_text
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@lru_cache(maxsize=1)
|
| 14 |
+
def get_ner_pipeline():
|
| 15 |
+
tokenizer = AutoTokenizer.from_pretrained(settings.ner_model_name)
|
| 16 |
+
model = AutoModelForTokenClassification.from_pretrained(settings.ner_model_name)
|
| 17 |
+
device = 0 if torch.cuda.is_available() else -1
|
| 18 |
+
return pipeline(
|
| 19 |
+
"token-classification",
|
| 20 |
+
model=model,
|
| 21 |
+
tokenizer=tokenizer,
|
| 22 |
+
aggregation_strategy="simple",
|
| 23 |
+
device=device,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _best_span_from_segment(segment: str) -> str:
|
| 28 |
+
segment = (segment or "").strip()
|
| 29 |
+
if not segment:
|
| 30 |
+
return ""
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
pipe = get_ner_pipeline()
|
| 34 |
+
ents = pipe(segment)
|
| 35 |
+
except Exception:
|
| 36 |
+
ents = []
|
| 37 |
+
|
| 38 |
+
spans: List[str] = []
|
| 39 |
+
for ent in ents:
|
| 40 |
+
text = segment[ent["start"] : ent["end"]].strip()
|
| 41 |
+
text = normalize_text(text)
|
| 42 |
+
if not text or len(text) < 2:
|
| 43 |
+
continue
|
| 44 |
+
if text in {"and", "or", "with", "of"}:
|
| 45 |
+
continue
|
| 46 |
+
spans.append(text)
|
| 47 |
+
|
| 48 |
+
if spans:
|
| 49 |
+
# Prefer the longest span because ingredient models sometimes emit
|
| 50 |
+
# smaller fragments when the input chunk is short.
|
| 51 |
+
spans.sort(key=len, reverse=True)
|
| 52 |
+
return spans[0]
|
| 53 |
+
|
| 54 |
+
return strip_amounts_and_preps(segment)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def extract_ingredients(recipe_text: str, max_items: int = 48) -> List[str]:
|
| 58 |
+
segments = tokenize_recipe_segments(recipe_text)
|
| 59 |
+
if not segments:
|
| 60 |
+
return []
|
| 61 |
+
|
| 62 |
+
out: List[str] = []
|
| 63 |
+
for segment in segments:
|
| 64 |
+
candidate = _best_span_from_segment(segment)
|
| 65 |
+
candidate = strip_amounts_and_preps(candidate)
|
| 66 |
+
if not candidate or len(candidate) < 2:
|
| 67 |
+
continue
|
| 68 |
+
if candidate in {"and", "or", "with", "of"}:
|
| 69 |
+
continue
|
| 70 |
+
out.append(candidate)
|
| 71 |
+
if len(out) >= max_items:
|
| 72 |
+
break
|
| 73 |
+
|
| 74 |
+
return dedupe_preserve_order(out)
|
services/recipe_service.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import Any, Dict, List, Literal, Optional
|
| 5 |
+
|
| 6 |
+
from .classify import classify_recipe
|
| 7 |
+
from .config import settings
|
| 8 |
+
from .dataset import SubstitutionDatabase
|
| 9 |
+
from .ner import extract_ingredients
|
| 10 |
+
from .semantic import WordVectorFallback
|
| 11 |
+
from .text_utils import normalize_text, singularize
|
| 12 |
+
|
| 13 |
+
Diet = Literal["vegan", "keto", "both"]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
NEUTRAL_OK = {
|
| 17 |
+
"salt", "pepper", "black pepper", "white pepper", "water", "olive oil", "vegetable oil", "oil",
|
| 18 |
+
"garlic", "onion", "lemon", "lime", "vinegar", "basil", "oregano", "thyme", "rosemary",
|
| 19 |
+
"cumin", "paprika", "turmeric", "ginger", "chili", "chilli", "coriander", "parsley",
|
| 20 |
+
"bay leaf", "bay leaves", "nutmeg", "cinnamon", "cardamom", "cloves", "allspice",
|
| 21 |
+
"saffron", "vanilla", "cocoa powder", "baking powder", "baking soda", "yeast",
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
VEGAN_ANIMAL_HINTS = {
|
| 25 |
+
"pancetta", "bacon", "guanciale", "pork", "ham", "prosciutto", "sausage",
|
| 26 |
+
"chicken", "beef", "turkey", "lamb", "fish", "shrimp", "anchovy", "gelatin", "lard",
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
VEGAN_DAIRY_HINTS = {
|
| 30 |
+
"milk", "cream", "butter", "cheese", "yogurt", "whey", "casein", "ghee",
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
VEGAN_EGG_HINTS = {
|
| 34 |
+
"egg", "eggs", "whole egg", "whole eggs", "egg white", "egg whites", "egg yolk", "egg yolks",
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
KETO_CARB_HINTS = {
|
| 38 |
+
"pasta", "spaghetti", "noodle", "noodles", "bread", "flour", "sugar", "rice",
|
| 39 |
+
"potato", "potatoes", "corn", "oats", "beans", "bean", "honey", "syrup", "maple", "couscous",
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@dataclass
|
| 44 |
+
class IngredientResult:
|
| 45 |
+
original: str
|
| 46 |
+
normalized: str
|
| 47 |
+
compliant: bool
|
| 48 |
+
substitute: str
|
| 49 |
+
instructions: str
|
| 50 |
+
source: str
|
| 51 |
+
matched_ingredient: Optional[str] = None
|
| 52 |
+
confidence: float = 1.0
|
| 53 |
+
notes: Optional[str] = None
|
| 54 |
+
|
| 55 |
+
def as_dict(self) -> Dict[str, Any]:
|
| 56 |
+
payload = {
|
| 57 |
+
"original": self.original,
|
| 58 |
+
"normalized": self.normalized,
|
| 59 |
+
"compliant": self.compliant,
|
| 60 |
+
"substitute": self.substitute,
|
| 61 |
+
"instructions": self.instructions,
|
| 62 |
+
"source": self.source,
|
| 63 |
+
"matched_ingredient": self.matched_ingredient,
|
| 64 |
+
"confidence": round(float(self.confidence), 3),
|
| 65 |
+
}
|
| 66 |
+
if self.notes:
|
| 67 |
+
payload["notes"] = self.notes
|
| 68 |
+
return payload
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class RecipeAdapterService:
|
| 72 |
+
def __init__(self, db: SubstitutionDatabase, semantic: WordVectorFallback):
|
| 73 |
+
self.db = db
|
| 74 |
+
self.semantic = semantic
|
| 75 |
+
|
| 76 |
+
def _row_to_result(self, ingredient: str, row, diet: Diet) -> IngredientResult:
|
| 77 |
+
substitute, instructions, compliant = self.db.pick_substitution(row, diet)
|
| 78 |
+
normalized = normalize_text(ingredient)
|
| 79 |
+
|
| 80 |
+
if compliant:
|
| 81 |
+
return IngredientResult(
|
| 82 |
+
original=ingredient,
|
| 83 |
+
normalized=normalized,
|
| 84 |
+
compliant=True,
|
| 85 |
+
substitute=ingredient,
|
| 86 |
+
instructions="Already compatible — no substitution needed.",
|
| 87 |
+
source="database match",
|
| 88 |
+
matched_ingredient=row.ingredient,
|
| 89 |
+
confidence=1.0,
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
return IngredientResult(
|
| 93 |
+
original=ingredient,
|
| 94 |
+
normalized=normalized,
|
| 95 |
+
compliant=False,
|
| 96 |
+
substitute=substitute,
|
| 97 |
+
instructions=instructions or "",
|
| 98 |
+
source="database match",
|
| 99 |
+
matched_ingredient=row.ingredient,
|
| 100 |
+
confidence=0.96,
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
def _rule_fallback(self, ingredient: str, diet: Diet, recipe_type: str) -> Optional[IngredientResult]:
|
| 104 |
+
text = normalize_text(ingredient)
|
| 105 |
+
singular = singularize(text)
|
| 106 |
+
if singular != text:
|
| 107 |
+
text = f"{text} {singular}"
|
| 108 |
+
|
| 109 |
+
def result(sub: str, instr: str, source: str = "diet rule fallback", conf: float = 0.94) -> IngredientResult:
|
| 110 |
+
return IngredientResult(
|
| 111 |
+
original=ingredient,
|
| 112 |
+
normalized=normalize_text(ingredient),
|
| 113 |
+
compliant=False,
|
| 114 |
+
substitute=sub,
|
| 115 |
+
instructions=instr,
|
| 116 |
+
source=source,
|
| 117 |
+
matched_ingredient=None,
|
| 118 |
+
confidence=conf,
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
if diet in {"vegan", "both"}:
|
| 122 |
+
if any(hint in text for hint in VEGAN_EGG_HINTS):
|
| 123 |
+
if recipe_type == "baked":
|
| 124 |
+
return result("Flax Egg", "1 tbsp ground flax + 3 tbsp water per egg. Rest 5 min before using.")
|
| 125 |
+
if diet == "both":
|
| 126 |
+
return result("Silken Tofu", "Blend until smooth and use as a creamy egg-free binder.")
|
| 127 |
+
return result("Silken Tofu or Tofu Scramble", "Blend silken tofu for sauces or use crumbled tofu for savory dishes.")
|
| 128 |
+
|
| 129 |
+
if any(hint in text for hint in VEGAN_ANIMAL_HINTS):
|
| 130 |
+
if any(hint in text for hint in {"pancetta", "bacon", "guanciale", "pork", "ham", "prosciutto", "sausage"}):
|
| 131 |
+
if diet == "both":
|
| 132 |
+
return result("Extra Firm Tofu", "Press tofu 30 min, cube and pan-fry until golden.")
|
| 133 |
+
return result("Smoked Tofu or Tempeh Bacon", "Dice and pan-fry until crispy.")
|
| 134 |
+
if any(hint in text for hint in {"chicken", "beef", "turkey", "lamb", "fish", "shrimp", "anchovy"}):
|
| 135 |
+
if diet == "both":
|
| 136 |
+
return result("Extra Firm Tofu", "Use as a 1:1 savory protein substitute.")
|
| 137 |
+
return result("Soy Curls or Extra Firm Tofu", "Use as a 1:1 meat substitute.")
|
| 138 |
+
return result("Plant-based alternative", "Choose a vegan substitute that matches the recipe context.")
|
| 139 |
+
|
| 140 |
+
if any(hint in text for hint in VEGAN_DAIRY_HINTS):
|
| 141 |
+
if "butter" in text:
|
| 142 |
+
return result("Vegan Butter", "Use 1:1 in place of butter.")
|
| 143 |
+
if "milk" in text:
|
| 144 |
+
return result("Unsweetened Plant Milk", "Use 1:1 in place of dairy milk.")
|
| 145 |
+
if "cream" in text:
|
| 146 |
+
return result("Cashew Cream", "Use as a rich dairy-free cream substitute.")
|
| 147 |
+
if "cheese" in text:
|
| 148 |
+
return result("Vegan Cheese", "Use a meltable vegan cheese or nutritional yeast blend.")
|
| 149 |
+
return result("Plant-based alternative", "Choose a vegan substitute that matches the recipe context.")
|
| 150 |
+
|
| 151 |
+
if diet in {"keto", "both"}:
|
| 152 |
+
if any(hint in text for hint in KETO_CARB_HINTS):
|
| 153 |
+
if "spaghetti" in text or "pasta" in text or "noodle" in text:
|
| 154 |
+
return result("Zucchini Noodles or Shirataki Noodles", "Use in a 1:1 swap for pasta-style dishes.")
|
| 155 |
+
if "rice" in text:
|
| 156 |
+
return result("Cauliflower Rice", "Use as a low-carb rice substitute.")
|
| 157 |
+
if "flour" in text:
|
| 158 |
+
return result("Almond Flour", "Use a keto baking flour blend.")
|
| 159 |
+
if "sugar" in text or "honey" in text or "syrup" in text:
|
| 160 |
+
return result("Erythritol or Allulose", "Use a keto-friendly sweetener to taste.")
|
| 161 |
+
if "potato" in text:
|
| 162 |
+
return result("Cauliflower", "Use roasted cauliflower or cauliflower mash.")
|
| 163 |
+
return result("Low-carb alternative", "Choose a keto-friendly substitute that matches the recipe context.")
|
| 164 |
+
|
| 165 |
+
return None
|
| 166 |
+
|
| 167 |
+
def _semantic_fallback(self, ingredient: str, diet: Diet, recipe_type: str) -> Optional[IngredientResult]:
|
| 168 |
+
if not self.semantic.available:
|
| 169 |
+
return None
|
| 170 |
+
|
| 171 |
+
candidates = self.db.semantic_terms(recipe_type)
|
| 172 |
+
if not candidates:
|
| 173 |
+
return None
|
| 174 |
+
|
| 175 |
+
hits = self.semantic.nearest(ingredient, candidates, top_k=5)
|
| 176 |
+
if not hits:
|
| 177 |
+
return None
|
| 178 |
+
|
| 179 |
+
for hit in hits:
|
| 180 |
+
if hit.score < settings.similarity_threshold:
|
| 181 |
+
continue
|
| 182 |
+
matched_rows = self.db.find_rows(hit.term, recipe_type)
|
| 183 |
+
if not matched_rows:
|
| 184 |
+
continue
|
| 185 |
+
row = matched_rows[0]
|
| 186 |
+
result = self._row_to_result(ingredient, row, diet)
|
| 187 |
+
result.source = f"glove semantic match ({hit.term}, score={hit.score:.2f})"
|
| 188 |
+
result.confidence = max(0.5, min(0.95, hit.score))
|
| 189 |
+
return result
|
| 190 |
+
|
| 191 |
+
return None
|
| 192 |
+
|
| 193 |
+
def _manual_review_result(self, ingredient: str) -> IngredientResult:
|
| 194 |
+
text = normalize_text(ingredient)
|
| 195 |
+
if text in NEUTRAL_OK:
|
| 196 |
+
return IngredientResult(
|
| 197 |
+
original=ingredient,
|
| 198 |
+
normalized=text,
|
| 199 |
+
compliant=True,
|
| 200 |
+
substitute=ingredient,
|
| 201 |
+
instructions="Already compatible — no substitution needed.",
|
| 202 |
+
source="known compatible",
|
| 203 |
+
matched_ingredient=None,
|
| 204 |
+
confidence=0.9,
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
return IngredientResult(
|
| 208 |
+
original=ingredient,
|
| 209 |
+
normalized=text,
|
| 210 |
+
compliant=False,
|
| 211 |
+
substitute=ingredient,
|
| 212 |
+
instructions="No reliable substitution found — please review manually.",
|
| 213 |
+
source="not in database",
|
| 214 |
+
matched_ingredient=None,
|
| 215 |
+
confidence=0.35,
|
| 216 |
+
notes="No reliable database or semantic match found.",
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
def adapt(self, recipe_text: str, diet: Diet) -> Dict[str, Any]:
|
| 220 |
+
if diet not in {"vegan", "keto", "both"}:
|
| 221 |
+
raise ValueError("diet must be one of: vegan, keto, both")
|
| 222 |
+
|
| 223 |
+
recipe_text = (recipe_text or "").strip()
|
| 224 |
+
if len(recipe_text) < 5:
|
| 225 |
+
raise ValueError("recipe_text is too short")
|
| 226 |
+
|
| 227 |
+
recipe_type = classify_recipe(recipe_text)
|
| 228 |
+
ingredients = extract_ingredients(recipe_text, max_items=settings.max_ingredients)
|
| 229 |
+
|
| 230 |
+
results: List[IngredientResult] = []
|
| 231 |
+
substitutions = 0
|
| 232 |
+
|
| 233 |
+
for ingredient in ingredients:
|
| 234 |
+
matches = self.db.find_rows(ingredient, recipe_type)
|
| 235 |
+
if matches:
|
| 236 |
+
result = self._row_to_result(ingredient, matches[0], diet)
|
| 237 |
+
else:
|
| 238 |
+
result = self._rule_fallback(ingredient, diet, recipe_type)
|
| 239 |
+
if result is None:
|
| 240 |
+
result = self._semantic_fallback(ingredient, diet, recipe_type)
|
| 241 |
+
if result is None:
|
| 242 |
+
result = self._manual_review_result(ingredient)
|
| 243 |
+
|
| 244 |
+
if not result.compliant:
|
| 245 |
+
substitutions += 1
|
| 246 |
+
results.append(result)
|
| 247 |
+
|
| 248 |
+
return {
|
| 249 |
+
"diet": diet,
|
| 250 |
+
"recipe_type": recipe_type,
|
| 251 |
+
"ingredients": [r.as_dict() for r in results],
|
| 252 |
+
"ingredients_found": len(results),
|
| 253 |
+
"substitution_count": substitutions,
|
| 254 |
+
"model_metadata": {
|
| 255 |
+
"ner_model": settings.ner_model_name,
|
| 256 |
+
"qa_model": settings.qa_model_name,
|
| 257 |
+
"semantic_model": settings.semantic_model_name,
|
| 258 |
+
"semantic_available": self.semantic.available,
|
| 259 |
+
"semantic_mode": self.semantic._kind,
|
| 260 |
+
"word2vec_available": self.semantic.available,
|
| 261 |
+
"word2vec_mode": self.semantic._kind,
|
| 262 |
+
"dataset_path": str(self.db.csv_path),
|
| 263 |
+
},
|
| 264 |
+
"disclaimer": (
|
| 265 |
+
"This is an assistive recipe tool, not nutritional or allergen medical advice. "
|
| 266 |
+
"Verify substitutions before cooking."
|
| 267 |
+
),
|
| 268 |
+
}
|
services/semantic.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import Iterable, List, Optional
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
from .text_utils import normalize_text, singularize
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class SemanticHit:
|
| 13 |
+
term: str
|
| 14 |
+
score: float
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class WordVectorFallback:
|
| 18 |
+
"""Small Glove-based semantic fallback.
|
| 19 |
+
|
| 20 |
+
The model is optional so the app can still boot if the host blocks downloads.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(self, model_name: str = "glove-wiki-gigaword-50", model_path: str = "", enable_download: bool = True):
|
| 24 |
+
self.model = None
|
| 25 |
+
self._kind = "disabled"
|
| 26 |
+
self._model_name = model_name
|
| 27 |
+
self._load(model_name=model_name, model_path=model_path, enable_download=enable_download)
|
| 28 |
+
|
| 29 |
+
def _load(self, model_name: str, model_path: str, enable_download: bool) -> None:
|
| 30 |
+
try:
|
| 31 |
+
from gensim.models import KeyedVectors
|
| 32 |
+
import gensim.downloader as api
|
| 33 |
+
except Exception:
|
| 34 |
+
self.model = None
|
| 35 |
+
self._kind = "unavailable"
|
| 36 |
+
return
|
| 37 |
+
|
| 38 |
+
if model_path:
|
| 39 |
+
try:
|
| 40 |
+
self.model = KeyedVectors.load(model_path, mmap="r")
|
| 41 |
+
self._kind = f"local:{model_path}"
|
| 42 |
+
return
|
| 43 |
+
except Exception:
|
| 44 |
+
try:
|
| 45 |
+
self.model = KeyedVectors.load_word2vec_format(model_path, binary=model_path.endswith(".bin"))
|
| 46 |
+
self._kind = f"local-vec:{model_path}"
|
| 47 |
+
return
|
| 48 |
+
except Exception:
|
| 49 |
+
self.model = None
|
| 50 |
+
|
| 51 |
+
if enable_download:
|
| 52 |
+
try:
|
| 53 |
+
self.model = api.load(model_name)
|
| 54 |
+
self._kind = model_name
|
| 55 |
+
except Exception:
|
| 56 |
+
self.model = None
|
| 57 |
+
self._kind = "download-failed"
|
| 58 |
+
else:
|
| 59 |
+
self.model = None
|
| 60 |
+
self._kind = "disabled"
|
| 61 |
+
|
| 62 |
+
@property
|
| 63 |
+
def available(self) -> bool:
|
| 64 |
+
return self.model is not None
|
| 65 |
+
|
| 66 |
+
def vector_for(self, phrase: str) -> Optional[np.ndarray]:
|
| 67 |
+
if not self.available:
|
| 68 |
+
return None
|
| 69 |
+
|
| 70 |
+
normalized = normalize_text(phrase)
|
| 71 |
+
tokens = [singularize(t) for t in normalized.split()]
|
| 72 |
+
vectors = []
|
| 73 |
+
for token in tokens:
|
| 74 |
+
if token in self.model:
|
| 75 |
+
vectors.append(self.model[token])
|
| 76 |
+
|
| 77 |
+
if vectors:
|
| 78 |
+
return np.mean(np.stack(vectors), axis=0)
|
| 79 |
+
|
| 80 |
+
phrase_key = normalized.replace(" ", "_")
|
| 81 |
+
if phrase_key in self.model:
|
| 82 |
+
return self.model[phrase_key]
|
| 83 |
+
|
| 84 |
+
if normalized in self.model:
|
| 85 |
+
return self.model[normalized]
|
| 86 |
+
|
| 87 |
+
return None
|
| 88 |
+
|
| 89 |
+
def nearest(self, query: str, candidates: Iterable[str], top_k: int = 3) -> List[SemanticHit]:
|
| 90 |
+
if not self.available:
|
| 91 |
+
return []
|
| 92 |
+
|
| 93 |
+
qv = self.vector_for(query)
|
| 94 |
+
if qv is None:
|
| 95 |
+
return []
|
| 96 |
+
|
| 97 |
+
scored: List[SemanticHit] = []
|
| 98 |
+
qnorm = np.linalg.norm(qv) + 1e-8
|
| 99 |
+
for candidate in candidates:
|
| 100 |
+
cv = self.vector_for(candidate)
|
| 101 |
+
if cv is None:
|
| 102 |
+
continue
|
| 103 |
+
score = float(np.dot(qv, cv) / (qnorm * (np.linalg.norm(cv) + 1e-8)))
|
| 104 |
+
scored.append(SemanticHit(term=candidate, score=score))
|
| 105 |
+
|
| 106 |
+
scored.sort(key=lambda x: x.score, reverse=True)
|
| 107 |
+
return scored[:top_k]
|
services/text_utils.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from functools import lru_cache
|
| 5 |
+
from typing import Iterable, List
|
| 6 |
+
|
| 7 |
+
STOPWORDS = {
|
| 8 |
+
"and", "or", "the", "a", "an", "some", "fresh", "dried", "chopped", "minced",
|
| 9 |
+
"diced", "sliced", "grated", "ground", "cooked", "raw", "cold", "hot", "warm",
|
| 10 |
+
"to", "taste", "optional", "plus", "more",
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
_AMOUNT_RE = re.compile(r"^(?:\d+(?:\.\d+)?|\d+\/\d+|[¼½¾⅓⅔⅛⅜⅝⅞])\s*")
|
| 14 |
+
_MEASURE_RE = re.compile(
|
| 15 |
+
r"^(?:g|kg|mg|ml|l|oz|lb|lbs|cup|cups|tbsp|tablespoon|tsp|teaspoon|clove|cloves|slice|slices|piece|pieces|can|cans|bunch|handful|pinch|large|small|medium|whole)\s+"
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def normalize_text(text: str) -> str:
|
| 20 |
+
text = (text or "").lower().strip()
|
| 21 |
+
text = re.sub(r"[‘’“”]", "'", text)
|
| 22 |
+
text = re.sub(r"[^a-z0-9\s\-']+", " ", text)
|
| 23 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 24 |
+
return text
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@lru_cache(maxsize=4096)
|
| 28 |
+
def singularize(word: str) -> str:
|
| 29 |
+
word = normalize_text(word)
|
| 30 |
+
if len(word) <= 3:
|
| 31 |
+
return word
|
| 32 |
+
if word.endswith("ies") and len(word) > 4:
|
| 33 |
+
return word[:-3] + "y"
|
| 34 |
+
if word.endswith("ves") and len(word) > 4:
|
| 35 |
+
return word[:-3] + "f"
|
| 36 |
+
if word.endswith("ses") or word.endswith("xes") or word.endswith("zes") or word.endswith("ches") or word.endswith("shes"):
|
| 37 |
+
return word[:-2]
|
| 38 |
+
if word.endswith("s") and not word.endswith("ss"):
|
| 39 |
+
return word[:-1]
|
| 40 |
+
return word
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def dedupe_preserve_order(items: Iterable[str]) -> List[str]:
|
| 44 |
+
seen = set()
|
| 45 |
+
out = []
|
| 46 |
+
for item in items:
|
| 47 |
+
item = normalize_text(item)
|
| 48 |
+
if item and item not in seen:
|
| 49 |
+
seen.add(item)
|
| 50 |
+
out.append(item)
|
| 51 |
+
return out
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def strip_amounts_and_preps(text: str) -> str:
|
| 55 |
+
text = normalize_text(text)
|
| 56 |
+
text = _AMOUNT_RE.sub("", text)
|
| 57 |
+
text = _MEASURE_RE.sub("", text)
|
| 58 |
+
text = re.sub(r"^of\s+", "", text)
|
| 59 |
+
text = re.sub(r"\(.*?\)", "", text)
|
| 60 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 61 |
+
return text
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def tokenize_recipe_segments(text: str) -> List[str]:
|
| 65 |
+
raw = text or ""
|
| 66 |
+
parts = re.split(r",|\n|;|\s+and\s+", raw, flags=re.IGNORECASE)
|
| 67 |
+
cleaned = []
|
| 68 |
+
for part in parts:
|
| 69 |
+
item = strip_amounts_and_preps(part)
|
| 70 |
+
if item and len(item) > 1:
|
| 71 |
+
cleaned.append(item)
|
| 72 |
+
return dedupe_preserve_order(cleaned)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def ingredient_variants(ingredient: str) -> List[str]:
|
| 76 |
+
ing = normalize_text(ingredient)
|
| 77 |
+
variants = [ing]
|
| 78 |
+
|
| 79 |
+
singular = singularize(ing)
|
| 80 |
+
if singular != ing:
|
| 81 |
+
variants.append(singular)
|
| 82 |
+
|
| 83 |
+
suffixes = [" cheese", " oil", " milk", " cream", " butter", " powder", " sauce", " paste", " extract"]
|
| 84 |
+
for suffix in suffixes:
|
| 85 |
+
if ing.endswith(suffix) and len(ing) > len(suffix) + 1:
|
| 86 |
+
base = ing[:-len(suffix)].strip()
|
| 87 |
+
variants.append(base)
|
| 88 |
+
base_singular = singularize(base)
|
| 89 |
+
if base_singular != base:
|
| 90 |
+
variants.append(base_singular)
|
| 91 |
+
|
| 92 |
+
words = ing.split()
|
| 93 |
+
if len(words) > 1:
|
| 94 |
+
variants.extend([words[0], words[-1], " ".join(words[:2]), " ".join(words[1:])])
|
| 95 |
+
|
| 96 |
+
return dedupe_preserve_order(variants)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def as_aliases(aliases: str | float | None) -> List[str]:
|
| 100 |
+
if aliases is None or not isinstance(aliases, str):
|
| 101 |
+
return []
|
| 102 |
+
return dedupe_preserve_order(alias.strip() for alias in aliases.split("|"))
|
static/index.html
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 6 |
+
<title>BiteWise API</title>
|
| 7 |
+
<style>
|
| 8 |
+
body { font-family: Arial, sans-serif; background: #f5f4f0; color: #1a1a1a; margin: 0; padding: 40px; }
|
| 9 |
+
.card { max-width: 720px; background: #fff; border: 1px solid #e8e6e0; border-radius: 16px; padding: 24px; }
|
| 10 |
+
code { background: #f7f7f7; padding: 2px 6px; border-radius: 6px; }
|
| 11 |
+
</style>
|
| 12 |
+
</head>
|
| 13 |
+
<body>
|
| 14 |
+
<div class="card">
|
| 15 |
+
<h1>BiteWise is running</h1>
|
| 16 |
+
<p>Use <code>POST /api/adapt</code> to adapt a recipe for vegan, keto, or both.</p>
|
| 17 |
+
<p>Open <code>/docs</code> to test the API.</p>
|
| 18 |
+
</div>
|
| 19 |
+
</body>
|
| 20 |
+
</html>
|