BiteWiseFinal

Sleeping

App Files Files Community

anaygupta commited on May 10

Commit

9373226

verified ·

1 Parent(s): 62fcf17

Upload 12 files

Browse files

Files changed (12) hide show

Dockerfile +15 -0
main.py +94 -0
requirements.txt +9 -0
services/__init__.py +0 -0
services/classify.py +64 -0
services/config.py +23 -0
services/dataset.py +239 -0
services/ner.py +74 -0
services/recipe_service.py +268 -0
services/semantic.py +107 -0
services/text_utils.py +102 -0
static/index.html +20 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,15 @@

+FROM python:3.11-slim
+ENV PYTHONDONTWRITEBYTECODE=1     PYTHONUNBUFFERED=1     PIP_NO_CACHE_DIR=1     BITEWISE_ENABLE_SEMANTIC_DOWNLOAD=1
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends     git     && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt /app/requirements.txt
+RUN pip install -r requirements.txt
+COPY . /app
+EXPOSE 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Any, Dict, Literal
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse, JSONResponse
+from pydantic import BaseModel, Field
+from services.config import settings
+from services.dataset import SubstitutionDatabase
+from services.recipe_service import RecipeAdapterService
+from services.semantic import WordVectorFallback
+app = FastAPI(title="BiteWise API", version="2.0.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+class AdaptRequest(BaseModel):
+    recipe_text: str = Field(min_length=5)
+    diet: Literal["vegan", "keto", "both"] = "vegan"
+_db = None
+_semantic = None
+_service = None
+def get_service() -> RecipeAdapterService:
+    global _db, _semantic, _service
+    if _service is not None:
+        return _service
+    _db = SubstitutionDatabase(settings.dataset_path)
+    _semantic = WordVectorFallback(
+        model_name=settings.semantic_model_name,
+        model_path=settings.semantic_model_path,
+        enable_download=settings.enable_semantic_download,
+    )
+    _service = RecipeAdapterService(db=_db, semantic=_semantic)
+    return _service
+@app.get("/")
+def root():
+    index = Path("static/index.html")
+    if index.exists():
+        return FileResponse(index)
+    return JSONResponse(
+        {
+            "name": "BiteWise API",
+            "status": "running",
+            "hint": "POST /api/adapt with {recipe_text, diet}",
+        }
+    )
+@app.get("/health")
+def health():
+    return {"ok": True}
+@app.get("/api/meta")
+def meta():
+    service = get_service()
+    return {
+        "ner_model": settings.ner_model_name,
+        "qa_model": settings.qa_model_name,
+        "semantic_model": settings.semantic_model_name,
+        "semantic_available": service.semantic.available,
+        "semantic_mode": service.semantic._kind,
+        "dataset_path": str(settings.dataset_path),
+    }
+@app.post("/api/adapt")
+def adapt(req: AdaptRequest) -> Dict[str, Any]:
+    try:
+        service = get_service()
+        return service.adapt(req.recipe_text, req.diet)
+    except FileNotFoundError as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi>=0.110
+uvicorn[standard]>=0.27
+pydantic>=2.6
+pandas>=2.1
+numpy>=1.26
+transformers>=4.41
+torch>=2.2
+gensim>=4.3
+python-multipart>=0.0.9

services/__init__.py ADDED Viewed

File without changes

services/classify.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from __future__ import annotations
+from functools import lru_cache
+from typing import Literal
+import torch
+from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
+from .config import settings
+from .text_utils import normalize_text
+RecipeType = Literal["baked", "cooked"]
+BAKE_KEYWORDS = [
+    "bake", "baking", "oven", "preheat", "flour", "dough", "batter",
+    "cake", "cookie", "muffin", "bread", "pastry", "brownie", "tart",
+    "pie", "scone", "loaf", "whisk", "fold in", "sift", "knead",
+    "leavening", "baking soda", "baking powder", "yeast",
+]
+COOK_KEYWORDS = [
+    "saute", "sauté", "fry", "boil", "simmer", "stir", "grill",
+    "roast", "steam", "poach", "braise", "sear", "stove", "skillet",
+    "pan", "wok", "sauce", "soup", "stew", "marinate",
+]
+@lru_cache(maxsize=1)
+def get_qa_pipeline():
+    tokenizer = AutoTokenizer.from_pretrained(settings.qa_model_name)
+    model = AutoModelForQuestionAnswering.from_pretrained(settings.qa_model_name)
+    device = 0 if torch.cuda.is_available() else -1
+    return pipeline(
+        "question-answering",
+        model=model,
+        tokenizer=tokenizer,
+        device=device,
+    )
+def classify_recipe(recipe_text: str) -> RecipeType:
+    text = normalize_text(recipe_text)
+    bake_score = sum(1 for kw in BAKE_KEYWORDS if kw in text)
+    cook_score = sum(1 for kw in COOK_KEYWORDS if kw in text)
+    answer = ""
+    try:
+        qa = get_qa_pipeline()
+        result = qa(question="Is this recipe for baking or cooking?", context=recipe_text)
+        answer = normalize_text(str(result.get("answer", "")))
+    except Exception:
+        pass
+    if any(sig in answer for sig in ("bak", "oven", "pastry", "dough")):
+        return "baked"
+    if any(sig in answer for sig in ("cook", "fry", "boil", "saut", "grill", "stir")):
+        return "cooked"
+    if bake_score > cook_score:
+        return "baked"
+    if cook_score > bake_score:
+        return "cooked"
+    return "cooked"

services/config.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+import os
+@dataclass(frozen=True)
+class Settings:
+    dataset_path: Path = Path(os.getenv("BITEWISE_DATASET_PATH", "data/united_master_database_corrected.csv"))
+    ner_model_name: str = os.getenv("BITEWISE_NER_MODEL", "Dizex/InstaFoodRoBERTa-NER")
+    qa_model_name: str = os.getenv(
+        "BITEWISE_QA_MODEL",
+        "bert-large-uncased-whole-word-masking-finetuned-squad",
+    )
+    semantic_model_name: str = os.getenv("BITEWISE_SEMANTIC_MODEL", "glove-wiki-gigaword-50")
+    semantic_model_path: str = os.getenv("BITEWISE_SEMANTIC_PATH", "")
+    enable_semantic_download: bool = os.getenv("BITEWISE_ENABLE_SEMANTIC_DOWNLOAD", "1") == "1"
+    max_ingredients: int = int(os.getenv("BITEWISE_MAX_INGREDIENTS", "48"))
+    similarity_threshold: float = float(os.getenv("BITEWISE_SIM_THRESHOLD", "0.52"))
+settings = Settings()

services/dataset.py ADDED Viewed

	@@ -0,0 +1,239 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional
+import re
+import pandas as pd
+from .text_utils import as_aliases, dedupe_preserve_order, ingredient_variants, normalize_text
+REQUIRED_COLUMNS = [
+    "Ingredient",
+    "Context",
+    "Aliases",
+    "Category",
+    "Is_Keto_Friendly",
+    "Is_Vegan_Friendly",
+    "Keto_Substitution",
+    "Keto_Instruction",
+    "Vegan_Substitution",
+    "Vegan_Instruction",
+    "Vegan_Keto_Substitution",
+    "Vegan_Keto_Instruction",
+]
+BAKE_CONTEXTS = {
+    "Baking & Desserts",
+    "Baking (Binder)",
+    "Baking (Leavening)",
+    "Pastries",
+    "Bagels",
+    "Puddings",
+    "Tiramisu",
+}
+COOK_CONTEXTS = {
+    "Main Course & Heavy Cooking",
+    "Sauces, Dips & Dressings",
+    "Soups & Savory Liquids",
+    "Cold Prep & Light Meals",
+    "Pasta",
+    "Lasagna",
+    "Roast",
+    "Stir-Fry",
+    "Appetizer",
+    "Indian",
+    "Beverages",
+    "Cheese Making",
+    "Processed",
+    "Technical & Additives",
+}
+@dataclass(frozen=True)
+class IngredientRow:
+    ingredient: str
+    context: str
+    aliases: List[str]
+    category: str
+    is_keto_friendly: bool
+    is_vegan_friendly: bool
+    keto_substitution: Optional[str]
+    keto_instruction: Optional[str]
+    vegan_substitution: Optional[str]
+    vegan_instruction: Optional[str]
+    vegan_keto_substitution: Optional[str]
+    vegan_keto_instruction: Optional[str]
+    @property
+    def lookup_terms(self) -> List[str]:
+        terms = [self.ingredient, *self.aliases]
+        ing = normalize_text(self.ingredient)
+        if "egg" in ing:
+            terms.extend([
+                "egg", "eggs", "whole egg", "whole eggs", "egg white", "egg whites",
+                "egg yolk", "egg yolks",
+            ])
+        if "pancetta" in ing or "bacon" in ing:
+            terms.extend(["pancetta", "bacon", "guanciale", "prosciutto", "cured pork"])
+        if "chicken" in ing:
+            terms.extend(["chicken", "chicken thighs", "chicken breast", "chicken pieces", "chicken drumsticks"])
+        if "milk" in ing:
+            terms.extend(["milk", "whole milk", "dairy milk"])
+        if "cheese" in ing:
+            terms.extend(["cheese", "hard cheese", "soft cheese"])
+        return dedupe_preserve_order(terms)
+class SubstitutionDatabase:
+    def __init__(self, csv_path: str | Path):
+        self.csv_path = Path(csv_path)
+        self.df = self._load()
+        self.rows = [self._row_from_series(row) for _, row in self.df.iterrows()]
+        self._preferred_rows_cache: Dict[str, List[IngredientRow]] = {}
+        self._semantic_terms_cache: Dict[str, List[str]] = {}
+    def _load(self) -> pd.DataFrame:
+        if not self.csv_path.exists():
+            raise FileNotFoundError(
+                f"Could not find substitution database at {self.csv_path}. "
+                "Place united_master_database_corrected.csv in data/ or set BITEWISE_DATASET_PATH."
+            )
+        df = pd.read_csv(self.csv_path)
+        missing = [c for c in REQUIRED_COLUMNS if c not in df.columns]
+        if missing:
+            raise ValueError(f"Dataset missing required columns: {missing}")
+        df = df.copy()
+        df = df.dropna(subset=["Ingredient", "Context"])
+        df["_ingredient_norm"] = df["Ingredient"].astype(str).map(normalize_text)
+        df["_aliases_norm"] = df["Aliases"].apply(as_aliases)
+        return df.reset_index(drop=True)
+    def _row_from_series(self, row: pd.Series) -> IngredientRow:
+        return IngredientRow(
+            ingredient=str(row["Ingredient"]),
+            context=str(row["Context"]),
+            aliases=as_aliases(row.get("Aliases")),
+            category=str(row.get("Category", "")),
+            is_keto_friendly=bool(row.get("Is_Keto_Friendly", False)),
+            is_vegan_friendly=bool(row.get("Is_Vegan_Friendly", False)),
+            keto_substitution=None if pd.isna(row.get("Keto_Substitution")) else str(row.get("Keto_Substitution")),
+            keto_instruction=None if pd.isna(row.get("Keto_Instruction")) else str(row.get("Keto_Instruction")),
+            vegan_substitution=None if pd.isna(row.get("Vegan_Substitution")) else str(row.get("Vegan_Substitution")),
+            vegan_instruction=None if pd.isna(row.get("Vegan_Instruction")) else str(row.get("Vegan_Instruction")),
+            vegan_keto_substitution=None if pd.isna(row.get("Vegan_Keto_Substitution")) else str(row.get("Vegan_Keto_Substitution")),
+            vegan_keto_instruction=None if pd.isna(row.get("Vegan_Keto_Instruction")) else str(row.get("Vegan_Keto_Instruction")),
+        )
+    def contexts_for_recipe_type(self, recipe_type: str) -> set[str]:
+        return BAKE_CONTEXTS if recipe_type == "baked" else COOK_CONTEXTS
+    def preferred_rows(self, recipe_type: str) -> List[IngredientRow]:
+        if recipe_type not in self._preferred_rows_cache:
+            contexts = self.contexts_for_recipe_type(recipe_type)
+            self._preferred_rows_cache[recipe_type] = [row for row in self.rows if row.context in contexts]
+        return self._preferred_rows_cache[recipe_type]
+    def _normalize_terms(self, row: IngredientRow) -> List[str]:
+        return [normalize_text(term) for term in row.lookup_terms if normalize_text(term)]
+    def _match_exact(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
+        query = normalize_text(query)
+        exact_rows = []
+        for row in rows:
+            terms = self._normalize_terms(row)
+            if not terms:
+                continue
+            if normalize_text(row.ingredient) == query or query in terms:
+                exact_rows.append(row)
+        return exact_rows
+    def _match_partial(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
+        query = normalize_text(query)
+        if len(query) < 4:
+            return []
+        pattern = re.compile(rf"(?<!\w){re.escape(query)}(?!\w)")
+        partial_rows = []
+        for row in rows:
+            candidates = [normalize_text(row.ingredient), *self._normalize_terms(row)]
+            if any(pattern.search(candidate) for candidate in candidates):
+                partial_rows.append(row)
+        return partial_rows
+    def _rank_rows(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
+        q = normalize_text(query)
+        def score(row: IngredientRow) -> tuple[int, int, int, int]:
+            ingredient_norm = normalize_text(row.ingredient)
+            alias_norms = [normalize_text(a) for a in row.aliases]
+            exact_ingredient = int(ingredient_norm == q)
+            exact_alias = int(q in alias_norms)
+            alias_specificity = int(len(row.aliases) > 0)
+            length = len(ingredient_norm)
+            return (exact_ingredient, exact_alias, alias_specificity, length)
+        return sorted(rows, key=score, reverse=True)
+    def find_rows(self, query: str, recipe_type: str) -> List[IngredientRow]:
+        query = normalize_text(query)
+        if not query:
+            return []
+        variants = ingredient_variants(query)
+        preferred = self.preferred_rows(recipe_type)
+        for candidate in variants:
+            exact_preferred = self._match_exact(preferred, candidate)
+            if exact_preferred:
+                return self._rank_rows(exact_preferred, candidate)
+        for candidate in variants:
+            exact_all = self._match_exact(self.rows, candidate)
+            if exact_all:
+                return self._rank_rows(exact_all, candidate)
+        for candidate in variants:
+            partial_preferred = self._match_partial(preferred, candidate)
+            if partial_preferred:
+                return self._rank_rows(partial_preferred, candidate)
+        for candidate in variants:
+            partial_all = self._match_partial(self.rows, candidate)
+            if partial_all:
+                return self._rank_rows(partial_all, candidate)
+        return []
+    def semantic_terms(self, recipe_type: str) -> List[str]:
+        if recipe_type not in self._semantic_terms_cache:
+            rows = self.preferred_rows(recipe_type) or self.rows
+            terms = []
+            for row in rows:
+                terms.extend(row.lookup_terms)
+            self._semantic_terms_cache[recipe_type] = dedupe_preserve_order(terms)
+        return self._semantic_terms_cache[recipe_type]
+    def pick_substitution(self, row: IngredientRow, diet: str) -> tuple[str, str, bool]:
+        if diet == "vegan":
+            sub = row.vegan_substitution
+            instr = row.vegan_instruction
+            compatible = row.is_vegan_friendly
+        elif diet == "keto":
+            sub = row.keto_substitution
+            instr = row.keto_instruction
+            compatible = row.is_keto_friendly
+        else:
+            sub = row.vegan_keto_substitution
+            instr = row.vegan_keto_instruction
+            compatible = row.is_vegan_friendly and row.is_keto_friendly
+        if compatible or not sub or str(sub).strip().lower() in {"nan", "none"}:
+            return row.ingredient, "Already compatible — no substitution needed.", True
+        return str(sub), (str(instr) if instr and str(instr).lower() != "nan" else ""), False

services/ner.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from __future__ import annotations
+from functools import lru_cache
+from typing import List
+import torch
+from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
+from .config import settings
+from .text_utils import dedupe_preserve_order, strip_amounts_and_preps, tokenize_recipe_segments, normalize_text
+@lru_cache(maxsize=1)
+def get_ner_pipeline():
+    tokenizer = AutoTokenizer.from_pretrained(settings.ner_model_name)
+    model = AutoModelForTokenClassification.from_pretrained(settings.ner_model_name)
+    device = 0 if torch.cuda.is_available() else -1
+    return pipeline(
+        "token-classification",
+        model=model,
+        tokenizer=tokenizer,
+        aggregation_strategy="simple",
+        device=device,
+    )
+def _best_span_from_segment(segment: str) -> str:
+    segment = (segment or "").strip()
+    if not segment:
+        return ""
+    try:
+        pipe = get_ner_pipeline()
+        ents = pipe(segment)
+    except Exception:
+        ents = []
+    spans: List[str] = []
+    for ent in ents:
+        text = segment[ent["start"] : ent["end"]].strip()
+        text = normalize_text(text)
+        if not text or len(text) < 2:
+            continue
+        if text in {"and", "or", "with", "of"}:
+            continue
+        spans.append(text)
+    if spans:
+        # Prefer the longest span because ingredient models sometimes emit
+        # smaller fragments when the input chunk is short.
+        spans.sort(key=len, reverse=True)
+        return spans[0]
+    return strip_amounts_and_preps(segment)
+def extract_ingredients(recipe_text: str, max_items: int = 48) -> List[str]:
+    segments = tokenize_recipe_segments(recipe_text)
+    if not segments:
+        return []
+    out: List[str] = []
+    for segment in segments:
+        candidate = _best_span_from_segment(segment)
+        candidate = strip_amounts_and_preps(candidate)
+        if not candidate or len(candidate) < 2:
+            continue
+        if candidate in {"and", "or", "with", "of"}:
+            continue
+        out.append(candidate)
+        if len(out) >= max_items:
+            break
+    return dedupe_preserve_order(out)

services/recipe_service.py ADDED Viewed

	@@ -0,0 +1,268 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict, List, Literal, Optional
+from .classify import classify_recipe
+from .config import settings
+from .dataset import SubstitutionDatabase
+from .ner import extract_ingredients
+from .semantic import WordVectorFallback
+from .text_utils import normalize_text, singularize
+Diet = Literal["vegan", "keto", "both"]
+NEUTRAL_OK = {
+    "salt", "pepper", "black pepper", "white pepper", "water", "olive oil", "vegetable oil", "oil",
+    "garlic", "onion", "lemon", "lime", "vinegar", "basil", "oregano", "thyme", "rosemary",
+    "cumin", "paprika", "turmeric", "ginger", "chili", "chilli", "coriander", "parsley",
+    "bay leaf", "bay leaves", "nutmeg", "cinnamon", "cardamom", "cloves", "allspice",
+    "saffron", "vanilla", "cocoa powder", "baking powder", "baking soda", "yeast",
+}
+VEGAN_ANIMAL_HINTS = {
+    "pancetta", "bacon", "guanciale", "pork", "ham", "prosciutto", "sausage",
+    "chicken", "beef", "turkey", "lamb", "fish", "shrimp", "anchovy", "gelatin", "lard",
+}
+VEGAN_DAIRY_HINTS = {
+    "milk", "cream", "butter", "cheese", "yogurt", "whey", "casein", "ghee",
+}
+VEGAN_EGG_HINTS = {
+    "egg", "eggs", "whole egg", "whole eggs", "egg white", "egg whites", "egg yolk", "egg yolks",
+}
+KETO_CARB_HINTS = {
+    "pasta", "spaghetti", "noodle", "noodles", "bread", "flour", "sugar", "rice",
+    "potato", "potatoes", "corn", "oats", "beans", "bean", "honey", "syrup", "maple", "couscous",
+}
+@dataclass
+class IngredientResult:
+    original: str
+    normalized: str
+    compliant: bool
+    substitute: str
+    instructions: str
+    source: str
+    matched_ingredient: Optional[str] = None
+    confidence: float = 1.0
+    notes: Optional[str] = None
+    def as_dict(self) -> Dict[str, Any]:
+        payload = {
+            "original": self.original,
+            "normalized": self.normalized,
+            "compliant": self.compliant,
+            "substitute": self.substitute,
+            "instructions": self.instructions,
+            "source": self.source,
+            "matched_ingredient": self.matched_ingredient,
+            "confidence": round(float(self.confidence), 3),
+        }
+        if self.notes:
+            payload["notes"] = self.notes
+        return payload
+class RecipeAdapterService:
+    def __init__(self, db: SubstitutionDatabase, semantic: WordVectorFallback):
+        self.db = db
+        self.semantic = semantic
+    def _row_to_result(self, ingredient: str, row, diet: Diet) -> IngredientResult:
+        substitute, instructions, compliant = self.db.pick_substitution(row, diet)
+        normalized = normalize_text(ingredient)
+        if compliant:
+            return IngredientResult(
+                original=ingredient,
+                normalized=normalized,
+                compliant=True,
+                substitute=ingredient,
+                instructions="Already compatible — no substitution needed.",
+                source="database match",
+                matched_ingredient=row.ingredient,
+                confidence=1.0,
+            )
+        return IngredientResult(
+            original=ingredient,
+            normalized=normalized,
+            compliant=False,
+            substitute=substitute,
+            instructions=instructions or "",
+            source="database match",
+            matched_ingredient=row.ingredient,
+            confidence=0.96,
+        )
+    def _rule_fallback(self, ingredient: str, diet: Diet, recipe_type: str) -> Optional[IngredientResult]:
+        text = normalize_text(ingredient)
+        singular = singularize(text)
+        if singular != text:
+            text = f"{text} {singular}"
+        def result(sub: str, instr: str, source: str = "diet rule fallback", conf: float = 0.94) -> IngredientResult:
+            return IngredientResult(
+                original=ingredient,
+                normalized=normalize_text(ingredient),
+                compliant=False,
+                substitute=sub,
+                instructions=instr,
+                source=source,
+                matched_ingredient=None,
+                confidence=conf,
+            )
+        if diet in {"vegan", "both"}:
+            if any(hint in text for hint in VEGAN_EGG_HINTS):
+                if recipe_type == "baked":
+                    return result("Flax Egg", "1 tbsp ground flax + 3 tbsp water per egg. Rest 5 min before using.")
+                if diet == "both":
+                    return result("Silken Tofu", "Blend until smooth and use as a creamy egg-free binder.")
+                return result("Silken Tofu or Tofu Scramble", "Blend silken tofu for sauces or use crumbled tofu for savory dishes.")
+            if any(hint in text for hint in VEGAN_ANIMAL_HINTS):
+                if any(hint in text for hint in {"pancetta", "bacon", "guanciale", "pork", "ham", "prosciutto", "sausage"}):
+                    if diet == "both":
+                        return result("Extra Firm Tofu", "Press tofu 30 min, cube and pan-fry until golden.")
+                    return result("Smoked Tofu or Tempeh Bacon", "Dice and pan-fry until crispy.")
+                if any(hint in text for hint in {"chicken", "beef", "turkey", "lamb", "fish", "shrimp", "anchovy"}):
+                    if diet == "both":
+                        return result("Extra Firm Tofu", "Use as a 1:1 savory protein substitute.")
+                    return result("Soy Curls or Extra Firm Tofu", "Use as a 1:1 meat substitute.")
+                return result("Plant-based alternative", "Choose a vegan substitute that matches the recipe context.")
+            if any(hint in text for hint in VEGAN_DAIRY_HINTS):
+                if "butter" in text:
+                    return result("Vegan Butter", "Use 1:1 in place of butter.")
+                if "milk" in text:
+                    return result("Unsweetened Plant Milk", "Use 1:1 in place of dairy milk.")
+                if "cream" in text:
+                    return result("Cashew Cream", "Use as a rich dairy-free cream substitute.")
+                if "cheese" in text:
+                    return result("Vegan Cheese", "Use a meltable vegan cheese or nutritional yeast blend.")
+                return result("Plant-based alternative", "Choose a vegan substitute that matches the recipe context.")
+        if diet in {"keto", "both"}:
+            if any(hint in text for hint in KETO_CARB_HINTS):
+                if "spaghetti" in text or "pasta" in text or "noodle" in text:
+                    return result("Zucchini Noodles or Shirataki Noodles", "Use in a 1:1 swap for pasta-style dishes.")
+                if "rice" in text:
+                    return result("Cauliflower Rice", "Use as a low-carb rice substitute.")
+                if "flour" in text:
+                    return result("Almond Flour", "Use a keto baking flour blend.")
+                if "sugar" in text or "honey" in text or "syrup" in text:
+                    return result("Erythritol or Allulose", "Use a keto-friendly sweetener to taste.")
+                if "potato" in text:
+                    return result("Cauliflower", "Use roasted cauliflower or cauliflower mash.")
+                return result("Low-carb alternative", "Choose a keto-friendly substitute that matches the recipe context.")
+        return None
+    def _semantic_fallback(self, ingredient: str, diet: Diet, recipe_type: str) -> Optional[IngredientResult]:
+        if not self.semantic.available:
+            return None
+        candidates = self.db.semantic_terms(recipe_type)
+        if not candidates:
+            return None
+        hits = self.semantic.nearest(ingredient, candidates, top_k=5)
+        if not hits:
+            return None
+        for hit in hits:
+            if hit.score < settings.similarity_threshold:
+                continue
+            matched_rows = self.db.find_rows(hit.term, recipe_type)
+            if not matched_rows:
+                continue
+            row = matched_rows[0]
+            result = self._row_to_result(ingredient, row, diet)
+            result.source = f"glove semantic match ({hit.term}, score={hit.score:.2f})"
+            result.confidence = max(0.5, min(0.95, hit.score))
+            return result
+        return None
+    def _manual_review_result(self, ingredient: str) -> IngredientResult:
+        text = normalize_text(ingredient)
+        if text in NEUTRAL_OK:
+            return IngredientResult(
+                original=ingredient,
+                normalized=text,
+                compliant=True,
+                substitute=ingredient,
+                instructions="Already compatible — no substitution needed.",
+                source="known compatible",
+                matched_ingredient=None,
+                confidence=0.9,
+            )
+        return IngredientResult(
+            original=ingredient,
+            normalized=text,
+            compliant=False,
+            substitute=ingredient,
+            instructions="No reliable substitution found — please review manually.",
+            source="not in database",
+            matched_ingredient=None,
+            confidence=0.35,
+            notes="No reliable database or semantic match found.",
+        )
+    def adapt(self, recipe_text: str, diet: Diet) -> Dict[str, Any]:
+        if diet not in {"vegan", "keto", "both"}:
+            raise ValueError("diet must be one of: vegan, keto, both")
+        recipe_text = (recipe_text or "").strip()
+        if len(recipe_text) < 5:
+            raise ValueError("recipe_text is too short")
+        recipe_type = classify_recipe(recipe_text)
+        ingredients = extract_ingredients(recipe_text, max_items=settings.max_ingredients)
+        results: List[IngredientResult] = []
+        substitutions = 0
+        for ingredient in ingredients:
+            matches = self.db.find_rows(ingredient, recipe_type)
+            if matches:
+                result = self._row_to_result(ingredient, matches[0], diet)
+            else:
+                result = self._rule_fallback(ingredient, diet, recipe_type)
+                if result is None:
+                    result = self._semantic_fallback(ingredient, diet, recipe_type)
+                if result is None:
+                    result = self._manual_review_result(ingredient)
+            if not result.compliant:
+                substitutions += 1
+            results.append(result)
+        return {
+            "diet": diet,
+            "recipe_type": recipe_type,
+            "ingredients": [r.as_dict() for r in results],
+            "ingredients_found": len(results),
+            "substitution_count": substitutions,
+            "model_metadata": {
+                "ner_model": settings.ner_model_name,
+                "qa_model": settings.qa_model_name,
+                "semantic_model": settings.semantic_model_name,
+                "semantic_available": self.semantic.available,
+                "semantic_mode": self.semantic._kind,
+                "word2vec_available": self.semantic.available,
+                "word2vec_mode": self.semantic._kind,
+                "dataset_path": str(self.db.csv_path),
+            },
+            "disclaimer": (
+                "This is an assistive recipe tool, not nutritional or allergen medical advice. "
+                "Verify substitutions before cooking."
+            ),
+        }

services/semantic.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Iterable, List, Optional
+import numpy as np
+from .text_utils import normalize_text, singularize
+@dataclass
+class SemanticHit:
+    term: str
+    score: float
+class WordVectorFallback:
+    """Small Glove-based semantic fallback.
+    The model is optional so the app can still boot if the host blocks downloads.
+    """
+    def __init__(self, model_name: str = "glove-wiki-gigaword-50", model_path: str = "", enable_download: bool = True):
+        self.model = None
+        self._kind = "disabled"
+        self._model_name = model_name
+        self._load(model_name=model_name, model_path=model_path, enable_download=enable_download)
+    def _load(self, model_name: str, model_path: str, enable_download: bool) -> None:
+        try:
+            from gensim.models import KeyedVectors
+            import gensim.downloader as api
+        except Exception:
+            self.model = None
+            self._kind = "unavailable"
+            return
+        if model_path:
+            try:
+                self.model = KeyedVectors.load(model_path, mmap="r")
+                self._kind = f"local:{model_path}"
+                return
+            except Exception:
+                try:
+                    self.model = KeyedVectors.load_word2vec_format(model_path, binary=model_path.endswith(".bin"))
+                    self._kind = f"local-vec:{model_path}"
+                    return
+                except Exception:
+                    self.model = None
+        if enable_download:
+            try:
+                self.model = api.load(model_name)
+                self._kind = model_name
+            except Exception:
+                self.model = None
+                self._kind = "download-failed"
+        else:
+            self.model = None
+            self._kind = "disabled"
+    @property
+    def available(self) -> bool:
+        return self.model is not None
+    def vector_for(self, phrase: str) -> Optional[np.ndarray]:
+        if not self.available:
+            return None
+        normalized = normalize_text(phrase)
+        tokens = [singularize(t) for t in normalized.split()]
+        vectors = []
+        for token in tokens:
+            if token in self.model:
+                vectors.append(self.model[token])
+        if vectors:
+            return np.mean(np.stack(vectors), axis=0)
+        phrase_key = normalized.replace(" ", "_")
+        if phrase_key in self.model:
+            return self.model[phrase_key]
+        if normalized in self.model:
+            return self.model[normalized]
+        return None
+    def nearest(self, query: str, candidates: Iterable[str], top_k: int = 3) -> List[SemanticHit]:
+        if not self.available:
+            return []
+        qv = self.vector_for(query)
+        if qv is None:
+            return []
+        scored: List[SemanticHit] = []
+        qnorm = np.linalg.norm(qv) + 1e-8
+        for candidate in candidates:
+            cv = self.vector_for(candidate)
+            if cv is None:
+                continue
+            score = float(np.dot(qv, cv) / (qnorm * (np.linalg.norm(cv) + 1e-8)))
+            scored.append(SemanticHit(term=candidate, score=score))
+        scored.sort(key=lambda x: x.score, reverse=True)
+        return scored[:top_k]

services/text_utils.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from __future__ import annotations
+import re
+from functools import lru_cache
+from typing import Iterable, List
+STOPWORDS = {
+    "and", "or", "the", "a", "an", "some", "fresh", "dried", "chopped", "minced",
+    "diced", "sliced", "grated", "ground", "cooked", "raw", "cold", "hot", "warm",
+    "to", "taste", "optional", "plus", "more",
+}
+_AMOUNT_RE = re.compile(r"^(?:\d+(?:\.\d+)?|\d+\/\d+|[¼½¾⅓⅔⅛⅜⅝⅞])\s*")
+_MEASURE_RE = re.compile(
+    r"^(?:g|kg|mg|ml|l|oz|lb|lbs|cup|cups|tbsp|tablespoon|tsp|teaspoon|clove|cloves|slice|slices|piece|pieces|can|cans|bunch|handful|pinch|large|small|medium|whole)\s+"
+)
+def normalize_text(text: str) -> str:
+    text = (text or "").lower().strip()
+    text = re.sub(r"[‘’“”]", "'", text)
+    text = re.sub(r"[^a-z0-9\s\-']+", " ", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+@lru_cache(maxsize=4096)
+def singularize(word: str) -> str:
+    word = normalize_text(word)
+    if len(word) <= 3:
+        return word
+    if word.endswith("ies") and len(word) > 4:
+        return word[:-3] + "y"
+    if word.endswith("ves") and len(word) > 4:
+        return word[:-3] + "f"
+    if word.endswith("ses") or word.endswith("xes") or word.endswith("zes") or word.endswith("ches") or word.endswith("shes"):
+        return word[:-2]
+    if word.endswith("s") and not word.endswith("ss"):
+        return word[:-1]
+    return word
+def dedupe_preserve_order(items: Iterable[str]) -> List[str]:
+    seen = set()
+    out = []
+    for item in items:
+        item = normalize_text(item)
+        if item and item not in seen:
+            seen.add(item)
+            out.append(item)
+    return out
+def strip_amounts_and_preps(text: str) -> str:
+    text = normalize_text(text)
+    text = _AMOUNT_RE.sub("", text)
+    text = _MEASURE_RE.sub("", text)
+    text = re.sub(r"^of\s+", "", text)
+    text = re.sub(r"\(.*?\)", "", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def tokenize_recipe_segments(text: str) -> List[str]:
+    raw = text or ""
+    parts = re.split(r",|\n|;|\s+and\s+", raw, flags=re.IGNORECASE)
+    cleaned = []
+    for part in parts:
+        item = strip_amounts_and_preps(part)
+        if item and len(item) > 1:
+            cleaned.append(item)
+    return dedupe_preserve_order(cleaned)
+def ingredient_variants(ingredient: str) -> List[str]:
+    ing = normalize_text(ingredient)
+    variants = [ing]
+    singular = singularize(ing)
+    if singular != ing:
+        variants.append(singular)
+    suffixes = [" cheese", " oil", " milk", " cream", " butter", " powder", " sauce", " paste", " extract"]
+    for suffix in suffixes:
+        if ing.endswith(suffix) and len(ing) > len(suffix) + 1:
+            base = ing[:-len(suffix)].strip()
+            variants.append(base)
+            base_singular = singularize(base)
+            if base_singular != base:
+                variants.append(base_singular)
+    words = ing.split()
+    if len(words) > 1:
+        variants.extend([words[0], words[-1], " ".join(words[:2]), " ".join(words[1:])])
+    return dedupe_preserve_order(variants)
+def as_aliases(aliases: str | float | None) -> List[str]:
+    if aliases is None or not isinstance(aliases, str):
+        return []
+    return dedupe_preserve_order(alias.strip() for alias in aliases.split("|"))

static/index.html ADDED Viewed

	@@ -0,0 +1,20 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+  <title>BiteWise API</title>
+  <style>
+    body { font-family: Arial, sans-serif; background: #f5f4f0; color: #1a1a1a; margin: 0; padding: 40px; }
+    .card { max-width: 720px; background: #fff; border: 1px solid #e8e6e0; border-radius: 16px; padding: 24px; }
+    code { background: #f7f7f7; padding: 2px 6px; border-radius: 6px; }
+  </style>
+</head>
+<body>
+  <div class="card">
+    <h1>BiteWise is running</h1>
+    <p>Use <code>POST /api/adapt</code> to adapt a recipe for vegan, keto, or both.</p>
+    <p>Open <code>/docs</code> to test the API.</p>
+  </div>
+</body>
+</html>