BiteWiseFinal

Sleeping

File size: 9,230 Bytes
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional
import re

import pandas as pd

from .text_utils import as_aliases, dedupe_preserve_order, ingredient_variants, normalize_text

REQUIRED_COLUMNS = [
    "Ingredient",
    "Context",
    "Aliases",
    "Category",
    "Is_Keto_Friendly",
    "Is_Vegan_Friendly",
    "Keto_Substitution",
    "Keto_Instruction",
    "Vegan_Substitution",
    "Vegan_Instruction",
    "Vegan_Keto_Substitution",
    "Vegan_Keto_Instruction",
]

BAKE_CONTEXTS = {
    "Baking & Desserts",
    "Baking (Binder)",
    "Baking (Leavening)",
    "Pastries",
    "Bagels",
    "Puddings",
    "Tiramisu",
}

COOK_CONTEXTS = {
    "Main Course & Heavy Cooking",
    "Sauces, Dips & Dressings",
    "Soups & Savory Liquids",
    "Cold Prep & Light Meals",
    "Pasta",
    "Lasagna",
    "Roast",
    "Stir-Fry",
    "Appetizer",
    "Indian",
    "Beverages",
    "Cheese Making",
    "Processed",
    "Technical & Additives",
}


@dataclass(frozen=True)
class IngredientRow:
    ingredient: str
    context: str
    aliases: List[str]
    category: str
    is_keto_friendly: bool
    is_vegan_friendly: bool
    keto_substitution: Optional[str]
    keto_instruction: Optional[str]
    vegan_substitution: Optional[str]
    vegan_instruction: Optional[str]
    vegan_keto_substitution: Optional[str]
    vegan_keto_instruction: Optional[str]

    @property
    def lookup_terms(self) -> List[str]:
        terms = [self.ingredient, *self.aliases]
        ing = normalize_text(self.ingredient)

        if "egg" in ing:
            terms.extend([
                "egg", "eggs", "whole egg", "whole eggs", "egg white", "egg whites",
                "egg yolk", "egg yolks",
            ])
        if "pancetta" in ing or "bacon" in ing:
            terms.extend(["pancetta", "bacon", "guanciale", "prosciutto", "cured pork"])
        if "chicken" in ing:
            terms.extend(["chicken", "chicken thighs", "chicken breast", "chicken pieces", "chicken drumsticks"])
        if "milk" in ing:
            terms.extend(["milk", "whole milk", "dairy milk"])
        if "cheese" in ing:
            terms.extend(["cheese", "hard cheese", "soft cheese"])

        return dedupe_preserve_order(terms)


class SubstitutionDatabase:
    def __init__(self, csv_path: str | Path):
        self.csv_path = Path(csv_path)
        self.df = self._load()
        self.rows = [self._row_from_series(row) for _, row in self.df.iterrows()]
        self._preferred_rows_cache: Dict[str, List[IngredientRow]] = {}
        self._semantic_terms_cache: Dict[str, List[str]] = {}

    def _load(self) -> pd.DataFrame:
        if not self.csv_path.exists():
            raise FileNotFoundError(
                f"Could not find substitution database at {self.csv_path}. "
                "Place united_master_database_corrected.csv in data/ or set BITEWISE_DATASET_PATH."
            )

        df = pd.read_csv(self.csv_path)
        missing = [c for c in REQUIRED_COLUMNS if c not in df.columns]
        if missing:
            raise ValueError(f"Dataset missing required columns: {missing}")

        df = df.copy()
        df = df.dropna(subset=["Ingredient", "Context"])
        df["_ingredient_norm"] = df["Ingredient"].astype(str).map(normalize_text)
        df["_aliases_norm"] = df["Aliases"].apply(as_aliases)
        return df.reset_index(drop=True)

    def _row_from_series(self, row: pd.Series) -> IngredientRow:
        return IngredientRow(
            ingredient=str(row["Ingredient"]),
            context=str(row["Context"]),
            aliases=as_aliases(row.get("Aliases")),
            category=str(row.get("Category", "")),
            is_keto_friendly=bool(row.get("Is_Keto_Friendly", False)),
            is_vegan_friendly=bool(row.get("Is_Vegan_Friendly", False)),
            keto_substitution=None if pd.isna(row.get("Keto_Substitution")) else str(row.get("Keto_Substitution")),
            keto_instruction=None if pd.isna(row.get("Keto_Instruction")) else str(row.get("Keto_Instruction")),
            vegan_substitution=None if pd.isna(row.get("Vegan_Substitution")) else str(row.get("Vegan_Substitution")),
            vegan_instruction=None if pd.isna(row.get("Vegan_Instruction")) else str(row.get("Vegan_Instruction")),
            vegan_keto_substitution=None if pd.isna(row.get("Vegan_Keto_Substitution")) else str(row.get("Vegan_Keto_Substitution")),
            vegan_keto_instruction=None if pd.isna(row.get("Vegan_Keto_Instruction")) else str(row.get("Vegan_Keto_Instruction")),
        )

    def contexts_for_recipe_type(self, recipe_type: str) -> set[str]:
        return BAKE_CONTEXTS if recipe_type == "baked" else COOK_CONTEXTS

    def preferred_rows(self, recipe_type: str) -> List[IngredientRow]:
        if recipe_type not in self._preferred_rows_cache:
            contexts = self.contexts_for_recipe_type(recipe_type)
            self._preferred_rows_cache[recipe_type] = [row for row in self.rows if row.context in contexts]
        return self._preferred_rows_cache[recipe_type]

    def _normalize_terms(self, row: IngredientRow) -> List[str]:
        return [normalize_text(term) for term in row.lookup_terms if normalize_text(term)]

    def _match_exact(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
        query = normalize_text(query)
        exact_rows = []
        for row in rows:
            terms = self._normalize_terms(row)
            if not terms:
                continue
            if normalize_text(row.ingredient) == query or query in terms:
                exact_rows.append(row)
        return exact_rows

    def _match_partial(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
        query = normalize_text(query)
        if len(query) < 4:
            return []
        pattern = re.compile(rf"(?<!\w){re.escape(query)}(?!\w)")
        partial_rows = []
        for row in rows:
            candidates = [normalize_text(row.ingredient), *self._normalize_terms(row)]
            if any(pattern.search(candidate) for candidate in candidates):
                partial_rows.append(row)
        return partial_rows

    def _rank_rows(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]:
        q = normalize_text(query)

        def score(row: IngredientRow) -> tuple[int, int, int, int]:
            ingredient_norm = normalize_text(row.ingredient)
            alias_norms = [normalize_text(a) for a in row.aliases]
            exact_ingredient = int(ingredient_norm == q)
            exact_alias = int(q in alias_norms)
            alias_specificity = int(len(row.aliases) > 0)
            length = len(ingredient_norm)
            return (exact_ingredient, exact_alias, alias_specificity, length)

        return sorted(rows, key=score, reverse=True)

    def find_rows(self, query: str, recipe_type: str) -> List[IngredientRow]:
        query = normalize_text(query)
        if not query:
            return []

        variants = ingredient_variants(query)
        preferred = self.preferred_rows(recipe_type)

        for candidate in variants:
            exact_preferred = self._match_exact(preferred, candidate)
            if exact_preferred:
                return self._rank_rows(exact_preferred, candidate)

        for candidate in variants:
            exact_all = self._match_exact(self.rows, candidate)
            if exact_all:
                return self._rank_rows(exact_all, candidate)

        for candidate in variants:
            partial_preferred = self._match_partial(preferred, candidate)
            if partial_preferred:
                return self._rank_rows(partial_preferred, candidate)

        for candidate in variants:
            partial_all = self._match_partial(self.rows, candidate)
            if partial_all:
                return self._rank_rows(partial_all, candidate)

        return []

    def semantic_terms(self, recipe_type: str) -> List[str]:
        if recipe_type not in self._semantic_terms_cache:
            rows = self.preferred_rows(recipe_type) or self.rows
            terms = []
            for row in rows:
                terms.extend(row.lookup_terms)
            self._semantic_terms_cache[recipe_type] = dedupe_preserve_order(terms)
        return self._semantic_terms_cache[recipe_type]

    def pick_substitution(self, row: IngredientRow, diet: str) -> tuple[str, str, bool]:
        if diet == "vegan":
            sub = row.vegan_substitution
            instr = row.vegan_instruction
            compatible = row.is_vegan_friendly
        elif diet == "keto":
            sub = row.keto_substitution
            instr = row.keto_instruction
            compatible = row.is_keto_friendly
        else:
            sub = row.vegan_keto_substitution
            instr = row.vegan_keto_instruction
            compatible = row.is_vegan_friendly and row.is_keto_friendly

        if compatible or not sub or str(sub).strip().lower() in {"nan", "none"}:
            return row.ingredient, "Already compatible — no substitution needed.", True

        return str(sub), (str(instr) if instr and str(instr).lower() != "nan" else ""), False