from __future__ import annotations from dataclasses import dataclass from pathlib import Path from typing import Dict, List, Optional import re import pandas as pd from .text_utils import as_aliases, dedupe_preserve_order, ingredient_variants, normalize_text REQUIRED_COLUMNS = [ "Ingredient", "Context", "Aliases", "Category", "Is_Keto_Friendly", "Is_Vegan_Friendly", "Keto_Substitution", "Keto_Instruction", "Vegan_Substitution", "Vegan_Instruction", "Vegan_Keto_Substitution", "Vegan_Keto_Instruction", ] BAKE_CONTEXTS = { "Baking & Desserts", "Baking (Binder)", "Baking (Leavening)", "Pastries", "Bagels", "Puddings", "Tiramisu", } COOK_CONTEXTS = { "Main Course & Heavy Cooking", "Sauces, Dips & Dressings", "Soups & Savory Liquids", "Cold Prep & Light Meals", "Pasta", "Lasagna", "Roast", "Stir-Fry", "Appetizer", "Indian", "Beverages", "Cheese Making", "Processed", "Technical & Additives", } @dataclass(frozen=True) class IngredientRow: ingredient: str context: str aliases: List[str] category: str is_keto_friendly: bool is_vegan_friendly: bool keto_substitution: Optional[str] keto_instruction: Optional[str] vegan_substitution: Optional[str] vegan_instruction: Optional[str] vegan_keto_substitution: Optional[str] vegan_keto_instruction: Optional[str] @property def lookup_terms(self) -> List[str]: terms = [self.ingredient, *self.aliases] ing = normalize_text(self.ingredient) if "egg" in ing: terms.extend([ "egg", "eggs", "whole egg", "whole eggs", "egg white", "egg whites", "egg yolk", "egg yolks", ]) if "pancetta" in ing or "bacon" in ing: terms.extend(["pancetta", "bacon", "guanciale", "prosciutto", "cured pork"]) if "chicken" in ing: terms.extend(["chicken", "chicken thighs", "chicken breast", "chicken pieces", "chicken drumsticks"]) if "milk" in ing: terms.extend(["milk", "whole milk", "dairy milk"]) if "cheese" in ing: terms.extend(["cheese", "hard cheese", "soft cheese"]) return dedupe_preserve_order(terms) class SubstitutionDatabase: def __init__(self, csv_path: str | Path): self.csv_path = Path(csv_path) self.df = self._load() self.rows = [self._row_from_series(row) for _, row in self.df.iterrows()] self._preferred_rows_cache: Dict[str, List[IngredientRow]] = {} self._semantic_terms_cache: Dict[str, List[str]] = {} def _load(self) -> pd.DataFrame: if not self.csv_path.exists(): raise FileNotFoundError( f"Could not find substitution database at {self.csv_path}. " "Place united_master_database_corrected.csv in data/ or set BITEWISE_DATASET_PATH." ) df = pd.read_csv(self.csv_path) missing = [c for c in REQUIRED_COLUMNS if c not in df.columns] if missing: raise ValueError(f"Dataset missing required columns: {missing}") df = df.copy() df = df.dropna(subset=["Ingredient", "Context"]) df["_ingredient_norm"] = df["Ingredient"].astype(str).map(normalize_text) df["_aliases_norm"] = df["Aliases"].apply(as_aliases) return df.reset_index(drop=True) def _row_from_series(self, row: pd.Series) -> IngredientRow: return IngredientRow( ingredient=str(row["Ingredient"]), context=str(row["Context"]), aliases=as_aliases(row.get("Aliases")), category=str(row.get("Category", "")), is_keto_friendly=bool(row.get("Is_Keto_Friendly", False)), is_vegan_friendly=bool(row.get("Is_Vegan_Friendly", False)), keto_substitution=None if pd.isna(row.get("Keto_Substitution")) else str(row.get("Keto_Substitution")), keto_instruction=None if pd.isna(row.get("Keto_Instruction")) else str(row.get("Keto_Instruction")), vegan_substitution=None if pd.isna(row.get("Vegan_Substitution")) else str(row.get("Vegan_Substitution")), vegan_instruction=None if pd.isna(row.get("Vegan_Instruction")) else str(row.get("Vegan_Instruction")), vegan_keto_substitution=None if pd.isna(row.get("Vegan_Keto_Substitution")) else str(row.get("Vegan_Keto_Substitution")), vegan_keto_instruction=None if pd.isna(row.get("Vegan_Keto_Instruction")) else str(row.get("Vegan_Keto_Instruction")), ) def contexts_for_recipe_type(self, recipe_type: str) -> set[str]: return BAKE_CONTEXTS if recipe_type == "baked" else COOK_CONTEXTS def preferred_rows(self, recipe_type: str) -> List[IngredientRow]: if recipe_type not in self._preferred_rows_cache: contexts = self.contexts_for_recipe_type(recipe_type) self._preferred_rows_cache[recipe_type] = [row for row in self.rows if row.context in contexts] return self._preferred_rows_cache[recipe_type] def _normalize_terms(self, row: IngredientRow) -> List[str]: return [normalize_text(term) for term in row.lookup_terms if normalize_text(term)] def _match_exact(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]: query = normalize_text(query) exact_rows = [] for row in rows: terms = self._normalize_terms(row) if not terms: continue if normalize_text(row.ingredient) == query or query in terms: exact_rows.append(row) return exact_rows def _match_partial(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]: query = normalize_text(query) if len(query) < 4: return [] pattern = re.compile(rf"(? List[IngredientRow]: q = normalize_text(query) def score(row: IngredientRow) -> tuple[int, int, int, int]: ingredient_norm = normalize_text(row.ingredient) alias_norms = [normalize_text(a) for a in row.aliases] exact_ingredient = int(ingredient_norm == q) exact_alias = int(q in alias_norms) alias_specificity = int(len(row.aliases) > 0) length = len(ingredient_norm) return (exact_ingredient, exact_alias, alias_specificity, length) return sorted(rows, key=score, reverse=True) def find_rows(self, query: str, recipe_type: str) -> List[IngredientRow]: query = normalize_text(query) if not query: return [] variants = ingredient_variants(query) preferred = self.preferred_rows(recipe_type) for candidate in variants: exact_preferred = self._match_exact(preferred, candidate) if exact_preferred: return self._rank_rows(exact_preferred, candidate) for candidate in variants: exact_all = self._match_exact(self.rows, candidate) if exact_all: return self._rank_rows(exact_all, candidate) for candidate in variants: partial_preferred = self._match_partial(preferred, candidate) if partial_preferred: return self._rank_rows(partial_preferred, candidate) for candidate in variants: partial_all = self._match_partial(self.rows, candidate) if partial_all: return self._rank_rows(partial_all, candidate) return [] def semantic_terms(self, recipe_type: str) -> List[str]: if recipe_type not in self._semantic_terms_cache: rows = self.preferred_rows(recipe_type) or self.rows terms = [] for row in rows: terms.extend(row.lookup_terms) self._semantic_terms_cache[recipe_type] = dedupe_preserve_order(terms) return self._semantic_terms_cache[recipe_type] def pick_substitution(self, row: IngredientRow, diet: str) -> tuple[str, str, bool]: if diet == "vegan": sub = row.vegan_substitution instr = row.vegan_instruction compatible = row.is_vegan_friendly elif diet == "keto": sub = row.keto_substitution instr = row.keto_instruction compatible = row.is_keto_friendly else: sub = row.vegan_keto_substitution instr = row.vegan_keto_instruction compatible = row.is_vegan_friendly and row.is_keto_friendly if compatible or not sub or str(sub).strip().lower() in {"nan", "none"}: return row.ingredient, "Already compatible — no substitution needed.", True return str(sub), (str(instr) if instr and str(instr).lower() != "nan" else ""), False