Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import Dict, List, Optional | |
| import re | |
| import pandas as pd | |
| from .text_utils import as_aliases, dedupe_preserve_order, ingredient_variants, normalize_text | |
| REQUIRED_COLUMNS = [ | |
| "Ingredient", | |
| "Context", | |
| "Aliases", | |
| "Category", | |
| "Is_Keto_Friendly", | |
| "Is_Vegan_Friendly", | |
| "Keto_Substitution", | |
| "Keto_Instruction", | |
| "Vegan_Substitution", | |
| "Vegan_Instruction", | |
| "Vegan_Keto_Substitution", | |
| "Vegan_Keto_Instruction", | |
| ] | |
| BAKE_CONTEXTS = { | |
| "Baking & Desserts", | |
| "Baking (Binder)", | |
| "Baking (Leavening)", | |
| "Pastries", | |
| "Bagels", | |
| "Puddings", | |
| "Tiramisu", | |
| } | |
| COOK_CONTEXTS = { | |
| "Main Course & Heavy Cooking", | |
| "Sauces, Dips & Dressings", | |
| "Soups & Savory Liquids", | |
| "Cold Prep & Light Meals", | |
| "Pasta", | |
| "Lasagna", | |
| "Roast", | |
| "Stir-Fry", | |
| "Appetizer", | |
| "Indian", | |
| "Beverages", | |
| "Cheese Making", | |
| "Processed", | |
| "Technical & Additives", | |
| } | |
| class IngredientRow: | |
| ingredient: str | |
| context: str | |
| aliases: List[str] | |
| category: str | |
| is_keto_friendly: bool | |
| is_vegan_friendly: bool | |
| keto_substitution: Optional[str] | |
| keto_instruction: Optional[str] | |
| vegan_substitution: Optional[str] | |
| vegan_instruction: Optional[str] | |
| vegan_keto_substitution: Optional[str] | |
| vegan_keto_instruction: Optional[str] | |
| def lookup_terms(self) -> List[str]: | |
| terms = [self.ingredient, *self.aliases] | |
| ing = normalize_text(self.ingredient) | |
| if "egg" in ing: | |
| terms.extend([ | |
| "egg", "eggs", "whole egg", "whole eggs", "egg white", "egg whites", | |
| "egg yolk", "egg yolks", | |
| ]) | |
| if "pancetta" in ing or "bacon" in ing: | |
| terms.extend(["pancetta", "bacon", "guanciale", "prosciutto", "cured pork"]) | |
| if "chicken" in ing: | |
| terms.extend(["chicken", "chicken thighs", "chicken breast", "chicken pieces", "chicken drumsticks"]) | |
| if "milk" in ing: | |
| terms.extend(["milk", "whole milk", "dairy milk"]) | |
| if "cheese" in ing: | |
| terms.extend(["cheese", "hard cheese", "soft cheese"]) | |
| return dedupe_preserve_order(terms) | |
| class SubstitutionDatabase: | |
| def __init__(self, csv_path: str | Path): | |
| self.csv_path = Path(csv_path) | |
| self.df = self._load() | |
| self.rows = [self._row_from_series(row) for _, row in self.df.iterrows()] | |
| self._preferred_rows_cache: Dict[str, List[IngredientRow]] = {} | |
| self._semantic_terms_cache: Dict[str, List[str]] = {} | |
| def _load(self) -> pd.DataFrame: | |
| if not self.csv_path.exists(): | |
| raise FileNotFoundError( | |
| f"Could not find substitution database at {self.csv_path}. " | |
| "Place united_master_database_corrected.csv in data/ or set BITEWISE_DATASET_PATH." | |
| ) | |
| df = pd.read_csv(self.csv_path) | |
| missing = [c for c in REQUIRED_COLUMNS if c not in df.columns] | |
| if missing: | |
| raise ValueError(f"Dataset missing required columns: {missing}") | |
| df = df.copy() | |
| df = df.dropna(subset=["Ingredient", "Context"]) | |
| df["_ingredient_norm"] = df["Ingredient"].astype(str).map(normalize_text) | |
| df["_aliases_norm"] = df["Aliases"].apply(as_aliases) | |
| return df.reset_index(drop=True) | |
| def _row_from_series(self, row: pd.Series) -> IngredientRow: | |
| return IngredientRow( | |
| ingredient=str(row["Ingredient"]), | |
| context=str(row["Context"]), | |
| aliases=as_aliases(row.get("Aliases")), | |
| category=str(row.get("Category", "")), | |
| is_keto_friendly=bool(row.get("Is_Keto_Friendly", False)), | |
| is_vegan_friendly=bool(row.get("Is_Vegan_Friendly", False)), | |
| keto_substitution=None if pd.isna(row.get("Keto_Substitution")) else str(row.get("Keto_Substitution")), | |
| keto_instruction=None if pd.isna(row.get("Keto_Instruction")) else str(row.get("Keto_Instruction")), | |
| vegan_substitution=None if pd.isna(row.get("Vegan_Substitution")) else str(row.get("Vegan_Substitution")), | |
| vegan_instruction=None if pd.isna(row.get("Vegan_Instruction")) else str(row.get("Vegan_Instruction")), | |
| vegan_keto_substitution=None if pd.isna(row.get("Vegan_Keto_Substitution")) else str(row.get("Vegan_Keto_Substitution")), | |
| vegan_keto_instruction=None if pd.isna(row.get("Vegan_Keto_Instruction")) else str(row.get("Vegan_Keto_Instruction")), | |
| ) | |
| def contexts_for_recipe_type(self, recipe_type: str) -> set[str]: | |
| return BAKE_CONTEXTS if recipe_type == "baked" else COOK_CONTEXTS | |
| def preferred_rows(self, recipe_type: str) -> List[IngredientRow]: | |
| if recipe_type not in self._preferred_rows_cache: | |
| contexts = self.contexts_for_recipe_type(recipe_type) | |
| self._preferred_rows_cache[recipe_type] = [row for row in self.rows if row.context in contexts] | |
| return self._preferred_rows_cache[recipe_type] | |
| def _normalize_terms(self, row: IngredientRow) -> List[str]: | |
| return [normalize_text(term) for term in row.lookup_terms if normalize_text(term)] | |
| def _match_exact(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]: | |
| query = normalize_text(query) | |
| exact_rows = [] | |
| for row in rows: | |
| terms = self._normalize_terms(row) | |
| if not terms: | |
| continue | |
| if normalize_text(row.ingredient) == query or query in terms: | |
| exact_rows.append(row) | |
| return exact_rows | |
| def _match_partial(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]: | |
| query = normalize_text(query) | |
| if len(query) < 4: | |
| return [] | |
| pattern = re.compile(rf"(?<!\w){re.escape(query)}(?!\w)") | |
| partial_rows = [] | |
| for row in rows: | |
| candidates = [normalize_text(row.ingredient), *self._normalize_terms(row)] | |
| if any(pattern.search(candidate) for candidate in candidates): | |
| partial_rows.append(row) | |
| return partial_rows | |
| def _rank_rows(self, rows: List[IngredientRow], query: str) -> List[IngredientRow]: | |
| q = normalize_text(query) | |
| def score(row: IngredientRow) -> tuple[int, int, int, int]: | |
| ingredient_norm = normalize_text(row.ingredient) | |
| alias_norms = [normalize_text(a) for a in row.aliases] | |
| exact_ingredient = int(ingredient_norm == q) | |
| exact_alias = int(q in alias_norms) | |
| alias_specificity = int(len(row.aliases) > 0) | |
| length = len(ingredient_norm) | |
| return (exact_ingredient, exact_alias, alias_specificity, length) | |
| return sorted(rows, key=score, reverse=True) | |
| def find_rows(self, query: str, recipe_type: str) -> List[IngredientRow]: | |
| query = normalize_text(query) | |
| if not query: | |
| return [] | |
| variants = ingredient_variants(query) | |
| preferred = self.preferred_rows(recipe_type) | |
| for candidate in variants: | |
| exact_preferred = self._match_exact(preferred, candidate) | |
| if exact_preferred: | |
| return self._rank_rows(exact_preferred, candidate) | |
| for candidate in variants: | |
| exact_all = self._match_exact(self.rows, candidate) | |
| if exact_all: | |
| return self._rank_rows(exact_all, candidate) | |
| for candidate in variants: | |
| partial_preferred = self._match_partial(preferred, candidate) | |
| if partial_preferred: | |
| return self._rank_rows(partial_preferred, candidate) | |
| for candidate in variants: | |
| partial_all = self._match_partial(self.rows, candidate) | |
| if partial_all: | |
| return self._rank_rows(partial_all, candidate) | |
| return [] | |
| def semantic_terms(self, recipe_type: str) -> List[str]: | |
| if recipe_type not in self._semantic_terms_cache: | |
| rows = self.preferred_rows(recipe_type) or self.rows | |
| terms = [] | |
| for row in rows: | |
| terms.extend(row.lookup_terms) | |
| self._semantic_terms_cache[recipe_type] = dedupe_preserve_order(terms) | |
| return self._semantic_terms_cache[recipe_type] | |
| def pick_substitution(self, row: IngredientRow, diet: str) -> tuple[str, str, bool]: | |
| if diet == "vegan": | |
| sub = row.vegan_substitution | |
| instr = row.vegan_instruction | |
| compatible = row.is_vegan_friendly | |
| elif diet == "keto": | |
| sub = row.keto_substitution | |
| instr = row.keto_instruction | |
| compatible = row.is_keto_friendly | |
| else: | |
| sub = row.vegan_keto_substitution | |
| instr = row.vegan_keto_instruction | |
| compatible = row.is_vegan_friendly and row.is_keto_friendly | |
| if compatible or not sub or str(sub).strip().lower() in {"nan", "none"}: | |
| return row.ingredient, "Already compatible — no substitution needed.", True | |
| return str(sub), (str(instr) if instr and str(instr).lower() != "nan" else ""), False | |