from __future__ import annotations import re from functools import lru_cache from typing import Iterable, List import inflect _p = inflect.engine() def normalize_text(text: str) -> str: text = (text or "").lower().strip() text = re.sub(r"[\u2018\u2019\u201c\u201d]", "'", text) text = re.sub(r"[^a-z0-9\s\-']+", " ", text) text = re.sub(r"\s+", " ", text).strip() return text @lru_cache(maxsize=4096) def singularize(word: str) -> str: word = normalize_text(word) singular = _p.singular_noun(word) return singular if singular else word def dedupe_preserve_order(items: Iterable[str]) -> List[str]: seen = set() out = [] for item in items: item = normalize_text(item) if item and item not in seen: seen.add(item) out.append(item) return out _AMOUNT_RE = re.compile( r"^(?:\d+(?:\.\d+)?|\d+/\d+|[\u00bc\u00bd\u00be\u2153\u2154\u215b\u215c\u215d\u215e])\s*" ) _MEASURE_RE = re.compile( r"^(?:g|kg|mg|ml|l|oz|lb|lbs|cup|cups|tbsp|tablespoon|tsp|teaspoon|clove|cloves|slice|slices|piece|pieces|can|cans|bunch|handful|pinch|large|small|medium|whole)\s+" ) def strip_amounts_and_preps(text: str) -> str: """Remove leading quantities and prep words from a recipe fragment.""" text = normalize_text(text) text = _AMOUNT_RE.sub("", text) text = _MEASURE_RE.sub("", text) text = re.sub(r"^of\s+", "", text) text = re.sub(r"\(.*?\)", "", text) text = re.sub(r"\s+", " ", text).strip() return text def tokenize_recipe_segments(text: str) -> List[str]: """Split a recipe into ingredient-like chunks. The MVP demo worked best with comma-separated ingredients, so we keep that behaviour first and only use a small fallback split when the recipe has no commas. """ raw = text or "" parts = re.split(r",|\n|;", raw) if len(parts) == 1: parts = re.split(r"\s+and\s+", raw, flags=re.IGNORECASE) cleaned = [] for part in parts: item = strip_amounts_and_preps(part) if item and len(item) > 1: cleaned.append(item) return dedupe_preserve_order(cleaned) def ingredient_variants(ingredient: str) -> List[str]: ing = normalize_text(ingredient) variants = [ing] singular = singularize(ing) if singular != ing: variants.append(singular) suffixes = [" cheese", " oil", " milk", " cream", " butter", " powder", " sauce", " paste", " extract"] for suffix in suffixes: if ing.endswith(suffix) and len(ing) > len(suffix) + 1: base = ing[:-len(suffix)].strip() variants.append(base) base_singular = singularize(base) if base_singular != base: variants.append(base_singular) words = ing.split() if len(words) > 1: variants.extend([ words[0], words[-1], " ".join(words[:2]), " ".join(words[1:]), ]) # very short fragments create false matches, so keep them only as a last resort return dedupe_preserve_order(variants) def as_aliases(aliases: str | float | None) -> List[str]: if aliases is None: return [] if not isinstance(aliases, str): return [] return dedupe_preserve_order(alias.strip() for alias in aliases.split("|"))