from __future__ import annotations import re from typing import Iterable, List try: import inflect _INFLECT = inflect.engine() except Exception: # pragma: no cover _INFLECT = None AMOUNT_PREFIX_RE = re.compile( r"""^\s* [\d\s\/\.½¼¾⅓⅔]+ \s* (?:g|kg|mg|ml|l|oz|lb|pound|pounds|cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons| clove|cloves|can|cans|slice|slices|piece|pieces|pinch|dash|handful|sprig|sprigs|bunch|bunches| package|packages|jar|jars|stalk|stalks|stick|sticks|packet|packets|quart|quarts|pint|pints)? \s* """, re.IGNORECASE | re.VERBOSE, ) PAREN_RE = re.compile(r"\s*[\(\[].*?[\)\]]\s*") MULTISPACE_RE = re.compile(r"\s+") DESCRIPTOR_PREFIXES = { "fresh", "dried", "ground", "minced", "chopped", "sliced", "grated", "large", "small", "medium", "extra", "extra-virgin", "unsalted", "salted", "boneless", "skinless", "whole", "low-fat", "reduced-fat", "fat-free", "light", "dark", "white", "black", "red", "green", "ripe", "plain", "organic" } def normalize_text(text: str) -> str: text = (text or "").lower().strip() text = text.replace("’", "'").replace("“", '"').replace("”", '"') text = text.replace("\n", " ") text = PAREN_RE.sub(" ", text) text = re.sub(r"[^\w\s\-/]", " ", text) text = MULTISPACE_RE.sub(" ", text).strip() return text def strip_amounts_and_preps(text: str) -> str: text = normalize_text(text) text = AMOUNT_PREFIX_RE.sub("", text) text = text.lstrip("-•*").strip() text = MULTISPACE_RE.sub(" ", text).strip() return text def singularize(text: str) -> str: text = normalize_text(text) if not text: return "" if _INFLECT is None: return text singular = _INFLECT.singular_noun(text) return singular if singular else text def dedupe_preserve_order(items: Iterable[str]) -> List[str]: seen = set() out: List[str] = [] for item in items: if not item: continue key = item.strip() if key and key not in seen: seen.add(key) out.append(key) return out def as_aliases(value: str | None) -> List[str]: if not value: return [] aliases = [] for part in str(value).split("|"): part = normalize_text(part) if part: aliases.append(part) return dedupe_preserve_order(aliases) def ingredient_variants(text: str) -> List[str]: base = strip_amounts_and_preps(text) base = normalize_text(base) if not base: return [] words = base.split() variants = [base] if len(words) >= 2 and words[0] in DESCRIPTOR_PREFIXES: variants.append(" ".join(words[1:])) if len(words) >= 2: variants.append(words[-1]) variants.append(" ".join(words[:-1])) variants.append(" ".join(words[:2])) variants.append(" ".join(words[-2:])) return dedupe_preserve_order(variants) def normalize_ingredient_for_lookup(text: str) -> str: return normalize_text(strip_amounts_and_preps(text)) def ingredient_lookup_variants(text: str) -> List[str]: return ingredient_variants(text)