| from __future__ import annotations |
|
|
| import re |
| from functools import lru_cache |
| from typing import Iterable, List |
|
|
| import inflect |
|
|
| _p = inflect.engine() |
|
|
|
|
| def normalize_text(text: str) -> str: |
| text = (text or "").lower().strip() |
| text = re.sub(r"[\u2018\u2019\u201c\u201d]", "'", text) |
| text = re.sub(r"[^a-z0-9\s\-']+", " ", text) |
| text = re.sub(r"\s+", " ", text).strip() |
| return text |
|
|
|
|
| @lru_cache(maxsize=4096) |
| def singularize(word: str) -> str: |
| word = normalize_text(word) |
| singular = _p.singular_noun(word) |
| return singular if singular else word |
|
|
|
|
| def dedupe_preserve_order(items: Iterable[str]) -> List[str]: |
| seen = set() |
| out = [] |
| for item in items: |
| item = normalize_text(item) |
| if item and item not in seen: |
| seen.add(item) |
| out.append(item) |
| return out |
|
|
|
|
| _AMOUNT_RE = re.compile( |
| r"^(?:\d+(?:\.\d+)?|\d+/\d+|[\u00bc\u00bd\u00be\u2153\u2154\u215b\u215c\u215d\u215e])\s*" |
| ) |
| _MEASURE_RE = re.compile( |
| r"^(?:g|kg|mg|ml|l|oz|lb|lbs|cup|cups|tbsp|tablespoon|tsp|teaspoon|clove|cloves|slice|slices|piece|pieces|can|cans|bunch|handful|pinch|large|small|medium|whole)\s+" |
| ) |
|
|
|
|
| def strip_amounts_and_preps(text: str) -> str: |
| """Remove leading quantities and prep words from a recipe fragment.""" |
| text = normalize_text(text) |
| text = _AMOUNT_RE.sub("", text) |
| text = _MEASURE_RE.sub("", text) |
| text = re.sub(r"^of\s+", "", text) |
| text = re.sub(r"\(.*?\)", "", text) |
| text = re.sub(r"\s+", " ", text).strip() |
| return text |
|
|
|
|
| def tokenize_recipe_segments(text: str) -> List[str]: |
| """Split a recipe into ingredient-like chunks. |
| |
| The MVP demo worked best with comma-separated ingredients, so we keep that |
| behaviour first and only use a small fallback split when the recipe has no |
| commas. |
| """ |
| raw = text or "" |
| parts = re.split(r",|\n|;", raw) |
| if len(parts) == 1: |
| parts = re.split(r"\s+and\s+", raw, flags=re.IGNORECASE) |
|
|
| cleaned = [] |
| for part in parts: |
| item = strip_amounts_and_preps(part) |
| if item and len(item) > 1: |
| cleaned.append(item) |
| return dedupe_preserve_order(cleaned) |
|
|
|
|
| def ingredient_variants(ingredient: str) -> List[str]: |
| ing = normalize_text(ingredient) |
| variants = [ing] |
|
|
| singular = singularize(ing) |
| if singular != ing: |
| variants.append(singular) |
|
|
| suffixes = [" cheese", " oil", " milk", " cream", " butter", " powder", " sauce", " paste", " extract"] |
| for suffix in suffixes: |
| if ing.endswith(suffix) and len(ing) > len(suffix) + 1: |
| base = ing[:-len(suffix)].strip() |
| variants.append(base) |
| base_singular = singularize(base) |
| if base_singular != base: |
| variants.append(base_singular) |
|
|
| words = ing.split() |
| if len(words) > 1: |
| variants.extend([ |
| words[0], |
| words[-1], |
| " ".join(words[:2]), |
| " ".join(words[1:]), |
| ]) |
|
|
| |
| return dedupe_preserve_order(variants) |
|
|
|
|
| def as_aliases(aliases: str | float | None) -> List[str]: |
| if aliases is None: |
| return [] |
| if not isinstance(aliases, str): |
| return [] |
| return dedupe_preserve_order(alias.strip() for alias in aliases.split("|")) |
|
|