bitewise / services /text_utils.py
anaygupta's picture
Upload 22 files
df8f88e verified
from __future__ import annotations
import re
from functools import lru_cache
from typing import Iterable, List
import inflect
_p = inflect.engine()
def normalize_text(text: str) -> str:
text = (text or "").lower().strip()
text = re.sub(r"[\u2018\u2019\u201c\u201d]", "'", text)
text = re.sub(r"[^a-z0-9\s\-']+", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
@lru_cache(maxsize=4096)
def singularize(word: str) -> str:
word = normalize_text(word)
singular = _p.singular_noun(word)
return singular if singular else word
def dedupe_preserve_order(items: Iterable[str]) -> List[str]:
seen = set()
out = []
for item in items:
item = normalize_text(item)
if item and item not in seen:
seen.add(item)
out.append(item)
return out
_AMOUNT_RE = re.compile(
r"^(?:\d+(?:\.\d+)?|\d+/\d+|[\u00bc\u00bd\u00be\u2153\u2154\u215b\u215c\u215d\u215e])\s*"
)
_MEASURE_RE = re.compile(
r"^(?:g|kg|mg|ml|l|oz|lb|lbs|cup|cups|tbsp|tablespoon|tsp|teaspoon|clove|cloves|slice|slices|piece|pieces|can|cans|bunch|handful|pinch|large|small|medium|whole)\s+"
)
def strip_amounts_and_preps(text: str) -> str:
"""Remove leading quantities and prep words from a recipe fragment."""
text = normalize_text(text)
text = _AMOUNT_RE.sub("", text)
text = _MEASURE_RE.sub("", text)
text = re.sub(r"^of\s+", "", text)
text = re.sub(r"\(.*?\)", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def tokenize_recipe_segments(text: str) -> List[str]:
"""Split a recipe into ingredient-like chunks.
The MVP demo worked best with comma-separated ingredients, so we keep that
behaviour first and only use a small fallback split when the recipe has no
commas.
"""
raw = text or ""
parts = re.split(r",|\n|;", raw)
if len(parts) == 1:
parts = re.split(r"\s+and\s+", raw, flags=re.IGNORECASE)
cleaned = []
for part in parts:
item = strip_amounts_and_preps(part)
if item and len(item) > 1:
cleaned.append(item)
return dedupe_preserve_order(cleaned)
def ingredient_variants(ingredient: str) -> List[str]:
ing = normalize_text(ingredient)
variants = [ing]
singular = singularize(ing)
if singular != ing:
variants.append(singular)
suffixes = [" cheese", " oil", " milk", " cream", " butter", " powder", " sauce", " paste", " extract"]
for suffix in suffixes:
if ing.endswith(suffix) and len(ing) > len(suffix) + 1:
base = ing[:-len(suffix)].strip()
variants.append(base)
base_singular = singularize(base)
if base_singular != base:
variants.append(base_singular)
words = ing.split()
if len(words) > 1:
variants.extend([
words[0],
words[-1],
" ".join(words[:2]),
" ".join(words[1:]),
])
# very short fragments create false matches, so keep them only as a last resort
return dedupe_preserve_order(variants)
def as_aliases(aliases: str | float | None) -> List[str]:
if aliases is None:
return []
if not isinstance(aliases, str):
return []
return dedupe_preserve_order(alias.strip() for alias in aliases.split("|"))