Spaces:

anaygupta
/

bitewise

Configuration error

App Files Files Community

bitewise / services /text_utils.py

anaygupta

Upload 22 files

df8f88e verified 16 days ago

raw

history blame contribute delete

3.33 kB

	from __future__ import annotations

	import re
	from functools import lru_cache
	from typing import Iterable, List

	import inflect

	_p = inflect.engine()


	def normalize_text(text: str) -> str:
	text = (text or "").lower().strip()
	text = re.sub(r"[\u2018\u2019\u201c\u201d]", "'", text)
	text = re.sub(r"[^a-z0-9\s\-']+", " ", text)
	text = re.sub(r"\s+", " ", text).strip()
	return text


	@lru_cache(maxsize=4096)
	def singularize(word: str) -> str:
	word = normalize_text(word)
	singular = _p.singular_noun(word)
	return singular if singular else word


	def dedupe_preserve_order(items: Iterable[str]) -> List[str]:
	seen = set()
	out = []
	for item in items:
	item = normalize_text(item)
	if item and item not in seen:
	seen.add(item)
	out.append(item)
	return out


	_AMOUNT_RE = re.compile(
	r"^(?:\d+(?:\.\d+)?\|\d+/\d+\|[\u00bc\u00bd\u00be\u2153\u2154\u215b\u215c\u215d\u215e])\s*"
	)
	_MEASURE_RE = re.compile(
	r"^(?:g\|kg\|mg\|ml\|l\|oz\|lb\|lbs\|cup\|cups\|tbsp\|tablespoon\|tsp\|teaspoon\|clove\|cloves\|slice\|slices\|piece\|pieces\|can\|cans\|bunch\|handful\|pinch\|large\|small\|medium\|whole)\s+"
	)


	def strip_amounts_and_preps(text: str) -> str:
	"""Remove leading quantities and prep words from a recipe fragment."""
	text = normalize_text(text)
	text = _AMOUNT_RE.sub("", text)
	text = _MEASURE_RE.sub("", text)
	text = re.sub(r"^of\s+", "", text)
	text = re.sub(r"\(.*?\)", "", text)
	text = re.sub(r"\s+", " ", text).strip()
	return text


	def tokenize_recipe_segments(text: str) -> List[str]:
	"""Split a recipe into ingredient-like chunks.

	The MVP demo worked best with comma-separated ingredients, so we keep that
	behaviour first and only use a small fallback split when the recipe has no
	commas.
	"""
	raw = text or ""
	parts = re.split(r",\|\n\|;", raw)
	if len(parts) == 1:
	parts = re.split(r"\s+and\s+", raw, flags=re.IGNORECASE)

	cleaned = []
	for part in parts:
	item = strip_amounts_and_preps(part)
	if item and len(item) > 1:
	cleaned.append(item)
	return dedupe_preserve_order(cleaned)


	def ingredient_variants(ingredient: str) -> List[str]:
	ing = normalize_text(ingredient)
	variants = [ing]

	singular = singularize(ing)
	if singular != ing:
	variants.append(singular)

	suffixes = [" cheese", " oil", " milk", " cream", " butter", " powder", " sauce", " paste", " extract"]
	for suffix in suffixes:
	if ing.endswith(suffix) and len(ing) > len(suffix) + 1:
	base = ing[:-len(suffix)].strip()
	variants.append(base)
	base_singular = singularize(base)
	if base_singular != base:
	variants.append(base_singular)

	words = ing.split()
	if len(words) > 1:
	variants.extend([
	words[0],
	words[-1],
	" ".join(words[:2]),
	" ".join(words[1:]),
	])

	# very short fragments create false matches, so keep them only as a last resort
	return dedupe_preserve_order(variants)


	def as_aliases(aliases: str \| float \| None) -> List[str]:
	if aliases is None:
	return []
	if not isinstance(aliases, str):
	return []
	return dedupe_preserve_order(alias.strip() for alias in aliases.split("\|"))