BiteWiseFinal / services /text_utils.py
anaygupta's picture
Update services/text_utils.py
f43f12b verified
from __future__ import annotations
import re
from typing import Iterable, List
try:
import inflect
_INFLECT = inflect.engine()
except Exception: # pragma: no cover
_INFLECT = None
AMOUNT_PREFIX_RE = re.compile(
r"""^\s*
[\d\s\/\.½¼¾⅓⅔]+
\s*
(?:g|kg|mg|ml|l|oz|lb|pound|pounds|cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|
clove|cloves|can|cans|slice|slices|piece|pieces|pinch|dash|handful|sprig|sprigs|bunch|bunches|
package|packages|jar|jars|stalk|stalks|stick|sticks|packet|packets|quart|quarts|pint|pints)?
\s*
""",
re.IGNORECASE | re.VERBOSE,
)
PAREN_RE = re.compile(r"\s*[\(\[].*?[\)\]]\s*")
MULTISPACE_RE = re.compile(r"\s+")
DESCRIPTOR_PREFIXES = {
"fresh", "dried", "ground", "minced", "chopped", "sliced", "grated",
"large", "small", "medium", "extra", "extra-virgin", "unsalted", "salted",
"boneless", "skinless", "whole", "low-fat", "reduced-fat", "fat-free",
"light", "dark", "white", "black", "red", "green", "ripe", "plain",
"organic"
}
def normalize_text(text: str) -> str:
text = (text or "").lower().strip()
text = text.replace("’", "'").replace("“", '"').replace("”", '"')
text = text.replace("\n", " ")
text = PAREN_RE.sub(" ", text)
text = re.sub(r"[^\w\s\-/]", " ", text)
text = MULTISPACE_RE.sub(" ", text).strip()
return text
def strip_amounts_and_preps(text: str) -> str:
text = normalize_text(text)
text = AMOUNT_PREFIX_RE.sub("", text)
text = text.lstrip("-•*").strip()
text = MULTISPACE_RE.sub(" ", text).strip()
return text
def singularize(text: str) -> str:
text = normalize_text(text)
if not text:
return ""
if _INFLECT is None:
return text
singular = _INFLECT.singular_noun(text)
return singular if singular else text
def dedupe_preserve_order(items: Iterable[str]) -> List[str]:
seen = set()
out: List[str] = []
for item in items:
if not item:
continue
key = item.strip()
if key and key not in seen:
seen.add(key)
out.append(key)
return out
def as_aliases(value: str | None) -> List[str]:
if not value:
return []
aliases = []
for part in str(value).split("|"):
part = normalize_text(part)
if part:
aliases.append(part)
return dedupe_preserve_order(aliases)
def ingredient_variants(text: str) -> List[str]:
base = strip_amounts_and_preps(text)
base = normalize_text(base)
if not base:
return []
words = base.split()
variants = [base]
if len(words) >= 2 and words[0] in DESCRIPTOR_PREFIXES:
variants.append(" ".join(words[1:]))
if len(words) >= 2:
variants.append(words[-1])
variants.append(" ".join(words[:-1]))
variants.append(" ".join(words[:2]))
variants.append(" ".join(words[-2:]))
return dedupe_preserve_order(variants)
def normalize_ingredient_for_lookup(text: str) -> str:
return normalize_text(strip_amounts_and_preps(text))
def ingredient_lookup_variants(text: str) -> List[str]:
return ingredient_variants(text)