BiteWiseFinal

Sleeping

App Files Files Community

anaygupta commited on 15 days ago

Commit

094cf40

verified ·

1 Parent(s): 2e9071b

Update services/text_utils.py

Browse files

Files changed (1) hide show

services/text_utils.py +38 -89

services/text_utils.py CHANGED Viewed

@@ -1,102 +1,51 @@
-from __future__ import annotations
 import re
-from functools import lru_cache
-from typing import Iterable, List
-STOPWORDS = {
-    "and", "or", "the", "a", "an", "some", "fresh", "dried", "chopped", "minced",
-    "diced", "sliced", "grated", "ground", "cooked", "raw", "cold", "hot", "warm",
-    "to", "taste", "optional", "plus", "more",
 }
-_AMOUNT_RE = re.compile(r"^(?:\d+(?:\.\d+)?|\d+\/\d+|[¼½¾⅓⅔⅛⅜⅝⅞])\s*")
-_MEASURE_RE = re.compile(
-    r"^(?:g|kg|mg|ml|l|oz|lb|lbs|cup|cups|tbsp|tablespoon|tsp|teaspoon|clove|cloves|slice|slices|piece|pieces|can|cans|bunch|handful|pinch|large|small|medium|whole)\s+"
-)
-def normalize_text(text: str) -> str:
     text = (text or "").lower().strip()
-    text = re.sub(r"[‘’“”]", "'", text)
-    text = re.sub(r"[^a-z0-9\s\-']+", " ", text)
     text = re.sub(r"\s+", " ", text).strip()
-    return text
-@lru_cache(maxsize=4096)
-def singularize(word: str) -> str:
-    word = normalize_text(word)
-    if len(word) <= 3:
-        return word
-    if word.endswith("ies") and len(word) > 4:
-        return word[:-3] + "y"
-    if word.endswith("ves") and len(word) > 4:
-        return word[:-3] + "f"
-    if word.endswith("ses") or word.endswith("xes") or word.endswith("zes") or word.endswith("ches") or word.endswith("shes"):
-        return word[:-2]
-    if word.endswith("s") and not word.endswith("ss"):
-        return word[:-1]
-    return word
-def dedupe_preserve_order(items: Iterable[str]) -> List[str]:
-    seen = set()
-    out = []
-    for item in items:
-        item = normalize_text(item)
-        if item and item not in seen:
-            seen.add(item)
-            out.append(item)
-    return out
-def strip_amounts_and_preps(text: str) -> str:
-    text = normalize_text(text)
-    text = _AMOUNT_RE.sub("", text)
-    text = _MEASURE_RE.sub("", text)
-    text = re.sub(r"^of\s+", "", text)
-    text = re.sub(r"\(.*?\)", "", text)
-    text = re.sub(r"\s+", " ", text).strip()
-    return text
-def tokenize_recipe_segments(text: str) -> List[str]:
-    raw = text or ""
-    parts = re.split(r",|\n|;|\s+and\s+", raw, flags=re.IGNORECASE)
-    cleaned = []
-    for part in parts:
-        item = strip_amounts_and_preps(part)
-        if item and len(item) > 1:
-            cleaned.append(item)
-    return dedupe_preserve_order(cleaned)
-def ingredient_variants(ingredient: str) -> List[str]:
-    ing = normalize_text(ingredient)
-    variants = [ing]
-    singular = singularize(ing)
-    if singular != ing:
-        variants.append(singular)
-    suffixes = [" cheese", " oil", " milk", " cream", " butter", " powder", " sauce", " paste", " extract"]
-    for suffix in suffixes:
-        if ing.endswith(suffix) and len(ing) > len(suffix) + 1:
-            base = ing[:-len(suffix)].strip()
-            variants.append(base)
-            base_singular = singularize(base)
-            if base_singular != base:
-                variants.append(base_singular)
-    words = ing.split()
-    if len(words) > 1:
-        variants.extend([words[0], words[-1], " ".join(words[:2]), " ".join(words[1:])])
-    return dedupe_preserve_order(variants)
-def as_aliases(aliases: str | float | None) -> List[str]:
-    if aliases is None or not isinstance(aliases, str):
-        return []
-    return dedupe_preserve_order(alias.strip() for alias in aliases.split("|"))

 import re
+DESCRIPTOR_PREFIXES = {
+    "fresh", "dried", "ground", "minced", "chopped", "sliced", "grated",
+    "large", "small", "medium", "extra", "extra-virgin", "unsalted", "salted",
+    "boneless", "skinless", "whole", "low-fat", "reduced-fat", "fat-free",
+    "light", "dark", "white", "black", "red", "green", "ripe"
 }
+def normalize_ingredient_for_lookup(text: str) -> str:
     text = (text or "").lower().strip()
+    text = re.sub(r"[\(\)\[\]\{\};:]", " ", text)
     text = re.sub(r"\s+", " ", text).strip()
+    # remove common amount/unit prefixes, but do NOT split on commas here
+    text = re.sub(
+        r"^\s*[\d\s\/\.½¼¾⅓⅔]+\s*"
+        r"(g|kg|ml|l|oz|lb|cup|cups|tbsp|tsp|teaspoon|tablespoon|clove|cloves|can|cans|slice|slices|piece|pieces|pinch|dash|handful)?\s*",
+        "",
+        text,
+        flags=re.IGNORECASE,
+    )
+    return text.strip()
+def ingredient_lookup_variants(text: str) -> list[str]:
+    base = normalize_ingredient_for_lookup(text)
+    if not base:
+        return []
+    words = base.split()
+    variants = [base]
+    # drop a leading descriptor like "fresh spinach" -> "spinach"
+    if len(words) >= 2 and words[0] in DESCRIPTOR_PREFIXES:
+        variants.append(" ".join(words[1:]))
+    # also try head/tail reductions for multi-word ingredients
+    if len(words) >= 2:
+        variants.append(words[-1])
+        variants.append(" ".join(words[:-1]))
+    # de-dup while keeping order
+    out = []
+    seen = set()
+    for v in variants:
+        v = v.strip()
+        if v and v not in seen:
+            seen.add(v)
+            out.append(v)
+    return out