anaygupta commited on
Commit
094cf40
·
verified ·
1 Parent(s): 2e9071b

Update services/text_utils.py

Browse files
Files changed (1) hide show
  1. services/text_utils.py +38 -89
services/text_utils.py CHANGED
@@ -1,102 +1,51 @@
1
- from __future__ import annotations
2
-
3
  import re
4
- from functools import lru_cache
5
- from typing import Iterable, List
6
 
7
- STOPWORDS = {
8
- "and", "or", "the", "a", "an", "some", "fresh", "dried", "chopped", "minced",
9
- "diced", "sliced", "grated", "ground", "cooked", "raw", "cold", "hot", "warm",
10
- "to", "taste", "optional", "plus", "more",
 
11
  }
12
 
13
- _AMOUNT_RE = re.compile(r"^(?:\d+(?:\.\d+)?|\d+\/\d+|[¼½¾⅓⅔⅛⅜⅝⅞])\s*")
14
- _MEASURE_RE = re.compile(
15
- r"^(?:g|kg|mg|ml|l|oz|lb|lbs|cup|cups|tbsp|tablespoon|tsp|teaspoon|clove|cloves|slice|slices|piece|pieces|can|cans|bunch|handful|pinch|large|small|medium|whole)\s+"
16
- )
17
-
18
-
19
- def normalize_text(text: str) -> str:
20
  text = (text or "").lower().strip()
21
- text = re.sub(r"[‘’“”]", "'", text)
22
- text = re.sub(r"[^a-z0-9\s\-']+", " ", text)
23
  text = re.sub(r"\s+", " ", text).strip()
24
- return text
25
-
26
-
27
- @lru_cache(maxsize=4096)
28
- def singularize(word: str) -> str:
29
- word = normalize_text(word)
30
- if len(word) <= 3:
31
- return word
32
- if word.endswith("ies") and len(word) > 4:
33
- return word[:-3] + "y"
34
- if word.endswith("ves") and len(word) > 4:
35
- return word[:-3] + "f"
36
- if word.endswith("ses") or word.endswith("xes") or word.endswith("zes") or word.endswith("ches") or word.endswith("shes"):
37
- return word[:-2]
38
- if word.endswith("s") and not word.endswith("ss"):
39
- return word[:-1]
40
- return word
41
 
 
 
 
 
 
 
 
 
42
 
43
- def dedupe_preserve_order(items: Iterable[str]) -> List[str]:
44
- seen = set()
45
- out = []
46
- for item in items:
47
- item = normalize_text(item)
48
- if item and item not in seen:
49
- seen.add(item)
50
- out.append(item)
51
- return out
52
-
53
-
54
- def strip_amounts_and_preps(text: str) -> str:
55
- text = normalize_text(text)
56
- text = _AMOUNT_RE.sub("", text)
57
- text = _MEASURE_RE.sub("", text)
58
- text = re.sub(r"^of\s+", "", text)
59
- text = re.sub(r"\(.*?\)", "", text)
60
- text = re.sub(r"\s+", " ", text).strip()
61
- return text
62
 
 
 
 
 
63
 
64
- def tokenize_recipe_segments(text: str) -> List[str]:
65
- raw = text or ""
66
- parts = re.split(r",|\n|;|\s+and\s+", raw, flags=re.IGNORECASE)
67
- cleaned = []
68
- for part in parts:
69
- item = strip_amounts_and_preps(part)
70
- if item and len(item) > 1:
71
- cleaned.append(item)
72
- return dedupe_preserve_order(cleaned)
73
-
74
-
75
- def ingredient_variants(ingredient: str) -> List[str]:
76
- ing = normalize_text(ingredient)
77
- variants = [ing]
78
-
79
- singular = singularize(ing)
80
- if singular != ing:
81
- variants.append(singular)
82
-
83
- suffixes = [" cheese", " oil", " milk", " cream", " butter", " powder", " sauce", " paste", " extract"]
84
- for suffix in suffixes:
85
- if ing.endswith(suffix) and len(ing) > len(suffix) + 1:
86
- base = ing[:-len(suffix)].strip()
87
- variants.append(base)
88
- base_singular = singularize(base)
89
- if base_singular != base:
90
- variants.append(base_singular)
91
-
92
- words = ing.split()
93
- if len(words) > 1:
94
- variants.extend([words[0], words[-1], " ".join(words[:2]), " ".join(words[1:])])
95
 
96
- return dedupe_preserve_order(variants)
 
 
97
 
 
 
 
 
98
 
99
- def as_aliases(aliases: str | float | None) -> List[str]:
100
- if aliases is None or not isinstance(aliases, str):
101
- return []
102
- return dedupe_preserve_order(alias.strip() for alias in aliases.split("|"))
 
 
 
 
 
 
 
 
1
  import re
 
 
2
 
3
+ DESCRIPTOR_PREFIXES = {
4
+ "fresh", "dried", "ground", "minced", "chopped", "sliced", "grated",
5
+ "large", "small", "medium", "extra", "extra-virgin", "unsalted", "salted",
6
+ "boneless", "skinless", "whole", "low-fat", "reduced-fat", "fat-free",
7
+ "light", "dark", "white", "black", "red", "green", "ripe"
8
  }
9
 
10
+ def normalize_ingredient_for_lookup(text: str) -> str:
 
 
 
 
 
 
11
  text = (text or "").lower().strip()
12
+ text = re.sub(r"[\(\)\[\]\{\};:]", " ", text)
 
13
  text = re.sub(r"\s+", " ", text).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ # remove common amount/unit prefixes, but do NOT split on commas here
16
+ text = re.sub(
17
+ r"^\s*[\d\s\/\.½¼¾⅓⅔]+\s*"
18
+ r"(g|kg|ml|l|oz|lb|cup|cups|tbsp|tsp|teaspoon|tablespoon|clove|cloves|can|cans|slice|slices|piece|pieces|pinch|dash|handful)?\s*",
19
+ "",
20
+ text,
21
+ flags=re.IGNORECASE,
22
+ )
23
 
24
+ return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ def ingredient_lookup_variants(text: str) -> list[str]:
27
+ base = normalize_ingredient_for_lookup(text)
28
+ if not base:
29
+ return []
30
 
31
+ words = base.split()
32
+ variants = [base]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ # drop a leading descriptor like "fresh spinach" -> "spinach"
35
+ if len(words) >= 2 and words[0] in DESCRIPTOR_PREFIXES:
36
+ variants.append(" ".join(words[1:]))
37
 
38
+ # also try head/tail reductions for multi-word ingredients
39
+ if len(words) >= 2:
40
+ variants.append(words[-1])
41
+ variants.append(" ".join(words[:-1]))
42
 
43
+ # de-dup while keeping order
44
+ out = []
45
+ seen = set()
46
+ for v in variants:
47
+ v = v.strip()
48
+ if v and v not in seen:
49
+ seen.add(v)
50
+ out.append(v)
51
+ return out