anaygupta commited on
Commit
8387c0e
·
verified ·
1 Parent(s): d33a661

Update services/ner.py

Browse files
Files changed (1) hide show
  1. services/ner.py +40 -32
services/ner.py CHANGED
@@ -1,27 +1,26 @@
1
  from __future__ import annotations
2
 
3
- import re
4
  from functools import lru_cache
5
  from typing import List
6
 
7
- from .config import settings
8
  from .text_utils import dedupe_preserve_order, normalize_text, strip_amounts_and_preps
9
 
10
  try:
11
  from ingredient_parser import parse_ingredient
12
- except Exception: # pragma: no cover - graceful fallback on deploy
13
  parse_ingredient = None
14
 
15
 
16
- _AND_PREFIX_RE = re.compile(r"^(?:and|or|&)\s+", re.IGNORECASE)
 
 
 
 
 
17
 
18
 
19
  def _parsed_name(parsed) -> str:
20
- """Return the ingredient name from ingredient_parser output.
21
-
22
- The MVP notebook used parsed.name[0].text, so we mirror that behavior here
23
- and fall back to the first available name-like text if the structure differs.
24
- """
25
  if parsed is None:
26
  return ""
27
 
@@ -29,18 +28,27 @@ def _parsed_name(parsed) -> str:
29
  if not name:
30
  return ""
31
 
32
- if isinstance(name, list) and name:
33
  first = name[0]
34
- return normalize_text(getattr(first, "text", "") or "")
 
 
 
35
 
36
- return normalize_text(getattr(name, "text", "") or "")
 
37
 
38
 
39
- def _parse_one_ingredient(fragment: str) -> str:
40
- fragment = (fragment or "").strip()
41
- fragment = _AND_PREFIX_RE.sub("", fragment)
42
- fragment = strip_amounts_and_preps(fragment)
43
 
 
 
 
 
 
 
44
  if not fragment:
45
  return ""
46
 
@@ -51,31 +59,29 @@ def _parse_one_ingredient(fragment: str) -> str:
51
  if name:
52
  return name
53
  except Exception:
 
54
  pass
55
 
56
  return normalize_text(fragment)
57
 
58
 
59
- def extract_ingredients(text: str, max_items: int = 48) -> List[str]:
60
- """Extract ingredients using the notebook-style comma-separated workflow.
61
 
62
- This intentionally avoids chunking and does not run NER over sub-spans.
63
- Each comma-separated fragment is parsed independently, which is much more
64
- stable for the kinds of recipe inputs BiteWise expects.
65
  """
66
- raw = normalize_text(text or "")
67
- if not raw:
68
  return []
69
 
70
- parts = [part.strip() for part in raw.split(",")]
71
- if len(parts) == 1:
72
- parts = [raw]
73
 
74
- extracted: List[str] = []
75
  for part in parts:
76
- candidate = _parse_one_ingredient(part)
77
- candidate = _AND_PREFIX_RE.sub("", candidate).strip()
78
- candidate = re.sub(r"\s+", " ", candidate).strip()
79
  candidate = normalize_text(candidate)
80
 
81
  if not candidate:
@@ -85,6 +91,8 @@ def extract_ingredients(text: str, max_items: int = 48) -> List[str]:
85
  if candidate in {"and", "or", "the", "a", "an"}:
86
  continue
87
 
88
- extracted.append(candidate)
 
 
89
 
90
- return dedupe_preserve_order(extracted)[:max_items]
 
1
  from __future__ import annotations
2
 
 
3
  from functools import lru_cache
4
  from typing import List
5
 
 
6
  from .text_utils import dedupe_preserve_order, normalize_text, strip_amounts_and_preps
7
 
8
  try:
9
  from ingredient_parser import parse_ingredient
10
+ except Exception: # pragma: no cover - deploy-time fallback
11
  parse_ingredient = None
12
 
13
 
14
+ def _clean_fragment(fragment: str) -> str:
15
+ fragment = (fragment or "").strip()
16
+ fragment = fragment.lstrip("-•*").strip()
17
+ fragment = fragment.removeprefix("and ").removeprefix("or ").strip()
18
+ fragment = strip_amounts_and_preps(fragment)
19
+ return fragment
20
 
21
 
22
  def _parsed_name(parsed) -> str:
23
+ """Mirror the notebook-style parsed.name[0].text behavior."""
 
 
 
 
24
  if parsed is None:
25
  return ""
26
 
 
28
  if not name:
29
  return ""
30
 
31
+ try:
32
  first = name[0]
33
+ text = getattr(first, "text", "") or ""
34
+ return normalize_text(text)
35
+ except Exception:
36
+ pass
37
 
38
+ text = getattr(name, "text", "") or ""
39
+ return normalize_text(text)
40
 
41
 
42
+ @lru_cache(maxsize=4096)
43
+ def parse_single_ingredient(fragment: str) -> str:
44
+ """Parse one comma-separated ingredient fragment.
 
45
 
46
+ This intentionally follows the original notebook approach:
47
+ split the recipe on commas first, then parse each fragment on its own.
48
+ That avoids the partial-span truncation caused by running NER over
49
+ smaller chunks and slicing model offsets.
50
+ """
51
+ fragment = _clean_fragment(fragment)
52
  if not fragment:
53
  return ""
54
 
 
59
  if name:
60
  return name
61
  except Exception:
62
+ # Fall back to a cleaned fragment if the parser fails on a token.
63
  pass
64
 
65
  return normalize_text(fragment)
66
 
67
 
68
+ def extract_ingredients(recipe_text: str, max_items: int = 48) -> List[str]:
69
+ """Extract ingredients from comma-separated recipe text.
70
 
71
+ The MVP notebook assumed ingredients are comma separated and parsed each
72
+ item independently. We keep that behavior here because it is more stable
73
+ than chunking or span-based extraction for recipe text.
74
  """
75
+ text = (recipe_text or "").strip()
76
+ if not text:
77
  return []
78
 
79
+ # Split ONLY on commas, then parse each ingredient independently.
80
+ parts = [part.strip() for part in text.split(",")]
 
81
 
82
+ out: List[str] = []
83
  for part in parts:
84
+ candidate = parse_single_ingredient(part)
 
 
85
  candidate = normalize_text(candidate)
86
 
87
  if not candidate:
 
91
  if candidate in {"and", "or", "the", "a", "an"}:
92
  continue
93
 
94
+ out.append(candidate)
95
+ if len(out) >= max_items:
96
+ break
97
 
98
+ return dedupe_preserve_order(out)