BiteWiseFinal

Sleeping

App Files Files Community

anaygupta commited on 23 days ago

Commit

8387c0e

verified ·

1 Parent(s): d33a661

Update services/ner.py

Browse files

Files changed (1) hide show

services/ner.py +40 -32

services/ner.py CHANGED Viewed

@@ -1,27 +1,26 @@
 from __future__ import annotations
-import re
 from functools import lru_cache
 from typing import List
-from .config import settings
 from .text_utils import dedupe_preserve_order, normalize_text, strip_amounts_and_preps
 try:
     from ingredient_parser import parse_ingredient
-except Exception:  # pragma: no cover - graceful fallback on deploy
     parse_ingredient = None
-_AND_PREFIX_RE = re.compile(r"^(?:and|or|&)\s+", re.IGNORECASE)
 def _parsed_name(parsed) -> str:
-    """Return the ingredient name from ingredient_parser output.
-    The MVP notebook used parsed.name[0].text, so we mirror that behavior here
-    and fall back to the first available name-like text if the structure differs.
-    """
     if parsed is None:
         return ""
@@ -29,18 +28,27 @@ def _parsed_name(parsed) -> str:
     if not name:
         return ""
-    if isinstance(name, list) and name:
         first = name[0]
-        return normalize_text(getattr(first, "text", "") or "")
-    return normalize_text(getattr(name, "text", "") or "")
-def _parse_one_ingredient(fragment: str) -> str:
-    fragment = (fragment or "").strip()
-    fragment = _AND_PREFIX_RE.sub("", fragment)
-    fragment = strip_amounts_and_preps(fragment)
     if not fragment:
         return ""
@@ -51,31 +59,29 @@ def _parse_one_ingredient(fragment: str) -> str:
             if name:
                 return name
         except Exception:
             pass
     return normalize_text(fragment)
-def extract_ingredients(text: str, max_items: int = 48) -> List[str]:
-    """Extract ingredients using the notebook-style comma-separated workflow.
-    This intentionally avoids chunking and does not run NER over sub-spans.
-    Each comma-separated fragment is parsed independently, which is much more
-    stable for the kinds of recipe inputs BiteWise expects.
     """
-    raw = normalize_text(text or "")
-    if not raw:
         return []
-    parts = [part.strip() for part in raw.split(",")]
-    if len(parts) == 1:
-        parts = [raw]
-    extracted: List[str] = []
     for part in parts:
-        candidate = _parse_one_ingredient(part)
-        candidate = _AND_PREFIX_RE.sub("", candidate).strip()
-        candidate = re.sub(r"\s+", " ", candidate).strip()
         candidate = normalize_text(candidate)
         if not candidate:
@@ -85,6 +91,8 @@ def extract_ingredients(text: str, max_items: int = 48) -> List[str]:
         if candidate in {"and", "or", "the", "a", "an"}:
             continue
-        extracted.append(candidate)
-    return dedupe_preserve_order(extracted)[:max_items]

 from __future__ import annotations
 from functools import lru_cache
 from typing import List
 from .text_utils import dedupe_preserve_order, normalize_text, strip_amounts_and_preps
 try:
     from ingredient_parser import parse_ingredient
+except Exception:  # pragma: no cover - deploy-time fallback
     parse_ingredient = None
+def _clean_fragment(fragment: str) -> str:
+    fragment = (fragment or "").strip()
+    fragment = fragment.lstrip("-•*").strip()
+    fragment = fragment.removeprefix("and ").removeprefix("or ").strip()
+    fragment = strip_amounts_and_preps(fragment)
+    return fragment
 def _parsed_name(parsed) -> str:
+    """Mirror the notebook-style parsed.name[0].text behavior."""
     if parsed is None:
         return ""
     if not name:
         return ""
+    try:
         first = name[0]
+        text = getattr(first, "text", "") or ""
+        return normalize_text(text)
+    except Exception:
+        pass
+    text = getattr(name, "text", "") or ""
+    return normalize_text(text)
+@lru_cache(maxsize=4096)
+def parse_single_ingredient(fragment: str) -> str:
+    """Parse one comma-separated ingredient fragment.
+    This intentionally follows the original notebook approach:
+    split the recipe on commas first, then parse each fragment on its own.
+    That avoids the partial-span truncation caused by running NER over
+    smaller chunks and slicing model offsets.
+    """
+    fragment = _clean_fragment(fragment)
     if not fragment:
         return ""
             if name:
                 return name
         except Exception:
+            # Fall back to a cleaned fragment if the parser fails on a token.
             pass
     return normalize_text(fragment)
+def extract_ingredients(recipe_text: str, max_items: int = 48) -> List[str]:
+    """Extract ingredients from comma-separated recipe text.
+    The MVP notebook assumed ingredients are comma separated and parsed each
+    item independently. We keep that behavior here because it is more stable
+    than chunking or span-based extraction for recipe text.
     """
+    text = (recipe_text or "").strip()
+    if not text:
         return []
+    # Split ONLY on commas, then parse each ingredient independently.
+    parts = [part.strip() for part in text.split(",")]
+    out: List[str] = []
     for part in parts:
+        candidate = parse_single_ingredient(part)
         candidate = normalize_text(candidate)
         if not candidate:
         if candidate in {"and", "or", "the", "a", "an"}:
             continue
+        out.append(candidate)
+        if len(out) >= max_items:
+            break
+    return dedupe_preserve_order(out)