anaygupta's picture
Update services/ner.py
c5e7cb1 verified
from __future__ import annotations
from functools import lru_cache
from typing import List
from .text_utils import dedupe_preserve_order, normalize_text, strip_amounts_and_preps
try:
from ingredient_parser import parse_ingredient
except Exception: # pragma: no cover
parse_ingredient = None
def _clean_fragment(fragment: str) -> str:
fragment = (fragment or "").strip()
fragment = fragment.lstrip("-•*").strip()
fragment = fragment.removeprefix("and ").removeprefix("or ").strip()
fragment = strip_amounts_and_preps(fragment)
return fragment
def _parsed_name(parsed) -> str:
if parsed is None:
return ""
name = getattr(parsed, "name", None)
if not name:
return ""
try:
first = name[0]
text = getattr(first, "text", "") or ""
return normalize_text(text)
except Exception:
pass
text = getattr(name, "text", "") or ""
return normalize_text(text)
@lru_cache(maxsize=4096)
def parse_single_ingredient(fragment: str) -> str:
fragment = _clean_fragment(fragment)
if not fragment:
return ""
if parse_ingredient is not None:
try:
parsed = parse_ingredient(fragment)
name = _parsed_name(parsed)
if name:
return name
except Exception:
pass
return normalize_text(fragment)
def extract_ingredients(recipe_text: str, max_items: int = 48) -> List[str]:
text = (recipe_text or "").strip()
if not text:
return []
parts = [part.strip() for part in text.split(",")]
out: List[str] = []
for part in parts:
candidate = parse_single_ingredient(part)
candidate = normalize_text(candidate)
if not candidate:
continue
if len(candidate) < 2:
continue
if candidate in {"and", "or", "the", "a", "an"}:
continue
out.append(candidate)
if len(out) >= max_items:
break
return dedupe_preserve_order(out)