from __future__ import annotations from dataclasses import dataclass from pathlib import Path from typing import List, Tuple from .text_utils import normalize_text, singularize, ingredient_lookup_variants try: import gensim.downloader as api except Exception: # pragma: no cover api = None DEFAULT_MODEL = "glove-wiki-gigaword-50" @dataclass class SemanticCandidate: term: str score: float class WordVectorFallback: def __init__(self, model_name: str = DEFAULT_MODEL, model_path: str | None = None, enable_download: bool = True): self.model_name = model_name or DEFAULT_MODEL self.model_path = model_path self.enable_download = enable_download self.available = False self._kind = "disabled" self._model = None self._load() def _load(self) -> None: if api is None: self.available = False self._kind = "unavailable" return try: if self.model_path: path = Path(self.model_path) if path.exists(): # Keep this permissive; local path loading is optional. self._model = api.load(self.model_name) elif self.enable_download: self._model = api.load(self.model_name) else: self._model = None else: if self.enable_download: self._model = api.load(self.model_name) else: self._model = None self.available = self._model is not None self._kind = "glove" if self.available else "disabled" except Exception: self._model = None self.available = False self._kind = "disabled" def _normalize_candidate(self, term: str) -> str: term = normalize_text(term) term = singularize(term) return term def most_similar(self, ingredient: str, topn: int = 10) -> List[Tuple[str, float]]: if not self.available or self._model is None: return [] query = self._normalize_candidate(ingredient) if not query: return [] try: raw = self._model.most_similar(query, topn=max(topn, 10)) except Exception: return [] out: List[Tuple[str, float]] = [] seen = set() for term, score in raw: term = self._normalize_candidate(term.replace("_", " ")) if not term or term in seen: continue seen.add(term) # Keep only candidates with at least some lexical overlap or clear phrase family. query_parts = set(query.split()) term_parts = set(term.split()) if query_parts and term_parts and not (query_parts & term_parts): # Allow the last fallback word variant to pass through only if it looks like a food term. variants = ingredient_lookup_variants(term) if not variants: continue out.append((term, float(score))) if len(out) >= topn: break return out