Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import List, Tuple | |
| from .text_utils import normalize_text, singularize, ingredient_lookup_variants | |
| try: | |
| import gensim.downloader as api | |
| except Exception: # pragma: no cover | |
| api = None | |
| DEFAULT_MODEL = "glove-wiki-gigaword-50" | |
| class SemanticCandidate: | |
| term: str | |
| score: float | |
| class WordVectorFallback: | |
| def __init__(self, model_name: str = DEFAULT_MODEL, model_path: str | None = None, enable_download: bool = True): | |
| self.model_name = model_name or DEFAULT_MODEL | |
| self.model_path = model_path | |
| self.enable_download = enable_download | |
| self.available = False | |
| self._kind = "disabled" | |
| self._model = None | |
| self._load() | |
| def _load(self) -> None: | |
| if api is None: | |
| self.available = False | |
| self._kind = "unavailable" | |
| return | |
| try: | |
| if self.model_path: | |
| path = Path(self.model_path) | |
| if path.exists(): | |
| # Keep this permissive; local path loading is optional. | |
| self._model = api.load(self.model_name) | |
| elif self.enable_download: | |
| self._model = api.load(self.model_name) | |
| else: | |
| self._model = None | |
| else: | |
| if self.enable_download: | |
| self._model = api.load(self.model_name) | |
| else: | |
| self._model = None | |
| self.available = self._model is not None | |
| self._kind = "glove" if self.available else "disabled" | |
| except Exception: | |
| self._model = None | |
| self.available = False | |
| self._kind = "disabled" | |
| def _normalize_candidate(self, term: str) -> str: | |
| term = normalize_text(term) | |
| term = singularize(term) | |
| return term | |
| def most_similar(self, ingredient: str, topn: int = 10) -> List[Tuple[str, float]]: | |
| if not self.available or self._model is None: | |
| return [] | |
| query = self._normalize_candidate(ingredient) | |
| if not query: | |
| return [] | |
| try: | |
| raw = self._model.most_similar(query, topn=max(topn, 10)) | |
| except Exception: | |
| return [] | |
| out: List[Tuple[str, float]] = [] | |
| seen = set() | |
| for term, score in raw: | |
| term = self._normalize_candidate(term.replace("_", " ")) | |
| if not term or term in seen: | |
| continue | |
| seen.add(term) | |
| # Keep only candidates with at least some lexical overlap or clear phrase family. | |
| query_parts = set(query.split()) | |
| term_parts = set(term.split()) | |
| if query_parts and term_parts and not (query_parts & term_parts): | |
| # Allow the last fallback word variant to pass through only if it looks like a food term. | |
| variants = ingredient_lookup_variants(term) | |
| if not variants: | |
| continue | |
| out.append((term, float(score))) | |
| if len(out) >= topn: | |
| break | |
| return out | |