Spaces:
Sleeping
Sleeping
File size: 3,219 Bytes
9373226 244053a 9373226 244053a 9373226 244053a 9373226 244053a 9373226 244053a 9373226 244053a 9373226 244053a 9373226 244053a 9373226 244053a 9373226 244053a 9373226 244053a 9373226 244053a 9373226 244053a 9373226 244053a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple
from .text_utils import normalize_text, singularize, ingredient_lookup_variants
try:
import gensim.downloader as api
except Exception: # pragma: no cover
api = None
DEFAULT_MODEL = "glove-wiki-gigaword-50"
@dataclass
class SemanticCandidate:
term: str
score: float
class WordVectorFallback:
def __init__(self, model_name: str = DEFAULT_MODEL, model_path: str | None = None, enable_download: bool = True):
self.model_name = model_name or DEFAULT_MODEL
self.model_path = model_path
self.enable_download = enable_download
self.available = False
self._kind = "disabled"
self._model = None
self._load()
def _load(self) -> None:
if api is None:
self.available = False
self._kind = "unavailable"
return
try:
if self.model_path:
path = Path(self.model_path)
if path.exists():
# Keep this permissive; local path loading is optional.
self._model = api.load(self.model_name)
elif self.enable_download:
self._model = api.load(self.model_name)
else:
self._model = None
else:
if self.enable_download:
self._model = api.load(self.model_name)
else:
self._model = None
self.available = self._model is not None
self._kind = "glove" if self.available else "disabled"
except Exception:
self._model = None
self.available = False
self._kind = "disabled"
def _normalize_candidate(self, term: str) -> str:
term = normalize_text(term)
term = singularize(term)
return term
def most_similar(self, ingredient: str, topn: int = 10) -> List[Tuple[str, float]]:
if not self.available or self._model is None:
return []
query = self._normalize_candidate(ingredient)
if not query:
return []
try:
raw = self._model.most_similar(query, topn=max(topn, 10))
except Exception:
return []
out: List[Tuple[str, float]] = []
seen = set()
for term, score in raw:
term = self._normalize_candidate(term.replace("_", " "))
if not term or term in seen:
continue
seen.add(term)
# Keep only candidates with at least some lexical overlap or clear phrase family.
query_parts = set(query.split())
term_parts = set(term.split())
if query_parts and term_parts and not (query_parts & term_parts):
# Allow the last fallback word variant to pass through only if it looks like a food term.
variants = ingredient_lookup_variants(term)
if not variants:
continue
out.append((term, float(score)))
if len(out) >= topn:
break
return out
|