File size: 3,219 Bytes
9373226
 
 
244053a
 
9373226
244053a
9373226
244053a
 
 
 
 
 
 
9373226
 
 
244053a
9373226
 
 
 
 
244053a
 
 
 
 
9373226
244053a
9373226
244053a
 
 
 
 
9373226
 
 
244053a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9373226
 
244053a
 
 
 
9373226
244053a
 
 
9373226
244053a
 
9373226
 
244053a
 
 
9373226
 
244053a
 
 
 
 
9373226
244053a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple

from .text_utils import normalize_text, singularize, ingredient_lookup_variants

try:
    import gensim.downloader as api
except Exception:  # pragma: no cover
    api = None


DEFAULT_MODEL = "glove-wiki-gigaword-50"


@dataclass
class SemanticCandidate:
    term: str
    score: float


class WordVectorFallback:
    def __init__(self, model_name: str = DEFAULT_MODEL, model_path: str | None = None, enable_download: bool = True):
        self.model_name = model_name or DEFAULT_MODEL
        self.model_path = model_path
        self.enable_download = enable_download
        self.available = False
        self._kind = "disabled"
        self._model = None

        self._load()

    def _load(self) -> None:
        if api is None:
            self.available = False
            self._kind = "unavailable"
            return

        try:
            if self.model_path:
                path = Path(self.model_path)
                if path.exists():
                    # Keep this permissive; local path loading is optional.
                    self._model = api.load(self.model_name)
                elif self.enable_download:
                    self._model = api.load(self.model_name)
                else:
                    self._model = None
            else:
                if self.enable_download:
                    self._model = api.load(self.model_name)
                else:
                    self._model = None

            self.available = self._model is not None
            self._kind = "glove" if self.available else "disabled"
        except Exception:
            self._model = None
            self.available = False
            self._kind = "disabled"

    def _normalize_candidate(self, term: str) -> str:
        term = normalize_text(term)
        term = singularize(term)
        return term

    def most_similar(self, ingredient: str, topn: int = 10) -> List[Tuple[str, float]]:
        if not self.available or self._model is None:
            return []

        query = self._normalize_candidate(ingredient)
        if not query:
            return []

        try:
            raw = self._model.most_similar(query, topn=max(topn, 10))
        except Exception:
            return []

        out: List[Tuple[str, float]] = []
        seen = set()
        for term, score in raw:
            term = self._normalize_candidate(term.replace("_", " "))
            if not term or term in seen:
                continue
            seen.add(term)

            # Keep only candidates with at least some lexical overlap or clear phrase family.
            query_parts = set(query.split())
            term_parts = set(term.split())
            if query_parts and term_parts and not (query_parts & term_parts):
                # Allow the last fallback word variant to pass through only if it looks like a food term.
                variants = ingredient_lookup_variants(term)
                if not variants:
                    continue

            out.append((term, float(score)))
            if len(out) >= topn:
                break

        return out