BiteWiseFinal

Sleeping

App Files Files Community

anaygupta commited on 30 days ago

Commit

244053a

verified ·

1 Parent(s): f43f12b

Update services/semantic.py

Browse files

Files changed (1) hide show

services/semantic.py +76 -80

services/semantic.py CHANGED Viewed

@@ -1,107 +1,103 @@
 from __future__ import annotations
 from dataclasses import dataclass
-from typing import Iterable, List, Optional
-import numpy as np
-from .text_utils import normalize_text, singularize
 @dataclass
-class SemanticHit:
     term: str
     score: float
 class WordVectorFallback:
-    """Small Glove-based semantic fallback.
-    The model is optional so the app can still boot if the host blocks downloads.
-    """
-    def __init__(self, model_name: str = "glove-wiki-gigaword-50", model_path: str = "", enable_download: bool = True):
-        self.model = None
         self._kind = "disabled"
-        self._model_name = model_name
-        self._load(model_name=model_name, model_path=model_path, enable_download=enable_download)
-    def _load(self, model_name: str, model_path: str, enable_download: bool) -> None:
-        try:
-            from gensim.models import KeyedVectors
-            import gensim.downloader as api
-        except Exception:
-            self.model = None
             self._kind = "unavailable"
             return
-        if model_path:
-            try:
-                self.model = KeyedVectors.load(model_path, mmap="r")
-                self._kind = f"local:{model_path}"
-                return
-            except Exception:
-                try:
-                    self.model = KeyedVectors.load_word2vec_format(model_path, binary=model_path.endswith(".bin"))
-                    self._kind = f"local-vec:{model_path}"
-                    return
-                except Exception:
-                    self.model = None
-        if enable_download:
-            try:
-                self.model = api.load(model_name)
-                self._kind = model_name
-            except Exception:
-                self.model = None
-                self._kind = "download-failed"
-        else:
-            self.model = None
             self._kind = "disabled"
-    @property
-    def available(self) -> bool:
-        return self.model is not None
-    def vector_for(self, phrase: str) -> Optional[np.ndarray]:
-        if not self.available:
-            return None
-        normalized = normalize_text(phrase)
-        tokens = [singularize(t) for t in normalized.split()]
-        vectors = []
-        for token in tokens:
-            if token in self.model:
-                vectors.append(self.model[token])
-        if vectors:
-            return np.mean(np.stack(vectors), axis=0)
-        phrase_key = normalized.replace(" ", "_")
-        if phrase_key in self.model:
-            return self.model[phrase_key]
-        if normalized in self.model:
-            return self.model[normalized]
-        return None
-    def nearest(self, query: str, candidates: Iterable[str], top_k: int = 3) -> List[SemanticHit]:
-        if not self.available:
             return []
-        qv = self.vector_for(query)
-        if qv is None:
             return []
-        scored: List[SemanticHit] = []
-        qnorm = np.linalg.norm(qv) + 1e-8
-        for candidate in candidates:
-            cv = self.vector_for(candidate)
-            if cv is None:
                 continue
-            score = float(np.dot(qv, cv) / (qnorm * (np.linalg.norm(cv) + 1e-8)))
-            scored.append(SemanticHit(term=candidate, score=score))
-        scored.sort(key=lambda x: x.score, reverse=True)
-        return scored[:top_k]

 from __future__ import annotations
 from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Tuple
+from .text_utils import normalize_text, singularize, ingredient_lookup_variants
+try:
+    import gensim.downloader as api
+except Exception:  # pragma: no cover
+    api = None
+DEFAULT_MODEL = "glove-wiki-gigaword-50"
 @dataclass
+class SemanticCandidate:
     term: str
     score: float
 class WordVectorFallback:
+    def __init__(self, model_name: str = DEFAULT_MODEL, model_path: str | None = None, enable_download: bool = True):
+        self.model_name = model_name or DEFAULT_MODEL
+        self.model_path = model_path
+        self.enable_download = enable_download
+        self.available = False
         self._kind = "disabled"
+        self._model = None
+        self._load()
+    def _load(self) -> None:
+        if api is None:
+            self.available = False
             self._kind = "unavailable"
             return
+        try:
+            if self.model_path:
+                path = Path(self.model_path)
+                if path.exists():
+                    # Keep this permissive; local path loading is optional.
+                    self._model = api.load(self.model_name)
+                elif self.enable_download:
+                    self._model = api.load(self.model_name)
+                else:
+                    self._model = None
+            else:
+                if self.enable_download:
+                    self._model = api.load(self.model_name)
+                else:
+                    self._model = None
+            self.available = self._model is not None
+            self._kind = "glove" if self.available else "disabled"
+        except Exception:
+            self._model = None
+            self.available = False
             self._kind = "disabled"
+    def _normalize_candidate(self, term: str) -> str:
+        term = normalize_text(term)
+        term = singularize(term)
+        return term
+    def most_similar(self, ingredient: str, topn: int = 10) -> List[Tuple[str, float]]:
+        if not self.available or self._model is None:
+            return []
+        query = self._normalize_candidate(ingredient)
+        if not query:
             return []
+        try:
+            raw = self._model.most_similar(query, topn=max(topn, 10))
+        except Exception:
             return []
+        out: List[Tuple[str, float]] = []
+        seen = set()
+        for term, score in raw:
+            term = self._normalize_candidate(term.replace("_", " "))
+            if not term or term in seen:
                 continue
+            seen.add(term)
+            # Keep only candidates with at least some lexical overlap or clear phrase family.
+            query_parts = set(query.split())
+            term_parts = set(term.split())
+            if query_parts and term_parts and not (query_parts & term_parts):
+                # Allow the last fallback word variant to pass through only if it looks like a food term.
+                variants = ingredient_lookup_variants(term)
+                if not variants:
+                    continue
+            out.append((term, float(score)))
+            if len(out) >= topn:
+                break
+        return out