Spaces:
Running
Running
| """Multilingual embeddings via fastembed (ONNX-based, no torch dependency). | |
| We use `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`: | |
| • 120MB ONNX model — small enough for the free HF Space | |
| • 384-dim output, well-supported in fastembed | |
| • Covers ~50 languages including French/Spanish/Arabic → close enough | |
| to Mauritian Kreol for retrieval to work (Creole shares heavy | |
| French-derived vocabulary) | |
| • Comparable retrieval quality to e5-small at similar size | |
| To see fastembed's full supported-model list: | |
| from fastembed import TextEmbedding | |
| TextEmbedding.list_supported_models() | |
| """ | |
| from __future__ import annotations | |
| from functools import lru_cache | |
| from typing import Iterable | |
| import numpy as np | |
| DEFAULT_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" | |
| EMBED_DIM = 384 | |
| def _model(model_name: str): | |
| from fastembed import TextEmbedding | |
| print(f"[knowledge] Loading embedder: {model_name}") | |
| return TextEmbedding(model_name=model_name) | |
| def embed_texts( | |
| texts: Iterable[str], model_name: str = DEFAULT_MODEL | |
| ) -> np.ndarray: | |
| """Embed a list of strings. Returns (N, EMBED_DIM) float32 normalised.""" | |
| texts = list(texts) | |
| if not texts: | |
| return np.zeros((0, EMBED_DIM), dtype=np.float32) | |
| model = _model(model_name) | |
| embeddings = list(model.embed(texts)) | |
| arr = np.array(embeddings, dtype=np.float32) | |
| norms = np.linalg.norm(arr, axis=1, keepdims=True) | |
| norms[norms == 0] = 1.0 | |
| return arr / norms | |
| def embed_passages(texts: Iterable[str]) -> np.ndarray: | |
| """Embed text chunks for storage. MiniLM has no required prefix.""" | |
| return embed_texts(texts) | |
| def embed_query(text: str) -> np.ndarray: | |
| """Embed a search query. MiniLM uses the same encoding as passages.""" | |
| return embed_texts([text])[0] | |