github-actions
Sync from GitHub @ de45553
f070f64
"""Multilingual embeddings via fastembed (ONNX-based, no torch dependency).
We use `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`:
• 120MB ONNX model — small enough for the free HF Space
• 384-dim output, well-supported in fastembed
• Covers ~50 languages including French/Spanish/Arabic → close enough
to Mauritian Kreol for retrieval to work (Creole shares heavy
French-derived vocabulary)
• Comparable retrieval quality to e5-small at similar size
To see fastembed's full supported-model list:
from fastembed import TextEmbedding
TextEmbedding.list_supported_models()
"""
from __future__ import annotations
from functools import lru_cache
from typing import Iterable
import numpy as np
DEFAULT_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
EMBED_DIM = 384
@lru_cache(maxsize=2)
def _model(model_name: str):
from fastembed import TextEmbedding
print(f"[knowledge] Loading embedder: {model_name}")
return TextEmbedding(model_name=model_name)
def embed_texts(
texts: Iterable[str], model_name: str = DEFAULT_MODEL
) -> np.ndarray:
"""Embed a list of strings. Returns (N, EMBED_DIM) float32 normalised."""
texts = list(texts)
if not texts:
return np.zeros((0, EMBED_DIM), dtype=np.float32)
model = _model(model_name)
embeddings = list(model.embed(texts))
arr = np.array(embeddings, dtype=np.float32)
norms = np.linalg.norm(arr, axis=1, keepdims=True)
norms[norms == 0] = 1.0
return arr / norms
def embed_passages(texts: Iterable[str]) -> np.ndarray:
"""Embed text chunks for storage. MiniLM has no required prefix."""
return embed_texts(texts)
def embed_query(text: str) -> np.ndarray:
"""Embed a search query. MiniLM uses the same encoding as passages."""
return embed_texts([text])[0]