mediastorm / tests /test_embedder.py
remdms's picture
feat: replace sentence-transformers with ONNX Runtime embedder (bge-small 384d)
0de90e8
import pytest
from mediastorm.vectorize.embedder import Embedder
@pytest.fixture(scope="module")
def embedder():
"""Shared embedder instance (model loading is slow)."""
return Embedder()
def test_embed_returns_384_dimensions(embedder):
"""Embedding should produce 384-dimensional vectors."""
vectors = embedder.embed_texts(["Hello world"])
assert len(vectors) == 1
assert len(vectors[0]) == 384
def test_embed_batch_consistency(embedder):
"""Same text should produce same vector."""
v1 = embedder.embed_texts(["The cat sat on the mat"])
v2 = embedder.embed_texts(["The cat sat on the mat"])
assert v1[0] == pytest.approx(v2[0], abs=1e-5)
def test_embed_multiple_texts(embedder):
"""Batch embedding should return one vector per text."""
texts = ["First sentence.", "Second sentence.", "Third sentence."]
vectors = embedder.embed_texts(texts)
assert len(vectors) == 3
assert all(len(v) == 384 for v in vectors)
def test_embed_vectors_are_normalized(embedder):
"""Embeddings should be L2 normalized (unit vectors)."""
vectors = embedder.embed_texts(["Test normalization"])
norm = sum(x ** 2 for x in vectors[0]) ** 0.5
assert norm == pytest.approx(1.0, abs=1e-4)