|
|
from __future__ import annotations |
|
|
|
|
|
import numpy as np |
|
|
import pytest |
|
|
import torch |
|
|
|
|
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.slow |
|
|
@pytest.mark.parametrize("normalize_embeddings", (False, True)) |
|
|
@pytest.mark.parametrize("prompt_name", (None, "retrieval")) |
|
|
def test_encode_multi_process( |
|
|
stsb_bert_tiny_model: SentenceTransformer, normalize_embeddings: bool, prompt_name: str | None |
|
|
) -> None: |
|
|
model = stsb_bert_tiny_model |
|
|
model.prompts = {"retrieval": "Represent this sentence for searching relevant passages: "} |
|
|
sentences = [f"This is sentence {i}" for i in range(40)] |
|
|
|
|
|
|
|
|
pool = model.start_multi_process_pool(["cpu", "cpu"]) |
|
|
emb = model.encode( |
|
|
sentences, normalize_embeddings=normalize_embeddings, prompt_name=prompt_name, pool=pool, chunk_size=10 |
|
|
) |
|
|
model.stop_multi_process_pool(pool) |
|
|
assert emb.shape == (len(sentences), 128) |
|
|
|
|
|
|
|
|
assert emb.sum() != 0.0 |
|
|
|
|
|
|
|
|
emb_normal = model.encode(sentences, normalize_embeddings=normalize_embeddings, prompt_name=prompt_name) |
|
|
diff = np.max(np.abs(emb - emb_normal)) |
|
|
assert diff < 0.001 |
|
|
|
|
|
|
|
|
assert np.all(np.abs(emb.mean(1)) < 0.01) == normalize_embeddings |
|
|
|
|
|
|
|
|
@pytest.mark.slow |
|
|
def test_multi_process_encode_same_as_standard_encode(stsb_bert_tiny_model: SentenceTransformer): |
|
|
model = stsb_bert_tiny_model |
|
|
|
|
|
texts = ["First sentence.", "Second sentence.", "Third sentence."] * 5 |
|
|
|
|
|
|
|
|
embeddings_standard = model.encode(texts) |
|
|
|
|
|
|
|
|
embeddings_multi = model.encode(texts, device=["cpu"] * 2) |
|
|
|
|
|
|
|
|
assert np.allclose(embeddings_standard, embeddings_multi, atol=1e-6) |
|
|
|
|
|
|
|
|
@pytest.mark.slow |
|
|
def test_multi_process_pool(stsb_bert_tiny_model: SentenceTransformer): |
|
|
|
|
|
model = stsb_bert_tiny_model |
|
|
texts = ["First sentence.", "Second sentence.", "Third sentence."] * 5 |
|
|
|
|
|
|
|
|
embeddings_standard = model.encode(texts) |
|
|
|
|
|
pool = model.start_multi_process_pool(["cpu"] * 2) |
|
|
try: |
|
|
|
|
|
embeddings_multi = model.encode(texts, pool=pool) |
|
|
|
|
|
finally: |
|
|
model.stop_multi_process_pool(pool) |
|
|
|
|
|
|
|
|
assert isinstance(embeddings_multi, np.ndarray) |
|
|
assert embeddings_multi.shape == (len(texts), model.get_sentence_embedding_dimension()) |
|
|
assert np.allclose(embeddings_standard, embeddings_multi, atol=1e-6) |
|
|
|
|
|
|
|
|
@pytest.mark.slow |
|
|
def test_multi_process_with_args(stsb_bert_tiny_model: SentenceTransformer): |
|
|
|
|
|
model = stsb_bert_tiny_model |
|
|
texts = ["First sentence.", "Second sentence."] |
|
|
|
|
|
|
|
|
pool = model.start_multi_process_pool(["cpu"] * 2) |
|
|
|
|
|
try: |
|
|
|
|
|
embeddings = model.encode(texts, pool=pool, normalize_embeddings=True, convert_to_tensor=True) |
|
|
|
|
|
|
|
|
assert isinstance(embeddings, torch.Tensor) |
|
|
|
|
|
norm = torch.norm(embeddings, p=2, dim=1) |
|
|
assert torch.allclose(norm, torch.ones_like(norm), atol=1e-6) |
|
|
|
|
|
|
|
|
embeddings_int8 = model.encode(texts, pool=pool, precision="int8") |
|
|
|
|
|
|
|
|
assert embeddings_int8.dtype == np.int8 |
|
|
finally: |
|
|
model.stop_multi_process_pool(pool) |
|
|
|
|
|
|
|
|
@pytest.mark.slow |
|
|
def test_multi_process_output_values(stsb_bert_tiny_model: SentenceTransformer): |
|
|
|
|
|
model = stsb_bert_tiny_model |
|
|
texts = ["First sentence.", "Second sentence."] |
|
|
|
|
|
|
|
|
embeddings_standard = model.encode(texts, output_value=None) |
|
|
|
|
|
|
|
|
embeddings_multi = model.encode(texts, device=["cpu"] * 2, output_value=None) |
|
|
|
|
|
|
|
|
assert isinstance(embeddings_standard, list) |
|
|
assert isinstance(embeddings_multi, list) |
|
|
assert isinstance(embeddings_standard[0], dict) |
|
|
assert isinstance(embeddings_multi[0], dict) |
|
|
assert "sentence_embedding" in embeddings_standard[0] |
|
|
assert "sentence_embedding" in embeddings_multi[0] |
|
|
|
|
|
|
|
|
for i in range(len(texts)): |
|
|
assert torch.allclose( |
|
|
embeddings_standard[i]["sentence_embedding"].cpu(), |
|
|
embeddings_multi[i]["sentence_embedding"], |
|
|
atol=1e-6, |
|
|
) |
|
|
|
|
|
|
|
|
@pytest.mark.slow |
|
|
def test_multi_process_chunk_size(stsb_bert_tiny_model: SentenceTransformer): |
|
|
|
|
|
model = stsb_bert_tiny_model |
|
|
texts = ["First sentence.", "Second sentence.", "Third sentence."] * 10 |
|
|
|
|
|
|
|
|
embeddings = model.encode(texts, device=["cpu"] * 2, chunk_size=5) |
|
|
|
|
|
|
|
|
assert isinstance(embeddings, np.ndarray) |
|
|
assert embeddings.shape == (len(texts), model.get_sentence_embedding_dimension()) |
|
|
|
|
|
|
|
|
@pytest.mark.slow |
|
|
def test_multi_process_with_prompt(stsb_bert_tiny_model: SentenceTransformer): |
|
|
|
|
|
model = stsb_bert_tiny_model |
|
|
model.prompts = {"retrieval": "Represent this sentence for searching relevant passages: "} |
|
|
texts = ["First sentence.", "Second sentence."] * 5 |
|
|
|
|
|
standard_embeddings = model.encode(texts, prompt_name="retrieval", normalize_embeddings=True) |
|
|
|
|
|
|
|
|
pool = model.start_multi_process_pool(["cpu"] * 2) |
|
|
|
|
|
try: |
|
|
|
|
|
multi_embeddings = model.encode(texts, pool=pool, prompt_name="retrieval", normalize_embeddings=True) |
|
|
finally: |
|
|
model.stop_multi_process_pool(pool) |
|
|
|
|
|
|
|
|
assert isinstance(multi_embeddings, np.ndarray) |
|
|
assert multi_embeddings.shape == (len(texts), 128) |
|
|
|
|
|
|
|
|
norm = np.linalg.norm(multi_embeddings, axis=1) |
|
|
assert np.allclose(norm, 1.0, atol=1e-6) |
|
|
|
|
|
|
|
|
assert np.allclose(standard_embeddings, multi_embeddings, atol=1e-6) |
|
|
|
|
|
|
|
|
@pytest.mark.slow |
|
|
@pytest.mark.parametrize("convert_to_tensor", [True, False]) |
|
|
@pytest.mark.parametrize("convert_to_numpy", [True, False]) |
|
|
@pytest.mark.parametrize("output_value", [None, "sentence_embedding", "token_embeddings"]) |
|
|
def test_multi_process_with_empty_texts( |
|
|
stsb_bert_tiny_model: SentenceTransformer, |
|
|
convert_to_tensor: bool, |
|
|
convert_to_numpy: bool, |
|
|
output_value: str | None, |
|
|
): |
|
|
|
|
|
model = stsb_bert_tiny_model |
|
|
texts = [] |
|
|
|
|
|
|
|
|
standard_embeddings = model.encode( |
|
|
texts, convert_to_tensor=convert_to_tensor, convert_to_numpy=convert_to_numpy, output_value=output_value |
|
|
) |
|
|
multi_embeddings = model.encode( |
|
|
texts, |
|
|
device=["cpu"] * 2, |
|
|
convert_to_tensor=convert_to_tensor, |
|
|
convert_to_numpy=convert_to_numpy, |
|
|
output_value=output_value, |
|
|
) |
|
|
|
|
|
|
|
|
assert type(standard_embeddings) is type(multi_embeddings) |
|
|
assert len(standard_embeddings) == 0 |
|
|
assert len(multi_embeddings) == 0 |
|
|
|
|
|
|
|
|
@pytest.mark.slow |
|
|
@pytest.mark.parametrize("convert_to_tensor", [True, False]) |
|
|
@pytest.mark.parametrize("convert_to_numpy", [True, False]) |
|
|
@pytest.mark.parametrize("output_value", [None, "sentence_embedding", "token_embeddings"]) |
|
|
def test_multi_process_with_one_single_string( |
|
|
stsb_bert_tiny_model: SentenceTransformer, |
|
|
convert_to_tensor: bool, |
|
|
convert_to_numpy: bool, |
|
|
output_value: str | None, |
|
|
): |
|
|
|
|
|
model = stsb_bert_tiny_model |
|
|
texts = "This is a single sentence." |
|
|
|
|
|
|
|
|
standard_embeddings = model.encode( |
|
|
texts, convert_to_tensor=convert_to_tensor, convert_to_numpy=convert_to_numpy, output_value=output_value |
|
|
) |
|
|
multi_embeddings = model.encode( |
|
|
texts, |
|
|
device=["cpu"] * 2, |
|
|
convert_to_tensor=convert_to_tensor, |
|
|
convert_to_numpy=convert_to_numpy, |
|
|
output_value=output_value, |
|
|
) |
|
|
|
|
|
|
|
|
assert type(standard_embeddings) is type(multi_embeddings) |
|
|
if isinstance(standard_embeddings, (np.ndarray, torch.Tensor)): |
|
|
assert standard_embeddings.shape == multi_embeddings.shape |
|
|
else: |
|
|
assert len(standard_embeddings) == len(multi_embeddings) |
|
|
|
|
|
if isinstance(standard_embeddings, dict): |
|
|
assert standard_embeddings.keys() == multi_embeddings.keys() |
|
|
for key in standard_embeddings: |
|
|
if isinstance(standard_embeddings[key], torch.Tensor): |
|
|
assert torch.allclose(standard_embeddings[key].cpu(), multi_embeddings[key], atol=1e-5) |
|
|
elif isinstance(standard_embeddings[key], np.ndarray): |
|
|
assert np.allclose(standard_embeddings[key], multi_embeddings[key], atol=1e-5) |
|
|
else: |
|
|
assert standard_embeddings[key] == multi_embeddings[key] |
|
|
elif isinstance(standard_embeddings, list) and len(standard_embeddings) > 0: |
|
|
for std_item, multi_item in zip(standard_embeddings, multi_embeddings): |
|
|
assert set(std_item.keys()) == set(multi_item.keys()) |
|
|
for key in std_item: |
|
|
if isinstance(std_item[key], torch.Tensor): |
|
|
assert torch.allclose(std_item[key].cpu(), multi_item[key], atol=1e-5) |
|
|
elif isinstance(std_item[key], np.ndarray): |
|
|
assert np.allclose(std_item[key], multi_item[key], atol=1e-5) |
|
|
else: |
|
|
assert std_item[key] == multi_item[key] |
|
|
|
|
|
|
|
|
@pytest.mark.slow |
|
|
def test_multi_process_more_workers_than_texts(stsb_bert_tiny_model: SentenceTransformer): |
|
|
|
|
|
model = stsb_bert_tiny_model |
|
|
texts = ["First sentence.", "Second sentence."] |
|
|
|
|
|
embeddings = model.encode(texts, device=["cpu"] * 3) |
|
|
|
|
|
|
|
|
assert isinstance(embeddings, np.ndarray) |
|
|
assert embeddings.shape == (len(texts), model.get_sentence_embedding_dimension()) |
|
|
|
|
|
|
|
|
@pytest.mark.slow |
|
|
def test_multi_process_with_large_chunk_size(stsb_bert_tiny_model: SentenceTransformer): |
|
|
|
|
|
model = stsb_bert_tiny_model |
|
|
texts = ["First sentence.", "Second sentence."] * 10 |
|
|
|
|
|
|
|
|
embeddings = model.encode(texts, device=["cpu"] * 2, chunk_size=30) |
|
|
|
|
|
|
|
|
assert isinstance(embeddings, np.ndarray) |
|
|
assert embeddings.shape == (len(texts), model.get_sentence_embedding_dimension()) |
|
|
|