qwenillustrious / sentence-transformers /tests /test_multi_process.py

Add files using upload-large-folder tool

bd33eac verified 4 months ago

11.7 kB

	from __future__ import annotations

	import numpy as np
	import pytest
	import torch

	from sentence_transformers import SentenceTransformer

	# These tests fail if optimum.intel.openvino is imported, because openvinotoolkit/nncf
	# patches torch._C._nn.gelu in a way that breaks pickling. As a result, we may have issues
	# when running both backend tests and multi-process tests in the same session.


	@pytest.mark.slow
	@pytest.mark.parametrize("normalize_embeddings", (False, True))
	@pytest.mark.parametrize("prompt_name", (None, "retrieval"))
	def test_encode_multi_process(
	stsb_bert_tiny_model: SentenceTransformer, normalize_embeddings: bool, prompt_name: str \| None
	) -> None:
	model = stsb_bert_tiny_model
	model.prompts = {"retrieval": "Represent this sentence for searching relevant passages: "}
	sentences = [f"This is sentence {i}" for i in range(40)]

	# Start the multi-process pool on e.g. two CPU devices & compute the embeddings using the pool
	pool = model.start_multi_process_pool(["cpu", "cpu"])
	emb = model.encode(
	sentences, normalize_embeddings=normalize_embeddings, prompt_name=prompt_name, pool=pool, chunk_size=10
	)
	model.stop_multi_process_pool(pool)
	assert emb.shape == (len(sentences), 128)

	# Make sure the embeddings aren't just all 0
	assert emb.sum() != 0.0

	# Compare against normal embeddings
	emb_normal = model.encode(sentences, normalize_embeddings=normalize_embeddings, prompt_name=prompt_name)
	diff = np.max(np.abs(emb - emb_normal))
	assert diff < 0.001

	# Ensure that after normalizing, the means are all almost 0, and otherwise not
	assert np.all(np.abs(emb.mean(1)) < 0.01) == normalize_embeddings


	@pytest.mark.slow
	def test_multi_process_encode_same_as_standard_encode(stsb_bert_tiny_model: SentenceTransformer):
	model = stsb_bert_tiny_model
	# Test that multi-process encoding gives the same result as standard encoding
	texts = ["First sentence.", "Second sentence.", "Third sentence."] * 5

	# Standard encode
	embeddings_standard = model.encode(texts)

	# Multi-process encode with device=["cpu"] * 2
	embeddings_multi = model.encode(texts, device=["cpu"] * 2)

	# Should produce the same embeddings
	assert np.allclose(embeddings_standard, embeddings_multi, atol=1e-6)


	@pytest.mark.slow
	def test_multi_process_pool(stsb_bert_tiny_model: SentenceTransformer):
	# Test the start_multi_process_pool and stop_multi_process_pool functions
	model = stsb_bert_tiny_model
	texts = ["First sentence.", "Second sentence.", "Third sentence."] * 5

	# Standard encode
	embeddings_standard = model.encode(texts)

	pool = model.start_multi_process_pool(["cpu"] * 2)
	try:
	# Encode using the pool
	embeddings_multi = model.encode(texts, pool=pool)

	finally:
	model.stop_multi_process_pool(pool)

	# Should be numpy array with correct shape and the same embeddings
	assert isinstance(embeddings_multi, np.ndarray)
	assert embeddings_multi.shape == (len(texts), model.get_sentence_embedding_dimension())
	assert np.allclose(embeddings_standard, embeddings_multi, atol=1e-6)


	@pytest.mark.slow
	def test_multi_process_with_args(stsb_bert_tiny_model: SentenceTransformer):
	# Test multi-process encoding with various arguments
	model = stsb_bert_tiny_model
	texts = ["First sentence.", "Second sentence."]

	# Create a pool
	pool = model.start_multi_process_pool(["cpu"] * 2)

	try:
	# Test with normalize_embeddings and convert_to_tensor
	embeddings = model.encode(texts, pool=pool, normalize_embeddings=True, convert_to_tensor=True)

	# Should be a tensor with normalized vectors
	assert isinstance(embeddings, torch.Tensor)
	# Verify that embeddings are normalized (unit vectors) when normalize_embeddings=True
	norm = torch.norm(embeddings, p=2, dim=1)
	assert torch.allclose(norm, torch.ones_like(norm), atol=1e-6)

	# Test with precision options
	embeddings_int8 = model.encode(texts, pool=pool, precision="int8")

	# Should be quantized
	assert embeddings_int8.dtype == np.int8
	finally:
	model.stop_multi_process_pool(pool)


	@pytest.mark.slow
	def test_multi_process_output_values(stsb_bert_tiny_model: SentenceTransformer):
	# Test that different output_value options work with multi-process
	model = stsb_bert_tiny_model
	texts = ["First sentence.", "Second sentence."]

	# Regular encoding with output_value=None
	embeddings_standard = model.encode(texts, output_value=None)

	# Multi-process encoding with output_value=None
	embeddings_multi = model.encode(texts, device=["cpu"] * 2, output_value=None)

	# Both should return a list of dictionaries
	assert isinstance(embeddings_standard, list)
	assert isinstance(embeddings_multi, list)
	assert isinstance(embeddings_standard[0], dict)
	assert isinstance(embeddings_multi[0], dict)
	assert "sentence_embedding" in embeddings_standard[0]
	assert "sentence_embedding" in embeddings_multi[0]

	# Make sure the sentence embeddings match
	for i in range(len(texts)):
	assert torch.allclose(
	embeddings_standard[i]["sentence_embedding"].cpu(),
	embeddings_multi[i]["sentence_embedding"],
	atol=1e-6,
	)


	@pytest.mark.slow
	def test_multi_process_chunk_size(stsb_bert_tiny_model: SentenceTransformer):
	# Test explicit chunk_size parameter
	model = stsb_bert_tiny_model
	texts = ["First sentence.", "Second sentence.", "Third sentence."] * 10

	# Test with explicit chunk size
	embeddings = model.encode(texts, device=["cpu"] * 2, chunk_size=5)

	# Should produce correct embeddings
	assert isinstance(embeddings, np.ndarray)
	assert embeddings.shape == (len(texts), model.get_sentence_embedding_dimension())


	@pytest.mark.slow
	def test_multi_process_with_prompt(stsb_bert_tiny_model: SentenceTransformer):
	# Test multi-process encoding with prompts
	model = stsb_bert_tiny_model
	model.prompts = {"retrieval": "Represent this sentence for searching relevant passages: "}
	texts = ["First sentence.", "Second sentence."] * 5

	standard_embeddings = model.encode(texts, prompt_name="retrieval", normalize_embeddings=True)

	# Create a pool
	pool = model.start_multi_process_pool(["cpu"] * 2)

	try:
	# Encode with prompt
	multi_embeddings = model.encode(texts, pool=pool, prompt_name="retrieval", normalize_embeddings=True)
	finally:
	model.stop_multi_process_pool(pool)

	# Should be a numpy array with correct shape
	assert isinstance(multi_embeddings, np.ndarray)
	assert multi_embeddings.shape == (len(texts), 128)

	# Verify normalization
	norm = np.linalg.norm(multi_embeddings, axis=1)
	assert np.allclose(norm, 1.0, atol=1e-6)

	# Compare with standard encoding
	assert np.allclose(standard_embeddings, multi_embeddings, atol=1e-6)


	@pytest.mark.slow
	@pytest.mark.parametrize("convert_to_tensor", [True, False])
	@pytest.mark.parametrize("convert_to_numpy", [True, False])
	@pytest.mark.parametrize("output_value", [None, "sentence_embedding", "token_embeddings"])
	def test_multi_process_with_empty_texts(
	stsb_bert_tiny_model: SentenceTransformer,
	convert_to_tensor: bool,
	convert_to_numpy: bool,
	output_value: str \| None,
	):
	# Test encoding with empty texts
	model = stsb_bert_tiny_model
	texts = []

	# Encode with empty texts
	standard_embeddings = model.encode(
	texts, convert_to_tensor=convert_to_tensor, convert_to_numpy=convert_to_numpy, output_value=output_value
	)
	multi_embeddings = model.encode(
	texts,
	device=["cpu"] * 2,
	convert_to_tensor=convert_to_tensor,
	convert_to_numpy=convert_to_numpy,
	output_value=output_value,
	)

	# Should return empty arrays, identical types as without multi-processing
	assert type(standard_embeddings) is type(multi_embeddings)
	assert len(standard_embeddings) == 0
	assert len(multi_embeddings) == 0


	@pytest.mark.slow
	@pytest.mark.parametrize("convert_to_tensor", [True, False])
	@pytest.mark.parametrize("convert_to_numpy", [True, False])
	@pytest.mark.parametrize("output_value", [None, "sentence_embedding", "token_embeddings"])
	def test_multi_process_with_one_single_string(
	stsb_bert_tiny_model: SentenceTransformer,
	convert_to_tensor: bool,
	convert_to_numpy: bool,
	output_value: str \| None,
	):
	# Test encoding with a single text
	model = stsb_bert_tiny_model
	texts = "This is a single sentence."

	# Encode with single text
	standard_embeddings = model.encode(
	texts, convert_to_tensor=convert_to_tensor, convert_to_numpy=convert_to_numpy, output_value=output_value
	)
	multi_embeddings = model.encode(
	texts,
	device=["cpu"] * 2,
	convert_to_tensor=convert_to_tensor,
	convert_to_numpy=convert_to_numpy,
	output_value=output_value,
	)

	# Assert that the embeddings are the same type and shape
	assert type(standard_embeddings) is type(multi_embeddings)
	if isinstance(standard_embeddings, (np.ndarray, torch.Tensor)):
	assert standard_embeddings.shape == multi_embeddings.shape
	else:
	assert len(standard_embeddings) == len(multi_embeddings)
	# Check that dictionary items are the same
	if isinstance(standard_embeddings, dict):
	assert standard_embeddings.keys() == multi_embeddings.keys()
	for key in standard_embeddings:
	if isinstance(standard_embeddings[key], torch.Tensor):
	assert torch.allclose(standard_embeddings[key].cpu(), multi_embeddings[key], atol=1e-5)
	elif isinstance(standard_embeddings[key], np.ndarray):
	assert np.allclose(standard_embeddings[key], multi_embeddings[key], atol=1e-5)
	else:
	assert standard_embeddings[key] == multi_embeddings[key]
	elif isinstance(standard_embeddings, list) and len(standard_embeddings) > 0:
	for std_item, multi_item in zip(standard_embeddings, multi_embeddings):
	assert set(std_item.keys()) == set(multi_item.keys())
	for key in std_item:
	if isinstance(std_item[key], torch.Tensor):
	assert torch.allclose(std_item[key].cpu(), multi_item[key], atol=1e-5)
	elif isinstance(std_item[key], np.ndarray):
	assert np.allclose(std_item[key], multi_item[key], atol=1e-5)
	else:
	assert std_item[key] == multi_item[key]


	@pytest.mark.slow
	def test_multi_process_more_workers_than_texts(stsb_bert_tiny_model: SentenceTransformer):
	# Test with more workers than texts
	model = stsb_bert_tiny_model
	texts = ["First sentence.", "Second sentence."]

	embeddings = model.encode(texts, device=["cpu"] * 3)

	# Should be numpy array with correct shape
	assert isinstance(embeddings, np.ndarray)
	assert embeddings.shape == (len(texts), model.get_sentence_embedding_dimension())


	@pytest.mark.slow
	def test_multi_process_with_large_chunk_size(stsb_bert_tiny_model: SentenceTransformer):
	# Test with a large chunk size
	model = stsb_bert_tiny_model
	texts = ["First sentence.", "Second sentence."] * 10 # 20 sentences

	# Use a large chunk size
	embeddings = model.encode(texts, device=["cpu"] * 2, chunk_size=30)

	# Should produce correct embeddings
	assert isinstance(embeddings, np.ndarray)
	assert embeddings.shape == (len(texts), model.get_sentence_embedding_dimension())