Sentence-Transformers ve E5-Large model aktarımı.

9bbba62 verified about 2 months ago

15.2 kB

	from __future__ import annotations

	from pathlib import Path

	import pytest

	from sentence_transformers import SparseEncoderTrainer, SparseEncoderTrainingArguments
	from sentence_transformers.base.model_card import generate_model_card
	from sentence_transformers.sparse_encoder import SparseEncoder
	from sentence_transformers.sparse_encoder.losses import SparseMultipleNegativesRankingLoss, SpladeLoss
	from sentence_transformers.sparse_encoder.model_card import SparseEncoderModelCardData
	from sentence_transformers.util import is_datasets_available, is_training_available

	if is_datasets_available():
	from datasets import Dataset, DatasetDict

	if not is_training_available():
	pytest.skip(
	reason='Sentence Transformers was not installed with the `["train"]` extra.',
	allow_module_level=True,
	)


	@pytest.fixture(scope="session")
	def dummy_dataset():
	"""
	Dummy dataset for testing purposes. The dataset looks as follows:
	{
	"anchor": ["anchor 1", "anchor 2", ..., "anchor 10"],
	"positive": ["positive 1", "positive 2", ..., "positive 10"],
	"negative": ["negative 1", "negative 2", ..., "negative 10"],
	}
	"""
	return Dataset.from_dict(
	{
	"anchor": [f"anchor {i}" for i in range(1, 11)],
	"positive": [f"positive {i}" for i in range(1, 11)],
	"negative": [f"negative {i}" for i in range(1, 11)],
	}
	)


	@pytest.mark.parametrize(
	("model_fixture_name", "num_datasets", "expected_substrings"),
	[
	# 0 actually refers to just a single dataset
	(
	"splade_bert_tiny_model",
	0,
	[
	"- sentence-transformers",
	"- sparse-encoder",
	"- sparse",
	"- splade",
	"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq)",
	"Maximum Sequence Length: 512 tokens",
	"Output Dimensionality: 30522 dimensions",
	"Similarity Function: Dot Product",
	"#### Unnamed Dataset",
	"\| details \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \|",
	" \| <code>anchor 1</code> \| <code>positive 1</code> \| <code>negative 1</code> \|",
	"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
	' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
	],
	),
	(
	"splade_bert_tiny_model",
	1,
	[
	"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on the train_0 dataset using the [sentence-transformers](https://www.SBERT.net) library.",
	"#### train_0",
	"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
	' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
	],
	),
	(
	"splade_bert_tiny_model",
	2,
	[
	"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on the train_0 and train_1 datasets using the [sentence-transformers](https://www.SBERT.net) library.",
	"#### train_0",
	"#### train_1",
	"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
	' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
	],
	),
	(
	"splade_bert_tiny_model",
	10,
	[
	"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on the train_0, train_1, train_2, train_3, train_4, train_5, train_6, train_7, train_8 and train_9 datasets using the [sentence-transformers](https://www.SBERT.net) library.",
	"<details><summary>train_0</summary>", # We start using <details><summary> if we have more than 3 datasets
	"#### train_0",
	"</details>\n<details><summary>train_9</summary>",
	"#### train_9",
	"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
	' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
	],
	),
	# We start using "50 datasets" when the ", "-joined dataset name exceed 200 characters
	(
	"splade_bert_tiny_model",
	50,
	[
	"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on 50 datasets using the [sentence-transformers](https://www.SBERT.net) library.",
	"<details><summary>train_0</summary>",
	"#### train_0",
	"</details>\n<details><summary>train_49</summary>",
	"#### train_49",
	"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
	' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
	],
	),
	(
	"csr_bert_tiny_model",
	0,
	[
	"- sentence-transformers",
	"- sparse-encoder",
	"- sparse",
	"- csr",
	"This is a [CSR Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) using the [sentence-transformers](https://www.SBERT.net) library.",
	"Maximum Sequence Length: 512 tokens",
	"Output Dimensionality: 512 dimensions",
	"Similarity Function: Dot Product",
	"#### Unnamed Dataset",
	"\| details \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \|",
	" \| <code>anchor 1</code> \| <code>positive 1</code> \| <code>negative 1</code> \|",
	"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
	' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
	],
	),
	(
	"inference_free_splade_bert_tiny_model",
	0,
	[
	"- sentence-transformers",
	"- sparse-encoder",
	"- sparse",
	"- asymmetric",
	"- inference-free",
	"- splade",
	"This is a [Asymmetric Inference-free SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/inference-free-splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/inference-free-splade-bert-tiny-nq) using the [sentence-transformers](https://www.SBERT.net) library.",
	"Maximum Sequence Length: 512 tokens",
	"Output Dimensionality: 30522 dimensions",
	"Similarity Function: Dot Product",
	"#### Unnamed Dataset",
	"\| details \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \|",
	" \| <code>anchor 1</code> \| <code>positive 1</code> \| <code>negative 1</code> \|",
	"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
	' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
	],
	),
	],
	)
	def test_model_card_base(
	model_fixture_name: str,
	dummy_dataset: Dataset,
	num_datasets: int,
	expected_substrings: list[str],
	request: pytest.FixtureRequest,
	tmp_path: Path,
	) -> None:
	model = request.getfixturevalue(model_fixture_name)

	# Let's avoid requesting the Hub for e.g. checking if a base model exists there
	model.model_card_data.local_files_only = True

	train_dataset = dummy_dataset
	if num_datasets:
	train_dataset = DatasetDict({f"train_{i}": train_dataset for i in range(num_datasets)})

	loss = SpladeLoss(
	model=model,
	loss=SparseMultipleNegativesRankingLoss(model=model),
	query_regularizer_weight=5e-5, # Weight for query loss
	document_regularizer_weight=3e-5, # Weight for document loss
	)

	args = SparseEncoderTrainingArguments(
	output_dir=tmp_path,
	router_mapping={"test": "query"} if "inference_free" in model_fixture_name else None,
	)

	# This adds data to model.model_card_data
	SparseEncoderTrainer(
	model,
	args=args,
	train_dataset=train_dataset,
	loss=loss,
	)

	model_card = generate_model_card(model)

	# For debugging purposes, we can save the model card to a file
	# with open(f"test_model_card_{model_fixture_name}_{num_datasets}d.md", "w", encoding="utf8") as f:
	# f.write(model_card)

	for substring in expected_substrings:
	assert substring in model_card

	# We don't want to have two consecutive empty lines anywhere
	assert "\n\n\n" not in model_card


	def test_model_card_set_transform(
	splade_bert_tiny_model: SparseEncoder, dummy_dataset: Dataset, tmp_path: Path
	) -> None:
	model = splade_bert_tiny_model

	# Let's avoid requesting the Hub for e.g. checking if a base model exists there
	model.model_card_data.local_files_only = True

	def dummy_transform(batch):
	return {
	"new_anchor": [text.upper() for text in batch["anchor"]],
	"new_positive": [text.upper() for text in batch["positive"]],
	"new_negative": [text.upper() for text in batch["negative"]],
	}

	# Use a copy to avoid mutating the session-scoped fixture
	dataset = dummy_dataset.select(range(len(dummy_dataset)))
	dataset.set_transform(dummy_transform)

	loss = SpladeLoss(
	model=model,
	loss=SparseMultipleNegativesRankingLoss(model=model),
	query_regularizer_weight=5e-5,
	document_regularizer_weight=3e-5,
	)
	SparseEncoderTrainer(
	model,
	args=SparseEncoderTrainingArguments(output_dir=tmp_path),
	train_dataset=dataset,
	loss=loss,
	)
	model_card = generate_model_card(model)

	# Post-transform column names should appear as column headers
	for substring in ["<code>new_anchor</code>", "<code>new_positive</code>", "<code>new_negative</code>"]:
	assert substring in model_card

	# Pre-transform column names should not appear as column headers
	for substring in ["<code>anchor</code>", "<code>positive</code>", "<code>negative</code>"]:
	assert substring not in model_card


	class TestGenerateUsageSnippetSparseEncoder:
	"""Verify SparseEncoder uses its own class name in usage snippets."""

	def test_sparse_encoder_class_name(self) -> None:
	data = SparseEncoderModelCardData()
	data.usage_examples = ["A", "B"]
	data.similarities = None
	data.model = None
	snippet = data.generate_usage_snippet()

	assert "from sentence_transformers import SparseEncoder" in snippet
	assert 'SparseEncoder("sparse_encoder_model_id")' in snippet
	assert "SentenceTransformer" not in snippet

	def test_sparse_encoder_custom_model_id(self) -> None:
	data = SparseEncoderModelCardData(model_id="my-org/my-sparse-model")
	data.usage_examples = ["test"]
	data.similarities = None
	data.model = None
	snippet = data.generate_usage_snippet()

	assert 'SparseEncoder("my-org/my-sparse-model")' in snippet