Add files using upload-large-folder tool

bd33eac verified 4 months ago

11.5 kB

	from __future__ import annotations

	from pathlib import Path

	import pytest

	from sentence_transformers import SparseEncoderTrainer, SparseEncoderTrainingArguments
	from sentence_transformers.model_card import generate_model_card
	from sentence_transformers.sparse_encoder import losses
	from sentence_transformers.util import is_datasets_available, is_training_available

	if is_datasets_available():
	from datasets import Dataset, DatasetDict

	if not is_training_available():
	pytest.skip(
	reason='Sentence Transformers was not installed with the `["train"]` extra.',
	allow_module_level=True,
	)


	@pytest.fixture(scope="session")
	def dummy_dataset():
	"""
	Dummy dataset for testing purposes. The dataset looks as follows:
	{
	"anchor": ["anchor 1", "anchor 2", ..., "anchor 10"],
	"positive": ["positive 1", "positive 2", ..., "positive 10"],
	"negative": ["negative 1", "negative 2", ..., "negative 10"],
	}
	"""
	return Dataset.from_dict(
	{
	"anchor": [f"anchor {i}" for i in range(1, 11)],
	"positive": [f"positive {i}" for i in range(1, 11)],
	"negative": [f"negative {i}" for i in range(1, 11)],
	}
	)


	@pytest.mark.parametrize(
	("model_fixture_name", "num_datasets", "expected_substrings"),
	[
	# 0 actually refers to just a single dataset
	(
	"splade_bert_tiny_model",
	0,
	[
	"- sentence-transformers",
	"- sparse-encoder",
	"- sparse",
	"- splade",
	"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq)",
	"Maximum Sequence Length: 512 tokens",
	"Output Dimensionality: 30522 dimensions",
	"Similarity Function: Dot Product",
	"#### Unnamed Dataset",
	"\| details \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \|",
	" \| <code>anchor 1</code> \| <code>positive 1</code> \| <code>negative 1</code> \|",
	"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
	' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\')",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
	],
	),
	(
	"splade_bert_tiny_model",
	1,
	[
	"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on the train_0 dataset using the [sentence-transformers](https://www.SBERT.net) library.",
	"#### train_0",
	"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
	' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\')",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
	],
	),
	(
	"splade_bert_tiny_model",
	2,
	[
	"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on the train_0 and train_1 datasets using the [sentence-transformers](https://www.SBERT.net) library.",
	"#### train_0",
	"#### train_1",
	"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
	' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\')",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
	],
	),
	(
	"splade_bert_tiny_model",
	10,
	[
	"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on the train_0, train_1, train_2, train_3, train_4, train_5, train_6, train_7, train_8 and train_9 datasets using the [sentence-transformers](https://www.SBERT.net) library.",
	"<details><summary>train_0</summary>", # We start using <details><summary> if we have more than 3 datasets
	"#### train_0",
	"</details>\n<details><summary>train_9</summary>",
	"#### train_9",
	"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
	' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\')",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
	],
	),
	# We start using "50 datasets" when the ", "-joined dataset name exceed 200 characters
	(
	"splade_bert_tiny_model",
	50,
	[
	"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on 50 datasets using the [sentence-transformers](https://www.SBERT.net) library.",
	"<details><summary>train_0</summary>",
	"#### train_0",
	"</details>\n<details><summary>train_49</summary>",
	"#### train_49",
	"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
	' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\')",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
	],
	),
	(
	"csr_bert_tiny_model",
	0,
	[
	"- sentence-transformers",
	"- sparse-encoder",
	"- sparse",
	"- csr",
	"This is a [CSR Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) using the [sentence-transformers](https://www.SBERT.net) library.",
	"Maximum Sequence Length: 512 tokens",
	"Output Dimensionality: 512 dimensions",
	"Similarity Function: Dot Product",
	"#### Unnamed Dataset",
	"\| details \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \|",
	" \| <code>anchor 1</code> \| <code>positive 1</code> \| <code>negative 1</code> \|",
	"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
	' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\')",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
	],
	),
	(
	"inference_free_splade_bert_tiny_model",
	0,
	[
	"- sentence-transformers",
	"- sparse-encoder",
	"- sparse",
	"- asymmetric",
	"- inference-free",
	"- splade",
	"This is a [Asymmetric Inference-free SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/inference-free-splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/inference-free-splade-bert-tiny-nq) using the [sentence-transformers](https://www.SBERT.net) library.",
	"Maximum Sequence Length: 512 tokens",
	"Output Dimensionality: 30522 dimensions",
	"Similarity Function: Dot Product",
	"#### Unnamed Dataset",
	"\| details \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \| <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> \|",
	" \| <code>anchor 1</code> \| <code>positive 1</code> \| <code>negative 1</code> \|",
	"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
	' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\')",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
	],
	),
	],
	)
	def test_model_card_base(
	model_fixture_name: str,
	dummy_dataset: Dataset,
	num_datasets: int,
	expected_substrings: list[str],
	request: pytest.FixtureRequest,
	tmp_path: Path,
	) -> None:
	model = request.getfixturevalue(model_fixture_name)

	train_dataset = dummy_dataset
	if num_datasets:
	train_dataset = DatasetDict({f"train_{i}": train_dataset for i in range(num_datasets)})

	loss = losses.SpladeLoss(
	model=model,
	loss=losses.SparseMultipleNegativesRankingLoss(model=model),
	query_regularizer_weight=5e-5, # Weight for query loss
	document_regularizer_weight=3e-5, # Weight for document loss
	)

	args = SparseEncoderTrainingArguments(
	output_dir=tmp_path,
	router_mapping={"test": "query"} if "inference_free" in model_fixture_name else None,
	)

	# This adds data to model.model_card_data
	SparseEncoderTrainer(
	model,
	args=args,
	train_dataset=train_dataset,
	loss=loss,
	)

	model_card = generate_model_card(model)

	# For debugging purposes, we can save the model card to a file
	# with open(f"test_model_card_{model_fixture_name}_{num_datasets}d.md", "w", encoding="utf8") as f:
	# f.write(model_card)

	for substring in expected_substrings:
	assert substring in model_card

	# We don't want to have two consecutive empty lines anywhere
	assert "\n\n\n" not in model_card