| from __future__ import annotations |
|
|
| from pathlib import Path |
|
|
| import pytest |
|
|
| from sentence_transformers import SparseEncoderTrainer, SparseEncoderTrainingArguments |
| from sentence_transformers.base.model_card import generate_model_card |
| from sentence_transformers.sparse_encoder import SparseEncoder |
| from sentence_transformers.sparse_encoder.losses import SparseMultipleNegativesRankingLoss, SpladeLoss |
| from sentence_transformers.sparse_encoder.model_card import SparseEncoderModelCardData |
| from sentence_transformers.util import is_datasets_available, is_training_available |
|
|
| if is_datasets_available(): |
| from datasets import Dataset, DatasetDict |
|
|
| if not is_training_available(): |
| pytest.skip( |
| reason='Sentence Transformers was not installed with the `["train"]` extra.', |
| allow_module_level=True, |
| ) |
|
|
|
|
| @pytest.fixture(scope="session") |
| def dummy_dataset(): |
| """ |
| Dummy dataset for testing purposes. The dataset looks as follows: |
| { |
| "anchor": ["anchor 1", "anchor 2", ..., "anchor 10"], |
| "positive": ["positive 1", "positive 2", ..., "positive 10"], |
| "negative": ["negative 1", "negative 2", ..., "negative 10"], |
| } |
| """ |
| return Dataset.from_dict( |
| { |
| "anchor": [f"anchor {i}" for i in range(1, 11)], |
| "positive": [f"positive {i}" for i in range(1, 11)], |
| "negative": [f"negative {i}" for i in range(1, 11)], |
| } |
| ) |
|
|
|
|
| @pytest.mark.parametrize( |
| ("model_fixture_name", "num_datasets", "expected_substrings"), |
| [ |
| |
| ( |
| "splade_bert_tiny_model", |
| 0, |
| [ |
| "- sentence-transformers", |
| "- sparse-encoder", |
| "- sparse", |
| "- splade", |
| "This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq)", |
| "**Maximum Sequence Length:** 512 tokens", |
| "**Output Dimensionality:** 30522 dimensions", |
| "**Similarity Function:** Dot Product", |
| "#### Unnamed Dataset", |
| "| details | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> |", |
| " | <code>anchor 1</code> | <code>positive 1</code> | <code>negative 1</code> |", |
| "* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:", |
| ' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```', |
| ], |
| ), |
| ( |
| "splade_bert_tiny_model", |
| 1, |
| [ |
| "This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on the train_0 dataset using the [sentence-transformers](https://www.SBERT.net) library.", |
| "#### train_0", |
| "* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:", |
| ' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```', |
| ], |
| ), |
| ( |
| "splade_bert_tiny_model", |
| 2, |
| [ |
| "This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on the train_0 and train_1 datasets using the [sentence-transformers](https://www.SBERT.net) library.", |
| "#### train_0", |
| "#### train_1", |
| "* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:", |
| ' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```', |
| ], |
| ), |
| ( |
| "splade_bert_tiny_model", |
| 10, |
| [ |
| "This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on the train_0, train_1, train_2, train_3, train_4, train_5, train_6, train_7, train_8 and train_9 datasets using the [sentence-transformers](https://www.SBERT.net) library.", |
| "<details><summary>train_0</summary>", |
| "#### train_0", |
| "</details>\n<details><summary>train_9</summary>", |
| "#### train_9", |
| "* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:", |
| ' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```', |
| ], |
| ), |
| |
| ( |
| "splade_bert_tiny_model", |
| 50, |
| [ |
| "This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on 50 datasets using the [sentence-transformers](https://www.SBERT.net) library.", |
| "<details><summary>train_0</summary>", |
| "#### train_0", |
| "</details>\n<details><summary>train_49</summary>", |
| "#### train_49", |
| "* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:", |
| ' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```', |
| ], |
| ), |
| ( |
| "csr_bert_tiny_model", |
| 0, |
| [ |
| "- sentence-transformers", |
| "- sparse-encoder", |
| "- sparse", |
| "- csr", |
| "This is a [CSR Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) using the [sentence-transformers](https://www.SBERT.net) library.", |
| "**Maximum Sequence Length:** 512 tokens", |
| "**Output Dimensionality:** 512 dimensions", |
| "**Similarity Function:** Dot Product", |
| "#### Unnamed Dataset", |
| "| details | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> |", |
| " | <code>anchor 1</code> | <code>positive 1</code> | <code>negative 1</code> |", |
| "* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:", |
| ' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```', |
| ], |
| ), |
| ( |
| "inference_free_splade_bert_tiny_model", |
| 0, |
| [ |
| "- sentence-transformers", |
| "- sparse-encoder", |
| "- sparse", |
| "- asymmetric", |
| "- inference-free", |
| "- splade", |
| "This is a [Asymmetric Inference-free SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/inference-free-splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/inference-free-splade-bert-tiny-nq) using the [sentence-transformers](https://www.SBERT.net) library.", |
| "**Maximum Sequence Length:** 512 tokens", |
| "**Output Dimensionality:** 30522 dimensions", |
| "**Similarity Function:** Dot Product", |
| "#### Unnamed Dataset", |
| "| details | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> |", |
| " | <code>anchor 1</code> | <code>positive 1</code> | <code>negative 1</code> |", |
| "* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:", |
| ' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```', |
| ], |
| ), |
| ], |
| ) |
| def test_model_card_base( |
| model_fixture_name: str, |
| dummy_dataset: Dataset, |
| num_datasets: int, |
| expected_substrings: list[str], |
| request: pytest.FixtureRequest, |
| tmp_path: Path, |
| ) -> None: |
| model = request.getfixturevalue(model_fixture_name) |
|
|
| |
| model.model_card_data.local_files_only = True |
|
|
| train_dataset = dummy_dataset |
| if num_datasets: |
| train_dataset = DatasetDict({f"train_{i}": train_dataset for i in range(num_datasets)}) |
|
|
| loss = SpladeLoss( |
| model=model, |
| loss=SparseMultipleNegativesRankingLoss(model=model), |
| query_regularizer_weight=5e-5, |
| document_regularizer_weight=3e-5, |
| ) |
|
|
| args = SparseEncoderTrainingArguments( |
| output_dir=tmp_path, |
| router_mapping={"test": "query"} if "inference_free" in model_fixture_name else None, |
| ) |
|
|
| |
| SparseEncoderTrainer( |
| model, |
| args=args, |
| train_dataset=train_dataset, |
| loss=loss, |
| ) |
|
|
| model_card = generate_model_card(model) |
|
|
| |
| |
| |
|
|
| for substring in expected_substrings: |
| assert substring in model_card |
|
|
| |
| assert "\n\n\n" not in model_card |
|
|
|
|
| def test_model_card_set_transform( |
| splade_bert_tiny_model: SparseEncoder, dummy_dataset: Dataset, tmp_path: Path |
| ) -> None: |
| model = splade_bert_tiny_model |
|
|
| |
| model.model_card_data.local_files_only = True |
|
|
| def dummy_transform(batch): |
| return { |
| "new_anchor": [text.upper() for text in batch["anchor"]], |
| "new_positive": [text.upper() for text in batch["positive"]], |
| "new_negative": [text.upper() for text in batch["negative"]], |
| } |
|
|
| |
| dataset = dummy_dataset.select(range(len(dummy_dataset))) |
| dataset.set_transform(dummy_transform) |
|
|
| loss = SpladeLoss( |
| model=model, |
| loss=SparseMultipleNegativesRankingLoss(model=model), |
| query_regularizer_weight=5e-5, |
| document_regularizer_weight=3e-5, |
| ) |
| SparseEncoderTrainer( |
| model, |
| args=SparseEncoderTrainingArguments(output_dir=tmp_path), |
| train_dataset=dataset, |
| loss=loss, |
| ) |
| model_card = generate_model_card(model) |
|
|
| |
| for substring in ["<code>new_anchor</code>", "<code>new_positive</code>", "<code>new_negative</code>"]: |
| assert substring in model_card |
|
|
| |
| for substring in ["<code>anchor</code>", "<code>positive</code>", "<code>negative</code>"]: |
| assert substring not in model_card |
|
|
|
|
| class TestGenerateUsageSnippetSparseEncoder: |
| """Verify SparseEncoder uses its own class name in usage snippets.""" |
|
|
| def test_sparse_encoder_class_name(self) -> None: |
| data = SparseEncoderModelCardData() |
| data.usage_examples = ["A", "B"] |
| data.similarities = None |
| data.model = None |
| snippet = data.generate_usage_snippet() |
|
|
| assert "from sentence_transformers import SparseEncoder" in snippet |
| assert 'SparseEncoder("sparse_encoder_model_id")' in snippet |
| assert "SentenceTransformer" not in snippet |
|
|
| def test_sparse_encoder_custom_model_id(self) -> None: |
| data = SparseEncoderModelCardData(model_id="my-org/my-sparse-model") |
| data.usage_examples = ["test"] |
| data.similarities = None |
| data.model = None |
| snippet = data.generate_usage_snippet() |
|
|
| assert 'SparseEncoder("my-org/my-sparse-model")' in snippet |
|
|