from __future__ import annotations from pathlib import Path import pytest from sentence_transformers import SparseEncoderTrainer, SparseEncoderTrainingArguments from sentence_transformers.base.model_card import generate_model_card from sentence_transformers.sparse_encoder import SparseEncoder from sentence_transformers.sparse_encoder.losses import SparseMultipleNegativesRankingLoss, SpladeLoss from sentence_transformers.sparse_encoder.model_card import SparseEncoderModelCardData from sentence_transformers.util import is_datasets_available, is_training_available if is_datasets_available(): from datasets import Dataset, DatasetDict if not is_training_available(): pytest.skip( reason='Sentence Transformers was not installed with the `["train"]` extra.', allow_module_level=True, ) @pytest.fixture(scope="session") def dummy_dataset(): """ Dummy dataset for testing purposes. The dataset looks as follows: { "anchor": ["anchor 1", "anchor 2", ..., "anchor 10"], "positive": ["positive 1", "positive 2", ..., "positive 10"], "negative": ["negative 1", "negative 2", ..., "negative 10"], } """ return Dataset.from_dict( { "anchor": [f"anchor {i}" for i in range(1, 11)], "positive": [f"positive {i}" for i in range(1, 11)], "negative": [f"negative {i}" for i in range(1, 11)], } ) @pytest.mark.parametrize( ("model_fixture_name", "num_datasets", "expected_substrings"), [ # 0 actually refers to just a single dataset ( "splade_bert_tiny_model", 0, [ "- sentence-transformers", "- sparse-encoder", "- sparse", "- splade", "This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq)", "**Maximum Sequence Length:** 512 tokens", "**Output Dimensionality:** 30522 dimensions", "**Similarity Function:** Dot Product", "#### Unnamed Dataset", "| details |
anchor 1 | positive 1 | negative 1 |",
"* Loss: [SpladeLoss](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
],
),
(
"splade_bert_tiny_model",
1,
[
"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on the train_0 dataset using the [sentence-transformers](https://www.SBERT.net) library.",
"#### train_0",
"* Loss: [SpladeLoss](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
],
),
(
"splade_bert_tiny_model",
2,
[
"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on the train_0 and train_1 datasets using the [sentence-transformers](https://www.SBERT.net) library.",
"#### train_0",
"#### train_1",
"* Loss: [SpladeLoss](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
],
),
(
"splade_bert_tiny_model",
10,
[
"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on the train_0, train_1, train_2, train_3, train_4, train_5, train_6, train_7, train_8 and train_9 datasets using the [sentence-transformers](https://www.SBERT.net) library.",
"SpladeLoss](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
],
),
# We start using "50 datasets" when the ", "-joined dataset name exceed 200 characters
(
"splade_bert_tiny_model",
50,
[
"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on 50 datasets using the [sentence-transformers](https://www.SBERT.net) library.",
"SpladeLoss](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
],
),
(
"csr_bert_tiny_model",
0,
[
"- sentence-transformers",
"- sparse-encoder",
"- sparse",
"- csr",
"This is a [CSR Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) using the [sentence-transformers](https://www.SBERT.net) library.",
"**Maximum Sequence Length:** 512 tokens",
"**Output Dimensionality:** 512 dimensions",
"**Similarity Function:** Dot Product",
"#### Unnamed Dataset",
"| details | anchor 1 | positive 1 | negative 1 |",
"* Loss: [SpladeLoss](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
],
),
(
"inference_free_splade_bert_tiny_model",
0,
[
"- sentence-transformers",
"- sparse-encoder",
"- sparse",
"- asymmetric",
"- inference-free",
"- splade",
"This is a [Asymmetric Inference-free SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/inference-free-splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/inference-free-splade-bert-tiny-nq) using the [sentence-transformers](https://www.SBERT.net) library.",
"**Maximum Sequence Length:** 512 tokens",
"**Output Dimensionality:** 30522 dimensions",
"**Similarity Function:** Dot Product",
"#### Unnamed Dataset",
"| details | anchor 1 | positive 1 | negative 1 |",
"* Loss: [SpladeLoss](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:",
' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\', gather_across_devices=False, directions=(\'query_to_doc\',), partition_mode=\'joint\', hardness_mode=None, hardness_strength=0.0)",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```',
],
),
],
)
def test_model_card_base(
model_fixture_name: str,
dummy_dataset: Dataset,
num_datasets: int,
expected_substrings: list[str],
request: pytest.FixtureRequest,
tmp_path: Path,
) -> None:
model = request.getfixturevalue(model_fixture_name)
# Let's avoid requesting the Hub for e.g. checking if a base model exists there
model.model_card_data.local_files_only = True
train_dataset = dummy_dataset
if num_datasets:
train_dataset = DatasetDict({f"train_{i}": train_dataset for i in range(num_datasets)})
loss = SpladeLoss(
model=model,
loss=SparseMultipleNegativesRankingLoss(model=model),
query_regularizer_weight=5e-5, # Weight for query loss
document_regularizer_weight=3e-5, # Weight for document loss
)
args = SparseEncoderTrainingArguments(
output_dir=tmp_path,
router_mapping={"test": "query"} if "inference_free" in model_fixture_name else None,
)
# This adds data to model.model_card_data
SparseEncoderTrainer(
model,
args=args,
train_dataset=train_dataset,
loss=loss,
)
model_card = generate_model_card(model)
# For debugging purposes, we can save the model card to a file
# with open(f"test_model_card_{model_fixture_name}_{num_datasets}d.md", "w", encoding="utf8") as f:
# f.write(model_card)
for substring in expected_substrings:
assert substring in model_card
# We don't want to have two consecutive empty lines anywhere
assert "\n\n\n" not in model_card
def test_model_card_set_transform(
splade_bert_tiny_model: SparseEncoder, dummy_dataset: Dataset, tmp_path: Path
) -> None:
model = splade_bert_tiny_model
# Let's avoid requesting the Hub for e.g. checking if a base model exists there
model.model_card_data.local_files_only = True
def dummy_transform(batch):
return {
"new_anchor": [text.upper() for text in batch["anchor"]],
"new_positive": [text.upper() for text in batch["positive"]],
"new_negative": [text.upper() for text in batch["negative"]],
}
# Use a copy to avoid mutating the session-scoped fixture
dataset = dummy_dataset.select(range(len(dummy_dataset)))
dataset.set_transform(dummy_transform)
loss = SpladeLoss(
model=model,
loss=SparseMultipleNegativesRankingLoss(model=model),
query_regularizer_weight=5e-5,
document_regularizer_weight=3e-5,
)
SparseEncoderTrainer(
model,
args=SparseEncoderTrainingArguments(output_dir=tmp_path),
train_dataset=dataset,
loss=loss,
)
model_card = generate_model_card(model)
# Post-transform column names should appear as column headers
for substring in ["new_anchor", "new_positive", "new_negative"]:
assert substring in model_card
# Pre-transform column names should not appear as column headers
for substring in ["anchor", "positive", "negative"]:
assert substring not in model_card
class TestGenerateUsageSnippetSparseEncoder:
"""Verify SparseEncoder uses its own class name in usage snippets."""
def test_sparse_encoder_class_name(self) -> None:
data = SparseEncoderModelCardData()
data.usage_examples = ["A", "B"]
data.similarities = None
data.model = None
snippet = data.generate_usage_snippet()
assert "from sentence_transformers import SparseEncoder" in snippet
assert 'SparseEncoder("sparse_encoder_model_id")' in snippet
assert "SentenceTransformer" not in snippet
def test_sparse_encoder_custom_model_id(self) -> None:
data = SparseEncoderModelCardData(model_id="my-org/my-sparse-model")
data.usage_examples = ["test"]
data.similarities = None
data.model = None
snippet = data.generate_usage_snippet()
assert 'SparseEncoder("my-org/my-sparse-model")' in snippet