|
|
from __future__ import annotations |
|
|
|
|
|
from pathlib import Path |
|
|
|
|
|
import pytest |
|
|
|
|
|
from sentence_transformers import SparseEncoderTrainer, SparseEncoderTrainingArguments |
|
|
from sentence_transformers.model_card import generate_model_card |
|
|
from sentence_transformers.sparse_encoder import losses |
|
|
from sentence_transformers.util import is_datasets_available, is_training_available |
|
|
|
|
|
if is_datasets_available(): |
|
|
from datasets import Dataset, DatasetDict |
|
|
|
|
|
if not is_training_available(): |
|
|
pytest.skip( |
|
|
reason='Sentence Transformers was not installed with the `["train"]` extra.', |
|
|
allow_module_level=True, |
|
|
) |
|
|
|
|
|
|
|
|
@pytest.fixture(scope="session") |
|
|
def dummy_dataset(): |
|
|
""" |
|
|
Dummy dataset for testing purposes. The dataset looks as follows: |
|
|
{ |
|
|
"anchor": ["anchor 1", "anchor 2", ..., "anchor 10"], |
|
|
"positive": ["positive 1", "positive 2", ..., "positive 10"], |
|
|
"negative": ["negative 1", "negative 2", ..., "negative 10"], |
|
|
} |
|
|
""" |
|
|
return Dataset.from_dict( |
|
|
{ |
|
|
"anchor": [f"anchor {i}" for i in range(1, 11)], |
|
|
"positive": [f"positive {i}" for i in range(1, 11)], |
|
|
"negative": [f"negative {i}" for i in range(1, 11)], |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
@pytest.mark.parametrize( |
|
|
("model_fixture_name", "num_datasets", "expected_substrings"), |
|
|
[ |
|
|
|
|
|
( |
|
|
"splade_bert_tiny_model", |
|
|
0, |
|
|
[ |
|
|
"- sentence-transformers", |
|
|
"- sparse-encoder", |
|
|
"- sparse", |
|
|
"- splade", |
|
|
"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq)", |
|
|
"**Maximum Sequence Length:** 512 tokens", |
|
|
"**Output Dimensionality:** 30522 dimensions", |
|
|
"**Similarity Function:** Dot Product", |
|
|
"#### Unnamed Dataset", |
|
|
"| details | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> |", |
|
|
" | <code>anchor 1</code> | <code>positive 1</code> | <code>negative 1</code> |", |
|
|
"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:", |
|
|
' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\')",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```', |
|
|
], |
|
|
), |
|
|
( |
|
|
"splade_bert_tiny_model", |
|
|
1, |
|
|
[ |
|
|
"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on the train_0 dataset using the [sentence-transformers](https://www.SBERT.net) library.", |
|
|
"#### train_0", |
|
|
"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:", |
|
|
' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\')",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```', |
|
|
], |
|
|
), |
|
|
( |
|
|
"splade_bert_tiny_model", |
|
|
2, |
|
|
[ |
|
|
"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on the train_0 and train_1 datasets using the [sentence-transformers](https://www.SBERT.net) library.", |
|
|
"#### train_0", |
|
|
"#### train_1", |
|
|
"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:", |
|
|
' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\')",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```', |
|
|
], |
|
|
), |
|
|
( |
|
|
"splade_bert_tiny_model", |
|
|
10, |
|
|
[ |
|
|
"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on the train_0, train_1, train_2, train_3, train_4, train_5, train_6, train_7, train_8 and train_9 datasets using the [sentence-transformers](https://www.SBERT.net) library.", |
|
|
"<details><summary>train_0</summary>", |
|
|
"#### train_0", |
|
|
"</details>\n<details><summary>train_9</summary>", |
|
|
"#### train_9", |
|
|
"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:", |
|
|
' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\')",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```', |
|
|
], |
|
|
), |
|
|
|
|
|
( |
|
|
"splade_bert_tiny_model", |
|
|
50, |
|
|
[ |
|
|
"This is a [SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/splade-bert-tiny-nq) on 50 datasets using the [sentence-transformers](https://www.SBERT.net) library.", |
|
|
"<details><summary>train_0</summary>", |
|
|
"#### train_0", |
|
|
"</details>\n<details><summary>train_49</summary>", |
|
|
"#### train_49", |
|
|
"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:", |
|
|
' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\')",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```', |
|
|
], |
|
|
), |
|
|
( |
|
|
"csr_bert_tiny_model", |
|
|
0, |
|
|
[ |
|
|
"- sentence-transformers", |
|
|
"- sparse-encoder", |
|
|
"- sparse", |
|
|
"- csr", |
|
|
"This is a [CSR Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sentence-transformers-testing/stsb-bert-tiny-safetensors](https://huggingface.co/sentence-transformers-testing/stsb-bert-tiny-safetensors) using the [sentence-transformers](https://www.SBERT.net) library.", |
|
|
"**Maximum Sequence Length:** 512 tokens", |
|
|
"**Output Dimensionality:** 512 dimensions", |
|
|
"**Similarity Function:** Dot Product", |
|
|
"#### Unnamed Dataset", |
|
|
"| details | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> |", |
|
|
" | <code>anchor 1</code> | <code>positive 1</code> | <code>negative 1</code> |", |
|
|
"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:", |
|
|
' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\')",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```', |
|
|
], |
|
|
), |
|
|
( |
|
|
"inference_free_splade_bert_tiny_model", |
|
|
0, |
|
|
[ |
|
|
"- sentence-transformers", |
|
|
"- sparse-encoder", |
|
|
"- sparse", |
|
|
"- asymmetric", |
|
|
"- inference-free", |
|
|
"- splade", |
|
|
"This is a [Asymmetric Inference-free SPLADE Sparse Encoder](https://www.sbert.net/docs/sparse_encoder/usage/usage.html) model finetuned from [sparse-encoder-testing/inference-free-splade-bert-tiny-nq](https://huggingface.co/sparse-encoder-testing/inference-free-splade-bert-tiny-nq) using the [sentence-transformers](https://www.SBERT.net) library.", |
|
|
"**Maximum Sequence Length:** 512 tokens", |
|
|
"**Output Dimensionality:** 30522 dimensions", |
|
|
"**Similarity Function:** Dot Product", |
|
|
"#### Unnamed Dataset", |
|
|
"| details | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> |", |
|
|
" | <code>anchor 1</code> | <code>positive 1</code> | <code>negative 1</code> |", |
|
|
"* Loss: [<code>SpladeLoss</code>](https://sbert.net/docs/package_reference/sparse_encoder/losses.html#spladeloss) with these parameters:", |
|
|
' ```json\n {\n "loss": "SparseMultipleNegativesRankingLoss(scale=1.0, similarity_fct=\'dot_score\')",\n "document_regularizer_weight": 3e-05,\n "query_regularizer_weight": 5e-05\n }\n ```', |
|
|
], |
|
|
), |
|
|
], |
|
|
) |
|
|
def test_model_card_base( |
|
|
model_fixture_name: str, |
|
|
dummy_dataset: Dataset, |
|
|
num_datasets: int, |
|
|
expected_substrings: list[str], |
|
|
request: pytest.FixtureRequest, |
|
|
tmp_path: Path, |
|
|
) -> None: |
|
|
model = request.getfixturevalue(model_fixture_name) |
|
|
|
|
|
train_dataset = dummy_dataset |
|
|
if num_datasets: |
|
|
train_dataset = DatasetDict({f"train_{i}": train_dataset for i in range(num_datasets)}) |
|
|
|
|
|
loss = losses.SpladeLoss( |
|
|
model=model, |
|
|
loss=losses.SparseMultipleNegativesRankingLoss(model=model), |
|
|
query_regularizer_weight=5e-5, |
|
|
document_regularizer_weight=3e-5, |
|
|
) |
|
|
|
|
|
args = SparseEncoderTrainingArguments( |
|
|
output_dir=tmp_path, |
|
|
router_mapping={"test": "query"} if "inference_free" in model_fixture_name else None, |
|
|
) |
|
|
|
|
|
|
|
|
SparseEncoderTrainer( |
|
|
model, |
|
|
args=args, |
|
|
train_dataset=train_dataset, |
|
|
loss=loss, |
|
|
) |
|
|
|
|
|
model_card = generate_model_card(model) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for substring in expected_substrings: |
|
|
assert substring in model_card |
|
|
|
|
|
|
|
|
assert "\n\n\n" not in model_card |
|
|
|