import logging from datasets import load_dataset from sentence_transformers import SparseEncoder from sentence_transformers.sparse_encoder.evaluation import SparseRerankingEvaluator logging.basicConfig(format="%(message)s", level=logging.INFO) # Load a model model = SparseEncoder("naver/splade-cocondenser-ensembledistil") # Load a dataset with queries, positives, and negatives eval_dataset = load_dataset("microsoft/ms_marco", "v1.1", split="validation").select(range(1000)) samples = [ { "query": sample["query"], "positive": [ text for is_selected, text in zip(sample["passages"]["is_selected"], sample["passages"]["passage_text"]) if is_selected ], "negative": [ text for is_selected, text in zip(sample["passages"]["is_selected"], sample["passages"]["passage_text"]) if not is_selected ], } for sample in eval_dataset ] # Now evaluate using only the documents from the 1000 samples reranking_evaluator = SparseRerankingEvaluator( samples=samples, name="ms_marco_dev_small", show_progress_bar=True, batch_size=32, ) results = reranking_evaluator(model) """ RerankingEvaluator: Evaluating the model on the ms_marco_dev_small dataset: Queries: 967 Positives: Min 1.0, Mean 1.1, Max 3.0 Negatives: Min 1.0, Mean 7.1, Max 9.0 MAP: 53.41 MRR@10: 54.14 NDCG@10: 65.06 Model Query Sparsity: Active Dimensions: 42.2, Sparsity Ratio: 0.9986 Model Corpus Sparsity: Active Dimensions: 126.5, Sparsity Ratio: 0.9959 """ # Print the results print(f"Primary metric: {reranking_evaluator.primary_metric}") # => Primary metric: ms_marco_dev_small_ndcg@10 print(f"Primary metric value: {results[reranking_evaluator.primary_metric]:.4f}") # => Primary metric value: 0.6506