--- language: - en tags: - ColBERT - PyLate - sentence-transformers - sentence-similarity - feature-extraction - generated_from_trainer - dataset_size:640000 - loss:Distillation datasets: - lightonai/ms-marco-en-bge-gemma pipeline_tag: sentence-similarity library_name: PyLate metrics: - MaxSim_accuracy@1 - MaxSim_accuracy@3 - MaxSim_accuracy@5 - MaxSim_accuracy@10 - MaxSim_precision@1 - MaxSim_precision@3 - MaxSim_precision@5 - MaxSim_precision@10 - MaxSim_recall@1 - MaxSim_recall@3 - MaxSim_recall@5 - MaxSim_recall@10 - MaxSim_ndcg@10 - MaxSim_mrr@10 - MaxSim_map@100 model-index: - name: PyLate results: - task: type: py-late-information-retrieval name: Py Late Information Retrieval dataset: name: NanoClimateFEVER type: NanoClimateFEVER metrics: - type: MaxSim_accuracy@1 value: 0.24 name: Maxsim Accuracy@1 - type: MaxSim_accuracy@3 value: 0.42 name: Maxsim Accuracy@3 - type: MaxSim_accuracy@5 value: 0.56 name: Maxsim Accuracy@5 - type: MaxSim_accuracy@10 value: 0.76 name: Maxsim Accuracy@10 - type: MaxSim_precision@1 value: 0.24 name: Maxsim Precision@1 - type: MaxSim_precision@3 value: 0.14666666666666667 name: Maxsim Precision@3 - type: MaxSim_precision@5 value: 0.132 name: Maxsim Precision@5 - type: MaxSim_precision@10 value: 0.1 name: Maxsim Precision@10 - type: MaxSim_recall@1 value: 0.11499999999999998 name: Maxsim Recall@1 - type: MaxSim_recall@3 value: 0.205 name: Maxsim Recall@3 - type: MaxSim_recall@5 value: 0.2733333333333333 name: Maxsim Recall@5 - type: MaxSim_recall@10 value: 0.3906666666666666 name: Maxsim Recall@10 - type: MaxSim_ndcg@10 value: 0.2950902457523894 name: Maxsim Ndcg@10 - type: MaxSim_mrr@10 value: 0.36876984126984125 name: Maxsim Mrr@10 - type: MaxSim_map@100 value: 0.22445703016815177 name: Maxsim Map@100 - task: type: py-late-information-retrieval name: Py Late Information Retrieval dataset: name: NanoDBPedia type: NanoDBPedia metrics: - type: MaxSim_accuracy@1 value: 0.76 name: Maxsim Accuracy@1 - type: MaxSim_accuracy@3 value: 0.92 name: Maxsim Accuracy@3 - type: MaxSim_accuracy@5 value: 0.92 name: Maxsim Accuracy@5 - type: MaxSim_accuracy@10 value: 0.94 name: Maxsim Accuracy@10 - type: MaxSim_precision@1 value: 0.76 name: Maxsim Precision@1 - type: MaxSim_precision@3 value: 0.7199999999999999 name: Maxsim Precision@3 - type: MaxSim_precision@5 value: 0.64 name: Maxsim Precision@5 - type: MaxSim_precision@10 value: 0.5359999999999999 name: Maxsim Precision@10 - type: MaxSim_recall@1 value: 0.103349775455209 name: Maxsim Recall@1 - type: MaxSim_recall@3 value: 0.2069476173044798 name: Maxsim Recall@3 - type: MaxSim_recall@5 value: 0.26630033614450777 name: Maxsim Recall@5 - type: MaxSim_recall@10 value: 0.3798346720417632 name: Maxsim Recall@10 - type: MaxSim_ndcg@10 value: 0.6745044425577195 name: Maxsim Ndcg@10 - type: MaxSim_mrr@10 value: 0.8420000000000001 name: Maxsim Mrr@10 - type: MaxSim_map@100 value: 0.5354371280529658 name: Maxsim Map@100 - task: type: py-late-information-retrieval name: Py Late Information Retrieval dataset: name: NanoFEVER type: NanoFEVER metrics: - type: MaxSim_accuracy@1 value: 0.9 name: Maxsim Accuracy@1 - type: MaxSim_accuracy@3 value: 0.96 name: Maxsim Accuracy@3 - type: MaxSim_accuracy@5 value: 1.0 name: Maxsim Accuracy@5 - type: MaxSim_accuracy@10 value: 1.0 name: Maxsim Accuracy@10 - type: MaxSim_precision@1 value: 0.9 name: Maxsim Precision@1 - type: MaxSim_precision@3 value: 0.3399999999999999 name: Maxsim Precision@3 - type: MaxSim_precision@5 value: 0.21599999999999994 name: Maxsim Precision@5 - type: MaxSim_precision@10 value: 0.10999999999999999 name: Maxsim Precision@10 - type: MaxSim_recall@1 value: 0.8366666666666667 name: Maxsim Recall@1 - type: MaxSim_recall@3 value: 0.9233333333333333 name: Maxsim Recall@3 - type: MaxSim_recall@5 value: 0.97 name: Maxsim Recall@5 - type: MaxSim_recall@10 value: 0.98 name: Maxsim Recall@10 - type: MaxSim_ndcg@10 value: 0.9294789232192022 name: Maxsim Ndcg@10 - type: MaxSim_mrr@10 value: 0.9366666666666665 name: Maxsim Mrr@10 - type: MaxSim_map@100 value: 0.9025750915750915 name: Maxsim Map@100 - task: type: py-late-information-retrieval name: Py Late Information Retrieval dataset: name: NanoFiQA2018 type: NanoFiQA2018 metrics: - type: MaxSim_accuracy@1 value: 0.58 name: Maxsim Accuracy@1 - type: MaxSim_accuracy@3 value: 0.68 name: Maxsim Accuracy@3 - type: MaxSim_accuracy@5 value: 0.72 name: Maxsim Accuracy@5 - type: MaxSim_accuracy@10 value: 0.78 name: Maxsim Accuracy@10 - type: MaxSim_precision@1 value: 0.58 name: Maxsim Precision@1 - type: MaxSim_precision@3 value: 0.31999999999999995 name: Maxsim Precision@3 - type: MaxSim_precision@5 value: 0.244 name: Maxsim Precision@5 - type: MaxSim_precision@10 value: 0.13799999999999998 name: Maxsim Precision@10 - type: MaxSim_recall@1 value: 0.36607936507936506 name: Maxsim Recall@1 - type: MaxSim_recall@3 value: 0.48507142857142854 name: Maxsim Recall@3 - type: MaxSim_recall@5 value: 0.5518412698412698 name: Maxsim Recall@5 - type: MaxSim_recall@10 value: 0.6031746031746031 name: Maxsim Recall@10 - type: MaxSim_ndcg@10 value: 0.5639041299556308 name: Maxsim Ndcg@10 - type: MaxSim_mrr@10 value: 0.6375793650793651 name: Maxsim Mrr@10 - type: MaxSim_map@100 value: 0.5136714023190043 name: Maxsim Map@100 - task: type: py-late-information-retrieval name: Py Late Information Retrieval dataset: name: NanoHotpotQA type: NanoHotpotQA metrics: - type: MaxSim_accuracy@1 value: 0.92 name: Maxsim Accuracy@1 - type: MaxSim_accuracy@3 value: 1.0 name: Maxsim Accuracy@3 - type: MaxSim_accuracy@5 value: 1.0 name: Maxsim Accuracy@5 - type: MaxSim_accuracy@10 value: 1.0 name: Maxsim Accuracy@10 - type: MaxSim_precision@1 value: 0.92 name: Maxsim Precision@1 - type: MaxSim_precision@3 value: 0.5533333333333332 name: Maxsim Precision@3 - type: MaxSim_precision@5 value: 0.352 name: Maxsim Precision@5 - type: MaxSim_precision@10 value: 0.18199999999999997 name: Maxsim Precision@10 - type: MaxSim_recall@1 value: 0.46 name: Maxsim Recall@1 - type: MaxSim_recall@3 value: 0.83 name: Maxsim Recall@3 - type: MaxSim_recall@5 value: 0.88 name: Maxsim Recall@5 - type: MaxSim_recall@10 value: 0.91 name: Maxsim Recall@10 - type: MaxSim_ndcg@10 value: 0.8735671033500391 name: Maxsim Ndcg@10 - type: MaxSim_mrr@10 value: 0.9533333333333333 name: Maxsim Mrr@10 - type: MaxSim_map@100 value: 0.819732728608772 name: Maxsim Map@100 - task: type: py-late-information-retrieval name: Py Late Information Retrieval dataset: name: NanoMSMARCO type: NanoMSMARCO metrics: - type: MaxSim_accuracy@1 value: 0.52 name: Maxsim Accuracy@1 - type: MaxSim_accuracy@3 value: 0.72 name: Maxsim Accuracy@3 - type: MaxSim_accuracy@5 value: 0.78 name: Maxsim Accuracy@5 - type: MaxSim_accuracy@10 value: 0.92 name: Maxsim Accuracy@10 - type: MaxSim_precision@1 value: 0.52 name: Maxsim Precision@1 - type: MaxSim_precision@3 value: 0.24 name: Maxsim Precision@3 - type: MaxSim_precision@5 value: 0.15600000000000003 name: Maxsim Precision@5 - type: MaxSim_precision@10 value: 0.092 name: Maxsim Precision@10 - type: MaxSim_recall@1 value: 0.52 name: Maxsim Recall@1 - type: MaxSim_recall@3 value: 0.72 name: Maxsim Recall@3 - type: MaxSim_recall@5 value: 0.78 name: Maxsim Recall@5 - type: MaxSim_recall@10 value: 0.92 name: Maxsim Recall@10 - type: MaxSim_ndcg@10 value: 0.7115365744941191 name: Maxsim Ndcg@10 - type: MaxSim_mrr@10 value: 0.6468571428571428 name: Maxsim Mrr@10 - type: MaxSim_map@100 value: 0.6512663906142167 name: Maxsim Map@100 - task: type: py-late-information-retrieval name: Py Late Information Retrieval dataset: name: NanoNFCorpus type: NanoNFCorpus metrics: - type: MaxSim_accuracy@1 value: 0.48 name: Maxsim Accuracy@1 - type: MaxSim_accuracy@3 value: 0.62 name: Maxsim Accuracy@3 - type: MaxSim_accuracy@5 value: 0.68 name: Maxsim Accuracy@5 - type: MaxSim_accuracy@10 value: 0.74 name: Maxsim Accuracy@10 - type: MaxSim_precision@1 value: 0.48 name: Maxsim Precision@1 - type: MaxSim_precision@3 value: 0.42666666666666664 name: Maxsim Precision@3 - type: MaxSim_precision@5 value: 0.37200000000000005 name: Maxsim Precision@5 - type: MaxSim_precision@10 value: 0.28800000000000003 name: Maxsim Precision@10 - type: MaxSim_recall@1 value: 0.04445987936677032 name: Maxsim Recall@1 - type: MaxSim_recall@3 value: 0.08334318466845993 name: Maxsim Recall@3 - type: MaxSim_recall@5 value: 0.12387064834298472 name: Maxsim Recall@5 - type: MaxSim_recall@10 value: 0.15623137130300419 name: Maxsim Recall@10 - type: MaxSim_ndcg@10 value: 0.3662101077105874 name: Maxsim Ndcg@10 - type: MaxSim_mrr@10 value: 0.5659126984126984 name: Maxsim Mrr@10 - type: MaxSim_map@100 value: 0.1629293985515298 name: Maxsim Map@100 - task: type: py-late-information-retrieval name: Py Late Information Retrieval dataset: name: NanoNQ type: NanoNQ metrics: - type: MaxSim_accuracy@1 value: 0.52 name: Maxsim Accuracy@1 - type: MaxSim_accuracy@3 value: 0.84 name: Maxsim Accuracy@3 - type: MaxSim_accuracy@5 value: 0.86 name: Maxsim Accuracy@5 - type: MaxSim_accuracy@10 value: 0.88 name: Maxsim Accuracy@10 - type: MaxSim_precision@1 value: 0.52 name: Maxsim Precision@1 - type: MaxSim_precision@3 value: 0.2866666666666667 name: Maxsim Precision@3 - type: MaxSim_precision@5 value: 0.17999999999999997 name: Maxsim Precision@5 - type: MaxSim_precision@10 value: 0.09399999999999999 name: Maxsim Precision@10 - type: MaxSim_recall@1 value: 0.49 name: Maxsim Recall@1 - type: MaxSim_recall@3 value: 0.79 name: Maxsim Recall@3 - type: MaxSim_recall@5 value: 0.82 name: Maxsim Recall@5 - type: MaxSim_recall@10 value: 0.84 name: Maxsim Recall@10 - type: MaxSim_ndcg@10 value: 0.706413633867191 name: Maxsim Ndcg@10 - type: MaxSim_mrr@10 value: 0.6778571428571428 name: Maxsim Mrr@10 - type: MaxSim_map@100 value: 0.6569910589410588 name: Maxsim Map@100 - task: type: py-late-information-retrieval name: Py Late Information Retrieval dataset: name: NanoQuoraRetrieval type: NanoQuoraRetrieval metrics: - type: MaxSim_accuracy@1 value: 0.9 name: Maxsim Accuracy@1 - type: MaxSim_accuracy@3 value: 0.96 name: Maxsim Accuracy@3 - type: MaxSim_accuracy@5 value: 0.98 name: Maxsim Accuracy@5 - type: MaxSim_accuracy@10 value: 1.0 name: Maxsim Accuracy@10 - type: MaxSim_precision@1 value: 0.9 name: Maxsim Precision@1 - type: MaxSim_precision@3 value: 0.38666666666666655 name: Maxsim Precision@3 - type: MaxSim_precision@5 value: 0.25199999999999995 name: Maxsim Precision@5 - type: MaxSim_precision@10 value: 0.13799999999999998 name: Maxsim Precision@10 - type: MaxSim_recall@1 value: 0.7873333333333333 name: Maxsim Recall@1 - type: MaxSim_recall@3 value: 0.9146666666666667 name: Maxsim Recall@3 - type: MaxSim_recall@5 value: 0.956 name: Maxsim Recall@5 - type: MaxSim_recall@10 value: 0.9966666666666666 name: Maxsim Recall@10 - type: MaxSim_ndcg@10 value: 0.9423484210846561 name: Maxsim Ndcg@10 - type: MaxSim_mrr@10 value: 0.9383333333333332 name: Maxsim Mrr@10 - type: MaxSim_map@100 value: 0.9161729437229437 name: Maxsim Map@100 - task: type: py-late-information-retrieval name: Py Late Information Retrieval dataset: name: NanoSCIDOCS type: NanoSCIDOCS metrics: - type: MaxSim_accuracy@1 value: 0.48 name: Maxsim Accuracy@1 - type: MaxSim_accuracy@3 value: 0.68 name: Maxsim Accuracy@3 - type: MaxSim_accuracy@5 value: 0.7 name: Maxsim Accuracy@5 - type: MaxSim_accuracy@10 value: 0.84 name: Maxsim Accuracy@10 - type: MaxSim_precision@1 value: 0.48 name: Maxsim Precision@1 - type: MaxSim_precision@3 value: 0.32666666666666666 name: Maxsim Precision@3 - type: MaxSim_precision@5 value: 0.256 name: Maxsim Precision@5 - type: MaxSim_precision@10 value: 0.18599999999999997 name: Maxsim Precision@10 - type: MaxSim_recall@1 value: 0.10166666666666668 name: Maxsim Recall@1 - type: MaxSim_recall@3 value: 0.20266666666666666 name: Maxsim Recall@3 - type: MaxSim_recall@5 value: 0.26266666666666666 name: Maxsim Recall@5 - type: MaxSim_recall@10 value: 0.3796666666666667 name: Maxsim Recall@10 - type: MaxSim_ndcg@10 value: 0.37448789415335676 name: Maxsim Ndcg@10 - type: MaxSim_mrr@10 value: 0.6007222222222223 name: Maxsim Mrr@10 - type: MaxSim_map@100 value: 0.28182998781809016 name: Maxsim Map@100 - task: type: py-late-information-retrieval name: Py Late Information Retrieval dataset: name: NanoArguAna type: NanoArguAna metrics: - type: MaxSim_accuracy@1 value: 0.26 name: Maxsim Accuracy@1 - type: MaxSim_accuracy@3 value: 0.56 name: Maxsim Accuracy@3 - type: MaxSim_accuracy@5 value: 0.66 name: Maxsim Accuracy@5 - type: MaxSim_accuracy@10 value: 0.8 name: Maxsim Accuracy@10 - type: MaxSim_precision@1 value: 0.26 name: Maxsim Precision@1 - type: MaxSim_precision@3 value: 0.18666666666666668 name: Maxsim Precision@3 - type: MaxSim_precision@5 value: 0.132 name: Maxsim Precision@5 - type: MaxSim_precision@10 value: 0.08 name: Maxsim Precision@10 - type: MaxSim_recall@1 value: 0.26 name: Maxsim Recall@1 - type: MaxSim_recall@3 value: 0.56 name: Maxsim Recall@3 - type: MaxSim_recall@5 value: 0.66 name: Maxsim Recall@5 - type: MaxSim_recall@10 value: 0.8 name: Maxsim Recall@10 - type: MaxSim_ndcg@10 value: 0.5176675835157897 name: Maxsim Ndcg@10 - type: MaxSim_mrr@10 value: 0.4284920634920634 name: Maxsim Mrr@10 - type: MaxSim_map@100 value: 0.43500479781656254 name: Maxsim Map@100 - task: type: py-late-information-retrieval name: Py Late Information Retrieval dataset: name: NanoSciFact type: NanoSciFact metrics: - type: MaxSim_accuracy@1 value: 0.74 name: Maxsim Accuracy@1 - type: MaxSim_accuracy@3 value: 0.82 name: Maxsim Accuracy@3 - type: MaxSim_accuracy@5 value: 0.88 name: Maxsim Accuracy@5 - type: MaxSim_accuracy@10 value: 0.88 name: Maxsim Accuracy@10 - type: MaxSim_precision@1 value: 0.74 name: Maxsim Precision@1 - type: MaxSim_precision@3 value: 0.29333333333333333 name: Maxsim Precision@3 - type: MaxSim_precision@5 value: 0.19599999999999998 name: Maxsim Precision@5 - type: MaxSim_precision@10 value: 0.09799999999999999 name: Maxsim Precision@10 - type: MaxSim_recall@1 value: 0.715 name: Maxsim Recall@1 - type: MaxSim_recall@3 value: 0.805 name: Maxsim Recall@3 - type: MaxSim_recall@5 value: 0.87 name: Maxsim Recall@5 - type: MaxSim_recall@10 value: 0.87 name: Maxsim Recall@10 - type: MaxSim_ndcg@10 value: 0.8103600696147834 name: Maxsim Ndcg@10 - type: MaxSim_mrr@10 value: 0.7906666666666666 name: Maxsim Mrr@10 - type: MaxSim_map@100 value: 0.792673895287103 name: Maxsim Map@100 - task: type: py-late-information-retrieval name: Py Late Information Retrieval dataset: name: NanoTouche2020 type: NanoTouche2020 metrics: - type: MaxSim_accuracy@1 value: 0.7755102040816326 name: Maxsim Accuracy@1 - type: MaxSim_accuracy@3 value: 0.9591836734693877 name: Maxsim Accuracy@3 - type: MaxSim_accuracy@5 value: 0.9795918367346939 name: Maxsim Accuracy@5 - type: MaxSim_accuracy@10 value: 1.0 name: Maxsim Accuracy@10 - type: MaxSim_precision@1 value: 0.7755102040816326 name: Maxsim Precision@1 - type: MaxSim_precision@3 value: 0.7210884353741496 name: Maxsim Precision@3 - type: MaxSim_precision@5 value: 0.6326530612244898 name: Maxsim Precision@5 - type: MaxSim_precision@10 value: 0.5306122448979592 name: Maxsim Precision@10 - type: MaxSim_recall@1 value: 0.05246741937655717 name: Maxsim Recall@1 - type: MaxSim_recall@3 value: 0.1459745060885227 name: Maxsim Recall@3 - type: MaxSim_recall@5 value: 0.20856404158297343 name: Maxsim Recall@5 - type: MaxSim_recall@10 value: 0.3416638417494836 name: Maxsim Recall@10 - type: MaxSim_ndcg@10 value: 0.6056555459991261 name: Maxsim Ndcg@10 - type: MaxSim_mrr@10 value: 0.8646258503401361 name: Maxsim Mrr@10 - type: MaxSim_map@100 value: 0.4446312449677973 name: Maxsim Map@100 - task: type: nano-beir name: Nano BEIR dataset: name: NanoBEIR mean type: NanoBEIR_mean metrics: - type: MaxSim_accuracy@1 value: 0.6211930926216641 name: Maxsim Accuracy@1 - type: MaxSim_accuracy@3 value: 0.7799372056514914 name: Maxsim Accuracy@3 - type: MaxSim_accuracy@5 value: 0.8245839874411303 name: Maxsim Accuracy@5 - type: MaxSim_accuracy@10 value: 0.8876923076923078 name: Maxsim Accuracy@10 - type: MaxSim_precision@1 value: 0.6211930926216641 name: Maxsim Precision@1 - type: MaxSim_precision@3 value: 0.38059654631083195 name: Maxsim Precision@3 - type: MaxSim_precision@5 value: 0.28928100470957613 name: Maxsim Precision@5 - type: MaxSim_precision@10 value: 0.19789324960753532 name: Maxsim Precision@10 - type: MaxSim_recall@1 value: 0.37323254661112065 name: Maxsim Recall@1 - type: MaxSim_recall@3 value: 0.5286156464076582 name: Maxsim Recall@3 - type: MaxSim_recall@5 value: 0.5863520227624412 name: Maxsim Recall@5 - type: MaxSim_recall@10 value: 0.6590695760206811 name: Maxsim Recall@10 - type: MaxSim_ndcg@10 value: 0.6439403596365069 name: Maxsim Ndcg@10 - type: MaxSim_mrr@10 value: 0.7116781789638933 name: Maxsim Mrr@10 - type: MaxSim_map@100 value: 0.5644133152648683 name: Maxsim Map@100 --- # PyLate This is a [PyLate](https://github.com/lightonai/pylate) model trained on the [train](https://huggingface.co/datasets/lightonai/ms-marco-en-bge-gemma) dataset. It maps sentences & paragraphs to sequences of 128-dimensional dense vectors and can be used for semantic textual similarity using the MaxSim operator. ## Model Details ### Model Description - **Model Type:** PyLate model - **Document Length:** 512 tokens - **Query Length:** 32 tokens - **Output Dimensionality:** 128 tokens - **Similarity Function:** MaxSim - **Training Dataset:** - [train](https://huggingface.co/datasets/lightonai/ms-marco-en-bge-gemma) - **Language:** en ### Model Sources - **Documentation:** [PyLate Documentation](https://lightonai.github.io/pylate/) - **Repository:** [PyLate on GitHub](https://github.com/lightonai/pylate) - **Hugging Face:** [PyLate models on Hugging Face](https://huggingface.co/models?library=PyLate) ### Full Model Architecture ``` ColBERT( (0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'ModernBertModel'}) (1): Dense({'in_features': 768, 'out_features': 128, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity', 'use_residual': False}) ) ``` ## Usage First install the PyLate library: ```bash pip install -U pylate ``` ### Retrieval Use this model with PyLate to index and retrieve documents. The index uses [FastPLAID](https://github.com/lightonai/fast-plaid) for efficient similarity search. #### Indexing documents Load the ColBERT model and initialize the PLAID index, then encode and index your documents: ```python from pylate import indexes, models, retrieve # Step 1: Load the ColBERT model model = models.ColBERT( model_name_or_path="pylate_model_id", ) # Step 2: Initialize the PLAID index index = indexes.PLAID( index_folder="pylate-index", index_name="index", override=True, # This overwrites the existing index if any ) # Step 3: Encode the documents documents_ids = ["1", "2", "3"] documents = ["document 1 text", "document 2 text", "document 3 text"] documents_embeddings = model.encode( documents, batch_size=32, is_query=False, # Ensure that it is set to False to indicate that these are documents, not queries show_progress_bar=True, ) # Step 4: Add document embeddings to the index by providing embeddings and corresponding ids index.add_documents( documents_ids=documents_ids, documents_embeddings=documents_embeddings, ) ``` Note that you do not have to recreate the index and encode the documents every time. Once you have created an index and added the documents, you can re-use the index later by loading it: ```python # To load an index, simply instantiate it with the correct folder/name and without overriding it index = indexes.PLAID( index_folder="pylate-index", index_name="index", ) ``` #### Retrieving top-k documents for queries Once the documents are indexed, you can retrieve the top-k most relevant documents for a given set of queries. To do so, initialize the ColBERT retriever with the index you want to search in, encode the queries and then retrieve the top-k documents to get the top matches ids and relevance scores: ```python # Step 1: Initialize the ColBERT retriever retriever = retrieve.ColBERT(index=index) # Step 2: Encode the queries queries_embeddings = model.encode( ["query for document 3", "query for document 1"], batch_size=32, is_query=True, # # Ensure that it is set to False to indicate that these are queries show_progress_bar=True, ) # Step 3: Retrieve top-k documents scores = retriever.retrieve( queries_embeddings=queries_embeddings, k=10, # Retrieve the top 10 matches for each query ) ``` ### Reranking If you only want to use the ColBERT model to perform reranking on top of your first-stage retrieval pipeline without building an index, you can simply use rank function and pass the queries and documents to rerank: ```python from pylate import rank, models queries = [ "query A", "query B", ] documents = [ ["document A", "document B"], ["document 1", "document C", "document B"], ] documents_ids = [ [1, 2], [1, 3, 2], ] model = models.ColBERT( model_name_or_path="pylate_model_id", ) queries_embeddings = model.encode( queries, is_query=True, ) documents_embeddings = model.encode( documents, is_query=False, ) reranked_documents = rank.rerank( documents_ids=documents_ids, queries_embeddings=queries_embeddings, documents_embeddings=documents_embeddings, ) ``` ## Evaluation ### Metrics #### Py Late Information Retrieval * Dataset: `['NanoClimateFEVER', 'NanoDBPedia', 'NanoFEVER', 'NanoFiQA2018', 'NanoHotpotQA', 'NanoMSMARCO', 'NanoNFCorpus', 'NanoNQ', 'NanoQuoraRetrieval', 'NanoSCIDOCS', 'NanoArguAna', 'NanoSciFact', 'NanoTouche2020']` * Evaluated with pylate.evaluation.pylate_information_retrieval_evaluator.PyLateInformationRetrievalEvaluator | Metric | NanoClimateFEVER | NanoDBPedia | NanoFEVER | NanoFiQA2018 | NanoHotpotQA | NanoMSMARCO | NanoNFCorpus | NanoNQ | NanoQuoraRetrieval | NanoSCIDOCS | NanoArguAna | NanoSciFact | NanoTouche2020 | |:--------------------|:-----------------|:------------|:-----------|:-------------|:-------------|:------------|:-------------|:-----------|:-------------------|:------------|:------------|:------------|:---------------| | MaxSim_accuracy@1 | 0.24 | 0.76 | 0.9 | 0.58 | 0.92 | 0.52 | 0.48 | 0.52 | 0.9 | 0.48 | 0.26 | 0.74 | 0.7755 | | MaxSim_accuracy@3 | 0.42 | 0.92 | 0.96 | 0.68 | 1.0 | 0.72 | 0.62 | 0.84 | 0.96 | 0.68 | 0.56 | 0.82 | 0.9592 | | MaxSim_accuracy@5 | 0.56 | 0.92 | 1.0 | 0.72 | 1.0 | 0.78 | 0.68 | 0.86 | 0.98 | 0.7 | 0.66 | 0.88 | 0.9796 | | MaxSim_accuracy@10 | 0.76 | 0.94 | 1.0 | 0.78 | 1.0 | 0.92 | 0.74 | 0.88 | 1.0 | 0.84 | 0.8 | 0.88 | 1.0 | | MaxSim_precision@1 | 0.24 | 0.76 | 0.9 | 0.58 | 0.92 | 0.52 | 0.48 | 0.52 | 0.9 | 0.48 | 0.26 | 0.74 | 0.7755 | | MaxSim_precision@3 | 0.1467 | 0.72 | 0.34 | 0.32 | 0.5533 | 0.24 | 0.4267 | 0.2867 | 0.3867 | 0.3267 | 0.1867 | 0.2933 | 0.7211 | | MaxSim_precision@5 | 0.132 | 0.64 | 0.216 | 0.244 | 0.352 | 0.156 | 0.372 | 0.18 | 0.252 | 0.256 | 0.132 | 0.196 | 0.6327 | | MaxSim_precision@10 | 0.1 | 0.536 | 0.11 | 0.138 | 0.182 | 0.092 | 0.288 | 0.094 | 0.138 | 0.186 | 0.08 | 0.098 | 0.5306 | | MaxSim_recall@1 | 0.115 | 0.1033 | 0.8367 | 0.3661 | 0.46 | 0.52 | 0.0445 | 0.49 | 0.7873 | 0.1017 | 0.26 | 0.715 | 0.0525 | | MaxSim_recall@3 | 0.205 | 0.2069 | 0.9233 | 0.4851 | 0.83 | 0.72 | 0.0833 | 0.79 | 0.9147 | 0.2027 | 0.56 | 0.805 | 0.146 | | MaxSim_recall@5 | 0.2733 | 0.2663 | 0.97 | 0.5518 | 0.88 | 0.78 | 0.1239 | 0.82 | 0.956 | 0.2627 | 0.66 | 0.87 | 0.2086 | | MaxSim_recall@10 | 0.3907 | 0.3798 | 0.98 | 0.6032 | 0.91 | 0.92 | 0.1562 | 0.84 | 0.9967 | 0.3797 | 0.8 | 0.87 | 0.3417 | | **MaxSim_ndcg@10** | **0.2951** | **0.6745** | **0.9295** | **0.5639** | **0.8736** | **0.7115** | **0.3662** | **0.7064** | **0.9423** | **0.3745** | **0.5177** | **0.8104** | **0.6057** | | MaxSim_mrr@10 | 0.3688 | 0.842 | 0.9367 | 0.6376 | 0.9533 | 0.6469 | 0.5659 | 0.6779 | 0.9383 | 0.6007 | 0.4285 | 0.7907 | 0.8646 | | MaxSim_map@100 | 0.2245 | 0.5354 | 0.9026 | 0.5137 | 0.8197 | 0.6513 | 0.1629 | 0.657 | 0.9162 | 0.2818 | 0.435 | 0.7927 | 0.4446 | #### Nano BEIR * Dataset: `NanoBEIR_mean` * Evaluated with pylate.evaluation.nano_beir_evaluator.NanoBEIREvaluator | Metric | Value | |:--------------------|:-----------| | MaxSim_accuracy@1 | 0.6212 | | MaxSim_accuracy@3 | 0.7799 | | MaxSim_accuracy@5 | 0.8246 | | MaxSim_accuracy@10 | 0.8877 | | MaxSim_precision@1 | 0.6212 | | MaxSim_precision@3 | 0.3806 | | MaxSim_precision@5 | 0.2893 | | MaxSim_precision@10 | 0.1979 | | MaxSim_recall@1 | 0.3732 | | MaxSim_recall@3 | 0.5286 | | MaxSim_recall@5 | 0.5864 | | MaxSim_recall@10 | 0.6591 | | **MaxSim_ndcg@10** | **0.6439** | | MaxSim_mrr@10 | 0.7117 | | MaxSim_map@100 | 0.5644 | ## Training Details ### Training Dataset #### train * Dataset: [train](https://huggingface.co/datasets/lightonai/ms-marco-en-bge-gemma) at [1a1ffe7](https://huggingface.co/datasets/lightonai/ms-marco-en-bge-gemma/tree/1a1ffe7cde403016be12ae532b249965b2293114) * Size: 640,000 training samples * Columns: query_id, document_ids, and scores * Approximate statistics based on the first 1000 samples: | | query_id | document_ids | scores | |:--------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------|:------------------------------------| | type | int | list | list | | details | | | | * Samples: | query_id | document_ids | scores | |:--------------------|:----------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------| | 685613 | [7546874, 1176459, 197677, 2306318, 8541504, ...] | [0.9999999992804947, 0.24845418756716053, 0.7594154013647826, 0.26644182105618575, 0.390668914839766, ...] | | 237784 | [6366584, 4034101, 2325374, 6914618, 6042146, ...] | [0.9999999991784339, 0.42233632827946693, 0.5956354295491569, 0.12644415907455164, 0.6636713730105909, ...] | | 904294 | [448408, 8743975, 49600, 7339401, 2714261, ...] | [0.9999999991841937, 0.877629062381539, 0.8330146583389045, 0.3116634796692611, 0.4633524534142185, ...] | * Loss: pylate.losses.distillation.Distillation ### Training Hyperparameters #### Non-Default Hyperparameters - `eval_strategy`: steps - `per_device_train_batch_size`: 16 - `learning_rate`: 4e-06 - `max_steps`: 20000 - `fp16`: True - `dataloader_drop_last`: True - `dataloader_num_workers`: 8 - `ddp_find_unused_parameters`: False - `torch_compile`: True - `torch_compile_backend`: inductor - `eval_on_start`: True #### All Hyperparameters
Click to expand - `overwrite_output_dir`: False - `do_predict`: False - `eval_strategy`: steps - `prediction_loss_only`: True - `per_device_train_batch_size`: 16 - `per_device_eval_batch_size`: 8 - `per_gpu_train_batch_size`: None - `per_gpu_eval_batch_size`: None - `gradient_accumulation_steps`: 1 - `eval_accumulation_steps`: None - `torch_empty_cache_steps`: None - `learning_rate`: 4e-06 - `weight_decay`: 0.0 - `adam_beta1`: 0.9 - `adam_beta2`: 0.999 - `adam_epsilon`: 1e-08 - `max_grad_norm`: 1.0 - `num_train_epochs`: 3.0 - `max_steps`: 20000 - `lr_scheduler_type`: linear - `lr_scheduler_kwargs`: {} - `warmup_ratio`: 0.0 - `warmup_steps`: 0 - `log_level`: passive - `log_level_replica`: warning - `log_on_each_node`: True - `logging_nan_inf_filter`: True - `save_safetensors`: True - `save_on_each_node`: False - `save_only_model`: False - `restore_callback_states_from_checkpoint`: False - `no_cuda`: False - `use_cpu`: False - `use_mps_device`: False - `seed`: 42 - `data_seed`: None - `jit_mode_eval`: False - `use_ipex`: False - `bf16`: False - `fp16`: True - `fp16_opt_level`: O1 - `half_precision_backend`: auto - `bf16_full_eval`: False - `fp16_full_eval`: False - `tf32`: None - `local_rank`: 0 - `ddp_backend`: None - `tpu_num_cores`: None - `tpu_metrics_debug`: False - `debug`: [] - `dataloader_drop_last`: True - `dataloader_num_workers`: 8 - `dataloader_prefetch_factor`: None - `past_index`: -1 - `disable_tqdm`: False - `remove_unused_columns`: True - `label_names`: None - `load_best_model_at_end`: False - `ignore_data_skip`: False - `fsdp`: [] - `fsdp_min_num_params`: 0 - `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False} - `fsdp_transformer_layer_cls_to_wrap`: None - `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None} - `parallelism_config`: None - `deepspeed`: None - `label_smoothing_factor`: 0.0 - `optim`: adamw_torch - `optim_args`: None - `adafactor`: False - `group_by_length`: False - `length_column_name`: length - `ddp_find_unused_parameters`: False - `ddp_bucket_cap_mb`: None - `ddp_broadcast_buffers`: False - `dataloader_pin_memory`: True - `dataloader_persistent_workers`: False - `skip_memory_metrics`: True - `use_legacy_prediction_loop`: False - `push_to_hub`: False - `resume_from_checkpoint`: None - `hub_model_id`: None - `hub_strategy`: every_save - `hub_private_repo`: None - `hub_always_push`: False - `hub_revision`: None - `gradient_checkpointing`: False - `gradient_checkpointing_kwargs`: None - `include_inputs_for_metrics`: False - `include_for_metrics`: [] - `eval_do_concat_batches`: True - `fp16_backend`: auto - `push_to_hub_model_id`: None - `push_to_hub_organization`: None - `mp_parameters`: - `auto_find_batch_size`: False - `full_determinism`: False - `torchdynamo`: None - `ray_scope`: last - `ddp_timeout`: 1800 - `torch_compile`: True - `torch_compile_backend`: inductor - `torch_compile_mode`: None - `include_tokens_per_second`: False - `include_num_input_tokens_seen`: False - `neftune_noise_alpha`: None - `optim_target_modules`: None - `batch_eval_metrics`: False - `eval_on_start`: True - `use_liger_kernel`: False - `liger_kernel_config`: None - `eval_use_gather_object`: False - `average_tokens_across_devices`: False - `prompts`: None - `batch_sampler`: batch_sampler - `multi_dataset_batch_sampler`: proportional - `router_mapping`: {} - `learning_rate_mapping`: {}