tags:
- ColBERT
- PyLate
- sentence-transformers
- sentence-similarity
- embeddings
- retrieval
- feature-extraction
- generated_from_trainer
- dataset_size:640000
- loss:Distillation
pipeline_tag: sentence-similarity
library_name: PyLate
license: apache-2.0
language:
- en
metrics:
- MaxSim_accuracy@1
- MaxSim_accuracy@3
- MaxSim_accuracy@5
- MaxSim_accuracy@10
- MaxSim_precision@1
- MaxSim_precision@3
- MaxSim_precision@5
- MaxSim_precision@10
- MaxSim_recall@1
- MaxSim_recall@3
- MaxSim_recall@5
- MaxSim_recall@10
- MaxSim_ndcg@10
- MaxSim_mrr@10
- MaxSim_map@100
model-index:
- name: PyLate
results:
- task:
type: py-late-information-retrieval
name: Py Late Information Retrieval
dataset:
name: NanoClimateFEVER
type: NanoClimateFEVER
metrics:
- type: MaxSim_accuracy@1
value: 0.32
name: Maxsim Accuracy@1
- type: MaxSim_accuracy@3
value: 0.6
name: Maxsim Accuracy@3
- type: MaxSim_accuracy@5
value: 0.68
name: Maxsim Accuracy@5
- type: MaxSim_accuracy@10
value: 0.84
name: Maxsim Accuracy@10
- type: MaxSim_precision@1
value: 0.32
name: Maxsim Precision@1
- type: MaxSim_precision@3
value: 0.24
name: Maxsim Precision@3
- type: MaxSim_precision@5
value: 0.18
name: Maxsim Precision@5
- type: MaxSim_precision@10
value: 0.12799999999999997
name: Maxsim Precision@10
- type: MaxSim_recall@1
value: 0.16666666666666669
name: Maxsim Recall@1
- type: MaxSim_recall@3
value: 0.30166666666666664
name: Maxsim Recall@3
- type: MaxSim_recall@5
value: 0.36999999999999994
name: Maxsim Recall@5
- type: MaxSim_recall@10
value: 0.48966666666666664
name: Maxsim Recall@10
- type: MaxSim_ndcg@10
value: 0.39788607688317723
name: Maxsim Ndcg@10
- type: MaxSim_mrr@10
value: 0.4867380952380952
name: Maxsim Mrr@10
- type: MaxSim_map@100
value: 0.32122237005984133
name: Maxsim Map@100
- task:
type: py-late-information-retrieval
name: Py Late Information Retrieval
dataset:
name: NanoDBPedia
type: NanoDBPedia
metrics:
- type: MaxSim_accuracy@1
value: 0.86
name: Maxsim Accuracy@1
- type: MaxSim_accuracy@3
value: 0.92
name: Maxsim Accuracy@3
- type: MaxSim_accuracy@5
value: 0.96
name: Maxsim Accuracy@5
- type: MaxSim_accuracy@10
value: 0.98
name: Maxsim Accuracy@10
- type: MaxSim_precision@1
value: 0.86
name: Maxsim Precision@1
- type: MaxSim_precision@3
value: 0.7066666666666667
name: Maxsim Precision@3
- type: MaxSim_precision@5
value: 0.6839999999999999
name: Maxsim Precision@5
- type: MaxSim_precision@10
value: 0.58
name: Maxsim Precision@10
- type: MaxSim_recall@1
value: 0.11298996781634019
name: Maxsim Recall@1
- type: MaxSim_recall@3
value: 0.2093514022345805
name: Maxsim Recall@3
- type: MaxSim_recall@5
value: 0.2979866359871688
name: Maxsim Recall@5
- type: MaxSim_recall@10
value: 0.41705883152883244
name: Maxsim Recall@10
- type: MaxSim_ndcg@10
value: 0.7272920241863232
name: Maxsim Ndcg@10
- type: MaxSim_mrr@10
value: 0.9022222222222223
name: Maxsim Mrr@10
- type: MaxSim_map@100
value: 0.5828675339983777
name: Maxsim Map@100
- task:
type: py-late-information-retrieval
name: Py Late Information Retrieval
dataset:
name: NanoFEVER
type: NanoFEVER
metrics:
- type: MaxSim_accuracy@1
value: 0.94
name: Maxsim Accuracy@1
- type: MaxSim_accuracy@3
value: 0.94
name: Maxsim Accuracy@3
- type: MaxSim_accuracy@5
value: 1
name: Maxsim Accuracy@5
- type: MaxSim_accuracy@10
value: 1
name: Maxsim Accuracy@10
- type: MaxSim_precision@1
value: 0.94
name: Maxsim Precision@1
- type: MaxSim_precision@3
value: 0.34666666666666657
name: Maxsim Precision@3
- type: MaxSim_precision@5
value: 0.21999999999999997
name: Maxsim Precision@5
- type: MaxSim_precision@10
value: 0.10999999999999999
name: Maxsim Precision@10
- type: MaxSim_recall@1
value: 0.8766666666666667
name: Maxsim Recall@1
- type: MaxSim_recall@3
value: 0.92
name: Maxsim Recall@3
- type: MaxSim_recall@5
value: 0.98
name: Maxsim Recall@5
- type: MaxSim_recall@10
value: 0.98
name: Maxsim Recall@10
- type: MaxSim_ndcg@10
value: 0.9471553127609496
name: Maxsim Ndcg@10
- type: MaxSim_mrr@10
value: 0.955
name: Maxsim Mrr@10
- type: MaxSim_map@100
value: 0.9296929824561403
name: Maxsim Map@100
- task:
type: py-late-information-retrieval
name: Py Late Information Retrieval
dataset:
name: NanoFiQA2018
type: NanoFiQA2018
metrics:
- type: MaxSim_accuracy@1
value: 0.56
name: Maxsim Accuracy@1
- type: MaxSim_accuracy@3
value: 0.68
name: Maxsim Accuracy@3
- type: MaxSim_accuracy@5
value: 0.76
name: Maxsim Accuracy@5
- type: MaxSim_accuracy@10
value: 0.8
name: Maxsim Accuracy@10
- type: MaxSim_precision@1
value: 0.56
name: Maxsim Precision@1
- type: MaxSim_precision@3
value: 0.33333333333333326
name: Maxsim Precision@3
- type: MaxSim_precision@5
value: 0.252
name: Maxsim Precision@5
- type: MaxSim_precision@10
value: 0.146
name: Maxsim Precision@10
- type: MaxSim_recall@1
value: 0.3225793650793651
name: Maxsim Recall@1
- type: MaxSim_recall@3
value: 0.48090476190476195
name: Maxsim Recall@3
- type: MaxSim_recall@5
value: 0.5861746031746032
name: Maxsim Recall@5
- type: MaxSim_recall@10
value: 0.6236984126984126
name: Maxsim Recall@10
- type: MaxSim_ndcg@10
value: 0.564408819366597
name: Maxsim Ndcg@10
- type: MaxSim_mrr@10
value: 0.6322222222222221
name: Maxsim Mrr@10
- type: MaxSim_map@100
value: 0.5081777792392109
name: Maxsim Map@100
- task:
type: py-late-information-retrieval
name: Py Late Information Retrieval
dataset:
name: NanoHotpotQA
type: NanoHotpotQA
metrics:
- type: MaxSim_accuracy@1
value: 0.94
name: Maxsim Accuracy@1
- type: MaxSim_accuracy@3
value: 1
name: Maxsim Accuracy@3
- type: MaxSim_accuracy@5
value: 1
name: Maxsim Accuracy@5
- type: MaxSim_accuracy@10
value: 1
name: Maxsim Accuracy@10
- type: MaxSim_precision@1
value: 0.94
name: Maxsim Precision@1
- type: MaxSim_precision@3
value: 0.5933333333333333
name: Maxsim Precision@3
- type: MaxSim_precision@5
value: 0.3679999999999999
name: Maxsim Precision@5
- type: MaxSim_precision@10
value: 0.18799999999999997
name: Maxsim Precision@10
- type: MaxSim_recall@1
value: 0.47
name: Maxsim Recall@1
- type: MaxSim_recall@3
value: 0.89
name: Maxsim Recall@3
- type: MaxSim_recall@5
value: 0.92
name: Maxsim Recall@5
- type: MaxSim_recall@10
value: 0.94
name: Maxsim Recall@10
- type: MaxSim_ndcg@10
value: 0.9106223443736624
name: Maxsim Ndcg@10
- type: MaxSim_mrr@10
value: 0.9633333333333333
name: Maxsim Mrr@10
- type: MaxSim_map@100
value: 0.8715001126887537
name: Maxsim Map@100
- task:
type: py-late-information-retrieval
name: Py Late Information Retrieval
dataset:
name: NanoMSMARCO
type: NanoMSMARCO
metrics:
- type: MaxSim_accuracy@1
value: 0.56
name: Maxsim Accuracy@1
- type: MaxSim_accuracy@3
value: 0.7
name: Maxsim Accuracy@3
- type: MaxSim_accuracy@5
value: 0.78
name: Maxsim Accuracy@5
- type: MaxSim_accuracy@10
value: 0.86
name: Maxsim Accuracy@10
- type: MaxSim_precision@1
value: 0.56
name: Maxsim Precision@1
- type: MaxSim_precision@3
value: 0.23333333333333336
name: Maxsim Precision@3
- type: MaxSim_precision@5
value: 0.15600000000000003
name: Maxsim Precision@5
- type: MaxSim_precision@10
value: 0.08599999999999998
name: Maxsim Precision@10
- type: MaxSim_recall@1
value: 0.56
name: Maxsim Recall@1
- type: MaxSim_recall@3
value: 0.7
name: Maxsim Recall@3
- type: MaxSim_recall@5
value: 0.78
name: Maxsim Recall@5
- type: MaxSim_recall@10
value: 0.86
name: Maxsim Recall@10
- type: MaxSim_ndcg@10
value: 0.7016952795427963
name: Maxsim Ndcg@10
- type: MaxSim_mrr@10
value: 0.6515476190476189
name: Maxsim Mrr@10
- type: MaxSim_map@100
value: 0.6612570762570762
name: Maxsim Map@100
- task:
type: py-late-information-retrieval
name: Py Late Information Retrieval
dataset:
name: NanoNFCorpus
type: NanoNFCorpus
metrics:
- type: MaxSim_accuracy@1
value: 0.6
name: Maxsim Accuracy@1
- type: MaxSim_accuracy@3
value: 0.68
name: Maxsim Accuracy@3
- type: MaxSim_accuracy@5
value: 0.7
name: Maxsim Accuracy@5
- type: MaxSim_accuracy@10
value: 0.76
name: Maxsim Accuracy@10
- type: MaxSim_precision@1
value: 0.6
name: Maxsim Precision@1
- type: MaxSim_precision@3
value: 0.4333333333333333
name: Maxsim Precision@3
- type: MaxSim_precision@5
value: 0.36
name: Maxsim Precision@5
- type: MaxSim_precision@10
value: 0.30199999999999994
name: Maxsim Precision@10
- type: MaxSim_recall@1
value: 0.06695123074603171
name: Maxsim Recall@1
- type: MaxSim_recall@3
value: 0.10230921078558003
name: Maxsim Recall@3
- type: MaxSim_recall@5
value: 0.11977716363807966
name: Maxsim Recall@5
- type: MaxSim_recall@10
value: 0.1560290535749611
name: Maxsim Recall@10
- type: MaxSim_ndcg@10
value: 0.39635586329364647
name: Maxsim Ndcg@10
- type: MaxSim_mrr@10
value: 0.648222222222222
name: Maxsim Mrr@10
- type: MaxSim_map@100
value: 0.18734700252488395
name: Maxsim Map@100
- task:
type: py-late-information-retrieval
name: Py Late Information Retrieval
dataset:
name: NanoNQ
type: NanoNQ
metrics:
- type: MaxSim_accuracy@1
value: 0.66
name: Maxsim Accuracy@1
- type: MaxSim_accuracy@3
value: 0.84
name: Maxsim Accuracy@3
- type: MaxSim_accuracy@5
value: 0.88
name: Maxsim Accuracy@5
- type: MaxSim_accuracy@10
value: 0.92
name: Maxsim Accuracy@10
- type: MaxSim_precision@1
value: 0.66
name: Maxsim Precision@1
- type: MaxSim_precision@3
value: 0.28
name: Maxsim Precision@3
- type: MaxSim_precision@5
value: 0.176
name: Maxsim Precision@5
- type: MaxSim_precision@10
value: 0.09799999999999998
name: Maxsim Precision@10
- type: MaxSim_recall@1
value: 0.63
name: Maxsim Recall@1
- type: MaxSim_recall@3
value: 0.77
name: Maxsim Recall@3
- type: MaxSim_recall@5
value: 0.81
name: Maxsim Recall@5
- type: MaxSim_recall@10
value: 0.88
name: Maxsim Recall@10
- type: MaxSim_ndcg@10
value: 0.7640243523560828
name: Maxsim Ndcg@10
- type: MaxSim_mrr@10
value: 0.7507460317460317
name: Maxsim Mrr@10
- type: MaxSim_map@100
value: 0.7186210251123722
name: Maxsim Map@100
- task:
type: py-late-information-retrieval
name: Py Late Information Retrieval
dataset:
name: NanoQuoraRetrieval
type: NanoQuoraRetrieval
metrics:
- type: MaxSim_accuracy@1
value: 0.98
name: Maxsim Accuracy@1
- type: MaxSim_accuracy@3
value: 1
name: Maxsim Accuracy@3
- type: MaxSim_accuracy@5
value: 1
name: Maxsim Accuracy@5
- type: MaxSim_accuracy@10
value: 1
name: Maxsim Accuracy@10
- type: MaxSim_precision@1
value: 0.98
name: Maxsim Precision@1
- type: MaxSim_precision@3
value: 0.40666666666666657
name: Maxsim Precision@3
- type: MaxSim_precision@5
value: 0.25999999999999995
name: Maxsim Precision@5
- type: MaxSim_precision@10
value: 0.13399999999999998
name: Maxsim Precision@10
- type: MaxSim_recall@1
value: 0.8573333333333334
name: Maxsim Recall@1
- type: MaxSim_recall@3
value: 0.9586666666666668
name: Maxsim Recall@3
- type: MaxSim_recall@5
value: 0.9793333333333334
name: Maxsim Recall@5
- type: MaxSim_recall@10
value: 0.9893333333333334
name: Maxsim Recall@10
- type: MaxSim_ndcg@10
value: 0.9803361966637445
name: Maxsim Ndcg@10
- type: MaxSim_mrr@10
value: 0.99
name: Maxsim Mrr@10
- type: MaxSim_map@100
value: 0.9729292929292929
name: Maxsim Map@100
- task:
type: py-late-information-retrieval
name: Py Late Information Retrieval
dataset:
name: NanoSCIDOCS
type: NanoSCIDOCS
metrics:
- type: MaxSim_accuracy@1
value: 0.46
name: Maxsim Accuracy@1
- type: MaxSim_accuracy@3
value: 0.76
name: Maxsim Accuracy@3
- type: MaxSim_accuracy@5
value: 0.78
name: Maxsim Accuracy@5
- type: MaxSim_accuracy@10
value: 0.92
name: Maxsim Accuracy@10
- type: MaxSim_precision@1
value: 0.46
name: Maxsim Precision@1
- type: MaxSim_precision@3
value: 0.4133333333333333
name: Maxsim Precision@3
- type: MaxSim_precision@5
value: 0.308
name: Maxsim Precision@5
- type: MaxSim_precision@10
value: 0.20399999999999996
name: Maxsim Precision@10
- type: MaxSim_recall@1
value: 0.09766666666666665
name: Maxsim Recall@1
- type: MaxSim_recall@3
value: 0.2546666666666666
name: Maxsim Recall@3
- type: MaxSim_recall@5
value: 0.31566666666666665
name: Maxsim Recall@5
- type: MaxSim_recall@10
value: 0.41666666666666663
name: Maxsim Recall@10
- type: MaxSim_ndcg@10
value: 0.41263681960415605
name: Maxsim Ndcg@10
- type: MaxSim_mrr@10
value: 0.6238888888888888
name: Maxsim Mrr@10
- type: MaxSim_map@100
value: 0.32305678261351617
name: Maxsim Map@100
- task:
type: py-late-information-retrieval
name: Py Late Information Retrieval
dataset:
name: NanoArguAna
type: NanoArguAna
metrics:
- type: MaxSim_accuracy@1
value: 0.22
name: Maxsim Accuracy@1
- type: MaxSim_accuracy@3
value: 0.64
name: Maxsim Accuracy@3
- type: MaxSim_accuracy@5
value: 0.7
name: Maxsim Accuracy@5
- type: MaxSim_accuracy@10
value: 0.88
name: Maxsim Accuracy@10
- type: MaxSim_precision@1
value: 0.22
name: Maxsim Precision@1
- type: MaxSim_precision@3
value: 0.21333333333333335
name: Maxsim Precision@3
- type: MaxSim_precision@5
value: 0.14
name: Maxsim Precision@5
- type: MaxSim_precision@10
value: 0.088
name: Maxsim Precision@10
- type: MaxSim_recall@1
value: 0.22
name: Maxsim Recall@1
- type: MaxSim_recall@3
value: 0.64
name: Maxsim Recall@3
- type: MaxSim_recall@5
value: 0.7
name: Maxsim Recall@5
- type: MaxSim_recall@10
value: 0.88
name: Maxsim Recall@10
- type: MaxSim_ndcg@10
value: 0.5451561462647055
name: Maxsim Ndcg@10
- type: MaxSim_mrr@10
value: 0.438579365079365
name: Maxsim Mrr@10
- type: MaxSim_map@100
value: 0.4404661078361693
name: Maxsim Map@100
- task:
type: py-late-information-retrieval
name: Py Late Information Retrieval
dataset:
name: NanoSciFact
type: NanoSciFact
metrics:
- type: MaxSim_accuracy@1
value: 0.74
name: Maxsim Accuracy@1
- type: MaxSim_accuracy@3
value: 0.86
name: Maxsim Accuracy@3
- type: MaxSim_accuracy@5
value: 0.9
name: Maxsim Accuracy@5
- type: MaxSim_accuracy@10
value: 0.92
name: Maxsim Accuracy@10
- type: MaxSim_precision@1
value: 0.74
name: Maxsim Precision@1
- type: MaxSim_precision@3
value: 0.3066666666666667
name: Maxsim Precision@3
- type: MaxSim_precision@5
value: 0.19999999999999996
name: Maxsim Precision@5
- type: MaxSim_precision@10
value: 0.10199999999999998
name: Maxsim Precision@10
- type: MaxSim_recall@1
value: 0.705
name: Maxsim Recall@1
- type: MaxSim_recall@3
value: 0.835
name: Maxsim Recall@3
- type: MaxSim_recall@5
value: 0.89
name: Maxsim Recall@5
- type: MaxSim_recall@10
value: 0.91
name: Maxsim Recall@10
- type: MaxSim_ndcg@10
value: 0.8244122815839126
name: Maxsim Ndcg@10
- type: MaxSim_mrr@10
value: 0.8028571428571429
name: Maxsim Mrr@10
- type: MaxSim_map@100
value: 0.7945920069148553
name: Maxsim Map@100
- task:
type: py-late-information-retrieval
name: Py Late Information Retrieval
dataset:
name: NanoTouche2020
type: NanoTouche2020
metrics:
- type: MaxSim_accuracy@1
value: 0.7755102040816326
name: Maxsim Accuracy@1
- type: MaxSim_accuracy@3
value: 0.9795918367346939
name: Maxsim Accuracy@3
- type: MaxSim_accuracy@5
value: 0.9795918367346939
name: Maxsim Accuracy@5
- type: MaxSim_accuracy@10
value: 1
name: Maxsim Accuracy@10
- type: MaxSim_precision@1
value: 0.7755102040816326
name: Maxsim Precision@1
- type: MaxSim_precision@3
value: 0.7414965986394557
name: Maxsim Precision@3
- type: MaxSim_precision@5
value: 0.6979591836734694
name: Maxsim Precision@5
- type: MaxSim_precision@10
value: 0.5551020408163266
name: Maxsim Precision@10
- type: MaxSim_recall@1
value: 0.05388501860819581
name: Maxsim Recall@1
- type: MaxSim_recall@3
value: 0.147652689306387
name: Maxsim Recall@3
- type: MaxSim_recall@5
value: 0.22721850419336123
name: Maxsim Recall@5
- type: MaxSim_recall@10
value: 0.35299487730359475
name: Maxsim Recall@10
- type: MaxSim_ndcg@10
value: 0.6326177160290112
name: Maxsim Ndcg@10
- type: MaxSim_mrr@10
value: 0.8736637512147717
name: Maxsim Mrr@10
- type: MaxSim_map@100
value: 0.45506840839628654
name: Maxsim Map@100
- task:
type: nano-beir
name: Nano BEIR
dataset:
name: NanoBEIR mean
type: NanoBEIR_mean
metrics:
- type: MaxSim_accuracy@1
value: 0.6627315541601255
name: Maxsim Accuracy@1
- type: MaxSim_accuracy@3
value: 0.815353218210361
name: Maxsim Accuracy@3
- type: MaxSim_accuracy@5
value: 0.855353218210361
name: Maxsim Accuracy@5
- type: MaxSim_accuracy@10
value: 0.9138461538461539
name: Maxsim Accuracy@10
- type: MaxSim_precision@1
value: 0.6627315541601255
name: Maxsim Precision@1
- type: MaxSim_precision@3
value: 0.4037048665620094
name: Maxsim Precision@3
- type: MaxSim_precision@5
value: 0.3078430141287284
name: Maxsim Precision@5
- type: MaxSim_precision@10
value: 0.20931554160125584
name: Maxsim Precision@10
- type: MaxSim_recall@1
value: 0.3953645319679436
name: Maxsim Recall@1
- type: MaxSim_recall@3
value: 0.5546321587870238
name: Maxsim Recall@3
- type: MaxSim_recall@5
value: 0.6135505313071702
name: Maxsim Recall@5
- type: MaxSim_recall@10
value: 0.684265218597882
name: Maxsim Recall@10
- type: MaxSim_ndcg@10
value: 0.6772768640699051
name: Maxsim Ndcg@10
- type: MaxSim_mrr@10
value: 0.7476169918516855
name: Maxsim Mrr@10
- type: MaxSim_map@100
value: 0.5974460370020597
name: Maxsim Map@100
ColBERT-Zero
🎯 TL;DR: First large-scale fully pre-trained ColBERT model using only public data. Achieves 55.43 nDCG@10 on BEIR benchmark, outperforming GTE-ModernColBERT and GTE-ModernBERT trained on closed and stronger data. New SOTA on BEIR for models <150M parameters.
Why ColBERT-Zero?
Late interaction (ColBERT / multi-vector) models have clear advantages in out-of-domain generalization, long-context handling, and reasoning-intensive retrieval. Yet they remain undertrained: current state-of-the-art ColBERT models (e.g, GTE-ModernColBERT and ColBERT-small) are simply built by bolting a small knowledge distillation step onto a strong dense (single-vector) model. Even recent efforts like mxbai-edge-colbert-v0 perform all early training stages in a single-vector setting, only switching to the multi-vector objective at the very end.
This leaves a lot of performance on the table. ColBERT-Zero demonstrates that performing contrastive pre-training directly in the multi-vector setting, rather than treating it as an afterthought, unlocks a significantly higher performance ceiling. Trained exclusively on public data (Nomic-embed dataset mixture), ColBERT-Zero overcomes a 2.4-point data quality disadvantage to outperform models trained on proprietary, closed-source data. For detailed results, please have a look at our blogpost and the paper. All the models (including intermediate checkpoints) as well training code are released under an Apache 2.0 license.
Controlled Comparison Design
We deliberately trained on the public Nomic-embed data mixture for a strategic reason: Nomic has already trained a dense ModernBERT model (ModernBERT-embed) on this exact data. This lets us compare dense vs. multi-vector training with the same data, same base model (ModernBERT), and same pipeline. The only variable is whether the contrastive phases are performed in the dense or multi-vector setting.
This design reveals a striking result: the dense baseline trained on Nomic data scores 52.89, while the one trained on GTE's proprietary data scores 55.33: a 2.4-point data quality gap. Despite this disadvantage, ColBERT-Zero's full multi-vector pre-training pipeline closes and surpasses this gap, reaching 55.43 nDCG@10.
The Three-Phase Training Pipeline
The development followed a three-phase pipeline, each providing a different type of learning signal:
Phase 1 - Unsupervised Contrastive Pre-training
We began with the nomic-embed-unsupervised-data dataset. Using PyLate's GradCache implementation to scale per-GPU batch size without VRAM constraints, combined with cross-GPU gathering of representations, we reached effective batch sizes of ~16k, required for unsupervised training to produce plausible in-batch hard negatives. Unlike dense training, the multi-vector objective allows the encoder to learn fine-grained token importance from the very first phase.
Phase 2 - Supervised Contrastive Fine-tuning
We refined the model using the nomic-embed-supervised-data. This stage introduced mined hard negatives: documents that are superficially similar to the query but not actually relevant. This allows teaching the model to handle nuance by prioritizing specific keywords and contextual tokens most indicative of a true match.
Phase 3 - Knowledge Distillation (KD)
The final stage used the ms-marco-en-bge dataset. We leveraged a powerful Gemma-based model as a teacher, allowing our student models to learn to replicate complex reasoning scores via the efficient MaxSim operator.
Key Findings
1. The Standard Recipe Leaves Performance on the Table
The KD-only approach (the current industry standard) scores 54.09, lagging behind full pre-training by 1.3 points. A simple distillation step is insufficient for optimal multi-vector performance.
2. Supervised + KD Is the Efficiency Sweet Spot
By running a supervised contrastive step in the multi-vector setting before distillation, we reach 55.12 nDCG@10, closing most of the gap with the fully pre-trained model (55.43). This costs ~40 GH200-hours instead of ~408: roughly 10× cheaper for 99.4% of the performance.
3. Prompt Alignment Is Non-Negotiable
Nomic's base models are pre-trained with asymmetric prompts (search_query: and search_document:). While ColBERT has its own asymmetric mechanism via [Q] and [D] markers, we found:
- Stripping pre-training prompts during fine-tuning causes significant performance degradation.
- Adding prompts to a model not pre-trained with them also hurts performance.
- Even with perfect alignment, prompts provide an intrinsic benefit: full ColBERT pre-training with prompts (55.43) vs. without prompts (54.61), no mismatch in either case, shows a meaningful 0.82-point gap.
Why do prompts help? Our leading hypothesis is that prompt tokens act as implicit query expansion: extra slots that don't carry specific meaning but let the model store global information about the sequence. The original ColBERT used [PAD] tokens for this purpose, but modern Flash Attention implementations broke this trick (masked tokens no longer produce usable embeddings). Explicit prompt tokens may be quietly re-enabling it.
Practical takeaway: Always align your prompts with the base model's pre-training setup. Misalignment is one of the easiest ways to silently lose performance. Note that this sensitivity decreases with stronger downstream fine-tuning: with enough training, the model can adapt to an initial mismatch.
Model Lineup
The Main Models (ColBERT-Zero)
ColBERT-Zero utilizes the full 3-phase pipeline with strict prompt alignment, achieving 55.43 nDCG@10 on BEIR, setting a new SOTA for models <150M parameters. We also provide ColBERT-Zero-noprompts, the same pipeline without asymmetric prompts, to study the impact of query expansion on multi-vector performance.
The cheap-to-train ones (ModernColBERT-embed-base)
These models represent the practical sweet spot. By skipping the expensive unsupervised phase, ModernColBERT-embed-base (Supervised + KD) achieves ~97% of the flagship's performance at only ~10% of the compute cost. For reference, ModernColBERT-embed-base-kd performs only the distillation step on a supervised dense base.
Intermediate Checkpoints
For researchers studying the incremental impact of each phase and prompt alignment, we release several ablation variants: ColBERT-Zero-supervised, ColBERT-Zero-unsupervised (and their -noprompts versions), and ModernColBERT-embed-base-supervised.
Full Performance on BEIR
| Model | Avg | FiQA | NFCorpus | TREC-COVID | Touche | ArguAna | Quora | SCIDOCS | SciFact | NQ | ClimateFEVER | HotpotQA | DBPedia | CQADupstack | FEVER | MSMARCO |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Baselines | ||||||||||||||||
| ModernBERT-embed-unsupervised | 47.05 | 42.53 | 35.33 | 68.44 | 18.58 | 48.82 | 88.63 | 19.83 | 72.30 | 46.32 | 22.97 | 60.00 | 37.97 | 42.40 | 67.39 | 34.23 |
| ModernBERT-embed-supervised | 52.89 | 40.59 | 33.40 | 84.15 | 31.91 | 48.96 | 88.85 | 18.59 | 69.63 | 62.15 | 35.67 | 67.11 | 41.50 | 42.08 | 87.35 | 41.47 |
| GTE-ModernColBERT | 54.67 | 45.28 | 37.93 | 83.59 | 31.23 | 48.51 | 86.61 | 19.06 | 76.34 | 61.80 | 30.62 | 77.32 | 48.03 | 41.00 | 87.44 | 45.32 |
| gte-modernbert-base | 55.33 | 48.81 | 36.44 | 81.95 | 21.68 | 72.68 | 88.55 | 21.29 | 77.40 | 57.62 | 37.74 | 69.47 | 41.79 | 42.63 | 91.03 | 40.90 |
| KD from dense supervised | ||||||||||||||||
| ModernColBERT-embed-base-kd-only | 54.09 | 42.51 | 37.01 | 79.52 | 34.58 | 51.75 | 87.67 | 18.15 | 75.04 | 61.45 | 28.31 | 76.70 | 47.54 | 40.68 | 84.82 | 45.57 |
| Supervised + KD from dense unsupervised | ||||||||||||||||
| ModernColBERT-embed-base-supervised | 50.72 | 40.09 | 35.56 | 71.12 | 25.53 | 44.27 | 86.96 | 18.19 | 73.78 | 58.89 | 32.95 | 71.49 | 43.23 | 42.55 | 70.51 | 45.72 |
| ModernColBERT-embed-base | 55.12 | 41.50 | 36.51 | 77.46 | 33.77 | 52.45 | 86.26 | 18.66 | 74.90 | 62.24 | 37.27 | 80.07 | 48.27 | 41.60 | 89.71 | 46.17 |
| ColBERT-Zero | ||||||||||||||||
| Unsupervised | 51.44 | 45.38 | 36.88 | 67.82 | 22.59 | 51.53 | 87.78 | 22.30 | 76.76 | 58.80 | 24.24 | 68.29 | 43.16 | 45.76 | 81.58 | 38.78 |
| Supervised | 51.81 | 42.45 | 35.60 | 74.72 | 23.83 | 41.81 | 87.19 | 19.85 | 73.71 | 61.95 | 35.01 | 71.37 | 46.20 | 45.16 | 72.61 | 45.68 |
| Distilled | 55.43 | 42.62 | 37.28 | 78.69 | 36.13 | 53.07 | 85.24 | 19.88 | 76.50 | 61.66 | 35.72 | 79.41 | 47.48 | 41.34 | 90.59 | 45.80 |
| ColBERT-Zero-noprompts | ||||||||||||||||
| Unsupervised | 51.70 | 45.31 | 34.72 | 73.55 | 23.26 | 52.56 | 88.15 | 22.63 | 76.10 | 59.18 | 24.24 | 66.66 | 42.61 | 45.56 | 81.88 | 39.15 |
| Supervised | 52.39 | 43.36 | 36.01 | 72.42 | 23.79 | 47.42 | 87.79 | 21.30 | 73.85 | 62.25 | 31.61 | 70.32 | 44.07 | 44.03 | 85.54 | 42.11 |
| Distilled | 54.61 | 43.14 | 36.60 | 78.60 | 36.36 | 49.49 | 88.05 | 19.13 | 76.42 | 61.73 | 32.70 | 76.99 | 47.69 | 40.21 | 85.97 | 46.01 |
Limitations & Discussion
- Data-specific findings. We deliberately used the Nomic Embed data mixture for controlled comparison. Some observations (particularly around prompt sensitivity) may not generalize to different or stronger training configurations.
- Scale vs. objective. The gains from multi-vector pre-training likely reflect more training time in the multi-vector setting, rather than the contrastive objective itself. Performing KD alone at a larger scale might yield similar or superior results due to the higher quality of the distillation signal. Our study uses the conventional setup where training scale is inversely proportional to signal quality, reflecting the higher cost of generating high-quality labels.
- Prompt sensitivity decreases with stronger fine-tuning. When experimenting with stronger fine-tuning data (e.g., NV-Retriever), adding prompts on top of a model pre-trained without them did not degrade results the way it did with ColBERT-Zero. With enough downstream training, the model can adapt to an initial mismatch.
Serving at Scale
For production deployment of ColBERT-Zero and other multi-vector models, check out NextPlaid and FastPlaid, our production-grade engines for multi-vector retrieval.
Resources
- 📦 All checkpoints: HF Collection - every phase, with and without prompts
- 💻 Code: Training boilerplates
- 📄 Paper: ArXiv
Model Details
Model Description
- Model Type: PyLate model
- Document Length: 512 tokens
- Query Length: 32 tokens
- Output Dimensionality: 128 tokens
- Similarity Function: MaxSim
- Training Dataset:
- train
Model Sources
- Documentation: PyLate Documentation
- Repository: PyLate on GitHub
- Hugging Face: PyLate models on Hugging Face
Full Model Architecture
ColBERT(
(0): Transformer({'max_seq_length': 511, 'do_lower_case': False}) with Transformer model: ModernBertModel
(1): Dense({'in_features': 768, 'out_features': 128, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
)
Usage
First install the PyLate library:
pip install -U pylate
Prompt alignment is critical for ColBERT-Zero models. You must use
prompt_name="query"when encoding queries andprompt_name="document"when encoding documents. ColBERT-Zero was pre-trained with asymmetric prompts (search_query:/search_document:), and stripping them causes significant performance.
Retrieval
Use this model with PyLate to index and retrieve documents. The index uses FastPLAID for efficient similarity search.
Indexing documents
Load the ColBERT model and initialize the PLAID index, then encode and index your documents:
from pylate import indexes, models, retrieve
# Step 1: Load the ColBERT model
model = models.ColBERT(
model_name_or_path="pylate_model_id",
)
# Step 2: Initialize the PLAID index
index = indexes.PLAID(
index_folder="pylate-index",
index_name="index",
override=True, # This overwrites the existing index if any
)
# Step 3: Encode the documents
documents_ids = ["1", "2", "3"]
documents = ["document 1 text", "document 2 text", "document 3 text"]
documents_embeddings = model.encode(
documents,
batch_size=32,
is_query=False, # Ensure that it is set to False to indicate that these are documents, not queries
prompt_name="document", # ⚠️ Required for ColBERT-Zero! Do not omit.
show_progress_bar=True,
)
# Step 4: Add document embeddings to the index by providing embeddings and corresponding ids
index.add_documents(
documents_ids=documents_ids,
documents_embeddings=documents_embeddings,
)
Note that you do not have to recreate the index and encode the documents every time. Once you have created an index and added the documents, you can re-use the index later by loading it:
# To load an index, simply instantiate it with the correct folder/name and without overriding it
index = indexes.PLAID(
index_folder="pylate-index",
index_name="index",
)
Retrieving top-k documents for queries
Once the documents are indexed, you can retrieve the top-k most relevant documents for a given set of queries. To do so, initialize the ColBERT retriever with the index you want to search in, encode the queries and then retrieve the top-k documents to get the top matches ids and relevance scores:
[!WARNING] Always pass prompt_name="query" for queries and prompt_name="document" for documents. Omitting these prompts will silently degrade retrieval quality.
# Step 1: Initialize the ColBERT retriever
retriever = retrieve.ColBERT(index=index)
# Step 2: Encode the queries
queries_embeddings = model.encode(
["query for document 3", "query for document 1"],
batch_size=32,
is_query=True, # # Ensure that it is set to False to indicate that these are queries
prompt_name="query", # ⚠️ Required for ColBERT-Zero! Do not omit.
show_progress_bar=True,
)
# Step 3: Retrieve top-k documents
scores = retriever.retrieve(
queries_embeddings=queries_embeddings,
k=10, # Retrieve the top 10 matches for each query
)
Reranking
Always pass
prompt_name="query"for queries andprompt_name="document"for documents. Omitting these prompts will silently degrade retrieval quality.
If you only want to use the ColBERT model to perform reranking on top of your first-stage retrieval pipeline without building an index, you can simply use rank function and pass the queries and documents to rerank:
from pylate import rank, models
queries = [
"query A",
"query B",
]
documents = [
["document A", "document B"],
["document 1", "document C", "document B"],
]
documents_ids = [
[1, 2],
[1, 3, 2],
]
model = models.ColBERT(
model_name_or_path="pylate_model_id",
)
queries_embeddings = model.encode(
queries,
is_query=True,
prompt_name="query" # ⚠️ Required for ColBERT-Zero! Do not omit.
)
documents_embeddings = model.encode(
documents,
is_query=False,
prompt_name="document" # ⚠️ Required for ColBERT-Zero! Do not omit.
)
reranked_documents = rank.rerank(
documents_ids=documents_ids,
queries_embeddings=queries_embeddings,
documents_embeddings=documents_embeddings,
)
Evaluation
Metrics
Py Late Information Retrieval
- Dataset:
['NanoClimateFEVER', 'NanoDBPedia', 'NanoFEVER', 'NanoFiQA2018', 'NanoHotpotQA', 'NanoMSMARCO', 'NanoNFCorpus', 'NanoNQ', 'NanoQuoraRetrieval', 'NanoSCIDOCS', 'NanoArguAna', 'NanoSciFact', 'NanoTouche2020'] - Evaluated with
pylate.evaluation.pylate_information_retrieval_evaluator.PyLateInformationRetrievalEvaluator
| Metric | NanoClimateFEVER | NanoDBPedia | NanoFEVER | NanoFiQA2018 | NanoHotpotQA | NanoMSMARCO | NanoNFCorpus | NanoNQ | NanoQuoraRetrieval | NanoSCIDOCS | NanoArguAna | NanoSciFact | NanoTouche2020 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MaxSim_accuracy@1 | 0.32 | 0.86 | 0.94 | 0.56 | 0.94 | 0.56 | 0.6 | 0.66 | 0.98 | 0.46 | 0.22 | 0.74 | 0.7755 |
| MaxSim_accuracy@3 | 0.6 | 0.92 | 0.94 | 0.68 | 1.0 | 0.7 | 0.68 | 0.84 | 1.0 | 0.76 | 0.64 | 0.86 | 0.9796 |
| MaxSim_accuracy@5 | 0.68 | 0.96 | 1.0 | 0.76 | 1.0 | 0.78 | 0.7 | 0.88 | 1.0 | 0.78 | 0.7 | 0.9 | 0.9796 |
| MaxSim_accuracy@10 | 0.84 | 0.98 | 1.0 | 0.8 | 1.0 | 0.86 | 0.76 | 0.92 | 1.0 | 0.92 | 0.88 | 0.92 | 1.0 |
| MaxSim_precision@1 | 0.32 | 0.86 | 0.94 | 0.56 | 0.94 | 0.56 | 0.6 | 0.66 | 0.98 | 0.46 | 0.22 | 0.74 | 0.7755 |
| MaxSim_precision@3 | 0.24 | 0.7067 | 0.3467 | 0.3333 | 0.5933 | 0.2333 | 0.4333 | 0.28 | 0.4067 | 0.4133 | 0.2133 | 0.3067 | 0.7415 |
| MaxSim_precision@5 | 0.18 | 0.684 | 0.22 | 0.252 | 0.368 | 0.156 | 0.36 | 0.176 | 0.26 | 0.308 | 0.14 | 0.2 | 0.698 |
| MaxSim_precision@10 | 0.128 | 0.58 | 0.11 | 0.146 | 0.188 | 0.086 | 0.302 | 0.098 | 0.134 | 0.204 | 0.088 | 0.102 | 0.5551 |
| MaxSim_recall@1 | 0.1667 | 0.113 | 0.8767 | 0.3226 | 0.47 | 0.56 | 0.067 | 0.63 | 0.8573 | 0.0977 | 0.22 | 0.705 | 0.0539 |
| MaxSim_recall@3 | 0.3017 | 0.2094 | 0.92 | 0.4809 | 0.89 | 0.7 | 0.1023 | 0.77 | 0.9587 | 0.2547 | 0.64 | 0.835 | 0.1477 |
| MaxSim_recall@5 | 0.37 | 0.298 | 0.98 | 0.5862 | 0.92 | 0.78 | 0.1198 | 0.81 | 0.9793 | 0.3157 | 0.7 | 0.89 | 0.2272 |
| MaxSim_recall@10 | 0.4897 | 0.4171 | 0.98 | 0.6237 | 0.94 | 0.86 | 0.156 | 0.88 | 0.9893 | 0.4167 | 0.88 | 0.91 | 0.353 |
| MaxSim_ndcg@10 | 0.3979 | 0.7273 | 0.9472 | 0.5644 | 0.9106 | 0.7017 | 0.3964 | 0.764 | 0.9803 | 0.4126 | 0.5452 | 0.8244 | 0.6326 |
| MaxSim_mrr@10 | 0.4867 | 0.9022 | 0.955 | 0.6322 | 0.9633 | 0.6515 | 0.6482 | 0.7507 | 0.99 | 0.6239 | 0.4386 | 0.8029 | 0.8737 |
| MaxSim_map@100 | 0.3212 | 0.5829 | 0.9297 | 0.5082 | 0.8715 | 0.6613 | 0.1873 | 0.7186 | 0.9729 | 0.3231 | 0.4405 | 0.7946 | 0.4551 |
Nano BEIR
- Dataset:
NanoBEIR_mean - Evaluated with
pylate.evaluation.nano_beir_evaluator.NanoBEIREvaluator
| Metric | Value |
|---|---|
| MaxSim_accuracy@1 | 0.6627 |
| MaxSim_accuracy@3 | 0.8154 |
| MaxSim_accuracy@5 | 0.8554 |
| MaxSim_accuracy@10 | 0.9138 |
| MaxSim_precision@1 | 0.6627 |
| MaxSim_precision@3 | 0.4037 |
| MaxSim_precision@5 | 0.3078 |
| MaxSim_precision@10 | 0.2093 |
| MaxSim_recall@1 | 0.3954 |
| MaxSim_recall@3 | 0.5546 |
| MaxSim_recall@5 | 0.6136 |
| MaxSim_recall@10 | 0.6843 |
| MaxSim_ndcg@10 | 0.6773 |
| MaxSim_mrr@10 | 0.7476 |
| MaxSim_map@100 | 0.5974 |
Training Details
Training Dataset
train
- Dataset: train
- Size: 640,000 training samples
- Columns:
query_id,document_ids, andscores - Approximate statistics based on the first 1000 samples:
query_id document_ids scores type int list list details - 836: ~0.10%
- 3582: ~0.10%
- 4599: ~0.10% ...
- size: 32 elements
- size: 32 elements
- Samples:
query_id document_ids scores 685613[7546874, 1176459, 197677, 2306318, 8541504, ...][0.9999999992804947, 0.24845418756716053, 0.7594154013647826, 0.26644182105618575, 0.390668914839766, ...]237784[6366584, 4034101, 2325374, 6914618, 6042146, ...][0.9999999991784339, 0.42233632827946693, 0.5956354295491569, 0.12644415907455164, 0.6636713730105909, ...]904294[448408, 8743975, 49600, 7339401, 2714261, ...][0.9999999991841937, 0.877629062381539, 0.8330146583389045, 0.3116634796692611, 0.4633524534142185, ...] - Loss:
pylate.losses.distillation.Distillation
Training Hyperparameters
Non-Default Hyperparameters
eval_strategy: stepsper_device_train_batch_size: 4per_device_eval_batch_size: 4gradient_accumulation_steps: 2learning_rate: 6e-05num_train_epochs: 1.0bf16: Truedataloader_num_workers: 4ddp_find_unused_parameters: False
All Hyperparameters
Click to expand
overwrite_output_dir: Falsedo_predict: Falseeval_strategy: stepsprediction_loss_only: Trueper_device_train_batch_size: 4per_device_eval_batch_size: 4per_gpu_train_batch_size: Noneper_gpu_eval_batch_size: Nonegradient_accumulation_steps: 2eval_accumulation_steps: Nonetorch_empty_cache_steps: Nonelearning_rate: 6e-05weight_decay: 0.0adam_beta1: 0.9adam_beta2: 0.999adam_epsilon: 1e-08max_grad_norm: 1.0num_train_epochs: 1.0max_steps: -1lr_scheduler_type: linearlr_scheduler_kwargs: {}warmup_ratio: 0.0warmup_steps: 0log_level: passivelog_level_replica: warninglog_on_each_node: Truelogging_nan_inf_filter: Truesave_safetensors: Truesave_on_each_node: Falsesave_only_model: Falserestore_callback_states_from_checkpoint: Falseno_cuda: Falseuse_cpu: Falseuse_mps_device: Falseseed: 42data_seed: Nonejit_mode_eval: Falseuse_ipex: Falsebf16: Truefp16: Falsefp16_opt_level: O1half_precision_backend: autobf16_full_eval: Falsefp16_full_eval: Falsetf32: Nonelocal_rank: 0ddp_backend: Nonetpu_num_cores: Nonetpu_metrics_debug: Falsedebug: []dataloader_drop_last: Truedataloader_num_workers: 4dataloader_prefetch_factor: Nonepast_index: -1disable_tqdm: Falseremove_unused_columns: Truelabel_names: Noneload_best_model_at_end: Falseignore_data_skip: Falsefsdp: []fsdp_min_num_params: 0fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}fsdp_transformer_layer_cls_to_wrap: Noneaccelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}deepspeed: Nonelabel_smoothing_factor: 0.0optim: adamw_torchoptim_args: Noneadafactor: Falsegroup_by_length: Falselength_column_name: lengthddp_find_unused_parameters: Falseddp_bucket_cap_mb: Noneddp_broadcast_buffers: Falsedataloader_pin_memory: Truedataloader_persistent_workers: Falseskip_memory_metrics: Trueuse_legacy_prediction_loop: Falsepush_to_hub: Falseresume_from_checkpoint: Nonehub_model_id: Nonehub_strategy: every_savehub_private_repo: Nonehub_always_push: Falsegradient_checkpointing: Falsegradient_checkpointing_kwargs: Noneinclude_inputs_for_metrics: Falseinclude_for_metrics: []eval_do_concat_batches: Truefp16_backend: autopush_to_hub_model_id: Nonepush_to_hub_organization: Nonemp_parameters:auto_find_batch_size: Falsefull_determinism: Falsetorchdynamo: Noneray_scope: lastddp_timeout: 1800torch_compile: Falsetorch_compile_backend: Nonetorch_compile_mode: Nonedispatch_batches: Nonesplit_batches: Noneinclude_tokens_per_second: Falseinclude_num_input_tokens_seen: Falseneftune_noise_alpha: Noneoptim_target_modules: Nonebatch_eval_metrics: Falseeval_on_start: Falseuse_liger_kernel: Falseeval_use_gather_object: Falseaverage_tokens_across_devices: Falseprompts: Nonebatch_sampler: batch_samplermulti_dataset_batch_sampler: proportional
Training Logs
Click to expand
| Epoch | Step | Training Loss | NanoClimateFEVER_MaxSim_ndcg@10 | NanoDBPedia_MaxSim_ndcg@10 | NanoFEVER_MaxSim_ndcg@10 | NanoFiQA2018_MaxSim_ndcg@10 | NanoHotpotQA_MaxSim_ndcg@10 | NanoMSMARCO_MaxSim_ndcg@10 | NanoNFCorpus_MaxSim_ndcg@10 | NanoNQ_MaxSim_ndcg@10 | NanoQuoraRetrieval_MaxSim_ndcg@10 | NanoSCIDOCS_MaxSim_ndcg@10 | NanoArguAna_MaxSim_ndcg@10 | NanoSciFact_MaxSim_ndcg@10 | NanoTouche2020_MaxSim_ndcg@10 | NanoBEIR_mean_MaxSim_ndcg@10 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.0025 | 50 | 0.0197 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.0275 | 550 | 0.0155 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.0525 | 1050 | 0.0142 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.075 | 1500 | 0.0132 | 0.3833 | 0.7161 | 0.9638 | 0.5617 | 0.9106 | 0.7037 | 0.3859 | 0.7424 | 0.9442 | 0.4208 | 0.5224 | 0.8290 | 0.6369 | 0.6708 |
| 0.0775 | 1550 | 0.0131 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.1025 | 2050 | 0.013 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.1275 | 2550 | 0.0126 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.15 | 3000 | 0.0122 | 0.3926 | 0.7088 | 0.9550 | 0.5684 | 0.9056 | 0.7031 | 0.3949 | 0.7584 | 0.9725 | 0.4101 | 0.5512 | 0.8149 | 0.6375 | 0.6748 |
| 0.1525 | 3050 | 0.0121 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.1775 | 3550 | 0.0119 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.2025 | 4050 | 0.0118 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.225 | 4500 | 0.0115 | 0.3936 | 0.7099 | 0.9434 | 0.5604 | 0.9147 | 0.7147 | 0.3924 | 0.7384 | 0.9742 | 0.4174 | 0.5445 | 0.8399 | 0.6451 | 0.6760 |
| 0.2275 | 4550 | 0.0112 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.2525 | 5050 | 0.0114 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.2775 | 5550 | 0.0112 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.3 | 6000 | 0.011 | 0.4211 | 0.7254 | 0.9552 | 0.5701 | 0.9173 | 0.7036 | 0.3913 | 0.7371 | 0.9705 | 0.4195 | 0.5487 | 0.8246 | 0.6362 | 0.6785 |
| 0.3025 | 6050 | 0.0112 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.3275 | 6550 | 0.0108 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.3525 | 7050 | 0.0106 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.375 | 7500 | 0.0109 | 0.3974 | 0.7208 | 0.9429 | 0.5659 | 0.9099 | 0.7157 | 0.3959 | 0.7550 | 0.9766 | 0.4162 | 0.5544 | 0.8384 | 0.6301 | 0.6784 |
| 0.3775 | 7550 | 0.0104 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.4025 | 8050 | 0.0101 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.4275 | 8550 | 0.0103 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.45 | 9000 | 0.0099 | 0.3905 | 0.7166 | 0.9512 | 0.5749 | 0.9093 | 0.7217 | 0.3990 | 0.7464 | 0.9749 | 0.4184 | 0.5371 | 0.8260 | 0.6291 | 0.6765 |
| 0.4525 | 9050 | 0.0104 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.4775 | 9550 | 0.0102 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.5025 | 10050 | 0.0096 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.525 | 10500 | 0.0098 | 0.3914 | 0.7332 | 0.9477 | 0.5763 | 0.9102 | 0.7044 | 0.3947 | 0.7521 | 0.9732 | 0.4065 | 0.5503 | 0.8283 | 0.6329 | 0.6770 |
| 0.5275 | 10550 | 0.0099 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.5525 | 11050 | 0.0097 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.5775 | 11550 | 0.0095 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.6 | 12000 | 0.0096 | 0.3954 | 0.7215 | 0.9403 | 0.5717 | 0.9087 | 0.6982 | 0.3965 | 0.7466 | 0.9728 | 0.4129 | 0.5516 | 0.8335 | 0.6330 | 0.6756 |
| 0.6025 | 12050 | 0.0097 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.6275 | 12550 | 0.0094 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.6525 | 13050 | 0.0096 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.675 | 13500 | 0.0092 | 0.4007 | 0.7236 | 0.9438 | 0.5687 | 0.9105 | 0.7198 | 0.3928 | 0.7635 | 0.9803 | 0.4146 | 0.5377 | 0.8270 | 0.6360 | 0.6784 |
| 0.6775 | 13550 | 0.0094 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.7025 | 14050 | 0.0093 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.7275 | 14550 | 0.0093 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.75 | 15000 | 0.0093 | 0.3948 | 0.7287 | 0.9525 | 0.5616 | 0.9140 | 0.6991 | 0.3922 | 0.7638 | 0.9877 | 0.4080 | 0.5488 | 0.8337 | 0.6354 | 0.6785 |
| 0.7525 | 15050 | 0.0091 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.7775 | 15550 | 0.009 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.8025 | 16050 | 0.0086 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.825 | 16500 | 0.0093 | 0.4052 | 0.7325 | 0.9472 | 0.5714 | 0.9116 | 0.7019 | 0.3959 | 0.7665 | 0.9876 | 0.4102 | 0.5428 | 0.8262 | 0.6357 | 0.6796 |
| 0.8275 | 16550 | 0.0086 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.8525 | 17050 | 0.0088 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.8775 | 17550 | 0.0088 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.9 | 18000 | 0.0088 | 0.4066 | 0.7304 | 0.9512 | 0.5572 | 0.9091 | 0.7095 | 0.3957 | 0.7672 | 0.9810 | 0.4158 | 0.5481 | 0.8302 | 0.6279 | 0.6792 |
| 0.9025 | 18050 | 0.0089 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.9275 | 18550 | 0.0087 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.9525 | 19050 | 0.0086 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| 0.975 | 19500 | 0.0087 | 0.3979 | 0.7273 | 0.9472 | 0.5644 | 0.9106 | 0.7017 | 0.3964 | 0.7640 | 0.9803 | 0.4126 | 0.5452 | 0.8244 | 0.6326 | 0.6773 |
| 0.9775 | 19550 | 0.0089 | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
Framework Versions
- Python: 3.13.0
- Sentence Transformers: 4.0.2
- PyLate: 1.3.2
- Transformers: 4.48.3
- PyTorch: 2.6.0
- Accelerate: 1.8.1
- Datasets: 4.0.0
- Tokenizers: 0.21.0
Citation
BibTeX
ColBERT-Zero
@misc{chaffin2026colbertzeropretrainpretraincolbert,
title = {ColBERT-Zero: To Pre-train Or Not To Pre-train ColBERT models},
author = {Antoine Chaffin and Luca Arnaboldi and Amélie Chatelain and Florent Krzakala},
year = {2026},
eprint = {2602.16609},
archivePrefix = {arXiv},
primaryClass = {cs.CL},
url = {https://arxiv.org/abs/2602.16609},
}
Sentence Transformers
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084"
}
PyLate
@inproceedings{DBLP:conf/cikm/ChaffinS25,
author = {Antoine Chaffin and
Rapha{"{e}}l Sourty},
editor = {Meeyoung Cha and
Chanyoung Park and
Noseong Park and
Carl Yang and
Senjuti Basu Roy and
Jessie Li and
Jaap Kamps and
Kijung Shin and
Bryan Hooi and
Lifang He},
title = {PyLate: Flexible Training and Retrieval for Late Interaction Models},
booktitle = {Proceedings of the 34th {ACM} International Conference on Information
and Knowledge Management, {CIKM} 2025, Seoul, Republic of Korea, November
10-14, 2025},
pages = {6334--6339},
publisher = {{ACM}},
year = {2025},
url = {https://github.com/lightonai/pylate},
doi = {10.1145/3746252.3761608},
}
Nomic Embed
@article{DBLP:journals/tmlr/NussbaumMMD25,
author = {Zach Nussbaum and
John Xavier Morris and
Andriy Mulyar and
Brandon Duderstadt},
title = {Nomic Embed: Training a Reproducible Long Context Text Embedder},
journal = {Trans. Mach. Learn. Res.},
volume = {2025},
year = {2025},
url = {https://openreview.net/forum?id=IPmzyQSiQE},
timestamp = {Fri, 20 Jun 2025 14:19:48 +0200},
biburl = {https://dblp.org/rec/journals/tmlr/NussbaumMMD25.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}