ModernColBERT-embed-base-kd-only / README.md

NohTow

Update README.md

d6c8db5 verified about 15 hours ago

preview code

raw

history blame contribute delete

152 kB

metadata

tags:
  - ColBERT
  - PyLate
  - sentence-transformers
  - sentence-similarity
  - embeddings
  - retrieval
  - feature-extraction
  - generated_from_trainer
  - dataset_size:640000
  - loss:Distillation
pipeline_tag: sentence-similarity
library_name: PyLate
license: apache-2.0
language:
  - en
metrics:
  - MaxSim_accuracy@1
  - MaxSim_accuracy@3
  - MaxSim_accuracy@5
  - MaxSim_accuracy@10
  - MaxSim_precision@1
  - MaxSim_precision@3
  - MaxSim_precision@5
  - MaxSim_precision@10
  - MaxSim_recall@1
  - MaxSim_recall@3
  - MaxSim_recall@5
  - MaxSim_recall@10
  - MaxSim_ndcg@10
  - MaxSim_mrr@10
  - MaxSim_map@100
model-index:
  - name: PyLate
    results:
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoClimateFEVER
          type: NanoClimateFEVER
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.34
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.6
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.7
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.84
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.34
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.24666666666666667
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.19199999999999995
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.12799999999999997
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.18333333333333332
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.30333333333333334
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.3899999999999999
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.4933333333333333
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.4063363730066463
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.4916031746031746
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.3303819327927656
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoDBPedia
          type: NanoDBPedia
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.86
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.94
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.94
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.96
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.86
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.7199999999999999
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.66
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.5720000000000001
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.12659835318654536
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.21845761987893375
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.2938340415477099
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.4105335585789726
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.7283036112199561
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.8991666666666666
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.5925340100852293
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoFEVER
          type: NanoFEVER
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.94
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 1
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 1
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 1
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.94
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.3666666666666666
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.21999999999999997
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.10999999999999999
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.8766666666666667
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.98
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.98
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.98
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.953933314347975
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.9633333333333333
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.9375757575757575
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoFiQA2018
          type: NanoFiQA2018
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.5
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.72
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.74
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.76
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.5
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.3466666666666666
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.24799999999999997
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.13599999999999998
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.2725793650793651
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.520904761904762
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.5646507936507936
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.5870079365079365
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.5309299781460816
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.6011904761904762
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.47808334745931363
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoHotpotQA
          type: NanoHotpotQA
        metrics:
          - type: MaxSim_accuracy@1
            value: 1
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 1
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 1
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 1
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 1
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.6
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.3679999999999999
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.18599999999999994
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.5
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.9
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.92
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.93
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.9222921452583728
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 1
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.8846838161838161
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoMSMARCO
          type: NanoMSMARCO
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.54
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.68
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.76
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.86
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.54
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.22666666666666666
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.15200000000000002
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.08599999999999998
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.54
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.68
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.76
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.86
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.6888194232849568
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.6348809523809523
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.6440971195471196
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoNFCorpus
          type: NanoNFCorpus
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.54
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.64
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.72
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.74
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.54
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.42
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.38400000000000006
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.28600000000000003
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.04566162692796489
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.08179125516090964
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.12712273647364136
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.15300844432718616
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.37177379221071954
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.6048333333333332
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.16751658822280646
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoNQ
          type: NanoNQ
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.68
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.82
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.86
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.9
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.68
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.28
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.176
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.09799999999999998
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.64
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.77
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.8
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.87
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.7688812490759633
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.7574126984126984
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.7299065569552858
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoQuoraRetrieval
          type: NanoQuoraRetrieval
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.98
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 1
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 1
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 1
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.98
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.3999999999999999
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.256
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.13799999999999998
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.8706666666666666
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.9520000000000001
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.9726666666666667
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.9966666666666666
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.981385502951296
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.99
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.9665185185185184
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoSCIDOCS
          type: NanoSCIDOCS
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.46
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.68
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.76
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.88
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.46
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.35333333333333333
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.292
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.19599999999999998
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.09766666666666665
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.21766666666666665
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.2986666666666667
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.4006666666666666
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.3925816517049085
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.5987380952380952
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.30497643441660005
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoArguAna
          type: NanoArguAna
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.24
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.58
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.7
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.88
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.24
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.19333333333333336
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.14
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.088
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.24
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.58
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.7
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.88
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.558015137444458
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.45589682539682536
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.4586809163059163
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoSciFact
          type: NanoSciFact
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.74
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.84
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.88
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.92
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.74
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.29333333333333333
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.19599999999999998
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.10199999999999998
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.715
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.81
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.87
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.91
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.8249697859180465
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.8
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.7959520905923344
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoTouche2020
          type: NanoTouche2020
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.7959183673469388
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.9387755102040817
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.9591836734693877
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 1
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.7959183673469388
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.7142857142857143
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.6653061224489795
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.5142857142857142
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.052193001619842895
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.14293338708352385
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.21678776156605578
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.3275908393694154
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.5977067950547461
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.8719630709426628
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.42650930096472894
            name: Maxsim Map@100
      - task:
          type: nano-beir
          name: Nano BEIR
        dataset:
          name: NanoBEIR mean
          type: NanoBEIR_mean
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.6627629513343799
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.8029827315541601
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.8476295133437991
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.9030769230769231
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.6627629513343799
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.3969963369963369
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.3037927786499215
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.20309890109890105
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.39695120616515783
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.5505451556944715
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.6072098974285796
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.6768313419577059
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.6712252892018559
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.743770663576786
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.5936474145861687
            name: Maxsim Map@100

📄 Paper | 📝 Blog | 📚 Collection

ColBERT-Zero

🎯 TL;DR: First large-scale fully pre-trained ColBERT model using only public data. Achieves 55.43 nDCG@10 on BEIR benchmark, outperforming GTE-ModernColBERT and GTE-ModernBERT trained on closed and stronger data. New SOTA on BEIR for models <150M parameters.

Why ColBERT-Zero?

Late interaction (ColBERT / multi-vector) models have clear advantages in out-of-domain generalization, long-context handling, and reasoning-intensive retrieval. Yet they remain undertrained: current state-of-the-art ColBERT models (e.g, GTE-ModernColBERT and ColBERT-small) are simply built by bolting a small knowledge distillation step onto a strong dense (single-vector) model. Even recent efforts like mxbai-edge-colbert-v0 perform all early training stages in a single-vector setting, only switching to the multi-vector objective at the very end.

This leaves a lot of performance on the table. ColBERT-Zero demonstrates that performing contrastive pre-training directly in the multi-vector setting, rather than treating it as an afterthought, unlocks a significantly higher performance ceiling. Trained exclusively on public data (Nomic-embed dataset mixture), ColBERT-Zero overcomes a 2.4-point data quality disadvantage to outperform models trained on proprietary, closed-source data. For detailed results, please have a look at our blogpost and the paper. All the models (including intermediate checkpoints) as well training code are released under an Apache 2.0 license.

Controlled Comparison Design

We deliberately trained on the public Nomic-embed data mixture for a strategic reason: Nomic has already trained a dense ModernBERT model (ModernBERT-embed) on this exact data. This lets us compare dense vs. multi-vector training with the same data, same base model (ModernBERT), and same pipeline. The only variable is whether the contrastive phases are performed in the dense or multi-vector setting.

This design reveals a striking result: the dense baseline trained on Nomic data scores 52.89, while the one trained on GTE's proprietary data scores 55.33: a 2.4-point data quality gap. Despite this disadvantage, ColBERT-Zero's full multi-vector pre-training pipeline closes and surpasses this gap, reaching 55.43 nDCG@10.

The Three-Phase Training Pipeline

The development followed a three-phase pipeline, each providing a different type of learning signal:

Phase 1 - Unsupervised Contrastive Pre-training

We began with the nomic-embed-unsupervised-data dataset. Using PyLate's GradCache implementation to scale per-GPU batch size without VRAM constraints, combined with cross-GPU gathering of representations, we reached effective batch sizes of ~16k, required for unsupervised training to produce plausible in-batch hard negatives. Unlike dense training, the multi-vector objective allows the encoder to learn fine-grained token importance from the very first phase.

Phase 2 - Supervised Contrastive Fine-tuning

We refined the model using the nomic-embed-supervised-data. This stage introduced mined hard negatives: documents that are superficially similar to the query but not actually relevant. This allows teaching the model to handle nuance by prioritizing specific keywords and contextual tokens most indicative of a true match.

Phase 3 - Knowledge Distillation (KD)

The final stage used the ms-marco-en-bge dataset. We leveraged a powerful Gemma-based model as a teacher, allowing our student models to learn to replicate complex reasoning scores via the efficient MaxSim operator.

Key Findings

1. The Standard Recipe Leaves Performance on the Table

The KD-only approach (the current industry standard) scores 54.09, lagging behind full pre-training by 1.3 points. A simple distillation step is insufficient for optimal multi-vector performance.

2. Supervised + KD Is the Efficiency Sweet Spot

By running a supervised contrastive step in the multi-vector setting before distillation, we reach 55.12 nDCG@10, closing most of the gap with the fully pre-trained model (55.43). This costs ~40 GH200-hours instead of ~408: roughly 10× cheaper for 99.4% of the performance.

3. Prompt Alignment Is Non-Negotiable

Nomic's base models are pre-trained with asymmetric prompts (search_query: and search_document:). While ColBERT has its own asymmetric mechanism via [Q] and [D] markers, we found:

Stripping pre-training prompts during fine-tuning causes significant performance degradation.
Adding prompts to a model not pre-trained with them also hurts performance.
Even with perfect alignment, prompts provide an intrinsic benefit: full ColBERT pre-training with prompts (55.43) vs. without prompts (54.61), no mismatch in either case, shows a meaningful 0.82-point gap.

Why do prompts help? Our leading hypothesis is that prompt tokens act as implicit query expansion: extra slots that don't carry specific meaning but let the model store global information about the sequence. The original ColBERT used [PAD] tokens for this purpose, but modern Flash Attention implementations broke this trick (masked tokens no longer produce usable embeddings). Explicit prompt tokens may be quietly re-enabling it.

Practical takeaway: Always align your prompts with the base model's pre-training setup. Misalignment is one of the easiest ways to silently lose performance. Note that this sensitivity decreases with stronger downstream fine-tuning: with enough training, the model can adapt to an initial mismatch.

Model Lineup

The Main Models (ColBERT-Zero)

ColBERT-Zero utilizes the full 3-phase pipeline with strict prompt alignment, achieving 55.43 nDCG@10 on BEIR, setting a new SOTA for models <150M parameters. We also provide ColBERT-Zero-noprompts, the same pipeline without asymmetric prompts, to study the impact of query expansion on multi-vector performance.

The cheap-to-train ones (ModernColBERT-embed-base)

These models represent the practical sweet spot. By skipping the expensive unsupervised phase, ModernColBERT-embed-base (Supervised + KD) achieves ~97% of the flagship's performance at only ~10% of the compute cost. For reference, ModernColBERT-embed-base-kd performs only the distillation step on a supervised dense base.

Intermediate Checkpoints

For researchers studying the incremental impact of each phase and prompt alignment, we release several ablation variants: ColBERT-Zero-supervised, ColBERT-Zero-unsupervised (and their -noprompts versions), and ModernColBERT-embed-base-supervised.

Full Performance on BEIR

Model	Avg	FiQA	NFCorpus	TREC-COVID	Touche	ArguAna	Quora	SCIDOCS	SciFact	NQ	ClimateFEVER	HotpotQA	DBPedia	CQADupstack	FEVER	MSMARCO
Baselines
ModernBERT-embed-unsupervised	47.05	42.53	35.33	68.44	18.58	48.82	88.63	19.83	72.30	46.32	22.97	60.00	37.97	42.40	67.39	34.23
ModernBERT-embed-supervised	52.89	40.59	33.40	84.15	31.91	48.96	88.85	18.59	69.63	62.15	35.67	67.11	41.50	42.08	87.35	41.47
GTE-ModernColBERT	54.67	45.28	37.93	83.59	31.23	48.51	86.61	19.06	76.34	61.80	30.62	77.32	48.03	41.00	87.44	45.32
gte-modernbert-base	55.33	48.81	36.44	81.95	21.68	72.68	88.55	21.29	77.40	57.62	37.74	69.47	41.79	42.63	91.03	40.90
KD from dense supervised
ModernColBERT-embed-base-kd-only	54.09	42.51	37.01	79.52	34.58	51.75	87.67	18.15	75.04	61.45	28.31	76.70	47.54	40.68	84.82	45.57
Supervised + KD from dense unsupervised
ModernColBERT-embed-base-supervised	50.72	40.09	35.56	71.12	25.53	44.27	86.96	18.19	73.78	58.89	32.95	71.49	43.23	42.55	70.51	45.72
ModernColBERT-embed-base	55.12	41.50	36.51	77.46	33.77	52.45	86.26	18.66	74.90	62.24	37.27	80.07	48.27	41.60	89.71	46.17
ColBERT-Zero
Unsupervised	51.44	45.38	36.88	67.82	22.59	51.53	87.78	22.30	76.76	58.80	24.24	68.29	43.16	45.76	81.58	38.78
Supervised	51.81	42.45	35.60	74.72	23.83	41.81	87.19	19.85	73.71	61.95	35.01	71.37	46.20	45.16	72.61	45.68
Distilled	55.43	42.62	37.28	78.69	36.13	53.07	85.24	19.88	76.50	61.66	35.72	79.41	47.48	41.34	90.59	45.80
ColBERT-Zero-noprompts
Unsupervised	51.70	45.31	34.72	73.55	23.26	52.56	88.15	22.63	76.10	59.18	24.24	66.66	42.61	45.56	81.88	39.15
Supervised	52.39	43.36	36.01	72.42	23.79	47.42	87.79	21.30	73.85	62.25	31.61	70.32	44.07	44.03	85.54	42.11
Distilled	54.61	43.14	36.60	78.60	36.36	49.49	88.05	19.13	76.42	61.73	32.70	76.99	47.69	40.21	85.97	46.01

Limitations & Discussion

Data-specific findings. We deliberately used the Nomic Embed data mixture for controlled comparison. Some observations (particularly around prompt sensitivity) may not generalize to different or stronger training configurations.
Scale vs. objective. The gains from multi-vector pre-training likely reflect more training time in the multi-vector setting, rather than the contrastive objective itself. Performing KD alone at a larger scale might yield similar or superior results due to the higher quality of the distillation signal. Our study uses the conventional setup where training scale is inversely proportional to signal quality, reflecting the higher cost of generating high-quality labels.
Prompt sensitivity decreases with stronger fine-tuning. When experimenting with stronger fine-tuning data (e.g., NV-Retriever), adding prompts on top of a model pre-trained without them did not degrade results the way it did with ColBERT-Zero. With enough downstream training, the model can adapt to an initial mismatch.

Serving at Scale

For production deployment of ColBERT-Zero and other multi-vector models, check out NextPlaid and FastPlaid, our production-grade engines for multi-vector retrieval.

Resources

📦 All checkpoints: HF Collection - every phase, with and without prompts
💻 Code: Training boilerplates
📄 Paper: ArXiv

Model Details

Model Description

Model Type: PyLate model
Document Length: 519 tokens
Query Length: 39 tokens
Output Dimensionality: 128 tokens
Similarity Function: MaxSim
Training Dataset:
- train

Model Sources

Documentation: PyLate Documentation
Repository: PyLate on GitHub
Hugging Face: PyLate models on Hugging Face

Full Model Architecture

ColBERT(
  (0): Transformer({'max_seq_length': 518, 'do_lower_case': False, 'architecture': 'ModernBertModel'})
  (1): Dense({'in_features': 768, 'out_features': 128, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity', 'use_residual': False})
)

Usage

First install the PyLate library:

pip install -U pylate

Prompt alignment is critical for ColBERT-Zero models. You must use prompt_name="query" when encoding queries and prompt_name="document" when encoding documents. ColBERT-Zero was pre-trained with asymmetric prompts (search_query: / search_document:), and stripping them causes significant performance.

Retrieval

Use this model with PyLate to index and retrieve documents. The index uses FastPLAID for efficient similarity search.

Indexing documents

Load the ColBERT model and initialize the PLAID index, then encode and index your documents:

from pylate import indexes, models, retrieve

# Step 1: Load the ColBERT model
model = models.ColBERT(
    model_name_or_path="pylate_model_id",
)

# Step 2: Initialize the PLAID index
index = indexes.PLAID(
    index_folder="pylate-index",
    index_name="index",
    override=True,  # This overwrites the existing index if any
)

# Step 3: Encode the documents
documents_ids = ["1", "2", "3"]
documents = ["document 1 text", "document 2 text", "document 3 text"]

documents_embeddings = model.encode(
    documents,
    batch_size=32,
    is_query=False,  # Ensure that it is set to False to indicate that these are documents, not queries
    prompt_name="document", # ⚠️ Required for ColBERT-Zero! Do not omit.
    show_progress_bar=True,
)

# Step 4: Add document embeddings to the index by providing embeddings and corresponding ids
index.add_documents(
    documents_ids=documents_ids,
    documents_embeddings=documents_embeddings,
)

Note that you do not have to recreate the index and encode the documents every time. Once you have created an index and added the documents, you can re-use the index later by loading it:

# To load an index, simply instantiate it with the correct folder/name and without overriding it
index = indexes.PLAID(
    index_folder="pylate-index",
    index_name="index",
)

Retrieving top-k documents for queries

Once the documents are indexed, you can retrieve the top-k most relevant documents for a given set of queries. To do so, initialize the ColBERT retriever with the index you want to search in, encode the queries and then retrieve the top-k documents to get the top matches ids and relevance scores:

[!WARNING] Always pass prompt_name="query" for queries and prompt_name="document" for documents. Omitting these prompts will silently degrade retrieval quality.

# Step 1: Initialize the ColBERT retriever
retriever = retrieve.ColBERT(index=index)

# Step 2: Encode the queries
queries_embeddings = model.encode(
    ["query for document 3", "query for document 1"],
    batch_size=32,
    is_query=True,  #  # Ensure that it is set to False to indicate that these are queries
    prompt_name="query", # ⚠️ Required for ColBERT-Zero! Do not omit.
    show_progress_bar=True,
)

# Step 3: Retrieve top-k documents
scores = retriever.retrieve(
    queries_embeddings=queries_embeddings,
    k=10,  # Retrieve the top 10 matches for each query
)

Reranking

Always pass prompt_name="query" for queries and prompt_name="document" for documents. Omitting these prompts will silently degrade retrieval quality.

If you only want to use the ColBERT model to perform reranking on top of your first-stage retrieval pipeline without building an index, you can simply use rank function and pass the queries and documents to rerank:

from pylate import rank, models

queries = [
    "query A",
    "query B",
]

documents = [
    ["document A", "document B"],
    ["document 1", "document C", "document B"],
]

documents_ids = [
    [1, 2],
    [1, 3, 2],
]

model = models.ColBERT(
    model_name_or_path="pylate_model_id",
)

queries_embeddings = model.encode(
    queries,
    is_query=True,
    prompt_name="query" # ⚠️ Required for ColBERT-Zero! Do not omit.
)

documents_embeddings = model.encode(
    documents,
    is_query=False,
    prompt_name="document" # ⚠️ Required for ColBERT-Zero! Do not omit.
)

reranked_documents = rank.rerank(
    documents_ids=documents_ids,
    queries_embeddings=queries_embeddings,
    documents_embeddings=documents_embeddings,
)

Evaluation

Metrics

Py Late Information Retrieval

Dataset: ['NanoClimateFEVER', 'NanoDBPedia', 'NanoFEVER', 'NanoFiQA2018', 'NanoHotpotQA', 'NanoMSMARCO', 'NanoNFCorpus', 'NanoNQ', 'NanoQuoraRetrieval', 'NanoSCIDOCS', 'NanoArguAna', 'NanoSciFact', 'NanoTouche2020']
Evaluated with pylate.evaluation.pylate_information_retrieval_evaluator.PyLateInformationRetrievalEvaluator

Metric	NanoClimateFEVER	NanoDBPedia	NanoFEVER	NanoFiQA2018	NanoHotpotQA	NanoMSMARCO	NanoNFCorpus	NanoNQ	NanoQuoraRetrieval	NanoSCIDOCS	NanoArguAna	NanoSciFact	NanoTouche2020
MaxSim_accuracy@1	0.34	0.86	0.94	0.5	1.0	0.54	0.54	0.68	0.98	0.46	0.24	0.74	0.7959
MaxSim_accuracy@3	0.6	0.94	1.0	0.72	1.0	0.68	0.64	0.82	1.0	0.68	0.58	0.84	0.9388
MaxSim_accuracy@5	0.7	0.94	1.0	0.74	1.0	0.76	0.72	0.86	1.0	0.76	0.7	0.88	0.9592
MaxSim_accuracy@10	0.84	0.96	1.0	0.76	1.0	0.86	0.74	0.9	1.0	0.88	0.88	0.92	1.0
MaxSim_precision@1	0.34	0.86	0.94	0.5	1.0	0.54	0.54	0.68	0.98	0.46	0.24	0.74	0.7959
MaxSim_precision@3	0.2467	0.72	0.3667	0.3467	0.6	0.2267	0.42	0.28	0.4	0.3533	0.1933	0.2933	0.7143
MaxSim_precision@5	0.192	0.66	0.22	0.248	0.368	0.152	0.384	0.176	0.256	0.292	0.14	0.196	0.6653
MaxSim_precision@10	0.128	0.572	0.11	0.136	0.186	0.086	0.286	0.098	0.138	0.196	0.088	0.102	0.5143
MaxSim_recall@1	0.1833	0.1266	0.8767	0.2726	0.5	0.54	0.0457	0.64	0.8707	0.0977	0.24	0.715	0.0522
MaxSim_recall@3	0.3033	0.2185	0.98	0.5209	0.9	0.68	0.0818	0.77	0.952	0.2177	0.58	0.81	0.1429
MaxSim_recall@5	0.39	0.2938	0.98	0.5647	0.92	0.76	0.1271	0.8	0.9727	0.2987	0.7	0.87	0.2168
MaxSim_recall@10	0.4933	0.4105	0.98	0.587	0.93	0.86	0.153	0.87	0.9967	0.4007	0.88	0.91	0.3276
MaxSim_ndcg@10	0.4063	0.7283	0.9539	0.5309	0.9223	0.6888	0.3718	0.7689	0.9814	0.3926	0.558	0.825	0.5977
MaxSim_mrr@10	0.4916	0.8992	0.9633	0.6012	1.0	0.6349	0.6048	0.7574	0.99	0.5987	0.4559	0.8	0.872
MaxSim_map@100	0.3304	0.5925	0.9376	0.4781	0.8847	0.6441	0.1675	0.7299	0.9665	0.305	0.4587	0.796	0.4265

Nano BEIR

Dataset: NanoBEIR_mean
Evaluated with pylate.evaluation.nano_beir_evaluator.NanoBEIREvaluator

Metric	Value
MaxSim_accuracy@1	0.6628
MaxSim_accuracy@3	0.803
MaxSim_accuracy@5	0.8476
MaxSim_accuracy@10	0.9031
MaxSim_precision@1	0.6628
MaxSim_precision@3	0.397
MaxSim_precision@5	0.3038
MaxSim_precision@10	0.2031
MaxSim_recall@1	0.397
MaxSim_recall@3	0.5505
MaxSim_recall@5	0.6072
MaxSim_recall@10	0.6768
MaxSim_ndcg@10	0.6712
MaxSim_mrr@10	0.7438
MaxSim_map@100	0.5936

Training Details

Training Dataset

train

Dataset: train
Size: 640,000 training samples
Columns: query_id, document_ids, and scores
Approximate statistics based on the first 1000 samples:
query_id document_ids scores
type int list list
details
836: ~0.10%
3582: ~0.10%
4599: ~0.10%
...

size: 32 elements

	query_id	document_ids	scores
type	int	list	list
details	836: ~0.10% 3582: ~0.10% 4599: ~0.10% ...	size: 32 elements

Samples:

query_id	document_ids	scores
`685613`	`[7546874, 1176459, 197677, 2306318, 8541504, ...]`	`[0.9999999992804947, 0.24845418756716053, 0.7594154013647826, 0.26644182105618575, 0.390668914839766, ...]`
`237784`	`[6366584, 4034101, 2325374, 6914618, 6042146, ...]`	`[0.9999999991784339, 0.42233632827946693, 0.5956354295491569, 0.12644415907455164, 0.6636713730105909, ...]`
`904294`	`[448408, 8743975, 49600, 7339401, 2714261, ...]`	`[0.9999999991841937, 0.877629062381539, 0.8330146583389045, 0.3116634796692611, 0.4633524534142185, ...]`

Loss: pylate.losses.distillation.Distillation

Training Hyperparameters

Non-Default Hyperparameters

eval_strategy: steps
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
gradient_accumulation_steps: 2
learning_rate: 8e-05
num_train_epochs: 1.0
bf16: True
dataloader_num_workers: 4
ddp_find_unused_parameters: False

All Hyperparameters

Click to expand

overwrite_output_dir: False
do_predict: False
eval_strategy: steps
prediction_loss_only: True
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
per_gpu_train_batch_size: None
per_gpu_eval_batch_size: None
gradient_accumulation_steps: 2
eval_accumulation_steps: None
torch_empty_cache_steps: None
learning_rate: 8e-05
weight_decay: 0.0
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 1e-08
max_grad_norm: 1.0
num_train_epochs: 1.0
max_steps: -1
lr_scheduler_type: linear
lr_scheduler_kwargs: {}
warmup_ratio: 0.0
warmup_steps: 0
log_level: passive
log_level_replica: warning
log_on_each_node: True
logging_nan_inf_filter: True
save_safetensors: True
save_on_each_node: False
save_only_model: False
restore_callback_states_from_checkpoint: False
no_cuda: False
use_cpu: False
use_mps_device: False
seed: 42
data_seed: None
jit_mode_eval: False
use_ipex: False
bf16: True
fp16: False
fp16_opt_level: O1
half_precision_backend: auto
bf16_full_eval: False
fp16_full_eval: False
tf32: None
local_rank: 1
ddp_backend: None
tpu_num_cores: None
tpu_metrics_debug: False
debug: []
dataloader_drop_last: True
dataloader_num_workers: 4
dataloader_prefetch_factor: None
past_index: -1
disable_tqdm: False
remove_unused_columns: True
label_names: None
load_best_model_at_end: False
ignore_data_skip: False
fsdp: []
fsdp_min_num_params: 0
fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
fsdp_transformer_layer_cls_to_wrap: None
accelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
deepspeed: None
label_smoothing_factor: 0.0
optim: adamw_torch
optim_args: None
adafactor: False
group_by_length: False
length_column_name: length
ddp_find_unused_parameters: False
ddp_bucket_cap_mb: None
ddp_broadcast_buffers: False
dataloader_pin_memory: True
dataloader_persistent_workers: False
skip_memory_metrics: True
use_legacy_prediction_loop: False
push_to_hub: False
resume_from_checkpoint: None
hub_model_id: None
hub_strategy: every_save
hub_private_repo: None
hub_always_push: False
gradient_checkpointing: False
gradient_checkpointing_kwargs: None
include_inputs_for_metrics: False
include_for_metrics: []
eval_do_concat_batches: True
fp16_backend: auto
push_to_hub_model_id: None
push_to_hub_organization: None
mp_parameters:
auto_find_batch_size: False
full_determinism: False
torchdynamo: None
ray_scope: last
ddp_timeout: 1800
torch_compile: False
torch_compile_backend: None
torch_compile_mode: None
dispatch_batches: None
split_batches: None
include_tokens_per_second: False
include_num_input_tokens_seen: False
neftune_noise_alpha: None
optim_target_modules: None
batch_eval_metrics: False
eval_on_start: False
use_liger_kernel: False
eval_use_gather_object: False
average_tokens_across_devices: False
prompts: None
batch_sampler: batch_sampler
multi_dataset_batch_sampler: proportional
router_mapping: {}
learning_rate_mapping: {}

Training Logs

Click to expand

Epoch	Step	Training Loss	NanoClimateFEVER_MaxSim_ndcg@10	NanoDBPedia_MaxSim_ndcg@10	NanoFEVER_MaxSim_ndcg@10	NanoFiQA2018_MaxSim_ndcg@10	NanoHotpotQA_MaxSim_ndcg@10	NanoMSMARCO_MaxSim_ndcg@10	NanoNFCorpus_MaxSim_ndcg@10	NanoNQ_MaxSim_ndcg@10	NanoQuoraRetrieval_MaxSim_ndcg@10	NanoSCIDOCS_MaxSim_ndcg@10	NanoArguAna_MaxSim_ndcg@10	NanoSciFact_MaxSim_ndcg@10	NanoTouche2020_MaxSim_ndcg@10	NanoBEIR_mean_MaxSim_ndcg@10
0.0025	50	0.0259	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.0275	550	0.019	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.0525	1050	0.0168	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.075	1500	0.0152	0.3530	0.6921	0.9345	0.5514	0.9121	0.6905	0.3714	0.7376	0.9617	0.3922	0.5317	0.7828	0.6200	0.6562
0.0775	1550	0.0147	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.1025	2050	0.0147	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.1275	2550	0.0141	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.15	3000	0.0135	0.3818	0.7111	0.9468	0.5737	0.8930	0.6751	0.3876	0.7329	0.9854	0.4004	0.5359	0.8012	0.6286	0.6656
0.1525	3050	0.0134	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.1775	3550	0.0132	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.2025	4050	0.0128	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.225	4500	0.0124	0.3586	0.7030	0.9472	0.5690	0.9114	0.6772	0.3946	0.7497	0.9750	0.3953	0.5223	0.8098	0.6158	0.6638
0.2275	4550	0.0122	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.2525	5050	0.0123	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.2775	5550	0.0118	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.3	6000	0.0119	0.3992	0.7115	0.9573	0.5612	0.9038	0.6984	0.3952	0.7582	0.9719	0.4023	0.5235	0.7987	0.6036	0.6681
0.3025	6050	0.0119	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.3275	6550	0.0116	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.3525	7050	0.0111	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.375	7500	0.0116	0.3920	0.7276	0.9523	0.5745	0.8960	0.6956	0.3928	0.7349	0.9779	0.3998	0.5397	0.8058	0.6309	0.6707
0.3775	7550	0.011	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.4025	8050	0.0107	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.4275	8550	0.0109	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.45	9000	0.0103	0.4048	0.7211	0.9469	0.5466	0.9155	0.6889	0.3713	0.7401	0.9806	0.4074	0.5507	0.8158	0.6125	0.6694
0.4525	9050	0.0107	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.4775	9550	0.0105	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.5025	10050	0.01	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.525	10500	0.0103	0.3875	0.7301	0.9445	0.5466	0.9113	0.6969	0.3752	0.7625	0.9795	0.4017	0.5424	0.8207	0.6067	0.6697
0.5275	10550	0.0103	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.5525	11050	0.0099	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.5775	11550	0.0096	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.6	12000	0.0098	0.4020	0.7211	0.9432	0.5410	0.9181	0.6831	0.3709	0.7479	0.9812	0.4049	0.5593	0.8293	0.5912	0.6687
0.6025	12050	0.0098	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.6275	12550	0.0095	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.6525	13050	0.0097	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.675	13500	0.0093	0.4119	0.7163	0.9522	0.5415	0.9182	0.7049	0.3714	0.7810	0.9827	0.3945	0.5462	0.8200	0.6176	0.6737
0.6775	13550	0.0095	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.7025	14050	0.0094	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.7275	14550	0.0092	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.75	15000	0.0092	0.4103	0.7132	0.9539	0.5326	0.9156	0.6947	0.3594	0.7590	0.9807	0.4009	0.5490	0.8321	0.6047	0.6697
0.7525	15050	0.0091	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.7775	15550	0.009	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.8025	16050	0.0084	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.825	16500	0.009	0.4041	0.7143	0.9555	0.5575	0.9165	0.6968	0.3698	0.7769	0.9812	0.3994	0.5557	0.8195	0.6004	0.6729
0.8275	16550	0.0086	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.8525	17050	0.0085	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.8775	17550	0.0086	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.9	18000	0.0086	0.4059	0.7210	0.9539	0.5391	0.9160	0.6962	0.3722	0.7770	0.9831	0.3985	0.5489	0.8330	0.6031	0.6729
0.9025	18050	0.0088	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.9275	18550	0.0085	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.9525	19050	0.0083	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.975	19500	0.0086	0.4063	0.7283	0.9539	0.5309	0.9223	0.6888	0.3718	0.7689	0.9814	0.3926	0.5580	0.8250	0.5977	0.6712
0.9775	19550	0.0087	-	-	-	-	-	-	-	-	-	-	-	-	-	-

Framework Versions

Python: 3.13.0
Sentence Transformers: 5.1.1
PyLate: 1.3.4
Transformers: 4.48.3
PyTorch: 2.6.0
Accelerate: 1.12.0
Datasets: 4.4.1
Tokenizers: 0.21.0

Citation

BibTeX

ColBERT-Zero

@misc{chaffin2026colbertzeropretrainpretraincolbert,
  title         = {ColBERT-Zero: To Pre-train Or Not To Pre-train ColBERT models}, 
  author        = {Antoine Chaffin and Luca Arnaboldi and Amélie Chatelain and Florent Krzakala},
  year          = {2026},
  eprint        = {2602.16609},
  archivePrefix = {arXiv},
  primaryClass  = {cs.CL},
  url           = {https://arxiv.org/abs/2602.16609}, 
}

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084"
}

PyLate

@inproceedings{DBLP:conf/cikm/ChaffinS25,
  author       = {Antoine Chaffin and
                  Rapha{"{e}}l Sourty},
  editor       = {Meeyoung Cha and
                  Chanyoung Park and
                  Noseong Park and
                  Carl Yang and
                  Senjuti Basu Roy and
                  Jessie Li and
                  Jaap Kamps and
                  Kijung Shin and
                  Bryan Hooi and
                  Lifang He},
  title        = {PyLate: Flexible Training and Retrieval for Late Interaction Models},
  booktitle    = {Proceedings of the 34th {ACM} International Conference on Information
                  and Knowledge Management, {CIKM} 2025, Seoul, Republic of Korea, November
                  10-14, 2025},
  pages        = {6334--6339},
  publisher    = {{ACM}},
  year         = {2025},
  url          = {https://github.com/lightonai/pylate},
  doi          = {10.1145/3746252.3761608},
}

Nomic Embed

@article{DBLP:journals/tmlr/NussbaumMMD25,
  author       = {Zach Nussbaum and
                  John Xavier Morris and
                  Andriy Mulyar and
                  Brandon Duderstadt},
  title        = {Nomic Embed: Training a Reproducible Long Context Text Embedder},
  journal      = {Trans. Mach. Learn. Res.},
  volume       = {2025},
  year         = {2025},
  url          = {https://openreview.net/forum?id=IPmzyQSiQE},
  timestamp    = {Fri, 20 Jun 2025 14:19:48 +0200},
  biburl       = {https://dblp.org/rec/journals/tmlr/NussbaumMMD25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}