ColBERT-Zero-unsupervised / README.md

NohTow

Update README.md

8b55b6e verified about 19 hours ago

preview code

raw

history blame contribute delete

243 kB

metadata

tags:
  - ColBERT
  - PyLate
  - sentence-transformers
  - sentence-similarity
  - embeddings
  - retrieval
  - feature-extraction
  - generated_from_trainer
  - dataset_size:238998494
  - loss:CachedContrastive
pipeline_tag: sentence-similarity
library_name: PyLate
license: apache-2.0
language:
  - en
metrics:
  - MaxSim_accuracy@1
  - MaxSim_accuracy@3
  - MaxSim_accuracy@5
  - MaxSim_accuracy@10
  - MaxSim_precision@1
  - MaxSim_precision@3
  - MaxSim_precision@5
  - MaxSim_precision@10
  - MaxSim_recall@1
  - MaxSim_recall@3
  - MaxSim_recall@5
  - MaxSim_recall@10
  - MaxSim_ndcg@10
  - MaxSim_mrr@10
  - MaxSim_map@100
model-index:
  - name: PyLate
    results:
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoClimateFEVER
          type: NanoClimateFEVER
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.42
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.62
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.64
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.76
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.42
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.22666666666666666
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.14400000000000002
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.092
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.20566666666666664
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.28400000000000003
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.29733333333333334
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.374
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.3518000478336987
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.5177460317460317
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.2944493241561189
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoDBPedia
          type: NanoDBPedia
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.8
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.96
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.98
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 1
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.8
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.6933333333333335
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.604
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.518
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.10547467061354297
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.21001141632312567
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.2682250276346291
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.3822739230347477
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.6607934403680658
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.8741666666666668
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.5341191871132479
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoFEVER
          type: NanoFEVER
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.9
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.96
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 1
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 1
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.9
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.3399999999999999
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.21599999999999994
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.10799999999999997
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.8366666666666667
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.9233333333333333
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.9733333333333333
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.9733333333333333
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.9268221917930667
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.9356666666666666
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.9016938568070643
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoFiQA2018
          type: NanoFiQA2018
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.48
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.7
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.74
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.8
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.48
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.33333333333333326
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.23999999999999996
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.142
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.27124603174603173
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.46518253968253975
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.5227619047619048
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.6102857142857143
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.5354658437477728
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.6044444444444445
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.4731164376512747
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoHotpotQA
          type: NanoHotpotQA
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.88
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.98
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.98
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 1
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.88
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.56
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.344
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.17799999999999996
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.44
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.84
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.86
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.89
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.8601880205101629
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.9325
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.8158855692500271
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoMSMARCO
          type: NanoMSMARCO
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.44
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.7
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.74
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.9
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.44
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.2333333333333333
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.14800000000000002
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.08999999999999998
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.44
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.7
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.74
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.9
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.6667909811661161
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.5937222222222223
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.5994919639747226
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoNFCorpus
          type: NanoNFCorpus
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.4
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.64
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.7
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.74
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.4
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.39333333333333337
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.36
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.294
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.023087598529427374
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.06761742851719367
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.10857051887512778
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.14233415018080223
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.34210971556529485
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.5169999999999999
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.15131881345299422
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoNQ
          type: NanoNQ
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.58
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.78
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.86
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.92
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.58
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.26
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.17999999999999997
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.09799999999999998
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.54
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.72
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.81
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.88
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.7232616852802778
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.6988333333333333
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.6638409439247397
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoQuoraRetrieval
          type: NanoQuoraRetrieval
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.96
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 1
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 1
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 1
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.96
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.41999999999999993
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.26399999999999996
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.13599999999999998
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.8373333333333334
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.972
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.9833333333333333
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.9933333333333334
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.9747100090686657
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.98
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.9621719576719576
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoSCIDOCS
          type: NanoSCIDOCS
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.54
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.7
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.84
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.92
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.54
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.3666666666666666
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.32799999999999996
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.214
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.11366666666666667
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.22666666666666666
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.33666666666666656
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.43866666666666665
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.4295804160884494
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.656079365079365
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.3329808800019895
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoArguAna
          type: NanoArguAna
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.26
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.64
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.82
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.9
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.26
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.21333333333333335
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.16399999999999998
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.09
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.26
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.64
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.82
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.9
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.5817359990817483
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.4781904761904761
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.4820919913419914
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoSciFact
          type: NanoSciFact
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.68
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.82
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.86
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.92
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.68
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.2866666666666667
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.18799999999999997
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.10399999999999998
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.655
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.79
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.845
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.92
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.800311389775704
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.7665793650793651
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.7584845013477088
            name: Maxsim Map@100
      - task:
          type: py-late-information-retrieval
          name: Py Late Information Retrieval
        dataset:
          name: NanoTouche2020
          type: NanoTouche2020
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.5714285714285714
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.8775510204081632
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.9183673469387755
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 1
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.5714285714285714
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.5918367346938774
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.5877551020408164
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.4836734693877551
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.03907914418061841
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.1204002709275123
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.19544619521998122
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.31191053266167984
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.5345004700502356
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.7458292840945903
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.3808192643084636
            name: Maxsim Map@100
      - task:
          type: nano-beir
          name: Nano BEIR
        dataset:
          name: NanoBEIR mean
          type: NanoBEIR_mean
        metrics:
          - type: MaxSim_accuracy@1
            value: 0.6085714285714284
            name: Maxsim Accuracy@1
          - type: MaxSim_accuracy@3
            value: 0.7982731554160125
            name: Maxsim Accuracy@3
          - type: MaxSim_accuracy@5
            value: 0.8521821036106751
            name: Maxsim Accuracy@5
          - type: MaxSim_accuracy@10
            value: 0.9123076923076924
            name: Maxsim Accuracy@10
          - type: MaxSim_precision@1
            value: 0.6085714285714284
            name: Maxsim Precision@1
          - type: MaxSim_precision@3
            value: 0.3783464154892726
            name: Maxsim Precision@3
          - type: MaxSim_precision@5
            value: 0.28982731554160124
            name: Maxsim Precision@5
          - type: MaxSim_precision@10
            value: 0.1959748822605965
            name: Maxsim Precision@10
          - type: MaxSim_recall@1
            value: 0.36670929064638114
            name: Maxsim Recall@1
          - type: MaxSim_recall@3
            value: 0.5353239734961824
            name: Maxsim Recall@3
          - type: MaxSim_recall@5
            value: 0.5969746394737162
            name: Maxsim Recall@5
          - type: MaxSim_recall@10
            value: 0.6704721271920213
            name: Maxsim Recall@10
          - type: MaxSim_ndcg@10
            value: 0.6452361700253275
            name: Maxsim Ndcg@10
          - type: MaxSim_mrr@10
            value: 0.7154429119633202
            name: Maxsim Mrr@10
          - type: MaxSim_map@100
            value: 0.5654203608463307
            name: Maxsim Map@100

📄 Paper | 📝 Blog | 📚 Collection

ColBERT-Zero

🎯 TL;DR: First large-scale fully pre-trained ColBERT model using only public data. Achieves 55.43 nDCG@10 on BEIR benchmark, outperforming GTE-ModernColBERT and GTE-ModernBERT trained on closed and stronger data. New SOTA on BEIR for models <150M parameters.

Why ColBERT-Zero?

Late interaction (ColBERT / multi-vector) models have clear advantages in out-of-domain generalization, long-context handling, and reasoning-intensive retrieval. Yet they remain undertrained: current state-of-the-art ColBERT models (e.g, GTE-ModernColBERT and ColBERT-small) are simply built by bolting a small knowledge distillation step onto a strong dense (single-vector) model. Even recent efforts like mxbai-edge-colbert-v0 perform all early training stages in a single-vector setting, only switching to the multi-vector objective at the very end.

This leaves a lot of performance on the table. ColBERT-Zero demonstrates that performing contrastive pre-training directly in the multi-vector setting, rather than treating it as an afterthought, unlocks a significantly higher performance ceiling. Trained exclusively on public data (Nomic-embed dataset mixture), ColBERT-Zero overcomes a 2.4-point data quality disadvantage to outperform models trained on proprietary, closed-source data. For detailed results, please have a look at our blogpost and the paper. All the models (including intermediate checkpoints) as well training code are released under an Apache 2.0 license.

Controlled Comparison Design

We deliberately trained on the public Nomic-embed data mixture for a strategic reason: Nomic has already trained a dense ModernBERT model (ModernBERT-embed) on this exact data. This lets us compare dense vs. multi-vector training with the same data, same base model (ModernBERT), and same pipeline. The only variable is whether the contrastive phases are performed in the dense or multi-vector setting.

This design reveals a striking result: the dense baseline trained on Nomic data scores 52.89, while the one trained on GTE's proprietary data scores 55.33: a 2.4-point data quality gap. Despite this disadvantage, ColBERT-Zero's full multi-vector pre-training pipeline closes and surpasses this gap, reaching 55.43 nDCG@10.

The Three-Phase Training Pipeline

The development followed a three-phase pipeline, each providing a different type of learning signal:

Phase 1 - Unsupervised Contrastive Pre-training

We began with the nomic-embed-unsupervised-data dataset. Using PyLate's GradCache implementation to scale per-GPU batch size without VRAM constraints, combined with cross-GPU gathering of representations, we reached effective batch sizes of ~16k, required for unsupervised training to produce plausible in-batch hard negatives. Unlike dense training, the multi-vector objective allows the encoder to learn fine-grained token importance from the very first phase.

Phase 2 - Supervised Contrastive Fine-tuning

We refined the model using the nomic-embed-supervised-data. This stage introduced mined hard negatives: documents that are superficially similar to the query but not actually relevant. This allows teaching the model to handle nuance by prioritizing specific keywords and contextual tokens most indicative of a true match.

Phase 3 - Knowledge Distillation (KD)

The final stage used the ms-marco-en-bge dataset. We leveraged a powerful Gemma-based model as a teacher, allowing our student models to learn to replicate complex reasoning scores via the efficient MaxSim operator.

Key Findings

1. The Standard Recipe Leaves Performance on the Table

The KD-only approach (the current industry standard) scores 54.09, lagging behind full pre-training by 1.3 points. A simple distillation step is insufficient for optimal multi-vector performance.

2. Supervised + KD Is the Efficiency Sweet Spot

By running a supervised contrastive step in the multi-vector setting before distillation, we reach 55.12 nDCG@10, closing most of the gap with the fully pre-trained model (55.43). This costs ~40 GH200-hours instead of ~408: roughly 10× cheaper for 99.4% of the performance.

3. Prompt Alignment Is Non-Negotiable

Nomic's base models are pre-trained with asymmetric prompts (search_query: and search_document:). While ColBERT has its own asymmetric mechanism via [Q] and [D] markers, we found:

Stripping pre-training prompts during fine-tuning causes significant performance degradation.
Adding prompts to a model not pre-trained with them also hurts performance.
Even with perfect alignment, prompts provide an intrinsic benefit: full ColBERT pre-training with prompts (55.43) vs. without prompts (54.61), no mismatch in either case, shows a meaningful 0.82-point gap.

Why do prompts help? Our leading hypothesis is that prompt tokens act as implicit query expansion: extra slots that don't carry specific meaning but let the model store global information about the sequence. The original ColBERT used [PAD] tokens for this purpose, but modern Flash Attention implementations broke this trick (masked tokens no longer produce usable embeddings). Explicit prompt tokens may be quietly re-enabling it.

Practical takeaway: Always align your prompts with the base model's pre-training setup. Misalignment is one of the easiest ways to silently lose performance. Note that this sensitivity decreases with stronger downstream fine-tuning: with enough training, the model can adapt to an initial mismatch.

Model Lineup

The Main Models (ColBERT-Zero)

ColBERT-Zero utilizes the full 3-phase pipeline with strict prompt alignment, achieving 55.43 nDCG@10 on BEIR, setting a new SOTA for models <150M parameters. We also provide ColBERT-Zero-noprompts, the same pipeline without asymmetric prompts, to study the impact of query expansion on multi-vector performance.

The cheap-to-train ones (ModernColBERT-embed-base)

These models represent the practical sweet spot. By skipping the expensive unsupervised phase, ModernColBERT-embed-base (Supervised + KD) achieves ~97% of the flagship's performance at only ~10% of the compute cost. For reference, ModernColBERT-embed-base-kd performs only the distillation step on a supervised dense base.

Intermediate Checkpoints

For researchers studying the incremental impact of each phase and prompt alignment, we release several ablation variants: ColBERT-Zero-supervised, ColBERT-Zero-unsupervised (and their -noprompts versions), and ModernColBERT-embed-base-supervised.

Full Performance on BEIR

Model	Avg	FiQA	NFCorpus	TREC-COVID	Touche	ArguAna	Quora	SCIDOCS	SciFact	NQ	ClimateFEVER	HotpotQA	DBPedia	CQADupstack	FEVER	MSMARCO
Baselines
ModernBERT-embed-unsupervised	47.05	42.53	35.33	68.44	18.58	48.82	88.63	19.83	72.30	46.32	22.97	60.00	37.97	42.40	67.39	34.23
ModernBERT-embed-supervised	52.89	40.59	33.40	84.15	31.91	48.96	88.85	18.59	69.63	62.15	35.67	67.11	41.50	42.08	87.35	41.47
GTE-ModernColBERT	54.67	45.28	37.93	83.59	31.23	48.51	86.61	19.06	76.34	61.80	30.62	77.32	48.03	41.00	87.44	45.32
gte-modernbert-base	55.33	48.81	36.44	81.95	21.68	72.68	88.55	21.29	77.40	57.62	37.74	69.47	41.79	42.63	91.03	40.90
KD from dense supervised
ModernColBERT-embed-base-kd-only	54.09	42.51	37.01	79.52	34.58	51.75	87.67	18.15	75.04	61.45	28.31	76.70	47.54	40.68	84.82	45.57
Supervised + KD from dense unsupervised
ModernColBERT-embed-base-supervised	50.72	40.09	35.56	71.12	25.53	44.27	86.96	18.19	73.78	58.89	32.95	71.49	43.23	42.55	70.51	45.72
ModernColBERT-embed-base	55.12	41.50	36.51	77.46	33.77	52.45	86.26	18.66	74.90	62.24	37.27	80.07	48.27	41.60	89.71	46.17
ColBERT-Zero
Unsupervised	51.44	45.38	36.88	67.82	22.59	51.53	87.78	22.30	76.76	58.80	24.24	68.29	43.16	45.76	81.58	38.78
Supervised	51.81	42.45	35.60	74.72	23.83	41.81	87.19	19.85	73.71	61.95	35.01	71.37	46.20	45.16	72.61	45.68
Distilled	55.43	42.62	37.28	78.69	36.13	53.07	85.24	19.88	76.50	61.66	35.72	79.41	47.48	41.34	90.59	45.80
ColBERT-Zero-noprompts
Unsupervised	51.70	45.31	34.72	73.55	23.26	52.56	88.15	22.63	76.10	59.18	24.24	66.66	42.61	45.56	81.88	39.15
Supervised	52.39	43.36	36.01	72.42	23.79	47.42	87.79	21.30	73.85	62.25	31.61	70.32	44.07	44.03	85.54	42.11
Distilled	54.61	43.14	36.60	78.60	36.36	49.49	88.05	19.13	76.42	61.73	32.70	76.99	47.69	40.21	85.97	46.01

Limitations & Discussion

Data-specific findings. We deliberately used the Nomic Embed data mixture for controlled comparison. Some observations (particularly around prompt sensitivity) may not generalize to different or stronger training configurations.
Scale vs. objective. The gains from multi-vector pre-training likely reflect more training time in the multi-vector setting, rather than the contrastive objective itself. Performing KD alone at a larger scale might yield similar or superior results due to the higher quality of the distillation signal. Our study uses the conventional setup where training scale is inversely proportional to signal quality, reflecting the higher cost of generating high-quality labels.
Prompt sensitivity decreases with stronger fine-tuning. When experimenting with stronger fine-tuning data (e.g., NV-Retriever), adding prompts on top of a model pre-trained without them did not degrade results the way it did with ColBERT-Zero. With enough downstream training, the model can adapt to an initial mismatch.

Serving at Scale

For production deployment of ColBERT-Zero and other multi-vector models, check out NextPlaid and FastPlaid, our production-grade engines for multi-vector retrieval.

Resources

📦 All checkpoints: HF Collection - every phase, with and without prompts
💻 Code: Training boilerplates
📄 Paper: ArXiv

Model Details

Model Description

Model Type: PyLate model
Document Length: 187 tokens
Query Length: 39 tokens
Output Dimensionality: 128 tokens
Similarity Function: MaxSim
Training Datasets:
- reddit_title_body
- amazon_reviews
- paq
- s2orc_citation_titles
- s2orc_title_abstract
- s2orc_abstract_citation
- s2orc_abstract_body
- wikianswers
- wikipedia
- gooaq
- codesearch
- yahoo_title_answer
- agnews
- amazonqa
- yahoo_qa
- yahoo_title_question
- ccnews
- npr
- eli5
- cnn
- stackexchange_duplicate_questions
- stackexchange_title_body
- stackexchange_body_body
- sentence_compression
- wikihow
- altlex
- quora
- simplewiki
- squad

Model Sources

Documentation: PyLate Documentation
Repository: PyLate on GitHub
Hugging Face: PyLate models on Hugging Face

Full Model Architecture

ColBERT(
  (0): Transformer({'max_seq_length': 186, 'do_lower_case': False, 'architecture': 'ModernBertModel'})
  (1): Dense({'in_features': 768, 'out_features': 128, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity', 'use_residual': False})
)

Usage

First install the PyLate library:

pip install -U pylate

Prompt alignment is critical for ColBERT-Zero models. You must use prompt_name="query" when encoding queries and prompt_name="document" when encoding documents. ColBERT-Zero was pre-trained with asymmetric prompts (search_query: / search_document:), and stripping them causes significant performance.

Retrieval

Use this model with PyLate to index and retrieve documents. The index uses FastPLAID for efficient similarity search.

Indexing documents

Load the ColBERT model and initialize the PLAID index, then encode and index your documents:

from pylate import indexes, models, retrieve

# Step 1: Load the ColBERT model
model = models.ColBERT(
    model_name_or_path="pylate_model_id",
)

# Step 2: Initialize the PLAID index
index = indexes.PLAID(
    index_folder="pylate-index",
    index_name="index",
    override=True,  # This overwrites the existing index if any
)

# Step 3: Encode the documents
documents_ids = ["1", "2", "3"]
documents = ["document 1 text", "document 2 text", "document 3 text"]

documents_embeddings = model.encode(
    documents,
    batch_size=32,
    is_query=False,  # Ensure that it is set to False to indicate that these are documents, not queries
    prompt_name="document", # ⚠️ Required for ColBERT-Zero! Do not omit.
    show_progress_bar=True,
)

# Step 4: Add document embeddings to the index by providing embeddings and corresponding ids
index.add_documents(
    documents_ids=documents_ids,
    documents_embeddings=documents_embeddings,
)

Note that you do not have to recreate the index and encode the documents every time. Once you have created an index and added the documents, you can re-use the index later by loading it:

# To load an index, simply instantiate it with the correct folder/name and without overriding it
index = indexes.PLAID(
    index_folder="pylate-index",
    index_name="index",
)

Retrieving top-k documents for queries

Once the documents are indexed, you can retrieve the top-k most relevant documents for a given set of queries. To do so, initialize the ColBERT retriever with the index you want to search in, encode the queries and then retrieve the top-k documents to get the top matches ids and relevance scores:

[!WARNING] Always pass prompt_name="query" for queries and prompt_name="document" for documents. Omitting these prompts will silently degrade retrieval quality.

# Step 1: Initialize the ColBERT retriever
retriever = retrieve.ColBERT(index=index)

# Step 2: Encode the queries
queries_embeddings = model.encode(
    ["query for document 3", "query for document 1"],
    batch_size=32,
    is_query=True,  #  # Ensure that it is set to False to indicate that these are queries
    prompt_name="query", # ⚠️ Required for ColBERT-Zero! Do not omit.
    show_progress_bar=True,
)

# Step 3: Retrieve top-k documents
scores = retriever.retrieve(
    queries_embeddings=queries_embeddings,
    k=10,  # Retrieve the top 10 matches for each query
)

Reranking

Always pass prompt_name="query" for queries and prompt_name="document" for documents. Omitting these prompts will silently degrade retrieval quality.

If you only want to use the ColBERT model to perform reranking on top of your first-stage retrieval pipeline without building an index, you can simply use rank function and pass the queries and documents to rerank:

from pylate import rank, models

queries = [
    "query A",
    "query B",
]

documents = [
    ["document A", "document B"],
    ["document 1", "document C", "document B"],
]

documents_ids = [
    [1, 2],
    [1, 3, 2],
]

model = models.ColBERT(
    model_name_or_path="pylate_model_id",
)

queries_embeddings = model.encode(
    queries,
    is_query=True,
    prompt_name="query" # ⚠️ Required for ColBERT-Zero! Do not omit.
)

documents_embeddings = model.encode(
    documents,
    is_query=False,
    prompt_name="document" # ⚠️ Required for ColBERT-Zero! Do not omit.
)

reranked_documents = rank.rerank(
    documents_ids=documents_ids,
    queries_embeddings=queries_embeddings,
    documents_embeddings=documents_embeddings,
)

Evaluation

Metrics

Py Late Information Retrieval

Dataset: ['NanoClimateFEVER', 'NanoDBPedia', 'NanoFEVER', 'NanoFiQA2018', 'NanoHotpotQA', 'NanoMSMARCO', 'NanoNFCorpus', 'NanoNQ', 'NanoQuoraRetrieval', 'NanoSCIDOCS', 'NanoArguAna', 'NanoSciFact', 'NanoTouche2020']
Evaluated with pylate.evaluation.pylate_information_retrieval_evaluator.PyLateInformationRetrievalEvaluator

Metric	NanoClimateFEVER	NanoDBPedia	NanoFEVER	NanoFiQA2018	NanoHotpotQA	NanoMSMARCO	NanoNFCorpus	NanoNQ	NanoQuoraRetrieval	NanoSCIDOCS	NanoArguAna	NanoSciFact	NanoTouche2020
MaxSim_accuracy@1	0.42	0.8	0.9	0.48	0.88	0.44	0.4	0.58	0.96	0.54	0.26	0.68	0.5714
MaxSim_accuracy@3	0.62	0.96	0.96	0.7	0.98	0.7	0.64	0.78	1.0	0.7	0.64	0.82	0.8776
MaxSim_accuracy@5	0.64	0.98	1.0	0.74	0.98	0.74	0.7	0.86	1.0	0.84	0.82	0.86	0.9184
MaxSim_accuracy@10	0.76	1.0	1.0	0.8	1.0	0.9	0.74	0.92	1.0	0.92	0.9	0.92	1.0
MaxSim_precision@1	0.42	0.8	0.9	0.48	0.88	0.44	0.4	0.58	0.96	0.54	0.26	0.68	0.5714
MaxSim_precision@3	0.2267	0.6933	0.34	0.3333	0.56	0.2333	0.3933	0.26	0.42	0.3667	0.2133	0.2867	0.5918
MaxSim_precision@5	0.144	0.604	0.216	0.24	0.344	0.148	0.36	0.18	0.264	0.328	0.164	0.188	0.5878
MaxSim_precision@10	0.092	0.518	0.108	0.142	0.178	0.09	0.294	0.098	0.136	0.214	0.09	0.104	0.4837
MaxSim_recall@1	0.2057	0.1055	0.8367	0.2712	0.44	0.44	0.0231	0.54	0.8373	0.1137	0.26	0.655	0.0391
MaxSim_recall@3	0.284	0.21	0.9233	0.4652	0.84	0.7	0.0676	0.72	0.972	0.2267	0.64	0.79	0.1204
MaxSim_recall@5	0.2973	0.2682	0.9733	0.5228	0.86	0.74	0.1086	0.81	0.9833	0.3367	0.82	0.845	0.1954
MaxSim_recall@10	0.374	0.3823	0.9733	0.6103	0.89	0.9	0.1423	0.88	0.9933	0.4387	0.9	0.92	0.3119
MaxSim_ndcg@10	0.3518	0.6608	0.9268	0.5355	0.8602	0.6668	0.3421	0.7233	0.9747	0.4296	0.5817	0.8003	0.5345
MaxSim_mrr@10	0.5177	0.8742	0.9357	0.6044	0.9325	0.5937	0.517	0.6988	0.98	0.6561	0.4782	0.7666	0.7458
MaxSim_map@100	0.2944	0.5341	0.9017	0.4731	0.8159	0.5995	0.1513	0.6638	0.9622	0.333	0.4821	0.7585	0.3808

Nano BEIR

Dataset: NanoBEIR_mean
Evaluated with pylate.evaluation.nano_beir_evaluator.NanoBEIREvaluator

Metric	Value
MaxSim_accuracy@1	0.6086
MaxSim_accuracy@3	0.7983
MaxSim_accuracy@5	0.8522
MaxSim_accuracy@10	0.9123
MaxSim_precision@1	0.6086
MaxSim_precision@3	0.3783
MaxSim_precision@5	0.2898
MaxSim_precision@10	0.196
MaxSim_recall@1	0.3667
MaxSim_recall@3	0.5353
MaxSim_recall@5	0.597
MaxSim_recall@10	0.6705
MaxSim_ndcg@10	0.6452
MaxSim_mrr@10	0.7154
MaxSim_map@100	0.5654

Training Details

Training Datasets

reddit_title_body

Dataset: reddit_title_body
Size: 66,204,599 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 7 tokens
mean: 18.38 tokens
max: 39 tokens

min: 20 tokens
mean: 38.42 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 7 tokens mean: 18.38 tokens max: 39 tokens	min: 20 tokens mean: 38.42 tokens max: 39 tokens

Samples:

query	document
`Prospective UNCW transfer?`	`Hey Reddit, I am transferring from Florida State to hopefully UNCW this spring. What can you guys tell me about the school that would be helpful? Some background info: I am transferring due to the fact that the only thing to do at FSU is workout and drink (not much of a drinker). I am majoring in biology and have a 3.7 GPA. Anything that you feel is useful to know about the school is appreciated. Thanks guys.`
`Calling for another Meet-up! The force is strong.`	`The time has come. The pull to meet-up with other Jax Redditors is strong, my son. We must use the force and decide where to meet-up. Jax Jedi's do not succumb to the dark side of average places, go with your exceptional suggestions. Yoda say "Must is beer, I say. Welcome are all other suggestions, mmmmmm."`
`I see your Best Customer E-Mail Ever, and raise you my e-mail from an appreciative customer.`	A little background, I'm a software support tech for a medium-large software company, and usually provide support on our Live Chat feature. You know the one. After chatting with one pleasant customer, several times per day over several weeks, I nominated him/their company for Customer of the Month. When they recieve their "Thanks for being awesome" customer box, I get this email: Subject: Epic customer appreciation box Body: Guy and the dog with "Oh you!" face. I laughed, and laughed. Unsuspecting tech support is floored by internet humor from relatively normal customer.

Loss: pylate.losses.cached_contrastive.CachedContrastive

amazon_reviews

Dataset: amazon_reviews
Size: 39,357,860 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 6 tokens
mean: 14.19 tokens
max: 35 tokens

min: 11 tokens
mean: 35.54 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 6 tokens mean: 14.19 tokens max: 35 tokens	min: 11 tokens mean: 35.54 tokens max: 39 tokens

Samples:

query	document
`It works well but the headphone apparatus falls too deep ...`	`It works well but the headphone apparatus falls too deep for any of my headphones to work :( I wish there was an adaptor included to solve this problem`
`Very nice frame! Snaps open at the front like a ...`	`Very nice frame! Snaps open at the front like a real movie poster frame which I think is cool. It worked perfectly for a document I had that was this size, looks great with the green color it is.`
`The shoes look very good. Size wise`	`The shoes look very good. Size wise, they fit well. The ultimate test will be how they last and time will tell.`

Loss: pylate.losses.cached_contrastive.CachedContrastive

paq

Dataset: paq
Size: 53,874,545 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 9 tokens
mean: 14.67 tokens
max: 23 tokens

min: 39 tokens
mean: 39.0 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 9 tokens mean: 14.67 tokens max: 23 tokens	min: 39 tokens mean: 39.0 tokens max: 39 tokens

Samples:

query	document
`how long does it take to complete the orbit of 70 ophiuchi`	70 Ophiuchi sequence dwarf of spectral type K0, while the secondary is an orange dwarf of spectral type K4. The two stars orbit each other at an average distance of 23.2 AUs. But since the orbit is highly elliptical (at e=0.499), the separation between the two varies from 11.4 to 34.8 AUs, with one orbit taking 83.38 years to complete. In 1855, William Stephen Jacob of the Madras Observatory claimed that the orbit of the binary showed an anomaly, and it was "highly probable" that there was a "planetary body in connection with this system". This is the first attempt to use
`who is the author of the switchman`	The Switchman The Switchman (Original title: El Guardagujas) is an existentialist short story by Mexican writer Juan José Arreola. The short story was originally published as a "confabulario", a word created in Spanish by Arreola, in 1952, in the collection "Confabulario and Other Inventions". It was republished ten years later along with other published works by Arreola at that time in the collection "El Confabulario total". The story revolves around a "stranger" who wishes to travel to the town of T. by train, but is quickly met by a "switchman" who tells him more and more fantastical stories about the
`what name is given to a narrow vertical aperture in a fortification through which an ar`	Arrowslit An arrowslit (often also referred to as an arrow loop, loophole or loop hole, and sometimes a balistarium) is a narrow vertical aperture in a fortification through which an archer can launch arrows. The interior walls behind an arrow loop are often cut away at an oblique angle so that the archer has a wide field of view and field of fire. Arrow slits come in a remarkable variety. A common and recognizable form is the cross, accommodating the use of both the longbow and the crossbow. The narrow vertical aperture permits the archer large degrees of freedom to

Loss: pylate.losses.cached_contrastive.CachedContrastive

s2orc_citation_titles

Dataset: s2orc_citation_titles
Size: 7,722,225 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 8 tokens
mean: 22.77 tokens
max: 39 tokens

min: 8 tokens
mean: 22.55 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 8 tokens mean: 22.77 tokens max: 39 tokens	min: 8 tokens mean: 22.55 tokens max: 39 tokens

Samples:

query	document
`Purulent pericarditis. Clinical considerations with reference to 26 cases.`	`Purulent Pericarditis: Report of 2 Cases and Review of the Literature`
`High-Resolution Controller Data Performance Measures for Optimizing Divergent Diamond Interchanges and Outcome Assessment for Drone Video`	`An Advanced Signal Phasing Scheme for Diverging Diamond Interchanges`
`Silurian subaqueous slide conglomerate, Addison, Maine`	`Bimodal Silurian and Lower Devonian volcanic rock assemblages in the Machias-Eastport area, Maine`

Loss: pylate.losses.cached_contrastive.CachedContrastive

s2orc_title_abstract

Dataset: s2orc_title_abstract
Size: 36,051,582 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 8 tokens
mean: 20.86 tokens
max: 39 tokens

min: 20 tokens
mean: 38.72 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 8 tokens mean: 20.86 tokens max: 39 tokens	min: 20 tokens mean: 38.72 tokens max: 39 tokens

Samples:

query	document
`2′–5′-Oligoadenylates (2–5A) As Mediators of Interferon Action. Synthesis and Biological Activity of New 2–5A Analogues`	Double-stranded RNA (dsRNA) is a potent inhibitor of protein synthesis in extracts of interferon-treated cells. One of the mechanisms that has been proposed to explain this inhibition of protein synthesis is by the 2–5A pathway (1). Interferon induces the synthesis of an enzyme, 2–5A synthetase, which upon activation by dsRNA generates 2–5A from ATP. This 2–5A activates a pre-existing endonuclease for cleavage of single-stranded RNA. The biological activity of 2–5A is rapidly lost due to cleavage of the 2′–5′ internucleotide bond by a specific 2′–5′ phosphodiesterase starting at the 3′-end. This rapid cleavage and the poor uptake of 2–5A in intact cells, the latter because of its ionic character, limit the potential of 2–5A as a useful approach to the treatment of virus infections or cancer.
`p-adic L-functions and Bernoulli Numbers`	`In this chapter we shall construct p-adic analogues of Dirichlet L-functions. Since the usual series for these functions do not converge p-adically, we must resort to another procedure. The values of ( L\left( {s,\chi } \right)) at negative integers are algebraic, hence may be regarded as lying in an extension of ( {\mathbb{Q}_p}). We therefore look for a p-adic function which agrees with ( L\left( {s,\chi } \right)) at the negative integers. With a few minor modifications, this is possible.`
`Wood Pile Structure of Three-Dimensional Photonic Crystal Band Gap Characteristics`	Based on the plane wave expansion method,wood pile structure three-dimensional photonic crystal band gap characteristics was studied.Silicon material for wood structure photonic crystals,the change in the structure of strip width and length,is obtained when the wood pile structure width of 5μm,7μm height is formed when the band gap structure of wide band gap width,in 0.2899—0.3804Hz,0.0905Hz.Change form wood pile structure in three-dimensional photonic crystal materials,get the germanium material wood structure shape three-dimensional photonic band gap structure in 0.2585—0.3500Hz,the band gap width of 0.0915Hz,band gap compared to silicon and silicon carbide material is wide.Conclusion for the preparation of three-dimensional photonic crystals provide reference.

Loss: pylate.losses.cached_contrastive.CachedContrastive

s2orc_abstract_citation

Dataset: s2orc_abstract_citation
Size: 7,639,890 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 28 tokens
mean: 38.97 tokens
max: 39 tokens

min: 24 tokens
mean: 38.96 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 28 tokens mean: 38.97 tokens max: 39 tokens	min: 24 tokens mean: 38.96 tokens max: 39 tokens

Samples:

query	document
Abstract Temperature modulated differential scanning calorimetry (TMd.s.c.) was applied in a study of syndiotactic polypropylene. The crystallites melt from their lateral surfaces only and the kinetics shows up in the imaginary part c ″ of the complex specific heat. Theoretical analysis predicts and experiments confirm that c ″ increases linearly with the underlying mean heating rate and the modulation period. Furthermore, it can be shown that c ″ is inversely related to the superheating effective during melting. Use of the relation yields for syndiotactic polypropylene values in agreement with direct measurements employing conventional d.s.c.	Crystal melting behavior of indium and isotactic polypropylene has been examined by differential scanning calorimetry of heat flux type in terms of the heating rate, (\beta ), dependence. The melting shows the dependence characterized by a power, (z), of the shift in peak temperature in proportion to (\beta ^{\text{z}}). The power, (z), differentiates the melting with and without superheating. For polymer crystal melting, intrinsic nature of the broad melting region with a fractional power, (z,\le,1/2), due to superheating of melting kinetics has been reconfirmed experimentally. On the other hand, the crystal melting of indium, which is supposed to proceed with negligible superheating, showed the shift in peak temperature with the power in the range of (1/2,\le,z \le,1), depending on sample mass, which is due to instrumental thermal lag predicted by the Mraw’s model consisting of lumped elements. The (\beta ) dependence is influenced by the thermal lag determined by ...
This article examines anti-racist strategies employed in Finnish children’s literature. The examples from four stories illustrate that certain physical characteristics and cultural markers can become strong signifiers of nationality, that is Finnishness. The characters in these stories have to cope with experiences of exclusion and loneliness before the people around them learn that difference and diversity do not change the fact that all humans are worth the same. However, the paper argues that the intended positive outcome of books with a strong anti-racist agenda threatens to be lost as heavily accentuated moral lessons often become counterproductive. The paper demonstrates some of the changes that have taken place in Finnish children’s literature during the past two decades and addresses significant cultural and societal issues that affect children’s everyday lives.	Abstract: In this article, representations of multiculturalism in Swedish and Finnish picturebooks are examined through the Forskolan Ravlyan and Tatu and Patu series. In the article, multiculturalism is understood and studied with an intersectional approach. This means considering sociocultural categorizations such as ethnicity, gender, nationality and disability to be meaningful to the existing social, political and economic structures of societies. These categorizations are seen to have the power to reproduce and circulate dominant discourses that effect the social inclusion and exclusion of certain groups of people. Thus, the social categories are examined as performative textual discourses, meaning that texts are acknowledged to be not only reflecting, but also creating social reality. Both series present diversity as an integrated part of the story by means of non-explicit multiculturalism. The analysis reveals that both series of books contain representations of diversity that c...
Rosai–Dorfman disease (RDD) is usually characterized by painless bilateral cervical lymphadenopathy associated with fever and leukocytosis. Although the disease may occur outside lymphnodes, manifestation of skeletal system occurs in less than 8% of cases. In addition, presentation of this disease in a purely skeletal form without lymph nodes involvement is extremely uncommon. This case report describes a 17-year-old female with a pure skeletal presentation of RDD in the fibula. Trocar biopsy was performed, and immunohistochemical staining using S100 and CD68 was done to confirm the diagnosis.	`We report a case of extranodal Rosai-Dorfman disease (RDD) (sinus histiocytosis with massive lymphadenopathy) presenting with a solitary active lesion of the femur.`

Loss: pylate.losses.cached_contrastive.CachedContrastive

s2orc_abstract_body

Dataset: s2orc_abstract_body
Size: 6,550,431 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 9 tokens
mean: 38.92 tokens
max: 39 tokens

min: 39 tokens
mean: 39.0 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 9 tokens mean: 38.92 tokens max: 39 tokens	min: 39 tokens mean: 39.0 tokens max: 39 tokens

Samples:

query	document
One of the goals of the 5G Communication Automotive Research and innovation (5GCAR) project has been to evaluate and propose system architecture enhancements aiming at supporting the strict requirements of vehicle-to-everything (V2X) use cases. In this paper, we provide an overview of 3GPP 5G system architecture, which is used as a baseline architecture in the project, and we present the main architectural enhancements introduced by 5GCAR. The work of the project focused on the following categories: (i) end-to-end security, also including aspects of privacy; (ii) network orchestration and management; (iii) network procedures; (iv) edge computing enhancements; and (v) multi-connectivity cooperation. The enhancements introduced by 5GCAR to above-listed categories are discussed in this paper, while a more detailed analysis of some selected features is presented. Figure 2. Reference point representation of the 5G system architecture [6].The network functions repository function (NRF) is us...	Introduction The automotive sector is considered to be one of the most prominent verticals that will benefit from the capabilities of the upcoming 5G cellular networks [1,2]. Vehicular applications cover a wide range of use cases and thus a large set of associated requirements. Examples include very high data rates and timely service delivery, while also considering ultra-low communication latencies, just to mention a few. Complex scenarios where vehicles communicate among themselves and also with nearby road infrastructure, road users, clouds, etc.-also known as vehicle-to-everything (V2X) communications-will not only leverage 5G network but will play a key role in its design. The H2020 5G PPP Phase 2 project 5G Communication Automotive Research and innovation (5GCAR) [3] worked towards the definition of enhancements in terms of system architecture, security, and privacy, specifically targeting automotive applications. In particular, 5GCAR considered five different classes of use c...
The Queer History Walking Tour is an annually recurring event during Dublin's official Pride festivities. Created and led by the 'Godfather of Gay,' Tonie Walsh, the walks seek to extend stories from the Irish Queer Archive (IQA) into the everyday fabric of the city, contributing to a processual queering of Irish heteronormative histories. As an activist form of public pedagogy, the walking tour encourages a relational understanding of queer cultural heritage through mobile, embodied, and emotional interactions. This paper argues that the walking tour works as an anarchive that contributes to a growing, intersectional understanding of LGBTQ+ experiences and queer futures, facilitated by peripatetic practices. In response to pervasive cis-male homonormativity at Pride, Dr Mary McAuliffe, a queer feminist woman, is the latest tour guide who includes historical stories of lesbian women, trans people, and gay men. Through engaging with this diversity of historical experiences, guides signa...	Introduction Dublin Pride does not consist of one parade, but two. Every year, Dublin Pride includes a 'mini parade:' a free Queer History Walking Tour created and led by Tonie Walsh. As a founder of the Irish Queer Archive (IQA), co-founder of the Gay Community News (GCN) and long-time gay rights activist, Walsh is well known within the LGBTQ+ community in Ireland as the 'Godfather of Gay' (Mullally, 2018). The tour is highly popular and can draw up to 150 attendees, and sometimes includes collaborations with other historians with their own stories to tell. i The tour includes pausing alongside places of key significance in queer Irish history, be it a historical place that no longer materially exists, or one that has remained unchanged. This paper will draw on Walsh's walking tour to illustrate how walking tours generate a relational understanding of queer cultural heritage through mobile, embodied and emotional interactions with places and other queer people. I argue that, despit...
A definitive diagnosis of salivary gland tumors is extremely difficult to make without evaluating the entire tumor and conducting immunohistochemical examinations. In this study, we aimed to examine and compare the expression patterns of the tumor protein TP D52 family, including TPD52, TPD53, and TPD54, in salivary gland tumor cells by using immunohistochemical staining. Among over 30 benign and malignant salivary gland tumors with extensive and diverse morphological features and overlapping histological similarities, we selected Warthin s tumor and pleomorphic adenoma to represent benign salivary gland tumors and mucoepidermoid carcinoma to represent malignant ones. Tumor samples were fixed in 10 buffered formalin and embedded in paraffin. Then, immunohistochemical staining was performed using antibodies against TPD52, TPD53, and TPD54. Neither the benign salivary gland tumors nor mucoepidermoid carcinoma stained for TPD52. However, the intensity of TPD53 and TPD54 staining was found...	Introduction The salivary glands are exocrine organs that produce saliva and are complex tissues composed of ductal, acinar, myoepithelial, and basal cells 1 . Collectively called as luminal cells, ductal and acinar cells are present on the luminal side of the salivary duct system. Myoepithelial and basal cells are located on the basement membrane around the luminal cells and are thus called abluminal cells 2 . In general, 3 types of acini namely serous, mucinous, and mixed and ducts i.e., intercalated, striated, and excretory are found in the salivary glands. The acini and intercalated ducts are surrounded by myoepithelial cells, whereas the striated and excretory ducts are surrounded by basal cells 3 . Tumors of the salivary glands comprise less than 1 of all neoplasms in the body 4 ; however, there are more than 30 benign and malignant salivary gland tumors with extensive and diverse morphologies yet overlapping histological similarities 4 . Hence, it is extremely difficult to d...

Loss: pylate.losses.cached_contrastive.CachedContrastive

wikianswers

Dataset: wikianswers
Size: 10,087,503 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 7 tokens
mean: 14.24 tokens
max: 39 tokens

min: 7 tokens
mean: 14.03 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 7 tokens mean: 14.24 tokens max: 39 tokens	min: 7 tokens mean: 14.03 tokens max: 39 tokens

Samples:

query	document
`What is the average weight for a 4'11 14 year old girl?`	`What is the average weight for a 4' 9 14 year old girl?`
`The Fahrenheit temperature reading is 98 degrees on a hot summer day Wh is this reading on the Kelvin scale?`	`Fahrenheit temp 98 on hot summer day what is this reading on the kelvin scale?`
`What is the word for you in Japanese?`	`What word in japanese i loveyou?`

Loss: pylate.losses.cached_contrastive.CachedContrastive

wikipedia

Dataset: wikipedia
Size: 6,198,049 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 4 tokens
mean: 8.86 tokens
max: 28 tokens

min: 23 tokens
mean: 38.92 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 4 tokens mean: 8.86 tokens max: 28 tokens	min: 23 tokens mean: 38.92 tokens max: 39 tokens

Samples:

query	document
`Pristimantis lichenoides`	Pristimantis lichenoides (rana camuflada in Spanish) is a species of frogs in the family Craugastoridae. It is endemic to Colombia and is only known from the vicinity of its type locality near Samaná in the Caldas Department, on the eastern slope of the Cordillera Central (Colombian Andes). The specific name lichenoides refers to its lichen-like dorsal coloration as well as its habit of being plastered to rock surfaces, resembling lichens growing on rocks. Description Adult males measure and adult females in snout–vent length. The head is as wide as the body and wider than it is long. The snout is rounded in dorsal view but subtruncate in lateral view. The tympanum is small but visible, with its upper edge hidden by the thick supratympanic fold. The fingers have lateral keels and round terminal discs. The lateral keels of the toes coalesce as basal webbing; the toe discs are slightly smaller than those on the fingers. Dorsal skin bears granules. Dorsal coloration is dark green to pa...
`Askim station`	`Askim Station () is located at Askim, Norway on the Eastern Østfold Line. The railway station is served by the Oslo Commuter Rail line L22 from Oslo Central Station. The station was opened with the eastern line of Østfold Line in 1882. Railway stations in Askim Railway stations on the Østfold Line Railway stations opened in 1882 1882 establishments in Norway`
`Mildred Alango`	Mildred Akinyi "Milka" Alango (born 10 March 1989 in Mombasa) is a Kenyan taekwondo practitioner. Alango qualified for the women's 49 kg class at the 2008 Summer Olympics in Beijing, after winning the championship title from the African Qualification Tournament in Tripoli, Libya. She lost the preliminary match to China's Wu Jingyu, who was able to score seven points at the end of the game. Because her opponent advanced further into the final match, Alango took advantage of the repechage round by defeating Sweden's Hanna Zajc on the superiority rule, after the pair had tied 2–2. She progressed to the bronze medal match, but narrowly lost the medal to Venezuela's Dalia Contreras, with a sudden death score of 0–1. References External links NBC 2008 Olympics profile 1989 births Living people Kenyan female taekwondo practitioners Olympic taekwondo practitioners of Kenya Taekwondo practitioners at the 2008 Summer Olympics Sportspeople from Mombasa

Loss: pylate.losses.cached_contrastive.CachedContrastive

gooaq

Dataset: gooaq
Size: 1,281,138 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 9 tokens
mean: 12.49 tokens
max: 22 tokens

min: 14 tokens
mean: 38.0 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 9 tokens mean: 12.49 tokens max: 22 tokens	min: 14 tokens mean: 38.0 tokens max: 39 tokens

Samples:

query	document
`what is psma pet ct scan?`	`A PSMA study, also called a ProstaScint® scan, is an imaging test to locate and determine the extent of prostate cancer. ... The study involves a special molecule called a monoclonal antibody developed in a laboratory and designed to bind to the prostate-specific membrane antigen on cancer cells.`
`how many calories do you burn walking up mount snowdon?`	`You will burn through around 2,000 calories climbing Snowdon.`
`ankara is the capital city of?`	`Ankara, formerly known as Angora, city, capital of Turkey, situated in the northwestern part of the country.`

Loss: pylate.losses.cached_contrastive.CachedContrastive

codesearch

Dataset: codesearch
Size: 864,023 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 6 tokens
mean: 28.34 tokens
max: 39 tokens

min: 28 tokens
mean: 38.9 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 6 tokens mean: 28.34 tokens max: 39 tokens	min: 28 tokens mean: 38.9 tokens max: 39 tokens

Samples:

query	document
`Similar to {@link #getOrCreateLocalTransaction(Transaction, boolean)} but with a custom global transaction factory.`	public LocalTransaction getOrCreateLocalTransaction(Transaction transaction, boolean implicitTransaction, Supplier gtxFactory) { LocalTransaction current = localTransactions.get(transaction); if (current == null) { if (!running) { // Assume that we wouldn't get this far if the cache was already stopped throw log.cacheIsStopping(cacheName); } GlobalTransaction tx = gtxFactory.get(); current = txFactory.newLocalTransaction(transaction, tx, implicitTransaction, currentTopologyId); if (trace) log.tracef("Created a new local transaction: %s", current); localTransactions.put(transaction, current); globalToLocalTransactions.put(current.getGlobalTransaction(), current); if (notifier.hasListener(TransactionRegistered.class)) { // TODO: this should be allowed to be async at some point CompletionStages.join(notifier.notifyTransactionRegistered(tx, ...
`// formatArgs converts the given args to pretty-printed, colorized strings.`	`func formatArgs(args ...interface{}) []string { formatted := make([]string, 0, len(args)) for _, a := range args { s := colorize(pretty.Sprint(a), cyan) formatted = append(formatted, s) } return formatted }`
`log request in history @access private @param $message string @return void @since 3.0 @package Gcs\Framework\Core\Engine`	`private function _setHistory($message) { $this->addError('URL : http://' . $this->request->env('HTTP_HOST') . $this->request->env('REQUEST_URI') . ' (' . $this->response->status() . ') / SRC "' . $this->request->src . '" / CONTROLLER "' . $this->request->controller . '" / ACTION "' . $this->request->action . '" / CACHE "' . $this->request->cache . '" / ORIGIN : ' . $this->request->env('HTTP_REFERER') . ' / IP : ' . $this->request->env('REMOTE_ADDR') . ' / ' . $message, 0, 0, 0, LOG_HISTORY); }`

Loss: pylate.losses.cached_contrastive.CachedContrastive

yahoo_title_answer

Dataset: yahoo_title_answer
Size: 276,726 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 7 tokens
mean: 18.44 tokens
max: 39 tokens

min: 6 tokens
mean: 35.58 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 7 tokens mean: 18.44 tokens max: 39 tokens	min: 6 tokens mean: 35.58 tokens max: 39 tokens

Samples:

query	document
`Who to contact in the philippines to install Supersports SA cable Channel?`	`go to this web site www.dishtv.sa.com\n or ask ur cable operator\n\nor contact this number 0091234537835\n\n\n\ni hope this helps`
`What does "you're preaching to the choir" mean?`	`"preaching to the choir" means trying to make a point to someone who already agrees with your position. The analogy meaning that those in the choir are already familiar with the preaching... it's the others that likely need it.`
`Does anyone know a good site where i can find a detailed but simply explained explanation on why henry VIII?`	`Henry VIII and the break with Rome\nClick on the * words in the site to show:\n\nPower - "Henry had hoped to resolve the issue of who was to succeed him"\n\nMoney - "As well as his desire for the divorce, there was a strong financial incentive for Henry to deny the authority of the Pope"\n\nFaith - "Although Henry's reformation broke with the papacy, his own religious beliefs were orthodox"\n\nLove - "Henry was in love with Anne Boleyn"`

Loss: pylate.losses.cached_contrastive.CachedContrastive

agnews

Dataset: agnews
Size: 420,288 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 8 tokens
mean: 14.72 tokens
max: 39 tokens

min: 12 tokens
mean: 35.44 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 8 tokens mean: 14.72 tokens max: 39 tokens	min: 12 tokens mean: 35.44 tokens max: 39 tokens

Samples:

query	document
`Italy coming out of Washington's shadow`	`Long considered something of a junior partner among Europe's elite nations, Italy is carving out a hefty role in world affairs. Rome is contributing the largest contingent to the U.N. peacekeeping force in Lebanon, has claimed a role in negotiations with Iran and is rallying European governments around the idea that Italy can form a counterweight to American might.`
`Iran, Europe Fail to Agree on Uranium Enrichment, IRNA Reports`	`Iran and Europe failed to reach an accord on Tehran's uranium enrichment program, the state-owned Iranian news agency said, increasing the chances the US may call for United Nations sanctions against the Islamic nation.`
`Omicidio DesirÃ©e, la Cassazione "La pena per Erra va inasprita"`	`La sentenza farÃ da apripista per la futura giurisprudenzaCon il nuovo processo a Milano, l'imputato rischia l'ergastolo Omicidio DesirÃ©e, la Cassazione "La pena per Erra va inasprita" Il nuovo processo si celebrerÃ all'Assise d'appello di Milano`

Loss: pylate.losses.cached_contrastive.CachedContrastive

amazonqa

Dataset: amazonqa
Size: 226,137 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 7 tokens
mean: 23.35 tokens
max: 39 tokens

min: 18 tokens
mean: 35.18 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 7 tokens mean: 23.35 tokens max: 39 tokens	min: 18 tokens mean: 35.18 tokens max: 39 tokens

Samples:

query	document
`Wondering how people get the wrinkles out from the packaging? Iron or wash and hang damp,maybe?`	`I sprayed with water (misted it) then ironed it. Most wrinkles came out and what did not, eventually came out from the steam of the shower. Good- luck`
`Why is it that most of the Janome users previously owned Singer or Kenmore? Anything has to be better than either of those--so what's the real benefit of a Janome HD3000 vs a Pfaff?`	I can't tell you anything about Pfaff because I have never owned a Pfaff. When I bought the Janome HD3000 I was looking for a heavy duty sewing machine that would sew through layered heavy fabrics, such as denim, etc. I had a Singer at the time, and had always owned Singers, and had noticed that with each new Singer I bought, the quality was less than the previous Singer. I don't know what happened to Singer, but in my opinion they have put out a less and less quality product over the past 10 to 15 years. The question I asked in my search engine was something like "what is a good heavy duty sewing machine". That led me to a demonstration video where I watched someone using the Janome to sew through the depth of fabric layers that I needed. And when I bought the machine, it worked just like in the video. It sails through layers of fabric that used to invariably tangle up and stop the Singer.
`I would like to use this for storing thread. I need the drawers to be a least 4" high . Also, do the tops come off the container.`	`The drawers are only 2 inches high. They slide out . Each has a picture of a big Lego head on top the top does snap off.`

Loss: pylate.losses.cached_contrastive.CachedContrastive

yahoo_qa

Dataset: yahoo_qa
Size: 143,477 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 8 tokens
mean: 34.24 tokens
max: 39 tokens

min: 9 tokens
mean: 37.25 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 8 tokens mean: 34.24 tokens max: 39 tokens	min: 9 tokens mean: 37.25 tokens max: 39 tokens

Samples:

query	document
`I have to meet up with someone whose last name is Kasprazck tomorrow and I dont wanna offend her by sayin her name wrong. Can u please write out how its pronounced if you know or how you think? Thanks xoxoxoxooxo`	`People with surname like that are usually aware that people may not know how to pronounce it properly. It will not be a big issue (and I am sure it won't offend her at all) if you were to ask how to pronounce it. Just make sure you listen carefully THEN repeat it so you will likely remember it.`
All I want to know why is this allowed when there is so much of a danger to children that are online. I am in charge of her as of 3/20/2006 and she is nolonger with her Mother who got her started with this problum and I was under the imprestion that this account was canceled out but I went to use my computer and I found out that she had been online without my knowledge of it til today I can only give you the email address I dont have her password for the my space I do have the home address that may have been givin and the phone number and her true date of birth. My sister is the one who told the lie.	Your question is essentially "All I want to know why is this allowed...?"\n\nThe answer is, nobody allowed it but you. You made a computer accessible to someone who you do not wish it to be used by.\n\nIf you meant to ask "Why are minors allowed to set up email accounts?" then the answer is, "Because there is no way to ensure that the person on the client's end isn't minor."\n\nIf you want to have her MySpace account removed, there are protocols you can follow on the MySpace FAQ (frequently asked questions). However, it will be pointless to go through the trouble if she has access to the internet; your home, school, friends, the mall, Kinko's, etc.
`I am not asking you alter anything you already have in place,\nbut why not combine Biology, and Chemistry into Biochemistry, yes, that is what I am searching for.`	Biology has traditionally consisted of botany, zoology and microbiology. Chemistry has consisted mainly of physical, organic, analytical and biochemistry. Until relatively recently those divisions have held up reasonably well. Now there is a whole new world of chemical and physical biology and biochemistry and biophysics. To throw all of biology and chemistry into biochemistry would be a misnomer for many of the parts of both. To add biochemistry or chemical biology into one pot might be a good idea to catch the questions in the area betwen

Loss: pylate.losses.cached_contrastive.CachedContrastive

yahoo_title_question

Dataset: yahoo_title_question
Size: 213,320 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 7 tokens
mean: 17.99 tokens
max: 39 tokens

min: 7 tokens
mean: 33.44 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 7 tokens mean: 17.99 tokens max: 39 tokens	min: 7 tokens mean: 33.44 tokens max: 39 tokens

Samples:

query	document
`1:03 s for 100 meter freestyle race 10 year old female category. does this time rank high in the USA Swimming?`	`My daugter just went 1:03 in 100meter fresstyle how does this compare to the best 10 year olds in the world or USA?`
`Why doesnt people believe that a mental illness or condition is a real medical probllem?`	`It seems that unless people can see a "broken arm", a "bleeding wound", a "cancer diagnosis", "asthma" , "arthiritis" (and many more lables out there) a mental condition is less inmportant as the above. There are so many people that do not understand that it is real...it is a struggle everyday to just get to the end of the day. You are ridiculed for you behavior as irresponsible or inconsiderate. You get the picture. IT IS AS REAL AS CANCER OR AIDS OR ANY OTHER UNCUREABLE ILLNESS!!`
`Why do you think people attand college or university?`	`people attand college or university for many different reasons,e.x.new experiences, career preparation, increased knowledge...`

Loss: pylate.losses.cached_contrastive.CachedContrastive

ccnews

Dataset: ccnews
Size: 353,670 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 7 tokens
mean: 17.8 tokens
max: 39 tokens

min: 22 tokens
mean: 38.95 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 7 tokens mean: 17.8 tokens max: 39 tokens	min: 22 tokens mean: 38.95 tokens max: 39 tokens

Samples:

query	document
`California Senate Approves Raising Age to Buy Long Guns`	SACRAMENTO (AP) — California would raise the age for buying rifles and shotguns from 18 to 21 and bar people from buying more than one long gun each month under a bill advancing in the Legislature. It’s been a frequently debated topic nationwide after a Florida high school shooting that killed 17 people. The Senate on Tuesday approved the measure by Democratic Sen. Anthony Portantino of La Canada Flintridge, sending it to the Assembly on a 23-10 vote. It extends age and purchase limits that currently apply only to handguns. Republican Sen. Jim Nielsen of Gerber says California should instead target criminal gangs and those with mental disabilities whom he said will obtain the guns no matter the legal limits. Walmart and Dick’s Sporting Goods previously announced age limits on gun sales.
`Mississippi officer fired after video of suspect being hit`	JACKSON, Miss. (AP) – A Mississippi police officer has been fired after cellphone video showed him hitting a handcuffed suspect. A Jackson Police Department news release says officer Justin Roberts was fired Monday by Chief Lee Vance. The release says the suspect was hit Saturday; Vance started an internal affairs investigation after the video surfaced Sunday. The identity of the handcuffed man was not released. It was not immediately clear whether Roberts can appeal his firing. The Associated Press tried to leave a message for Roberts at the Jackson Police Department, but department spokesman Commander Tyree Jones says he does not have a way to reach the fired officer. Jones says both Roberts and the handcuffed suspect are African-American. Share this: Facebook LinkedIn Twitter Google Like this: Like Loading...
`Sir Cameron Mackintosh Discusses Newest Incarnation of MISS SAIGON`	Sir Cameron Mackintosh, the man responsible for a nearly unrivaled number of influential theatrical productions, has mounted a new incarnation of Miss Saigon at the Birmingham Hippodrome. He recently spoke with Express and Star about the upcoming production. "This version is by far the best we have ever done," he says. "The world has sadly got worse, not better and we are indeed in gritty times and I think that is what has made the show feel even more contemporary than when it first came out nearly 30 years ago." The show features a new collection of designers and takes a grittier approach to the already hard-hitting content. Mackintosh says that even at the beginning the subject matter posed a monumental challenge in terms of transfer to the stage. When first speaking with Claude-Michel Schönberg and Alain Boublil he says, " the phrase I used then was 'doing this musical is like dancing on a razor blade'; you have to be utterly truthful and it has to deliver the power that only musica...

Loss: pylate.losses.cached_contrastive.CachedContrastive

npr

Dataset: npr
Size: 365,075 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 7 tokens
mean: 15.98 tokens
max: 30 tokens

min: 16 tokens
mean: 38.45 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 7 tokens mean: 15.98 tokens max: 30 tokens	min: 16 tokens mean: 38.45 tokens max: 39 tokens

Samples:

query	document
`Chicago Sells City Relics in Online Auction`	Pieces of Chicago's history and cultural experiences go up for bidding in a two-week auction beginning Thursday. The sale is an attempt to raise money for city arts and cultural programs, while also raising its profile. The "Great Chicago Fire Sale" is the first charitable eBay auction to be held by a municipality, and is being run by Chicago's Department of Cultural Affairs. Offerings include a dinner party prepared by Oprah Winfrey's chef, a chance to dye the river green on St. Patrick's Day, a cow statue from the city's 1999 Cows on Parade display and an authentic Playboy Bunny costume from the 1960s. NPR's David Schaper reports.
`Hear Code Orange's Darkly Catchy 'Bleeding In The Blur'`	Code Orange could never be accused of going soft. Show up to any of the Pittsburgh band's shows and behold the cyclonic mosaic of moshing bodies moved by its nightmarishly chaotic hardcore. But there's always been an experimental underpinning to Code Orange that toys with noise and melody (and some '90s grunge). Forever, the band's upcoming third album, is among its most bruising works, with surprises throughout. But none are quite like this. "Bleeding In The Blur" certainly sets itself up to swarm, but the squealing feedback and Jami Morgan's thunderous drums quickly turn the reins over to guitarist Reba Meyers. No pinch harmonics, no slamming breakdowns, (mostly) no throaty screams — this is a darkly catchy pop song that sounds as if it's been carved from obsidian. "Bleeding In The Blur" has the gloomy heft of Thrice and the unconventional hooks of Jawbox, with Meyers' dominating vocals out front. If you've ever wanted a heavier song by Adventures (the emo band featuring three-quarte...
`Ireland Is The Focus Of Investor Anxieties`	`Over the past two weeks, investors have dumped Irish government bonds over concerns about the country's economy and its banks. Irish officials have been reluctant to accept a bailout. But over the weekend, they held talks about the debt crisis with other members of the European Union.`

Loss: pylate.losses.cached_contrastive.CachedContrastive

eli5

Dataset: eli5
Size: 106,781 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 6 tokens
mean: 21.72 tokens
max: 39 tokens

min: 14 tokens
mean: 38.36 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 6 tokens mean: 21.72 tokens max: 39 tokens	min: 14 tokens mean: 38.36 tokens max: 39 tokens

Samples:

query	document
`How far did Genghis Khan influence spread? and did it help america?`	It would be rather difficult for Ghenghis Khan to influence America very much, given that the United States didn't exist until about 500 years after his death. It would be over 200 years before Columbus made his first voyage in search of Asia. At the time the only contact between America and the rest of the world would have been the Norse expedition to Vinland, and that didn't exactly end well. It's possible someone who's more knowledgeable about the subject could point to some cultural shifts that would affect America but with half a millennium of separation, Ghenghis Khan's influence on the USA would be pretty minimal.
`How did Stephen Hawking talked even though he can't move a muscle? How did the computer knew what he wanted to say?`	`He used very subtle muscle movements to control the computer. The computer would go over a list of letters/words and Hawking would move his muscle whenever he wanted to choose the current letter or word. Towards the end of his life it would take him up to a minute per word. Any interview you see of him is either heavily edited to remove these long pauses, or his entire talk was pre-recorded (that's how he gave lectures).`
`How would William the Conqueror's name have been said/written in Old Norman?`	`William the Conqueror by David Bates p. 33 (ISBN 978-0752429601) and Hanks and Hodges, Oxford Dictionary of First Names, Oxford University Press, 2nd edition ( ISBN 978-0-19-861060-1), p.276 list it as Williame (french spelling Guillaume), all the other sources I found were too unreliable.`

Loss: pylate.losses.cached_contrastive.CachedContrastive

cnn

Dataset: cnn
Size: 293,521 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 15 tokens
mean: 38.58 tokens
max: 39 tokens

min: 39 tokens
mean: 39.0 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 15 tokens mean: 38.58 tokens max: 39 tokens	min: 39 tokens mean: 39.0 tokens max: 39 tokens

Samples:

query	document
`Chan, is famous in the United States for such action movies as 'Rush Hour' and 'Rumble in the Bronx' He lashed out at the United States and blamed the country for the financial crisis that is sweeping the globe .`	He may enjoy a Hollywood payday now and then, but that doesn't stop Jackie Chan from criticizing America. The martial arts star called the U..S the 'most corrupt' country in the world during a recent interview on a Hong Kong television show. 'If you talk about corruption, the entire world, the United States has no corruption?' Chan asked the host. Scroll down for video . Controversial: Chan, who's made millions in American films, called the country the most corrupt nation on the planet . Chan then referred to America as 'the most corrupt in the world.' 'Where does this Great . Breakdown (financial crisis) come from? It started exactly from the . world, the United States,' Chan told the interviewer. 'When I was interviewed in the U.S., people . asked me, I said the same thing. 'I said now that China has become . strong, everyone is making an issue of China,' continued the Rush Hour star. 'If our own countrymen . don't support our country, who will support our country? We know our . coun...
`A bus was carrying members of King family after 'Dream' speech ceremony . The bus and a car collided near Washington's Tidal Basin just off the National Mall . Reality star Omarosa Manigault said she was on the bus: 'We were very afraid' Mall Police say a person in the car taken to hospital; no report yet on bus passengers .`	Washington (CNN) -- Family members of the Rev. Martin Luther King Jr. were involved in a bus accident Wednesday after the high-profile ceremony marking the 50th anniversary of King's "I Have a Dream" speech, police said. The bus and a car collided near Washington's Tidal Basin just off the National Mall where the ceremony was held, according to Park Police, who have jurisdiction over the Mall. They said a person in the car was injured and taken to a hospital but did not provide information on injuries to bus passengers. Several members of the King family were aboard the bus and had laid a wreath at the memorial to the civil rights leader, according to Omarosa Manigault, a reality television star who was aboard the bus. "We were very afraid," she told CNN. "There were children on the bus, seniors and everything. Everybody was thrown out of their seats." She said she hit her head in the accident. Obama: Because they marched, America changed . 9 things about MLK's speech and the March on ...
`Ofcom's chief executive said there had been a big change in tolerance levels . 35% of viewers think there is too much violence, down from 55% in 2008 . But there is less tolerance of language deemed as 'discriminatory' or unjust . Critics say British public has become 'desensitised' due to lax Ofcom laws .`	Television viewers have become more tolerant of violence and swearing, the head of Ofcom has claimed. But the sexist or racist language of the 1970s is far less acceptable than it once was, research by the broadcasting regulator reveals. Ofcom’s chief executive Ed Richards, who is about to stand down after 11 years in the job, told MPs there has been a big change in tolerance levels in the past few decades. Ofcom chief says the British public has grown more tolerant - but still does not like discriminatory language on TV shows. Till Death Us Do Part, which frequently had lead character Alf Garnett making racist remarks . But critics argued the British public has simply become ‘desensitised’ to swearing after years of lax regulation by Ofcom. According to the regulator’s latest research, published in July, only 35 per cent of viewers think there is too much violence on TV, down from 55 per cent in 2008. Just 35 per cent think there is too much swearing, down from 53 per cent six years a...

Loss: pylate.losses.cached_contrastive.CachedContrastive

stackexchange_duplicate_questions

Dataset: stackexchange_duplicate_questions
Size: 73,210 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 6 tokens
mean: 15.86 tokens
max: 39 tokens

min: 6 tokens
mean: 15.56 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 6 tokens mean: 15.86 tokens max: 39 tokens	min: 6 tokens mean: 15.56 tokens max: 39 tokens

Samples:

query	document
`Clone() vs Copy constructor- which is recommended in java`	`clone() vs copy constructor vs factory method?`
`AES-128/192 safer than AES-256 in practice?`	`Is AES-256 weaker than 192 and 128 bit versions?`
`How does this Java code which determines whether a String contains all unique characters work?`	`Explain the use of a bit vector for determining if all characters are unique`

Loss: pylate.losses.cached_contrastive.CachedContrastive

stackexchange_title_body

Dataset: stackexchange_title_body
Size: 80,695 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 15 tokens
mean: 38.83 tokens
max: 39 tokens

min: 26 tokens
mean: 38.78 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 15 tokens mean: 38.83 tokens max: 39 tokens	min: 26 tokens mean: 38.78 tokens max: 39 tokens

Samples:

query	document
Allow linking to named anchors This is similar to , but I don't think it's a dupe. Markdown should support links that are just named anchors, like foo. I occasionally reference existing answers in comments or my own answers if I'm expanding on them, and currently I need to include the full URL to get Markdown to link it, which seems unnecessary. Just copying the answer's link is annoying because it's a different URL, so when users click it it loads a new page, even though it's actually the exact same page. To get around it I tend to take the current URL and splice in the #id of the answer I'm linking to, but Markdown should assume that if I just include the #id part	Support anchor names in posts I admit this feature request is probably somewhat limited in useful scope, but I'm throwing it out there anyway. Inspired by , and because I want to use it on , I'm requesting that name be supported on a tags in posts. On very long answers, such as the closing/migration guidance answer, this would allow direct linking to the specific closure reason. This would then allow us, when someone , to link directly to the appropriate reason and description thereof. I recognize the limited scope of this, however, I have seen other long answers that could stand to have that kind of "deep" linking ability as well. (The original incarnation of this post had either name or id, but preferenced name. Per Koper's answer, which I agree with, I took out the idea of supporting id, because Koper's right -- too dangerous.)
Unable to reload same gif image, if used twice in a page I am using same gif image twice in a page. Both the images will be hidden initially. Based on certain criteria I am showing those gif images (when clicked on particular target one gif image will be shown at a time). I am unable to reload the gif image. See the attached plunker 1) <script> var img1 = document.getElementById("img1"); var img2 = document.getElementById("img2"); function toggle1() { if (document.getElementById('gif-1').style.display == "none") { document.getElementById('gif-1').src = ''; document.getElementById('gif-1').src = 'http://insightgraphicdesign.net/wp-content/uploads/2014/07/coke-responsive-logo.gif'; document.getElementById('gif-1').style.display = "block"; } else document.getElementById('gif-1').style.display = "none"; } function toggle2() { if (document.getElementById('gif-2').style.display == "...	how to clear or replace a cached image I know there are many ways to prevent image caching (such as via META tags), as well as a few nice tricks to ensure that the current version of an image is shown with every page load (such as image.jpg?x=timestamp), but is there any way to actually clear or replace an image in the browsers cache so that neither of the methods above are necessary? As an example, lets say there are 100 images on a page and that these images are named "01.jpg", "02.jpg", "03.jpg", etc. If image "42.jpg" is replaced, is there any way to replace it in the cache so that "42.jpg" will automatically display the new image on successive page loads? I can't use the META tag method, because I need everuthing that ISN"T replaced to remain cached, and I can't use the timestamp method, because I don't want ALL of the images to be reloaded every time the page loads. I've racked my brain and scoured the Internet for a way to do this (preferrably via javascript), but no luck. Any...
`Is it possible that there are more than 6 quark flavors/more than 3 generations? I thought that things like the top quark don't exist in nature because they're super unstable and we can only observe them after high-energy collisions (e.g. LHC) Is it possible to make even more massive quarks? Or is there a reason the limit is six?`	Why do we think there are only three generations of fundamental particles? In the of particle physics, there are three generations of quarks (up/down, strange/charm, and top/bottom), along with three generations of leptons (electron, muon, and tau). All of these particles have been observed experimentally, and we don't seem to have seen anything new along these lines. A priori, this doesn't eliminate the possibility of a fourth generation, but the physicists I've spoken to do not think additional generations are likely. Question: What sort of theoretical or experimental reasons do we have for this limitation? One reason I heard from my officemate is that we haven't seen new neutrinos. Neutrinos seem to be light enough that if another generation's neutrino is too heavy to be detected, then the corresponding quarks would be massive enough that new physics might interfere with their existence. This suggests the question: is there a general rule relating neutrino masses to quark...

Loss: pylate.losses.cached_contrastive.CachedContrastive

stackexchange_body_body

Dataset: stackexchange_body_body
Size: 65,689 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 16 tokens
mean: 38.22 tokens
max: 39 tokens

min: 9 tokens
mean: 38.09 tokens
max: 39 tokens

	query	document
type	string	string
details	min: 16 tokens mean: 38.22 tokens max: 39 tokens	min: 9 tokens mean: 38.09 tokens max: 39 tokens

Samples:

query	document
`When I type "sudo apt-get update" I see HTTP protocol is used to fetch the updates . Why not HTTPS is used for more secure communication ?`	`Does apt-get use https or any kind of encryption? Is there a way to configure it to use it?`
If I have 4 identical* LEDs wired in parallel to a single resistor so that the overall current available is 30 mA, do I still run the risk of premature burnout? The LEDs peak forward current is 30 mA. *I know that LEDs from the same package may still have slight differences I thoroughly read through the answers here - - but it seems like the assumption would be that one would arrange a circuit so that the available current equals the total draw of the 4 LEDs, in my case 80 mA. The problem then would be that some would draw more than the peak. But, if I'm limiting the avaialble current to 30mA, is there still an issue? That would mean that ideally 7.5mA would be supplied to each LED. Obviously, based on the answer in the aforementioned link, it would not likely be even, but it shouldn't get to "dangerous levels". Follow-up: Based on the volt/amperage curve, it looks like I'd be seeing a ~0.1V drop. Will this significantly affect the brightness? Still pretty new to all this so m...	I'm trying to wire up 6 RGB LEDs in parallel, all controlled from a single source (well, three sources, one for each colour). The LEDs came supplied with resistors to limit the current of 270 Ohm for a 5v supply. The problem is, 6 LEDs x 3 colours = 18 resistors, which is a lot, and means I need a much bigger board and a lot more soldering. So, can I instead wire the LEDs in parallel with each other, with a single resistor protecting all six? (3 resistors in total, one for each colour). How do I calculate the value of that resistor? More details: The LEDs are being driven from a to supply a bit of current, which is in turn controlled by a Netduino providing a PWM signal on the three channels. . If I've correctly understood the data sheet they want 20mA of current, and forward voltages of 2, 3, 3 volts (for R,G and B respectively?). The supplied resistors were all 270 Ohm, so the channels may not be balanced quite right. For extra credit: I'm only using 3 of the transistors in my...
I want to make a figure in Mathematica, export it as a PDF, edit/label it in Photoshop, and then add it as a figure in a TeX document. I would really like to have the font in the figure closely match the math mode stuff in the document. In the past I've made a PDF with TeX with just the labels I want and then pasted them all into the figures, but this is incredibly tedious. I found this post - - which says that the font in math mode is "Latin Modern Symbol" but this is not an option in Photoshop. Is there another font that looks close enough to math mode which is in photoshop? Thanks for any help!	`I draw figures in Inkscape. When I label elements within the figures with variable names that I have used in the underlying TeX document, I would like them to look exactly the same as in the document. (e.g. l does not look the same as $l$) What is the name of the math mode font so I can select it correctly in the Inkscape font list? If the exact font should not be available, what is a similar looking font that is present on most systems?`

Loss: pylate.losses.cached_contrastive.CachedContrastive

sentence_compression

Dataset: sentence_compression
Size: 173,604 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 11 tokens
mean: 32.05 tokens
max: 39 tokens

min: 7 tokens
mean: 12.69 tokens
max: 31 tokens

	query	document
type	string	string
details	min: 11 tokens mean: 32.05 tokens max: 39 tokens	min: 7 tokens mean: 12.69 tokens max: 31 tokens

Samples:

query	document
`Sedgebrook, a continuing care retirement community located in Lincolnshire, will host a free support group for caregivers who support aging loved ones.`	`Sedgebrook retirement community to host support group for caregivers`
`Junction City Police said in a news release Saturday that several shots were fired at the narcotics detective around midnight as he conducted surveillance in an unmarked vehicle.`	`Shots fired at narcotics detective`
`A SWAT team surrounded a home on Miller Avenue in South San Francisco Monday afternoon, according to authorities and neighbors.`	`SWAT team surrounds home in South San Francisco`

Loss: pylate.losses.cached_contrastive.CachedContrastive

wikihow

Dataset: wikihow
Size: 96,029 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 5 tokens
mean: 10.12 tokens
max: 24 tokens

min: 9 tokens
mean: 37.1 tokens
max: 39 tokens

Samples:

query	document
`Dry and Propagate Comfrey`	`This article will tell you how to dry and propagate comfrey.`
`Add a Playlist Shortcut on Android`	`Adding a playlist shortcut to your home screen is a surefire way to add convenience in using your Android device. For daily commutes or morning jogs, this is a useful feature to be able to start playing your music in the quickest way possible.`
`Add an Android App to Google Drive`	`Google drive is a social service that can be used to share with friends. You can use Google Drive on your Android to share Android apps.`

Loss: pylate.losses.cached_contrastive.CachedContrastive

altlex

Dataset: altlex
Size: 110,708 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 5 tokens
mean: 29.71 tokens
max: 39 tokens

min: 6 tokens
mean: 27.34 tokens
max: 39 tokens

Samples:

query	document
`Avery County is a county located in the U.S. state of North Carolina .`	`Avery County is a county in the U.S. state of North Carolina .`
`There he studied piano with Mieczyslaw Horszowski and composition with Constant Vauclain , and switched majors from piano to composition .`	`He studied piano at the Curtis Institute of Music , with Mieczyslaw Horszowski and composition with Constant Vauclain .`
`The ReachOut website includes testimonials from a school nurse in Tucson , Arizona and an elementary school principal of the Deer Valley Unified School District in Greater Phoenix .`	`The ReachOut website has leters from a school nurse in Tucson , Arizona and an elementary school principal of the Deer Valley Unified School District in Greater Phoenix which say good things about ReachOut .`

Loss: pylate.losses.cached_contrastive.CachedContrastive

quora

Dataset: quora
Size: 44,885 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 7 tokens
mean: 14.76 tokens
max: 39 tokens

min: 7 tokens
mean: 14.64 tokens
max: 39 tokens

Samples:

query	document
`Website traffic analytics will show statistics for "direct navigation" which includes both typed in URL's (domain +.com) in the URL bar, as well as those using bookmarks to get to a site. What is an estimate for the breakdown of each?`	`Website traffic analytics will show statistics for "direct navigation" which includes both typed in URL's (domain +.com) in the URL bar, as well as those using bookmarks to get to a site. Are there any statistics that show an estimated percentage of each rather than lumping them together?`
`What are the most recognized flags in the world?`	`Which 10 flags are the most recognisable in the world?`
`Can I deposit 500 & 1000 INR notes in my savings account multiple times on each banking day till 30/12/2016?`	`Can I deposit 500 & 1000 INR notes in my current account multiple times on each banking day till 30/12/2016?`

Loss: pylate.losses.cached_contrastive.CachedContrastive

simplewiki

Dataset: simplewiki
Size: 97,717 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 9 tokens
mean: 28.33 tokens
max: 39 tokens

min: 9 tokens
mean: 30.86 tokens
max: 39 tokens

Samples:

query	document
`Some of those rescued by the Nordnorge were taken to the Chilean Eduardo Frei Montalva Station on King George Island . Later they were flown by C-130 Hercules transport aircraft of the Chilean Air Force to Punta Arenas , Chile , in two separate flights on Saturday , November 24th , and Sunday , November 25th .`	`All of those rescued by Nordnorge were taken to the Chilean Frei Montalva Station on King George Island where they were subsequently airlifted by C-130 Hercules transport aircraft of the Chilean Air Force to Punta Arenas , Chile in two separate flights , one on Saturday , November 24 , and the other on Sunday , November 25 .`
`The name of that province is Friesland . Leeuwarden is called Ljouwert in Frisian .`	`Leeuwarden ( , Stadsfries : Liwwadden , Frisian : Ljouwert , ) is the capital city of the Dutch province of Friesland .`
`France has invested a lot in nuclear power . This made France the smallest producer of carbon dioxide among the seven most industrialised countries in the world .`	`France is the smallest emitter of carbon dioxide among the seven most industrialized countries in the world , due to its heavy investment in nuclear power .`

Loss: pylate.losses.cached_contrastive.CachedContrastive

squad

Dataset: squad
Size: 25,117 training samples
Columns: query and document
Approximate statistics based on the first 1000 samples:
query document
type string string
details
min: 7 tokens
mean: 15.82 tokens
max: 39 tokens

min: 32 tokens
mean: 38.97 tokens
max: 39 tokens

Samples:

query	document
`What percentage of Italians spoke standard Italian when Italy was first unified?`	During the Risorgimento, proponents of Italian republicanism and Italian nationalism, such as Alessandro Manzoni, stressed the importance of establishing a uniform national language in order to better create an Italian national identity. With the unification of Italy in the 1860s, standard Italian became the official national language of the new Italian state, while the various unofficial regional languages of Italy gradually became regarded as subordinate "dialects" to Italian, increasingly associated negatively with lack of education or provincialism. However, at the time of the Italian Unification, standard Italian still existed mainly as a literary language, and only 2.5% of Italy's population could speak standard Italian.
`What type of process is used to produce most paper used in paperback books?`	`Mechanical pulping yields almost a tonne of pulp per tonne of dry wood used, which is why mechanical pulps are sometimes referred to as "high yield" pulps. With almost twice the yield as chemical pulping, mechanical pulps is often cheaper. Mass-market paperback books and newspapers tend to use mechanical papers. Book publishers tend to use acid-free paper, made from fully bleached chemical pulps for hardback and trade paperback books.`
`What do orthodox Jews express ambivalence towards?`	Politically, Orthodox Jews, given their variety of movements and affiliations, tend not to conform easily to the standard left-right political spectrum, with one of the key differences between the movements stemming from the groups' attitudes to Zionism. Generally speaking, of the three key strands of Orthodox Judaism, Haredi Orthodox and Hasidic Orthodox Jews are at best ambivalent towards the ideology of Zionism and the creation of the State of Israel, and there are many groups and organisations who are outspokenly anti-Zionistic, seeing the ideology of Zionism as diametrically opposed to the teaching of the Torah, and the Zionist administration of the State of Israel, with its emphasis on militarism and nationalism, as destructive of the Judaic way of life.

Loss: pylate.losses.cached_contrastive.CachedContrastive

Training Hyperparameters

Non-Default Hyperparameters

eval_strategy: steps
per_device_train_batch_size: 16384
per_device_eval_batch_size: 16384
learning_rate: 0.0003
num_train_epochs: 1
seed: 2203
bf16: True
dataloader_num_workers: 4
accelerator_config: {'split_batches': True, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
ddp_find_unused_parameters: False

All Hyperparameters

Click to expand

overwrite_output_dir: False
do_predict: False
eval_strategy: steps
prediction_loss_only: True
per_device_train_batch_size: 16384
per_device_eval_batch_size: 16384
per_gpu_train_batch_size: None
per_gpu_eval_batch_size: None
gradient_accumulation_steps: 1
eval_accumulation_steps: None
torch_empty_cache_steps: None
learning_rate: 0.0003
weight_decay: 0.0
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 1e-08
max_grad_norm: 1.0
num_train_epochs: 1
max_steps: -1
lr_scheduler_type: linear
lr_scheduler_kwargs: {}
warmup_ratio: 0.0
warmup_steps: 0
log_level: passive
log_level_replica: warning
log_on_each_node: True
logging_nan_inf_filter: True
save_safetensors: True
save_on_each_node: False
save_only_model: False
restore_callback_states_from_checkpoint: False
no_cuda: False
use_cpu: False
use_mps_device: False
seed: 2203
data_seed: None
jit_mode_eval: False
use_ipex: False
bf16: True
fp16: False
fp16_opt_level: O1
half_precision_backend: auto
bf16_full_eval: False
fp16_full_eval: False
tf32: None
local_rank: 3
ddp_backend: None
tpu_num_cores: None
tpu_metrics_debug: False
debug: []
dataloader_drop_last: True
dataloader_num_workers: 4
dataloader_prefetch_factor: None
past_index: -1
disable_tqdm: False
remove_unused_columns: True
label_names: None
load_best_model_at_end: False
ignore_data_skip: False
fsdp: []
fsdp_min_num_params: 0
fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
fsdp_transformer_layer_cls_to_wrap: None
accelerator_config: {'split_batches': True, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
deepspeed: None
label_smoothing_factor: 0.0
optim: adamw_torch
optim_args: None
adafactor: False
group_by_length: False
length_column_name: length
ddp_find_unused_parameters: False
ddp_bucket_cap_mb: None
ddp_broadcast_buffers: False
dataloader_pin_memory: True
dataloader_persistent_workers: False
skip_memory_metrics: True
use_legacy_prediction_loop: False
push_to_hub: False
resume_from_checkpoint: None
hub_model_id: None
hub_strategy: every_save
hub_private_repo: None
hub_always_push: False
gradient_checkpointing: False
gradient_checkpointing_kwargs: None
include_inputs_for_metrics: False
include_for_metrics: []
eval_do_concat_batches: True
fp16_backend: auto
push_to_hub_model_id: None
push_to_hub_organization: None
mp_parameters:
auto_find_batch_size: False
full_determinism: False
torchdynamo: None
ray_scope: last
ddp_timeout: 1800
torch_compile: False
torch_compile_backend: None
torch_compile_mode: None
dispatch_batches: None
split_batches: None
include_tokens_per_second: False
include_num_input_tokens_seen: False
neftune_noise_alpha: None
optim_target_modules: None
batch_eval_metrics: False
eval_on_start: False
use_liger_kernel: False
eval_use_gather_object: False
average_tokens_across_devices: False
prompts: None
batch_sampler: batch_sampler
multi_dataset_batch_sampler: proportional
router_mapping: {}
learning_rate_mapping: {}

Training Logs

Click to expand

Epoch	Step	Training Loss	NanoClimateFEVER_MaxSim_ndcg@10	NanoDBPedia_MaxSim_ndcg@10	NanoFEVER_MaxSim_ndcg@10	NanoFiQA2018_MaxSim_ndcg@10	NanoHotpotQA_MaxSim_ndcg@10	NanoMSMARCO_MaxSim_ndcg@10	NanoNFCorpus_MaxSim_ndcg@10	NanoNQ_MaxSim_ndcg@10	NanoQuoraRetrieval_MaxSim_ndcg@10	NanoSCIDOCS_MaxSim_ndcg@10	NanoArguAna_MaxSim_ndcg@10	NanoSciFact_MaxSim_ndcg@10	NanoTouche2020_MaxSim_ndcg@10	NanoBEIR_mean_MaxSim_ndcg@10
0.0034	50	61.2999	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.0343	500	5.0461	0.3091	0.6323	0.8872	0.5003	0.8744	0.5921	0.3513	0.6589	0.9681	0.3880	0.5267	0.7637	0.5567	0.6161
0.0377	550	5.5948	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.0686	1000	4.5264	0.2929	0.6032	0.8808	0.4734	0.8602	0.5893	0.3661	0.6827	0.9840	0.4127	0.5617	0.7662	0.5425	0.6166
0.0721	1050	4.2289	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.1029	1500	3.3687	0.3020	0.6234	0.8844	0.4924	0.8616	0.6463	0.3382	0.6535	0.9737	0.4126	0.5931	0.7712	0.5332	0.6220
0.1064	1550	3.5525	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.1373	2000	3.4038	0.3187	0.6411	0.9097	0.4932	0.8662	0.5951	0.3396	0.6608	0.9654	0.4146	0.5593	0.7731	0.5297	0.6205
0.1407	2050	2.6228	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.1716	2500	3.3323	0.3495	0.6658	0.9354	0.5105	0.8702	0.5510	0.3455	0.6634	0.9708	0.4018	0.5564	0.7860	0.5386	0.6265
0.1750	2550	4.0719	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.2059	3000	4.6922	0.3614	0.6465	0.8694	0.5131	0.8670	0.6584	0.3381	0.6651	0.9772	0.4106	0.5640	0.7646	0.5562	0.6301
0.2093	3050	4.588	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.2402	3500	2.6753	0.3553	0.6433	0.9020	0.5309	0.8610	0.6263	0.3574	0.6896	0.9664	0.4094	0.5529	0.7897	0.5457	0.6331
0.2436	3550	1.9266	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.2745	4000	3.7069	0.3320	0.6326	0.9042	0.5067	0.8537	0.6766	0.3617	0.6794	0.9734	0.4097	0.5667	0.7830	0.5461	0.6327
0.2779	4050	2.0447	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.3088	4500	5.9963	0.3331	0.6102	0.8848	0.5332	0.8662	0.6105	0.3684	0.6865	0.9617	0.4200	0.5531	0.7741	0.5498	0.6270
0.3123	4550	1.9683	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.3431	5000	3.4992	0.3363	0.6468	0.8804	0.5186	0.8409	0.6062	0.3638	0.6793	0.9572	0.4050	0.6033	0.7820	0.5277	0.6267
0.3466	5050	3.8568	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.3775	5500	2.4815	0.3515	0.6371	0.9249	0.5024	0.8714	0.5951	0.3559	0.6958	0.9738	0.4188	0.5704	0.8005	0.5287	0.6328
0.3809	5550	0.8767	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.4118	6000	2.8857	0.3370	0.6572	0.9093	0.5114	0.8845	0.6167	0.3607	0.6978	0.9675	0.4366	0.5420	0.8066	0.5399	0.6359
0.4152	6050	4.0425	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.4461	6500	2.1285	0.3643	0.6483	0.9097	0.5183	0.8664	0.6320	0.3609	0.7104	0.9673	0.4278	0.5570	0.8011	0.5425	0.6389
0.4495	6550	3.4573	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.4804	7000	4.2792	0.3313	0.6685	0.9194	0.5153	0.8760	0.6245	0.3654	0.7160	0.9611	0.4190	0.5751	0.7887	0.5562	0.6397
0.4838	7050	1.9176	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.5147	7500	3.0862	0.3277	0.6472	0.9035	0.5352	0.8700	0.6237	0.3486	0.7176	0.9654	0.4276	0.5619	0.7786	0.5489	0.6351
0.5182	7550	1.942	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.5490	8000	2.1192	0.3355	0.6564	0.9309	0.5218	0.8688	0.6458	0.3540	0.7075	0.9691	0.4101	0.5733	0.7808	0.5434	0.6383
0.5525	8050	2.9456	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.5834	8500	3.8136	0.3470	0.6382	0.9309	0.5277	0.8581	0.6432	0.3533	0.6766	0.9686	0.4272	0.5573	0.8036	0.5352	0.6359
0.5868	8550	2.2182	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.6177	9000	3.4301	0.3418	0.6299	0.9196	0.5272	0.8482	0.6435	0.3487	0.7118	0.9686	0.4260	0.5727	0.8009	0.5402	0.6369
0.6211	9050	1.8215	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.6520	9500	1.728	0.3491	0.6376	0.8956	0.5294	0.8523	0.6730	0.3468	0.6982	0.9728	0.4139	0.5757	0.7896	0.5438	0.6367
0.6554	9550	1.6046	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.6863	10000	3.2371	0.3511	0.6495	0.8984	0.5236	0.8408	0.6371	0.3571	0.7229	0.9675	0.4286	0.6078	0.7906	0.5409	0.6397
0.6897	10050	0.7697	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.7206	10500	1.8522	0.3443	0.6459	0.9096	0.5073	0.8310	0.6554	0.3558	0.7143	0.9651	0.4333	0.5888	0.7959	0.5351	0.6371
0.7240	10550	2.0346	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.7549	11000	3.3423	0.3527	0.6543	0.9204	0.5239	0.8794	0.6599	0.3477	0.7359	0.9709	0.4315	0.5812	0.8018	0.5267	0.6451
0.7584	11050	1.9674	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.7892	11500	2.6639	0.3671	0.6534	0.9254	0.5337	0.8470	0.6734	0.3436	0.7249	0.9600	0.4282	0.5881	0.8062	0.5336	0.6450
0.7927	11550	2.5904	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.8236	12000	3.1084	0.3631	0.6534	0.9043	0.5061	0.8514	0.6698	0.3383	0.7349	0.9647	0.4329	0.5784	0.7941	0.5416	0.6410
0.8270	12050	1.1884	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.8579	12500	1.0438	0.3353	0.6592	0.9085	0.4985	0.8414	0.6357	0.3449	0.7182	0.9792	0.4176	0.5767	0.7912	0.5307	0.6336
0.8613	12550	1.422	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.8922	13000	1.8963	0.3529	0.6538	0.9161	0.5349	0.8396	0.6563	0.3467	0.7182	0.9796	0.4286	0.5852	0.8058	0.5374	0.6427
0.8956	13050	3.0346	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.9265	13500	1.6398	0.3477	0.6604	0.9121	0.5267	0.8496	0.6493	0.3423	0.7234	0.9730	0.4321	0.5767	0.7996	0.5368	0.6408
0.9299	13550	1.3234	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.9608	14000	2.0053	0.3489	0.6655	0.9194	0.5436	0.8521	0.6605	0.3414	0.7221	0.9753	0.4324	0.5817	0.7986	0.5366	0.6445
0.9642	14050	1.2547	-	-	-	-	-	-	-	-	-	-	-	-	-	-
0.9951	14500	1.4897	0.3518	0.6608	0.9268	0.5355	0.8602	0.6668	0.3421	0.7233	0.9747	0.4296	0.5817	0.8003	0.5345	0.6452
0.9986	14550	3.0093	-	-	-	-	-	-	-	-	-	-	-	-	-	-

Framework Versions

Python: 3.13.0
Sentence Transformers: 5.1.1
PyLate: 1.3.4
Transformers: 4.48.3
PyTorch: 2.6.0
Accelerate: 1.12.0
Datasets: 4.4.1
Tokenizers: 0.21.0

Citation

BibTeX

ColBERT-Zero

@misc{chaffin2026colbertzeropretrainpretraincolbert,
  title         = {ColBERT-Zero: To Pre-train Or Not To Pre-train ColBERT models}, 
  author        = {Antoine Chaffin and Luca Arnaboldi and Amélie Chatelain and Florent Krzakala},
  year          = {2026},
  eprint        = {2602.16609},
  archivePrefix = {arXiv},
  primaryClass  = {cs.CL},
  url           = {https://arxiv.org/abs/2602.16609}, 
}

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084"
}

PyLate

@inproceedings{DBLP:conf/cikm/ChaffinS25,
  author       = {Antoine Chaffin and
                  Rapha{"{e}}l Sourty},
  editor       = {Meeyoung Cha and
                  Chanyoung Park and
                  Noseong Park and
                  Carl Yang and
                  Senjuti Basu Roy and
                  Jessie Li and
                  Jaap Kamps and
                  Kijung Shin and
                  Bryan Hooi and
                  Lifang He},
  title        = {PyLate: Flexible Training and Retrieval for Late Interaction Models},
  booktitle    = {Proceedings of the 34th {ACM} International Conference on Information
                  and Knowledge Management, {CIKM} 2025, Seoul, Republic of Korea, November
                  10-14, 2025},
  pages        = {6334--6339},
  publisher    = {{ACM}},
  year         = {2025},
  url          = {https://github.com/lightonai/pylate},
  doi          = {10.1145/3746252.3761608},
}

Nomic Embed

@article{DBLP:journals/tmlr/NussbaumMMD25,
  author       = {Zach Nussbaum and
                  John Xavier Morris and
                  Andriy Mulyar and
                  Brandon Duderstadt},
  title        = {Nomic Embed: Training a Reproducible Long Context Text Embedder},
  journal      = {Trans. Mach. Learn. Res.},
  volume       = {2025},
  year         = {2025},
  url          = {https://openreview.net/forum?id=IPmzyQSiQE},
  timestamp    = {Fri, 20 Jun 2025 14:19:48 +0200},
  biburl       = {https://dblp.org/rec/journals/tmlr/NussbaumMMD25.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

CachedContrastive

@misc{gao2021scaling,
    title.        = {Scaling Deep Contrastive Learning Batch Size under Memory Limited Setup},
    author        = {Luyu Gao and Yunyi Zhang and Jiawei Han and Jamie Callan},
    year          = {2021},
    eprint        = {2101.06983},
    archivePrefix = {arXiv},
    primaryClass. = {cs.LG}
}