|
|
--- |
|
|
tags: |
|
|
- ColBERT |
|
|
- PyLate |
|
|
- sentence-transformers |
|
|
- sentence-similarity |
|
|
- embeddings |
|
|
- retrieval |
|
|
- feature-extraction |
|
|
- generated_from_trainer |
|
|
- dataset_size:238998494 |
|
|
- loss:CachedContrastive |
|
|
pipeline_tag: sentence-similarity |
|
|
library_name: PyLate |
|
|
license: apache-2.0 |
|
|
language: |
|
|
- en |
|
|
metrics: |
|
|
- MaxSim_accuracy@1 |
|
|
- MaxSim_accuracy@3 |
|
|
- MaxSim_accuracy@5 |
|
|
- MaxSim_accuracy@10 |
|
|
- MaxSim_precision@1 |
|
|
- MaxSim_precision@3 |
|
|
- MaxSim_precision@5 |
|
|
- MaxSim_precision@10 |
|
|
- MaxSim_recall@1 |
|
|
- MaxSim_recall@3 |
|
|
- MaxSim_recall@5 |
|
|
- MaxSim_recall@10 |
|
|
- MaxSim_ndcg@10 |
|
|
- MaxSim_mrr@10 |
|
|
- MaxSim_map@100 |
|
|
model-index: |
|
|
- name: PyLate |
|
|
results: |
|
|
- task: |
|
|
type: py-late-information-retrieval |
|
|
name: Py Late Information Retrieval |
|
|
dataset: |
|
|
name: NanoClimateFEVER |
|
|
type: NanoClimateFEVER |
|
|
metrics: |
|
|
- type: MaxSim_accuracy@1 |
|
|
value: 0.42 |
|
|
name: Maxsim Accuracy@1 |
|
|
- type: MaxSim_accuracy@3 |
|
|
value: 0.62 |
|
|
name: Maxsim Accuracy@3 |
|
|
- type: MaxSim_accuracy@5 |
|
|
value: 0.64 |
|
|
name: Maxsim Accuracy@5 |
|
|
- type: MaxSim_accuracy@10 |
|
|
value: 0.76 |
|
|
name: Maxsim Accuracy@10 |
|
|
- type: MaxSim_precision@1 |
|
|
value: 0.42 |
|
|
name: Maxsim Precision@1 |
|
|
- type: MaxSim_precision@3 |
|
|
value: 0.22666666666666666 |
|
|
name: Maxsim Precision@3 |
|
|
- type: MaxSim_precision@5 |
|
|
value: 0.14400000000000002 |
|
|
name: Maxsim Precision@5 |
|
|
- type: MaxSim_precision@10 |
|
|
value: 0.092 |
|
|
name: Maxsim Precision@10 |
|
|
- type: MaxSim_recall@1 |
|
|
value: 0.20566666666666664 |
|
|
name: Maxsim Recall@1 |
|
|
- type: MaxSim_recall@3 |
|
|
value: 0.28400000000000003 |
|
|
name: Maxsim Recall@3 |
|
|
- type: MaxSim_recall@5 |
|
|
value: 0.29733333333333334 |
|
|
name: Maxsim Recall@5 |
|
|
- type: MaxSim_recall@10 |
|
|
value: 0.374 |
|
|
name: Maxsim Recall@10 |
|
|
- type: MaxSim_ndcg@10 |
|
|
value: 0.3518000478336987 |
|
|
name: Maxsim Ndcg@10 |
|
|
- type: MaxSim_mrr@10 |
|
|
value: 0.5177460317460317 |
|
|
name: Maxsim Mrr@10 |
|
|
- type: MaxSim_map@100 |
|
|
value: 0.2944493241561189 |
|
|
name: Maxsim Map@100 |
|
|
- task: |
|
|
type: py-late-information-retrieval |
|
|
name: Py Late Information Retrieval |
|
|
dataset: |
|
|
name: NanoDBPedia |
|
|
type: NanoDBPedia |
|
|
metrics: |
|
|
- type: MaxSim_accuracy@1 |
|
|
value: 0.8 |
|
|
name: Maxsim Accuracy@1 |
|
|
- type: MaxSim_accuracy@3 |
|
|
value: 0.96 |
|
|
name: Maxsim Accuracy@3 |
|
|
- type: MaxSim_accuracy@5 |
|
|
value: 0.98 |
|
|
name: Maxsim Accuracy@5 |
|
|
- type: MaxSim_accuracy@10 |
|
|
value: 1.0 |
|
|
name: Maxsim Accuracy@10 |
|
|
- type: MaxSim_precision@1 |
|
|
value: 0.8 |
|
|
name: Maxsim Precision@1 |
|
|
- type: MaxSim_precision@3 |
|
|
value: 0.6933333333333335 |
|
|
name: Maxsim Precision@3 |
|
|
- type: MaxSim_precision@5 |
|
|
value: 0.604 |
|
|
name: Maxsim Precision@5 |
|
|
- type: MaxSim_precision@10 |
|
|
value: 0.518 |
|
|
name: Maxsim Precision@10 |
|
|
- type: MaxSim_recall@1 |
|
|
value: 0.10547467061354297 |
|
|
name: Maxsim Recall@1 |
|
|
- type: MaxSim_recall@3 |
|
|
value: 0.21001141632312567 |
|
|
name: Maxsim Recall@3 |
|
|
- type: MaxSim_recall@5 |
|
|
value: 0.2682250276346291 |
|
|
name: Maxsim Recall@5 |
|
|
- type: MaxSim_recall@10 |
|
|
value: 0.3822739230347477 |
|
|
name: Maxsim Recall@10 |
|
|
- type: MaxSim_ndcg@10 |
|
|
value: 0.6607934403680658 |
|
|
name: Maxsim Ndcg@10 |
|
|
- type: MaxSim_mrr@10 |
|
|
value: 0.8741666666666668 |
|
|
name: Maxsim Mrr@10 |
|
|
- type: MaxSim_map@100 |
|
|
value: 0.5341191871132479 |
|
|
name: Maxsim Map@100 |
|
|
- task: |
|
|
type: py-late-information-retrieval |
|
|
name: Py Late Information Retrieval |
|
|
dataset: |
|
|
name: NanoFEVER |
|
|
type: NanoFEVER |
|
|
metrics: |
|
|
- type: MaxSim_accuracy@1 |
|
|
value: 0.9 |
|
|
name: Maxsim Accuracy@1 |
|
|
- type: MaxSim_accuracy@3 |
|
|
value: 0.96 |
|
|
name: Maxsim Accuracy@3 |
|
|
- type: MaxSim_accuracy@5 |
|
|
value: 1.0 |
|
|
name: Maxsim Accuracy@5 |
|
|
- type: MaxSim_accuracy@10 |
|
|
value: 1.0 |
|
|
name: Maxsim Accuracy@10 |
|
|
- type: MaxSim_precision@1 |
|
|
value: 0.9 |
|
|
name: Maxsim Precision@1 |
|
|
- type: MaxSim_precision@3 |
|
|
value: 0.3399999999999999 |
|
|
name: Maxsim Precision@3 |
|
|
- type: MaxSim_precision@5 |
|
|
value: 0.21599999999999994 |
|
|
name: Maxsim Precision@5 |
|
|
- type: MaxSim_precision@10 |
|
|
value: 0.10799999999999997 |
|
|
name: Maxsim Precision@10 |
|
|
- type: MaxSim_recall@1 |
|
|
value: 0.8366666666666667 |
|
|
name: Maxsim Recall@1 |
|
|
- type: MaxSim_recall@3 |
|
|
value: 0.9233333333333333 |
|
|
name: Maxsim Recall@3 |
|
|
- type: MaxSim_recall@5 |
|
|
value: 0.9733333333333333 |
|
|
name: Maxsim Recall@5 |
|
|
- type: MaxSim_recall@10 |
|
|
value: 0.9733333333333333 |
|
|
name: Maxsim Recall@10 |
|
|
- type: MaxSim_ndcg@10 |
|
|
value: 0.9268221917930667 |
|
|
name: Maxsim Ndcg@10 |
|
|
- type: MaxSim_mrr@10 |
|
|
value: 0.9356666666666666 |
|
|
name: Maxsim Mrr@10 |
|
|
- type: MaxSim_map@100 |
|
|
value: 0.9016938568070643 |
|
|
name: Maxsim Map@100 |
|
|
- task: |
|
|
type: py-late-information-retrieval |
|
|
name: Py Late Information Retrieval |
|
|
dataset: |
|
|
name: NanoFiQA2018 |
|
|
type: NanoFiQA2018 |
|
|
metrics: |
|
|
- type: MaxSim_accuracy@1 |
|
|
value: 0.48 |
|
|
name: Maxsim Accuracy@1 |
|
|
- type: MaxSim_accuracy@3 |
|
|
value: 0.7 |
|
|
name: Maxsim Accuracy@3 |
|
|
- type: MaxSim_accuracy@5 |
|
|
value: 0.74 |
|
|
name: Maxsim Accuracy@5 |
|
|
- type: MaxSim_accuracy@10 |
|
|
value: 0.8 |
|
|
name: Maxsim Accuracy@10 |
|
|
- type: MaxSim_precision@1 |
|
|
value: 0.48 |
|
|
name: Maxsim Precision@1 |
|
|
- type: MaxSim_precision@3 |
|
|
value: 0.33333333333333326 |
|
|
name: Maxsim Precision@3 |
|
|
- type: MaxSim_precision@5 |
|
|
value: 0.23999999999999996 |
|
|
name: Maxsim Precision@5 |
|
|
- type: MaxSim_precision@10 |
|
|
value: 0.142 |
|
|
name: Maxsim Precision@10 |
|
|
- type: MaxSim_recall@1 |
|
|
value: 0.27124603174603173 |
|
|
name: Maxsim Recall@1 |
|
|
- type: MaxSim_recall@3 |
|
|
value: 0.46518253968253975 |
|
|
name: Maxsim Recall@3 |
|
|
- type: MaxSim_recall@5 |
|
|
value: 0.5227619047619048 |
|
|
name: Maxsim Recall@5 |
|
|
- type: MaxSim_recall@10 |
|
|
value: 0.6102857142857143 |
|
|
name: Maxsim Recall@10 |
|
|
- type: MaxSim_ndcg@10 |
|
|
value: 0.5354658437477728 |
|
|
name: Maxsim Ndcg@10 |
|
|
- type: MaxSim_mrr@10 |
|
|
value: 0.6044444444444445 |
|
|
name: Maxsim Mrr@10 |
|
|
- type: MaxSim_map@100 |
|
|
value: 0.4731164376512747 |
|
|
name: Maxsim Map@100 |
|
|
- task: |
|
|
type: py-late-information-retrieval |
|
|
name: Py Late Information Retrieval |
|
|
dataset: |
|
|
name: NanoHotpotQA |
|
|
type: NanoHotpotQA |
|
|
metrics: |
|
|
- type: MaxSim_accuracy@1 |
|
|
value: 0.88 |
|
|
name: Maxsim Accuracy@1 |
|
|
- type: MaxSim_accuracy@3 |
|
|
value: 0.98 |
|
|
name: Maxsim Accuracy@3 |
|
|
- type: MaxSim_accuracy@5 |
|
|
value: 0.98 |
|
|
name: Maxsim Accuracy@5 |
|
|
- type: MaxSim_accuracy@10 |
|
|
value: 1.0 |
|
|
name: Maxsim Accuracy@10 |
|
|
- type: MaxSim_precision@1 |
|
|
value: 0.88 |
|
|
name: Maxsim Precision@1 |
|
|
- type: MaxSim_precision@3 |
|
|
value: 0.56 |
|
|
name: Maxsim Precision@3 |
|
|
- type: MaxSim_precision@5 |
|
|
value: 0.344 |
|
|
name: Maxsim Precision@5 |
|
|
- type: MaxSim_precision@10 |
|
|
value: 0.17799999999999996 |
|
|
name: Maxsim Precision@10 |
|
|
- type: MaxSim_recall@1 |
|
|
value: 0.44 |
|
|
name: Maxsim Recall@1 |
|
|
- type: MaxSim_recall@3 |
|
|
value: 0.84 |
|
|
name: Maxsim Recall@3 |
|
|
- type: MaxSim_recall@5 |
|
|
value: 0.86 |
|
|
name: Maxsim Recall@5 |
|
|
- type: MaxSim_recall@10 |
|
|
value: 0.89 |
|
|
name: Maxsim Recall@10 |
|
|
- type: MaxSim_ndcg@10 |
|
|
value: 0.8601880205101629 |
|
|
name: Maxsim Ndcg@10 |
|
|
- type: MaxSim_mrr@10 |
|
|
value: 0.9325 |
|
|
name: Maxsim Mrr@10 |
|
|
- type: MaxSim_map@100 |
|
|
value: 0.8158855692500271 |
|
|
name: Maxsim Map@100 |
|
|
- task: |
|
|
type: py-late-information-retrieval |
|
|
name: Py Late Information Retrieval |
|
|
dataset: |
|
|
name: NanoMSMARCO |
|
|
type: NanoMSMARCO |
|
|
metrics: |
|
|
- type: MaxSim_accuracy@1 |
|
|
value: 0.44 |
|
|
name: Maxsim Accuracy@1 |
|
|
- type: MaxSim_accuracy@3 |
|
|
value: 0.7 |
|
|
name: Maxsim Accuracy@3 |
|
|
- type: MaxSim_accuracy@5 |
|
|
value: 0.74 |
|
|
name: Maxsim Accuracy@5 |
|
|
- type: MaxSim_accuracy@10 |
|
|
value: 0.9 |
|
|
name: Maxsim Accuracy@10 |
|
|
- type: MaxSim_precision@1 |
|
|
value: 0.44 |
|
|
name: Maxsim Precision@1 |
|
|
- type: MaxSim_precision@3 |
|
|
value: 0.2333333333333333 |
|
|
name: Maxsim Precision@3 |
|
|
- type: MaxSim_precision@5 |
|
|
value: 0.14800000000000002 |
|
|
name: Maxsim Precision@5 |
|
|
- type: MaxSim_precision@10 |
|
|
value: 0.08999999999999998 |
|
|
name: Maxsim Precision@10 |
|
|
- type: MaxSim_recall@1 |
|
|
value: 0.44 |
|
|
name: Maxsim Recall@1 |
|
|
- type: MaxSim_recall@3 |
|
|
value: 0.7 |
|
|
name: Maxsim Recall@3 |
|
|
- type: MaxSim_recall@5 |
|
|
value: 0.74 |
|
|
name: Maxsim Recall@5 |
|
|
- type: MaxSim_recall@10 |
|
|
value: 0.9 |
|
|
name: Maxsim Recall@10 |
|
|
- type: MaxSim_ndcg@10 |
|
|
value: 0.6667909811661161 |
|
|
name: Maxsim Ndcg@10 |
|
|
- type: MaxSim_mrr@10 |
|
|
value: 0.5937222222222223 |
|
|
name: Maxsim Mrr@10 |
|
|
- type: MaxSim_map@100 |
|
|
value: 0.5994919639747226 |
|
|
name: Maxsim Map@100 |
|
|
- task: |
|
|
type: py-late-information-retrieval |
|
|
name: Py Late Information Retrieval |
|
|
dataset: |
|
|
name: NanoNFCorpus |
|
|
type: NanoNFCorpus |
|
|
metrics: |
|
|
- type: MaxSim_accuracy@1 |
|
|
value: 0.4 |
|
|
name: Maxsim Accuracy@1 |
|
|
- type: MaxSim_accuracy@3 |
|
|
value: 0.64 |
|
|
name: Maxsim Accuracy@3 |
|
|
- type: MaxSim_accuracy@5 |
|
|
value: 0.7 |
|
|
name: Maxsim Accuracy@5 |
|
|
- type: MaxSim_accuracy@10 |
|
|
value: 0.74 |
|
|
name: Maxsim Accuracy@10 |
|
|
- type: MaxSim_precision@1 |
|
|
value: 0.4 |
|
|
name: Maxsim Precision@1 |
|
|
- type: MaxSim_precision@3 |
|
|
value: 0.39333333333333337 |
|
|
name: Maxsim Precision@3 |
|
|
- type: MaxSim_precision@5 |
|
|
value: 0.36 |
|
|
name: Maxsim Precision@5 |
|
|
- type: MaxSim_precision@10 |
|
|
value: 0.294 |
|
|
name: Maxsim Precision@10 |
|
|
- type: MaxSim_recall@1 |
|
|
value: 0.023087598529427374 |
|
|
name: Maxsim Recall@1 |
|
|
- type: MaxSim_recall@3 |
|
|
value: 0.06761742851719367 |
|
|
name: Maxsim Recall@3 |
|
|
- type: MaxSim_recall@5 |
|
|
value: 0.10857051887512778 |
|
|
name: Maxsim Recall@5 |
|
|
- type: MaxSim_recall@10 |
|
|
value: 0.14233415018080223 |
|
|
name: Maxsim Recall@10 |
|
|
- type: MaxSim_ndcg@10 |
|
|
value: 0.34210971556529485 |
|
|
name: Maxsim Ndcg@10 |
|
|
- type: MaxSim_mrr@10 |
|
|
value: 0.5169999999999999 |
|
|
name: Maxsim Mrr@10 |
|
|
- type: MaxSim_map@100 |
|
|
value: 0.15131881345299422 |
|
|
name: Maxsim Map@100 |
|
|
- task: |
|
|
type: py-late-information-retrieval |
|
|
name: Py Late Information Retrieval |
|
|
dataset: |
|
|
name: NanoNQ |
|
|
type: NanoNQ |
|
|
metrics: |
|
|
- type: MaxSim_accuracy@1 |
|
|
value: 0.58 |
|
|
name: Maxsim Accuracy@1 |
|
|
- type: MaxSim_accuracy@3 |
|
|
value: 0.78 |
|
|
name: Maxsim Accuracy@3 |
|
|
- type: MaxSim_accuracy@5 |
|
|
value: 0.86 |
|
|
name: Maxsim Accuracy@5 |
|
|
- type: MaxSim_accuracy@10 |
|
|
value: 0.92 |
|
|
name: Maxsim Accuracy@10 |
|
|
- type: MaxSim_precision@1 |
|
|
value: 0.58 |
|
|
name: Maxsim Precision@1 |
|
|
- type: MaxSim_precision@3 |
|
|
value: 0.26 |
|
|
name: Maxsim Precision@3 |
|
|
- type: MaxSim_precision@5 |
|
|
value: 0.17999999999999997 |
|
|
name: Maxsim Precision@5 |
|
|
- type: MaxSim_precision@10 |
|
|
value: 0.09799999999999998 |
|
|
name: Maxsim Precision@10 |
|
|
- type: MaxSim_recall@1 |
|
|
value: 0.54 |
|
|
name: Maxsim Recall@1 |
|
|
- type: MaxSim_recall@3 |
|
|
value: 0.72 |
|
|
name: Maxsim Recall@3 |
|
|
- type: MaxSim_recall@5 |
|
|
value: 0.81 |
|
|
name: Maxsim Recall@5 |
|
|
- type: MaxSim_recall@10 |
|
|
value: 0.88 |
|
|
name: Maxsim Recall@10 |
|
|
- type: MaxSim_ndcg@10 |
|
|
value: 0.7232616852802778 |
|
|
name: Maxsim Ndcg@10 |
|
|
- type: MaxSim_mrr@10 |
|
|
value: 0.6988333333333333 |
|
|
name: Maxsim Mrr@10 |
|
|
- type: MaxSim_map@100 |
|
|
value: 0.6638409439247397 |
|
|
name: Maxsim Map@100 |
|
|
- task: |
|
|
type: py-late-information-retrieval |
|
|
name: Py Late Information Retrieval |
|
|
dataset: |
|
|
name: NanoQuoraRetrieval |
|
|
type: NanoQuoraRetrieval |
|
|
metrics: |
|
|
- type: MaxSim_accuracy@1 |
|
|
value: 0.96 |
|
|
name: Maxsim Accuracy@1 |
|
|
- type: MaxSim_accuracy@3 |
|
|
value: 1.0 |
|
|
name: Maxsim Accuracy@3 |
|
|
- type: MaxSim_accuracy@5 |
|
|
value: 1.0 |
|
|
name: Maxsim Accuracy@5 |
|
|
- type: MaxSim_accuracy@10 |
|
|
value: 1.0 |
|
|
name: Maxsim Accuracy@10 |
|
|
- type: MaxSim_precision@1 |
|
|
value: 0.96 |
|
|
name: Maxsim Precision@1 |
|
|
- type: MaxSim_precision@3 |
|
|
value: 0.41999999999999993 |
|
|
name: Maxsim Precision@3 |
|
|
- type: MaxSim_precision@5 |
|
|
value: 0.26399999999999996 |
|
|
name: Maxsim Precision@5 |
|
|
- type: MaxSim_precision@10 |
|
|
value: 0.13599999999999998 |
|
|
name: Maxsim Precision@10 |
|
|
- type: MaxSim_recall@1 |
|
|
value: 0.8373333333333334 |
|
|
name: Maxsim Recall@1 |
|
|
- type: MaxSim_recall@3 |
|
|
value: 0.972 |
|
|
name: Maxsim Recall@3 |
|
|
- type: MaxSim_recall@5 |
|
|
value: 0.9833333333333333 |
|
|
name: Maxsim Recall@5 |
|
|
- type: MaxSim_recall@10 |
|
|
value: 0.9933333333333334 |
|
|
name: Maxsim Recall@10 |
|
|
- type: MaxSim_ndcg@10 |
|
|
value: 0.9747100090686657 |
|
|
name: Maxsim Ndcg@10 |
|
|
- type: MaxSim_mrr@10 |
|
|
value: 0.98 |
|
|
name: Maxsim Mrr@10 |
|
|
- type: MaxSim_map@100 |
|
|
value: 0.9621719576719576 |
|
|
name: Maxsim Map@100 |
|
|
- task: |
|
|
type: py-late-information-retrieval |
|
|
name: Py Late Information Retrieval |
|
|
dataset: |
|
|
name: NanoSCIDOCS |
|
|
type: NanoSCIDOCS |
|
|
metrics: |
|
|
- type: MaxSim_accuracy@1 |
|
|
value: 0.54 |
|
|
name: Maxsim Accuracy@1 |
|
|
- type: MaxSim_accuracy@3 |
|
|
value: 0.7 |
|
|
name: Maxsim Accuracy@3 |
|
|
- type: MaxSim_accuracy@5 |
|
|
value: 0.84 |
|
|
name: Maxsim Accuracy@5 |
|
|
- type: MaxSim_accuracy@10 |
|
|
value: 0.92 |
|
|
name: Maxsim Accuracy@10 |
|
|
- type: MaxSim_precision@1 |
|
|
value: 0.54 |
|
|
name: Maxsim Precision@1 |
|
|
- type: MaxSim_precision@3 |
|
|
value: 0.3666666666666666 |
|
|
name: Maxsim Precision@3 |
|
|
- type: MaxSim_precision@5 |
|
|
value: 0.32799999999999996 |
|
|
name: Maxsim Precision@5 |
|
|
- type: MaxSim_precision@10 |
|
|
value: 0.214 |
|
|
name: Maxsim Precision@10 |
|
|
- type: MaxSim_recall@1 |
|
|
value: 0.11366666666666667 |
|
|
name: Maxsim Recall@1 |
|
|
- type: MaxSim_recall@3 |
|
|
value: 0.22666666666666666 |
|
|
name: Maxsim Recall@3 |
|
|
- type: MaxSim_recall@5 |
|
|
value: 0.33666666666666656 |
|
|
name: Maxsim Recall@5 |
|
|
- type: MaxSim_recall@10 |
|
|
value: 0.43866666666666665 |
|
|
name: Maxsim Recall@10 |
|
|
- type: MaxSim_ndcg@10 |
|
|
value: 0.4295804160884494 |
|
|
name: Maxsim Ndcg@10 |
|
|
- type: MaxSim_mrr@10 |
|
|
value: 0.656079365079365 |
|
|
name: Maxsim Mrr@10 |
|
|
- type: MaxSim_map@100 |
|
|
value: 0.3329808800019895 |
|
|
name: Maxsim Map@100 |
|
|
- task: |
|
|
type: py-late-information-retrieval |
|
|
name: Py Late Information Retrieval |
|
|
dataset: |
|
|
name: NanoArguAna |
|
|
type: NanoArguAna |
|
|
metrics: |
|
|
- type: MaxSim_accuracy@1 |
|
|
value: 0.26 |
|
|
name: Maxsim Accuracy@1 |
|
|
- type: MaxSim_accuracy@3 |
|
|
value: 0.64 |
|
|
name: Maxsim Accuracy@3 |
|
|
- type: MaxSim_accuracy@5 |
|
|
value: 0.82 |
|
|
name: Maxsim Accuracy@5 |
|
|
- type: MaxSim_accuracy@10 |
|
|
value: 0.9 |
|
|
name: Maxsim Accuracy@10 |
|
|
- type: MaxSim_precision@1 |
|
|
value: 0.26 |
|
|
name: Maxsim Precision@1 |
|
|
- type: MaxSim_precision@3 |
|
|
value: 0.21333333333333335 |
|
|
name: Maxsim Precision@3 |
|
|
- type: MaxSim_precision@5 |
|
|
value: 0.16399999999999998 |
|
|
name: Maxsim Precision@5 |
|
|
- type: MaxSim_precision@10 |
|
|
value: 0.09 |
|
|
name: Maxsim Precision@10 |
|
|
- type: MaxSim_recall@1 |
|
|
value: 0.26 |
|
|
name: Maxsim Recall@1 |
|
|
- type: MaxSim_recall@3 |
|
|
value: 0.64 |
|
|
name: Maxsim Recall@3 |
|
|
- type: MaxSim_recall@5 |
|
|
value: 0.82 |
|
|
name: Maxsim Recall@5 |
|
|
- type: MaxSim_recall@10 |
|
|
value: 0.9 |
|
|
name: Maxsim Recall@10 |
|
|
- type: MaxSim_ndcg@10 |
|
|
value: 0.5817359990817483 |
|
|
name: Maxsim Ndcg@10 |
|
|
- type: MaxSim_mrr@10 |
|
|
value: 0.4781904761904761 |
|
|
name: Maxsim Mrr@10 |
|
|
- type: MaxSim_map@100 |
|
|
value: 0.4820919913419914 |
|
|
name: Maxsim Map@100 |
|
|
- task: |
|
|
type: py-late-information-retrieval |
|
|
name: Py Late Information Retrieval |
|
|
dataset: |
|
|
name: NanoSciFact |
|
|
type: NanoSciFact |
|
|
metrics: |
|
|
- type: MaxSim_accuracy@1 |
|
|
value: 0.68 |
|
|
name: Maxsim Accuracy@1 |
|
|
- type: MaxSim_accuracy@3 |
|
|
value: 0.82 |
|
|
name: Maxsim Accuracy@3 |
|
|
- type: MaxSim_accuracy@5 |
|
|
value: 0.86 |
|
|
name: Maxsim Accuracy@5 |
|
|
- type: MaxSim_accuracy@10 |
|
|
value: 0.92 |
|
|
name: Maxsim Accuracy@10 |
|
|
- type: MaxSim_precision@1 |
|
|
value: 0.68 |
|
|
name: Maxsim Precision@1 |
|
|
- type: MaxSim_precision@3 |
|
|
value: 0.2866666666666667 |
|
|
name: Maxsim Precision@3 |
|
|
- type: MaxSim_precision@5 |
|
|
value: 0.18799999999999997 |
|
|
name: Maxsim Precision@5 |
|
|
- type: MaxSim_precision@10 |
|
|
value: 0.10399999999999998 |
|
|
name: Maxsim Precision@10 |
|
|
- type: MaxSim_recall@1 |
|
|
value: 0.655 |
|
|
name: Maxsim Recall@1 |
|
|
- type: MaxSim_recall@3 |
|
|
value: 0.79 |
|
|
name: Maxsim Recall@3 |
|
|
- type: MaxSim_recall@5 |
|
|
value: 0.845 |
|
|
name: Maxsim Recall@5 |
|
|
- type: MaxSim_recall@10 |
|
|
value: 0.92 |
|
|
name: Maxsim Recall@10 |
|
|
- type: MaxSim_ndcg@10 |
|
|
value: 0.800311389775704 |
|
|
name: Maxsim Ndcg@10 |
|
|
- type: MaxSim_mrr@10 |
|
|
value: 0.7665793650793651 |
|
|
name: Maxsim Mrr@10 |
|
|
- type: MaxSim_map@100 |
|
|
value: 0.7584845013477088 |
|
|
name: Maxsim Map@100 |
|
|
- task: |
|
|
type: py-late-information-retrieval |
|
|
name: Py Late Information Retrieval |
|
|
dataset: |
|
|
name: NanoTouche2020 |
|
|
type: NanoTouche2020 |
|
|
metrics: |
|
|
- type: MaxSim_accuracy@1 |
|
|
value: 0.5714285714285714 |
|
|
name: Maxsim Accuracy@1 |
|
|
- type: MaxSim_accuracy@3 |
|
|
value: 0.8775510204081632 |
|
|
name: Maxsim Accuracy@3 |
|
|
- type: MaxSim_accuracy@5 |
|
|
value: 0.9183673469387755 |
|
|
name: Maxsim Accuracy@5 |
|
|
- type: MaxSim_accuracy@10 |
|
|
value: 1.0 |
|
|
name: Maxsim Accuracy@10 |
|
|
- type: MaxSim_precision@1 |
|
|
value: 0.5714285714285714 |
|
|
name: Maxsim Precision@1 |
|
|
- type: MaxSim_precision@3 |
|
|
value: 0.5918367346938774 |
|
|
name: Maxsim Precision@3 |
|
|
- type: MaxSim_precision@5 |
|
|
value: 0.5877551020408164 |
|
|
name: Maxsim Precision@5 |
|
|
- type: MaxSim_precision@10 |
|
|
value: 0.4836734693877551 |
|
|
name: Maxsim Precision@10 |
|
|
- type: MaxSim_recall@1 |
|
|
value: 0.03907914418061841 |
|
|
name: Maxsim Recall@1 |
|
|
- type: MaxSim_recall@3 |
|
|
value: 0.1204002709275123 |
|
|
name: Maxsim Recall@3 |
|
|
- type: MaxSim_recall@5 |
|
|
value: 0.19544619521998122 |
|
|
name: Maxsim Recall@5 |
|
|
- type: MaxSim_recall@10 |
|
|
value: 0.31191053266167984 |
|
|
name: Maxsim Recall@10 |
|
|
- type: MaxSim_ndcg@10 |
|
|
value: 0.5345004700502356 |
|
|
name: Maxsim Ndcg@10 |
|
|
- type: MaxSim_mrr@10 |
|
|
value: 0.7458292840945903 |
|
|
name: Maxsim Mrr@10 |
|
|
- type: MaxSim_map@100 |
|
|
value: 0.3808192643084636 |
|
|
name: Maxsim Map@100 |
|
|
- task: |
|
|
type: nano-beir |
|
|
name: Nano BEIR |
|
|
dataset: |
|
|
name: NanoBEIR mean |
|
|
type: NanoBEIR_mean |
|
|
metrics: |
|
|
- type: MaxSim_accuracy@1 |
|
|
value: 0.6085714285714284 |
|
|
name: Maxsim Accuracy@1 |
|
|
- type: MaxSim_accuracy@3 |
|
|
value: 0.7982731554160125 |
|
|
name: Maxsim Accuracy@3 |
|
|
- type: MaxSim_accuracy@5 |
|
|
value: 0.8521821036106751 |
|
|
name: Maxsim Accuracy@5 |
|
|
- type: MaxSim_accuracy@10 |
|
|
value: 0.9123076923076924 |
|
|
name: Maxsim Accuracy@10 |
|
|
- type: MaxSim_precision@1 |
|
|
value: 0.6085714285714284 |
|
|
name: Maxsim Precision@1 |
|
|
- type: MaxSim_precision@3 |
|
|
value: 0.3783464154892726 |
|
|
name: Maxsim Precision@3 |
|
|
- type: MaxSim_precision@5 |
|
|
value: 0.28982731554160124 |
|
|
name: Maxsim Precision@5 |
|
|
- type: MaxSim_precision@10 |
|
|
value: 0.1959748822605965 |
|
|
name: Maxsim Precision@10 |
|
|
- type: MaxSim_recall@1 |
|
|
value: 0.36670929064638114 |
|
|
name: Maxsim Recall@1 |
|
|
- type: MaxSim_recall@3 |
|
|
value: 0.5353239734961824 |
|
|
name: Maxsim Recall@3 |
|
|
- type: MaxSim_recall@5 |
|
|
value: 0.5969746394737162 |
|
|
name: Maxsim Recall@5 |
|
|
- type: MaxSim_recall@10 |
|
|
value: 0.6704721271920213 |
|
|
name: Maxsim Recall@10 |
|
|
- type: MaxSim_ndcg@10 |
|
|
value: 0.6452361700253275 |
|
|
name: Maxsim Ndcg@10 |
|
|
- type: MaxSim_mrr@10 |
|
|
value: 0.7154429119633202 |
|
|
name: Maxsim Mrr@10 |
|
|
- type: MaxSim_map@100 |
|
|
value: 0.5654203608463307 |
|
|
name: Maxsim Map@100 |
|
|
--- |
|
|
|
|
|
<div align="center"> |
|
|
<img src="https://cdn-uploads.huggingface.co/production/uploads/609bbe2f4932693ca2009d6a/xn21ll7YRj0ZftBli3-T5.jpeg" width="600" height="auto"> |
|
|
|
|
|
|
|
|
[](https://lighton.ai) |
|
|
[](https://www.linkedin.com/company/lighton/) |
|
|
[](https://x.com/LightOnIO) |
|
|
|
|
|
📄 [Paper](https://arxiv.org/abs/2602.16609) | 📝 [Blog](https://huggingface.co/blog/lightonai/colbert-zero) | 📚 [Collection](https://huggingface.co/collections/lightonai/colbert-zero) |
|
|
|
|
|
</div> |
|
|
|
|
|
|
|
|
# ColBERT-Zero |
|
|
|
|
|
> 🎯 **TL;DR**: First large-scale fully pre-trained ColBERT model using only public data. Achieves **55.43 nDCG@10** on BEIR benchmark, outperforming GTE-ModernColBERT and GTE-ModernBERT trained on closed and stronger data. **New SOTA on BEIR for models <150M parameters**. |
|
|
|
|
|
|
|
|
## Why ColBERT-Zero? |
|
|
|
|
|
Late interaction (ColBERT / multi-vector) models have clear advantages in out-of-domain generalization, long-context handling, and reasoning-intensive retrieval. Yet they remain undertrained: current state-of-the-art ColBERT models (e.g, [GTE-ModernColBERT](https://huggingface.co/Alibaba-NLP/gte-modernbert-colbert) and [ColBERT-small](https://huggingface.co)) are simply built by bolting a small knowledge distillation step onto a strong dense (single-vector) model. Even recent efforts like [mxbai-edge-colbert-v0](https://huggingface.co/collections/mixedbread-ai/mxbai-edge-colbert-v0-series) perform all early training stages in a single-vector setting, only switching to the multi-vector objective at the very end. |
|
|
|
|
|
**This leaves a lot of performance on the table.** ColBERT-Zero demonstrates that performing contrastive pre-training directly in the multi-vector setting, rather than treating it as an afterthought, unlocks a significantly higher performance ceiling. Trained exclusively on public data ([Nomic-embed](https://arxiv.org/abs/2402.01613) dataset mixture), [ColBERT-Zero](https://huggingface.co/lightonai/ColBERT-Zero) overcomes a 2.4-point data quality disadvantage to outperform models trained on proprietary, closed-source data. For detailed results, please have a look at our [blogpost](https://huggingface.co/blog/lightonai/colbert-zero/) and the [paper](https://arxiv.org/abs/2602.16609). All the [models](https://huggingface.co/collections/lightonai/colbert-zero) (including intermediate checkpoints) as well [training code](https://github.com/lightonai/pylate/tree/main/examples/train/ColBERT-zero) are released under an Apache 2.0 license. |
|
|
|
|
|
## Controlled Comparison Design |
|
|
|
|
|
We deliberately trained on the public [Nomic-embed](https://arxiv.org/abs/2402.01613) data mixture for a strategic reason: Nomic has already trained a dense ModernBERT model ([ModernBERT-embed](https://huggingface.co/nomic-ai/modernbert-embed-base)) on this exact data. This lets us compare dense vs. multi-vector training with the **same data, same base model ([ModernBERT](https://huggingface.co/answerdotai/ModernBERT-base)), and same pipeline**. The only variable is whether the contrastive phases are performed in the dense or multi-vector setting. |
|
|
|
|
|
This design reveals a striking result: the dense baseline trained on Nomic data scores 52.89, while the one trained on GTE's proprietary data scores 55.33: a 2.4-point data quality gap. Despite this disadvantage, ColBERT-Zero's full multi-vector pre-training pipeline closes and surpasses this gap, reaching **55.43 nDCG@10**. |
|
|
|
|
|
## The Three-Phase Training Pipeline |
|
|
|
|
|
The development followed a three-phase pipeline, each providing a different type of learning signal: |
|
|
|
|
|
### Phase 1 - Unsupervised Contrastive Pre-training |
|
|
We began with the [nomic-embed-unsupervised-data](https://huggingface.co/datasets/nomic-ai/nomic-embed-unsupervised-data) dataset. Using [PyLate](https://lightonai.github.io/pylate/)'s **GradCache** implementation to scale per-GPU batch size without VRAM constraints, combined with **cross-GPU gathering** of representations, we reached effective batch sizes of **~16k**, required for unsupervised training to produce plausible in-batch hard negatives. Unlike dense training, the multi-vector objective allows the encoder to learn fine-grained token importance from the very first phase. |
|
|
|
|
|
### Phase 2 - Supervised Contrastive Fine-tuning |
|
|
We refined the model using the [nomic-embed-supervised-data](https://huggingface.co/datasets/nomic-ai/nomic-embed-supervised-data). This stage introduced mined hard negatives: documents that are superficially similar to the query but not actually relevant. This allows teaching the model to handle nuance by prioritizing specific keywords and contextual tokens most indicative of a true match. |
|
|
|
|
|
### Phase 3 - Knowledge Distillation (KD) |
|
|
The final stage used the [ms-marco-en-bge](https://huggingface.co/datasets/lightonai/ms-marco-en-bge) dataset. We leveraged a powerful Gemma-based model as a teacher, allowing our student models to learn to replicate complex reasoning scores via the efficient MaxSim operator. |
|
|
|
|
|
## Key Findings |
|
|
|
|
|
### 1. The Standard Recipe Leaves Performance on the Table |
|
|
The KD-only approach (the current industry standard) scores 54.09, lagging behind full pre-training by **1.3 points**. A simple distillation step is insufficient for optimal multi-vector performance. |
|
|
|
|
|
### 2. Supervised + KD Is the Efficiency Sweet Spot |
|
|
By running a supervised contrastive step in the multi-vector setting before distillation, we reach **55.12 nDCG@10**, closing most of the gap with the fully pre-trained model (55.43). This costs **~40 GH200-hours instead of ~408**: roughly **10× cheaper for 99.4% of the performance**. |
|
|
<div align="center"> |
|
|
<img src="https://cdn-uploads.huggingface.co/production/uploads/609bbe2f4932693ca2009d6a/V1_hTZ0VnJHldfd3Ip-Jm.png" width="600" height="auto"> |
|
|
</div> |
|
|
|
|
|
### 3. Prompt Alignment Is Non-Negotiable |
|
|
Nomic's base models are pre-trained with asymmetric prompts (`search_query:` and `search_document:`). While ColBERT has its own asymmetric mechanism via `[Q]` and `[D]` markers, we found: |
|
|
- **Stripping pre-training prompts during fine-tuning** causes significant performance degradation. |
|
|
- **Adding prompts to a model not pre-trained with them** also hurts performance. |
|
|
- **Even with perfect alignment**, prompts provide an intrinsic benefit: full ColBERT pre-training with prompts (55.43) vs. without prompts (54.61), no mismatch in either case, shows a meaningful 0.82-point gap. |
|
|
|
|
|
<div align="center"> |
|
|
<img src="https://cdn-uploads.huggingface.co/production/uploads/609bbe2f4932693ca2009d6a/uZoRA7SwisR-svi4lPDTi.png" width="600" height="auto"> |
|
|
</div> |
|
|
|
|
|
**Why do prompts help?** Our leading hypothesis is that prompt tokens act as **implicit query expansion**: extra slots that don't carry specific meaning but let the model store global information about the sequence. The original ColBERT used `[PAD]` tokens for this purpose, but modern Flash Attention implementations broke this trick (masked tokens no longer produce usable embeddings). Explicit prompt tokens may be quietly re-enabling it. |
|
|
|
|
|
**Practical takeaway:** Always align your prompts with the base model's pre-training setup. Misalignment is one of the easiest ways to silently lose performance. Note that this sensitivity decreases with stronger downstream fine-tuning: with enough training, the model can adapt to an initial mismatch. |
|
|
|
|
|
## Model Lineup |
|
|
|
|
|
### The Main Models (ColBERT-Zero) |
|
|
`ColBERT-Zero` utilizes the full 3-phase pipeline with strict prompt alignment, **achieving 55.43 nDCG@10 on BEIR**, setting a new SOTA for models <150M parameters. We also provide `ColBERT-Zero-noprompts`, the same pipeline without asymmetric prompts, to study the impact of query expansion on multi-vector performance. |
|
|
|
|
|
### The cheap-to-train ones (ModernColBERT-embed-base) |
|
|
These models represent the practical sweet spot. By skipping the expensive unsupervised phase, `ModernColBERT-embed-base` (Supervised + KD) achieves ~97% of the flagship's performance at only ~10% of the compute cost. For reference, `ModernColBERT-embed-base-kd` performs only the distillation step on a supervised dense base. |
|
|
|
|
|
### Intermediate Checkpoints |
|
|
For researchers studying the incremental impact of each phase and prompt alignment, we release several ablation variants: `ColBERT-Zero-supervised`, `ColBERT-Zero-unsupervised` (and their `-noprompts` versions), and `ModernColBERT-embed-base-supervised`. |
|
|
|
|
|
|
|
|
#### Full Performance on BEIR |
|
|
|
|
|
<!DOCTYPE html> |
|
|
<html lang="en"> |
|
|
<head> |
|
|
<meta charset="UTF-8"> |
|
|
<style> |
|
|
.beir-wrap { overflow-x: auto; font-family: system-ui, sans-serif; width: 100%; display: block; -webkit-overflow-scrolling: touch; } |
|
|
.beir-wrap table { border-collapse: collapse; font-size: 0.70rem; white-space: nowrap; background: #fff; box-shadow: 0 1px 4px rgba(0,0,0,.1); border-radius: 8px; min-width: max-content; } |
|
|
.beir-wrap th, .beir-wrap td { padding: 7px 10px; text-align: center; border-bottom: 1px solid #e9ecef; } |
|
|
.beir-wrap td:first-child, .beir-wrap th:first-child { text-align: left; min-width: 260px; } |
|
|
.beir-wrap th { background: #1e293b; color: #fff; font-weight: 600; } |
|
|
.beir-wrap th.avg-col { background: #f59e0b; color: #1e293b; font-weight: 700; } |
|
|
.beir-wrap td.avg-col { font-weight: 700; font-size: 0.78rem; color: #1e293b; background: #fef3c7; border-left: 2px solid #f59e0b; border-right: 2px solid #f59e0b; } |
|
|
.beir-wrap tr:last-child td.avg-col { border-bottom: 2px solid #f59e0b; } |
|
|
.beir-wrap .section-row td { background: #334155; color: #94a3b8; font-weight: 600; font-size: 0.72rem; letter-spacing: .05em; text-transform: uppercase; padding: 5px 10px; } |
|
|
.beir-wrap strong { color: #0f172a; } |
|
|
.beir-wrap tbody tr:not(.section-row):hover td { background: #f1f5f9; } |
|
|
.beir-wrap tbody tr:not(.section-row):hover td.avg-col { background: #fde68a; } |
|
|
.beir-wrap a { color: #3b82f6; text-decoration: none; } |
|
|
.beir-wrap a:hover { text-decoration: underline; } |
|
|
</style> |
|
|
</head> |
|
|
<body> |
|
|
<div class="beir-wrap"> |
|
|
<table> |
|
|
<thead> |
|
|
<tr> |
|
|
<th>Model</th> |
|
|
<th class="avg-col">Avg</th> |
|
|
<th>FiQA</th><th>NFCorpus</th><th>TREC-COVID</th><th>Touche</th><th>ArguAna</th><th>Quora</th><th>SCIDOCS</th><th>SciFact</th><th>NQ</th><th>ClimateFEVER</th><th>HotpotQA</th><th>DBPedia</th><th>CQADupstack</th><th>FEVER</th><th>MSMARCO</th> |
|
|
</tr> |
|
|
</thead> |
|
|
<tbody> |
|
|
<tr class="section-row"><td colspan="17">Baselines</td></tr> |
|
|
<tr> |
|
|
<td><a href="https://huggingface.co/nomic-ai/modernbert-embed-base-unsupervised">ModernBERT-embed-unsupervised</a></td> |
|
|
<td class="avg-col">47.05</td> |
|
|
<td>42.53</td><td>35.33</td><td>68.44</td><td>18.58</td><td>48.82</td><td>88.63</td><td>19.83</td><td>72.30</td><td>46.32</td><td>22.97</td><td>60.00</td><td>37.97</td><td>42.40</td><td>67.39</td><td>34.23</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td><a href="https://huggingface.co/nomic-ai/modernbert-embed-base">ModernBERT-embed-supervised</a></td> |
|
|
<td class="avg-col">52.89</td> |
|
|
<td>40.59</td><td>33.40</td><td><strong>84.15</strong></td><td>31.91</td><td>48.96</td><td><strong>88.85</strong></td><td>18.59</td><td>69.63</td><td>62.15</td><td>35.67</td><td>67.11</td><td>41.50</td><td>42.08</td><td>87.35</td><td>41.47</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td><a href="https://huggingface.co/lightonai/GTE-ModernColBERT-v1">GTE-ModernColBERT</a></td> |
|
|
<td class="avg-col">54.67</td> |
|
|
<td>45.28</td><td><strong>37.93</strong></td><td>83.59</td><td>31.23</td><td>48.51</td><td>86.61</td><td>19.06</td><td>76.34</td><td>61.80</td><td>30.62</td><td>77.32</td><td>48.03</td><td>41.00</td><td>87.44</td><td>45.32</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td><a href="https://huggingface.co/Alibaba-NLP/gte-modernbert-base">gte-modernbert-base</a></td> |
|
|
<td class="avg-col">55.33</td> |
|
|
<td><strong>48.81</strong></td><td>36.44</td><td>81.95</td><td>21.68</td><td><strong>72.68</strong></td><td>88.55</td><td>21.29</td><td><strong>77.40</strong></td><td>57.62</td><td><strong>37.74</strong></td><td>69.47</td><td>41.79</td><td>42.63</td><td><strong>91.03</strong></td><td>40.90</td> |
|
|
</tr> |
|
|
|
|
|
<tr class="section-row"><td colspan="17">KD from dense supervised</td></tr> |
|
|
<tr> |
|
|
<td><a href="https://huggingface.co/lightonai/ModernColBERT-embed-base-kd-only">ModernColBERT-embed-base-kd-only</a></td> |
|
|
<td class="avg-col">54.09</td> |
|
|
<td>42.51</td><td>37.01</td><td>79.52</td><td>34.58</td><td>51.75</td><td>87.67</td><td>18.15</td><td>75.04</td><td>61.45</td><td>28.31</td><td>76.70</td><td>47.54</td><td>40.68</td><td>84.82</td><td>45.57</td> |
|
|
</tr> |
|
|
|
|
|
<tr class="section-row"><td colspan="17">Supervised + KD from dense unsupervised</td></tr> |
|
|
<tr> |
|
|
<td><a href="https://huggingface.co/lightonai/ModernColBERT-embed-base-supervised">ModernColBERT-embed-base-supervised</a></td> |
|
|
<td class="avg-col">50.72</td> |
|
|
<td>40.09</td><td>35.56</td><td>71.12</td><td>25.53</td><td>44.27</td><td>86.96</td><td>18.19</td><td>73.78</td><td>58.89</td><td>32.95</td><td>71.49</td><td>43.23</td><td>42.55</td><td>70.51</td><td>45.72</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td><a href="https://huggingface.co/lightonai/ModernColBERT-embed-base">ModernColBERT-embed-base</a></td> |
|
|
<td class="avg-col">55.12</td> |
|
|
<td>41.50</td><td>36.51</td><td>77.46</td><td>33.77</td><td>52.45</td><td>86.26</td><td>18.66</td><td>74.90</td><td>62.24</td><td>37.27</td><td><strong>80.07</strong></td><td><strong>48.27</strong></td><td>41.60</td><td>89.71</td><td><strong>46.17</strong></td> |
|
|
</tr> |
|
|
|
|
|
<tr class="section-row"><td colspan="17">ColBERT-Zero</td></tr> |
|
|
<tr> |
|
|
<td><a href="https://huggingface.co/lightonai/ColBERT-Zero-unsupervised">Unsupervised</a></td> |
|
|
<td class="avg-col">51.44</td> |
|
|
<td>45.38</td><td>36.88</td><td>67.82</td><td>22.59</td><td>51.53</td><td>87.78</td><td>22.30</td><td>76.76</td><td>58.80</td><td>24.24</td><td>68.29</td><td>43.16</td><td><strong>45.76</strong></td><td>81.58</td><td>38.78</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td><a href="https://huggingface.co/lightonai/ColBERT-Zero-supervised">Supervised</a></td> |
|
|
<td class="avg-col">51.81</td> |
|
|
<td>42.45</td><td>35.60</td><td>74.72</td><td>23.83</td><td>41.81</td><td>87.19</td><td>19.85</td><td>73.71</td><td>61.95</td><td>35.01</td><td>71.37</td><td>46.20</td><td>45.16</td><td>72.61</td><td>45.68</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td><a href="https://huggingface.co/lightonai/ColBERT-Zero">Distilled</a></td> |
|
|
<td class="avg-col"><strong>55.43</strong></td> |
|
|
<td>42.62</td><td>37.28</td><td>78.69</td><td>36.13</td><td>53.07</td><td>85.24</td><td>19.88</td><td>76.50</td><td>61.66</td><td>35.72</td><td>79.41</td><td>47.48</td><td>41.34</td><td>90.59</td><td>45.80</td> |
|
|
</tr> |
|
|
|
|
|
<tr class="section-row"><td colspan="17">ColBERT-Zero-noprompts</td></tr> |
|
|
<tr> |
|
|
<td><a href="https://huggingface.co/lightonai/ColBERT-Zero-unsupervised-noprompts">Unsupervised</a></td> |
|
|
<td class="avg-col">51.70</td> |
|
|
<td>45.31</td><td>34.72</td><td>73.55</td><td>23.26</td><td>52.56</td><td>88.15</td><td><strong>22.63</strong></td><td>76.10</td><td>59.18</td><td>24.24</td><td>66.66</td><td>42.61</td><td>45.56</td><td>81.88</td><td>39.15</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td><a href="https://huggingface.co/lightonai/ColBERT-Zero-supervised-noprompts">Supervised</a></td> |
|
|
<td class="avg-col">52.39</td> |
|
|
<td>43.36</td><td>36.01</td><td>72.42</td><td>23.79</td><td>47.42</td><td>87.79</td><td>21.30</td><td>73.85</td><td><strong>62.25</strong></td><td>31.61</td><td>70.32</td><td>44.07</td><td>44.03</td><td>85.54</td><td>42.11</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td><a href="https://huggingface.co/lightonai/ColBERT-Zero-noprompts">Distilled</a></td> |
|
|
<td class="avg-col">54.61</td> |
|
|
<td>43.14</td><td>36.60</td><td>78.60</td><td><strong>36.36</strong></td><td>49.49</td><td>88.05</td><td>19.13</td><td>76.42</td><td>61.73</td><td>32.70</td><td>76.99</td><td>47.69</td><td>40.21</td><td>85.97</td><td>46.01</td> |
|
|
</tr> |
|
|
</tbody> |
|
|
</table> |
|
|
</div> |
|
|
</body> |
|
|
</html> |
|
|
|
|
|
|
|
|
## Limitations & Discussion |
|
|
|
|
|
- **Data-specific findings.** We deliberately used the Nomic Embed data mixture for controlled comparison. Some observations (particularly around prompt sensitivity) may not generalize to different or stronger training configurations. |
|
|
- **Scale vs. objective.** The gains from multi-vector pre-training likely reflect *more training time* in the multi-vector setting, rather than the contrastive objective itself. Performing KD alone at a larger scale might yield similar or superior results due to the higher quality of the distillation signal. Our study uses the conventional setup where training scale is inversely proportional to signal quality, reflecting the higher cost of generating high-quality labels. |
|
|
- **Prompt sensitivity decreases with stronger fine-tuning.** When experimenting with stronger fine-tuning data (e.g., NV-Retriever), adding prompts on top of a model pre-trained without them did not degrade results the way it did with ColBERT-Zero. With enough downstream training, the model can adapt to an initial mismatch. |
|
|
|
|
|
## Serving at Scale |
|
|
|
|
|
For production deployment of ColBERT-Zero and other multi-vector models, check out [NextPlaid](https://github.com/lightonai/nextplaid) and [FastPlaid](https://github.com/lightonai/fastplaid), our production-grade engines for multi-vector retrieval. |
|
|
|
|
|
## Resources |
|
|
|
|
|
- 📦 **All checkpoints:** [HF Collection](https://huggingface.co/collections/lightonai/colbert-zero) - every phase, with and without prompts |
|
|
- 💻 **Code:** [Training boilerplates](https://github.com/lightonai/pylate/tree/main/examples/train/ColBERT-zero) |
|
|
- 📄 **Paper:** [ArXiv](https://arxiv.org/abs/2602.16609) |
|
|
|
|
|
|
|
|
## Model Details |
|
|
|
|
|
### Model Description |
|
|
- **Model Type:** PyLate model |
|
|
<!-- - **Base model:** [Unknown](https://huggingface.co/unknown) --> |
|
|
- **Document Length:** 187 tokens |
|
|
- **Query Length:** 39 tokens |
|
|
- **Output Dimensionality:** 128 tokens |
|
|
- **Similarity Function:** MaxSim |
|
|
- **Training Datasets:** |
|
|
- reddit_title_body |
|
|
- amazon_reviews |
|
|
- paq |
|
|
- s2orc_citation_titles |
|
|
- s2orc_title_abstract |
|
|
- s2orc_abstract_citation |
|
|
- s2orc_abstract_body |
|
|
- wikianswers |
|
|
- wikipedia |
|
|
- gooaq |
|
|
- codesearch |
|
|
- yahoo_title_answer |
|
|
- agnews |
|
|
- amazonqa |
|
|
- yahoo_qa |
|
|
- yahoo_title_question |
|
|
- ccnews |
|
|
- npr |
|
|
- eli5 |
|
|
- cnn |
|
|
- stackexchange_duplicate_questions |
|
|
- stackexchange_title_body |
|
|
- stackexchange_body_body |
|
|
- sentence_compression |
|
|
- wikihow |
|
|
- altlex |
|
|
- quora |
|
|
- simplewiki |
|
|
- squad |
|
|
<!-- - **Language:** Unknown --> |
|
|
<!-- - **License:** Unknown --> |
|
|
|
|
|
### Model Sources |
|
|
|
|
|
- **Documentation:** [PyLate Documentation](https://lightonai.github.io/pylate/) |
|
|
- **Repository:** [PyLate on GitHub](https://github.com/lightonai/pylate) |
|
|
- **Hugging Face:** [PyLate models on Hugging Face](https://huggingface.co/models?library=PyLate) |
|
|
|
|
|
### Full Model Architecture |
|
|
|
|
|
``` |
|
|
ColBERT( |
|
|
(0): Transformer({'max_seq_length': 186, 'do_lower_case': False, 'architecture': 'ModernBertModel'}) |
|
|
(1): Dense({'in_features': 768, 'out_features': 128, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity', 'use_residual': False}) |
|
|
) |
|
|
``` |
|
|
|
|
|
## Usage |
|
|
First install the PyLate library: |
|
|
|
|
|
```bash |
|
|
pip install -U pylate |
|
|
``` |
|
|
|
|
|
> [!WARNING] |
|
|
> **Prompt alignment is critical for ColBERT-Zero models.** You **must** use `prompt_name="query"` when encoding queries and `prompt_name="document"` when encoding documents. ColBERT-Zero was pre-trained with asymmetric prompts (`search_query:` / `search_document:`), and stripping them causes significant performance. |
|
|
|
|
|
### Retrieval |
|
|
|
|
|
Use this model with PyLate to index and retrieve documents. The index uses [FastPLAID](https://github.com/lightonai/fast-plaid) for efficient similarity search. |
|
|
|
|
|
#### Indexing documents |
|
|
|
|
|
Load the ColBERT model and initialize the PLAID index, then encode and index your documents: |
|
|
|
|
|
```python |
|
|
from pylate import indexes, models, retrieve |
|
|
|
|
|
# Step 1: Load the ColBERT model |
|
|
model = models.ColBERT( |
|
|
model_name_or_path="pylate_model_id", |
|
|
) |
|
|
|
|
|
# Step 2: Initialize the PLAID index |
|
|
index = indexes.PLAID( |
|
|
index_folder="pylate-index", |
|
|
index_name="index", |
|
|
override=True, # This overwrites the existing index if any |
|
|
) |
|
|
|
|
|
# Step 3: Encode the documents |
|
|
documents_ids = ["1", "2", "3"] |
|
|
documents = ["document 1 text", "document 2 text", "document 3 text"] |
|
|
|
|
|
documents_embeddings = model.encode( |
|
|
documents, |
|
|
batch_size=32, |
|
|
is_query=False, # Ensure that it is set to False to indicate that these are documents, not queries |
|
|
prompt_name="document", # ⚠️ Required for ColBERT-Zero! Do not omit. |
|
|
show_progress_bar=True, |
|
|
) |
|
|
|
|
|
# Step 4: Add document embeddings to the index by providing embeddings and corresponding ids |
|
|
index.add_documents( |
|
|
documents_ids=documents_ids, |
|
|
documents_embeddings=documents_embeddings, |
|
|
) |
|
|
``` |
|
|
|
|
|
Note that you do not have to recreate the index and encode the documents every time. Once you have created an index and added the documents, you can re-use the index later by loading it: |
|
|
|
|
|
```python |
|
|
# To load an index, simply instantiate it with the correct folder/name and without overriding it |
|
|
index = indexes.PLAID( |
|
|
index_folder="pylate-index", |
|
|
index_name="index", |
|
|
) |
|
|
``` |
|
|
|
|
|
#### Retrieving top-k documents for queries |
|
|
|
|
|
Once the documents are indexed, you can retrieve the top-k most relevant documents for a given set of queries. |
|
|
To do so, initialize the ColBERT retriever with the index you want to search in, encode the queries and then retrieve the top-k documents to get the top matches ids and relevance scores: |
|
|
|
|
|
[!WARNING] |
|
|
Always pass prompt_name="query" for queries and prompt_name="document" for documents. Omitting these prompts will silently degrade retrieval quality. |
|
|
|
|
|
```python |
|
|
# Step 1: Initialize the ColBERT retriever |
|
|
retriever = retrieve.ColBERT(index=index) |
|
|
|
|
|
# Step 2: Encode the queries |
|
|
queries_embeddings = model.encode( |
|
|
["query for document 3", "query for document 1"], |
|
|
batch_size=32, |
|
|
is_query=True, # # Ensure that it is set to False to indicate that these are queries |
|
|
prompt_name="query", # ⚠️ Required for ColBERT-Zero! Do not omit. |
|
|
show_progress_bar=True, |
|
|
) |
|
|
|
|
|
# Step 3: Retrieve top-k documents |
|
|
scores = retriever.retrieve( |
|
|
queries_embeddings=queries_embeddings, |
|
|
k=10, # Retrieve the top 10 matches for each query |
|
|
) |
|
|
``` |
|
|
|
|
|
### Reranking |
|
|
> [!WARNING] |
|
|
> Always pass `prompt_name="query"` for queries and `prompt_name="document"` for documents. Omitting these prompts will silently degrade retrieval quality. |
|
|
|
|
|
If you only want to use the ColBERT model to perform reranking on top of your first-stage retrieval pipeline without building an index, you can simply use rank function and pass the queries and documents to rerank: |
|
|
|
|
|
|
|
|
```python |
|
|
from pylate import rank, models |
|
|
|
|
|
queries = [ |
|
|
"query A", |
|
|
"query B", |
|
|
] |
|
|
|
|
|
documents = [ |
|
|
["document A", "document B"], |
|
|
["document 1", "document C", "document B"], |
|
|
] |
|
|
|
|
|
documents_ids = [ |
|
|
[1, 2], |
|
|
[1, 3, 2], |
|
|
] |
|
|
|
|
|
model = models.ColBERT( |
|
|
model_name_or_path="pylate_model_id", |
|
|
) |
|
|
|
|
|
queries_embeddings = model.encode( |
|
|
queries, |
|
|
is_query=True, |
|
|
prompt_name="query" # ⚠️ Required for ColBERT-Zero! Do not omit. |
|
|
) |
|
|
|
|
|
documents_embeddings = model.encode( |
|
|
documents, |
|
|
is_query=False, |
|
|
prompt_name="document" # ⚠️ Required for ColBERT-Zero! Do not omit. |
|
|
) |
|
|
|
|
|
reranked_documents = rank.rerank( |
|
|
documents_ids=documents_ids, |
|
|
queries_embeddings=queries_embeddings, |
|
|
documents_embeddings=documents_embeddings, |
|
|
) |
|
|
``` |
|
|
|
|
|
<!-- |
|
|
### Direct Usage (Transformers) |
|
|
|
|
|
<details><summary>Click to see the direct usage in Transformers</summary> |
|
|
|
|
|
</details> |
|
|
--> |
|
|
|
|
|
<!-- |
|
|
### Downstream Usage (Sentence Transformers) |
|
|
|
|
|
You can finetune this model on your own dataset. |
|
|
|
|
|
<details><summary>Click to expand</summary> |
|
|
|
|
|
</details> |
|
|
--> |
|
|
|
|
|
<!-- |
|
|
### Out-of-Scope Use |
|
|
|
|
|
*List how the model may foreseeably be misused and address what users ought not to do with the model.* |
|
|
--> |
|
|
|
|
|
## Evaluation |
|
|
|
|
|
### Metrics |
|
|
|
|
|
#### Py Late Information Retrieval |
|
|
* Dataset: `['NanoClimateFEVER', 'NanoDBPedia', 'NanoFEVER', 'NanoFiQA2018', 'NanoHotpotQA', 'NanoMSMARCO', 'NanoNFCorpus', 'NanoNQ', 'NanoQuoraRetrieval', 'NanoSCIDOCS', 'NanoArguAna', 'NanoSciFact', 'NanoTouche2020']` |
|
|
* Evaluated with <code>pylate.evaluation.pylate_information_retrieval_evaluator.PyLateInformationRetrievalEvaluator</code> |
|
|
|
|
|
| Metric | NanoClimateFEVER | NanoDBPedia | NanoFEVER | NanoFiQA2018 | NanoHotpotQA | NanoMSMARCO | NanoNFCorpus | NanoNQ | NanoQuoraRetrieval | NanoSCIDOCS | NanoArguAna | NanoSciFact | NanoTouche2020 | |
|
|
|:--------------------|:-----------------|:------------|:-----------|:-------------|:-------------|:------------|:-------------|:-----------|:-------------------|:------------|:------------|:------------|:---------------| |
|
|
| MaxSim_accuracy@1 | 0.42 | 0.8 | 0.9 | 0.48 | 0.88 | 0.44 | 0.4 | 0.58 | 0.96 | 0.54 | 0.26 | 0.68 | 0.5714 | |
|
|
| MaxSim_accuracy@3 | 0.62 | 0.96 | 0.96 | 0.7 | 0.98 | 0.7 | 0.64 | 0.78 | 1.0 | 0.7 | 0.64 | 0.82 | 0.8776 | |
|
|
| MaxSim_accuracy@5 | 0.64 | 0.98 | 1.0 | 0.74 | 0.98 | 0.74 | 0.7 | 0.86 | 1.0 | 0.84 | 0.82 | 0.86 | 0.9184 | |
|
|
| MaxSim_accuracy@10 | 0.76 | 1.0 | 1.0 | 0.8 | 1.0 | 0.9 | 0.74 | 0.92 | 1.0 | 0.92 | 0.9 | 0.92 | 1.0 | |
|
|
| MaxSim_precision@1 | 0.42 | 0.8 | 0.9 | 0.48 | 0.88 | 0.44 | 0.4 | 0.58 | 0.96 | 0.54 | 0.26 | 0.68 | 0.5714 | |
|
|
| MaxSim_precision@3 | 0.2267 | 0.6933 | 0.34 | 0.3333 | 0.56 | 0.2333 | 0.3933 | 0.26 | 0.42 | 0.3667 | 0.2133 | 0.2867 | 0.5918 | |
|
|
| MaxSim_precision@5 | 0.144 | 0.604 | 0.216 | 0.24 | 0.344 | 0.148 | 0.36 | 0.18 | 0.264 | 0.328 | 0.164 | 0.188 | 0.5878 | |
|
|
| MaxSim_precision@10 | 0.092 | 0.518 | 0.108 | 0.142 | 0.178 | 0.09 | 0.294 | 0.098 | 0.136 | 0.214 | 0.09 | 0.104 | 0.4837 | |
|
|
| MaxSim_recall@1 | 0.2057 | 0.1055 | 0.8367 | 0.2712 | 0.44 | 0.44 | 0.0231 | 0.54 | 0.8373 | 0.1137 | 0.26 | 0.655 | 0.0391 | |
|
|
| MaxSim_recall@3 | 0.284 | 0.21 | 0.9233 | 0.4652 | 0.84 | 0.7 | 0.0676 | 0.72 | 0.972 | 0.2267 | 0.64 | 0.79 | 0.1204 | |
|
|
| MaxSim_recall@5 | 0.2973 | 0.2682 | 0.9733 | 0.5228 | 0.86 | 0.74 | 0.1086 | 0.81 | 0.9833 | 0.3367 | 0.82 | 0.845 | 0.1954 | |
|
|
| MaxSim_recall@10 | 0.374 | 0.3823 | 0.9733 | 0.6103 | 0.89 | 0.9 | 0.1423 | 0.88 | 0.9933 | 0.4387 | 0.9 | 0.92 | 0.3119 | |
|
|
| **MaxSim_ndcg@10** | **0.3518** | **0.6608** | **0.9268** | **0.5355** | **0.8602** | **0.6668** | **0.3421** | **0.7233** | **0.9747** | **0.4296** | **0.5817** | **0.8003** | **0.5345** | |
|
|
| MaxSim_mrr@10 | 0.5177 | 0.8742 | 0.9357 | 0.6044 | 0.9325 | 0.5937 | 0.517 | 0.6988 | 0.98 | 0.6561 | 0.4782 | 0.7666 | 0.7458 | |
|
|
| MaxSim_map@100 | 0.2944 | 0.5341 | 0.9017 | 0.4731 | 0.8159 | 0.5995 | 0.1513 | 0.6638 | 0.9622 | 0.333 | 0.4821 | 0.7585 | 0.3808 | |
|
|
|
|
|
#### Nano BEIR |
|
|
* Dataset: `NanoBEIR_mean` |
|
|
* Evaluated with <code>pylate.evaluation.nano_beir_evaluator.NanoBEIREvaluator</code> |
|
|
|
|
|
| Metric | Value | |
|
|
|:--------------------|:-----------| |
|
|
| MaxSim_accuracy@1 | 0.6086 | |
|
|
| MaxSim_accuracy@3 | 0.7983 | |
|
|
| MaxSim_accuracy@5 | 0.8522 | |
|
|
| MaxSim_accuracy@10 | 0.9123 | |
|
|
| MaxSim_precision@1 | 0.6086 | |
|
|
| MaxSim_precision@3 | 0.3783 | |
|
|
| MaxSim_precision@5 | 0.2898 | |
|
|
| MaxSim_precision@10 | 0.196 | |
|
|
| MaxSim_recall@1 | 0.3667 | |
|
|
| MaxSim_recall@3 | 0.5353 | |
|
|
| MaxSim_recall@5 | 0.597 | |
|
|
| MaxSim_recall@10 | 0.6705 | |
|
|
| **MaxSim_ndcg@10** | **0.6452** | |
|
|
| MaxSim_mrr@10 | 0.7154 | |
|
|
| MaxSim_map@100 | 0.5654 | |
|
|
|
|
|
<!-- |
|
|
## Bias, Risks and Limitations |
|
|
|
|
|
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.* |
|
|
--> |
|
|
|
|
|
<!-- |
|
|
### Recommendations |
|
|
|
|
|
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.* |
|
|
--> |
|
|
|
|
|
## Training Details |
|
|
|
|
|
### Training Datasets |
|
|
|
|
|
#### reddit_title_body |
|
|
|
|
|
* Dataset: reddit_title_body |
|
|
* Size: 66,204,599 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 7 tokens</li><li>mean: 18.38 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 20 tokens</li><li>mean: 38.42 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:----------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>Prospective UNCW transfer?</code> | <code>Hey Reddit, I am transferring from Florida State to hopefully UNCW this spring. What can you guys tell me about the school that would be helpful? Some background info: I am transferring due to the fact that the only thing to do at FSU is workout and drink (not much of a drinker). I am majoring in biology and have a 3.7 GPA. Anything that you feel is useful to know about the school is appreciated. Thanks guys.</code> | |
|
|
| <code>Calling for another Meet-up! The force is strong.</code> | <code>The time has come. The pull to meet-up with other Jax Redditors is strong, my son. We must use the force and decide where to meet-up. Jax Jedi's do not succumb to the dark side of average places, go with your exceptional suggestions. Yoda say "Must is beer, I say. Welcome are all other suggestions, mmmmmm."</code> | |
|
|
| <code>I see your Best Customer E-Mail Ever, and raise you my e-mail from an appreciative customer.</code> | <code>A little background, I'm a software support tech for a medium-large software company, and usually provide support on our Live Chat feature. You know the one.<br>After chatting with one pleasant customer, several times per day over several weeks, I nominated him/their company for Customer of the Month. When they recieve their "Thanks for being awesome" customer box, I get this email:<br><br>Subject: Epic customer appreciation box<br><br>Body: Guy and the dog with "Oh you!" face.<br><br>I laughed, and laughed. Unsuspecting tech support is floored by internet humor from relatively normal customer.</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### amazon_reviews |
|
|
|
|
|
* Dataset: amazon_reviews |
|
|
* Size: 39,357,860 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 6 tokens</li><li>mean: 14.19 tokens</li><li>max: 35 tokens</li></ul> | <ul><li>min: 11 tokens</li><li>mean: 35.54 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:--------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>It works well but the headphone apparatus falls too deep ...</code> | <code>It works well but the headphone apparatus falls too deep for any of my headphones to work :( I wish there was an adaptor included to solve this problem</code> | |
|
|
| <code>Very nice frame! Snaps open at the front like a ...</code> | <code>Very nice frame! Snaps open at the front like a real movie poster frame which I think is cool. It worked perfectly for a document I had that was this size, looks great with the green color it is.</code> | |
|
|
| <code>The shoes look very good. Size wise</code> | <code>The shoes look very good. Size wise, they fit well. The ultimate test will be how they last and time will tell.</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### paq |
|
|
|
|
|
* Dataset: paq |
|
|
* Size: 53,874,545 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 9 tokens</li><li>mean: 14.67 tokens</li><li>max: 23 tokens</li></ul> | <ul><li>min: 39 tokens</li><li>mean: 39.0 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:-----------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>how long does it take to complete the orbit of 70 ophiuchi</code> | <code>70 Ophiuchi sequence dwarf of spectral type K0, while the secondary is an orange dwarf of spectral type K4. The two stars orbit each other at an average distance of 23.2 AUs. But since the orbit is highly elliptical (at e=0.499), the separation between the two varies from 11.4 to 34.8 AUs, with one orbit taking 83.38 years to complete. In 1855, William Stephen Jacob of the Madras Observatory claimed that the orbit of the binary showed an anomaly, and it was "highly probable" that there was a "planetary body in connection with this system". This is the first attempt to use</code> | |
|
|
| <code>who is the author of the switchman</code> | <code>The Switchman The Switchman (Original title: El Guardagujas) is an existentialist short story by Mexican writer Juan José Arreola. The short story was originally published as a "confabulario", a word created in Spanish by Arreola, in 1952, in the collection "Confabulario and Other Inventions". It was republished ten years later along with other published works by Arreola at that time in the collection "El Confabulario total". The story revolves around a "stranger" who wishes to travel to the town of T. by train, but is quickly met by a "switchman" who tells him more and more fantastical stories about the</code> | |
|
|
| <code>what name is given to a narrow vertical aperture in a fortification through which an ar</code> | <code>Arrowslit An arrowslit (often also referred to as an arrow loop, loophole or loop hole, and sometimes a balistarium) is a narrow vertical aperture in a fortification through which an archer can launch arrows. The interior walls behind an arrow loop are often cut away at an oblique angle so that the archer has a wide field of view and field of fire. Arrow slits come in a remarkable variety. A common and recognizable form is the cross, accommodating the use of both the longbow and the crossbow. The narrow vertical aperture permits the archer large degrees of freedom to</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### s2orc_citation_titles |
|
|
|
|
|
* Dataset: s2orc_citation_titles |
|
|
* Size: 7,722,225 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 8 tokens</li><li>mean: 22.77 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 8 tokens</li><li>mean: 22.55 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:-------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------| |
|
|
| <code>Purulent pericarditis. Clinical considerations with reference to 26 cases.</code> | <code>Purulent Pericarditis: Report of 2 Cases and Review of the Literature</code> | |
|
|
| <code>High-Resolution Controller Data Performance Measures for Optimizing Divergent Diamond Interchanges and Outcome Assessment for Drone Video</code> | <code>An Advanced Signal Phasing Scheme for Diverging Diamond Interchanges</code> | |
|
|
| <code>Silurian subaqueous slide conglomerate, Addison, Maine</code> | <code>Bimodal Silurian and Lower Devonian volcanic rock assemblages in the Machias-Eastport area, Maine</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### s2orc_title_abstract |
|
|
|
|
|
* Dataset: s2orc_title_abstract |
|
|
* Size: 36,051,582 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 8 tokens</li><li>mean: 20.86 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 20 tokens</li><li>mean: 38.72 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:-------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>2′–5′-Oligoadenylates (2–5A) As Mediators of Interferon Action. Synthesis and Biological Activity of New 2–5A Analogues</code> | <code>Double-stranded RNA (dsRNA) is a potent inhibitor of protein synthesis in extracts of interferon-treated cells. One of the mechanisms that has been proposed to explain this inhibition of protein synthesis is by the 2–5A pathway (1). Interferon induces the synthesis of an enzyme, 2–5A synthetase, which upon activation by dsRNA generates 2–5A from ATP. This 2–5A activates a pre-existing endonuclease for cleavage of single-stranded RNA. The biological activity of 2–5A is rapidly lost due to cleavage of the 2′–5′ internucleotide bond by a specific 2′–5′ phosphodiesterase starting at the 3′-end. This rapid cleavage and the poor uptake of 2–5A in intact cells, the latter because of its ionic character, limit the potential of 2–5A as a useful approach to the treatment of virus infections or cancer.</code> | |
|
|
| <code>p-adic L-functions and Bernoulli Numbers</code> | <code>In this chapter we shall construct p-adic analogues of Dirichlet L-functions. Since the usual series for these functions do not converge p-adically, we must resort to another procedure. The values of \( L\left( {s,\chi } \right)\) at negative integers are algebraic, hence may be regarded as lying in an extension of \( {\mathbb{Q}_p}\). We therefore look for a p-adic function which agrees with \( L\left( {s,\chi } \right)\) at the negative integers. With a few minor modifications, this is possible.</code> | |
|
|
| <code>Wood Pile Structure of Three-Dimensional Photonic Crystal Band Gap Characteristics</code> | <code>Based on the plane wave expansion method,wood pile structure three-dimensional photonic crystal band gap characteristics was studied.Silicon material for wood structure photonic crystals,the change in the structure of strip width and length,is obtained when the wood pile structure width of 5μm,7μm height is formed when the band gap structure of wide band gap width,in 0.2899—0.3804Hz,0.0905Hz.Change form wood pile structure in three-dimensional photonic crystal materials,get the germanium material wood structure shape three-dimensional photonic band gap structure in 0.2585—0.3500Hz,the band gap width of 0.0915Hz,band gap compared to silicon and silicon carbide material is wide.Conclusion for the preparation of three-dimensional photonic crystals provide reference.</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### s2orc_abstract_citation |
|
|
|
|
|
* Dataset: s2orc_abstract_citation |
|
|
* Size: 7,639,890 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:-----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 28 tokens</li><li>mean: 38.97 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 24 tokens</li><li>mean: 38.96 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>Abstract Temperature modulated differential scanning calorimetry (TMd.s.c.) was applied in a study of syndiotactic polypropylene. The crystallites melt from their lateral surfaces only and the kinetics shows up in the imaginary part c ″ of the complex specific heat. Theoretical analysis predicts and experiments confirm that c ″ increases linearly with the underlying mean heating rate and the modulation period. Furthermore, it can be shown that c ″ is inversely related to the superheating effective during melting. Use of the relation yields for syndiotactic polypropylene values in agreement with direct measurements employing conventional d.s.c.</code> | <code>Crystal melting behavior of indium and isotactic polypropylene has been examined by differential scanning calorimetry of heat flux type in terms of the heating rate, \(\beta \), dependence. The melting shows the dependence characterized by a power, \(z\), of the shift in peak temperature in proportion to \(\beta ^{\text{z}}\). The power, \(z\), differentiates the melting with and without superheating. For polymer crystal melting, intrinsic nature of the broad melting region with a fractional power, \(z\,\le\,1/2\), due to superheating of melting kinetics has been reconfirmed experimentally. On the other hand, the crystal melting of indium, which is supposed to proceed with negligible superheating, showed the shift in peak temperature with the power in the range of \(1/2\,\le\,z \le\,1\), depending on sample mass, which is due to instrumental thermal lag predicted by the Mraw’s model consisting of lumped elements. The \(\beta \) dependence is influenced by the thermal lag determined by ...</code> | |
|
|
| <code>This article examines anti-racist strategies employed in Finnish children’s literature. The examples from four stories illustrate that certain physical characteristics and cultural markers can become strong signifiers of nationality, that is Finnishness. The characters in these stories have to cope with experiences of exclusion and loneliness before the people around them learn that difference and diversity do not change the fact that all humans are worth the same. However, the paper argues that the intended positive outcome of books with a strong anti-racist agenda threatens to be lost as heavily accentuated moral lessons often become counterproductive. The paper demonstrates some of the changes that have taken place in Finnish children’s literature during the past two decades and addresses significant cultural and societal issues that affect children’s everyday lives.</code> | <code>Abstract: In this article, representations of multiculturalism in Swedish and Finnish picturebooks are examined through the Forskolan Ravlyan and Tatu and Patu series. In the article, multiculturalism is understood and studied with an intersectional approach. This means considering sociocultural categorizations such as ethnicity, gender, nationality and disability to be meaningful to the existing social, political and economic structures of societies. These categorizations are seen to have the power to reproduce and circulate dominant discourses that effect the social inclusion and exclusion of certain groups of people. Thus, the social categories are examined as performative textual discourses, meaning that texts are acknowledged to be not only reflecting, but also creating social reality. Both series present diversity as an integrated part of the story by means of non-explicit multiculturalism. The analysis reveals that both series of books contain representations of diversity that c...</code> | |
|
|
| <code>Rosai–Dorfman disease (RDD) is usually characterized by painless bilateral cervical lymphadenopathy associated with fever and leukocytosis. Although the disease may occur outside lymphnodes, manifestation of skeletal system occurs in less than 8% of cases. In addition, presentation of this disease in a purely skeletal form without lymph nodes involvement is extremely uncommon. This case report describes a 17-year-old female with a pure skeletal presentation of RDD in the fibula. Trocar biopsy was performed, and immunohistochemical staining using S100 and CD68 was done to confirm the diagnosis.</code> | <code>We report a case of extranodal Rosai-Dorfman disease (RDD) (sinus histiocytosis with massive lymphadenopathy) presenting with a solitary active lesion of the femur.</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### s2orc_abstract_body |
|
|
|
|
|
* Dataset: s2orc_abstract_body |
|
|
* Size: 6,550,431 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 9 tokens</li><li>mean: 38.92 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 39 tokens</li><li>mean: 39.0 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>One of the goals of the 5G Communication Automotive Research and innovation (5GCAR) project has been to evaluate and propose system architecture enhancements aiming at supporting the strict requirements of vehicle-to-everything (V2X) use cases. In this paper, we provide an overview of 3GPP 5G system architecture, which is used as a baseline architecture in the project, and we present the main architectural enhancements introduced by 5GCAR. The work of the project focused on the following categories: (i) end-to-end security, also including aspects of privacy; (ii) network orchestration and management; (iii) network procedures; (iv) edge computing enhancements; and (v) multi-connectivity cooperation. The enhancements introduced by 5GCAR to above-listed categories are discussed in this paper, while a more detailed analysis of some selected features is presented. Figure 2. Reference point representation of the 5G system architecture [6].The network functions repository function (NRF) is us...</code> | <code><br><br>Introduction<br><br>The automotive sector is considered to be one of the most prominent verticals that will benefit from the capabilities of the upcoming 5G cellular networks [1,2]. Vehicular applications cover a wide range of use cases and thus a large set of associated requirements. Examples include very high data rates and timely service delivery, while also considering ultra-low communication latencies, just to mention a few. Complex scenarios where vehicles communicate among themselves and also with nearby road infrastructure, road users, clouds, etc.-also known as vehicle-to-everything (V2X) communications-will not only leverage 5G network but will play a key role in its design. The H2020 5G PPP Phase 2 project 5G Communication Automotive Research and innovation (5GCAR) [3] worked towards the definition of enhancements in terms of system architecture, security, and privacy, specifically targeting automotive applications. In particular, 5GCAR considered five different classes of use c...</code> | |
|
|
| <code>The Queer History Walking Tour is an annually recurring event during Dublin's official Pride festivities. Created and led by the 'Godfather of Gay,' Tonie Walsh, the walks seek to extend stories from the Irish Queer Archive (IQA) into the everyday fabric of the city, contributing to a processual queering of Irish heteronormative histories. As an activist form of public pedagogy, the walking tour encourages a relational understanding of queer cultural heritage through mobile, embodied, and emotional interactions. This paper argues that the walking tour works as an anarchive that contributes to a growing, intersectional understanding of LGBTQ+ experiences and queer futures, facilitated by peripatetic practices. In response to pervasive cis-male homonormativity at Pride, Dr Mary McAuliffe, a queer feminist woman, is the latest tour guide who includes historical stories of lesbian women, trans people, and gay men. Through engaging with this diversity of historical experiences, guides signa...</code> | <code><br><br>Introduction<br><br>Dublin Pride does not consist of one parade, but two. Every year, Dublin Pride includes a 'mini parade:' a free Queer History Walking Tour created and led by Tonie Walsh. As a founder of the Irish Queer Archive (IQA), co-founder of the Gay Community News (GCN) and long-time gay rights activist, Walsh is well known within the LGBTQ+ community in Ireland as the 'Godfather of Gay' (Mullally, 2018). The tour is highly popular and can draw up to 150 attendees, and sometimes includes collaborations with other historians with their own stories to tell. i The tour includes pausing alongside places of key significance in queer Irish history, be it a historical place that no longer materially exists, or one that has remained unchanged. This paper will draw on Walsh's walking tour to illustrate how walking tours generate a relational understanding of queer cultural heritage through mobile, embodied and emotional interactions with places and other queer people. I argue that, despit...</code> | |
|
|
| <code>A definitive diagnosis of salivary gland tumors is extremely difficult to make without evaluating the entire tumor and conducting immunohistochemical examinations. In this study, we aimed to examine and compare the expression patterns of the tumor protein TP D52 family, including TPD52, TPD53, and TPD54, in salivary gland tumor cells by using immunohistochemical staining. Among over 30 benign and malignant salivary gland tumors with extensive and diverse morphological features and overlapping histological similarities, we selected Warthin s tumor and pleomorphic adenoma to represent benign salivary gland tumors and mucoepidermoid carcinoma to represent malignant ones. Tumor samples were fixed in 10 buffered formalin and embedded in paraffin. Then, immunohistochemical staining was performed using antibodies against TPD52, TPD53, and TPD54. Neither the benign salivary gland tumors nor mucoepidermoid carcinoma stained for TPD52. However, the intensity of TPD53 and TPD54 staining was found...</code> | <code><br><br>Introduction<br><br>The salivary glands are exocrine organs that produce saliva and are complex tissues composed of ductal, acinar, myoepithelial, and basal cells 1 . Collectively called as luminal cells, ductal and acinar cells are present on the luminal side of the salivary duct system. Myoepithelial and basal cells are located on the basement membrane around the luminal cells and are thus called abluminal cells 2 . In general, 3 types of acini namely serous, mucinous, and mixed and ducts i.e., intercalated, striated, and excretory are found in the salivary glands. The acini and intercalated ducts are surrounded by myoepithelial cells, whereas the striated and excretory ducts are surrounded by basal cells 3 .<br><br>Tumors of the salivary glands comprise less than 1 of all neoplasms in the body 4 ; however, there are more than 30 benign and malignant salivary gland tumors with extensive and diverse morphologies yet overlapping histological similarities 4 . Hence, it is extremely difficult to d...</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### wikianswers |
|
|
|
|
|
* Dataset: wikianswers |
|
|
* Size: 10,087,503 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 7 tokens</li><li>mean: 14.24 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 7 tokens</li><li>mean: 14.03 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:--------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------| |
|
|
| <code>What is the average weight for a 4'11 14 year old girl?</code> | <code>What is the average weight for a 4' 9 14 year old girl?</code> | |
|
|
| <code>The Fahrenheit temperature reading is 98 degrees on a hot summer day Wh is this reading on the Kelvin scale?</code> | <code>Fahrenheit temp 98 on hot summer day what is this reading on the kelvin scale?</code> | |
|
|
| <code>What is the word for you in Japanese?</code> | <code>What word in japanese i loveyou?</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### wikipedia |
|
|
|
|
|
* Dataset: wikipedia |
|
|
* Size: 6,198,049 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:---------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 4 tokens</li><li>mean: 8.86 tokens</li><li>max: 28 tokens</li></ul> | <ul><li>min: 23 tokens</li><li>mean: 38.92 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:--------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>Pristimantis lichenoides</code> | <code>Pristimantis lichenoides (rana camuflada in Spanish) is a species of frogs in the family Craugastoridae. It is endemic to Colombia and is only known from the vicinity of its type locality near Samaná in the Caldas Department, on the eastern slope of the Cordillera Central (Colombian Andes). The specific name lichenoides refers to its lichen-like dorsal coloration as well as its habit of being plastered to rock surfaces, resembling lichens growing on rocks.<br><br>Description<br>Adult males measure and adult females in snout–vent length. The head is as wide as the body and wider than it is long. The snout is rounded in dorsal view but subtruncate in lateral view. The tympanum is small but visible, with its upper edge hidden by the thick supratympanic fold. The fingers have lateral keels and round terminal discs. The lateral keels of the toes coalesce as basal webbing; the toe discs are slightly smaller than those on the fingers. Dorsal skin bears granules. Dorsal coloration is dark green to pa...</code> | |
|
|
| <code>Askim station</code> | <code>Askim Station () is located at Askim, Norway on the Eastern Østfold Line. The railway station is served by the Oslo Commuter Rail line L22 from Oslo Central Station. The station was opened with the eastern line of Østfold Line in 1882.<br><br>Railway stations in Askim<br>Railway stations on the Østfold Line<br>Railway stations opened in 1882<br>1882 establishments in Norway</code> | |
|
|
| <code>Mildred Alango</code> | <code>Mildred Akinyi "Milka" Alango (born 10 March 1989 in Mombasa) is a Kenyan taekwondo practitioner. Alango qualified for the women's 49 kg class at the 2008 Summer Olympics in Beijing, after winning the championship title from the African Qualification Tournament in Tripoli, Libya. She lost the preliminary match to China's Wu Jingyu, who was able to score seven points at the end of the game. Because her opponent advanced further into the final match, Alango took advantage of the repechage round by defeating Sweden's Hanna Zajc on the superiority rule, after the pair had tied 2–2. She progressed to the bronze medal match, but narrowly lost the medal to Venezuela's Dalia Contreras, with a sudden death score of 0–1.<br><br>References<br><br>External links<br><br>NBC 2008 Olympics profile<br><br>1989 births<br>Living people<br>Kenyan female taekwondo practitioners<br>Olympic taekwondo practitioners of Kenya<br>Taekwondo practitioners at the 2008 Summer Olympics<br>Sportspeople from Mombasa</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### gooaq |
|
|
|
|
|
* Dataset: gooaq |
|
|
* Size: 1,281,138 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 9 tokens</li><li>mean: 12.49 tokens</li><li>max: 22 tokens</li></ul> | <ul><li>min: 14 tokens</li><li>mean: 38.0 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:---------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>what is psma pet ct scan?</code> | <code>A PSMA study, also called a ProstaScint® scan, is an imaging test to locate and determine the extent of prostate cancer. ... The study involves a special molecule called a monoclonal antibody developed in a laboratory and designed to bind to the prostate-specific membrane antigen on cancer cells.</code> | |
|
|
| <code>how many calories do you burn walking up mount snowdon?</code> | <code>You will burn through around 2,000 calories climbing Snowdon.</code> | |
|
|
| <code>ankara is the capital city of?</code> | <code>Ankara, formerly known as Angora, city, capital of Turkey, situated in the northwestern part of the country.</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### codesearch |
|
|
|
|
|
* Dataset: codesearch |
|
|
* Size: 864,023 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 6 tokens</li><li>mean: 28.34 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 28 tokens</li><li>mean: 38.9 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>Similar to {@link #getOrCreateLocalTransaction(Transaction, boolean)} but with a custom global transaction factory.</code> | <code>public LocalTransaction getOrCreateLocalTransaction(Transaction transaction, boolean implicitTransaction, Supplier<GlobalTransaction> gtxFactory) {<br> LocalTransaction current = localTransactions.get(transaction);<br> if (current == null) {<br> if (!running) {<br> // Assume that we wouldn't get this far if the cache was already stopped<br> throw log.cacheIsStopping(cacheName);<br> }<br> GlobalTransaction tx = gtxFactory.get();<br> current = txFactory.newLocalTransaction(transaction, tx, implicitTransaction, currentTopologyId);<br> if (trace) log.tracef("Created a new local transaction: %s", current);<br> localTransactions.put(transaction, current);<br> globalToLocalTransactions.put(current.getGlobalTransaction(), current);<br> if (notifier.hasListener(TransactionRegistered.class)) {<br> // TODO: this should be allowed to be async at some point<br> CompletionStages.join(notifier.notifyTransactionRegistered(tx, ...</code> | |
|
|
| <code>// formatArgs converts the given args to pretty-printed, colorized strings.</code> | <code>func formatArgs(args ...interface{}) []string {<br> formatted := make([]string, 0, len(args))<br> for _, a := range args {<br> s := colorize(pretty.Sprint(a), cyan)<br> formatted = append(formatted, s)<br> }<br> return formatted<br>}</code> | |
|
|
| <code>log request in history<br>@access private<br>@param $message string<br>@return void<br>@since 3.0<br>@package Gcs\Framework\Core\Engine</code> | <code>private function _setHistory($message) {<br> $this->addError('URL : http://' . $this->request->env('HTTP_HOST') . $this->request->env('REQUEST_URI') . ' (' . $this->response->status() . ') / SRC "' . $this->request->src . '" / CONTROLLER "' . $this->request->controller . '" / ACTION "' . $this->request->action . '" / CACHE "' . $this->request->cache . '" / ORIGIN : ' . $this->request->env('HTTP_REFERER') . ' / IP : ' . $this->request->env('REMOTE_ADDR') . ' / ' . $message, 0, 0, 0, LOG_HISTORY);<br> }</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### yahoo_title_answer |
|
|
|
|
|
* Dataset: yahoo_title_answer |
|
|
* Size: 276,726 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 7 tokens</li><li>mean: 18.44 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 35.58 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:--------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>Who to contact in the philippines to install Supersports SA cable Channel?</code> | <code>go to this web site www.dishtv.sa.com\n or ask ur cable operator\n\nor contact this number 0091234537835\n\n\n\ni hope this helps</code> | |
|
|
| <code>What does "you're preaching to the choir" mean?</code> | <code>"preaching to the choir" means trying to make a point to someone who already agrees with your position. The analogy meaning that those in the choir are already familiar with the preaching... it's the others that likely need it.</code> | |
|
|
| <code>Does anyone know a good site where i can find a detailed but simply explained explanation on why henry VIII?</code> | <code>Henry VIII and the break with Rome\nClick on the * words in the site to show:\n\nPower - "Henry had hoped to resolve the issue of who was to succeed him"\n\nMoney - "As well as his desire for the divorce, there was a strong financial incentive for Henry to deny the authority of the Pope"\n\nFaith - "Although Henry's reformation broke with the papacy, his own religious beliefs were orthodox"\n\nLove - "Henry was in love with Anne Boleyn"</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### agnews |
|
|
|
|
|
* Dataset: agnews |
|
|
* Size: 420,288 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 8 tokens</li><li>mean: 14.72 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 12 tokens</li><li>mean: 35.44 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:----------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>Italy coming out of Washington's shadow</code> | <code>Long considered something of a junior partner among Europe's elite nations, Italy is carving out a hefty role in world affairs. Rome is contributing the largest contingent to the U.N. peacekeeping force in Lebanon, has claimed a role in negotiations with Iran and is rallying European governments around the idea that Italy can form a counterweight to American might.</code> | |
|
|
| <code>Iran, Europe Fail to Agree on Uranium Enrichment, IRNA Reports</code> | <code>Iran and Europe failed to reach an accord on Tehran's uranium enrichment program, the state-owned Iranian news agency said, increasing the chances the US may call for United Nations sanctions against the Islamic nation.</code> | |
|
|
| <code>Omicidio Desirée, la Cassazione "La pena per Erra va inasprita"</code> | <code>La sentenza farà da apripista per la futura giurisprudenzaCon il nuovo processo a Milano, l'imputato rischia l'ergastolo Omicidio Desirée, la Cassazione "La pena per Erra va inasprita" Il nuovo processo si celebrerà all'Assise d'appello di Milano</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### amazonqa |
|
|
|
|
|
* Dataset: amazonqa |
|
|
* Size: 226,137 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 7 tokens</li><li>mean: 23.35 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 18 tokens</li><li>mean: 35.18 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>Wondering how people get the wrinkles out from the packaging? Iron or wash and hang damp,maybe?</code> | <code>I sprayed with water (misted it) then ironed it. Most wrinkles came out and what did not, eventually came out from the steam of the shower. Good- luck</code> | |
|
|
| <code>Why is it that most of the Janome users previously owned Singer or Kenmore? Anything has to be better than either of those--so what's the real benefit of a Janome HD3000 vs a Pfaff?</code> | <code>I can't tell you anything about Pfaff because I have never owned a Pfaff. When I bought the Janome HD3000 I was looking for a heavy duty sewing machine that would sew through layered heavy fabrics, such as denim, etc. I had a Singer at the time, and had always owned Singers, and had noticed that with each new Singer I bought, the quality was less than the previous Singer. I don't know what happened to Singer, but in my opinion they have put out a less and less quality product over the past 10 to 15 years. The question I asked in my search engine was something like "what is a good heavy duty sewing machine". That led me to a demonstration video where I watched someone using the Janome to sew through the depth of fabric layers that I needed. And when I bought the machine, it worked just like in the video. It sails through layers of fabric that used to invariably tangle up and stop the Singer.</code> | |
|
|
| <code>I would like to use this for storing thread. I need the drawers to be a least 4" high . Also, do the tops come off the container.</code> | <code>The drawers are only 2 inches high. They slide out . Each has a picture of a big Lego head on top the top does snap off.</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### yahoo_qa |
|
|
|
|
|
* Dataset: yahoo_qa |
|
|
* Size: 143,477 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 8 tokens</li><li>mean: 34.24 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 9 tokens</li><li>mean: 37.25 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>I have to meet up with someone whose last name is Kasprazck tomorrow and I dont wanna offend her by sayin her name wrong. Can u please write out how its pronounced if you know or how you think? Thanks xoxoxoxooxo</code> | <code>People with surname like that are usually aware that people may not know how to pronounce it properly. It will not be a big issue (and I am sure it won't offend her at all) if you were to ask how to pronounce it. Just make sure you listen carefully THEN repeat it so you will likely remember it.</code> | |
|
|
| <code>All I want to know why is this allowed when there is so much of a danger to children that are online. I am in charge of her as of 3/20/2006 and she is nolonger with her Mother who got her started with this problum and I was under the imprestion that this account was canceled out but I went to use my computer and I found out that she had been online without my knowledge of it til today I can only give you the email address I dont have her password for the my space I do have the home address that may have been givin and the phone number and her true date of birth. My sister is the one who told the lie.</code> | <code>Your question is essentially "All I want to know why is this allowed...?"\n\nThe answer is, nobody allowed it but you. You made a computer accessible to someone who you do not wish it to be used by.\n\nIf you meant to ask "Why are minors allowed to set up email accounts?" then the answer is, "Because there is no way to ensure that the person on the client's end isn't minor."\n\nIf you want to have her MySpace account removed, there are protocols you can follow on the MySpace FAQ (frequently asked questions). However, it will be pointless to go through the trouble if she has access to the internet; your home, school, friends, the mall, Kinko's, etc.</code> | |
|
|
| <code>I am not asking you alter anything you already have in place,\nbut why not combine Biology, and Chemistry into Biochemistry, yes, that is what I am searching for.</code> | <code>Biology has traditionally consisted of botany, zoology and microbiology. Chemistry has consisted mainly of physical, organic, analytical and biochemistry. Until relatively recently those divisions have held up reasonably well. Now there is a whole new world of chemical and physical biology and biochemistry and biophysics. To throw all of biology and chemistry into biochemistry would be a misnomer for many of the parts of both. To add biochemistry or chemical biology into one pot might be a good idea to catch the questions in the area betwen</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### yahoo_title_question |
|
|
|
|
|
* Dataset: yahoo_title_question |
|
|
* Size: 213,320 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 7 tokens</li><li>mean: 17.99 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 7 tokens</li><li>mean: 33.44 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:----------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>1:03 s for 100 meter freestyle race 10 year old female category. does this time rank high in the USA Swimming?</code> | <code>My daugter just went 1:03 in 100meter fresstyle how does this compare to the best 10 year olds in the world or USA?</code> | |
|
|
| <code>Why doesnt people believe that a mental illness or condition is a real medical probllem?</code> | <code>It seems that unless people can see a "broken arm", a "bleeding wound", a "cancer diagnosis", "asthma" , "arthiritis" (and many more lables out there) a mental condition is less inmportant as the above. There are so many people that do not understand that it is real...it is a struggle everyday to just get to the end of the day. You are ridiculed for you behavior as irresponsible or inconsiderate. You get the picture. IT IS AS REAL AS CANCER OR AIDS OR ANY OTHER UNCUREABLE ILLNESS!!</code> | |
|
|
| <code>Why do you think people attand college or university?</code> | <code>people attand college or university for many different reasons,e.x.new experiences, career preparation, increased knowledge...</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### ccnews |
|
|
|
|
|
* Dataset: ccnews |
|
|
* Size: 353,670 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:---------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 7 tokens</li><li>mean: 17.8 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 22 tokens</li><li>mean: 38.95 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:--------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>California Senate Approves Raising Age to Buy Long Guns</code> | <code>SACRAMENTO (AP) — California would raise the age for buying rifles and shotguns from 18 to 21 and bar people from buying more than one long gun each month under a bill advancing in the Legislature.<br>It’s been a frequently debated topic nationwide after a Florida high school shooting that killed 17 people.<br>The Senate on Tuesday approved the measure by Democratic Sen. Anthony Portantino of La Canada Flintridge, sending it to the Assembly on a 23-10 vote.<br>It extends age and purchase limits that currently apply only to handguns.<br>Republican Sen. Jim Nielsen of Gerber says California should instead target criminal gangs and those with mental disabilities whom he said will obtain the guns no matter the legal limits.<br>Walmart and Dick’s Sporting Goods previously announced age limits on gun sales.</code> | |
|
|
| <code>Mississippi officer fired after video of suspect being hit</code> | <code>JACKSON, Miss. (AP) – A Mississippi police officer has been fired after cellphone video showed him hitting a handcuffed suspect.<br>A Jackson Police Department news release says officer Justin Roberts was fired Monday by Chief Lee Vance.<br>The release says the suspect was hit Saturday; Vance started an internal affairs investigation after the video surfaced Sunday.<br>The identity of the handcuffed man was not released.<br>It was not immediately clear whether Roberts can appeal his firing.<br>The Associated Press tried to leave a message for Roberts at the Jackson Police Department, but department spokesman Commander Tyree Jones says he does not have a way to reach the fired officer.<br>Jones says both Roberts and the handcuffed suspect are African-American.<br>Share this: Facebook<br>LinkedIn<br>Twitter<br>Google<br>Like this: Like Loading...</code> | |
|
|
| <code>Sir Cameron Mackintosh Discusses Newest Incarnation of MISS SAIGON</code> | <code>Sir Cameron Mackintosh, the man responsible for a nearly unrivaled number of influential theatrical productions, has mounted a new incarnation of Miss Saigon at the Birmingham Hippodrome. He recently spoke with Express and Star about the upcoming production.<br>"This version is by far the best we have ever done," he says. "The world has sadly got worse, not better and we are indeed in gritty times and I think that is what has made the show feel even more contemporary than when it first came out nearly 30 years ago."<br>The show features a new collection of designers and takes a grittier approach to the already hard-hitting content. Mackintosh says that even at the beginning the subject matter posed a monumental challenge in terms of transfer to the stage. When first speaking with Claude-Michel Schönberg and Alain Boublil he says, " the phrase I used then was 'doing this musical is like dancing on a razor blade'; you have to be utterly truthful and it has to deliver the power that only musica...</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### npr |
|
|
|
|
|
* Dataset: npr |
|
|
* Size: 365,075 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 7 tokens</li><li>mean: 15.98 tokens</li><li>max: 30 tokens</li></ul> | <ul><li>min: 16 tokens</li><li>mean: 38.45 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:---------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>Chicago Sells City Relics in Online Auction</code> | <code>Pieces of Chicago's history and cultural experiences go up for bidding in a two-week auction beginning Thursday. The sale is an attempt to raise money for city arts and cultural programs, while also raising its profile. The "Great Chicago Fire Sale" is the first charitable eBay auction to be held by a municipality, and is being run by Chicago's Department of Cultural Affairs. Offerings include a dinner party prepared by Oprah Winfrey's chef, a chance to dye the river green on St. Patrick's Day, a cow statue from the city's 1999 Cows on Parade display and an authentic Playboy Bunny costume from the 1960s. NPR's David Schaper reports.</code> | |
|
|
| <code>Hear Code Orange's Darkly Catchy 'Bleeding In The Blur'</code> | <code>Code Orange could never be accused of going soft. Show up to any of the Pittsburgh band's shows and behold the cyclonic mosaic of moshing bodies moved by its nightmarishly chaotic hardcore. But there's always been an experimental underpinning to Code Orange that toys with noise and melody (and some '90s grunge). Forever, the band's upcoming third album, is among its most bruising works, with surprises throughout. But none are quite like this. "Bleeding In The Blur" certainly sets itself up to swarm, but the squealing feedback and Jami Morgan's thunderous drums quickly turn the reins over to guitarist Reba Meyers. No pinch harmonics, no slamming breakdowns, (mostly) no throaty screams — this is a darkly catchy pop song that sounds as if it's been carved from obsidian. "Bleeding In The Blur" has the gloomy heft of Thrice and the unconventional hooks of Jawbox, with Meyers' dominating vocals out front. If you've ever wanted a heavier song by Adventures (the emo band featuring three-quarte...</code> | |
|
|
| <code>Ireland Is The Focus Of Investor Anxieties</code> | <code>Over the past two weeks, investors have dumped Irish government bonds over concerns about the country's economy and its banks. Irish officials have been reluctant to accept a bailout. But over the weekend, they held talks about the debt crisis with other members of the European Union.</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### eli5 |
|
|
|
|
|
* Dataset: eli5 |
|
|
* Size: 106,781 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 6 tokens</li><li>mean: 21.72 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 14 tokens</li><li>mean: 38.36 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:---------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>How far did Genghis Khan influence spread? and did it help america?</code> | <code>It would be rather difficult for Ghenghis Khan to influence America very much, given that the United States didn't exist until about 500 years after his death. It would be over 200 years before Columbus made his first voyage in search of Asia. At the time the only contact between America and the rest of the world would have been the Norse expedition to Vinland, and that didn't exactly end well. It's possible someone who's more knowledgeable about the subject could point to some cultural shifts that would affect America but with half a millennium of separation, Ghenghis Khan's influence on the USA would be pretty minimal.</code> | |
|
|
| <code>How did Stephen Hawking talked even though he can't move a muscle? How did the computer knew what he wanted to say?</code> | <code>He used very subtle muscle movements to control the computer. The computer would go over a list of letters/words and Hawking would move his muscle whenever he wanted to choose the current letter or word. Towards the end of his life it would take him up to a minute per word. Any interview you see of him is either heavily edited to remove these long pauses, or his entire talk was pre-recorded (that's how he gave lectures).</code> | |
|
|
| <code>How would William the Conqueror's name have been said/written in Old Norman?</code> | <code>William the Conqueror by David Bates p. 33 (ISBN 978-0752429601) and Hanks and Hodges, Oxford Dictionary of First Names, Oxford University Press, 2nd edition ( ISBN 978-0-19-861060-1), p.276 list it as Williame (french spelling Guillaume), all the other sources I found were too unreliable.</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### cnn |
|
|
|
|
|
* Dataset: cnn |
|
|
* Size: 293,521 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:-----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 15 tokens</li><li>mean: 38.58 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 39 tokens</li><li>mean: 39.0 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>Chan, is famous in the United States for such action movies as 'Rush Hour'<br>and 'Rumble in the Bronx'<br>He lashed out at the United States and blamed the country for the financial crisis that is sweeping the globe .</code> | <code>He may enjoy a Hollywood payday now and then, but that doesn't stop Jackie Chan from criticizing America. The martial arts star called the U..S the 'most corrupt' country in the world during a recent interview on a Hong Kong television show. 'If you talk about corruption, the entire world, the United States has no corruption?' Chan asked the host. Scroll down for video . Controversial: Chan, who's made millions in American films, called the country the most corrupt nation on the planet . Chan then referred to America as 'the most corrupt in the world.' 'Where does this Great . Breakdown (financial crisis) come from? It started exactly from the . world, the United States,' Chan told the interviewer. 'When I was interviewed in the U.S., people . asked me, I said the same thing. 'I said now that China has become . strong, everyone is making an issue of China,' continued the Rush Hour star. 'If our own countrymen . don't support our country, who will support our country? We know our . coun...</code> | |
|
|
| <code>A bus was carrying members of King family after 'Dream' speech ceremony .<br>The bus and a car collided near Washington's Tidal Basin just off the National Mall .<br>Reality star Omarosa Manigault said she was on the bus: 'We were very afraid'<br>Mall Police say a person in the car taken to hospital; no report yet on bus passengers .</code> | <code>Washington (CNN) -- Family members of the Rev. Martin Luther King Jr. were involved in a bus accident Wednesday after the high-profile ceremony marking the 50th anniversary of King's "I Have a Dream" speech, police said. The bus and a car collided near Washington's Tidal Basin just off the National Mall where the ceremony was held, according to Park Police, who have jurisdiction over the Mall. They said a person in the car was injured and taken to a hospital but did not provide information on injuries to bus passengers. Several members of the King family were aboard the bus and had laid a wreath at the memorial to the civil rights leader, according to Omarosa Manigault, a reality television star who was aboard the bus. "We were very afraid," she told CNN. "There were children on the bus, seniors and everything. Everybody was thrown out of their seats." She said she hit her head in the accident. Obama: Because they marched, America changed . 9 things about MLK's speech and the March on ...</code> | |
|
|
| <code>Ofcom's chief executive said there had been a big change in tolerance levels .<br>35% of viewers think there is too much violence, down from 55% in 2008 .<br>But there is less tolerance of language deemed as 'discriminatory' or unjust .<br>Critics say British public has become 'desensitised' due to lax Ofcom laws .</code> | <code>Television viewers have become more tolerant of violence and swearing, the head of Ofcom has claimed. But the sexist or racist language of the 1970s is far less acceptable than it once was, research by the broadcasting regulator reveals. Ofcom’s chief executive Ed Richards, who is about to stand down after 11 years in the job, told MPs there has been a big change in tolerance levels in the past few decades. Ofcom chief says the British public has grown more tolerant - but still does not like discriminatory language on TV shows. Till Death Us Do Part, which frequently had lead character Alf Garnett making racist remarks . But critics argued the British public has simply become ‘desensitised’ to swearing after years of lax regulation by Ofcom. According to the regulator’s latest research, published in July, only 35 per cent of viewers think there is too much violence on TV, down from 55 per cent in 2008. Just 35 per cent think there is too much swearing, down from 53 per cent six years a...</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### stackexchange_duplicate_questions |
|
|
|
|
|
* Dataset: stackexchange_duplicate_questions |
|
|
* Size: 73,210 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 6 tokens</li><li>mean: 15.86 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 15.56 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------| |
|
|
| <code>Clone() vs Copy constructor- which is recommended in java</code> | <code>clone() vs copy constructor vs factory method?</code> | |
|
|
| <code>AES-128/192 safer than AES-256 in practice?</code> | <code>Is AES-256 weaker than 192 and 128 bit versions?</code> | |
|
|
| <code>How does this Java code which determines whether a String contains all unique characters work?</code> | <code>Explain the use of a bit vector for determining if all characters are unique</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### stackexchange_title_body |
|
|
|
|
|
* Dataset: stackexchange_title_body |
|
|
* Size: 80,695 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:-----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 15 tokens</li><li>mean: 38.83 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 26 tokens</li><li>mean: 38.78 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>Allow linking to named anchors This is similar to , but I don't think it's a dupe. Markdown should support links that are just named anchors, like [foo](#12345). I occasionally reference existing answers in comments or my own answers if I'm expanding on them, and currently I need to include the full URL to get Markdown to link it, which seems unnecessary. Just copying the answer's link is annoying because it's a different URL, so when users click it it loads a new page, even though it's actually the exact same page. To get around it I tend to take the current URL and splice in the #id of the answer I'm linking to, but Markdown should assume that if I just include the #id part</code> | <code>Support anchor names in posts I admit this feature request is probably somewhat limited in useful scope, but I'm throwing it out there anyway. Inspired by , and because I want to use it on , I'm requesting that name be supported on a tags in posts. On very long answers, such as the closing/migration guidance answer, this would allow direct linking to the specific closure reason. This would then allow us, when someone , to link directly to the appropriate reason and description thereof. I recognize the limited scope of this, however, I have seen other long answers that could stand to have that kind of "deep" linking ability as well. (The original incarnation of this post had either name or id, but preferenced name. Per Koper's answer, which I agree with, I took out the idea of supporting id, because Koper's right -- too dangerous.)</code> | |
|
|
| <code>Unable to reload same gif image, if used twice in a page I am using same gif image twice in a page. Both the images will be hidden initially. Based on certain criteria I am showing those gif images (when clicked on particular target one gif image will be shown at a time). I am unable to reload the gif image. See the attached plunker 1) <script> var img1 = document.getElementById("img1"); var img2 = document.getElementById("img2"); function toggle1() { if (document.getElementById('gif-1').style.display == "none") { document.getElementById('gif-1').src = ''; document.getElementById('gif-1').src = 'http://insightgraphicdesign.net/wp-content/uploads/2014/07/coke-responsive-logo.gif'; document.getElementById('gif-1').style.display = "block"; } else document.getElementById('gif-1').style.display = "none"; } function toggle2() { if (document.getElementById('gif-2').style.display == "...</code> | <code>how to clear or replace a cached image I know there are many ways to prevent image caching (such as via META tags), as well as a few nice tricks to ensure that the current version of an image is shown with every page load (such as image.jpg?x=timestamp), but is there any way to actually clear or replace an image in the browsers cache so that neither of the methods above are necessary? As an example, lets say there are 100 images on a page and that these images are named "01.jpg", "02.jpg", "03.jpg", etc. If image "42.jpg" is replaced, is there any way to replace it in the cache so that "42.jpg" will automatically display the new image on successive page loads? I can't use the META tag method, because I need everuthing that ISN"T replaced to remain cached, and I can't use the timestamp method, because I don't want ALL of the images to be reloaded every time the page loads. I've racked my brain and scoured the Internet for a way to do this (preferrably via javascript), but no luck. Any...</code> | |
|
|
| <code>Is it possible that there are more than 6 quark flavors/more than 3 generations? I thought that things like the top quark don't exist in nature because they're super unstable and we can only observe them after high-energy collisions (e.g. LHC) Is it possible to make even more massive quarks? Or is there a reason the limit is six?</code> | <code>Why do we think there are only three generations of fundamental particles? In the of particle physics, there are three generations of quarks (up/down, strange/charm, and top/bottom), along with three generations of leptons (electron, muon, and tau). All of these particles have been observed experimentally, and we don't seem to have seen anything new along these lines. A priori, this doesn't eliminate the possibility of a fourth generation, but the physicists I've spoken to do not think additional generations are likely. Question: What sort of theoretical or experimental reasons do we have for this limitation? One reason I heard from my officemate is that we haven't seen new neutrinos. Neutrinos seem to be light enough that if another generation's neutrino is too heavy to be detected, then the corresponding quarks would be massive enough that new physics might interfere with their existence. This suggests the question: is there a general rule relating neutrino masses to quark...</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### stackexchange_body_body |
|
|
|
|
|
* Dataset: stackexchange_body_body |
|
|
* Size: 65,689 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:-----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 16 tokens</li><li>mean: 38.22 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 9 tokens</li><li>mean: 38.09 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>When I type "sudo apt-get update" I see HTTP protocol is used to fetch the updates . Why not HTTPS is used for more secure communication ?</code> | <code>Does apt-get use https or any kind of encryption? Is there a way to configure it to use it?</code> | |
|
|
| <code>If I have 4 identical* LEDs wired in parallel to a single resistor so that the overall current available is 30 mA, do I still run the risk of premature burnout? The LEDs peak forward current is 30 mA. *I know that LEDs from the same package may still have slight differences I thoroughly read through the answers here - - but it seems like the assumption would be that one would arrange a circuit so that the available current equals the total draw of the 4 LEDs, in my case 80 mA. The problem then would be that some would draw more than the peak. But, if I'm limiting the avaialble current to 30mA, is there still an issue? That would mean that ideally 7.5mA would be supplied to each LED. Obviously, based on the answer in the aforementioned link, it would not likely be even, but it shouldn't get to "dangerous levels". Follow-up: Based on the volt/amperage curve, it looks like I'd be seeing a ~0.1V drop. Will this significantly affect the brightness? Still pretty new to all this so m...</code> | <code>I'm trying to wire up 6 RGB LEDs in parallel, all controlled from a single source (well, three sources, one for each colour). The LEDs came supplied with resistors to limit the current of 270 Ohm for a 5v supply. The problem is, 6 LEDs x 3 colours = 18 resistors, which is a lot, and means I need a much bigger board and a lot more soldering. So, can I instead wire the LEDs in parallel with each other, with a single resistor protecting all six? (3 resistors in total, one for each colour). How do I calculate the value of that resistor? More details: The LEDs are being driven from a to supply a bit of current, which is in turn controlled by a Netduino providing a PWM signal on the three channels. . If I've correctly understood the data sheet they want 20mA of current, and forward voltages of 2, 3, 3 volts (for R,G and B respectively?). The supplied resistors were all 270 Ohm, so the channels may not be balanced quite right. For extra credit: I'm only using 3 of the transistors in my...</code> | |
|
|
| <code>I want to make a figure in Mathematica, export it as a PDF, edit/label it in Photoshop, and then add it as a figure in a TeX document. I would really like to have the font in the figure closely match the math mode stuff in the document. In the past I've made a PDF with TeX with just the labels I want and then pasted them all into the figures, but this is incredibly tedious. I found this post - - which says that the font in math mode is "Latin Modern Symbol" but this is not an option in Photoshop. Is there another font that looks close enough to math mode which is in photoshop? Thanks for any help!</code> | <code>I draw figures in Inkscape. When I label elements within the figures with variable names that I have used in the underlying TeX document, I would like them to look exactly the same as in the document. (e.g. l does not look the same as $l$) What is the name of the math mode font so I can select it correctly in the Inkscape font list? If the exact font should not be available, what is a similar looking font that is present on most systems?</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### sentence_compression |
|
|
|
|
|
* Dataset: sentence_compression |
|
|
* Size: 173,604 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:-----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 11 tokens</li><li>mean: 32.05 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 7 tokens</li><li>mean: 12.69 tokens</li><li>max: 31 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| <code>Sedgebrook, a continuing care retirement community located in Lincolnshire, will host a free support group for caregivers who support aging loved ones.</code> | <code>Sedgebrook retirement community to host support group for caregivers</code> | |
|
|
| <code>Junction City Police said in a news release Saturday that several shots were fired at the narcotics detective around midnight as he conducted surveillance in an unmarked vehicle.</code> | <code>Shots fired at narcotics detective</code> | |
|
|
| <code>A SWAT team surrounded a home on Miller Avenue in South San Francisco Monday afternoon, according to authorities and neighbors.</code> | <code>SWAT team surrounds home in South San Francisco</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### wikihow |
|
|
|
|
|
* Dataset: wikihow |
|
|
* Size: 96,029 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:---------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 5 tokens</li><li>mean: 10.12 tokens</li><li>max: 24 tokens</li></ul> | <ul><li>min: 9 tokens</li><li>mean: 37.1 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>Dry and Propagate Comfrey</code> | <code> This article will tell you how to dry and propagate comfrey. </code> | |
|
|
| <code>Add a Playlist Shortcut on Android</code> | <code>Adding a playlist shortcut to your home screen is a surefire way to add convenience in using your Android device. For daily commutes or morning jogs, this is a useful feature to be able to start playing your music in the quickest way possible.</code> | |
|
|
| <code>Add an Android App to Google Drive</code> | <code> Google drive is a social service that can be used to share with friends. You can use Google Drive on your Android to share Android apps. </code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### altlex |
|
|
|
|
|
* Dataset: altlex |
|
|
* Size: 110,708 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 5 tokens</li><li>mean: 29.71 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 27.34 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>Avery County is a county located in the U.S. state of North Carolina .</code> | <code>Avery County is a county in the U.S. state of North Carolina .</code> | |
|
|
| <code>There he studied piano with Mieczyslaw Horszowski and composition with Constant Vauclain , and switched majors from piano to composition .</code> | <code>He studied piano at the Curtis Institute of Music , with Mieczyslaw Horszowski and composition with Constant Vauclain .</code> | |
|
|
| <code>The ReachOut website includes testimonials from a school nurse in Tucson , Arizona and an elementary school principal of the Deer Valley Unified School District in Greater Phoenix .</code> | <code>The ReachOut website has leters from a school nurse in Tucson , Arizona and an elementary school principal of the Deer Valley Unified School District in Greater Phoenix which say good things about ReachOut .</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### quora |
|
|
|
|
|
* Dataset: quora |
|
|
* Size: 44,885 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 7 tokens</li><li>mean: 14.76 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 7 tokens</li><li>mean: 14.64 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>Website traffic analytics will show statistics for "direct navigation" which includes both typed in URL's (domain +.com) in the URL bar, as well as those using bookmarks to get to a site. What is an estimate for the breakdown of each?</code> | <code>Website traffic analytics will show statistics for "direct navigation" which includes both typed in URL's (domain +.com) in the URL bar, as well as those using bookmarks to get to a site. Are there any statistics that show an estimated percentage of each rather than lumping them together?</code> | |
|
|
| <code>What are the most recognized flags in the world?</code> | <code>Which 10 flags are the most recognisable in the world?</code> | |
|
|
| <code>Can I deposit 500 & 1000 INR notes in my savings account multiple times on each banking day till 30/12/2016?</code> | <code>Can I deposit 500 & 1000 INR notes in my current account multiple times on each banking day till 30/12/2016?</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### simplewiki |
|
|
|
|
|
* Dataset: simplewiki |
|
|
* Size: 97,717 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 9 tokens</li><li>mean: 28.33 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 9 tokens</li><li>mean: 30.86 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>Some of those rescued by the Nordnorge were taken to the Chilean Eduardo Frei Montalva Station on King George Island . Later they were flown by C-130 Hercules transport aircraft of the Chilean Air Force to Punta Arenas , Chile , in two separate flights on Saturday , November 24th , and Sunday , November 25th .</code> | <code>All of those rescued by Nordnorge were taken to the Chilean Frei Montalva Station on King George Island where they were subsequently airlifted by C-130 Hercules transport aircraft of the Chilean Air Force to Punta Arenas , Chile in two separate flights , one on Saturday , November 24 , and the other on Sunday , November 25 .</code> | |
|
|
| <code>The name of that province is Friesland . Leeuwarden is called Ljouwert in Frisian .</code> | <code>Leeuwarden ( , Stadsfries : Liwwadden , Frisian : Ljouwert , ) is the capital city of the Dutch province of Friesland .</code> | |
|
|
| <code>France has invested a lot in nuclear power . This made France the smallest producer of carbon dioxide among the seven most industrialised countries in the world .</code> | <code>France is the smallest emitter of carbon dioxide among the seven most industrialized countries in the world , due to its heavy investment in nuclear power .</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
#### squad |
|
|
|
|
|
* Dataset: squad |
|
|
* Size: 25,117 training samples |
|
|
* Columns: <code>query</code> and <code>document</code> |
|
|
* Approximate statistics based on the first 1000 samples: |
|
|
| | query | document | |
|
|
|:--------|:----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 7 tokens</li><li>mean: 15.82 tokens</li><li>max: 39 tokens</li></ul> | <ul><li>min: 32 tokens</li><li>mean: 38.97 tokens</li><li>max: 39 tokens</li></ul> | |
|
|
* Samples: |
|
|
| query | document | |
|
|
|:----------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>What percentage of Italians spoke standard Italian when Italy was first unified?</code> | <code>During the Risorgimento, proponents of Italian republicanism and Italian nationalism, such as Alessandro Manzoni, stressed the importance of establishing a uniform national language in order to better create an Italian national identity. With the unification of Italy in the 1860s, standard Italian became the official national language of the new Italian state, while the various unofficial regional languages of Italy gradually became regarded as subordinate "dialects" to Italian, increasingly associated negatively with lack of education or provincialism. However, at the time of the Italian Unification, standard Italian still existed mainly as a literary language, and only 2.5% of Italy's population could speak standard Italian.</code> | |
|
|
| <code>What type of process is used to produce most paper used in paperback books?</code> | <code>Mechanical pulping yields almost a tonne of pulp per tonne of dry wood used, which is why mechanical pulps are sometimes referred to as "high yield" pulps. With almost twice the yield as chemical pulping, mechanical pulps is often cheaper. Mass-market paperback books and newspapers tend to use mechanical papers. Book publishers tend to use acid-free paper, made from fully bleached chemical pulps for hardback and trade paperback books.</code> | |
|
|
| <code>What do orthodox Jews express ambivalence towards?</code> | <code>Politically, Orthodox Jews, given their variety of movements and affiliations, tend not to conform easily to the standard left-right political spectrum, with one of the key differences between the movements stemming from the groups' attitudes to Zionism. Generally speaking, of the three key strands of Orthodox Judaism, Haredi Orthodox and Hasidic Orthodox Jews are at best ambivalent towards the ideology of Zionism and the creation of the State of Israel, and there are many groups and organisations who are outspokenly anti-Zionistic, seeing the ideology of Zionism as diametrically opposed to the teaching of the Torah, and the Zionist administration of the State of Israel, with its emphasis on militarism and nationalism, as destructive of the Judaic way of life.</code> | |
|
|
* Loss: <code>pylate.losses.cached_contrastive.CachedContrastive</code> |
|
|
|
|
|
### Training Hyperparameters |
|
|
#### Non-Default Hyperparameters |
|
|
|
|
|
- `eval_strategy`: steps |
|
|
- `per_device_train_batch_size`: 16384 |
|
|
- `per_device_eval_batch_size`: 16384 |
|
|
- `learning_rate`: 0.0003 |
|
|
- `num_train_epochs`: 1 |
|
|
- `seed`: 2203 |
|
|
- `bf16`: True |
|
|
- `dataloader_num_workers`: 4 |
|
|
- `accelerator_config`: {'split_batches': True, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None} |
|
|
- `ddp_find_unused_parameters`: False |
|
|
|
|
|
#### All Hyperparameters |
|
|
<details><summary>Click to expand</summary> |
|
|
|
|
|
- `overwrite_output_dir`: False |
|
|
- `do_predict`: False |
|
|
- `eval_strategy`: steps |
|
|
- `prediction_loss_only`: True |
|
|
- `per_device_train_batch_size`: 16384 |
|
|
- `per_device_eval_batch_size`: 16384 |
|
|
- `per_gpu_train_batch_size`: None |
|
|
- `per_gpu_eval_batch_size`: None |
|
|
- `gradient_accumulation_steps`: 1 |
|
|
- `eval_accumulation_steps`: None |
|
|
- `torch_empty_cache_steps`: None |
|
|
- `learning_rate`: 0.0003 |
|
|
- `weight_decay`: 0.0 |
|
|
- `adam_beta1`: 0.9 |
|
|
- `adam_beta2`: 0.999 |
|
|
- `adam_epsilon`: 1e-08 |
|
|
- `max_grad_norm`: 1.0 |
|
|
- `num_train_epochs`: 1 |
|
|
- `max_steps`: -1 |
|
|
- `lr_scheduler_type`: linear |
|
|
- `lr_scheduler_kwargs`: {} |
|
|
- `warmup_ratio`: 0.0 |
|
|
- `warmup_steps`: 0 |
|
|
- `log_level`: passive |
|
|
- `log_level_replica`: warning |
|
|
- `log_on_each_node`: True |
|
|
- `logging_nan_inf_filter`: True |
|
|
- `save_safetensors`: True |
|
|
- `save_on_each_node`: False |
|
|
- `save_only_model`: False |
|
|
- `restore_callback_states_from_checkpoint`: False |
|
|
- `no_cuda`: False |
|
|
- `use_cpu`: False |
|
|
- `use_mps_device`: False |
|
|
- `seed`: 2203 |
|
|
- `data_seed`: None |
|
|
- `jit_mode_eval`: False |
|
|
- `use_ipex`: False |
|
|
- `bf16`: True |
|
|
- `fp16`: False |
|
|
- `fp16_opt_level`: O1 |
|
|
- `half_precision_backend`: auto |
|
|
- `bf16_full_eval`: False |
|
|
- `fp16_full_eval`: False |
|
|
- `tf32`: None |
|
|
- `local_rank`: 3 |
|
|
- `ddp_backend`: None |
|
|
- `tpu_num_cores`: None |
|
|
- `tpu_metrics_debug`: False |
|
|
- `debug`: [] |
|
|
- `dataloader_drop_last`: True |
|
|
- `dataloader_num_workers`: 4 |
|
|
- `dataloader_prefetch_factor`: None |
|
|
- `past_index`: -1 |
|
|
- `disable_tqdm`: False |
|
|
- `remove_unused_columns`: True |
|
|
- `label_names`: None |
|
|
- `load_best_model_at_end`: False |
|
|
- `ignore_data_skip`: False |
|
|
- `fsdp`: [] |
|
|
- `fsdp_min_num_params`: 0 |
|
|
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False} |
|
|
- `fsdp_transformer_layer_cls_to_wrap`: None |
|
|
- `accelerator_config`: {'split_batches': True, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None} |
|
|
- `deepspeed`: None |
|
|
- `label_smoothing_factor`: 0.0 |
|
|
- `optim`: adamw_torch |
|
|
- `optim_args`: None |
|
|
- `adafactor`: False |
|
|
- `group_by_length`: False |
|
|
- `length_column_name`: length |
|
|
- `ddp_find_unused_parameters`: False |
|
|
- `ddp_bucket_cap_mb`: None |
|
|
- `ddp_broadcast_buffers`: False |
|
|
- `dataloader_pin_memory`: True |
|
|
- `dataloader_persistent_workers`: False |
|
|
- `skip_memory_metrics`: True |
|
|
- `use_legacy_prediction_loop`: False |
|
|
- `push_to_hub`: False |
|
|
- `resume_from_checkpoint`: None |
|
|
- `hub_model_id`: None |
|
|
- `hub_strategy`: every_save |
|
|
- `hub_private_repo`: None |
|
|
- `hub_always_push`: False |
|
|
- `gradient_checkpointing`: False |
|
|
- `gradient_checkpointing_kwargs`: None |
|
|
- `include_inputs_for_metrics`: False |
|
|
- `include_for_metrics`: [] |
|
|
- `eval_do_concat_batches`: True |
|
|
- `fp16_backend`: auto |
|
|
- `push_to_hub_model_id`: None |
|
|
- `push_to_hub_organization`: None |
|
|
- `mp_parameters`: |
|
|
- `auto_find_batch_size`: False |
|
|
- `full_determinism`: False |
|
|
- `torchdynamo`: None |
|
|
- `ray_scope`: last |
|
|
- `ddp_timeout`: 1800 |
|
|
- `torch_compile`: False |
|
|
- `torch_compile_backend`: None |
|
|
- `torch_compile_mode`: None |
|
|
- `dispatch_batches`: None |
|
|
- `split_batches`: None |
|
|
- `include_tokens_per_second`: False |
|
|
- `include_num_input_tokens_seen`: False |
|
|
- `neftune_noise_alpha`: None |
|
|
- `optim_target_modules`: None |
|
|
- `batch_eval_metrics`: False |
|
|
- `eval_on_start`: False |
|
|
- `use_liger_kernel`: False |
|
|
- `eval_use_gather_object`: False |
|
|
- `average_tokens_across_devices`: False |
|
|
- `prompts`: None |
|
|
- `batch_sampler`: batch_sampler |
|
|
- `multi_dataset_batch_sampler`: proportional |
|
|
- `router_mapping`: {} |
|
|
- `learning_rate_mapping`: {} |
|
|
|
|
|
</details> |
|
|
|
|
|
### Training Logs |
|
|
<details><summary>Click to expand</summary> |
|
|
|
|
|
| Epoch | Step | Training Loss | NanoClimateFEVER_MaxSim_ndcg@10 | NanoDBPedia_MaxSim_ndcg@10 | NanoFEVER_MaxSim_ndcg@10 | NanoFiQA2018_MaxSim_ndcg@10 | NanoHotpotQA_MaxSim_ndcg@10 | NanoMSMARCO_MaxSim_ndcg@10 | NanoNFCorpus_MaxSim_ndcg@10 | NanoNQ_MaxSim_ndcg@10 | NanoQuoraRetrieval_MaxSim_ndcg@10 | NanoSCIDOCS_MaxSim_ndcg@10 | NanoArguAna_MaxSim_ndcg@10 | NanoSciFact_MaxSim_ndcg@10 | NanoTouche2020_MaxSim_ndcg@10 | NanoBEIR_mean_MaxSim_ndcg@10 | |
|
|
|:------:|:-----:|:-------------:|:-------------------------------:|:--------------------------:|:------------------------:|:---------------------------:|:---------------------------:|:--------------------------:|:---------------------------:|:---------------------:|:---------------------------------:|:--------------------------:|:--------------------------:|:--------------------------:|:-----------------------------:|:----------------------------:| |
|
|
| 0.0034 | 50 | 61.2999 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.0343 | 500 | 5.0461 | 0.3091 | 0.6323 | 0.8872 | 0.5003 | 0.8744 | 0.5921 | 0.3513 | 0.6589 | 0.9681 | 0.3880 | 0.5267 | 0.7637 | 0.5567 | 0.6161 | |
|
|
| 0.0377 | 550 | 5.5948 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.0686 | 1000 | 4.5264 | 0.2929 | 0.6032 | 0.8808 | 0.4734 | 0.8602 | 0.5893 | 0.3661 | 0.6827 | 0.9840 | 0.4127 | 0.5617 | 0.7662 | 0.5425 | 0.6166 | |
|
|
| 0.0721 | 1050 | 4.2289 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.1029 | 1500 | 3.3687 | 0.3020 | 0.6234 | 0.8844 | 0.4924 | 0.8616 | 0.6463 | 0.3382 | 0.6535 | 0.9737 | 0.4126 | 0.5931 | 0.7712 | 0.5332 | 0.6220 | |
|
|
| 0.1064 | 1550 | 3.5525 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.1373 | 2000 | 3.4038 | 0.3187 | 0.6411 | 0.9097 | 0.4932 | 0.8662 | 0.5951 | 0.3396 | 0.6608 | 0.9654 | 0.4146 | 0.5593 | 0.7731 | 0.5297 | 0.6205 | |
|
|
| 0.1407 | 2050 | 2.6228 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.1716 | 2500 | 3.3323 | 0.3495 | 0.6658 | 0.9354 | 0.5105 | 0.8702 | 0.5510 | 0.3455 | 0.6634 | 0.9708 | 0.4018 | 0.5564 | 0.7860 | 0.5386 | 0.6265 | |
|
|
| 0.1750 | 2550 | 4.0719 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.2059 | 3000 | 4.6922 | 0.3614 | 0.6465 | 0.8694 | 0.5131 | 0.8670 | 0.6584 | 0.3381 | 0.6651 | 0.9772 | 0.4106 | 0.5640 | 0.7646 | 0.5562 | 0.6301 | |
|
|
| 0.2093 | 3050 | 4.588 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.2402 | 3500 | 2.6753 | 0.3553 | 0.6433 | 0.9020 | 0.5309 | 0.8610 | 0.6263 | 0.3574 | 0.6896 | 0.9664 | 0.4094 | 0.5529 | 0.7897 | 0.5457 | 0.6331 | |
|
|
| 0.2436 | 3550 | 1.9266 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.2745 | 4000 | 3.7069 | 0.3320 | 0.6326 | 0.9042 | 0.5067 | 0.8537 | 0.6766 | 0.3617 | 0.6794 | 0.9734 | 0.4097 | 0.5667 | 0.7830 | 0.5461 | 0.6327 | |
|
|
| 0.2779 | 4050 | 2.0447 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.3088 | 4500 | 5.9963 | 0.3331 | 0.6102 | 0.8848 | 0.5332 | 0.8662 | 0.6105 | 0.3684 | 0.6865 | 0.9617 | 0.4200 | 0.5531 | 0.7741 | 0.5498 | 0.6270 | |
|
|
| 0.3123 | 4550 | 1.9683 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.3431 | 5000 | 3.4992 | 0.3363 | 0.6468 | 0.8804 | 0.5186 | 0.8409 | 0.6062 | 0.3638 | 0.6793 | 0.9572 | 0.4050 | 0.6033 | 0.7820 | 0.5277 | 0.6267 | |
|
|
| 0.3466 | 5050 | 3.8568 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.3775 | 5500 | 2.4815 | 0.3515 | 0.6371 | 0.9249 | 0.5024 | 0.8714 | 0.5951 | 0.3559 | 0.6958 | 0.9738 | 0.4188 | 0.5704 | 0.8005 | 0.5287 | 0.6328 | |
|
|
| 0.3809 | 5550 | 0.8767 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.4118 | 6000 | 2.8857 | 0.3370 | 0.6572 | 0.9093 | 0.5114 | 0.8845 | 0.6167 | 0.3607 | 0.6978 | 0.9675 | 0.4366 | 0.5420 | 0.8066 | 0.5399 | 0.6359 | |
|
|
| 0.4152 | 6050 | 4.0425 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.4461 | 6500 | 2.1285 | 0.3643 | 0.6483 | 0.9097 | 0.5183 | 0.8664 | 0.6320 | 0.3609 | 0.7104 | 0.9673 | 0.4278 | 0.5570 | 0.8011 | 0.5425 | 0.6389 | |
|
|
| 0.4495 | 6550 | 3.4573 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.4804 | 7000 | 4.2792 | 0.3313 | 0.6685 | 0.9194 | 0.5153 | 0.8760 | 0.6245 | 0.3654 | 0.7160 | 0.9611 | 0.4190 | 0.5751 | 0.7887 | 0.5562 | 0.6397 | |
|
|
| 0.4838 | 7050 | 1.9176 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.5147 | 7500 | 3.0862 | 0.3277 | 0.6472 | 0.9035 | 0.5352 | 0.8700 | 0.6237 | 0.3486 | 0.7176 | 0.9654 | 0.4276 | 0.5619 | 0.7786 | 0.5489 | 0.6351 | |
|
|
| 0.5182 | 7550 | 1.942 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.5490 | 8000 | 2.1192 | 0.3355 | 0.6564 | 0.9309 | 0.5218 | 0.8688 | 0.6458 | 0.3540 | 0.7075 | 0.9691 | 0.4101 | 0.5733 | 0.7808 | 0.5434 | 0.6383 | |
|
|
| 0.5525 | 8050 | 2.9456 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.5834 | 8500 | 3.8136 | 0.3470 | 0.6382 | 0.9309 | 0.5277 | 0.8581 | 0.6432 | 0.3533 | 0.6766 | 0.9686 | 0.4272 | 0.5573 | 0.8036 | 0.5352 | 0.6359 | |
|
|
| 0.5868 | 8550 | 2.2182 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.6177 | 9000 | 3.4301 | 0.3418 | 0.6299 | 0.9196 | 0.5272 | 0.8482 | 0.6435 | 0.3487 | 0.7118 | 0.9686 | 0.4260 | 0.5727 | 0.8009 | 0.5402 | 0.6369 | |
|
|
| 0.6211 | 9050 | 1.8215 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.6520 | 9500 | 1.728 | 0.3491 | 0.6376 | 0.8956 | 0.5294 | 0.8523 | 0.6730 | 0.3468 | 0.6982 | 0.9728 | 0.4139 | 0.5757 | 0.7896 | 0.5438 | 0.6367 | |
|
|
| 0.6554 | 9550 | 1.6046 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.6863 | 10000 | 3.2371 | 0.3511 | 0.6495 | 0.8984 | 0.5236 | 0.8408 | 0.6371 | 0.3571 | 0.7229 | 0.9675 | 0.4286 | 0.6078 | 0.7906 | 0.5409 | 0.6397 | |
|
|
| 0.6897 | 10050 | 0.7697 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.7206 | 10500 | 1.8522 | 0.3443 | 0.6459 | 0.9096 | 0.5073 | 0.8310 | 0.6554 | 0.3558 | 0.7143 | 0.9651 | 0.4333 | 0.5888 | 0.7959 | 0.5351 | 0.6371 | |
|
|
| 0.7240 | 10550 | 2.0346 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.7549 | 11000 | 3.3423 | 0.3527 | 0.6543 | 0.9204 | 0.5239 | 0.8794 | 0.6599 | 0.3477 | 0.7359 | 0.9709 | 0.4315 | 0.5812 | 0.8018 | 0.5267 | 0.6451 | |
|
|
| 0.7584 | 11050 | 1.9674 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.7892 | 11500 | 2.6639 | 0.3671 | 0.6534 | 0.9254 | 0.5337 | 0.8470 | 0.6734 | 0.3436 | 0.7249 | 0.9600 | 0.4282 | 0.5881 | 0.8062 | 0.5336 | 0.6450 | |
|
|
| 0.7927 | 11550 | 2.5904 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.8236 | 12000 | 3.1084 | 0.3631 | 0.6534 | 0.9043 | 0.5061 | 0.8514 | 0.6698 | 0.3383 | 0.7349 | 0.9647 | 0.4329 | 0.5784 | 0.7941 | 0.5416 | 0.6410 | |
|
|
| 0.8270 | 12050 | 1.1884 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.8579 | 12500 | 1.0438 | 0.3353 | 0.6592 | 0.9085 | 0.4985 | 0.8414 | 0.6357 | 0.3449 | 0.7182 | 0.9792 | 0.4176 | 0.5767 | 0.7912 | 0.5307 | 0.6336 | |
|
|
| 0.8613 | 12550 | 1.422 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.8922 | 13000 | 1.8963 | 0.3529 | 0.6538 | 0.9161 | 0.5349 | 0.8396 | 0.6563 | 0.3467 | 0.7182 | 0.9796 | 0.4286 | 0.5852 | 0.8058 | 0.5374 | 0.6427 | |
|
|
| 0.8956 | 13050 | 3.0346 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.9265 | 13500 | 1.6398 | 0.3477 | 0.6604 | 0.9121 | 0.5267 | 0.8496 | 0.6493 | 0.3423 | 0.7234 | 0.9730 | 0.4321 | 0.5767 | 0.7996 | 0.5368 | 0.6408 | |
|
|
| 0.9299 | 13550 | 1.3234 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.9608 | 14000 | 2.0053 | 0.3489 | 0.6655 | 0.9194 | 0.5436 | 0.8521 | 0.6605 | 0.3414 | 0.7221 | 0.9753 | 0.4324 | 0.5817 | 0.7986 | 0.5366 | 0.6445 | |
|
|
| 0.9642 | 14050 | 1.2547 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
| 0.9951 | 14500 | 1.4897 | 0.3518 | 0.6608 | 0.9268 | 0.5355 | 0.8602 | 0.6668 | 0.3421 | 0.7233 | 0.9747 | 0.4296 | 0.5817 | 0.8003 | 0.5345 | 0.6452 | |
|
|
| 0.9986 | 14550 | 3.0093 | - | - | - | - | - | - | - | - | - | - | - | - | - | - | |
|
|
|
|
|
</details> |
|
|
|
|
|
### Framework Versions |
|
|
- Python: 3.13.0 |
|
|
- Sentence Transformers: 5.1.1 |
|
|
- PyLate: 1.3.4 |
|
|
- Transformers: 4.48.3 |
|
|
- PyTorch: 2.6.0 |
|
|
- Accelerate: 1.12.0 |
|
|
- Datasets: 4.4.1 |
|
|
- Tokenizers: 0.21.0 |
|
|
|
|
|
|
|
|
## Citation |
|
|
|
|
|
### BibTeX |
|
|
|
|
|
#### ColBERT-Zero |
|
|
```bibtex |
|
|
@misc{chaffin2026colbertzeropretrainpretraincolbert, |
|
|
title = {ColBERT-Zero: To Pre-train Or Not To Pre-train ColBERT models}, |
|
|
author = {Antoine Chaffin and Luca Arnaboldi and Amélie Chatelain and Florent Krzakala}, |
|
|
year = {2026}, |
|
|
eprint = {2602.16609}, |
|
|
archivePrefix = {arXiv}, |
|
|
primaryClass = {cs.CL}, |
|
|
url = {https://arxiv.org/abs/2602.16609}, |
|
|
} |
|
|
``` |
|
|
#### Sentence Transformers |
|
|
```bibtex |
|
|
@inproceedings{reimers-2019-sentence-bert, |
|
|
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks", |
|
|
author = "Reimers, Nils and Gurevych, Iryna", |
|
|
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing", |
|
|
month = "11", |
|
|
year = "2019", |
|
|
publisher = "Association for Computational Linguistics", |
|
|
url = "https://arxiv.org/abs/1908.10084" |
|
|
} |
|
|
``` |
|
|
#### PyLate |
|
|
```bibtex |
|
|
@inproceedings{DBLP:conf/cikm/ChaffinS25, |
|
|
author = {Antoine Chaffin and |
|
|
Rapha{"{e}}l Sourty}, |
|
|
editor = {Meeyoung Cha and |
|
|
Chanyoung Park and |
|
|
Noseong Park and |
|
|
Carl Yang and |
|
|
Senjuti Basu Roy and |
|
|
Jessie Li and |
|
|
Jaap Kamps and |
|
|
Kijung Shin and |
|
|
Bryan Hooi and |
|
|
Lifang He}, |
|
|
title = {PyLate: Flexible Training and Retrieval for Late Interaction Models}, |
|
|
booktitle = {Proceedings of the 34th {ACM} International Conference on Information |
|
|
and Knowledge Management, {CIKM} 2025, Seoul, Republic of Korea, November |
|
|
10-14, 2025}, |
|
|
pages = {6334--6339}, |
|
|
publisher = {{ACM}}, |
|
|
year = {2025}, |
|
|
url = {https://github.com/lightonai/pylate}, |
|
|
doi = {10.1145/3746252.3761608}, |
|
|
} |
|
|
``` |
|
|
#### Nomic Embed |
|
|
```bibtex |
|
|
@article{DBLP:journals/tmlr/NussbaumMMD25, |
|
|
author = {Zach Nussbaum and |
|
|
John Xavier Morris and |
|
|
Andriy Mulyar and |
|
|
Brandon Duderstadt}, |
|
|
title = {Nomic Embed: Training a Reproducible Long Context Text Embedder}, |
|
|
journal = {Trans. Mach. Learn. Res.}, |
|
|
volume = {2025}, |
|
|
year = {2025}, |
|
|
url = {https://openreview.net/forum?id=IPmzyQSiQE}, |
|
|
timestamp = {Fri, 20 Jun 2025 14:19:48 +0200}, |
|
|
biburl = {https://dblp.org/rec/journals/tmlr/NussbaumMMD25.bib}, |
|
|
bibsource = {dblp computer science bibliography, https://dblp.org} |
|
|
} |
|
|
``` |
|
|
|
|
|
#### CachedContrastive |
|
|
```bibtex |
|
|
@misc{gao2021scaling, |
|
|
title. = {Scaling Deep Contrastive Learning Batch Size under Memory Limited Setup}, |
|
|
author = {Luyu Gao and Yunyi Zhang and Jiawei Han and Jamie Callan}, |
|
|
year = {2021}, |
|
|
eprint = {2101.06983}, |
|
|
archivePrefix = {arXiv}, |
|
|
primaryClass. = {cs.LG} |
|
|
} |
|
|
``` |
|
|
|
|
|
<!-- |
|
|
## Glossary |
|
|
|
|
|
*Clearly define terms in order to be accessible across audiences.* |
|
|
--> |
|
|
|
|
|
<!-- |
|
|
## Model Card Authors |
|
|
|
|
|
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.* |
|
|
--> |
|
|
|
|
|
<!-- |
|
|
## Model Card Contact |
|
|
|
|
|
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.* |
|
|
--> |