| config: |
| REPO_ID: "mteb/leaderboard" |
| RESULTS_REPO: mteb/results |
| LEADERBOARD_NAME: "MTEB Leaderboard" |
| tasks: |
| BitextMining: |
| icon: "๐" |
| metric: f1 |
| metric_description: "[F1](https://huggingface.co/spaces/evaluate-metric/f1)" |
| task_description: "Bitext mining is the task of finding parallel sentences in two languages." |
| Classification: |
| icon: "โค๏ธ" |
| metric: accuracy |
| metric_description: "[Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)" |
| task_description: "Classification is the task of assigning a label to a text." |
| Clustering: |
| icon: "โจ" |
| metric: v_measure |
| metric_description: "Validity Measure (V-measure)" |
| task_description: "Clustering is the task of grouping similar documents together." |
| PairClassification: |
| icon: "๐ญ" |
| metric: max_ap |
| metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)" |
| task_description: "Pair classification is the task of determining whether two texts are similar." |
| Reranking: |
| icon: "๐ฅ" |
| metric: map |
| metric_description: "Mean Average Precision (MAP)" |
| task_description: "Reranking is the task of reordering a list of documents to improve relevance." |
| Retrieval: |
| icon: "๐" |
| metric: ndcg_at_10 |
| metric_description: "Normalized Discounted Cumulative Gain @ 10 (nDCG@10)" |
| task_description: "Retrieval is the task of finding relevant documents for a query." |
| STS: |
| icon: "โ๏ธ" |
| metric: cosine_spearman |
| metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)" |
| task_description: "Semantic Textual Similarity is the task of determining how similar two texts are." |
| Summarization: |
| icon: "๐" |
| metric: cosine_spearman |
| metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)" |
| task_description: "Summarization is the task of generating a summary of a text." |
| MultilabelClassification: |
| icon: "๐ท๏ธ" |
| metric: accuracy |
| metric_description: "Accuracy" |
| task_description: "Multilabel classification is the task of assigning multiple labels to a text." |
| InstructionRetrieval: |
| icon: "๐๐" |
| metric: "p-MRR" |
| metric_description: "paired mean reciprocal rank (p-MRR)" |
| task_description: "Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions." |
| boards: |
| en: |
| title: English |
| language_long: "English" |
| has_overall: true |
| acronym: null |
| icon: null |
| special_icons: null |
| credits: null |
| tasks: |
| Classification: |
| - AmazonCounterfactualClassification (en) |
| - AmazonPolarityClassification |
| - AmazonReviewsClassification (en) |
| - Banking77Classification |
| - EmotionClassification |
| - ImdbClassification |
| - MassiveIntentClassification (en) |
| - MassiveScenarioClassification (en) |
| - MTOPDomainClassification (en) |
| - MTOPIntentClassification (en) |
| - ToxicConversationsClassification |
| - TweetSentimentExtractionClassification |
| Clustering: |
| - ArxivClusteringP2P |
| - ArxivClusteringS2S |
| - BiorxivClusteringP2P |
| - BiorxivClusteringS2S |
| - MedrxivClusteringP2P |
| - MedrxivClusteringS2S |
| - RedditClustering |
| - RedditClusteringP2P |
| - StackExchangeClustering |
| - StackExchangeClusteringP2P |
| - TwentyNewsgroupsClustering |
| PairClassification: |
| - SprintDuplicateQuestions |
| - TwitterSemEval2015 |
| - TwitterURLCorpus |
| Reranking: |
| - AskUbuntuDupQuestions |
| - MindSmallReranking |
| - SciDocsRR |
| - StackOverflowDupQuestions |
| Retrieval: |
| - ArguAna |
| - ClimateFEVER |
| - CQADupstackRetrieval |
| - DBPedia |
| - FEVER |
| - FiQA2018 |
| - HotpotQA |
| - MSMARCO |
| - NFCorpus |
| - NQ |
| - QuoraRetrieval |
| - SCIDOCS |
| - SciFact |
| - Touche2020 |
| - TRECCOVID |
| STS: |
| - BIOSSES |
| - SICK-R |
| - STS12 |
| - STS13 |
| - STS14 |
| - STS15 |
| - STS16 |
| - STS17 (en-en) |
| - STS22 (en) |
| - STSBenchmark |
| Summarization: |
| - SummEval |
| en-x: |
| title: "English-X" |
| language_long: "117 (Pairs of: English & other language)" |
| has_overall: false |
| acronym: null |
| icon: null |
| special_icons: null |
| credits: null |
| tasks: |
| BitextMining: ['BUCC (de-en)', 'BUCC (fr-en)', 'BUCC (ru-en)', 'BUCC (zh-en)', 'Tatoeba (afr-eng)', 'Tatoeba (amh-eng)', 'Tatoeba (ang-eng)', 'Tatoeba (ara-eng)', 'Tatoeba (arq-eng)', 'Tatoeba (arz-eng)', 'Tatoeba (ast-eng)', 'Tatoeba (awa-eng)', 'Tatoeba (aze-eng)', 'Tatoeba (bel-eng)', 'Tatoeba (ben-eng)', 'Tatoeba (ber-eng)', 'Tatoeba (bos-eng)', 'Tatoeba (bre-eng)', 'Tatoeba (bul-eng)', 'Tatoeba (cat-eng)', 'Tatoeba (cbk-eng)', 'Tatoeba (ceb-eng)', 'Tatoeba (ces-eng)', 'Tatoeba (cha-eng)', 'Tatoeba (cmn-eng)', 'Tatoeba (cor-eng)', 'Tatoeba (csb-eng)', 'Tatoeba (cym-eng)', 'Tatoeba (dan-eng)', 'Tatoeba (deu-eng)', 'Tatoeba (dsb-eng)', 'Tatoeba (dtp-eng)', 'Tatoeba (ell-eng)', 'Tatoeba (epo-eng)', 'Tatoeba (est-eng)', 'Tatoeba (eus-eng)', 'Tatoeba (fao-eng)', 'Tatoeba (fin-eng)', 'Tatoeba (fra-eng)', 'Tatoeba (fry-eng)', 'Tatoeba (gla-eng)', 'Tatoeba (gle-eng)', 'Tatoeba (glg-eng)', 'Tatoeba (gsw-eng)', 'Tatoeba (heb-eng)', 'Tatoeba (hin-eng)', 'Tatoeba (hrv-eng)', 'Tatoeba (hsb-eng)', 'Tatoeba (hun-eng)', 'Tatoeba (hye-eng)', 'Tatoeba (ido-eng)', 'Tatoeba (ile-eng)', 'Tatoeba (ina-eng)', 'Tatoeba (ind-eng)', 'Tatoeba (isl-eng)', 'Tatoeba (ita-eng)', 'Tatoeba (jav-eng)', 'Tatoeba (jpn-eng)', 'Tatoeba (kab-eng)', 'Tatoeba (kat-eng)', 'Tatoeba (kaz-eng)', 'Tatoeba (khm-eng)', 'Tatoeba (kor-eng)', 'Tatoeba (kur-eng)', 'Tatoeba (kzj-eng)', 'Tatoeba (lat-eng)', 'Tatoeba (lfn-eng)', 'Tatoeba (lit-eng)', 'Tatoeba (lvs-eng)', 'Tatoeba (mal-eng)', 'Tatoeba (mar-eng)', 'Tatoeba (max-eng)', 'Tatoeba (mhr-eng)', 'Tatoeba (mkd-eng)', 'Tatoeba (mon-eng)', 'Tatoeba (nds-eng)', 'Tatoeba (nld-eng)', 'Tatoeba (nno-eng)', 'Tatoeba (nob-eng)', 'Tatoeba (nov-eng)', 'Tatoeba (oci-eng)', 'Tatoeba (orv-eng)', 'Tatoeba (pam-eng)', 'Tatoeba (pes-eng)', 'Tatoeba (pms-eng)', 'Tatoeba (pol-eng)', 'Tatoeba (por-eng)', 'Tatoeba (ron-eng)', 'Tatoeba (rus-eng)', 'Tatoeba (slk-eng)', 'Tatoeba (slv-eng)', 'Tatoeba (spa-eng)', 'Tatoeba (sqi-eng)', 'Tatoeba (srp-eng)', 'Tatoeba (swe-eng)', 'Tatoeba (swg-eng)', 'Tatoeba (swh-eng)', 'Tatoeba (tam-eng)', 'Tatoeba (tat-eng)', 'Tatoeba (tel-eng)', 'Tatoeba (tgl-eng)', 'Tatoeba (tha-eng)', 'Tatoeba (tuk-eng)', 'Tatoeba (tur-eng)', 'Tatoeba (tzl-eng)', 'Tatoeba (uig-eng)', 'Tatoeba (ukr-eng)', 'Tatoeba (urd-eng)', 'Tatoeba (uzb-eng)', 'Tatoeba (vie-eng)', 'Tatoeba (war-eng)', 'Tatoeba (wuu-eng)', 'Tatoeba (xho-eng)', 'Tatoeba (yid-eng)', 'Tatoeba (yue-eng)', 'Tatoeba (zsm-eng)'] |
| zh: |
| title: Chinese |
| language_long: Chinese |
| has_overall: true |
| acronym: C-MTEB |
| icon: "๐จ๐ณ" |
| special_icons: |
| Classification: "๐งก" |
| credits: "[FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)" |
| tasks: |
| Classification: |
| - AmazonReviewsClassification (zh) |
| - IFlyTek |
| - JDReview |
| - MassiveIntentClassification (zh-CN) |
| - MassiveScenarioClassification (zh-CN) |
| - MultilingualSentiment |
| - OnlineShopping |
| - TNews |
| - Waimai |
| Clustering: |
| - CLSClusteringP2P |
| - CLSClusteringS2S |
| - ThuNewsClusteringP2P |
| - ThuNewsClusteringS2S |
| PairClassification: |
| - Cmnli |
| - Ocnli |
| Reranking: |
| - CMedQAv1 |
| - CMedQAv2 |
| - MMarcoReranking |
| - T2Reranking |
| Retrieval: |
| - CmedqaRetrieval |
| - CovidRetrieval |
| - DuRetrieval |
| - EcomRetrieval |
| - MedicalRetrieval |
| - MMarcoRetrieval |
| - T2Retrieval |
| - VideoRetrieval |
| STS: |
| - AFQMC |
| - ATEC |
| - BQ |
| - LCQMC |
| - PAWSX |
| - QBQTC |
| - STS22 (zh) |
| - STSB |
| da: |
| title: Danish |
| language_long: Danish |
| has_overall: false |
| acronym: null |
| icon: "๐ฉ๐ฐ" |
| special_icons: |
| Classification: "๐ค" |
| credits: "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)" |
| tasks: |
| BitextMining: |
| - BornholmBitextMining |
| Classification: |
| - AngryTweetsClassification |
| - DanishPoliticalCommentsClassification |
| - DKHateClassification |
| - LccSentimentClassification |
| - MassiveIntentClassification (da) |
| - MassiveScenarioClassification (da) |
| - NordicLangClassification |
| - ScalaDaClassification |
| fr: |
| title: French |
| language_long: "French" |
| has_overall: true |
| acronym: "F-MTEB" |
| icon: "๐ซ๐ท" |
| special_icons: |
| Classification: "๐" |
| credits: "[Lyon-NLP](https://github.com/Lyon-NLP): [Gabriel Sequeira](https://github.com/GabrielSequeira), [Imene Kerboua](https://github.com/imenelydiaker), [Wissam Siblini](https://github.com/wissam-sib), [Mathieu Ciancone](https://github.com/MathieuCiancone), [Marion Schaeffer](https://github.com/schmarion)" |
| tasks: |
| Classification: |
| - AmazonReviewsClassification (fr) |
| - MasakhaNEWSClassification (fra) |
| - MassiveIntentClassification (fr) |
| - MassiveScenarioClassification (fr) |
| - MTOPDomainClassification (fr) |
| - MTOPIntentClassification (fr) |
| Clustering: |
| - AlloProfClusteringP2P |
| - AlloProfClusteringS2S |
| - HALClusteringS2S |
| - MLSUMClusteringP2P (fr) |
| - MLSUMClusteringS2S (fr) |
| - MasakhaNEWSClusteringP2P (fra) |
| - MasakhaNEWSClusteringS2S (fra) |
| PairClassification: |
| - OpusparcusPC (fr) |
| - PawsXPairClassification (fr) |
| Reranking: |
| - AlloprofReranking |
| - SyntecReranking |
| Retrieval: |
| - AlloprofRetrieval |
| - BSARDRetrieval |
| - MintakaRetrieval (fr) |
| - SyntecRetrieval |
| - XPQARetrieval (fr) |
| STS: |
| - STS22 (fr) |
| - STSBenchmarkMultilingualSTS (fr) |
| - SICKFr |
| Summarization: |
| - SummEvalFr |
| 'no': |
| title: Norwegian |
| language_long: "Norwegian Bokmรฅl" |
| has_overall: false |
| acronym: null |
| icon: "๐ณ๐ด" |
| special_icons: |
| Classification: "๐" |
| credits: "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)" |
| tasks: |
| Classification: &id001 |
| - NoRecClassification |
| - NordicLangClassification |
| - NorwegianParliament |
| - MassiveIntentClassification (nb) |
| - MassiveScenarioClassification (nb) |
| - ScalaNbClassification |
| instructions: |
| title: English |
| language_long: "English" |
| has_overall: false |
| acronym: null |
| icon: null |
| credits: "[Orion Weller, FollowIR](https://arxiv.org/abs/2403.15246)" |
| tasks: |
| InstructionRetrieval: |
| - Robust04InstructionRetrieval |
| - News21InstructionRetrieval |
| - Core17InstructionRetrieval |
| de: |
| title: German |
| language_long: "German" |
| has_overall: false |
| acronym: null |
| icon: "๐ฉ๐ช" |
| special_icons: null |
| credits: "[Silvan](https://github.com/slvnwhrl)" |
| tasks: |
| Clustering: |
| - BlurbsClusteringP2P |
| - BlurbsClusteringS2S |
| - TenKGnadClusteringP2P |
| - TenKGnadClusteringS2S |
| pl: |
| title: Polish |
| language_long: Polish |
| has_overall: true |
| acronym: null |
| icon: "๐ต๐ฑ" |
| special_icons: |
| Classification: "๐ค" |
| credits: "[Rafaล Poลwiata](https://github.com/rafalposwiata)" |
| tasks: |
| Classification: |
| - AllegroReviews |
| - CBD |
| - MassiveIntentClassification (pl) |
| - MassiveScenarioClassification (pl) |
| - PAC |
| - PolEmo2.0-IN |
| - PolEmo2.0-OUT |
| Clustering: |
| - 8TagsClustering |
| PairClassification: |
| - CDSC-E |
| - PPC |
| - PSC |
| - SICK-E-PL |
| Retrieval: |
| - ArguAna-PL |
| - DBPedia-PL |
| - FiQA-PL |
| - HotpotQA-PL |
| - MSMARCO-PL |
| - NFCorpus-PL |
| - NQ-PL |
| - Quora-PL |
| - SCIDOCS-PL |
| - SciFact-PL |
| - TRECCOVID-PL |
| STS: |
| - CDSC-R |
| - SICK-R-PL |
| - STS22 (pl) |
| ru: |
| title: Russian |
| language_long: "Russian" |
| has_overall: true |
| acronym: null |
| icon: "๐ท๐บ" |
| special_icons: null |
| credits: "[Roman Solomatin](https://github.com/Samoed) and SaluteDevices: [Alena Fenogenova](https://github.com/Alenush), [Aleksandr Abramov](https://github.com/Ab1992ao), [Artem Snegirev](https://github.com/artemsnegirev), [Anna Maksimova](https://github.com/anpalmak2003), [Maria Tikhonova](https://github.com/MariyaTikhonova)" |
| tasks: |
| Classification: |
| - GeoreviewClassification |
| - HeadlineClassification |
| - InappropriatenessClassification |
| - KinopoiskClassification |
| - RuReviewsClassification |
| - RuSciBenchGRNTIClassification |
| - RuSciBenchOECDClassification |
| - MassiveIntentClassification (ru) |
| - MassiveScenarioClassification (ru) |
| Clustering: |
| - GeoreviewClusteringP2P |
| - RuSciBenchGRNTIClusteringP2P |
| - RuSciBenchOECDClusteringP2P |
| PairClassification: |
| - TERRa |
| Reranking: |
| - RuBQReranking |
| - MIRACLReranking (ru) |
| Retrieval: |
| - RiaNewsRetrieval |
| - RuBQRetrieval |
| - MIRACLRetrieval (ru) |
| STS: |
| - RUParaPhraserSTS |
| - RuSTSBenchmarkSTS |
| - STS22 (ru) |
| MultilabelClassification: |
| - CEDRClassification |
| - SensitiveTopicsClassification |
| se: |
| title: Swedish |
| language_long: Swedish |
| has_overall: false |
| acronym: null |
| icon: "๐ธ๐ช" |
| special_icons: |
| Classification: "๐" |
| credits: "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)" |
| tasks: |
| Classification: |
| - NoRecClassification |
| - NordicLangClassification |
| - NorwegianParliament |
| - MassiveIntentClassification (nb) |
| - MassiveScenarioClassification (nb) |
| - ScalaNbClassification |
| other-cls: |
| title: "Other Languages" |
| language_long: "47 (Only languages not included in the other tabs)" |
| has_overall: false |
| acronym: null |
| icon: null |
| special_icons: |
| Classification: "๐๐๐" |
| credits: null |
| tasks: |
| Classification: ['AmazonCounterfactualClassification (de)', 'AmazonCounterfactualClassification (ja)', 'AmazonReviewsClassification (de)', 'AmazonReviewsClassification (es)', 'AmazonReviewsClassification (fr)', 'AmazonReviewsClassification (ja)', 'AmazonReviewsClassification (zh)', 'MTOPDomainClassification (de)', 'MTOPDomainClassification (es)', 'MTOPDomainClassification (fr)', 'MTOPDomainClassification (hi)', 'MTOPDomainClassification (th)', 'MTOPIntentClassification (de)', 'MTOPIntentClassification (es)', 'MTOPIntentClassification (fr)', 'MTOPIntentClassification (hi)', 'MTOPIntentClassification (th)', 'MassiveIntentClassification (af)', 'MassiveIntentClassification (am)', 'MassiveIntentClassification (ar)', 'MassiveIntentClassification (az)', 'MassiveIntentClassification (bn)', 'MassiveIntentClassification (cy)', 'MassiveIntentClassification (de)', 'MassiveIntentClassification (el)', 'MassiveIntentClassification (es)', 'MassiveIntentClassification (fa)', 'MassiveIntentClassification (fi)', 'MassiveIntentClassification (fr)', 'MassiveIntentClassification (he)', 'MassiveIntentClassification (hi)', 'MassiveIntentClassification (hu)', 'MassiveIntentClassification (hy)', 'MassiveIntentClassification (id)', 'MassiveIntentClassification (is)', 'MassiveIntentClassification (it)', 'MassiveIntentClassification (ja)', 'MassiveIntentClassification (jv)', 'MassiveIntentClassification (ka)', 'MassiveIntentClassification (km)', 'MassiveIntentClassification (kn)', 'MassiveIntentClassification (ko)', 'MassiveIntentClassification (lv)', 'MassiveIntentClassification (ml)', 'MassiveIntentClassification (mn)', 'MassiveIntentClassification (ms)', 'MassiveIntentClassification (my)', 'MassiveIntentClassification (nl)', 'MassiveIntentClassification (pt)', 'MassiveIntentClassification (ro)', 'MassiveIntentClassification (ru)', 'MassiveIntentClassification (sl)', 'MassiveIntentClassification (sq)', 'MassiveIntentClassification (sw)', 'MassiveIntentClassification (ta)', 'MassiveIntentClassification (te)', 'MassiveIntentClassification (th)', 'MassiveIntentClassification (tl)', 'MassiveIntentClassification (tr)', 'MassiveIntentClassification (ur)', 'MassiveIntentClassification (vi)', 'MassiveIntentClassification (zh-TW)', 'MassiveScenarioClassification (af)', 'MassiveScenarioClassification (am)', 'MassiveScenarioClassification (ar)', 'MassiveScenarioClassification (az)', 'MassiveScenarioClassification (bn)', 'MassiveScenarioClassification (cy)', 'MassiveScenarioClassification (de)', 'MassiveScenarioClassification (el)', 'MassiveScenarioClassification (es)', 'MassiveScenarioClassification (fa)', 'MassiveScenarioClassification (fi)', 'MassiveScenarioClassification (fr)', 'MassiveScenarioClassification (he)', 'MassiveScenarioClassification (hi)', 'MassiveScenarioClassification (hu)', 'MassiveScenarioClassification (hy)', 'MassiveScenarioClassification (id)', 'MassiveScenarioClassification (is)', 'MassiveScenarioClassification (it)', 'MassiveScenarioClassification (ja)', 'MassiveScenarioClassification (jv)', 'MassiveScenarioClassification (ka)', 'MassiveScenarioClassification (km)', 'MassiveScenarioClassification (kn)', 'MassiveScenarioClassification (ko)', 'MassiveScenarioClassification (lv)', 'MassiveScenarioClassification (ml)', 'MassiveScenarioClassification (mn)', 'MassiveScenarioClassification (ms)', 'MassiveScenarioClassification (my)', 'MassiveScenarioClassification (nl)', 'MassiveScenarioClassification (pt)', 'MassiveScenarioClassification (ro)', 'MassiveScenarioClassification (ru)', 'MassiveScenarioClassification (sl)', 'MassiveScenarioClassification (sq)', 'MassiveScenarioClassification (sw)', 'MassiveScenarioClassification (ta)', 'MassiveScenarioClassification (te)', 'MassiveScenarioClassification (th)', 'MassiveScenarioClassification (tl)', 'MassiveScenarioClassification (tr)', 'MassiveScenarioClassification (ur)', 'MassiveScenarioClassification (vi)', 'MassiveScenarioClassification (zh-TW)'] |
| other-sts: |
| title: Other |
| language_long: "Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Russian, Spanish (Only language combos not included in the other tabs)" |
| has_overall: false |
| acronym: null |
| icon: null |
| special_icons: null |
| credits: null |
| tasks: |
| STS: ["STS17 (ar-ar)", "STS17 (en-ar)", "STS17 (en-de)", "STS17 (en-tr)", "STS17 (es-en)", "STS17 (es-es)", "STS17 (fr-en)", "STS17 (it-en)", "STS17 (ko-ko)", "STS17 (nl-en)", "STS22 (ar)", "STS22 (de)", "STS22 (de-en)", "STS22 (de-fr)", "STS22 (de-pl)", "STS22 (es)", "STS22 (es-en)", "STS22 (es-it)", "STS22 (fr)", "STS22 (fr-pl)", "STS22 (it)", "STS22 (pl)", "STS22 (pl-en)", "STS22 (ru)", "STS22 (tr)", "STS22 (zh-en)", "STSBenchmark"] |
| law: |
| title: Law |
| language_long: "English, German, Chinese" |
| has_overall: false |
| acronym: null |
| icon: "โ๏ธ" |
| special_icons: null |
| credits: "[Voyage AI](https://www.voyageai.com/)" |
| tasks: |
| Retrieval: |
| - AILACasedocs |
| - AILAStatutes |
| - GerDaLIRSmall |
| - LeCaRDv2 |
| - LegalBenchConsumerContractsQA |
| - LegalBenchCorporateLobbying |
| - LegalQuAD |
| - LegalSummarization |
| longembed: |
| title: LongEmbed |
| language_long: "English" |
| has_overall: false |
| acronym: null |
| icon: "๐" |
| special_icons: null |
| credits: "[LongEmbed (Dawei Zhu et al.)](https://arxiv.org/abs/2404.12096v2)" |
| metric: nDCG@10 (for NarrativeQA, QMSum, SummScreenFD, WikimQA) & nDCG@1 (for passkey and needle) |
| tasks: |
| Retrieval: |
| - LEMBNarrativeQARetrieval |
| - LEMBNeedleRetrieval |
| - LEMBPasskeyRetrieval |
| - LEMBQMSumRetrieval |
| - LEMBSummScreenFDRetrieval |
| - LEMBWikimQARetrieval |
| rar-b: |
| title: RAR-b |
| language_long: "English" |
| has_overall: false |
| acronym: null |
| icon: "๐" |
| special_icons: null |
| credits: "[RAR-b (Chenghao Xiao et al.)](https://arxiv.org/abs/2404.06347/)" |
| metric: nDCG@10 |
| tasks: |
| Retrieval: |
| - ARCChallenge |
| - AlphaNLI |
| - HellaSwag |
| - PIQA |
| - Quail |
| - RARbCode |
| - RARbMath |
| - SIQA |
| - SpartQA |
| - TempReasonL1 |
| - TempReasonL2Fact |
| - TempReasonL2Pure |
| - TempReasonL3Fact |
| - TempReasonL3Pure |
| - WinoGrande |
| bright: |
| title: BRIGHT |
| language_long: "English" |
| has_overall: false |
| acronym: null |
| icon: "๐" |
| special_icons: null |
| credits: "[BRIGHT (Hongjin Su, Howard Yen, Mengzhou Xia et al.)](https://brightbenchmark.github.io/)" |
| metric: nDCG@10 |
| split: standard |
| desc: This tab only allows submissions with the original queries; not results from LLM rewritten queries or using reranking. |
| tasks: |
| Retrieval: |
| - BrightRetrieval (biology) |
| - BrightRetrieval (earth_science) |
| - BrightRetrieval (economics) |
| - BrightRetrieval (psychology) |
| - BrightRetrieval (robotics) |
| - BrightRetrieval (stackoverflow) |
| - BrightRetrieval (sustainable_living) |
| - BrightRetrieval (pony) |
| - BrightRetrieval (leetcode) |
| - BrightRetrieval (aops) |
| - BrightRetrieval (theoremqa_theorems) |
| - BrightRetrieval (theoremqa_questions) |
| bright_long: |
| title: BRIGHT Long |
| language_long: "English" |
| has_overall: false |
| acronym: null |
| icon: "๐" |
| special_icons: null |
| credits: "[BRIGHT (Hongjin Su, Howard Yen, Mengzhou Xia et al.)](https://brightbenchmark.github.io/)" |
| metric: Recall@1 |
| split: long |
| desc: This tab is for the long document setting of BRIGHT. |
| tasks: |
| Retrieval: |
| - BrightRetrieval (biology) |
| - BrightRetrieval (earth_science) |
| - BrightRetrieval (economics) |
| - BrightRetrieval (psychology) |
| - BrightRetrieval (robotics) |
| - BrightRetrieval (stackoverflow) |
| - BrightRetrieval (sustainable_living) |
| - BrightRetrieval (pony) |
| coir: |
| title: CoIR |
| language_long: "Code" |
| has_overall: false |
| acronym: null |
| icon: "๐ป" |
| special_icons: null |
| credits: "[Samoed](https://github.com/Samoed) and [monikernemo](https://github.com/monikernemo) and [CoIR (Xiangyang Li, Kuicai Dong, Yi Quan Lee et al.)](https://arxiv.org/abs/2407.02883)" |
| metric: nDCG@10 |
| tasks: |
| Retrieval: |
| - AppsRetrieval |
| - CodeFeedbackMT |
| - CodeFeedbackST |
| - CodeSearchNetCCRetrieval (python) |
| - CodeSearchNetCCRetrieval (javascript) |
| - CodeSearchNetCCRetrieval (go) |
| - CodeSearchNetCCRetrieval (ruby) |
| - CodeSearchNetCCRetrieval (java) |
| - CodeSearchNetCCRetrieval (php) |
| - CodeSearchNetRetrieval (python) |
| - CodeSearchNetRetrieval (javascript) |
| - CodeSearchNetRetrieval (go) |
| - CodeSearchNetRetrieval (ruby) |
| - CodeSearchNetRetrieval (java) |
| - CodeSearchNetRetrieval (php) |
| - CodeTransOceanContest |
| - CodeTransOceanDL |
| - CosQA |
| - StackOverflowQA |
| - SyntheticText2SQL |
|
|