| """All the official datasets in the benchmark.""" |
|
|
| from pydantic import BaseModel |
|
|
| from languages import * |
| from tasks import * |
|
|
|
|
| class Dataset(BaseModel): |
| """Class to hold dataset information.""" |
|
|
| name: str |
| language: Language |
| task: Task |
|
|
| def __hash__(self): |
| return hash(self.name) |
|
|
|
|
| ALBANIAN_DATASETS = [ |
| Dataset(name="mms-sq", language=ALBANIAN, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-sq", language=ALBANIAN, task=GRAMMAR), |
| Dataset(name="wikiann-sq", language=ALBANIAN, task=INFORMATION_EXTRACTION), |
| Dataset(name="multi-wiki-qa-sq", language=ALBANIAN, task=READING_COMPREHENSION), |
| Dataset(name="lr-sum-sq", language=ALBANIAN, task=SUMMARISATION), |
| Dataset(name="global-mmlu-lite-sq", language=ALBANIAN, task=KNOWLEDGE), |
| Dataset(name="winogrande-sq", language=ALBANIAN, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| BELARUSIAN_DATASETS = [ |
| Dataset(name="besls", language=BELARUSIAN, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-be", language=BELARUSIAN, task=GRAMMAR), |
| Dataset(name="wikiann-be", language=BELARUSIAN, task=INFORMATION_EXTRACTION), |
| Dataset(name="multi-wiki-qa-be", language=BELARUSIAN, task=READING_COMPREHENSION), |
| Dataset(name="be-wsc", language=BELARUSIAN, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| BOSNIAN_DATASETS = [ |
| Dataset(name="mms-bs", language=BOSNIAN, task=TEXT_CLASSIFICATION), |
| Dataset(name="wikiann-bs", language=BOSNIAN, task=INFORMATION_EXTRACTION), |
| Dataset(name="multi-wiki-qa-bs", language=BOSNIAN, task=READING_COMPREHENSION), |
| Dataset(name="lr-sum-bs", language=BOSNIAN, task=SUMMARISATION), |
| ] |
|
|
| BULGARIAN_DATASETS = [ |
| Dataset(name="bgt", language=BULGARIAN, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-bg", language=BULGARIAN, task=GRAMMAR), |
| Dataset(name="bner", language=BULGARIAN, task=INFORMATION_EXTRACTION), |
| Dataset(name="bgquad", language=BULGARIAN, task=READING_COMPREHENSION), |
| Dataset(name="cnn-dailymail-bg", language=BULGARIAN, task=SUMMARISATION), |
| Dataset(name="mmlu-bg", language=BULGARIAN, task=KNOWLEDGE), |
| Dataset(name="winogrande-bg", language=BULGARIAN, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| CATALAN_DATASETS = [ |
| Dataset(name="guia-cat", language=CATALAN, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-ca", language=CATALAN, task=GRAMMAR), |
| Dataset(name="wikiann-ca", language=CATALAN, task=INFORMATION_EXTRACTION), |
| Dataset(name="multi-wiki-qa-ca", language=CATALAN, task=READING_COMPREHENSION), |
| Dataset(name="dacsa-ca", language=CATALAN, task=SUMMARISATION), |
| Dataset(name="mmlu-ca", language=CATALAN, task=KNOWLEDGE), |
| Dataset(name="winogrande-ca", language=CATALAN, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| CROATIAN_DATASETS = [ |
| Dataset(name="mms-hr", language=CROATIAN, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-hr", language=CROATIAN, task=GRAMMAR), |
| Dataset(name="wikiann-hr", language=CROATIAN, task=INFORMATION_EXTRACTION), |
| Dataset(name="multi-wiki-qa-hr", language=CROATIAN, task=READING_COMPREHENSION), |
| Dataset(name="mmlu-hr", language=CROATIAN, task=KNOWLEDGE), |
| Dataset(name="winogrande-hr", language=CROATIAN, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| CZECH_DATASETS = [ |
| Dataset(name="csfd-sentiment", language=CZECH, task=TEXT_CLASSIFICATION), |
| Dataset(name="cs-gec", language=CZECH, task=GRAMMAR), |
| Dataset(name="poner", language=CZECH, task=INFORMATION_EXTRACTION), |
| Dataset(name="sqad", language=CZECH, task=READING_COMPREHENSION), |
| Dataset(name="czech-news", language=CZECH, task=SUMMARISATION), |
| Dataset(name="umimeto-qa", language=CZECH, task=KNOWLEDGE), |
| Dataset(name="hellaswag-cs", language=CZECH, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| DANISH_DATASETS = [ |
| Dataset(name="angry-tweets", language=DANISH, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-da", language=DANISH, task=GRAMMAR), |
| Dataset(name="dansk", language=DANISH, task=INFORMATION_EXTRACTION), |
| Dataset(name="multi-wiki-qa-da", language=DANISH, task=READING_COMPREHENSION), |
| Dataset(name="nordjylland-news", language=DANISH, task=SUMMARISATION), |
| Dataset(name="danske-talemaader", language=DANISH, task=KNOWLEDGE), |
| Dataset(name="danish-citizen-tests", language=DANISH, task=KNOWLEDGE), |
| Dataset(name="hellaswag-da", language=DANISH, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| DUTCH_DATASETS = [ |
| Dataset(name="dutch-social", language=DUTCH, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-nl", language=DUTCH, task=GRAMMAR), |
| Dataset(name="conll-nl", language=DUTCH, task=INFORMATION_EXTRACTION), |
| Dataset(name="squad-nl", language=DUTCH, task=READING_COMPREHENSION), |
| Dataset(name="wiki-lingua-nl", language=DUTCH, task=SUMMARISATION), |
| Dataset(name="mmlu-nl", language=DUTCH, task=KNOWLEDGE), |
| Dataset(name="hellaswag-nl", language=DUTCH, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| ENGLISH_DATASETS = [ |
| Dataset(name="sst5", language=ENGLISH, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-en", language=ENGLISH, task=GRAMMAR), |
| Dataset(name="conll-en", language=ENGLISH, task=INFORMATION_EXTRACTION), |
| Dataset(name="squad", language=ENGLISH, task=READING_COMPREHENSION), |
| Dataset(name="cnn-dailymail", language=ENGLISH, task=SUMMARISATION), |
| Dataset(name="life-in-the-uk", language=ENGLISH, task=KNOWLEDGE), |
| Dataset(name="hellaswag", language=ENGLISH, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| ESTONIAN_DATASETS = [ |
| Dataset(name="estonian-valence", language=ESTONIAN, task=TEXT_CLASSIFICATION), |
| Dataset(name="grammar-et", language=ESTONIAN, task=GRAMMAR), |
| Dataset(name="estner", language=ESTONIAN, task=INFORMATION_EXTRACTION), |
| Dataset(name="multi-wiki-qa-et", language=ESTONIAN, task=READING_COMPREHENSION), |
| Dataset(name="err-news", language=ESTONIAN, task=SUMMARISATION), |
| Dataset(name="trivia-et", language=ESTONIAN, task=KNOWLEDGE), |
| Dataset(name="winogrande-et", language=ESTONIAN, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| FAROESE_DATASETS = [ |
| Dataset(name="fosent", language=FAROESE, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-fo", language=FAROESE, task=GRAMMAR), |
| Dataset(name="fone", language=FAROESE, task=INFORMATION_EXTRACTION), |
| Dataset(name="foqa", language=FAROESE, task=READING_COMPREHENSION), |
| ] |
|
|
| FINNISH_DATASETS = [ |
| Dataset(name="scandisent-fi", language=FINNISH, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-fi", language=FINNISH, task=GRAMMAR), |
| Dataset(name="turku-ner-fi", language=FINNISH, task=INFORMATION_EXTRACTION), |
| Dataset(name="tydiqa-fi", language=FINNISH, task=READING_COMPREHENSION), |
| Dataset(name="xlsum-fi", language=FINNISH, task=SUMMARISATION), |
| Dataset(name="hellaswag-fi", language=FINNISH, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| FRENCH_DATASETS = [ |
| Dataset(name="allocine", language=FRENCH, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-fr", language=FRENCH, task=GRAMMAR), |
| Dataset(name="eltec", language=FRENCH, task=INFORMATION_EXTRACTION), |
| Dataset(name="fquad", language=FRENCH, task=READING_COMPREHENSION), |
| Dataset(name="orange-sum", language=FRENCH, task=SUMMARISATION), |
| Dataset(name="mmlu-fr", language=FRENCH, task=KNOWLEDGE), |
| Dataset(name="hellaswag-fr", language=FRENCH, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| GERMAN_DATASETS = [ |
| Dataset(name="sb10k", language=GERMAN, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-de", language=GERMAN, task=GRAMMAR), |
| Dataset(name="germeval", language=GERMAN, task=INFORMATION_EXTRACTION), |
| Dataset(name="germanquad", language=GERMAN, task=READING_COMPREHENSION), |
| Dataset(name="mlsum", language=GERMAN, task=SUMMARISATION), |
| Dataset(name="mmlu-de", language=GERMAN, task=KNOWLEDGE), |
| Dataset(name="hellaswag-de", language=GERMAN, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| GREEK_DATASETS = [ |
| Dataset(name="greek-sa", language=GREEK, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-el", language=GREEK, task=GRAMMAR), |
| Dataset(name="elner", language=GREEK, task=INFORMATION_EXTRACTION), |
| Dataset(name="multi-wiki-qa-el", language=GREEK, task=READING_COMPREHENSION), |
| Dataset(name="greek-wikipedia", language=GREEK, task=SUMMARISATION), |
| Dataset(name="global-mmlu-el", language=GREEK, task=KNOWLEDGE), |
| Dataset(name="winogrande-el", language=GREEK, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| HUNGARIAN_DATASETS = [ |
| Dataset(name="husst", language=HUNGARIAN, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-hu", language=HUNGARIAN, task=GRAMMAR), |
| Dataset(name="szeged-ner", language=HUNGARIAN, task=INFORMATION_EXTRACTION), |
| Dataset(name="multi-wiki-qa-hu", language=HUNGARIAN, task=READING_COMPREHENSION), |
| Dataset(name="hunsum", language=HUNGARIAN, task=SUMMARISATION), |
| Dataset(name="mmlu-hu", language=HUNGARIAN, task=KNOWLEDGE), |
| Dataset(name="winogrande-hu", language=HUNGARIAN, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| ICELANDIC_DATASETS = [ |
| Dataset( |
| name="hotter-and-colder-sentiment", language=ICELANDIC, task=TEXT_CLASSIFICATION |
| ), |
| Dataset(name="scala-is", language=ICELANDIC, task=GRAMMAR), |
| Dataset(name="mim-gold-ner", language=ICELANDIC, task=INFORMATION_EXTRACTION), |
| Dataset(name="nqii", language=ICELANDIC, task=READING_COMPREHENSION), |
| Dataset(name="rrn", language=ICELANDIC, task=SUMMARISATION), |
| Dataset(name="icelandic-knowledge", language=ICELANDIC, task=KNOWLEDGE), |
| Dataset(name="winogrande-is", language=ICELANDIC, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| ITALIAN_DATASETS = [ |
| Dataset(name="sentipolc16", language=ITALIAN, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-it", language=ITALIAN, task=GRAMMAR), |
| Dataset(name="multinerd-it", language=ITALIAN, task=INFORMATION_EXTRACTION), |
| Dataset(name="squad-it", language=ITALIAN, task=READING_COMPREHENSION), |
| Dataset(name="ilpost-sum", language=ITALIAN, task=SUMMARISATION), |
| Dataset(name="mmlu-it", language=ITALIAN, task=KNOWLEDGE), |
| Dataset(name="hellaswag-it", language=ITALIAN, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| LATVIAN_DATASETS = [ |
| Dataset( |
| name="latvian-twitter-sentiment", language=LATVIAN, task=TEXT_CLASSIFICATION |
| ), |
| Dataset(name="scala-lv", language=LATVIAN, task=GRAMMAR), |
| Dataset(name="fullstack-ner-lv", language=LATVIAN, task=INFORMATION_EXTRACTION), |
| Dataset(name="multi-wiki-qa-lv", language=LATVIAN, task=READING_COMPREHENSION), |
| Dataset(name="lsm", language=LATVIAN, task=SUMMARISATION), |
| Dataset(name="mmlu-lv", language=LATVIAN, task=KNOWLEDGE), |
| Dataset(name="copa-lv", language=LATVIAN, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| LITHUANIAN_DATASETS = [ |
| Dataset(name="atsiliepimai", language=LITHUANIAN, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-lt", language=LITHUANIAN, task=GRAMMAR), |
| Dataset(name="wikiann-lt", language=LITHUANIAN, task=INFORMATION_EXTRACTION), |
| Dataset(name="multi-wiki-qa-lt", language=LITHUANIAN, task=READING_COMPREHENSION), |
| Dataset(name="lrytas", language=LITHUANIAN, task=SUMMARISATION), |
| Dataset(name="lt-history", language=LITHUANIAN, task=KNOWLEDGE), |
| Dataset(name="winogrande-lt", language=LITHUANIAN, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| NORWEGIAN_DATASETS = [ |
| Dataset(name="norec", language=NORWEGIAN, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-nb", language=NORWEGIAN, task=GRAMMAR), |
| Dataset(name="scala-nn", language=NORWEGIAN, task=GRAMMAR), |
| Dataset(name="norne-nb", language=NORWEGIAN, task=INFORMATION_EXTRACTION), |
| Dataset(name="norne-nn", language=NORWEGIAN, task=INFORMATION_EXTRACTION), |
| Dataset(name="norquad", language=NORWEGIAN, task=READING_COMPREHENSION), |
| Dataset(name="no-sammendrag", language=NORWEGIAN, task=SUMMARISATION), |
| Dataset(name="nrk-quiz-qa", language=NORWEGIAN, task=KNOWLEDGE), |
| Dataset( |
| name="nor-common-sense-qa", language=NORWEGIAN, task=COMMON_SENSE_REASONING |
| ), |
| ] |
|
|
| POLISH_DATASETS = [ |
| Dataset(name="polemo2", language=POLISH, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-pl", language=POLISH, task=GRAMMAR), |
| Dataset(name="kpwr-ner", language=POLISH, task=INFORMATION_EXTRACTION), |
| Dataset(name="poquad", language=POLISH, task=READING_COMPREHENSION), |
| Dataset(name="psc", language=POLISH, task=SUMMARISATION), |
| Dataset(name="llmzszl", language=POLISH, task=KNOWLEDGE), |
| Dataset(name="winogrande-pl", language=POLISH, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| PORTUGUESE_DATASETS = [ |
| Dataset(name="sst2-pt", language=PORTUGUESE, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-pt", language=PORTUGUESE, task=GRAMMAR), |
| Dataset(name="harem", language=PORTUGUESE, task=INFORMATION_EXTRACTION), |
| Dataset(name="multi-wiki-qa-pt", language=PORTUGUESE, task=READING_COMPREHENSION), |
| Dataset(name="publico", language=PORTUGUESE, task=SUMMARISATION), |
| Dataset(name="mmlu-pt", language=PORTUGUESE, task=KNOWLEDGE), |
| Dataset(name="goldenswag-pt", language=PORTUGUESE, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| ROMANIAN_DATASETS = [ |
| Dataset(name="ro-sent", language=ROMANIAN, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-ro", language=ROMANIAN, task=GRAMMAR), |
| Dataset(name="ronec", language=ROMANIAN, task=INFORMATION_EXTRACTION), |
| Dataset(name="multi-wiki-qa-ro", language=ROMANIAN, task=READING_COMPREHENSION), |
| Dataset(name="sumo-ro", language=ROMANIAN, task=SUMMARISATION), |
| Dataset(name="global-mmlu-ro", language=ROMANIAN, task=KNOWLEDGE), |
| Dataset(name="winogrande-ro", language=ROMANIAN, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| SERBIAN_DATASETS = [ |
| Dataset(name="mms-sr", language=SERBIAN, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-sr", language=SERBIAN, task=GRAMMAR), |
| Dataset(name="uner-sr", language=SERBIAN, task=INFORMATION_EXTRACTION), |
| Dataset(name="multi-wiki-qa-sr", language=SERBIAN, task=READING_COMPREHENSION), |
| Dataset(name="lr-sum-sr", language=SERBIAN, task=SUMMARISATION), |
| Dataset(name="mmlu-sr", language=SERBIAN, task=KNOWLEDGE), |
| Dataset(name="winogrande-sr", language=SERBIAN, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| SLOVAK_DATASETS = [ |
| Dataset(name="csfd-sentiment-sk", language=SLOVAK, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-sk", language=SLOVAK, task=GRAMMAR), |
| Dataset(name="uner-sk", language=SLOVAK, task=INFORMATION_EXTRACTION), |
| Dataset(name="multi-wiki-qa-sk", language=SLOVAK, task=READING_COMPREHENSION), |
| Dataset(name="mmlu-sk", language=SLOVAK, task=KNOWLEDGE), |
| Dataset(name="winogrande-sk", language=SLOVAK, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| SLOVENE_DATASETS = [ |
| Dataset(name="sentinews", language=SLOVENE, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-sl", language=SLOVENE, task=GRAMMAR), |
| Dataset(name="ssj500k-ner", language=SLOVENE, task=INFORMATION_EXTRACTION), |
| Dataset(name="multi-wiki-qa-sl", language=SLOVENE, task=READING_COMPREHENSION), |
| Dataset(name="mmlu-sl", language=SLOVENE, task=KNOWLEDGE), |
| Dataset(name="winogrande-sl", language=SLOVENE, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| SPANISH_DATASETS = [ |
| Dataset(name="sentiment-headlines-es", language=SPANISH, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-es", language=SPANISH, task=GRAMMAR), |
| Dataset(name="conll-es", language=SPANISH, task=INFORMATION_EXTRACTION), |
| Dataset(name="mlqa-es", language=SPANISH, task=READING_COMPREHENSION), |
| Dataset(name="mlsum-es", language=SPANISH, task=SUMMARISATION), |
| Dataset(name="mmlu-es", language=SPANISH, task=KNOWLEDGE), |
| Dataset(name="hellaswag-es", language=SPANISH, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| SWEDISH_DATASETS = [ |
| Dataset(name="swerec", language=SWEDISH, task=TEXT_CLASSIFICATION), |
| Dataset(name="scala-sv", language=SWEDISH, task=GRAMMAR), |
| Dataset(name="suc3", language=SWEDISH, task=INFORMATION_EXTRACTION), |
| Dataset(name="scandiqa-sv", language=SWEDISH, task=READING_COMPREHENSION), |
| Dataset(name="swedn", language=SWEDISH, task=SUMMARISATION), |
| Dataset(name="mmlu-sv", language=SWEDISH, task=KNOWLEDGE), |
| Dataset(name="hellaswag-sv", language=SWEDISH, task=COMMON_SENSE_REASONING), |
| ] |
|
|
| DATASETS = ( |
| ALBANIAN_DATASETS |
| + BELARUSIAN_DATASETS |
| + BOSNIAN_DATASETS |
| + BULGARIAN_DATASETS |
| + CATALAN_DATASETS |
| + CROATIAN_DATASETS |
| + CZECH_DATASETS |
| + DANISH_DATASETS |
| + DUTCH_DATASETS |
| + ENGLISH_DATASETS |
| + ESTONIAN_DATASETS |
| + FAROESE_DATASETS |
| + FINNISH_DATASETS |
| + FRENCH_DATASETS |
| + GERMAN_DATASETS |
| + GREEK_DATASETS |
| + HUNGARIAN_DATASETS |
| + ICELANDIC_DATASETS |
| + ITALIAN_DATASETS |
| + LATVIAN_DATASETS |
| + LITHUANIAN_DATASETS |
| + NORWEGIAN_DATASETS |
| + POLISH_DATASETS |
| + PORTUGUESE_DATASETS |
| + ROMANIAN_DATASETS |
| + SERBIAN_DATASETS |
| + SLOVAK_DATASETS |
| + SLOVENE_DATASETS |
| + SPANISH_DATASETS |
| + SWEDISH_DATASETS |
| ) |
|
|