"""All the official datasets in the benchmark.""" from pydantic import BaseModel from languages import * from tasks import * class Dataset(BaseModel): """Class to hold dataset information.""" name: str language: Language task: Task def __hash__(self): return hash(self.name) ALBANIAN_DATASETS = [ Dataset(name="mms-sq", language=ALBANIAN, task=TEXT_CLASSIFICATION), Dataset(name="scala-sq", language=ALBANIAN, task=GRAMMAR), Dataset(name="wikiann-sq", language=ALBANIAN, task=INFORMATION_EXTRACTION), Dataset(name="multi-wiki-qa-sq", language=ALBANIAN, task=READING_COMPREHENSION), Dataset(name="lr-sum-sq", language=ALBANIAN, task=SUMMARISATION), Dataset(name="global-mmlu-lite-sq", language=ALBANIAN, task=KNOWLEDGE), Dataset(name="winogrande-sq", language=ALBANIAN, task=COMMON_SENSE_REASONING), ] BELARUSIAN_DATASETS = [ Dataset(name="besls", language=BELARUSIAN, task=TEXT_CLASSIFICATION), Dataset(name="scala-be", language=BELARUSIAN, task=GRAMMAR), Dataset(name="wikiann-be", language=BELARUSIAN, task=INFORMATION_EXTRACTION), Dataset(name="multi-wiki-qa-be", language=BELARUSIAN, task=READING_COMPREHENSION), Dataset(name="be-wsc", language=BELARUSIAN, task=COMMON_SENSE_REASONING), ] BOSNIAN_DATASETS = [ Dataset(name="mms-bs", language=BOSNIAN, task=TEXT_CLASSIFICATION), Dataset(name="wikiann-bs", language=BOSNIAN, task=INFORMATION_EXTRACTION), Dataset(name="multi-wiki-qa-bs", language=BOSNIAN, task=READING_COMPREHENSION), Dataset(name="lr-sum-bs", language=BOSNIAN, task=SUMMARISATION), ] BULGARIAN_DATASETS = [ Dataset(name="bgt", language=BULGARIAN, task=TEXT_CLASSIFICATION), Dataset(name="scala-bg", language=BULGARIAN, task=GRAMMAR), Dataset(name="bner", language=BULGARIAN, task=INFORMATION_EXTRACTION), Dataset(name="bgquad", language=BULGARIAN, task=READING_COMPREHENSION), Dataset(name="cnn-dailymail-bg", language=BULGARIAN, task=SUMMARISATION), Dataset(name="mmlu-bg", language=BULGARIAN, task=KNOWLEDGE), Dataset(name="winogrande-bg", language=BULGARIAN, task=COMMON_SENSE_REASONING), ] CATALAN_DATASETS = [ Dataset(name="guia-cat", language=CATALAN, task=TEXT_CLASSIFICATION), Dataset(name="scala-ca", language=CATALAN, task=GRAMMAR), Dataset(name="wikiann-ca", language=CATALAN, task=INFORMATION_EXTRACTION), Dataset(name="multi-wiki-qa-ca", language=CATALAN, task=READING_COMPREHENSION), Dataset(name="dacsa-ca", language=CATALAN, task=SUMMARISATION), Dataset(name="mmlu-ca", language=CATALAN, task=KNOWLEDGE), Dataset(name="winogrande-ca", language=CATALAN, task=COMMON_SENSE_REASONING), ] CROATIAN_DATASETS = [ Dataset(name="mms-hr", language=CROATIAN, task=TEXT_CLASSIFICATION), Dataset(name="scala-hr", language=CROATIAN, task=GRAMMAR), Dataset(name="wikiann-hr", language=CROATIAN, task=INFORMATION_EXTRACTION), Dataset(name="multi-wiki-qa-hr", language=CROATIAN, task=READING_COMPREHENSION), Dataset(name="mmlu-hr", language=CROATIAN, task=KNOWLEDGE), Dataset(name="winogrande-hr", language=CROATIAN, task=COMMON_SENSE_REASONING), ] CZECH_DATASETS = [ Dataset(name="csfd-sentiment", language=CZECH, task=TEXT_CLASSIFICATION), Dataset(name="cs-gec", language=CZECH, task=GRAMMAR), Dataset(name="poner", language=CZECH, task=INFORMATION_EXTRACTION), Dataset(name="sqad", language=CZECH, task=READING_COMPREHENSION), Dataset(name="czech-news", language=CZECH, task=SUMMARISATION), Dataset(name="umimeto-qa", language=CZECH, task=KNOWLEDGE), Dataset(name="hellaswag-cs", language=CZECH, task=COMMON_SENSE_REASONING), ] DANISH_DATASETS = [ Dataset(name="angry-tweets", language=DANISH, task=TEXT_CLASSIFICATION), Dataset(name="scala-da", language=DANISH, task=GRAMMAR), Dataset(name="dansk", language=DANISH, task=INFORMATION_EXTRACTION), Dataset(name="multi-wiki-qa-da", language=DANISH, task=READING_COMPREHENSION), Dataset(name="nordjylland-news", language=DANISH, task=SUMMARISATION), Dataset(name="danske-talemaader", language=DANISH, task=KNOWLEDGE), Dataset(name="danish-citizen-tests", language=DANISH, task=KNOWLEDGE), Dataset(name="hellaswag-da", language=DANISH, task=COMMON_SENSE_REASONING), ] DUTCH_DATASETS = [ Dataset(name="dutch-social", language=DUTCH, task=TEXT_CLASSIFICATION), Dataset(name="scala-nl", language=DUTCH, task=GRAMMAR), Dataset(name="conll-nl", language=DUTCH, task=INFORMATION_EXTRACTION), Dataset(name="squad-nl", language=DUTCH, task=READING_COMPREHENSION), Dataset(name="wiki-lingua-nl", language=DUTCH, task=SUMMARISATION), Dataset(name="mmlu-nl", language=DUTCH, task=KNOWLEDGE), Dataset(name="hellaswag-nl", language=DUTCH, task=COMMON_SENSE_REASONING), ] ENGLISH_DATASETS = [ Dataset(name="sst5", language=ENGLISH, task=TEXT_CLASSIFICATION), Dataset(name="scala-en", language=ENGLISH, task=GRAMMAR), Dataset(name="conll-en", language=ENGLISH, task=INFORMATION_EXTRACTION), Dataset(name="squad", language=ENGLISH, task=READING_COMPREHENSION), Dataset(name="cnn-dailymail", language=ENGLISH, task=SUMMARISATION), Dataset(name="life-in-the-uk", language=ENGLISH, task=KNOWLEDGE), Dataset(name="hellaswag", language=ENGLISH, task=COMMON_SENSE_REASONING), ] ESTONIAN_DATASETS = [ Dataset(name="estonian-valence", language=ESTONIAN, task=TEXT_CLASSIFICATION), Dataset(name="grammar-et", language=ESTONIAN, task=GRAMMAR), Dataset(name="estner", language=ESTONIAN, task=INFORMATION_EXTRACTION), Dataset(name="multi-wiki-qa-et", language=ESTONIAN, task=READING_COMPREHENSION), Dataset(name="err-news", language=ESTONIAN, task=SUMMARISATION), Dataset(name="trivia-et", language=ESTONIAN, task=KNOWLEDGE), Dataset(name="winogrande-et", language=ESTONIAN, task=COMMON_SENSE_REASONING), ] FAROESE_DATASETS = [ Dataset(name="fosent", language=FAROESE, task=TEXT_CLASSIFICATION), Dataset(name="scala-fo", language=FAROESE, task=GRAMMAR), Dataset(name="fone", language=FAROESE, task=INFORMATION_EXTRACTION), Dataset(name="foqa", language=FAROESE, task=READING_COMPREHENSION), ] FINNISH_DATASETS = [ Dataset(name="scandisent-fi", language=FINNISH, task=TEXT_CLASSIFICATION), Dataset(name="scala-fi", language=FINNISH, task=GRAMMAR), Dataset(name="turku-ner-fi", language=FINNISH, task=INFORMATION_EXTRACTION), Dataset(name="tydiqa-fi", language=FINNISH, task=READING_COMPREHENSION), Dataset(name="xlsum-fi", language=FINNISH, task=SUMMARISATION), Dataset(name="hellaswag-fi", language=FINNISH, task=COMMON_SENSE_REASONING), ] FRENCH_DATASETS = [ Dataset(name="allocine", language=FRENCH, task=TEXT_CLASSIFICATION), Dataset(name="scala-fr", language=FRENCH, task=GRAMMAR), Dataset(name="eltec", language=FRENCH, task=INFORMATION_EXTRACTION), Dataset(name="fquad", language=FRENCH, task=READING_COMPREHENSION), Dataset(name="orange-sum", language=FRENCH, task=SUMMARISATION), Dataset(name="mmlu-fr", language=FRENCH, task=KNOWLEDGE), Dataset(name="hellaswag-fr", language=FRENCH, task=COMMON_SENSE_REASONING), ] GERMAN_DATASETS = [ Dataset(name="sb10k", language=GERMAN, task=TEXT_CLASSIFICATION), Dataset(name="scala-de", language=GERMAN, task=GRAMMAR), Dataset(name="germeval", language=GERMAN, task=INFORMATION_EXTRACTION), Dataset(name="germanquad", language=GERMAN, task=READING_COMPREHENSION), Dataset(name="mlsum", language=GERMAN, task=SUMMARISATION), Dataset(name="mmlu-de", language=GERMAN, task=KNOWLEDGE), Dataset(name="hellaswag-de", language=GERMAN, task=COMMON_SENSE_REASONING), ] GREEK_DATASETS = [ Dataset(name="greek-sa", language=GREEK, task=TEXT_CLASSIFICATION), Dataset(name="scala-el", language=GREEK, task=GRAMMAR), Dataset(name="elner", language=GREEK, task=INFORMATION_EXTRACTION), Dataset(name="multi-wiki-qa-el", language=GREEK, task=READING_COMPREHENSION), Dataset(name="greek-wikipedia", language=GREEK, task=SUMMARISATION), Dataset(name="global-mmlu-el", language=GREEK, task=KNOWLEDGE), Dataset(name="winogrande-el", language=GREEK, task=COMMON_SENSE_REASONING), ] HUNGARIAN_DATASETS = [ Dataset(name="husst", language=HUNGARIAN, task=TEXT_CLASSIFICATION), Dataset(name="scala-hu", language=HUNGARIAN, task=GRAMMAR), Dataset(name="szeged-ner", language=HUNGARIAN, task=INFORMATION_EXTRACTION), Dataset(name="multi-wiki-qa-hu", language=HUNGARIAN, task=READING_COMPREHENSION), Dataset(name="hunsum", language=HUNGARIAN, task=SUMMARISATION), Dataset(name="mmlu-hu", language=HUNGARIAN, task=KNOWLEDGE), Dataset(name="winogrande-hu", language=HUNGARIAN, task=COMMON_SENSE_REASONING), ] ICELANDIC_DATASETS = [ Dataset( name="hotter-and-colder-sentiment", language=ICELANDIC, task=TEXT_CLASSIFICATION ), Dataset(name="scala-is", language=ICELANDIC, task=GRAMMAR), Dataset(name="mim-gold-ner", language=ICELANDIC, task=INFORMATION_EXTRACTION), Dataset(name="nqii", language=ICELANDIC, task=READING_COMPREHENSION), Dataset(name="rrn", language=ICELANDIC, task=SUMMARISATION), Dataset(name="icelandic-knowledge", language=ICELANDIC, task=KNOWLEDGE), Dataset(name="winogrande-is", language=ICELANDIC, task=COMMON_SENSE_REASONING), ] ITALIAN_DATASETS = [ Dataset(name="sentipolc16", language=ITALIAN, task=TEXT_CLASSIFICATION), Dataset(name="scala-it", language=ITALIAN, task=GRAMMAR), Dataset(name="multinerd-it", language=ITALIAN, task=INFORMATION_EXTRACTION), Dataset(name="squad-it", language=ITALIAN, task=READING_COMPREHENSION), Dataset(name="ilpost-sum", language=ITALIAN, task=SUMMARISATION), Dataset(name="mmlu-it", language=ITALIAN, task=KNOWLEDGE), Dataset(name="hellaswag-it", language=ITALIAN, task=COMMON_SENSE_REASONING), ] LATVIAN_DATASETS = [ Dataset( name="latvian-twitter-sentiment", language=LATVIAN, task=TEXT_CLASSIFICATION ), Dataset(name="scala-lv", language=LATVIAN, task=GRAMMAR), Dataset(name="fullstack-ner-lv", language=LATVIAN, task=INFORMATION_EXTRACTION), Dataset(name="multi-wiki-qa-lv", language=LATVIAN, task=READING_COMPREHENSION), Dataset(name="lsm", language=LATVIAN, task=SUMMARISATION), Dataset(name="mmlu-lv", language=LATVIAN, task=KNOWLEDGE), Dataset(name="copa-lv", language=LATVIAN, task=COMMON_SENSE_REASONING), ] LITHUANIAN_DATASETS = [ Dataset(name="atsiliepimai", language=LITHUANIAN, task=TEXT_CLASSIFICATION), Dataset(name="scala-lt", language=LITHUANIAN, task=GRAMMAR), Dataset(name="wikiann-lt", language=LITHUANIAN, task=INFORMATION_EXTRACTION), Dataset(name="multi-wiki-qa-lt", language=LITHUANIAN, task=READING_COMPREHENSION), Dataset(name="lrytas", language=LITHUANIAN, task=SUMMARISATION), Dataset(name="lt-history", language=LITHUANIAN, task=KNOWLEDGE), Dataset(name="winogrande-lt", language=LITHUANIAN, task=COMMON_SENSE_REASONING), ] NORWEGIAN_DATASETS = [ Dataset(name="norec", language=NORWEGIAN, task=TEXT_CLASSIFICATION), Dataset(name="scala-nb", language=NORWEGIAN, task=GRAMMAR), Dataset(name="scala-nn", language=NORWEGIAN, task=GRAMMAR), Dataset(name="norne-nb", language=NORWEGIAN, task=INFORMATION_EXTRACTION), Dataset(name="norne-nn", language=NORWEGIAN, task=INFORMATION_EXTRACTION), Dataset(name="norquad", language=NORWEGIAN, task=READING_COMPREHENSION), Dataset(name="no-sammendrag", language=NORWEGIAN, task=SUMMARISATION), Dataset(name="nrk-quiz-qa", language=NORWEGIAN, task=KNOWLEDGE), Dataset( name="nor-common-sense-qa", language=NORWEGIAN, task=COMMON_SENSE_REASONING ), ] POLISH_DATASETS = [ Dataset(name="polemo2", language=POLISH, task=TEXT_CLASSIFICATION), Dataset(name="scala-pl", language=POLISH, task=GRAMMAR), Dataset(name="kpwr-ner", language=POLISH, task=INFORMATION_EXTRACTION), Dataset(name="poquad", language=POLISH, task=READING_COMPREHENSION), Dataset(name="psc", language=POLISH, task=SUMMARISATION), Dataset(name="llmzszl", language=POLISH, task=KNOWLEDGE), Dataset(name="winogrande-pl", language=POLISH, task=COMMON_SENSE_REASONING), ] PORTUGUESE_DATASETS = [ Dataset(name="sst2-pt", language=PORTUGUESE, task=TEXT_CLASSIFICATION), Dataset(name="scala-pt", language=PORTUGUESE, task=GRAMMAR), Dataset(name="harem", language=PORTUGUESE, task=INFORMATION_EXTRACTION), Dataset(name="multi-wiki-qa-pt", language=PORTUGUESE, task=READING_COMPREHENSION), Dataset(name="publico", language=PORTUGUESE, task=SUMMARISATION), Dataset(name="mmlu-pt", language=PORTUGUESE, task=KNOWLEDGE), Dataset(name="goldenswag-pt", language=PORTUGUESE, task=COMMON_SENSE_REASONING), ] ROMANIAN_DATASETS = [ Dataset(name="ro-sent", language=ROMANIAN, task=TEXT_CLASSIFICATION), Dataset(name="scala-ro", language=ROMANIAN, task=GRAMMAR), Dataset(name="ronec", language=ROMANIAN, task=INFORMATION_EXTRACTION), Dataset(name="multi-wiki-qa-ro", language=ROMANIAN, task=READING_COMPREHENSION), Dataset(name="sumo-ro", language=ROMANIAN, task=SUMMARISATION), Dataset(name="global-mmlu-ro", language=ROMANIAN, task=KNOWLEDGE), Dataset(name="winogrande-ro", language=ROMANIAN, task=COMMON_SENSE_REASONING), ] SERBIAN_DATASETS = [ Dataset(name="mms-sr", language=SERBIAN, task=TEXT_CLASSIFICATION), Dataset(name="scala-sr", language=SERBIAN, task=GRAMMAR), Dataset(name="uner-sr", language=SERBIAN, task=INFORMATION_EXTRACTION), Dataset(name="multi-wiki-qa-sr", language=SERBIAN, task=READING_COMPREHENSION), Dataset(name="lr-sum-sr", language=SERBIAN, task=SUMMARISATION), Dataset(name="mmlu-sr", language=SERBIAN, task=KNOWLEDGE), Dataset(name="winogrande-sr", language=SERBIAN, task=COMMON_SENSE_REASONING), ] SLOVAK_DATASETS = [ Dataset(name="csfd-sentiment-sk", language=SLOVAK, task=TEXT_CLASSIFICATION), Dataset(name="scala-sk", language=SLOVAK, task=GRAMMAR), Dataset(name="uner-sk", language=SLOVAK, task=INFORMATION_EXTRACTION), Dataset(name="multi-wiki-qa-sk", language=SLOVAK, task=READING_COMPREHENSION), Dataset(name="mmlu-sk", language=SLOVAK, task=KNOWLEDGE), Dataset(name="winogrande-sk", language=SLOVAK, task=COMMON_SENSE_REASONING), ] SLOVENE_DATASETS = [ Dataset(name="sentinews", language=SLOVENE, task=TEXT_CLASSIFICATION), Dataset(name="scala-sl", language=SLOVENE, task=GRAMMAR), Dataset(name="ssj500k-ner", language=SLOVENE, task=INFORMATION_EXTRACTION), Dataset(name="multi-wiki-qa-sl", language=SLOVENE, task=READING_COMPREHENSION), Dataset(name="mmlu-sl", language=SLOVENE, task=KNOWLEDGE), Dataset(name="winogrande-sl", language=SLOVENE, task=COMMON_SENSE_REASONING), ] SPANISH_DATASETS = [ Dataset(name="sentiment-headlines-es", language=SPANISH, task=TEXT_CLASSIFICATION), Dataset(name="scala-es", language=SPANISH, task=GRAMMAR), Dataset(name="conll-es", language=SPANISH, task=INFORMATION_EXTRACTION), Dataset(name="mlqa-es", language=SPANISH, task=READING_COMPREHENSION), Dataset(name="mlsum-es", language=SPANISH, task=SUMMARISATION), Dataset(name="mmlu-es", language=SPANISH, task=KNOWLEDGE), Dataset(name="hellaswag-es", language=SPANISH, task=COMMON_SENSE_REASONING), ] SWEDISH_DATASETS = [ Dataset(name="swerec", language=SWEDISH, task=TEXT_CLASSIFICATION), Dataset(name="scala-sv", language=SWEDISH, task=GRAMMAR), Dataset(name="suc3", language=SWEDISH, task=INFORMATION_EXTRACTION), Dataset(name="scandiqa-sv", language=SWEDISH, task=READING_COMPREHENSION), Dataset(name="swedn", language=SWEDISH, task=SUMMARISATION), Dataset(name="mmlu-sv", language=SWEDISH, task=KNOWLEDGE), Dataset(name="hellaswag-sv", language=SWEDISH, task=COMMON_SENSE_REASONING), ] DATASETS = ( ALBANIAN_DATASETS + BELARUSIAN_DATASETS + BOSNIAN_DATASETS + BULGARIAN_DATASETS + CATALAN_DATASETS + CROATIAN_DATASETS + CZECH_DATASETS + DANISH_DATASETS + DUTCH_DATASETS + ENGLISH_DATASETS + ESTONIAN_DATASETS + FAROESE_DATASETS + FINNISH_DATASETS + FRENCH_DATASETS + GERMAN_DATASETS + GREEK_DATASETS + HUNGARIAN_DATASETS + ICELANDIC_DATASETS + ITALIAN_DATASETS + LATVIAN_DATASETS + LITHUANIAN_DATASETS + NORWEGIAN_DATASETS + POLISH_DATASETS + PORTUGUESE_DATASETS + ROMANIAN_DATASETS + SERBIAN_DATASETS + SLOVAK_DATASETS + SLOVENE_DATASETS + SPANISH_DATASETS + SWEDISH_DATASETS )