| { |
| "datasets": { |
| "causal-lm": [ |
| { |
| "id": "wikitext", |
| "name": "WikiText", |
| "description": "Wikipedia text dataset for language modeling", |
| "configs": ["wikitext-2-raw-v1", "wikitext-103-raw-v1"], |
| "splits": ["train", "validation", "test"], |
| "text_column": "text", |
| "default_config": "wikitext-2-raw-v1", |
| "size_categories": ["100K-1M", "1M-10M"], |
| "recommended": true |
| }, |
| { |
| "id": "openwebtext", |
| "name": "OpenWebText", |
| "description": "Open source recreation of WebText dataset", |
| "configs": [], |
| "splits": ["train"], |
| "text_column": "text", |
| "size_categories": [">10M"], |
| "recommended": true |
| }, |
| { |
| "id": "the_pile", |
| "name": "The Pile", |
| "description": "Large-scale text corpus for language modeling", |
| "configs": ["all", "enron_emails", "europarl", "hacker_news", "pubmed", "ubuntu_irc"], |
| "splits": ["train", "validation", "test"], |
| "text_column": "text", |
| "size_categories": [">10M"], |
| "recommended": false |
| }, |
| { |
| "id": "c4", |
| "name": "C4 (Colossal Clean Crawled Corpus)", |
| "description": "Huge cleaned web text dataset", |
| "configs": ["en", "realnewslike", "en.noblocklist", "en.noclean"], |
| "splits": ["train", "validation"], |
| "text_column": "text", |
| "size_categories": [">10M"], |
| "recommended": false |
| }, |
| { |
| "id": "tiny_shakespeare", |
| "name": "Tiny Shakespeare", |
| "description": "Small Shakespeare text for quick testing", |
| "configs": [], |
| "splits": ["train", "validation", "test"], |
| "text_column": "text", |
| "size_categories": ["<10K"], |
| "recommended": true |
| } |
| ], |
| "seq2seq": [ |
| { |
| "id": "cnn_dailymail", |
| "name": "CNN/DailyMail", |
| "description": "News article summarization dataset", |
| "configs": ["1.0.0", "2.0.0", "3.0.0"], |
| "splits": ["train", "validation", "test"], |
| "text_column": "article", |
| "label_column": "highlights", |
| "default_config": "3.0.0", |
| "size_categories": ["100K-1M"], |
| "recommended": true |
| }, |
| { |
| "id": "xsum", |
| "name": "XSum", |
| "description": "BBC article summarization", |
| "configs": [], |
| "splits": ["train", "validation", "test"], |
| "text_column": "document", |
| "label_column": "summary", |
| "size_categories": ["10K-100K"], |
| "recommended": true |
| }, |
| { |
| "id": "samsum", |
| "name": "SAMSum", |
| "description": "Dialogue summarization dataset", |
| "configs": [], |
| "splits": ["train", "validation", "test"], |
| "text_column": "dialogue", |
| "label_column": "summary", |
| "size_categories": ["10K-100K"], |
| "recommended": true |
| }, |
| { |
| "id": "wmt16", |
| "name": "WMT16 Translation", |
| "description": "Machine translation dataset", |
| "configs": ["de-en", "en-de", "ro-en", "en-ro", "cs-en", "en-cs"], |
| "splits": ["train", "validation", "test"], |
| "size_categories": ["1M-10M"], |
| "recommended": false |
| }, |
| { |
| "id": "billsum", |
| "name": "BillSum", |
| "description": "US Congressional bill summarization", |
| "configs": [], |
| "splits": ["train", "test"], |
| "text_column": "text", |
| "label_column": "summary", |
| "size_categories": ["10K-100K"], |
| "recommended": true |
| } |
| ], |
| "token-classification": [ |
| { |
| "id": "conll2003", |
| "name": "CoNLL-2003", |
| "description": "Named entity recognition dataset", |
| "configs": [], |
| "splits": ["train", "validation", "test"], |
| "text_column": "tokens", |
| "label_column": "ner_tags", |
| "labels": ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"], |
| "size_categories": ["10K-100K"], |
| "recommended": true |
| }, |
| { |
| "id": "wnut_17", |
| "name": "WNUT 17", |
| "description": "Emerging entity recognition from social media", |
| "configs": [], |
| "splits": ["train", "validation", "test"], |
| "text_column": "tokens", |
| "label_column": "ner_tags", |
| "labels": ["O", "B-corporation", "B-creative-work", "B-group", "B-location", "B-person", "B-product", "I-corporation", "I-creative-work", "I-group", "I-location", "I-person", "I-product"], |
| "size_categories": ["<10K"], |
| "recommended": true |
| }, |
| { |
| "id": "ontonotes5", |
| "name": "OntoNotes 5.0", |
| "description": "Multi-genre NER and coreference", |
| "configs": ["english_v4", "english_v12", "chinese_v4", "arabic_v4"], |
| "splits": ["train", "validation", "test"], |
| "text_column": "document", |
| "label_column": "named_entities", |
| "size_categories": ["100K-1M"], |
| "recommended": false |
| } |
| ], |
| "text-classification": [ |
| { |
| "id": "imdb", |
| "name": "IMDB", |
| "description": "Movie review sentiment classification", |
| "configs": [], |
| "splits": ["train", "test", "unsupervised"], |
| "text_column": "text", |
| "label_column": "label", |
| "labels": ["negative", "positive"], |
| "size_categories": ["10K-100K"], |
| "recommended": true |
| }, |
| { |
| "id": "yelp_polarity", |
| "name": "Yelp Polarity", |
| "description": "Yelp review sentiment classification", |
| "configs": [], |
| "splits": ["train", "test"], |
| "text_column": "text", |
| "label_column": "label", |
| "labels": ["negative", "positive"], |
| "size_categories": ["100K-1M"], |
| "recommended": true |
| }, |
| { |
| "id": "ag_news", |
| "name": "AG News", |
| "description": "News article categorization", |
| "configs": [], |
| "splits": ["train", "test"], |
| "text_column": "text", |
| "label_column": "label", |
| "labels": ["World", "Sports", "Business", "Sci/Tech"], |
| "size_categories": ["100K-1M"], |
| "recommended": true |
| }, |
| { |
| "id": "glue", |
| "name": "GLUE", |
| "description": "General Language Understanding Evaluation", |
| "configs": ["cola", "mnli", "mnli_matched", "mnli_mismatched", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"], |
| "splits": ["train", "validation", "test"], |
| "size_categories": ["varies"], |
| "recommended": true |
| }, |
| { |
| "id": "emotion", |
| "name": "Emotion", |
| "description": "Twitter emotion classification", |
| "configs": [], |
| "splits": ["train", "validation", "test"], |
| "text_column": "text", |
| "label_column": "label", |
| "labels": ["sadness", "joy", "love", "anger", "fear", "surprise"], |
| "size_categories": ["10K-100K"], |
| "recommended": true |
| } |
| ], |
| "question-answering": [ |
| { |
| "id": "squad", |
| "name": "SQuAD", |
| "description": "Stanford Question Answering Dataset", |
| "configs": ["plain_text"], |
| "splits": ["train", "validation"], |
| "text_column": "context", |
| "question_column": "question", |
| "answer_column": "answers", |
| "size_categories": ["10K-100K"], |
| "recommended": true |
| }, |
| { |
| "id": "squad_v2", |
| "name": "SQuAD 2.0", |
| "description": "SQuAD with unanswerable questions", |
| "configs": ["squad_v2"], |
| "splits": ["train", "validation"], |
| "size_categories": ["100K-1M"], |
| "recommended": true |
| }, |
| { |
| "id": "natural_questions", |
| "name": "Natural Questions", |
| "description": "Real user questions with Wikipedia answers", |
| "configs": ["default"], |
| "splits": ["train", "validation"], |
| "size_categories": [">10M"], |
| "recommended": false |
| }, |
| { |
| "id": "coqa", |
| "name": "CoQA", |
| "description": "Conversational Question Answering", |
| "configs": [], |
| "splits": ["train", "validation"], |
| "size_categories": ["100K-1M"], |
| "recommended": true |
| } |
| ], |
| "translation": [ |
| { |
| "id": "wmt14", |
| "name": "WMT14 Translation", |
| "description": "Large-scale machine translation", |
| "configs": ["de-en", "en-de", "fr-en", "en-fr"], |
| "splits": ["train", "validation", "test"], |
| "size_categories": [">10M"], |
| "recommended": false |
| }, |
| { |
| "id": "opus100", |
| "name": "OPUS-100", |
| "description": "Multi-lingual parallel corpus", |
| "configs": ["en-de", "en-fr", "en-es", "en-ru", "en-zh"], |
| "splits": ["train", "validation", "test"], |
| "size_categories": ["1M-10M"], |
| "recommended": true |
| } |
| ], |
| "image-classification": [ |
| { |
| "id": "cifar10", |
| "name": "CIFAR-10", |
| "description": "10-class image classification", |
| "configs": [], |
| "splits": ["train", "test"], |
| "image_column": "img", |
| "label_column": "label", |
| "labels": ["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"], |
| "size_categories": ["10K-100K"], |
| "recommended": true |
| }, |
| { |
| "id": "imagenet-1k", |
| "name": "ImageNet-1k", |
| "description": "Large-scale image classification", |
| "configs": [], |
| "splits": ["train", "validation"], |
| "image_column": "image", |
| "label_column": "label", |
| "size_categories": [">10M"], |
| "recommended": false |
| } |
| ] |
| }, |
| "models": { |
| "causal-lm": { |
| "small": [ |
| {"id": "gpt2", "params": "124M", "recommended": true}, |
| {"id": "distilgpt2", "params": "82M", "recommended": true}, |
| {"id": "EleutherAI/gpt-neo-125M", "params": "125M", "recommended": true}, |
| {"id": "bigscience/bloom-560m", "params": "560M", "recommended": true} |
| ], |
| "medium": [ |
| {"id": "gpt2-medium", "params": "355M", "recommended": true}, |
| {"id": "gpt2-large", "params": "774M", "recommended": true}, |
| {"id": "EleutherAI/gpt-neo-1.3B", "params": "1.3B", "recommended": true}, |
| {"id": "EleutherAI/gpt-j-6b", "params": "6B", "recommended": false}, |
| {"id": "bigscience/bloom-1b7", "params": "1.7B", "recommended": true}, |
| {"id": "meta-llama/Llama-2-7b-hf", "params": "7B", "recommended": true}, |
| {"id": "mistralai/Mistral-7B-v0.1", "params": "7B", "recommended": true} |
| ], |
| "large": [ |
| {"id": "EleutherAI/gpt-neox-20b", "params": "20B", "recommended": false}, |
| {"id": "bigscience/bloom", "params": "176B", "recommended": false}, |
| {"id": "meta-llama/Llama-2-13b-hf", "params": "13B", "recommended": false}, |
| {"id": "meta-llama/Llama-2-70b-hf", "params": "70B", "recommended": false} |
| ] |
| }, |
| "seq2seq": { |
| "small": [ |
| {"id": "google-t5/t5-small", "params": "60M", "recommended": true}, |
| {"id": "facebook/bart-base", "params": "140M", "recommended": true}, |
| {"id": "google/flan-t5-small", "params": "80M", "recommended": true} |
| ], |
| "medium": [ |
| {"id": "google-t5/t5-base", "params": "220M", "recommended": true}, |
| {"id": "facebook/bart-large", "params": "400M", "recommended": true}, |
| {"id": "google/flan-t5-base", "params": "250M", "recommended": true}, |
| {"id": "google/flan-t5-large", "params": "780M", "recommended": true}, |
| {"id": "google-t5/t5-large", "params": "770M", "recommended": true} |
| ], |
| "large": [ |
| {"id": "google-t5/t5-3b", "params": "3B", "recommended": false}, |
| {"id": "google/flan-t5-xl", "params": "3B", "recommended": false}, |
| {"id": "facebook/bart-large-cnn", "params": "400M", "recommended": true} |
| ] |
| }, |
| "token-classification": { |
| "small": [ |
| {"id": "dslim/bert-base-NER", "params": "110M", "recommended": true}, |
| {"id": "dslim/distilbert-NER", "params": "66M", "recommended": true}, |
| {"id": "dbmdz/bert-large-cased-finetuned-conll03-english", "params": "340M", "recommended": true} |
| ], |
| "medium": [ |
| {"id": "dslim/bert-base-NER", "params": "110M", "recommended": true}, |
| {"id": "elastic/distilbert-base-uncased-finetuned-conll03-english", "params": "66M", "recommended": true} |
| ] |
| }, |
| "text-classification": { |
| "small": [ |
| {"id": "distilbert/distilbert-base-uncased", "params": "66M", "recommended": true}, |
| {"id": "google-bert/bert-base-uncased", "params": "110M", "recommended": true}, |
| {"id": "roberta-base", "params": "125M", "recommended": true} |
| ], |
| "medium": [ |
| {"id": "google-bert/bert-large-uncased", "params": "340M", "recommended": true}, |
| {"id": "roberta-large", "params": "355M", "recommended": true}, |
| {"id": "microsoft/deberta-v3-base", "params": "184M", "recommended": true} |
| ] |
| }, |
| "question-answering": { |
| "small": [ |
| {"id": "distilbert/distilbert-base-uncased-distilled-squad", "params": "66M", "recommended": true}, |
| {"id": "deepset/minilm-uncased-squad2", "params": "33M", "recommended": true} |
| ], |
| "medium": [ |
| {"id": "deepset/roberta-base-squad2", "params": "125M", "recommended": true}, |
| {"id": "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", "params": "340M", "recommended": true} |
| ] |
| } |
| }, |
| "task_metadata": { |
| "causal-lm": { |
| "display_name": "Causal Language Modeling", |
| "description": "Generate text, autocomplete, story writing", |
| "icon": "text_fields", |
| "metrics": ["perplexity", "accuracy", "f1"], |
| "requires_decoder_only": true |
| }, |
| "seq2seq": { |
| "display_name": "Sequence-to-Sequence", |
| "description": "Summarization, translation, paraphrase", |
| "icon": "compare_arrows", |
| "metrics": ["rouge1", "rouge2", "rougeL", "bleu", "meteor"], |
| "requires_encoder_decoder": true |
| }, |
| "token-classification": { |
| "display_name": "Token Classification", |
| "description": "Named entity recognition, POS tagging", |
| "icon": "label", |
| "metrics": ["precision", "recall", "f1", "accuracy"], |
| "requires_encoder": true |
| }, |
| "text-classification": { |
| "display_name": "Text Classification", |
| "description": "Sentiment analysis, topic classification", |
| "icon": "category", |
| "metrics": ["accuracy", "f1", "precision", "recall"], |
| "requires_encoder": true |
| }, |
| "question-answering": { |
| "display_name": "Question Answering", |
| "description": "Extractive and generative QA", |
| "icon": "help", |
| "metrics": ["exact_match", "f1"], |
| "requires_encoder": true |
| }, |
| "translation": { |
| "display_name": "Translation", |
| "description": "Machine translation between languages", |
| "icon": "translate", |
| "metrics": ["bleu", "meteor", "chrf"], |
| "requires_encoder_decoder": true |
| } |
| } |
| } |