| { | |
| "sources": { | |
| "ag_news": { | |
| "provider": "huggingface", | |
| "name": "default", | |
| "split": "train", | |
| "streaming": false, | |
| "remove_columns": "label", | |
| "concatenate_successive_entries": 0 | |
| } | |
| }, | |
| "name": "sanity-check-2", | |
| "normalizer": { | |
| "force_lowercase": true, | |
| "strip_accents": true, | |
| "force_english_keyboard": true, | |
| "whitespace_escape": false | |
| }, | |
| "tokenizer": "BPE", | |
| "vocab_size": 32768, | |
| "seq_length": 128, | |
| "include_cls_token_in_corpus": false, | |
| "include_sep_token_in_corpus": false, | |
| "use_type_ids": false, | |
| "max_entries_in_raw_dataset": 10000000000.0, | |
| "max_seq_in_tokenized_dataset": 10000000000.0, | |
| "named_entity_simplification": false, | |
| "remove_whitespaces": false, | |
| "remove_trash": false, | |
| "trash_cutoff": 0.3, | |
| "deduplicate_entries": false, | |
| "deduplication_threshold": 100, | |
| "ordering": "randomized" | |
| } |