|
|
|
|
|
|
|
|
| wikitext:
|
| name: "wikitext"
|
| subset: "wikitext-2-raw-v1"
|
| description: "Small Wikipedia dataset (~100MB)"
|
| max_length: 512
|
| streaming: false
|
| recommended_for: "development, testing, quick experiments"
|
|
|
| wikitext_large:
|
| name: "wikitext"
|
| subset: "wikitext-103-raw-v1"
|
| description: "Larger Wikipedia dataset (~500MB)"
|
| max_length: 1024
|
| streaming: false
|
| recommended_for: "medium-scale training"
|
|
|
|
|
| openwebtext:
|
| name: "openwebtext"
|
| subset: null
|
| description: "Open source recreation of WebText (~40GB)"
|
| max_length: 1024
|
| streaming: true
|
| recommended_for: "serious training, diverse web content"
|
|
|
| bookcorpus:
|
| name: "bookcorpus"
|
| subset: null
|
| description: "Collection of over 11,000 books (~5GB)"
|
| max_length: 1024
|
| streaming: true
|
| recommended_for: "literary and narrative training"
|
|
|
|
|
| pile:
|
| name: "EleutherAI/pile"
|
| subset: null
|
| description: "Large-scale curated dataset (~800GB)"
|
| max_length: 2048
|
| streaming: true
|
| recommended_for: "large-scale production training"
|
|
|
| c4:
|
| name: "c4"
|
| subset: "en"
|
| description: "Colossal Clean Crawled Corpus (~750GB)"
|
| max_length: 512
|
| streaming: true
|
| recommended_for: "large-scale web data training"
|
|
|
|
|
| oscar:
|
| name: "oscar"
|
| subset: "unshuffled_deduplicated_en"
|
| description: "Multilingual web crawl (English subset)"
|
| max_length: 512
|
| streaming: true
|
| recommended_for: "web content, multilingual training"
|
|
|
|
|
| custom_example:
|
| name: "custom"
|
| local_path: "/path/to/your/dataset.json"
|
| file_type: "json"
|
| text_column: "text"
|
| max_length: 512
|
| streaming: false
|
| description: "Template for custom datasets"
|
| recommended_for: "domain-specific training"
|
|
|
|
|
| mixed_example:
|
| datasets:
|
| wikitext: 0.3
|
| openwebtext: 0.5
|
| bookcorpus: 0.2
|
| description: "Mix multiple datasets with weights"
|
| recommended_for: "balanced training across domains"
|
|
|
|
|
| recommendations:
|
| quick_test: "wikitext"
|
| development: "wikitext_large"
|
| research: "openwebtext"
|
| production: "pile"
|
| literary: "bookcorpus"
|
| web_content: "c4"
|
| multilingual: "oscar"
|
|
|