Vedisasi's picture
Upload folder using huggingface_hub
54c5666 verified
# Dataset configurations for ULTRATHINK training
# Users can easily switch between datasets by modifying these configs
# Small datasets (good for testing and development)
wikitext:
name: "wikitext"
subset: "wikitext-2-raw-v1"
description: "Small Wikipedia dataset (~100MB)"
max_length: 512
streaming: false
recommended_for: "development, testing, quick experiments"
wikitext_large:
name: "wikitext"
subset: "wikitext-103-raw-v1"
description: "Larger Wikipedia dataset (~500MB)"
max_length: 1024
streaming: false
recommended_for: "medium-scale training"
# Medium datasets (good for serious training)
openwebtext:
name: "openwebtext"
subset: null
description: "Open source recreation of WebText (~40GB)"
max_length: 1024
streaming: true
recommended_for: "serious training, diverse web content"
bookcorpus:
name: "bookcorpus"
subset: null
description: "Collection of over 11,000 books (~5GB)"
max_length: 1024
streaming: true
recommended_for: "literary and narrative training"
# Large datasets (for production training)
pile:
name: "EleutherAI/pile"
subset: null
description: "Large-scale curated dataset (~800GB)"
max_length: 2048
streaming: true
recommended_for: "large-scale production training"
c4:
name: "c4"
subset: "en"
description: "Colossal Clean Crawled Corpus (~750GB)"
max_length: 512
streaming: true
recommended_for: "large-scale web data training"
# Specialized datasets
oscar:
name: "oscar"
subset: "unshuffled_deduplicated_en"
description: "Multilingual web crawl (English subset)"
max_length: 512
streaming: true
recommended_for: "web content, multilingual training"
# Custom dataset template
custom_example:
name: "custom"
local_path: "/path/to/your/dataset.json"
file_type: "json" # json, txt, csv, parquet
text_column: "text"
max_length: 512
streaming: false
description: "Template for custom datasets"
recommended_for: "domain-specific training"
# Mixed dataset example
mixed_example:
datasets:
wikitext: 0.3
openwebtext: 0.5
bookcorpus: 0.2
description: "Mix multiple datasets with weights"
recommended_for: "balanced training across domains"
# Dataset recommendations by use case
recommendations:
quick_test: "wikitext"
development: "wikitext_large"
research: "openwebtext"
production: "pile"
literary: "bookcorpus"
web_content: "c4"
multilingual: "oscar"