Vedisasi
/

UltraThinking-LLM-Training

Model card Files Files and versions

UltraThinking-LLM-Training / configs /datasets.yaml

Vedisasi's picture

Upload folder using huggingface_hub

54c5666 verified 6 months ago

history blame contribute delete

2.51 kB

	# Dataset configurations for ULTRATHINK training
	# Users can easily switch between datasets by modifying these configs

	# Small datasets (good for testing and development)
	wikitext:
	name: "wikitext"
	subset: "wikitext-2-raw-v1"
	description: "Small Wikipedia dataset (~100MB)"
	max_length: 512
	streaming: false
	recommended_for: "development, testing, quick experiments"

	wikitext_large:
	name: "wikitext"
	subset: "wikitext-103-raw-v1"
	description: "Larger Wikipedia dataset (~500MB)"
	max_length: 1024
	streaming: false
	recommended_for: "medium-scale training"

	# Medium datasets (good for serious training)
	openwebtext:
	name: "openwebtext"
	subset: null
	description: "Open source recreation of WebText (~40GB)"
	max_length: 1024
	streaming: true
	recommended_for: "serious training, diverse web content"

	bookcorpus:
	name: "bookcorpus"
	subset: null
	description: "Collection of over 11,000 books (~5GB)"
	max_length: 1024
	streaming: true
	recommended_for: "literary and narrative training"

	# Large datasets (for production training)
	pile:
	name: "EleutherAI/pile"
	subset: null
	description: "Large-scale curated dataset (~800GB)"
	max_length: 2048
	streaming: true
	recommended_for: "large-scale production training"

	c4:
	name: "c4"
	subset: "en"
	description: "Colossal Clean Crawled Corpus (~750GB)"
	max_length: 512
	streaming: true
	recommended_for: "large-scale web data training"

	# Specialized datasets
	oscar:
	name: "oscar"
	subset: "unshuffled_deduplicated_en"
	description: "Multilingual web crawl (English subset)"
	max_length: 512
	streaming: true
	recommended_for: "web content, multilingual training"

	# Custom dataset template
	custom_example:
	name: "custom"
	local_path: "/path/to/your/dataset.json"
	file_type: "json" # json, txt, csv, parquet
	text_column: "text"
	max_length: 512
	streaming: false
	description: "Template for custom datasets"
	recommended_for: "domain-specific training"

	# Mixed dataset example
	mixed_example:
	datasets:
	wikitext: 0.3
	openwebtext: 0.5
	bookcorpus: 0.2
	description: "Mix multiple datasets with weights"
	recommended_for: "balanced training across domains"

	# Dataset recommendations by use case
	recommendations:
	quick_test: "wikitext"
	development: "wikitext_large"
	research: "openwebtext"
	production: "pile"
	literary: "bookcorpus"
	web_content: "c4"
	multilingual: "oscar"