Supernova25million / configs /data_sources.example.yaml

Upload folder using huggingface_hub

8174855 verified 5 months ago

1.19 kB

	# Example broad data sources for Supernova training
	# Enable/adjust per your needs. Many are huge; ensure bandwidth/disk and review each dataset’s license.

	sources:
	- name: c4_en
	hf_path: c4
	hf_name: en
	split: train
	text_field: text
	weight: 5
	streaming: true

	- name: wikipedia_en
	hf_path: wikipedia
	hf_name: 20220301.en
	split: train
	text_field: text
	weight: 3
	streaming: true

	- name: openwebtext
	hf_path: openwebtext
	hf_name: null
	split: train
	text_field: text
	weight: 3
	streaming: true

	- name: bookcorpusopen
	hf_path: bookcorpusopen
	hf_name: null
	split: train
	text_field: text
	weight: 2
	streaming: true

	- name: the_pile
	hf_path: the_pile
	hf_name: all
	split: train
	text_field: text
	weight: 6
	streaming: true

	# You can add more sources here (news, legal, biomedical, code, arXiv, Common Crawl variants, etc.).
	# Example template:
	# - name: your_source_name
	# hf_path: your_org/your_dataset
	# hf_name: optional_subset
	# split: train
	# text_field: text
	# weight: 1
	# streaming: true