Supernova25million / configs /data_sources.example.yaml
algorythmtechnologies's picture
Upload folder using huggingface_hub
8174855 verified
# Example broad data sources for Supernova training
# Enable/adjust per your needs. Many are huge; ensure bandwidth/disk and review each dataset’s license.
sources:
- name: c4_en
hf_path: c4
hf_name: en
split: train
text_field: text
weight: 5
streaming: true
- name: wikipedia_en
hf_path: wikipedia
hf_name: 20220301.en
split: train
text_field: text
weight: 3
streaming: true
- name: openwebtext
hf_path: openwebtext
hf_name: null
split: train
text_field: text
weight: 3
streaming: true
- name: bookcorpusopen
hf_path: bookcorpusopen
hf_name: null
split: train
text_field: text
weight: 2
streaming: true
- name: the_pile
hf_path: the_pile
hf_name: all
split: train
text_field: text
weight: 6
streaming: true
# You can add more sources here (news, legal, biomedical, code, arXiv, Common Crawl variants, etc.).
# Example template:
# - name: your_source_name
# hf_path: your_org/your_dataset
# hf_name: optional_subset
# split: train
# text_field: text
# weight: 1
# streaming: true