File size: 1,190 Bytes
# Example broad data sources for Supernova training
# Enable/adjust per your needs. Many are huge; ensure bandwidth/disk and review each dataset’s license.

sources:
  - name: c4_en
    hf_path: c4
    hf_name: en
    split: train
    text_field: text
    weight: 5
    streaming: true

  - name: wikipedia_en
    hf_path: wikipedia
    hf_name: 20220301.en
    split: train
    text_field: text
    weight: 3
    streaming: true

  - name: openwebtext
    hf_path: openwebtext
    hf_name: null
    split: train
    text_field: text
    weight: 3
    streaming: true

  - name: bookcorpusopen
    hf_path: bookcorpusopen
    hf_name: null
    split: train
    text_field: text
    weight: 2
    streaming: true

  - name: the_pile
    hf_path: the_pile
    hf_name: all
    split: train
    text_field: text
    weight: 6
    streaming: true

# You can add more sources here (news, legal, biomedical, code, arXiv, Common Crawl variants, etc.).
# Example template:
#  - name: your_source_name
#    hf_path: your_org/your_dataset
#    hf_name: optional_subset
#    split: train
#    text_field: text
#    weight: 1
#    streaming: true