File size: 1,190 Bytes
8174855 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# Example broad data sources for Supernova training
# Enable/adjust per your needs. Many are huge; ensure bandwidth/disk and review each dataset’s license.
sources:
- name: c4_en
hf_path: c4
hf_name: en
split: train
text_field: text
weight: 5
streaming: true
- name: wikipedia_en
hf_path: wikipedia
hf_name: 20220301.en
split: train
text_field: text
weight: 3
streaming: true
- name: openwebtext
hf_path: openwebtext
hf_name: null
split: train
text_field: text
weight: 3
streaming: true
- name: bookcorpusopen
hf_path: bookcorpusopen
hf_name: null
split: train
text_field: text
weight: 2
streaming: true
- name: the_pile
hf_path: the_pile
hf_name: all
split: train
text_field: text
weight: 6
streaming: true
# You can add more sources here (news, legal, biomedical, code, arXiv, Common Crawl variants, etc.).
# Example template:
# - name: your_source_name
# hf_path: your_org/your_dataset
# hf_name: optional_subset
# split: train
# text_field: text
# weight: 1
# streaming: true
|