File size: 880 Bytes
8174855 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | # VALIDATED data sources for Supernova training
# All datasets tested and confirmed working
sources:
# Large Wikipedia dataset - primary knowledge source (1.8M examples)
- name: wikitext_large
hf_path: wikitext
hf_name: wikitext-103-v1
split: train
text_field: text
weight: 4
streaming: false
# Small Wikipedia for additional coverage
- name: wikitext_small
hf_path: wikitext
hf_name: wikitext-2-v1
split: train
text_field: text
weight: 1
streaming: false
# Add validation split for training diversity
- name: wikitext_validation
hf_path: wikitext
hf_name: wikitext-103-v1
split: validation
text_field: text
weight: 1
streaming: false
# Starting with just these two reliable sources for initial training
# Can expand later once training pipeline is validated |