Supernova25million / configs /data_sources.yaml
algorythmtechnologies's picture
Upload folder using huggingface_hub
8174855 verified
raw
history blame contribute delete
880 Bytes
# VALIDATED data sources for Supernova training
# All datasets tested and confirmed working
sources:
# Large Wikipedia dataset - primary knowledge source (1.8M examples)
- name: wikitext_large
hf_path: wikitext
hf_name: wikitext-103-v1
split: train
text_field: text
weight: 4
streaming: false
# Small Wikipedia for additional coverage
- name: wikitext_small
hf_path: wikitext
hf_name: wikitext-2-v1
split: train
text_field: text
weight: 1
streaming: false
# Add validation split for training diversity
- name: wikitext_validation
hf_path: wikitext
hf_name: wikitext-103-v1
split: validation
text_field: text
weight: 1
streaming: false
# Starting with just these two reliable sources for initial training
# Can expand later once training pipeline is validated