# VALIDATED data sources for Supernova training # All datasets tested and confirmed working sources: # Large Wikipedia dataset - primary knowledge source (1.8M examples) - name: wikitext_large hf_path: wikitext hf_name: wikitext-103-v1 split: train text_field: text weight: 4 streaming: false # Small Wikipedia for additional coverage - name: wikitext_small hf_path: wikitext hf_name: wikitext-2-v1 split: train text_field: text weight: 1 streaming: false # Add validation split for training diversity - name: wikitext_validation hf_path: wikitext hf_name: wikitext-103-v1 split: validation text_field: text weight: 1 streaming: false # Starting with just these two reliable sources for initial training # Can expand later once training pipeline is validated