| # VALIDATED data sources for Supernova training | |
| # All datasets tested and confirmed working | |
| sources: | |
| # Large Wikipedia dataset - primary knowledge source (1.8M examples) | |
| - name: wikitext_large | |
| hf_path: wikitext | |
| hf_name: wikitext-103-v1 | |
| split: train | |
| text_field: text | |
| weight: 4 | |
| streaming: false | |
| # Small Wikipedia for additional coverage | |
| - name: wikitext_small | |
| hf_path: wikitext | |
| hf_name: wikitext-2-v1 | |
| split: train | |
| text_field: text | |
| weight: 1 | |
| streaming: false | |
| # Add validation split for training diversity | |
| - name: wikitext_validation | |
| hf_path: wikitext | |
| hf_name: wikitext-103-v1 | |
| split: validation | |
| text_field: text | |
| weight: 1 | |
| streaming: false | |
| # Starting with just these two reliable sources for initial training | |
| # Can expand later once training pipeline is validated |