| # Example broad data sources for Supernova training | |
| # Enable/adjust per your needs. Many are huge; ensure bandwidth/disk and review each dataset’s license. | |
| sources: | |
| - name: c4_en | |
| hf_path: c4 | |
| hf_name: en | |
| split: train | |
| text_field: text | |
| weight: 5 | |
| streaming: true | |
| - name: wikipedia_en | |
| hf_path: wikipedia | |
| hf_name: 20220301.en | |
| split: train | |
| text_field: text | |
| weight: 3 | |
| streaming: true | |
| - name: openwebtext | |
| hf_path: openwebtext | |
| hf_name: null | |
| split: train | |
| text_field: text | |
| weight: 3 | |
| streaming: true | |
| - name: bookcorpusopen | |
| hf_path: bookcorpusopen | |
| hf_name: null | |
| split: train | |
| text_field: text | |
| weight: 2 | |
| streaming: true | |
| - name: the_pile | |
| hf_path: the_pile | |
| hf_name: all | |
| split: train | |
| text_field: text | |
| weight: 6 | |
| streaming: true | |
| # You can add more sources here (news, legal, biomedical, code, arXiv, Common Crawl variants, etc.). | |
| # Example template: | |
| # - name: your_source_name | |
| # hf_path: your_org/your_dataset | |
| # hf_name: optional_subset | |
| # split: train | |
| # text_field: text | |
| # weight: 1 | |
| # streaming: true | |