# Example broad data sources for Supernova training # Enable/adjust per your needs. Many are huge; ensure bandwidth/disk and review each dataset’s license. sources: - name: c4_en hf_path: c4 hf_name: en split: train text_field: text weight: 5 streaming: true - name: wikipedia_en hf_path: wikipedia hf_name: 20220301.en split: train text_field: text weight: 3 streaming: true - name: openwebtext hf_path: openwebtext hf_name: null split: train text_field: text weight: 3 streaming: true - name: bookcorpusopen hf_path: bookcorpusopen hf_name: null split: train text_field: text weight: 2 streaming: true - name: the_pile hf_path: the_pile hf_name: all split: train text_field: text weight: 6 streaming: true # You can add more sources here (news, legal, biomedical, code, arXiv, Common Crawl variants, etc.). # Example template: # - name: your_source_name # hf_path: your_org/your_dataset # hf_name: optional_subset # split: train # text_field: text # weight: 1 # streaming: true