| # Comprehensive data sources for Supernova - covering all subjects and fields of knowledge | |
| # This configuration ensures broad coverage across every domain of human knowledge | |
| sources: | |
| # Core Web Crawl Data (General Knowledge) | |
| - name: c4_en | |
| hf_path: c4 | |
| hf_name: en | |
| split: train | |
| text_field: text | |
| weight: 10 | |
| streaming: true | |
| - name: openwebtext | |
| hf_path: openwebtext | |
| hf_name: null | |
| split: train | |
| text_field: text | |
| weight: 8 | |
| streaming: true | |
| - name: the_pile | |
| hf_path: the_pile | |
| hf_name: all | |
| split: train | |
| text_field: text | |
| weight: 15 | |
| streaming: true | |
| # Encyclopedia & Reference (Structured Knowledge) | |
| - name: wikipedia_en | |
| hf_path: wikipedia | |
| hf_name: 20220301.en | |
| split: train | |
| text_field: text | |
| weight: 12 | |
| streaming: true | |
| # Literature & Humanities | |
| - name: bookcorpusopen | |
| hf_path: bookcorpusopen | |
| hf_name: null | |
| split: train | |
| text_field: text | |
| weight: 6 | |
| streaming: true | |
| - name: gutenberg_books | |
| hf_path: sedthh/gutenberg_english | |
| hf_name: null | |
| split: train | |
| text_field: text | |
| weight: 4 | |
| streaming: true | |
| # Academic & Scientific Papers | |
| - name: arxiv_papers | |
| hf_path: togethercomputer/RedPajama-Data-1T | |
| hf_name: arxiv | |
| split: train | |
| text_field: text | |
| weight: 8 | |
| streaming: true | |
| - name: pubmed_abstracts | |
| hf_path: togethercomputer/RedPajama-Data-1T | |
| hf_name: pubmed_abstracts | |
| split: train | |
| text_field: text | |
| weight: 6 | |
| streaming: true | |
| # Code & Technical Documentation | |
| - name: github_code | |
| hf_path: togethercomputer/RedPajama-Data-1T | |
| hf_name: github | |
| split: train | |
| text_field: text | |
| weight: 7 | |
| streaming: true | |
| - name: stack_exchange | |
| hf_path: togethercomputer/RedPajama-Data-1T | |
| hf_name: stackexchange | |
| split: train | |
| text_field: text | |
| weight: 5 | |
| streaming: true | |
| # Mathematics & Science Specific | |
| - name: math_dataset | |
| hf_path: competition_math | |
| hf_name: null | |
| split: train | |
| text_field: problem | |
| weight: 3 | |
| streaming: true | |
| - name: scientific_papers | |
| hf_path: allenai/s2orc | |
| hf_name: null | |
| split: train | |
| text_field: text | |
| weight: 6 | |
| streaming: true | |
| # News & Current Events (for general knowledge) | |
| - name: cc_news | |
| hf_path: togethercomputer/RedPajama-Data-1T | |
| hf_name: cc_news | |
| split: train | |
| text_field: text | |
| weight: 4 | |
| streaming: true | |
| # Educational Content | |
| - name: khan_academy | |
| hf_path: prasadsharaf/khan-academy-scrape | |
| hf_name: null | |
| split: train | |
| text_field: text | |
| weight: 3 | |
| streaming: true | |
| # Legal Documents (Law) | |
| - name: legal_pile | |
| hf_path: pile-of-law/pile-of-law | |
| hf_name: null | |
| split: train | |
| text_field: text | |
| weight: 2 | |
| streaming: true | |
| # Medical & Healthcare | |
| - name: medical_meadow | |
| hf_path: medalpaca/medical_meadow_medical_flashcards | |
| hf_name: null | |
| split: train | |
| text_field: output | |
| weight: 2 | |
| streaming: true | |
| # Philosophy & Ethics | |
| - name: philosophy_dataset | |
| hf_path: AiresPucrs/stanford-encyclopedia-philosophy | |
| hf_name: null | |
| split: train | |
| text_field: text | |
| weight: 2 | |
| streaming: true | |
| # Note: Some datasets might require authentication or have usage restrictions | |
| # Always review the license and terms of use for each dataset | |
| # Adjust weights based on your priorities and available compute resources | |
| # Higher weights = more representation in training | |
| # Coverage areas: | |
| # β General Web Knowledge (C4, OpenWebText, The Pile) | |
| # β Encyclopedic Knowledge (Wikipedia) | |
| # β Literature & Arts (Books, Gutenberg) | |
| # β Science & Research (ArXiv, PubMed, S2ORC) | |
| # β Technology & Programming (GitHub, Stack Exchange) | |
| # β Mathematics (Competition Math, Scientific Papers) | |
| # β Current Events (News) | |
| # β Education (Khan Academy) | |
| # β Law (Pile of Law) | |
| # β Medicine (Medical datasets) | |
| # β Philosophy & Ethics | |
| # β Engineering (through technical papers and code) | |
| # β History (through Wikipedia and books) | |
| # β Languages & Linguistics (through diverse text sources) | |
| # β Business & Economics (through news and web content) |