# Comprehensive data sources for Supernova - covering all subjects and fields of knowledge # This configuration ensures broad coverage across every domain of human knowledge sources: # Core Web Crawl Data (General Knowledge) - name: c4_en hf_path: c4 hf_name: en split: train text_field: text weight: 10 streaming: true - name: openwebtext hf_path: openwebtext hf_name: null split: train text_field: text weight: 8 streaming: true - name: the_pile hf_path: the_pile hf_name: all split: train text_field: text weight: 15 streaming: true # Encyclopedia & Reference (Structured Knowledge) - name: wikipedia_en hf_path: wikipedia hf_name: 20220301.en split: train text_field: text weight: 12 streaming: true # Literature & Humanities - name: bookcorpusopen hf_path: bookcorpusopen hf_name: null split: train text_field: text weight: 6 streaming: true - name: gutenberg_books hf_path: sedthh/gutenberg_english hf_name: null split: train text_field: text weight: 4 streaming: true # Academic & Scientific Papers - name: arxiv_papers hf_path: togethercomputer/RedPajama-Data-1T hf_name: arxiv split: train text_field: text weight: 8 streaming: true - name: pubmed_abstracts hf_path: togethercomputer/RedPajama-Data-1T hf_name: pubmed_abstracts split: train text_field: text weight: 6 streaming: true # Code & Technical Documentation - name: github_code hf_path: togethercomputer/RedPajama-Data-1T hf_name: github split: train text_field: text weight: 7 streaming: true - name: stack_exchange hf_path: togethercomputer/RedPajama-Data-1T hf_name: stackexchange split: train text_field: text weight: 5 streaming: true # Mathematics & Science Specific - name: math_dataset hf_path: competition_math hf_name: null split: train text_field: problem weight: 3 streaming: true - name: scientific_papers hf_path: allenai/s2orc hf_name: null split: train text_field: text weight: 6 streaming: true # News & Current Events (for general knowledge) - name: cc_news hf_path: togethercomputer/RedPajama-Data-1T hf_name: cc_news split: train text_field: text weight: 4 streaming: true # Educational Content - name: khan_academy hf_path: prasadsharaf/khan-academy-scrape hf_name: null split: train text_field: text weight: 3 streaming: true # Legal Documents (Law) - name: legal_pile hf_path: pile-of-law/pile-of-law hf_name: null split: train text_field: text weight: 2 streaming: true # Medical & Healthcare - name: medical_meadow hf_path: medalpaca/medical_meadow_medical_flashcards hf_name: null split: train text_field: output weight: 2 streaming: true # Philosophy & Ethics - name: philosophy_dataset hf_path: AiresPucrs/stanford-encyclopedia-philosophy hf_name: null split: train text_field: text weight: 2 streaming: true # Note: Some datasets might require authentication or have usage restrictions # Always review the license and terms of use for each dataset # Adjust weights based on your priorities and available compute resources # Higher weights = more representation in training # Coverage areas: # ✓ General Web Knowledge (C4, OpenWebText, The Pile) # ✓ Encyclopedic Knowledge (Wikipedia) # ✓ Literature & Arts (Books, Gutenberg) # ✓ Science & Research (ArXiv, PubMed, S2ORC) # ✓ Technology & Programming (GitHub, Stack Exchange) # ✓ Mathematics (Competition Math, Scientific Papers) # ✓ Current Events (News) # ✓ Education (Khan Academy) # ✓ Law (Pile of Law) # ✓ Medicine (Medical datasets) # ✓ Philosophy & Ethics # ✓ Engineering (through technical papers and code) # ✓ History (through Wikipedia and books) # ✓ Languages & Linguistics (through diverse text sources) # ✓ Business & Economics (through news and web content)