Supernova25million / configs /comprehensive_data_sources.yaml
algorythmtechnologies's picture
Upload folder using huggingface_hub
8174855 verified
# Comprehensive data sources for Supernova - covering all subjects and fields of knowledge
# This configuration ensures broad coverage across every domain of human knowledge
sources:
# Core Web Crawl Data (General Knowledge)
- name: c4_en
hf_path: c4
hf_name: en
split: train
text_field: text
weight: 10
streaming: true
- name: openwebtext
hf_path: openwebtext
hf_name: null
split: train
text_field: text
weight: 8
streaming: true
- name: the_pile
hf_path: the_pile
hf_name: all
split: train
text_field: text
weight: 15
streaming: true
# Encyclopedia & Reference (Structured Knowledge)
- name: wikipedia_en
hf_path: wikipedia
hf_name: 20220301.en
split: train
text_field: text
weight: 12
streaming: true
# Literature & Humanities
- name: bookcorpusopen
hf_path: bookcorpusopen
hf_name: null
split: train
text_field: text
weight: 6
streaming: true
- name: gutenberg_books
hf_path: sedthh/gutenberg_english
hf_name: null
split: train
text_field: text
weight: 4
streaming: true
# Academic & Scientific Papers
- name: arxiv_papers
hf_path: togethercomputer/RedPajama-Data-1T
hf_name: arxiv
split: train
text_field: text
weight: 8
streaming: true
- name: pubmed_abstracts
hf_path: togethercomputer/RedPajama-Data-1T
hf_name: pubmed_abstracts
split: train
text_field: text
weight: 6
streaming: true
# Code & Technical Documentation
- name: github_code
hf_path: togethercomputer/RedPajama-Data-1T
hf_name: github
split: train
text_field: text
weight: 7
streaming: true
- name: stack_exchange
hf_path: togethercomputer/RedPajama-Data-1T
hf_name: stackexchange
split: train
text_field: text
weight: 5
streaming: true
# Mathematics & Science Specific
- name: math_dataset
hf_path: competition_math
hf_name: null
split: train
text_field: problem
weight: 3
streaming: true
- name: scientific_papers
hf_path: allenai/s2orc
hf_name: null
split: train
text_field: text
weight: 6
streaming: true
# News & Current Events (for general knowledge)
- name: cc_news
hf_path: togethercomputer/RedPajama-Data-1T
hf_name: cc_news
split: train
text_field: text
weight: 4
streaming: true
# Educational Content
- name: khan_academy
hf_path: prasadsharaf/khan-academy-scrape
hf_name: null
split: train
text_field: text
weight: 3
streaming: true
# Legal Documents (Law)
- name: legal_pile
hf_path: pile-of-law/pile-of-law
hf_name: null
split: train
text_field: text
weight: 2
streaming: true
# Medical & Healthcare
- name: medical_meadow
hf_path: medalpaca/medical_meadow_medical_flashcards
hf_name: null
split: train
text_field: output
weight: 2
streaming: true
# Philosophy & Ethics
- name: philosophy_dataset
hf_path: AiresPucrs/stanford-encyclopedia-philosophy
hf_name: null
split: train
text_field: text
weight: 2
streaming: true
# Note: Some datasets might require authentication or have usage restrictions
# Always review the license and terms of use for each dataset
# Adjust weights based on your priorities and available compute resources
# Higher weights = more representation in training
# Coverage areas:
# βœ“ General Web Knowledge (C4, OpenWebText, The Pile)
# βœ“ Encyclopedic Knowledge (Wikipedia)
# βœ“ Literature & Arts (Books, Gutenberg)
# βœ“ Science & Research (ArXiv, PubMed, S2ORC)
# βœ“ Technology & Programming (GitHub, Stack Exchange)
# βœ“ Mathematics (Competition Math, Scientific Papers)
# βœ“ Current Events (News)
# βœ“ Education (Khan Academy)
# βœ“ Law (Pile of Law)
# βœ“ Medicine (Medical datasets)
# βœ“ Philosophy & Ethics
# βœ“ Engineering (through technical papers and code)
# βœ“ History (through Wikipedia and books)
# βœ“ Languages & Linguistics (through diverse text sources)
# βœ“ Business & Economics (through news and web content)