# Comprehensive data sources for Supernova - covering all subjects and fields of knowledge
# This configuration ensures broad coverage across every domain of human knowledge

sources:
  # Core Web Crawl Data (General Knowledge)
  - name: c4_en
    hf_path: c4
    hf_name: en
    split: train
    text_field: text
    weight: 10
    streaming: true

  - name: openwebtext
    hf_path: openwebtext
    hf_name: null
    split: train
    text_field: text
    weight: 8
    streaming: true

  - name: the_pile
    hf_path: the_pile
    hf_name: all
    split: train
    text_field: text
    weight: 15
    streaming: true

  # Encyclopedia & Reference (Structured Knowledge)
  - name: wikipedia_en
    hf_path: wikipedia
    hf_name: 20220301.en
    split: train
    text_field: text
    weight: 12
    streaming: true

  # Literature & Humanities
  - name: bookcorpusopen
    hf_path: bookcorpusopen
    hf_name: null
    split: train
    text_field: text
    weight: 6
    streaming: true

  - name: gutenberg_books
    hf_path: sedthh/gutenberg_english
    hf_name: null
    split: train
    text_field: text
    weight: 4
    streaming: true

  # Academic & Scientific Papers
  - name: arxiv_papers
    hf_path: togethercomputer/RedPajama-Data-1T
    hf_name: arxiv
    split: train
    text_field: text
    weight: 8
    streaming: true

  - name: pubmed_abstracts
    hf_path: togethercomputer/RedPajama-Data-1T
    hf_name: pubmed_abstracts
    split: train
    text_field: text
    weight: 6
    streaming: true

  # Code & Technical Documentation
  - name: github_code
    hf_path: togethercomputer/RedPajama-Data-1T
    hf_name: github
    split: train
    text_field: text
    weight: 7
    streaming: true

  - name: stack_exchange
    hf_path: togethercomputer/RedPajama-Data-1T
    hf_name: stackexchange
    split: train
    text_field: text
    weight: 5
    streaming: true

  # Mathematics & Science Specific
  - name: math_dataset
    hf_path: competition_math
    hf_name: null
    split: train
    text_field: problem
    weight: 3
    streaming: true

  - name: scientific_papers
    hf_path: allenai/s2orc
    hf_name: null
    split: train
    text_field: text
    weight: 6
    streaming: true

  # News & Current Events (for general knowledge)
  - name: cc_news
    hf_path: togethercomputer/RedPajama-Data-1T
    hf_name: cc_news
    split: train
    text_field: text
    weight: 4
    streaming: true

  # Educational Content
  - name: khan_academy
    hf_path: prasadsharaf/khan-academy-scrape
    hf_name: null
    split: train
    text_field: text
    weight: 3
    streaming: true

  # Legal Documents (Law)
  - name: legal_pile
    hf_path: pile-of-law/pile-of-law
    hf_name: null
    split: train
    text_field: text
    weight: 2
    streaming: true

  # Medical & Healthcare
  - name: medical_meadow
    hf_path: medalpaca/medical_meadow_medical_flashcards
    hf_name: null
    split: train  
    text_field: output
    weight: 2
    streaming: true

  # Philosophy & Ethics
  - name: philosophy_dataset
    hf_path: AiresPucrs/stanford-encyclopedia-philosophy
    hf_name: null
    split: train
    text_field: text
    weight: 2
    streaming: true

# Note: Some datasets might require authentication or have usage restrictions
# Always review the license and terms of use for each dataset
# Adjust weights based on your priorities and available compute resources
# Higher weights = more representation in training

# Coverage areas:
# ✓ General Web Knowledge (C4, OpenWebText, The Pile)
# ✓ Encyclopedic Knowledge (Wikipedia) 
# ✓ Literature & Arts (Books, Gutenberg)
# ✓ Science & Research (ArXiv, PubMed, S2ORC)
# ✓ Technology & Programming (GitHub, Stack Exchange)
# ✓ Mathematics (Competition Math, Scientific Papers)
# ✓ Current Events (News)
# ✓ Education (Khan Academy)
# ✓ Law (Pile of Law)
# ✓ Medicine (Medical datasets)
# ✓ Philosophy & Ethics
# ✓ Engineering (through technical papers and code)
# ✓ History (through Wikipedia and books)
# ✓ Languages & Linguistics (through diverse text sources)
# ✓ Business & Economics (through news and web content)