Supernova25million / configs /comprehensive_data_sources.yaml

Upload folder using huggingface_hub

8174855 verified 6 months ago

4.3 kB

	# Comprehensive data sources for Supernova - covering all subjects and fields of knowledge
	# This configuration ensures broad coverage across every domain of human knowledge

	sources:
	# Core Web Crawl Data (General Knowledge)
	- name: c4_en
	hf_path: c4
	hf_name: en
	split: train
	text_field: text
	weight: 10
	streaming: true

	- name: openwebtext
	hf_path: openwebtext
	hf_name: null
	split: train
	text_field: text
	weight: 8
	streaming: true

	- name: the_pile
	hf_path: the_pile
	hf_name: all
	split: train
	text_field: text
	weight: 15
	streaming: true

	# Encyclopedia & Reference (Structured Knowledge)
	- name: wikipedia_en
	hf_path: wikipedia
	hf_name: 20220301.en
	split: train
	text_field: text
	weight: 12
	streaming: true

	# Literature & Humanities
	- name: bookcorpusopen
	hf_path: bookcorpusopen
	hf_name: null
	split: train
	text_field: text
	weight: 6
	streaming: true

	- name: gutenberg_books
	hf_path: sedthh/gutenberg_english
	hf_name: null
	split: train
	text_field: text
	weight: 4
	streaming: true

	# Academic & Scientific Papers
	- name: arxiv_papers
	hf_path: togethercomputer/RedPajama-Data-1T
	hf_name: arxiv
	split: train
	text_field: text
	weight: 8
	streaming: true

	- name: pubmed_abstracts
	hf_path: togethercomputer/RedPajama-Data-1T
	hf_name: pubmed_abstracts
	split: train
	text_field: text
	weight: 6
	streaming: true

	# Code & Technical Documentation
	- name: github_code
	hf_path: togethercomputer/RedPajama-Data-1T
	hf_name: github
	split: train
	text_field: text
	weight: 7
	streaming: true

	- name: stack_exchange
	hf_path: togethercomputer/RedPajama-Data-1T
	hf_name: stackexchange
	split: train
	text_field: text
	weight: 5
	streaming: true

	# Mathematics & Science Specific
	- name: math_dataset
	hf_path: competition_math
	hf_name: null
	split: train
	text_field: problem
	weight: 3
	streaming: true

	- name: scientific_papers
	hf_path: allenai/s2orc
	hf_name: null
	split: train
	text_field: text
	weight: 6
	streaming: true

	# News & Current Events (for general knowledge)
	- name: cc_news
	hf_path: togethercomputer/RedPajama-Data-1T
	hf_name: cc_news
	split: train
	text_field: text
	weight: 4
	streaming: true

	# Educational Content
	- name: khan_academy
	hf_path: prasadsharaf/khan-academy-scrape
	hf_name: null
	split: train
	text_field: text
	weight: 3
	streaming: true

	# Legal Documents (Law)
	- name: legal_pile
	hf_path: pile-of-law/pile-of-law
	hf_name: null
	split: train
	text_field: text
	weight: 2
	streaming: true

	# Medical & Healthcare
	- name: medical_meadow
	hf_path: medalpaca/medical_meadow_medical_flashcards
	hf_name: null
	split: train
	text_field: output
	weight: 2
	streaming: true

	# Philosophy & Ethics
	- name: philosophy_dataset
	hf_path: AiresPucrs/stanford-encyclopedia-philosophy
	hf_name: null
	split: train
	text_field: text
	weight: 2
	streaming: true

	# Note: Some datasets might require authentication or have usage restrictions
	# Always review the license and terms of use for each dataset
	# Adjust weights based on your priorities and available compute resources
	# Higher weights = more representation in training

	# Coverage areas:
	# ✓ General Web Knowledge (C4, OpenWebText, The Pile)
	# ✓ Encyclopedic Knowledge (Wikipedia)
	# ✓ Literature & Arts (Books, Gutenberg)
	# ✓ Science & Research (ArXiv, PubMed, S2ORC)
	# ✓ Technology & Programming (GitHub, Stack Exchange)
	# ✓ Mathematics (Competition Math, Scientific Papers)
	# ✓ Current Events (News)
	# ✓ Education (Khan Academy)
	# ✓ Law (Pile of Law)
	# ✓ Medicine (Medical datasets)
	# ✓ Philosophy & Ethics
	# ✓ Engineering (through technical papers and code)
	# ✓ History (through Wikipedia and books)
	# ✓ Languages & Linguistics (through diverse text sources)
	# ✓ Business & Economics (through news and web content)