File size: 880 Bytes
8174855
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# VALIDATED data sources for Supernova training  
# All datasets tested and confirmed working

sources:
  # Large Wikipedia dataset - primary knowledge source (1.8M examples)
  - name: wikitext_large
    hf_path: wikitext
    hf_name: wikitext-103-v1
    split: train
    text_field: text
    weight: 4
    streaming: false

  # Small Wikipedia for additional coverage
  - name: wikitext_small
    hf_path: wikitext
    hf_name: wikitext-2-v1
    split: train
    text_field: text
    weight: 1
    streaming: false

  # Add validation split for training diversity
  - name: wikitext_validation
    hf_path: wikitext
    hf_name: wikitext-103-v1
    split: validation
    text_field: text
    weight: 1
    streaming: false

# Starting with just these two reliable sources for initial training
# Can expand later once training pipeline is validated