File size: 1,190 Bytes
8174855
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# Example broad data sources for Supernova training
# Enable/adjust per your needs. Many are huge; ensure bandwidth/disk and review each dataset’s license.

sources:
  - name: c4_en
    hf_path: c4
    hf_name: en
    split: train
    text_field: text
    weight: 5
    streaming: true

  - name: wikipedia_en
    hf_path: wikipedia
    hf_name: 20220301.en
    split: train
    text_field: text
    weight: 3
    streaming: true

  - name: openwebtext
    hf_path: openwebtext
    hf_name: null
    split: train
    text_field: text
    weight: 3
    streaming: true

  - name: bookcorpusopen
    hf_path: bookcorpusopen
    hf_name: null
    split: train
    text_field: text
    weight: 2
    streaming: true

  - name: the_pile
    hf_path: the_pile
    hf_name: all
    split: train
    text_field: text
    weight: 6
    streaming: true

# You can add more sources here (news, legal, biomedical, code, arXiv, Common Crawl variants, etc.).
# Example template:
#  - name: your_source_name
#    hf_path: your_org/your_dataset
#    hf_name: optional_subset
#    split: train
#    text_field: text
#    weight: 1
#    streaming: true