gsaltintas commited on
Commit
cb2dd0e
·
verified ·
1 Parent(s): 5516ffc

Upload config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.yaml +84 -0
config.yaml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Config for super vocab built from tokenizers: flexitok/maddition_jpn_Jpan_2000, flexitok/maddition_cmn_Hani_2000, flexitok/maddition_ita_Latn_2000, flexitok/maddition_por_Latn_2000, flexitok/maddition_swe_Latn_2000, flexitok/maddition_deu_Latn_2000, flexitok/maddition_ind_Latn_2000, flexitok/maddition_fas_Arab_2000, flexitok/maddition_vie_Latn_2000, flexitok/maddition_nld_Latn_2000, flexitok/maddition_fra_Latn_2000, flexitok/maddition_spa_Latn_2000, flexitok/maddition_dan_Latn_2000, flexitok/maddition_eng_Latn_2000, flexitok/maddition_arb_Arab_2000, flexitok/maddition_ces_Latn_2000, flexitok/maddition_hun_Latn_2000, flexitok/maddition_tur_Latn_2000, flexitok/maddition_pol_Latn_2000, flexitok/maddition_ell_Grek_2000, flexitok/maddition_rus_Cyrl_2000, flexitok/maddition_digit_2000
3
+ ## Training superset tokenizer with individual tokenizers trained on Fineweb-2-hq
4
+ # use with apps/main/configs/flexitok/llama_1b_base.yaml
5
+ ##TODO: add router details, make sure name is correct
6
+ model:
7
+ vocab_size: 13290
8
+ name: maddition
9
+ dump_dir: /fsx/craffel/lingua_logs/maddition
10
+ data:
11
+ tokenizer:
12
+ name: supertokenizer
13
+ seed: 42
14
+ superset_code_name: maddition
15
+ n_words: 13290
16
+ tokenizers:
17
+ - name: huggingface
18
+ path: flexitok/maddition_jpn_Jpan_2000
19
+ load_supermapping: true
20
+ - name: huggingface
21
+ path: flexitok/maddition_cmn_Hani_2000
22
+ load_supermapping: true
23
+ - name: huggingface
24
+ path: flexitok/maddition_ita_Latn_2000
25
+ load_supermapping: true
26
+ - name: huggingface
27
+ path: flexitok/maddition_por_Latn_2000
28
+ load_supermapping: true
29
+ - name: huggingface
30
+ path: flexitok/maddition_swe_Latn_2000
31
+ load_supermapping: true
32
+ - name: huggingface
33
+ path: flexitok/maddition_deu_Latn_2000
34
+ load_supermapping: true
35
+ - name: huggingface
36
+ path: flexitok/maddition_ind_Latn_2000
37
+ load_supermapping: true
38
+ - name: huggingface
39
+ path: flexitok/maddition_fas_Arab_2000
40
+ load_supermapping: true
41
+ - name: huggingface
42
+ path: flexitok/maddition_vie_Latn_2000
43
+ load_supermapping: true
44
+ - name: huggingface
45
+ path: flexitok/maddition_nld_Latn_2000
46
+ load_supermapping: true
47
+ - name: huggingface
48
+ path: flexitok/maddition_fra_Latn_2000
49
+ load_supermapping: true
50
+ - name: huggingface
51
+ path: flexitok/maddition_spa_Latn_2000
52
+ load_supermapping: true
53
+ - name: huggingface
54
+ path: flexitok/maddition_dan_Latn_2000
55
+ load_supermapping: true
56
+ - name: huggingface
57
+ path: flexitok/maddition_eng_Latn_2000
58
+ load_supermapping: true
59
+ - name: huggingface
60
+ path: flexitok/maddition_arb_Arab_2000
61
+ load_supermapping: true
62
+ - name: huggingface
63
+ path: flexitok/maddition_ces_Latn_2000
64
+ load_supermapping: true
65
+ - name: huggingface
66
+ path: flexitok/maddition_hun_Latn_2000
67
+ load_supermapping: true
68
+ - name: huggingface
69
+ path: flexitok/maddition_tur_Latn_2000
70
+ load_supermapping: true
71
+ - name: huggingface
72
+ path: flexitok/maddition_pol_Latn_2000
73
+ load_supermapping: true
74
+ - name: huggingface
75
+ path: flexitok/maddition_ell_Grek_2000
76
+ load_supermapping: true
77
+ - name: huggingface
78
+ path: flexitok/maddition_rus_Cyrl_2000
79
+ load_supermapping: true
80
+ - name: huggingface
81
+ path: flexitok/maddition_digit_2000
82
+ load_supermapping: true
83
+ checkpoint:
84
+ path: /fsx/craffel/lingua_logs/checkpoints/maddition