gsaltintas commited on
Commit
8ac49b7
·
verified ·
1 Parent(s): c5995be

Upload config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.yaml +81 -0
config.yaml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Config for super vocab built from tokenizers: flexitok/bpe_arb_Arab_8000, flexitok/bpe_ces_Latn_8000, flexitok/bpe_ltr_cmn_Hani_8000_v2, flexitok/bpe_dan_Latn_8000, flexitok/bpe_deu_Latn_8000, flexitok/bpe_ell_Grek_8000, flexitok/bpe_fas_Arab_8000, flexitok/bpe_fra_Latn_8000, flexitok/bpe_fw_edu_8000, flexitok/bpe_hun_Latn_8000, flexitok/bpe_ind_Latn_8000, flexitok/bpe_ita_Latn_8000, flexitok/bpe_ltr_jpn_Jpan_8000_v2, flexitok/bpe_nld_Latn_8000, flexitok/bpe_pol_Latn_8000, flexitok/bpe_por_Latn_8000, flexitok/bpe_rus_Cyrl_8000, flexitok/bpe_spa_Latn_8000, flexitok/bpe_swe_Latn_8000, flexitok/bpe_ltr_tur_Latn_8000_v2, flexitok/bpe_vie_Latn_8000
3
+ ## Training superset tokenizer with individual tokenizers trained on Fineweb-2-hq
4
+ # use with apps/main/configs/flexitok/llama_1b_base.yaml
5
+ ##TODO: add router details, make sure name is correct
6
+ model:
7
+ vocab_size: 122554
8
+ name: safe
9
+ dump_dir: /fsx/craffel/lingua_logs/safe
10
+ data:
11
+ tokenizer:
12
+ name: supertokenizer
13
+ seed: 42
14
+ superset_code_name: safe
15
+ n_words: 122554
16
+ tokenizers:
17
+ - name: huggingface
18
+ path: flexitok/bpe_arb_Arab_8000
19
+ load_supermapping: true
20
+ - name: huggingface
21
+ path: flexitok/bpe_ces_Latn_8000
22
+ load_supermapping: true
23
+ - name: huggingface
24
+ path: flexitok/bpe_ltr_cmn_Hani_8000_v2
25
+ load_supermapping: true
26
+ - name: huggingface
27
+ path: flexitok/bpe_dan_Latn_8000
28
+ load_supermapping: true
29
+ - name: huggingface
30
+ path: flexitok/bpe_deu_Latn_8000
31
+ load_supermapping: true
32
+ - name: huggingface
33
+ path: flexitok/bpe_ell_Grek_8000
34
+ load_supermapping: true
35
+ - name: huggingface
36
+ path: flexitok/bpe_fas_Arab_8000
37
+ load_supermapping: true
38
+ - name: huggingface
39
+ path: flexitok/bpe_fra_Latn_8000
40
+ load_supermapping: true
41
+ - name: huggingface
42
+ path: flexitok/bpe_fw_edu_8000
43
+ load_supermapping: true
44
+ - name: huggingface
45
+ path: flexitok/bpe_hun_Latn_8000
46
+ load_supermapping: true
47
+ - name: huggingface
48
+ path: flexitok/bpe_ind_Latn_8000
49
+ load_supermapping: true
50
+ - name: huggingface
51
+ path: flexitok/bpe_ita_Latn_8000
52
+ load_supermapping: true
53
+ - name: huggingface
54
+ path: flexitok/bpe_ltr_jpn_Jpan_8000_v2
55
+ load_supermapping: true
56
+ - name: huggingface
57
+ path: flexitok/bpe_nld_Latn_8000
58
+ load_supermapping: true
59
+ - name: huggingface
60
+ path: flexitok/bpe_pol_Latn_8000
61
+ load_supermapping: true
62
+ - name: huggingface
63
+ path: flexitok/bpe_por_Latn_8000
64
+ load_supermapping: true
65
+ - name: huggingface
66
+ path: flexitok/bpe_rus_Cyrl_8000
67
+ load_supermapping: true
68
+ - name: huggingface
69
+ path: flexitok/bpe_spa_Latn_8000
70
+ load_supermapping: true
71
+ - name: huggingface
72
+ path: flexitok/bpe_swe_Latn_8000
73
+ load_supermapping: true
74
+ - name: huggingface
75
+ path: flexitok/bpe_ltr_tur_Latn_8000_v2
76
+ load_supermapping: true
77
+ - name: huggingface
78
+ path: flexitok/bpe_vie_Latn_8000
79
+ load_supermapping: true
80
+ checkpoint:
81
+ path: /fsx/craffel/lingua_logs/checkpoints/safe