gsaltintas commited on
Commit
4d2a76c
·
verified ·
1 Parent(s): f1d064e

Upload config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.yaml +51 -0
config.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Config for super vocab built from tokenizers: flexitok/bpe_script_Arab_16000, flexitok/bpe_script_CmJp_16000, flexitok/bpe_ltr_ell_Grek_8000_v2, flexitok/bpe_ltr_fw_edu_32000_v2, flexitok/bpe_ltr_hun_Latn_8000_v2, flexitok/bpe_ltr_rus_Cyrl_16000_v2, flexitok/bpe_ltr_tur_Latn_8000_v2, flexitok/bpe_script_Germ_32000, flexitok/bpe_script_Roma_32000, flexitok/bpe_script_SEAS_16000, flexitok/bpe_script_Slav_16000
3
+ ## Training superset tokenizer with individual tokenizers trained on Fineweb-2-hq
4
+ # use with apps/main/configs/flexitok/llama_1b_base.yaml
5
+ ##TODO: add router details, make sure name is correct
6
+ model:
7
+ vocab_size: 163711
8
+ name: script_1
9
+ dump_dir: /fsx/craffel/lingua_logs/script_1
10
+ data:
11
+ tokenizer:
12
+ name: supertokenizer
13
+ seed: 42
14
+ superset_code_name: script_1
15
+ n_words: 163711
16
+ tokenizers:
17
+ - name: huggingface
18
+ path: flexitok/bpe_script_Arab_16000
19
+ load_supermapping: true
20
+ - name: huggingface
21
+ path: flexitok/bpe_script_CmJp_16000
22
+ load_supermapping: true
23
+ - name: huggingface
24
+ path: flexitok/bpe_ltr_ell_Grek_8000_v2
25
+ load_supermapping: true
26
+ - name: huggingface
27
+ path: flexitok/bpe_ltr_fw_edu_32000_v2
28
+ load_supermapping: true
29
+ - name: huggingface
30
+ path: flexitok/bpe_ltr_hun_Latn_8000_v2
31
+ load_supermapping: true
32
+ - name: huggingface
33
+ path: flexitok/bpe_ltr_rus_Cyrl_16000_v2
34
+ load_supermapping: true
35
+ - name: huggingface
36
+ path: flexitok/bpe_ltr_tur_Latn_8000_v2
37
+ load_supermapping: true
38
+ - name: huggingface
39
+ path: flexitok/bpe_script_Germ_32000
40
+ load_supermapping: true
41
+ - name: huggingface
42
+ path: flexitok/bpe_script_Roma_32000
43
+ load_supermapping: true
44
+ - name: huggingface
45
+ path: flexitok/bpe_script_SEAS_16000
46
+ load_supermapping: true
47
+ - name: huggingface
48
+ path: flexitok/bpe_script_Slav_16000
49
+ load_supermapping: true
50
+ checkpoint:
51
+ path: /fsx/craffel/lingua_logs/checkpoints/script_1