craffel HF Staff commited on
Commit
7eeb894
·
verified ·
1 Parent(s): 7fe10d3

Upload 16k_v2/config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. 16k_v2/config.yaml +275 -0
16k_v2/config.yaml ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: 16k_v2
2
+ dump_dir: /fsx/craffel/lingua_logs/16k_v2
3
+ seed: 777
4
+ grad_acc_steps: 8
5
+ gc_collect_freq: 1000
6
+ probe_freq: null
7
+ steps: 100000
8
+ data:
9
+ root_dir: /scratch/craffel/lingua/data/flexitok/
10
+ sources:
11
+ fw_edu: 0.4
12
+ dan_Latn: 0.0216582869670702
13
+ swe_Latn: 0.0216359765418466
14
+ vie_Latn: 0.0197485510268674
15
+ hun_Latn: 0.0247194573562308
16
+ fas_Arab: 0.0205634624231076
17
+ tur_Latn: 0.0235455794841729
18
+ ces_Latn: 0.0248024455266208
19
+ arb_Arab: 0.0234323706569333
20
+ ell_Grek: 0.0233670886888026
21
+ ind_Latn: 0.0269322054593488
22
+ nld_Latn: 0.0277796326621489
23
+ pol_Latn: 0.0294120104572311
24
+ por_Latn: 0.0301413168306825
25
+ ita_Latn: 0.0324056371021865
26
+ jpn_Jpan: 0.03553104151369
27
+ fra_Latn: 0.0381835560678536
28
+ spa_Latn: 0.0387222793083669
29
+ deu_Latn: 0.0419925340453022
30
+ cmn_Hani: 0.0454067521384114
31
+ rus_Cyrl: 0.0500198157431261
32
+ batch_size: 4
33
+ seq_len: 4096
34
+ n_views: 2
35
+ seed: 42
36
+ add_bos: true
37
+ add_eos: true
38
+ load_async: true
39
+ prefetch_size: 1024
40
+ tokenizer:
41
+ name: supertokenizer
42
+ path: meta-llama/Llama-3.2-1B
43
+ tokenizers:
44
+ - name: huggingface
45
+ path: flexitok/bpe_ltr_arb_Arab_16000_v2
46
+ load_supermapping: true
47
+ - name: huggingface
48
+ path: flexitok/bpe_ltr_ces_Latn_16000_v2
49
+ load_supermapping: true
50
+ - name: huggingface
51
+ path: flexitok/bpe_ltr_cmn_Hani_16000_v2
52
+ load_supermapping: true
53
+ - name: huggingface
54
+ path: flexitok/bpe_ltr_dan_Latn_16000_v2
55
+ load_supermapping: true
56
+ - name: huggingface
57
+ path: flexitok/bpe_ltr_deu_Latn_16000_v2
58
+ load_supermapping: true
59
+ - name: huggingface
60
+ path: flexitok/bpe_ltr_ell_Grek_16000_v2
61
+ load_supermapping: true
62
+ - name: huggingface
63
+ path: flexitok/bpe_ltr_fas_Arab_16000_v2
64
+ load_supermapping: true
65
+ - name: huggingface
66
+ path: flexitok/bpe_ltr_fra_Latn_16000_v2
67
+ load_supermapping: true
68
+ - name: huggingface
69
+ path: flexitok/bpe_ltr_fw_edu_16000_v2
70
+ load_supermapping: true
71
+ - name: huggingface
72
+ path: flexitok/bpe_ltr_hun_Latn_16000_v2
73
+ load_supermapping: true
74
+ - name: huggingface
75
+ path: flexitok/bpe_ltr_ind_Latn_16000_v2
76
+ load_supermapping: true
77
+ - name: huggingface
78
+ path: flexitok/bpe_ltr_ita_Latn_16000_v2
79
+ load_supermapping: true
80
+ - name: huggingface
81
+ path: flexitok/bpe_ltr_jpn_Jpan_16000_v2
82
+ load_supermapping: true
83
+ - name: huggingface
84
+ path: flexitok/bpe_ltr_nld_Latn_16000_v2
85
+ load_supermapping: true
86
+ - name: huggingface
87
+ path: flexitok/bpe_ltr_pol_Latn_16000_v2
88
+ load_supermapping: true
89
+ - name: huggingface
90
+ path: flexitok/bpe_ltr_por_Latn_16000_v2
91
+ load_supermapping: true
92
+ - name: huggingface
93
+ path: flexitok/bpe_ltr_rus_Cyrl_16000_v2
94
+ load_supermapping: true
95
+ - name: huggingface
96
+ path: flexitok/bpe_ltr_spa_Latn_16000_v2
97
+ load_supermapping: true
98
+ - name: huggingface
99
+ path: flexitok/bpe_ltr_swe_Latn_16000_v2
100
+ load_supermapping: true
101
+ - name: huggingface
102
+ path: flexitok/bpe_ltr_tur_Latn_16000_v2
103
+ load_supermapping: true
104
+ - name: huggingface
105
+ path: flexitok/bpe_ltr_vie_Latn_16000_v2
106
+ load_supermapping: true
107
+ load_supermapping: false
108
+ dropout: 0.0
109
+ seed: 42
110
+ superset_code_name: 16k_v2
111
+ n_words: 245153
112
+ routing:
113
+ source_to_tokenizer:
114
+ arb_Arab: flexitok/bpe_ltr_arb_Arab_16000_v2
115
+ ces_Latn: flexitok/bpe_ltr_ces_Latn_16000_v2
116
+ cmn_Hani: flexitok/bpe_ltr_cmn_Hani_16000_v2
117
+ dan_Latn: flexitok/bpe_ltr_dan_Latn_16000_v2
118
+ deu_Latn: flexitok/bpe_ltr_deu_Latn_16000_v2
119
+ ell_Grek: flexitok/bpe_ltr_ell_Grek_16000_v2
120
+ fas_Arab: flexitok/bpe_ltr_fas_Arab_16000_v2
121
+ fra_Latn: flexitok/bpe_ltr_fra_Latn_16000_v2
122
+ fw_edu: flexitok/bpe_ltr_fw_edu_16000_v2
123
+ hun_Latn: flexitok/bpe_ltr_hun_Latn_16000_v2
124
+ ind_Latn: flexitok/bpe_ltr_ind_Latn_16000_v2
125
+ ita_Latn: flexitok/bpe_ltr_ita_Latn_16000_v2
126
+ jpn_Jpan: flexitok/bpe_ltr_jpn_Jpan_16000_v2
127
+ nld_Latn: flexitok/bpe_ltr_nld_Latn_16000_v2
128
+ pol_Latn: flexitok/bpe_ltr_pol_Latn_16000_v2
129
+ por_Latn: flexitok/bpe_ltr_por_Latn_16000_v2
130
+ rus_Cyrl: flexitok/bpe_ltr_rus_Cyrl_16000_v2
131
+ spa_Latn: flexitok/bpe_ltr_spa_Latn_16000_v2
132
+ swe_Latn: flexitok/bpe_ltr_swe_Latn_16000_v2
133
+ tur_Latn: flexitok/bpe_ltr_tur_Latn_16000_v2
134
+ vie_Latn: flexitok/bpe_ltr_vie_Latn_16000_v2
135
+ task_to_tokenizer: {}
136
+ suitable_tokenizer_probability: 1.0
137
+ optim:
138
+ lr: 0.001
139
+ weight_decay: 0.1
140
+ epsilon: 1.0e-08
141
+ beta1: 0.9
142
+ beta2: 0.95
143
+ clip: 1.0
144
+ scheduler: cosine
145
+ warmup: 2000
146
+ lr_min_ratio: 1.0e-06
147
+ cycle_length: 1.0
148
+ cosine_theta: 1.0
149
+ annealing_step: 1000
150
+ decay_fraction: 0.1
151
+ exp_factor: 0.5
152
+ model:
153
+ dim: 2048
154
+ n_layers: 25
155
+ head_dim: null
156
+ n_heads: 16
157
+ n_kv_heads: null
158
+ ffn_dim_multiplier: null
159
+ multiple_of: 256
160
+ norm_eps: 1.0e-05
161
+ rope_theta: 10000.0
162
+ init_base_std: null
163
+ init_std_factor: disabled
164
+ max_seqlen: 4096
165
+ seed: 42
166
+ vocab_size: 245153
167
+ weight_tying: false
168
+ sliding_window: null
169
+ use_factorized_embeddings: false
170
+ factorized_embedding_dim: 0
171
+ distributed:
172
+ dp_shard: 1
173
+ dp_replicate: 8
174
+ tp_size: 1
175
+ selective_activation_checkpointing: false
176
+ compile: true
177
+ fsdp_type: full_shard
178
+ model_dtype: bf16
179
+ float8_recipe: null
180
+ float8_filter: layers\.[0-9]+\.
181
+ matmul_allow_tf32: false
182
+ detect_anomaly: false
183
+ compile_cache_size_limit: 8
184
+ spawn_method: forkserver
185
+ env:
186
+ MKL_SERVICE_FORCE_INTEL: GNU
187
+ OMP_NUM_THREADS: '1'
188
+ MKL_NUM_THREADS: '1'
189
+ ENABLE_INTRA_NODE_COMM: '1'
190
+ TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
191
+ NCCL_IB_TIMEOUT: '22'
192
+ NCCL_DEBUG: INFO
193
+ TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
194
+ checkpoint:
195
+ dump:
196
+ every: 10000
197
+ keep: -1
198
+ eval:
199
+ every: 10000
200
+ keep: -1
201
+ path: /fsx/craffel/lingua_logs/checkpoints/16k_v2
202
+ init_ckpt_path: null
203
+ load_init_optimizer_state: false
204
+ save_init_ckpt: false
205
+ profiling:
206
+ run: true
207
+ trace_folder: profiling
208
+ mem_warmup: 0
209
+ mem_steps: 4
210
+ profile_warmup: 100
211
+ profile_steps: 4
212
+ logging:
213
+ freq: 1
214
+ acc_freq: null
215
+ wandb: null
216
+ async_eval_gpus: 8
217
+ eval:
218
+ harness:
219
+ tasks:
220
+ - hellaswag
221
+ - piqa
222
+ - arc_easy
223
+ - arc_challenge
224
+ - include_base_44_arabic
225
+ - include_base_44_chinese
226
+ - include_base_44_german
227
+ - include_base_44_greek
228
+ - include_base_44_persian
229
+ - include_base_44_french
230
+ - include_base_44_hungarian
231
+ - include_base_44_indonesian
232
+ - include_base_44_italian
233
+ - include_base_44_japanese
234
+ - include_base_44_dutch
235
+ - include_base_44_polish
236
+ - include_base_44_portuguese
237
+ - include_base_44_russian
238
+ - include_base_44_spanish
239
+ - include_base_44_turkish
240
+ - include_base_44_vietnamese
241
+ - belebele_arb_Arab
242
+ - belebele_ces_Latn
243
+ - belebele_zho_Hans
244
+ - belebele_dan_Latn
245
+ - belebele_deu_Latn
246
+ - belebele_ell_Grek
247
+ - belebele_pes_Arab
248
+ - belebele_fra_Latn
249
+ - belebele_hun_Latn
250
+ - belebele_ind_Latn
251
+ - belebele_ita_Latn
252
+ - belebele_jpn_Jpan
253
+ - belebele_nld_Latn
254
+ - belebele_pol_Latn
255
+ - belebele_por_Latn
256
+ - belebele_rus_Cyrl
257
+ - belebele_spa_Latn
258
+ - belebele_swe_Latn
259
+ - belebele_tur_Latn
260
+ - belebele_vie_Latn
261
+ - belebele_eng_Latn
262
+ - xnli_ar
263
+ - xnli_zh
264
+ - xnli_de
265
+ - xnli_el
266
+ - xnli_en
267
+ - xnli_es
268
+ - xnli_fr
269
+ - xnli_ru
270
+ - xnli_tr
271
+ - xnli_vi
272
+ generator:
273
+ max_tokens: 16384
274
+ dtype: bf16
275
+ add_bos: false