craffel HF Staff commited on
Commit
b98b098
·
verified ·
1 Parent(s): a3efb3d

Upload flexitok_superset_albert_w_xglm/config.yaml with huggingface_hub

Browse files
flexitok_superset_albert_w_xglm/config.yaml ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: flexitok_superset_albert_w_xglm
2
+ dump_dir: /fsx/craffel/lingua_logs/flexitok_superset_albert_w_xglm
3
+ seed: 777
4
+ grad_acc_steps: 16
5
+ gc_collect_freq: 1000
6
+ probe_freq: null
7
+ steps: 100000
8
+ data:
9
+ root_dir: /scratch/craffel/lingua/data/flexitok/
10
+ sources:
11
+ fw_edu: 0.4
12
+ dan_Latn: 0.0216582869670702
13
+ swe_Latn: 0.0216359765418466
14
+ vie_Latn: 0.0197485510268674
15
+ hun_Latn: 0.0247194573562308
16
+ fas_Arab: 0.0205634624231076
17
+ tur_Latn: 0.0235455794841729
18
+ ces_Latn: 0.0248024455266208
19
+ arb_Arab: 0.0234323706569333
20
+ ell_Grek: 0.0233670886888026
21
+ ind_Latn: 0.0269322054593488
22
+ nld_Latn: 0.0277796326621489
23
+ pol_Latn: 0.0294120104572311
24
+ por_Latn: 0.0301413168306825
25
+ ita_Latn: 0.0324056371021865
26
+ jpn_Jpan: 0.03553104151369
27
+ fra_Latn: 0.0381835560678536
28
+ spa_Latn: 0.0387222793083669
29
+ deu_Latn: 0.0419925340453022
30
+ cmn_Hani: 0.0454067521384114
31
+ rus_Cyrl: 0.0500198157431261
32
+ batch_size: 2
33
+ seq_len: 4096
34
+ n_views: 2
35
+ seed: 42
36
+ add_bos: true
37
+ add_eos: true
38
+ load_async: true
39
+ prefetch_size: 1024
40
+ tokenizer:
41
+ name: supertokenizer
42
+ path: meta-llama/Llama-3.2-1B
43
+ tokenizers:
44
+ - name: huggingface
45
+ path: flexitok/reindexed-CohereLabs-aya-expanse-8b
46
+ load_supermapping: false
47
+ - name: huggingface
48
+ path: flexitok/reindexed-bigscience-bloom
49
+ load_supermapping: false
50
+ - name: huggingface
51
+ path: flexitok/reindexed-common-pile-comma-v0.1
52
+ load_supermapping: false
53
+ - name: huggingface
54
+ path: flexitok/reindexed-google-gemma-2-2b
55
+ load_supermapping: false
56
+ - name: huggingface
57
+ path: flexitok/reindexed-gpt2
58
+ load_supermapping: false
59
+ - name: huggingface
60
+ path: flexitok/reindexed-meta-llama-Llama-3.2-1B
61
+ load_supermapping: false
62
+ - name: huggingface
63
+ path: flexitok/reindexed-google-bert-bert-base-multilingual-cased
64
+ load_supermapping: false
65
+ - name: huggingface
66
+ path: flexitok/reindexed-microsoft-Phi-3-mini-4k-instruct
67
+ load_supermapping: false
68
+ - name: huggingface
69
+ path: flexitok/reindexed-Qwen-Qwen3-8B
70
+ load_supermapping: false
71
+ - name: huggingface
72
+ path: facebook/xglm-564M
73
+ load_supermapping: true
74
+ - name: huggingface
75
+ path: google/byt5-small
76
+ load_supermapping: true
77
+ - name: tiktoken
78
+ path: gpt-4o
79
+ load_supermapping: true
80
+ - name: tekken
81
+ path: tekken
82
+ load_supermapping: true
83
+ - name: tokenmonster
84
+ path: englishcode-32000-consistent-v1
85
+ load_supermapping: true
86
+ load_supermapping: false
87
+ dropout: 0.0
88
+ seed: 42
89
+ optim:
90
+ lr: 0.001
91
+ weight_decay: 0.1
92
+ epsilon: 1.0e-08
93
+ beta1: 0.9
94
+ beta2: 0.95
95
+ clip: 1.0
96
+ scheduler: cosine
97
+ warmup: 2000
98
+ lr_min_ratio: 1.0e-06
99
+ cycle_length: 1.0
100
+ cosine_theta: 1.0
101
+ annealing_step: 1000
102
+ decay_fraction: 0.1
103
+ exp_factor: 0.5
104
+ model:
105
+ dim: 2048
106
+ n_layers: 25
107
+ head_dim: null
108
+ n_heads: 16
109
+ n_kv_heads: null
110
+ ffn_dim_multiplier: null
111
+ multiple_of: 256
112
+ norm_eps: 1.0e-05
113
+ rope_theta: 10000.0
114
+ init_base_std: null
115
+ init_std_factor: disabled
116
+ max_seqlen: 4096
117
+ seed: 42
118
+ vocab_size: 851586
119
+ weight_tying: false
120
+ sliding_window: null
121
+ use_factorized_embeddings: true
122
+ factorized_embedding_dim: 512
123
+ distributed:
124
+ dp_shard: 1
125
+ dp_replicate: 8
126
+ tp_size: 1
127
+ selective_activation_checkpointing: false
128
+ compile: true
129
+ fsdp_type: full_shard
130
+ model_dtype: bf16
131
+ float8_recipe: null
132
+ float8_filter: layers\.[0-9]+\.
133
+ matmul_allow_tf32: false
134
+ detect_anomaly: false
135
+ compile_cache_size_limit: 8
136
+ spawn_method: forkserver
137
+ env:
138
+ MKL_SERVICE_FORCE_INTEL: GNU
139
+ OMP_NUM_THREADS: '1'
140
+ MKL_NUM_THREADS: '1'
141
+ ENABLE_INTRA_NODE_COMM: '1'
142
+ TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
143
+ NCCL_IB_TIMEOUT: '22'
144
+ NCCL_DEBUG: INFO
145
+ TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
146
+ checkpoint:
147
+ dump:
148
+ every: 10000
149
+ keep: -1
150
+ eval:
151
+ every: 10000
152
+ keep: -1
153
+ path: /fsx/craffel/lingua_logs/flexitok_superset_albert_w_xglm/checkpoints
154
+ init_ckpt_path: null
155
+ load_init_optimizer_state: false
156
+ save_init_ckpt: false
157
+ profiling:
158
+ run: true
159
+ trace_folder: profiling
160
+ mem_warmup: 0
161
+ mem_steps: 4
162
+ profile_warmup: 100
163
+ profile_steps: 4
164
+ logging:
165
+ freq: 1
166
+ acc_freq: null
167
+ wandb: null
168
+ async_eval_gpus: 8
169
+ eval:
170
+ harness:
171
+ tasks:
172
+ - hellaswag
173
+ - piqa
174
+ - arc_easy
175
+ - arc_challenge
176
+ - include_base_44_arabic
177
+ - include_base_44_chinese
178
+ - include_base_44_german
179
+ - include_base_44_greek
180
+ - include_base_44_persian
181
+ - include_base_44_french
182
+ - include_base_44_hungarian
183
+ - include_base_44_indonesian
184
+ - include_base_44_italian
185
+ - include_base_44_japanese
186
+ - include_base_44_dutch
187
+ - include_base_44_polish
188
+ - include_base_44_portuguese
189
+ - include_base_44_russian
190
+ - include_base_44_spanish
191
+ - include_base_44_turkish
192
+ - include_base_44_vietnamese
193
+ - belebele_arb_Arab
194
+ - belebele_ces_Latn
195
+ - belebele_zho_Hans
196
+ - belebele_dan_Latn
197
+ - belebele_deu_Latn
198
+ - belebele_ell_Grek
199
+ - belebele_pes_Arab
200
+ - belebele_fra_Latn
201
+ - belebele_hun_Latn
202
+ - belebele_ind_Latn
203
+ - belebele_ita_Latn
204
+ - belebele_jpn_Jpan
205
+ - belebele_nld_Latn
206
+ - belebele_pol_Latn
207
+ - belebele_por_Latn
208
+ - belebele_rus_Cyrl
209
+ - belebele_spa_Latn
210
+ - belebele_swe_Latn
211
+ - belebele_tur_Latn
212
+ - belebele_vie_Latn
213
+ - belebele_eng_Latn
214
+ - xnli_ar
215
+ - xnli_zh
216
+ - xnli_de
217
+ - xnli_el
218
+ - xnli_en
219
+ - xnli_es
220
+ - xnli_fr
221
+ - xnli_hi
222
+ - xnli_ru
223
+ - xnli_tr
224
+ - xnli_vi
225
+ generator:
226
+ max_tokens: 16384
227
+ dtype: bf16
228
+ add_bos: false