craffel HF Staff commited on
Commit
94f47ec
·
verified ·
1 Parent(s): 5f132ca

Upload flexitok_subword_regularization/config.yaml with huggingface_hub

Browse files
flexitok_subword_regularization/config.yaml ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: flexitok_subword_regularization
2
+ dump_dir: /fsx/craffel/lingua_logs/flexitok_subword_regularization
3
+ seed: 777
4
+ grad_acc_steps: 8
5
+ gc_collect_freq: 1000
6
+ probe_freq: null
7
+ steps: 100000
8
+ data:
9
+ root_dir: /scratch/craffel/lingua/data/flexitok/
10
+ sources:
11
+ fw_edu: 0.4
12
+ dan_Latn: 0.0216582869670702
13
+ swe_Latn: 0.0216359765418466
14
+ vie_Latn: 0.0197485510268674
15
+ hun_Latn: 0.0247194573562308
16
+ fas_Arab: 0.0205634624231076
17
+ tur_Latn: 0.0235455794841729
18
+ ces_Latn: 0.0248024455266208
19
+ arb_Arab: 0.0234323706569333
20
+ ell_Grek: 0.0233670886888026
21
+ ind_Latn: 0.0269322054593488
22
+ nld_Latn: 0.0277796326621489
23
+ pol_Latn: 0.0294120104572311
24
+ por_Latn: 0.0301413168306825
25
+ ita_Latn: 0.0324056371021865
26
+ jpn_Jpan: 0.03553104151369
27
+ fra_Latn: 0.0381835560678536
28
+ spa_Latn: 0.0387222793083669
29
+ deu_Latn: 0.0419925340453022
30
+ cmn_Hani: 0.0454067521384114
31
+ rus_Cyrl: 0.0500198157431261
32
+ batch_size: 4
33
+ seq_len: 4096
34
+ n_views: 2
35
+ seed: 42
36
+ add_bos: true
37
+ add_eos: true
38
+ load_async: true
39
+ prefetch_size: 1024
40
+ tokenizer:
41
+ name: sp
42
+ path: /fsx/craffel/lingua/tokenizers/xglm-564M-sentencepiece.bpe.model
43
+ tokenizers: null
44
+ load_supermapping: false
45
+ dropout: 0.1
46
+ seed: 42
47
+ optim:
48
+ lr: 0.001
49
+ weight_decay: 0.1
50
+ epsilon: 1.0e-08
51
+ beta1: 0.9
52
+ beta2: 0.95
53
+ clip: 1.0
54
+ scheduler: cosine
55
+ warmup: 2000
56
+ lr_min_ratio: 1.0e-06
57
+ cycle_length: 1.0
58
+ cosine_theta: 1.0
59
+ annealing_step: 1000
60
+ decay_fraction: 0.1
61
+ exp_factor: 0.5
62
+ model:
63
+ dim: 2048
64
+ n_layers: 25
65
+ head_dim: null
66
+ n_heads: 16
67
+ n_kv_heads: null
68
+ ffn_dim_multiplier: null
69
+ multiple_of: 256
70
+ norm_eps: 1.0e-05
71
+ rope_theta: 10000.0
72
+ init_base_std: null
73
+ init_std_factor: disabled
74
+ max_seqlen: 4096
75
+ seed: 42
76
+ vocab_size: 256000
77
+ weight_tying: false
78
+ sliding_window: null
79
+ use_factorized_embeddings: false
80
+ factorized_embedding_dim: 0
81
+ distributed:
82
+ dp_shard: 1
83
+ dp_replicate: 8
84
+ tp_size: 1
85
+ selective_activation_checkpointing: false
86
+ compile: true
87
+ fsdp_type: full_shard
88
+ model_dtype: bf16
89
+ float8_recipe: null
90
+ float8_filter: layers\.[0-9]+\.
91
+ matmul_allow_tf32: false
92
+ detect_anomaly: false
93
+ compile_cache_size_limit: 8
94
+ spawn_method: forkserver
95
+ env:
96
+ MKL_SERVICE_FORCE_INTEL: GNU
97
+ OMP_NUM_THREADS: '1'
98
+ MKL_NUM_THREADS: '1'
99
+ ENABLE_INTRA_NODE_COMM: '1'
100
+ TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
101
+ NCCL_IB_TIMEOUT: '22'
102
+ NCCL_DEBUG: INFO
103
+ TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
104
+ checkpoint:
105
+ dump:
106
+ every: 10000
107
+ keep: -1
108
+ eval:
109
+ every: 10000
110
+ keep: -1
111
+ path: /fsx/craffel/lingua_logs/flexitok_subword_regularization/checkpoints
112
+ init_ckpt_path: null
113
+ load_init_optimizer_state: false
114
+ save_init_ckpt: false
115
+ profiling:
116
+ run: true
117
+ trace_folder: profiling
118
+ mem_warmup: 0
119
+ mem_steps: 4
120
+ profile_warmup: 100
121
+ profile_steps: 4
122
+ logging:
123
+ freq: 1
124
+ acc_freq: null
125
+ wandb: null
126
+ async_eval_gpus: 8
127
+ eval:
128
+ harness:
129
+ tasks:
130
+ - hellaswag
131
+ - piqa
132
+ - arc_easy
133
+ - arc_challenge
134
+ - include_base_44_arabic
135
+ - include_base_44_chinese
136
+ - include_base_44_german
137
+ - include_base_44_greek
138
+ - include_base_44_persian
139
+ - include_base_44_french
140
+ - include_base_44_hungarian
141
+ - include_base_44_indonesian
142
+ - include_base_44_italian
143
+ - include_base_44_japanese
144
+ - include_base_44_dutch
145
+ - include_base_44_polish
146
+ - include_base_44_portuguese
147
+ - include_base_44_russian
148
+ - include_base_44_spanish
149
+ - include_base_44_turkish
150
+ - include_base_44_vietnamese
151
+ - belebele_arb_Arab
152
+ - belebele_ces_Latn
153
+ - belebele_zho_Hans
154
+ - belebele_dan_Latn
155
+ - belebele_deu_Latn
156
+ - belebele_ell_Grek
157
+ - belebele_pes_Arab
158
+ - belebele_fra_Latn
159
+ - belebele_hun_Latn
160
+ - belebele_ind_Latn
161
+ - belebele_ita_Latn
162
+ - belebele_jpn_Jpan
163
+ - belebele_nld_Latn
164
+ - belebele_pol_Latn
165
+ - belebele_por_Latn
166
+ - belebele_rus_Cyrl
167
+ - belebele_spa_Latn
168
+ - belebele_swe_Latn
169
+ - belebele_tur_Latn
170
+ - belebele_vie_Latn
171
+ - belebele_eng_Latn
172
+ - xnli_ar
173
+ - xnli_zh
174
+ - xnli_de
175
+ - xnli_el
176
+ - xnli_en
177
+ - xnli_es
178
+ - xnli_fr
179
+ - xnli_hi
180
+ - xnli_ru
181
+ - xnli_tr
182
+ - xnli_vi
183
+ generator:
184
+ max_tokens: 16384
185
+ dtype: bf16
186
+ add_bos: false