zuol commited on
Commit
eea9054
·
verified ·
1 Parent(s): 7aecd3d

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. .gitattributes +4 -0
  2. config.yaml +423 -0
  3. model.pt +3 -0
  4. optim.pt +3 -0
  5. train.pt +3 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ config.yaml filter=lfs diff=lfs merge=lfs -text
37
+ model.pt filter=lfs diff=lfs merge=lfs -text
38
+ optim.pt filter=lfs diff=lfs merge=lfs -text
39
+ train.pt filter=lfs diff=lfs merge=lfs -text
config.yaml ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-300M
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 1536
7
+ n_heads: 8
8
+ n_kv_heads: 4
9
+ clip_qkv: null
10
+ n_layers: 8
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ flash_attention: true
21
+ attention_dropout: 0.0
22
+ multi_query_attention: null
23
+ attention_layer_norm: false
24
+ residual_dropout: 0.0
25
+ embedding_dropout: 0.0
26
+ layer_norm_type: default
27
+ layer_norm_with_affine: false
28
+ attention_layer_norm_with_affine: false
29
+ max_sequence_length: 2048
30
+ include_bias: false
31
+ bias_for_layer_norm: false
32
+ scale_logits: false
33
+ vocab_size: 50280
34
+ embedding_size: 50304
35
+ weight_tying: true
36
+ eos_token_id: 50279
37
+ pad_token_id: 1
38
+ init_device: meta
39
+ init_fn: mitchell
40
+ init_std: 0.02
41
+ init_cutoff_factor: null
42
+ precision: amp_bf16
43
+ use_moe: false
44
+ moe_num_experts: 6
45
+ moe_top_k: 2
46
+ use_mod: false
47
+ mod_capacity_factor: 0.125
48
+ mod_every: 2
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0005
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ no_decay_norm_and_bias: null
57
+ decay_norm_and_bias: false
58
+ decay_embeddings: false
59
+ metrics_log_interval: 10
60
+ scheduler:
61
+ name: cosine_with_warmup
62
+ units: steps
63
+ t_warmup: 2000
64
+ t_max: null
65
+ alpha_f: 0.1
66
+ grad_clip_warmup_steps: null
67
+ grad_clip_warmup_factor: null
68
+ data:
69
+ paths:
70
+ - ../tokenized_data/text/part-0-00000.npy
71
+ - ../tokenized_data/text/part-1-00000.npy
72
+ - ../tokenized_data/text/part-2-00000.npy
73
+ - ../tokenized_data/text/part-3-00000.npy
74
+ datasets: null
75
+ label_mask_paths: null
76
+ pad_direction: right
77
+ generate_attention_mask: false
78
+ num_workers: 0
79
+ drop_last: true
80
+ pin_memory: true
81
+ prefetch_factor: 32
82
+ persistent_workers: true
83
+ timeout: 0
84
+ seed: null
85
+ restore_dataloader: true
86
+ fast_forward_batches: null
87
+ evaluators:
88
+ - label: v3-small-ppl-validation
89
+ type: lm
90
+ data:
91
+ paths: null
92
+ datasets:
93
+ v3-small-c4_en-validation:
94
+ - ./validation/v3/c4_en/part-0-00000.npy
95
+ v3-small-dolma_books-validation:
96
+ - ./validation/v3/dolma_books/part-0-00000.npy
97
+ v3-small-dolma_common-crawl-validation:
98
+ - ./validation/v3/dolma_common-crawl/part-0-00000.npy
99
+ v3-small-dolma_pes2o-validation:
100
+ - ./validation/v3/dolma_pes2o/part-0-00000.npy
101
+ v3-small-dolma_reddit-validation:
102
+ - ./validation/v3/dolma_reddit/part-0-00000.npy
103
+ v3-small-dolma_stack-validation:
104
+ - ./validation/v3/dolma_stack/part-0-00000.npy
105
+ v3-small-dolma_wiki-validation:
106
+ - ./validation/v3/dolma_wiki/part-0-00000.npy
107
+ v3-small-ice-validation:
108
+ - ./validation/v3/ice/part-0-00000.npy
109
+ v3-small-m2d2_s2orc-validation:
110
+ - ./validation/v3/m2d2_s2orc/part-0-00000.npy
111
+ v3-small-pile-validation:
112
+ - ./validation/v3/pile/part-0-00000.npy
113
+ v3-small-wikitext_103-validation:
114
+ - ./validation/v3/wikitext_103/part-0-00000.npy
115
+ label_mask_paths: null
116
+ pad_direction: right
117
+ generate_attention_mask: false
118
+ num_workers: 0
119
+ drop_last: true
120
+ pin_memory: false
121
+ prefetch_factor: null
122
+ persistent_workers: false
123
+ timeout: 0
124
+ seed: null
125
+ device_eval_batch_size: null
126
+ subset_num_batches: null
127
+ - label: v2-small-ppl-validation
128
+ type: lm
129
+ data:
130
+ paths: null
131
+ datasets:
132
+ v2-small-4chan-validation:
133
+ - ./validation/v2/4chan/val.npy
134
+ v2-small-c4_100_domains-validation:
135
+ - ./validation/v2/c4_100_domains/val.npy
136
+ v2-small-c4_en-validation:
137
+ - ./validation/v2/c4_en/val.npy
138
+ v2-small-gab-validation:
139
+ - ./validation/v2/gab/val.npy
140
+ v2-small-ice-validation:
141
+ - ./validation/v2/ice/val.npy
142
+ v2-small-m2d2_s2orc-validation:
143
+ - ./validation/v2/m2d2_s2orc/val.npy
144
+ v2-small-m2d2_wiki-validation:
145
+ - ./validation/v2/m2d2_wiki/val.npy
146
+ v2-small-manosphere-validation:
147
+ - ./validation/v2/manosphere/val.npy
148
+ v2-small-mc4_en-validation:
149
+ - ./validation/v2/mc4_en/val.npy
150
+ v2-small-pile-validation:
151
+ - ./validation/v2/pile/val.npy
152
+ v2-small-ptb-validation:
153
+ - ./validation/v2/ptb/val.npy
154
+ v2-small-twitterAEE-validation:
155
+ - ./validation/v2/twitterAEE/val.npy
156
+ v2-small-wikitext_103-validation:
157
+ - ./validation/v2/wikitext_103/val.npy
158
+ label_mask_paths: null
159
+ pad_direction: right
160
+ generate_attention_mask: false
161
+ num_workers: 0
162
+ drop_last: true
163
+ pin_memory: false
164
+ prefetch_factor: null
165
+ persistent_workers: false
166
+ timeout: 0
167
+ seed: null
168
+ device_eval_batch_size: null
169
+ subset_num_batches: null
170
+ - label: piqa
171
+ type: downstream
172
+ data:
173
+ paths: null
174
+ datasets: null
175
+ label_mask_paths: null
176
+ pad_direction: right
177
+ generate_attention_mask: false
178
+ num_workers: 0
179
+ drop_last: false
180
+ pin_memory: false
181
+ prefetch_factor: null
182
+ persistent_workers: false
183
+ timeout: 0
184
+ seed: null
185
+ device_eval_batch_size: null
186
+ subset_num_batches: null
187
+ - label: hellaswag
188
+ type: downstream
189
+ data:
190
+ paths: null
191
+ datasets: null
192
+ label_mask_paths: null
193
+ pad_direction: right
194
+ generate_attention_mask: false
195
+ num_workers: 0
196
+ drop_last: false
197
+ pin_memory: false
198
+ prefetch_factor: null
199
+ persistent_workers: false
200
+ timeout: 0
201
+ seed: null
202
+ device_eval_batch_size: null
203
+ subset_num_batches: null
204
+ - label: winogrande
205
+ type: downstream
206
+ data:
207
+ paths: null
208
+ datasets: null
209
+ label_mask_paths: null
210
+ pad_direction: right
211
+ generate_attention_mask: false
212
+ num_workers: 0
213
+ drop_last: false
214
+ pin_memory: false
215
+ prefetch_factor: null
216
+ persistent_workers: false
217
+ timeout: 0
218
+ seed: null
219
+ device_eval_batch_size: null
220
+ subset_num_batches: null
221
+ - label: openbook_qa
222
+ type: downstream
223
+ data:
224
+ paths: null
225
+ datasets: null
226
+ label_mask_paths: null
227
+ pad_direction: right
228
+ generate_attention_mask: false
229
+ num_workers: 0
230
+ drop_last: false
231
+ pin_memory: false
232
+ prefetch_factor: null
233
+ persistent_workers: false
234
+ timeout: 0
235
+ seed: null
236
+ device_eval_batch_size: null
237
+ subset_num_batches: null
238
+ - label: sciq
239
+ type: downstream
240
+ data:
241
+ paths: null
242
+ datasets: null
243
+ label_mask_paths: null
244
+ pad_direction: right
245
+ generate_attention_mask: false
246
+ num_workers: 0
247
+ drop_last: false
248
+ pin_memory: false
249
+ prefetch_factor: null
250
+ persistent_workers: false
251
+ timeout: 0
252
+ seed: null
253
+ device_eval_batch_size: null
254
+ subset_num_batches: null
255
+ - label: arc_easy
256
+ type: downstream
257
+ data:
258
+ paths: null
259
+ datasets: null
260
+ label_mask_paths: null
261
+ pad_direction: right
262
+ generate_attention_mask: false
263
+ num_workers: 0
264
+ drop_last: false
265
+ pin_memory: false
266
+ prefetch_factor: null
267
+ persistent_workers: false
268
+ timeout: 0
269
+ seed: null
270
+ device_eval_batch_size: null
271
+ subset_num_batches: null
272
+ - label: copa
273
+ type: downstream
274
+ data:
275
+ paths: null
276
+ datasets: null
277
+ label_mask_paths: null
278
+ pad_direction: right
279
+ generate_attention_mask: false
280
+ num_workers: 0
281
+ drop_last: false
282
+ pin_memory: false
283
+ prefetch_factor: null
284
+ persistent_workers: false
285
+ timeout: 0
286
+ seed: null
287
+ device_eval_batch_size: null
288
+ subset_num_batches: null
289
+ - label: rte
290
+ type: downstream
291
+ data:
292
+ paths: null
293
+ datasets: null
294
+ label_mask_paths: null
295
+ pad_direction: right
296
+ generate_attention_mask: false
297
+ num_workers: 0
298
+ drop_last: false
299
+ pin_memory: false
300
+ prefetch_factor: null
301
+ persistent_workers: false
302
+ timeout: 0
303
+ seed: null
304
+ device_eval_batch_size: null
305
+ subset_num_batches: null
306
+ - label: commitment_bank
307
+ type: downstream
308
+ data:
309
+ paths: null
310
+ datasets: null
311
+ label_mask_paths: null
312
+ pad_direction: right
313
+ generate_attention_mask: false
314
+ num_workers: 0
315
+ drop_last: false
316
+ pin_memory: false
317
+ prefetch_factor: null
318
+ persistent_workers: false
319
+ timeout: 0
320
+ seed: null
321
+ device_eval_batch_size: null
322
+ subset_num_batches: null
323
+ - label: mrpc
324
+ type: downstream
325
+ data:
326
+ paths: null
327
+ datasets: null
328
+ label_mask_paths: null
329
+ pad_direction: right
330
+ generate_attention_mask: false
331
+ num_workers: 0
332
+ drop_last: false
333
+ pin_memory: false
334
+ prefetch_factor: null
335
+ persistent_workers: false
336
+ timeout: 0
337
+ seed: null
338
+ device_eval_batch_size: null
339
+ subset_num_batches: null
340
+ - label: sst2
341
+ type: downstream
342
+ data:
343
+ paths: null
344
+ datasets: null
345
+ label_mask_paths: null
346
+ pad_direction: right
347
+ generate_attention_mask: false
348
+ num_workers: 0
349
+ drop_last: false
350
+ pin_memory: false
351
+ prefetch_factor: null
352
+ persistent_workers: false
353
+ timeout: 0
354
+ seed: null
355
+ device_eval_batch_size: null
356
+ subset_num_batches: null
357
+ eval_interval: 1000
358
+ tokenizer:
359
+ identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json
360
+ truncate_direction: right
361
+ save_folder: ./checkpoints/olmo-tiny/OLMo-300M
362
+ remote_save_folder: null
363
+ canceled_check_interval: 50
364
+ save_interval: 2000
365
+ save_interval_unsharded: 10000
366
+ save_interval_ephemeral: null
367
+ save_num_checkpoints_to_keep: 9
368
+ save_num_unsharded_checkpoints_to_keep: -1
369
+ save_overwrite: false
370
+ force_save_unsharded: false
371
+ no_pre_train_checkpoint: false
372
+ load_path: null
373
+ load_path_sharded_checkpointer: null
374
+ reset_optimizer_state: false
375
+ reset_trainer_state: false
376
+ sharded_checkpointer: torch_legacy
377
+ new_style_checkpoints: null
378
+ max_duration: 7393280
379
+ global_train_batch_size: 20
380
+ device_train_batch_size: 20
381
+ device_train_microbatch_size: 10
382
+ device_eval_batch_size: 10
383
+ eval_subset_num_batches: -1
384
+ eval_on_load: false
385
+ device_train_grad_accum: 2
386
+ max_grad_norm: 1.0
387
+ max_grad_norm_ratio: null
388
+ precision: amp_bf16
389
+ use_msamp: false
390
+ wandb:
391
+ project: olmo-300m
392
+ entity: doosen
393
+ group: null
394
+ name: OLMo-300M
395
+ tags:
396
+ - watching
397
+ log_artifacts: false
398
+ rank_zero_only: true
399
+ log_interval: 1
400
+ speed_monitor:
401
+ window_size: 20
402
+ gpu_flops_available: null
403
+ console_log_interval: 1
404
+ compile:
405
+ mode: null
406
+ fullgraph: false
407
+ backend: inductor
408
+ fsdp:
409
+ use_orig_params: true
410
+ sharding_strategy: FULL_SHARD
411
+ wrapping_strategy: null
412
+ precision: mixed
413
+ softmax_auxiliary_loss: false
414
+ time_limit: 171000.0
415
+ extra_steps_after_cancel: 10
416
+ early_stopping_factor: null
417
+ save_data_indices: true
418
+ python_profiling: false
419
+ torch_profiling: false
420
+ stop_at: null
421
+ stop_after: null
422
+ activation_checkpointing: null
423
+ fused_loss: null
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae926b71ee1d1efee8833bbb2d14ddd29320f508084f5f37a51aa1279ac679d2
3
+ size 1441545641
optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21ec769e81a3790d1c582c2684f71743447e476d111bc43932cf087a34ab4a22
3
+ size 2883088543
train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca89481ad2adaee0150f46db04032554fcec69730dbc4473559219e6f3cefd96
3
+ size 14604