DatPySci commited on
Commit
cd574bb
·
verified ·
1 Parent(s): 16956da

upload pretrain 150M

Browse files
OLMo-150M-different-lr/OLMo-150M-constant-1e3/step60000-unsharded/config.yaml ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-150M-constant-1e3
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 768
7
+ n_heads: 12
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 12
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: false
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.001
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: constant_with_warmup
65
+ units: steps
66
+ t_warmup: 5000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00013_00000_doc_shuffled.ds
88
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00014_00000_doc_shuffled.ds
89
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00015_00000_doc_shuffled.ds
90
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
91
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00017_00000_doc_shuffled.ds
92
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00018_00000_doc_shuffled.ds
93
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
94
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
95
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
96
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
97
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
98
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
99
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
100
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
101
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
102
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
103
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
104
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
105
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00000_00000_doc_shuffled.ds
106
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00001_00000_doc_shuffled.ds
107
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00002_00000_doc_shuffled.ds
108
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00003_00000_doc_shuffled.ds
109
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00004_00000_doc_shuffled.ds
110
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00005_00000_doc_shuffled.ds
111
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00006_00000_doc_shuffled.ds
112
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00007_00000_doc_shuffled.ds
113
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00008_00000_doc_shuffled.ds
114
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00009_00000_doc_shuffled.ds
115
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00010_00000_doc_shuffled.ds
116
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00011_00000_doc_shuffled.ds
117
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00012_00000_doc_shuffled.ds
118
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00013_00000_doc_shuffled.ds
119
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00014_00000_doc_shuffled.ds
120
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00015_00000_doc_shuffled.ds
121
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
122
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
123
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
124
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
125
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
126
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
127
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
128
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
129
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
130
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
131
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
132
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
133
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
134
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
135
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
136
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
137
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
138
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
139
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
140
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
141
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
142
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
143
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
144
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
145
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
146
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
147
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
148
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
149
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
150
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
151
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
152
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
153
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
154
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00000_00000_doc_shuffled.ds
155
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00001_00000_doc_shuffled.ds
156
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00002_00000_doc_shuffled.ds
157
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00003_00000_doc_shuffled.ds
158
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00004_00000_doc_shuffled.ds
159
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00005_00000_doc_shuffled.ds
160
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00006_00000_doc_shuffled.ds
161
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00007_00000_doc_shuffled.ds
162
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00008_00000_doc_shuffled.ds
163
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00009_00000_doc_shuffled.ds
164
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00010_00000_doc_shuffled.ds
165
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00011_00000_doc_shuffled.ds
166
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00012_00000_doc_shuffled.ds
167
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00013_00000_doc_shuffled.ds
168
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00014_00000_doc_shuffled.ds
169
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00015_00000_doc_shuffled.ds
170
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00016_00000_doc_shuffled.ds
171
+ memmap_dtype: uint16
172
+ datasets: null
173
+ label_mask_paths: null
174
+ pad_direction: right
175
+ generate_attention_mask: false
176
+ generate_doc_lengths: false
177
+ num_workers: 32
178
+ drop_last: true
179
+ pin_memory: true
180
+ prefetch_factor: 8
181
+ persistent_workers: true
182
+ timeout: 0
183
+ seed: null
184
+ instance_filter: null
185
+ custom_dataset: null
186
+ restore_dataloader: true
187
+ fast_forward_batches: null
188
+ evaluators: []
189
+ eval_interval: 5000
190
+ tokenizer:
191
+ identifier: meta-llama/Llama-2-7b-hf
192
+ truncate_direction: right
193
+ save_folder: checkpoints/OLMo-150M-constant-1e3
194
+ remote_save_folder: null
195
+ canceled_check_interval: 50
196
+ save_interval: 3000
197
+ save_interval_unsharded: 3000
198
+ save_interval_ephemeral: null
199
+ save_num_checkpoints_to_keep: 48
200
+ save_num_unsharded_checkpoints_to_keep: 48
201
+ save_overwrite: true
202
+ force_save_unsharded: false
203
+ no_pre_train_checkpoint: false
204
+ load_path: null
205
+ load_path_sharded_checkpointer: null
206
+ try_load_latest_save: true
207
+ reset_optimizer_state: false
208
+ reset_trainer_state: false
209
+ sharded_checkpointer: torch_legacy
210
+ new_style_checkpoints: null
211
+ max_duration: 1ep
212
+ global_train_batch_size: 512
213
+ device_train_batch_size: 128
214
+ device_train_microbatch_size: 16
215
+ device_eval_batch_size: 16
216
+ eval_subset_num_batches: -1
217
+ eval_on_load: false
218
+ device_train_grad_accum: 8
219
+ max_grad_norm: 1.0
220
+ max_grad_norm_ratio: null
221
+ precision: amp_bf16
222
+ wandb:
223
+ project: olmo-pretrain
224
+ entity: marksmans
225
+ group: null
226
+ name: OLMo-150M-constant-1e3
227
+ tags:
228
+ - watching
229
+ log_artifacts: false
230
+ rank_zero_only: true
231
+ log_interval: 1
232
+ speed_monitor:
233
+ window_size: 20
234
+ gpu_flops_available: null
235
+ console_log_interval: 1
236
+ gen1_gc_interval: 1
237
+ compile: null
238
+ distributed_strategy: ddp
239
+ fsdp:
240
+ use_orig_params: true
241
+ sharding_strategy: FULL_SHARD
242
+ wrapping_strategy: null
243
+ precision: pure
244
+ hybrid_sharding_num_model_replicas: null
245
+ ddp:
246
+ grad_sync_mode: batch
247
+ find_unused_params: false
248
+ single:
249
+ device: auto
250
+ softmax_auxiliary_loss: false
251
+ auxiliary_loss_multiplier: 0.0001
252
+ time_limit: null
253
+ extra_steps_after_cancel: 10
254
+ early_stopping_factor: null
255
+ save_data_indices: true
256
+ python_profiling: false
257
+ torch_profiling: false
258
+ stop_at: 61787
259
+ stop_after: null
260
+ activation_checkpointing: null
261
+ fused_loss: null
262
+ hf_datasets_cache_dir: null
263
+ module_outputs_save_steps: null
OLMo-150M-different-lr/OLMo-150M-constant-1e3/step60000-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a636a65452611e9847ab079f6bc3e85281cbbcbe044e9122825185dbf791114
3
+ size 649612628
OLMo-150M-different-lr/OLMo-150M-constant-1e3/step60000-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10f2ae64a3467b4823c30b3e4679510ffc0c315577f80faa4188ff374dd3edef
3
+ size 1299223890
OLMo-150M-different-lr/OLMo-150M-constant-1e3/step60000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c00cd5c4ec998cdfd012019e3cfe67a081d3ea1aa330cdc84057ce567c02f2a2
3
+ size 15244
OLMo-150M-different-lr/OLMo-150M-constant-3e3/step60000-unsharded/config.yaml ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-150M-constant-3e3
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 768
7
+ n_heads: 12
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 12
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: false
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.003
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: constant_with_warmup
65
+ units: steps
66
+ t_warmup: 5000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00013_00000_doc_shuffled.ds
88
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00014_00000_doc_shuffled.ds
89
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00015_00000_doc_shuffled.ds
90
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
91
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00017_00000_doc_shuffled.ds
92
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00018_00000_doc_shuffled.ds
93
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
94
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
95
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
96
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
97
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
98
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
99
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
100
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
101
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
102
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
103
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
104
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
105
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00000_00000_doc_shuffled.ds
106
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00001_00000_doc_shuffled.ds
107
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00002_00000_doc_shuffled.ds
108
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00003_00000_doc_shuffled.ds
109
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00004_00000_doc_shuffled.ds
110
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00005_00000_doc_shuffled.ds
111
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00006_00000_doc_shuffled.ds
112
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00007_00000_doc_shuffled.ds
113
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00008_00000_doc_shuffled.ds
114
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00009_00000_doc_shuffled.ds
115
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00010_00000_doc_shuffled.ds
116
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00011_00000_doc_shuffled.ds
117
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00012_00000_doc_shuffled.ds
118
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00013_00000_doc_shuffled.ds
119
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00014_00000_doc_shuffled.ds
120
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00015_00000_doc_shuffled.ds
121
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
122
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
123
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
124
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
125
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
126
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
127
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
128
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
129
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
130
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
131
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
132
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
133
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
134
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
135
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
136
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
137
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
138
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
139
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
140
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
141
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
142
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
143
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
144
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
145
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
146
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
147
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
148
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
149
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
150
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
151
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
152
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
153
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
154
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00000_00000_doc_shuffled.ds
155
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00001_00000_doc_shuffled.ds
156
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00002_00000_doc_shuffled.ds
157
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00003_00000_doc_shuffled.ds
158
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00004_00000_doc_shuffled.ds
159
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00005_00000_doc_shuffled.ds
160
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00006_00000_doc_shuffled.ds
161
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00007_00000_doc_shuffled.ds
162
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00008_00000_doc_shuffled.ds
163
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00009_00000_doc_shuffled.ds
164
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00010_00000_doc_shuffled.ds
165
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00011_00000_doc_shuffled.ds
166
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00012_00000_doc_shuffled.ds
167
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00013_00000_doc_shuffled.ds
168
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00014_00000_doc_shuffled.ds
169
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00015_00000_doc_shuffled.ds
170
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00016_00000_doc_shuffled.ds
171
+ memmap_dtype: uint16
172
+ datasets: null
173
+ label_mask_paths: null
174
+ pad_direction: right
175
+ generate_attention_mask: false
176
+ generate_doc_lengths: false
177
+ num_workers: 32
178
+ drop_last: true
179
+ pin_memory: true
180
+ prefetch_factor: 8
181
+ persistent_workers: true
182
+ timeout: 0
183
+ seed: null
184
+ instance_filter: null
185
+ custom_dataset: null
186
+ restore_dataloader: true
187
+ fast_forward_batches: null
188
+ evaluators: []
189
+ eval_interval: 5000
190
+ tokenizer:
191
+ identifier: meta-llama/Llama-2-7b-hf
192
+ truncate_direction: right
193
+ save_folder: checkpoints/OLMo-150M-constant-3e3
194
+ remote_save_folder: null
195
+ canceled_check_interval: 50
196
+ save_interval: 3000
197
+ save_interval_unsharded: 3000
198
+ save_interval_ephemeral: null
199
+ save_num_checkpoints_to_keep: 48
200
+ save_num_unsharded_checkpoints_to_keep: 48
201
+ save_overwrite: true
202
+ force_save_unsharded: false
203
+ no_pre_train_checkpoint: false
204
+ load_path: null
205
+ load_path_sharded_checkpointer: null
206
+ try_load_latest_save: true
207
+ reset_optimizer_state: false
208
+ reset_trainer_state: false
209
+ sharded_checkpointer: torch_legacy
210
+ new_style_checkpoints: null
211
+ max_duration: 1ep
212
+ global_train_batch_size: 512
213
+ device_train_batch_size: 128
214
+ device_train_microbatch_size: 16
215
+ device_eval_batch_size: 16
216
+ eval_subset_num_batches: -1
217
+ eval_on_load: false
218
+ device_train_grad_accum: 8
219
+ max_grad_norm: 1.0
220
+ max_grad_norm_ratio: null
221
+ precision: amp_bf16
222
+ wandb:
223
+ project: olmo-pretrain
224
+ entity: marksmans
225
+ group: null
226
+ name: OLMo-150M-constant-3e3
227
+ tags:
228
+ - watching
229
+ log_artifacts: false
230
+ rank_zero_only: true
231
+ log_interval: 1
232
+ speed_monitor:
233
+ window_size: 20
234
+ gpu_flops_available: null
235
+ console_log_interval: 1
236
+ gen1_gc_interval: 1
237
+ compile: null
238
+ distributed_strategy: ddp
239
+ fsdp:
240
+ use_orig_params: true
241
+ sharding_strategy: FULL_SHARD
242
+ wrapping_strategy: null
243
+ precision: pure
244
+ hybrid_sharding_num_model_replicas: null
245
+ ddp:
246
+ grad_sync_mode: batch
247
+ find_unused_params: false
248
+ single:
249
+ device: auto
250
+ softmax_auxiliary_loss: false
251
+ auxiliary_loss_multiplier: 0.0001
252
+ time_limit: null
253
+ extra_steps_after_cancel: 10
254
+ early_stopping_factor: null
255
+ save_data_indices: true
256
+ python_profiling: false
257
+ torch_profiling: false
258
+ stop_at: 61787
259
+ stop_after: null
260
+ activation_checkpointing: null
261
+ fused_loss: null
262
+ hf_datasets_cache_dir: null
263
+ module_outputs_save_steps: null
OLMo-150M-different-lr/OLMo-150M-constant-3e3/step60000-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1292c5a0153222b4582901ed1ae66794c9e01a60e962ba0f442111a203582c74
3
+ size 649612628
OLMo-150M-different-lr/OLMo-150M-constant-3e3/step60000-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab8560d8d81457bdb8e5176b7ca7d5b62ad2ce59080133d86e84f138de500fc4
3
+ size 1299223890
OLMo-150M-different-lr/OLMo-150M-constant-3e3/step60000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80fcab166fdb3a4eb6d8dd5f789f8621800a8012c3cb7a1049f695051374eabf
3
+ size 15244
OLMo-150M-different-lr/OLMo-150M-constant-6e4/step60000-unsharded/config.yaml ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-150M-constant-6e4
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 768
7
+ n_heads: 12
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 12
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: false
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0006
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: constant_with_warmup
65
+ units: steps
66
+ t_warmup: 5000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00013_00000_doc_shuffled.ds
88
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00014_00000_doc_shuffled.ds
89
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00015_00000_doc_shuffled.ds
90
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
91
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00017_00000_doc_shuffled.ds
92
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00018_00000_doc_shuffled.ds
93
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
94
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
95
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
96
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
97
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
98
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
99
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
100
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
101
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
102
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
103
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
104
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
105
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00000_00000_doc_shuffled.ds
106
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00001_00000_doc_shuffled.ds
107
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00002_00000_doc_shuffled.ds
108
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00003_00000_doc_shuffled.ds
109
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00004_00000_doc_shuffled.ds
110
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00005_00000_doc_shuffled.ds
111
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00006_00000_doc_shuffled.ds
112
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00007_00000_doc_shuffled.ds
113
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00008_00000_doc_shuffled.ds
114
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00009_00000_doc_shuffled.ds
115
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00010_00000_doc_shuffled.ds
116
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00011_00000_doc_shuffled.ds
117
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00012_00000_doc_shuffled.ds
118
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00013_00000_doc_shuffled.ds
119
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00014_00000_doc_shuffled.ds
120
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00015_00000_doc_shuffled.ds
121
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
122
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
123
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
124
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
125
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
126
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
127
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
128
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
129
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
130
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
131
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
132
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
133
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
134
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
135
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
136
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
137
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
138
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
139
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
140
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
141
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
142
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
143
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
144
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
145
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
146
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
147
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
148
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
149
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
150
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
151
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
152
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
153
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
154
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00000_00000_doc_shuffled.ds
155
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00001_00000_doc_shuffled.ds
156
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00002_00000_doc_shuffled.ds
157
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00003_00000_doc_shuffled.ds
158
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00004_00000_doc_shuffled.ds
159
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00005_00000_doc_shuffled.ds
160
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00006_00000_doc_shuffled.ds
161
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00007_00000_doc_shuffled.ds
162
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00008_00000_doc_shuffled.ds
163
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00009_00000_doc_shuffled.ds
164
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00010_00000_doc_shuffled.ds
165
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00011_00000_doc_shuffled.ds
166
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00012_00000_doc_shuffled.ds
167
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00013_00000_doc_shuffled.ds
168
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00014_00000_doc_shuffled.ds
169
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00015_00000_doc_shuffled.ds
170
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00016_00000_doc_shuffled.ds
171
+ memmap_dtype: uint16
172
+ datasets: null
173
+ label_mask_paths: null
174
+ pad_direction: right
175
+ generate_attention_mask: false
176
+ generate_doc_lengths: false
177
+ num_workers: 32
178
+ drop_last: true
179
+ pin_memory: true
180
+ prefetch_factor: 8
181
+ persistent_workers: true
182
+ timeout: 0
183
+ seed: null
184
+ instance_filter: null
185
+ custom_dataset: null
186
+ restore_dataloader: true
187
+ fast_forward_batches: null
188
+ evaluators: []
189
+ eval_interval: 5000
190
+ tokenizer:
191
+ identifier: meta-llama/Llama-2-7b-hf
192
+ truncate_direction: right
193
+ save_folder: checkpoints/OLMo-150M-constant-6e4
194
+ remote_save_folder: null
195
+ canceled_check_interval: 50
196
+ save_interval: 3000
197
+ save_interval_unsharded: 3000
198
+ save_interval_ephemeral: null
199
+ save_num_checkpoints_to_keep: 48
200
+ save_num_unsharded_checkpoints_to_keep: 48
201
+ save_overwrite: true
202
+ force_save_unsharded: false
203
+ no_pre_train_checkpoint: false
204
+ load_path: null
205
+ load_path_sharded_checkpointer: null
206
+ try_load_latest_save: true
207
+ reset_optimizer_state: false
208
+ reset_trainer_state: false
209
+ sharded_checkpointer: torch_legacy
210
+ new_style_checkpoints: null
211
+ max_duration: 1ep
212
+ global_train_batch_size: 512
213
+ device_train_batch_size: 128
214
+ device_train_microbatch_size: 16
215
+ device_eval_batch_size: 16
216
+ eval_subset_num_batches: -1
217
+ eval_on_load: false
218
+ device_train_grad_accum: 8
219
+ max_grad_norm: 1.0
220
+ max_grad_norm_ratio: null
221
+ precision: amp_bf16
222
+ wandb:
223
+ project: olmo-pretrain
224
+ entity: marksmans
225
+ group: null
226
+ name: OLMo-150M-constant-6e4
227
+ tags:
228
+ - watching
229
+ log_artifacts: false
230
+ rank_zero_only: true
231
+ log_interval: 1
232
+ speed_monitor:
233
+ window_size: 20
234
+ gpu_flops_available: null
235
+ console_log_interval: 1
236
+ gen1_gc_interval: 1
237
+ compile: null
238
+ distributed_strategy: ddp
239
+ fsdp:
240
+ use_orig_params: true
241
+ sharding_strategy: FULL_SHARD
242
+ wrapping_strategy: null
243
+ precision: pure
244
+ hybrid_sharding_num_model_replicas: null
245
+ ddp:
246
+ grad_sync_mode: batch
247
+ find_unused_params: false
248
+ single:
249
+ device: auto
250
+ softmax_auxiliary_loss: false
251
+ auxiliary_loss_multiplier: 0.0001
252
+ time_limit: null
253
+ extra_steps_after_cancel: 10
254
+ early_stopping_factor: null
255
+ save_data_indices: true
256
+ python_profiling: false
257
+ torch_profiling: false
258
+ stop_at: 61787
259
+ stop_after: null
260
+ activation_checkpointing: null
261
+ fused_loss: null
262
+ hf_datasets_cache_dir: null
263
+ module_outputs_save_steps: null
OLMo-150M-different-lr/OLMo-150M-constant-6e4/step60000-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7390872cc7f6cbfbdd2274322dad24adefa8533e23b6941fa38f30890b43c846
3
+ size 649612628
OLMo-150M-different-lr/OLMo-150M-constant-6e4/step60000-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92e91982593a34e6704b074464037c3421b85ef718ff7426608b30dcb46e93ef
3
+ size 1299223890
OLMo-150M-different-lr/OLMo-150M-constant-6e4/step60000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d843cb275cf4f8170353213179d8552c9cd967d6a572f5e1d621040f10f100a0
3
+ size 15244