DatPySci commited on
Commit
0a99e39
·
verified ·
1 Parent(s): 6dd57a1

upload pretrain 150M

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +7 -0
  2. OLMo-150M/OLMo-150M-constant/config.yaml +263 -0
  3. OLMo-150M/OLMo-150M-constant/data-indices/rank0.tsv.gz +3 -0
  4. OLMo-150M/OLMo-150M-constant/data-indices/rank1.tsv.gz +3 -0
  5. OLMo-150M/OLMo-150M-constant/data-indices/rank2.tsv.gz +3 -0
  6. OLMo-150M/OLMo-150M-constant/data-indices/rank3.tsv.gz +3 -0
  7. OLMo-150M/OLMo-150M-constant/step0-unsharded/config.yaml +263 -0
  8. OLMo-150M/OLMo-150M-constant/step0-unsharded/model.pt +3 -0
  9. OLMo-150M/OLMo-150M-constant/step0-unsharded/optim.pt +3 -0
  10. OLMo-150M/OLMo-150M-constant/step0-unsharded/train.pt +3 -0
  11. OLMo-150M/OLMo-150M-constant/step12000-unsharded/config.yaml +263 -0
  12. OLMo-150M/OLMo-150M-constant/step12000-unsharded/model.pt +3 -0
  13. OLMo-150M/OLMo-150M-constant/step12000-unsharded/optim.pt +3 -0
  14. OLMo-150M/OLMo-150M-constant/step12000-unsharded/train.pt +3 -0
  15. OLMo-150M/OLMo-150M-constant/step15000-unsharded/config.yaml +263 -0
  16. OLMo-150M/OLMo-150M-constant/step15000-unsharded/model.pt +3 -0
  17. OLMo-150M/OLMo-150M-constant/step15000-unsharded/optim.pt +3 -0
  18. OLMo-150M/OLMo-150M-constant/step15000-unsharded/train.pt +3 -0
  19. OLMo-150M/OLMo-150M-constant/step18000-unsharded/config.yaml +263 -0
  20. OLMo-150M/OLMo-150M-constant/step18000-unsharded/model.pt +3 -0
  21. OLMo-150M/OLMo-150M-constant/step18000-unsharded/optim.pt +3 -0
  22. OLMo-150M/OLMo-150M-constant/step18000-unsharded/train.pt +3 -0
  23. OLMo-150M/OLMo-150M-constant/step21000-unsharded/config.yaml +263 -0
  24. OLMo-150M/OLMo-150M-constant/step21000-unsharded/model.pt +3 -0
  25. OLMo-150M/OLMo-150M-constant/step21000-unsharded/optim.pt +3 -0
  26. OLMo-150M/OLMo-150M-constant/step21000-unsharded/train.pt +3 -0
  27. OLMo-150M/OLMo-150M-constant/step24000-unsharded/config.yaml +263 -0
  28. OLMo-150M/OLMo-150M-constant/step24000-unsharded/model.pt +3 -0
  29. OLMo-150M/OLMo-150M-constant/step24000-unsharded/optim.pt +3 -0
  30. OLMo-150M/OLMo-150M-constant/step24000-unsharded/train.pt +3 -0
  31. OLMo-150M/OLMo-150M-constant/step27000-unsharded/config.yaml +263 -0
  32. OLMo-150M/OLMo-150M-constant/step27000-unsharded/model.pt +3 -0
  33. OLMo-150M/OLMo-150M-constant/step27000-unsharded/optim.pt +3 -0
  34. OLMo-150M/OLMo-150M-constant/step27000-unsharded/train.pt +3 -0
  35. OLMo-150M/OLMo-150M-constant/step3000-unsharded/config.yaml +263 -0
  36. OLMo-150M/OLMo-150M-constant/step3000-unsharded/model.pt +3 -0
  37. OLMo-150M/OLMo-150M-constant/step3000-unsharded/optim.pt +3 -0
  38. OLMo-150M/OLMo-150M-constant/step3000-unsharded/train.pt +3 -0
  39. OLMo-150M/OLMo-150M-constant/step30000-unsharded/config.yaml +263 -0
  40. OLMo-150M/OLMo-150M-constant/step30000-unsharded/model.pt +3 -0
  41. OLMo-150M/OLMo-150M-constant/step30000-unsharded/optim.pt +3 -0
  42. OLMo-150M/OLMo-150M-constant/step30000-unsharded/train.pt +3 -0
  43. OLMo-150M/OLMo-150M-constant/step33000-unsharded/config.yaml +263 -0
  44. OLMo-150M/OLMo-150M-constant/step33000-unsharded/model.pt +3 -0
  45. OLMo-150M/OLMo-150M-constant/step33000-unsharded/optim.pt +3 -0
  46. OLMo-150M/OLMo-150M-constant/step33000-unsharded/train.pt +3 -0
  47. OLMo-150M/OLMo-150M-constant/step36000-unsharded/config.yaml +263 -0
  48. OLMo-150M/OLMo-150M-constant/step36000-unsharded/model.pt +3 -0
  49. OLMo-150M/OLMo-150M-constant/step36000-unsharded/optim.pt +3 -0
  50. OLMo-150M/OLMo-150M-constant/step36000-unsharded/train.pt +3 -0
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ OLMo-150M/OLMo-150M-constant/wandb/wandb/run-20260430_192520-m8kf4lt8/files/output.log filter=lfs diff=lfs merge=lfs -text
37
+ OLMo-150M/OLMo-150M-constant/wandb/wandb/run-20260430_192520-m8kf4lt8/run-m8kf4lt8.wandb filter=lfs diff=lfs merge=lfs -text
38
+ OLMo-150M/OLMo-150M-constant/wandb/wandb/run-20260502_155120-rmngugh9/files/output.log filter=lfs diff=lfs merge=lfs -text
39
+ OLMo-150M/OLMo-150M-constant/wandb/wandb/run-20260502_155120-rmngugh9/run-rmngugh9.wandb filter=lfs diff=lfs merge=lfs -text
40
+ OLMo-150M/OLMo-150M-cosine/wandb/wandb/run-20260501_103343-9yhu352f/run-9yhu352f.wandb filter=lfs diff=lfs merge=lfs -text
41
+ OLMo-150M/OLMo-150M-cosine/wandb/wandb/run-20260502_155121-vorxelbs/files/output.log filter=lfs diff=lfs merge=lfs -text
42
+ OLMo-150M/OLMo-150M-cosine/wandb/wandb/run-20260502_155121-vorxelbs/run-vorxelbs.wandb filter=lfs diff=lfs merge=lfs -text
OLMo-150M/OLMo-150M-constant/config.yaml ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-150M-constant
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 768
7
+ n_heads: 12
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 12
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: false
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0003
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: constant_with_warmup
65
+ units: steps
66
+ t_warmup: 5000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00013_00000_doc_shuffled.ds
88
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00014_00000_doc_shuffled.ds
89
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00015_00000_doc_shuffled.ds
90
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
91
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00017_00000_doc_shuffled.ds
92
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00018_00000_doc_shuffled.ds
93
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
94
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
95
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
96
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
97
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
98
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
99
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
100
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
101
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
102
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
103
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
104
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
105
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00000_00000_doc_shuffled.ds
106
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00001_00000_doc_shuffled.ds
107
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00002_00000_doc_shuffled.ds
108
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00003_00000_doc_shuffled.ds
109
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00004_00000_doc_shuffled.ds
110
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00005_00000_doc_shuffled.ds
111
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00006_00000_doc_shuffled.ds
112
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00007_00000_doc_shuffled.ds
113
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00008_00000_doc_shuffled.ds
114
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00009_00000_doc_shuffled.ds
115
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00010_00000_doc_shuffled.ds
116
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00011_00000_doc_shuffled.ds
117
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00012_00000_doc_shuffled.ds
118
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00013_00000_doc_shuffled.ds
119
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00014_00000_doc_shuffled.ds
120
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00015_00000_doc_shuffled.ds
121
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
122
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
123
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
124
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
125
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
126
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
127
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
128
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
129
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
130
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
131
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
132
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
133
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
134
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
135
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
136
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
137
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
138
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
139
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
140
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
141
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
142
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
143
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
144
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
145
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
146
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
147
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
148
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
149
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
150
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
151
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
152
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
153
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
154
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00000_00000_doc_shuffled.ds
155
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00001_00000_doc_shuffled.ds
156
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00002_00000_doc_shuffled.ds
157
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00003_00000_doc_shuffled.ds
158
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00004_00000_doc_shuffled.ds
159
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00005_00000_doc_shuffled.ds
160
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00006_00000_doc_shuffled.ds
161
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00007_00000_doc_shuffled.ds
162
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00008_00000_doc_shuffled.ds
163
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00009_00000_doc_shuffled.ds
164
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00010_00000_doc_shuffled.ds
165
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00011_00000_doc_shuffled.ds
166
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00012_00000_doc_shuffled.ds
167
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00013_00000_doc_shuffled.ds
168
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00014_00000_doc_shuffled.ds
169
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00015_00000_doc_shuffled.ds
170
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00016_00000_doc_shuffled.ds
171
+ memmap_dtype: uint16
172
+ datasets: null
173
+ label_mask_paths: null
174
+ pad_direction: right
175
+ generate_attention_mask: false
176
+ generate_doc_lengths: false
177
+ num_workers: 32
178
+ drop_last: true
179
+ pin_memory: true
180
+ prefetch_factor: 8
181
+ persistent_workers: true
182
+ timeout: 0
183
+ seed: null
184
+ instance_filter: null
185
+ custom_dataset: null
186
+ restore_dataloader: true
187
+ fast_forward_batches: null
188
+ evaluators: []
189
+ eval_interval: 5000
190
+ tokenizer:
191
+ identifier: meta-llama/Llama-2-7b-hf
192
+ truncate_direction: right
193
+ save_folder: checkpoints/OLMo-150M-constant
194
+ remote_save_folder: null
195
+ canceled_check_interval: 50
196
+ save_interval: 3000
197
+ save_interval_unsharded: 3000
198
+ save_interval_ephemeral: null
199
+ save_num_checkpoints_to_keep: 48
200
+ save_num_unsharded_checkpoints_to_keep: 48
201
+ save_overwrite: true
202
+ force_save_unsharded: false
203
+ no_pre_train_checkpoint: false
204
+ load_path: null
205
+ load_path_sharded_checkpointer: null
206
+ try_load_latest_save: false
207
+ reset_optimizer_state: false
208
+ reset_trainer_state: false
209
+ sharded_checkpointer: torch_legacy
210
+ new_style_checkpoints: null
211
+ max_duration: 1ep
212
+ global_train_batch_size: 512
213
+ device_train_batch_size: 128
214
+ device_train_microbatch_size: 16
215
+ device_eval_batch_size: 16
216
+ eval_subset_num_batches: -1
217
+ eval_on_load: false
218
+ device_train_grad_accum: 8
219
+ max_grad_norm: 1.0
220
+ max_grad_norm_ratio: null
221
+ precision: amp_bf16
222
+ wandb:
223
+ project: olmo-pretrain
224
+ entity: marksmans
225
+ group: null
226
+ name: OLMo-150M-constant
227
+ tags:
228
+ - watching
229
+ log_artifacts: false
230
+ rank_zero_only: true
231
+ log_interval: 1
232
+ speed_monitor:
233
+ window_size: 20
234
+ gpu_flops_available: null
235
+ console_log_interval: 1
236
+ gen1_gc_interval: 1
237
+ compile: null
238
+ distributed_strategy: ddp
239
+ fsdp:
240
+ use_orig_params: true
241
+ sharding_strategy: FULL_SHARD
242
+ wrapping_strategy: null
243
+ precision: pure
244
+ hybrid_sharding_num_model_replicas: null
245
+ ddp:
246
+ grad_sync_mode: batch
247
+ find_unused_params: false
248
+ single:
249
+ device: auto
250
+ softmax_auxiliary_loss: false
251
+ auxiliary_loss_multiplier: 0.0001
252
+ time_limit: null
253
+ extra_steps_after_cancel: 10
254
+ early_stopping_factor: null
255
+ save_data_indices: true
256
+ python_profiling: false
257
+ torch_profiling: false
258
+ stop_at: null
259
+ stop_after: null
260
+ activation_checkpointing: null
261
+ fused_loss: null
262
+ hf_datasets_cache_dir: null
263
+ module_outputs_save_steps: null
OLMo-150M/OLMo-150M-constant/data-indices/rank0.tsv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ae84355d0730b6a114f047e43ccb7c811f2f4d765a127b6c3d0d69457a82ca4
3
+ size 32434788
OLMo-150M/OLMo-150M-constant/data-indices/rank1.tsv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ece72bfa45b475283e9aecf33238f4c0284c0f47e19fef9ae561b81cc6a55e14
3
+ size 32434924
OLMo-150M/OLMo-150M-constant/data-indices/rank2.tsv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b7623ecc167d77931d7f10d1045dbb06c143f17f4898443fa730e935bc16339
3
+ size 32433050
OLMo-150M/OLMo-150M-constant/data-indices/rank3.tsv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b62608527cdbcf09869bb7eebd0302737c1533febfd4ede40b66988eba377328
3
+ size 32436069
OLMo-150M/OLMo-150M-constant/step0-unsharded/config.yaml ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-150M-constant
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 768
7
+ n_heads: 12
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 12
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: false
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0003
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: constant_with_warmup
65
+ units: steps
66
+ t_warmup: 5000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00013_00000_doc_shuffled.ds
88
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00014_00000_doc_shuffled.ds
89
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00015_00000_doc_shuffled.ds
90
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
91
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00017_00000_doc_shuffled.ds
92
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00018_00000_doc_shuffled.ds
93
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
94
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
95
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
96
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
97
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
98
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
99
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
100
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
101
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
102
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
103
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
104
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
105
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00000_00000_doc_shuffled.ds
106
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00001_00000_doc_shuffled.ds
107
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00002_00000_doc_shuffled.ds
108
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00003_00000_doc_shuffled.ds
109
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00004_00000_doc_shuffled.ds
110
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00005_00000_doc_shuffled.ds
111
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00006_00000_doc_shuffled.ds
112
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00007_00000_doc_shuffled.ds
113
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00008_00000_doc_shuffled.ds
114
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00009_00000_doc_shuffled.ds
115
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00010_00000_doc_shuffled.ds
116
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00011_00000_doc_shuffled.ds
117
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00012_00000_doc_shuffled.ds
118
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00013_00000_doc_shuffled.ds
119
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00014_00000_doc_shuffled.ds
120
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00015_00000_doc_shuffled.ds
121
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
122
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
123
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
124
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
125
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
126
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
127
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
128
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
129
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
130
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
131
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
132
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
133
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
134
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
135
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
136
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
137
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
138
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
139
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
140
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
141
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
142
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
143
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
144
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
145
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
146
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
147
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
148
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
149
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
150
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
151
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
152
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
153
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
154
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00000_00000_doc_shuffled.ds
155
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00001_00000_doc_shuffled.ds
156
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00002_00000_doc_shuffled.ds
157
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00003_00000_doc_shuffled.ds
158
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00004_00000_doc_shuffled.ds
159
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00005_00000_doc_shuffled.ds
160
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00006_00000_doc_shuffled.ds
161
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00007_00000_doc_shuffled.ds
162
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00008_00000_doc_shuffled.ds
163
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00009_00000_doc_shuffled.ds
164
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00010_00000_doc_shuffled.ds
165
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00011_00000_doc_shuffled.ds
166
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00012_00000_doc_shuffled.ds
167
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00013_00000_doc_shuffled.ds
168
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00014_00000_doc_shuffled.ds
169
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00015_00000_doc_shuffled.ds
170
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00016_00000_doc_shuffled.ds
171
+ memmap_dtype: uint16
172
+ datasets: null
173
+ label_mask_paths: null
174
+ pad_direction: right
175
+ generate_attention_mask: false
176
+ generate_doc_lengths: false
177
+ num_workers: 32
178
+ drop_last: true
179
+ pin_memory: true
180
+ prefetch_factor: 8
181
+ persistent_workers: true
182
+ timeout: 0
183
+ seed: null
184
+ instance_filter: null
185
+ custom_dataset: null
186
+ restore_dataloader: true
187
+ fast_forward_batches: null
188
+ evaluators: []
189
+ eval_interval: 5000
190
+ tokenizer:
191
+ identifier: meta-llama/Llama-2-7b-hf
192
+ truncate_direction: right
193
+ save_folder: checkpoints/OLMo-150M-constant
194
+ remote_save_folder: null
195
+ canceled_check_interval: 50
196
+ save_interval: 3000
197
+ save_interval_unsharded: 3000
198
+ save_interval_ephemeral: null
199
+ save_num_checkpoints_to_keep: 48
200
+ save_num_unsharded_checkpoints_to_keep: 48
201
+ save_overwrite: true
202
+ force_save_unsharded: false
203
+ no_pre_train_checkpoint: false
204
+ load_path: null
205
+ load_path_sharded_checkpointer: null
206
+ try_load_latest_save: false
207
+ reset_optimizer_state: false
208
+ reset_trainer_state: false
209
+ sharded_checkpointer: torch_legacy
210
+ new_style_checkpoints: null
211
+ max_duration: 1ep
212
+ global_train_batch_size: 512
213
+ device_train_batch_size: 128
214
+ device_train_microbatch_size: 16
215
+ device_eval_batch_size: 16
216
+ eval_subset_num_batches: -1
217
+ eval_on_load: false
218
+ device_train_grad_accum: 8
219
+ max_grad_norm: 1.0
220
+ max_grad_norm_ratio: null
221
+ precision: amp_bf16
222
+ wandb:
223
+ project: olmo-pretrain
224
+ entity: marksmans
225
+ group: null
226
+ name: OLMo-150M-constant
227
+ tags:
228
+ - watching
229
+ log_artifacts: false
230
+ rank_zero_only: true
231
+ log_interval: 1
232
+ speed_monitor:
233
+ window_size: 20
234
+ gpu_flops_available: null
235
+ console_log_interval: 1
236
+ gen1_gc_interval: 1
237
+ compile: null
238
+ distributed_strategy: ddp
239
+ fsdp:
240
+ use_orig_params: true
241
+ sharding_strategy: FULL_SHARD
242
+ wrapping_strategy: null
243
+ precision: pure
244
+ hybrid_sharding_num_model_replicas: null
245
+ ddp:
246
+ grad_sync_mode: batch
247
+ find_unused_params: false
248
+ single:
249
+ device: auto
250
+ softmax_auxiliary_loss: false
251
+ auxiliary_loss_multiplier: 0.0001
252
+ time_limit: null
253
+ extra_steps_after_cancel: 10
254
+ early_stopping_factor: null
255
+ save_data_indices: true
256
+ python_profiling: false
257
+ torch_profiling: false
258
+ stop_at: null
259
+ stop_after: null
260
+ activation_checkpointing: null
261
+ fused_loss: null
262
+ hf_datasets_cache_dir: null
263
+ module_outputs_save_steps: null
OLMo-150M/OLMo-150M-constant/step0-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fe02330ebcf589840b78a3eba94f41002e99a3eabb6e6328423498c99c1ac6b
3
+ size 649612628
OLMo-150M/OLMo-150M-constant/step0-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1095767beff59f2b264783e12eb2ae6267f313f1c53ff613a19147376ddd001f
3
+ size 3736
OLMo-150M/OLMo-150M-constant/step0-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cad05ff35183aad480225eeb9464447d9b4d66a8627b0cab8bd68d366192ef1
3
+ size 14476
OLMo-150M/OLMo-150M-constant/step12000-unsharded/config.yaml ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-150M-constant
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 768
7
+ n_heads: 12
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 12
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: false
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0003
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: constant_with_warmup
65
+ units: steps
66
+ t_warmup: 5000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00013_00000_doc_shuffled.ds
88
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00014_00000_doc_shuffled.ds
89
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00015_00000_doc_shuffled.ds
90
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
91
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00017_00000_doc_shuffled.ds
92
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00018_00000_doc_shuffled.ds
93
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
94
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
95
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
96
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
97
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
98
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
99
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
100
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
101
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
102
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
103
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
104
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
105
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00000_00000_doc_shuffled.ds
106
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00001_00000_doc_shuffled.ds
107
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00002_00000_doc_shuffled.ds
108
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00003_00000_doc_shuffled.ds
109
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00004_00000_doc_shuffled.ds
110
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00005_00000_doc_shuffled.ds
111
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00006_00000_doc_shuffled.ds
112
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00007_00000_doc_shuffled.ds
113
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00008_00000_doc_shuffled.ds
114
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00009_00000_doc_shuffled.ds
115
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00010_00000_doc_shuffled.ds
116
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00011_00000_doc_shuffled.ds
117
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00012_00000_doc_shuffled.ds
118
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00013_00000_doc_shuffled.ds
119
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00014_00000_doc_shuffled.ds
120
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00015_00000_doc_shuffled.ds
121
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
122
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
123
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
124
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
125
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
126
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
127
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
128
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
129
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
130
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
131
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
132
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
133
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
134
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
135
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
136
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
137
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
138
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
139
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
140
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
141
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
142
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
143
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
144
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
145
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
146
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
147
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
148
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
149
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
150
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
151
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
152
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
153
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
154
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00000_00000_doc_shuffled.ds
155
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00001_00000_doc_shuffled.ds
156
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00002_00000_doc_shuffled.ds
157
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00003_00000_doc_shuffled.ds
158
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00004_00000_doc_shuffled.ds
159
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00005_00000_doc_shuffled.ds
160
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00006_00000_doc_shuffled.ds
161
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00007_00000_doc_shuffled.ds
162
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00008_00000_doc_shuffled.ds
163
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00009_00000_doc_shuffled.ds
164
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00010_00000_doc_shuffled.ds
165
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00011_00000_doc_shuffled.ds
166
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00012_00000_doc_shuffled.ds
167
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00013_00000_doc_shuffled.ds
168
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00014_00000_doc_shuffled.ds
169
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00015_00000_doc_shuffled.ds
170
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00016_00000_doc_shuffled.ds
171
+ memmap_dtype: uint16
172
+ datasets: null
173
+ label_mask_paths: null
174
+ pad_direction: right
175
+ generate_attention_mask: false
176
+ generate_doc_lengths: false
177
+ num_workers: 32
178
+ drop_last: true
179
+ pin_memory: true
180
+ prefetch_factor: 8
181
+ persistent_workers: true
182
+ timeout: 0
183
+ seed: null
184
+ instance_filter: null
185
+ custom_dataset: null
186
+ restore_dataloader: true
187
+ fast_forward_batches: null
188
+ evaluators: []
189
+ eval_interval: 5000
190
+ tokenizer:
191
+ identifier: meta-llama/Llama-2-7b-hf
192
+ truncate_direction: right
193
+ save_folder: checkpoints/OLMo-150M-constant
194
+ remote_save_folder: null
195
+ canceled_check_interval: 50
196
+ save_interval: 3000
197
+ save_interval_unsharded: 3000
198
+ save_interval_ephemeral: null
199
+ save_num_checkpoints_to_keep: 48
200
+ save_num_unsharded_checkpoints_to_keep: 48
201
+ save_overwrite: true
202
+ force_save_unsharded: false
203
+ no_pre_train_checkpoint: false
204
+ load_path: null
205
+ load_path_sharded_checkpointer: null
206
+ try_load_latest_save: false
207
+ reset_optimizer_state: false
208
+ reset_trainer_state: false
209
+ sharded_checkpointer: torch_legacy
210
+ new_style_checkpoints: null
211
+ max_duration: 1ep
212
+ global_train_batch_size: 512
213
+ device_train_batch_size: 128
214
+ device_train_microbatch_size: 16
215
+ device_eval_batch_size: 16
216
+ eval_subset_num_batches: -1
217
+ eval_on_load: false
218
+ device_train_grad_accum: 8
219
+ max_grad_norm: 1.0
220
+ max_grad_norm_ratio: null
221
+ precision: amp_bf16
222
+ wandb:
223
+ project: olmo-pretrain
224
+ entity: marksmans
225
+ group: null
226
+ name: OLMo-150M-constant
227
+ tags:
228
+ - watching
229
+ log_artifacts: false
230
+ rank_zero_only: true
231
+ log_interval: 1
232
+ speed_monitor:
233
+ window_size: 20
234
+ gpu_flops_available: null
235
+ console_log_interval: 1
236
+ gen1_gc_interval: 1
237
+ compile: null
238
+ distributed_strategy: ddp
239
+ fsdp:
240
+ use_orig_params: true
241
+ sharding_strategy: FULL_SHARD
242
+ wrapping_strategy: null
243
+ precision: pure
244
+ hybrid_sharding_num_model_replicas: null
245
+ ddp:
246
+ grad_sync_mode: batch
247
+ find_unused_params: false
248
+ single:
249
+ device: auto
250
+ softmax_auxiliary_loss: false
251
+ auxiliary_loss_multiplier: 0.0001
252
+ time_limit: null
253
+ extra_steps_after_cancel: 10
254
+ early_stopping_factor: null
255
+ save_data_indices: true
256
+ python_profiling: false
257
+ torch_profiling: false
258
+ stop_at: 61787
259
+ stop_after: null
260
+ activation_checkpointing: null
261
+ fused_loss: null
262
+ hf_datasets_cache_dir: null
263
+ module_outputs_save_steps: null
OLMo-150M/OLMo-150M-constant/step12000-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:814029d566d13ef11604ddc7c30c1ee110fd2c51cfaa7c2baa039c78de430464
3
+ size 649612628
OLMo-150M/OLMo-150M-constant/step12000-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66ee56b5b5bd8464065771e3e6f37bd3cec1858e4f6d5f8681df274824633c66
3
+ size 1299223890
OLMo-150M/OLMo-150M-constant/step12000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d8ba6c2151c4576fab74051338f274d10dca6fe504055c08cfa0611655a4d47
3
+ size 14668
OLMo-150M/OLMo-150M-constant/step15000-unsharded/config.yaml ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-150M-constant
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 768
7
+ n_heads: 12
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 12
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: false
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0003
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: constant_with_warmup
65
+ units: steps
66
+ t_warmup: 5000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00013_00000_doc_shuffled.ds
88
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00014_00000_doc_shuffled.ds
89
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00015_00000_doc_shuffled.ds
90
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
91
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00017_00000_doc_shuffled.ds
92
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00018_00000_doc_shuffled.ds
93
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
94
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
95
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
96
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
97
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
98
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
99
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
100
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
101
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
102
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
103
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
104
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
105
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00000_00000_doc_shuffled.ds
106
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00001_00000_doc_shuffled.ds
107
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00002_00000_doc_shuffled.ds
108
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00003_00000_doc_shuffled.ds
109
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00004_00000_doc_shuffled.ds
110
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00005_00000_doc_shuffled.ds
111
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00006_00000_doc_shuffled.ds
112
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00007_00000_doc_shuffled.ds
113
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00008_00000_doc_shuffled.ds
114
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00009_00000_doc_shuffled.ds
115
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00010_00000_doc_shuffled.ds
116
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00011_00000_doc_shuffled.ds
117
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00012_00000_doc_shuffled.ds
118
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00013_00000_doc_shuffled.ds
119
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00014_00000_doc_shuffled.ds
120
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00015_00000_doc_shuffled.ds
121
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
122
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
123
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
124
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
125
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
126
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
127
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
128
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
129
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
130
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
131
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
132
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
133
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
134
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
135
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
136
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
137
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
138
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
139
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
140
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
141
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
142
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
143
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
144
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
145
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
146
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
147
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
148
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
149
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
150
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
151
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
152
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
153
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
154
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00000_00000_doc_shuffled.ds
155
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00001_00000_doc_shuffled.ds
156
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00002_00000_doc_shuffled.ds
157
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00003_00000_doc_shuffled.ds
158
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00004_00000_doc_shuffled.ds
159
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00005_00000_doc_shuffled.ds
160
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00006_00000_doc_shuffled.ds
161
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00007_00000_doc_shuffled.ds
162
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00008_00000_doc_shuffled.ds
163
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00009_00000_doc_shuffled.ds
164
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00010_00000_doc_shuffled.ds
165
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00011_00000_doc_shuffled.ds
166
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00012_00000_doc_shuffled.ds
167
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00013_00000_doc_shuffled.ds
168
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00014_00000_doc_shuffled.ds
169
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00015_00000_doc_shuffled.ds
170
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00016_00000_doc_shuffled.ds
171
+ memmap_dtype: uint16
172
+ datasets: null
173
+ label_mask_paths: null
174
+ pad_direction: right
175
+ generate_attention_mask: false
176
+ generate_doc_lengths: false
177
+ num_workers: 32
178
+ drop_last: true
179
+ pin_memory: true
180
+ prefetch_factor: 8
181
+ persistent_workers: true
182
+ timeout: 0
183
+ seed: null
184
+ instance_filter: null
185
+ custom_dataset: null
186
+ restore_dataloader: true
187
+ fast_forward_batches: null
188
+ evaluators: []
189
+ eval_interval: 5000
190
+ tokenizer:
191
+ identifier: meta-llama/Llama-2-7b-hf
192
+ truncate_direction: right
193
+ save_folder: checkpoints/OLMo-150M-constant
194
+ remote_save_folder: null
195
+ canceled_check_interval: 50
196
+ save_interval: 3000
197
+ save_interval_unsharded: 3000
198
+ save_interval_ephemeral: null
199
+ save_num_checkpoints_to_keep: 48
200
+ save_num_unsharded_checkpoints_to_keep: 48
201
+ save_overwrite: true
202
+ force_save_unsharded: false
203
+ no_pre_train_checkpoint: false
204
+ load_path: null
205
+ load_path_sharded_checkpointer: null
206
+ try_load_latest_save: false
207
+ reset_optimizer_state: false
208
+ reset_trainer_state: false
209
+ sharded_checkpointer: torch_legacy
210
+ new_style_checkpoints: null
211
+ max_duration: 1ep
212
+ global_train_batch_size: 512
213
+ device_train_batch_size: 128
214
+ device_train_microbatch_size: 16
215
+ device_eval_batch_size: 16
216
+ eval_subset_num_batches: -1
217
+ eval_on_load: false
218
+ device_train_grad_accum: 8
219
+ max_grad_norm: 1.0
220
+ max_grad_norm_ratio: null
221
+ precision: amp_bf16
222
+ wandb:
223
+ project: olmo-pretrain
224
+ entity: marksmans
225
+ group: null
226
+ name: OLMo-150M-constant
227
+ tags:
228
+ - watching
229
+ log_artifacts: false
230
+ rank_zero_only: true
231
+ log_interval: 1
232
+ speed_monitor:
233
+ window_size: 20
234
+ gpu_flops_available: null
235
+ console_log_interval: 1
236
+ gen1_gc_interval: 1
237
+ compile: null
238
+ distributed_strategy: ddp
239
+ fsdp:
240
+ use_orig_params: true
241
+ sharding_strategy: FULL_SHARD
242
+ wrapping_strategy: null
243
+ precision: pure
244
+ hybrid_sharding_num_model_replicas: null
245
+ ddp:
246
+ grad_sync_mode: batch
247
+ find_unused_params: false
248
+ single:
249
+ device: auto
250
+ softmax_auxiliary_loss: false
251
+ auxiliary_loss_multiplier: 0.0001
252
+ time_limit: null
253
+ extra_steps_after_cancel: 10
254
+ early_stopping_factor: null
255
+ save_data_indices: true
256
+ python_profiling: false
257
+ torch_profiling: false
258
+ stop_at: 61787
259
+ stop_after: null
260
+ activation_checkpointing: null
261
+ fused_loss: null
262
+ hf_datasets_cache_dir: null
263
+ module_outputs_save_steps: null
OLMo-150M/OLMo-150M-constant/step15000-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:872db658283f5a6304967d6a4f72dd361988f316eb530354509dbd30417cd5df
3
+ size 649612628
OLMo-150M/OLMo-150M-constant/step15000-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65a5b4b12e3e4879c85a86a2a480439a90a8e2c7570c0aaebb0ae9aab14112a0
3
+ size 1299223890
OLMo-150M/OLMo-150M-constant/step15000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f7bc1d9a0c40b1b63cb49df01a6d833e574a7cc241c8ebd364a1fc83b65d6e9
3
+ size 14732
OLMo-150M/OLMo-150M-constant/step18000-unsharded/config.yaml ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-150M-constant
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 768
7
+ n_heads: 12
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 12
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: false
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0003
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: constant_with_warmup
65
+ units: steps
66
+ t_warmup: 5000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00013_00000_doc_shuffled.ds
88
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00014_00000_doc_shuffled.ds
89
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00015_00000_doc_shuffled.ds
90
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
91
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00017_00000_doc_shuffled.ds
92
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00018_00000_doc_shuffled.ds
93
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
94
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
95
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
96
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
97
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
98
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
99
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
100
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
101
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
102
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
103
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
104
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
105
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00000_00000_doc_shuffled.ds
106
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00001_00000_doc_shuffled.ds
107
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00002_00000_doc_shuffled.ds
108
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00003_00000_doc_shuffled.ds
109
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00004_00000_doc_shuffled.ds
110
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00005_00000_doc_shuffled.ds
111
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00006_00000_doc_shuffled.ds
112
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00007_00000_doc_shuffled.ds
113
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00008_00000_doc_shuffled.ds
114
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00009_00000_doc_shuffled.ds
115
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00010_00000_doc_shuffled.ds
116
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00011_00000_doc_shuffled.ds
117
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00012_00000_doc_shuffled.ds
118
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00013_00000_doc_shuffled.ds
119
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00014_00000_doc_shuffled.ds
120
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00015_00000_doc_shuffled.ds
121
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
122
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
123
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
124
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
125
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
126
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
127
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
128
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
129
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
130
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
131
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
132
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
133
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
134
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
135
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
136
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
137
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
138
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
139
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
140
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
141
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
142
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
143
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
144
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
145
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
146
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
147
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
148
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
149
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
150
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
151
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
152
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
153
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
154
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00000_00000_doc_shuffled.ds
155
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00001_00000_doc_shuffled.ds
156
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00002_00000_doc_shuffled.ds
157
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00003_00000_doc_shuffled.ds
158
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00004_00000_doc_shuffled.ds
159
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00005_00000_doc_shuffled.ds
160
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00006_00000_doc_shuffled.ds
161
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00007_00000_doc_shuffled.ds
162
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00008_00000_doc_shuffled.ds
163
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00009_00000_doc_shuffled.ds
164
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00010_00000_doc_shuffled.ds
165
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00011_00000_doc_shuffled.ds
166
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00012_00000_doc_shuffled.ds
167
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00013_00000_doc_shuffled.ds
168
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00014_00000_doc_shuffled.ds
169
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00015_00000_doc_shuffled.ds
170
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00016_00000_doc_shuffled.ds
171
+ memmap_dtype: uint16
172
+ datasets: null
173
+ label_mask_paths: null
174
+ pad_direction: right
175
+ generate_attention_mask: false
176
+ generate_doc_lengths: false
177
+ num_workers: 32
178
+ drop_last: true
179
+ pin_memory: true
180
+ prefetch_factor: 8
181
+ persistent_workers: true
182
+ timeout: 0
183
+ seed: null
184
+ instance_filter: null
185
+ custom_dataset: null
186
+ restore_dataloader: true
187
+ fast_forward_batches: null
188
+ evaluators: []
189
+ eval_interval: 5000
190
+ tokenizer:
191
+ identifier: meta-llama/Llama-2-7b-hf
192
+ truncate_direction: right
193
+ save_folder: checkpoints/OLMo-150M-constant
194
+ remote_save_folder: null
195
+ canceled_check_interval: 50
196
+ save_interval: 3000
197
+ save_interval_unsharded: 3000
198
+ save_interval_ephemeral: null
199
+ save_num_checkpoints_to_keep: 48
200
+ save_num_unsharded_checkpoints_to_keep: 48
201
+ save_overwrite: true
202
+ force_save_unsharded: false
203
+ no_pre_train_checkpoint: false
204
+ load_path: null
205
+ load_path_sharded_checkpointer: null
206
+ try_load_latest_save: false
207
+ reset_optimizer_state: false
208
+ reset_trainer_state: false
209
+ sharded_checkpointer: torch_legacy
210
+ new_style_checkpoints: null
211
+ max_duration: 1ep
212
+ global_train_batch_size: 512
213
+ device_train_batch_size: 128
214
+ device_train_microbatch_size: 16
215
+ device_eval_batch_size: 16
216
+ eval_subset_num_batches: -1
217
+ eval_on_load: false
218
+ device_train_grad_accum: 8
219
+ max_grad_norm: 1.0
220
+ max_grad_norm_ratio: null
221
+ precision: amp_bf16
222
+ wandb:
223
+ project: olmo-pretrain
224
+ entity: marksmans
225
+ group: null
226
+ name: OLMo-150M-constant
227
+ tags:
228
+ - watching
229
+ log_artifacts: false
230
+ rank_zero_only: true
231
+ log_interval: 1
232
+ speed_monitor:
233
+ window_size: 20
234
+ gpu_flops_available: null
235
+ console_log_interval: 1
236
+ gen1_gc_interval: 1
237
+ compile: null
238
+ distributed_strategy: ddp
239
+ fsdp:
240
+ use_orig_params: true
241
+ sharding_strategy: FULL_SHARD
242
+ wrapping_strategy: null
243
+ precision: pure
244
+ hybrid_sharding_num_model_replicas: null
245
+ ddp:
246
+ grad_sync_mode: batch
247
+ find_unused_params: false
248
+ single:
249
+ device: auto
250
+ softmax_auxiliary_loss: false
251
+ auxiliary_loss_multiplier: 0.0001
252
+ time_limit: null
253
+ extra_steps_after_cancel: 10
254
+ early_stopping_factor: null
255
+ save_data_indices: true
256
+ python_profiling: false
257
+ torch_profiling: false
258
+ stop_at: 61787
259
+ stop_after: null
260
+ activation_checkpointing: null
261
+ fused_loss: null
262
+ hf_datasets_cache_dir: null
263
+ module_outputs_save_steps: null
OLMo-150M/OLMo-150M-constant/step18000-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:855c090c6c8cd4da941772ae7a68ef88fa01bde8cd0b5fce77c978d271bceb4a
3
+ size 649612628
OLMo-150M/OLMo-150M-constant/step18000-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d403cf02f0bfffc718e360b322f47006439ae5da1434df60243d3924a960f42d
3
+ size 1299223890
OLMo-150M/OLMo-150M-constant/step18000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bf149a2559122b821e76c1488e40a0e4f7646bfd69673619d5eb406c906c116
3
+ size 14732
OLMo-150M/OLMo-150M-constant/step21000-unsharded/config.yaml ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-150M-constant
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 768
7
+ n_heads: 12
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 12
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: false
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0003
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: constant_with_warmup
65
+ units: steps
66
+ t_warmup: 5000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00013_00000_doc_shuffled.ds
88
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00014_00000_doc_shuffled.ds
89
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00015_00000_doc_shuffled.ds
90
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
91
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00017_00000_doc_shuffled.ds
92
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00018_00000_doc_shuffled.ds
93
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
94
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
95
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
96
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
97
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
98
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
99
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
100
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
101
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
102
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
103
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
104
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
105
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00000_00000_doc_shuffled.ds
106
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00001_00000_doc_shuffled.ds
107
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00002_00000_doc_shuffled.ds
108
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00003_00000_doc_shuffled.ds
109
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00004_00000_doc_shuffled.ds
110
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00005_00000_doc_shuffled.ds
111
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00006_00000_doc_shuffled.ds
112
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00007_00000_doc_shuffled.ds
113
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00008_00000_doc_shuffled.ds
114
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00009_00000_doc_shuffled.ds
115
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00010_00000_doc_shuffled.ds
116
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00011_00000_doc_shuffled.ds
117
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00012_00000_doc_shuffled.ds
118
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00013_00000_doc_shuffled.ds
119
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00014_00000_doc_shuffled.ds
120
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00015_00000_doc_shuffled.ds
121
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
122
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
123
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
124
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
125
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
126
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
127
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
128
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
129
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
130
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
131
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
132
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
133
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
134
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
135
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
136
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
137
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
138
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
139
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
140
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
141
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
142
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
143
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
144
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
145
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
146
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
147
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
148
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
149
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
150
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
151
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
152
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
153
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
154
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00000_00000_doc_shuffled.ds
155
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00001_00000_doc_shuffled.ds
156
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00002_00000_doc_shuffled.ds
157
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00003_00000_doc_shuffled.ds
158
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00004_00000_doc_shuffled.ds
159
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00005_00000_doc_shuffled.ds
160
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00006_00000_doc_shuffled.ds
161
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00007_00000_doc_shuffled.ds
162
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00008_00000_doc_shuffled.ds
163
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00009_00000_doc_shuffled.ds
164
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00010_00000_doc_shuffled.ds
165
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00011_00000_doc_shuffled.ds
166
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00012_00000_doc_shuffled.ds
167
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00013_00000_doc_shuffled.ds
168
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00014_00000_doc_shuffled.ds
169
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00015_00000_doc_shuffled.ds
170
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00016_00000_doc_shuffled.ds
171
+ memmap_dtype: uint16
172
+ datasets: null
173
+ label_mask_paths: null
174
+ pad_direction: right
175
+ generate_attention_mask: false
176
+ generate_doc_lengths: false
177
+ num_workers: 32
178
+ drop_last: true
179
+ pin_memory: true
180
+ prefetch_factor: 8
181
+ persistent_workers: true
182
+ timeout: 0
183
+ seed: null
184
+ instance_filter: null
185
+ custom_dataset: null
186
+ restore_dataloader: true
187
+ fast_forward_batches: null
188
+ evaluators: []
189
+ eval_interval: 5000
190
+ tokenizer:
191
+ identifier: meta-llama/Llama-2-7b-hf
192
+ truncate_direction: right
193
+ save_folder: checkpoints/OLMo-150M-constant
194
+ remote_save_folder: null
195
+ canceled_check_interval: 50
196
+ save_interval: 3000
197
+ save_interval_unsharded: 3000
198
+ save_interval_ephemeral: null
199
+ save_num_checkpoints_to_keep: 48
200
+ save_num_unsharded_checkpoints_to_keep: 48
201
+ save_overwrite: true
202
+ force_save_unsharded: false
203
+ no_pre_train_checkpoint: false
204
+ load_path: null
205
+ load_path_sharded_checkpointer: null
206
+ try_load_latest_save: false
207
+ reset_optimizer_state: false
208
+ reset_trainer_state: false
209
+ sharded_checkpointer: torch_legacy
210
+ new_style_checkpoints: null
211
+ max_duration: 1ep
212
+ global_train_batch_size: 512
213
+ device_train_batch_size: 128
214
+ device_train_microbatch_size: 16
215
+ device_eval_batch_size: 16
216
+ eval_subset_num_batches: -1
217
+ eval_on_load: false
218
+ device_train_grad_accum: 8
219
+ max_grad_norm: 1.0
220
+ max_grad_norm_ratio: null
221
+ precision: amp_bf16
222
+ wandb:
223
+ project: olmo-pretrain
224
+ entity: marksmans
225
+ group: null
226
+ name: OLMo-150M-constant
227
+ tags:
228
+ - watching
229
+ log_artifacts: false
230
+ rank_zero_only: true
231
+ log_interval: 1
232
+ speed_monitor:
233
+ window_size: 20
234
+ gpu_flops_available: null
235
+ console_log_interval: 1
236
+ gen1_gc_interval: 1
237
+ compile: null
238
+ distributed_strategy: ddp
239
+ fsdp:
240
+ use_orig_params: true
241
+ sharding_strategy: FULL_SHARD
242
+ wrapping_strategy: null
243
+ precision: pure
244
+ hybrid_sharding_num_model_replicas: null
245
+ ddp:
246
+ grad_sync_mode: batch
247
+ find_unused_params: false
248
+ single:
249
+ device: auto
250
+ softmax_auxiliary_loss: false
251
+ auxiliary_loss_multiplier: 0.0001
252
+ time_limit: null
253
+ extra_steps_after_cancel: 10
254
+ early_stopping_factor: null
255
+ save_data_indices: true
256
+ python_profiling: false
257
+ torch_profiling: false
258
+ stop_at: 61787
259
+ stop_after: null
260
+ activation_checkpointing: null
261
+ fused_loss: null
262
+ hf_datasets_cache_dir: null
263
+ module_outputs_save_steps: null
OLMo-150M/OLMo-150M-constant/step21000-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c6e762b272484e12c9ab21dd275bfe823fa78fc23102ba827d14d7a8bd44215
3
+ size 649612628
OLMo-150M/OLMo-150M-constant/step21000-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47199457d0fa7bb063f79f7f3e520a3ee56f3c0bed3b8e218aca06d6e2d108c3
3
+ size 1299223890
OLMo-150M/OLMo-150M-constant/step21000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14a06d3efdf385119386b67a5b2db608641871728471f96ac4561298498b485d
3
+ size 14796
OLMo-150M/OLMo-150M-constant/step24000-unsharded/config.yaml ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-150M-constant
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 768
7
+ n_heads: 12
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 12
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: false
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0003
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: constant_with_warmup
65
+ units: steps
66
+ t_warmup: 5000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00013_00000_doc_shuffled.ds
88
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00014_00000_doc_shuffled.ds
89
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00015_00000_doc_shuffled.ds
90
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
91
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00017_00000_doc_shuffled.ds
92
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00018_00000_doc_shuffled.ds
93
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
94
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
95
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
96
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
97
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
98
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
99
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
100
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
101
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
102
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
103
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
104
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
105
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00000_00000_doc_shuffled.ds
106
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00001_00000_doc_shuffled.ds
107
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00002_00000_doc_shuffled.ds
108
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00003_00000_doc_shuffled.ds
109
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00004_00000_doc_shuffled.ds
110
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00005_00000_doc_shuffled.ds
111
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00006_00000_doc_shuffled.ds
112
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00007_00000_doc_shuffled.ds
113
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00008_00000_doc_shuffled.ds
114
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00009_00000_doc_shuffled.ds
115
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00010_00000_doc_shuffled.ds
116
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00011_00000_doc_shuffled.ds
117
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00012_00000_doc_shuffled.ds
118
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00013_00000_doc_shuffled.ds
119
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00014_00000_doc_shuffled.ds
120
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00015_00000_doc_shuffled.ds
121
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
122
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
123
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
124
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
125
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
126
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
127
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
128
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
129
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
130
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
131
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
132
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
133
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
134
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
135
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
136
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
137
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
138
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
139
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
140
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
141
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
142
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
143
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
144
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
145
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
146
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
147
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
148
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
149
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
150
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
151
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
152
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
153
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
154
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00000_00000_doc_shuffled.ds
155
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00001_00000_doc_shuffled.ds
156
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00002_00000_doc_shuffled.ds
157
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00003_00000_doc_shuffled.ds
158
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00004_00000_doc_shuffled.ds
159
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00005_00000_doc_shuffled.ds
160
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00006_00000_doc_shuffled.ds
161
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00007_00000_doc_shuffled.ds
162
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00008_00000_doc_shuffled.ds
163
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00009_00000_doc_shuffled.ds
164
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00010_00000_doc_shuffled.ds
165
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00011_00000_doc_shuffled.ds
166
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00012_00000_doc_shuffled.ds
167
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00013_00000_doc_shuffled.ds
168
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00014_00000_doc_shuffled.ds
169
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00015_00000_doc_shuffled.ds
170
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00016_00000_doc_shuffled.ds
171
+ memmap_dtype: uint16
172
+ datasets: null
173
+ label_mask_paths: null
174
+ pad_direction: right
175
+ generate_attention_mask: false
176
+ generate_doc_lengths: false
177
+ num_workers: 32
178
+ drop_last: true
179
+ pin_memory: true
180
+ prefetch_factor: 8
181
+ persistent_workers: true
182
+ timeout: 0
183
+ seed: null
184
+ instance_filter: null
185
+ custom_dataset: null
186
+ restore_dataloader: true
187
+ fast_forward_batches: null
188
+ evaluators: []
189
+ eval_interval: 5000
190
+ tokenizer:
191
+ identifier: meta-llama/Llama-2-7b-hf
192
+ truncate_direction: right
193
+ save_folder: checkpoints/OLMo-150M-constant
194
+ remote_save_folder: null
195
+ canceled_check_interval: 50
196
+ save_interval: 3000
197
+ save_interval_unsharded: 3000
198
+ save_interval_ephemeral: null
199
+ save_num_checkpoints_to_keep: 48
200
+ save_num_unsharded_checkpoints_to_keep: 48
201
+ save_overwrite: true
202
+ force_save_unsharded: false
203
+ no_pre_train_checkpoint: false
204
+ load_path: null
205
+ load_path_sharded_checkpointer: null
206
+ try_load_latest_save: false
207
+ reset_optimizer_state: false
208
+ reset_trainer_state: false
209
+ sharded_checkpointer: torch_legacy
210
+ new_style_checkpoints: null
211
+ max_duration: 1ep
212
+ global_train_batch_size: 512
213
+ device_train_batch_size: 128
214
+ device_train_microbatch_size: 16
215
+ device_eval_batch_size: 16
216
+ eval_subset_num_batches: -1
217
+ eval_on_load: false
218
+ device_train_grad_accum: 8
219
+ max_grad_norm: 1.0
220
+ max_grad_norm_ratio: null
221
+ precision: amp_bf16
222
+ wandb:
223
+ project: olmo-pretrain
224
+ entity: marksmans
225
+ group: null
226
+ name: OLMo-150M-constant
227
+ tags:
228
+ - watching
229
+ log_artifacts: false
230
+ rank_zero_only: true
231
+ log_interval: 1
232
+ speed_monitor:
233
+ window_size: 20
234
+ gpu_flops_available: null
235
+ console_log_interval: 1
236
+ gen1_gc_interval: 1
237
+ compile: null
238
+ distributed_strategy: ddp
239
+ fsdp:
240
+ use_orig_params: true
241
+ sharding_strategy: FULL_SHARD
242
+ wrapping_strategy: null
243
+ precision: pure
244
+ hybrid_sharding_num_model_replicas: null
245
+ ddp:
246
+ grad_sync_mode: batch
247
+ find_unused_params: false
248
+ single:
249
+ device: auto
250
+ softmax_auxiliary_loss: false
251
+ auxiliary_loss_multiplier: 0.0001
252
+ time_limit: null
253
+ extra_steps_after_cancel: 10
254
+ early_stopping_factor: null
255
+ save_data_indices: true
256
+ python_profiling: false
257
+ torch_profiling: false
258
+ stop_at: 61787
259
+ stop_after: null
260
+ activation_checkpointing: null
261
+ fused_loss: null
262
+ hf_datasets_cache_dir: null
263
+ module_outputs_save_steps: null
OLMo-150M/OLMo-150M-constant/step24000-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa2956de0dcfd5d5b8d0d235d3dd0b8be95cf55150a1d14bc7331183dde01760
3
+ size 649612628
OLMo-150M/OLMo-150M-constant/step24000-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01a29ef4a0e0df0d7e6df517ae27638760a075855043d5da0012efc66f890901
3
+ size 1299223890
OLMo-150M/OLMo-150M-constant/step24000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7deb37f08223397eefcdd761253c86a57157f2c70c28c29e15e52faebf2d5ebb
3
+ size 14796
OLMo-150M/OLMo-150M-constant/step27000-unsharded/config.yaml ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-150M-constant
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 768
7
+ n_heads: 12
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 12
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: false
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0003
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: constant_with_warmup
65
+ units: steps
66
+ t_warmup: 5000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00013_00000_doc_shuffled.ds
88
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00014_00000_doc_shuffled.ds
89
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00015_00000_doc_shuffled.ds
90
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
91
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00017_00000_doc_shuffled.ds
92
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00018_00000_doc_shuffled.ds
93
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
94
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
95
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
96
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
97
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
98
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
99
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
100
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
101
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
102
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
103
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
104
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
105
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00000_00000_doc_shuffled.ds
106
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00001_00000_doc_shuffled.ds
107
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00002_00000_doc_shuffled.ds
108
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00003_00000_doc_shuffled.ds
109
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00004_00000_doc_shuffled.ds
110
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00005_00000_doc_shuffled.ds
111
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00006_00000_doc_shuffled.ds
112
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00007_00000_doc_shuffled.ds
113
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00008_00000_doc_shuffled.ds
114
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00009_00000_doc_shuffled.ds
115
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00010_00000_doc_shuffled.ds
116
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00011_00000_doc_shuffled.ds
117
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00012_00000_doc_shuffled.ds
118
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00013_00000_doc_shuffled.ds
119
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00014_00000_doc_shuffled.ds
120
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00015_00000_doc_shuffled.ds
121
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
122
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
123
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
124
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
125
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
126
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
127
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
128
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
129
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
130
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
131
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
132
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
133
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
134
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
135
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
136
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
137
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
138
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
139
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
140
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
141
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
142
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
143
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
144
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
145
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
146
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
147
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
148
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
149
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
150
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
151
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
152
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
153
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
154
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00000_00000_doc_shuffled.ds
155
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00001_00000_doc_shuffled.ds
156
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00002_00000_doc_shuffled.ds
157
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00003_00000_doc_shuffled.ds
158
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00004_00000_doc_shuffled.ds
159
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00005_00000_doc_shuffled.ds
160
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00006_00000_doc_shuffled.ds
161
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00007_00000_doc_shuffled.ds
162
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00008_00000_doc_shuffled.ds
163
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00009_00000_doc_shuffled.ds
164
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00010_00000_doc_shuffled.ds
165
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00011_00000_doc_shuffled.ds
166
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00012_00000_doc_shuffled.ds
167
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00013_00000_doc_shuffled.ds
168
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00014_00000_doc_shuffled.ds
169
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00015_00000_doc_shuffled.ds
170
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00016_00000_doc_shuffled.ds
171
+ memmap_dtype: uint16
172
+ datasets: null
173
+ label_mask_paths: null
174
+ pad_direction: right
175
+ generate_attention_mask: false
176
+ generate_doc_lengths: false
177
+ num_workers: 32
178
+ drop_last: true
179
+ pin_memory: true
180
+ prefetch_factor: 8
181
+ persistent_workers: true
182
+ timeout: 0
183
+ seed: null
184
+ instance_filter: null
185
+ custom_dataset: null
186
+ restore_dataloader: true
187
+ fast_forward_batches: null
188
+ evaluators: []
189
+ eval_interval: 5000
190
+ tokenizer:
191
+ identifier: meta-llama/Llama-2-7b-hf
192
+ truncate_direction: right
193
+ save_folder: checkpoints/OLMo-150M-constant
194
+ remote_save_folder: null
195
+ canceled_check_interval: 50
196
+ save_interval: 3000
197
+ save_interval_unsharded: 3000
198
+ save_interval_ephemeral: null
199
+ save_num_checkpoints_to_keep: 48
200
+ save_num_unsharded_checkpoints_to_keep: 48
201
+ save_overwrite: true
202
+ force_save_unsharded: false
203
+ no_pre_train_checkpoint: false
204
+ load_path: null
205
+ load_path_sharded_checkpointer: null
206
+ try_load_latest_save: false
207
+ reset_optimizer_state: false
208
+ reset_trainer_state: false
209
+ sharded_checkpointer: torch_legacy
210
+ new_style_checkpoints: null
211
+ max_duration: 1ep
212
+ global_train_batch_size: 512
213
+ device_train_batch_size: 128
214
+ device_train_microbatch_size: 16
215
+ device_eval_batch_size: 16
216
+ eval_subset_num_batches: -1
217
+ eval_on_load: false
218
+ device_train_grad_accum: 8
219
+ max_grad_norm: 1.0
220
+ max_grad_norm_ratio: null
221
+ precision: amp_bf16
222
+ wandb:
223
+ project: olmo-pretrain
224
+ entity: marksmans
225
+ group: null
226
+ name: OLMo-150M-constant
227
+ tags:
228
+ - watching
229
+ log_artifacts: false
230
+ rank_zero_only: true
231
+ log_interval: 1
232
+ speed_monitor:
233
+ window_size: 20
234
+ gpu_flops_available: null
235
+ console_log_interval: 1
236
+ gen1_gc_interval: 1
237
+ compile: null
238
+ distributed_strategy: ddp
239
+ fsdp:
240
+ use_orig_params: true
241
+ sharding_strategy: FULL_SHARD
242
+ wrapping_strategy: null
243
+ precision: pure
244
+ hybrid_sharding_num_model_replicas: null
245
+ ddp:
246
+ grad_sync_mode: batch
247
+ find_unused_params: false
248
+ single:
249
+ device: auto
250
+ softmax_auxiliary_loss: false
251
+ auxiliary_loss_multiplier: 0.0001
252
+ time_limit: null
253
+ extra_steps_after_cancel: 10
254
+ early_stopping_factor: null
255
+ save_data_indices: true
256
+ python_profiling: false
257
+ torch_profiling: false
258
+ stop_at: 61787
259
+ stop_after: null
260
+ activation_checkpointing: null
261
+ fused_loss: null
262
+ hf_datasets_cache_dir: null
263
+ module_outputs_save_steps: null
OLMo-150M/OLMo-150M-constant/step27000-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81706e1197b277011cc0bac96e041fcc045f3677ba34c2d92046799db22cd87e
3
+ size 649612628
OLMo-150M/OLMo-150M-constant/step27000-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b7b424827c6d4ce803a36b8c5ac3ae8b444b78061a13fa0d49baafaa0c8b752
3
+ size 1299223890
OLMo-150M/OLMo-150M-constant/step27000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d43ce74e41a4ab0dbf481aeb1d4298078a76f3fd65993a7fba55e2bcabc7578a
3
+ size 14860
OLMo-150M/OLMo-150M-constant/step3000-unsharded/config.yaml ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-150M-constant
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 768
7
+ n_heads: 12
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 12
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: false
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0003
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: constant_with_warmup
65
+ units: steps
66
+ t_warmup: 5000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00013_00000_doc_shuffled.ds
88
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00014_00000_doc_shuffled.ds
89
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00015_00000_doc_shuffled.ds
90
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
91
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00017_00000_doc_shuffled.ds
92
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00018_00000_doc_shuffled.ds
93
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
94
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
95
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
96
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
97
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
98
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
99
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
100
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
101
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
102
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
103
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
104
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
105
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00000_00000_doc_shuffled.ds
106
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00001_00000_doc_shuffled.ds
107
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00002_00000_doc_shuffled.ds
108
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00003_00000_doc_shuffled.ds
109
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00004_00000_doc_shuffled.ds
110
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00005_00000_doc_shuffled.ds
111
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00006_00000_doc_shuffled.ds
112
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00007_00000_doc_shuffled.ds
113
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00008_00000_doc_shuffled.ds
114
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00009_00000_doc_shuffled.ds
115
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00010_00000_doc_shuffled.ds
116
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00011_00000_doc_shuffled.ds
117
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00012_00000_doc_shuffled.ds
118
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00013_00000_doc_shuffled.ds
119
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00014_00000_doc_shuffled.ds
120
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00015_00000_doc_shuffled.ds
121
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
122
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
123
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
124
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
125
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
126
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
127
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
128
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
129
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
130
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
131
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
132
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
133
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
134
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
135
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
136
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
137
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
138
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
139
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
140
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
141
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
142
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
143
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
144
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
145
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
146
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
147
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
148
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
149
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
150
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
151
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
152
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
153
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
154
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00000_00000_doc_shuffled.ds
155
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00001_00000_doc_shuffled.ds
156
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00002_00000_doc_shuffled.ds
157
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00003_00000_doc_shuffled.ds
158
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00004_00000_doc_shuffled.ds
159
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00005_00000_doc_shuffled.ds
160
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00006_00000_doc_shuffled.ds
161
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00007_00000_doc_shuffled.ds
162
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00008_00000_doc_shuffled.ds
163
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00009_00000_doc_shuffled.ds
164
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00010_00000_doc_shuffled.ds
165
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00011_00000_doc_shuffled.ds
166
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00012_00000_doc_shuffled.ds
167
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00013_00000_doc_shuffled.ds
168
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00014_00000_doc_shuffled.ds
169
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00015_00000_doc_shuffled.ds
170
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00016_00000_doc_shuffled.ds
171
+ memmap_dtype: uint16
172
+ datasets: null
173
+ label_mask_paths: null
174
+ pad_direction: right
175
+ generate_attention_mask: false
176
+ generate_doc_lengths: false
177
+ num_workers: 32
178
+ drop_last: true
179
+ pin_memory: true
180
+ prefetch_factor: 8
181
+ persistent_workers: true
182
+ timeout: 0
183
+ seed: null
184
+ instance_filter: null
185
+ custom_dataset: null
186
+ restore_dataloader: true
187
+ fast_forward_batches: null
188
+ evaluators: []
189
+ eval_interval: 5000
190
+ tokenizer:
191
+ identifier: meta-llama/Llama-2-7b-hf
192
+ truncate_direction: right
193
+ save_folder: checkpoints/OLMo-150M-constant
194
+ remote_save_folder: null
195
+ canceled_check_interval: 50
196
+ save_interval: 3000
197
+ save_interval_unsharded: 3000
198
+ save_interval_ephemeral: null
199
+ save_num_checkpoints_to_keep: 48
200
+ save_num_unsharded_checkpoints_to_keep: 48
201
+ save_overwrite: true
202
+ force_save_unsharded: false
203
+ no_pre_train_checkpoint: false
204
+ load_path: null
205
+ load_path_sharded_checkpointer: null
206
+ try_load_latest_save: false
207
+ reset_optimizer_state: false
208
+ reset_trainer_state: false
209
+ sharded_checkpointer: torch_legacy
210
+ new_style_checkpoints: null
211
+ max_duration: 1ep
212
+ global_train_batch_size: 512
213
+ device_train_batch_size: 128
214
+ device_train_microbatch_size: 16
215
+ device_eval_batch_size: 16
216
+ eval_subset_num_batches: -1
217
+ eval_on_load: false
218
+ device_train_grad_accum: 8
219
+ max_grad_norm: 1.0
220
+ max_grad_norm_ratio: null
221
+ precision: amp_bf16
222
+ wandb:
223
+ project: olmo-pretrain
224
+ entity: marksmans
225
+ group: null
226
+ name: OLMo-150M-constant
227
+ tags:
228
+ - watching
229
+ log_artifacts: false
230
+ rank_zero_only: true
231
+ log_interval: 1
232
+ speed_monitor:
233
+ window_size: 20
234
+ gpu_flops_available: null
235
+ console_log_interval: 1
236
+ gen1_gc_interval: 1
237
+ compile: null
238
+ distributed_strategy: ddp
239
+ fsdp:
240
+ use_orig_params: true
241
+ sharding_strategy: FULL_SHARD
242
+ wrapping_strategy: null
243
+ precision: pure
244
+ hybrid_sharding_num_model_replicas: null
245
+ ddp:
246
+ grad_sync_mode: batch
247
+ find_unused_params: false
248
+ single:
249
+ device: auto
250
+ softmax_auxiliary_loss: false
251
+ auxiliary_loss_multiplier: 0.0001
252
+ time_limit: null
253
+ extra_steps_after_cancel: 10
254
+ early_stopping_factor: null
255
+ save_data_indices: true
256
+ python_profiling: false
257
+ torch_profiling: false
258
+ stop_at: 61787
259
+ stop_after: null
260
+ activation_checkpointing: null
261
+ fused_loss: null
262
+ hf_datasets_cache_dir: null
263
+ module_outputs_save_steps: null
OLMo-150M/OLMo-150M-constant/step3000-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1f9d73a36dbe87643ff1efcb50eca268bf40d3cdbadb48ae4559f20b5d964f1
3
+ size 649612628
OLMo-150M/OLMo-150M-constant/step3000-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7447691a5b73ea2fa475cab52438509cdc1962d7735e78d9cc6fd7895876c8c
3
+ size 1299223890
OLMo-150M/OLMo-150M-constant/step3000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf281ecd523bc3bab053057261ba0affaf1941b6f7518dc4480ae5d913807378
3
+ size 14540
OLMo-150M/OLMo-150M-constant/step30000-unsharded/config.yaml ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-150M-constant
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 768
7
+ n_heads: 12
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 12
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: false
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0003
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: constant_with_warmup
65
+ units: steps
66
+ t_warmup: 5000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00013_00000_doc_shuffled.ds
88
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00014_00000_doc_shuffled.ds
89
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00015_00000_doc_shuffled.ds
90
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
91
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00017_00000_doc_shuffled.ds
92
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00018_00000_doc_shuffled.ds
93
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
94
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
95
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
96
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
97
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
98
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
99
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
100
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
101
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
102
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
103
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
104
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
105
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00000_00000_doc_shuffled.ds
106
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00001_00000_doc_shuffled.ds
107
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00002_00000_doc_shuffled.ds
108
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00003_00000_doc_shuffled.ds
109
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00004_00000_doc_shuffled.ds
110
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00005_00000_doc_shuffled.ds
111
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00006_00000_doc_shuffled.ds
112
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00007_00000_doc_shuffled.ds
113
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00008_00000_doc_shuffled.ds
114
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00009_00000_doc_shuffled.ds
115
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00010_00000_doc_shuffled.ds
116
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00011_00000_doc_shuffled.ds
117
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00012_00000_doc_shuffled.ds
118
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00013_00000_doc_shuffled.ds
119
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00014_00000_doc_shuffled.ds
120
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00015_00000_doc_shuffled.ds
121
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
122
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
123
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
124
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
125
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
126
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
127
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
128
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
129
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
130
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
131
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
132
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
133
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
134
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
135
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
136
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
137
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
138
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
139
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
140
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
141
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
142
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
143
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
144
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
145
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
146
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
147
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
148
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
149
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
150
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
151
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
152
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
153
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
154
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00000_00000_doc_shuffled.ds
155
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00001_00000_doc_shuffled.ds
156
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00002_00000_doc_shuffled.ds
157
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00003_00000_doc_shuffled.ds
158
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00004_00000_doc_shuffled.ds
159
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00005_00000_doc_shuffled.ds
160
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00006_00000_doc_shuffled.ds
161
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00007_00000_doc_shuffled.ds
162
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00008_00000_doc_shuffled.ds
163
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00009_00000_doc_shuffled.ds
164
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00010_00000_doc_shuffled.ds
165
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00011_00000_doc_shuffled.ds
166
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00012_00000_doc_shuffled.ds
167
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00013_00000_doc_shuffled.ds
168
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00014_00000_doc_shuffled.ds
169
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00015_00000_doc_shuffled.ds
170
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00016_00000_doc_shuffled.ds
171
+ memmap_dtype: uint16
172
+ datasets: null
173
+ label_mask_paths: null
174
+ pad_direction: right
175
+ generate_attention_mask: false
176
+ generate_doc_lengths: false
177
+ num_workers: 32
178
+ drop_last: true
179
+ pin_memory: true
180
+ prefetch_factor: 8
181
+ persistent_workers: true
182
+ timeout: 0
183
+ seed: null
184
+ instance_filter: null
185
+ custom_dataset: null
186
+ restore_dataloader: true
187
+ fast_forward_batches: null
188
+ evaluators: []
189
+ eval_interval: 5000
190
+ tokenizer:
191
+ identifier: meta-llama/Llama-2-7b-hf
192
+ truncate_direction: right
193
+ save_folder: checkpoints/OLMo-150M-constant
194
+ remote_save_folder: null
195
+ canceled_check_interval: 50
196
+ save_interval: 3000
197
+ save_interval_unsharded: 3000
198
+ save_interval_ephemeral: null
199
+ save_num_checkpoints_to_keep: 48
200
+ save_num_unsharded_checkpoints_to_keep: 48
201
+ save_overwrite: true
202
+ force_save_unsharded: false
203
+ no_pre_train_checkpoint: false
204
+ load_path: null
205
+ load_path_sharded_checkpointer: null
206
+ try_load_latest_save: false
207
+ reset_optimizer_state: false
208
+ reset_trainer_state: false
209
+ sharded_checkpointer: torch_legacy
210
+ new_style_checkpoints: null
211
+ max_duration: 1ep
212
+ global_train_batch_size: 512
213
+ device_train_batch_size: 128
214
+ device_train_microbatch_size: 16
215
+ device_eval_batch_size: 16
216
+ eval_subset_num_batches: -1
217
+ eval_on_load: false
218
+ device_train_grad_accum: 8
219
+ max_grad_norm: 1.0
220
+ max_grad_norm_ratio: null
221
+ precision: amp_bf16
222
+ wandb:
223
+ project: olmo-pretrain
224
+ entity: marksmans
225
+ group: null
226
+ name: OLMo-150M-constant
227
+ tags:
228
+ - watching
229
+ log_artifacts: false
230
+ rank_zero_only: true
231
+ log_interval: 1
232
+ speed_monitor:
233
+ window_size: 20
234
+ gpu_flops_available: null
235
+ console_log_interval: 1
236
+ gen1_gc_interval: 1
237
+ compile: null
238
+ distributed_strategy: ddp
239
+ fsdp:
240
+ use_orig_params: true
241
+ sharding_strategy: FULL_SHARD
242
+ wrapping_strategy: null
243
+ precision: pure
244
+ hybrid_sharding_num_model_replicas: null
245
+ ddp:
246
+ grad_sync_mode: batch
247
+ find_unused_params: false
248
+ single:
249
+ device: auto
250
+ softmax_auxiliary_loss: false
251
+ auxiliary_loss_multiplier: 0.0001
252
+ time_limit: null
253
+ extra_steps_after_cancel: 10
254
+ early_stopping_factor: null
255
+ save_data_indices: true
256
+ python_profiling: false
257
+ torch_profiling: false
258
+ stop_at: 61787
259
+ stop_after: null
260
+ activation_checkpointing: null
261
+ fused_loss: null
262
+ hf_datasets_cache_dir: null
263
+ module_outputs_save_steps: null
OLMo-150M/OLMo-150M-constant/step30000-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac4ca0a5e679f06fd718d58b1796b501fc573a568c3d907b04cddc1c1a034986
3
+ size 649612628
OLMo-150M/OLMo-150M-constant/step30000-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:667c75645b9abe8dda3a4ea221a86617c4a7f18cfced871e9379bc6ba90b3c38
3
+ size 1299223890
OLMo-150M/OLMo-150M-constant/step30000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dded90d1b0b58562b1eaa085449cfbd6e731d658c36e7f116b8778c5b1e8cd4c
3
+ size 14924
OLMo-150M/OLMo-150M-constant/step33000-unsharded/config.yaml ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-150M-constant
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 768
7
+ n_heads: 12
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 12
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: false
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0003
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: constant_with_warmup
65
+ units: steps
66
+ t_warmup: 5000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00013_00000_doc_shuffled.ds
88
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00014_00000_doc_shuffled.ds
89
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00015_00000_doc_shuffled.ds
90
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
91
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00017_00000_doc_shuffled.ds
92
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00018_00000_doc_shuffled.ds
93
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
94
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
95
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
96
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
97
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
98
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
99
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
100
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
101
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
102
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
103
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
104
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
105
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00000_00000_doc_shuffled.ds
106
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00001_00000_doc_shuffled.ds
107
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00002_00000_doc_shuffled.ds
108
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00003_00000_doc_shuffled.ds
109
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00004_00000_doc_shuffled.ds
110
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00005_00000_doc_shuffled.ds
111
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00006_00000_doc_shuffled.ds
112
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00007_00000_doc_shuffled.ds
113
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00008_00000_doc_shuffled.ds
114
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00009_00000_doc_shuffled.ds
115
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00010_00000_doc_shuffled.ds
116
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00011_00000_doc_shuffled.ds
117
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00012_00000_doc_shuffled.ds
118
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00013_00000_doc_shuffled.ds
119
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00014_00000_doc_shuffled.ds
120
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00015_00000_doc_shuffled.ds
121
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
122
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
123
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
124
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
125
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
126
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
127
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
128
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
129
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
130
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
131
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
132
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
133
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
134
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
135
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
136
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
137
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
138
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
139
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
140
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
141
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
142
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
143
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
144
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
145
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
146
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
147
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
148
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
149
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
150
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
151
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
152
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
153
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
154
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00000_00000_doc_shuffled.ds
155
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00001_00000_doc_shuffled.ds
156
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00002_00000_doc_shuffled.ds
157
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00003_00000_doc_shuffled.ds
158
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00004_00000_doc_shuffled.ds
159
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00005_00000_doc_shuffled.ds
160
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00006_00000_doc_shuffled.ds
161
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00007_00000_doc_shuffled.ds
162
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00008_00000_doc_shuffled.ds
163
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00009_00000_doc_shuffled.ds
164
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00010_00000_doc_shuffled.ds
165
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00011_00000_doc_shuffled.ds
166
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00012_00000_doc_shuffled.ds
167
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00013_00000_doc_shuffled.ds
168
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00014_00000_doc_shuffled.ds
169
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00015_00000_doc_shuffled.ds
170
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00016_00000_doc_shuffled.ds
171
+ memmap_dtype: uint16
172
+ datasets: null
173
+ label_mask_paths: null
174
+ pad_direction: right
175
+ generate_attention_mask: false
176
+ generate_doc_lengths: false
177
+ num_workers: 32
178
+ drop_last: true
179
+ pin_memory: true
180
+ prefetch_factor: 8
181
+ persistent_workers: true
182
+ timeout: 0
183
+ seed: null
184
+ instance_filter: null
185
+ custom_dataset: null
186
+ restore_dataloader: true
187
+ fast_forward_batches: null
188
+ evaluators: []
189
+ eval_interval: 5000
190
+ tokenizer:
191
+ identifier: meta-llama/Llama-2-7b-hf
192
+ truncate_direction: right
193
+ save_folder: checkpoints/OLMo-150M-constant
194
+ remote_save_folder: null
195
+ canceled_check_interval: 50
196
+ save_interval: 3000
197
+ save_interval_unsharded: 3000
198
+ save_interval_ephemeral: null
199
+ save_num_checkpoints_to_keep: 48
200
+ save_num_unsharded_checkpoints_to_keep: 48
201
+ save_overwrite: true
202
+ force_save_unsharded: false
203
+ no_pre_train_checkpoint: false
204
+ load_path: null
205
+ load_path_sharded_checkpointer: null
206
+ try_load_latest_save: false
207
+ reset_optimizer_state: false
208
+ reset_trainer_state: false
209
+ sharded_checkpointer: torch_legacy
210
+ new_style_checkpoints: null
211
+ max_duration: 1ep
212
+ global_train_batch_size: 512
213
+ device_train_batch_size: 128
214
+ device_train_microbatch_size: 16
215
+ device_eval_batch_size: 16
216
+ eval_subset_num_batches: -1
217
+ eval_on_load: false
218
+ device_train_grad_accum: 8
219
+ max_grad_norm: 1.0
220
+ max_grad_norm_ratio: null
221
+ precision: amp_bf16
222
+ wandb:
223
+ project: olmo-pretrain
224
+ entity: marksmans
225
+ group: null
226
+ name: OLMo-150M-constant
227
+ tags:
228
+ - watching
229
+ log_artifacts: false
230
+ rank_zero_only: true
231
+ log_interval: 1
232
+ speed_monitor:
233
+ window_size: 20
234
+ gpu_flops_available: null
235
+ console_log_interval: 1
236
+ gen1_gc_interval: 1
237
+ compile: null
238
+ distributed_strategy: ddp
239
+ fsdp:
240
+ use_orig_params: true
241
+ sharding_strategy: FULL_SHARD
242
+ wrapping_strategy: null
243
+ precision: pure
244
+ hybrid_sharding_num_model_replicas: null
245
+ ddp:
246
+ grad_sync_mode: batch
247
+ find_unused_params: false
248
+ single:
249
+ device: auto
250
+ softmax_auxiliary_loss: false
251
+ auxiliary_loss_multiplier: 0.0001
252
+ time_limit: null
253
+ extra_steps_after_cancel: 10
254
+ early_stopping_factor: null
255
+ save_data_indices: true
256
+ python_profiling: false
257
+ torch_profiling: false
258
+ stop_at: 61787
259
+ stop_after: null
260
+ activation_checkpointing: null
261
+ fused_loss: null
262
+ hf_datasets_cache_dir: null
263
+ module_outputs_save_steps: null
OLMo-150M/OLMo-150M-constant/step33000-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63263fbcd40aeef796152a4b199a6b4d419c3caf16984f58629c4735097ce001
3
+ size 649612628
OLMo-150M/OLMo-150M-constant/step33000-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:137f131d56cc296b8c41b56d8624cdae05ea8a192870fa3d40e49162aee041e7
3
+ size 1299223890
OLMo-150M/OLMo-150M-constant/step33000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de39c2ff2721e2ea81574b477debb64ef01123cdd37bf814d58136c4ca7af2b9
3
+ size 14924
OLMo-150M/OLMo-150M-constant/step36000-unsharded/config.yaml ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: OLMo-150M-constant
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 768
7
+ n_heads: 12
8
+ n_kv_heads: null
9
+ clip_qkv: null
10
+ n_layers: 12
11
+ mlp_ratio: 8
12
+ mlp_hidden_size: null
13
+ activation_type: swiglu
14
+ block_type: sequential
15
+ block_group_size: 1
16
+ alibi: false
17
+ alibi_bias_max: 8.0
18
+ rope: true
19
+ rope_full_precision: true
20
+ rope_theta: 10000
21
+ flash_attention: true
22
+ attention_dropout: 0.0
23
+ multi_query_attention: null
24
+ attention_layer_norm: false
25
+ residual_dropout: 0.0
26
+ embedding_dropout: 0.0
27
+ embedding_layer_norm: false
28
+ layer_norm_type: default
29
+ layer_norm_with_affine: false
30
+ layer_norm_eps: 1.0e-05
31
+ attention_layer_norm_with_affine: false
32
+ max_sequence_length: 2048
33
+ include_bias: false
34
+ bias_for_layer_norm: false
35
+ scale_logits: false
36
+ vocab_size: 32000
37
+ embedding_size: 32000
38
+ weight_tying: false
39
+ eos_token_id: 0
40
+ pad_token_id: 1
41
+ init_device: cuda
42
+ init_fn: normal
43
+ init_std: 0.02
44
+ init_cutoff_factor: 3.0
45
+ precision: amp_bf16
46
+ scale_emb_init: false
47
+ emb_init_std: null
48
+ norm_after: false
49
+ optimizer:
50
+ name: adamw
51
+ learning_rate: 0.0003
52
+ weight_decay: 0.1
53
+ betas:
54
+ - 0.9
55
+ - 0.95
56
+ eps: 1.0e-08
57
+ no_decay_norm_and_bias: null
58
+ selective_updates: false
59
+ decay_norm_and_bias: true
60
+ decay_embeddings: true
61
+ metrics_log_interval: 10
62
+ record_update_metrics: false
63
+ scheduler:
64
+ name: constant_with_warmup
65
+ units: steps
66
+ t_warmup: 5000
67
+ t_max: null
68
+ alpha_f: 0.1
69
+ grad_clip_warmup_steps: null
70
+ grad_clip_warmup_factor: null
71
+ warmup_min_lr: 0.0
72
+ data:
73
+ paths:
74
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00000_00000_doc_shuffled.ds
75
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00001_00000_doc_shuffled.ds
76
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00002_00000_doc_shuffled.ds
77
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00003_00000_doc_shuffled.ds
78
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00004_00000_doc_shuffled.ds
79
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00005_00000_doc_shuffled.ds
80
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00006_00000_doc_shuffled.ds
81
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00007_00000_doc_shuffled.ds
82
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00008_00000_doc_shuffled.ds
83
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00009_00000_doc_shuffled.ds
84
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00010_00000_doc_shuffled.ds
85
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00011_00000_doc_shuffled.ds
86
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00012_00000_doc_shuffled.ds
87
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00013_00000_doc_shuffled.ds
88
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00014_00000_doc_shuffled.ds
89
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00015_00000_doc_shuffled.ds
90
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00016_00000_doc_shuffled.ds
91
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00017_00000_doc_shuffled.ds
92
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00018_00000_doc_shuffled.ds
93
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00020_00000_doc_shuffled.ds
94
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00021_00000_doc_shuffled.ds
95
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00022_00000_doc_shuffled.ds
96
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00023_00000_doc_shuffled.ds
97
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00024_00000_doc_shuffled.ds
98
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00025_00000_doc_shuffled.ds
99
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00026_00000_doc_shuffled.ds
100
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00027_00000_doc_shuffled.ds
101
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00028_00000_doc_shuffled.ds
102
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00029_00000_doc_shuffled.ds
103
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00030_00000_doc_shuffled.ds
104
+ - data/as_fm3_omi1_omi2_tinygsm/algebraic-stack-tokenized/00031_00000_doc_shuffled.ds
105
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00000_00000_doc_shuffled.ds
106
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00001_00000_doc_shuffled.ds
107
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00002_00000_doc_shuffled.ds
108
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00003_00000_doc_shuffled.ds
109
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00004_00000_doc_shuffled.ds
110
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00005_00000_doc_shuffled.ds
111
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00006_00000_doc_shuffled.ds
112
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00007_00000_doc_shuffled.ds
113
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00008_00000_doc_shuffled.ds
114
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00009_00000_doc_shuffled.ds
115
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00010_00000_doc_shuffled.ds
116
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00011_00000_doc_shuffled.ds
117
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00012_00000_doc_shuffled.ds
118
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00013_00000_doc_shuffled.ds
119
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00014_00000_doc_shuffled.ds
120
+ - data/as_fm3_omi1_omi2_tinygsm/finemath3-tokenized/00015_00000_doc_shuffled.ds
121
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct1-tokenized/00000_00000_doc_shuffled.ds
122
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00000_00000_doc_shuffled.ds
123
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00001_00000_doc_shuffled.ds
124
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00002_00000_doc_shuffled.ds
125
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00003_00000_doc_shuffled.ds
126
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00004_00000_doc_shuffled.ds
127
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00005_00000_doc_shuffled.ds
128
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00006_00000_doc_shuffled.ds
129
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00007_00000_doc_shuffled.ds
130
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00008_00000_doc_shuffled.ds
131
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00009_00000_doc_shuffled.ds
132
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00010_00000_doc_shuffled.ds
133
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00011_00000_doc_shuffled.ds
134
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00012_00000_doc_shuffled.ds
135
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00013_00000_doc_shuffled.ds
136
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00014_00000_doc_shuffled.ds
137
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00015_00000_doc_shuffled.ds
138
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00016_00000_doc_shuffled.ds
139
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00017_00000_doc_shuffled.ds
140
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00018_00000_doc_shuffled.ds
141
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00019_00000_doc_shuffled.ds
142
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00020_00000_doc_shuffled.ds
143
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00021_00000_doc_shuffled.ds
144
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00022_00000_doc_shuffled.ds
145
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00023_00000_doc_shuffled.ds
146
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00024_00000_doc_shuffled.ds
147
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00025_00000_doc_shuffled.ds
148
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00026_00000_doc_shuffled.ds
149
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00027_00000_doc_shuffled.ds
150
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00028_00000_doc_shuffled.ds
151
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00029_00000_doc_shuffled.ds
152
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00030_00000_doc_shuffled.ds
153
+ - data/as_fm3_omi1_omi2_tinygsm/openmathinstruct2-tokenized/00031_00000_doc_shuffled.ds
154
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00000_00000_doc_shuffled.ds
155
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00001_00000_doc_shuffled.ds
156
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00002_00000_doc_shuffled.ds
157
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00003_00000_doc_shuffled.ds
158
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00004_00000_doc_shuffled.ds
159
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00005_00000_doc_shuffled.ds
160
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00006_00000_doc_shuffled.ds
161
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00007_00000_doc_shuffled.ds
162
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00008_00000_doc_shuffled.ds
163
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00009_00000_doc_shuffled.ds
164
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00010_00000_doc_shuffled.ds
165
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00011_00000_doc_shuffled.ds
166
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00012_00000_doc_shuffled.ds
167
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00013_00000_doc_shuffled.ds
168
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00014_00000_doc_shuffled.ds
169
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00015_00000_doc_shuffled.ds
170
+ - data/as_fm3_omi1_omi2_tinygsm/tinygsm-tokenized/00016_00000_doc_shuffled.ds
171
+ memmap_dtype: uint16
172
+ datasets: null
173
+ label_mask_paths: null
174
+ pad_direction: right
175
+ generate_attention_mask: false
176
+ generate_doc_lengths: false
177
+ num_workers: 32
178
+ drop_last: true
179
+ pin_memory: true
180
+ prefetch_factor: 8
181
+ persistent_workers: true
182
+ timeout: 0
183
+ seed: null
184
+ instance_filter: null
185
+ custom_dataset: null
186
+ restore_dataloader: true
187
+ fast_forward_batches: null
188
+ evaluators: []
189
+ eval_interval: 5000
190
+ tokenizer:
191
+ identifier: meta-llama/Llama-2-7b-hf
192
+ truncate_direction: right
193
+ save_folder: checkpoints/OLMo-150M-constant
194
+ remote_save_folder: null
195
+ canceled_check_interval: 50
196
+ save_interval: 3000
197
+ save_interval_unsharded: 3000
198
+ save_interval_ephemeral: null
199
+ save_num_checkpoints_to_keep: 48
200
+ save_num_unsharded_checkpoints_to_keep: 48
201
+ save_overwrite: true
202
+ force_save_unsharded: false
203
+ no_pre_train_checkpoint: false
204
+ load_path: null
205
+ load_path_sharded_checkpointer: null
206
+ try_load_latest_save: false
207
+ reset_optimizer_state: false
208
+ reset_trainer_state: false
209
+ sharded_checkpointer: torch_legacy
210
+ new_style_checkpoints: null
211
+ max_duration: 1ep
212
+ global_train_batch_size: 512
213
+ device_train_batch_size: 128
214
+ device_train_microbatch_size: 16
215
+ device_eval_batch_size: 16
216
+ eval_subset_num_batches: -1
217
+ eval_on_load: false
218
+ device_train_grad_accum: 8
219
+ max_grad_norm: 1.0
220
+ max_grad_norm_ratio: null
221
+ precision: amp_bf16
222
+ wandb:
223
+ project: olmo-pretrain
224
+ entity: marksmans
225
+ group: null
226
+ name: OLMo-150M-constant
227
+ tags:
228
+ - watching
229
+ log_artifacts: false
230
+ rank_zero_only: true
231
+ log_interval: 1
232
+ speed_monitor:
233
+ window_size: 20
234
+ gpu_flops_available: null
235
+ console_log_interval: 1
236
+ gen1_gc_interval: 1
237
+ compile: null
238
+ distributed_strategy: ddp
239
+ fsdp:
240
+ use_orig_params: true
241
+ sharding_strategy: FULL_SHARD
242
+ wrapping_strategy: null
243
+ precision: pure
244
+ hybrid_sharding_num_model_replicas: null
245
+ ddp:
246
+ grad_sync_mode: batch
247
+ find_unused_params: false
248
+ single:
249
+ device: auto
250
+ softmax_auxiliary_loss: false
251
+ auxiliary_loss_multiplier: 0.0001
252
+ time_limit: null
253
+ extra_steps_after_cancel: 10
254
+ early_stopping_factor: null
255
+ save_data_indices: true
256
+ python_profiling: false
257
+ torch_profiling: false
258
+ stop_at: 61787
259
+ stop_after: null
260
+ activation_checkpointing: null
261
+ fused_loss: null
262
+ hf_datasets_cache_dir: null
263
+ module_outputs_save_steps: null
OLMo-150M/OLMo-150M-constant/step36000-unsharded/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b37117ae18377853d7e05de03526afa8647178f54c4b333b9fba877cb43947c9
3
+ size 649612628
OLMo-150M/OLMo-150M-constant/step36000-unsharded/optim.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a02ac9aa7ed7da1f77f6711488801a6e6f524a93b5b982c34b998e296e4fac3
3
+ size 1299223890
OLMo-150M/OLMo-150M-constant/step36000-unsharded/train.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35a34bfc67495913ced39844632367af62ca81b87c011dcbb19942755b12737c
3
+ size 14988