Text Generation
Transformers
Safetensors
English
bolmo
custom_code
benjamin commited on
Commit
bf8dcf8
·
verified ·
1 Parent(s): d691ab2

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +129 -0
  2. olmo_core/.metadata.json +1 -0
  3. olmo_core/config.json +349 -0
  4. olmo_core/model_and_optim/.metadata +3 -0
  5. olmo_core/model_and_optim/__0_0.distcp +3 -0
  6. olmo_core/model_and_optim/__0_1.distcp +3 -0
  7. olmo_core/model_and_optim/__0_10.distcp +3 -0
  8. olmo_core/model_and_optim/__0_11.distcp +3 -0
  9. olmo_core/model_and_optim/__0_12.distcp +3 -0
  10. olmo_core/model_and_optim/__0_13.distcp +3 -0
  11. olmo_core/model_and_optim/__0_14.distcp +3 -0
  12. olmo_core/model_and_optim/__0_15.distcp +3 -0
  13. olmo_core/model_and_optim/__0_2.distcp +3 -0
  14. olmo_core/model_and_optim/__0_3.distcp +3 -0
  15. olmo_core/model_and_optim/__0_4.distcp +3 -0
  16. olmo_core/model_and_optim/__0_5.distcp +3 -0
  17. olmo_core/model_and_optim/__0_6.distcp +3 -0
  18. olmo_core/model_and_optim/__0_7.distcp +3 -0
  19. olmo_core/model_and_optim/__0_8.distcp +3 -0
  20. olmo_core/model_and_optim/__0_9.distcp +3 -0
  21. olmo_core/model_and_optim/__1_0.distcp +3 -0
  22. olmo_core/model_and_optim/__1_1.distcp +3 -0
  23. olmo_core/model_and_optim/__1_10.distcp +3 -0
  24. olmo_core/model_and_optim/__1_11.distcp +3 -0
  25. olmo_core/model_and_optim/__1_12.distcp +3 -0
  26. olmo_core/model_and_optim/__1_13.distcp +3 -0
  27. olmo_core/model_and_optim/__1_14.distcp +3 -0
  28. olmo_core/model_and_optim/__1_15.distcp +3 -0
  29. olmo_core/model_and_optim/__1_2.distcp +3 -0
  30. olmo_core/model_and_optim/__1_3.distcp +3 -0
  31. olmo_core/model_and_optim/__1_4.distcp +3 -0
  32. olmo_core/model_and_optim/__1_5.distcp +3 -0
  33. olmo_core/model_and_optim/__1_6.distcp +3 -0
  34. olmo_core/model_and_optim/__1_7.distcp +3 -0
  35. olmo_core/model_and_optim/__1_8.distcp +3 -0
  36. olmo_core/model_and_optim/__1_9.distcp +3 -0
  37. olmo_core/model_and_optim/__2_0.distcp +3 -0
  38. olmo_core/model_and_optim/__2_1.distcp +3 -0
  39. olmo_core/model_and_optim/__2_10.distcp +3 -0
  40. olmo_core/model_and_optim/__2_11.distcp +3 -0
  41. olmo_core/model_and_optim/__2_12.distcp +3 -0
  42. olmo_core/model_and_optim/__2_13.distcp +3 -0
  43. olmo_core/model_and_optim/__2_14.distcp +3 -0
  44. olmo_core/model_and_optim/__2_15.distcp +3 -0
  45. olmo_core/model_and_optim/__2_2.distcp +3 -0
  46. olmo_core/model_and_optim/__2_3.distcp +3 -0
  47. olmo_core/model_and_optim/__2_4.distcp +3 -0
  48. olmo_core/model_and_optim/__2_5.distcp +3 -0
  49. olmo_core/model_and_optim/__2_6.distcp +3 -0
  50. olmo_core/model_and_optim/__2_7.distcp +3 -0
.gitattributes CHANGED
@@ -33,3 +33,132 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ olmo_core/model_and_optim/.metadata filter=lfs diff=lfs merge=lfs -text
37
+ olmo_core/model_and_optim/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
38
+ olmo_core/model_and_optim/__0_1.distcp filter=lfs diff=lfs merge=lfs -text
39
+ olmo_core/model_and_optim/__0_10.distcp filter=lfs diff=lfs merge=lfs -text
40
+ olmo_core/model_and_optim/__0_11.distcp filter=lfs diff=lfs merge=lfs -text
41
+ olmo_core/model_and_optim/__0_12.distcp filter=lfs diff=lfs merge=lfs -text
42
+ olmo_core/model_and_optim/__0_13.distcp filter=lfs diff=lfs merge=lfs -text
43
+ olmo_core/model_and_optim/__0_14.distcp filter=lfs diff=lfs merge=lfs -text
44
+ olmo_core/model_and_optim/__0_15.distcp filter=lfs diff=lfs merge=lfs -text
45
+ olmo_core/model_and_optim/__0_2.distcp filter=lfs diff=lfs merge=lfs -text
46
+ olmo_core/model_and_optim/__0_3.distcp filter=lfs diff=lfs merge=lfs -text
47
+ olmo_core/model_and_optim/__0_4.distcp filter=lfs diff=lfs merge=lfs -text
48
+ olmo_core/model_and_optim/__0_5.distcp filter=lfs diff=lfs merge=lfs -text
49
+ olmo_core/model_and_optim/__0_6.distcp filter=lfs diff=lfs merge=lfs -text
50
+ olmo_core/model_and_optim/__0_7.distcp filter=lfs diff=lfs merge=lfs -text
51
+ olmo_core/model_and_optim/__0_8.distcp filter=lfs diff=lfs merge=lfs -text
52
+ olmo_core/model_and_optim/__0_9.distcp filter=lfs diff=lfs merge=lfs -text
53
+ olmo_core/model_and_optim/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
54
+ olmo_core/model_and_optim/__1_1.distcp filter=lfs diff=lfs merge=lfs -text
55
+ olmo_core/model_and_optim/__1_10.distcp filter=lfs diff=lfs merge=lfs -text
56
+ olmo_core/model_and_optim/__1_11.distcp filter=lfs diff=lfs merge=lfs -text
57
+ olmo_core/model_and_optim/__1_12.distcp filter=lfs diff=lfs merge=lfs -text
58
+ olmo_core/model_and_optim/__1_13.distcp filter=lfs diff=lfs merge=lfs -text
59
+ olmo_core/model_and_optim/__1_14.distcp filter=lfs diff=lfs merge=lfs -text
60
+ olmo_core/model_and_optim/__1_15.distcp filter=lfs diff=lfs merge=lfs -text
61
+ olmo_core/model_and_optim/__1_2.distcp filter=lfs diff=lfs merge=lfs -text
62
+ olmo_core/model_and_optim/__1_3.distcp filter=lfs diff=lfs merge=lfs -text
63
+ olmo_core/model_and_optim/__1_4.distcp filter=lfs diff=lfs merge=lfs -text
64
+ olmo_core/model_and_optim/__1_5.distcp filter=lfs diff=lfs merge=lfs -text
65
+ olmo_core/model_and_optim/__1_6.distcp filter=lfs diff=lfs merge=lfs -text
66
+ olmo_core/model_and_optim/__1_7.distcp filter=lfs diff=lfs merge=lfs -text
67
+ olmo_core/model_and_optim/__1_8.distcp filter=lfs diff=lfs merge=lfs -text
68
+ olmo_core/model_and_optim/__1_9.distcp filter=lfs diff=lfs merge=lfs -text
69
+ olmo_core/model_and_optim/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
70
+ olmo_core/model_and_optim/__2_1.distcp filter=lfs diff=lfs merge=lfs -text
71
+ olmo_core/model_and_optim/__2_10.distcp filter=lfs diff=lfs merge=lfs -text
72
+ olmo_core/model_and_optim/__2_11.distcp filter=lfs diff=lfs merge=lfs -text
73
+ olmo_core/model_and_optim/__2_12.distcp filter=lfs diff=lfs merge=lfs -text
74
+ olmo_core/model_and_optim/__2_13.distcp filter=lfs diff=lfs merge=lfs -text
75
+ olmo_core/model_and_optim/__2_14.distcp filter=lfs diff=lfs merge=lfs -text
76
+ olmo_core/model_and_optim/__2_15.distcp filter=lfs diff=lfs merge=lfs -text
77
+ olmo_core/model_and_optim/__2_2.distcp filter=lfs diff=lfs merge=lfs -text
78
+ olmo_core/model_and_optim/__2_3.distcp filter=lfs diff=lfs merge=lfs -text
79
+ olmo_core/model_and_optim/__2_4.distcp filter=lfs diff=lfs merge=lfs -text
80
+ olmo_core/model_and_optim/__2_5.distcp filter=lfs diff=lfs merge=lfs -text
81
+ olmo_core/model_and_optim/__2_6.distcp filter=lfs diff=lfs merge=lfs -text
82
+ olmo_core/model_and_optim/__2_7.distcp filter=lfs diff=lfs merge=lfs -text
83
+ olmo_core/model_and_optim/__2_8.distcp filter=lfs diff=lfs merge=lfs -text
84
+ olmo_core/model_and_optim/__2_9.distcp filter=lfs diff=lfs merge=lfs -text
85
+ olmo_core/model_and_optim/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
86
+ olmo_core/model_and_optim/__3_1.distcp filter=lfs diff=lfs merge=lfs -text
87
+ olmo_core/model_and_optim/__3_10.distcp filter=lfs diff=lfs merge=lfs -text
88
+ olmo_core/model_and_optim/__3_11.distcp filter=lfs diff=lfs merge=lfs -text
89
+ olmo_core/model_and_optim/__3_12.distcp filter=lfs diff=lfs merge=lfs -text
90
+ olmo_core/model_and_optim/__3_13.distcp filter=lfs diff=lfs merge=lfs -text
91
+ olmo_core/model_and_optim/__3_14.distcp filter=lfs diff=lfs merge=lfs -text
92
+ olmo_core/model_and_optim/__3_15.distcp filter=lfs diff=lfs merge=lfs -text
93
+ olmo_core/model_and_optim/__3_2.distcp filter=lfs diff=lfs merge=lfs -text
94
+ olmo_core/model_and_optim/__3_3.distcp filter=lfs diff=lfs merge=lfs -text
95
+ olmo_core/model_and_optim/__3_4.distcp filter=lfs diff=lfs merge=lfs -text
96
+ olmo_core/model_and_optim/__3_5.distcp filter=lfs diff=lfs merge=lfs -text
97
+ olmo_core/model_and_optim/__3_6.distcp filter=lfs diff=lfs merge=lfs -text
98
+ olmo_core/model_and_optim/__3_7.distcp filter=lfs diff=lfs merge=lfs -text
99
+ olmo_core/model_and_optim/__3_8.distcp filter=lfs diff=lfs merge=lfs -text
100
+ olmo_core/model_and_optim/__3_9.distcp filter=lfs diff=lfs merge=lfs -text
101
+ olmo_core/model_and_optim/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
102
+ olmo_core/model_and_optim/__4_1.distcp filter=lfs diff=lfs merge=lfs -text
103
+ olmo_core/model_and_optim/__4_10.distcp filter=lfs diff=lfs merge=lfs -text
104
+ olmo_core/model_and_optim/__4_11.distcp filter=lfs diff=lfs merge=lfs -text
105
+ olmo_core/model_and_optim/__4_12.distcp filter=lfs diff=lfs merge=lfs -text
106
+ olmo_core/model_and_optim/__4_13.distcp filter=lfs diff=lfs merge=lfs -text
107
+ olmo_core/model_and_optim/__4_14.distcp filter=lfs diff=lfs merge=lfs -text
108
+ olmo_core/model_and_optim/__4_15.distcp filter=lfs diff=lfs merge=lfs -text
109
+ olmo_core/model_and_optim/__4_2.distcp filter=lfs diff=lfs merge=lfs -text
110
+ olmo_core/model_and_optim/__4_3.distcp filter=lfs diff=lfs merge=lfs -text
111
+ olmo_core/model_and_optim/__4_4.distcp filter=lfs diff=lfs merge=lfs -text
112
+ olmo_core/model_and_optim/__4_5.distcp filter=lfs diff=lfs merge=lfs -text
113
+ olmo_core/model_and_optim/__4_6.distcp filter=lfs diff=lfs merge=lfs -text
114
+ olmo_core/model_and_optim/__4_7.distcp filter=lfs diff=lfs merge=lfs -text
115
+ olmo_core/model_and_optim/__4_8.distcp filter=lfs diff=lfs merge=lfs -text
116
+ olmo_core/model_and_optim/__4_9.distcp filter=lfs diff=lfs merge=lfs -text
117
+ olmo_core/model_and_optim/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
118
+ olmo_core/model_and_optim/__5_1.distcp filter=lfs diff=lfs merge=lfs -text
119
+ olmo_core/model_and_optim/__5_10.distcp filter=lfs diff=lfs merge=lfs -text
120
+ olmo_core/model_and_optim/__5_11.distcp filter=lfs diff=lfs merge=lfs -text
121
+ olmo_core/model_and_optim/__5_12.distcp filter=lfs diff=lfs merge=lfs -text
122
+ olmo_core/model_and_optim/__5_13.distcp filter=lfs diff=lfs merge=lfs -text
123
+ olmo_core/model_and_optim/__5_14.distcp filter=lfs diff=lfs merge=lfs -text
124
+ olmo_core/model_and_optim/__5_15.distcp filter=lfs diff=lfs merge=lfs -text
125
+ olmo_core/model_and_optim/__5_2.distcp filter=lfs diff=lfs merge=lfs -text
126
+ olmo_core/model_and_optim/__5_3.distcp filter=lfs diff=lfs merge=lfs -text
127
+ olmo_core/model_and_optim/__5_4.distcp filter=lfs diff=lfs merge=lfs -text
128
+ olmo_core/model_and_optim/__5_5.distcp filter=lfs diff=lfs merge=lfs -text
129
+ olmo_core/model_and_optim/__5_6.distcp filter=lfs diff=lfs merge=lfs -text
130
+ olmo_core/model_and_optim/__5_7.distcp filter=lfs diff=lfs merge=lfs -text
131
+ olmo_core/model_and_optim/__5_8.distcp filter=lfs diff=lfs merge=lfs -text
132
+ olmo_core/model_and_optim/__5_9.distcp filter=lfs diff=lfs merge=lfs -text
133
+ olmo_core/model_and_optim/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
134
+ olmo_core/model_and_optim/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
135
+ olmo_core/model_and_optim/__6_10.distcp filter=lfs diff=lfs merge=lfs -text
136
+ olmo_core/model_and_optim/__6_11.distcp filter=lfs diff=lfs merge=lfs -text
137
+ olmo_core/model_and_optim/__6_12.distcp filter=lfs diff=lfs merge=lfs -text
138
+ olmo_core/model_and_optim/__6_13.distcp filter=lfs diff=lfs merge=lfs -text
139
+ olmo_core/model_and_optim/__6_14.distcp filter=lfs diff=lfs merge=lfs -text
140
+ olmo_core/model_and_optim/__6_15.distcp filter=lfs diff=lfs merge=lfs -text
141
+ olmo_core/model_and_optim/__6_2.distcp filter=lfs diff=lfs merge=lfs -text
142
+ olmo_core/model_and_optim/__6_3.distcp filter=lfs diff=lfs merge=lfs -text
143
+ olmo_core/model_and_optim/__6_4.distcp filter=lfs diff=lfs merge=lfs -text
144
+ olmo_core/model_and_optim/__6_5.distcp filter=lfs diff=lfs merge=lfs -text
145
+ olmo_core/model_and_optim/__6_6.distcp filter=lfs diff=lfs merge=lfs -text
146
+ olmo_core/model_and_optim/__6_7.distcp filter=lfs diff=lfs merge=lfs -text
147
+ olmo_core/model_and_optim/__6_8.distcp filter=lfs diff=lfs merge=lfs -text
148
+ olmo_core/model_and_optim/__6_9.distcp filter=lfs diff=lfs merge=lfs -text
149
+ olmo_core/model_and_optim/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
150
+ olmo_core/model_and_optim/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
151
+ olmo_core/model_and_optim/__7_10.distcp filter=lfs diff=lfs merge=lfs -text
152
+ olmo_core/model_and_optim/__7_11.distcp filter=lfs diff=lfs merge=lfs -text
153
+ olmo_core/model_and_optim/__7_12.distcp filter=lfs diff=lfs merge=lfs -text
154
+ olmo_core/model_and_optim/__7_13.distcp filter=lfs diff=lfs merge=lfs -text
155
+ olmo_core/model_and_optim/__7_14.distcp filter=lfs diff=lfs merge=lfs -text
156
+ olmo_core/model_and_optim/__7_15.distcp filter=lfs diff=lfs merge=lfs -text
157
+ olmo_core/model_and_optim/__7_2.distcp filter=lfs diff=lfs merge=lfs -text
158
+ olmo_core/model_and_optim/__7_3.distcp filter=lfs diff=lfs merge=lfs -text
159
+ olmo_core/model_and_optim/__7_4.distcp filter=lfs diff=lfs merge=lfs -text
160
+ olmo_core/model_and_optim/__7_5.distcp filter=lfs diff=lfs merge=lfs -text
161
+ olmo_core/model_and_optim/__7_6.distcp filter=lfs diff=lfs merge=lfs -text
162
+ olmo_core/model_and_optim/__7_7.distcp filter=lfs diff=lfs merge=lfs -text
163
+ olmo_core/model_and_optim/__7_8.distcp filter=lfs diff=lfs merge=lfs -text
164
+ olmo_core/model_and_optim/__7_9.distcp filter=lfs diff=lfs merge=lfs -text
olmo_core/.metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"version": "2.3.0"}
olmo_core/config.json ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "d_model": 2048,
4
+ "vocab_size": 640,
5
+ "n_layers": 16,
6
+ "block": {
7
+ "attention": {
8
+ "name": "default",
9
+ "n_heads": 16,
10
+ "bias": false,
11
+ "rope": {
12
+ "name": "default",
13
+ "theta": 500000,
14
+ "full_precision": true,
15
+ "_CLASS_": "olmo_core.nn.rope.RoPEConfig"
16
+ },
17
+ "qk_norm": {
18
+ "name": "rms",
19
+ "eps": 1e-06,
20
+ "bias": false,
21
+ "dtype": "float32",
22
+ "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
23
+ },
24
+ "use_flash": true,
25
+ "dtype": "float32",
26
+ "_CLASS_": "olmo_core.nn.attention.AttentionConfig"
27
+ },
28
+ "layer_norm": {
29
+ "name": "rms",
30
+ "eps": 1e-06,
31
+ "bias": false,
32
+ "dtype": "float32",
33
+ "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
34
+ },
35
+ "feed_forward": {
36
+ "hidden_size": 8192,
37
+ "name": "default",
38
+ "bias": false,
39
+ "dtype": "float32",
40
+ "act_name": "silu",
41
+ "_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig"
42
+ },
43
+ "name": "reordered_norm",
44
+ "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"
45
+ },
46
+ "lm_head": {
47
+ "name": "default",
48
+ "layer_norm": {
49
+ "name": "rms",
50
+ "eps": 1e-06,
51
+ "bias": false,
52
+ "dtype": "float32",
53
+ "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
54
+ },
55
+ "bias": false,
56
+ "dtype": "float32",
57
+ "loss_implementation": "default",
58
+ "_CLASS_": "olmo_core.nn.lm_head.LMHeadConfig"
59
+ },
60
+ "name": "bolmo_distill",
61
+ "dtype": "float32",
62
+ "init_method": "normal",
63
+ "init_seed": 0,
64
+ "init_std": 0.02,
65
+ "freeze_params": [
66
+ "boundary_predictor.*",
67
+ "teacher_embeddings.*"
68
+ ],
69
+ "local_encoder": {
70
+ "sliding_window_size": 0,
71
+ "d_model": 2048,
72
+ "n_layers": 1,
73
+ "block_config": {
74
+ "attention": {
75
+ "name": "default",
76
+ "n_heads": 16,
77
+ "dtype": "float32",
78
+ "_CLASS_": "olmo_core.nn.attention.AttentionConfig"
79
+ },
80
+ "layer_norm": {
81
+ "name": "rms",
82
+ "eps": 1e-06,
83
+ "bias": false,
84
+ "dtype": "float32",
85
+ "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
86
+ },
87
+ "feed_forward": {
88
+ "hidden_size": 2816,
89
+ "name": "default",
90
+ "bias": false,
91
+ "dtype": "float32",
92
+ "act_name": "silu",
93
+ "_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig"
94
+ },
95
+ "xlstm": {
96
+ "num_heads": 16,
97
+ "dtype": "float32",
98
+ "_CLASS_": "olmo_core.nn.xlstm.XLSTMConfig"
99
+ },
100
+ "name": "xlstm",
101
+ "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"
102
+ },
103
+ "cross_attn_n_heads": 0,
104
+ "cross_attn_do_project": true,
105
+ "cross_attn_init_pooling": "amax",
106
+ "pooling": "hnet",
107
+ "add_hash_embeddings": false,
108
+ "add_expanded_embeddings": true,
109
+ "hash_byte_group_size": [
110
+ 3,
111
+ 4,
112
+ 5,
113
+ 6,
114
+ 7,
115
+ 8
116
+ ],
117
+ "hash_byte_group_vocab": [
118
+ 1536,
119
+ 3072,
120
+ 6144,
121
+ 12288,
122
+ 24576,
123
+ 49152
124
+ ],
125
+ "hash_byte_group_nb_functions": 1,
126
+ "add_norm_after_last_block": true,
127
+ "add_norm_after_pool": false,
128
+ "add_out_projection": true,
129
+ "boundary_predictor": "hnet",
130
+ "boundary_predictor_lookahead": 1,
131
+ "represent_bytes_with_embeddings": false,
132
+ "represent_bytes_with_last_mixed_out": false,
133
+ "blt_compat": false,
134
+ "dtype": "float32",
135
+ "_CLASS_": "olmo_core.nn.bolmo.config.LocalEncoderConfig"
136
+ },
137
+ "local_decoder": {
138
+ "sliding_window_size": 0,
139
+ "d_model": 2048,
140
+ "n_layers": 4,
141
+ "cross_attn_n_heads": 0,
142
+ "block_config": {
143
+ "attention": {
144
+ "name": "default",
145
+ "n_heads": 16,
146
+ "dtype": "float32",
147
+ "_CLASS_": "olmo_core.nn.attention.AttentionConfig"
148
+ },
149
+ "layer_norm": {
150
+ "name": "rms",
151
+ "eps": 1e-06,
152
+ "bias": false,
153
+ "dtype": "float32",
154
+ "_CLASS_": "olmo_core.nn.layer_norm.LayerNormConfig"
155
+ },
156
+ "feed_forward": {
157
+ "hidden_size": 2816,
158
+ "name": "default",
159
+ "bias": false,
160
+ "dtype": "float32",
161
+ "act_name": "silu",
162
+ "_CLASS_": "olmo_core.nn.feed_forward.FeedForwardConfig"
163
+ },
164
+ "xlstm": {
165
+ "num_heads": 16,
166
+ "dtype": "float32",
167
+ "_CLASS_": "olmo_core.nn.xlstm.XLSTMConfig"
168
+ },
169
+ "name": "xlstm",
170
+ "_CLASS_": "olmo_core.nn.transformer.config.TransformerBlockConfig"
171
+ },
172
+ "depooling": "hnet",
173
+ "add_norm_before_first_block": true,
174
+ "add_norm_onto_residual": false,
175
+ "add_in_projection": true,
176
+ "add_projected_patch_residuals": false,
177
+ "hnet_smooth": false,
178
+ "hnet_smooth_ste": false,
179
+ "hnet_modulate": false,
180
+ "blt_compat": false,
181
+ "fuse_boundaries": true,
182
+ "no_boundaries": false,
183
+ "dtype": "float32",
184
+ "_CLASS_": "olmo_core.nn.bolmo.config.LocalDecoderConfig"
185
+ },
186
+ "share_blocks_between_teacher_and_student": false,
187
+ "_CLASS_": "olmo_core.nn.transformer.config.TransformerConfig"
188
+ },
189
+ "dataset": {
190
+ "tokenizer": {
191
+ "vocab_size": 520,
192
+ "eos_token_id": 1,
193
+ "pad_token_id": 0,
194
+ "bos_token_id": 1,
195
+ "special_tokens": [
196
+ "<pad>",
197
+ "<bos>",
198
+ "<eos>",
199
+ "<bpe_token_end>"
200
+ ],
201
+ "special_tokens_first": true,
202
+ "original_identifier": "allenai/dolma2-tokenizer",
203
+ "bpe_token_end_id": 3,
204
+ "_CLASS_": "olmo_core.data.tokenizer.ByteTokenizerConfig"
205
+ },
206
+ "paths": [],
207
+ "expand_glob": false,
208
+ "include_instance_metadata": true,
209
+ "work_dir": "",
210
+ "ignore_fingerprint_mismatch": false,
211
+ "sequence_length": 4096,
212
+ "generate_doc_lengths": false,
213
+ "byte_sequence_length": 24576,
214
+ "_CLASS_": "olmo_core.data.numpy_dataset.NumpyByteFSLDatasetConfig"
215
+ },
216
+ "data_loader": {
217
+ "global_batch_size": 1572864,
218
+ "seed": 1234,
219
+ "num_workers": 24,
220
+ "ignore_fingerprint_mismatch": false,
221
+ "_CLASS_": "olmo_core.data.data_loader.NumpyDataLoaderConfig"
222
+ },
223
+ "train_module": {
224
+ "rank_microbatch_size": 98304,
225
+ "max_sequence_length": 24576,
226
+ "optim": {
227
+ "group_overrides": [
228
+ {
229
+ "params": [
230
+ "local_encoder.embedding.weight",
231
+ "local_encoder.expanded_embeddings.weight"
232
+ ],
233
+ "opts": {
234
+ "weight_decay": 0.0
235
+ },
236
+ "_CLASS_": "olmo_core.optim.config.OptimGroupOverride"
237
+ },
238
+ {
239
+ "params": [
240
+ "blocks.*"
241
+ ],
242
+ "opts": {
243
+ "lr": 2.6e-05
244
+ },
245
+ "_CLASS_": "olmo_core.optim.config.OptimGroupOverride"
246
+ }
247
+ ],
248
+ "compile": false,
249
+ "fixed_fields": [
250
+ "initial_lr"
251
+ ],
252
+ "lr": 5.2e-05,
253
+ "betas": [
254
+ 0.9,
255
+ 0.95
256
+ ],
257
+ "eps": 1e-08,
258
+ "weight_decay": 0.1,
259
+ "_CLASS_": "olmo_core.optim.adamw.AdamWConfig"
260
+ },
261
+ "max_grad_norm": 0.5,
262
+ "scheduler": {
263
+ "lr_field": "lr",
264
+ "initial_lr_field": "initial_lr",
265
+ "units": "steps",
266
+ "alpha_f": 0.0,
267
+ "warmup_fraction": 0.1,
268
+ "warmup_min_lr": 0.0,
269
+ "_CLASS_": "olmo_core.optim.scheduler.LinearWithWarmup"
270
+ },
271
+ "compile_model": true,
272
+ "float8_config": {
273
+ "enabled": false,
274
+ "_CLASS_": "olmo_core.float8.Float8Config"
275
+ },
276
+ "dp_config": {
277
+ "name": "fsdp",
278
+ "param_dtype": "bfloat16",
279
+ "reduce_dtype": "float32",
280
+ "wrapping_strategy": "full",
281
+ "prefetch_factor": 0,
282
+ "_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerDataParallelConfig"
283
+ },
284
+ "bolmo_config": {
285
+ "tokenizer": {
286
+ "vocab_size": 520,
287
+ "eos_token_id": 1,
288
+ "pad_token_id": 0,
289
+ "bos_token_id": 1,
290
+ "special_tokens": [
291
+ "<pad>",
292
+ "<bos>",
293
+ "<eos>",
294
+ "<bpe_token_end>"
295
+ ],
296
+ "special_tokens_first": true,
297
+ "original_identifier": "allenai/dolma2-tokenizer",
298
+ "bpe_token_end_id": 3,
299
+ "_CLASS_": "olmo_core.data.tokenizer.ByteTokenizerConfig"
300
+ },
301
+ "losses": [
302
+ "ce",
303
+ "boundary"
304
+ ],
305
+ "loss_weights": [
306
+ 1.0,
307
+ 4.0
308
+ ],
309
+ "binarization_temp": 1.0,
310
+ "temperature": 1.0,
311
+ "div_fn": "tvd_temp_limit",
312
+ "boundary_mode": "end",
313
+ "merge_boundary_loss": false,
314
+ "use_output_boundary_jsd": false,
315
+ "eval_add_boundary_logp": false,
316
+ "do_alm_debiasing": false,
317
+ "rep_compare_fn": "l2",
318
+ "start_ratio": 4.3,
319
+ "target_ratio": 4.3,
320
+ "gradual_boundary_compression_steps": 150000,
321
+ "encoder_loss_lookahead": 0,
322
+ "encoder_loss_no_lookahead_weight": 1.0,
323
+ "encoder_loss_lookahead_weights": [],
324
+ "patching": "dolma2",
325
+ "epsilon": 1e-06,
326
+ "skip_blocks": false,
327
+ "skip_teacher_blocks": false,
328
+ "skip_teacher": true,
329
+ "compute_teacher_ce": false,
330
+ "use_student_patch_reps_for_teacher": false,
331
+ "use_oracle_patch_reps": false,
332
+ "teacher_blocks_no_grad": true,
333
+ "student_blocks_no_grad": false,
334
+ "decoder_backprop_through_encoder": true,
335
+ "decoder_backprop_through_boundary_predictor": true,
336
+ "boundary_predictor_backprop_through_encoder": true,
337
+ "teacher_force_boundaries": false,
338
+ "boundary_threshold": "sample:0",
339
+ "xlstm_igate_bias_init": -10.0,
340
+ "skip_boundary_before_eos": true,
341
+ "_CLASS_": "olmo_core.nn.bolmo.config.BolmoConfig"
342
+ },
343
+ "label_ignore_index": -100,
344
+ "_CLASS_": "olmo_core.train.train_module.transformer.config.TransformerTrainModuleConfig"
345
+ },
346
+ "trainer": {},
347
+ "init_seed": 12536,
348
+ "_CLASS_": "__main__.ExperimentConfig"
349
+ }
olmo_core/model_and_optim/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea8ac9dd55708f4a36911565f33e7e223a9423ef21c79b868869c14bbd0b6cfa
3
+ size 2464795
olmo_core/model_and_optim/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9241563abb2e63f5e76e9047b32d1b46adf82bc18bba2fc33dcff6c1caf4e856
3
+ size 138224729
olmo_core/model_and_optim/__0_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6ced12b46582166b9e74e636518a332aff6fa25ca3295baa81a9fb61d7844ac
3
+ size 138224729
olmo_core/model_and_optim/__0_10.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7afb9d5ccf88ad2e9c0f3a4db73aab41f04065731bef1cb9e3e632a2d32a3bd
3
+ size 138054699
olmo_core/model_and_optim/__0_11.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0285c42fd4de086b601dacbcc1b6042c30156a2ad0569c4c49e3864f99fdbb75
3
+ size 138053122
olmo_core/model_and_optim/__0_12.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d727be4cca2724990cfb0f8e45ee9d71538a48b93a812f26246a3112391d55a0
3
+ size 137924481
olmo_core/model_and_optim/__0_13.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c10f064d3c4a5a8f3b2020a8e630bc0270bb6dc50e7de8ea32a6c23e0f917ba5
3
+ size 137924481
olmo_core/model_and_optim/__0_14.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f66920be4bb54652f8ebb8db3b01b448ad6ff467e2caf300dd74df04e780e50f
3
+ size 137926058
olmo_core/model_and_optim/__0_15.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89eececcc6343abcd58709df669e959793180a439bedd5a9269028997c3b95f4
3
+ size 137926058
olmo_core/model_and_optim/__0_2.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:452c356d9c7dfca4d617d3c41e2b758c516515e4319149770c9d4cee22751a98
3
+ size 138224729
olmo_core/model_and_optim/__0_3.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93056e9e58af6bf3efce9901b41424a7bea1f3937c4fa69567d7ff904ec9cc3b
3
+ size 138448453
olmo_core/model_and_optim/__0_4.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:165e64ab61aac760c9ac06bc259d989f7ba6caf9dc80a320040e2094ac4031c7
3
+ size 138319274
olmo_core/model_and_optim/__0_5.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06bb7b56204a7491b3471bf4ce34eebe72a483fea29dd2ca6d950a1b69c5a4b2
3
+ size 138319274
olmo_core/model_and_optim/__0_6.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8febe1110dd2aa1c63e8f4112934e0432e7cb120ad379705b6bd6f69c900fff4
3
+ size 138319274
olmo_core/model_and_optim/__0_7.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29f5377b1855852505822287811f35dca870bd9a89711c6dd91e053917cfc798
3
+ size 137902754
olmo_core/model_and_optim/__0_8.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6943898c5bb21c2b2249dad0168defbfbdcb63a7ad0973d751b468ec07d3f40
3
+ size 137902754
olmo_core/model_and_optim/__0_9.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d46b7bc5207027ea7fd5d98242a338a2bc0c70dd3350d2e39851ca7e6688dfc1
3
+ size 138054699
olmo_core/model_and_optim/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0e30b56e0f4cdea3fcad9e636a0131caf679e2a5d66ad69fca0d60608001ce3
3
+ size 137975051
olmo_core/model_and_optim/__1_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f19ba4c47b4093ee13f16632f9be4f04af4f068b7462db621139bbf1c2ba2678
3
+ size 137975051
olmo_core/model_and_optim/__1_10.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd8a8524fca46aea6135faec75282ecae3ecc451b0f693614192f0b8a19a59c1
3
+ size 137667506
olmo_core/model_and_optim/__1_11.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf27a46ff7598b81b3bd5f8663147542d7a16d6d31a44ed20968d06ff2b334c2
3
+ size 137667506
olmo_core/model_and_optim/__1_12.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7de5fbb6ddd3855373b1311eb09827f7bce000b9cec11255f10ee17fcbf05eb
3
+ size 137676064
olmo_core/model_and_optim/__1_13.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6ec75d600a68102f796d9d8fce2fdb0b9898dff43595cad570b60efcdb269b0
3
+ size 137676064
olmo_core/model_and_optim/__1_14.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b66435c1ed78293f1af1f31bedbd97033cc797597391ab045bc584d58797f58
3
+ size 137677641
olmo_core/model_and_optim/__1_15.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff36c0a40ac9b5137b10958759d3c3eb7fe1f64d0d9699ef27f44d1a7104ee43
3
+ size 137677641
olmo_core/model_and_optim/__1_2.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efb835fa07e895daeef6b90d4528fe24de85f1d3950db6b123342594bcd41387
3
+ size 137975051
olmo_core/model_and_optim/__1_3.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67679cdff1de9eb8b5574a2941ada5f5d98e7e3f7d8d8fac8cf6e934879db0aa
3
+ size 138198775
olmo_core/model_and_optim/__1_4.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27723cefd17c891e7d891610acffeaf7ac2e8d31f109118b92bfee9c55016ba0
3
+ size 138070857
olmo_core/model_and_optim/__1_5.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad447e78e2efadf7eab55bac518eb88714645b477a93f222048b5a856f7b15c0
3
+ size 138070857
olmo_core/model_and_optim/__1_6.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8efabe23bc8209221a31ec2c2c0dd0ac4a24444cfb53eea1423d402406891cb7
3
+ size 138070857
olmo_core/model_and_optim/__1_7.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:340d3f0e71d0e35e23a7a34c035339edfecb1268b9fdbcb6370e18be2754adc0
3
+ size 137654337
olmo_core/model_and_optim/__1_8.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7b4d3f88074fd110a32ca232d5264a3848f8caa11f5334ff71c74df8e999d00
3
+ size 137654337
olmo_core/model_and_optim/__1_9.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:118cdbe40e52906b81456b66df6d4a5743a9183f6dffc9afbba01a596a547faa
3
+ size 137667506
olmo_core/model_and_optim/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f6a41c5048cde6511c0b50bba5684c39f6922fafb5b11931e01a2a4160d8604
3
+ size 137975051
olmo_core/model_and_optim/__2_1.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:328dd39d60e3663225c6c6fdb38e9aa600426dca217c0d962208e7af954d49f1
3
+ size 137975051
olmo_core/model_and_optim/__2_10.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0578b542929035283b915bb5ec4883c42b9dd801f7988b3bef83033816d9ed35
3
+ size 137667506
olmo_core/model_and_optim/__2_11.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da49896618a8f8303b681ec064c6355088b58fafd33b4c62b6cc740f1038851f
3
+ size 137667506
olmo_core/model_and_optim/__2_12.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c79ff34c25d25fe399d9a289fcf6d1031142a88d62c50dc50b150aa02aa4956
3
+ size 137676064
olmo_core/model_and_optim/__2_13.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb401a50823a9ed8038e1968d9a3e222d05509c33079c2b8b16e282cd67f5b41
3
+ size 137676064
olmo_core/model_and_optim/__2_14.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb63c9c083f627e150c041a2d3de9282d6f643dae1e95e1a46691684fe406459
3
+ size 137677641
olmo_core/model_and_optim/__2_15.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1205969c3caa7aaf7073f5afb49061fc7d646f86704cd4fab2caf44d28c00a6
3
+ size 137677641
olmo_core/model_and_optim/__2_2.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51c4181a8c59c612f46ff9ce6520da3c74f841bc20a137ddac9b2d2e5f3f7690
3
+ size 137975051
olmo_core/model_and_optim/__2_3.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94dd1a66905a5908f094413497af69d448d1d38159a2f97dfe3e78143f892c14
3
+ size 138198775
olmo_core/model_and_optim/__2_4.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:daf72d44ed90913fce12cfbe2626044c68edf87b14f5ebb3a48eb77591a80447
3
+ size 138070857
olmo_core/model_and_optim/__2_5.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6fcb095aa0987410995ae325c87eb5e01bcdf0dfa35eae629e1ef2a11a76b3f
3
+ size 138070857
olmo_core/model_and_optim/__2_6.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c8c5f22aabf49f0afc77bd3871ad5939f429d088814ede8bd602ae7f9e52055
3
+ size 138070857
olmo_core/model_and_optim/__2_7.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc9faf4b47e1ea02e5a1b0fa8e8cddea7ac92bda6d230945722d594ece8b82bc
3
+ size 137654337