sha000 commited on
Commit
a00487c
·
verified ·
1 Parent(s): 6c5e03c

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_sliding_window_pattern": 6,
3
+ "architectures": [
4
+ "Gemma3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "attn_logit_softcapping": null,
9
+ "bos_token_id": 2,
10
+ "dtype": "float32",
11
+ "eos_token_id": 1,
12
+ "final_logit_softcapping": null,
13
+ "head_dim": 256,
14
+ "hidden_activation": "gelu_pytorch_tanh",
15
+ "hidden_size": 640,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 2048,
18
+ "layer_types": [
19
+ "sliding_attention",
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "full_attention",
25
+ "sliding_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "full_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention"
37
+ ],
38
+ "max_position_embeddings": 32768,
39
+ "model_type": "gemma3_text",
40
+ "num_attention_heads": 4,
41
+ "num_hidden_layers": 18,
42
+ "num_key_value_heads": 1,
43
+ "pad_token_id": 0,
44
+ "query_pre_attn_scalar": 256,
45
+ "rms_norm_eps": 1e-06,
46
+ "rope_local_base_freq": 10000.0,
47
+ "rope_scaling": null,
48
+ "rope_theta": 1000000.0,
49
+ "sliding_window": 512,
50
+ "transformers_version": "4.57.6",
51
+ "use_bidirectional_attention": false,
52
+ "use_cache": true,
53
+ "vocab_size": 262144
54
+ }
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 2,
3
+ "cache_implementation": "hybrid",
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 1
7
+ ],
8
+ "pad_token_id": 0,
9
+ "top_k": 64,
10
+ "top_p": 0.95,
11
+ "transformers_version": "4.57.6"
12
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db3b88c64f18b372c87e056f71ae2992d35beffee75489de18eaabac1e7ac3f8
3
+ size 1072419256
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f4b03c9cd20d14da9630165d32d9ccef73b2882fdc1aa75b34614214d8c4763
3
+ size 2144987083
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:718a0f3db00824213036a2c0441849791319b7d9cf189065873bb26a7020738e
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c66988972fe7d155c16e4cf837db4d3fc078e095271b2271e57d816b6069445
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
trainer_state.json ADDED
@@ -0,0 +1,986 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 939,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.471254324913025,
14
+ "epoch": 0.03194888178913738,
15
+ "grad_norm": 8.043270111083984,
16
+ "learning_rate": 1.9808306709265177e-05,
17
+ "loss": 1.1636,
18
+ "mean_token_accuracy": 0.6890625,
19
+ "num_tokens": 12480.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 0.90457843542099,
24
+ "epoch": 0.06389776357827476,
25
+ "grad_norm": 10.676708221435547,
26
+ "learning_rate": 1.959531416400426e-05,
27
+ "loss": 0.3864,
28
+ "mean_token_accuracy": 0.8375,
29
+ "num_tokens": 24960.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.027526319026947,
34
+ "epoch": 0.09584664536741214,
35
+ "grad_norm": 5.594727039337158,
36
+ "learning_rate": 1.9382321618743344e-05,
37
+ "loss": 0.3759,
38
+ "mean_token_accuracy": 0.83671875,
39
+ "num_tokens": 37440.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.192073893547058,
44
+ "epoch": 0.12779552715654952,
45
+ "grad_norm": 3.208804130554199,
46
+ "learning_rate": 1.916932907348243e-05,
47
+ "loss": 0.3341,
48
+ "mean_token_accuracy": 0.85703125,
49
+ "num_tokens": 49920.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 1.1430011987686157,
54
+ "epoch": 0.1597444089456869,
55
+ "grad_norm": 19.184051513671875,
56
+ "learning_rate": 1.895633652822151e-05,
57
+ "loss": 0.3429,
58
+ "mean_token_accuracy": 0.85078125,
59
+ "num_tokens": 62400.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 1.1257157444953918,
64
+ "epoch": 0.19169329073482427,
65
+ "grad_norm": 5.893524646759033,
66
+ "learning_rate": 1.87433439829606e-05,
67
+ "loss": 0.2334,
68
+ "mean_token_accuracy": 0.8875,
69
+ "num_tokens": 74880.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 0.992573595046997,
74
+ "epoch": 0.22364217252396165,
75
+ "grad_norm": 15.351304054260254,
76
+ "learning_rate": 1.8530351437699682e-05,
77
+ "loss": 0.1187,
78
+ "mean_token_accuracy": 0.96328125,
79
+ "num_tokens": 87360.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 0.805773138999939,
84
+ "epoch": 0.25559105431309903,
85
+ "grad_norm": 74.02106475830078,
86
+ "learning_rate": 1.8317358892438765e-05,
87
+ "loss": 0.1924,
88
+ "mean_token_accuracy": 0.93125,
89
+ "num_tokens": 99840.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 0.8376959323883056,
94
+ "epoch": 0.28753993610223644,
95
+ "grad_norm": 9.446106910705566,
96
+ "learning_rate": 1.8104366347177852e-05,
97
+ "loss": 0.0837,
98
+ "mean_token_accuracy": 0.96796875,
99
+ "num_tokens": 112320.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 0.6883749544620514,
104
+ "epoch": 0.3194888178913738,
105
+ "grad_norm": 29.95865249633789,
106
+ "learning_rate": 1.7891373801916932e-05,
107
+ "loss": 0.0712,
108
+ "mean_token_accuracy": 0.9671875,
109
+ "num_tokens": 124800.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 0.5861309468746185,
114
+ "epoch": 0.3514376996805112,
115
+ "grad_norm": 0.981063723564148,
116
+ "learning_rate": 1.767838125665602e-05,
117
+ "loss": 0.0339,
118
+ "mean_token_accuracy": 0.9890625,
119
+ "num_tokens": 137280.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 0.6167496562004089,
124
+ "epoch": 0.38338658146964855,
125
+ "grad_norm": 0.3446030020713806,
126
+ "learning_rate": 1.7465388711395103e-05,
127
+ "loss": 0.019,
128
+ "mean_token_accuracy": 0.9953125,
129
+ "num_tokens": 149760.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 0.6116879105567932,
134
+ "epoch": 0.41533546325878595,
135
+ "grad_norm": 7.9384846687316895,
136
+ "learning_rate": 1.7252396166134186e-05,
137
+ "loss": 0.0179,
138
+ "mean_token_accuracy": 0.99453125,
139
+ "num_tokens": 162240.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 0.5835295200347901,
144
+ "epoch": 0.4472843450479233,
145
+ "grad_norm": 15.288229942321777,
146
+ "learning_rate": 1.7039403620873273e-05,
147
+ "loss": 0.0144,
148
+ "mean_token_accuracy": 0.99375,
149
+ "num_tokens": 174720.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 0.5895743370056152,
154
+ "epoch": 0.4792332268370607,
155
+ "grad_norm": 8.906089782714844,
156
+ "learning_rate": 1.6826411075612353e-05,
157
+ "loss": 0.0277,
158
+ "mean_token_accuracy": 0.9953125,
159
+ "num_tokens": 187200.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 0.6350247144699097,
164
+ "epoch": 0.5111821086261981,
165
+ "grad_norm": 12.155186653137207,
166
+ "learning_rate": 1.661341853035144e-05,
167
+ "loss": 0.009,
168
+ "mean_token_accuracy": 0.9984375,
169
+ "num_tokens": 199680.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 0.6250557661056518,
174
+ "epoch": 0.5431309904153354,
175
+ "grad_norm": 1.7694993019104004,
176
+ "learning_rate": 1.6400425985090524e-05,
177
+ "loss": 0.0297,
178
+ "mean_token_accuracy": 0.9890625,
179
+ "num_tokens": 212160.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 0.6015866935253144,
184
+ "epoch": 0.5750798722044729,
185
+ "grad_norm": 24.392311096191406,
186
+ "learning_rate": 1.6187433439829607e-05,
187
+ "loss": 0.0199,
188
+ "mean_token_accuracy": 0.9921875,
189
+ "num_tokens": 224640.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 0.6133609235286712,
194
+ "epoch": 0.6070287539936102,
195
+ "grad_norm": 0.03015461377799511,
196
+ "learning_rate": 1.5974440894568694e-05,
197
+ "loss": 0.0131,
198
+ "mean_token_accuracy": 0.996875,
199
+ "num_tokens": 237120.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 0.6456725597381592,
204
+ "epoch": 0.6389776357827476,
205
+ "grad_norm": 18.586185455322266,
206
+ "learning_rate": 1.5761448349307774e-05,
207
+ "loss": 0.0133,
208
+ "mean_token_accuracy": 0.99609375,
209
+ "num_tokens": 249600.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 0.6170299232006073,
214
+ "epoch": 0.670926517571885,
215
+ "grad_norm": 49.949588775634766,
216
+ "learning_rate": 1.554845580404686e-05,
217
+ "loss": 0.0231,
218
+ "mean_token_accuracy": 0.9921875,
219
+ "num_tokens": 262080.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 0.5970291554927826,
224
+ "epoch": 0.7028753993610224,
225
+ "grad_norm": 0.9214933514595032,
226
+ "learning_rate": 1.5335463258785944e-05,
227
+ "loss": 0.0179,
228
+ "mean_token_accuracy": 0.990625,
229
+ "num_tokens": 274560.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 0.580757600069046,
234
+ "epoch": 0.7348242811501597,
235
+ "grad_norm": 8.092296600341797,
236
+ "learning_rate": 1.5122470713525028e-05,
237
+ "loss": 0.0349,
238
+ "mean_token_accuracy": 0.98984375,
239
+ "num_tokens": 287040.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 0.5787507772445679,
244
+ "epoch": 0.7667731629392971,
245
+ "grad_norm": 10.055787086486816,
246
+ "learning_rate": 1.4909478168264111e-05,
247
+ "loss": 0.0065,
248
+ "mean_token_accuracy": 0.99609375,
249
+ "num_tokens": 299520.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 0.5758024156093597,
254
+ "epoch": 0.7987220447284346,
255
+ "grad_norm": 61.38268280029297,
256
+ "learning_rate": 1.4696485623003197e-05,
257
+ "loss": 0.0424,
258
+ "mean_token_accuracy": 0.9875,
259
+ "num_tokens": 312000.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 0.6395232379436493,
264
+ "epoch": 0.8306709265175719,
265
+ "grad_norm": 2.0960898399353027,
266
+ "learning_rate": 1.4483493077742282e-05,
267
+ "loss": 0.1762,
268
+ "mean_token_accuracy": 0.95,
269
+ "num_tokens": 324480.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 0.6194823384284973,
274
+ "epoch": 0.8626198083067093,
275
+ "grad_norm": 2.2915937900543213,
276
+ "learning_rate": 1.4270500532481364e-05,
277
+ "loss": 0.0054,
278
+ "mean_token_accuracy": 0.9984375,
279
+ "num_tokens": 336960.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 0.553073239326477,
284
+ "epoch": 0.8945686900958466,
285
+ "grad_norm": 0.6241616606712341,
286
+ "learning_rate": 1.4057507987220449e-05,
287
+ "loss": 0.0121,
288
+ "mean_token_accuracy": 0.99765625,
289
+ "num_tokens": 349440.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 0.588155323266983,
294
+ "epoch": 0.9265175718849841,
295
+ "grad_norm": 0.8865500688552856,
296
+ "learning_rate": 1.3844515441959532e-05,
297
+ "loss": 0.0044,
298
+ "mean_token_accuracy": 0.99765625,
299
+ "num_tokens": 361920.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 0.6138588011264801,
304
+ "epoch": 0.9584664536741214,
305
+ "grad_norm": 0.16805018484592438,
306
+ "learning_rate": 1.3631522896698617e-05,
307
+ "loss": 0.001,
308
+ "mean_token_accuracy": 0.99921875,
309
+ "num_tokens": 374400.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 0.6157549917697906,
314
+ "epoch": 0.9904153354632588,
315
+ "grad_norm": 11.855587005615234,
316
+ "learning_rate": 1.3418530351437703e-05,
317
+ "loss": 0.0051,
318
+ "mean_token_accuracy": 0.9984375,
319
+ "num_tokens": 386880.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "epoch": 1.0,
324
+ "eval_entropy": 0.5923710940759394,
325
+ "eval_loss": 0.01570574752986431,
326
+ "eval_mean_token_accuracy": 0.995253164556962,
327
+ "eval_num_tokens": 389844.0,
328
+ "eval_runtime": 13.2845,
329
+ "eval_samples_per_second": 188.189,
330
+ "eval_steps_per_second": 5.947,
331
+ "step": 313
332
+ },
333
+ {
334
+ "entropy": 0.5902234852313996,
335
+ "epoch": 1.0223642172523961,
336
+ "grad_norm": 0.026217741891741753,
337
+ "learning_rate": 1.3205537806176784e-05,
338
+ "loss": 0.0056,
339
+ "mean_token_accuracy": 0.9984375,
340
+ "num_tokens": 398580.0,
341
+ "step": 320
342
+ },
343
+ {
344
+ "entropy": 0.562452882528305,
345
+ "epoch": 1.0543130990415335,
346
+ "grad_norm": 22.588623046875,
347
+ "learning_rate": 1.299254526091587e-05,
348
+ "loss": 0.0152,
349
+ "mean_token_accuracy": 0.9921875,
350
+ "num_tokens": 411060.0,
351
+ "step": 330
352
+ },
353
+ {
354
+ "entropy": 0.553607851266861,
355
+ "epoch": 1.0862619808306708,
356
+ "grad_norm": 2.0158348083496094,
357
+ "learning_rate": 1.2779552715654953e-05,
358
+ "loss": 0.0202,
359
+ "mean_token_accuracy": 0.99453125,
360
+ "num_tokens": 423540.0,
361
+ "step": 340
362
+ },
363
+ {
364
+ "entropy": 0.602299690246582,
365
+ "epoch": 1.1182108626198084,
366
+ "grad_norm": 0.03288736939430237,
367
+ "learning_rate": 1.2566560170394038e-05,
368
+ "loss": 0.0265,
369
+ "mean_token_accuracy": 0.9921875,
370
+ "num_tokens": 436020.0,
371
+ "step": 350
372
+ },
373
+ {
374
+ "entropy": 0.5916118025779724,
375
+ "epoch": 1.1501597444089458,
376
+ "grad_norm": 0.2714002728462219,
377
+ "learning_rate": 1.235356762513312e-05,
378
+ "loss": 0.0049,
379
+ "mean_token_accuracy": 0.996875,
380
+ "num_tokens": 448500.0,
381
+ "step": 360
382
+ },
383
+ {
384
+ "entropy": 0.6014433860778808,
385
+ "epoch": 1.182108626198083,
386
+ "grad_norm": 0.8565823435783386,
387
+ "learning_rate": 1.2140575079872205e-05,
388
+ "loss": 0.0029,
389
+ "mean_token_accuracy": 0.99921875,
390
+ "num_tokens": 460980.0,
391
+ "step": 370
392
+ },
393
+ {
394
+ "entropy": 0.558870005607605,
395
+ "epoch": 1.2140575079872205,
396
+ "grad_norm": 0.4954104721546173,
397
+ "learning_rate": 1.192758253461129e-05,
398
+ "loss": 0.0021,
399
+ "mean_token_accuracy": 0.99921875,
400
+ "num_tokens": 473460.0,
401
+ "step": 380
402
+ },
403
+ {
404
+ "entropy": 0.5449843347072602,
405
+ "epoch": 1.2460063897763578,
406
+ "grad_norm": 0.0184471495449543,
407
+ "learning_rate": 1.1714589989350374e-05,
408
+ "loss": 0.0012,
409
+ "mean_token_accuracy": 1.0,
410
+ "num_tokens": 485940.0,
411
+ "step": 390
412
+ },
413
+ {
414
+ "entropy": 0.5302130222320557,
415
+ "epoch": 1.2779552715654952,
416
+ "grad_norm": 0.5405293107032776,
417
+ "learning_rate": 1.1501597444089459e-05,
418
+ "loss": 0.0021,
419
+ "mean_token_accuracy": 0.99921875,
420
+ "num_tokens": 498420.0,
421
+ "step": 400
422
+ },
423
+ {
424
+ "entropy": 0.5257469773292541,
425
+ "epoch": 1.3099041533546325,
426
+ "grad_norm": 14.052752494812012,
427
+ "learning_rate": 1.1288604898828541e-05,
428
+ "loss": 0.0054,
429
+ "mean_token_accuracy": 0.9984375,
430
+ "num_tokens": 510900.0,
431
+ "step": 410
432
+ },
433
+ {
434
+ "entropy": 0.5204551070928574,
435
+ "epoch": 1.34185303514377,
436
+ "grad_norm": 0.041872043162584305,
437
+ "learning_rate": 1.1075612353567626e-05,
438
+ "loss": 0.0028,
439
+ "mean_token_accuracy": 0.9984375,
440
+ "num_tokens": 523380.0,
441
+ "step": 420
442
+ },
443
+ {
444
+ "entropy": 0.5394512295722962,
445
+ "epoch": 1.3738019169329074,
446
+ "grad_norm": 0.06288646906614304,
447
+ "learning_rate": 1.086261980830671e-05,
448
+ "loss": 0.0003,
449
+ "mean_token_accuracy": 1.0,
450
+ "num_tokens": 535860.0,
451
+ "step": 430
452
+ },
453
+ {
454
+ "entropy": 0.5291238784790039,
455
+ "epoch": 1.4057507987220448,
456
+ "grad_norm": 0.0030913001392036676,
457
+ "learning_rate": 1.0649627263045795e-05,
458
+ "loss": 0.0007,
459
+ "mean_token_accuracy": 0.99921875,
460
+ "num_tokens": 548340.0,
461
+ "step": 440
462
+ },
463
+ {
464
+ "entropy": 0.5285849571228027,
465
+ "epoch": 1.4376996805111821,
466
+ "grad_norm": 1.8844810724258423,
467
+ "learning_rate": 1.043663471778488e-05,
468
+ "loss": 0.0003,
469
+ "mean_token_accuracy": 1.0,
470
+ "num_tokens": 560820.0,
471
+ "step": 450
472
+ },
473
+ {
474
+ "entropy": 0.5385882794857025,
475
+ "epoch": 1.4696485623003195,
476
+ "grad_norm": 0.11690080910921097,
477
+ "learning_rate": 1.0223642172523962e-05,
478
+ "loss": 0.0003,
479
+ "mean_token_accuracy": 1.0,
480
+ "num_tokens": 573300.0,
481
+ "step": 460
482
+ },
483
+ {
484
+ "entropy": 0.5441839516162872,
485
+ "epoch": 1.5015974440894568,
486
+ "grad_norm": 0.0011391988955438137,
487
+ "learning_rate": 1.0010649627263047e-05,
488
+ "loss": 0.0007,
489
+ "mean_token_accuracy": 0.99921875,
490
+ "num_tokens": 585780.0,
491
+ "step": 470
492
+ },
493
+ {
494
+ "entropy": 0.5367125928401947,
495
+ "epoch": 1.5335463258785942,
496
+ "grad_norm": 0.595458984375,
497
+ "learning_rate": 9.79765708200213e-06,
498
+ "loss": 0.0002,
499
+ "mean_token_accuracy": 1.0,
500
+ "num_tokens": 598260.0,
501
+ "step": 480
502
+ },
503
+ {
504
+ "entropy": 0.5380379557609558,
505
+ "epoch": 1.5654952076677318,
506
+ "grad_norm": 0.0110127292573452,
507
+ "learning_rate": 9.584664536741216e-06,
508
+ "loss": 0.0006,
509
+ "mean_token_accuracy": 1.0,
510
+ "num_tokens": 610740.0,
511
+ "step": 490
512
+ },
513
+ {
514
+ "entropy": 0.5656424820423126,
515
+ "epoch": 1.5974440894568689,
516
+ "grad_norm": 0.018918879330158234,
517
+ "learning_rate": 9.3716719914803e-06,
518
+ "loss": 0.0002,
519
+ "mean_token_accuracy": 1.0,
520
+ "num_tokens": 623220.0,
521
+ "step": 500
522
+ },
523
+ {
524
+ "entropy": 0.5534205734729767,
525
+ "epoch": 1.6293929712460065,
526
+ "grad_norm": 0.0005970252677798271,
527
+ "learning_rate": 9.158679446219383e-06,
528
+ "loss": 0.0002,
529
+ "mean_token_accuracy": 1.0,
530
+ "num_tokens": 635700.0,
531
+ "step": 510
532
+ },
533
+ {
534
+ "entropy": 0.5591952800750732,
535
+ "epoch": 1.6613418530351438,
536
+ "grad_norm": 0.23496565222740173,
537
+ "learning_rate": 8.945686900958466e-06,
538
+ "loss": 0.0007,
539
+ "mean_token_accuracy": 1.0,
540
+ "num_tokens": 648180.0,
541
+ "step": 520
542
+ },
543
+ {
544
+ "entropy": 0.5553164839744568,
545
+ "epoch": 1.6932907348242812,
546
+ "grad_norm": 0.015620424412190914,
547
+ "learning_rate": 8.732694355697551e-06,
548
+ "loss": 0.0006,
549
+ "mean_token_accuracy": 1.0,
550
+ "num_tokens": 660660.0,
551
+ "step": 530
552
+ },
553
+ {
554
+ "entropy": 0.5558278143405915,
555
+ "epoch": 1.7252396166134185,
556
+ "grad_norm": 0.013437892310321331,
557
+ "learning_rate": 8.519701810436637e-06,
558
+ "loss": 0.0,
559
+ "mean_token_accuracy": 1.0,
560
+ "num_tokens": 673140.0,
561
+ "step": 540
562
+ },
563
+ {
564
+ "entropy": 0.5494430124759674,
565
+ "epoch": 1.7571884984025559,
566
+ "grad_norm": 0.05179116502404213,
567
+ "learning_rate": 8.30670926517572e-06,
568
+ "loss": 0.0,
569
+ "mean_token_accuracy": 1.0,
570
+ "num_tokens": 685620.0,
571
+ "step": 550
572
+ },
573
+ {
574
+ "entropy": 0.5591476142406464,
575
+ "epoch": 1.7891373801916934,
576
+ "grad_norm": 0.001572166453115642,
577
+ "learning_rate": 8.093716719914804e-06,
578
+ "loss": 0.0,
579
+ "mean_token_accuracy": 1.0,
580
+ "num_tokens": 698100.0,
581
+ "step": 560
582
+ },
583
+ {
584
+ "entropy": 0.5543432533740997,
585
+ "epoch": 1.8210862619808306,
586
+ "grad_norm": 0.0029468077700585127,
587
+ "learning_rate": 7.880724174653887e-06,
588
+ "loss": 0.0,
589
+ "mean_token_accuracy": 1.0,
590
+ "num_tokens": 710580.0,
591
+ "step": 570
592
+ },
593
+ {
594
+ "entropy": 0.5561375498771668,
595
+ "epoch": 1.8530351437699681,
596
+ "grad_norm": 7.772324897814542e-05,
597
+ "learning_rate": 7.667731629392972e-06,
598
+ "loss": 0.0,
599
+ "mean_token_accuracy": 1.0,
600
+ "num_tokens": 723060.0,
601
+ "step": 580
602
+ },
603
+ {
604
+ "entropy": 0.554932814836502,
605
+ "epoch": 1.8849840255591053,
606
+ "grad_norm": 0.023860394954681396,
607
+ "learning_rate": 7.454739084132056e-06,
608
+ "loss": 0.0,
609
+ "mean_token_accuracy": 1.0,
610
+ "num_tokens": 735540.0,
611
+ "step": 590
612
+ },
613
+ {
614
+ "entropy": 0.5592103660106659,
615
+ "epoch": 1.9169329073482428,
616
+ "grad_norm": 6.846313772257417e-05,
617
+ "learning_rate": 7.241746538871141e-06,
618
+ "loss": 0.0,
619
+ "mean_token_accuracy": 1.0,
620
+ "num_tokens": 748020.0,
621
+ "step": 600
622
+ },
623
+ {
624
+ "entropy": 0.5566479444503785,
625
+ "epoch": 1.9488817891373802,
626
+ "grad_norm": 0.00017782168288249522,
627
+ "learning_rate": 7.028753993610224e-06,
628
+ "loss": 0.0,
629
+ "mean_token_accuracy": 1.0,
630
+ "num_tokens": 760500.0,
631
+ "step": 610
632
+ },
633
+ {
634
+ "entropy": 0.560238641500473,
635
+ "epoch": 1.9808306709265175,
636
+ "grad_norm": 0.0009979789610952139,
637
+ "learning_rate": 6.815761448349309e-06,
638
+ "loss": 0.0,
639
+ "mean_token_accuracy": 1.0,
640
+ "num_tokens": 772980.0,
641
+ "step": 620
642
+ },
643
+ {
644
+ "epoch": 2.0,
645
+ "eval_entropy": 0.5581134014491793,
646
+ "eval_loss": 6.910775482538156e-06,
647
+ "eval_mean_token_accuracy": 1.0,
648
+ "eval_num_tokens": 779688.0,
649
+ "eval_runtime": 13.4056,
650
+ "eval_samples_per_second": 186.489,
651
+ "eval_steps_per_second": 5.893,
652
+ "step": 626
653
+ },
654
+ {
655
+ "entropy": 0.5542679131031036,
656
+ "epoch": 2.012779552715655,
657
+ "grad_norm": 0.0001906445249915123,
658
+ "learning_rate": 6.602768903088392e-06,
659
+ "loss": 0.0,
660
+ "mean_token_accuracy": 1.0,
661
+ "num_tokens": 784680.0,
662
+ "step": 630
663
+ },
664
+ {
665
+ "entropy": 0.5601355612277985,
666
+ "epoch": 2.0447284345047922,
667
+ "grad_norm": 8.866995631251484e-05,
668
+ "learning_rate": 6.3897763578274765e-06,
669
+ "loss": 0.0,
670
+ "mean_token_accuracy": 1.0,
671
+ "num_tokens": 797160.0,
672
+ "step": 640
673
+ },
674
+ {
675
+ "entropy": 0.5562943339347839,
676
+ "epoch": 2.07667731629393,
677
+ "grad_norm": 4.927597183268517e-05,
678
+ "learning_rate": 6.17678381256656e-06,
679
+ "loss": 0.0,
680
+ "mean_token_accuracy": 1.0,
681
+ "num_tokens": 809640.0,
682
+ "step": 650
683
+ },
684
+ {
685
+ "entropy": 0.5592762529850006,
686
+ "epoch": 2.108626198083067,
687
+ "grad_norm": 0.0003652777522802353,
688
+ "learning_rate": 5.963791267305645e-06,
689
+ "loss": 0.0,
690
+ "mean_token_accuracy": 1.0,
691
+ "num_tokens": 822120.0,
692
+ "step": 660
693
+ },
694
+ {
695
+ "entropy": 0.560578465461731,
696
+ "epoch": 2.1405750798722045,
697
+ "grad_norm": 0.0005100357229821384,
698
+ "learning_rate": 5.7507987220447296e-06,
699
+ "loss": 0.0,
700
+ "mean_token_accuracy": 1.0,
701
+ "num_tokens": 834600.0,
702
+ "step": 670
703
+ },
704
+ {
705
+ "entropy": 0.5573894202709198,
706
+ "epoch": 2.1725239616613417,
707
+ "grad_norm": 0.0007649549515917897,
708
+ "learning_rate": 5.537806176783813e-06,
709
+ "loss": 0.0,
710
+ "mean_token_accuracy": 1.0,
711
+ "num_tokens": 847080.0,
712
+ "step": 680
713
+ },
714
+ {
715
+ "entropy": 0.5623638391494751,
716
+ "epoch": 2.2044728434504792,
717
+ "grad_norm": 0.007040001451969147,
718
+ "learning_rate": 5.324813631522897e-06,
719
+ "loss": 0.0,
720
+ "mean_token_accuracy": 1.0,
721
+ "num_tokens": 859560.0,
722
+ "step": 690
723
+ },
724
+ {
725
+ "entropy": 0.5608678042888642,
726
+ "epoch": 2.236421725239617,
727
+ "grad_norm": 0.0008389271097257733,
728
+ "learning_rate": 5.111821086261981e-06,
729
+ "loss": 0.0,
730
+ "mean_token_accuracy": 1.0,
731
+ "num_tokens": 872040.0,
732
+ "step": 700
733
+ },
734
+ {
735
+ "entropy": 0.562554806470871,
736
+ "epoch": 2.268370607028754,
737
+ "grad_norm": 0.0008370543946512043,
738
+ "learning_rate": 4.898828541001065e-06,
739
+ "loss": 0.0,
740
+ "mean_token_accuracy": 1.0,
741
+ "num_tokens": 884520.0,
742
+ "step": 710
743
+ },
744
+ {
745
+ "entropy": 0.5613407075405121,
746
+ "epoch": 2.3003194888178915,
747
+ "grad_norm": 3.100551475654356e-05,
748
+ "learning_rate": 4.68583599574015e-06,
749
+ "loss": 0.0,
750
+ "mean_token_accuracy": 1.0,
751
+ "num_tokens": 897000.0,
752
+ "step": 720
753
+ },
754
+ {
755
+ "entropy": 0.5588717699050904,
756
+ "epoch": 2.3322683706070286,
757
+ "grad_norm": 0.0035649905912578106,
758
+ "learning_rate": 4.472843450479233e-06,
759
+ "loss": 0.0,
760
+ "mean_token_accuracy": 1.0,
761
+ "num_tokens": 909480.0,
762
+ "step": 730
763
+ },
764
+ {
765
+ "entropy": 0.5609096884727478,
766
+ "epoch": 2.364217252396166,
767
+ "grad_norm": 0.0003579799085855484,
768
+ "learning_rate": 4.259850905218318e-06,
769
+ "loss": 0.0,
770
+ "mean_token_accuracy": 1.0,
771
+ "num_tokens": 921960.0,
772
+ "step": 740
773
+ },
774
+ {
775
+ "entropy": 0.5581632852554321,
776
+ "epoch": 2.3961661341853033,
777
+ "grad_norm": 0.00018412918143440038,
778
+ "learning_rate": 4.046858359957402e-06,
779
+ "loss": 0.0,
780
+ "mean_token_accuracy": 1.0,
781
+ "num_tokens": 934440.0,
782
+ "step": 750
783
+ },
784
+ {
785
+ "entropy": 0.5592087864875793,
786
+ "epoch": 2.428115015974441,
787
+ "grad_norm": 0.001302594318985939,
788
+ "learning_rate": 3.833865814696486e-06,
789
+ "loss": 0.0,
790
+ "mean_token_accuracy": 1.0,
791
+ "num_tokens": 946920.0,
792
+ "step": 760
793
+ },
794
+ {
795
+ "entropy": 0.5602552175521851,
796
+ "epoch": 2.460063897763578,
797
+ "grad_norm": 0.0001967909629456699,
798
+ "learning_rate": 3.6208732694355704e-06,
799
+ "loss": 0.0,
800
+ "mean_token_accuracy": 1.0,
801
+ "num_tokens": 959400.0,
802
+ "step": 770
803
+ },
804
+ {
805
+ "entropy": 0.5587169051170349,
806
+ "epoch": 2.4920127795527156,
807
+ "grad_norm": 3.6476201785262674e-05,
808
+ "learning_rate": 3.4078807241746544e-06,
809
+ "loss": 0.0,
810
+ "mean_token_accuracy": 1.0,
811
+ "num_tokens": 971880.0,
812
+ "step": 780
813
+ },
814
+ {
815
+ "entropy": 0.5619259059429169,
816
+ "epoch": 2.523961661341853,
817
+ "grad_norm": 0.00010852525883819908,
818
+ "learning_rate": 3.1948881789137383e-06,
819
+ "loss": 0.0,
820
+ "mean_token_accuracy": 1.0,
821
+ "num_tokens": 984360.0,
822
+ "step": 790
823
+ },
824
+ {
825
+ "entropy": 0.5560723125934601,
826
+ "epoch": 2.5559105431309903,
827
+ "grad_norm": 7.974612526595592e-05,
828
+ "learning_rate": 2.9818956336528226e-06,
829
+ "loss": 0.0,
830
+ "mean_token_accuracy": 1.0,
831
+ "num_tokens": 996840.0,
832
+ "step": 800
833
+ },
834
+ {
835
+ "entropy": 0.5587869763374329,
836
+ "epoch": 2.587859424920128,
837
+ "grad_norm": 0.0005656637367792428,
838
+ "learning_rate": 2.7689030883919065e-06,
839
+ "loss": 0.0,
840
+ "mean_token_accuracy": 1.0,
841
+ "num_tokens": 1009320.0,
842
+ "step": 810
843
+ },
844
+ {
845
+ "entropy": 0.5632799625396728,
846
+ "epoch": 2.619808306709265,
847
+ "grad_norm": 6.125601794337854e-05,
848
+ "learning_rate": 2.5559105431309904e-06,
849
+ "loss": 0.0,
850
+ "mean_token_accuracy": 1.0,
851
+ "num_tokens": 1021800.0,
852
+ "step": 820
853
+ },
854
+ {
855
+ "entropy": 0.558579832315445,
856
+ "epoch": 2.6517571884984026,
857
+ "grad_norm": 0.0008585830801166594,
858
+ "learning_rate": 2.342917997870075e-06,
859
+ "loss": 0.0,
860
+ "mean_token_accuracy": 1.0,
861
+ "num_tokens": 1034280.0,
862
+ "step": 830
863
+ },
864
+ {
865
+ "entropy": 0.5579914152622223,
866
+ "epoch": 2.68370607028754,
867
+ "grad_norm": 5.5771433835616335e-05,
868
+ "learning_rate": 2.129925452609159e-06,
869
+ "loss": 0.0,
870
+ "mean_token_accuracy": 1.0,
871
+ "num_tokens": 1046760.0,
872
+ "step": 840
873
+ },
874
+ {
875
+ "entropy": 0.5599809646606445,
876
+ "epoch": 2.7156549520766773,
877
+ "grad_norm": 0.00012791369226761162,
878
+ "learning_rate": 1.916932907348243e-06,
879
+ "loss": 0.0,
880
+ "mean_token_accuracy": 1.0,
881
+ "num_tokens": 1059240.0,
882
+ "step": 850
883
+ },
884
+ {
885
+ "entropy": 0.5608228087425232,
886
+ "epoch": 2.747603833865815,
887
+ "grad_norm": 8.307035022880882e-05,
888
+ "learning_rate": 1.7039403620873272e-06,
889
+ "loss": 0.0,
890
+ "mean_token_accuracy": 1.0,
891
+ "num_tokens": 1071720.0,
892
+ "step": 860
893
+ },
894
+ {
895
+ "entropy": 0.5607754468917847,
896
+ "epoch": 2.779552715654952,
897
+ "grad_norm": 5.250581671134569e-05,
898
+ "learning_rate": 1.4909478168264113e-06,
899
+ "loss": 0.0,
900
+ "mean_token_accuracy": 1.0,
901
+ "num_tokens": 1084200.0,
902
+ "step": 870
903
+ },
904
+ {
905
+ "entropy": 0.5697051167488099,
906
+ "epoch": 2.8115015974440896,
907
+ "grad_norm": 0.0002477150410413742,
908
+ "learning_rate": 1.2779552715654952e-06,
909
+ "loss": 0.0,
910
+ "mean_token_accuracy": 1.0,
911
+ "num_tokens": 1096680.0,
912
+ "step": 880
913
+ },
914
+ {
915
+ "entropy": 0.5599372982978821,
916
+ "epoch": 2.8434504792332267,
917
+ "grad_norm": 9.851283539319411e-05,
918
+ "learning_rate": 1.0649627263045796e-06,
919
+ "loss": 0.0,
920
+ "mean_token_accuracy": 1.0,
921
+ "num_tokens": 1109160.0,
922
+ "step": 890
923
+ },
924
+ {
925
+ "entropy": 0.5631425619125366,
926
+ "epoch": 2.8753993610223643,
927
+ "grad_norm": 5.103146395413205e-05,
928
+ "learning_rate": 8.519701810436636e-07,
929
+ "loss": 0.0,
930
+ "mean_token_accuracy": 1.0,
931
+ "num_tokens": 1121640.0,
932
+ "step": 900
933
+ },
934
+ {
935
+ "entropy": 0.5630890011787415,
936
+ "epoch": 2.9073482428115014,
937
+ "grad_norm": 0.0011606919579207897,
938
+ "learning_rate": 6.389776357827476e-07,
939
+ "loss": 0.0,
940
+ "mean_token_accuracy": 1.0,
941
+ "num_tokens": 1134120.0,
942
+ "step": 910
943
+ },
944
+ {
945
+ "entropy": 0.5582900941371918,
946
+ "epoch": 2.939297124600639,
947
+ "grad_norm": 0.0002894483332056552,
948
+ "learning_rate": 4.259850905218318e-07,
949
+ "loss": 0.0,
950
+ "mean_token_accuracy": 1.0,
951
+ "num_tokens": 1146600.0,
952
+ "step": 920
953
+ },
954
+ {
955
+ "entropy": 0.5616473734378815,
956
+ "epoch": 2.9712460063897765,
957
+ "grad_norm": 4.966451888321899e-05,
958
+ "learning_rate": 2.129925452609159e-07,
959
+ "loss": 0.0008,
960
+ "mean_token_accuracy": 0.99921875,
961
+ "num_tokens": 1159080.0,
962
+ "step": 930
963
+ }
964
+ ],
965
+ "logging_steps": 10,
966
+ "max_steps": 939,
967
+ "num_input_tokens_seen": 0,
968
+ "num_train_epochs": 3,
969
+ "save_steps": 500,
970
+ "stateful_callbacks": {
971
+ "TrainerControl": {
972
+ "args": {
973
+ "should_epoch_stop": false,
974
+ "should_evaluate": false,
975
+ "should_log": false,
976
+ "should_save": true,
977
+ "should_training_stop": true
978
+ },
979
+ "attributes": {}
980
+ }
981
+ },
982
+ "total_flos": 704006916867072.0,
983
+ "train_batch_size": 32,
984
+ "trial_name": null,
985
+ "trial_params": null
986
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38c3ca7367bcc9f38dbe905e8479e53106f87f537886253c3a785e1a377c8c53
3
+ size 6481