sinem02 commited on
Commit
23f39e5
·
verified ·
1 Parent(s): 3d989a0

Upload Qwen2.5-Coder-1.5B-LoRA-Deep_training_logs.json

Browse files
Qwen2.5-Coder-1.5B-LoRA-Deep_training_logs.json ADDED
@@ -0,0 +1,1168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "loss":1.2008,
4
+ "grad_norm":0.3822754323,
5
+ "learning_rate":0.0001407407,
6
+ "entropy":1.0346003115,
7
+ "num_tokens":322124.0,
8
+ "mean_token_accuracy":0.7109046429,
9
+ "epoch":0.0673400673,
10
+ "step":20,
11
+ "eval_loss":null,
12
+ "eval_runtime":null,
13
+ "eval_samples_per_second":null,
14
+ "eval_steps_per_second":null,
15
+ "eval_entropy":null,
16
+ "eval_num_tokens":null,
17
+ "eval_mean_token_accuracy":null,
18
+ "train_runtime":null,
19
+ "train_samples_per_second":null,
20
+ "train_steps_per_second":null,
21
+ "total_flos":null,
22
+ "train_loss":null
23
+ },
24
+ {
25
+ "loss":0.9385,
26
+ "grad_norm":0.2103841156,
27
+ "learning_rate":0.0001999048,
28
+ "entropy":0.9494877957,
29
+ "num_tokens":642995.0,
30
+ "mean_token_accuracy":0.7539383888,
31
+ "epoch":0.1346801347,
32
+ "step":40,
33
+ "eval_loss":null,
34
+ "eval_runtime":null,
35
+ "eval_samples_per_second":null,
36
+ "eval_steps_per_second":null,
37
+ "eval_entropy":null,
38
+ "eval_num_tokens":null,
39
+ "eval_mean_token_accuracy":null,
40
+ "train_runtime":null,
41
+ "train_samples_per_second":null,
42
+ "train_steps_per_second":null,
43
+ "total_flos":null,
44
+ "train_loss":null
45
+ },
46
+ {
47
+ "loss":0.915,
48
+ "grad_norm":0.206428811,
49
+ "learning_rate":0.0001993238,
50
+ "entropy":0.9191693425,
51
+ "num_tokens":966396.0,
52
+ "mean_token_accuracy":0.7584572025,
53
+ "epoch":0.202020202,
54
+ "step":60,
55
+ "eval_loss":null,
56
+ "eval_runtime":null,
57
+ "eval_samples_per_second":null,
58
+ "eval_steps_per_second":null,
59
+ "eval_entropy":null,
60
+ "eval_num_tokens":null,
61
+ "eval_mean_token_accuracy":null,
62
+ "train_runtime":null,
63
+ "train_samples_per_second":null,
64
+ "train_steps_per_second":null,
65
+ "total_flos":null,
66
+ "train_loss":null
67
+ },
68
+ {
69
+ "loss":0.8441,
70
+ "grad_norm":0.3027354181,
71
+ "learning_rate":0.0001982178,
72
+ "entropy":0.8431956261,
73
+ "num_tokens":1287058.0,
74
+ "mean_token_accuracy":0.7713396206,
75
+ "epoch":0.2693602694,
76
+ "step":80,
77
+ "eval_loss":null,
78
+ "eval_runtime":null,
79
+ "eval_samples_per_second":null,
80
+ "eval_steps_per_second":null,
81
+ "eval_entropy":null,
82
+ "eval_num_tokens":null,
83
+ "eval_mean_token_accuracy":null,
84
+ "train_runtime":null,
85
+ "train_samples_per_second":null,
86
+ "train_steps_per_second":null,
87
+ "total_flos":null,
88
+ "train_loss":null
89
+ },
90
+ {
91
+ "loss":0.8408,
92
+ "grad_norm":0.2982031703,
93
+ "learning_rate":0.0001965926,
94
+ "entropy":0.8472392239,
95
+ "num_tokens":1607723.0,
96
+ "mean_token_accuracy":0.7738652974,
97
+ "epoch":0.3367003367,
98
+ "step":100,
99
+ "eval_loss":null,
100
+ "eval_runtime":null,
101
+ "eval_samples_per_second":null,
102
+ "eval_steps_per_second":null,
103
+ "eval_entropy":null,
104
+ "eval_num_tokens":null,
105
+ "eval_mean_token_accuracy":null,
106
+ "train_runtime":null,
107
+ "train_samples_per_second":null,
108
+ "train_steps_per_second":null,
109
+ "total_flos":null,
110
+ "train_loss":null
111
+ },
112
+ {
113
+ "loss":null,
114
+ "grad_norm":null,
115
+ "learning_rate":null,
116
+ "entropy":null,
117
+ "num_tokens":null,
118
+ "mean_token_accuracy":null,
119
+ "epoch":0.3367003367,
120
+ "step":100,
121
+ "eval_loss":0.8332510591,
122
+ "eval_runtime":10.3382,
123
+ "eval_samples_per_second":24.182,
124
+ "eval_steps_per_second":3.095,
125
+ "eval_entropy":0.8475092333,
126
+ "eval_num_tokens":1607723.0,
127
+ "eval_mean_token_accuracy":0.7752955835,
128
+ "train_runtime":null,
129
+ "train_samples_per_second":null,
130
+ "train_steps_per_second":null,
131
+ "total_flos":null,
132
+ "train_loss":null
133
+ },
134
+ {
135
+ "loss":0.8262,
136
+ "grad_norm":0.2738818824,
137
+ "learning_rate":0.0001944568,
138
+ "entropy":0.8286631659,
139
+ "num_tokens":1928620.0,
140
+ "mean_token_accuracy":0.7755305201,
141
+ "epoch":0.404040404,
142
+ "step":120,
143
+ "eval_loss":null,
144
+ "eval_runtime":null,
145
+ "eval_samples_per_second":null,
146
+ "eval_steps_per_second":null,
147
+ "eval_entropy":null,
148
+ "eval_num_tokens":null,
149
+ "eval_mean_token_accuracy":null,
150
+ "train_runtime":null,
151
+ "train_samples_per_second":null,
152
+ "train_steps_per_second":null,
153
+ "total_flos":null,
154
+ "train_loss":null
155
+ },
156
+ {
157
+ "loss":0.8089,
158
+ "grad_norm":0.2727711201,
159
+ "learning_rate":0.0001918216,
160
+ "entropy":0.8132541452,
161
+ "num_tokens":2249401.0,
162
+ "mean_token_accuracy":0.779610493,
163
+ "epoch":0.4713804714,
164
+ "step":140,
165
+ "eval_loss":null,
166
+ "eval_runtime":null,
167
+ "eval_samples_per_second":null,
168
+ "eval_steps_per_second":null,
169
+ "eval_entropy":null,
170
+ "eval_num_tokens":null,
171
+ "eval_mean_token_accuracy":null,
172
+ "train_runtime":null,
173
+ "train_samples_per_second":null,
174
+ "train_steps_per_second":null,
175
+ "total_flos":null,
176
+ "train_loss":null
177
+ },
178
+ {
179
+ "loss":0.7815,
180
+ "grad_norm":0.253259182,
181
+ "learning_rate":0.0001887011,
182
+ "entropy":0.7838059939,
183
+ "num_tokens":2571041.0,
184
+ "mean_token_accuracy":0.785765557,
185
+ "epoch":0.5387205387,
186
+ "step":160,
187
+ "eval_loss":null,
188
+ "eval_runtime":null,
189
+ "eval_samples_per_second":null,
190
+ "eval_steps_per_second":null,
191
+ "eval_entropy":null,
192
+ "eval_num_tokens":null,
193
+ "eval_mean_token_accuracy":null,
194
+ "train_runtime":null,
195
+ "train_samples_per_second":null,
196
+ "train_steps_per_second":null,
197
+ "total_flos":null,
198
+ "train_loss":null
199
+ },
200
+ {
201
+ "loss":0.763,
202
+ "grad_norm":0.2851669788,
203
+ "learning_rate":0.0001851117,
204
+ "entropy":0.7674662221,
205
+ "num_tokens":2890814.0,
206
+ "mean_token_accuracy":0.7893050611,
207
+ "epoch":0.6060606061,
208
+ "step":180,
209
+ "eval_loss":null,
210
+ "eval_runtime":null,
211
+ "eval_samples_per_second":null,
212
+ "eval_steps_per_second":null,
213
+ "eval_entropy":null,
214
+ "eval_num_tokens":null,
215
+ "eval_mean_token_accuracy":null,
216
+ "train_runtime":null,
217
+ "train_samples_per_second":null,
218
+ "train_steps_per_second":null,
219
+ "total_flos":null,
220
+ "train_loss":null
221
+ },
222
+ {
223
+ "loss":0.7434,
224
+ "grad_norm":0.2782152891,
225
+ "learning_rate":0.0001810723,
226
+ "entropy":0.7478979569,
227
+ "num_tokens":3212811.0,
228
+ "mean_token_accuracy":0.7946783796,
229
+ "epoch":0.6734006734,
230
+ "step":200,
231
+ "eval_loss":null,
232
+ "eval_runtime":null,
233
+ "eval_samples_per_second":null,
234
+ "eval_steps_per_second":null,
235
+ "eval_entropy":null,
236
+ "eval_num_tokens":null,
237
+ "eval_mean_token_accuracy":null,
238
+ "train_runtime":null,
239
+ "train_samples_per_second":null,
240
+ "train_steps_per_second":null,
241
+ "total_flos":null,
242
+ "train_loss":null
243
+ },
244
+ {
245
+ "loss":null,
246
+ "grad_norm":null,
247
+ "learning_rate":null,
248
+ "entropy":null,
249
+ "num_tokens":null,
250
+ "mean_token_accuracy":null,
251
+ "epoch":0.6734006734,
252
+ "step":200,
253
+ "eval_loss":0.7540781498,
254
+ "eval_runtime":10.3368,
255
+ "eval_samples_per_second":24.185,
256
+ "eval_steps_per_second":3.096,
257
+ "eval_entropy":0.7548957299,
258
+ "eval_num_tokens":3212811.0,
259
+ "eval_mean_token_accuracy":0.7921991255,
260
+ "train_runtime":null,
261
+ "train_samples_per_second":null,
262
+ "train_steps_per_second":null,
263
+ "total_flos":null,
264
+ "train_loss":null
265
+ },
266
+ {
267
+ "loss":0.718,
268
+ "grad_norm":0.2911323905,
269
+ "learning_rate":0.0001766044,
270
+ "entropy":0.7216884721,
271
+ "num_tokens":3534962.0,
272
+ "mean_token_accuracy":0.8007057041,
273
+ "epoch":0.7407407407,
274
+ "step":220,
275
+ "eval_loss":null,
276
+ "eval_runtime":null,
277
+ "eval_samples_per_second":null,
278
+ "eval_steps_per_second":null,
279
+ "eval_entropy":null,
280
+ "eval_num_tokens":null,
281
+ "eval_mean_token_accuracy":null,
282
+ "train_runtime":null,
283
+ "train_samples_per_second":null,
284
+ "train_steps_per_second":null,
285
+ "total_flos":null,
286
+ "train_loss":null
287
+ },
288
+ {
289
+ "loss":0.7015,
290
+ "grad_norm":0.3469219804,
291
+ "learning_rate":0.0001717316,
292
+ "entropy":0.7073224507,
293
+ "num_tokens":3855519.0,
294
+ "mean_token_accuracy":0.8033309393,
295
+ "epoch":0.8080808081,
296
+ "step":240,
297
+ "eval_loss":null,
298
+ "eval_runtime":null,
299
+ "eval_samples_per_second":null,
300
+ "eval_steps_per_second":null,
301
+ "eval_entropy":null,
302
+ "eval_num_tokens":null,
303
+ "eval_mean_token_accuracy":null,
304
+ "train_runtime":null,
305
+ "train_samples_per_second":null,
306
+ "train_steps_per_second":null,
307
+ "total_flos":null,
308
+ "train_loss":null
309
+ },
310
+ {
311
+ "loss":0.7066,
312
+ "grad_norm":0.3413038254,
313
+ "learning_rate":0.0001664796,
314
+ "entropy":0.7131307989,
315
+ "num_tokens":4174694.0,
316
+ "mean_token_accuracy":0.8030782551,
317
+ "epoch":0.8754208754,
318
+ "step":260,
319
+ "eval_loss":null,
320
+ "eval_runtime":null,
321
+ "eval_samples_per_second":null,
322
+ "eval_steps_per_second":null,
323
+ "eval_entropy":null,
324
+ "eval_num_tokens":null,
325
+ "eval_mean_token_accuracy":null,
326
+ "train_runtime":null,
327
+ "train_samples_per_second":null,
328
+ "train_steps_per_second":null,
329
+ "total_flos":null,
330
+ "train_loss":null
331
+ },
332
+ {
333
+ "loss":0.6725,
334
+ "grad_norm":0.3970124125,
335
+ "learning_rate":0.0001608761,
336
+ "entropy":0.6751278345,
337
+ "num_tokens":4495214.0,
338
+ "mean_token_accuracy":0.8109409161,
339
+ "epoch":0.9427609428,
340
+ "step":280,
341
+ "eval_loss":null,
342
+ "eval_runtime":null,
343
+ "eval_samples_per_second":null,
344
+ "eval_steps_per_second":null,
345
+ "eval_entropy":null,
346
+ "eval_num_tokens":null,
347
+ "eval_mean_token_accuracy":null,
348
+ "train_runtime":null,
349
+ "train_samples_per_second":null,
350
+ "train_steps_per_second":null,
351
+ "total_flos":null,
352
+ "train_loss":null
353
+ },
354
+ {
355
+ "loss":0.6567,
356
+ "grad_norm":0.4383921921,
357
+ "learning_rate":0.0001549509,
358
+ "entropy":0.6729893133,
359
+ "num_tokens":4815033.0,
360
+ "mean_token_accuracy":0.8148056932,
361
+ "epoch":1.0101010101,
362
+ "step":300,
363
+ "eval_loss":null,
364
+ "eval_runtime":null,
365
+ "eval_samples_per_second":null,
366
+ "eval_steps_per_second":null,
367
+ "eval_entropy":null,
368
+ "eval_num_tokens":null,
369
+ "eval_mean_token_accuracy":null,
370
+ "train_runtime":null,
371
+ "train_samples_per_second":null,
372
+ "train_steps_per_second":null,
373
+ "total_flos":null,
374
+ "train_loss":null
375
+ },
376
+ {
377
+ "loss":null,
378
+ "grad_norm":null,
379
+ "learning_rate":null,
380
+ "entropy":null,
381
+ "num_tokens":null,
382
+ "mean_token_accuracy":null,
383
+ "epoch":1.0101010101,
384
+ "step":300,
385
+ "eval_loss":0.6720606685,
386
+ "eval_runtime":10.367,
387
+ "eval_samples_per_second":24.115,
388
+ "eval_steps_per_second":3.087,
389
+ "eval_entropy":0.6333643645,
390
+ "eval_num_tokens":4815033.0,
391
+ "eval_mean_token_accuracy":0.8116748761,
392
+ "train_runtime":null,
393
+ "train_samples_per_second":null,
394
+ "train_steps_per_second":null,
395
+ "total_flos":null,
396
+ "train_loss":null
397
+ },
398
+ {
399
+ "loss":0.5649,
400
+ "grad_norm":0.4388367832,
401
+ "learning_rate":0.0001487352,
402
+ "entropy":0.5757804383,
403
+ "num_tokens":5135569.0,
404
+ "mean_token_accuracy":0.8381757662,
405
+ "epoch":1.0774410774,
406
+ "step":320,
407
+ "eval_loss":null,
408
+ "eval_runtime":null,
409
+ "eval_samples_per_second":null,
410
+ "eval_steps_per_second":null,
411
+ "eval_entropy":null,
412
+ "eval_num_tokens":null,
413
+ "eval_mean_token_accuracy":null,
414
+ "train_runtime":null,
415
+ "train_samples_per_second":null,
416
+ "train_steps_per_second":null,
417
+ "total_flos":null,
418
+ "train_loss":null
419
+ },
420
+ {
421
+ "loss":0.564,
422
+ "grad_norm":0.4527507126,
423
+ "learning_rate":0.0001422618,
424
+ "entropy":0.5801134199,
425
+ "num_tokens":5456292.0,
426
+ "mean_token_accuracy":0.8384027012,
427
+ "epoch":1.1447811448,
428
+ "step":340,
429
+ "eval_loss":null,
430
+ "eval_runtime":null,
431
+ "eval_samples_per_second":null,
432
+ "eval_steps_per_second":null,
433
+ "eval_entropy":null,
434
+ "eval_num_tokens":null,
435
+ "eval_mean_token_accuracy":null,
436
+ "train_runtime":null,
437
+ "train_samples_per_second":null,
438
+ "train_steps_per_second":null,
439
+ "total_flos":null,
440
+ "train_loss":null
441
+ },
442
+ {
443
+ "loss":0.5403,
444
+ "grad_norm":0.6442076564,
445
+ "learning_rate":0.0001355651,
446
+ "entropy":0.5545659784,
447
+ "num_tokens":5779926.0,
448
+ "mean_token_accuracy":0.8451263145,
449
+ "epoch":1.2121212121,
450
+ "step":360,
451
+ "eval_loss":null,
452
+ "eval_runtime":null,
453
+ "eval_samples_per_second":null,
454
+ "eval_steps_per_second":null,
455
+ "eval_entropy":null,
456
+ "eval_num_tokens":null,
457
+ "eval_mean_token_accuracy":null,
458
+ "train_runtime":null,
459
+ "train_samples_per_second":null,
460
+ "train_steps_per_second":null,
461
+ "total_flos":null,
462
+ "train_loss":null
463
+ },
464
+ {
465
+ "loss":0.5554,
466
+ "grad_norm":0.5305426717,
467
+ "learning_rate":0.0001286803,
468
+ "entropy":0.5719649505,
469
+ "num_tokens":6100921.0,
470
+ "mean_token_accuracy":0.8403378457,
471
+ "epoch":1.2794612795,
472
+ "step":380,
473
+ "eval_loss":null,
474
+ "eval_runtime":null,
475
+ "eval_samples_per_second":null,
476
+ "eval_steps_per_second":null,
477
+ "eval_entropy":null,
478
+ "eval_num_tokens":null,
479
+ "eval_mean_token_accuracy":null,
480
+ "train_runtime":null,
481
+ "train_samples_per_second":null,
482
+ "train_steps_per_second":null,
483
+ "total_flos":null,
484
+ "train_loss":null
485
+ },
486
+ {
487
+ "loss":0.5345,
488
+ "grad_norm":0.5867527723,
489
+ "learning_rate":0.000121644,
490
+ "entropy":0.5514425825,
491
+ "num_tokens":6423622.0,
492
+ "mean_token_accuracy":0.8455459923,
493
+ "epoch":1.3468013468,
494
+ "step":400,
495
+ "eval_loss":null,
496
+ "eval_runtime":null,
497
+ "eval_samples_per_second":null,
498
+ "eval_steps_per_second":null,
499
+ "eval_entropy":null,
500
+ "eval_num_tokens":null,
501
+ "eval_mean_token_accuracy":null,
502
+ "train_runtime":null,
503
+ "train_samples_per_second":null,
504
+ "train_steps_per_second":null,
505
+ "total_flos":null,
506
+ "train_loss":null
507
+ },
508
+ {
509
+ "loss":null,
510
+ "grad_norm":null,
511
+ "learning_rate":null,
512
+ "entropy":null,
513
+ "num_tokens":null,
514
+ "mean_token_accuracy":null,
515
+ "epoch":1.3468013468,
516
+ "step":400,
517
+ "eval_loss":0.5978295803,
518
+ "eval_runtime":10.3395,
519
+ "eval_samples_per_second":24.179,
520
+ "eval_steps_per_second":3.095,
521
+ "eval_entropy":0.5523942402,
522
+ "eval_num_tokens":6423622.0,
523
+ "eval_mean_token_accuracy":0.8328636196,
524
+ "train_runtime":null,
525
+ "train_samples_per_second":null,
526
+ "train_steps_per_second":null,
527
+ "total_flos":null,
528
+ "train_loss":null
529
+ },
530
+ {
531
+ "loss":0.5122,
532
+ "grad_norm":0.5380092859,
533
+ "learning_rate":0.0001144932,
534
+ "entropy":0.5361574471,
535
+ "num_tokens":6744569.0,
536
+ "mean_token_accuracy":0.8533119515,
537
+ "epoch":1.4141414141,
538
+ "step":420,
539
+ "eval_loss":null,
540
+ "eval_runtime":null,
541
+ "eval_samples_per_second":null,
542
+ "eval_steps_per_second":null,
543
+ "eval_entropy":null,
544
+ "eval_num_tokens":null,
545
+ "eval_mean_token_accuracy":null,
546
+ "train_runtime":null,
547
+ "train_samples_per_second":null,
548
+ "train_steps_per_second":null,
549
+ "total_flos":null,
550
+ "train_loss":null
551
+ },
552
+ {
553
+ "loss":0.4923,
554
+ "grad_norm":0.5738714933,
555
+ "learning_rate":0.0001072658,
556
+ "entropy":0.5068911854,
557
+ "num_tokens":7065251.0,
558
+ "mean_token_accuracy":0.8583682023,
559
+ "epoch":1.4814814815,
560
+ "step":440,
561
+ "eval_loss":null,
562
+ "eval_runtime":null,
563
+ "eval_samples_per_second":null,
564
+ "eval_steps_per_second":null,
565
+ "eval_entropy":null,
566
+ "eval_num_tokens":null,
567
+ "eval_mean_token_accuracy":null,
568
+ "train_runtime":null,
569
+ "train_samples_per_second":null,
570
+ "train_steps_per_second":null,
571
+ "total_flos":null,
572
+ "train_loss":null
573
+ },
574
+ {
575
+ "loss":0.4808,
576
+ "grad_norm":0.5104277134,
577
+ "learning_rate":0.0001,
578
+ "entropy":0.504258769,
579
+ "num_tokens":7385952.0,
580
+ "mean_token_accuracy":0.861315985,
581
+ "epoch":1.5488215488,
582
+ "step":460,
583
+ "eval_loss":null,
584
+ "eval_runtime":null,
585
+ "eval_samples_per_second":null,
586
+ "eval_steps_per_second":null,
587
+ "eval_entropy":null,
588
+ "eval_num_tokens":null,
589
+ "eval_mean_token_accuracy":null,
590
+ "train_runtime":null,
591
+ "train_samples_per_second":null,
592
+ "train_steps_per_second":null,
593
+ "total_flos":null,
594
+ "train_loss":null
595
+ },
596
+ {
597
+ "loss":0.4867,
598
+ "grad_norm":0.5913535357,
599
+ "learning_rate":0.0000927342,
600
+ "entropy":0.5113813952,
601
+ "num_tokens":7704982.0,
602
+ "mean_token_accuracy":0.8592018247,
603
+ "epoch":1.6161616162,
604
+ "step":480,
605
+ "eval_loss":null,
606
+ "eval_runtime":null,
607
+ "eval_samples_per_second":null,
608
+ "eval_steps_per_second":null,
609
+ "eval_entropy":null,
610
+ "eval_num_tokens":null,
611
+ "eval_mean_token_accuracy":null,
612
+ "train_runtime":null,
613
+ "train_samples_per_second":null,
614
+ "train_steps_per_second":null,
615
+ "total_flos":null,
616
+ "train_loss":null
617
+ },
618
+ {
619
+ "loss":0.4591,
620
+ "grad_norm":0.5065989494,
621
+ "learning_rate":0.0000855068,
622
+ "entropy":0.4817487616,
623
+ "num_tokens":8026316.0,
624
+ "mean_token_accuracy":0.8679369375,
625
+ "epoch":1.6835016835,
626
+ "step":500,
627
+ "eval_loss":null,
628
+ "eval_runtime":null,
629
+ "eval_samples_per_second":null,
630
+ "eval_steps_per_second":null,
631
+ "eval_entropy":null,
632
+ "eval_num_tokens":null,
633
+ "eval_mean_token_accuracy":null,
634
+ "train_runtime":null,
635
+ "train_samples_per_second":null,
636
+ "train_steps_per_second":null,
637
+ "total_flos":null,
638
+ "train_loss":null
639
+ },
640
+ {
641
+ "loss":null,
642
+ "grad_norm":null,
643
+ "learning_rate":null,
644
+ "entropy":null,
645
+ "num_tokens":null,
646
+ "mean_token_accuracy":null,
647
+ "epoch":1.6835016835,
648
+ "step":500,
649
+ "eval_loss":0.511384666,
650
+ "eval_runtime":10.3657,
651
+ "eval_samples_per_second":24.118,
652
+ "eval_steps_per_second":3.087,
653
+ "eval_entropy":0.514307227,
654
+ "eval_num_tokens":8026316.0,
655
+ "eval_mean_token_accuracy":0.8547733743,
656
+ "train_runtime":null,
657
+ "train_samples_per_second":null,
658
+ "train_steps_per_second":null,
659
+ "total_flos":null,
660
+ "train_loss":null
661
+ },
662
+ {
663
+ "loss":0.4296,
664
+ "grad_norm":0.5839767456,
665
+ "learning_rate":0.000078356,
666
+ "entropy":0.4638194107,
667
+ "num_tokens":8348213.0,
668
+ "mean_token_accuracy":0.8757635169,
669
+ "epoch":1.7508417508,
670
+ "step":520,
671
+ "eval_loss":null,
672
+ "eval_runtime":null,
673
+ "eval_samples_per_second":null,
674
+ "eval_steps_per_second":null,
675
+ "eval_entropy":null,
676
+ "eval_num_tokens":null,
677
+ "eval_mean_token_accuracy":null,
678
+ "train_runtime":null,
679
+ "train_samples_per_second":null,
680
+ "train_steps_per_second":null,
681
+ "total_flos":null,
682
+ "train_loss":null
683
+ },
684
+ {
685
+ "loss":0.4351,
686
+ "grad_norm":0.6890075207,
687
+ "learning_rate":0.0000713197,
688
+ "entropy":0.4628964256,
689
+ "num_tokens":8671513.0,
690
+ "mean_token_accuracy":0.8734818839,
691
+ "epoch":1.8181818182,
692
+ "step":540,
693
+ "eval_loss":null,
694
+ "eval_runtime":null,
695
+ "eval_samples_per_second":null,
696
+ "eval_steps_per_second":null,
697
+ "eval_entropy":null,
698
+ "eval_num_tokens":null,
699
+ "eval_mean_token_accuracy":null,
700
+ "train_runtime":null,
701
+ "train_samples_per_second":null,
702
+ "train_steps_per_second":null,
703
+ "total_flos":null,
704
+ "train_loss":null
705
+ },
706
+ {
707
+ "loss":0.4173,
708
+ "grad_norm":0.5538685918,
709
+ "learning_rate":0.0000644349,
710
+ "entropy":0.4462708168,
711
+ "num_tokens":8990602.0,
712
+ "mean_token_accuracy":0.8780993037,
713
+ "epoch":1.8855218855,
714
+ "step":560,
715
+ "eval_loss":null,
716
+ "eval_runtime":null,
717
+ "eval_samples_per_second":null,
718
+ "eval_steps_per_second":null,
719
+ "eval_entropy":null,
720
+ "eval_num_tokens":null,
721
+ "eval_mean_token_accuracy":null,
722
+ "train_runtime":null,
723
+ "train_samples_per_second":null,
724
+ "train_steps_per_second":null,
725
+ "total_flos":null,
726
+ "train_loss":null
727
+ },
728
+ {
729
+ "loss":0.4249,
730
+ "grad_norm":0.6900932789,
731
+ "learning_rate":0.0000577382,
732
+ "entropy":0.4559292875,
733
+ "num_tokens":9310065.0,
734
+ "mean_token_accuracy":0.8769208066,
735
+ "epoch":1.9528619529,
736
+ "step":580,
737
+ "eval_loss":null,
738
+ "eval_runtime":null,
739
+ "eval_samples_per_second":null,
740
+ "eval_steps_per_second":null,
741
+ "eval_entropy":null,
742
+ "eval_num_tokens":null,
743
+ "eval_mean_token_accuracy":null,
744
+ "train_runtime":null,
745
+ "train_samples_per_second":null,
746
+ "train_steps_per_second":null,
747
+ "total_flos":null,
748
+ "train_loss":null
749
+ },
750
+ {
751
+ "loss":0.3863,
752
+ "grad_norm":0.6177843809,
753
+ "learning_rate":0.0000512648,
754
+ "entropy":0.4270478457,
755
+ "num_tokens":9629281.0,
756
+ "mean_token_accuracy":0.8871142037,
757
+ "epoch":2.0202020202,
758
+ "step":600,
759
+ "eval_loss":null,
760
+ "eval_runtime":null,
761
+ "eval_samples_per_second":null,
762
+ "eval_steps_per_second":null,
763
+ "eval_entropy":null,
764
+ "eval_num_tokens":null,
765
+ "eval_mean_token_accuracy":null,
766
+ "train_runtime":null,
767
+ "train_samples_per_second":null,
768
+ "train_steps_per_second":null,
769
+ "total_flos":null,
770
+ "train_loss":null
771
+ },
772
+ {
773
+ "loss":null,
774
+ "grad_norm":null,
775
+ "learning_rate":null,
776
+ "entropy":null,
777
+ "num_tokens":null,
778
+ "mean_token_accuracy":null,
779
+ "epoch":2.0202020202,
780
+ "step":600,
781
+ "eval_loss":0.4490914941,
782
+ "eval_runtime":10.371,
783
+ "eval_samples_per_second":24.106,
784
+ "eval_steps_per_second":3.086,
785
+ "eval_entropy":0.4390519308,
786
+ "eval_num_tokens":9629281.0,
787
+ "eval_mean_token_accuracy":0.8718043752,
788
+ "train_runtime":null,
789
+ "train_samples_per_second":null,
790
+ "train_steps_per_second":null,
791
+ "total_flos":null,
792
+ "train_loss":null
793
+ },
794
+ {
795
+ "loss":0.3554,
796
+ "grad_norm":0.5899857879,
797
+ "learning_rate":0.0000450491,
798
+ "entropy":0.396466079,
799
+ "num_tokens":9951673.0,
800
+ "mean_token_accuracy":0.8959640451,
801
+ "epoch":2.0875420875,
802
+ "step":620,
803
+ "eval_loss":null,
804
+ "eval_runtime":null,
805
+ "eval_samples_per_second":null,
806
+ "eval_steps_per_second":null,
807
+ "eval_entropy":null,
808
+ "eval_num_tokens":null,
809
+ "eval_mean_token_accuracy":null,
810
+ "train_runtime":null,
811
+ "train_samples_per_second":null,
812
+ "train_steps_per_second":null,
813
+ "total_flos":null,
814
+ "train_loss":null
815
+ },
816
+ {
817
+ "loss":0.3401,
818
+ "grad_norm":0.6384023428,
819
+ "learning_rate":0.0000391239,
820
+ "entropy":0.3796210378,
821
+ "num_tokens":10273617.0,
822
+ "mean_token_accuracy":0.8999493234,
823
+ "epoch":2.1548821549,
824
+ "step":640,
825
+ "eval_loss":null,
826
+ "eval_runtime":null,
827
+ "eval_samples_per_second":null,
828
+ "eval_steps_per_second":null,
829
+ "eval_entropy":null,
830
+ "eval_num_tokens":null,
831
+ "eval_mean_token_accuracy":null,
832
+ "train_runtime":null,
833
+ "train_samples_per_second":null,
834
+ "train_steps_per_second":null,
835
+ "total_flos":null,
836
+ "train_loss":null
837
+ },
838
+ {
839
+ "loss":0.3281,
840
+ "grad_norm":0.6890760064,
841
+ "learning_rate":0.0000335204,
842
+ "entropy":0.3717882721,
843
+ "num_tokens":10594830.0,
844
+ "mean_token_accuracy":0.9037643224,
845
+ "epoch":2.2222222222,
846
+ "step":660,
847
+ "eval_loss":null,
848
+ "eval_runtime":null,
849
+ "eval_samples_per_second":null,
850
+ "eval_steps_per_second":null,
851
+ "eval_entropy":null,
852
+ "eval_num_tokens":null,
853
+ "eval_mean_token_accuracy":null,
854
+ "train_runtime":null,
855
+ "train_samples_per_second":null,
856
+ "train_steps_per_second":null,
857
+ "total_flos":null,
858
+ "train_loss":null
859
+ },
860
+ {
861
+ "loss":0.32,
862
+ "grad_norm":0.6508978605,
863
+ "learning_rate":0.0000282684,
864
+ "entropy":0.3625029052,
865
+ "num_tokens":10916597.0,
866
+ "mean_token_accuracy":0.9063778028,
867
+ "epoch":2.2895622896,
868
+ "step":680,
869
+ "eval_loss":null,
870
+ "eval_runtime":null,
871
+ "eval_samples_per_second":null,
872
+ "eval_steps_per_second":null,
873
+ "eval_entropy":null,
874
+ "eval_num_tokens":null,
875
+ "eval_mean_token_accuracy":null,
876
+ "train_runtime":null,
877
+ "train_samples_per_second":null,
878
+ "train_steps_per_second":null,
879
+ "total_flos":null,
880
+ "train_loss":null
881
+ },
882
+ {
883
+ "loss":0.3189,
884
+ "grad_norm":0.6131536961,
885
+ "learning_rate":0.0000233956,
886
+ "entropy":0.3583062481,
887
+ "num_tokens":11235743.0,
888
+ "mean_token_accuracy":0.9068948857,
889
+ "epoch":2.3569023569,
890
+ "step":700,
891
+ "eval_loss":null,
892
+ "eval_runtime":null,
893
+ "eval_samples_per_second":null,
894
+ "eval_steps_per_second":null,
895
+ "eval_entropy":null,
896
+ "eval_num_tokens":null,
897
+ "eval_mean_token_accuracy":null,
898
+ "train_runtime":null,
899
+ "train_samples_per_second":null,
900
+ "train_steps_per_second":null,
901
+ "total_flos":null,
902
+ "train_loss":null
903
+ },
904
+ {
905
+ "loss":null,
906
+ "grad_norm":null,
907
+ "learning_rate":null,
908
+ "entropy":null,
909
+ "num_tokens":null,
910
+ "mean_token_accuracy":null,
911
+ "epoch":2.3569023569,
912
+ "step":700,
913
+ "eval_loss":0.4149619639,
914
+ "eval_runtime":10.3707,
915
+ "eval_samples_per_second":24.106,
916
+ "eval_steps_per_second":3.086,
917
+ "eval_entropy":0.4037288642,
918
+ "eval_num_tokens":11235743.0,
919
+ "eval_mean_token_accuracy":0.8824688997,
920
+ "train_runtime":null,
921
+ "train_samples_per_second":null,
922
+ "train_steps_per_second":null,
923
+ "total_flos":null,
924
+ "train_loss":null
925
+ },
926
+ {
927
+ "loss":0.3248,
928
+ "grad_norm":0.5035169125,
929
+ "learning_rate":0.0000189277,
930
+ "entropy":0.3696210571,
931
+ "num_tokens":11556934.0,
932
+ "mean_token_accuracy":0.9046057545,
933
+ "epoch":2.4242424242,
934
+ "step":720,
935
+ "eval_loss":null,
936
+ "eval_runtime":null,
937
+ "eval_samples_per_second":null,
938
+ "eval_steps_per_second":null,
939
+ "eval_entropy":null,
940
+ "eval_num_tokens":null,
941
+ "eval_mean_token_accuracy":null,
942
+ "train_runtime":null,
943
+ "train_samples_per_second":null,
944
+ "train_steps_per_second":null,
945
+ "total_flos":null,
946
+ "train_loss":null
947
+ },
948
+ {
949
+ "loss":0.3126,
950
+ "grad_norm":0.5420159698,
951
+ "learning_rate":0.0000148883,
952
+ "entropy":0.3530109294,
953
+ "num_tokens":11879049.0,
954
+ "mean_token_accuracy":0.910102234,
955
+ "epoch":2.4915824916,
956
+ "step":740,
957
+ "eval_loss":null,
958
+ "eval_runtime":null,
959
+ "eval_samples_per_second":null,
960
+ "eval_steps_per_second":null,
961
+ "eval_entropy":null,
962
+ "eval_num_tokens":null,
963
+ "eval_mean_token_accuracy":null,
964
+ "train_runtime":null,
965
+ "train_samples_per_second":null,
966
+ "train_steps_per_second":null,
967
+ "total_flos":null,
968
+ "train_loss":null
969
+ },
970
+ {
971
+ "loss":0.3017,
972
+ "grad_norm":0.4808464348,
973
+ "learning_rate":0.0000112989,
974
+ "entropy":0.3429520307,
975
+ "num_tokens":12199559.0,
976
+ "mean_token_accuracy":0.9116890863,
977
+ "epoch":2.5589225589,
978
+ "step":760,
979
+ "eval_loss":null,
980
+ "eval_runtime":null,
981
+ "eval_samples_per_second":null,
982
+ "eval_steps_per_second":null,
983
+ "eval_entropy":null,
984
+ "eval_num_tokens":null,
985
+ "eval_mean_token_accuracy":null,
986
+ "train_runtime":null,
987
+ "train_samples_per_second":null,
988
+ "train_steps_per_second":null,
989
+ "total_flos":null,
990
+ "train_loss":null
991
+ },
992
+ {
993
+ "loss":0.2944,
994
+ "grad_norm":0.5233286023,
995
+ "learning_rate":0.0000081784,
996
+ "entropy":0.3373699239,
997
+ "num_tokens":12518745.0,
998
+ "mean_token_accuracy":0.9141617462,
999
+ "epoch":2.6262626263,
1000
+ "step":780,
1001
+ "eval_loss":null,
1002
+ "eval_runtime":null,
1003
+ "eval_samples_per_second":null,
1004
+ "eval_steps_per_second":null,
1005
+ "eval_entropy":null,
1006
+ "eval_num_tokens":null,
1007
+ "eval_mean_token_accuracy":null,
1008
+ "train_runtime":null,
1009
+ "train_samples_per_second":null,
1010
+ "train_steps_per_second":null,
1011
+ "total_flos":null,
1012
+ "train_loss":null
1013
+ },
1014
+ {
1015
+ "loss":0.3132,
1016
+ "grad_norm":0.540781498,
1017
+ "learning_rate":0.0000055432,
1018
+ "entropy":0.3541788673,
1019
+ "num_tokens":12839323.0,
1020
+ "mean_token_accuracy":0.9097139165,
1021
+ "epoch":2.6936026936,
1022
+ "step":800,
1023
+ "eval_loss":null,
1024
+ "eval_runtime":null,
1025
+ "eval_samples_per_second":null,
1026
+ "eval_steps_per_second":null,
1027
+ "eval_entropy":null,
1028
+ "eval_num_tokens":null,
1029
+ "eval_mean_token_accuracy":null,
1030
+ "train_runtime":null,
1031
+ "train_samples_per_second":null,
1032
+ "train_steps_per_second":null,
1033
+ "total_flos":null,
1034
+ "train_loss":null
1035
+ },
1036
+ {
1037
+ "loss":null,
1038
+ "grad_norm":null,
1039
+ "learning_rate":null,
1040
+ "entropy":null,
1041
+ "num_tokens":null,
1042
+ "mean_token_accuracy":null,
1043
+ "epoch":2.6936026936,
1044
+ "step":800,
1045
+ "eval_loss":0.3966158926,
1046
+ "eval_runtime":10.3484,
1047
+ "eval_samples_per_second":24.158,
1048
+ "eval_steps_per_second":3.092,
1049
+ "eval_entropy":0.3902668599,
1050
+ "eval_num_tokens":12839323.0,
1051
+ "eval_mean_token_accuracy":0.8880477473,
1052
+ "train_runtime":null,
1053
+ "train_samples_per_second":null,
1054
+ "train_steps_per_second":null,
1055
+ "total_flos":null,
1056
+ "train_loss":null
1057
+ },
1058
+ {
1059
+ "loss":0.3082,
1060
+ "grad_norm":0.5258508921,
1061
+ "learning_rate":0.0000034074,
1062
+ "entropy":0.3506111713,
1063
+ "num_tokens":13160627.0,
1064
+ "mean_token_accuracy":0.9098422483,
1065
+ "epoch":2.7609427609,
1066
+ "step":820,
1067
+ "eval_loss":null,
1068
+ "eval_runtime":null,
1069
+ "eval_samples_per_second":null,
1070
+ "eval_steps_per_second":null,
1071
+ "eval_entropy":null,
1072
+ "eval_num_tokens":null,
1073
+ "eval_mean_token_accuracy":null,
1074
+ "train_runtime":null,
1075
+ "train_samples_per_second":null,
1076
+ "train_steps_per_second":null,
1077
+ "total_flos":null,
1078
+ "train_loss":null
1079
+ },
1080
+ {
1081
+ "loss":0.3079,
1082
+ "grad_norm":0.4996784031,
1083
+ "learning_rate":0.0000017822,
1084
+ "entropy":0.3474921705,
1085
+ "num_tokens":13481114.0,
1086
+ "mean_token_accuracy":0.9094100349,
1087
+ "epoch":2.8282828283,
1088
+ "step":840,
1089
+ "eval_loss":null,
1090
+ "eval_runtime":null,
1091
+ "eval_samples_per_second":null,
1092
+ "eval_steps_per_second":null,
1093
+ "eval_entropy":null,
1094
+ "eval_num_tokens":null,
1095
+ "eval_mean_token_accuracy":null,
1096
+ "train_runtime":null,
1097
+ "train_samples_per_second":null,
1098
+ "train_steps_per_second":null,
1099
+ "total_flos":null,
1100
+ "train_loss":null
1101
+ },
1102
+ {
1103
+ "loss":0.3092,
1104
+ "grad_norm":0.4853805304,
1105
+ "learning_rate":0.0000006762,
1106
+ "entropy":0.3521205258,
1107
+ "num_tokens":13803114.0,
1108
+ "mean_token_accuracy":0.9098572351,
1109
+ "epoch":2.8956228956,
1110
+ "step":860,
1111
+ "eval_loss":null,
1112
+ "eval_runtime":null,
1113
+ "eval_samples_per_second":null,
1114
+ "eval_steps_per_second":null,
1115
+ "eval_entropy":null,
1116
+ "eval_num_tokens":null,
1117
+ "eval_mean_token_accuracy":null,
1118
+ "train_runtime":null,
1119
+ "train_samples_per_second":null,
1120
+ "train_steps_per_second":null,
1121
+ "total_flos":null,
1122
+ "train_loss":null
1123
+ },
1124
+ {
1125
+ "loss":0.3038,
1126
+ "grad_norm":0.5111385584,
1127
+ "learning_rate":0.0000000952,
1128
+ "entropy":0.3490505032,
1129
+ "num_tokens":14125289.0,
1130
+ "mean_token_accuracy":0.9108231679,
1131
+ "epoch":2.962962963,
1132
+ "step":880,
1133
+ "eval_loss":null,
1134
+ "eval_runtime":null,
1135
+ "eval_samples_per_second":null,
1136
+ "eval_steps_per_second":null,
1137
+ "eval_entropy":null,
1138
+ "eval_num_tokens":null,
1139
+ "eval_mean_token_accuracy":null,
1140
+ "train_runtime":null,
1141
+ "train_samples_per_second":null,
1142
+ "train_steps_per_second":null,
1143
+ "total_flos":null,
1144
+ "train_loss":null
1145
+ },
1146
+ {
1147
+ "loss":null,
1148
+ "grad_norm":null,
1149
+ "learning_rate":null,
1150
+ "entropy":null,
1151
+ "num_tokens":null,
1152
+ "mean_token_accuracy":null,
1153
+ "epoch":3.0,
1154
+ "step":891,
1155
+ "eval_loss":null,
1156
+ "eval_runtime":null,
1157
+ "eval_samples_per_second":null,
1158
+ "eval_steps_per_second":null,
1159
+ "eval_entropy":null,
1160
+ "eval_num_tokens":null,
1161
+ "eval_mean_token_accuracy":null,
1162
+ "train_runtime":1912.4699,
1163
+ "train_samples_per_second":7.451,
1164
+ "train_steps_per_second":0.466,
1165
+ "total_flos":1.163395683e+17,
1166
+ "train_loss":0.5388089069
1167
+ }
1168
+ ]