PurplelinkPL commited on
Commit
91d9c5b
·
verified ·
1 Parent(s): 0cb0da3

Upload 9 files

Browse files
Files changed (6) hide show
  1. config.json +1 -1
  2. model.safetensors +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1132 -1132
  6. training_args.bin +1 -1
config.json CHANGED
@@ -41,6 +41,6 @@
41
  "sep_token_id": 50282,
42
  "sparse_pred_ignore_index": -100,
43
  "sparse_prediction": false,
44
- "transformers_version": "4.56.0",
45
  "vocab_size": 50368
46
  }
 
41
  "sep_token_id": 50282,
42
  "sparse_pred_ignore_index": -100,
43
  "sparse_prediction": false,
44
+ "transformers_version": "4.56.1",
45
  "vocab_size": 50368
46
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:954170b1fad48a196e31d3546782ff96db097725c8af15848d7f16d81276ef73
3
  size 1583544840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c513c87136b7061f89a0058cf57e10feabc8eaa6dc84ac77ff0f5a223c2f19c
3
  size 1583544840
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d98adafc4abe7b7680c12defd6f5a399e2070f42e277fc2f67f1547179234cd
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:449e44f9adf4d083aec6625b9110f6a9a09baba982e3a32de94ff0c135c00f4d
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ed749c4fee6e9346a27fe219e7901c7d0d1eadfb8abff3040bfb1e1b1961b12
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d0df6d48ac6c8d2a3fe965d9b7a645f9b425ec23c31765b3bbc57f64cf0fee9
3
  size 1465
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7142857142857143,
6
  "eval_steps": 1000,
7
  "global_step": 25000,
8
  "is_hyper_param_search": false,
@@ -10,1965 +10,1965 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 2.857142857142857e-05,
14
- "grad_norm": 1.1264785528182983,
15
  "learning_rate": 0.0,
16
- "loss": 1.4622,
17
  "step": 1
18
  },
19
  {
20
- "epoch": 0.002857142857142857,
21
- "grad_norm": 1.0415701866149902,
22
- "learning_rate": 1.4142857142857144e-06,
23
- "loss": 1.4319,
24
  "step": 100
25
  },
26
  {
27
- "epoch": 0.005714285714285714,
28
- "grad_norm": 1.1746091842651367,
29
- "learning_rate": 2.8428571428571432e-06,
30
- "loss": 1.4189,
31
  "step": 200
32
  },
33
  {
34
- "epoch": 0.008571428571428572,
35
- "grad_norm": 1.1301525831222534,
36
- "learning_rate": 4.271428571428572e-06,
37
- "loss": 1.4293,
38
  "step": 300
39
  },
40
  {
41
- "epoch": 0.011428571428571429,
42
- "grad_norm": 1.0607796907424927,
43
- "learning_rate": 4.9999753285470756e-06,
44
- "loss": 1.4205,
45
  "step": 400
46
  },
47
  {
48
- "epoch": 0.014285714285714285,
49
- "grad_norm": 1.1491715908050537,
50
- "learning_rate": 4.999771876927458e-06,
51
- "loss": 1.4197,
52
  "step": 500
53
  },
54
  {
55
- "epoch": 0.017142857142857144,
56
- "grad_norm": 1.0873078107833862,
57
- "learning_rate": 4.999362935318198e-06,
58
- "loss": 1.4364,
59
  "step": 600
60
  },
61
  {
62
- "epoch": 0.02,
63
- "grad_norm": 1.0659881830215454,
64
- "learning_rate": 4.998748537335728e-06,
65
- "loss": 1.4507,
66
  "step": 700
67
  },
68
  {
69
- "epoch": 0.022857142857142857,
70
- "grad_norm": 1.1764490604400635,
71
- "learning_rate": 4.99792873348571e-06,
72
- "loss": 1.4398,
73
  "step": 800
74
  },
75
  {
76
- "epoch": 0.025714285714285714,
77
- "grad_norm": 1.0576765537261963,
78
- "learning_rate": 4.996903591158886e-06,
79
- "loss": 1.4203,
80
  "step": 900
81
  },
82
  {
83
- "epoch": 0.02857142857142857,
84
- "grad_norm": 1.111843228340149,
85
- "learning_rate": 4.995673194625541e-06,
86
- "loss": 1.4203,
87
  "step": 1000
88
  },
89
  {
90
- "epoch": 0.02857142857142857,
91
- "eval_loss": 1.4505008459091187,
92
- "eval_runtime": 103.009,
93
- "eval_samples_per_second": 132.95,
94
- "eval_steps_per_second": 2.077,
95
  "step": 1000
96
  },
97
  {
98
- "epoch": 0.03142857142857143,
99
- "grad_norm": 1.037828803062439,
100
- "learning_rate": 4.994237645028573e-06,
101
- "loss": 1.443,
102
  "step": 1100
103
  },
104
  {
105
- "epoch": 0.03428571428571429,
106
- "grad_norm": 1.1225452423095703,
107
- "learning_rate": 4.992597060375177e-06,
108
- "loss": 1.4519,
109
  "step": 1200
110
  },
111
  {
112
- "epoch": 0.037142857142857144,
113
- "grad_norm": 1.032313346862793,
114
- "learning_rate": 4.990751575527151e-06,
115
- "loss": 1.4358,
116
  "step": 1300
117
  },
118
  {
119
- "epoch": 0.04,
120
- "grad_norm": 1.1252490282058716,
121
- "learning_rate": 4.988701342189802e-06,
122
- "loss": 1.4102,
123
  "step": 1400
124
  },
125
  {
126
- "epoch": 0.04285714285714286,
127
- "grad_norm": 1.0545426607131958,
128
- "learning_rate": 4.986446528899478e-06,
129
- "loss": 1.4142,
130
  "step": 1500
131
  },
132
  {
133
- "epoch": 0.045714285714285714,
134
- "grad_norm": 1.08208429813385,
135
- "learning_rate": 4.983987321009718e-06,
136
- "loss": 1.4247,
137
  "step": 1600
138
  },
139
  {
140
- "epoch": 0.04857142857142857,
141
- "grad_norm": 1.042827844619751,
142
- "learning_rate": 4.98132392067601e-06,
143
- "loss": 1.4078,
144
  "step": 1700
145
  },
146
  {
147
- "epoch": 0.05142857142857143,
148
- "grad_norm": 1.029168725013733,
149
- "learning_rate": 4.978456546839175e-06,
150
- "loss": 1.4255,
151
  "step": 1800
152
  },
153
  {
154
- "epoch": 0.054285714285714284,
155
- "grad_norm": 1.1674017906188965,
156
- "learning_rate": 4.975385435207367e-06,
157
- "loss": 1.4428,
158
  "step": 1900
159
  },
160
  {
161
- "epoch": 0.05714285714285714,
162
- "grad_norm": 1.2838454246520996,
163
- "learning_rate": 4.972110838236704e-06,
164
- "loss": 1.4327,
165
  "step": 2000
166
  },
167
  {
168
- "epoch": 0.05714285714285714,
169
- "eval_loss": 1.4425562620162964,
170
- "eval_runtime": 97.037,
171
- "eval_samples_per_second": 141.132,
172
- "eval_steps_per_second": 2.205,
173
  "step": 2000
174
  },
175
  {
176
- "epoch": 0.06,
177
- "grad_norm": 1.0720206499099731,
178
- "learning_rate": 4.968633025110507e-06,
179
- "loss": 1.4312,
180
  "step": 2100
181
  },
182
  {
183
- "epoch": 0.06285714285714286,
184
- "grad_norm": 1.0312304496765137,
185
- "learning_rate": 4.964952281717177e-06,
186
- "loss": 1.4405,
187
  "step": 2200
188
  },
189
  {
190
- "epoch": 0.06571428571428571,
191
- "grad_norm": 1.0791317224502563,
192
- "learning_rate": 4.961068910626692e-06,
193
- "loss": 1.4407,
194
  "step": 2300
195
  },
196
  {
197
- "epoch": 0.06857142857142857,
198
- "grad_norm": 1.0809016227722168,
199
- "learning_rate": 4.956983231065733e-06,
200
- "loss": 1.434,
201
  "step": 2400
202
  },
203
  {
204
- "epoch": 0.07142857142857142,
205
- "grad_norm": 1.059635043144226,
206
- "learning_rate": 4.952695578891449e-06,
207
- "loss": 1.4114,
208
  "step": 2500
209
  },
210
  {
211
- "epoch": 0.07428571428571429,
212
- "grad_norm": 1.0659129619598389,
213
- "learning_rate": 4.948206306563842e-06,
214
- "loss": 1.4374,
215
  "step": 2600
216
  },
217
  {
218
- "epoch": 0.07714285714285714,
219
- "grad_norm": 1.0818511247634888,
220
- "learning_rate": 4.943515783116794e-06,
221
- "loss": 1.4196,
222
  "step": 2700
223
  },
224
  {
225
- "epoch": 0.08,
226
- "grad_norm": 1.1003646850585938,
227
- "learning_rate": 4.9386243941277374e-06,
228
- "loss": 1.4508,
229
  "step": 2800
230
  },
231
  {
232
- "epoch": 0.08285714285714285,
233
- "grad_norm": 1.086207628250122,
234
- "learning_rate": 4.933532541685949e-06,
235
- "loss": 1.4354,
236
  "step": 2900
237
  },
238
  {
239
- "epoch": 0.08571428571428572,
240
- "grad_norm": 1.0702838897705078,
241
- "learning_rate": 4.928240644359507e-06,
242
- "loss": 1.4262,
243
  "step": 3000
244
  },
245
  {
246
- "epoch": 0.08571428571428572,
247
- "eval_loss": 1.438844919204712,
248
- "eval_runtime": 96.9421,
249
- "eval_samples_per_second": 141.27,
250
- "eval_steps_per_second": 2.208,
251
  "step": 3000
252
  },
253
  {
254
- "epoch": 0.08857142857142856,
255
- "grad_norm": 1.1206424236297607,
256
- "learning_rate": 4.922749137160875e-06,
257
- "loss": 1.4445,
258
  "step": 3100
259
  },
260
  {
261
- "epoch": 0.09142857142857143,
262
- "grad_norm": 1.0971518754959106,
263
- "learning_rate": 4.917058471511149e-06,
264
- "loss": 1.4117,
265
  "step": 3200
266
  },
267
  {
268
- "epoch": 0.09428571428571429,
269
- "grad_norm": 1.1263982057571411,
270
- "learning_rate": 4.9111691152029436e-06,
271
- "loss": 1.4294,
272
  "step": 3300
273
  },
274
  {
275
- "epoch": 0.09714285714285714,
276
- "grad_norm": 1.0150455236434937,
277
- "learning_rate": 4.905081552361943e-06,
278
- "loss": 1.4357,
279
  "step": 3400
280
  },
281
  {
282
- "epoch": 0.1,
283
- "grad_norm": 1.0511361360549927,
284
- "learning_rate": 4.898796283407099e-06,
285
- "loss": 1.438,
286
  "step": 3500
287
  },
288
  {
289
- "epoch": 0.10285714285714286,
290
- "grad_norm": 1.1033008098602295,
291
- "learning_rate": 4.892313825009499e-06,
292
- "loss": 1.4162,
293
  "step": 3600
294
  },
295
  {
296
- "epoch": 0.10571428571428572,
297
- "grad_norm": 1.1107470989227295,
298
- "learning_rate": 4.885634710049891e-06,
299
- "loss": 1.4267,
300
  "step": 3700
301
  },
302
  {
303
- "epoch": 0.10857142857142857,
304
- "grad_norm": 1.0580041408538818,
305
- "learning_rate": 4.878759487574882e-06,
306
- "loss": 1.4287,
307
  "step": 3800
308
  },
309
  {
310
- "epoch": 0.11142857142857143,
311
- "grad_norm": 1.0198274850845337,
312
- "learning_rate": 4.871688722751799e-06,
313
- "loss": 1.438,
314
  "step": 3900
315
  },
316
  {
317
- "epoch": 0.11428571428571428,
318
- "grad_norm": 1.1063220500946045,
319
- "learning_rate": 4.864422996822239e-06,
320
- "loss": 1.4078,
321
  "step": 4000
322
  },
323
  {
324
- "epoch": 0.11428571428571428,
325
- "eval_loss": 1.4405826330184937,
326
- "eval_runtime": 97.1575,
327
- "eval_samples_per_second": 140.957,
328
- "eval_steps_per_second": 2.203,
329
  "step": 4000
330
  },
331
  {
332
- "epoch": 0.11714285714285715,
333
- "grad_norm": 1.0360065698623657,
334
- "learning_rate": 4.8569629070542775e-06,
335
- "loss": 1.414,
336
  "step": 4100
337
  },
338
  {
339
- "epoch": 0.12,
340
- "grad_norm": 1.0539647340774536,
341
- "learning_rate": 4.849309066693382e-06,
342
- "loss": 1.3992,
343
  "step": 4200
344
  },
345
  {
346
- "epoch": 0.12285714285714286,
347
- "grad_norm": 1.0678602457046509,
348
- "learning_rate": 4.8414621049119935e-06,
349
- "loss": 1.4226,
350
  "step": 4300
351
  },
352
  {
353
- "epoch": 0.12571428571428572,
354
- "grad_norm": 1.1174051761627197,
355
- "learning_rate": 4.833422666757811e-06,
356
- "loss": 1.4149,
357
  "step": 4400
358
  },
359
  {
360
- "epoch": 0.12857142857142856,
361
- "grad_norm": 1.1076269149780273,
362
- "learning_rate": 4.825191413100764e-06,
363
- "loss": 1.4219,
364
  "step": 4500
365
  },
366
  {
367
- "epoch": 0.13142857142857142,
368
- "grad_norm": 1.0237882137298584,
369
- "learning_rate": 4.816769020578685e-06,
370
- "loss": 1.4063,
371
  "step": 4600
372
  },
373
  {
374
- "epoch": 0.13428571428571429,
375
- "grad_norm": 1.0634537935256958,
376
- "learning_rate": 4.808156181541694e-06,
377
- "loss": 1.4077,
378
  "step": 4700
379
  },
380
  {
381
- "epoch": 0.13714285714285715,
382
- "grad_norm": 1.1134625673294067,
383
- "learning_rate": 4.799353603995275e-06,
384
- "loss": 1.4589,
385
  "step": 4800
386
  },
387
  {
388
- "epoch": 0.14,
389
- "grad_norm": 1.069698691368103,
390
- "learning_rate": 4.790362011542085e-06,
391
- "loss": 1.4063,
392
  "step": 4900
393
  },
394
  {
395
- "epoch": 0.14285714285714285,
396
- "grad_norm": 1.1093010902404785,
397
- "learning_rate": 4.7811821433224665e-06,
398
- "loss": 1.4225,
399
  "step": 5000
400
  },
401
  {
402
- "epoch": 0.14285714285714285,
403
- "eval_loss": 1.4438061714172363,
404
- "eval_runtime": 98.2976,
405
- "eval_samples_per_second": 139.322,
406
- "eval_steps_per_second": 2.177,
407
  "step": 5000
408
  },
409
  {
410
- "epoch": 0.1457142857142857,
411
- "grad_norm": 1.0884599685668945,
412
- "learning_rate": 4.7718147539536865e-06,
413
- "loss": 1.4347,
414
  "step": 5100
415
  },
416
  {
417
- "epoch": 0.14857142857142858,
418
- "grad_norm": 1.0088622570037842,
419
- "learning_rate": 4.762260613467909e-06,
420
- "loss": 1.4254,
421
  "step": 5200
422
  },
423
  {
424
- "epoch": 0.15142857142857144,
425
- "grad_norm": 1.1340473890304565,
426
- "learning_rate": 4.75252050724889e-06,
427
- "loss": 1.4101,
428
  "step": 5300
429
  },
430
  {
431
- "epoch": 0.15428571428571428,
432
- "grad_norm": 1.093491554260254,
433
- "learning_rate": 4.7425952359674225e-06,
434
- "loss": 1.4256,
435
  "step": 5400
436
  },
437
  {
438
- "epoch": 0.15714285714285714,
439
- "grad_norm": 1.0808088779449463,
440
- "learning_rate": 4.732485615515511e-06,
441
- "loss": 1.4093,
442
  "step": 5500
443
  },
444
  {
445
- "epoch": 0.16,
446
- "grad_norm": 1.100080132484436,
447
- "learning_rate": 4.722192476939309e-06,
448
- "loss": 1.4263,
449
  "step": 5600
450
  },
451
  {
452
- "epoch": 0.16285714285714287,
453
- "grad_norm": 1.096901774406433,
454
- "learning_rate": 4.7117166663708025e-06,
455
- "loss": 1.4084,
456
  "step": 5700
457
  },
458
  {
459
- "epoch": 0.1657142857142857,
460
- "grad_norm": 1.1885929107666016,
461
- "learning_rate": 4.7010590449582525e-06,
462
- "loss": 1.4146,
463
  "step": 5800
464
  },
465
  {
466
- "epoch": 0.16857142857142857,
467
- "grad_norm": 1.082043170928955,
468
- "learning_rate": 4.690220488795406e-06,
469
- "loss": 1.4201,
470
  "step": 5900
471
  },
472
  {
473
- "epoch": 0.17142857142857143,
474
- "grad_norm": 1.0647767782211304,
475
- "learning_rate": 4.679201888849481e-06,
476
- "loss": 1.436,
477
  "step": 6000
478
  },
479
  {
480
- "epoch": 0.17142857142857143,
481
- "eval_loss": 1.4336808919906616,
482
- "eval_runtime": 97.8921,
483
- "eval_samples_per_second": 139.899,
484
- "eval_steps_per_second": 2.186,
485
  "step": 6000
486
  },
487
  {
488
- "epoch": 0.1742857142857143,
489
- "grad_norm": 1.1146217584609985,
490
- "learning_rate": 4.668004150887924e-06,
491
- "loss": 1.4132,
492
  "step": 6100
493
  },
494
  {
495
- "epoch": 0.17714285714285713,
496
- "grad_norm": 1.0890520811080933,
497
- "learning_rate": 4.656628195403952e-06,
498
- "loss": 1.4047,
499
  "step": 6200
500
  },
501
  {
502
- "epoch": 0.18,
503
- "grad_norm": 1.033389687538147,
504
- "learning_rate": 4.645074957540887e-06,
505
- "loss": 1.4272,
506
  "step": 6300
507
  },
508
  {
509
- "epoch": 0.18285714285714286,
510
- "grad_norm": 1.1013028621673584,
511
- "learning_rate": 4.63334538701528e-06,
512
- "loss": 1.4402,
513
  "step": 6400
514
  },
515
  {
516
- "epoch": 0.18571428571428572,
517
- "grad_norm": 1.0814400911331177,
518
- "learning_rate": 4.6214404480388455e-06,
519
- "loss": 1.4031,
520
  "step": 6500
521
  },
522
  {
523
- "epoch": 0.18857142857142858,
524
- "grad_norm": 1.0447463989257812,
525
- "learning_rate": 4.609361119239197e-06,
526
- "loss": 1.4453,
527
  "step": 6600
528
  },
529
  {
530
- "epoch": 0.19142857142857142,
531
- "grad_norm": 1.1220800876617432,
532
- "learning_rate": 4.5971083935794026e-06,
533
- "loss": 1.4148,
534
  "step": 6700
535
  },
536
  {
537
- "epoch": 0.19428571428571428,
538
- "grad_norm": 1.107762098312378,
539
- "learning_rate": 4.584683278276356e-06,
540
- "loss": 1.4285,
541
  "step": 6800
542
  },
543
  {
544
- "epoch": 0.19714285714285715,
545
- "grad_norm": 1.1005544662475586,
546
- "learning_rate": 4.572086794717985e-06,
547
- "loss": 1.4328,
548
  "step": 6900
549
  },
550
  {
551
- "epoch": 0.2,
552
- "grad_norm": 1.033148169517517,
553
- "learning_rate": 4.559319978379287e-06,
554
- "loss": 1.4111,
555
  "step": 7000
556
  },
557
  {
558
- "epoch": 0.2,
559
- "eval_loss": 1.4392390251159668,
560
- "eval_runtime": 97.5914,
561
- "eval_samples_per_second": 140.33,
562
- "eval_steps_per_second": 2.193,
563
  "step": 7000
564
  },
565
  {
566
- "epoch": 0.20285714285714285,
567
- "grad_norm": 1.052509069442749,
568
- "learning_rate": 4.546383878737207e-06,
569
- "loss": 1.4113,
570
  "step": 7100
571
  },
572
  {
573
- "epoch": 0.2057142857142857,
574
- "grad_norm": 1.0561904907226562,
575
- "learning_rate": 4.533279559184373e-06,
576
- "loss": 1.4275,
577
  "step": 7200
578
  },
579
  {
580
- "epoch": 0.20857142857142857,
581
- "grad_norm": 1.0787992477416992,
582
- "learning_rate": 4.520008096941676e-06,
583
- "loss": 1.4084,
584
  "step": 7300
585
  },
586
  {
587
- "epoch": 0.21142857142857144,
588
- "grad_norm": 1.0198429822921753,
589
- "learning_rate": 4.506570582969719e-06,
590
- "loss": 1.4029,
591
  "step": 7400
592
  },
593
  {
594
- "epoch": 0.21428571428571427,
595
- "grad_norm": 1.0664575099945068,
596
- "learning_rate": 4.492968121879142e-06,
597
- "loss": 1.4049,
598
  "step": 7500
599
  },
600
  {
601
- "epoch": 0.21714285714285714,
602
- "grad_norm": 1.0929675102233887,
603
- "learning_rate": 4.479201831839812e-06,
604
- "loss": 1.4169,
605
  "step": 7600
606
  },
607
  {
608
- "epoch": 0.22,
609
- "grad_norm": 1.1445673704147339,
610
- "learning_rate": 4.465272844488908e-06,
611
- "loss": 1.4033,
612
  "step": 7700
613
  },
614
  {
615
- "epoch": 0.22285714285714286,
616
- "grad_norm": 1.064433217048645,
617
- "learning_rate": 4.4511823048378986e-06,
618
- "loss": 1.43,
619
  "step": 7800
620
  },
621
  {
622
- "epoch": 0.2257142857142857,
623
- "grad_norm": 1.0845831632614136,
624
- "learning_rate": 4.436931371178416e-06,
625
- "loss": 1.4441,
626
  "step": 7900
627
  },
628
  {
629
- "epoch": 0.22857142857142856,
630
- "grad_norm": 1.0980095863342285,
631
- "learning_rate": 4.42252121498704e-06,
632
- "loss": 1.4015,
633
  "step": 8000
634
  },
635
  {
636
- "epoch": 0.22857142857142856,
637
- "eval_loss": 1.4309405088424683,
638
- "eval_runtime": 97.7381,
639
- "eval_samples_per_second": 140.119,
640
- "eval_steps_per_second": 2.19,
641
  "step": 8000
642
  },
643
  {
644
- "epoch": 0.23142857142857143,
645
- "grad_norm": 1.1431641578674316,
646
- "learning_rate": 4.407953020829001e-06,
647
- "loss": 1.4249,
648
  "step": 8100
649
  },
650
  {
651
- "epoch": 0.2342857142857143,
652
- "grad_norm": 1.0139048099517822,
653
- "learning_rate": 4.393227986260801e-06,
654
- "loss": 1.3958,
655
  "step": 8200
656
  },
657
  {
658
- "epoch": 0.23714285714285716,
659
- "grad_norm": 1.0676871538162231,
660
- "learning_rate": 4.378347321731773e-06,
661
- "loss": 1.4204,
662
  "step": 8300
663
  },
664
  {
665
- "epoch": 0.24,
666
- "grad_norm": 1.1097986698150635,
667
- "learning_rate": 4.363312250484577e-06,
668
- "loss": 1.4335,
669
  "step": 8400
670
  },
671
  {
672
- "epoch": 0.24285714285714285,
673
- "grad_norm": 1.083742380142212,
674
- "learning_rate": 4.348124008454644e-06,
675
- "loss": 1.436,
676
  "step": 8500
677
  },
678
  {
679
- "epoch": 0.24571428571428572,
680
- "grad_norm": 1.072716474533081,
681
- "learning_rate": 4.332783844168581e-06,
682
- "loss": 1.424,
683
  "step": 8600
684
  },
685
  {
686
- "epoch": 0.24857142857142858,
687
- "grad_norm": 1.1168031692504883,
688
- "learning_rate": 4.317293018641536e-06,
689
- "loss": 1.4262,
690
  "step": 8700
691
  },
692
  {
693
- "epoch": 0.25142857142857145,
694
- "grad_norm": 1.1102938652038574,
695
- "learning_rate": 4.301652805273535e-06,
696
- "loss": 1.4141,
697
  "step": 8800
698
  },
699
  {
700
- "epoch": 0.2542857142857143,
701
- "grad_norm": 1.1052049398422241,
702
- "learning_rate": 4.285864489744809e-06,
703
- "loss": 1.4221,
704
  "step": 8900
705
  },
706
  {
707
- "epoch": 0.2571428571428571,
708
- "grad_norm": 1.0475815534591675,
709
- "learning_rate": 4.269929369910103e-06,
710
- "loss": 1.4145,
711
  "step": 9000
712
  },
713
  {
714
- "epoch": 0.2571428571428571,
715
- "eval_loss": 1.428357481956482,
716
- "eval_runtime": 98.1292,
717
- "eval_samples_per_second": 139.561,
718
- "eval_steps_per_second": 2.181,
719
  "step": 9000
720
  },
721
  {
722
- "epoch": 0.26,
723
- "grad_norm": 1.0066262483596802,
724
- "learning_rate": 4.253848755691992e-06,
725
- "loss": 1.4049,
726
  "step": 9100
727
  },
728
  {
729
- "epoch": 0.26285714285714284,
730
- "grad_norm": 1.131996512413025,
731
- "learning_rate": 4.2376239689731955e-06,
732
- "loss": 1.3991,
733
  "step": 9200
734
  },
735
  {
736
- "epoch": 0.26571428571428574,
737
- "grad_norm": 1.1413109302520752,
738
- "learning_rate": 4.2212563434879175e-06,
739
- "loss": 1.3744,
740
  "step": 9300
741
  },
742
  {
743
- "epoch": 0.26857142857142857,
744
- "grad_norm": 1.073792576789856,
745
- "learning_rate": 4.204747224712209e-06,
746
- "loss": 1.422,
747
  "step": 9400
748
  },
749
  {
750
- "epoch": 0.2714285714285714,
751
- "grad_norm": 1.0397651195526123,
752
- "learning_rate": 4.188097969753363e-06,
753
- "loss": 1.4064,
754
  "step": 9500
755
  },
756
  {
757
- "epoch": 0.2742857142857143,
758
- "grad_norm": 1.1306557655334473,
759
- "learning_rate": 4.171309947238357e-06,
760
- "loss": 1.4408,
761
  "step": 9600
762
  },
763
  {
764
- "epoch": 0.27714285714285714,
765
- "grad_norm": 1.1982935667037964,
766
- "learning_rate": 4.154384537201347e-06,
767
- "loss": 1.4151,
768
  "step": 9700
769
  },
770
  {
771
- "epoch": 0.28,
772
- "grad_norm": 1.1465263366699219,
773
- "learning_rate": 4.137323130970225e-06,
774
- "loss": 1.4211,
775
  "step": 9800
776
  },
777
  {
778
- "epoch": 0.28285714285714286,
779
- "grad_norm": 0.9817516803741455,
780
- "learning_rate": 4.120127131052244e-06,
781
- "loss": 1.4089,
782
  "step": 9900
783
  },
784
  {
785
- "epoch": 0.2857142857142857,
786
- "grad_norm": 1.150546908378601,
787
- "learning_rate": 4.1027979510187285e-06,
788
- "loss": 1.4191,
789
  "step": 10000
790
  },
791
  {
792
- "epoch": 0.2857142857142857,
793
- "eval_loss": 1.429203748703003,
794
- "eval_runtime": 98.6571,
795
- "eval_samples_per_second": 138.814,
796
- "eval_steps_per_second": 2.169,
797
  "step": 10000
798
  },
799
  {
800
- "epoch": 0.2885714285714286,
801
- "grad_norm": 1.0928316116333008,
802
- "learning_rate": 4.085337015388876e-06,
803
- "loss": 1.4155,
804
  "step": 10100
805
  },
806
  {
807
- "epoch": 0.2914285714285714,
808
- "grad_norm": 1.1372452974319458,
809
- "learning_rate": 4.067745759512654e-06,
810
- "loss": 1.4229,
811
  "step": 10200
812
  },
813
  {
814
- "epoch": 0.29428571428571426,
815
- "grad_norm": 1.1249101161956787,
816
- "learning_rate": 4.0500256294528084e-06,
817
- "loss": 1.4178,
818
  "step": 10300
819
  },
820
  {
821
- "epoch": 0.29714285714285715,
822
- "grad_norm": 1.1340339183807373,
823
- "learning_rate": 4.032178081865995e-06,
824
- "loss": 1.4125,
825
  "step": 10400
826
  },
827
  {
828
- "epoch": 0.3,
829
- "grad_norm": 1.0652027130126953,
830
- "learning_rate": 4.014204583883038e-06,
831
- "loss": 1.4283,
832
  "step": 10500
833
  },
834
  {
835
- "epoch": 0.3028571428571429,
836
- "grad_norm": 1.1057724952697754,
837
- "learning_rate": 3.996106612988321e-06,
838
- "loss": 1.4046,
839
  "step": 10600
840
  },
841
  {
842
- "epoch": 0.3057142857142857,
843
- "grad_norm": 1.089181661605835,
844
- "learning_rate": 3.977885656898337e-06,
845
- "loss": 1.4199,
846
  "step": 10700
847
  },
848
  {
849
- "epoch": 0.30857142857142855,
850
- "grad_norm": 1.0804879665374756,
851
- "learning_rate": 3.959543213439393e-06,
852
- "loss": 1.4259,
853
  "step": 10800
854
  },
855
  {
856
- "epoch": 0.31142857142857144,
857
- "grad_norm": 1.0948872566223145,
858
- "learning_rate": 3.941080790424483e-06,
859
- "loss": 1.4143,
860
  "step": 10900
861
  },
862
  {
863
- "epoch": 0.3142857142857143,
864
- "grad_norm": 1.1653496026992798,
865
- "learning_rate": 3.92249990552934e-06,
866
- "loss": 1.4343,
867
  "step": 11000
868
  },
869
  {
870
- "epoch": 0.3142857142857143,
871
- "eval_loss": 1.4226573705673218,
872
- "eval_runtime": 98.9066,
873
- "eval_samples_per_second": 138.464,
874
- "eval_steps_per_second": 2.164,
875
  "step": 11000
876
  },
877
  {
878
- "epoch": 0.3171428571428571,
879
- "grad_norm": 1.0654685497283936,
880
- "learning_rate": 3.903802086167676e-06,
881
- "loss": 1.4102,
882
  "step": 11100
883
  },
884
  {
885
- "epoch": 0.32,
886
- "grad_norm": 1.01749849319458,
887
- "learning_rate": 3.884988869365626e-06,
888
- "loss": 1.408,
889
  "step": 11200
890
  },
891
  {
892
- "epoch": 0.32285714285714284,
893
- "grad_norm": 1.1105825901031494,
894
- "learning_rate": 3.866061801635399e-06,
895
- "loss": 1.4276,
896
  "step": 11300
897
  },
898
  {
899
- "epoch": 0.32571428571428573,
900
- "grad_norm": 1.0666981935501099,
901
- "learning_rate": 3.8470224388481485e-06,
902
- "loss": 1.3964,
903
  "step": 11400
904
  },
905
  {
906
- "epoch": 0.32857142857142857,
907
- "grad_norm": 1.090728759765625,
908
- "learning_rate": 3.827872346106073e-06,
909
- "loss": 1.3981,
910
  "step": 11500
911
  },
912
  {
913
- "epoch": 0.3314285714285714,
914
- "grad_norm": 1.069846272468567,
915
- "learning_rate": 3.808613097613759e-06,
916
- "loss": 1.4053,
917
  "step": 11600
918
  },
919
  {
920
- "epoch": 0.3342857142857143,
921
- "grad_norm": 1.1465699672698975,
922
- "learning_rate": 3.7892462765487836e-06,
923
- "loss": 1.3944,
924
  "step": 11700
925
  },
926
  {
927
- "epoch": 0.33714285714285713,
928
- "grad_norm": 1.068352222442627,
929
- "learning_rate": 3.769773474931558e-06,
930
- "loss": 1.4284,
931
  "step": 11800
932
  },
933
  {
934
- "epoch": 0.34,
935
- "grad_norm": 1.1487313508987427,
936
- "learning_rate": 3.7501962934944704e-06,
937
- "loss": 1.3894,
938
  "step": 11900
939
  },
940
  {
941
- "epoch": 0.34285714285714286,
942
- "grad_norm": 1.1034648418426514,
943
- "learning_rate": 3.7305163415502936e-06,
944
- "loss": 1.4184,
945
  "step": 12000
946
  },
947
  {
948
- "epoch": 0.34285714285714286,
949
- "eval_loss": 1.4204550981521606,
950
- "eval_runtime": 99.0133,
951
- "eval_samples_per_second": 138.315,
952
- "eval_steps_per_second": 2.161,
953
  "step": 12000
954
  },
955
  {
956
- "epoch": 0.3457142857142857,
957
- "grad_norm": 1.08807373046875,
958
- "learning_rate": 3.710735236859895e-06,
959
- "loss": 1.4208,
960
  "step": 12100
961
  },
962
  {
963
- "epoch": 0.3485714285714286,
964
- "grad_norm": 1.142823338508606,
965
- "learning_rate": 3.6908546054992523e-06,
966
- "loss": 1.4292,
967
  "step": 12200
968
  },
969
  {
970
- "epoch": 0.3514285714285714,
971
- "grad_norm": 1.0997464656829834,
972
- "learning_rate": 3.670876081725784e-06,
973
- "loss": 1.4058,
974
  "step": 12300
975
  },
976
  {
977
- "epoch": 0.35428571428571426,
978
- "grad_norm": 1.1083920001983643,
979
- "learning_rate": 3.650801307844004e-06,
980
- "loss": 1.4152,
981
  "step": 12400
982
  },
983
  {
984
- "epoch": 0.35714285714285715,
985
- "grad_norm": 1.1371042728424072,
986
- "learning_rate": 3.630631934070527e-06,
987
- "loss": 1.4259,
988
  "step": 12500
989
  },
990
  {
991
- "epoch": 0.36,
992
- "grad_norm": 1.0470432043075562,
993
- "learning_rate": 3.610369618398404e-06,
994
- "loss": 1.3952,
995
  "step": 12600
996
  },
997
  {
998
- "epoch": 0.3628571428571429,
999
- "grad_norm": 1.0853626728057861,
1000
- "learning_rate": 3.5900160264608395e-06,
1001
- "loss": 1.4005,
1002
  "step": 12700
1003
  },
1004
  {
1005
- "epoch": 0.3657142857142857,
1006
- "grad_norm": 1.0409729480743408,
1007
- "learning_rate": 3.569572831394265e-06,
1008
- "loss": 1.431,
1009
  "step": 12800
1010
  },
1011
  {
1012
- "epoch": 0.36857142857142855,
1013
- "grad_norm": 1.1226378679275513,
1014
- "learning_rate": 3.5490417137007997e-06,
1015
- "loss": 1.4112,
1016
  "step": 12900
1017
  },
1018
  {
1019
- "epoch": 0.37142857142857144,
1020
- "grad_norm": 1.0430322885513306,
1021
- "learning_rate": 3.528424361110115e-06,
1022
- "loss": 1.3999,
1023
  "step": 13000
1024
  },
1025
  {
1026
- "epoch": 0.37142857142857144,
1027
- "eval_loss": 1.423007845878601,
1028
- "eval_runtime": 99.2113,
1029
- "eval_samples_per_second": 138.039,
1030
- "eval_steps_per_second": 2.157,
1031
  "step": 13000
1032
  },
1033
  {
1034
- "epoch": 0.3742857142857143,
1035
- "grad_norm": 1.1154820919036865,
1036
- "learning_rate": 3.507722468440688e-06,
1037
- "loss": 1.4097,
1038
  "step": 13100
1039
  },
1040
  {
1041
- "epoch": 0.37714285714285717,
1042
- "grad_norm": 1.1299182176589966,
1043
- "learning_rate": 3.4869377374604886e-06,
1044
- "loss": 1.4064,
1045
  "step": 13200
1046
  },
1047
  {
1048
- "epoch": 0.38,
1049
- "grad_norm": 1.1046215295791626,
1050
- "learning_rate": 3.4660718767470854e-06,
1051
- "loss": 1.4234,
1052
  "step": 13300
1053
  },
1054
  {
1055
- "epoch": 0.38285714285714284,
1056
- "grad_norm": 1.0251668691635132,
1057
- "learning_rate": 3.445126601547193e-06,
1058
- "loss": 1.4097,
1059
  "step": 13400
1060
  },
1061
  {
1062
- "epoch": 0.38571428571428573,
1063
- "grad_norm": 1.0839489698410034,
1064
- "learning_rate": 3.4241036336356757e-06,
1065
- "loss": 1.401,
1066
  "step": 13500
1067
  },
1068
  {
1069
- "epoch": 0.38857142857142857,
1070
- "grad_norm": 1.0709606409072876,
1071
- "learning_rate": 3.40300470117401e-06,
1072
- "loss": 1.4164,
1073
  "step": 13600
1074
  },
1075
  {
1076
- "epoch": 0.3914285714285714,
1077
- "grad_norm": 1.0628767013549805,
1078
- "learning_rate": 3.3818315385682255e-06,
1079
- "loss": 1.409,
1080
  "step": 13700
1081
  },
1082
  {
1083
- "epoch": 0.3942857142857143,
1084
- "grad_norm": 1.0831209421157837,
1085
- "learning_rate": 3.3605858863263274e-06,
1086
- "loss": 1.4073,
1087
  "step": 13800
1088
  },
1089
  {
1090
- "epoch": 0.39714285714285713,
1091
- "grad_norm": 1.1459494829177856,
1092
- "learning_rate": 3.339269490915223e-06,
1093
- "loss": 1.4147,
1094
  "step": 13900
1095
  },
1096
  {
1097
- "epoch": 0.4,
1098
- "grad_norm": 1.0614882707595825,
1099
- "learning_rate": 3.317884104617155e-06,
1100
- "loss": 1.4089,
1101
  "step": 14000
1102
  },
1103
  {
1104
- "epoch": 0.4,
1105
- "eval_loss": 1.4181102514266968,
1106
- "eval_runtime": 99.8701,
1107
- "eval_samples_per_second": 137.128,
1108
- "eval_steps_per_second": 2.143,
1109
  "step": 14000
1110
  },
1111
  {
1112
- "epoch": 0.40285714285714286,
1113
- "grad_norm": 1.0587329864501953,
1114
- "learning_rate": 3.2964314853856593e-06,
1115
- "loss": 1.3895,
1116
  "step": 14100
1117
  },
1118
  {
1119
- "epoch": 0.4057142857142857,
1120
- "grad_norm": 1.1020365953445435,
1121
- "learning_rate": 3.2749133967010545e-06,
1122
- "loss": 1.4037,
1123
  "step": 14200
1124
  },
1125
  {
1126
- "epoch": 0.4085714285714286,
1127
- "grad_norm": 1.1230683326721191,
1128
- "learning_rate": 3.253331607425475e-06,
1129
- "loss": 1.4018,
1130
  "step": 14300
1131
  },
1132
  {
1133
- "epoch": 0.4114285714285714,
1134
- "grad_norm": 1.0774966478347778,
1135
- "learning_rate": 3.231687891657469e-06,
1136
- "loss": 1.4087,
1137
  "step": 14400
1138
  },
1139
  {
1140
- "epoch": 0.4142857142857143,
1141
- "grad_norm": 1.0514012575149536,
1142
- "learning_rate": 3.209984028586157e-06,
1143
- "loss": 1.3861,
1144
  "step": 14500
1145
  },
1146
  {
1147
- "epoch": 0.41714285714285715,
1148
- "grad_norm": 1.1025465726852417,
1149
- "learning_rate": 3.188221802344978e-06,
1150
- "loss": 1.4038,
1151
  "step": 14600
1152
  },
1153
  {
1154
- "epoch": 0.42,
1155
- "grad_norm": 1.139419436454773,
1156
- "learning_rate": 3.16640300186503e-06,
1157
- "loss": 1.4033,
1158
  "step": 14700
1159
  },
1160
  {
1161
- "epoch": 0.4228571428571429,
1162
- "grad_norm": 1.043289303779602,
1163
- "learning_rate": 3.1445294207280093e-06,
1164
- "loss": 1.3867,
1165
  "step": 14800
1166
  },
1167
  {
1168
- "epoch": 0.4257142857142857,
1169
- "grad_norm": 1.101967453956604,
1170
- "learning_rate": 3.1226028570187737e-06,
1171
- "loss": 1.391,
1172
  "step": 14900
1173
  },
1174
  {
1175
- "epoch": 0.42857142857142855,
1176
- "grad_norm": 1.0626415014266968,
1177
- "learning_rate": 3.1006251131775342e-06,
1178
- "loss": 1.3949,
1179
  "step": 15000
1180
  },
1181
  {
1182
- "epoch": 0.42857142857142855,
1183
- "eval_loss": 1.4195818901062012,
1184
- "eval_runtime": 99.5817,
1185
- "eval_samples_per_second": 137.525,
1186
- "eval_steps_per_second": 2.149,
1187
  "step": 15000
1188
  },
1189
  {
1190
- "epoch": 0.43142857142857144,
1191
- "grad_norm": 1.1212193965911865,
1192
- "learning_rate": 3.078597995851689e-06,
1193
- "loss": 1.4007,
1194
  "step": 15100
1195
  },
1196
  {
1197
- "epoch": 0.4342857142857143,
1198
- "grad_norm": 1.0601767301559448,
1199
- "learning_rate": 3.056523315747308e-06,
1200
- "loss": 1.4098,
1201
  "step": 15200
1202
  },
1203
  {
1204
- "epoch": 0.43714285714285717,
1205
- "grad_norm": 1.0668915510177612,
1206
- "learning_rate": 3.034402887480287e-06,
1207
- "loss": 1.3885,
1208
  "step": 15300
1209
  },
1210
  {
1211
- "epoch": 0.44,
1212
- "grad_norm": 1.0714190006256104,
1213
- "learning_rate": 3.012238529427181e-06,
1214
- "loss": 1.4018,
1215
  "step": 15400
1216
  },
1217
  {
1218
- "epoch": 0.44285714285714284,
1219
- "grad_norm": 1.1230597496032715,
1220
- "learning_rate": 2.9900320635757293e-06,
1221
- "loss": 1.4086,
1222
  "step": 15500
1223
  },
1224
  {
1225
- "epoch": 0.44571428571428573,
1226
- "grad_norm": 1.0094853639602661,
1227
- "learning_rate": 2.9677853153750763e-06,
1228
- "loss": 1.3801,
1229
  "step": 15600
1230
  },
1231
  {
1232
- "epoch": 0.44857142857142857,
1233
- "grad_norm": 1.0972274541854858,
1234
- "learning_rate": 2.9455001135857194e-06,
1235
- "loss": 1.3985,
1236
  "step": 15700
1237
  },
1238
  {
1239
- "epoch": 0.4514285714285714,
1240
- "grad_norm": 1.0266581773757935,
1241
- "learning_rate": 2.9231782901291726e-06,
1242
- "loss": 1.4124,
1243
  "step": 15800
1244
  },
1245
  {
1246
- "epoch": 0.4542857142857143,
1247
- "grad_norm": 1.138675332069397,
1248
- "learning_rate": 2.900821679937382e-06,
1249
- "loss": 1.4173,
1250
  "step": 15900
1251
  },
1252
  {
1253
- "epoch": 0.45714285714285713,
1254
- "grad_norm": 1.1691060066223145,
1255
- "learning_rate": 2.8784321208018817e-06,
1256
- "loss": 1.4123,
1257
  "step": 16000
1258
  },
1259
  {
1260
- "epoch": 0.45714285714285713,
1261
- "eval_loss": 1.4248454570770264,
1262
- "eval_runtime": 99.8569,
1263
- "eval_samples_per_second": 137.146,
1264
- "eval_steps_per_second": 2.143,
1265
  "step": 16000
1266
  },
1267
  {
1268
- "epoch": 0.46,
1269
- "grad_norm": 1.1149132251739502,
1270
- "learning_rate": 2.8560114532227262e-06,
1271
- "loss": 1.4171,
1272
  "step": 16100
1273
  },
1274
  {
1275
- "epoch": 0.46285714285714286,
1276
- "grad_norm": 1.0276226997375488,
1277
- "learning_rate": 2.8335615202571927e-06,
1278
- "loss": 1.4177,
1279
  "step": 16200
1280
  },
1281
  {
1282
- "epoch": 0.4657142857142857,
1283
- "grad_norm": 1.0828535556793213,
1284
- "learning_rate": 2.811084167368276e-06,
1285
- "loss": 1.3762,
1286
  "step": 16300
1287
  },
1288
  {
1289
- "epoch": 0.4685714285714286,
1290
- "grad_norm": 1.171616554260254,
1291
- "learning_rate": 2.788581242272983e-06,
1292
- "loss": 1.3965,
1293
  "step": 16400
1294
  },
1295
  {
1296
- "epoch": 0.4714285714285714,
1297
- "grad_norm": 1.0692201852798462,
1298
- "learning_rate": 2.7660545947904464e-06,
1299
- "loss": 1.4066,
1300
  "step": 16500
1301
  },
1302
  {
1303
- "epoch": 0.4742857142857143,
1304
- "grad_norm": 1.1563397645950317,
1305
- "learning_rate": 2.7435060766898614e-06,
1306
- "loss": 1.4008,
1307
  "step": 16600
1308
  },
1309
  {
1310
- "epoch": 0.47714285714285715,
1311
- "grad_norm": 1.1032534837722778,
1312
- "learning_rate": 2.7209375415382655e-06,
1313
- "loss": 1.3905,
1314
  "step": 16700
1315
  },
1316
  {
1317
- "epoch": 0.48,
1318
- "grad_norm": 1.1357022523880005,
1319
- "learning_rate": 2.698350844548168e-06,
1320
- "loss": 1.406,
1321
  "step": 16800
1322
  },
1323
  {
1324
- "epoch": 0.4828571428571429,
1325
- "grad_norm": 1.0574637651443481,
1326
- "learning_rate": 2.6757478424250417e-06,
1327
- "loss": 1.4049,
1328
  "step": 16900
1329
  },
1330
  {
1331
- "epoch": 0.4857142857142857,
1332
- "grad_norm": 1.0180025100708008,
1333
- "learning_rate": 2.653130393214702e-06,
1334
- "loss": 1.3979,
1335
  "step": 17000
1336
  },
1337
  {
1338
- "epoch": 0.4857142857142857,
1339
- "eval_loss": 1.4195657968521118,
1340
- "eval_runtime": 99.9485,
1341
- "eval_samples_per_second": 137.021,
1342
- "eval_steps_per_second": 2.141,
1343
  "step": 17000
1344
  },
1345
  {
1346
- "epoch": 0.48857142857142855,
1347
- "grad_norm": 1.0153673887252808,
1348
- "learning_rate": 2.630500356150565e-06,
1349
- "loss": 1.4138,
1350
  "step": 17100
1351
  },
1352
  {
1353
- "epoch": 0.49142857142857144,
1354
- "grad_norm": 1.0832693576812744,
1355
- "learning_rate": 2.6078595915008096e-06,
1356
- "loss": 1.3934,
1357
  "step": 17200
1358
  },
1359
  {
1360
- "epoch": 0.4942857142857143,
1361
- "grad_norm": 1.1552319526672363,
1362
- "learning_rate": 2.585209960415464e-06,
1363
- "loss": 1.414,
1364
  "step": 17300
1365
  },
1366
  {
1367
- "epoch": 0.49714285714285716,
1368
- "grad_norm": 1.1260509490966797,
1369
- "learning_rate": 2.562553324773404e-06,
1370
- "loss": 1.3988,
1371
  "step": 17400
1372
  },
1373
  {
1374
- "epoch": 0.5,
1375
- "grad_norm": 1.1187398433685303,
1376
- "learning_rate": 2.5398915470293077e-06,
1377
- "loss": 1.4048,
1378
  "step": 17500
1379
  },
1380
  {
1381
- "epoch": 0.5028571428571429,
1382
- "grad_norm": 1.0673401355743408,
1383
- "learning_rate": 2.5172264900605497e-06,
1384
- "loss": 1.4012,
1385
  "step": 17600
1386
  },
1387
  {
1388
- "epoch": 0.5057142857142857,
1389
- "grad_norm": 1.098514199256897,
1390
- "learning_rate": 2.49456001701407e-06,
1391
- "loss": 1.4021,
1392
  "step": 17700
1393
  },
1394
  {
1395
- "epoch": 0.5085714285714286,
1396
- "grad_norm": 1.1217247247695923,
1397
- "learning_rate": 2.471893991153216e-06,
1398
- "loss": 1.4041,
1399
  "step": 17800
1400
  },
1401
  {
1402
- "epoch": 0.5114285714285715,
1403
- "grad_norm": 1.1324173212051392,
1404
- "learning_rate": 2.4492302757045705e-06,
1405
- "loss": 1.3942,
1406
  "step": 17900
1407
  },
1408
  {
1409
- "epoch": 0.5142857142857142,
1410
- "grad_norm": 1.1281129121780396,
1411
- "learning_rate": 2.426570733704798e-06,
1412
- "loss": 1.4046,
1413
  "step": 18000
1414
  },
1415
  {
1416
- "epoch": 0.5142857142857142,
1417
- "eval_loss": 1.4171615839004517,
1418
- "eval_runtime": 99.7442,
1419
- "eval_samples_per_second": 137.301,
1420
- "eval_steps_per_second": 2.145,
1421
  "step": 18000
1422
  },
1423
  {
1424
- "epoch": 0.5171428571428571,
1425
- "grad_norm": 1.084283471107483,
1426
- "learning_rate": 2.4039172278474864e-06,
1427
- "loss": 1.4183,
1428
  "step": 18100
1429
  },
1430
  {
1431
- "epoch": 0.52,
1432
- "grad_norm": 1.0714788436889648,
1433
- "learning_rate": 2.381271620330034e-06,
1434
- "loss": 1.3793,
1435
  "step": 18200
1436
  },
1437
  {
1438
- "epoch": 0.5228571428571429,
1439
- "grad_norm": 1.1440812349319458,
1440
- "learning_rate": 2.358635772700567e-06,
1441
- "loss": 1.3765,
1442
  "step": 18300
1443
  },
1444
  {
1445
- "epoch": 0.5257142857142857,
1446
- "grad_norm": 1.0656503438949585,
1447
- "learning_rate": 2.336011545704916e-06,
1448
- "loss": 1.4153,
1449
  "step": 18400
1450
  },
1451
  {
1452
- "epoch": 0.5285714285714286,
1453
- "grad_norm": 1.1328638792037964,
1454
- "learning_rate": 2.3134007991336523e-06,
1455
- "loss": 1.3873,
1456
  "step": 18500
1457
  },
1458
  {
1459
- "epoch": 0.5314285714285715,
1460
- "grad_norm": 1.0806158781051636,
1461
- "learning_rate": 2.290805391669212e-06,
1462
- "loss": 1.3774,
1463
  "step": 18600
1464
  },
1465
  {
1466
- "epoch": 0.5342857142857143,
1467
- "grad_norm": 1.069150686264038,
1468
- "learning_rate": 2.2682271807331003e-06,
1469
- "loss": 1.3918,
1470
  "step": 18700
1471
  },
1472
  {
1473
- "epoch": 0.5371428571428571,
1474
- "grad_norm": 1.1267215013504028,
1475
- "learning_rate": 2.2456680223332103e-06,
1476
- "loss": 1.3845,
1477
  "step": 18800
1478
  },
1479
  {
1480
- "epoch": 0.54,
1481
- "grad_norm": 1.142121434211731,
1482
- "learning_rate": 2.2231297709112496e-06,
1483
- "loss": 1.4109,
1484
  "step": 18900
1485
  },
1486
  {
1487
- "epoch": 0.5428571428571428,
1488
- "grad_norm": 1.0814783573150635,
1489
- "learning_rate": 2.2006142791902957e-06,
1490
- "loss": 1.4098,
1491
  "step": 19000
1492
  },
1493
  {
1494
- "epoch": 0.5428571428571428,
1495
- "eval_loss": 1.416707158088684,
1496
- "eval_runtime": 100.0528,
1497
- "eval_samples_per_second": 136.878,
1498
- "eval_steps_per_second": 2.139,
1499
  "step": 19000
1500
  },
1501
  {
1502
- "epoch": 0.5457142857142857,
1503
- "grad_norm": 1.0706247091293335,
1504
- "learning_rate": 2.1781233980225035e-06,
1505
- "loss": 1.4188,
1506
  "step": 19100
1507
  },
1508
  {
1509
- "epoch": 0.5485714285714286,
1510
- "grad_norm": 1.021273136138916,
1511
- "learning_rate": 2.1556589762369518e-06,
1512
- "loss": 1.3989,
1513
  "step": 19200
1514
  },
1515
  {
1516
- "epoch": 0.5514285714285714,
1517
- "grad_norm": 1.1904112100601196,
1518
- "learning_rate": 2.133222860487667e-06,
1519
- "loss": 1.4393,
1520
  "step": 19300
1521
  },
1522
  {
1523
- "epoch": 0.5542857142857143,
1524
- "grad_norm": 1.1062791347503662,
1525
- "learning_rate": 2.1108168951018186e-06,
1526
- "loss": 1.4045,
1527
  "step": 19400
1528
  },
1529
  {
1530
- "epoch": 0.5571428571428572,
1531
- "grad_norm": 1.1809172630310059,
1532
- "learning_rate": 2.088442921928113e-06,
1533
- "loss": 1.3958,
1534
  "step": 19500
1535
  },
1536
  {
1537
- "epoch": 0.56,
1538
- "grad_norm": 1.0156745910644531,
1539
- "learning_rate": 2.066102780185383e-06,
1540
- "loss": 1.398,
1541
  "step": 19600
1542
  },
1543
  {
1544
- "epoch": 0.5628571428571428,
1545
- "grad_norm": 1.1121779680252075,
1546
- "learning_rate": 2.0437983063114013e-06,
1547
- "loss": 1.4122,
1548
  "step": 19700
1549
  },
1550
  {
1551
- "epoch": 0.5657142857142857,
1552
- "grad_norm": 1.0523419380187988,
1553
- "learning_rate": 2.021531333811914e-06,
1554
- "loss": 1.4063,
1555
  "step": 19800
1556
  },
1557
  {
1558
- "epoch": 0.5685714285714286,
1559
- "grad_norm": 1.099584698677063,
1560
- "learning_rate": 1.9993036931099265e-06,
1561
- "loss": 1.409,
1562
  "step": 19900
1563
  },
1564
  {
1565
- "epoch": 0.5714285714285714,
1566
- "grad_norm": 1.1999467611312866,
1567
- "learning_rate": 1.9771172113952327e-06,
1568
- "loss": 1.4,
1569
  "step": 20000
1570
  },
1571
  {
1572
- "epoch": 0.5714285714285714,
1573
- "eval_loss": 1.415099024772644,
1574
- "eval_runtime": 99.755,
1575
- "eval_samples_per_second": 137.286,
1576
- "eval_steps_per_second": 2.145,
1577
  "step": 20000
1578
  },
1579
  {
1580
- "epoch": 0.5742857142857143,
1581
- "grad_norm": 1.0494403839111328,
1582
- "learning_rate": 1.9549737124742104e-06,
1583
- "loss": 1.4095,
1584
  "step": 20100
1585
  },
1586
  {
1587
- "epoch": 0.5771428571428572,
1588
- "grad_norm": 1.1081063747406006,
1589
- "learning_rate": 1.9328750166199046e-06,
1590
- "loss": 1.3992,
1591
  "step": 20200
1592
  },
1593
  {
1594
- "epoch": 0.58,
1595
- "grad_norm": 1.1197865009307861,
1596
- "learning_rate": 1.91082294042239e-06,
1597
- "loss": 1.3917,
1598
  "step": 20300
1599
  },
1600
  {
1601
- "epoch": 0.5828571428571429,
1602
- "grad_norm": 1.140148639678955,
1603
- "learning_rate": 1.8888192966394448e-06,
1604
- "loss": 1.3907,
1605
  "step": 20400
1606
  },
1607
  {
1608
- "epoch": 0.5857142857142857,
1609
- "grad_norm": 1.0425162315368652,
1610
- "learning_rate": 1.8668658940475298e-06,
1611
- "loss": 1.4006,
1612
  "step": 20500
1613
  },
1614
  {
1615
- "epoch": 0.5885714285714285,
1616
- "grad_norm": 1.1035826206207275,
1617
- "learning_rate": 1.8449645372931068e-06,
1618
- "loss": 1.4033,
1619
  "step": 20600
1620
  },
1621
  {
1622
- "epoch": 0.5914285714285714,
1623
- "grad_norm": 1.1139192581176758,
1624
- "learning_rate": 1.823117026744287e-06,
1625
- "loss": 1.3964,
1626
  "step": 20700
1627
  },
1628
  {
1629
- "epoch": 0.5942857142857143,
1630
- "grad_norm": 1.1130657196044922,
1631
- "learning_rate": 1.8013251583428366e-06,
1632
- "loss": 1.3972,
1633
  "step": 20800
1634
  },
1635
  {
1636
- "epoch": 0.5971428571428572,
1637
- "grad_norm": 1.0860106945037842,
1638
- "learning_rate": 1.7795907234565385e-06,
1639
- "loss": 1.3931,
1640
  "step": 20900
1641
  },
1642
  {
1643
- "epoch": 0.6,
1644
- "grad_norm": 1.05580472946167,
1645
- "learning_rate": 1.7579155087319443e-06,
1646
- "loss": 1.3874,
1647
  "step": 21000
1648
  },
1649
  {
1650
- "epoch": 0.6,
1651
- "eval_loss": 1.4096276760101318,
1652
- "eval_runtime": 99.8984,
1653
- "eval_samples_per_second": 137.089,
1654
- "eval_steps_per_second": 2.142,
1655
  "step": 21000
1656
  },
1657
  {
1658
- "epoch": 0.6028571428571429,
1659
- "grad_norm": 1.1223632097244263,
1660
- "learning_rate": 1.7363012959475e-06,
1661
- "loss": 1.3793,
1662
  "step": 21100
1663
  },
1664
  {
1665
- "epoch": 0.6057142857142858,
1666
- "grad_norm": 1.115355372428894,
1667
- "learning_rate": 1.7147498618670778e-06,
1668
- "loss": 1.4093,
1669
  "step": 21200
1670
  },
1671
  {
1672
- "epoch": 0.6085714285714285,
1673
- "grad_norm": 1.0437370538711548,
1674
- "learning_rate": 1.6932629780939225e-06,
1675
- "loss": 1.3875,
1676
  "step": 21300
1677
  },
1678
  {
1679
- "epoch": 0.6114285714285714,
1680
- "grad_norm": 1.0260958671569824,
1681
- "learning_rate": 1.6718424109250154e-06,
1682
- "loss": 1.4035,
1683
  "step": 21400
1684
  },
1685
  {
1686
- "epoch": 0.6142857142857143,
1687
- "grad_norm": 0.9281340837478638,
1688
- "learning_rate": 1.6504899212058837e-06,
1689
- "loss": 1.3853,
1690
  "step": 21500
1691
  },
1692
  {
1693
- "epoch": 0.6171428571428571,
1694
- "grad_norm": 1.1064680814743042,
1695
- "learning_rate": 1.6292072641858478e-06,
1696
- "loss": 1.4016,
1697
  "step": 21600
1698
  },
1699
  {
1700
- "epoch": 0.62,
1701
- "grad_norm": 1.0704963207244873,
1702
- "learning_rate": 1.6079961893737384e-06,
1703
- "loss": 1.3848,
1704
  "step": 21700
1705
  },
1706
  {
1707
- "epoch": 0.6228571428571429,
1708
- "grad_norm": 1.0652328729629517,
1709
- "learning_rate": 1.5868584403940768e-06,
1710
- "loss": 1.3749,
1711
  "step": 21800
1712
  },
1713
  {
1714
- "epoch": 0.6257142857142857,
1715
- "grad_norm": 1.0742926597595215,
1716
- "learning_rate": 1.5657957548437447e-06,
1717
- "loss": 1.404,
1718
  "step": 21900
1719
  },
1720
  {
1721
- "epoch": 0.6285714285714286,
1722
- "grad_norm": 1.0579770803451538,
1723
- "learning_rate": 1.5448098641491487e-06,
1724
- "loss": 1.4036,
1725
  "step": 22000
1726
  },
1727
  {
1728
- "epoch": 0.6285714285714286,
1729
- "eval_loss": 1.4158315658569336,
1730
- "eval_runtime": 100.3868,
1731
- "eval_samples_per_second": 136.422,
1732
- "eval_steps_per_second": 2.132,
1733
  "step": 22000
1734
  },
1735
  {
1736
- "epoch": 0.6314285714285715,
1737
- "grad_norm": 1.101526141166687,
1738
- "learning_rate": 1.5239024934238874e-06,
1739
- "loss": 1.4188,
1740
  "step": 22100
1741
  },
1742
  {
1743
- "epoch": 0.6342857142857142,
1744
- "grad_norm": 1.0752556324005127,
1745
- "learning_rate": 1.5030753613269455e-06,
1746
- "loss": 1.3847,
1747
  "step": 22200
1748
  },
1749
  {
1750
- "epoch": 0.6371428571428571,
1751
- "grad_norm": 1.0786316394805908,
1752
- "learning_rate": 1.4823301799214101e-06,
1753
- "loss": 1.3867,
1754
  "step": 22300
1755
  },
1756
  {
1757
- "epoch": 0.64,
1758
- "grad_norm": 1.0339590311050415,
1759
- "learning_rate": 1.4616686545337374e-06,
1760
- "loss": 1.3662,
1761
  "step": 22400
1762
  },
1763
  {
1764
- "epoch": 0.6428571428571429,
1765
- "grad_norm": 1.0901203155517578,
1766
- "learning_rate": 1.4410924836135625e-06,
1767
- "loss": 1.3836,
1768
  "step": 22500
1769
  },
1770
  {
1771
- "epoch": 0.6457142857142857,
1772
- "grad_norm": 1.0557289123535156,
1773
- "learning_rate": 1.4206033585940895e-06,
1774
- "loss": 1.375,
1775
  "step": 22600
1776
  },
1777
  {
1778
- "epoch": 0.6485714285714286,
1779
- "grad_norm": 1.049706220626831,
1780
- "learning_rate": 1.40020296375304e-06,
1781
- "loss": 1.3977,
1782
  "step": 22700
1783
  },
1784
  {
1785
- "epoch": 0.6514285714285715,
1786
- "grad_norm": 1.170900821685791,
1787
- "learning_rate": 1.379892976074209e-06,
1788
- "loss": 1.384,
1789
  "step": 22800
1790
  },
1791
  {
1792
- "epoch": 0.6542857142857142,
1793
- "grad_norm": 1.10288667678833,
1794
- "learning_rate": 1.3596750651096047e-06,
1795
- "loss": 1.4045,
1796
  "step": 22900
1797
  },
1798
  {
1799
- "epoch": 0.6571428571428571,
1800
- "grad_norm": 1.0909626483917236,
1801
- "learning_rate": 1.3395508928422074e-06,
1802
- "loss": 1.4018,
1803
  "step": 23000
1804
  },
1805
  {
1806
- "epoch": 0.6571428571428571,
1807
- "eval_loss": 1.4156948328018188,
1808
- "eval_runtime": 100.678,
1809
- "eval_samples_per_second": 136.028,
1810
- "eval_steps_per_second": 2.126,
1811
  "step": 23000
1812
  },
1813
  {
1814
- "epoch": 0.66,
1815
- "grad_norm": 1.1391985416412354,
1816
- "learning_rate": 1.3195221135493503e-06,
1817
- "loss": 1.372,
1818
  "step": 23100
1819
  },
1820
  {
1821
- "epoch": 0.6628571428571428,
1822
- "grad_norm": 1.124377965927124,
1823
- "learning_rate": 1.2995903736667267e-06,
1824
- "loss": 1.3998,
1825
  "step": 23200
1826
  },
1827
  {
1828
- "epoch": 0.6657142857142857,
1829
- "grad_norm": 1.1056832075119019,
1830
- "learning_rate": 1.279757311653056e-06,
1831
- "loss": 1.3677,
1832
  "step": 23300
1833
  },
1834
  {
1835
- "epoch": 0.6685714285714286,
1836
- "grad_norm": 1.0959793329238892,
1837
- "learning_rate": 1.2600245578553866e-06,
1838
- "loss": 1.3801,
1839
  "step": 23400
1840
  },
1841
  {
1842
- "epoch": 0.6714285714285714,
1843
- "grad_norm": 1.0466543436050415,
1844
- "learning_rate": 1.240393734375086e-06,
1845
- "loss": 1.3866,
1846
  "step": 23500
1847
  },
1848
  {
1849
- "epoch": 0.6742857142857143,
1850
- "grad_norm": 1.0811994075775146,
1851
- "learning_rate": 1.2208664549344884e-06,
1852
- "loss": 1.3885,
1853
  "step": 23600
1854
  },
1855
  {
1856
- "epoch": 0.6771428571428572,
1857
- "grad_norm": 1.1533517837524414,
1858
- "learning_rate": 1.2014443247442498e-06,
1859
- "loss": 1.3684,
1860
  "step": 23700
1861
  },
1862
  {
1863
- "epoch": 0.68,
1864
- "grad_norm": 1.0400276184082031,
1865
- "learning_rate": 1.1821289403713865e-06,
1866
- "loss": 1.3733,
1867
  "step": 23800
1868
  },
1869
  {
1870
- "epoch": 0.6828571428571428,
1871
- "grad_norm": 1.0742322206497192,
1872
- "learning_rate": 1.1629218896080382e-06,
1873
- "loss": 1.3884,
1874
  "step": 23900
1875
  },
1876
  {
1877
- "epoch": 0.6857142857142857,
1878
- "grad_norm": 1.0781975984573364,
1879
- "learning_rate": 1.1438247513409423e-06,
1880
- "loss": 1.3611,
1881
  "step": 24000
1882
  },
1883
  {
1884
- "epoch": 0.6857142857142857,
1885
- "eval_loss": 1.4142818450927734,
1886
- "eval_runtime": 100.8413,
1887
- "eval_samples_per_second": 135.807,
1888
- "eval_steps_per_second": 2.122,
1889
  "step": 24000
1890
  },
1891
  {
1892
- "epoch": 0.6885714285714286,
1893
- "grad_norm": 1.1411370038986206,
1894
- "learning_rate": 1.1248390954216437e-06,
1895
- "loss": 1.3838,
1896
  "step": 24100
1897
  },
1898
  {
1899
- "epoch": 0.6914285714285714,
1900
- "grad_norm": 1.0584548711776733,
1901
- "learning_rate": 1.1059664825374511e-06,
1902
- "loss": 1.3482,
1903
  "step": 24200
1904
  },
1905
  {
1906
- "epoch": 0.6942857142857143,
1907
- "grad_norm": 1.096170425415039,
1908
- "learning_rate": 1.0872084640831356e-06,
1909
- "loss": 1.3704,
1910
  "step": 24300
1911
  },
1912
  {
1913
- "epoch": 0.6971428571428572,
1914
- "grad_norm": 1.0241279602050781,
1915
- "learning_rate": 1.068566582033411e-06,
1916
- "loss": 1.3735,
1917
  "step": 24400
1918
  },
1919
  {
1920
- "epoch": 0.7,
1921
- "grad_norm": 1.0666210651397705,
1922
- "learning_rate": 1.050042368816168e-06,
1923
- "loss": 1.3893,
1924
  "step": 24500
1925
  },
1926
  {
1927
- "epoch": 0.7028571428571428,
1928
- "grad_norm": 1.0070935487747192,
1929
- "learning_rate": 1.0316373471865108e-06,
1930
- "loss": 1.3807,
1931
  "step": 24600
1932
  },
1933
  {
1934
- "epoch": 0.7057142857142857,
1935
- "grad_norm": 1.0485628843307495,
1936
- "learning_rate": 1.013353030101576e-06,
1937
- "loss": 1.3817,
1938
  "step": 24700
1939
  },
1940
  {
1941
- "epoch": 0.7085714285714285,
1942
- "grad_norm": 0.9520274996757507,
1943
- "learning_rate": 9.951909205961665e-07,
1944
- "loss": 1.3201,
1945
  "step": 24800
1946
  },
1947
  {
1948
- "epoch": 0.7114285714285714,
1949
- "grad_norm": 1.0479100942611694,
1950
- "learning_rate": 9.77152511659194e-07,
1951
- "loss": 1.2627,
1952
  "step": 24900
1953
  },
1954
  {
1955
- "epoch": 0.7142857142857143,
1956
- "grad_norm": 1.0204826593399048,
1957
- "learning_rate": 9.59239286110952e-07,
1958
- "loss": 1.2352,
1959
  "step": 25000
1960
  },
1961
  {
1962
- "epoch": 0.7142857142857143,
1963
- "eval_loss": 1.4112086296081543,
1964
- "eval_runtime": 101.0868,
1965
- "eval_samples_per_second": 135.478,
1966
- "eval_steps_per_second": 2.117,
1967
  "step": 25000
1968
  }
1969
  ],
1970
  "logging_steps": 100,
1971
- "max_steps": 35000,
1972
  "num_input_tokens_seen": 0,
1973
  "num_train_epochs": 9223372036854775807,
1974
  "save_steps": 5000,
@@ -1979,12 +1979,12 @@
1979
  "should_evaluate": false,
1980
  "should_log": false,
1981
  "should_save": true,
1982
- "should_training_stop": false
1983
  },
1984
  "attributes": {}
1985
  }
1986
  },
1987
- "total_flos": 3.3846542204928e+18,
1988
  "train_batch_size": 64,
1989
  "trial_name": null,
1990
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.08784,
6
  "eval_steps": 1000,
7
  "global_step": 25000,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 4e-05,
14
+ "grad_norm": 0.911555290222168,
15
  "learning_rate": 0.0,
16
+ "loss": 0.7505,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 0.004,
21
+ "grad_norm": 1.2557882070541382,
22
+ "learning_rate": 9.9e-07,
23
+ "loss": 0.831,
24
  "step": 100
25
  },
26
  {
27
+ "epoch": 0.008,
28
+ "grad_norm": 0.9086900353431702,
29
+ "learning_rate": 1.9900000000000004e-06,
30
+ "loss": 0.8295,
31
  "step": 200
32
  },
33
  {
34
+ "epoch": 0.012,
35
+ "grad_norm": 0.9221948385238647,
36
+ "learning_rate": 2.4999758220143106e-06,
37
+ "loss": 0.8411,
38
  "step": 300
39
  },
40
  {
41
+ "epoch": 0.016,
42
+ "grad_norm": 0.8809811472892761,
43
+ "learning_rate": 2.4997764426529066e-06,
44
+ "loss": 0.8288,
45
  "step": 400
46
  },
47
  {
48
+ "epoch": 0.02,
49
+ "grad_norm": 1.3145067691802979,
50
+ "learning_rate": 2.499375702067717e-06,
51
+ "loss": 0.8312,
52
  "step": 500
53
  },
54
  {
55
+ "epoch": 0.024,
56
+ "grad_norm": 0.9034631252288818,
57
+ "learning_rate": 2.4987736648251815e-06,
58
+ "loss": 0.8385,
59
  "step": 600
60
  },
61
  {
62
+ "epoch": 0.028,
63
+ "grad_norm": 0.8681179881095886,
64
+ "learning_rate": 2.497970427924213e-06,
65
+ "loss": 0.8175,
66
  "step": 700
67
  },
68
  {
69
+ "epoch": 0.032,
70
+ "grad_norm": 0.9303165674209595,
71
+ "learning_rate": 2.496966120780569e-06,
72
+ "loss": 0.8281,
73
  "step": 800
74
  },
75
  {
76
+ "epoch": 0.036,
77
+ "grad_norm": 0.9573058485984802,
78
+ "learning_rate": 2.4957609052060012e-06,
79
+ "loss": 0.8326,
80
  "step": 900
81
  },
82
  {
83
+ "epoch": 0.04,
84
+ "grad_norm": 0.9730055928230286,
85
+ "learning_rate": 2.4943549753821847e-06,
86
+ "loss": 0.8391,
87
  "step": 1000
88
  },
89
  {
90
+ "epoch": 0.04,
91
+ "eval_loss": 1.5264503955841064,
92
+ "eval_runtime": 104.8997,
93
+ "eval_samples_per_second": 130.553,
94
+ "eval_steps_per_second": 2.04,
95
  "step": 1000
96
  },
97
  {
98
+ "epoch": 0.044,
99
+ "grad_norm": 0.8237825632095337,
100
+ "learning_rate": 2.4927485578294313e-06,
101
+ "loss": 0.8176,
102
  "step": 1100
103
  },
104
  {
105
+ "epoch": 0.048,
106
+ "grad_norm": 0.9133234620094299,
107
+ "learning_rate": 2.4909419113701947e-06,
108
+ "loss": 0.8303,
109
  "step": 1200
110
  },
111
  {
112
+ "epoch": 0.052,
113
+ "grad_norm": 0.9377557635307312,
114
+ "learning_rate": 2.4889353270873663e-06,
115
+ "loss": 0.8159,
116
  "step": 1300
117
  },
118
  {
119
+ "epoch": 0.056,
120
+ "grad_norm": 0.9034435749053955,
121
+ "learning_rate": 2.4867291282773805e-06,
122
+ "loss": 0.8145,
123
  "step": 1400
124
  },
125
  {
126
+ "epoch": 0.06,
127
+ "grad_norm": 1.0601003170013428,
128
+ "learning_rate": 2.4843236703981235e-06,
129
+ "loss": 0.8317,
130
  "step": 1500
131
  },
132
  {
133
+ "epoch": 0.064,
134
+ "grad_norm": 0.9157763719558716,
135
+ "learning_rate": 2.481719341011662e-06,
136
+ "loss": 0.8355,
137
  "step": 1600
138
  },
139
  {
140
+ "epoch": 0.068,
141
+ "grad_norm": 0.9011576175689697,
142
+ "learning_rate": 2.4789165597218035e-06,
143
+ "loss": 0.8319,
144
  "step": 1700
145
  },
146
  {
147
+ "epoch": 0.072,
148
+ "grad_norm": 0.8954268097877502,
149
+ "learning_rate": 2.475915778106486e-06,
150
+ "loss": 0.8156,
151
  "step": 1800
152
  },
153
  {
154
+ "epoch": 0.076,
155
+ "grad_norm": 0.8911709189414978,
156
+ "learning_rate": 2.4727174796450266e-06,
157
+ "loss": 0.8365,
158
  "step": 1900
159
  },
160
  {
161
+ "epoch": 0.08,
162
+ "grad_norm": 0.9407449960708618,
163
+ "learning_rate": 2.4693221796402166e-06,
164
+ "loss": 0.8288,
165
  "step": 2000
166
  },
167
  {
168
+ "epoch": 0.08,
169
+ "eval_loss": 1.5217734575271606,
170
+ "eval_runtime": 98.2235,
171
+ "eval_samples_per_second": 139.427,
172
+ "eval_steps_per_second": 2.179,
173
  "step": 2000
174
  },
175
  {
176
+ "epoch": 0.084,
177
+ "grad_norm": 0.8769101500511169,
178
+ "learning_rate": 2.4657304251353047e-06,
179
+ "loss": 0.8131,
180
  "step": 2100
181
  },
182
  {
183
+ "epoch": 0.088,
184
+ "grad_norm": 0.8608514070510864,
185
+ "learning_rate": 2.4619427948258547e-06,
186
+ "loss": 0.8088,
187
  "step": 2200
188
  },
189
  {
190
+ "epoch": 0.092,
191
+ "grad_norm": 0.9365686178207397,
192
+ "learning_rate": 2.4579598989665065e-06,
193
+ "loss": 0.8286,
194
  "step": 2300
195
  },
196
  {
197
+ "epoch": 0.096,
198
+ "grad_norm": 0.928945779800415,
199
+ "learning_rate": 2.453782379272657e-06,
200
+ "loss": 0.8109,
201
  "step": 2400
202
  },
203
  {
204
+ "epoch": 0.1,
205
+ "grad_norm": 0.9162323474884033,
206
+ "learning_rate": 2.449410908817064e-06,
207
+ "loss": 0.806,
208
  "step": 2500
209
  },
210
  {
211
+ "epoch": 0.104,
212
+ "grad_norm": 0.9436105489730835,
213
+ "learning_rate": 2.444846191921406e-06,
214
+ "loss": 0.7969,
215
  "step": 2600
216
  },
217
  {
218
+ "epoch": 0.108,
219
+ "grad_norm": 0.9459385871887207,
220
+ "learning_rate": 2.4400889640427992e-06,
221
+ "loss": 0.8315,
222
  "step": 2700
223
  },
224
  {
225
+ "epoch": 0.112,
226
+ "grad_norm": 0.9575082063674927,
227
+ "learning_rate": 2.435139991655308e-06,
228
+ "loss": 0.8324,
229
  "step": 2800
230
  },
231
  {
232
+ "epoch": 0.116,
233
+ "grad_norm": 0.927148163318634,
234
+ "learning_rate": 2.4300000721264466e-06,
235
+ "loss": 0.8267,
236
  "step": 2900
237
  },
238
  {
239
+ "epoch": 0.12,
240
+ "grad_norm": 0.9774505496025085,
241
+ "learning_rate": 2.4246700335887123e-06,
242
+ "loss": 0.8262,
243
  "step": 3000
244
  },
245
  {
246
+ "epoch": 0.12,
247
+ "eval_loss": 1.5202959775924683,
248
+ "eval_runtime": 98.5199,
249
+ "eval_samples_per_second": 139.007,
250
+ "eval_steps_per_second": 2.172,
251
  "step": 3000
252
  },
253
  {
254
+ "epoch": 0.124,
255
+ "grad_norm": 0.9433075785636902,
256
+ "learning_rate": 2.4191507348061575e-06,
257
+ "loss": 0.803,
258
  "step": 3100
259
  },
260
  {
261
+ "epoch": 0.128,
262
+ "grad_norm": 0.9418466091156006,
263
+ "learning_rate": 2.4134430650360284e-06,
264
+ "loss": 0.8088,
265
  "step": 3200
266
  },
267
  {
268
+ "epoch": 0.132,
269
+ "grad_norm": 0.9223436713218689,
270
+ "learning_rate": 2.407547943885489e-06,
271
+ "loss": 0.8116,
272
  "step": 3300
273
  },
274
  {
275
+ "epoch": 0.136,
276
+ "grad_norm": 0.9359924793243408,
277
+ "learning_rate": 2.4014663211634552e-06,
278
+ "loss": 0.8232,
279
  "step": 3400
280
  },
281
  {
282
+ "epoch": 0.14,
283
+ "grad_norm": 0.9347231388092041,
284
+ "learning_rate": 2.395199176727567e-06,
285
+ "loss": 0.8131,
286
  "step": 3500
287
  },
288
  {
289
+ "epoch": 0.144,
290
+ "grad_norm": 0.9255951046943665,
291
+ "learning_rate": 2.388747520326311e-06,
292
+ "loss": 0.8064,
293
  "step": 3600
294
  },
295
  {
296
+ "epoch": 0.148,
297
+ "grad_norm": 0.8580342531204224,
298
+ "learning_rate": 2.3821123914363374e-06,
299
+ "loss": 0.8247,
300
  "step": 3700
301
  },
302
  {
303
+ "epoch": 0.152,
304
+ "grad_norm": 0.8920683860778809,
305
+ "learning_rate": 2.3752948590949766e-06,
306
+ "loss": 0.8058,
307
  "step": 3800
308
  },
309
  {
310
+ "epoch": 0.156,
311
+ "grad_norm": 0.8848472237586975,
312
+ "learning_rate": 2.368296021728002e-06,
313
+ "loss": 0.8209,
314
  "step": 3900
315
  },
316
  {
317
+ "epoch": 0.16,
318
+ "grad_norm": 0.9708815217018127,
319
+ "learning_rate": 2.3611170069726532e-06,
320
+ "loss": 0.8216,
321
  "step": 4000
322
  },
323
  {
324
+ "epoch": 0.16,
325
+ "eval_loss": 1.5283503532409668,
326
+ "eval_runtime": 98.9755,
327
+ "eval_samples_per_second": 138.368,
328
+ "eval_steps_per_second": 2.162,
329
  "step": 4000
330
  },
331
  {
332
+ "epoch": 0.164,
333
+ "grad_norm": 0.8715313673019409,
334
+ "learning_rate": 2.3537589714959523e-06,
335
+ "loss": 0.8185,
336
  "step": 4100
337
  },
338
  {
339
+ "epoch": 0.168,
340
+ "grad_norm": 0.9748795032501221,
341
+ "learning_rate": 2.346223100808346e-06,
342
+ "loss": 0.8172,
343
  "step": 4200
344
  },
345
  {
346
+ "epoch": 0.172,
347
+ "grad_norm": 0.900182843208313,
348
+ "learning_rate": 2.3385106090726974e-06,
349
+ "loss": 0.8101,
350
  "step": 4300
351
  },
352
  {
353
+ "epoch": 0.176,
354
+ "grad_norm": 0.8882376551628113,
355
+ "learning_rate": 2.330622738908663e-06,
356
+ "loss": 0.8004,
357
  "step": 4400
358
  },
359
  {
360
+ "epoch": 0.18,
361
+ "grad_norm": 0.9087768793106079,
362
+ "learning_rate": 2.322560761192485e-06,
363
+ "loss": 0.8028,
364
  "step": 4500
365
  },
366
  {
367
+ "epoch": 0.184,
368
+ "grad_norm": 0.9928045868873596,
369
+ "learning_rate": 2.3143259748522308e-06,
370
+ "loss": 0.8257,
371
  "step": 4600
372
  },
373
  {
374
+ "epoch": 0.188,
375
+ "grad_norm": 0.9519675970077515,
376
+ "learning_rate": 2.3059197066585126e-06,
377
+ "loss": 0.817,
378
  "step": 4700
379
  },
380
  {
381
+ "epoch": 0.192,
382
+ "grad_norm": 0.970738410949707,
383
+ "learning_rate": 2.297343311010719e-06,
384
+ "loss": 0.8109,
385
  "step": 4800
386
  },
387
  {
388
+ "epoch": 0.196,
389
+ "grad_norm": 0.9740980267524719,
390
+ "learning_rate": 2.2885981697188002e-06,
391
+ "loss": 0.8168,
392
  "step": 4900
393
  },
394
  {
395
+ "epoch": 0.2,
396
+ "grad_norm": 0.9454805850982666,
397
+ "learning_rate": 2.2796856917806313e-06,
398
+ "loss": 0.8305,
399
  "step": 5000
400
  },
401
  {
402
+ "epoch": 0.2,
403
+ "eval_loss": 1.5317082405090332,
404
+ "eval_runtime": 98.9715,
405
+ "eval_samples_per_second": 138.373,
406
+ "eval_steps_per_second": 2.162,
407
  "step": 5000
408
  },
409
  {
410
+ "epoch": 0.204,
411
+ "grad_norm": 0.9181498289108276,
412
+ "learning_rate": 2.270607313155e-06,
413
+ "loss": 0.807,
414
  "step": 5100
415
  },
416
  {
417
+ "epoch": 0.208,
418
+ "grad_norm": 0.8452897071838379,
419
+ "learning_rate": 2.2613644965302456e-06,
420
+ "loss": 0.802,
421
  "step": 5200
422
  },
423
  {
424
+ "epoch": 0.212,
425
+ "grad_norm": 0.8827036619186401,
426
+ "learning_rate": 2.251958731088596e-06,
427
+ "loss": 0.8001,
428
  "step": 5300
429
  },
430
  {
431
+ "epoch": 0.216,
432
+ "grad_norm": 0.8728039264678955,
433
+ "learning_rate": 2.242391532266232e-06,
434
+ "loss": 0.8211,
435
  "step": 5400
436
  },
437
  {
438
+ "epoch": 0.22,
439
+ "grad_norm": 0.9410618543624878,
440
+ "learning_rate": 2.2326644415091264e-06,
441
+ "loss": 0.7996,
442
  "step": 5500
443
  },
444
  {
445
+ "epoch": 0.224,
446
+ "grad_norm": 0.9829330444335938,
447
+ "learning_rate": 2.2227790260246856e-06,
448
+ "loss": 0.7971,
449
  "step": 5600
450
  },
451
  {
452
+ "epoch": 0.228,
453
+ "grad_norm": 0.9688398241996765,
454
+ "learning_rate": 2.2127368785292484e-06,
455
+ "loss": 0.7854,
456
  "step": 5700
457
  },
458
  {
459
+ "epoch": 0.232,
460
+ "grad_norm": 0.864470362663269,
461
+ "learning_rate": 2.2025396169914697e-06,
462
+ "loss": 0.8192,
463
  "step": 5800
464
  },
465
  {
466
+ "epoch": 0.236,
467
+ "grad_norm": 0.9038395881652832,
468
+ "learning_rate": 2.1921888843716356e-06,
469
+ "loss": 0.8005,
470
  "step": 5900
471
  },
472
  {
473
+ "epoch": 0.24,
474
+ "grad_norm": 0.8807651996612549,
475
+ "learning_rate": 2.181686348356955e-06,
476
+ "loss": 0.806,
477
  "step": 6000
478
  },
479
  {
480
+ "epoch": 0.24,
481
+ "eval_loss": 1.524116039276123,
482
+ "eval_runtime": 99.2477,
483
+ "eval_samples_per_second": 137.988,
484
+ "eval_steps_per_second": 2.156,
485
  "step": 6000
486
  },
487
  {
488
+ "epoch": 0.244,
489
+ "grad_norm": 1.0644515752792358,
490
+ "learning_rate": 2.1710337010928655e-06,
491
+ "loss": 0.8232,
492
  "step": 6100
493
  },
494
  {
495
+ "epoch": 0.248,
496
+ "grad_norm": 0.9187564253807068,
497
+ "learning_rate": 2.1602326589103967e-06,
498
+ "loss": 0.8036,
499
  "step": 6200
500
  },
501
  {
502
+ "epoch": 0.252,
503
+ "grad_norm": 0.9233301877975464,
504
+ "learning_rate": 2.1492849620496414e-06,
505
+ "loss": 0.8118,
506
  "step": 6300
507
  },
508
  {
509
+ "epoch": 0.256,
510
+ "grad_norm": 0.9559895396232605,
511
+ "learning_rate": 2.13819237437937e-06,
512
+ "loss": 0.7959,
513
  "step": 6400
514
  },
515
  {
516
+ "epoch": 0.26,
517
+ "grad_norm": 0.8455320000648499,
518
+ "learning_rate": 2.126956683112842e-06,
519
+ "loss": 0.8254,
520
  "step": 6500
521
  },
522
  {
523
+ "epoch": 0.264,
524
+ "grad_norm": 0.942471444606781,
525
+ "learning_rate": 2.1155796985198495e-06,
526
+ "loss": 0.808,
527
  "step": 6600
528
  },
529
  {
530
+ "epoch": 0.268,
531
+ "grad_norm": 0.8535305261611938,
532
+ "learning_rate": 2.1040632536350573e-06,
533
+ "loss": 0.8182,
534
  "step": 6700
535
  },
536
  {
537
+ "epoch": 0.272,
538
+ "grad_norm": 0.8879380226135254,
539
+ "learning_rate": 2.092409203962663e-06,
540
+ "loss": 0.8177,
541
  "step": 6800
542
  },
543
  {
544
+ "epoch": 0.276,
545
+ "grad_norm": 0.8684147000312805,
546
+ "learning_rate": 2.080619427177443e-06,
547
+ "loss": 0.7982,
548
  "step": 6900
549
  },
550
  {
551
+ "epoch": 0.28,
552
+ "grad_norm": 0.9437069892883301,
553
+ "learning_rate": 2.0686958228222298e-06,
554
+ "loss": 0.7984,
555
  "step": 7000
556
  },
557
  {
558
+ "epoch": 0.28,
559
+ "eval_loss": 1.530232548713684,
560
+ "eval_runtime": 99.3518,
561
+ "eval_samples_per_second": 137.844,
562
+ "eval_steps_per_second": 2.154,
563
  "step": 7000
564
  },
565
  {
566
+ "epoch": 0.284,
567
+ "grad_norm": 0.9226755499839783,
568
+ "learning_rate": 2.056640312001856e-06,
569
+ "loss": 0.8072,
570
  "step": 7100
571
  },
572
  {
573
+ "epoch": 0.288,
574
+ "grad_norm": 0.9192745685577393,
575
+ "learning_rate": 2.0444548370736335e-06,
576
+ "loss": 0.8081,
577
  "step": 7200
578
  },
579
  {
580
+ "epoch": 0.292,
581
+ "grad_norm": 1.026985764503479,
582
+ "learning_rate": 2.032141361334406e-06,
583
+ "loss": 0.8074,
584
  "step": 7300
585
  },
586
  {
587
+ "epoch": 0.296,
588
+ "grad_norm": 0.8428290486335754,
589
+ "learning_rate": 2.019701868704224e-06,
590
+ "loss": 0.8081,
591
  "step": 7400
592
  },
593
  {
594
+ "epoch": 0.3,
595
+ "grad_norm": 0.9866459369659424,
596
+ "learning_rate": 2.007138363406702e-06,
597
+ "loss": 0.8241,
598
  "step": 7500
599
  },
600
  {
601
+ "epoch": 0.304,
602
+ "grad_norm": 0.9240759015083313,
603
+ "learning_rate": 1.9944528696461016e-06,
604
+ "loss": 0.8089,
605
  "step": 7600
606
  },
607
  {
608
+ "epoch": 0.308,
609
+ "grad_norm": 0.8980386853218079,
610
+ "learning_rate": 1.9816474312811984e-06,
611
+ "loss": 0.7995,
612
  "step": 7700
613
  },
614
  {
615
+ "epoch": 0.312,
616
+ "grad_norm": 0.9766695499420166,
617
+ "learning_rate": 1.9687241114959753e-06,
618
+ "loss": 0.7969,
619
  "step": 7800
620
  },
621
  {
622
+ "epoch": 0.316,
623
+ "grad_norm": 0.8739997148513794,
624
+ "learning_rate": 1.955684992467211e-06,
625
+ "loss": 0.8053,
626
  "step": 7900
627
  },
628
  {
629
+ "epoch": 0.32,
630
+ "grad_norm": 0.9071422219276428,
631
+ "learning_rate": 1.942532175029003e-06,
632
+ "loss": 0.7896,
633
  "step": 8000
634
  },
635
  {
636
+ "epoch": 0.32,
637
+ "eval_loss": 1.5243619680404663,
638
+ "eval_runtime": 99.5243,
639
+ "eval_samples_per_second": 137.605,
640
+ "eval_steps_per_second": 2.15,
641
  "step": 8000
642
  },
643
  {
644
+ "epoch": 0.324,
645
+ "grad_norm": 0.9778127670288086,
646
+ "learning_rate": 1.929267778334285e-06,
647
+ "loss": 0.7878,
648
  "step": 8100
649
  },
650
  {
651
+ "epoch": 0.328,
652
+ "grad_norm": 0.9122934937477112,
653
+ "learning_rate": 1.915893939513396e-06,
654
+ "loss": 0.7967,
655
  "step": 8200
656
  },
657
  {
658
+ "epoch": 0.332,
659
+ "grad_norm": 0.90513676404953,
660
+ "learning_rate": 1.9024128133297467e-06,
661
+ "loss": 0.8048,
662
  "step": 8300
663
  },
664
  {
665
+ "epoch": 0.336,
666
+ "grad_norm": 0.9107154607772827,
667
+ "learning_rate": 1.8888265718326532e-06,
668
+ "loss": 0.7944,
669
  "step": 8400
670
  },
671
  {
672
+ "epoch": 0.34,
673
+ "grad_norm": 0.8964477777481079,
674
+ "learning_rate": 1.8751374040073774e-06,
675
+ "loss": 0.7958,
676
  "step": 8500
677
  },
678
  {
679
+ "epoch": 0.344,
680
+ "grad_norm": 0.9018213152885437,
681
+ "learning_rate": 1.8613475154224456e-06,
682
+ "loss": 0.8065,
683
  "step": 8600
684
  },
685
  {
686
+ "epoch": 0.348,
687
+ "grad_norm": 0.9653429985046387,
688
+ "learning_rate": 1.8474591278742894e-06,
689
+ "loss": 0.8194,
690
  "step": 8700
691
  },
692
  {
693
+ "epoch": 0.352,
694
+ "grad_norm": 0.9324017763137817,
695
+ "learning_rate": 1.8334744790292766e-06,
696
+ "loss": 0.796,
697
  "step": 8800
698
  },
699
  {
700
+ "epoch": 0.356,
701
+ "grad_norm": 1.0298709869384766,
702
+ "learning_rate": 1.8193958220631833e-06,
703
+ "loss": 0.8268,
704
  "step": 8900
705
  },
706
  {
707
+ "epoch": 0.36,
708
+ "grad_norm": 0.8846196532249451,
709
+ "learning_rate": 1.805225425298166e-06,
710
+ "loss": 0.825,
711
  "step": 9000
712
  },
713
  {
714
+ "epoch": 0.36,
715
+ "eval_loss": 1.5243308544158936,
716
+ "eval_runtime": 100.5198,
717
+ "eval_samples_per_second": 136.242,
718
+ "eval_steps_per_second": 2.129,
719
  "step": 9000
720
  },
721
  {
722
+ "epoch": 0.364,
723
+ "grad_norm": 0.8830705881118774,
724
+ "learning_rate": 1.790965571837296e-06,
725
+ "loss": 0.8233,
726
  "step": 9100
727
  },
728
  {
729
+ "epoch": 0.368,
730
+ "grad_norm": 0.9197975993156433,
731
+ "learning_rate": 1.7766185591967092e-06,
732
+ "loss": 0.8299,
733
  "step": 9200
734
  },
735
  {
736
+ "epoch": 0.372,
737
+ "grad_norm": 1.0428673028945923,
738
+ "learning_rate": 1.762186698935437e-06,
739
+ "loss": 0.8182,
740
  "step": 9300
741
  },
742
  {
743
+ "epoch": 0.376,
744
+ "grad_norm": 0.9466006755828857,
745
+ "learning_rate": 1.7476723162829723e-06,
746
+ "loss": 0.8255,
747
  "step": 9400
748
  },
749
  {
750
+ "epoch": 0.38,
751
+ "grad_norm": 0.9237021803855896,
752
+ "learning_rate": 1.7330777497646328e-06,
753
+ "loss": 0.7672,
754
  "step": 9500
755
  },
756
  {
757
+ "epoch": 0.384,
758
+ "grad_norm": 0.917202353477478,
759
+ "learning_rate": 1.7184053508247853e-06,
760
+ "loss": 0.8427,
761
  "step": 9600
762
  },
763
  {
764
+ "epoch": 0.388,
765
+ "grad_norm": 0.9462612271308899,
766
+ "learning_rate": 1.703657483447983e-06,
767
+ "loss": 0.8409,
768
  "step": 9700
769
  },
770
  {
771
+ "epoch": 0.392,
772
+ "grad_norm": 0.8924245834350586,
773
+ "learning_rate": 1.6888365237780886e-06,
774
+ "loss": 0.8335,
775
  "step": 9800
776
  },
777
  {
778
+ "epoch": 0.396,
779
+ "grad_norm": 0.9719087481498718,
780
+ "learning_rate": 1.6739448597354327e-06,
781
+ "loss": 0.826,
782
  "step": 9900
783
  },
784
  {
785
+ "epoch": 0.4,
786
+ "grad_norm": 0.8893173336982727,
787
+ "learning_rate": 1.6589848906320794e-06,
788
+ "loss": 0.8326,
789
  "step": 10000
790
  },
791
  {
792
+ "epoch": 0.4,
793
+ "eval_loss": 1.5264792442321777,
794
+ "eval_runtime": 101.5699,
795
+ "eval_samples_per_second": 134.833,
796
+ "eval_steps_per_second": 2.107,
797
  "step": 10000
798
  },
799
  {
800
+ "epoch": 0.404,
801
+ "grad_norm": 0.8719335198402405,
802
+ "learning_rate": 1.6439590267852528e-06,
803
+ "loss": 0.8198,
804
  "step": 10100
805
  },
806
  {
807
+ "epoch": 0.408,
808
+ "grad_norm": 0.8997857570648193,
809
+ "learning_rate": 1.6288696891289938e-06,
810
+ "loss": 0.8103,
811
  "step": 10200
812
  },
813
  {
814
+ "epoch": 0.412,
815
+ "grad_norm": 0.9756138920783997,
816
+ "learning_rate": 1.6137193088241021e-06,
817
+ "loss": 0.8245,
818
  "step": 10300
819
  },
820
  {
821
+ "epoch": 0.416,
822
+ "grad_norm": 1.009027123451233,
823
+ "learning_rate": 1.598510326866435e-06,
824
+ "loss": 0.8226,
825
  "step": 10400
826
  },
827
  {
828
+ "epoch": 0.42,
829
+ "grad_norm": 0.9941139221191406,
830
+ "learning_rate": 1.583245193693619e-06,
831
+ "loss": 0.8154,
832
  "step": 10500
833
  },
834
  {
835
+ "epoch": 0.424,
836
+ "grad_norm": 0.9156614542007446,
837
+ "learning_rate": 1.5679263687902402e-06,
838
+ "loss": 0.8194,
839
  "step": 10600
840
  },
841
  {
842
+ "epoch": 0.428,
843
+ "grad_norm": 0.9270005226135254,
844
+ "learning_rate": 1.552556320291578e-06,
845
+ "loss": 0.8144,
846
  "step": 10700
847
  },
848
  {
849
+ "epoch": 0.432,
850
+ "grad_norm": 0.9664807915687561,
851
+ "learning_rate": 1.5371375245859446e-06,
852
+ "loss": 0.823,
853
  "step": 10800
854
  },
855
  {
856
+ "epoch": 0.436,
857
+ "grad_norm": 0.9909628629684448,
858
+ "learning_rate": 1.5216724659156944e-06,
859
+ "loss": 0.8319,
860
  "step": 10900
861
  },
862
  {
863
+ "epoch": 0.44,
864
+ "grad_norm": 1.0144808292388916,
865
+ "learning_rate": 1.506163635976969e-06,
866
+ "loss": 0.8272,
867
  "step": 11000
868
  },
869
  {
870
+ "epoch": 0.44,
871
+ "eval_loss": 1.5209919214248657,
872
+ "eval_runtime": 101.3638,
873
+ "eval_samples_per_second": 135.107,
874
+ "eval_steps_per_second": 2.111,
875
  "step": 11000
876
  },
877
  {
878
+ "epoch": 0.444,
879
+ "grad_norm": 0.9689117074012756,
880
+ "learning_rate": 1.49061353351824e-06,
881
+ "loss": 0.8408,
882
  "step": 11100
883
  },
884
  {
885
+ "epoch": 0.448,
886
+ "grad_norm": 1.0267921686172485,
887
+ "learning_rate": 1.4750246639377161e-06,
888
+ "loss": 0.8362,
889
  "step": 11200
890
  },
891
  {
892
+ "epoch": 0.452,
893
+ "grad_norm": 0.920600175857544,
894
+ "learning_rate": 1.4593995388796797e-06,
895
+ "loss": 0.8343,
896
  "step": 11300
897
  },
898
  {
899
+ "epoch": 0.456,
900
+ "grad_norm": 1.025995135307312,
901
+ "learning_rate": 1.4437406758298156e-06,
902
+ "loss": 0.8255,
903
  "step": 11400
904
  },
905
  {
906
+ "epoch": 0.46,
907
+ "grad_norm": 0.889402449131012,
908
+ "learning_rate": 1.428050597709599e-06,
909
+ "loss": 0.839,
910
  "step": 11500
911
  },
912
  {
913
+ "epoch": 0.464,
914
+ "grad_norm": 0.8957056999206543,
915
+ "learning_rate": 1.412331832469809e-06,
916
+ "loss": 0.8304,
917
  "step": 11600
918
  },
919
  {
920
+ "epoch": 0.468,
921
+ "grad_norm": 0.9389684796333313,
922
+ "learning_rate": 1.39658691268323e-06,
923
+ "loss": 0.8523,
924
  "step": 11700
925
  },
926
  {
927
+ "epoch": 0.472,
928
+ "grad_norm": 0.9115435481071472,
929
+ "learning_rate": 1.3808183751366089e-06,
930
+ "loss": 0.8421,
931
  "step": 11800
932
  },
933
  {
934
+ "epoch": 0.476,
935
+ "grad_norm": 0.9521908164024353,
936
+ "learning_rate": 1.3650287604219342e-06,
937
+ "loss": 0.8704,
938
  "step": 11900
939
  },
940
  {
941
+ "epoch": 0.48,
942
+ "grad_norm": 0.9166862964630127,
943
+ "learning_rate": 1.3492206125271016e-06,
944
+ "loss": 0.8527,
945
  "step": 12000
946
  },
947
  {
948
+ "epoch": 0.48,
949
+ "eval_loss": 1.5229912996292114,
950
+ "eval_runtime": 101.6086,
951
+ "eval_samples_per_second": 134.782,
952
+ "eval_steps_per_second": 2.106,
953
  "step": 12000
954
  },
955
  {
956
+ "epoch": 0.484,
957
+ "grad_norm": 0.9557492733001709,
958
+ "learning_rate": 1.333396478426031e-06,
959
+ "loss": 0.8499,
960
  "step": 12100
961
  },
962
  {
963
+ "epoch": 0.488,
964
+ "grad_norm": 0.9957550764083862,
965
+ "learning_rate": 1.317558907668306e-06,
966
+ "loss": 0.8534,
967
  "step": 12200
968
  },
969
  {
970
+ "epoch": 0.492,
971
+ "grad_norm": 1.1370068788528442,
972
+ "learning_rate": 1.3017104519683932e-06,
973
+ "loss": 0.8336,
974
  "step": 12300
975
  },
976
  {
977
+ "epoch": 0.496,
978
+ "grad_norm": 0.9006808400154114,
979
+ "learning_rate": 1.285853664794518e-06,
980
+ "loss": 0.8196,
981
  "step": 12400
982
  },
983
  {
984
+ "epoch": 0.5,
985
+ "grad_norm": 0.9441719651222229,
986
+ "learning_rate": 1.269991100957254e-06,
987
+ "loss": 0.844,
988
  "step": 12500
989
  },
990
  {
991
+ "epoch": 0.504,
992
+ "grad_norm": 0.8616164922714233,
993
+ "learning_rate": 1.2541253161978986e-06,
994
+ "loss": 0.8319,
995
  "step": 12600
996
  },
997
  {
998
+ "epoch": 0.508,
999
+ "grad_norm": 0.9243165850639343,
1000
+ "learning_rate": 1.238258866776697e-06,
1001
+ "loss": 0.8307,
1002
  "step": 12700
1003
  },
1004
  {
1005
+ "epoch": 0.512,
1006
+ "grad_norm": 0.9617546796798706,
1007
+ "learning_rate": 1.222394309060982e-06,
1008
+ "loss": 0.8562,
1009
  "step": 12800
1010
  },
1011
  {
1012
+ "epoch": 0.516,
1013
+ "grad_norm": 0.8897221684455872,
1014
+ "learning_rate": 1.2065341991133013e-06,
1015
+ "loss": 0.8344,
1016
  "step": 12900
1017
  },
1018
  {
1019
+ "epoch": 0.52,
1020
+ "grad_norm": 0.8364721536636353,
1021
+ "learning_rate": 1.1906810922795864e-06,
1022
+ "loss": 0.8389,
1023
  "step": 13000
1024
  },
1025
  {
1026
+ "epoch": 0.52,
1027
+ "eval_loss": 1.5288289785385132,
1028
+ "eval_runtime": 101.5647,
1029
+ "eval_samples_per_second": 134.84,
1030
+ "eval_steps_per_second": 2.107,
1031
  "step": 13000
1032
  },
1033
  {
1034
+ "epoch": 0.524,
1035
+ "grad_norm": 1.0084967613220215,
1036
+ "learning_rate": 1.1748375427774422e-06,
1037
+ "loss": 0.8498,
1038
  "step": 13100
1039
  },
1040
  {
1041
+ "epoch": 0.528,
1042
+ "grad_norm": 0.9439749717712402,
1043
+ "learning_rate": 1.1590061032846182e-06,
1044
+ "loss": 0.8509,
1045
  "step": 13200
1046
  },
1047
  {
1048
+ "epoch": 0.532,
1049
+ "grad_norm": 0.8930461406707764,
1050
+ "learning_rate": 1.1431893245277262e-06,
1051
+ "loss": 0.8384,
1052
  "step": 13300
1053
  },
1054
  {
1055
+ "epoch": 0.536,
1056
+ "grad_norm": 1.0605283975601196,
1057
+ "learning_rate": 1.1273897548712726e-06,
1058
+ "loss": 0.8557,
1059
  "step": 13400
1060
  },
1061
  {
1062
+ "epoch": 0.54,
1063
+ "grad_norm": 0.8892098069190979,
1064
+ "learning_rate": 1.11160993990707e-06,
1065
+ "loss": 0.8378,
1066
  "step": 13500
1067
  },
1068
  {
1069
+ "epoch": 0.544,
1070
+ "grad_norm": 0.9008782505989075,
1071
+ "learning_rate": 1.0958524220440999e-06,
1072
+ "loss": 0.8437,
1073
  "step": 13600
1074
  },
1075
  {
1076
+ "epoch": 0.548,
1077
+ "grad_norm": 0.8771668672561646,
1078
+ "learning_rate": 1.0801197400988838e-06,
1079
+ "loss": 0.8512,
1080
  "step": 13700
1081
  },
1082
  {
1083
+ "epoch": 0.552,
1084
+ "grad_norm": 0.9245998859405518,
1085
+ "learning_rate": 1.0644144288864352e-06,
1086
+ "loss": 0.8671,
1087
  "step": 13800
1088
  },
1089
  {
1090
+ "epoch": 0.556,
1091
+ "grad_norm": 0.9122968912124634,
1092
+ "learning_rate": 1.048739018811855e-06,
1093
+ "loss": 0.8328,
1094
  "step": 13900
1095
  },
1096
  {
1097
+ "epoch": 0.56,
1098
+ "grad_norm": 0.9968782067298889,
1099
+ "learning_rate": 1.0330960354626384e-06,
1100
+ "loss": 0.851,
1101
  "step": 14000
1102
  },
1103
  {
1104
+ "epoch": 0.56,
1105
+ "eval_loss": 1.5260618925094604,
1106
+ "eval_runtime": 101.9042,
1107
+ "eval_samples_per_second": 134.391,
1108
+ "eval_steps_per_second": 2.1,
1109
  "step": 14000
1110
  },
1111
  {
1112
+ "epoch": 0.564,
1113
+ "grad_norm": 1.0338596105575562,
1114
+ "learning_rate": 1.0174879992017586e-06,
1115
+ "loss": 0.8374,
1116
  "step": 14100
1117
  },
1118
  {
1119
+ "epoch": 0.568,
1120
+ "grad_norm": 0.9291728734970093,
1121
+ "learning_rate": 1.0019174247615919e-06,
1122
+ "loss": 0.8356,
1123
  "step": 14200
1124
  },
1125
  {
1126
+ "epoch": 0.572,
1127
+ "grad_norm": 0.8955647945404053,
1128
+ "learning_rate": 9.863868208387473e-07,
1129
+ "loss": 0.839,
1130
  "step": 14300
1131
  },
1132
  {
1133
+ "epoch": 0.576,
1134
+ "grad_norm": 0.9726178050041199,
1135
+ "learning_rate": 9.708986896898727e-07,
1136
+ "loss": 0.8396,
1137
  "step": 14400
1138
  },
1139
  {
1140
+ "epoch": 0.58,
1141
+ "grad_norm": 0.9720205068588257,
1142
+ "learning_rate": 9.554555267284956e-07,
1143
+ "loss": 0.8334,
1144
  "step": 14500
1145
  },
1146
  {
1147
+ "epoch": 0.584,
1148
+ "grad_norm": 0.9503899216651917,
1149
+ "learning_rate": 9.400598201229705e-07,
1150
+ "loss": 0.8165,
1151
  "step": 14600
1152
  },
1153
  {
1154
+ "epoch": 0.588,
1155
+ "grad_norm": 0.8789735436439514,
1156
+ "learning_rate": 9.247140503955863e-07,
1157
+ "loss": 0.8262,
1158
  "step": 14700
1159
  },
1160
  {
1161
+ "epoch": 0.592,
1162
+ "grad_norm": 1.4387589693069458,
1163
+ "learning_rate": 9.09420690022913e-07,
1164
+ "loss": 0.8378,
1165
  "step": 14800
1166
  },
1167
  {
1168
+ "epoch": 0.596,
1169
+ "grad_norm": 1.1762765645980835,
1170
+ "learning_rate": 8.941822030374405e-07,
1171
+ "loss": 0.8428,
1172
  "step": 14900
1173
  },
1174
  {
1175
+ "epoch": 0.6,
1176
+ "grad_norm": 0.880807638168335,
1177
+ "learning_rate": 8.790010446305814e-07,
1178
+ "loss": 0.8254,
1179
  "step": 15000
1180
  },
1181
  {
1182
+ "epoch": 0.6,
1183
+ "eval_loss": 1.5283499956130981,
1184
+ "eval_runtime": 103.2419,
1185
+ "eval_samples_per_second": 132.65,
1186
+ "eval_steps_per_second": 2.073,
1187
  "step": 15000
1188
  },
1189
  {
1190
+ "epoch": 0.604,
1191
+ "grad_norm": 0.9635188579559326,
1192
+ "learning_rate": 8.63879660757092e-07,
1193
+ "loss": 0.798,
1194
  "step": 15100
1195
  },
1196
  {
1197
+ "epoch": 0.608,
1198
+ "grad_norm": 0.9472705721855164,
1199
+ "learning_rate": 8.488204877409884e-07,
1200
+ "loss": 0.8033,
1201
  "step": 15200
1202
  },
1203
  {
1204
+ "epoch": 0.612,
1205
+ "grad_norm": 0.8378113508224487,
1206
+ "learning_rate": 8.338259518830106e-07,
1207
+ "loss": 0.8012,
1208
  "step": 15300
1209
  },
1210
  {
1211
+ "epoch": 0.616,
1212
+ "grad_norm": 0.9451029300689697,
1213
+ "learning_rate": 8.18898469069703e-07,
1214
+ "loss": 0.8047,
1215
  "step": 15400
1216
  },
1217
  {
1218
+ "epoch": 0.62,
1219
+ "grad_norm": 0.9609344005584717,
1220
+ "learning_rate": 8.040404443841701e-07,
1221
+ "loss": 0.7927,
1222
  "step": 15500
1223
  },
1224
  {
1225
+ "epoch": 0.624,
1226
+ "grad_norm": 0.8947242498397827,
1227
+ "learning_rate": 7.892542717185766e-07,
1228
+ "loss": 0.7885,
1229
  "step": 15600
1230
  },
1231
  {
1232
+ "epoch": 0.628,
1233
+ "grad_norm": 0.9105751514434814,
1234
+ "learning_rate": 7.745423333884483e-07,
1235
+ "loss": 0.801,
1236
  "step": 15700
1237
  },
1238
  {
1239
+ "epoch": 0.632,
1240
+ "grad_norm": 0.899936854839325,
1241
+ "learning_rate": 7.599069997488386e-07,
1242
+ "loss": 0.8005,
1243
  "step": 15800
1244
  },
1245
  {
1246
+ "epoch": 0.636,
1247
+ "grad_norm": 1.0273375511169434,
1248
+ "learning_rate": 7.453506288124224e-07,
1249
+ "loss": 0.8015,
1250
  "step": 15900
1251
  },
1252
  {
1253
+ "epoch": 0.64,
1254
+ "grad_norm": 0.8960332274436951,
1255
+ "learning_rate": 7.308755658695775e-07,
1256
+ "loss": 0.8074,
1257
  "step": 16000
1258
  },
1259
  {
1260
+ "epoch": 0.64,
1261
+ "eval_loss": 1.5343570709228516,
1262
+ "eval_runtime": 102.3372,
1263
+ "eval_samples_per_second": 133.822,
1264
+ "eval_steps_per_second": 2.091,
1265
  "step": 16000
1266
  },
1267
  {
1268
+ "epoch": 0.644,
1269
+ "grad_norm": 0.8942509293556213,
1270
+ "learning_rate": 7.164841431105172e-07,
1271
+ "loss": 0.796,
1272
  "step": 16100
1273
  },
1274
  {
1275
+ "epoch": 0.648,
1276
+ "grad_norm": 0.9353269934654236,
1277
+ "learning_rate": 7.021786792495325e-07,
1278
+ "loss": 0.8196,
1279
  "step": 16200
1280
  },
1281
  {
1282
+ "epoch": 0.652,
1283
+ "grad_norm": 0.985683262348175,
1284
+ "learning_rate": 6.879614791514075e-07,
1285
+ "loss": 0.808,
1286
  "step": 16300
1287
  },
1288
  {
1289
+ "epoch": 0.656,
1290
+ "grad_norm": 0.8981220722198486,
1291
+ "learning_rate": 6.738348334600634e-07,
1292
+ "loss": 0.8015,
1293
  "step": 16400
1294
  },
1295
  {
1296
+ "epoch": 0.66,
1297
+ "grad_norm": 0.9412031173706055,
1298
+ "learning_rate": 6.598010182294938e-07,
1299
+ "loss": 0.8009,
1300
  "step": 16500
1301
  },
1302
  {
1303
+ "epoch": 0.664,
1304
+ "grad_norm": 0.8926331996917725,
1305
+ "learning_rate": 6.458622945570538e-07,
1306
+ "loss": 0.783,
1307
  "step": 16600
1308
  },
1309
  {
1310
+ "epoch": 0.668,
1311
+ "grad_norm": 0.8715830445289612,
1312
+ "learning_rate": 6.320209082191569e-07,
1313
+ "loss": 0.8127,
1314
  "step": 16700
1315
  },
1316
  {
1317
+ "epoch": 0.672,
1318
+ "grad_norm": 0.8215272426605225,
1319
+ "learning_rate": 6.182790893094402e-07,
1320
+ "loss": 0.7958,
1321
  "step": 16800
1322
  },
1323
  {
1324
+ "epoch": 0.676,
1325
+ "grad_norm": 0.9258244037628174,
1326
+ "learning_rate": 6.046390518794556e-07,
1327
+ "loss": 0.7931,
1328
  "step": 16900
1329
  },
1330
  {
1331
+ "epoch": 0.68,
1332
+ "grad_norm": 0.8930866122245789,
1333
+ "learning_rate": 5.911029935819468e-07,
1334
+ "loss": 0.7811,
1335
  "step": 17000
1336
  },
1337
  {
1338
+ "epoch": 0.68,
1339
+ "eval_loss": 1.5324440002441406,
1340
+ "eval_runtime": 102.3251,
1341
+ "eval_samples_per_second": 133.838,
1342
+ "eval_steps_per_second": 2.091,
1343
  "step": 17000
1344
  },
1345
  {
1346
+ "epoch": 0.684,
1347
+ "grad_norm": 0.9415869116783142,
1348
+ "learning_rate": 5.776730953167705e-07,
1349
+ "loss": 0.8003,
1350
  "step": 17100
1351
  },
1352
  {
1353
+ "epoch": 0.688,
1354
+ "grad_norm": 0.892819344997406,
1355
+ "learning_rate": 5.643515208795141e-07,
1356
+ "loss": 0.7943,
1357
  "step": 17200
1358
  },
1359
  {
1360
+ "epoch": 0.692,
1361
+ "grad_norm": 0.9383297562599182,
1362
+ "learning_rate": 5.511404166128647e-07,
1363
+ "loss": 0.7998,
1364
  "step": 17300
1365
  },
1366
  {
1367
+ "epoch": 0.696,
1368
+ "grad_norm": 0.8630228638648987,
1369
+ "learning_rate": 5.380419110608033e-07,
1370
+ "loss": 0.7949,
1371
  "step": 17400
1372
  },
1373
  {
1374
+ "epoch": 0.7,
1375
+ "grad_norm": 0.9032106995582581,
1376
+ "learning_rate": 5.250581146256524e-07,
1377
+ "loss": 0.7928,
1378
  "step": 17500
1379
  },
1380
  {
1381
+ "epoch": 0.704,
1382
+ "grad_norm": 0.9039574265480042,
1383
+ "learning_rate": 5.121911192280557e-07,
1384
+ "loss": 0.8012,
1385
  "step": 17600
1386
  },
1387
  {
1388
+ "epoch": 0.708,
1389
+ "grad_norm": 0.9616802334785461,
1390
+ "learning_rate": 4.994429979699302e-07,
1391
+ "loss": 0.7964,
1392
  "step": 17700
1393
  },
1394
  {
1395
+ "epoch": 0.712,
1396
+ "grad_norm": 0.9427072405815125,
1397
+ "learning_rate": 4.868158048004537e-07,
1398
+ "loss": 0.805,
1399
  "step": 17800
1400
  },
1401
  {
1402
+ "epoch": 0.716,
1403
+ "grad_norm": 0.9399961829185486,
1404
+ "learning_rate": 4.743115741851383e-07,
1405
+ "loss": 0.7913,
1406
  "step": 17900
1407
  },
1408
  {
1409
+ "epoch": 0.72,
1410
+ "grad_norm": 0.8884769678115845,
1411
+ "learning_rate": 4.6193232077804006e-07,
1412
+ "loss": 0.7985,
1413
  "step": 18000
1414
  },
1415
  {
1416
+ "epoch": 0.72,
1417
+ "eval_loss": 1.5309633016586304,
1418
+ "eval_runtime": 102.8962,
1419
+ "eval_samples_per_second": 133.095,
1420
+ "eval_steps_per_second": 2.08,
1421
  "step": 18000
1422
  },
1423
  {
1424
+ "epoch": 0.724,
1425
+ "grad_norm": 0.9725548028945923,
1426
+ "learning_rate": 4.4968003909716243e-07,
1427
+ "loss": 0.8162,
1428
  "step": 18100
1429
  },
1430
  {
1431
+ "epoch": 0.728,
1432
+ "grad_norm": 1.0075186491012573,
1433
+ "learning_rate": 4.3755670320310443e-07,
1434
+ "loss": 0.8054,
1435
  "step": 18200
1436
  },
1437
  {
1438
+ "epoch": 0.732,
1439
+ "grad_norm": 0.8749048113822937,
1440
+ "learning_rate": 4.2556426638100555e-07,
1441
+ "loss": 0.8056,
1442
  "step": 18300
1443
  },
1444
  {
1445
+ "epoch": 0.736,
1446
+ "grad_norm": 0.9941290616989136,
1447
+ "learning_rate": 4.1370466082583353e-07,
1448
+ "loss": 0.8052,
1449
  "step": 18400
1450
  },
1451
  {
1452
+ "epoch": 0.74,
1453
+ "grad_norm": 0.8676705956459045,
1454
+ "learning_rate": 4.0197979733107755e-07,
1455
+ "loss": 0.7861,
1456
  "step": 18500
1457
  },
1458
  {
1459
+ "epoch": 0.744,
1460
+ "grad_norm": 0.9036993980407715,
1461
+ "learning_rate": 3.903915649808812e-07,
1462
+ "loss": 0.8081,
1463
  "step": 18600
1464
  },
1465
  {
1466
+ "epoch": 0.748,
1467
+ "grad_norm": 0.9067134261131287,
1468
+ "learning_rate": 3.789418308456812e-07,
1469
+ "loss": 0.7956,
1470
  "step": 18700
1471
  },
1472
  {
1473
+ "epoch": 0.752,
1474
+ "grad_norm": 0.8146563768386841,
1475
+ "learning_rate": 3.676324396813856e-07,
1476
+ "loss": 0.8031,
1477
  "step": 18800
1478
  },
1479
  {
1480
+ "epoch": 0.756,
1481
+ "grad_norm": 0.9973321557044983,
1482
+ "learning_rate": 3.5646521363215447e-07,
1483
+ "loss": 0.794,
1484
  "step": 18900
1485
  },
1486
  {
1487
+ "epoch": 0.76,
1488
+ "grad_norm": 0.9761902689933777,
1489
+ "learning_rate": 3.4544195193681615e-07,
1490
+ "loss": 0.7816,
1491
  "step": 19000
1492
  },
1493
  {
1494
+ "epoch": 0.76,
1495
+ "eval_loss": 1.5294893980026245,
1496
+ "eval_runtime": 102.4113,
1497
+ "eval_samples_per_second": 133.726,
1498
+ "eval_steps_per_second": 2.09,
1499
  "step": 19000
1500
  },
1501
  {
1502
+ "epoch": 0.764,
1503
+ "grad_norm": 0.8643273115158081,
1504
+ "learning_rate": 3.3456443063898157e-07,
1505
+ "loss": 0.7917,
1506
  "step": 19100
1507
  },
1508
  {
1509
+ "epoch": 0.768,
1510
+ "grad_norm": 0.9306071400642395,
1511
+ "learning_rate": 3.238344023008888e-07,
1512
+ "loss": 0.8012,
1513
  "step": 19200
1514
  },
1515
  {
1516
+ "epoch": 0.772,
1517
+ "grad_norm": 0.9324482083320618,
1518
+ "learning_rate": 3.132535957210366e-07,
1519
+ "loss": 0.7929,
1520
  "step": 19300
1521
  },
1522
  {
1523
+ "epoch": 0.776,
1524
+ "grad_norm": 0.8625467419624329,
1525
+ "learning_rate": 3.0282371565564324e-07,
1526
+ "loss": 0.7815,
1527
  "step": 19400
1528
  },
1529
  {
1530
+ "epoch": 0.78,
1531
+ "grad_norm": 0.8669098019599915,
1532
+ "learning_rate": 2.925464425439789e-07,
1533
+ "loss": 0.8214,
1534
  "step": 19500
1535
  },
1536
  {
1537
+ "epoch": 0.784,
1538
+ "grad_norm": 0.8781657814979553,
1539
+ "learning_rate": 2.824234322376185e-07,
1540
+ "loss": 0.7941,
1541
  "step": 19600
1542
  },
1543
  {
1544
+ "epoch": 0.788,
1545
+ "grad_norm": 0.8899013996124268,
1546
+ "learning_rate": 2.724563157336542e-07,
1547
+ "loss": 0.7966,
1548
  "step": 19700
1549
  },
1550
  {
1551
+ "epoch": 0.792,
1552
+ "grad_norm": 0.9773925542831421,
1553
+ "learning_rate": 2.626466989119131e-07,
1554
+ "loss": 0.8009,
1555
  "step": 19800
1556
  },
1557
  {
1558
+ "epoch": 0.796,
1559
+ "grad_norm": 0.912438690662384,
1560
+ "learning_rate": 2.5299616227621946e-07,
1561
+ "loss": 0.7902,
1562
  "step": 19900
1563
  },
1564
  {
1565
+ "epoch": 0.8,
1566
+ "grad_norm": 0.9557161927223206,
1567
+ "learning_rate": 2.435062606997499e-07,
1568
+ "loss": 0.7889,
1569
  "step": 20000
1570
  },
1571
  {
1572
+ "epoch": 0.8,
1573
+ "eval_loss": 1.5292094945907593,
1574
+ "eval_runtime": 102.5763,
1575
+ "eval_samples_per_second": 133.51,
1576
+ "eval_steps_per_second": 2.086,
1577
  "step": 20000
1578
  },
1579
  {
1580
+ "epoch": 0.804,
1581
+ "grad_norm": 0.8561129570007324,
1582
+ "learning_rate": 2.3417852317451418e-07,
1583
+ "loss": 0.8033,
1584
  "step": 20100
1585
  },
1586
  {
1587
+ "epoch": 0.808,
1588
+ "grad_norm": 0.9422599673271179,
1589
+ "learning_rate": 2.250144525650086e-07,
1590
+ "loss": 0.7985,
1591
  "step": 20200
1592
  },
1593
  {
1594
+ "epoch": 0.812,
1595
+ "grad_norm": 0.8980026245117188,
1596
+ "learning_rate": 2.160155253660759e-07,
1597
+ "loss": 0.7951,
1598
  "step": 20300
1599
  },
1600
  {
1601
+ "epoch": 0.816,
1602
+ "grad_norm": 0.8675551414489746,
1603
+ "learning_rate": 2.071831914650173e-07,
1604
+ "loss": 0.7994,
1605
  "step": 20400
1606
  },
1607
  {
1608
+ "epoch": 0.82,
1609
+ "grad_norm": 0.8988806009292603,
1610
+ "learning_rate": 1.9851887390798922e-07,
1611
+ "loss": 0.7875,
1612
  "step": 20500
1613
  },
1614
  {
1615
+ "epoch": 0.824,
1616
+ "grad_norm": 0.9102202653884888,
1617
+ "learning_rate": 1.9002396867072587e-07,
1618
+ "loss": 0.7993,
1619
  "step": 20600
1620
  },
1621
  {
1622
+ "epoch": 0.828,
1623
+ "grad_norm": 0.9096868634223938,
1624
+ "learning_rate": 1.816998444336214e-07,
1625
+ "loss": 0.7704,
1626
  "step": 20700
1627
  },
1628
  {
1629
+ "epoch": 0.832,
1630
+ "grad_norm": 0.9461880922317505,
1631
+ "learning_rate": 1.7354784236121206e-07,
1632
+ "loss": 0.7853,
1633
  "step": 20800
1634
  },
1635
  {
1636
+ "epoch": 0.836,
1637
+ "grad_norm": 0.9219881296157837,
1638
+ "learning_rate": 1.6556927588609078e-07,
1639
+ "loss": 0.7857,
1640
  "step": 20900
1641
  },
1642
  {
1643
+ "epoch": 0.84,
1644
+ "grad_norm": 0.8964762687683105,
1645
+ "learning_rate": 1.577654304972899e-07,
1646
+ "loss": 0.7872,
1647
  "step": 21000
1648
  },
1649
  {
1650
+ "epoch": 0.84,
1651
+ "eval_loss": 1.524131178855896,
1652
+ "eval_runtime": 102.4749,
1653
+ "eval_samples_per_second": 133.642,
1654
+ "eval_steps_per_second": 2.088,
1655
  "step": 21000
1656
  },
1657
  {
1658
+ "epoch": 0.844,
1659
+ "grad_norm": 0.9355736970901489,
1660
+ "learning_rate": 1.501375635331652e-07,
1661
+ "loss": 0.7957,
1662
  "step": 21100
1663
  },
1664
  {
1665
+ "epoch": 0.848,
1666
+ "grad_norm": 0.8686819076538086,
1667
+ "learning_rate": 1.4268690397881675e-07,
1668
+ "loss": 0.793,
1669
  "step": 21200
1670
  },
1671
  {
1672
+ "epoch": 0.852,
1673
+ "grad_norm": 0.874756395816803,
1674
+ "learning_rate": 1.3541465226807813e-07,
1675
+ "loss": 0.7878,
1676
  "step": 21300
1677
  },
1678
  {
1679
+ "epoch": 0.856,
1680
+ "grad_norm": 0.9285154342651367,
1681
+ "learning_rate": 1.283219800901045e-07,
1682
+ "loss": 0.7547,
1683
  "step": 21400
1684
  },
1685
  {
1686
+ "epoch": 0.86,
1687
+ "grad_norm": 0.9496791958808899,
1688
+ "learning_rate": 1.2141003020059273e-07,
1689
+ "loss": 0.7885,
1690
  "step": 21500
1691
  },
1692
  {
1693
+ "epoch": 0.864,
1694
+ "grad_norm": 0.879410445690155,
1695
+ "learning_rate": 1.1467991623766287e-07,
1696
+ "loss": 0.8123,
1697
  "step": 21600
1698
  },
1699
  {
1700
+ "epoch": 0.868,
1701
+ "grad_norm": 0.942361056804657,
1702
+ "learning_rate": 1.081327225424321e-07,
1703
+ "loss": 0.817,
1704
  "step": 21700
1705
  },
1706
  {
1707
+ "epoch": 0.872,
1708
+ "grad_norm": 0.9548047184944153,
1709
+ "learning_rate": 1.0176950398430752e-07,
1710
+ "loss": 0.7925,
1711
  "step": 21800
1712
  },
1713
  {
1714
+ "epoch": 0.876,
1715
+ "grad_norm": 0.8643764853477478,
1716
+ "learning_rate": 9.559128579102767e-08,
1717
+ "loss": 0.7985,
1718
  "step": 21900
1719
  },
1720
  {
1721
+ "epoch": 0.88,
1722
+ "grad_norm": 0.9450801014900208,
1723
+ "learning_rate": 8.959906338348007e-08,
1724
+ "loss": 0.7975,
1725
  "step": 22000
1726
  },
1727
  {
1728
+ "epoch": 0.88,
1729
+ "eval_loss": 1.5321519374847412,
1730
+ "eval_runtime": 103.5374,
1731
+ "eval_samples_per_second": 132.271,
1732
+ "eval_steps_per_second": 2.067,
1733
  "step": 22000
1734
  },
1735
  {
1736
+ "epoch": 0.884,
1737
+ "grad_norm": 0.9130359292030334,
1738
+ "learning_rate": 8.37938022153223e-08,
1739
+ "loss": 0.8005,
1740
  "step": 22100
1741
  },
1742
  {
1743
+ "epoch": 0.888,
1744
+ "grad_norm": 0.8732690215110779,
1745
+ "learning_rate": 7.817643761742891e-08,
1746
+ "loss": 0.7857,
1747
  "step": 22200
1748
  },
1749
  {
1750
+ "epoch": 0.892,
1751
+ "grad_norm": 0.9094323515892029,
1752
+ "learning_rate": 7.274787464719338e-08,
1753
+ "loss": 0.8096,
1754
  "step": 22300
1755
  },
1756
  {
1757
+ "epoch": 0.896,
1758
+ "grad_norm": 0.8987523913383484,
1759
+ "learning_rate": 6.75089879427078e-08,
1760
+ "loss": 0.8072,
1761
  "step": 22400
1762
  },
1763
  {
1764
+ "epoch": 0.9,
1765
+ "grad_norm": 0.9105306267738342,
1766
+ "learning_rate": 6.246062158184241e-08,
1767
+ "loss": 0.7968,
1768
  "step": 22500
1769
  },
1770
  {
1771
+ "epoch": 0.904,
1772
+ "grad_norm": 0.8889061808586121,
1773
+ "learning_rate": 5.7603588946250064e-08,
1774
+ "loss": 0.7971,
1775
  "step": 22600
1776
  },
1777
  {
1778
+ "epoch": 0.908,
1779
+ "grad_norm": 0.9296440482139587,
1780
+ "learning_rate": 5.293867259031568e-08,
1781
+ "loss": 0.7896,
1782
  "step": 22700
1783
  },
1784
  {
1785
+ "epoch": 0.912,
1786
+ "grad_norm": 1.0374181270599365,
1787
+ "learning_rate": 4.8466624115073164e-08,
1788
+ "loss": 0.808,
1789
  "step": 22800
1790
  },
1791
  {
1792
+ "epoch": 1.00384,
1793
+ "grad_norm": 0.8791893124580383,
1794
+ "learning_rate": 4.4188164047108403e-08,
1795
+ "loss": 0.7835,
1796
  "step": 22900
1797
  },
1798
  {
1799
+ "epoch": 1.00784,
1800
+ "grad_norm": 0.8789498209953308,
1801
+ "learning_rate": 4.010398172247104e-08,
1802
+ "loss": 0.7987,
1803
  "step": 23000
1804
  },
1805
  {
1806
+ "epoch": 1.00784,
1807
+ "eval_loss": 1.5310029983520508,
1808
+ "eval_runtime": 101.8479,
1809
+ "eval_samples_per_second": 134.465,
1810
+ "eval_steps_per_second": 2.101,
1811
  "step": 23000
1812
  },
1813
  {
1814
+ "epoch": 1.01184,
1815
+ "grad_norm": 0.9262071847915649,
1816
+ "learning_rate": 3.6214735175608004e-08,
1817
+ "loss": 0.7966,
1818
  "step": 23100
1819
  },
1820
  {
1821
+ "epoch": 1.01584,
1822
+ "grad_norm": 0.8986383676528931,
1823
+ "learning_rate": 3.252105103334499e-08,
1824
+ "loss": 0.7954,
1825
  "step": 23200
1826
  },
1827
  {
1828
+ "epoch": 1.01984,
1829
+ "grad_norm": 0.9548205733299255,
1830
+ "learning_rate": 2.9023524413923365e-08,
1831
+ "loss": 0.7934,
1832
  "step": 23300
1833
  },
1834
  {
1835
+ "epoch": 1.02384,
1836
+ "grad_norm": 0.9211428165435791,
1837
+ "learning_rate": 2.5722718831117656e-08,
1838
+ "loss": 0.8068,
1839
  "step": 23400
1840
  },
1841
  {
1842
+ "epoch": 1.02784,
1843
+ "grad_norm": 0.9240931272506714,
1844
+ "learning_rate": 2.26191661034425e-08,
1845
+ "loss": 0.787,
1846
  "step": 23500
1847
  },
1848
  {
1849
+ "epoch": 1.03184,
1850
+ "grad_norm": 0.9866804480552673,
1851
+ "learning_rate": 1.9713366268468148e-08,
1852
+ "loss": 0.7929,
1853
  "step": 23600
1854
  },
1855
  {
1856
+ "epoch": 1.03584,
1857
+ "grad_norm": 0.9947385787963867,
1858
+ "learning_rate": 1.700578750225432e-08,
1859
+ "loss": 0.7973,
1860
  "step": 23700
1861
  },
1862
  {
1863
+ "epoch": 1.03984,
1864
+ "grad_norm": 0.8872534036636353,
1865
+ "learning_rate": 1.4496866043919865e-08,
1866
+ "loss": 0.7995,
1867
  "step": 23800
1868
  },
1869
  {
1870
+ "epoch": 1.04384,
1871
+ "grad_norm": 0.8726480007171631,
1872
+ "learning_rate": 1.2187006125356087e-08,
1873
+ "loss": 0.7929,
1874
  "step": 23900
1875
  },
1876
  {
1877
+ "epoch": 1.04784,
1878
+ "grad_norm": 0.881963849067688,
1879
+ "learning_rate": 1.0076579906098255e-08,
1880
+ "loss": 0.8044,
1881
  "step": 24000
1882
  },
1883
  {
1884
+ "epoch": 1.04784,
1885
+ "eval_loss": 1.5276943445205688,
1886
+ "eval_runtime": 99.4561,
1887
+ "eval_samples_per_second": 137.699,
1888
+ "eval_steps_per_second": 2.152,
1889
  "step": 24000
1890
  },
1891
  {
1892
+ "epoch": 1.0518399999999999,
1893
+ "grad_norm": 0.8809722065925598,
1894
+ "learning_rate": 8.16592741336386e-09,
1895
+ "loss": 0.7832,
1896
  "step": 24100
1897
  },
1898
  {
1899
+ "epoch": 1.05584,
1900
+ "grad_norm": 0.8471363186836243,
1901
+ "learning_rate": 6.455356487267833e-09,
1902
+ "loss": 0.7815,
1903
  "step": 24200
1904
  },
1905
  {
1906
+ "epoch": 1.05984,
1907
+ "grad_norm": 0.9595879912376404,
1908
+ "learning_rate": 4.9451427312251224e-09,
1909
+ "loss": 0.7943,
1910
  "step": 24300
1911
  },
1912
  {
1913
+ "epoch": 1.06384,
1914
+ "grad_norm": 0.8937146663665771,
1915
+ "learning_rate": 3.635529467544696e-09,
1916
+ "loss": 0.8066,
1917
  "step": 24400
1918
  },
1919
  {
1920
+ "epoch": 1.06784,
1921
+ "grad_norm": 0.9749945998191833,
1922
+ "learning_rate": 2.526727698227288e-09,
1923
+ "loss": 0.802,
1924
  "step": 24500
1925
  },
1926
  {
1927
+ "epoch": 1.07184,
1928
+ "grad_norm": 0.919170081615448,
1929
+ "learning_rate": 1.6189160709680074e-09,
1930
+ "loss": 0.79,
1931
  "step": 24600
1932
  },
1933
  {
1934
+ "epoch": 1.07584,
1935
+ "grad_norm": 0.9579231142997742,
1936
+ "learning_rate": 9.122408503739466e-10,
1937
+ "loss": 0.8092,
1938
  "step": 24700
1939
  },
1940
  {
1941
+ "epoch": 1.07984,
1942
+ "grad_norm": 0.8257275223731995,
1943
+ "learning_rate": 4.0681589439789395e-10,
1944
+ "loss": 0.8028,
1945
  "step": 24800
1946
  },
1947
  {
1948
+ "epoch": 1.08384,
1949
+ "grad_norm": 0.8641030788421631,
1950
+ "learning_rate": 1.0272263599411803e-10,
1951
+ "loss": 0.7852,
1952
  "step": 24900
1953
  },
1954
  {
1955
+ "epoch": 1.08784,
1956
+ "grad_norm": 0.929093062877655,
1957
+ "learning_rate": 1.006999733599301e-14,
1958
+ "loss": 0.7867,
1959
  "step": 25000
1960
  },
1961
  {
1962
+ "epoch": 1.08784,
1963
+ "eval_loss": 1.5288399457931519,
1964
+ "eval_runtime": 99.9402,
1965
+ "eval_samples_per_second": 137.032,
1966
+ "eval_steps_per_second": 2.141,
1967
  "step": 25000
1968
  }
1969
  ],
1970
  "logging_steps": 100,
1971
+ "max_steps": 25000,
1972
  "num_input_tokens_seen": 0,
1973
  "num_train_epochs": 9223372036854775807,
1974
  "save_steps": 5000,
 
1979
  "should_evaluate": false,
1980
  "should_log": false,
1981
  "should_save": true,
1982
+ "should_training_stop": true
1983
  },
1984
  "attributes": {}
1985
  }
1986
  },
1987
+ "total_flos": 3.3846277778817024e+18,
1988
  "train_batch_size": 64,
1989
  "trial_name": null,
1990
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:399a4e8079b62b554066d23eba050b8905a3c51a8603b79dbe178069ff2eff81
3
  size 5841
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8468e002e14d69abcb2f7de8e401f9fa2561c9e9f59ee528d9d623ec438f38ae
3
  size 5841