DanJoshua commited on
Commit
2e10691
·
verified ·
1 Parent(s): 8459bc7

Model save

Browse files
Files changed (3) hide show
  1. all_results.json +5 -10
  2. train_results.json +5 -5
  3. trainer_state.json +524 -1790
all_results.json CHANGED
@@ -1,13 +1,8 @@
1
  {
2
- "epoch": 14.066666666666666,
3
- "eval_accuracy": 0.8975,
4
- "eval_loss": 0.47002196311950684,
5
- "eval_runtime": 423.5464,
6
- "eval_samples_per_second": 1.889,
7
- "eval_steps_per_second": 0.236,
8
  "total_flos": 0.0,
9
- "train_loss": 0.23767406691715381,
10
- "train_runtime": 18748.4047,
11
- "train_samples_per_second": 1.216,
12
- "train_steps_per_second": 0.152
13
  }
 
1
  {
2
+ "epoch": 5.125,
 
 
 
 
 
3
  "total_flos": 0.0,
4
+ "train_loss": 0.43041260364024264,
5
+ "train_runtime": 5887.2517,
6
+ "train_samples_per_second": 2.065,
7
+ "train_steps_per_second": 0.258
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 14.066666666666666,
3
  "total_flos": 0.0,
4
- "train_loss": 0.23767406691715381,
5
- "train_runtime": 18748.4047,
6
- "train_samples_per_second": 1.216,
7
- "train_steps_per_second": 0.152
8
  }
 
1
  {
2
+ "epoch": 5.125,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.43041260364024264,
5
+ "train_runtime": 5887.2517,
6
+ "train_samples_per_second": 2.065,
7
+ "train_steps_per_second": 0.258
8
  }
trainer_state.json CHANGED
@@ -1,2168 +1,902 @@
1
  {
2
- "best_metric": 0.925,
3
- "best_model_checkpoint": "mvit_v2_rwf-2000/checkpoint-760",
4
- "epoch": 14.066666666666666,
5
  "eval_steps": 500,
6
- "global_step": 2850,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0035087719298245615,
13
- "grad_norm": 38.48279571533203,
14
- "learning_rate": 1.992982456140351e-05,
15
- "loss": 6.4374,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.007017543859649123,
20
- "grad_norm": 40.122596740722656,
21
- "learning_rate": 1.9859649122807017e-05,
22
- "loss": 5.1283,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.010526315789473684,
27
- "grad_norm": 36.445030212402344,
28
- "learning_rate": 1.9789473684210528e-05,
29
- "loss": 3.3545,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.014035087719298246,
34
- "grad_norm": 24.644166946411133,
35
- "learning_rate": 1.9719298245614036e-05,
36
- "loss": 1.9933,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.017543859649122806,
41
- "grad_norm": 15.512025833129883,
42
- "learning_rate": 1.9649122807017544e-05,
43
- "loss": 1.4073,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.021052631578947368,
48
- "grad_norm": 11.587557792663574,
49
- "learning_rate": 1.9578947368421055e-05,
50
- "loss": 0.8724,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.02456140350877193,
55
- "grad_norm": 17.1866512298584,
56
- "learning_rate": 1.9508771929824562e-05,
57
- "loss": 0.8025,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.028070175438596492,
62
- "grad_norm": 15.993489265441895,
63
- "learning_rate": 1.9438596491228074e-05,
64
- "loss": 0.6614,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.031578947368421054,
69
- "grad_norm": 20.174253463745117,
70
- "learning_rate": 1.936842105263158e-05,
71
- "loss": 0.773,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.03508771929824561,
76
- "grad_norm": 12.386943817138672,
77
- "learning_rate": 1.929824561403509e-05,
78
- "loss": 0.6383,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.03859649122807018,
83
- "grad_norm": 15.069538116455078,
84
- "learning_rate": 1.9228070175438597e-05,
85
- "loss": 0.4782,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.042105263157894736,
90
- "grad_norm": 18.363399505615234,
91
- "learning_rate": 1.9157894736842108e-05,
92
- "loss": 0.517,
93
  "step": 120
94
  },
95
  {
96
- "epoch": 0.0456140350877193,
97
- "grad_norm": 6.734167575836182,
98
- "learning_rate": 1.9087719298245616e-05,
99
- "loss": 0.6264,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 0.04912280701754386,
104
- "grad_norm": 9.45508098602295,
105
- "learning_rate": 1.9017543859649123e-05,
106
- "loss": 0.5572,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 0.05263157894736842,
111
- "grad_norm": 7.9156622886657715,
112
- "learning_rate": 1.894736842105263e-05,
113
- "loss": 0.3201,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 0.056140350877192984,
118
- "grad_norm": 12.281410217285156,
119
- "learning_rate": 1.8877192982456142e-05,
120
- "loss": 0.3903,
121
  "step": 160
122
  },
123
  {
124
- "epoch": 0.05964912280701754,
125
- "grad_norm": 11.644484519958496,
126
- "learning_rate": 1.880701754385965e-05,
127
- "loss": 0.3503,
128
  "step": 170
129
  },
130
  {
131
- "epoch": 0.06315789473684211,
132
- "grad_norm": 1.892181158065796,
133
- "learning_rate": 1.873684210526316e-05,
134
- "loss": 0.5198,
135
  "step": 180
136
  },
137
  {
138
- "epoch": 0.06666666666666667,
139
- "grad_norm": 10.875889778137207,
140
- "learning_rate": 1.866666666666667e-05,
141
- "loss": 0.6151,
142
  "step": 190
143
  },
144
  {
145
- "epoch": 0.06666666666666667,
146
- "eval_accuracy": 0.86875,
147
- "eval_loss": 0.37832513451576233,
148
- "eval_runtime": 81.5575,
149
- "eval_samples_per_second": 1.962,
150
- "eval_steps_per_second": 0.245,
 
 
151
  "step": 190
152
  },
153
  {
154
- "epoch": 1.0035087719298246,
155
- "grad_norm": 17.829296112060547,
156
- "learning_rate": 1.8596491228070176e-05,
157
- "loss": 0.5231,
158
  "step": 200
159
  },
160
  {
161
- "epoch": 1.0070175438596491,
162
- "grad_norm": 5.5297980308532715,
163
- "learning_rate": 1.8526315789473684e-05,
164
- "loss": 0.2097,
165
  "step": 210
166
  },
167
  {
168
- "epoch": 1.0105263157894737,
169
- "grad_norm": 16.311006546020508,
170
- "learning_rate": 1.8456140350877195e-05,
171
- "loss": 0.3499,
172
  "step": 220
173
  },
174
  {
175
- "epoch": 1.0140350877192983,
176
- "grad_norm": 14.499452590942383,
177
- "learning_rate": 1.8385964912280703e-05,
178
- "loss": 0.4439,
179
  "step": 230
180
  },
181
  {
182
- "epoch": 1.0175438596491229,
183
- "grad_norm": 13.696078300476074,
184
- "learning_rate": 1.831578947368421e-05,
185
- "loss": 0.3348,
186
  "step": 240
187
  },
188
  {
189
- "epoch": 1.0210526315789474,
190
- "grad_norm": 2.477189302444458,
191
- "learning_rate": 1.824561403508772e-05,
192
- "loss": 0.2746,
193
  "step": 250
194
  },
195
  {
196
- "epoch": 1.024561403508772,
197
- "grad_norm": 35.30912399291992,
198
- "learning_rate": 1.817543859649123e-05,
199
- "loss": 0.3349,
200
  "step": 260
201
  },
202
  {
203
- "epoch": 1.0280701754385966,
204
- "grad_norm": 1.255654215812683,
205
- "learning_rate": 1.810526315789474e-05,
206
- "loss": 0.392,
207
  "step": 270
208
  },
209
  {
210
- "epoch": 1.0315789473684212,
211
- "grad_norm": 5.34593391418457,
212
- "learning_rate": 1.8035087719298248e-05,
213
- "loss": 0.3443,
214
  "step": 280
215
  },
216
  {
217
- "epoch": 1.0350877192982457,
218
- "grad_norm": 1.489880919456482,
219
- "learning_rate": 1.7964912280701756e-05,
220
- "loss": 0.246,
221
  "step": 290
222
  },
223
  {
224
- "epoch": 1.03859649122807,
225
- "grad_norm": 1.138655424118042,
226
- "learning_rate": 1.7894736842105264e-05,
227
- "loss": 0.205,
228
  "step": 300
229
  },
230
  {
231
- "epoch": 1.0421052631578946,
232
- "grad_norm": 7.539694786071777,
233
- "learning_rate": 1.7824561403508775e-05,
234
- "loss": 0.2926,
235
  "step": 310
236
  },
237
  {
238
- "epoch": 1.0456140350877192,
239
- "grad_norm": 11.403474807739258,
240
- "learning_rate": 1.7754385964912283e-05,
241
- "loss": 0.3764,
242
  "step": 320
243
  },
244
  {
245
- "epoch": 1.0491228070175438,
246
- "grad_norm": 17.022748947143555,
247
- "learning_rate": 1.768421052631579e-05,
248
- "loss": 0.351,
249
  "step": 330
250
  },
251
  {
252
- "epoch": 1.0526315789473684,
253
- "grad_norm": 2.676985025405884,
254
- "learning_rate": 1.7614035087719298e-05,
255
- "loss": 0.2847,
256
  "step": 340
257
  },
258
  {
259
- "epoch": 1.056140350877193,
260
- "grad_norm": 3.728642225265503,
261
- "learning_rate": 1.754385964912281e-05,
262
- "loss": 0.2979,
263
  "step": 350
264
  },
265
  {
266
- "epoch": 1.0596491228070175,
267
- "grad_norm": 1.4747868776321411,
268
- "learning_rate": 1.7473684210526317e-05,
269
- "loss": 0.4769,
270
  "step": 360
271
  },
272
  {
273
- "epoch": 1.063157894736842,
274
- "grad_norm": 21.361783981323242,
275
- "learning_rate": 1.7403508771929828e-05,
276
- "loss": 0.1665,
277
  "step": 370
278
  },
279
  {
280
- "epoch": 1.0666666666666667,
281
- "grad_norm": 26.084320068359375,
282
- "learning_rate": 1.7333333333333336e-05,
283
- "loss": 0.3347,
284
  "step": 380
285
  },
286
  {
287
- "epoch": 1.0666666666666667,
288
- "eval_accuracy": 0.90625,
289
- "eval_loss": 0.28085756301879883,
290
- "eval_runtime": 80.3329,
291
- "eval_samples_per_second": 1.992,
292
- "eval_steps_per_second": 0.249,
 
 
293
  "step": 380
294
  },
295
  {
296
- "epoch": 2.0035087719298246,
297
- "grad_norm": 1.256848931312561,
298
- "learning_rate": 1.7263157894736843e-05,
299
- "loss": 0.2939,
300
  "step": 390
301
  },
302
  {
303
- "epoch": 2.007017543859649,
304
- "grad_norm": 15.828075408935547,
305
- "learning_rate": 1.719298245614035e-05,
306
- "loss": 0.1886,
307
  "step": 400
308
  },
309
  {
310
- "epoch": 2.0105263157894737,
311
- "grad_norm": 0.37902504205703735,
312
- "learning_rate": 1.7122807017543862e-05,
313
- "loss": 0.2517,
314
  "step": 410
315
  },
316
  {
317
- "epoch": 2.0140350877192983,
318
- "grad_norm": 0.3434399664402008,
319
- "learning_rate": 1.705263157894737e-05,
320
- "loss": 0.1779,
321
  "step": 420
322
  },
323
  {
324
- "epoch": 2.017543859649123,
325
- "grad_norm": 3.215871572494507,
326
- "learning_rate": 1.6982456140350878e-05,
327
- "loss": 0.214,
328
  "step": 430
329
  },
330
  {
331
- "epoch": 2.0210526315789474,
332
- "grad_norm": 51.95631408691406,
333
- "learning_rate": 1.6912280701754385e-05,
334
- "loss": 0.4919,
335
  "step": 440
336
  },
337
  {
338
- "epoch": 2.024561403508772,
339
- "grad_norm": 1.8221938610076904,
340
- "learning_rate": 1.6842105263157896e-05,
341
- "loss": 0.3032,
342
  "step": 450
343
  },
344
  {
345
- "epoch": 2.0280701754385966,
346
- "grad_norm": 1.5179495811462402,
347
- "learning_rate": 1.6771929824561408e-05,
348
- "loss": 0.2598,
349
  "step": 460
350
  },
351
  {
352
- "epoch": 2.031578947368421,
353
- "grad_norm": 6.242626667022705,
354
- "learning_rate": 1.6701754385964915e-05,
355
- "loss": 0.0454,
356
  "step": 470
357
  },
358
  {
359
- "epoch": 2.0350877192982457,
360
- "grad_norm": 26.637720108032227,
361
- "learning_rate": 1.6631578947368423e-05,
362
- "loss": 0.315,
363
  "step": 480
364
  },
365
  {
366
- "epoch": 2.0385964912280703,
367
- "grad_norm": 0.4115073084831238,
368
- "learning_rate": 1.656140350877193e-05,
369
- "loss": 0.3634,
370
  "step": 490
371
  },
372
  {
373
- "epoch": 2.042105263157895,
374
- "grad_norm": 29.888004302978516,
375
- "learning_rate": 1.649122807017544e-05,
376
- "loss": 0.2089,
377
  "step": 500
378
  },
379
  {
380
- "epoch": 2.0456140350877194,
381
- "grad_norm": 25.771451950073242,
382
- "learning_rate": 1.642105263157895e-05,
383
- "loss": 0.4124,
384
  "step": 510
385
  },
386
  {
387
- "epoch": 2.049122807017544,
388
- "grad_norm": 1.3790911436080933,
389
- "learning_rate": 1.6350877192982457e-05,
390
- "loss": 0.195,
391
  "step": 520
392
  },
393
  {
394
- "epoch": 2.0526315789473686,
395
- "grad_norm": 14.140546798706055,
396
- "learning_rate": 1.6280701754385965e-05,
397
- "loss": 0.2252,
398
  "step": 530
399
  },
400
  {
401
- "epoch": 2.056140350877193,
402
- "grad_norm": 9.343216896057129,
403
- "learning_rate": 1.6210526315789473e-05,
404
- "loss": 0.0905,
405
  "step": 540
406
  },
407
  {
408
- "epoch": 2.0596491228070177,
409
- "grad_norm": 21.790802001953125,
410
- "learning_rate": 1.6140350877192984e-05,
411
- "loss": 0.3495,
412
  "step": 550
413
  },
414
  {
415
- "epoch": 2.0631578947368423,
416
- "grad_norm": 4.165024280548096,
417
- "learning_rate": 1.6070175438596495e-05,
418
- "loss": 0.1162,
419
  "step": 560
420
  },
421
  {
422
- "epoch": 2.066666666666667,
423
- "grad_norm": 6.51143217086792,
424
- "learning_rate": 1.6000000000000003e-05,
425
- "loss": 0.2167,
426
  "step": 570
427
  },
428
  {
429
- "epoch": 2.066666666666667,
430
- "eval_accuracy": 0.89375,
431
- "eval_loss": 0.44960325956344604,
432
- "eval_runtime": 80.4193,
433
- "eval_samples_per_second": 1.99,
434
- "eval_steps_per_second": 0.249,
 
 
435
  "step": 570
436
  },
437
  {
438
- "epoch": 3.0035087719298246,
439
- "grad_norm": 33.92047119140625,
440
- "learning_rate": 1.592982456140351e-05,
441
- "loss": 0.1101,
442
  "step": 580
443
  },
444
  {
445
- "epoch": 3.007017543859649,
446
- "grad_norm": 14.30198860168457,
447
- "learning_rate": 1.5859649122807018e-05,
448
- "loss": 0.3499,
449
  "step": 590
450
  },
451
  {
452
- "epoch": 3.0105263157894737,
453
- "grad_norm": 17.972110748291016,
454
- "learning_rate": 1.578947368421053e-05,
455
- "loss": 0.2152,
456
  "step": 600
457
  },
458
  {
459
- "epoch": 3.0140350877192983,
460
- "grad_norm": 20.626901626586914,
461
- "learning_rate": 1.5719298245614037e-05,
462
- "loss": 0.2415,
463
  "step": 610
464
  },
465
  {
466
- "epoch": 3.017543859649123,
467
- "grad_norm": 21.53983497619629,
468
- "learning_rate": 1.5649122807017545e-05,
469
- "loss": 0.2904,
470
  "step": 620
471
  },
472
  {
473
- "epoch": 3.0210526315789474,
474
- "grad_norm": 2.215639114379883,
475
- "learning_rate": 1.5578947368421052e-05,
476
- "loss": 0.1822,
477
  "step": 630
478
  },
479
  {
480
- "epoch": 3.024561403508772,
481
- "grad_norm": 1.557974100112915,
482
- "learning_rate": 1.5508771929824563e-05,
483
- "loss": 0.186,
484
  "step": 640
485
  },
486
  {
487
- "epoch": 3.0280701754385966,
488
- "grad_norm": 23.085500717163086,
489
- "learning_rate": 1.543859649122807e-05,
490
- "loss": 0.1562,
491
  "step": 650
492
  },
493
  {
494
- "epoch": 3.031578947368421,
495
- "grad_norm": 22.804210662841797,
496
- "learning_rate": 1.536842105263158e-05,
497
- "loss": 0.3565,
498
  "step": 660
499
  },
500
  {
501
- "epoch": 3.0350877192982457,
502
- "grad_norm": 27.2755126953125,
503
- "learning_rate": 1.529824561403509e-05,
504
- "loss": 0.1267,
505
  "step": 670
506
  },
507
  {
508
- "epoch": 3.0385964912280703,
509
- "grad_norm": 22.491836547851562,
510
- "learning_rate": 1.5228070175438598e-05,
511
- "loss": 0.2473,
512
  "step": 680
513
  },
514
  {
515
- "epoch": 3.042105263157895,
516
- "grad_norm": 9.781085014343262,
517
- "learning_rate": 1.5157894736842107e-05,
518
- "loss": 0.1112,
519
  "step": 690
520
  },
521
  {
522
- "epoch": 3.0456140350877194,
523
- "grad_norm": 13.67097282409668,
524
- "learning_rate": 1.5087719298245615e-05,
525
- "loss": 0.1913,
526
  "step": 700
527
  },
528
  {
529
- "epoch": 3.049122807017544,
530
- "grad_norm": 38.54108428955078,
531
- "learning_rate": 1.5017543859649124e-05,
532
- "loss": 0.1275,
533
  "step": 710
534
  },
535
  {
536
- "epoch": 3.0526315789473686,
537
- "grad_norm": 0.3990094065666199,
538
- "learning_rate": 1.4947368421052632e-05,
539
- "loss": 0.169,
540
  "step": 720
541
  },
542
  {
543
- "epoch": 3.056140350877193,
544
- "grad_norm": 0.46114712953567505,
545
- "learning_rate": 1.4877192982456141e-05,
546
- "loss": 0.2143,
547
  "step": 730
548
  },
549
  {
550
- "epoch": 3.0596491228070177,
551
- "grad_norm": 29.63218879699707,
552
- "learning_rate": 1.4807017543859649e-05,
553
- "loss": 0.1604,
554
  "step": 740
555
  },
556
  {
557
- "epoch": 3.0631578947368423,
558
- "grad_norm": 0.25956010818481445,
559
- "learning_rate": 1.4736842105263159e-05,
560
- "loss": 0.1813,
561
  "step": 750
562
  },
563
  {
564
- "epoch": 3.066666666666667,
565
- "grad_norm": 23.94867515563965,
566
- "learning_rate": 1.4666666666666666e-05,
567
- "loss": 0.3886,
568
  "step": 760
569
  },
570
  {
571
- "epoch": 3.066666666666667,
572
- "eval_accuracy": 0.925,
573
- "eval_loss": 0.36359649896621704,
574
- "eval_runtime": 79.1785,
575
- "eval_samples_per_second": 2.021,
576
- "eval_steps_per_second": 0.253,
 
 
577
  "step": 760
578
  },
579
  {
580
- "epoch": 4.003508771929824,
581
- "grad_norm": 41.33153533935547,
582
- "learning_rate": 1.4596491228070177e-05,
583
- "loss": 0.3154,
584
  "step": 770
585
  },
586
  {
587
- "epoch": 4.007017543859649,
588
- "grad_norm": 48.22314453125,
589
- "learning_rate": 1.4526315789473687e-05,
590
- "loss": 0.3626,
591
  "step": 780
592
  },
593
  {
594
- "epoch": 4.010526315789473,
595
- "grad_norm": 11.335643768310547,
596
- "learning_rate": 1.4456140350877195e-05,
597
- "loss": 0.1304,
598
  "step": 790
599
  },
600
  {
601
- "epoch": 4.014035087719298,
602
- "grad_norm": 32.79051971435547,
603
- "learning_rate": 1.4385964912280704e-05,
604
- "loss": 0.4769,
605
  "step": 800
606
  },
607
  {
608
- "epoch": 4.017543859649122,
609
- "grad_norm": 0.31978896260261536,
610
- "learning_rate": 1.4315789473684212e-05,
611
- "loss": 0.0934,
612
  "step": 810
613
  },
614
  {
615
- "epoch": 4.021052631578947,
616
- "grad_norm": 0.9937857985496521,
617
- "learning_rate": 1.4245614035087721e-05,
618
- "loss": 0.2003,
619
  "step": 820
620
  },
621
  {
622
- "epoch": 4.024561403508772,
623
- "grad_norm": 31.888282775878906,
624
- "learning_rate": 1.4175438596491229e-05,
625
- "loss": 0.205,
626
  "step": 830
627
  },
628
  {
629
- "epoch": 4.028070175438597,
630
- "grad_norm": 21.987398147583008,
631
- "learning_rate": 1.4105263157894738e-05,
632
- "loss": 0.3816,
633
  "step": 840
634
  },
635
  {
636
- "epoch": 4.031578947368421,
637
- "grad_norm": 0.6919679045677185,
638
- "learning_rate": 1.4035087719298246e-05,
639
- "loss": 0.0419,
640
  "step": 850
641
  },
642
  {
643
- "epoch": 4.035087719298246,
644
- "grad_norm": 25.4196720123291,
645
- "learning_rate": 1.3964912280701755e-05,
646
- "loss": 0.2052,
647
  "step": 860
648
  },
649
  {
650
- "epoch": 4.03859649122807,
651
- "grad_norm": 27.873807907104492,
652
- "learning_rate": 1.3894736842105265e-05,
653
- "loss": 0.1663,
654
  "step": 870
655
  },
656
  {
657
- "epoch": 4.042105263157895,
658
- "grad_norm": 0.03928101807832718,
659
- "learning_rate": 1.3824561403508774e-05,
660
- "loss": 0.2424,
661
  "step": 880
662
  },
663
  {
664
- "epoch": 4.045614035087719,
665
- "grad_norm": 23.581205368041992,
666
- "learning_rate": 1.3754385964912282e-05,
667
- "loss": 0.2602,
668
  "step": 890
669
  },
670
  {
671
- "epoch": 4.049122807017544,
672
- "grad_norm": 0.08026744425296783,
673
- "learning_rate": 1.3684210526315791e-05,
674
- "loss": 0.1061,
675
  "step": 900
676
  },
677
  {
678
- "epoch": 4.052631578947368,
679
- "grad_norm": 0.03813062608242035,
680
- "learning_rate": 1.3614035087719299e-05,
681
- "loss": 0.1803,
682
  "step": 910
683
  },
684
  {
685
- "epoch": 4.056140350877193,
686
- "grad_norm": 30.048786163330078,
687
- "learning_rate": 1.3543859649122808e-05,
688
- "loss": 0.259,
689
  "step": 920
690
  },
691
  {
692
- "epoch": 4.059649122807017,
693
- "grad_norm": 26.78702163696289,
694
- "learning_rate": 1.3473684210526316e-05,
695
- "loss": 0.0982,
696
  "step": 930
697
  },
698
  {
699
- "epoch": 4.063157894736842,
700
- "grad_norm": 24.856910705566406,
701
- "learning_rate": 1.3403508771929826e-05,
702
- "loss": 0.3255,
703
  "step": 940
704
  },
705
  {
706
- "epoch": 4.066666666666666,
707
- "grad_norm": 3.1492743492126465,
708
- "learning_rate": 1.3333333333333333e-05,
709
- "loss": 0.3391,
710
  "step": 950
711
  },
712
  {
713
- "epoch": 4.066666666666666,
714
- "eval_accuracy": 0.9125,
715
- "eval_loss": 0.36482366919517517,
716
- "eval_runtime": 80.9254,
717
- "eval_samples_per_second": 1.977,
718
- "eval_steps_per_second": 0.247,
 
 
719
  "step": 950
720
  },
721
  {
722
- "epoch": 5.003508771929824,
723
- "grad_norm": 0.903612494468689,
724
- "learning_rate": 1.3263157894736843e-05,
725
- "loss": 0.1713,
726
  "step": 960
727
  },
728
  {
729
- "epoch": 5.007017543859649,
730
- "grad_norm": 0.05947212502360344,
731
- "learning_rate": 1.3192982456140354e-05,
732
- "loss": 0.0272,
733
  "step": 970
734
  },
735
  {
736
- "epoch": 5.010526315789473,
737
- "grad_norm": 4.998480319976807,
738
- "learning_rate": 1.3122807017543862e-05,
739
- "loss": 0.0885,
740
  "step": 980
741
  },
742
  {
743
- "epoch": 5.014035087719298,
744
- "grad_norm": 0.13530714809894562,
745
- "learning_rate": 1.305263157894737e-05,
746
- "loss": 0.1824,
747
  "step": 990
748
  },
749
  {
750
- "epoch": 5.017543859649122,
751
- "grad_norm": 5.782525539398193,
752
- "learning_rate": 1.2982456140350879e-05,
753
- "loss": 0.2345,
754
  "step": 1000
755
  },
756
  {
757
- "epoch": 5.021052631578947,
758
- "grad_norm": 22.056116104125977,
759
- "learning_rate": 1.2912280701754386e-05,
760
- "loss": 0.0706,
761
  "step": 1010
762
  },
763
  {
764
- "epoch": 5.024561403508772,
765
- "grad_norm": 17.097532272338867,
766
- "learning_rate": 1.2842105263157896e-05,
767
- "loss": 0.1151,
768
  "step": 1020
769
  },
770
  {
771
- "epoch": 5.028070175438597,
772
- "grad_norm": 0.3872256577014923,
773
- "learning_rate": 1.2771929824561404e-05,
774
- "loss": 0.2927,
775
  "step": 1030
776
  },
777
  {
778
- "epoch": 5.031578947368421,
779
- "grad_norm": 9.663734436035156,
780
- "learning_rate": 1.2701754385964913e-05,
781
- "loss": 0.202,
782
  "step": 1040
783
  },
784
  {
785
- "epoch": 5.035087719298246,
786
- "grad_norm": 4.37908935546875,
787
- "learning_rate": 1.263157894736842e-05,
788
- "loss": 0.0968,
789
  "step": 1050
790
  },
791
  {
792
- "epoch": 5.03859649122807,
793
- "grad_norm": 28.826305389404297,
794
- "learning_rate": 1.256140350877193e-05,
795
- "loss": 0.1524,
796
  "step": 1060
797
  },
798
  {
799
- "epoch": 5.042105263157895,
800
- "grad_norm": 3.279170274734497,
801
- "learning_rate": 1.2491228070175441e-05,
802
- "loss": 0.2658,
803
  "step": 1070
804
  },
805
  {
806
- "epoch": 5.045614035087719,
807
- "grad_norm": 19.066734313964844,
808
- "learning_rate": 1.2421052631578949e-05,
809
- "loss": 0.0729,
810
  "step": 1080
811
  },
812
  {
813
- "epoch": 5.049122807017544,
814
- "grad_norm": 25.409595489501953,
815
- "learning_rate": 1.2350877192982458e-05,
816
- "loss": 0.2967,
817
  "step": 1090
818
  },
819
  {
820
- "epoch": 5.052631578947368,
821
- "grad_norm": 0.39942100644111633,
822
- "learning_rate": 1.2280701754385966e-05,
823
- "loss": 0.2999,
824
  "step": 1100
825
  },
826
  {
827
- "epoch": 5.056140350877193,
828
- "grad_norm": 0.22568103671073914,
829
- "learning_rate": 1.2210526315789475e-05,
830
- "loss": 0.1757,
831
  "step": 1110
832
  },
833
  {
834
- "epoch": 5.059649122807017,
835
- "grad_norm": 1.2673927545547485,
836
- "learning_rate": 1.2140350877192983e-05,
837
- "loss": 0.3092,
838
  "step": 1120
839
  },
840
  {
841
- "epoch": 5.063157894736842,
842
- "grad_norm": 0.3143061697483063,
843
- "learning_rate": 1.2070175438596493e-05,
844
- "loss": 0.1556,
845
  "step": 1130
846
  },
847
  {
848
- "epoch": 5.066666666666666,
849
- "grad_norm": 0.23581956326961517,
850
- "learning_rate": 1.2e-05,
851
- "loss": 0.0884,
852
  "step": 1140
853
  },
854
  {
855
- "epoch": 5.066666666666666,
856
- "eval_accuracy": 0.9125,
857
- "eval_loss": 0.4376702308654785,
858
- "eval_runtime": 79.3551,
859
- "eval_samples_per_second": 2.016,
860
- "eval_steps_per_second": 0.252,
 
 
861
  "step": 1140
862
  },
863
  {
864
- "epoch": 6.003508771929824,
865
- "grad_norm": 1.794795274734497,
866
- "learning_rate": 1.192982456140351e-05,
867
- "loss": 0.0246,
868
- "step": 1150
869
- },
870
- {
871
- "epoch": 6.007017543859649,
872
- "grad_norm": 0.8937227129936218,
873
- "learning_rate": 1.1859649122807017e-05,
874
- "loss": 0.2805,
875
- "step": 1160
876
- },
877
- {
878
- "epoch": 6.010526315789473,
879
- "grad_norm": 0.779798150062561,
880
- "learning_rate": 1.1789473684210527e-05,
881
- "loss": 0.0737,
882
- "step": 1170
883
- },
884
- {
885
- "epoch": 6.014035087719298,
886
- "grad_norm": 0.1962108314037323,
887
- "learning_rate": 1.1719298245614036e-05,
888
- "loss": 0.3239,
889
- "step": 1180
890
- },
891
- {
892
- "epoch": 6.017543859649122,
893
- "grad_norm": 0.8909149765968323,
894
- "learning_rate": 1.1649122807017546e-05,
895
- "loss": 0.031,
896
- "step": 1190
897
- },
898
- {
899
- "epoch": 6.021052631578947,
900
- "grad_norm": 1.4533624649047852,
901
- "learning_rate": 1.1578947368421053e-05,
902
- "loss": 0.0936,
903
- "step": 1200
904
- },
905
- {
906
- "epoch": 6.024561403508772,
907
- "grad_norm": 6.9985785484313965,
908
- "learning_rate": 1.1508771929824563e-05,
909
- "loss": 0.0783,
910
- "step": 1210
911
- },
912
- {
913
- "epoch": 6.028070175438597,
914
- "grad_norm": 0.06940655410289764,
915
- "learning_rate": 1.143859649122807e-05,
916
- "loss": 0.2152,
917
- "step": 1220
918
- },
919
- {
920
- "epoch": 6.031578947368421,
921
- "grad_norm": 1.0425422191619873,
922
- "learning_rate": 1.136842105263158e-05,
923
- "loss": 0.1117,
924
- "step": 1230
925
- },
926
- {
927
- "epoch": 6.035087719298246,
928
- "grad_norm": 1.4716694355010986,
929
- "learning_rate": 1.1298245614035088e-05,
930
- "loss": 0.0857,
931
- "step": 1240
932
- },
933
- {
934
- "epoch": 6.03859649122807,
935
- "grad_norm": 17.07855224609375,
936
- "learning_rate": 1.1228070175438597e-05,
937
- "loss": 0.371,
938
- "step": 1250
939
- },
940
- {
941
- "epoch": 6.042105263157895,
942
- "grad_norm": 27.737403869628906,
943
- "learning_rate": 1.1157894736842105e-05,
944
- "loss": 0.3054,
945
- "step": 1260
946
- },
947
- {
948
- "epoch": 6.045614035087719,
949
- "grad_norm": 0.04294845834374428,
950
- "learning_rate": 1.1087719298245614e-05,
951
- "loss": 0.2088,
952
- "step": 1270
953
- },
954
- {
955
- "epoch": 6.049122807017544,
956
- "grad_norm": 80.5639877319336,
957
- "learning_rate": 1.1017543859649125e-05,
958
- "loss": 0.1758,
959
- "step": 1280
960
- },
961
- {
962
- "epoch": 6.052631578947368,
963
- "grad_norm": 0.319958359003067,
964
- "learning_rate": 1.0947368421052633e-05,
965
- "loss": 0.1048,
966
- "step": 1290
967
- },
968
- {
969
- "epoch": 6.056140350877193,
970
- "grad_norm": 0.6600483059883118,
971
- "learning_rate": 1.0877192982456142e-05,
972
- "loss": 0.0745,
973
- "step": 1300
974
- },
975
- {
976
- "epoch": 6.059649122807017,
977
- "grad_norm": 1.0790461301803589,
978
- "learning_rate": 1.080701754385965e-05,
979
- "loss": 0.2404,
980
- "step": 1310
981
- },
982
- {
983
- "epoch": 6.063157894736842,
984
- "grad_norm": 0.02302667126059532,
985
- "learning_rate": 1.073684210526316e-05,
986
- "loss": 0.1674,
987
- "step": 1320
988
- },
989
- {
990
- "epoch": 6.066666666666666,
991
- "grad_norm": 0.017043210566043854,
992
- "learning_rate": 1.0666666666666667e-05,
993
- "loss": 0.2528,
994
- "step": 1330
995
- },
996
- {
997
- "epoch": 6.066666666666666,
998
- "eval_accuracy": 0.9125,
999
- "eval_loss": 0.5012751817703247,
1000
- "eval_runtime": 80.6613,
1001
- "eval_samples_per_second": 1.984,
1002
- "eval_steps_per_second": 0.248,
1003
- "step": 1330
1004
- },
1005
- {
1006
- "epoch": 7.003508771929824,
1007
- "grad_norm": 0.11117155849933624,
1008
- "learning_rate": 1.0596491228070177e-05,
1009
- "loss": 0.016,
1010
- "step": 1340
1011
- },
1012
- {
1013
- "epoch": 7.007017543859649,
1014
- "grad_norm": 0.05332833155989647,
1015
- "learning_rate": 1.0526315789473684e-05,
1016
- "loss": 0.0475,
1017
- "step": 1350
1018
- },
1019
- {
1020
- "epoch": 7.010526315789473,
1021
- "grad_norm": 5.294000148773193,
1022
- "learning_rate": 1.0456140350877194e-05,
1023
- "loss": 0.0664,
1024
- "step": 1360
1025
- },
1026
- {
1027
- "epoch": 7.014035087719298,
1028
- "grad_norm": 36.36144256591797,
1029
- "learning_rate": 1.0385964912280702e-05,
1030
- "loss": 0.0834,
1031
- "step": 1370
1032
- },
1033
- {
1034
- "epoch": 7.017543859649122,
1035
- "grad_norm": 0.4908476769924164,
1036
- "learning_rate": 1.0315789473684213e-05,
1037
- "loss": 0.0731,
1038
- "step": 1380
1039
- },
1040
- {
1041
- "epoch": 7.021052631578947,
1042
- "grad_norm": 0.3506104052066803,
1043
- "learning_rate": 1.024561403508772e-05,
1044
- "loss": 0.1426,
1045
- "step": 1390
1046
- },
1047
- {
1048
- "epoch": 7.024561403508772,
1049
- "grad_norm": 0.016969937831163406,
1050
- "learning_rate": 1.017543859649123e-05,
1051
- "loss": 0.0807,
1052
- "step": 1400
1053
- },
1054
- {
1055
- "epoch": 7.028070175438597,
1056
- "grad_norm": 14.8442964553833,
1057
- "learning_rate": 1.0105263157894738e-05,
1058
- "loss": 0.1258,
1059
- "step": 1410
1060
- },
1061
- {
1062
- "epoch": 7.031578947368421,
1063
- "grad_norm": 0.3821958601474762,
1064
- "learning_rate": 1.0035087719298247e-05,
1065
- "loss": 0.1482,
1066
- "step": 1420
1067
- },
1068
- {
1069
- "epoch": 7.035087719298246,
1070
- "grad_norm": 9.797504425048828,
1071
- "learning_rate": 9.964912280701755e-06,
1072
- "loss": 0.1026,
1073
- "step": 1430
1074
- },
1075
- {
1076
- "epoch": 7.03859649122807,
1077
- "grad_norm": 1.286016583442688,
1078
- "learning_rate": 9.894736842105264e-06,
1079
- "loss": 0.0226,
1080
- "step": 1440
1081
- },
1082
- {
1083
- "epoch": 7.042105263157895,
1084
- "grad_norm": 43.57596969604492,
1085
- "learning_rate": 9.824561403508772e-06,
1086
- "loss": 0.2212,
1087
- "step": 1450
1088
- },
1089
- {
1090
- "epoch": 7.045614035087719,
1091
- "grad_norm": 2.1723453998565674,
1092
- "learning_rate": 9.754385964912281e-06,
1093
- "loss": 0.1333,
1094
- "step": 1460
1095
- },
1096
- {
1097
- "epoch": 7.049122807017544,
1098
- "grad_norm": 24.372819900512695,
1099
- "learning_rate": 9.68421052631579e-06,
1100
- "loss": 0.1644,
1101
- "step": 1470
1102
- },
1103
- {
1104
- "epoch": 7.052631578947368,
1105
- "grad_norm": 0.023473242297768593,
1106
- "learning_rate": 9.614035087719298e-06,
1107
- "loss": 0.0843,
1108
- "step": 1480
1109
- },
1110
- {
1111
- "epoch": 7.056140350877193,
1112
- "grad_norm": 29.83729362487793,
1113
- "learning_rate": 9.543859649122808e-06,
1114
- "loss": 0.2588,
1115
- "step": 1490
1116
- },
1117
- {
1118
- "epoch": 7.059649122807017,
1119
- "grad_norm": 0.08369996398687363,
1120
- "learning_rate": 9.473684210526315e-06,
1121
- "loss": 0.0662,
1122
- "step": 1500
1123
- },
1124
- {
1125
- "epoch": 7.063157894736842,
1126
- "grad_norm": 38.48130798339844,
1127
- "learning_rate": 9.403508771929825e-06,
1128
- "loss": 0.0767,
1129
- "step": 1510
1130
- },
1131
- {
1132
- "epoch": 7.066666666666666,
1133
- "grad_norm": 0.0697094202041626,
1134
- "learning_rate": 9.333333333333334e-06,
1135
- "loss": 0.0272,
1136
- "step": 1520
1137
- },
1138
- {
1139
- "epoch": 7.066666666666666,
1140
- "eval_accuracy": 0.925,
1141
- "eval_loss": 0.4899701178073883,
1142
- "eval_runtime": 82.0136,
1143
- "eval_samples_per_second": 1.951,
1144
- "eval_steps_per_second": 0.244,
1145
- "step": 1520
1146
- },
1147
- {
1148
- "epoch": 8.003508771929825,
1149
- "grad_norm": 0.15197695791721344,
1150
- "learning_rate": 9.263157894736842e-06,
1151
- "loss": 0.1707,
1152
- "step": 1530
1153
- },
1154
- {
1155
- "epoch": 8.007017543859648,
1156
- "grad_norm": 0.8387830853462219,
1157
- "learning_rate": 9.192982456140351e-06,
1158
- "loss": 0.1568,
1159
- "step": 1540
1160
- },
1161
- {
1162
- "epoch": 8.010526315789473,
1163
- "grad_norm": 38.247314453125,
1164
- "learning_rate": 9.12280701754386e-06,
1165
- "loss": 0.1991,
1166
- "step": 1550
1167
- },
1168
- {
1169
- "epoch": 8.014035087719298,
1170
- "grad_norm": 0.011660151183605194,
1171
- "learning_rate": 9.05263157894737e-06,
1172
- "loss": 0.1268,
1173
- "step": 1560
1174
- },
1175
- {
1176
- "epoch": 8.017543859649123,
1177
- "grad_norm": 24.12862205505371,
1178
- "learning_rate": 8.982456140350878e-06,
1179
- "loss": 0.1796,
1180
- "step": 1570
1181
- },
1182
- {
1183
- "epoch": 8.021052631578947,
1184
- "grad_norm": 0.23101183772087097,
1185
- "learning_rate": 8.912280701754387e-06,
1186
- "loss": 0.0037,
1187
- "step": 1580
1188
- },
1189
- {
1190
- "epoch": 8.024561403508772,
1191
- "grad_norm": 0.017889101058244705,
1192
- "learning_rate": 8.842105263157895e-06,
1193
- "loss": 0.1605,
1194
- "step": 1590
1195
- },
1196
- {
1197
- "epoch": 8.028070175438597,
1198
- "grad_norm": 0.035311147570610046,
1199
- "learning_rate": 8.771929824561405e-06,
1200
- "loss": 0.0528,
1201
- "step": 1600
1202
- },
1203
- {
1204
- "epoch": 8.031578947368422,
1205
- "grad_norm": 0.06620756536722183,
1206
- "learning_rate": 8.701754385964914e-06,
1207
- "loss": 0.0355,
1208
- "step": 1610
1209
- },
1210
- {
1211
- "epoch": 8.035087719298245,
1212
- "grad_norm": 0.06222343072295189,
1213
- "learning_rate": 8.631578947368422e-06,
1214
- "loss": 0.1415,
1215
- "step": 1620
1216
- },
1217
- {
1218
- "epoch": 8.03859649122807,
1219
- "grad_norm": 0.01348515972495079,
1220
- "learning_rate": 8.561403508771931e-06,
1221
- "loss": 0.1125,
1222
- "step": 1630
1223
- },
1224
- {
1225
- "epoch": 8.042105263157895,
1226
- "grad_norm": 27.071624755859375,
1227
- "learning_rate": 8.491228070175439e-06,
1228
- "loss": 0.2027,
1229
- "step": 1640
1230
- },
1231
- {
1232
- "epoch": 8.04561403508772,
1233
- "grad_norm": 11.8720064163208,
1234
- "learning_rate": 8.421052631578948e-06,
1235
- "loss": 0.1943,
1236
- "step": 1650
1237
- },
1238
- {
1239
- "epoch": 8.049122807017543,
1240
- "grad_norm": 0.32566535472869873,
1241
- "learning_rate": 8.350877192982458e-06,
1242
- "loss": 0.1909,
1243
- "step": 1660
1244
- },
1245
- {
1246
- "epoch": 8.052631578947368,
1247
- "grad_norm": 0.4174618721008301,
1248
- "learning_rate": 8.280701754385965e-06,
1249
- "loss": 0.1284,
1250
- "step": 1670
1251
- },
1252
- {
1253
- "epoch": 8.056140350877193,
1254
- "grad_norm": 11.002934455871582,
1255
- "learning_rate": 8.210526315789475e-06,
1256
- "loss": 0.1926,
1257
- "step": 1680
1258
- },
1259
- {
1260
- "epoch": 8.059649122807018,
1261
- "grad_norm": 1.0391727685928345,
1262
- "learning_rate": 8.140350877192983e-06,
1263
- "loss": 0.1868,
1264
- "step": 1690
1265
- },
1266
- {
1267
- "epoch": 8.063157894736841,
1268
- "grad_norm": 18.734594345092773,
1269
- "learning_rate": 8.070175438596492e-06,
1270
- "loss": 0.191,
1271
- "step": 1700
1272
- },
1273
- {
1274
- "epoch": 8.066666666666666,
1275
- "grad_norm": 44.64786911010742,
1276
- "learning_rate": 8.000000000000001e-06,
1277
- "loss": 0.291,
1278
- "step": 1710
1279
- },
1280
- {
1281
- "epoch": 8.066666666666666,
1282
- "eval_accuracy": 0.9125,
1283
- "eval_loss": 0.5223789215087891,
1284
- "eval_runtime": 80.1013,
1285
- "eval_samples_per_second": 1.997,
1286
- "eval_steps_per_second": 0.25,
1287
- "step": 1710
1288
- },
1289
- {
1290
- "epoch": 9.003508771929825,
1291
- "grad_norm": 0.26973825693130493,
1292
- "learning_rate": 7.929824561403509e-06,
1293
- "loss": 0.0497,
1294
- "step": 1720
1295
- },
1296
- {
1297
- "epoch": 9.007017543859648,
1298
- "grad_norm": 0.040555402636528015,
1299
- "learning_rate": 7.859649122807018e-06,
1300
- "loss": 0.1145,
1301
- "step": 1730
1302
- },
1303
- {
1304
- "epoch": 9.010526315789473,
1305
- "grad_norm": 0.05395271256566048,
1306
- "learning_rate": 7.789473684210526e-06,
1307
- "loss": 0.1051,
1308
- "step": 1740
1309
- },
1310
- {
1311
- "epoch": 9.014035087719298,
1312
- "grad_norm": 0.7126879096031189,
1313
- "learning_rate": 7.719298245614036e-06,
1314
- "loss": 0.1499,
1315
- "step": 1750
1316
- },
1317
- {
1318
- "epoch": 9.017543859649123,
1319
- "grad_norm": 27.24796485900879,
1320
- "learning_rate": 7.649122807017545e-06,
1321
- "loss": 0.1365,
1322
- "step": 1760
1323
- },
1324
- {
1325
- "epoch": 9.021052631578947,
1326
- "grad_norm": 0.22679656744003296,
1327
- "learning_rate": 7.578947368421054e-06,
1328
- "loss": 0.1239,
1329
- "step": 1770
1330
- },
1331
- {
1332
- "epoch": 9.024561403508772,
1333
- "grad_norm": 11.224397659301758,
1334
- "learning_rate": 7.508771929824562e-06,
1335
- "loss": 0.0112,
1336
- "step": 1780
1337
- },
1338
- {
1339
- "epoch": 9.028070175438597,
1340
- "grad_norm": 0.019182119518518448,
1341
- "learning_rate": 7.438596491228071e-06,
1342
- "loss": 0.1523,
1343
- "step": 1790
1344
- },
1345
- {
1346
- "epoch": 9.031578947368422,
1347
- "grad_norm": 8.09163761138916,
1348
- "learning_rate": 7.368421052631579e-06,
1349
- "loss": 0.0313,
1350
- "step": 1800
1351
- },
1352
- {
1353
- "epoch": 9.035087719298245,
1354
- "grad_norm": 0.012492038309574127,
1355
- "learning_rate": 7.298245614035089e-06,
1356
- "loss": 0.1956,
1357
- "step": 1810
1358
- },
1359
- {
1360
- "epoch": 9.03859649122807,
1361
- "grad_norm": 7.786038398742676,
1362
- "learning_rate": 7.228070175438597e-06,
1363
- "loss": 0.074,
1364
- "step": 1820
1365
- },
1366
- {
1367
- "epoch": 9.042105263157895,
1368
- "grad_norm": 27.997344970703125,
1369
- "learning_rate": 7.157894736842106e-06,
1370
- "loss": 0.0531,
1371
- "step": 1830
1372
- },
1373
- {
1374
- "epoch": 9.04561403508772,
1375
- "grad_norm": 3.2020115852355957,
1376
- "learning_rate": 7.087719298245614e-06,
1377
- "loss": 0.0392,
1378
- "step": 1840
1379
- },
1380
- {
1381
- "epoch": 9.049122807017543,
1382
- "grad_norm": 0.26066985726356506,
1383
- "learning_rate": 7.017543859649123e-06,
1384
- "loss": 0.0755,
1385
- "step": 1850
1386
- },
1387
- {
1388
- "epoch": 9.052631578947368,
1389
- "grad_norm": 0.8559029698371887,
1390
- "learning_rate": 6.947368421052632e-06,
1391
- "loss": 0.0497,
1392
- "step": 1860
1393
- },
1394
- {
1395
- "epoch": 9.056140350877193,
1396
- "grad_norm": 1.5701016187667847,
1397
- "learning_rate": 6.877192982456141e-06,
1398
- "loss": 0.0245,
1399
- "step": 1870
1400
- },
1401
- {
1402
- "epoch": 9.059649122807018,
1403
- "grad_norm": 0.4311680793762207,
1404
- "learning_rate": 6.8070175438596495e-06,
1405
- "loss": 0.0173,
1406
- "step": 1880
1407
- },
1408
- {
1409
- "epoch": 9.063157894736841,
1410
- "grad_norm": 0.3006422519683838,
1411
- "learning_rate": 6.736842105263158e-06,
1412
- "loss": 0.2069,
1413
- "step": 1890
1414
- },
1415
- {
1416
- "epoch": 9.066666666666666,
1417
- "grad_norm": 0.2408071756362915,
1418
- "learning_rate": 6.666666666666667e-06,
1419
- "loss": 0.0597,
1420
- "step": 1900
1421
- },
1422
- {
1423
- "epoch": 9.066666666666666,
1424
- "eval_accuracy": 0.9125,
1425
- "eval_loss": 0.6106778383255005,
1426
- "eval_runtime": 81.1134,
1427
- "eval_samples_per_second": 1.973,
1428
- "eval_steps_per_second": 0.247,
1429
- "step": 1900
1430
- },
1431
- {
1432
- "epoch": 10.003508771929825,
1433
- "grad_norm": 2.007347583770752,
1434
- "learning_rate": 6.596491228070177e-06,
1435
- "loss": 0.1263,
1436
- "step": 1910
1437
- },
1438
- {
1439
- "epoch": 10.007017543859648,
1440
- "grad_norm": 0.05921289697289467,
1441
- "learning_rate": 6.526315789473685e-06,
1442
- "loss": 0.0126,
1443
- "step": 1920
1444
- },
1445
- {
1446
- "epoch": 10.010526315789473,
1447
- "grad_norm": 0.1697661280632019,
1448
- "learning_rate": 6.456140350877193e-06,
1449
- "loss": 0.1105,
1450
- "step": 1930
1451
- },
1452
- {
1453
- "epoch": 10.014035087719298,
1454
- "grad_norm": 4.040391445159912,
1455
- "learning_rate": 6.385964912280702e-06,
1456
- "loss": 0.073,
1457
- "step": 1940
1458
- },
1459
- {
1460
- "epoch": 10.017543859649123,
1461
- "grad_norm": 4.27597188949585,
1462
- "learning_rate": 6.31578947368421e-06,
1463
- "loss": 0.0857,
1464
- "step": 1950
1465
- },
1466
- {
1467
- "epoch": 10.021052631578947,
1468
- "grad_norm": 1.0044366121292114,
1469
- "learning_rate": 6.245614035087721e-06,
1470
- "loss": 0.0379,
1471
- "step": 1960
1472
- },
1473
- {
1474
- "epoch": 10.024561403508772,
1475
- "grad_norm": 17.36284637451172,
1476
- "learning_rate": 6.175438596491229e-06,
1477
- "loss": 0.175,
1478
- "step": 1970
1479
- },
1480
- {
1481
- "epoch": 10.028070175438597,
1482
- "grad_norm": 0.5226487517356873,
1483
- "learning_rate": 6.105263157894738e-06,
1484
- "loss": 0.1461,
1485
- "step": 1980
1486
- },
1487
- {
1488
- "epoch": 10.031578947368422,
1489
- "grad_norm": 0.017955180257558823,
1490
- "learning_rate": 6.035087719298246e-06,
1491
- "loss": 0.0366,
1492
- "step": 1990
1493
- },
1494
- {
1495
- "epoch": 10.035087719298245,
1496
- "grad_norm": 22.40249252319336,
1497
- "learning_rate": 5.964912280701755e-06,
1498
- "loss": 0.1114,
1499
- "step": 2000
1500
- },
1501
- {
1502
- "epoch": 10.03859649122807,
1503
- "grad_norm": 26.304452896118164,
1504
- "learning_rate": 5.8947368421052634e-06,
1505
- "loss": 0.0683,
1506
- "step": 2010
1507
- },
1508
- {
1509
- "epoch": 10.042105263157895,
1510
- "grad_norm": 0.20418749749660492,
1511
- "learning_rate": 5.824561403508773e-06,
1512
- "loss": 0.0703,
1513
- "step": 2020
1514
- },
1515
- {
1516
- "epoch": 10.04561403508772,
1517
- "grad_norm": 17.021589279174805,
1518
- "learning_rate": 5.754385964912281e-06,
1519
- "loss": 0.1972,
1520
- "step": 2030
1521
- },
1522
- {
1523
- "epoch": 10.049122807017543,
1524
- "grad_norm": 0.07942705601453781,
1525
- "learning_rate": 5.68421052631579e-06,
1526
- "loss": 0.1112,
1527
- "step": 2040
1528
- },
1529
- {
1530
- "epoch": 10.052631578947368,
1531
- "grad_norm": 0.6547494530677795,
1532
- "learning_rate": 5.6140350877192985e-06,
1533
- "loss": 0.0109,
1534
- "step": 2050
1535
- },
1536
- {
1537
- "epoch": 10.056140350877193,
1538
- "grad_norm": 45.802181243896484,
1539
- "learning_rate": 5.543859649122807e-06,
1540
- "loss": 0.2281,
1541
- "step": 2060
1542
- },
1543
- {
1544
- "epoch": 10.059649122807018,
1545
- "grad_norm": 0.15139073133468628,
1546
- "learning_rate": 5.4736842105263165e-06,
1547
- "loss": 0.0229,
1548
- "step": 2070
1549
- },
1550
- {
1551
- "epoch": 10.063157894736841,
1552
- "grad_norm": 0.6947728395462036,
1553
- "learning_rate": 5.403508771929825e-06,
1554
- "loss": 0.1189,
1555
- "step": 2080
1556
- },
1557
- {
1558
- "epoch": 10.066666666666666,
1559
- "grad_norm": 2.7859318256378174,
1560
- "learning_rate": 5.333333333333334e-06,
1561
- "loss": 0.1327,
1562
- "step": 2090
1563
- },
1564
- {
1565
- "epoch": 10.066666666666666,
1566
- "eval_accuracy": 0.91875,
1567
- "eval_loss": 0.6103284358978271,
1568
- "eval_runtime": 78.896,
1569
- "eval_samples_per_second": 2.028,
1570
- "eval_steps_per_second": 0.253,
1571
- "step": 2090
1572
- },
1573
- {
1574
- "epoch": 11.003508771929825,
1575
- "grad_norm": 0.11758152395486832,
1576
- "learning_rate": 5.263157894736842e-06,
1577
- "loss": 0.0357,
1578
- "step": 2100
1579
- },
1580
- {
1581
- "epoch": 11.007017543859648,
1582
- "grad_norm": 13.015204429626465,
1583
- "learning_rate": 5.192982456140351e-06,
1584
- "loss": 0.3564,
1585
- "step": 2110
1586
- },
1587
- {
1588
- "epoch": 11.010526315789473,
1589
- "grad_norm": 3.9868524074554443,
1590
- "learning_rate": 5.12280701754386e-06,
1591
- "loss": 0.1582,
1592
- "step": 2120
1593
- },
1594
- {
1595
- "epoch": 11.014035087719298,
1596
- "grad_norm": 18.87368392944336,
1597
- "learning_rate": 5.052631578947369e-06,
1598
- "loss": 0.0427,
1599
- "step": 2130
1600
- },
1601
- {
1602
- "epoch": 11.017543859649123,
1603
- "grad_norm": 0.04656202346086502,
1604
- "learning_rate": 4.982456140350877e-06,
1605
- "loss": 0.0223,
1606
- "step": 2140
1607
- },
1608
- {
1609
- "epoch": 11.021052631578947,
1610
- "grad_norm": 0.09466197341680527,
1611
- "learning_rate": 4.912280701754386e-06,
1612
- "loss": 0.1413,
1613
- "step": 2150
1614
- },
1615
- {
1616
- "epoch": 11.024561403508772,
1617
- "grad_norm": 0.03507490083575249,
1618
- "learning_rate": 4.842105263157895e-06,
1619
- "loss": 0.0847,
1620
- "step": 2160
1621
- },
1622
- {
1623
- "epoch": 11.028070175438597,
1624
- "grad_norm": 0.029148323461413383,
1625
- "learning_rate": 4.771929824561404e-06,
1626
- "loss": 0.1364,
1627
- "step": 2170
1628
- },
1629
- {
1630
- "epoch": 11.031578947368422,
1631
- "grad_norm": 0.020161481574177742,
1632
- "learning_rate": 4.7017543859649125e-06,
1633
- "loss": 0.0025,
1634
- "step": 2180
1635
- },
1636
- {
1637
- "epoch": 11.035087719298245,
1638
- "grad_norm": 0.05569101870059967,
1639
- "learning_rate": 4.631578947368421e-06,
1640
- "loss": 0.0425,
1641
- "step": 2190
1642
- },
1643
- {
1644
- "epoch": 11.03859649122807,
1645
- "grad_norm": 0.028240151703357697,
1646
- "learning_rate": 4.56140350877193e-06,
1647
- "loss": 0.0477,
1648
- "step": 2200
1649
- },
1650
- {
1651
- "epoch": 11.042105263157895,
1652
- "grad_norm": 0.011267263442277908,
1653
- "learning_rate": 4.491228070175439e-06,
1654
- "loss": 0.0603,
1655
- "step": 2210
1656
- },
1657
- {
1658
- "epoch": 11.04561403508772,
1659
- "grad_norm": 0.05064944550395012,
1660
- "learning_rate": 4.4210526315789476e-06,
1661
- "loss": 0.1647,
1662
- "step": 2220
1663
- },
1664
- {
1665
- "epoch": 11.049122807017543,
1666
- "grad_norm": 20.04745864868164,
1667
- "learning_rate": 4.350877192982457e-06,
1668
- "loss": 0.0739,
1669
- "step": 2230
1670
- },
1671
- {
1672
- "epoch": 11.052631578947368,
1673
- "grad_norm": 0.2934364378452301,
1674
- "learning_rate": 4.2807017543859656e-06,
1675
- "loss": 0.0542,
1676
- "step": 2240
1677
- },
1678
- {
1679
- "epoch": 11.056140350877193,
1680
- "grad_norm": 0.038585782051086426,
1681
- "learning_rate": 4.210526315789474e-06,
1682
- "loss": 0.1208,
1683
- "step": 2250
1684
- },
1685
- {
1686
- "epoch": 11.059649122807018,
1687
- "grad_norm": 0.014354056678712368,
1688
- "learning_rate": 4.140350877192983e-06,
1689
- "loss": 0.0879,
1690
- "step": 2260
1691
- },
1692
- {
1693
- "epoch": 11.063157894736841,
1694
- "grad_norm": 0.4161794185638428,
1695
- "learning_rate": 4.070175438596491e-06,
1696
- "loss": 0.083,
1697
- "step": 2270
1698
- },
1699
- {
1700
- "epoch": 11.066666666666666,
1701
- "grad_norm": 16.042465209960938,
1702
- "learning_rate": 4.000000000000001e-06,
1703
- "loss": 0.1942,
1704
- "step": 2280
1705
- },
1706
- {
1707
- "epoch": 11.066666666666666,
1708
- "eval_accuracy": 0.90625,
1709
- "eval_loss": 0.608224630355835,
1710
- "eval_runtime": 79.0963,
1711
- "eval_samples_per_second": 2.023,
1712
- "eval_steps_per_second": 0.253,
1713
- "step": 2280
1714
- },
1715
- {
1716
- "epoch": 12.003508771929825,
1717
- "grad_norm": 2.9245848655700684,
1718
- "learning_rate": 3.929824561403509e-06,
1719
- "loss": 0.1737,
1720
- "step": 2290
1721
- },
1722
- {
1723
- "epoch": 12.007017543859648,
1724
- "grad_norm": 0.012533026747405529,
1725
- "learning_rate": 3.859649122807018e-06,
1726
- "loss": 0.0383,
1727
- "step": 2300
1728
- },
1729
- {
1730
- "epoch": 12.010526315789473,
1731
- "grad_norm": 0.03444502130150795,
1732
- "learning_rate": 3.789473684210527e-06,
1733
- "loss": 0.0517,
1734
- "step": 2310
1735
- },
1736
- {
1737
- "epoch": 12.014035087719298,
1738
- "grad_norm": 4.755786418914795,
1739
- "learning_rate": 3.7192982456140354e-06,
1740
- "loss": 0.0255,
1741
- "step": 2320
1742
- },
1743
- {
1744
- "epoch": 12.017543859649123,
1745
- "grad_norm": 43.459564208984375,
1746
- "learning_rate": 3.6491228070175443e-06,
1747
- "loss": 0.1303,
1748
- "step": 2330
1749
- },
1750
- {
1751
- "epoch": 12.021052631578947,
1752
- "grad_norm": 11.069656372070312,
1753
- "learning_rate": 3.578947368421053e-06,
1754
- "loss": 0.1322,
1755
- "step": 2340
1756
- },
1757
- {
1758
- "epoch": 12.024561403508772,
1759
- "grad_norm": 0.8341348171234131,
1760
- "learning_rate": 3.5087719298245615e-06,
1761
- "loss": 0.042,
1762
- "step": 2350
1763
- },
1764
- {
1765
- "epoch": 12.028070175438597,
1766
- "grad_norm": 12.14256763458252,
1767
- "learning_rate": 3.4385964912280705e-06,
1768
- "loss": 0.0088,
1769
- "step": 2360
1770
- },
1771
- {
1772
- "epoch": 12.031578947368422,
1773
- "grad_norm": 0.03645330294966698,
1774
- "learning_rate": 3.368421052631579e-06,
1775
- "loss": 0.0504,
1776
- "step": 2370
1777
- },
1778
- {
1779
- "epoch": 12.035087719298245,
1780
- "grad_norm": 0.14884474873542786,
1781
- "learning_rate": 3.2982456140350885e-06,
1782
- "loss": 0.0772,
1783
- "step": 2380
1784
- },
1785
- {
1786
- "epoch": 12.03859649122807,
1787
- "grad_norm": 36.239627838134766,
1788
- "learning_rate": 3.2280701754385966e-06,
1789
- "loss": 0.0407,
1790
- "step": 2390
1791
- },
1792
- {
1793
- "epoch": 12.042105263157895,
1794
- "grad_norm": 3.6581294536590576,
1795
- "learning_rate": 3.157894736842105e-06,
1796
- "loss": 0.0538,
1797
- "step": 2400
1798
- },
1799
- {
1800
- "epoch": 12.04561403508772,
1801
- "grad_norm": 0.029139500111341476,
1802
- "learning_rate": 3.0877192982456146e-06,
1803
- "loss": 0.0484,
1804
- "step": 2410
1805
- },
1806
- {
1807
- "epoch": 12.049122807017543,
1808
- "grad_norm": 0.011205335147678852,
1809
- "learning_rate": 3.017543859649123e-06,
1810
- "loss": 0.0471,
1811
- "step": 2420
1812
- },
1813
- {
1814
- "epoch": 12.052631578947368,
1815
- "grad_norm": 0.31373676657676697,
1816
- "learning_rate": 2.9473684210526317e-06,
1817
- "loss": 0.1412,
1818
- "step": 2430
1819
- },
1820
- {
1821
- "epoch": 12.056140350877193,
1822
- "grad_norm": 0.03848983347415924,
1823
- "learning_rate": 2.8771929824561407e-06,
1824
- "loss": 0.0769,
1825
- "step": 2440
1826
- },
1827
- {
1828
- "epoch": 12.059649122807018,
1829
- "grad_norm": 0.22592990100383759,
1830
- "learning_rate": 2.8070175438596493e-06,
1831
- "loss": 0.1461,
1832
- "step": 2450
1833
- },
1834
- {
1835
- "epoch": 12.063157894736841,
1836
- "grad_norm": 0.6783035397529602,
1837
- "learning_rate": 2.7368421052631583e-06,
1838
- "loss": 0.0748,
1839
- "step": 2460
1840
- },
1841
- {
1842
- "epoch": 12.066666666666666,
1843
- "grad_norm": 0.008166045881807804,
1844
- "learning_rate": 2.666666666666667e-06,
1845
- "loss": 0.3295,
1846
- "step": 2470
1847
- },
1848
- {
1849
- "epoch": 12.066666666666666,
1850
- "eval_accuracy": 0.90625,
1851
- "eval_loss": 0.6353408098220825,
1852
- "eval_runtime": 80.6006,
1853
- "eval_samples_per_second": 1.985,
1854
- "eval_steps_per_second": 0.248,
1855
- "step": 2470
1856
- },
1857
- {
1858
- "epoch": 13.003508771929825,
1859
- "grad_norm": 0.017932241782546043,
1860
- "learning_rate": 2.5964912280701754e-06,
1861
- "loss": 0.0974,
1862
- "step": 2480
1863
- },
1864
- {
1865
- "epoch": 13.007017543859648,
1866
- "grad_norm": 0.06463440507650375,
1867
- "learning_rate": 2.5263157894736844e-06,
1868
- "loss": 0.0408,
1869
- "step": 2490
1870
- },
1871
- {
1872
- "epoch": 13.010526315789473,
1873
- "grad_norm": 0.07242042571306229,
1874
- "learning_rate": 2.456140350877193e-06,
1875
- "loss": 0.0486,
1876
- "step": 2500
1877
- },
1878
- {
1879
- "epoch": 13.014035087719298,
1880
- "grad_norm": 0.45470118522644043,
1881
- "learning_rate": 2.385964912280702e-06,
1882
- "loss": 0.1951,
1883
- "step": 2510
1884
- },
1885
- {
1886
- "epoch": 13.017543859649123,
1887
- "grad_norm": 4.542286396026611,
1888
- "learning_rate": 2.3157894736842105e-06,
1889
- "loss": 0.1041,
1890
- "step": 2520
1891
- },
1892
- {
1893
- "epoch": 13.021052631578947,
1894
- "grad_norm": 53.463401794433594,
1895
- "learning_rate": 2.2456140350877195e-06,
1896
- "loss": 0.1823,
1897
- "step": 2530
1898
- },
1899
- {
1900
- "epoch": 13.024561403508772,
1901
- "grad_norm": 1.6816776990890503,
1902
- "learning_rate": 2.1754385964912285e-06,
1903
- "loss": 0.1033,
1904
- "step": 2540
1905
- },
1906
- {
1907
- "epoch": 13.028070175438597,
1908
- "grad_norm": 0.356115460395813,
1909
- "learning_rate": 2.105263157894737e-06,
1910
- "loss": 0.1217,
1911
- "step": 2550
1912
- },
1913
- {
1914
- "epoch": 13.031578947368422,
1915
- "grad_norm": 0.18997782468795776,
1916
- "learning_rate": 2.0350877192982456e-06,
1917
- "loss": 0.0044,
1918
- "step": 2560
1919
- },
1920
- {
1921
- "epoch": 13.035087719298245,
1922
- "grad_norm": 9.70322036743164,
1923
- "learning_rate": 1.9649122807017546e-06,
1924
- "loss": 0.0338,
1925
- "step": 2570
1926
- },
1927
- {
1928
- "epoch": 13.03859649122807,
1929
- "grad_norm": 19.50843620300293,
1930
- "learning_rate": 1.8947368421052634e-06,
1931
- "loss": 0.0769,
1932
- "step": 2580
1933
- },
1934
- {
1935
- "epoch": 13.042105263157895,
1936
- "grad_norm": 0.1590365469455719,
1937
- "learning_rate": 1.8245614035087722e-06,
1938
- "loss": 0.0048,
1939
- "step": 2590
1940
- },
1941
- {
1942
- "epoch": 13.04561403508772,
1943
- "grad_norm": 0.012651736848056316,
1944
- "learning_rate": 1.7543859649122807e-06,
1945
- "loss": 0.0993,
1946
- "step": 2600
1947
- },
1948
- {
1949
- "epoch": 13.049122807017543,
1950
- "grad_norm": 0.02113044075667858,
1951
- "learning_rate": 1.6842105263157895e-06,
1952
- "loss": 0.012,
1953
- "step": 2610
1954
- },
1955
- {
1956
- "epoch": 13.052631578947368,
1957
- "grad_norm": 0.007876179181039333,
1958
- "learning_rate": 1.6140350877192983e-06,
1959
- "loss": 0.0133,
1960
- "step": 2620
1961
- },
1962
- {
1963
- "epoch": 13.056140350877193,
1964
- "grad_norm": 14.243346214294434,
1965
- "learning_rate": 1.5438596491228073e-06,
1966
- "loss": 0.0686,
1967
- "step": 2630
1968
- },
1969
- {
1970
- "epoch": 13.059649122807018,
1971
- "grad_norm": 0.13937072455883026,
1972
- "learning_rate": 1.4736842105263159e-06,
1973
- "loss": 0.2213,
1974
- "step": 2640
1975
- },
1976
- {
1977
- "epoch": 13.063157894736841,
1978
- "grad_norm": 1.7151681184768677,
1979
- "learning_rate": 1.4035087719298246e-06,
1980
- "loss": 0.2929,
1981
- "step": 2650
1982
- },
1983
- {
1984
- "epoch": 13.066666666666666,
1985
- "grad_norm": 0.04530133679509163,
1986
- "learning_rate": 1.3333333333333334e-06,
1987
- "loss": 0.0117,
1988
- "step": 2660
1989
- },
1990
- {
1991
- "epoch": 13.066666666666666,
1992
- "eval_accuracy": 0.91875,
1993
- "eval_loss": 0.6410062313079834,
1994
- "eval_runtime": 81.4044,
1995
- "eval_samples_per_second": 1.965,
1996
- "eval_steps_per_second": 0.246,
1997
- "step": 2660
1998
- },
1999
- {
2000
- "epoch": 14.003508771929825,
2001
- "grad_norm": 0.012890544719994068,
2002
- "learning_rate": 1.2631578947368422e-06,
2003
- "loss": 0.1506,
2004
- "step": 2670
2005
- },
2006
- {
2007
- "epoch": 14.007017543859648,
2008
- "grad_norm": 1.0323655605316162,
2009
- "learning_rate": 1.192982456140351e-06,
2010
- "loss": 0.0419,
2011
- "step": 2680
2012
- },
2013
- {
2014
- "epoch": 14.010526315789473,
2015
- "grad_norm": 0.03734529763460159,
2016
- "learning_rate": 1.1228070175438598e-06,
2017
- "loss": 0.0395,
2018
- "step": 2690
2019
- },
2020
- {
2021
- "epoch": 14.014035087719298,
2022
- "grad_norm": 0.011295179836452007,
2023
- "learning_rate": 1.0526315789473685e-06,
2024
- "loss": 0.0694,
2025
- "step": 2700
2026
- },
2027
- {
2028
- "epoch": 14.017543859649123,
2029
- "grad_norm": 0.10645683854818344,
2030
- "learning_rate": 9.824561403508773e-07,
2031
- "loss": 0.2579,
2032
- "step": 2710
2033
- },
2034
- {
2035
- "epoch": 14.021052631578947,
2036
- "grad_norm": 0.6168031096458435,
2037
- "learning_rate": 9.122807017543861e-07,
2038
- "loss": 0.0487,
2039
- "step": 2720
2040
- },
2041
- {
2042
- "epoch": 14.024561403508772,
2043
- "grad_norm": 0.12228459864854813,
2044
- "learning_rate": 8.421052631578948e-07,
2045
- "loss": 0.0028,
2046
- "step": 2730
2047
- },
2048
- {
2049
- "epoch": 14.028070175438597,
2050
- "grad_norm": 0.5748558640480042,
2051
- "learning_rate": 7.719298245614036e-07,
2052
- "loss": 0.0619,
2053
- "step": 2740
2054
- },
2055
- {
2056
- "epoch": 14.031578947368422,
2057
- "grad_norm": 0.24423201382160187,
2058
- "learning_rate": 7.017543859649123e-07,
2059
- "loss": 0.0306,
2060
- "step": 2750
2061
- },
2062
- {
2063
- "epoch": 14.035087719298245,
2064
- "grad_norm": 0.2773297131061554,
2065
- "learning_rate": 6.315789473684211e-07,
2066
- "loss": 0.0952,
2067
- "step": 2760
2068
- },
2069
- {
2070
- "epoch": 14.03859649122807,
2071
- "grad_norm": 3.161602258682251,
2072
- "learning_rate": 5.614035087719299e-07,
2073
- "loss": 0.1977,
2074
- "step": 2770
2075
- },
2076
- {
2077
- "epoch": 14.042105263157895,
2078
- "grad_norm": 0.02265220880508423,
2079
- "learning_rate": 4.912280701754387e-07,
2080
- "loss": 0.1725,
2081
- "step": 2780
2082
- },
2083
- {
2084
- "epoch": 14.04561403508772,
2085
- "grad_norm": 0.028731781989336014,
2086
- "learning_rate": 4.210526315789474e-07,
2087
- "loss": 0.043,
2088
- "step": 2790
2089
- },
2090
- {
2091
- "epoch": 14.049122807017543,
2092
- "grad_norm": 0.2788037657737732,
2093
- "learning_rate": 3.5087719298245616e-07,
2094
- "loss": 0.0601,
2095
- "step": 2800
2096
- },
2097
- {
2098
- "epoch": 14.052631578947368,
2099
- "grad_norm": 0.011185847222805023,
2100
- "learning_rate": 2.8070175438596494e-07,
2101
- "loss": 0.0245,
2102
- "step": 2810
2103
- },
2104
- {
2105
- "epoch": 14.056140350877193,
2106
- "grad_norm": 0.12891526520252228,
2107
- "learning_rate": 2.105263157894737e-07,
2108
- "loss": 0.0771,
2109
- "step": 2820
2110
- },
2111
- {
2112
- "epoch": 14.059649122807018,
2113
- "grad_norm": 0.09125010669231415,
2114
- "learning_rate": 1.4035087719298247e-07,
2115
- "loss": 0.0382,
2116
- "step": 2830
2117
- },
2118
- {
2119
- "epoch": 14.063157894736841,
2120
- "grad_norm": 0.01801062375307083,
2121
- "learning_rate": 7.017543859649123e-08,
2122
- "loss": 0.1891,
2123
- "step": 2840
2124
- },
2125
- {
2126
- "epoch": 14.066666666666666,
2127
- "grad_norm": 1.261662483215332,
2128
- "learning_rate": 0.0,
2129
- "loss": 0.0905,
2130
- "step": 2850
2131
- },
2132
- {
2133
- "epoch": 14.066666666666666,
2134
- "eval_accuracy": 0.9125,
2135
- "eval_loss": 0.6544592976570129,
2136
- "eval_runtime": 80.7865,
2137
- "eval_samples_per_second": 1.981,
2138
- "eval_steps_per_second": 0.248,
2139
- "step": 2850
2140
- },
2141
- {
2142
- "epoch": 14.066666666666666,
2143
- "step": 2850,
2144
  "total_flos": 0.0,
2145
- "train_loss": 0.23767406691715381,
2146
- "train_runtime": 18748.4047,
2147
- "train_samples_per_second": 1.216,
2148
- "train_steps_per_second": 0.152
2149
- },
2150
- {
2151
- "epoch": 14.066666666666666,
2152
- "eval_accuracy": 0.8975,
2153
- "eval_loss": 0.47002196311950684,
2154
- "eval_runtime": 423.5464,
2155
- "eval_samples_per_second": 1.889,
2156
- "eval_steps_per_second": 0.236,
2157
- "step": 2850
2158
  }
2159
  ],
2160
  "logging_steps": 10,
2161
- "max_steps": 2850,
2162
  "num_input_tokens_seen": 0,
2163
  "num_train_epochs": 9223372036854775807,
2164
  "save_steps": 500,
2165
  "stateful_callbacks": {
 
 
 
 
 
 
 
 
 
2166
  "TrainerControl": {
2167
  "args": {
2168
  "should_epoch_stop": false,
 
1
  {
2
+ "best_metric": 0.93125,
3
+ "best_model_checkpoint": "mvit_v2_rwf-2000/checkpoint-570",
4
+ "epoch": 5.125,
5
  "eval_steps": 500,
6
+ "global_step": 1140,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.006578947368421052,
13
+ "grad_norm": 36.64044189453125,
14
+ "learning_rate": 1.986842105263158e-05,
15
+ "loss": 6.5592,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.013157894736842105,
20
+ "grad_norm": 43.76960372924805,
21
+ "learning_rate": 1.9736842105263158e-05,
22
+ "loss": 5.0333,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.019736842105263157,
27
+ "grad_norm": 29.054157257080078,
28
+ "learning_rate": 1.960526315789474e-05,
29
+ "loss": 3.3536,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.02631578947368421,
34
+ "grad_norm": 26.586713790893555,
35
+ "learning_rate": 1.9473684210526318e-05,
36
+ "loss": 2.0178,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.03289473684210526,
41
+ "grad_norm": 13.628296852111816,
42
+ "learning_rate": 1.9342105263157896e-05,
43
+ "loss": 1.4669,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.039473684210526314,
48
+ "grad_norm": 12.267337799072266,
49
+ "learning_rate": 1.9210526315789474e-05,
50
+ "loss": 0.9024,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.046052631578947366,
55
+ "grad_norm": 13.692243576049805,
56
+ "learning_rate": 1.9078947368421056e-05,
57
+ "loss": 0.676,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.05263157894736842,
62
+ "grad_norm": 15.4345064163208,
63
+ "learning_rate": 1.894736842105263e-05,
64
+ "loss": 0.8188,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.05921052631578947,
69
+ "grad_norm": 18.368070602416992,
70
+ "learning_rate": 1.8815789473684213e-05,
71
+ "loss": 0.6296,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.06578947368421052,
76
+ "grad_norm": 12.882184982299805,
77
+ "learning_rate": 1.868421052631579e-05,
78
+ "loss": 0.6014,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.07236842105263158,
83
+ "grad_norm": 13.11134147644043,
84
+ "learning_rate": 1.8552631578947373e-05,
85
+ "loss": 0.6352,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.07894736842105263,
90
+ "grad_norm": 8.479208946228027,
91
+ "learning_rate": 1.8421052631578947e-05,
92
+ "loss": 0.6688,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 0.08552631578947369,
97
+ "grad_norm": 9.394298553466797,
98
+ "learning_rate": 1.828947368421053e-05,
99
+ "loss": 0.5578,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 0.09210526315789473,
104
+ "grad_norm": 13.005159378051758,
105
+ "learning_rate": 1.8157894736842107e-05,
106
+ "loss": 0.5352,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 0.09868421052631579,
111
+ "grad_norm": 21.03794288635254,
112
+ "learning_rate": 1.8026315789473685e-05,
113
+ "loss": 0.3291,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 0.10526315789473684,
118
+ "grad_norm": 20.376361846923828,
119
+ "learning_rate": 1.7894736842105264e-05,
120
+ "loss": 0.3126,
121
  "step": 160
122
  },
123
  {
124
+ "epoch": 0.1118421052631579,
125
+ "grad_norm": 9.78145694732666,
126
+ "learning_rate": 1.7763157894736845e-05,
127
+ "loss": 0.4688,
128
  "step": 170
129
  },
130
  {
131
+ "epoch": 0.11842105263157894,
132
+ "grad_norm": 10.771368026733398,
133
+ "learning_rate": 1.763157894736842e-05,
134
+ "loss": 0.3389,
135
  "step": 180
136
  },
137
  {
138
+ "epoch": 0.125,
139
+ "grad_norm": 1.7945446968078613,
140
+ "learning_rate": 1.7500000000000002e-05,
141
+ "loss": 0.4242,
142
  "step": 190
143
  },
144
  {
145
+ "epoch": 0.125,
146
+ "eval_accuracy": 0.8375,
147
+ "eval_f1": 0.8365807668133248,
148
+ "eval_loss": 0.42270001769065857,
149
+ "eval_precision": 0.8452685421994885,
150
+ "eval_runtime": 69.4902,
151
+ "eval_samples_per_second": 2.302,
152
+ "eval_steps_per_second": 0.288,
153
  "step": 190
154
  },
155
  {
156
+ "epoch": 1.006578947368421,
157
+ "grad_norm": 26.072763442993164,
158
+ "learning_rate": 1.736842105263158e-05,
159
+ "loss": 0.2755,
160
  "step": 200
161
  },
162
  {
163
+ "epoch": 1.013157894736842,
164
+ "grad_norm": 16.94261932373047,
165
+ "learning_rate": 1.723684210526316e-05,
166
+ "loss": 0.4916,
167
  "step": 210
168
  },
169
  {
170
+ "epoch": 1.019736842105263,
171
+ "grad_norm": 11.323436737060547,
172
+ "learning_rate": 1.7105263157894737e-05,
173
+ "loss": 0.2864,
174
  "step": 220
175
  },
176
  {
177
+ "epoch": 1.0263157894736843,
178
+ "grad_norm": 5.436691761016846,
179
+ "learning_rate": 1.6973684210526318e-05,
180
+ "loss": 0.3343,
181
  "step": 230
182
  },
183
  {
184
+ "epoch": 1.0328947368421053,
185
+ "grad_norm": 19.71637725830078,
186
+ "learning_rate": 1.6842105263157896e-05,
187
+ "loss": 0.3933,
188
  "step": 240
189
  },
190
  {
191
+ "epoch": 1.0394736842105263,
192
+ "grad_norm": 7.336604595184326,
193
+ "learning_rate": 1.6710526315789475e-05,
194
+ "loss": 0.3754,
195
  "step": 250
196
  },
197
  {
198
+ "epoch": 1.0460526315789473,
199
+ "grad_norm": 23.727394104003906,
200
+ "learning_rate": 1.6578947368421053e-05,
201
+ "loss": 0.2446,
202
  "step": 260
203
  },
204
  {
205
+ "epoch": 1.0526315789473684,
206
+ "grad_norm": 21.048952102661133,
207
+ "learning_rate": 1.644736842105263e-05,
208
+ "loss": 0.3691,
209
  "step": 270
210
  },
211
  {
212
+ "epoch": 1.0592105263157894,
213
+ "grad_norm": 1.7281363010406494,
214
+ "learning_rate": 1.6315789473684213e-05,
215
+ "loss": 0.363,
216
  "step": 280
217
  },
218
  {
219
+ "epoch": 1.0657894736842106,
220
+ "grad_norm": 17.512372970581055,
221
+ "learning_rate": 1.618421052631579e-05,
222
+ "loss": 0.365,
223
  "step": 290
224
  },
225
  {
226
+ "epoch": 1.0723684210526316,
227
+ "grad_norm": 4.898046016693115,
228
+ "learning_rate": 1.605263157894737e-05,
229
+ "loss": 0.1919,
230
  "step": 300
231
  },
232
  {
233
+ "epoch": 1.0789473684210527,
234
+ "grad_norm": 23.86078643798828,
235
+ "learning_rate": 1.5921052631578948e-05,
236
+ "loss": 0.3222,
237
  "step": 310
238
  },
239
  {
240
+ "epoch": 1.0855263157894737,
241
+ "grad_norm": 14.71253490447998,
242
+ "learning_rate": 1.578947368421053e-05,
243
+ "loss": 0.3996,
244
  "step": 320
245
  },
246
  {
247
+ "epoch": 1.0921052631578947,
248
+ "grad_norm": 15.617661476135254,
249
+ "learning_rate": 1.5657894736842107e-05,
250
+ "loss": 0.2916,
251
  "step": 330
252
  },
253
  {
254
+ "epoch": 1.0986842105263157,
255
+ "grad_norm": 16.566120147705078,
256
+ "learning_rate": 1.5526315789473686e-05,
257
+ "loss": 0.2138,
258
  "step": 340
259
  },
260
  {
261
+ "epoch": 1.1052631578947367,
262
+ "grad_norm": 19.48283576965332,
263
+ "learning_rate": 1.5394736842105264e-05,
264
+ "loss": 0.3296,
265
  "step": 350
266
  },
267
  {
268
+ "epoch": 1.111842105263158,
269
+ "grad_norm": 25.25094223022461,
270
+ "learning_rate": 1.5263157894736846e-05,
271
+ "loss": 0.5412,
272
  "step": 360
273
  },
274
  {
275
+ "epoch": 1.118421052631579,
276
+ "grad_norm": 2.2683284282684326,
277
+ "learning_rate": 1.5131578947368422e-05,
278
+ "loss": 0.2906,
279
  "step": 370
280
  },
281
  {
282
+ "epoch": 1.125,
283
+ "grad_norm": 31.231639862060547,
284
+ "learning_rate": 1.5000000000000002e-05,
285
+ "loss": 0.3906,
286
  "step": 380
287
  },
288
  {
289
+ "epoch": 1.125,
290
+ "eval_accuracy": 0.9,
291
+ "eval_f1": 0.8999374609130708,
292
+ "eval_loss": 0.2932564616203308,
293
+ "eval_precision": 0.9010025062656641,
294
+ "eval_runtime": 73.6902,
295
+ "eval_samples_per_second": 2.171,
296
+ "eval_steps_per_second": 0.271,
297
  "step": 380
298
  },
299
  {
300
+ "epoch": 2.0065789473684212,
301
+ "grad_norm": 0.302801251411438,
302
+ "learning_rate": 1.486842105263158e-05,
303
+ "loss": 0.148,
304
  "step": 390
305
  },
306
  {
307
+ "epoch": 2.013157894736842,
308
+ "grad_norm": 2.899660587310791,
309
+ "learning_rate": 1.4736842105263159e-05,
310
+ "loss": 0.3116,
311
  "step": 400
312
  },
313
  {
314
+ "epoch": 2.0197368421052633,
315
+ "grad_norm": 0.4640880227088928,
316
+ "learning_rate": 1.4605263157894739e-05,
317
+ "loss": 0.131,
318
  "step": 410
319
  },
320
  {
321
+ "epoch": 2.026315789473684,
322
+ "grad_norm": 28.330896377563477,
323
+ "learning_rate": 1.4473684210526317e-05,
324
+ "loss": 0.1221,
325
  "step": 420
326
  },
327
  {
328
+ "epoch": 2.0328947368421053,
329
+ "grad_norm": 0.6787762641906738,
330
+ "learning_rate": 1.4342105263157895e-05,
331
+ "loss": 0.1547,
332
  "step": 430
333
  },
334
  {
335
+ "epoch": 2.039473684210526,
336
+ "grad_norm": 27.384422302246094,
337
+ "learning_rate": 1.4210526315789475e-05,
338
+ "loss": 0.2902,
339
  "step": 440
340
  },
341
  {
342
+ "epoch": 2.0460526315789473,
343
+ "grad_norm": 18.884803771972656,
344
+ "learning_rate": 1.4078947368421055e-05,
345
+ "loss": 0.2381,
346
  "step": 450
347
  },
348
  {
349
+ "epoch": 2.0526315789473686,
350
+ "grad_norm": 2.0699939727783203,
351
+ "learning_rate": 1.3947368421052631e-05,
352
+ "loss": 0.1437,
353
  "step": 460
354
  },
355
  {
356
+ "epoch": 2.0592105263157894,
357
+ "grad_norm": 0.6573389768600464,
358
+ "learning_rate": 1.3815789473684211e-05,
359
+ "loss": 0.54,
360
  "step": 470
361
  },
362
  {
363
+ "epoch": 2.0657894736842106,
364
+ "grad_norm": 24.888164520263672,
365
+ "learning_rate": 1.3684210526315791e-05,
366
+ "loss": 0.2222,
367
  "step": 480
368
  },
369
  {
370
+ "epoch": 2.0723684210526314,
371
+ "grad_norm": 5.3757643699646,
372
+ "learning_rate": 1.3552631578947371e-05,
373
+ "loss": 0.3766,
374
  "step": 490
375
  },
376
  {
377
+ "epoch": 2.0789473684210527,
378
+ "grad_norm": 8.543233871459961,
379
+ "learning_rate": 1.3421052631578948e-05,
380
+ "loss": 0.2016,
381
  "step": 500
382
  },
383
  {
384
+ "epoch": 2.085526315789474,
385
+ "grad_norm": 24.547653198242188,
386
+ "learning_rate": 1.3289473684210528e-05,
387
+ "loss": 0.1867,
388
  "step": 510
389
  },
390
  {
391
+ "epoch": 2.0921052631578947,
392
+ "grad_norm": 9.149017333984375,
393
+ "learning_rate": 1.3157894736842108e-05,
394
+ "loss": 0.1007,
395
  "step": 520
396
  },
397
  {
398
+ "epoch": 2.098684210526316,
399
+ "grad_norm": 0.6297352313995361,
400
+ "learning_rate": 1.3026315789473684e-05,
401
+ "loss": 0.0951,
402
  "step": 530
403
  },
404
  {
405
+ "epoch": 2.1052631578947367,
406
+ "grad_norm": 17.117029190063477,
407
+ "learning_rate": 1.2894736842105264e-05,
408
+ "loss": 0.405,
409
  "step": 540
410
  },
411
  {
412
+ "epoch": 2.111842105263158,
413
+ "grad_norm": 1.1509796380996704,
414
+ "learning_rate": 1.2763157894736844e-05,
415
+ "loss": 0.3637,
416
  "step": 550
417
  },
418
  {
419
+ "epoch": 2.1184210526315788,
420
+ "grad_norm": 13.600915908813477,
421
+ "learning_rate": 1.263157894736842e-05,
422
+ "loss": 0.2626,
423
  "step": 560
424
  },
425
  {
426
+ "epoch": 2.125,
427
+ "grad_norm": 3.0041847229003906,
428
+ "learning_rate": 1.25e-05,
429
+ "loss": 0.3199,
430
  "step": 570
431
  },
432
  {
433
+ "epoch": 2.125,
434
+ "eval_accuracy": 0.93125,
435
+ "eval_f1": 0.931247314348217,
436
+ "eval_loss": 0.30343881249427795,
437
+ "eval_precision": 0.9313173933427098,
438
+ "eval_runtime": 65.8695,
439
+ "eval_samples_per_second": 2.429,
440
+ "eval_steps_per_second": 0.304,
441
  "step": 570
442
  },
443
  {
444
+ "epoch": 3.0065789473684212,
445
+ "grad_norm": 3.123574733734131,
446
+ "learning_rate": 1.236842105263158e-05,
447
+ "loss": 0.1695,
448
  "step": 580
449
  },
450
  {
451
+ "epoch": 3.013157894736842,
452
+ "grad_norm": 30.406293869018555,
453
+ "learning_rate": 1.2236842105263159e-05,
454
+ "loss": 0.2813,
455
  "step": 590
456
  },
457
  {
458
+ "epoch": 3.0197368421052633,
459
+ "grad_norm": 14.729782104492188,
460
+ "learning_rate": 1.2105263157894737e-05,
461
+ "loss": 0.2084,
462
  "step": 600
463
  },
464
  {
465
+ "epoch": 3.026315789473684,
466
+ "grad_norm": 6.453456401824951,
467
+ "learning_rate": 1.1973684210526317e-05,
468
+ "loss": 0.3726,
469
  "step": 610
470
  },
471
  {
472
+ "epoch": 3.0328947368421053,
473
+ "grad_norm": 0.5487316846847534,
474
+ "learning_rate": 1.1842105263157895e-05,
475
+ "loss": 0.3515,
476
  "step": 620
477
  },
478
  {
479
+ "epoch": 3.039473684210526,
480
+ "grad_norm": 2.270763635635376,
481
+ "learning_rate": 1.1710526315789475e-05,
482
+ "loss": 0.0221,
483
  "step": 630
484
  },
485
  {
486
+ "epoch": 3.0460526315789473,
487
+ "grad_norm": 13.998810768127441,
488
+ "learning_rate": 1.1578947368421053e-05,
489
+ "loss": 0.33,
490
  "step": 640
491
  },
492
  {
493
+ "epoch": 3.0526315789473686,
494
+ "grad_norm": 8.822827339172363,
495
+ "learning_rate": 1.1447368421052632e-05,
496
+ "loss": 0.161,
497
  "step": 650
498
  },
499
  {
500
+ "epoch": 3.0592105263157894,
501
+ "grad_norm": 29.708253860473633,
502
+ "learning_rate": 1.1315789473684212e-05,
503
+ "loss": 0.2569,
504
  "step": 660
505
  },
506
  {
507
+ "epoch": 3.0657894736842106,
508
+ "grad_norm": 23.062585830688477,
509
+ "learning_rate": 1.1184210526315792e-05,
510
+ "loss": 0.2207,
511
  "step": 670
512
  },
513
  {
514
+ "epoch": 3.0723684210526314,
515
+ "grad_norm": 10.31871509552002,
516
+ "learning_rate": 1.105263157894737e-05,
517
+ "loss": 0.3339,
518
  "step": 680
519
  },
520
  {
521
+ "epoch": 3.0789473684210527,
522
+ "grad_norm": 2.75856351852417,
523
+ "learning_rate": 1.0921052631578948e-05,
524
+ "loss": 0.0798,
525
  "step": 690
526
  },
527
  {
528
+ "epoch": 3.085526315789474,
529
+ "grad_norm": 1.439271330833435,
530
+ "learning_rate": 1.0789473684210528e-05,
531
+ "loss": 0.107,
532
  "step": 700
533
  },
534
  {
535
+ "epoch": 3.0921052631578947,
536
+ "grad_norm": 0.4239494502544403,
537
+ "learning_rate": 1.0657894736842108e-05,
538
+ "loss": 0.1158,
539
  "step": 710
540
  },
541
  {
542
+ "epoch": 3.098684210526316,
543
+ "grad_norm": 16.95366859436035,
544
+ "learning_rate": 1.0526315789473684e-05,
545
+ "loss": 0.2155,
546
  "step": 720
547
  },
548
  {
549
+ "epoch": 3.1052631578947367,
550
+ "grad_norm": 20.484813690185547,
551
+ "learning_rate": 1.0394736842105264e-05,
552
+ "loss": 0.2505,
553
  "step": 730
554
  },
555
  {
556
+ "epoch": 3.111842105263158,
557
+ "grad_norm": 31.08599853515625,
558
+ "learning_rate": 1.0263157894736844e-05,
559
+ "loss": 0.4975,
560
  "step": 740
561
  },
562
  {
563
+ "epoch": 3.1184210526315788,
564
+ "grad_norm": 22.264848709106445,
565
+ "learning_rate": 1.0131578947368421e-05,
566
+ "loss": 0.2602,
567
  "step": 750
568
  },
569
  {
570
+ "epoch": 3.125,
571
+ "grad_norm": 20.796369552612305,
572
+ "learning_rate": 1e-05,
573
+ "loss": 0.2239,
574
  "step": 760
575
  },
576
  {
577
+ "epoch": 3.125,
578
+ "eval_accuracy": 0.9125,
579
+ "eval_f1": 0.912445278298937,
580
+ "eval_loss": 0.3610968291759491,
581
+ "eval_precision": 0.9135338345864662,
582
+ "eval_runtime": 68.362,
583
+ "eval_samples_per_second": 2.34,
584
+ "eval_steps_per_second": 0.293,
585
  "step": 760
586
  },
587
  {
588
+ "epoch": 4.006578947368421,
589
+ "grad_norm": 0.06610066443681717,
590
+ "learning_rate": 9.868421052631579e-06,
591
+ "loss": 0.2882,
592
  "step": 770
593
  },
594
  {
595
+ "epoch": 4.0131578947368425,
596
+ "grad_norm": 3.3229968547821045,
597
+ "learning_rate": 9.736842105263159e-06,
598
+ "loss": 0.148,
599
  "step": 780
600
  },
601
  {
602
+ "epoch": 4.019736842105263,
603
+ "grad_norm": 18.369155883789062,
604
+ "learning_rate": 9.605263157894737e-06,
605
+ "loss": 0.2034,
606
  "step": 790
607
  },
608
  {
609
+ "epoch": 4.026315789473684,
610
+ "grad_norm": 0.4686603844165802,
611
+ "learning_rate": 9.473684210526315e-06,
612
+ "loss": 0.1949,
613
  "step": 800
614
  },
615
  {
616
+ "epoch": 4.032894736842105,
617
+ "grad_norm": 0.3468804359436035,
618
+ "learning_rate": 9.342105263157895e-06,
619
+ "loss": 0.1468,
620
  "step": 810
621
  },
622
  {
623
+ "epoch": 4.0394736842105265,
624
+ "grad_norm": 5.2816243171691895,
625
+ "learning_rate": 9.210526315789474e-06,
626
+ "loss": 0.1253,
627
  "step": 820
628
  },
629
  {
630
+ "epoch": 4.046052631578948,
631
+ "grad_norm": 3.5091466903686523,
632
+ "learning_rate": 9.078947368421054e-06,
633
+ "loss": 0.0314,
634
  "step": 830
635
  },
636
  {
637
+ "epoch": 4.052631578947368,
638
+ "grad_norm": 38.08211135864258,
639
+ "learning_rate": 8.947368421052632e-06,
640
+ "loss": 0.3176,
641
  "step": 840
642
  },
643
  {
644
+ "epoch": 4.059210526315789,
645
+ "grad_norm": 0.1611049771308899,
646
+ "learning_rate": 8.81578947368421e-06,
647
+ "loss": 0.1931,
648
  "step": 850
649
  },
650
  {
651
+ "epoch": 4.065789473684211,
652
+ "grad_norm": 0.5490935444831848,
653
+ "learning_rate": 8.68421052631579e-06,
654
+ "loss": 0.4194,
655
  "step": 860
656
  },
657
  {
658
+ "epoch": 4.072368421052632,
659
+ "grad_norm": 4.411422252655029,
660
+ "learning_rate": 8.552631578947368e-06,
661
+ "loss": 0.1544,
662
  "step": 870
663
  },
664
  {
665
+ "epoch": 4.078947368421052,
666
+ "grad_norm": 0.4949572682380676,
667
+ "learning_rate": 8.421052631578948e-06,
668
+ "loss": 0.3998,
669
  "step": 880
670
  },
671
  {
672
+ "epoch": 4.0855263157894735,
673
+ "grad_norm": 30.750051498413086,
674
+ "learning_rate": 8.289473684210526e-06,
675
+ "loss": 0.1656,
676
  "step": 890
677
  },
678
  {
679
+ "epoch": 4.092105263157895,
680
+ "grad_norm": 16.065580368041992,
681
+ "learning_rate": 8.157894736842106e-06,
682
+ "loss": 0.0652,
683
  "step": 900
684
  },
685
  {
686
+ "epoch": 4.098684210526316,
687
+ "grad_norm": 21.529672622680664,
688
+ "learning_rate": 8.026315789473685e-06,
689
+ "loss": 0.1112,
690
  "step": 910
691
  },
692
  {
693
+ "epoch": 4.105263157894737,
694
+ "grad_norm": 1.2219691276550293,
695
+ "learning_rate": 7.894736842105265e-06,
696
+ "loss": 0.2536,
697
  "step": 920
698
  },
699
  {
700
+ "epoch": 4.1118421052631575,
701
+ "grad_norm": 1.4607069492340088,
702
+ "learning_rate": 7.763157894736843e-06,
703
+ "loss": 0.0822,
704
  "step": 930
705
  },
706
  {
707
+ "epoch": 4.118421052631579,
708
+ "grad_norm": 35.09339904785156,
709
+ "learning_rate": 7.631578947368423e-06,
710
+ "loss": 0.3982,
711
  "step": 940
712
  },
713
  {
714
+ "epoch": 4.125,
715
+ "grad_norm": 4.976933479309082,
716
+ "learning_rate": 7.500000000000001e-06,
717
+ "loss": 0.1747,
718
  "step": 950
719
  },
720
  {
721
+ "epoch": 4.125,
722
+ "eval_accuracy": 0.93125,
723
+ "eval_f1": 0.931247314348217,
724
+ "eval_loss": 0.34753698110580444,
725
+ "eval_precision": 0.9313173933427098,
726
+ "eval_runtime": 73.1814,
727
+ "eval_samples_per_second": 2.186,
728
+ "eval_steps_per_second": 0.273,
729
  "step": 950
730
  },
731
  {
732
+ "epoch": 5.006578947368421,
733
+ "grad_norm": 0.45159298181533813,
734
+ "learning_rate": 7.368421052631579e-06,
735
+ "loss": 0.1255,
736
  "step": 960
737
  },
738
  {
739
+ "epoch": 5.0131578947368425,
740
+ "grad_norm": 29.889768600463867,
741
+ "learning_rate": 7.236842105263158e-06,
742
+ "loss": 0.1549,
743
  "step": 970
744
  },
745
  {
746
+ "epoch": 5.019736842105263,
747
+ "grad_norm": 6.2291412353515625,
748
+ "learning_rate": 7.1052631578947375e-06,
749
+ "loss": 0.1345,
750
  "step": 980
751
  },
752
  {
753
+ "epoch": 5.026315789473684,
754
+ "grad_norm": 17.853551864624023,
755
+ "learning_rate": 6.973684210526316e-06,
756
+ "loss": 0.0786,
757
  "step": 990
758
  },
759
  {
760
+ "epoch": 5.032894736842105,
761
+ "grad_norm": 0.2519562244415283,
762
+ "learning_rate": 6.842105263157896e-06,
763
+ "loss": 0.1226,
764
  "step": 1000
765
  },
766
  {
767
+ "epoch": 5.0394736842105265,
768
+ "grad_norm": 17.962093353271484,
769
+ "learning_rate": 6.710526315789474e-06,
770
+ "loss": 0.2208,
771
  "step": 1010
772
  },
773
  {
774
+ "epoch": 5.046052631578948,
775
+ "grad_norm": 0.25263816118240356,
776
+ "learning_rate": 6.578947368421054e-06,
777
+ "loss": 0.0662,
778
  "step": 1020
779
  },
780
  {
781
+ "epoch": 5.052631578947368,
782
+ "grad_norm": 0.3767828345298767,
783
+ "learning_rate": 6.447368421052632e-06,
784
+ "loss": 0.1212,
785
  "step": 1030
786
  },
787
  {
788
+ "epoch": 5.059210526315789,
789
+ "grad_norm": 31.959402084350586,
790
+ "learning_rate": 6.31578947368421e-06,
791
+ "loss": 0.4038,
792
  "step": 1040
793
  },
794
  {
795
+ "epoch": 5.065789473684211,
796
+ "grad_norm": 28.180463790893555,
797
+ "learning_rate": 6.18421052631579e-06,
798
+ "loss": 0.1677,
799
  "step": 1050
800
  },
801
  {
802
+ "epoch": 5.072368421052632,
803
+ "grad_norm": 0.06592092663049698,
804
+ "learning_rate": 6.0526315789473685e-06,
805
+ "loss": 0.1123,
806
  "step": 1060
807
  },
808
  {
809
+ "epoch": 5.078947368421052,
810
+ "grad_norm": 19.177854537963867,
811
+ "learning_rate": 5.921052631578948e-06,
812
+ "loss": 0.4745,
813
  "step": 1070
814
  },
815
  {
816
+ "epoch": 5.0855263157894735,
817
+ "grad_norm": 39.3027458190918,
818
+ "learning_rate": 5.789473684210527e-06,
819
+ "loss": 0.2614,
820
  "step": 1080
821
  },
822
  {
823
+ "epoch": 5.092105263157895,
824
+ "grad_norm": 12.32589054107666,
825
+ "learning_rate": 5.657894736842106e-06,
826
+ "loss": 0.0943,
827
  "step": 1090
828
  },
829
  {
830
+ "epoch": 5.098684210526316,
831
+ "grad_norm": 17.65743064880371,
832
+ "learning_rate": 5.526315789473685e-06,
833
+ "loss": 0.1302,
834
  "step": 1100
835
  },
836
  {
837
+ "epoch": 5.105263157894737,
838
+ "grad_norm": 0.037535008043050766,
839
+ "learning_rate": 5.394736842105264e-06,
840
+ "loss": 0.1401,
841
  "step": 1110
842
  },
843
  {
844
+ "epoch": 5.1118421052631575,
845
+ "grad_norm": 1.1971192359924316,
846
+ "learning_rate": 5.263157894736842e-06,
847
+ "loss": 0.1045,
848
  "step": 1120
849
  },
850
  {
851
+ "epoch": 5.118421052631579,
852
+ "grad_norm": 0.809428334236145,
853
+ "learning_rate": 5.131578947368422e-06,
854
+ "loss": 0.2402,
855
  "step": 1130
856
  },
857
  {
858
+ "epoch": 5.125,
859
+ "grad_norm": 0.5052193999290466,
860
+ "learning_rate": 5e-06,
861
+ "loss": 0.1702,
862
  "step": 1140
863
  },
864
  {
865
+ "epoch": 5.125,
866
+ "eval_accuracy": 0.93125,
867
+ "eval_f1": 0.931247314348217,
868
+ "eval_loss": 0.36672312021255493,
869
+ "eval_precision": 0.9313173933427098,
870
+ "eval_runtime": 73.77,
871
+ "eval_samples_per_second": 2.169,
872
+ "eval_steps_per_second": 0.271,
873
  "step": 1140
874
  },
875
  {
876
+ "epoch": 5.125,
877
+ "step": 1140,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
878
  "total_flos": 0.0,
879
+ "train_loss": 0.43041260364024264,
880
+ "train_runtime": 5887.2517,
881
+ "train_samples_per_second": 2.065,
882
+ "train_steps_per_second": 0.258
 
 
 
 
 
 
 
 
 
883
  }
884
  ],
885
  "logging_steps": 10,
886
+ "max_steps": 1520,
887
  "num_input_tokens_seen": 0,
888
  "num_train_epochs": 9223372036854775807,
889
  "save_steps": 500,
890
  "stateful_callbacks": {
891
+ "EarlyStoppingCallback": {
892
+ "args": {
893
+ "early_stopping_patience": 3,
894
+ "early_stopping_threshold": 0.01
895
+ },
896
+ "attributes": {
897
+ "early_stopping_patience_counter": 0
898
+ }
899
+ },
900
  "TrainerControl": {
901
  "args": {
902
  "should_epoch_stop": false,