dchen0 commited on
Commit
496d5c8
·
verified ·
1 Parent(s): 4a37e4e

Delete trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +0 -2509
trainer_state.json DELETED
@@ -1,2509 +0,0 @@
1
- {
2
- "best_global_step": 2750,
3
- "best_metric": 0.8479926448053938,
4
- "best_model_checkpoint": "dinov2-fonts-with-subfonts-2/checkpoint-2750",
5
- "epoch": 1.0,
6
- "eval_steps": 50,
7
- "global_step": 2752,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.003633720930232558,
14
- "grad_norm": 15.892566680908203,
15
- "learning_rate": 9.967296511627907e-05,
16
- "loss": 2.5094,
17
- "step": 10
18
- },
19
- {
20
- "epoch": 0.007267441860465116,
21
- "grad_norm": 13.693017959594727,
22
- "learning_rate": 9.930959302325582e-05,
23
- "loss": 2.4643,
24
- "step": 20
25
- },
26
- {
27
- "epoch": 0.010901162790697675,
28
- "grad_norm": 16.248794555664062,
29
- "learning_rate": 9.894622093023256e-05,
30
- "loss": 2.4288,
31
- "step": 30
32
- },
33
- {
34
- "epoch": 0.014534883720930232,
35
- "grad_norm": 14.924266815185547,
36
- "learning_rate": 9.85828488372093e-05,
37
- "loss": 2.1981,
38
- "step": 40
39
- },
40
- {
41
- "epoch": 0.018168604651162792,
42
- "grad_norm": 17.095857620239258,
43
- "learning_rate": 9.821947674418606e-05,
44
- "loss": 2.3055,
45
- "step": 50
46
- },
47
- {
48
- "epoch": 0.018168604651162792,
49
- "eval_accuracy": 0.28256205945448976,
50
- "eval_loss": 2.221597671508789,
51
- "eval_model_preparation_time": 0.0015,
52
- "eval_runtime": 45.2864,
53
- "eval_samples_per_second": 72.053,
54
- "eval_steps_per_second": 2.252,
55
- "step": 50
56
- },
57
- {
58
- "epoch": 0.02180232558139535,
59
- "grad_norm": 15.624826431274414,
60
- "learning_rate": 9.78561046511628e-05,
61
- "loss": 2.2321,
62
- "step": 60
63
- },
64
- {
65
- "epoch": 0.025436046511627907,
66
- "grad_norm": 15.980948448181152,
67
- "learning_rate": 9.749273255813955e-05,
68
- "loss": 2.1783,
69
- "step": 70
70
- },
71
- {
72
- "epoch": 0.029069767441860465,
73
- "grad_norm": 16.875125885009766,
74
- "learning_rate": 9.712936046511628e-05,
75
- "loss": 2.1456,
76
- "step": 80
77
- },
78
- {
79
- "epoch": 0.032703488372093026,
80
- "grad_norm": 14.524847030639648,
81
- "learning_rate": 9.676598837209303e-05,
82
- "loss": 2.0414,
83
- "step": 90
84
- },
85
- {
86
- "epoch": 0.036337209302325583,
87
- "grad_norm": 19.023685455322266,
88
- "learning_rate": 9.640261627906977e-05,
89
- "loss": 2.0327,
90
- "step": 100
91
- },
92
- {
93
- "epoch": 0.036337209302325583,
94
- "eval_accuracy": 0.3444682807232608,
95
- "eval_loss": 2.0124428272247314,
96
- "eval_model_preparation_time": 0.0015,
97
- "eval_runtime": 55.6946,
98
- "eval_samples_per_second": 58.587,
99
- "eval_steps_per_second": 1.831,
100
- "step": 100
101
- },
102
- {
103
- "epoch": 0.03997093023255814,
104
- "grad_norm": 18.07843589782715,
105
- "learning_rate": 9.603924418604651e-05,
106
- "loss": 1.9096,
107
- "step": 110
108
- },
109
- {
110
- "epoch": 0.0436046511627907,
111
- "grad_norm": 15.953253746032715,
112
- "learning_rate": 9.567587209302325e-05,
113
- "loss": 1.9289,
114
- "step": 120
115
- },
116
- {
117
- "epoch": 0.047238372093023256,
118
- "grad_norm": 16.393022537231445,
119
- "learning_rate": 9.53125e-05,
120
- "loss": 1.898,
121
- "step": 130
122
- },
123
- {
124
- "epoch": 0.050872093023255814,
125
- "grad_norm": 15.503422737121582,
126
- "learning_rate": 9.494912790697676e-05,
127
- "loss": 1.9764,
128
- "step": 140
129
- },
130
- {
131
- "epoch": 0.05450581395348837,
132
- "grad_norm": 17.98733139038086,
133
- "learning_rate": 9.458575581395349e-05,
134
- "loss": 1.8097,
135
- "step": 150
136
- },
137
- {
138
- "epoch": 0.05450581395348837,
139
- "eval_accuracy": 0.3781795893349678,
140
- "eval_loss": 1.8244749307632446,
141
- "eval_model_preparation_time": 0.0015,
142
- "eval_runtime": 50.7499,
143
- "eval_samples_per_second": 64.296,
144
- "eval_steps_per_second": 2.01,
145
- "step": 150
146
- },
147
- {
148
- "epoch": 0.05813953488372093,
149
- "grad_norm": 22.832233428955078,
150
- "learning_rate": 9.422238372093024e-05,
151
- "loss": 1.8411,
152
- "step": 160
153
- },
154
- {
155
- "epoch": 0.06177325581395349,
156
- "grad_norm": 17.715545654296875,
157
- "learning_rate": 9.385901162790698e-05,
158
- "loss": 1.7486,
159
- "step": 170
160
- },
161
- {
162
- "epoch": 0.06540697674418605,
163
- "grad_norm": 18.01825714111328,
164
- "learning_rate": 9.349563953488372e-05,
165
- "loss": 1.7325,
166
- "step": 180
167
- },
168
- {
169
- "epoch": 0.0690406976744186,
170
- "grad_norm": 17.555883407592773,
171
- "learning_rate": 9.313226744186046e-05,
172
- "loss": 1.772,
173
- "step": 190
174
- },
175
- {
176
- "epoch": 0.07267441860465117,
177
- "grad_norm": 14.410489082336426,
178
- "learning_rate": 9.276889534883722e-05,
179
- "loss": 1.7379,
180
- "step": 200
181
- },
182
- {
183
- "epoch": 0.07267441860465117,
184
- "eval_accuracy": 0.4112779650628256,
185
- "eval_loss": 1.7361937761306763,
186
- "eval_model_preparation_time": 0.0015,
187
- "eval_runtime": 47.8089,
188
- "eval_samples_per_second": 68.251,
189
- "eval_steps_per_second": 2.133,
190
- "step": 200
191
- },
192
- {
193
- "epoch": 0.07630813953488372,
194
- "grad_norm": 17.80829429626465,
195
- "learning_rate": 9.240552325581396e-05,
196
- "loss": 1.7533,
197
- "step": 210
198
- },
199
- {
200
- "epoch": 0.07994186046511628,
201
- "grad_norm": 19.89047622680664,
202
- "learning_rate": 9.20421511627907e-05,
203
- "loss": 1.7556,
204
- "step": 220
205
- },
206
- {
207
- "epoch": 0.08357558139534883,
208
- "grad_norm": 17.062580108642578,
209
- "learning_rate": 9.167877906976745e-05,
210
- "loss": 1.6221,
211
- "step": 230
212
- },
213
- {
214
- "epoch": 0.0872093023255814,
215
- "grad_norm": 20.477750778198242,
216
- "learning_rate": 9.131540697674419e-05,
217
- "loss": 1.6176,
218
- "step": 240
219
- },
220
- {
221
- "epoch": 0.09084302325581395,
222
- "grad_norm": 19.113245010375977,
223
- "learning_rate": 9.095203488372093e-05,
224
- "loss": 1.7862,
225
- "step": 250
226
- },
227
- {
228
- "epoch": 0.09084302325581395,
229
- "eval_accuracy": 0.4511186025130248,
230
- "eval_loss": 1.5864135026931763,
231
- "eval_model_preparation_time": 0.0015,
232
- "eval_runtime": 47.5524,
233
- "eval_samples_per_second": 68.619,
234
- "eval_steps_per_second": 2.145,
235
- "step": 250
236
- },
237
- {
238
- "epoch": 0.09447674418604651,
239
- "grad_norm": 17.91750144958496,
240
- "learning_rate": 9.058866279069767e-05,
241
- "loss": 1.5788,
242
- "step": 260
243
- },
244
- {
245
- "epoch": 0.09811046511627906,
246
- "grad_norm": 19.93201446533203,
247
- "learning_rate": 9.022529069767443e-05,
248
- "loss": 1.6572,
249
- "step": 270
250
- },
251
- {
252
- "epoch": 0.10174418604651163,
253
- "grad_norm": 14.712456703186035,
254
- "learning_rate": 8.986191860465117e-05,
255
- "loss": 1.6514,
256
- "step": 280
257
- },
258
- {
259
- "epoch": 0.10537790697674419,
260
- "grad_norm": 17.98506736755371,
261
- "learning_rate": 8.94985465116279e-05,
262
- "loss": 1.555,
263
- "step": 290
264
- },
265
- {
266
- "epoch": 0.10901162790697674,
267
- "grad_norm": 18.016740798950195,
268
- "learning_rate": 8.913517441860465e-05,
269
- "loss": 1.4337,
270
- "step": 300
271
- },
272
- {
273
- "epoch": 0.10901162790697674,
274
- "eval_accuracy": 0.4566349984676678,
275
- "eval_loss": 1.5329961776733398,
276
- "eval_model_preparation_time": 0.0015,
277
- "eval_runtime": 49.8886,
278
- "eval_samples_per_second": 65.406,
279
- "eval_steps_per_second": 2.045,
280
- "step": 300
281
- },
282
- {
283
- "epoch": 0.11264534883720931,
284
- "grad_norm": 18.44303321838379,
285
- "learning_rate": 8.87718023255814e-05,
286
- "loss": 1.6438,
287
- "step": 310
288
- },
289
- {
290
- "epoch": 0.11627906976744186,
291
- "grad_norm": 19.64215660095215,
292
- "learning_rate": 8.840843023255815e-05,
293
- "loss": 1.4625,
294
- "step": 320
295
- },
296
- {
297
- "epoch": 0.11991279069767442,
298
- "grad_norm": 23.59469985961914,
299
- "learning_rate": 8.804505813953488e-05,
300
- "loss": 1.5685,
301
- "step": 330
302
- },
303
- {
304
- "epoch": 0.12354651162790697,
305
- "grad_norm": 19.073352813720703,
306
- "learning_rate": 8.768168604651164e-05,
307
- "loss": 1.4693,
308
- "step": 340
309
- },
310
- {
311
- "epoch": 0.12718023255813954,
312
- "grad_norm": 16.183441162109375,
313
- "learning_rate": 8.731831395348838e-05,
314
- "loss": 1.4731,
315
- "step": 350
316
- },
317
- {
318
- "epoch": 0.12718023255813954,
319
- "eval_accuracy": 0.5099601593625498,
320
- "eval_loss": 1.4183763265609741,
321
- "eval_model_preparation_time": 0.0015,
322
- "eval_runtime": 47.7302,
323
- "eval_samples_per_second": 68.363,
324
- "eval_steps_per_second": 2.137,
325
- "step": 350
326
- },
327
- {
328
- "epoch": 0.1308139534883721,
329
- "grad_norm": 22.20075035095215,
330
- "learning_rate": 8.695494186046512e-05,
331
- "loss": 1.4671,
332
- "step": 360
333
- },
334
- {
335
- "epoch": 0.13444767441860464,
336
- "grad_norm": 19.052248001098633,
337
- "learning_rate": 8.659156976744186e-05,
338
- "loss": 1.4519,
339
- "step": 370
340
- },
341
- {
342
- "epoch": 0.1380813953488372,
343
- "grad_norm": 17.498851776123047,
344
- "learning_rate": 8.622819767441861e-05,
345
- "loss": 1.4895,
346
- "step": 380
347
- },
348
- {
349
- "epoch": 0.14171511627906977,
350
- "grad_norm": 17.81024932861328,
351
- "learning_rate": 8.586482558139536e-05,
352
- "loss": 1.5058,
353
- "step": 390
354
- },
355
- {
356
- "epoch": 0.14534883720930233,
357
- "grad_norm": 22.155494689941406,
358
- "learning_rate": 8.550145348837209e-05,
359
- "loss": 1.4192,
360
- "step": 400
361
- },
362
- {
363
- "epoch": 0.14534883720930233,
364
- "eval_accuracy": 0.5163959546429666,
365
- "eval_loss": 1.3699551820755005,
366
- "eval_model_preparation_time": 0.0015,
367
- "eval_runtime": 47.8531,
368
- "eval_samples_per_second": 68.188,
369
- "eval_steps_per_second": 2.132,
370
- "step": 400
371
- },
372
- {
373
- "epoch": 0.14898255813953487,
374
- "grad_norm": 17.518573760986328,
375
- "learning_rate": 8.513808139534885e-05,
376
- "loss": 1.3937,
377
- "step": 410
378
- },
379
- {
380
- "epoch": 0.15261627906976744,
381
- "grad_norm": 18.192092895507812,
382
- "learning_rate": 8.477470930232559e-05,
383
- "loss": 1.3298,
384
- "step": 420
385
- },
386
- {
387
- "epoch": 0.15625,
388
- "grad_norm": 19.992328643798828,
389
- "learning_rate": 8.441133720930233e-05,
390
- "loss": 1.3802,
391
- "step": 430
392
- },
393
- {
394
- "epoch": 0.15988372093023256,
395
- "grad_norm": 27.304393768310547,
396
- "learning_rate": 8.404796511627907e-05,
397
- "loss": 1.3234,
398
- "step": 440
399
- },
400
- {
401
- "epoch": 0.16351744186046513,
402
- "grad_norm": 24.619159698486328,
403
- "learning_rate": 8.368459302325582e-05,
404
- "loss": 1.415,
405
- "step": 450
406
- },
407
- {
408
- "epoch": 0.16351744186046513,
409
- "eval_accuracy": 0.5262028807845541,
410
- "eval_loss": 1.3008671998977661,
411
- "eval_model_preparation_time": 0.0015,
412
- "eval_runtime": 48.1216,
413
- "eval_samples_per_second": 67.807,
414
- "eval_steps_per_second": 2.12,
415
- "step": 450
416
- },
417
- {
418
- "epoch": 0.16715116279069767,
419
- "grad_norm": 19.95876121520996,
420
- "learning_rate": 8.332122093023256e-05,
421
- "loss": 1.3458,
422
- "step": 460
423
- },
424
- {
425
- "epoch": 0.17078488372093023,
426
- "grad_norm": 18.42659568786621,
427
- "learning_rate": 8.29578488372093e-05,
428
- "loss": 1.4143,
429
- "step": 470
430
- },
431
- {
432
- "epoch": 0.1744186046511628,
433
- "grad_norm": 19.090171813964844,
434
- "learning_rate": 8.259447674418606e-05,
435
- "loss": 1.3685,
436
- "step": 480
437
- },
438
- {
439
- "epoch": 0.17805232558139536,
440
- "grad_norm": 23.138212203979492,
441
- "learning_rate": 8.22311046511628e-05,
442
- "loss": 1.3492,
443
- "step": 490
444
- },
445
- {
446
- "epoch": 0.1816860465116279,
447
- "grad_norm": 24.334434509277344,
448
- "learning_rate": 8.186773255813954e-05,
449
- "loss": 1.4102,
450
- "step": 500
451
- },
452
- {
453
- "epoch": 0.1816860465116279,
454
- "eval_accuracy": 0.5455102666258045,
455
- "eval_loss": 1.2436403036117554,
456
- "eval_model_preparation_time": 0.0015,
457
- "eval_runtime": 48.5657,
458
- "eval_samples_per_second": 67.187,
459
- "eval_steps_per_second": 2.1,
460
- "step": 500
461
- },
462
- {
463
- "epoch": 0.18531976744186046,
464
- "grad_norm": 20.45575714111328,
465
- "learning_rate": 8.150436046511628e-05,
466
- "loss": 1.2621,
467
- "step": 510
468
- },
469
- {
470
- "epoch": 0.18895348837209303,
471
- "grad_norm": 15.790328979492188,
472
- "learning_rate": 8.114098837209303e-05,
473
- "loss": 1.2724,
474
- "step": 520
475
- },
476
- {
477
- "epoch": 0.1925872093023256,
478
- "grad_norm": 21.53169059753418,
479
- "learning_rate": 8.077761627906977e-05,
480
- "loss": 1.3039,
481
- "step": 530
482
- },
483
- {
484
- "epoch": 0.19622093023255813,
485
- "grad_norm": 17.059200286865234,
486
- "learning_rate": 8.041424418604651e-05,
487
- "loss": 1.2784,
488
- "step": 540
489
- },
490
- {
491
- "epoch": 0.1998546511627907,
492
- "grad_norm": 19.423311233520508,
493
- "learning_rate": 8.005087209302325e-05,
494
- "loss": 1.2175,
495
- "step": 550
496
- },
497
- {
498
- "epoch": 0.1998546511627907,
499
- "eval_accuracy": 0.5675758504443763,
500
- "eval_loss": 1.1887367963790894,
501
- "eval_model_preparation_time": 0.0015,
502
- "eval_runtime": 48.5535,
503
- "eval_samples_per_second": 67.204,
504
- "eval_steps_per_second": 2.101,
505
- "step": 550
506
- },
507
- {
508
- "epoch": 0.20348837209302326,
509
- "grad_norm": 21.524168014526367,
510
- "learning_rate": 7.96875e-05,
511
- "loss": 1.1391,
512
- "step": 560
513
- },
514
- {
515
- "epoch": 0.20712209302325582,
516
- "grad_norm": 15.95803165435791,
517
- "learning_rate": 7.932412790697675e-05,
518
- "loss": 1.1714,
519
- "step": 570
520
- },
521
- {
522
- "epoch": 0.21075581395348839,
523
- "grad_norm": 17.44819450378418,
524
- "learning_rate": 7.896075581395349e-05,
525
- "loss": 1.2085,
526
- "step": 580
527
- },
528
- {
529
- "epoch": 0.21438953488372092,
530
- "grad_norm": 20.14234161376953,
531
- "learning_rate": 7.859738372093024e-05,
532
- "loss": 1.1547,
533
- "step": 590
534
- },
535
- {
536
- "epoch": 0.2180232558139535,
537
- "grad_norm": 22.286245346069336,
538
- "learning_rate": 7.823401162790698e-05,
539
- "loss": 1.1583,
540
- "step": 600
541
- },
542
- {
543
- "epoch": 0.2180232558139535,
544
- "eval_accuracy": 0.5770763101440393,
545
- "eval_loss": 1.1267926692962646,
546
- "eval_model_preparation_time": 0.0015,
547
- "eval_runtime": 47.6542,
548
- "eval_samples_per_second": 68.472,
549
- "eval_steps_per_second": 2.14,
550
- "step": 600
551
- },
552
- {
553
- "epoch": 0.22165697674418605,
554
- "grad_norm": 24.456897735595703,
555
- "learning_rate": 7.787063953488372e-05,
556
- "loss": 1.1663,
557
- "step": 610
558
- },
559
- {
560
- "epoch": 0.22529069767441862,
561
- "grad_norm": 19.741336822509766,
562
- "learning_rate": 7.750726744186046e-05,
563
- "loss": 1.1761,
564
- "step": 620
565
- },
566
- {
567
- "epoch": 0.22892441860465115,
568
- "grad_norm": 19.847604751586914,
569
- "learning_rate": 7.714389534883722e-05,
570
- "loss": 1.1423,
571
- "step": 630
572
- },
573
- {
574
- "epoch": 0.23255813953488372,
575
- "grad_norm": 16.832901000976562,
576
- "learning_rate": 7.678052325581396e-05,
577
- "loss": 1.0611,
578
- "step": 640
579
- },
580
- {
581
- "epoch": 0.23619186046511628,
582
- "grad_norm": 19.98552703857422,
583
- "learning_rate": 7.64171511627907e-05,
584
- "loss": 1.1858,
585
- "step": 650
586
- },
587
- {
588
- "epoch": 0.23619186046511628,
589
- "eval_accuracy": 0.6196751455715599,
590
- "eval_loss": 1.0860530138015747,
591
- "eval_model_preparation_time": 0.0015,
592
- "eval_runtime": 50.2713,
593
- "eval_samples_per_second": 64.908,
594
- "eval_steps_per_second": 2.029,
595
- "step": 650
596
- },
597
- {
598
- "epoch": 0.23982558139534885,
599
- "grad_norm": 22.90323257446289,
600
- "learning_rate": 7.605377906976745e-05,
601
- "loss": 1.2555,
602
- "step": 660
603
- },
604
- {
605
- "epoch": 0.24345930232558138,
606
- "grad_norm": 18.252866744995117,
607
- "learning_rate": 7.569040697674419e-05,
608
- "loss": 1.1472,
609
- "step": 670
610
- },
611
- {
612
- "epoch": 0.24709302325581395,
613
- "grad_norm": 29.458932876586914,
614
- "learning_rate": 7.532703488372093e-05,
615
- "loss": 1.2043,
616
- "step": 680
617
- },
618
- {
619
- "epoch": 0.2507267441860465,
620
- "grad_norm": 26.31637954711914,
621
- "learning_rate": 7.496366279069767e-05,
622
- "loss": 1.1212,
623
- "step": 690
624
- },
625
- {
626
- "epoch": 0.2543604651162791,
627
- "grad_norm": 19.821483612060547,
628
- "learning_rate": 7.460029069767443e-05,
629
- "loss": 1.0821,
630
- "step": 700
631
- },
632
- {
633
- "epoch": 0.2543604651162791,
634
- "eval_accuracy": 0.622739809990806,
635
- "eval_loss": 1.0653626918792725,
636
- "eval_model_preparation_time": 0.0015,
637
- "eval_runtime": 47.867,
638
- "eval_samples_per_second": 68.168,
639
- "eval_steps_per_second": 2.131,
640
- "step": 700
641
- },
642
- {
643
- "epoch": 0.25799418604651164,
644
- "grad_norm": 20.264490127563477,
645
- "learning_rate": 7.423691860465117e-05,
646
- "loss": 1.0817,
647
- "step": 710
648
- },
649
- {
650
- "epoch": 0.2616279069767442,
651
- "grad_norm": 16.901201248168945,
652
- "learning_rate": 7.38735465116279e-05,
653
- "loss": 1.0836,
654
- "step": 720
655
- },
656
- {
657
- "epoch": 0.26526162790697677,
658
- "grad_norm": 21.42996597290039,
659
- "learning_rate": 7.351017441860465e-05,
660
- "loss": 1.0995,
661
- "step": 730
662
- },
663
- {
664
- "epoch": 0.2688953488372093,
665
- "grad_norm": 27.091205596923828,
666
- "learning_rate": 7.31468023255814e-05,
667
- "loss": 1.2028,
668
- "step": 740
669
- },
670
- {
671
- "epoch": 0.27252906976744184,
672
- "grad_norm": 26.723642349243164,
673
- "learning_rate": 7.278343023255814e-05,
674
- "loss": 1.1347,
675
- "step": 750
676
- },
677
- {
678
- "epoch": 0.27252906976744184,
679
- "eval_accuracy": 0.6356114005516396,
680
- "eval_loss": 1.023024082183838,
681
- "eval_model_preparation_time": 0.0015,
682
- "eval_runtime": 48.1088,
683
- "eval_samples_per_second": 67.825,
684
- "eval_steps_per_second": 2.12,
685
- "step": 750
686
- },
687
- {
688
- "epoch": 0.2761627906976744,
689
- "grad_norm": 15.63192367553711,
690
- "learning_rate": 7.242005813953488e-05,
691
- "loss": 1.1222,
692
- "step": 760
693
- },
694
- {
695
- "epoch": 0.279796511627907,
696
- "grad_norm": 16.611923217773438,
697
- "learning_rate": 7.205668604651164e-05,
698
- "loss": 1.1061,
699
- "step": 770
700
- },
701
- {
702
- "epoch": 0.28343023255813954,
703
- "grad_norm": 21.829530715942383,
704
- "learning_rate": 7.169331395348838e-05,
705
- "loss": 1.0047,
706
- "step": 780
707
- },
708
- {
709
- "epoch": 0.2870639534883721,
710
- "grad_norm": 22.85479736328125,
711
- "learning_rate": 7.132994186046512e-05,
712
- "loss": 1.1656,
713
- "step": 790
714
- },
715
- {
716
- "epoch": 0.29069767441860467,
717
- "grad_norm": 24.8774356842041,
718
- "learning_rate": 7.096656976744186e-05,
719
- "loss": 1.0365,
720
- "step": 800
721
- },
722
- {
723
- "epoch": 0.29069767441860467,
724
- "eval_accuracy": 0.6472571253447748,
725
- "eval_loss": 0.9910984635353088,
726
- "eval_model_preparation_time": 0.0015,
727
- "eval_runtime": 49.0197,
728
- "eval_samples_per_second": 66.565,
729
- "eval_steps_per_second": 2.081,
730
- "step": 800
731
- },
732
- {
733
- "epoch": 0.29433139534883723,
734
- "grad_norm": 24.75929069519043,
735
- "learning_rate": 7.060319767441861e-05,
736
- "loss": 1.0438,
737
- "step": 810
738
- },
739
- {
740
- "epoch": 0.29796511627906974,
741
- "grad_norm": 28.923444747924805,
742
- "learning_rate": 7.023982558139535e-05,
743
- "loss": 1.0488,
744
- "step": 820
745
- },
746
- {
747
- "epoch": 0.3015988372093023,
748
- "grad_norm": 18.869667053222656,
749
- "learning_rate": 6.987645348837209e-05,
750
- "loss": 1.0892,
751
- "step": 830
752
- },
753
- {
754
- "epoch": 0.30523255813953487,
755
- "grad_norm": 18.0045108795166,
756
- "learning_rate": 6.951308139534885e-05,
757
- "loss": 1.0249,
758
- "step": 840
759
- },
760
- {
761
- "epoch": 0.30886627906976744,
762
- "grad_norm": 18.63671875,
763
- "learning_rate": 6.914970930232559e-05,
764
- "loss": 0.9398,
765
- "step": 850
766
- },
767
- {
768
- "epoch": 0.30886627906976744,
769
- "eval_accuracy": 0.6420471958320564,
770
- "eval_loss": 0.9642446041107178,
771
- "eval_model_preparation_time": 0.0015,
772
- "eval_runtime": 47.8675,
773
- "eval_samples_per_second": 68.167,
774
- "eval_steps_per_second": 2.131,
775
- "step": 850
776
- },
777
- {
778
- "epoch": 0.3125,
779
- "grad_norm": 22.47454261779785,
780
- "learning_rate": 6.878633720930233e-05,
781
- "loss": 1.0515,
782
- "step": 860
783
- },
784
- {
785
- "epoch": 0.31613372093023256,
786
- "grad_norm": 21.17203712463379,
787
- "learning_rate": 6.842296511627907e-05,
788
- "loss": 0.973,
789
- "step": 870
790
- },
791
- {
792
- "epoch": 0.31976744186046513,
793
- "grad_norm": 20.848777770996094,
794
- "learning_rate": 6.805959302325582e-05,
795
- "loss": 0.9193,
796
- "step": 880
797
- },
798
- {
799
- "epoch": 0.3234011627906977,
800
- "grad_norm": 25.05883026123047,
801
- "learning_rate": 6.769622093023256e-05,
802
- "loss": 0.9351,
803
- "step": 890
804
- },
805
- {
806
- "epoch": 0.32703488372093026,
807
- "grad_norm": 22.536279678344727,
808
- "learning_rate": 6.73328488372093e-05,
809
- "loss": 1.0203,
810
- "step": 900
811
- },
812
- {
813
- "epoch": 0.32703488372093026,
814
- "eval_accuracy": 0.6736132393502912,
815
- "eval_loss": 0.9405654072761536,
816
- "eval_model_preparation_time": 0.0015,
817
- "eval_runtime": 49.3771,
818
- "eval_samples_per_second": 66.083,
819
- "eval_steps_per_second": 2.066,
820
- "step": 900
821
- },
822
- {
823
- "epoch": 0.33066860465116277,
824
- "grad_norm": 14.16507339477539,
825
- "learning_rate": 6.696947674418606e-05,
826
- "loss": 1.0585,
827
- "step": 910
828
- },
829
- {
830
- "epoch": 0.33430232558139533,
831
- "grad_norm": 19.16563606262207,
832
- "learning_rate": 6.66061046511628e-05,
833
- "loss": 0.9585,
834
- "step": 920
835
- },
836
- {
837
- "epoch": 0.3379360465116279,
838
- "grad_norm": 26.094350814819336,
839
- "learning_rate": 6.624273255813954e-05,
840
- "loss": 1.0055,
841
- "step": 930
842
- },
843
- {
844
- "epoch": 0.34156976744186046,
845
- "grad_norm": 18.31908416748047,
846
- "learning_rate": 6.587936046511628e-05,
847
- "loss": 0.9208,
848
- "step": 940
849
- },
850
- {
851
- "epoch": 0.345203488372093,
852
- "grad_norm": 31.46238136291504,
853
- "learning_rate": 6.551598837209303e-05,
854
- "loss": 1.0836,
855
- "step": 950
856
- },
857
- {
858
- "epoch": 0.345203488372093,
859
- "eval_accuracy": 0.6647257125344774,
860
- "eval_loss": 0.9281411170959473,
861
- "eval_model_preparation_time": 0.0015,
862
- "eval_runtime": 50.41,
863
- "eval_samples_per_second": 64.729,
864
- "eval_steps_per_second": 2.023,
865
- "step": 950
866
- },
867
- {
868
- "epoch": 0.3488372093023256,
869
- "grad_norm": 16.519235610961914,
870
- "learning_rate": 6.515261627906977e-05,
871
- "loss": 0.9886,
872
- "step": 960
873
- },
874
- {
875
- "epoch": 0.35247093023255816,
876
- "grad_norm": 21.729778289794922,
877
- "learning_rate": 6.478924418604651e-05,
878
- "loss": 1.0044,
879
- "step": 970
880
- },
881
- {
882
- "epoch": 0.3561046511627907,
883
- "grad_norm": 19.362590789794922,
884
- "learning_rate": 6.442587209302325e-05,
885
- "loss": 0.9829,
886
- "step": 980
887
- },
888
- {
889
- "epoch": 0.35973837209302323,
890
- "grad_norm": 19.202898025512695,
891
- "learning_rate": 6.40625e-05,
892
- "loss": 1.0145,
893
- "step": 990
894
- },
895
- {
896
- "epoch": 0.3633720930232558,
897
- "grad_norm": 17.671977996826172,
898
- "learning_rate": 6.369912790697675e-05,
899
- "loss": 0.9329,
900
- "step": 1000
901
- },
902
- {
903
- "epoch": 0.3633720930232558,
904
- "eval_accuracy": 0.6923076923076923,
905
- "eval_loss": 0.8783280849456787,
906
- "eval_model_preparation_time": 0.0015,
907
- "eval_runtime": 49.9909,
908
- "eval_samples_per_second": 65.272,
909
- "eval_steps_per_second": 2.04,
910
- "step": 1000
911
- },
912
- {
913
- "epoch": 0.36700581395348836,
914
- "grad_norm": 22.96895408630371,
915
- "learning_rate": 6.333575581395349e-05,
916
- "loss": 0.9242,
917
- "step": 1010
918
- },
919
- {
920
- "epoch": 0.3706395348837209,
921
- "grad_norm": 19.682958602905273,
922
- "learning_rate": 6.297238372093024e-05,
923
- "loss": 0.9181,
924
- "step": 1020
925
- },
926
- {
927
- "epoch": 0.3742732558139535,
928
- "grad_norm": 21.612184524536133,
929
- "learning_rate": 6.260901162790698e-05,
930
- "loss": 0.8911,
931
- "step": 1030
932
- },
933
- {
934
- "epoch": 0.37790697674418605,
935
- "grad_norm": 17.922510147094727,
936
- "learning_rate": 6.224563953488372e-05,
937
- "loss": 0.8801,
938
- "step": 1040
939
- },
940
- {
941
- "epoch": 0.3815406976744186,
942
- "grad_norm": 19.79427719116211,
943
- "learning_rate": 6.188226744186046e-05,
944
- "loss": 0.9331,
945
- "step": 1050
946
- },
947
- {
948
- "epoch": 0.3815406976744186,
949
- "eval_accuracy": 0.6972111553784861,
950
- "eval_loss": 0.8626508712768555,
951
- "eval_model_preparation_time": 0.0015,
952
- "eval_runtime": 48.7203,
953
- "eval_samples_per_second": 66.974,
954
- "eval_steps_per_second": 2.094,
955
- "step": 1050
956
- },
957
- {
958
- "epoch": 0.3851744186046512,
959
- "grad_norm": 23.669225692749023,
960
- "learning_rate": 6.151889534883722e-05,
961
- "loss": 0.9045,
962
- "step": 1060
963
- },
964
- {
965
- "epoch": 0.38880813953488375,
966
- "grad_norm": 19.99860191345215,
967
- "learning_rate": 6.115552325581396e-05,
968
- "loss": 0.9222,
969
- "step": 1070
970
- },
971
- {
972
- "epoch": 0.39244186046511625,
973
- "grad_norm": 43.26332092285156,
974
- "learning_rate": 6.07921511627907e-05,
975
- "loss": 0.9973,
976
- "step": 1080
977
- },
978
- {
979
- "epoch": 0.3960755813953488,
980
- "grad_norm": 25.616634368896484,
981
- "learning_rate": 6.042877906976745e-05,
982
- "loss": 0.9616,
983
- "step": 1090
984
- },
985
- {
986
- "epoch": 0.3997093023255814,
987
- "grad_norm": 21.20777702331543,
988
- "learning_rate": 6.006540697674419e-05,
989
- "loss": 0.9042,
990
- "step": 1100
991
- },
992
- {
993
- "epoch": 0.3997093023255814,
994
- "eval_accuracy": 0.7269384002451732,
995
- "eval_loss": 0.832632303237915,
996
- "eval_model_preparation_time": 0.0015,
997
- "eval_runtime": 47.8314,
998
- "eval_samples_per_second": 68.219,
999
- "eval_steps_per_second": 2.132,
1000
- "step": 1100
1001
- },
1002
- {
1003
- "epoch": 0.40334302325581395,
1004
- "grad_norm": 25.138526916503906,
1005
- "learning_rate": 5.970203488372094e-05,
1006
- "loss": 0.9166,
1007
- "step": 1110
1008
- },
1009
- {
1010
- "epoch": 0.4069767441860465,
1011
- "grad_norm": 31.42925453186035,
1012
- "learning_rate": 5.933866279069767e-05,
1013
- "loss": 0.8452,
1014
- "step": 1120
1015
- },
1016
- {
1017
- "epoch": 0.4106104651162791,
1018
- "grad_norm": 21.83854866027832,
1019
- "learning_rate": 5.8975290697674425e-05,
1020
- "loss": 0.9061,
1021
- "step": 1130
1022
- },
1023
- {
1024
- "epoch": 0.41424418604651164,
1025
- "grad_norm": 24.000688552856445,
1026
- "learning_rate": 5.861191860465116e-05,
1027
- "loss": 0.9286,
1028
- "step": 1140
1029
- },
1030
- {
1031
- "epoch": 0.4178779069767442,
1032
- "grad_norm": 20.03739356994629,
1033
- "learning_rate": 5.824854651162791e-05,
1034
- "loss": 0.8758,
1035
- "step": 1150
1036
- },
1037
- {
1038
- "epoch": 0.4178779069767442,
1039
- "eval_accuracy": 0.7247931351517009,
1040
- "eval_loss": 0.8062854409217834,
1041
- "eval_model_preparation_time": 0.0015,
1042
- "eval_runtime": 49.4415,
1043
- "eval_samples_per_second": 65.997,
1044
- "eval_steps_per_second": 2.063,
1045
- "step": 1150
1046
- },
1047
- {
1048
- "epoch": 0.42151162790697677,
1049
- "grad_norm": 22.991212844848633,
1050
- "learning_rate": 5.7885174418604646e-05,
1051
- "loss": 0.8388,
1052
- "step": 1160
1053
- },
1054
- {
1055
- "epoch": 0.4251453488372093,
1056
- "grad_norm": 14.57451343536377,
1057
- "learning_rate": 5.75218023255814e-05,
1058
- "loss": 0.8124,
1059
- "step": 1170
1060
- },
1061
- {
1062
- "epoch": 0.42877906976744184,
1063
- "grad_norm": 16.236392974853516,
1064
- "learning_rate": 5.715843023255815e-05,
1065
- "loss": 0.9013,
1066
- "step": 1180
1067
- },
1068
- {
1069
- "epoch": 0.4324127906976744,
1070
- "grad_norm": 13.36828327178955,
1071
- "learning_rate": 5.679505813953489e-05,
1072
- "loss": 0.8201,
1073
- "step": 1190
1074
- },
1075
- {
1076
- "epoch": 0.436046511627907,
1077
- "grad_norm": 25.648473739624023,
1078
- "learning_rate": 5.6431686046511635e-05,
1079
- "loss": 0.857,
1080
- "step": 1200
1081
- },
1082
- {
1083
- "epoch": 0.436046511627907,
1084
- "eval_accuracy": 0.6993564204719583,
1085
- "eval_loss": 0.8466485738754272,
1086
- "eval_model_preparation_time": 0.0015,
1087
- "eval_runtime": 48.354,
1088
- "eval_samples_per_second": 67.482,
1089
- "eval_steps_per_second": 2.109,
1090
- "step": 1200
1091
- },
1092
- {
1093
- "epoch": 0.43968023255813954,
1094
- "grad_norm": 18.110530853271484,
1095
- "learning_rate": 5.606831395348837e-05,
1096
- "loss": 0.9745,
1097
- "step": 1210
1098
- },
1099
- {
1100
- "epoch": 0.4433139534883721,
1101
- "grad_norm": 25.08212661743164,
1102
- "learning_rate": 5.570494186046512e-05,
1103
- "loss": 0.8266,
1104
- "step": 1220
1105
- },
1106
- {
1107
- "epoch": 0.44694767441860467,
1108
- "grad_norm": 17.428245544433594,
1109
- "learning_rate": 5.5341569767441856e-05,
1110
- "loss": 0.8776,
1111
- "step": 1230
1112
- },
1113
- {
1114
- "epoch": 0.45058139534883723,
1115
- "grad_norm": 20.17462158203125,
1116
- "learning_rate": 5.497819767441861e-05,
1117
- "loss": 0.9266,
1118
- "step": 1240
1119
- },
1120
- {
1121
- "epoch": 0.45421511627906974,
1122
- "grad_norm": 19.790435791015625,
1123
- "learning_rate": 5.461482558139536e-05,
1124
- "loss": 0.8733,
1125
- "step": 1250
1126
- },
1127
- {
1128
- "epoch": 0.45421511627906974,
1129
- "eval_accuracy": 0.7223414036163041,
1130
- "eval_loss": 0.787076473236084,
1131
- "eval_model_preparation_time": 0.0015,
1132
- "eval_runtime": 51.1791,
1133
- "eval_samples_per_second": 63.756,
1134
- "eval_steps_per_second": 1.993,
1135
- "step": 1250
1136
- },
1137
- {
1138
- "epoch": 0.4578488372093023,
1139
- "grad_norm": 17.204328536987305,
1140
- "learning_rate": 5.42514534883721e-05,
1141
- "loss": 0.8393,
1142
- "step": 1260
1143
- },
1144
- {
1145
- "epoch": 0.46148255813953487,
1146
- "grad_norm": 16.980710983276367,
1147
- "learning_rate": 5.3888081395348845e-05,
1148
- "loss": 0.8185,
1149
- "step": 1270
1150
- },
1151
- {
1152
- "epoch": 0.46511627906976744,
1153
- "grad_norm": 21.34583854675293,
1154
- "learning_rate": 5.352470930232558e-05,
1155
- "loss": 0.8666,
1156
- "step": 1280
1157
- },
1158
- {
1159
- "epoch": 0.46875,
1160
- "grad_norm": 15.561858177185059,
1161
- "learning_rate": 5.316133720930233e-05,
1162
- "loss": 0.8883,
1163
- "step": 1290
1164
- },
1165
- {
1166
- "epoch": 0.47238372093023256,
1167
- "grad_norm": 24.96396255493164,
1168
- "learning_rate": 5.2797965116279066e-05,
1169
- "loss": 0.7988,
1170
- "step": 1300
1171
- },
1172
- {
1173
- "epoch": 0.47238372093023256,
1174
- "eval_accuracy": 0.7486975176218205,
1175
- "eval_loss": 0.7476297616958618,
1176
- "eval_model_preparation_time": 0.0015,
1177
- "eval_runtime": 48.5075,
1178
- "eval_samples_per_second": 67.268,
1179
- "eval_steps_per_second": 2.103,
1180
- "step": 1300
1181
- },
1182
- {
1183
- "epoch": 0.47601744186046513,
1184
- "grad_norm": 25.246776580810547,
1185
- "learning_rate": 5.243459302325582e-05,
1186
- "loss": 0.9025,
1187
- "step": 1310
1188
- },
1189
- {
1190
- "epoch": 0.4796511627906977,
1191
- "grad_norm": 18.316572189331055,
1192
- "learning_rate": 5.2071220930232554e-05,
1193
- "loss": 0.7593,
1194
- "step": 1320
1195
- },
1196
- {
1197
- "epoch": 0.48328488372093026,
1198
- "grad_norm": 22.122472763061523,
1199
- "learning_rate": 5.170784883720931e-05,
1200
- "loss": 0.8634,
1201
- "step": 1330
1202
- },
1203
- {
1204
- "epoch": 0.48691860465116277,
1205
- "grad_norm": 24.7249698638916,
1206
- "learning_rate": 5.1344476744186055e-05,
1207
- "loss": 0.7407,
1208
- "step": 1340
1209
- },
1210
- {
1211
- "epoch": 0.49055232558139533,
1212
- "grad_norm": 27.893335342407227,
1213
- "learning_rate": 5.0981104651162795e-05,
1214
- "loss": 0.9492,
1215
- "step": 1350
1216
- },
1217
- {
1218
- "epoch": 0.49055232558139533,
1219
- "eval_accuracy": 0.6944529574011645,
1220
- "eval_loss": 0.8117865920066833,
1221
- "eval_model_preparation_time": 0.0015,
1222
- "eval_runtime": 48.5657,
1223
- "eval_samples_per_second": 67.187,
1224
- "eval_steps_per_second": 2.1,
1225
- "step": 1350
1226
- },
1227
- {
1228
- "epoch": 0.4941860465116279,
1229
- "grad_norm": 19.129026412963867,
1230
- "learning_rate": 5.061773255813954e-05,
1231
- "loss": 0.9274,
1232
- "step": 1360
1233
- },
1234
- {
1235
- "epoch": 0.49781976744186046,
1236
- "grad_norm": 18.911165237426758,
1237
- "learning_rate": 5.0254360465116276e-05,
1238
- "loss": 0.807,
1239
- "step": 1370
1240
- },
1241
- {
1242
- "epoch": 0.501453488372093,
1243
- "grad_norm": 19.536846160888672,
1244
- "learning_rate": 4.989098837209303e-05,
1245
- "loss": 0.8404,
1246
- "step": 1380
1247
- },
1248
- {
1249
- "epoch": 0.5050872093023255,
1250
- "grad_norm": 23.16112518310547,
1251
- "learning_rate": 4.952761627906977e-05,
1252
- "loss": 0.812,
1253
- "step": 1390
1254
- },
1255
- {
1256
- "epoch": 0.5087209302325582,
1257
- "grad_norm": 24.2547664642334,
1258
- "learning_rate": 4.916424418604652e-05,
1259
- "loss": 0.8047,
1260
- "step": 1400
1261
- },
1262
- {
1263
- "epoch": 0.5087209302325582,
1264
- "eval_accuracy": 0.7391970579221575,
1265
- "eval_loss": 0.7529688477516174,
1266
- "eval_model_preparation_time": 0.0015,
1267
- "eval_runtime": 50.0516,
1268
- "eval_samples_per_second": 65.193,
1269
- "eval_steps_per_second": 2.038,
1270
- "step": 1400
1271
- },
1272
- {
1273
- "epoch": 0.5123546511627907,
1274
- "grad_norm": 21.53682518005371,
1275
- "learning_rate": 4.880087209302326e-05,
1276
- "loss": 0.799,
1277
- "step": 1410
1278
- },
1279
- {
1280
- "epoch": 0.5159883720930233,
1281
- "grad_norm": 23.544010162353516,
1282
- "learning_rate": 4.8437500000000005e-05,
1283
- "loss": 0.7933,
1284
- "step": 1420
1285
- },
1286
- {
1287
- "epoch": 0.5196220930232558,
1288
- "grad_norm": 19.342622756958008,
1289
- "learning_rate": 4.8074127906976745e-05,
1290
- "loss": 0.8245,
1291
- "step": 1430
1292
- },
1293
- {
1294
- "epoch": 0.5232558139534884,
1295
- "grad_norm": 20.08898162841797,
1296
- "learning_rate": 4.771075581395349e-05,
1297
- "loss": 0.8104,
1298
- "step": 1440
1299
- },
1300
- {
1301
- "epoch": 0.5268895348837209,
1302
- "grad_norm": 36.42573165893555,
1303
- "learning_rate": 4.734738372093023e-05,
1304
- "loss": 0.8601,
1305
- "step": 1450
1306
- },
1307
- {
1308
- "epoch": 0.5268895348837209,
1309
- "eval_accuracy": 0.7477781182960466,
1310
- "eval_loss": 0.7395877838134766,
1311
- "eval_model_preparation_time": 0.0015,
1312
- "eval_runtime": 48.2379,
1313
- "eval_samples_per_second": 67.644,
1314
- "eval_steps_per_second": 2.115,
1315
- "step": 1450
1316
- },
1317
- {
1318
- "epoch": 0.5305232558139535,
1319
- "grad_norm": 18.966615676879883,
1320
- "learning_rate": 4.6984011627906973e-05,
1321
- "loss": 0.8142,
1322
- "step": 1460
1323
- },
1324
- {
1325
- "epoch": 0.534156976744186,
1326
- "grad_norm": 19.256010055541992,
1327
- "learning_rate": 4.662063953488373e-05,
1328
- "loss": 0.7733,
1329
- "step": 1470
1330
- },
1331
- {
1332
- "epoch": 0.5377906976744186,
1333
- "grad_norm": 17.08919334411621,
1334
- "learning_rate": 4.625726744186047e-05,
1335
- "loss": 0.7369,
1336
- "step": 1480
1337
- },
1338
- {
1339
- "epoch": 0.5414244186046512,
1340
- "grad_norm": 30.637798309326172,
1341
- "learning_rate": 4.5893895348837215e-05,
1342
- "loss": 0.7535,
1343
- "step": 1490
1344
- },
1345
- {
1346
- "epoch": 0.5450581395348837,
1347
- "grad_norm": 16.82142448425293,
1348
- "learning_rate": 4.5530523255813955e-05,
1349
- "loss": 0.7873,
1350
- "step": 1500
1351
- },
1352
- {
1353
- "epoch": 0.5450581395348837,
1354
- "eval_accuracy": 0.7618755746245786,
1355
- "eval_loss": 0.7191266417503357,
1356
- "eval_model_preparation_time": 0.0015,
1357
- "eval_runtime": 47.9425,
1358
- "eval_samples_per_second": 68.061,
1359
- "eval_steps_per_second": 2.128,
1360
- "step": 1500
1361
- },
1362
- {
1363
- "epoch": 0.5486918604651163,
1364
- "grad_norm": 15.000924110412598,
1365
- "learning_rate": 4.51671511627907e-05,
1366
- "loss": 0.7462,
1367
- "step": 1510
1368
- },
1369
- {
1370
- "epoch": 0.5523255813953488,
1371
- "grad_norm": 19.328227996826172,
1372
- "learning_rate": 4.480377906976744e-05,
1373
- "loss": 0.7997,
1374
- "step": 1520
1375
- },
1376
- {
1377
- "epoch": 0.5559593023255814,
1378
- "grad_norm": 17.77477264404297,
1379
- "learning_rate": 4.444040697674418e-05,
1380
- "loss": 0.7797,
1381
- "step": 1530
1382
- },
1383
- {
1384
- "epoch": 0.559593023255814,
1385
- "grad_norm": 19.144107818603516,
1386
- "learning_rate": 4.407703488372093e-05,
1387
- "loss": 0.7292,
1388
- "step": 1540
1389
- },
1390
- {
1391
- "epoch": 0.5632267441860465,
1392
- "grad_norm": 23.17219352722168,
1393
- "learning_rate": 4.371366279069768e-05,
1394
- "loss": 0.704,
1395
- "step": 1550
1396
- },
1397
- {
1398
- "epoch": 0.5632267441860465,
1399
- "eval_accuracy": 0.7581979773214833,
1400
- "eval_loss": 0.6839959621429443,
1401
- "eval_model_preparation_time": 0.0015,
1402
- "eval_runtime": 48.3014,
1403
- "eval_samples_per_second": 67.555,
1404
- "eval_steps_per_second": 2.112,
1405
- "step": 1550
1406
- },
1407
- {
1408
- "epoch": 0.5668604651162791,
1409
- "grad_norm": 17.988855361938477,
1410
- "learning_rate": 4.3350290697674425e-05,
1411
- "loss": 0.8192,
1412
- "step": 1560
1413
- },
1414
- {
1415
- "epoch": 0.5704941860465116,
1416
- "grad_norm": 39.58876037597656,
1417
- "learning_rate": 4.2986918604651165e-05,
1418
- "loss": 0.7845,
1419
- "step": 1570
1420
- },
1421
- {
1422
- "epoch": 0.5741279069767442,
1423
- "grad_norm": 24.698930740356445,
1424
- "learning_rate": 4.262354651162791e-05,
1425
- "loss": 0.7521,
1426
- "step": 1580
1427
- },
1428
- {
1429
- "epoch": 0.5777616279069767,
1430
- "grad_norm": 17.967649459838867,
1431
- "learning_rate": 4.226017441860465e-05,
1432
- "loss": 0.8063,
1433
- "step": 1590
1434
- },
1435
- {
1436
- "epoch": 0.5813953488372093,
1437
- "grad_norm": 25.17499351501465,
1438
- "learning_rate": 4.18968023255814e-05,
1439
- "loss": 0.8536,
1440
- "step": 1600
1441
- },
1442
- {
1443
- "epoch": 0.5813953488372093,
1444
- "eval_accuracy": 0.7581979773214833,
1445
- "eval_loss": 0.6965886950492859,
1446
- "eval_model_preparation_time": 0.0015,
1447
- "eval_runtime": 48.257,
1448
- "eval_samples_per_second": 67.617,
1449
- "eval_steps_per_second": 2.114,
1450
- "step": 1600
1451
- },
1452
- {
1453
- "epoch": 0.5850290697674418,
1454
- "grad_norm": 27.656429290771484,
1455
- "learning_rate": 4.153343023255814e-05,
1456
- "loss": 0.7501,
1457
- "step": 1610
1458
- },
1459
- {
1460
- "epoch": 0.5886627906976745,
1461
- "grad_norm": 19.678503036499023,
1462
- "learning_rate": 4.117005813953488e-05,
1463
- "loss": 0.7515,
1464
- "step": 1620
1465
- },
1466
- {
1467
- "epoch": 0.592296511627907,
1468
- "grad_norm": 19.597240447998047,
1469
- "learning_rate": 4.080668604651163e-05,
1470
- "loss": 0.7465,
1471
- "step": 1630
1472
- },
1473
- {
1474
- "epoch": 0.5959302325581395,
1475
- "grad_norm": 24.52513885498047,
1476
- "learning_rate": 4.0443313953488375e-05,
1477
- "loss": 0.6374,
1478
- "step": 1640
1479
- },
1480
- {
1481
- "epoch": 0.5995639534883721,
1482
- "grad_norm": 27.33843231201172,
1483
- "learning_rate": 4.007994186046512e-05,
1484
- "loss": 0.7612,
1485
- "step": 1650
1486
- },
1487
- {
1488
- "epoch": 0.5995639534883721,
1489
- "eval_accuracy": 0.7894575543977934,
1490
- "eval_loss": 0.6561300754547119,
1491
- "eval_model_preparation_time": 0.0015,
1492
- "eval_runtime": 53.5897,
1493
- "eval_samples_per_second": 60.889,
1494
- "eval_steps_per_second": 1.903,
1495
- "step": 1650
1496
- },
1497
- {
1498
- "epoch": 0.6031976744186046,
1499
- "grad_norm": 17.966524124145508,
1500
- "learning_rate": 3.971656976744186e-05,
1501
- "loss": 0.6416,
1502
- "step": 1660
1503
- },
1504
- {
1505
- "epoch": 0.6068313953488372,
1506
- "grad_norm": 24.99458885192871,
1507
- "learning_rate": 3.935319767441861e-05,
1508
- "loss": 0.7307,
1509
- "step": 1670
1510
- },
1511
- {
1512
- "epoch": 0.6104651162790697,
1513
- "grad_norm": 17.375545501708984,
1514
- "learning_rate": 3.898982558139535e-05,
1515
- "loss": 0.6845,
1516
- "step": 1680
1517
- },
1518
- {
1519
- "epoch": 0.6140988372093024,
1520
- "grad_norm": 21.309844970703125,
1521
- "learning_rate": 3.86264534883721e-05,
1522
- "loss": 0.7197,
1523
- "step": 1690
1524
- },
1525
- {
1526
- "epoch": 0.6177325581395349,
1527
- "grad_norm": 19.636783599853516,
1528
- "learning_rate": 3.826308139534884e-05,
1529
- "loss": 0.7701,
1530
- "step": 1700
1531
- },
1532
- {
1533
- "epoch": 0.6177325581395349,
1534
- "eval_accuracy": 0.7931351517008888,
1535
- "eval_loss": 0.6381626725196838,
1536
- "eval_model_preparation_time": 0.0015,
1537
- "eval_runtime": 51.3975,
1538
- "eval_samples_per_second": 63.486,
1539
- "eval_steps_per_second": 1.985,
1540
- "step": 1700
1541
- },
1542
- {
1543
- "epoch": 0.6213662790697675,
1544
- "grad_norm": 16.005521774291992,
1545
- "learning_rate": 3.789970930232558e-05,
1546
- "loss": 0.73,
1547
- "step": 1710
1548
- },
1549
- {
1550
- "epoch": 0.625,
1551
- "grad_norm": 22.36250114440918,
1552
- "learning_rate": 3.7536337209302325e-05,
1553
- "loss": 0.7021,
1554
- "step": 1720
1555
- },
1556
- {
1557
- "epoch": 0.6286337209302325,
1558
- "grad_norm": 27.990943908691406,
1559
- "learning_rate": 3.717296511627907e-05,
1560
- "loss": 0.6955,
1561
- "step": 1730
1562
- },
1563
- {
1564
- "epoch": 0.6322674418604651,
1565
- "grad_norm": 16.45665168762207,
1566
- "learning_rate": 3.680959302325582e-05,
1567
- "loss": 0.7612,
1568
- "step": 1740
1569
- },
1570
- {
1571
- "epoch": 0.6359011627906976,
1572
- "grad_norm": 37.34355163574219,
1573
- "learning_rate": 3.644622093023256e-05,
1574
- "loss": 0.7538,
1575
- "step": 1750
1576
- },
1577
- {
1578
- "epoch": 0.6359011627906976,
1579
- "eval_accuracy": 0.7821023597916028,
1580
- "eval_loss": 0.6540101766586304,
1581
- "eval_model_preparation_time": 0.0015,
1582
- "eval_runtime": 50.9202,
1583
- "eval_samples_per_second": 64.081,
1584
- "eval_steps_per_second": 2.003,
1585
- "step": 1750
1586
- },
1587
- {
1588
- "epoch": 0.6395348837209303,
1589
- "grad_norm": 22.030466079711914,
1590
- "learning_rate": 3.608284883720931e-05,
1591
- "loss": 0.7048,
1592
- "step": 1760
1593
- },
1594
- {
1595
- "epoch": 0.6431686046511628,
1596
- "grad_norm": 16.459775924682617,
1597
- "learning_rate": 3.571947674418605e-05,
1598
- "loss": 0.7338,
1599
- "step": 1770
1600
- },
1601
- {
1602
- "epoch": 0.6468023255813954,
1603
- "grad_norm": 20.406429290771484,
1604
- "learning_rate": 3.535610465116279e-05,
1605
- "loss": 0.7017,
1606
- "step": 1780
1607
- },
1608
- {
1609
- "epoch": 0.6504360465116279,
1610
- "grad_norm": 32.876670837402344,
1611
- "learning_rate": 3.4992732558139535e-05,
1612
- "loss": 0.7467,
1613
- "step": 1790
1614
- },
1615
- {
1616
- "epoch": 0.6540697674418605,
1617
- "grad_norm": 21.809101104736328,
1618
- "learning_rate": 3.4629360465116276e-05,
1619
- "loss": 0.7496,
1620
- "step": 1800
1621
- },
1622
- {
1623
- "epoch": 0.6540697674418605,
1624
- "eval_accuracy": 0.7735212994177137,
1625
- "eval_loss": 0.6603609323501587,
1626
- "eval_model_preparation_time": 0.0015,
1627
- "eval_runtime": 54.6689,
1628
- "eval_samples_per_second": 59.687,
1629
- "eval_steps_per_second": 1.866,
1630
- "step": 1800
1631
- },
1632
- {
1633
- "epoch": 0.657703488372093,
1634
- "grad_norm": 17.124296188354492,
1635
- "learning_rate": 3.426598837209303e-05,
1636
- "loss": 0.6712,
1637
- "step": 1810
1638
- },
1639
- {
1640
- "epoch": 0.6613372093023255,
1641
- "grad_norm": 17.997486114501953,
1642
- "learning_rate": 3.390261627906977e-05,
1643
- "loss": 0.6917,
1644
- "step": 1820
1645
- },
1646
- {
1647
- "epoch": 0.6649709302325582,
1648
- "grad_norm": 30.947837829589844,
1649
- "learning_rate": 3.353924418604652e-05,
1650
- "loss": 0.7091,
1651
- "step": 1830
1652
- },
1653
- {
1654
- "epoch": 0.6686046511627907,
1655
- "grad_norm": 24.643381118774414,
1656
- "learning_rate": 3.317587209302326e-05,
1657
- "loss": 0.6695,
1658
- "step": 1840
1659
- },
1660
- {
1661
- "epoch": 0.6722383720930233,
1662
- "grad_norm": 23.651065826416016,
1663
- "learning_rate": 3.2812500000000005e-05,
1664
- "loss": 0.7512,
1665
- "step": 1850
1666
- },
1667
- {
1668
- "epoch": 0.6722383720930233,
1669
- "eval_accuracy": 0.8032485442844008,
1670
- "eval_loss": 0.6195615530014038,
1671
- "eval_model_preparation_time": 0.0015,
1672
- "eval_runtime": 51.8657,
1673
- "eval_samples_per_second": 62.913,
1674
- "eval_steps_per_second": 1.967,
1675
- "step": 1850
1676
- },
1677
- {
1678
- "epoch": 0.6758720930232558,
1679
- "grad_norm": 19.081098556518555,
1680
- "learning_rate": 3.2449127906976745e-05,
1681
- "loss": 0.6343,
1682
- "step": 1860
1683
- },
1684
- {
1685
- "epoch": 0.6795058139534884,
1686
- "grad_norm": 23.336320877075195,
1687
- "learning_rate": 3.2085755813953486e-05,
1688
- "loss": 0.7375,
1689
- "step": 1870
1690
- },
1691
- {
1692
- "epoch": 0.6831395348837209,
1693
- "grad_norm": 19.397146224975586,
1694
- "learning_rate": 3.172238372093023e-05,
1695
- "loss": 0.6674,
1696
- "step": 1880
1697
- },
1698
- {
1699
- "epoch": 0.6867732558139535,
1700
- "grad_norm": 21.171167373657227,
1701
- "learning_rate": 3.135901162790697e-05,
1702
- "loss": 0.7242,
1703
- "step": 1890
1704
- },
1705
- {
1706
- "epoch": 0.690406976744186,
1707
- "grad_norm": 24.170560836791992,
1708
- "learning_rate": 3.099563953488373e-05,
1709
- "loss": 0.6773,
1710
- "step": 1900
1711
- },
1712
- {
1713
- "epoch": 0.690406976744186,
1714
- "eval_accuracy": 0.8075390744713454,
1715
- "eval_loss": 0.6085334420204163,
1716
- "eval_model_preparation_time": 0.0015,
1717
- "eval_runtime": 50.933,
1718
- "eval_samples_per_second": 64.065,
1719
- "eval_steps_per_second": 2.003,
1720
- "step": 1900
1721
- },
1722
- {
1723
- "epoch": 0.6940406976744186,
1724
- "grad_norm": 19.82205581665039,
1725
- "learning_rate": 3.063226744186047e-05,
1726
- "loss": 0.7194,
1727
- "step": 1910
1728
- },
1729
- {
1730
- "epoch": 0.6976744186046512,
1731
- "grad_norm": 19.169923782348633,
1732
- "learning_rate": 3.026889534883721e-05,
1733
- "loss": 0.6967,
1734
- "step": 1920
1735
- },
1736
- {
1737
- "epoch": 0.7013081395348837,
1738
- "grad_norm": 18.116230010986328,
1739
- "learning_rate": 2.9905523255813955e-05,
1740
- "loss": 0.705,
1741
- "step": 1930
1742
- },
1743
- {
1744
- "epoch": 0.7049418604651163,
1745
- "grad_norm": 16.55962562561035,
1746
- "learning_rate": 2.95421511627907e-05,
1747
- "loss": 0.6809,
1748
- "step": 1940
1749
- },
1750
- {
1751
- "epoch": 0.7085755813953488,
1752
- "grad_norm": 34.48362731933594,
1753
- "learning_rate": 2.9178779069767443e-05,
1754
- "loss": 0.6655,
1755
- "step": 1950
1756
- },
1757
- {
1758
- "epoch": 0.7085755813953488,
1759
- "eval_accuracy": 0.7995709469813056,
1760
- "eval_loss": 0.6168962717056274,
1761
- "eval_model_preparation_time": 0.0015,
1762
- "eval_runtime": 49.9889,
1763
- "eval_samples_per_second": 65.274,
1764
- "eval_steps_per_second": 2.04,
1765
- "step": 1950
1766
- },
1767
- {
1768
- "epoch": 0.7122093023255814,
1769
- "grad_norm": 27.100082397460938,
1770
- "learning_rate": 2.8815406976744186e-05,
1771
- "loss": 0.6525,
1772
- "step": 1960
1773
- },
1774
- {
1775
- "epoch": 0.715843023255814,
1776
- "grad_norm": 15.424944877624512,
1777
- "learning_rate": 2.845203488372093e-05,
1778
- "loss": 0.7398,
1779
- "step": 1970
1780
- },
1781
- {
1782
- "epoch": 0.7194767441860465,
1783
- "grad_norm": 18.602054595947266,
1784
- "learning_rate": 2.8088662790697677e-05,
1785
- "loss": 0.7341,
1786
- "step": 1980
1787
- },
1788
- {
1789
- "epoch": 0.7231104651162791,
1790
- "grad_norm": 18.60715675354004,
1791
- "learning_rate": 2.772529069767442e-05,
1792
- "loss": 0.62,
1793
- "step": 1990
1794
- },
1795
- {
1796
- "epoch": 0.7267441860465116,
1797
- "grad_norm": 18.766788482666016,
1798
- "learning_rate": 2.7361918604651165e-05,
1799
- "loss": 0.5997,
1800
- "step": 2000
1801
- },
1802
- {
1803
- "epoch": 0.7267441860465116,
1804
- "eval_accuracy": 0.8004903463070794,
1805
- "eval_loss": 0.6034106016159058,
1806
- "eval_model_preparation_time": 0.0015,
1807
- "eval_runtime": 50.3413,
1808
- "eval_samples_per_second": 64.818,
1809
- "eval_steps_per_second": 2.026,
1810
- "step": 2000
1811
- },
1812
- {
1813
- "epoch": 0.7303779069767442,
1814
- "grad_norm": 20.883230209350586,
1815
- "learning_rate": 2.699854651162791e-05,
1816
- "loss": 0.6445,
1817
- "step": 2010
1818
- },
1819
- {
1820
- "epoch": 0.7340116279069767,
1821
- "grad_norm": 17.367563247680664,
1822
- "learning_rate": 2.6635174418604652e-05,
1823
- "loss": 0.6727,
1824
- "step": 2020
1825
- },
1826
- {
1827
- "epoch": 0.7376453488372093,
1828
- "grad_norm": 21.541458129882812,
1829
- "learning_rate": 2.6271802325581396e-05,
1830
- "loss": 0.6984,
1831
- "step": 2030
1832
- },
1833
- {
1834
- "epoch": 0.7412790697674418,
1835
- "grad_norm": 25.181142807006836,
1836
- "learning_rate": 2.590843023255814e-05,
1837
- "loss": 0.6668,
1838
- "step": 2040
1839
- },
1840
- {
1841
- "epoch": 0.7449127906976745,
1842
- "grad_norm": 19.112144470214844,
1843
- "learning_rate": 2.5545058139534884e-05,
1844
- "loss": 0.6144,
1845
- "step": 2050
1846
- },
1847
- {
1848
- "epoch": 0.7449127906976745,
1849
- "eval_accuracy": 0.8075390744713454,
1850
- "eval_loss": 0.6008254885673523,
1851
- "eval_model_preparation_time": 0.0015,
1852
- "eval_runtime": 50.7754,
1853
- "eval_samples_per_second": 64.263,
1854
- "eval_steps_per_second": 2.009,
1855
- "step": 2050
1856
- },
1857
- {
1858
- "epoch": 0.748546511627907,
1859
- "grad_norm": 23.80943489074707,
1860
- "learning_rate": 2.5181686046511628e-05,
1861
- "loss": 0.6887,
1862
- "step": 2060
1863
- },
1864
- {
1865
- "epoch": 0.7521802325581395,
1866
- "grad_norm": 22.5864200592041,
1867
- "learning_rate": 2.481831395348837e-05,
1868
- "loss": 0.6269,
1869
- "step": 2070
1870
- },
1871
- {
1872
- "epoch": 0.7558139534883721,
1873
- "grad_norm": 20.330726623535156,
1874
- "learning_rate": 2.4454941860465115e-05,
1875
- "loss": 0.6929,
1876
- "step": 2080
1877
- },
1878
- {
1879
- "epoch": 0.7594476744186046,
1880
- "grad_norm": 24.566627502441406,
1881
- "learning_rate": 2.4091569767441862e-05,
1882
- "loss": 0.5906,
1883
- "step": 2090
1884
- },
1885
- {
1886
- "epoch": 0.7630813953488372,
1887
- "grad_norm": 23.221717834472656,
1888
- "learning_rate": 2.3728197674418606e-05,
1889
- "loss": 0.6519,
1890
- "step": 2100
1891
- },
1892
- {
1893
- "epoch": 0.7630813953488372,
1894
- "eval_accuracy": 0.8231688630095004,
1895
- "eval_loss": 0.5798885822296143,
1896
- "eval_model_preparation_time": 0.0015,
1897
- "eval_runtime": 52.3499,
1898
- "eval_samples_per_second": 62.331,
1899
- "eval_steps_per_second": 1.948,
1900
- "step": 2100
1901
- },
1902
- {
1903
- "epoch": 0.7667151162790697,
1904
- "grad_norm": 25.42754364013672,
1905
- "learning_rate": 2.336482558139535e-05,
1906
- "loss": 0.6025,
1907
- "step": 2110
1908
- },
1909
- {
1910
- "epoch": 0.7703488372093024,
1911
- "grad_norm": 17.46138572692871,
1912
- "learning_rate": 2.3001453488372094e-05,
1913
- "loss": 0.6702,
1914
- "step": 2120
1915
- },
1916
- {
1917
- "epoch": 0.7739825581395349,
1918
- "grad_norm": 19.8096923828125,
1919
- "learning_rate": 2.263808139534884e-05,
1920
- "loss": 0.6543,
1921
- "step": 2130
1922
- },
1923
- {
1924
- "epoch": 0.7776162790697675,
1925
- "grad_norm": 18.788450241088867,
1926
- "learning_rate": 2.2274709302325585e-05,
1927
- "loss": 0.704,
1928
- "step": 2140
1929
- },
1930
- {
1931
- "epoch": 0.78125,
1932
- "grad_norm": 21.987895965576172,
1933
- "learning_rate": 2.1911337209302325e-05,
1934
- "loss": 0.6834,
1935
- "step": 2150
1936
- },
1937
- {
1938
- "epoch": 0.78125,
1939
- "eval_accuracy": 0.8292981918479927,
1940
- "eval_loss": 0.5684976577758789,
1941
- "eval_model_preparation_time": 0.0015,
1942
- "eval_runtime": 51.8943,
1943
- "eval_samples_per_second": 62.878,
1944
- "eval_steps_per_second": 1.966,
1945
- "step": 2150
1946
- },
1947
- {
1948
- "epoch": 0.7848837209302325,
1949
- "grad_norm": 22.958097457885742,
1950
- "learning_rate": 2.154796511627907e-05,
1951
- "loss": 0.631,
1952
- "step": 2160
1953
- },
1954
- {
1955
- "epoch": 0.7885174418604651,
1956
- "grad_norm": 20.562345504760742,
1957
- "learning_rate": 2.1184593023255813e-05,
1958
- "loss": 0.6287,
1959
- "step": 2170
1960
- },
1961
- {
1962
- "epoch": 0.7921511627906976,
1963
- "grad_norm": 14.870391845703125,
1964
- "learning_rate": 2.082122093023256e-05,
1965
- "loss": 0.5629,
1966
- "step": 2180
1967
- },
1968
- {
1969
- "epoch": 0.7957848837209303,
1970
- "grad_norm": 23.68880844116211,
1971
- "learning_rate": 2.0457848837209304e-05,
1972
- "loss": 0.6285,
1973
- "step": 2190
1974
- },
1975
- {
1976
- "epoch": 0.7994186046511628,
1977
- "grad_norm": 23.51922035217285,
1978
- "learning_rate": 2.0094476744186047e-05,
1979
- "loss": 0.6258,
1980
- "step": 2200
1981
- },
1982
- {
1983
- "epoch": 0.7994186046511628,
1984
- "eval_accuracy": 0.8240882623352743,
1985
- "eval_loss": 0.5687205791473389,
1986
- "eval_model_preparation_time": 0.0015,
1987
- "eval_runtime": 51.3481,
1988
- "eval_samples_per_second": 63.547,
1989
- "eval_steps_per_second": 1.986,
1990
- "step": 2200
1991
- },
1992
- {
1993
- "epoch": 0.8030523255813954,
1994
- "grad_norm": 18.42901611328125,
1995
- "learning_rate": 1.973110465116279e-05,
1996
- "loss": 0.6338,
1997
- "step": 2210
1998
- },
1999
- {
2000
- "epoch": 0.8066860465116279,
2001
- "grad_norm": 24.76369857788086,
2002
- "learning_rate": 1.936773255813954e-05,
2003
- "loss": 0.6247,
2004
- "step": 2220
2005
- },
2006
- {
2007
- "epoch": 0.8103197674418605,
2008
- "grad_norm": 21.530895233154297,
2009
- "learning_rate": 1.9004360465116282e-05,
2010
- "loss": 0.5696,
2011
- "step": 2230
2012
- },
2013
- {
2014
- "epoch": 0.813953488372093,
2015
- "grad_norm": 24.415956497192383,
2016
- "learning_rate": 1.8640988372093023e-05,
2017
- "loss": 0.621,
2018
- "step": 2240
2019
- },
2020
- {
2021
- "epoch": 0.8175872093023255,
2022
- "grad_norm": 24.22767448425293,
2023
- "learning_rate": 1.8277616279069766e-05,
2024
- "loss": 0.6685,
2025
- "step": 2250
2026
- },
2027
- {
2028
- "epoch": 0.8175872093023255,
2029
- "eval_accuracy": 0.8243947287771989,
2030
- "eval_loss": 0.5570237636566162,
2031
- "eval_model_preparation_time": 0.0015,
2032
- "eval_runtime": 52.4926,
2033
- "eval_samples_per_second": 62.161,
2034
- "eval_steps_per_second": 1.943,
2035
- "step": 2250
2036
- },
2037
- {
2038
- "epoch": 0.8212209302325582,
2039
- "grad_norm": 15.582283973693848,
2040
- "learning_rate": 1.7914244186046513e-05,
2041
- "loss": 0.6792,
2042
- "step": 2260
2043
- },
2044
- {
2045
- "epoch": 0.8248546511627907,
2046
- "grad_norm": 21.79721450805664,
2047
- "learning_rate": 1.7550872093023257e-05,
2048
- "loss": 0.5503,
2049
- "step": 2270
2050
- },
2051
- {
2052
- "epoch": 0.8284883720930233,
2053
- "grad_norm": 17.022436141967773,
2054
- "learning_rate": 1.71875e-05,
2055
- "loss": 0.5733,
2056
- "step": 2280
2057
- },
2058
- {
2059
- "epoch": 0.8321220930232558,
2060
- "grad_norm": 19.0545711517334,
2061
- "learning_rate": 1.6824127906976745e-05,
2062
- "loss": 0.5846,
2063
- "step": 2290
2064
- },
2065
- {
2066
- "epoch": 0.8357558139534884,
2067
- "grad_norm": 23.020610809326172,
2068
- "learning_rate": 1.646075581395349e-05,
2069
- "loss": 0.6844,
2070
- "step": 2300
2071
- },
2072
- {
2073
- "epoch": 0.8357558139534884,
2074
- "eval_accuracy": 0.8253141281029728,
2075
- "eval_loss": 0.5579020977020264,
2076
- "eval_model_preparation_time": 0.0015,
2077
- "eval_runtime": 50.8057,
2078
- "eval_samples_per_second": 64.225,
2079
- "eval_steps_per_second": 2.008,
2080
- "step": 2300
2081
- },
2082
- {
2083
- "epoch": 0.8393895348837209,
2084
- "grad_norm": 20.574384689331055,
2085
- "learning_rate": 1.6097383720930236e-05,
2086
- "loss": 0.6167,
2087
- "step": 2310
2088
- },
2089
- {
2090
- "epoch": 0.8430232558139535,
2091
- "grad_norm": 28.64992332458496,
2092
- "learning_rate": 1.5734011627906976e-05,
2093
- "loss": 0.6587,
2094
- "step": 2320
2095
- },
2096
- {
2097
- "epoch": 0.846656976744186,
2098
- "grad_norm": 15.830301284790039,
2099
- "learning_rate": 1.537063953488372e-05,
2100
- "loss": 0.6342,
2101
- "step": 2330
2102
- },
2103
- {
2104
- "epoch": 0.8502906976744186,
2105
- "grad_norm": 18.624250411987305,
2106
- "learning_rate": 1.5007267441860465e-05,
2107
- "loss": 0.6572,
2108
- "step": 2340
2109
- },
2110
- {
2111
- "epoch": 0.8539244186046512,
2112
- "grad_norm": 30.454252243041992,
2113
- "learning_rate": 1.4643895348837211e-05,
2114
- "loss": 0.6446,
2115
- "step": 2350
2116
- },
2117
- {
2118
- "epoch": 0.8539244186046512,
2119
- "eval_accuracy": 0.8243947287771989,
2120
- "eval_loss": 0.5529566407203674,
2121
- "eval_model_preparation_time": 0.0015,
2122
- "eval_runtime": 51.0771,
2123
- "eval_samples_per_second": 63.884,
2124
- "eval_steps_per_second": 1.997,
2125
- "step": 2350
2126
- },
2127
- {
2128
- "epoch": 0.8575581395348837,
2129
- "grad_norm": 25.79448127746582,
2130
- "learning_rate": 1.4280523255813955e-05,
2131
- "loss": 0.6686,
2132
- "step": 2360
2133
- },
2134
- {
2135
- "epoch": 0.8611918604651163,
2136
- "grad_norm": 17.485763549804688,
2137
- "learning_rate": 1.3917151162790698e-05,
2138
- "loss": 0.5882,
2139
- "step": 2370
2140
- },
2141
- {
2142
- "epoch": 0.8648255813953488,
2143
- "grad_norm": 19.48297691345215,
2144
- "learning_rate": 1.3553779069767442e-05,
2145
- "loss": 0.6424,
2146
- "step": 2380
2147
- },
2148
- {
2149
- "epoch": 0.8684593023255814,
2150
- "grad_norm": 28.619121551513672,
2151
- "learning_rate": 1.3190406976744188e-05,
2152
- "loss": 0.5695,
2153
- "step": 2390
2154
- },
2155
- {
2156
- "epoch": 0.872093023255814,
2157
- "grad_norm": 16.906198501586914,
2158
- "learning_rate": 1.2827034883720932e-05,
2159
- "loss": 0.6243,
2160
- "step": 2400
2161
- },
2162
- {
2163
- "epoch": 0.872093023255814,
2164
- "eval_accuracy": 0.8277658596383696,
2165
- "eval_loss": 0.5536515712738037,
2166
- "eval_model_preparation_time": 0.0015,
2167
- "eval_runtime": 50.8517,
2168
- "eval_samples_per_second": 64.167,
2169
- "eval_steps_per_second": 2.006,
2170
- "step": 2400
2171
- },
2172
- {
2173
- "epoch": 0.8757267441860465,
2174
- "grad_norm": 21.633602142333984,
2175
- "learning_rate": 1.2463662790697675e-05,
2176
- "loss": 0.5708,
2177
- "step": 2410
2178
- },
2179
- {
2180
- "epoch": 0.8793604651162791,
2181
- "grad_norm": 28.101240158081055,
2182
- "learning_rate": 1.2100290697674419e-05,
2183
- "loss": 0.6923,
2184
- "step": 2420
2185
- },
2186
- {
2187
- "epoch": 0.8829941860465116,
2188
- "grad_norm": 16.087495803833008,
2189
- "learning_rate": 1.1736918604651163e-05,
2190
- "loss": 0.4912,
2191
- "step": 2430
2192
- },
2193
- {
2194
- "epoch": 0.8866279069767442,
2195
- "grad_norm": 18.913463592529297,
2196
- "learning_rate": 1.1373546511627907e-05,
2197
- "loss": 0.583,
2198
- "step": 2440
2199
- },
2200
- {
2201
- "epoch": 0.8902616279069767,
2202
- "grad_norm": 26.900217056274414,
2203
- "learning_rate": 1.1010174418604652e-05,
2204
- "loss": 0.617,
2205
- "step": 2450
2206
- },
2207
- {
2208
- "epoch": 0.8902616279069767,
2209
- "eval_accuracy": 0.8372663193380325,
2210
- "eval_loss": 0.5327870845794678,
2211
- "eval_model_preparation_time": 0.0015,
2212
- "eval_runtime": 51.6187,
2213
- "eval_samples_per_second": 63.214,
2214
- "eval_steps_per_second": 1.976,
2215
- "step": 2450
2216
- },
2217
- {
2218
- "epoch": 0.8938953488372093,
2219
- "grad_norm": 18.022111892700195,
2220
- "learning_rate": 1.0646802325581396e-05,
2221
- "loss": 0.547,
2222
- "step": 2460
2223
- },
2224
- {
2225
- "epoch": 0.8975290697674418,
2226
- "grad_norm": 16.78557777404785,
2227
- "learning_rate": 1.028343023255814e-05,
2228
- "loss": 0.5687,
2229
- "step": 2470
2230
- },
2231
- {
2232
- "epoch": 0.9011627906976745,
2233
- "grad_norm": 20.011295318603516,
2234
- "learning_rate": 9.920058139534884e-06,
2235
- "loss": 0.6218,
2236
- "step": 2480
2237
- },
2238
- {
2239
- "epoch": 0.904796511627907,
2240
- "grad_norm": 22.754287719726562,
2241
- "learning_rate": 9.556686046511629e-06,
2242
- "loss": 0.6328,
2243
- "step": 2490
2244
- },
2245
- {
2246
- "epoch": 0.9084302325581395,
2247
- "grad_norm": 19.715274810791016,
2248
- "learning_rate": 9.193313953488373e-06,
2249
- "loss": 0.5724,
2250
- "step": 2500
2251
- },
2252
- {
2253
- "epoch": 0.9084302325581395,
2254
- "eval_accuracy": 0.840024517315354,
2255
- "eval_loss": 0.5281281471252441,
2256
- "eval_model_preparation_time": 0.0015,
2257
- "eval_runtime": 52.2053,
2258
- "eval_samples_per_second": 62.503,
2259
- "eval_steps_per_second": 1.954,
2260
- "step": 2500
2261
- },
2262
- {
2263
- "epoch": 0.9120639534883721,
2264
- "grad_norm": 19.64166831970215,
2265
- "learning_rate": 8.829941860465117e-06,
2266
- "loss": 0.5446,
2267
- "step": 2510
2268
- },
2269
- {
2270
- "epoch": 0.9156976744186046,
2271
- "grad_norm": 19.133466720581055,
2272
- "learning_rate": 8.46656976744186e-06,
2273
- "loss": 0.569,
2274
- "step": 2520
2275
- },
2276
- {
2277
- "epoch": 0.9193313953488372,
2278
- "grad_norm": 20.076690673828125,
2279
- "learning_rate": 8.103197674418606e-06,
2280
- "loss": 0.6545,
2281
- "step": 2530
2282
- },
2283
- {
2284
- "epoch": 0.9229651162790697,
2285
- "grad_norm": 18.778846740722656,
2286
- "learning_rate": 7.73982558139535e-06,
2287
- "loss": 0.5915,
2288
- "step": 2540
2289
- },
2290
- {
2291
- "epoch": 0.9265988372093024,
2292
- "grad_norm": 18.11453628540039,
2293
- "learning_rate": 7.376453488372094e-06,
2294
- "loss": 0.4931,
2295
- "step": 2550
2296
- },
2297
- {
2298
- "epoch": 0.9265988372093024,
2299
- "eval_accuracy": 0.8440085810603739,
2300
- "eval_loss": 0.5274325609207153,
2301
- "eval_model_preparation_time": 0.0015,
2302
- "eval_runtime": 52.3555,
2303
- "eval_samples_per_second": 62.324,
2304
- "eval_steps_per_second": 1.948,
2305
- "step": 2550
2306
- },
2307
- {
2308
- "epoch": 0.9302325581395349,
2309
- "grad_norm": 26.267868041992188,
2310
- "learning_rate": 7.013081395348837e-06,
2311
- "loss": 0.5701,
2312
- "step": 2560
2313
- },
2314
- {
2315
- "epoch": 0.9338662790697675,
2316
- "grad_norm": 34.12333679199219,
2317
- "learning_rate": 6.649709302325581e-06,
2318
- "loss": 0.5949,
2319
- "step": 2570
2320
- },
2321
- {
2322
- "epoch": 0.9375,
2323
- "grad_norm": 18.76346778869629,
2324
- "learning_rate": 6.286337209302326e-06,
2325
- "loss": 0.5934,
2326
- "step": 2580
2327
- },
2328
- {
2329
- "epoch": 0.9411337209302325,
2330
- "grad_norm": 28.602293014526367,
2331
- "learning_rate": 5.92296511627907e-06,
2332
- "loss": 0.5537,
2333
- "step": 2590
2334
- },
2335
- {
2336
- "epoch": 0.9447674418604651,
2337
- "grad_norm": 24.983774185180664,
2338
- "learning_rate": 5.559593023255814e-06,
2339
- "loss": 0.6459,
2340
- "step": 2600
2341
- },
2342
- {
2343
- "epoch": 0.9447674418604651,
2344
- "eval_accuracy": 0.8446215139442231,
2345
- "eval_loss": 0.5270681977272034,
2346
- "eval_model_preparation_time": 0.0015,
2347
- "eval_runtime": 51.3715,
2348
- "eval_samples_per_second": 63.518,
2349
- "eval_steps_per_second": 1.986,
2350
- "step": 2600
2351
- },
2352
- {
2353
- "epoch": 0.9484011627906976,
2354
- "grad_norm": 20.338668823242188,
2355
- "learning_rate": 5.196220930232559e-06,
2356
- "loss": 0.5579,
2357
- "step": 2610
2358
- },
2359
- {
2360
- "epoch": 0.9520348837209303,
2361
- "grad_norm": 13.569022178649902,
2362
- "learning_rate": 4.832848837209302e-06,
2363
- "loss": 0.5295,
2364
- "step": 2620
2365
- },
2366
- {
2367
- "epoch": 0.9556686046511628,
2368
- "grad_norm": 17.16480827331543,
2369
- "learning_rate": 4.469476744186047e-06,
2370
- "loss": 0.5583,
2371
- "step": 2630
2372
- },
2373
- {
2374
- "epoch": 0.9593023255813954,
2375
- "grad_norm": 25.31192398071289,
2376
- "learning_rate": 4.106104651162791e-06,
2377
- "loss": 0.5767,
2378
- "step": 2640
2379
- },
2380
- {
2381
- "epoch": 0.9629360465116279,
2382
- "grad_norm": 14.07887077331543,
2383
- "learning_rate": 3.7427325581395346e-06,
2384
- "loss": 0.5954,
2385
- "step": 2650
2386
- },
2387
- {
2388
- "epoch": 0.9629360465116279,
2389
- "eval_accuracy": 0.8446215139442231,
2390
- "eval_loss": 0.5231106281280518,
2391
- "eval_model_preparation_time": 0.0015,
2392
- "eval_runtime": 52.8556,
2393
- "eval_samples_per_second": 61.734,
2394
- "eval_steps_per_second": 1.93,
2395
- "step": 2650
2396
- },
2397
- {
2398
- "epoch": 0.9665697674418605,
2399
- "grad_norm": 17.648242950439453,
2400
- "learning_rate": 3.379360465116279e-06,
2401
- "loss": 0.5344,
2402
- "step": 2660
2403
- },
2404
- {
2405
- "epoch": 0.970203488372093,
2406
- "grad_norm": 15.185200691223145,
2407
- "learning_rate": 3.0159883720930235e-06,
2408
- "loss": 0.6055,
2409
- "step": 2670
2410
- },
2411
- {
2412
- "epoch": 0.9738372093023255,
2413
- "grad_norm": 23.906478881835938,
2414
- "learning_rate": 2.6526162790697672e-06,
2415
- "loss": 0.5212,
2416
- "step": 2680
2417
- },
2418
- {
2419
- "epoch": 0.9774709302325582,
2420
- "grad_norm": 22.67589569091797,
2421
- "learning_rate": 2.2892441860465114e-06,
2422
- "loss": 0.6287,
2423
- "step": 2690
2424
- },
2425
- {
2426
- "epoch": 0.9811046511627907,
2427
- "grad_norm": 18.85926055908203,
2428
- "learning_rate": 1.9258720930232557e-06,
2429
- "loss": 0.5407,
2430
- "step": 2700
2431
- },
2432
- {
2433
- "epoch": 0.9811046511627907,
2434
- "eval_accuracy": 0.8467667790376954,
2435
- "eval_loss": 0.5167918801307678,
2436
- "eval_model_preparation_time": 0.0015,
2437
- "eval_runtime": 51.0533,
2438
- "eval_samples_per_second": 63.914,
2439
- "eval_steps_per_second": 1.998,
2440
- "step": 2700
2441
- },
2442
- {
2443
- "epoch": 0.9847383720930233,
2444
- "grad_norm": 17.184297561645508,
2445
- "learning_rate": 1.5625e-06,
2446
- "loss": 0.653,
2447
- "step": 2710
2448
- },
2449
- {
2450
- "epoch": 0.9883720930232558,
2451
- "grad_norm": 27.768354415893555,
2452
- "learning_rate": 1.1991279069767443e-06,
2453
- "loss": 0.5727,
2454
- "step": 2720
2455
- },
2456
- {
2457
- "epoch": 0.9920058139534884,
2458
- "grad_norm": 22.3969783782959,
2459
- "learning_rate": 8.357558139534884e-07,
2460
- "loss": 0.603,
2461
- "step": 2730
2462
- },
2463
- {
2464
- "epoch": 0.9956395348837209,
2465
- "grad_norm": 17.257413864135742,
2466
- "learning_rate": 4.7238372093023254e-07,
2467
- "loss": 0.5879,
2468
- "step": 2740
2469
- },
2470
- {
2471
- "epoch": 0.9992732558139535,
2472
- "grad_norm": 28.563993453979492,
2473
- "learning_rate": 1.0901162790697675e-07,
2474
- "loss": 0.5729,
2475
- "step": 2750
2476
- },
2477
- {
2478
- "epoch": 0.9992732558139535,
2479
- "eval_accuracy": 0.8479926448053938,
2480
- "eval_loss": 0.5158767700195312,
2481
- "eval_model_preparation_time": 0.0015,
2482
- "eval_runtime": 51.352,
2483
- "eval_samples_per_second": 63.542,
2484
- "eval_steps_per_second": 1.986,
2485
- "step": 2750
2486
- }
2487
- ],
2488
- "logging_steps": 10,
2489
- "max_steps": 2752,
2490
- "num_input_tokens_seen": 0,
2491
- "num_train_epochs": 1,
2492
- "save_steps": 50,
2493
- "stateful_callbacks": {
2494
- "TrainerControl": {
2495
- "args": {
2496
- "should_epoch_stop": false,
2497
- "should_evaluate": false,
2498
- "should_log": false,
2499
- "should_save": true,
2500
- "should_training_stop": true
2501
- },
2502
- "attributes": {}
2503
- }
2504
- },
2505
- "total_flos": 9.055112855342285e+18,
2506
- "train_batch_size": 32,
2507
- "trial_name": null,
2508
- "trial_params": null
2509
- }