Pamreth commited on
Commit
6f3cbf8
·
verified ·
1 Parent(s): 4d25f1c

🍻 cheers

Browse files
README.md CHANGED
@@ -3,6 +3,7 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: microsoft/swin-base-simmim-window6-192
5
  tags:
 
6
  - generated_from_trainer
7
  metrics:
8
  - accuracy
@@ -16,11 +17,11 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # swin-ena24
18
 
19
- This model is a fine-tuned version of [microsoft/swin-base-simmim-window6-192](https://huggingface.co/microsoft/swin-base-simmim-window6-192) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 4.5154
22
- - Accuracy: 0.5820
23
- - F1 Macro: 0.5067
24
 
25
  ## Model description
26
 
 
3
  license: apache-2.0
4
  base_model: microsoft/swin-base-simmim-window6-192
5
  tags:
6
+ - image-classification
7
  - generated_from_trainer
8
  metrics:
9
  - accuracy
 
17
 
18
  # swin-ena24
19
 
20
+ This model is a fine-tuned version of [microsoft/swin-base-simmim-window6-192](https://huggingface.co/microsoft/swin-base-simmim-window6-192) on the ena24 dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 2.4677
23
+ - Accuracy: 0.5146
24
+ - F1 Macro: 0.4328
25
 
26
  ## Model description
27
 
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 4.501259445843829,
3
- "eval_accuracy": 0.453125,
4
- "eval_f1_macro": 0.42038287499285415,
5
- "eval_loss": 2.6776063442230225,
6
- "eval_runtime": 12.4042,
7
- "eval_samples_per_second": 82.553,
8
- "eval_steps_per_second": 5.16,
9
- "total_flos": 1.645322107388756e+18,
10
- "train_loss": 0.348135751808895,
11
- "train_runtime": 1025.0873,
12
- "train_samples_per_second": 27.876,
13
- "train_steps_per_second": 1.743
14
  }
 
1
  {
2
+ "epoch": 7.0,
3
+ "eval_accuracy": 0.5146484375,
4
+ "eval_f1_macro": 0.432831730682114,
5
+ "eval_loss": 2.467733144760132,
6
+ "eval_runtime": 12.1136,
7
+ "eval_samples_per_second": 84.533,
8
+ "eval_steps_per_second": 5.283,
9
+ "total_flos": 2.5585840915697664e+18,
10
+ "train_loss": 0.4836694428009911,
11
+ "train_runtime": 1538.4023,
12
+ "train_samples_per_second": 28.894,
13
+ "train_steps_per_second": 1.806
14
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 4.501259445843829,
3
- "eval_accuracy": 0.453125,
4
- "eval_f1_macro": 0.42038287499285415,
5
- "eval_loss": 2.6776063442230225,
6
- "eval_runtime": 12.4042,
7
- "eval_samples_per_second": 82.553,
8
- "eval_steps_per_second": 5.16
9
  }
 
1
  {
2
+ "epoch": 7.0,
3
+ "eval_accuracy": 0.5146484375,
4
+ "eval_f1_macro": 0.432831730682114,
5
+ "eval_loss": 2.467733144760132,
6
+ "eval_runtime": 12.1136,
7
+ "eval_samples_per_second": 84.533,
8
+ "eval_steps_per_second": 5.283
9
  }
runs/May26_21-06-28_35d69dd4580d/events.out.tfevents.1748295298.35d69dd4580d.35.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88af87eb188944eb9bcf29288441ae963bd95b848a78b14454c88cc062382539
3
+ size 463
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 4.501259445843829,
3
- "total_flos": 1.645322107388756e+18,
4
- "train_loss": 0.348135751808895,
5
- "train_runtime": 1025.0873,
6
- "train_samples_per_second": 27.876,
7
- "train_steps_per_second": 1.743
8
  }
 
1
  {
2
+ "epoch": 7.0,
3
+ "total_flos": 2.5585840915697664e+18,
4
+ "train_loss": 0.4836694428009911,
5
+ "train_runtime": 1538.4023,
6
+ "train_samples_per_second": 28.894,
7
+ "train_steps_per_second": 1.806
8
  }
trainer_state.json CHANGED
@@ -1,1444 +1,2237 @@
1
  {
2
- "best_global_step": 600,
3
- "best_metric": 2.6776063442230225,
4
- "best_model_checkpoint": "./swin-ena24/checkpoint-600",
5
- "epoch": 4.501259445843829,
6
  "eval_steps": 100,
7
- "global_step": 1787,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.02518891687657431,
14
- "grad_norm": 722647.75,
15
- "learning_rate": 0.00019899272523782877,
16
- "loss": 1.2825,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.05037783375314862,
21
- "grad_norm": 1145054.375,
22
- "learning_rate": 0.0001978735310576385,
23
- "loss": 1.1453,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.07556675062972293,
28
- "grad_norm": 907092.375,
29
- "learning_rate": 0.00019675433687744824,
30
- "loss": 1.0663,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.10075566750629723,
35
- "grad_norm": 1349867.75,
36
- "learning_rate": 0.00019563514269725797,
37
- "loss": 0.978,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.12594458438287154,
42
- "grad_norm": 1561374.75,
43
- "learning_rate": 0.0001945159485170677,
44
- "loss": 1.1605,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.15113350125944586,
49
- "grad_norm": 615556.875,
50
- "learning_rate": 0.00019339675433687744,
51
- "loss": 0.9696,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.17632241813602015,
56
- "grad_norm": 1404338.125,
57
- "learning_rate": 0.00019227756015668718,
58
- "loss": 1.2168,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.20151133501259447,
63
- "grad_norm": 525871.375,
64
- "learning_rate": 0.0001911583659764969,
65
- "loss": 0.9943,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.22670025188916876,
70
- "grad_norm": 609385.125,
71
- "learning_rate": 0.00019003917179630665,
72
- "loss": 0.9651,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.2518891687657431,
77
- "grad_norm": 447139.09375,
78
- "learning_rate": 0.0001889199776161164,
79
- "loss": 0.6803,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.2518891687657431,
84
- "eval_accuracy": 0.3671875,
85
- "eval_f1_macro": 0.3084548897402726,
86
- "eval_loss": 2.8848822116851807,
87
- "eval_runtime": 12.3338,
88
- "eval_samples_per_second": 83.024,
89
- "eval_steps_per_second": 5.189,
90
  "step": 100
91
  },
92
  {
93
  "epoch": 0.2770780856423174,
94
- "grad_norm": 971148.875,
95
- "learning_rate": 0.00018780078343592614,
96
- "loss": 0.8191,
97
  "step": 110
98
  },
99
  {
100
  "epoch": 0.3022670025188917,
101
- "grad_norm": 1061247.5,
102
- "learning_rate": 0.00018668158925573588,
103
- "loss": 0.8493,
104
  "step": 120
105
  },
106
  {
107
  "epoch": 0.327455919395466,
108
- "grad_norm": 839266.125,
109
- "learning_rate": 0.0001855623950755456,
110
- "loss": 1.009,
111
  "step": 130
112
  },
113
  {
114
  "epoch": 0.3526448362720403,
115
- "grad_norm": 930034.625,
116
- "learning_rate": 0.00018444320089535537,
117
- "loss": 0.8423,
118
  "step": 140
119
  },
120
  {
121
  "epoch": 0.3778337531486146,
122
- "grad_norm": 399340.1875,
123
- "learning_rate": 0.0001833240067151651,
124
- "loss": 0.8548,
125
  "step": 150
126
  },
127
  {
128
  "epoch": 0.40302267002518893,
129
- "grad_norm": 566613.125,
130
- "learning_rate": 0.00018220481253497484,
131
- "loss": 0.8475,
132
  "step": 160
133
  },
134
  {
135
  "epoch": 0.4282115869017632,
136
- "grad_norm": 758996.75,
137
- "learning_rate": 0.00018108561835478458,
138
- "loss": 1.015,
139
  "step": 170
140
  },
141
  {
142
  "epoch": 0.4534005037783375,
143
- "grad_norm": 1068383.625,
144
- "learning_rate": 0.0001799664241745943,
145
- "loss": 1.0068,
146
  "step": 180
147
  },
148
  {
149
  "epoch": 0.47858942065491183,
150
- "grad_norm": 601097.5,
151
- "learning_rate": 0.00017884722999440405,
152
- "loss": 1.0784,
153
  "step": 190
154
  },
155
  {
156
  "epoch": 0.5037783375314862,
157
- "grad_norm": 426435.375,
158
- "learning_rate": 0.00017772803581421378,
159
- "loss": 0.7695,
160
  "step": 200
161
  },
162
  {
163
  "epoch": 0.5037783375314862,
164
- "eval_accuracy": 0.3466796875,
165
- "eval_f1_macro": 0.32332588313582183,
166
- "eval_loss": 2.980027675628662,
167
- "eval_runtime": 12.101,
168
- "eval_samples_per_second": 84.621,
169
- "eval_steps_per_second": 5.289,
170
  "step": 200
171
  },
172
  {
173
  "epoch": 0.5289672544080605,
174
- "grad_norm": 799709.6875,
175
- "learning_rate": 0.00017660884163402352,
176
- "loss": 0.795,
177
  "step": 210
178
  },
179
  {
180
  "epoch": 0.5541561712846348,
181
- "grad_norm": 423194.71875,
182
- "learning_rate": 0.00017548964745383325,
183
- "loss": 0.7358,
184
  "step": 220
185
  },
186
  {
187
  "epoch": 0.5793450881612091,
188
- "grad_norm": 1420062.375,
189
- "learning_rate": 0.00017437045327364298,
190
- "loss": 0.7564,
191
  "step": 230
192
  },
193
  {
194
  "epoch": 0.6045340050377834,
195
- "grad_norm": 315543.90625,
196
- "learning_rate": 0.00017325125909345272,
197
- "loss": 0.6731,
198
  "step": 240
199
  },
200
  {
201
  "epoch": 0.6297229219143576,
202
- "grad_norm": 617961.875,
203
- "learning_rate": 0.00017213206491326245,
204
- "loss": 0.7648,
205
  "step": 250
206
  },
207
  {
208
  "epoch": 0.654911838790932,
209
- "grad_norm": 848650.6875,
210
- "learning_rate": 0.0001710128707330722,
211
- "loss": 0.9092,
212
  "step": 260
213
  },
214
  {
215
  "epoch": 0.6801007556675063,
216
- "grad_norm": 569121.125,
217
- "learning_rate": 0.00016989367655288192,
218
- "loss": 0.6555,
219
  "step": 270
220
  },
221
  {
222
  "epoch": 0.7052896725440806,
223
- "grad_norm": 469067.625,
224
- "learning_rate": 0.00016877448237269166,
225
- "loss": 0.6177,
226
  "step": 280
227
  },
228
  {
229
  "epoch": 0.7304785894206549,
230
- "grad_norm": 529740.25,
231
- "learning_rate": 0.0001676552881925014,
232
- "loss": 0.5253,
233
  "step": 290
234
  },
235
  {
236
  "epoch": 0.7556675062972292,
237
- "grad_norm": 267644.59375,
238
- "learning_rate": 0.00016653609401231113,
239
- "loss": 0.4746,
240
  "step": 300
241
  },
242
  {
243
  "epoch": 0.7556675062972292,
244
- "eval_accuracy": 0.41015625,
245
- "eval_f1_macro": 0.3715782586739612,
246
- "eval_loss": 2.728983163833618,
247
- "eval_runtime": 12.1019,
248
- "eval_samples_per_second": 84.615,
249
- "eval_steps_per_second": 5.288,
250
  "step": 300
251
  },
252
  {
253
  "epoch": 0.7808564231738035,
254
- "grad_norm": 543288.625,
255
- "learning_rate": 0.0001654168998321209,
256
- "loss": 0.6678,
257
  "step": 310
258
  },
259
  {
260
  "epoch": 0.8060453400503779,
261
- "grad_norm": 535119.4375,
262
- "learning_rate": 0.00016429770565193062,
263
- "loss": 0.7071,
264
  "step": 320
265
  },
266
  {
267
  "epoch": 0.8312342569269522,
268
- "grad_norm": 399674.84375,
269
- "learning_rate": 0.00016317851147174036,
270
- "loss": 0.7075,
271
  "step": 330
272
  },
273
  {
274
  "epoch": 0.8564231738035264,
275
- "grad_norm": 393063.6875,
276
- "learning_rate": 0.0001620593172915501,
277
- "loss": 0.5218,
278
  "step": 340
279
  },
280
  {
281
  "epoch": 0.8816120906801007,
282
- "grad_norm": 808876.125,
283
- "learning_rate": 0.00016094012311135983,
284
- "loss": 0.699,
285
  "step": 350
286
  },
287
  {
288
  "epoch": 0.906801007556675,
289
- "grad_norm": 611903.4375,
290
- "learning_rate": 0.00015982092893116956,
291
- "loss": 0.5328,
292
  "step": 360
293
  },
294
  {
295
  "epoch": 0.9319899244332494,
296
- "grad_norm": 841311.8125,
297
- "learning_rate": 0.0001587017347509793,
298
- "loss": 0.6082,
299
  "step": 370
300
  },
301
  {
302
  "epoch": 0.9571788413098237,
303
- "grad_norm": 506197.6875,
304
- "learning_rate": 0.00015758254057078906,
305
- "loss": 0.462,
306
  "step": 380
307
  },
308
  {
309
  "epoch": 0.982367758186398,
310
- "grad_norm": 400107.0,
311
- "learning_rate": 0.0001564633463905988,
312
- "loss": 0.6311,
313
  "step": 390
314
  },
315
  {
316
  "epoch": 1.0075566750629723,
317
- "grad_norm": 341860.5625,
318
- "learning_rate": 0.00015534415221040852,
319
- "loss": 0.4341,
320
  "step": 400
321
  },
322
  {
323
  "epoch": 1.0075566750629723,
324
- "eval_accuracy": 0.4443359375,
325
- "eval_f1_macro": 0.3756943082865848,
326
- "eval_loss": 2.714715003967285,
327
- "eval_runtime": 12.3938,
328
- "eval_samples_per_second": 82.622,
329
- "eval_steps_per_second": 5.164,
330
  "step": 400
331
  },
332
  {
333
  "epoch": 1.0327455919395465,
334
- "grad_norm": 416256.65625,
335
- "learning_rate": 0.00015422495803021826,
336
- "loss": 0.4053,
337
  "step": 410
338
  },
339
  {
340
  "epoch": 1.057934508816121,
341
- "grad_norm": 294220.53125,
342
- "learning_rate": 0.000153105763850028,
343
- "loss": 0.3937,
344
  "step": 420
345
  },
346
  {
347
  "epoch": 1.0831234256926952,
348
- "grad_norm": 612743.9375,
349
- "learning_rate": 0.00015198656966983773,
350
- "loss": 0.3495,
351
  "step": 430
352
  },
353
  {
354
  "epoch": 1.1083123425692696,
355
- "grad_norm": 650256.0,
356
- "learning_rate": 0.00015086737548964746,
357
- "loss": 0.3416,
358
  "step": 440
359
  },
360
  {
361
  "epoch": 1.1335012594458438,
362
- "grad_norm": 520329.25,
363
- "learning_rate": 0.0001497481813094572,
364
- "loss": 0.4317,
365
  "step": 450
366
  },
367
  {
368
  "epoch": 1.1586901763224182,
369
- "grad_norm": 1286625.625,
370
- "learning_rate": 0.00014862898712926693,
371
- "loss": 0.4651,
372
  "step": 460
373
  },
374
  {
375
  "epoch": 1.1838790931989924,
376
- "grad_norm": 628032.5625,
377
- "learning_rate": 0.00014750979294907667,
378
- "loss": 0.5113,
379
  "step": 470
380
  },
381
  {
382
  "epoch": 1.2090680100755669,
383
- "grad_norm": 214764.890625,
384
- "learning_rate": 0.0001463905987688864,
385
- "loss": 0.5569,
386
  "step": 480
387
  },
388
  {
389
  "epoch": 1.234256926952141,
390
- "grad_norm": 379592.4375,
391
- "learning_rate": 0.00014527140458869614,
392
- "loss": 0.5934,
393
  "step": 490
394
  },
395
  {
396
  "epoch": 1.2594458438287153,
397
- "grad_norm": 149916.71875,
398
- "learning_rate": 0.00014415221040850587,
399
- "loss": 0.4169,
400
  "step": 500
401
  },
402
  {
403
  "epoch": 1.2594458438287153,
404
- "eval_accuracy": 0.4716796875,
405
- "eval_f1_macro": 0.42709945905854985,
406
- "eval_loss": 2.783041000366211,
407
- "eval_runtime": 12.5706,
408
- "eval_samples_per_second": 81.46,
409
- "eval_steps_per_second": 5.091,
410
  "step": 500
411
  },
412
  {
413
  "epoch": 1.2846347607052897,
414
- "grad_norm": 89028.7265625,
415
- "learning_rate": 0.00014303301622831563,
416
- "loss": 0.4572,
417
  "step": 510
418
  },
419
  {
420
  "epoch": 1.309823677581864,
421
- "grad_norm": 1493811.125,
422
- "learning_rate": 0.00014191382204812537,
423
- "loss": 0.6389,
424
  "step": 520
425
  },
426
  {
427
  "epoch": 1.3350125944584383,
428
- "grad_norm": 703393.125,
429
- "learning_rate": 0.0001407946278679351,
430
- "loss": 0.5807,
431
  "step": 530
432
  },
433
  {
434
  "epoch": 1.3602015113350125,
435
- "grad_norm": 1209954.0,
436
- "learning_rate": 0.00013967543368774484,
437
- "loss": 0.4739,
438
  "step": 540
439
  },
440
  {
441
  "epoch": 1.385390428211587,
442
- "grad_norm": 364055.90625,
443
- "learning_rate": 0.00013855623950755457,
444
- "loss": 0.5227,
445
  "step": 550
446
  },
447
  {
448
  "epoch": 1.4105793450881612,
449
- "grad_norm": 674074.5,
450
- "learning_rate": 0.0001374370453273643,
451
- "loss": 0.4125,
452
  "step": 560
453
  },
454
  {
455
  "epoch": 1.4357682619647356,
456
- "grad_norm": 522579.625,
457
- "learning_rate": 0.00013631785114717404,
458
- "loss": 0.5498,
459
  "step": 570
460
  },
461
  {
462
  "epoch": 1.4609571788413098,
463
- "grad_norm": 492744.28125,
464
- "learning_rate": 0.00013519865696698377,
465
- "loss": 0.566,
466
  "step": 580
467
  },
468
  {
469
  "epoch": 1.486146095717884,
470
- "grad_norm": 1123395.625,
471
- "learning_rate": 0.0001340794627867935,
472
- "loss": 0.4968,
473
  "step": 590
474
  },
475
  {
476
  "epoch": 1.5113350125944585,
477
- "grad_norm": 240168.359375,
478
- "learning_rate": 0.00013296026860660324,
479
- "loss": 0.4753,
480
  "step": 600
481
  },
482
  {
483
  "epoch": 1.5113350125944585,
484
- "eval_accuracy": 0.453125,
485
- "eval_f1_macro": 0.42038287499285415,
486
- "eval_loss": 2.6776063442230225,
487
- "eval_runtime": 12.1269,
488
- "eval_samples_per_second": 84.441,
489
- "eval_steps_per_second": 5.278,
490
  "step": 600
491
  },
492
  {
493
  "epoch": 1.536523929471033,
494
- "grad_norm": 717263.0625,
495
- "learning_rate": 0.00013184107442641298,
496
- "loss": 0.5654,
497
  "step": 610
498
  },
499
  {
500
  "epoch": 1.561712846347607,
501
- "grad_norm": 394923.28125,
502
- "learning_rate": 0.0001307218802462227,
503
- "loss": 0.4827,
504
  "step": 620
505
  },
506
  {
507
  "epoch": 1.5869017632241813,
508
- "grad_norm": 652397.875,
509
- "learning_rate": 0.00012960268606603245,
510
- "loss": 0.4257,
511
  "step": 630
512
  },
513
  {
514
  "epoch": 1.6120906801007555,
515
- "grad_norm": 534140.0,
516
- "learning_rate": 0.00012848349188584218,
517
- "loss": 0.4803,
518
  "step": 640
519
  },
520
  {
521
  "epoch": 1.63727959697733,
522
- "grad_norm": 939360.3125,
523
- "learning_rate": 0.00012736429770565194,
524
- "loss": 0.3417,
525
  "step": 650
526
  },
527
  {
528
  "epoch": 1.6624685138539044,
529
- "grad_norm": 414164.28125,
530
- "learning_rate": 0.00012624510352546168,
531
- "loss": 0.4478,
532
  "step": 660
533
  },
534
  {
535
  "epoch": 1.6876574307304786,
536
- "grad_norm": 443836.5625,
537
- "learning_rate": 0.0001251259093452714,
538
- "loss": 0.3038,
539
  "step": 670
540
  },
541
  {
542
  "epoch": 1.7128463476070528,
543
- "grad_norm": 383613.28125,
544
- "learning_rate": 0.00012400671516508115,
545
- "loss": 0.3046,
546
  "step": 680
547
  },
548
  {
549
  "epoch": 1.7380352644836272,
550
- "grad_norm": 311289.09375,
551
- "learning_rate": 0.00012288752098489088,
552
- "loss": 0.5118,
553
  "step": 690
554
  },
555
  {
556
  "epoch": 1.7632241813602016,
557
- "grad_norm": 315275.5625,
558
- "learning_rate": 0.00012176832680470063,
559
- "loss": 0.39,
560
  "step": 700
561
  },
562
  {
563
  "epoch": 1.7632241813602016,
564
- "eval_accuracy": 0.44140625,
565
- "eval_f1_macro": 0.39664059012360764,
566
- "eval_loss": 3.111250877380371,
567
- "eval_runtime": 11.9797,
568
- "eval_samples_per_second": 85.478,
569
- "eval_steps_per_second": 5.342,
570
  "step": 700
571
  },
572
  {
573
  "epoch": 1.7884130982367759,
574
- "grad_norm": 492566.125,
575
- "learning_rate": 0.00012064913262451036,
576
- "loss": 0.5712,
577
  "step": 710
578
  },
579
  {
580
  "epoch": 1.81360201511335,
581
- "grad_norm": 705778.1875,
582
- "learning_rate": 0.0001195299384443201,
583
- "loss": 0.4067,
584
  "step": 720
585
  },
586
  {
587
  "epoch": 1.8387909319899243,
588
- "grad_norm": 557869.0,
589
- "learning_rate": 0.00011841074426412983,
590
- "loss": 0.365,
591
  "step": 730
592
  },
593
  {
594
  "epoch": 1.8639798488664987,
595
- "grad_norm": 457722.875,
596
- "learning_rate": 0.00011729155008393957,
597
- "loss": 0.4615,
598
  "step": 740
599
  },
600
  {
601
  "epoch": 1.8891687657430731,
602
- "grad_norm": 540578.625,
603
- "learning_rate": 0.00011617235590374931,
604
- "loss": 0.4853,
605
  "step": 750
606
  },
607
  {
608
  "epoch": 1.9143576826196473,
609
- "grad_norm": 326453.71875,
610
- "learning_rate": 0.00011505316172355905,
611
- "loss": 0.4549,
612
  "step": 760
613
  },
614
  {
615
  "epoch": 1.9395465994962215,
616
- "grad_norm": 440919.15625,
617
- "learning_rate": 0.00011393396754336878,
618
- "loss": 0.5766,
619
  "step": 770
620
  },
621
  {
622
  "epoch": 1.964735516372796,
623
- "grad_norm": 432765.0625,
624
- "learning_rate": 0.00011281477336317852,
625
- "loss": 0.3215,
626
  "step": 780
627
  },
628
  {
629
  "epoch": 1.9899244332493704,
630
- "grad_norm": 556068.75,
631
- "learning_rate": 0.00011169557918298825,
632
- "loss": 0.4189,
633
  "step": 790
634
  },
635
  {
636
  "epoch": 2.0151133501259446,
637
- "grad_norm": 625197.0625,
638
- "learning_rate": 0.00011057638500279799,
639
- "loss": 0.2894,
640
  "step": 800
641
  },
642
  {
643
  "epoch": 2.0151133501259446,
644
  "eval_accuracy": 0.484375,
645
- "eval_f1_macro": 0.43064344829980045,
646
- "eval_loss": 2.688737154006958,
647
- "eval_runtime": 12.7085,
648
- "eval_samples_per_second": 80.576,
649
- "eval_steps_per_second": 5.036,
650
  "step": 800
651
  },
652
  {
653
  "epoch": 2.040302267002519,
654
- "grad_norm": 161002.921875,
655
- "learning_rate": 0.00010945719082260772,
656
- "loss": 0.2777,
657
  "step": 810
658
  },
659
  {
660
  "epoch": 2.065491183879093,
661
- "grad_norm": 494643.84375,
662
- "learning_rate": 0.00010833799664241746,
663
- "loss": 0.1907,
664
  "step": 820
665
  },
666
  {
667
  "epoch": 2.0906801007556677,
668
- "grad_norm": 401399.3125,
669
- "learning_rate": 0.0001072188024622272,
670
- "loss": 0.2712,
671
  "step": 830
672
  },
673
  {
674
  "epoch": 2.115869017632242,
675
- "grad_norm": 377969.53125,
676
- "learning_rate": 0.00010609960828203694,
677
- "loss": 0.2549,
678
  "step": 840
679
  },
680
  {
681
  "epoch": 2.141057934508816,
682
- "grad_norm": 119198.296875,
683
- "learning_rate": 0.00010498041410184667,
684
- "loss": 0.2477,
685
  "step": 850
686
  },
687
  {
688
  "epoch": 2.1662468513853903,
689
- "grad_norm": 47976.99609375,
690
- "learning_rate": 0.00010386121992165641,
691
- "loss": 0.3046,
692
  "step": 860
693
  },
694
  {
695
  "epoch": 2.1914357682619645,
696
- "grad_norm": 219181.703125,
697
- "learning_rate": 0.00010274202574146614,
698
- "loss": 0.1819,
699
  "step": 870
700
  },
701
  {
702
  "epoch": 2.216624685138539,
703
- "grad_norm": 639062.5,
704
- "learning_rate": 0.00010162283156127588,
705
- "loss": 0.1144,
706
  "step": 880
707
  },
708
  {
709
  "epoch": 2.2418136020151134,
710
- "grad_norm": 194537.953125,
711
- "learning_rate": 0.00010050363738108561,
712
- "loss": 0.2128,
713
  "step": 890
714
  },
715
  {
716
  "epoch": 2.2670025188916876,
717
- "grad_norm": 453326.8125,
718
- "learning_rate": 9.938444320089536e-05,
719
- "loss": 0.1688,
720
  "step": 900
721
  },
722
  {
723
  "epoch": 2.2670025188916876,
724
- "eval_accuracy": 0.484375,
725
- "eval_f1_macro": 0.44583362196028875,
726
- "eval_loss": 2.7481908798217773,
727
- "eval_runtime": 12.286,
728
- "eval_samples_per_second": 83.347,
729
- "eval_steps_per_second": 5.209,
730
  "step": 900
731
  },
732
  {
733
  "epoch": 2.292191435768262,
734
- "grad_norm": 84822.6328125,
735
- "learning_rate": 9.82652490207051e-05,
736
- "loss": 0.2347,
737
  "step": 910
738
  },
739
  {
740
  "epoch": 2.3173803526448364,
741
- "grad_norm": 195546.609375,
742
- "learning_rate": 9.714605484051483e-05,
743
- "loss": 0.2993,
744
  "step": 920
745
  },
746
  {
747
  "epoch": 2.3425692695214106,
748
- "grad_norm": 438621.5625,
749
- "learning_rate": 9.602686066032456e-05,
750
- "loss": 0.226,
751
  "step": 930
752
  },
753
  {
754
  "epoch": 2.367758186397985,
755
- "grad_norm": 780395.5625,
756
- "learning_rate": 9.490766648013431e-05,
757
- "loss": 0.2522,
758
  "step": 940
759
  },
760
  {
761
  "epoch": 2.392947103274559,
762
- "grad_norm": 490075.65625,
763
- "learning_rate": 9.378847229994404e-05,
764
- "loss": 0.1968,
765
  "step": 950
766
  },
767
  {
768
  "epoch": 2.4181360201511337,
769
- "grad_norm": 272662.71875,
770
- "learning_rate": 9.266927811975378e-05,
771
- "loss": 0.2001,
772
  "step": 960
773
  },
774
  {
775
  "epoch": 2.443324937027708,
776
- "grad_norm": 141303.65625,
777
- "learning_rate": 9.155008393956351e-05,
778
- "loss": 0.2019,
779
  "step": 970
780
  },
781
  {
782
  "epoch": 2.468513853904282,
783
- "grad_norm": 527202.1875,
784
- "learning_rate": 9.043088975937326e-05,
785
- "loss": 0.2352,
786
  "step": 980
787
  },
788
  {
789
  "epoch": 2.4937027707808563,
790
- "grad_norm": 447107.84375,
791
- "learning_rate": 8.9311695579183e-05,
792
- "loss": 0.3576,
793
  "step": 990
794
  },
795
  {
796
  "epoch": 2.5188916876574305,
797
- "grad_norm": 566260.5,
798
- "learning_rate": 8.819250139899273e-05,
799
- "loss": 0.2888,
800
  "step": 1000
801
  },
802
  {
803
  "epoch": 2.5188916876574305,
804
- "eval_accuracy": 0.486328125,
805
- "eval_f1_macro": 0.4256721212133858,
806
- "eval_loss": 2.939445734024048,
807
- "eval_runtime": 12.4275,
808
- "eval_samples_per_second": 82.398,
809
- "eval_steps_per_second": 5.15,
810
  "step": 1000
811
  },
812
  {
813
  "epoch": 2.544080604534005,
814
- "grad_norm": 608117.25,
815
- "learning_rate": 8.707330721880247e-05,
816
- "loss": 0.3085,
817
  "step": 1010
818
  },
819
  {
820
  "epoch": 2.5692695214105794,
821
- "grad_norm": 326933.9375,
822
- "learning_rate": 8.59541130386122e-05,
823
- "loss": 0.1651,
824
  "step": 1020
825
  },
826
  {
827
  "epoch": 2.5944584382871536,
828
- "grad_norm": 189686.5,
829
- "learning_rate": 8.483491885842193e-05,
830
- "loss": 0.0999,
831
  "step": 1030
832
  },
833
  {
834
  "epoch": 2.619647355163728,
835
- "grad_norm": 306902.0625,
836
- "learning_rate": 8.371572467823168e-05,
837
- "loss": 0.1963,
838
  "step": 1040
839
  },
840
  {
841
  "epoch": 2.644836272040302,
842
- "grad_norm": 329261.125,
843
- "learning_rate": 8.259653049804142e-05,
844
- "loss": 0.169,
845
  "step": 1050
846
  },
847
  {
848
  "epoch": 2.6700251889168767,
849
- "grad_norm": 42989.2734375,
850
- "learning_rate": 8.147733631785115e-05,
851
- "loss": 0.1491,
852
  "step": 1060
853
  },
854
  {
855
  "epoch": 2.695214105793451,
856
- "grad_norm": 76116.8515625,
857
- "learning_rate": 8.035814213766089e-05,
858
- "loss": 0.2026,
859
  "step": 1070
860
  },
861
  {
862
  "epoch": 2.720403022670025,
863
- "grad_norm": 498388.03125,
864
- "learning_rate": 7.923894795747062e-05,
865
- "loss": 0.1291,
866
  "step": 1080
867
  },
868
  {
869
  "epoch": 2.7455919395465997,
870
- "grad_norm": 468509.4375,
871
- "learning_rate": 7.811975377728036e-05,
872
- "loss": 0.2243,
873
  "step": 1090
874
  },
875
  {
876
  "epoch": 2.770780856423174,
877
- "grad_norm": 650361.5625,
878
- "learning_rate": 7.700055959709009e-05,
879
- "loss": 0.324,
880
  "step": 1100
881
  },
882
  {
883
  "epoch": 2.770780856423174,
884
- "eval_accuracy": 0.5048828125,
885
- "eval_f1_macro": 0.4221348831502289,
886
- "eval_loss": 2.9311883449554443,
887
- "eval_runtime": 12.5375,
888
- "eval_samples_per_second": 81.675,
889
- "eval_steps_per_second": 5.105,
890
  "step": 1100
891
  },
892
  {
893
  "epoch": 2.795969773299748,
894
- "grad_norm": 26658.416015625,
895
- "learning_rate": 7.588136541689984e-05,
896
- "loss": 0.0959,
897
  "step": 1110
898
  },
899
  {
900
  "epoch": 2.8211586901763224,
901
- "grad_norm": 1624137.0,
902
- "learning_rate": 7.476217123670957e-05,
903
- "loss": 0.2074,
904
  "step": 1120
905
  },
906
  {
907
  "epoch": 2.8463476070528966,
908
- "grad_norm": 103690.328125,
909
- "learning_rate": 7.36429770565193e-05,
910
- "loss": 0.2154,
911
  "step": 1130
912
  },
913
  {
914
  "epoch": 2.8715365239294712,
915
- "grad_norm": 514053.46875,
916
- "learning_rate": 7.252378287632905e-05,
917
- "loss": 0.2469,
918
  "step": 1140
919
  },
920
  {
921
  "epoch": 2.8967254408060454,
922
- "grad_norm": 357004.5,
923
- "learning_rate": 7.140458869613879e-05,
924
- "loss": 0.1763,
925
  "step": 1150
926
  },
927
  {
928
  "epoch": 2.9219143576826196,
929
- "grad_norm": 586420.9375,
930
- "learning_rate": 7.028539451594852e-05,
931
- "loss": 0.2131,
932
  "step": 1160
933
  },
934
  {
935
  "epoch": 2.947103274559194,
936
- "grad_norm": 480120.375,
937
- "learning_rate": 6.916620033575826e-05,
938
- "loss": 0.2259,
939
  "step": 1170
940
  },
941
  {
942
  "epoch": 2.972292191435768,
943
- "grad_norm": 400242.75,
944
- "learning_rate": 6.804700615556799e-05,
945
- "loss": 0.1008,
946
  "step": 1180
947
  },
948
  {
949
  "epoch": 2.9974811083123427,
950
- "grad_norm": 218580.453125,
951
- "learning_rate": 6.692781197537773e-05,
952
- "loss": 0.1216,
953
  "step": 1190
954
  },
955
  {
956
  "epoch": 3.022670025188917,
957
- "grad_norm": 361961.6875,
958
- "learning_rate": 6.580861779518746e-05,
959
- "loss": 0.1116,
960
  "step": 1200
961
  },
962
  {
963
  "epoch": 3.022670025188917,
964
- "eval_accuracy": 0.490234375,
965
- "eval_f1_macro": 0.4544818016574019,
966
- "eval_loss": 3.1089882850646973,
967
- "eval_runtime": 12.359,
968
- "eval_samples_per_second": 82.854,
969
- "eval_steps_per_second": 5.178,
970
  "step": 1200
971
  },
972
  {
973
  "epoch": 3.047858942065491,
974
- "grad_norm": 135584.640625,
975
- "learning_rate": 6.46894236149972e-05,
976
- "loss": 0.0556,
977
  "step": 1210
978
  },
979
  {
980
  "epoch": 3.0730478589420653,
981
- "grad_norm": 3213.19189453125,
982
- "learning_rate": 6.357022943480694e-05,
983
- "loss": 0.0488,
984
  "step": 1220
985
  },
986
  {
987
  "epoch": 3.09823677581864,
988
- "grad_norm": 62310.16015625,
989
- "learning_rate": 6.245103525461668e-05,
990
- "loss": 0.0927,
991
  "step": 1230
992
  },
993
  {
994
  "epoch": 3.123425692695214,
995
- "grad_norm": 53239.93359375,
996
- "learning_rate": 6.133184107442641e-05,
997
- "loss": 0.1118,
998
  "step": 1240
999
  },
1000
  {
1001
  "epoch": 3.1486146095717884,
1002
- "grad_norm": 9139.4677734375,
1003
- "learning_rate": 6.0212646894236155e-05,
1004
- "loss": 0.1502,
1005
  "step": 1250
1006
  },
1007
  {
1008
  "epoch": 3.1738035264483626,
1009
- "grad_norm": 41054.84375,
1010
- "learning_rate": 5.9093452714045896e-05,
1011
- "loss": 0.0324,
1012
  "step": 1260
1013
  },
1014
  {
1015
  "epoch": 3.1989924433249373,
1016
- "grad_norm": 677185.0625,
1017
- "learning_rate": 5.797425853385563e-05,
1018
- "loss": 0.0757,
1019
  "step": 1270
1020
  },
1021
  {
1022
  "epoch": 3.2241813602015115,
1023
- "grad_norm": 82874.671875,
1024
- "learning_rate": 5.6855064353665365e-05,
1025
- "loss": 0.1237,
1026
  "step": 1280
1027
  },
1028
  {
1029
  "epoch": 3.2493702770780857,
1030
- "grad_norm": 125875.4296875,
1031
- "learning_rate": 5.57358701734751e-05,
1032
- "loss": 0.0216,
1033
  "step": 1290
1034
  },
1035
  {
1036
  "epoch": 3.27455919395466,
1037
- "grad_norm": 9467.65234375,
1038
- "learning_rate": 5.461667599328484e-05,
1039
- "loss": 0.038,
1040
  "step": 1300
1041
  },
1042
  {
1043
  "epoch": 3.27455919395466,
1044
- "eval_accuracy": 0.482421875,
1045
- "eval_f1_macro": 0.43847535887927364,
1046
- "eval_loss": 3.4505765438079834,
1047
- "eval_runtime": 12.1113,
1048
- "eval_samples_per_second": 84.549,
1049
- "eval_steps_per_second": 5.284,
1050
  "step": 1300
1051
  },
1052
  {
1053
  "epoch": 3.299748110831234,
1054
- "grad_norm": 60840.53515625,
1055
- "learning_rate": 5.3497481813094575e-05,
1056
- "loss": 0.0469,
1057
  "step": 1310
1058
  },
1059
  {
1060
  "epoch": 3.3249370277078087,
1061
- "grad_norm": 547180.5625,
1062
- "learning_rate": 5.237828763290431e-05,
1063
- "loss": 0.0693,
1064
  "step": 1320
1065
  },
1066
  {
1067
  "epoch": 3.350125944584383,
1068
- "grad_norm": 83035.671875,
1069
- "learning_rate": 5.1259093452714044e-05,
1070
- "loss": 0.0886,
1071
  "step": 1330
1072
  },
1073
  {
1074
  "epoch": 3.375314861460957,
1075
- "grad_norm": 16501.109375,
1076
- "learning_rate": 5.0139899272523786e-05,
1077
- "loss": 0.0809,
1078
  "step": 1340
1079
  },
1080
  {
1081
  "epoch": 3.4005037783375314,
1082
- "grad_norm": 442814.75,
1083
- "learning_rate": 4.902070509233353e-05,
1084
- "loss": 0.0672,
1085
  "step": 1350
1086
  },
1087
  {
1088
  "epoch": 3.4256926952141056,
1089
- "grad_norm": 7077.60986328125,
1090
- "learning_rate": 4.790151091214326e-05,
1091
- "loss": 0.0529,
1092
  "step": 1360
1093
  },
1094
  {
1095
  "epoch": 3.4508816120906802,
1096
- "grad_norm": 5218.708984375,
1097
- "learning_rate": 4.6782316731952996e-05,
1098
- "loss": 0.0875,
1099
  "step": 1370
1100
  },
1101
  {
1102
  "epoch": 3.4760705289672544,
1103
- "grad_norm": 84720.59375,
1104
- "learning_rate": 4.566312255176273e-05,
1105
- "loss": 0.1393,
1106
  "step": 1380
1107
  },
1108
  {
1109
  "epoch": 3.5012594458438286,
1110
- "grad_norm": 105196.2890625,
1111
- "learning_rate": 4.454392837157247e-05,
1112
- "loss": 0.0471,
1113
  "step": 1390
1114
  },
1115
  {
1116
  "epoch": 3.5264483627204033,
1117
- "grad_norm": 1474484.625,
1118
- "learning_rate": 4.3424734191382206e-05,
1119
- "loss": 0.1514,
1120
  "step": 1400
1121
  },
1122
  {
1123
  "epoch": 3.5264483627204033,
1124
- "eval_accuracy": 0.4658203125,
1125
- "eval_f1_macro": 0.42776168656822045,
1126
- "eval_loss": 3.7711305618286133,
1127
- "eval_runtime": 12.5329,
1128
- "eval_samples_per_second": 81.705,
1129
- "eval_steps_per_second": 5.107,
1130
  "step": 1400
1131
  },
1132
  {
1133
  "epoch": 3.551637279596977,
1134
- "grad_norm": 1280.7615966796875,
1135
- "learning_rate": 4.230554001119195e-05,
1136
- "loss": 0.0247,
1137
  "step": 1410
1138
  },
1139
  {
1140
  "epoch": 3.5768261964735517,
1141
- "grad_norm": 147165.875,
1142
- "learning_rate": 4.118634583100168e-05,
1143
- "loss": 0.0793,
1144
  "step": 1420
1145
  },
1146
  {
1147
  "epoch": 3.602015113350126,
1148
- "grad_norm": 602996.625,
1149
- "learning_rate": 4.0067151650811416e-05,
1150
- "loss": 0.0822,
1151
  "step": 1430
1152
  },
1153
  {
1154
  "epoch": 3.6272040302267,
1155
- "grad_norm": 238983.75,
1156
- "learning_rate": 3.894795747062116e-05,
1157
- "loss": 0.1207,
1158
  "step": 1440
1159
  },
1160
  {
1161
  "epoch": 3.652392947103275,
1162
- "grad_norm": 6159.01025390625,
1163
- "learning_rate": 3.782876329043089e-05,
1164
- "loss": 0.1041,
1165
  "step": 1450
1166
  },
1167
  {
1168
  "epoch": 3.677581863979849,
1169
- "grad_norm": 28944.771484375,
1170
- "learning_rate": 3.670956911024063e-05,
1171
- "loss": 0.0233,
1172
  "step": 1460
1173
  },
1174
  {
1175
  "epoch": 3.702770780856423,
1176
- "grad_norm": 10967.2275390625,
1177
- "learning_rate": 3.559037493005036e-05,
1178
- "loss": 0.0122,
1179
  "step": 1470
1180
  },
1181
  {
1182
  "epoch": 3.7279596977329974,
1183
- "grad_norm": 13209.2763671875,
1184
- "learning_rate": 3.44711807498601e-05,
1185
- "loss": 0.067,
1186
  "step": 1480
1187
  },
1188
  {
1189
  "epoch": 3.7531486146095716,
1190
- "grad_norm": 834.6673583984375,
1191
- "learning_rate": 3.3351986569669844e-05,
1192
- "loss": 0.062,
1193
  "step": 1490
1194
  },
1195
  {
1196
  "epoch": 3.7783375314861463,
1197
- "grad_norm": 17447.7265625,
1198
- "learning_rate": 3.223279238947958e-05,
1199
- "loss": 0.0463,
1200
  "step": 1500
1201
  },
1202
  {
1203
  "epoch": 3.7783375314861463,
1204
- "eval_accuracy": 0.5087890625,
1205
- "eval_f1_macro": 0.45826871981986883,
1206
- "eval_loss": 3.459321975708008,
1207
- "eval_runtime": 12.1286,
1208
- "eval_samples_per_second": 84.429,
1209
- "eval_steps_per_second": 5.277,
1210
  "step": 1500
1211
  },
1212
  {
1213
  "epoch": 3.8035264483627205,
1214
- "grad_norm": 340515.1875,
1215
- "learning_rate": 3.111359820928931e-05,
1216
- "loss": 0.0475,
1217
  "step": 1510
1218
  },
1219
  {
1220
  "epoch": 3.8287153652392947,
1221
- "grad_norm": 2723.62890625,
1222
- "learning_rate": 2.999440402909905e-05,
1223
- "loss": 0.041,
1224
  "step": 1520
1225
  },
1226
  {
1227
  "epoch": 3.853904282115869,
1228
- "grad_norm": 329.2898254394531,
1229
- "learning_rate": 2.8875209848908785e-05,
1230
- "loss": 0.0553,
1231
  "step": 1530
1232
  },
1233
  {
1234
  "epoch": 3.879093198992443,
1235
- "grad_norm": 77856.5625,
1236
- "learning_rate": 2.775601566871852e-05,
1237
- "loss": 0.0526,
1238
  "step": 1540
1239
  },
1240
  {
1241
  "epoch": 3.9042821158690177,
1242
- "grad_norm": 5539.43408203125,
1243
- "learning_rate": 2.6636821488528264e-05,
1244
- "loss": 0.0466,
1245
  "step": 1550
1246
  },
1247
  {
1248
  "epoch": 3.929471032745592,
1249
- "grad_norm": 59965.44140625,
1250
- "learning_rate": 2.5517627308338e-05,
1251
- "loss": 0.0963,
1252
  "step": 1560
1253
  },
1254
  {
1255
  "epoch": 3.954659949622166,
1256
- "grad_norm": 1198.622314453125,
1257
- "learning_rate": 2.4398433128147733e-05,
1258
- "loss": 0.1066,
1259
  "step": 1570
1260
  },
1261
  {
1262
  "epoch": 3.979848866498741,
1263
- "grad_norm": 7364.1904296875,
1264
- "learning_rate": 2.327923894795747e-05,
1265
- "loss": 0.1056,
1266
  "step": 1580
1267
  },
1268
  {
1269
  "epoch": 4.005037783375315,
1270
- "grad_norm": 52357.25390625,
1271
- "learning_rate": 2.2160044767767206e-05,
1272
- "loss": 0.072,
1273
  "step": 1590
1274
  },
1275
  {
1276
  "epoch": 4.030226700251889,
1277
- "grad_norm": 14521.083984375,
1278
- "learning_rate": 2.1040850587576947e-05,
1279
- "loss": 0.0032,
1280
  "step": 1600
1281
  },
1282
  {
1283
  "epoch": 4.030226700251889,
1284
- "eval_accuracy": 0.515625,
1285
- "eval_f1_macro": 0.4618955788683866,
1286
- "eval_loss": 3.532505512237549,
1287
- "eval_runtime": 12.192,
1288
- "eval_samples_per_second": 83.99,
1289
- "eval_steps_per_second": 5.249,
1290
  "step": 1600
1291
  },
1292
  {
1293
  "epoch": 4.055415617128464,
1294
- "grad_norm": 65696.1171875,
1295
- "learning_rate": 1.9921656407386682e-05,
1296
- "loss": 0.0244,
1297
  "step": 1610
1298
  },
1299
  {
1300
  "epoch": 4.080604534005038,
1301
- "grad_norm": 3767.174560546875,
1302
- "learning_rate": 1.880246222719642e-05,
1303
- "loss": 0.0062,
1304
  "step": 1620
1305
  },
1306
  {
1307
  "epoch": 4.105793450881612,
1308
- "grad_norm": 1467.1162109375,
1309
- "learning_rate": 1.7683268047006157e-05,
1310
- "loss": 0.0117,
1311
  "step": 1630
1312
  },
1313
  {
1314
  "epoch": 4.130982367758186,
1315
- "grad_norm": 262.5705261230469,
1316
- "learning_rate": 1.6564073866815892e-05,
1317
- "loss": 0.0306,
1318
  "step": 1640
1319
  },
1320
  {
1321
  "epoch": 4.156171284634761,
1322
- "grad_norm": 436.18170166015625,
1323
- "learning_rate": 1.544487968662563e-05,
1324
- "loss": 0.0025,
1325
  "step": 1650
1326
  },
1327
  {
1328
  "epoch": 4.181360201511335,
1329
- "grad_norm": 5583.3583984375,
1330
- "learning_rate": 1.4325685506435368e-05,
1331
- "loss": 0.0065,
1332
  "step": 1660
1333
  },
1334
  {
1335
  "epoch": 4.206549118387909,
1336
- "grad_norm": 551.452880859375,
1337
- "learning_rate": 1.3206491326245104e-05,
1338
- "loss": 0.0014,
1339
  "step": 1670
1340
  },
1341
  {
1342
  "epoch": 4.231738035264484,
1343
- "grad_norm": 224.07012939453125,
1344
- "learning_rate": 1.208729714605484e-05,
1345
- "loss": 0.027,
1346
  "step": 1680
1347
  },
1348
  {
1349
  "epoch": 4.2569269521410575,
1350
- "grad_norm": 9847.1767578125,
1351
- "learning_rate": 1.0968102965864578e-05,
1352
- "loss": 0.0007,
1353
  "step": 1690
1354
  },
1355
  {
1356
  "epoch": 4.282115869017632,
1357
- "grad_norm": 292.0694274902344,
1358
- "learning_rate": 9.848908785674316e-06,
1359
- "loss": 0.0033,
1360
  "step": 1700
1361
  },
1362
  {
1363
  "epoch": 4.282115869017632,
1364
- "eval_accuracy": 0.544921875,
1365
- "eval_f1_macro": 0.5009764520106917,
1366
- "eval_loss": 3.5591835975646973,
1367
- "eval_runtime": 12.5398,
1368
- "eval_samples_per_second": 81.66,
1369
- "eval_steps_per_second": 5.104,
1370
  "step": 1700
1371
  },
1372
  {
1373
  "epoch": 4.307304785894207,
1374
- "grad_norm": 24245.0234375,
1375
- "learning_rate": 8.729714605484052e-06,
1376
- "loss": 0.0227,
1377
  "step": 1710
1378
  },
1379
  {
1380
  "epoch": 4.332493702770781,
1381
- "grad_norm": 1062.8597412109375,
1382
- "learning_rate": 7.610520425293789e-06,
1383
- "loss": 0.0353,
1384
  "step": 1720
1385
  },
1386
  {
1387
  "epoch": 4.357682619647355,
1388
- "grad_norm": 2904.12060546875,
1389
- "learning_rate": 6.4913262451035254e-06,
1390
- "loss": 0.0411,
1391
  "step": 1730
1392
  },
1393
  {
1394
  "epoch": 4.382871536523929,
1395
- "grad_norm": 373621.8125,
1396
- "learning_rate": 5.3721320649132625e-06,
1397
- "loss": 0.0484,
1398
  "step": 1740
1399
  },
1400
  {
1401
  "epoch": 4.408060453400504,
1402
- "grad_norm": 220.9502410888672,
1403
- "learning_rate": 4.2529378847229995e-06,
1404
- "loss": 0.0209,
1405
  "step": 1750
1406
  },
1407
  {
1408
  "epoch": 4.433249370277078,
1409
- "grad_norm": 205.74693298339844,
1410
- "learning_rate": 3.1337437045327366e-06,
1411
- "loss": 0.0033,
1412
  "step": 1760
1413
  },
1414
  {
1415
  "epoch": 4.458438287153652,
1416
- "grad_norm": 1567.0152587890625,
1417
- "learning_rate": 2.0145495243424736e-06,
1418
- "loss": 0.0024,
1419
  "step": 1770
1420
  },
1421
  {
1422
  "epoch": 4.483627204030227,
1423
- "grad_norm": 1867.4505615234375,
1424
- "learning_rate": 8.953553441522105e-07,
1425
- "loss": 0.0005,
1426
  "step": 1780
1427
  },
1428
  {
1429
- "epoch": 4.501259445843829,
1430
- "step": 1787,
1431
- "total_flos": 1.645322107388756e+18,
1432
- "train_loss": 0.348135751808895,
1433
- "train_runtime": 1025.0873,
1434
- "train_samples_per_second": 27.876,
1435
- "train_steps_per_second": 1.743
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1436
  }
1437
  ],
1438
  "logging_steps": 10,
1439
- "max_steps": 1787,
1440
  "num_input_tokens_seen": 0,
1441
- "num_train_epochs": 5,
1442
  "save_steps": 100,
1443
  "stateful_callbacks": {
1444
  "TrainerControl": {
@@ -1452,7 +2245,7 @@
1452
  "attributes": {}
1453
  }
1454
  },
1455
- "total_flos": 1.645322107388756e+18,
1456
  "train_batch_size": 16,
1457
  "trial_name": null,
1458
  "trial_params": null
 
1
  {
2
+ "best_global_step": 1100,
3
+ "best_metric": 2.467733144760132,
4
+ "best_model_checkpoint": "./swin-ena24/checkpoint-1100",
5
+ "epoch": 7.0,
6
  "eval_steps": 100,
7
+ "global_step": 2779,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.02518891687657431,
14
+ "grad_norm": 254123.9375,
15
+ "learning_rate": 0.00019935228499460238,
16
+ "loss": 2.9691,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.05037783375314862,
21
+ "grad_norm": 426010.46875,
22
+ "learning_rate": 0.00019863260165527168,
23
+ "loss": 2.8919,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.07556675062972293,
28
+ "grad_norm": 438250.125,
29
+ "learning_rate": 0.000197912918315941,
30
+ "loss": 2.7195,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.10075566750629723,
35
+ "grad_norm": 643403.375,
36
+ "learning_rate": 0.00019719323497661032,
37
+ "loss": 2.567,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.12594458438287154,
42
+ "grad_norm": 416875.5,
43
+ "learning_rate": 0.0001964735516372796,
44
+ "loss": 2.6196,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.15113350125944586,
49
+ "grad_norm": 412576.78125,
50
+ "learning_rate": 0.0001957538682979489,
51
+ "loss": 2.4359,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.17632241813602015,
56
+ "grad_norm": 371569.9375,
57
+ "learning_rate": 0.00019503418495861824,
58
+ "loss": 2.4815,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.20151133501259447,
63
+ "grad_norm": 297408.5625,
64
+ "learning_rate": 0.0001943145016192875,
65
+ "loss": 2.508,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.22670025188916876,
70
+ "grad_norm": 405330.71875,
71
+ "learning_rate": 0.00019359481827995682,
72
+ "loss": 2.385,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.2518891687657431,
77
+ "grad_norm": 578770.75,
78
+ "learning_rate": 0.00019287513494062612,
79
+ "loss": 1.9888,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.2518891687657431,
84
+ "eval_accuracy": 0.1630859375,
85
+ "eval_f1_macro": 0.08928735942932195,
86
+ "eval_loss": 3.412175178527832,
87
+ "eval_runtime": 11.0442,
88
+ "eval_samples_per_second": 92.718,
89
+ "eval_steps_per_second": 5.795,
90
  "step": 100
91
  },
92
  {
93
  "epoch": 0.2770780856423174,
94
+ "grad_norm": 510874.125,
95
+ "learning_rate": 0.00019215545160129545,
96
+ "loss": 2.188,
97
  "step": 110
98
  },
99
  {
100
  "epoch": 0.3022670025188917,
101
+ "grad_norm": 513859.9375,
102
+ "learning_rate": 0.00019143576826196473,
103
+ "loss": 2.1743,
104
  "step": 120
105
  },
106
  {
107
  "epoch": 0.327455919395466,
108
+ "grad_norm": 527887.5,
109
+ "learning_rate": 0.00019071608492263404,
110
+ "loss": 2.2732,
111
  "step": 130
112
  },
113
  {
114
  "epoch": 0.3526448362720403,
115
+ "grad_norm": 460399.375,
116
+ "learning_rate": 0.00018999640158330337,
117
+ "loss": 2.1564,
118
  "step": 140
119
  },
120
  {
121
  "epoch": 0.3778337531486146,
122
+ "grad_norm": 569056.8125,
123
+ "learning_rate": 0.00018927671824397267,
124
+ "loss": 1.8225,
125
  "step": 150
126
  },
127
  {
128
  "epoch": 0.40302267002518893,
129
+ "grad_norm": 909920.6875,
130
+ "learning_rate": 0.00018855703490464195,
131
+ "loss": 1.771,
132
  "step": 160
133
  },
134
  {
135
  "epoch": 0.4282115869017632,
136
+ "grad_norm": 494884.5,
137
+ "learning_rate": 0.00018783735156531128,
138
+ "loss": 1.8349,
139
  "step": 170
140
  },
141
  {
142
  "epoch": 0.4534005037783375,
143
+ "grad_norm": 550597.375,
144
+ "learning_rate": 0.0001871176682259806,
145
+ "loss": 1.5891,
146
  "step": 180
147
  },
148
  {
149
  "epoch": 0.47858942065491183,
150
+ "grad_norm": 462934.75,
151
+ "learning_rate": 0.00018639798488664987,
152
+ "loss": 1.8926,
153
  "step": 190
154
  },
155
  {
156
  "epoch": 0.5037783375314862,
157
+ "grad_norm": 724803.5,
158
+ "learning_rate": 0.00018567830154731917,
159
+ "loss": 1.6111,
160
  "step": 200
161
  },
162
  {
163
  "epoch": 0.5037783375314862,
164
+ "eval_accuracy": 0.2578125,
165
+ "eval_f1_macro": 0.15353256242114557,
166
+ "eval_loss": 2.9077086448669434,
167
+ "eval_runtime": 11.368,
168
+ "eval_samples_per_second": 90.078,
169
+ "eval_steps_per_second": 5.63,
170
  "step": 200
171
  },
172
  {
173
  "epoch": 0.5289672544080605,
174
+ "grad_norm": 559555.8125,
175
+ "learning_rate": 0.0001849586182079885,
176
+ "loss": 1.6636,
177
  "step": 210
178
  },
179
  {
180
  "epoch": 0.5541561712846348,
181
+ "grad_norm": 1021320.3125,
182
+ "learning_rate": 0.0001842389348686578,
183
+ "loss": 1.462,
184
  "step": 220
185
  },
186
  {
187
  "epoch": 0.5793450881612091,
188
+ "grad_norm": 692250.375,
189
+ "learning_rate": 0.0001835192515293271,
190
+ "loss": 1.5007,
191
  "step": 230
192
  },
193
  {
194
  "epoch": 0.6045340050377834,
195
+ "grad_norm": 774498.1875,
196
+ "learning_rate": 0.00018279956818999642,
197
+ "loss": 1.5122,
198
  "step": 240
199
  },
200
  {
201
  "epoch": 0.6297229219143576,
202
+ "grad_norm": 497269.53125,
203
+ "learning_rate": 0.00018207988485066572,
204
+ "loss": 1.3379,
205
  "step": 250
206
  },
207
  {
208
  "epoch": 0.654911838790932,
209
+ "grad_norm": 786711.4375,
210
+ "learning_rate": 0.00018136020151133503,
211
+ "loss": 1.5591,
212
  "step": 260
213
  },
214
  {
215
  "epoch": 0.6801007556675063,
216
+ "grad_norm": 561642.375,
217
+ "learning_rate": 0.00018064051817200433,
218
+ "loss": 1.3499,
219
  "step": 270
220
  },
221
  {
222
  "epoch": 0.7052896725440806,
223
+ "grad_norm": 372517.21875,
224
+ "learning_rate": 0.00017992083483267364,
225
+ "loss": 1.3296,
226
  "step": 280
227
  },
228
  {
229
  "epoch": 0.7304785894206549,
230
+ "grad_norm": 783448.4375,
231
+ "learning_rate": 0.00017920115149334294,
232
+ "loss": 1.2106,
233
  "step": 290
234
  },
235
  {
236
  "epoch": 0.7556675062972292,
237
+ "grad_norm": 748525.0,
238
+ "learning_rate": 0.00017848146815401222,
239
+ "loss": 1.1276,
240
  "step": 300
241
  },
242
  {
243
  "epoch": 0.7556675062972292,
244
+ "eval_accuracy": 0.357421875,
245
+ "eval_f1_macro": 0.28272669387224364,
246
+ "eval_loss": 2.6503801345825195,
247
+ "eval_runtime": 11.5522,
248
+ "eval_samples_per_second": 88.641,
249
+ "eval_steps_per_second": 5.54,
250
  "step": 300
251
  },
252
  {
253
  "epoch": 0.7808564231738035,
254
+ "grad_norm": 481374.1875,
255
+ "learning_rate": 0.00017776178481468155,
256
+ "loss": 1.1727,
257
  "step": 310
258
  },
259
  {
260
  "epoch": 0.8060453400503779,
261
+ "grad_norm": 1038731.1875,
262
+ "learning_rate": 0.00017704210147535086,
263
+ "loss": 1.1783,
264
  "step": 320
265
  },
266
  {
267
  "epoch": 0.8312342569269522,
268
+ "grad_norm": 369614.8125,
269
+ "learning_rate": 0.00017632241813602016,
270
+ "loss": 1.4703,
271
  "step": 330
272
  },
273
  {
274
  "epoch": 0.8564231738035264,
275
+ "grad_norm": 411708.9375,
276
+ "learning_rate": 0.00017560273479668947,
277
+ "loss": 1.0237,
278
  "step": 340
279
  },
280
  {
281
  "epoch": 0.8816120906801007,
282
+ "grad_norm": 777554.375,
283
+ "learning_rate": 0.00017488305145735877,
284
+ "loss": 1.3113,
285
  "step": 350
286
  },
287
  {
288
  "epoch": 0.906801007556675,
289
+ "grad_norm": 589113.125,
290
+ "learning_rate": 0.00017416336811802808,
291
+ "loss": 1.4282,
292
  "step": 360
293
  },
294
  {
295
  "epoch": 0.9319899244332494,
296
+ "grad_norm": 537991.9375,
297
+ "learning_rate": 0.00017344368477869738,
298
+ "loss": 1.0802,
299
  "step": 370
300
  },
301
  {
302
  "epoch": 0.9571788413098237,
303
+ "grad_norm": 745339.875,
304
+ "learning_rate": 0.0001727240014393667,
305
+ "loss": 1.1078,
306
  "step": 380
307
  },
308
  {
309
  "epoch": 0.982367758186398,
310
+ "grad_norm": 515597.875,
311
+ "learning_rate": 0.000172004318100036,
312
+ "loss": 1.3261,
313
  "step": 390
314
  },
315
  {
316
  "epoch": 1.0075566750629723,
317
+ "grad_norm": 438800.96875,
318
+ "learning_rate": 0.0001712846347607053,
319
+ "loss": 1.0234,
320
  "step": 400
321
  },
322
  {
323
  "epoch": 1.0075566750629723,
324
+ "eval_accuracy": 0.390625,
325
+ "eval_f1_macro": 0.31557618159267287,
326
+ "eval_loss": 2.572810173034668,
327
+ "eval_runtime": 11.7819,
328
+ "eval_samples_per_second": 86.913,
329
+ "eval_steps_per_second": 5.432,
330
  "step": 400
331
  },
332
  {
333
  "epoch": 1.0327455919395465,
334
+ "grad_norm": 360076.0,
335
+ "learning_rate": 0.0001705649514213746,
336
+ "loss": 1.0415,
337
  "step": 410
338
  },
339
  {
340
  "epoch": 1.057934508816121,
341
+ "grad_norm": 468955.34375,
342
+ "learning_rate": 0.0001698452680820439,
343
+ "loss": 0.8579,
344
  "step": 420
345
  },
346
  {
347
  "epoch": 1.0831234256926952,
348
+ "grad_norm": 765571.4375,
349
+ "learning_rate": 0.0001691255847427132,
350
+ "loss": 0.9408,
351
  "step": 430
352
  },
353
  {
354
  "epoch": 1.1083123425692696,
355
+ "grad_norm": 682154.8125,
356
+ "learning_rate": 0.00016840590140338252,
357
+ "loss": 1.0962,
358
  "step": 440
359
  },
360
  {
361
  "epoch": 1.1335012594458438,
362
+ "grad_norm": 861960.5,
363
+ "learning_rate": 0.00016768621806405182,
364
+ "loss": 0.8124,
365
  "step": 450
366
  },
367
  {
368
  "epoch": 1.1586901763224182,
369
+ "grad_norm": 583161.8125,
370
+ "learning_rate": 0.00016696653472472113,
371
+ "loss": 1.1014,
372
  "step": 460
373
  },
374
  {
375
  "epoch": 1.1838790931989924,
376
+ "grad_norm": 390205.40625,
377
+ "learning_rate": 0.00016624685138539046,
378
+ "loss": 0.8475,
379
  "step": 470
380
  },
381
  {
382
  "epoch": 1.2090680100755669,
383
+ "grad_norm": 293254.3125,
384
+ "learning_rate": 0.00016552716804605974,
385
+ "loss": 1.0482,
386
  "step": 480
387
  },
388
  {
389
  "epoch": 1.234256926952141,
390
+ "grad_norm": 571449.25,
391
+ "learning_rate": 0.00016480748470672904,
392
+ "loss": 0.8776,
393
  "step": 490
394
  },
395
  {
396
  "epoch": 1.2594458438287153,
397
+ "grad_norm": 362773.25,
398
+ "learning_rate": 0.00016408780136739835,
399
+ "loss": 0.8909,
400
  "step": 500
401
  },
402
  {
403
  "epoch": 1.2594458438287153,
404
+ "eval_accuracy": 0.421875,
405
+ "eval_f1_macro": 0.3388453334601594,
406
+ "eval_loss": 2.500704765319824,
407
+ "eval_runtime": 11.4557,
408
+ "eval_samples_per_second": 89.388,
409
+ "eval_steps_per_second": 5.587,
410
  "step": 500
411
  },
412
  {
413
  "epoch": 1.2846347607052897,
414
+ "grad_norm": 351467.09375,
415
+ "learning_rate": 0.00016336811802806765,
416
+ "loss": 0.8244,
417
  "step": 510
418
  },
419
  {
420
  "epoch": 1.309823677581864,
421
+ "grad_norm": 894287.0625,
422
+ "learning_rate": 0.00016264843468873696,
423
+ "loss": 1.048,
424
  "step": 520
425
  },
426
  {
427
  "epoch": 1.3350125944584383,
428
+ "grad_norm": 474119.75,
429
+ "learning_rate": 0.00016192875134940626,
430
+ "loss": 0.9196,
431
  "step": 530
432
  },
433
  {
434
  "epoch": 1.3602015113350125,
435
+ "grad_norm": 1069011.125,
436
+ "learning_rate": 0.0001612090680100756,
437
+ "loss": 0.804,
438
  "step": 540
439
  },
440
  {
441
  "epoch": 1.385390428211587,
442
+ "grad_norm": 587531.6875,
443
+ "learning_rate": 0.00016048938467074487,
444
+ "loss": 0.6696,
445
  "step": 550
446
  },
447
  {
448
  "epoch": 1.4105793450881612,
449
+ "grad_norm": 879147.25,
450
+ "learning_rate": 0.00015976970133141418,
451
+ "loss": 0.7831,
452
  "step": 560
453
  },
454
  {
455
  "epoch": 1.4357682619647356,
456
+ "grad_norm": 219215.859375,
457
+ "learning_rate": 0.0001590500179920835,
458
+ "loss": 0.5681,
459
  "step": 570
460
  },
461
  {
462
  "epoch": 1.4609571788413098,
463
+ "grad_norm": 447798.375,
464
+ "learning_rate": 0.00015833033465275279,
465
+ "loss": 0.8663,
466
  "step": 580
467
  },
468
  {
469
  "epoch": 1.486146095717884,
470
+ "grad_norm": 708370.625,
471
+ "learning_rate": 0.0001576106513134221,
472
+ "loss": 0.7913,
473
  "step": 590
474
  },
475
  {
476
  "epoch": 1.5113350125944585,
477
+ "grad_norm": 822008.0625,
478
+ "learning_rate": 0.0001568909679740914,
479
+ "loss": 0.8008,
480
  "step": 600
481
  },
482
  {
483
  "epoch": 1.5113350125944585,
484
+ "eval_accuracy": 0.404296875,
485
+ "eval_f1_macro": 0.3618779716937303,
486
+ "eval_loss": 2.7039053440093994,
487
+ "eval_runtime": 11.8087,
488
+ "eval_samples_per_second": 86.716,
489
+ "eval_steps_per_second": 5.42,
490
  "step": 600
491
  },
492
  {
493
  "epoch": 1.536523929471033,
494
+ "grad_norm": 467169.0625,
495
+ "learning_rate": 0.00015617128463476073,
496
+ "loss": 0.7902,
497
  "step": 610
498
  },
499
  {
500
  "epoch": 1.561712846347607,
501
+ "grad_norm": 385000.3125,
502
+ "learning_rate": 0.00015545160129543,
503
+ "loss": 0.7146,
504
  "step": 620
505
  },
506
  {
507
  "epoch": 1.5869017632241813,
508
+ "grad_norm": 366275.21875,
509
+ "learning_rate": 0.0001547319179560993,
510
+ "loss": 0.6558,
511
  "step": 630
512
  },
513
  {
514
  "epoch": 1.6120906801007555,
515
+ "grad_norm": 432902.1875,
516
+ "learning_rate": 0.00015401223461676864,
517
+ "loss": 0.7669,
518
  "step": 640
519
  },
520
  {
521
  "epoch": 1.63727959697733,
522
+ "grad_norm": 426131.71875,
523
+ "learning_rate": 0.00015329255127743795,
524
+ "loss": 0.7121,
525
  "step": 650
526
  },
527
  {
528
  "epoch": 1.6624685138539044,
529
+ "grad_norm": 426302.0,
530
+ "learning_rate": 0.00015257286793810722,
531
+ "loss": 0.9564,
532
  "step": 660
533
  },
534
  {
535
  "epoch": 1.6876574307304786,
536
+ "grad_norm": 405949.34375,
537
+ "learning_rate": 0.00015185318459877656,
538
+ "loss": 0.6072,
539
  "step": 670
540
  },
541
  {
542
  "epoch": 1.7128463476070528,
543
+ "grad_norm": 319128.53125,
544
+ "learning_rate": 0.00015113350125944586,
545
+ "loss": 0.5537,
546
  "step": 680
547
  },
548
  {
549
  "epoch": 1.7380352644836272,
550
+ "grad_norm": 405533.625,
551
+ "learning_rate": 0.00015041381792011514,
552
+ "loss": 0.8143,
553
  "step": 690
554
  },
555
  {
556
  "epoch": 1.7632241813602016,
557
+ "grad_norm": 357302.3125,
558
+ "learning_rate": 0.00014969413458078447,
559
+ "loss": 0.6885,
560
  "step": 700
561
  },
562
  {
563
  "epoch": 1.7632241813602016,
564
+ "eval_accuracy": 0.3701171875,
565
+ "eval_f1_macro": 0.2926079223907959,
566
+ "eval_loss": 3.1089859008789062,
567
+ "eval_runtime": 11.4817,
568
+ "eval_samples_per_second": 89.186,
569
+ "eval_steps_per_second": 5.574,
570
  "step": 700
571
  },
572
  {
573
  "epoch": 1.7884130982367759,
574
+ "grad_norm": 448783.3125,
575
+ "learning_rate": 0.00014897445124145378,
576
+ "loss": 0.8981,
577
  "step": 710
578
  },
579
  {
580
  "epoch": 1.81360201511335,
581
+ "grad_norm": 663808.1875,
582
+ "learning_rate": 0.00014825476790212308,
583
+ "loss": 0.6493,
584
  "step": 720
585
  },
586
  {
587
  "epoch": 1.8387909319899243,
588
+ "grad_norm": 1166550.75,
589
+ "learning_rate": 0.00014753508456279236,
590
+ "loss": 0.6288,
591
  "step": 730
592
  },
593
  {
594
  "epoch": 1.8639798488664987,
595
+ "grad_norm": 774232.6875,
596
+ "learning_rate": 0.0001468154012234617,
597
+ "loss": 0.8717,
598
  "step": 740
599
  },
600
  {
601
  "epoch": 1.8891687657430731,
602
+ "grad_norm": 565763.0,
603
+ "learning_rate": 0.000146095717884131,
604
+ "loss": 0.7199,
605
  "step": 750
606
  },
607
  {
608
  "epoch": 1.9143576826196473,
609
+ "grad_norm": 365545.625,
610
+ "learning_rate": 0.0001453760345448003,
611
+ "loss": 0.721,
612
  "step": 760
613
  },
614
  {
615
  "epoch": 1.9395465994962215,
616
+ "grad_norm": 616748.25,
617
+ "learning_rate": 0.0001446563512054696,
618
+ "loss": 0.6793,
619
  "step": 770
620
  },
621
  {
622
  "epoch": 1.964735516372796,
623
+ "grad_norm": 503789.65625,
624
+ "learning_rate": 0.0001439366678661389,
625
+ "loss": 0.5703,
626
  "step": 780
627
  },
628
  {
629
  "epoch": 1.9899244332493704,
630
+ "grad_norm": 619143.4375,
631
+ "learning_rate": 0.00014321698452680822,
632
+ "loss": 0.679,
633
  "step": 790
634
  },
635
  {
636
  "epoch": 2.0151133501259446,
637
+ "grad_norm": 661451.3125,
638
+ "learning_rate": 0.00014249730118747752,
639
+ "loss": 0.839,
640
  "step": 800
641
  },
642
  {
643
  "epoch": 2.0151133501259446,
644
  "eval_accuracy": 0.484375,
645
+ "eval_f1_macro": 0.414907768165119,
646
+ "eval_loss": 2.584502935409546,
647
+ "eval_runtime": 11.5153,
648
+ "eval_samples_per_second": 88.925,
649
+ "eval_steps_per_second": 5.558,
650
  "step": 800
651
  },
652
  {
653
  "epoch": 2.040302267002519,
654
+ "grad_norm": 316634.78125,
655
+ "learning_rate": 0.00014177761784814683,
656
+ "loss": 0.4317,
657
  "step": 810
658
  },
659
  {
660
  "epoch": 2.065491183879093,
661
+ "grad_norm": 347729.625,
662
+ "learning_rate": 0.00014105793450881613,
663
+ "loss": 0.3792,
664
  "step": 820
665
  },
666
  {
667
  "epoch": 2.0906801007556677,
668
+ "grad_norm": 659445.4375,
669
+ "learning_rate": 0.00014033825116948544,
670
+ "loss": 0.452,
671
  "step": 830
672
  },
673
  {
674
  "epoch": 2.115869017632242,
675
+ "grad_norm": 735795.1875,
676
+ "learning_rate": 0.00013961856783015474,
677
+ "loss": 0.5447,
678
  "step": 840
679
  },
680
  {
681
  "epoch": 2.141057934508816,
682
+ "grad_norm": 107777.5390625,
683
+ "learning_rate": 0.00013889888449082404,
684
+ "loss": 0.5247,
685
  "step": 850
686
  },
687
  {
688
  "epoch": 2.1662468513853903,
689
+ "grad_norm": 577874.5,
690
+ "learning_rate": 0.00013817920115149335,
691
+ "loss": 0.5108,
692
  "step": 860
693
  },
694
  {
695
  "epoch": 2.1914357682619645,
696
+ "grad_norm": 441668.625,
697
+ "learning_rate": 0.00013745951781216265,
698
+ "loss": 0.4324,
699
  "step": 870
700
  },
701
  {
702
  "epoch": 2.216624685138539,
703
+ "grad_norm": 489653.59375,
704
+ "learning_rate": 0.00013673983447283196,
705
+ "loss": 0.4427,
706
  "step": 880
707
  },
708
  {
709
  "epoch": 2.2418136020151134,
710
+ "grad_norm": 296346.84375,
711
+ "learning_rate": 0.00013602015113350126,
712
+ "loss": 0.5105,
713
  "step": 890
714
  },
715
  {
716
  "epoch": 2.2670025188916876,
717
+ "grad_norm": 187145.796875,
718
+ "learning_rate": 0.00013530046779417057,
719
+ "loss": 0.325,
720
  "step": 900
721
  },
722
  {
723
  "epoch": 2.2670025188916876,
724
+ "eval_accuracy": 0.5068359375,
725
+ "eval_f1_macro": 0.41278416064519763,
726
+ "eval_loss": 2.5142855644226074,
727
+ "eval_runtime": 12.173,
728
+ "eval_samples_per_second": 84.12,
729
+ "eval_steps_per_second": 5.258,
730
  "step": 900
731
  },
732
  {
733
  "epoch": 2.292191435768262,
734
+ "grad_norm": 406714.6875,
735
+ "learning_rate": 0.00013458078445483987,
736
+ "loss": 0.4576,
737
  "step": 910
738
  },
739
  {
740
  "epoch": 2.3173803526448364,
741
+ "grad_norm": 379892.40625,
742
+ "learning_rate": 0.00013386110111550918,
743
+ "loss": 0.4179,
744
  "step": 920
745
  },
746
  {
747
  "epoch": 2.3425692695214106,
748
+ "grad_norm": 540661.9375,
749
+ "learning_rate": 0.00013314141777617848,
750
+ "loss": 0.3386,
751
  "step": 930
752
  },
753
  {
754
  "epoch": 2.367758186397985,
755
+ "grad_norm": 850949.25,
756
+ "learning_rate": 0.0001324217344368478,
757
+ "loss": 0.5695,
758
  "step": 940
759
  },
760
  {
761
  "epoch": 2.392947103274559,
762
+ "grad_norm": 363627.53125,
763
+ "learning_rate": 0.0001317020510975171,
764
+ "loss": 0.3362,
765
  "step": 950
766
  },
767
  {
768
  "epoch": 2.4181360201511337,
769
+ "grad_norm": 644468.0625,
770
+ "learning_rate": 0.0001309823677581864,
771
+ "loss": 0.444,
772
  "step": 960
773
  },
774
  {
775
  "epoch": 2.443324937027708,
776
+ "grad_norm": 164236.78125,
777
+ "learning_rate": 0.00013026268441885573,
778
+ "loss": 0.285,
779
  "step": 970
780
  },
781
  {
782
  "epoch": 2.468513853904282,
783
+ "grad_norm": 688494.375,
784
+ "learning_rate": 0.000129543001079525,
785
+ "loss": 0.4024,
786
  "step": 980
787
  },
788
  {
789
  "epoch": 2.4937027707808563,
790
+ "grad_norm": 486211.0625,
791
+ "learning_rate": 0.0001288233177401943,
792
+ "loss": 0.4544,
793
  "step": 990
794
  },
795
  {
796
  "epoch": 2.5188916876574305,
797
+ "grad_norm": 273390.03125,
798
+ "learning_rate": 0.00012810363440086365,
799
+ "loss": 0.4501,
800
  "step": 1000
801
  },
802
  {
803
  "epoch": 2.5188916876574305,
804
+ "eval_accuracy": 0.4482421875,
805
+ "eval_f1_macro": 0.40562802574511003,
806
+ "eval_loss": 2.7683629989624023,
807
+ "eval_runtime": 11.8873,
808
+ "eval_samples_per_second": 86.142,
809
+ "eval_steps_per_second": 5.384,
810
  "step": 1000
811
  },
812
  {
813
  "epoch": 2.544080604534005,
814
+ "grad_norm": 730712.125,
815
+ "learning_rate": 0.00012738395106153292,
816
+ "loss": 0.5234,
817
  "step": 1010
818
  },
819
  {
820
  "epoch": 2.5692695214105794,
821
+ "grad_norm": 369909.34375,
822
+ "learning_rate": 0.00012666426772220223,
823
+ "loss": 0.4051,
824
  "step": 1020
825
  },
826
  {
827
  "epoch": 2.5944584382871536,
828
+ "grad_norm": 507635.78125,
829
+ "learning_rate": 0.00012594458438287153,
830
+ "loss": 0.2967,
831
  "step": 1030
832
  },
833
  {
834
  "epoch": 2.619647355163728,
835
+ "grad_norm": 688013.125,
836
+ "learning_rate": 0.00012522490104354087,
837
+ "loss": 0.5321,
838
  "step": 1040
839
  },
840
  {
841
  "epoch": 2.644836272040302,
842
+ "grad_norm": 505216.40625,
843
+ "learning_rate": 0.00012450521770421014,
844
+ "loss": 0.4807,
845
  "step": 1050
846
  },
847
  {
848
  "epoch": 2.6700251889168767,
849
+ "grad_norm": 252679.53125,
850
+ "learning_rate": 0.00012378553436487945,
851
+ "loss": 0.4088,
852
  "step": 1060
853
  },
854
  {
855
  "epoch": 2.695214105793451,
856
+ "grad_norm": 235546.9375,
857
+ "learning_rate": 0.00012306585102554878,
858
+ "loss": 0.393,
859
  "step": 1070
860
  },
861
  {
862
  "epoch": 2.720403022670025,
863
+ "grad_norm": 398018.84375,
864
+ "learning_rate": 0.00012234616768621808,
865
+ "loss": 0.3694,
866
  "step": 1080
867
  },
868
  {
869
  "epoch": 2.7455919395465997,
870
+ "grad_norm": 374467.3125,
871
+ "learning_rate": 0.00012162648434688738,
872
+ "loss": 0.3599,
873
  "step": 1090
874
  },
875
  {
876
  "epoch": 2.770780856423174,
877
+ "grad_norm": 533788.9375,
878
+ "learning_rate": 0.0001209068010075567,
879
+ "loss": 0.3191,
880
  "step": 1100
881
  },
882
  {
883
  "epoch": 2.770780856423174,
884
+ "eval_accuracy": 0.5146484375,
885
+ "eval_f1_macro": 0.432831730682114,
886
+ "eval_loss": 2.467733144760132,
887
+ "eval_runtime": 11.9402,
888
+ "eval_samples_per_second": 85.761,
889
+ "eval_steps_per_second": 5.36,
890
  "step": 1100
891
  },
892
  {
893
  "epoch": 2.795969773299748,
894
+ "grad_norm": 27070.849609375,
895
+ "learning_rate": 0.00012018711766822599,
896
+ "loss": 0.2222,
897
  "step": 1110
898
  },
899
  {
900
  "epoch": 2.8211586901763224,
901
+ "grad_norm": 742011.0625,
902
+ "learning_rate": 0.00011946743432889529,
903
+ "loss": 0.3194,
904
  "step": 1120
905
  },
906
  {
907
  "epoch": 2.8463476070528966,
908
+ "grad_norm": 508084.53125,
909
+ "learning_rate": 0.00011874775098956458,
910
+ "loss": 0.5068,
911
  "step": 1130
912
  },
913
  {
914
  "epoch": 2.8715365239294712,
915
+ "grad_norm": 472090.21875,
916
+ "learning_rate": 0.00011802806765023391,
917
+ "loss": 0.3457,
918
  "step": 1140
919
  },
920
  {
921
  "epoch": 2.8967254408060454,
922
+ "grad_norm": 345946.625,
923
+ "learning_rate": 0.0001173083843109032,
924
+ "loss": 0.4024,
925
  "step": 1150
926
  },
927
  {
928
  "epoch": 2.9219143576826196,
929
+ "grad_norm": 116557.78125,
930
+ "learning_rate": 0.00011658870097157251,
931
+ "loss": 0.3248,
932
  "step": 1160
933
  },
934
  {
935
  "epoch": 2.947103274559194,
936
+ "grad_norm": 544007.125,
937
+ "learning_rate": 0.00011586901763224183,
938
+ "loss": 0.2957,
939
  "step": 1170
940
  },
941
  {
942
  "epoch": 2.972292191435768,
943
+ "grad_norm": 558989.0,
944
+ "learning_rate": 0.00011514933429291112,
945
+ "loss": 0.3386,
946
  "step": 1180
947
  },
948
  {
949
  "epoch": 2.9974811083123427,
950
+ "grad_norm": 509623.65625,
951
+ "learning_rate": 0.00011442965095358043,
952
+ "loss": 0.4525,
953
  "step": 1190
954
  },
955
  {
956
  "epoch": 3.022670025188917,
957
+ "grad_norm": 374462.28125,
958
+ "learning_rate": 0.00011370996761424974,
959
+ "loss": 0.1664,
960
  "step": 1200
961
  },
962
  {
963
  "epoch": 3.022670025188917,
964
+ "eval_accuracy": 0.5361328125,
965
+ "eval_f1_macro": 0.4597358092759388,
966
+ "eval_loss": 2.477670192718506,
967
+ "eval_runtime": 11.8885,
968
+ "eval_samples_per_second": 86.134,
969
+ "eval_steps_per_second": 5.383,
970
  "step": 1200
971
  },
972
  {
973
  "epoch": 3.047858942065491,
974
+ "grad_norm": 417895.8125,
975
+ "learning_rate": 0.00011299028427491905,
976
+ "loss": 0.2631,
977
  "step": 1210
978
  },
979
  {
980
  "epoch": 3.0730478589420653,
981
+ "grad_norm": 307081.21875,
982
+ "learning_rate": 0.00011227060093558834,
983
+ "loss": 0.1544,
984
  "step": 1220
985
  },
986
  {
987
  "epoch": 3.09823677581864,
988
+ "grad_norm": 1034528.125,
989
+ "learning_rate": 0.00011155091759625764,
990
+ "loss": 0.2107,
991
  "step": 1230
992
  },
993
  {
994
  "epoch": 3.123425692695214,
995
+ "grad_norm": 659800.0625,
996
+ "learning_rate": 0.00011083123425692696,
997
+ "loss": 0.3405,
998
  "step": 1240
999
  },
1000
  {
1001
  "epoch": 3.1486146095717884,
1002
+ "grad_norm": 129759.8671875,
1003
+ "learning_rate": 0.00011011155091759627,
1004
+ "loss": 0.2225,
1005
  "step": 1250
1006
  },
1007
  {
1008
  "epoch": 3.1738035264483626,
1009
+ "grad_norm": 560157.25,
1010
+ "learning_rate": 0.00010939186757826556,
1011
+ "loss": 0.1545,
1012
  "step": 1260
1013
  },
1014
  {
1015
  "epoch": 3.1989924433249373,
1016
+ "grad_norm": 253132.671875,
1017
+ "learning_rate": 0.00010867218423893488,
1018
+ "loss": 0.3073,
1019
  "step": 1270
1020
  },
1021
  {
1022
  "epoch": 3.2241813602015115,
1023
+ "grad_norm": 407181.90625,
1024
+ "learning_rate": 0.00010795250089960418,
1025
+ "loss": 0.3247,
1026
  "step": 1280
1027
  },
1028
  {
1029
  "epoch": 3.2493702770780857,
1030
+ "grad_norm": 433259.09375,
1031
+ "learning_rate": 0.00010723281756027347,
1032
+ "loss": 0.127,
1033
  "step": 1290
1034
  },
1035
  {
1036
  "epoch": 3.27455919395466,
1037
+ "grad_norm": 187479.984375,
1038
+ "learning_rate": 0.0001065131342209428,
1039
+ "loss": 0.1469,
1040
  "step": 1300
1041
  },
1042
  {
1043
  "epoch": 3.27455919395466,
1044
+ "eval_accuracy": 0.5205078125,
1045
+ "eval_f1_macro": 0.4495261806770087,
1046
+ "eval_loss": 2.6402528285980225,
1047
+ "eval_runtime": 11.5773,
1048
+ "eval_samples_per_second": 88.449,
1049
+ "eval_steps_per_second": 5.528,
1050
  "step": 1300
1051
  },
1052
  {
1053
  "epoch": 3.299748110831234,
1054
+ "grad_norm": 736520.0625,
1055
+ "learning_rate": 0.0001057934508816121,
1056
+ "loss": 0.2567,
1057
  "step": 1310
1058
  },
1059
  {
1060
  "epoch": 3.3249370277078087,
1061
+ "grad_norm": 376107.03125,
1062
+ "learning_rate": 0.0001050737675422814,
1063
+ "loss": 0.1442,
1064
  "step": 1320
1065
  },
1066
  {
1067
  "epoch": 3.350125944584383,
1068
+ "grad_norm": 723020.3125,
1069
+ "learning_rate": 0.0001043540842029507,
1070
+ "loss": 0.1422,
1071
  "step": 1330
1072
  },
1073
  {
1074
  "epoch": 3.375314861460957,
1075
+ "grad_norm": 60796.48046875,
1076
+ "learning_rate": 0.00010363440086362001,
1077
+ "loss": 0.2077,
1078
  "step": 1340
1079
  },
1080
  {
1081
  "epoch": 3.4005037783375314,
1082
+ "grad_norm": 624072.0,
1083
+ "learning_rate": 0.00010291471752428932,
1084
+ "loss": 0.1422,
1085
  "step": 1350
1086
  },
1087
  {
1088
  "epoch": 3.4256926952141056,
1089
+ "grad_norm": 191143.5625,
1090
+ "learning_rate": 0.00010219503418495862,
1091
+ "loss": 0.2425,
1092
  "step": 1360
1093
  },
1094
  {
1095
  "epoch": 3.4508816120906802,
1096
+ "grad_norm": 291693.75,
1097
+ "learning_rate": 0.00010147535084562794,
1098
+ "loss": 0.2693,
1099
  "step": 1370
1100
  },
1101
  {
1102
  "epoch": 3.4760705289672544,
1103
+ "grad_norm": 516212.4375,
1104
+ "learning_rate": 0.00010075566750629723,
1105
+ "loss": 0.2397,
1106
  "step": 1380
1107
  },
1108
  {
1109
  "epoch": 3.5012594458438286,
1110
+ "grad_norm": 273365.9375,
1111
+ "learning_rate": 0.00010003598416696654,
1112
+ "loss": 0.1774,
1113
  "step": 1390
1114
  },
1115
  {
1116
  "epoch": 3.5264483627204033,
1117
+ "grad_norm": 518260.21875,
1118
+ "learning_rate": 9.931630082763584e-05,
1119
+ "loss": 0.3063,
1120
  "step": 1400
1121
  },
1122
  {
1123
  "epoch": 3.5264483627204033,
1124
+ "eval_accuracy": 0.5009765625,
1125
+ "eval_f1_macro": 0.441499104615912,
1126
+ "eval_loss": 2.7999606132507324,
1127
+ "eval_runtime": 11.7898,
1128
+ "eval_samples_per_second": 86.855,
1129
+ "eval_steps_per_second": 5.428,
1130
  "step": 1400
1131
  },
1132
  {
1133
  "epoch": 3.551637279596977,
1134
+ "grad_norm": 552209.5,
1135
+ "learning_rate": 9.859661748830516e-05,
1136
+ "loss": 0.2254,
1137
  "step": 1410
1138
  },
1139
  {
1140
  "epoch": 3.5768261964735517,
1141
+ "grad_norm": 7195.90625,
1142
+ "learning_rate": 9.787693414897445e-05,
1143
+ "loss": 0.1378,
1144
  "step": 1420
1145
  },
1146
  {
1147
  "epoch": 3.602015113350126,
1148
+ "grad_norm": 879434.0625,
1149
+ "learning_rate": 9.715725080964376e-05,
1150
+ "loss": 0.2466,
1151
  "step": 1430
1152
  },
1153
  {
1154
  "epoch": 3.6272040302267,
1155
+ "grad_norm": 116491.9609375,
1156
+ "learning_rate": 9.643756747031306e-05,
1157
+ "loss": 0.2094,
1158
  "step": 1440
1159
  },
1160
  {
1161
  "epoch": 3.652392947103275,
1162
+ "grad_norm": 524990.0625,
1163
+ "learning_rate": 9.571788413098237e-05,
1164
+ "loss": 0.2522,
1165
  "step": 1450
1166
  },
1167
  {
1168
  "epoch": 3.677581863979849,
1169
+ "grad_norm": 939990.8125,
1170
+ "learning_rate": 9.499820079165168e-05,
1171
+ "loss": 0.2382,
1172
  "step": 1460
1173
  },
1174
  {
1175
  "epoch": 3.702770780856423,
1176
+ "grad_norm": 337334.625,
1177
+ "learning_rate": 9.427851745232098e-05,
1178
+ "loss": 0.299,
1179
  "step": 1470
1180
  },
1181
  {
1182
  "epoch": 3.7279596977329974,
1183
+ "grad_norm": 322309.28125,
1184
+ "learning_rate": 9.35588341129903e-05,
1185
+ "loss": 0.2307,
1186
  "step": 1480
1187
  },
1188
  {
1189
  "epoch": 3.7531486146095716,
1190
+ "grad_norm": 80700.328125,
1191
+ "learning_rate": 9.283915077365959e-05,
1192
+ "loss": 0.1816,
1193
  "step": 1490
1194
  },
1195
  {
1196
  "epoch": 3.7783375314861463,
1197
+ "grad_norm": 667783.125,
1198
+ "learning_rate": 9.21194674343289e-05,
1199
+ "loss": 0.1786,
1200
  "step": 1500
1201
  },
1202
  {
1203
  "epoch": 3.7783375314861463,
1204
+ "eval_accuracy": 0.533203125,
1205
+ "eval_f1_macro": 0.45250014527358284,
1206
+ "eval_loss": 2.8164846897125244,
1207
+ "eval_runtime": 11.728,
1208
+ "eval_samples_per_second": 87.312,
1209
+ "eval_steps_per_second": 5.457,
1210
  "step": 1500
1211
  },
1212
  {
1213
  "epoch": 3.8035264483627205,
1214
+ "grad_norm": 340034.53125,
1215
+ "learning_rate": 9.139978409499821e-05,
1216
+ "loss": 0.1934,
1217
  "step": 1510
1218
  },
1219
  {
1220
  "epoch": 3.8287153652392947,
1221
+ "grad_norm": 569343.8125,
1222
+ "learning_rate": 9.068010075566751e-05,
1223
+ "loss": 0.3571,
1224
  "step": 1520
1225
  },
1226
  {
1227
  "epoch": 3.853904282115869,
1228
+ "grad_norm": 73828.2578125,
1229
+ "learning_rate": 8.996041741633682e-05,
1230
+ "loss": 0.2595,
1231
  "step": 1530
1232
  },
1233
  {
1234
  "epoch": 3.879093198992443,
1235
+ "grad_norm": 610885.5625,
1236
+ "learning_rate": 8.924073407700611e-05,
1237
+ "loss": 0.2454,
1238
  "step": 1540
1239
  },
1240
  {
1241
  "epoch": 3.9042821158690177,
1242
+ "grad_norm": 549705.5,
1243
+ "learning_rate": 8.852105073767543e-05,
1244
+ "loss": 0.2246,
1245
  "step": 1550
1246
  },
1247
  {
1248
  "epoch": 3.929471032745592,
1249
+ "grad_norm": 946495.1875,
1250
+ "learning_rate": 8.780136739834473e-05,
1251
+ "loss": 0.3156,
1252
  "step": 1560
1253
  },
1254
  {
1255
  "epoch": 3.954659949622166,
1256
+ "grad_norm": 89126.0546875,
1257
+ "learning_rate": 8.708168405901404e-05,
1258
+ "loss": 0.127,
1259
  "step": 1570
1260
  },
1261
  {
1262
  "epoch": 3.979848866498741,
1263
+ "grad_norm": 364322.40625,
1264
+ "learning_rate": 8.636200071968334e-05,
1265
+ "loss": 0.2114,
1266
  "step": 1580
1267
  },
1268
  {
1269
  "epoch": 4.005037783375315,
1270
+ "grad_norm": 646444.625,
1271
+ "learning_rate": 8.564231738035265e-05,
1272
+ "loss": 0.1633,
1273
  "step": 1590
1274
  },
1275
  {
1276
  "epoch": 4.030226700251889,
1277
+ "grad_norm": 281335.90625,
1278
+ "learning_rate": 8.492263404102195e-05,
1279
+ "loss": 0.0687,
1280
  "step": 1600
1281
  },
1282
  {
1283
  "epoch": 4.030226700251889,
1284
+ "eval_accuracy": 0.568359375,
1285
+ "eval_f1_macro": 0.49420184108051124,
1286
+ "eval_loss": 2.9026849269866943,
1287
+ "eval_runtime": 11.365,
1288
+ "eval_samples_per_second": 90.101,
1289
+ "eval_steps_per_second": 5.631,
1290
  "step": 1600
1291
  },
1292
  {
1293
  "epoch": 4.055415617128464,
1294
+ "grad_norm": 12626.9794921875,
1295
+ "learning_rate": 8.420295070169126e-05,
1296
+ "loss": 0.0641,
1297
  "step": 1610
1298
  },
1299
  {
1300
  "epoch": 4.080604534005038,
1301
+ "grad_norm": 55986.578125,
1302
+ "learning_rate": 8.348326736236056e-05,
1303
+ "loss": 0.0647,
1304
  "step": 1620
1305
  },
1306
  {
1307
  "epoch": 4.105793450881612,
1308
+ "grad_norm": 509303.3125,
1309
+ "learning_rate": 8.276358402302987e-05,
1310
+ "loss": 0.0746,
1311
  "step": 1630
1312
  },
1313
  {
1314
  "epoch": 4.130982367758186,
1315
+ "grad_norm": 21481.740234375,
1316
+ "learning_rate": 8.204390068369917e-05,
1317
+ "loss": 0.0746,
1318
  "step": 1640
1319
  },
1320
  {
1321
  "epoch": 4.156171284634761,
1322
+ "grad_norm": 11360.7412109375,
1323
+ "learning_rate": 8.132421734436848e-05,
1324
+ "loss": 0.0555,
1325
  "step": 1650
1326
  },
1327
  {
1328
  "epoch": 4.181360201511335,
1329
+ "grad_norm": 396739.0,
1330
+ "learning_rate": 8.06045340050378e-05,
1331
+ "loss": 0.0595,
1332
  "step": 1660
1333
  },
1334
  {
1335
  "epoch": 4.206549118387909,
1336
+ "grad_norm": 8099.65478515625,
1337
+ "learning_rate": 7.988485066570709e-05,
1338
+ "loss": 0.1391,
1339
  "step": 1670
1340
  },
1341
  {
1342
  "epoch": 4.231738035264484,
1343
+ "grad_norm": 208596.65625,
1344
+ "learning_rate": 7.916516732637639e-05,
1345
+ "loss": 0.1038,
1346
  "step": 1680
1347
  },
1348
  {
1349
  "epoch": 4.2569269521410575,
1350
+ "grad_norm": 359488.34375,
1351
+ "learning_rate": 7.84454839870457e-05,
1352
+ "loss": 0.025,
1353
  "step": 1690
1354
  },
1355
  {
1356
  "epoch": 4.282115869017632,
1357
+ "grad_norm": 247004.875,
1358
+ "learning_rate": 7.7725800647715e-05,
1359
+ "loss": 0.0427,
1360
  "step": 1700
1361
  },
1362
  {
1363
  "epoch": 4.282115869017632,
1364
+ "eval_accuracy": 0.4912109375,
1365
+ "eval_f1_macro": 0.4362345681944286,
1366
+ "eval_loss": 3.321627616882324,
1367
+ "eval_runtime": 11.8095,
1368
+ "eval_samples_per_second": 86.71,
1369
+ "eval_steps_per_second": 5.419,
1370
  "step": 1700
1371
  },
1372
  {
1373
  "epoch": 4.307304785894207,
1374
+ "grad_norm": 22134.611328125,
1375
+ "learning_rate": 7.700611730838432e-05,
1376
+ "loss": 0.1015,
1377
  "step": 1710
1378
  },
1379
  {
1380
  "epoch": 4.332493702770781,
1381
+ "grad_norm": 163005.84375,
1382
+ "learning_rate": 7.628643396905361e-05,
1383
+ "loss": 0.0524,
1384
  "step": 1720
1385
  },
1386
  {
1387
  "epoch": 4.357682619647355,
1388
+ "grad_norm": 25011.78515625,
1389
+ "learning_rate": 7.556675062972293e-05,
1390
+ "loss": 0.1,
1391
  "step": 1730
1392
  },
1393
  {
1394
  "epoch": 4.382871536523929,
1395
+ "grad_norm": 606683.75,
1396
+ "learning_rate": 7.484706729039224e-05,
1397
+ "loss": 0.1503,
1398
  "step": 1740
1399
  },
1400
  {
1401
  "epoch": 4.408060453400504,
1402
+ "grad_norm": 411167.28125,
1403
+ "learning_rate": 7.412738395106154e-05,
1404
+ "loss": 0.1075,
1405
  "step": 1750
1406
  },
1407
  {
1408
  "epoch": 4.433249370277078,
1409
+ "grad_norm": 20846.5390625,
1410
+ "learning_rate": 7.340770061173085e-05,
1411
+ "loss": 0.0793,
1412
  "step": 1760
1413
  },
1414
  {
1415
  "epoch": 4.458438287153652,
1416
+ "grad_norm": 35871.06640625,
1417
+ "learning_rate": 7.268801727240015e-05,
1418
+ "loss": 0.0607,
1419
  "step": 1770
1420
  },
1421
  {
1422
  "epoch": 4.483627204030227,
1423
+ "grad_norm": 58382.01171875,
1424
+ "learning_rate": 7.196833393306946e-05,
1425
+ "loss": 0.082,
1426
  "step": 1780
1427
  },
1428
  {
1429
+ "epoch": 4.508816120906801,
1430
+ "grad_norm": 13096.5205078125,
1431
+ "learning_rate": 7.124865059373876e-05,
1432
+ "loss": 0.0754,
1433
+ "step": 1790
1434
+ },
1435
+ {
1436
+ "epoch": 4.534005037783375,
1437
+ "grad_norm": 21618.96875,
1438
+ "learning_rate": 7.052896725440807e-05,
1439
+ "loss": 0.1825,
1440
+ "step": 1800
1441
+ },
1442
+ {
1443
+ "epoch": 4.534005037783375,
1444
+ "eval_accuracy": 0.53125,
1445
+ "eval_f1_macro": 0.4663662196170286,
1446
+ "eval_loss": 3.1456074714660645,
1447
+ "eval_runtime": 11.9035,
1448
+ "eval_samples_per_second": 86.025,
1449
+ "eval_steps_per_second": 5.377,
1450
+ "step": 1800
1451
+ },
1452
+ {
1453
+ "epoch": 4.55919395465995,
1454
+ "grad_norm": 97136.0546875,
1455
+ "learning_rate": 6.980928391507737e-05,
1456
+ "loss": 0.0821,
1457
+ "step": 1810
1458
+ },
1459
+ {
1460
+ "epoch": 4.584382871536524,
1461
+ "grad_norm": 14197.41796875,
1462
+ "learning_rate": 6.908960057574667e-05,
1463
+ "loss": 0.1245,
1464
+ "step": 1820
1465
+ },
1466
+ {
1467
+ "epoch": 4.609571788413098,
1468
+ "grad_norm": 792662.5,
1469
+ "learning_rate": 6.836991723641598e-05,
1470
+ "loss": 0.0678,
1471
+ "step": 1830
1472
+ },
1473
+ {
1474
+ "epoch": 4.634760705289673,
1475
+ "grad_norm": 70302.1484375,
1476
+ "learning_rate": 6.765023389708528e-05,
1477
+ "loss": 0.0369,
1478
+ "step": 1840
1479
+ },
1480
+ {
1481
+ "epoch": 4.659949622166247,
1482
+ "grad_norm": 315541.75,
1483
+ "learning_rate": 6.693055055775459e-05,
1484
+ "loss": 0.086,
1485
+ "step": 1850
1486
+ },
1487
+ {
1488
+ "epoch": 4.685138539042821,
1489
+ "grad_norm": 426322.5625,
1490
+ "learning_rate": 6.62108672184239e-05,
1491
+ "loss": 0.1367,
1492
+ "step": 1860
1493
+ },
1494
+ {
1495
+ "epoch": 4.710327455919396,
1496
+ "grad_norm": 5201.7265625,
1497
+ "learning_rate": 6.54911838790932e-05,
1498
+ "loss": 0.0518,
1499
+ "step": 1870
1500
+ },
1501
+ {
1502
+ "epoch": 4.73551637279597,
1503
+ "grad_norm": 16552.916015625,
1504
+ "learning_rate": 6.47715005397625e-05,
1505
+ "loss": 0.0857,
1506
+ "step": 1880
1507
+ },
1508
+ {
1509
+ "epoch": 4.760705289672544,
1510
+ "grad_norm": 74746.0234375,
1511
+ "learning_rate": 6.405181720043182e-05,
1512
+ "loss": 0.178,
1513
+ "step": 1890
1514
+ },
1515
+ {
1516
+ "epoch": 4.785894206549118,
1517
+ "grad_norm": 30216.1328125,
1518
+ "learning_rate": 6.333213386110111e-05,
1519
+ "loss": 0.0758,
1520
+ "step": 1900
1521
+ },
1522
+ {
1523
+ "epoch": 4.785894206549118,
1524
+ "eval_accuracy": 0.5546875,
1525
+ "eval_f1_macro": 0.4578418057053371,
1526
+ "eval_loss": 3.2782468795776367,
1527
+ "eval_runtime": 11.4304,
1528
+ "eval_samples_per_second": 89.586,
1529
+ "eval_steps_per_second": 5.599,
1530
+ "step": 1900
1531
+ },
1532
+ {
1533
+ "epoch": 4.811083123425693,
1534
+ "grad_norm": 122862.0703125,
1535
+ "learning_rate": 6.261245052177043e-05,
1536
+ "loss": 0.1008,
1537
+ "step": 1910
1538
+ },
1539
+ {
1540
+ "epoch": 4.836272040302267,
1541
+ "grad_norm": 355039.96875,
1542
+ "learning_rate": 6.189276718243972e-05,
1543
+ "loss": 0.0619,
1544
+ "step": 1920
1545
+ },
1546
+ {
1547
+ "epoch": 4.861460957178841,
1548
+ "grad_norm": 180546.546875,
1549
+ "learning_rate": 6.117308384310904e-05,
1550
+ "loss": 0.212,
1551
+ "step": 1930
1552
+ },
1553
+ {
1554
+ "epoch": 4.886649874055416,
1555
+ "grad_norm": 1149145.875,
1556
+ "learning_rate": 6.045340050377835e-05,
1557
+ "loss": 0.1821,
1558
+ "step": 1940
1559
+ },
1560
+ {
1561
+ "epoch": 4.91183879093199,
1562
+ "grad_norm": 197082.046875,
1563
+ "learning_rate": 5.9733717164447645e-05,
1564
+ "loss": 0.0539,
1565
+ "step": 1950
1566
+ },
1567
+ {
1568
+ "epoch": 4.937027707808564,
1569
+ "grad_norm": 361.24676513671875,
1570
+ "learning_rate": 5.901403382511696e-05,
1571
+ "loss": 0.1526,
1572
+ "step": 1960
1573
+ },
1574
+ {
1575
+ "epoch": 4.962216624685139,
1576
+ "grad_norm": 362799.34375,
1577
+ "learning_rate": 5.8294350485786255e-05,
1578
+ "loss": 0.0838,
1579
+ "step": 1970
1580
+ },
1581
+ {
1582
+ "epoch": 4.987405541561713,
1583
+ "grad_norm": 4683.50732421875,
1584
+ "learning_rate": 5.757466714645556e-05,
1585
+ "loss": 0.0995,
1586
+ "step": 1980
1587
+ },
1588
+ {
1589
+ "epoch": 5.012594458438287,
1590
+ "grad_norm": 5118.85986328125,
1591
+ "learning_rate": 5.685498380712487e-05,
1592
+ "loss": 0.0488,
1593
+ "step": 1990
1594
+ },
1595
+ {
1596
+ "epoch": 5.037783375314861,
1597
+ "grad_norm": 6831.2880859375,
1598
+ "learning_rate": 5.613530046779417e-05,
1599
+ "loss": 0.0471,
1600
+ "step": 2000
1601
+ },
1602
+ {
1603
+ "epoch": 5.037783375314861,
1604
+ "eval_accuracy": 0.5517578125,
1605
+ "eval_f1_macro": 0.4725336026660133,
1606
+ "eval_loss": 3.334784507751465,
1607
+ "eval_runtime": 11.4554,
1608
+ "eval_samples_per_second": 89.39,
1609
+ "eval_steps_per_second": 5.587,
1610
+ "step": 2000
1611
+ },
1612
+ {
1613
+ "epoch": 5.062972292191436,
1614
+ "grad_norm": 88275.8359375,
1615
+ "learning_rate": 5.541561712846348e-05,
1616
+ "loss": 0.0503,
1617
+ "step": 2010
1618
+ },
1619
+ {
1620
+ "epoch": 5.08816120906801,
1621
+ "grad_norm": 1161589.25,
1622
+ "learning_rate": 5.469593378913278e-05,
1623
+ "loss": 0.0326,
1624
+ "step": 2020
1625
+ },
1626
+ {
1627
+ "epoch": 5.113350125944584,
1628
+ "grad_norm": 263008.125,
1629
+ "learning_rate": 5.397625044980209e-05,
1630
+ "loss": 0.103,
1631
+ "step": 2030
1632
+ },
1633
+ {
1634
+ "epoch": 5.138539042821159,
1635
+ "grad_norm": 2268.762939453125,
1636
+ "learning_rate": 5.32565671104714e-05,
1637
+ "loss": 0.0093,
1638
+ "step": 2040
1639
+ },
1640
+ {
1641
+ "epoch": 5.163727959697733,
1642
+ "grad_norm": 530844.6875,
1643
+ "learning_rate": 5.25368837711407e-05,
1644
+ "loss": 0.0919,
1645
+ "step": 2050
1646
+ },
1647
+ {
1648
+ "epoch": 5.188916876574307,
1649
+ "grad_norm": 181022.703125,
1650
+ "learning_rate": 5.1817200431810006e-05,
1651
+ "loss": 0.049,
1652
+ "step": 2060
1653
+ },
1654
+ {
1655
+ "epoch": 5.214105793450882,
1656
+ "grad_norm": 1360.029541015625,
1657
+ "learning_rate": 5.109751709247931e-05,
1658
+ "loss": 0.0656,
1659
+ "step": 2070
1660
+ },
1661
+ {
1662
+ "epoch": 5.239294710327456,
1663
+ "grad_norm": 541578.0625,
1664
+ "learning_rate": 5.0377833753148616e-05,
1665
+ "loss": 0.0287,
1666
+ "step": 2080
1667
+ },
1668
+ {
1669
+ "epoch": 5.26448362720403,
1670
+ "grad_norm": 8820.35546875,
1671
+ "learning_rate": 4.965815041381792e-05,
1672
+ "loss": 0.0252,
1673
+ "step": 2090
1674
+ },
1675
+ {
1676
+ "epoch": 5.289672544080605,
1677
+ "grad_norm": 50239.13671875,
1678
+ "learning_rate": 4.8938467074487226e-05,
1679
+ "loss": 0.0512,
1680
+ "step": 2100
1681
+ },
1682
+ {
1683
+ "epoch": 5.289672544080605,
1684
+ "eval_accuracy": 0.5283203125,
1685
+ "eval_f1_macro": 0.45143425422893263,
1686
+ "eval_loss": 3.718236207962036,
1687
+ "eval_runtime": 11.3333,
1688
+ "eval_samples_per_second": 90.353,
1689
+ "eval_steps_per_second": 5.647,
1690
+ "step": 2100
1691
+ },
1692
+ {
1693
+ "epoch": 5.314861460957179,
1694
+ "grad_norm": 1060627.125,
1695
+ "learning_rate": 4.821878373515653e-05,
1696
+ "loss": 0.0412,
1697
+ "step": 2110
1698
+ },
1699
+ {
1700
+ "epoch": 5.340050377833753,
1701
+ "grad_norm": 258190.28125,
1702
+ "learning_rate": 4.749910039582584e-05,
1703
+ "loss": 0.1177,
1704
+ "step": 2120
1705
+ },
1706
+ {
1707
+ "epoch": 5.365239294710327,
1708
+ "grad_norm": 126000.2265625,
1709
+ "learning_rate": 4.677941705649515e-05,
1710
+ "loss": 0.0049,
1711
+ "step": 2130
1712
+ },
1713
+ {
1714
+ "epoch": 5.390428211586902,
1715
+ "grad_norm": 2938.684814453125,
1716
+ "learning_rate": 4.605973371716445e-05,
1717
+ "loss": 0.1016,
1718
+ "step": 2140
1719
+ },
1720
+ {
1721
+ "epoch": 5.415617128463476,
1722
+ "grad_norm": 191.32302856445312,
1723
+ "learning_rate": 4.534005037783376e-05,
1724
+ "loss": 0.0051,
1725
+ "step": 2150
1726
+ },
1727
+ {
1728
+ "epoch": 5.44080604534005,
1729
+ "grad_norm": 70.82958984375,
1730
+ "learning_rate": 4.4620367038503055e-05,
1731
+ "loss": 0.0335,
1732
+ "step": 2160
1733
+ },
1734
+ {
1735
+ "epoch": 5.465994962216625,
1736
+ "grad_norm": 2224.07568359375,
1737
+ "learning_rate": 4.390068369917237e-05,
1738
+ "loss": 0.0019,
1739
+ "step": 2170
1740
+ },
1741
+ {
1742
+ "epoch": 5.491183879093199,
1743
+ "grad_norm": 7065.70703125,
1744
+ "learning_rate": 4.318100035984167e-05,
1745
+ "loss": 0.0489,
1746
+ "step": 2180
1747
+ },
1748
+ {
1749
+ "epoch": 5.516372795969773,
1750
+ "grad_norm": 218.8759765625,
1751
+ "learning_rate": 4.246131702051098e-05,
1752
+ "loss": 0.0761,
1753
+ "step": 2190
1754
+ },
1755
+ {
1756
+ "epoch": 5.541561712846348,
1757
+ "grad_norm": 2789.82763671875,
1758
+ "learning_rate": 4.174163368118028e-05,
1759
+ "loss": 0.0095,
1760
+ "step": 2200
1761
+ },
1762
+ {
1763
+ "epoch": 5.541561712846348,
1764
+ "eval_accuracy": 0.5341796875,
1765
+ "eval_f1_macro": 0.478510457034419,
1766
+ "eval_loss": 3.902801036834717,
1767
+ "eval_runtime": 11.7659,
1768
+ "eval_samples_per_second": 87.031,
1769
+ "eval_steps_per_second": 5.439,
1770
+ "step": 2200
1771
+ },
1772
+ {
1773
+ "epoch": 5.566750629722922,
1774
+ "grad_norm": 33521.52734375,
1775
+ "learning_rate": 4.1021950341849587e-05,
1776
+ "loss": 0.0445,
1777
+ "step": 2210
1778
+ },
1779
+ {
1780
+ "epoch": 5.591939546599496,
1781
+ "grad_norm": 339091.78125,
1782
+ "learning_rate": 4.03022670025189e-05,
1783
+ "loss": 0.0717,
1784
+ "step": 2220
1785
+ },
1786
+ {
1787
+ "epoch": 5.617128463476071,
1788
+ "grad_norm": 570797.6875,
1789
+ "learning_rate": 3.9582583663188196e-05,
1790
+ "loss": 0.0664,
1791
+ "step": 2230
1792
+ },
1793
+ {
1794
+ "epoch": 5.642317380352645,
1795
+ "grad_norm": 638298.25,
1796
+ "learning_rate": 3.88629003238575e-05,
1797
+ "loss": 0.0152,
1798
+ "step": 2240
1799
+ },
1800
+ {
1801
+ "epoch": 5.667506297229219,
1802
+ "grad_norm": 1308.215087890625,
1803
+ "learning_rate": 3.8143216984526806e-05,
1804
+ "loss": 0.0029,
1805
+ "step": 2250
1806
+ },
1807
+ {
1808
+ "epoch": 5.692695214105793,
1809
+ "grad_norm": 47.38713836669922,
1810
+ "learning_rate": 3.742353364519612e-05,
1811
+ "loss": 0.0199,
1812
+ "step": 2260
1813
+ },
1814
+ {
1815
+ "epoch": 5.717884130982368,
1816
+ "grad_norm": 56138.89453125,
1817
+ "learning_rate": 3.670385030586542e-05,
1818
+ "loss": 0.0748,
1819
+ "step": 2270
1820
+ },
1821
+ {
1822
+ "epoch": 5.7430730478589425,
1823
+ "grad_norm": 49.50386047363281,
1824
+ "learning_rate": 3.598416696653473e-05,
1825
+ "loss": 0.0174,
1826
+ "step": 2280
1827
+ },
1828
+ {
1829
+ "epoch": 5.768261964735516,
1830
+ "grad_norm": 7627.2783203125,
1831
+ "learning_rate": 3.526448362720403e-05,
1832
+ "loss": 0.002,
1833
+ "step": 2290
1834
+ },
1835
+ {
1836
+ "epoch": 5.793450881612091,
1837
+ "grad_norm": 123.52435302734375,
1838
+ "learning_rate": 3.454480028787334e-05,
1839
+ "loss": 0.0247,
1840
+ "step": 2300
1841
+ },
1842
+ {
1843
+ "epoch": 5.793450881612091,
1844
+ "eval_accuracy": 0.5712890625,
1845
+ "eval_f1_macro": 0.48791698609028533,
1846
+ "eval_loss": 3.960580348968506,
1847
+ "eval_runtime": 11.8204,
1848
+ "eval_samples_per_second": 86.63,
1849
+ "eval_steps_per_second": 5.414,
1850
+ "step": 2300
1851
+ },
1852
+ {
1853
+ "epoch": 5.818639798488665,
1854
+ "grad_norm": 534.7195434570312,
1855
+ "learning_rate": 3.382511694854264e-05,
1856
+ "loss": 0.0496,
1857
+ "step": 2310
1858
+ },
1859
+ {
1860
+ "epoch": 5.843828715365239,
1861
+ "grad_norm": 17011.57421875,
1862
+ "learning_rate": 3.310543360921195e-05,
1863
+ "loss": 0.1476,
1864
+ "step": 2320
1865
+ },
1866
+ {
1867
+ "epoch": 5.869017632241814,
1868
+ "grad_norm": 386354.28125,
1869
+ "learning_rate": 3.238575026988125e-05,
1870
+ "loss": 0.0593,
1871
+ "step": 2330
1872
+ },
1873
+ {
1874
+ "epoch": 5.894206549118388,
1875
+ "grad_norm": 102287.3828125,
1876
+ "learning_rate": 3.166606693055056e-05,
1877
+ "loss": 0.0117,
1878
+ "step": 2340
1879
+ },
1880
+ {
1881
+ "epoch": 5.919395465994962,
1882
+ "grad_norm": 29.17066764831543,
1883
+ "learning_rate": 3.094638359121986e-05,
1884
+ "loss": 0.0702,
1885
+ "step": 2350
1886
+ },
1887
+ {
1888
+ "epoch": 5.944584382871536,
1889
+ "grad_norm": 1611.7230224609375,
1890
+ "learning_rate": 3.0226700251889174e-05,
1891
+ "loss": 0.0687,
1892
+ "step": 2360
1893
+ },
1894
+ {
1895
+ "epoch": 5.969773299748111,
1896
+ "grad_norm": 168.02320861816406,
1897
+ "learning_rate": 2.950701691255848e-05,
1898
+ "loss": 0.0872,
1899
+ "step": 2370
1900
+ },
1901
+ {
1902
+ "epoch": 5.994962216624685,
1903
+ "grad_norm": 1058.51904296875,
1904
+ "learning_rate": 2.878733357322778e-05,
1905
+ "loss": 0.0008,
1906
+ "step": 2380
1907
+ },
1908
+ {
1909
+ "epoch": 6.020151133501259,
1910
+ "grad_norm": 23.99247932434082,
1911
+ "learning_rate": 2.8067650233897085e-05,
1912
+ "loss": 0.0526,
1913
+ "step": 2390
1914
+ },
1915
+ {
1916
+ "epoch": 6.045340050377834,
1917
+ "grad_norm": 2630.893798828125,
1918
+ "learning_rate": 2.734796689456639e-05,
1919
+ "loss": 0.0008,
1920
+ "step": 2400
1921
+ },
1922
+ {
1923
+ "epoch": 6.045340050377834,
1924
+ "eval_accuracy": 0.5654296875,
1925
+ "eval_f1_macro": 0.49182304765519874,
1926
+ "eval_loss": 4.129029750823975,
1927
+ "eval_runtime": 11.7471,
1928
+ "eval_samples_per_second": 87.17,
1929
+ "eval_steps_per_second": 5.448,
1930
+ "step": 2400
1931
+ },
1932
+ {
1933
+ "epoch": 6.0705289672544085,
1934
+ "grad_norm": 3671.589111328125,
1935
+ "learning_rate": 2.66282835552357e-05,
1936
+ "loss": 0.0009,
1937
+ "step": 2410
1938
+ },
1939
+ {
1940
+ "epoch": 6.095717884130982,
1941
+ "grad_norm": 246.3783721923828,
1942
+ "learning_rate": 2.5908600215905003e-05,
1943
+ "loss": 0.0279,
1944
+ "step": 2420
1945
+ },
1946
+ {
1947
+ "epoch": 6.120906801007557,
1948
+ "grad_norm": 250481.90625,
1949
+ "learning_rate": 2.5188916876574308e-05,
1950
+ "loss": 0.0367,
1951
+ "step": 2430
1952
+ },
1953
+ {
1954
+ "epoch": 6.146095717884131,
1955
+ "grad_norm": 5997.31396484375,
1956
+ "learning_rate": 2.4469233537243613e-05,
1957
+ "loss": 0.0002,
1958
+ "step": 2440
1959
+ },
1960
+ {
1961
+ "epoch": 6.171284634760705,
1962
+ "grad_norm": 294020.6875,
1963
+ "learning_rate": 2.374955019791292e-05,
1964
+ "loss": 0.0447,
1965
+ "step": 2450
1966
+ },
1967
+ {
1968
+ "epoch": 6.19647355163728,
1969
+ "grad_norm": 5173.7607421875,
1970
+ "learning_rate": 2.3029866858582226e-05,
1971
+ "loss": 0.0006,
1972
+ "step": 2460
1973
+ },
1974
+ {
1975
+ "epoch": 6.221662468513854,
1976
+ "grad_norm": 4.966667175292969,
1977
+ "learning_rate": 2.2310183519251528e-05,
1978
+ "loss": 0.0394,
1979
+ "step": 2470
1980
+ },
1981
+ {
1982
+ "epoch": 6.246851385390428,
1983
+ "grad_norm": 328896.6875,
1984
+ "learning_rate": 2.1590500179920836e-05,
1985
+ "loss": 0.0725,
1986
+ "step": 2480
1987
+ },
1988
+ {
1989
+ "epoch": 6.272040302267002,
1990
+ "grad_norm": 855.643798828125,
1991
+ "learning_rate": 2.087081684059014e-05,
1992
+ "loss": 0.0001,
1993
+ "step": 2490
1994
+ },
1995
+ {
1996
+ "epoch": 6.297229219143577,
1997
+ "grad_norm": 18.617643356323242,
1998
+ "learning_rate": 2.015113350125945e-05,
1999
+ "loss": 0.0024,
2000
+ "step": 2500
2001
+ },
2002
+ {
2003
+ "epoch": 6.297229219143577,
2004
+ "eval_accuracy": 0.5654296875,
2005
+ "eval_f1_macro": 0.4862593826724424,
2006
+ "eval_loss": 4.414713382720947,
2007
+ "eval_runtime": 11.3987,
2008
+ "eval_samples_per_second": 89.835,
2009
+ "eval_steps_per_second": 5.615,
2010
+ "step": 2500
2011
+ },
2012
+ {
2013
+ "epoch": 6.3224181360201515,
2014
+ "grad_norm": 43.19185256958008,
2015
+ "learning_rate": 1.943145016192875e-05,
2016
+ "loss": 0.005,
2017
+ "step": 2510
2018
+ },
2019
+ {
2020
+ "epoch": 6.347607052896725,
2021
+ "grad_norm": 636742.8125,
2022
+ "learning_rate": 1.871176682259806e-05,
2023
+ "loss": 0.0187,
2024
+ "step": 2520
2025
+ },
2026
+ {
2027
+ "epoch": 6.3727959697733,
2028
+ "grad_norm": 14.576433181762695,
2029
+ "learning_rate": 1.7992083483267364e-05,
2030
+ "loss": 0.0008,
2031
+ "step": 2530
2032
+ },
2033
+ {
2034
+ "epoch": 6.3979848866498745,
2035
+ "grad_norm": 22.115917205810547,
2036
+ "learning_rate": 1.727240014393667e-05,
2037
+ "loss": 0.0038,
2038
+ "step": 2540
2039
+ },
2040
+ {
2041
+ "epoch": 6.423173803526448,
2042
+ "grad_norm": 618.1704711914062,
2043
+ "learning_rate": 1.6552716804605974e-05,
2044
+ "loss": 0.0685,
2045
+ "step": 2550
2046
+ },
2047
+ {
2048
+ "epoch": 6.448362720403023,
2049
+ "grad_norm": 279531.71875,
2050
+ "learning_rate": 1.583303346527528e-05,
2051
+ "loss": 0.0035,
2052
+ "step": 2560
2053
+ },
2054
+ {
2055
+ "epoch": 6.473551637279597,
2056
+ "grad_norm": 265.0247802734375,
2057
+ "learning_rate": 1.5113350125944587e-05,
2058
+ "loss": 0.0245,
2059
+ "step": 2570
2060
+ },
2061
+ {
2062
+ "epoch": 6.498740554156171,
2063
+ "grad_norm": 635805.875,
2064
+ "learning_rate": 1.439366678661389e-05,
2065
+ "loss": 0.0052,
2066
+ "step": 2580
2067
+ },
2068
+ {
2069
+ "epoch": 6.523929471032746,
2070
+ "grad_norm": 8.843326568603516,
2071
+ "learning_rate": 1.3673983447283195e-05,
2072
+ "loss": 0.0,
2073
+ "step": 2590
2074
+ },
2075
+ {
2076
+ "epoch": 6.54911838790932,
2077
+ "grad_norm": 87.60523986816406,
2078
+ "learning_rate": 1.2954300107952502e-05,
2079
+ "loss": 0.0002,
2080
+ "step": 2600
2081
+ },
2082
+ {
2083
+ "epoch": 6.54911838790932,
2084
+ "eval_accuracy": 0.5654296875,
2085
+ "eval_f1_macro": 0.4913330578351924,
2086
+ "eval_loss": 4.520939826965332,
2087
+ "eval_runtime": 11.8845,
2088
+ "eval_samples_per_second": 86.163,
2089
+ "eval_steps_per_second": 5.385,
2090
+ "step": 2600
2091
+ },
2092
+ {
2093
+ "epoch": 6.574307304785894,
2094
+ "grad_norm": 8.957763671875,
2095
+ "learning_rate": 1.2234616768621806e-05,
2096
+ "loss": 0.0,
2097
+ "step": 2610
2098
+ },
2099
+ {
2100
+ "epoch": 6.599496221662468,
2101
+ "grad_norm": 6.1030144691467285,
2102
+ "learning_rate": 1.1514933429291113e-05,
2103
+ "loss": 0.0009,
2104
+ "step": 2620
2105
+ },
2106
+ {
2107
+ "epoch": 6.624685138539043,
2108
+ "grad_norm": 4.862893581390381,
2109
+ "learning_rate": 1.0795250089960418e-05,
2110
+ "loss": 0.1231,
2111
+ "step": 2630
2112
+ },
2113
+ {
2114
+ "epoch": 6.6498740554156175,
2115
+ "grad_norm": 598.0420532226562,
2116
+ "learning_rate": 1.0075566750629725e-05,
2117
+ "loss": 0.0001,
2118
+ "step": 2640
2119
+ },
2120
+ {
2121
+ "epoch": 6.675062972292191,
2122
+ "grad_norm": 5.682479381561279,
2123
+ "learning_rate": 9.35588341129903e-06,
2124
+ "loss": 0.0365,
2125
+ "step": 2650
2126
+ },
2127
+ {
2128
+ "epoch": 6.700251889168766,
2129
+ "grad_norm": 1.800890564918518,
2130
+ "learning_rate": 8.636200071968334e-06,
2131
+ "loss": 0.133,
2132
+ "step": 2660
2133
+ },
2134
+ {
2135
+ "epoch": 6.72544080604534,
2136
+ "grad_norm": 718.6954345703125,
2137
+ "learning_rate": 7.91651673263764e-06,
2138
+ "loss": 0.0,
2139
+ "step": 2670
2140
+ },
2141
+ {
2142
+ "epoch": 6.750629722921914,
2143
+ "grad_norm": 338166.46875,
2144
+ "learning_rate": 7.196833393306945e-06,
2145
+ "loss": 0.034,
2146
+ "step": 2680
2147
+ },
2148
+ {
2149
+ "epoch": 6.775818639798489,
2150
+ "grad_norm": 471.62005615234375,
2151
+ "learning_rate": 6.477150053976251e-06,
2152
+ "loss": 0.0599,
2153
+ "step": 2690
2154
+ },
2155
+ {
2156
+ "epoch": 6.801007556675063,
2157
+ "grad_norm": 308.09417724609375,
2158
+ "learning_rate": 5.7574667146455565e-06,
2159
+ "loss": 0.0055,
2160
+ "step": 2700
2161
+ },
2162
+ {
2163
+ "epoch": 6.801007556675063,
2164
+ "eval_accuracy": 0.58203125,
2165
+ "eval_f1_macro": 0.5067161880167751,
2166
+ "eval_loss": 4.515384197235107,
2167
+ "eval_runtime": 11.8883,
2168
+ "eval_samples_per_second": 86.135,
2169
+ "eval_steps_per_second": 5.383,
2170
+ "step": 2700
2171
+ },
2172
+ {
2173
+ "epoch": 6.826196473551637,
2174
+ "grad_norm": 407.3280334472656,
2175
+ "learning_rate": 5.037783375314862e-06,
2176
+ "loss": 0.0,
2177
+ "step": 2710
2178
+ },
2179
+ {
2180
+ "epoch": 6.851385390428211,
2181
+ "grad_norm": 396.4019470214844,
2182
+ "learning_rate": 4.318100035984167e-06,
2183
+ "loss": 0.0015,
2184
+ "step": 2720
2185
+ },
2186
+ {
2187
+ "epoch": 6.876574307304786,
2188
+ "grad_norm": 2.2462317943573,
2189
+ "learning_rate": 3.5984166966534725e-06,
2190
+ "loss": 0.0007,
2191
+ "step": 2730
2192
+ },
2193
+ {
2194
+ "epoch": 6.9017632241813605,
2195
+ "grad_norm": 40.78224182128906,
2196
+ "learning_rate": 2.8787333573227783e-06,
2197
+ "loss": 0.0001,
2198
+ "step": 2740
2199
+ },
2200
+ {
2201
+ "epoch": 6.926952141057934,
2202
+ "grad_norm": 11996.0771484375,
2203
+ "learning_rate": 2.1590500179920836e-06,
2204
+ "loss": 0.0002,
2205
+ "step": 2750
2206
+ },
2207
+ {
2208
+ "epoch": 6.952141057934509,
2209
+ "grad_norm": 2.2435834407806396,
2210
+ "learning_rate": 1.4393666786613891e-06,
2211
+ "loss": 0.0001,
2212
+ "step": 2760
2213
+ },
2214
+ {
2215
+ "epoch": 6.977329974811083,
2216
+ "grad_norm": 2109.901123046875,
2217
+ "learning_rate": 7.196833393306946e-07,
2218
+ "loss": 0.0832,
2219
+ "step": 2770
2220
+ },
2221
+ {
2222
+ "epoch": 7.0,
2223
+ "step": 2779,
2224
+ "total_flos": 2.5585840915697664e+18,
2225
+ "train_loss": 0.4836694428009911,
2226
+ "train_runtime": 1538.4023,
2227
+ "train_samples_per_second": 28.894,
2228
+ "train_steps_per_second": 1.806
2229
  }
2230
  ],
2231
  "logging_steps": 10,
2232
+ "max_steps": 2779,
2233
  "num_input_tokens_seen": 0,
2234
+ "num_train_epochs": 7,
2235
  "save_steps": 100,
2236
  "stateful_callbacks": {
2237
  "TrainerControl": {
 
2245
  "attributes": {}
2246
  }
2247
  },
2248
+ "total_flos": 2.5585840915697664e+18,
2249
  "train_batch_size": 16,
2250
  "trial_name": null,
2251
  "trial_params": null