PaceAhh commited on
Commit
af68c42
·
verified ·
1 Parent(s): 741d30d

Add fine-tuned reverse model

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:514393859ed0d46275228e8f825f8b30494c8b087c4be0aa14131e53baddf126
3
  size 8405472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:785ee56c23465a10b4464b35d71e6e3a37ecea3efc9900199739964a182f8463
3
  size 8405472
checkpoint-1200/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9452fdc38ddc13e5d1210d8b52f37e520b0f86c5f808f242671be648544eddc
3
  size 8405472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c40325bc6b0d94db4ff1a7099dfde12ccef64d1d09393654f3f65dfc87b57038
3
  size 8405472
checkpoint-1200/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf2a9129cb2ae5b0835d3c2b2adb8333cf56b48bf8551978e2c4f6207d0f593c
3
  size 4411194
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae87b3adf150d00d014ba15dca822ab4a737276032c9b389628c5d4c41f2e25f
3
  size 4411194
checkpoint-1200/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45e3e7441265ecf1e8b3eda15b5f0ad8e8c8a0667fbf52c460fbf9a0cb01d39c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c80e82a7639e588810bfa55efd6a4dda224b512193918b24627c05ad13a1a2b
3
  size 14244
checkpoint-1200/trainer_state.json CHANGED
@@ -10,842 +10,842 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.004062563477554337,
13
- "grad_norm": 0.9910523295402527,
14
  "learning_rate": 9.91876523151909e-05,
15
- "loss": 2.0449,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.008125126955108674,
20
- "grad_norm": 0.8047148585319519,
21
- "learning_rate": 9.83753046303818e-05,
22
- "loss": 1.9782,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.01218769043266301,
27
- "grad_norm": 0.6215866208076477,
28
- "learning_rate": 9.756295694557271e-05,
29
- "loss": 1.7088,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.01625025391021735,
34
- "grad_norm": 0.8722022175788879,
35
- "learning_rate": 9.675060926076362e-05,
36
- "loss": 1.6144,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.020312817387771683,
41
- "grad_norm": 2.4887263774871826,
42
- "learning_rate": 9.593826157595452e-05,
43
- "loss": 1.6992,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.02437538086532602,
48
- "grad_norm": 1.4015988111495972,
49
- "learning_rate": 9.528838342810723e-05,
50
- "loss": 1.2877,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.028437944342880356,
55
- "grad_norm": 2.6476988792419434,
56
- "learning_rate": 9.447603574329814e-05,
57
- "loss": 1.4567,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.0325005078204347,
62
- "grad_norm": 1.3684196472167969,
63
- "learning_rate": 9.366368805848904e-05,
64
- "loss": 1.4383,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.03656307129798903,
69
- "grad_norm": 3.351715564727783,
70
- "learning_rate": 9.285134037367993e-05,
71
- "loss": 1.4779,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.040625634775543366,
76
- "grad_norm": 1.4918268918991089,
77
- "learning_rate": 9.203899268887084e-05,
78
- "loss": 1.4002,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.04468819825309771,
83
- "grad_norm": 1.6876213550567627,
84
- "learning_rate": 9.122664500406174e-05,
85
- "loss": 1.5442,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.04875076173065204,
90
- "grad_norm": 0.7458980083465576,
91
- "learning_rate": 9.041429731925265e-05,
92
- "loss": 1.3639,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.05281332520820638,
97
- "grad_norm": 0.7894807457923889,
98
- "learning_rate": 8.960194963444355e-05,
99
- "loss": 1.4716,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.05687588868576071,
104
- "grad_norm": 0.49681055545806885,
105
- "learning_rate": 8.878960194963445e-05,
106
- "loss": 1.4235,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.06093845216331505,
111
- "grad_norm": 0.829867959022522,
112
- "learning_rate": 8.797725426482534e-05,
113
- "loss": 1.3474,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.0650010156408694,
118
- "grad_norm": 1.1840708255767822,
119
- "learning_rate": 8.716490658001625e-05,
120
- "loss": 1.4501,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.06906357911842373,
125
- "grad_norm": 0.8244519233703613,
126
- "learning_rate": 8.635255889520715e-05,
127
- "loss": 1.3456,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.07312614259597806,
132
- "grad_norm": 0.5561311841011047,
133
- "learning_rate": 8.554021121039806e-05,
134
- "loss": 1.5179,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.0771887060735324,
139
- "grad_norm": 0.868900716304779,
140
- "learning_rate": 8.472786352558896e-05,
141
- "loss": 1.4277,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.08125126955108673,
146
- "grad_norm": 3.0395560264587402,
147
- "learning_rate": 8.391551584077985e-05,
148
- "loss": 1.3674,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.08531383302864107,
153
- "grad_norm": 1.3599716424942017,
154
- "learning_rate": 8.310316815597076e-05,
155
- "loss": 1.4285,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.08937639650619542,
160
- "grad_norm": 2.562626838684082,
161
- "learning_rate": 8.229082047116166e-05,
162
- "loss": 1.4496,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.09343895998374975,
167
- "grad_norm": 0.9251193404197693,
168
- "learning_rate": 8.147847278635257e-05,
169
- "loss": 1.3718,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.09750152346130408,
174
- "grad_norm": 1.510450839996338,
175
- "learning_rate": 8.066612510154347e-05,
176
- "loss": 1.3749,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.10156408693885842,
181
- "grad_norm": 0.7255865335464478,
182
- "learning_rate": 7.985377741673436e-05,
183
- "loss": 1.3185,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.10562665041641275,
188
- "grad_norm": 0.9007101655006409,
189
- "learning_rate": 7.904142973192526e-05,
190
- "loss": 1.4268,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.10968921389396709,
195
- "grad_norm": 0.9022626280784607,
196
- "learning_rate": 7.822908204711617e-05,
197
- "loss": 1.3011,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.11375177737152142,
202
- "grad_norm": 1.0873183012008667,
203
- "learning_rate": 7.741673436230708e-05,
204
- "loss": 1.4486,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.11781434084907577,
209
- "grad_norm": 0.7647557854652405,
210
- "learning_rate": 7.660438667749798e-05,
211
- "loss": 1.3715,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.1218769043266301,
216
- "grad_norm": 0.9489114284515381,
217
- "learning_rate": 7.579203899268887e-05,
218
- "loss": 1.4807,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.12593946780418444,
223
- "grad_norm": 1.9156900644302368,
224
- "learning_rate": 7.497969130787977e-05,
225
- "loss": 1.4448,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.1300020312817388,
230
- "grad_norm": 2.21012544631958,
231
- "learning_rate": 7.416734362307067e-05,
232
- "loss": 1.6132,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.1340645947592931,
237
- "grad_norm": 0.9065983295440674,
238
- "learning_rate": 7.335499593826158e-05,
239
- "loss": 1.2961,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.13812715823684746,
244
- "grad_norm": 0.7284504771232605,
245
- "learning_rate": 7.254264825345249e-05,
246
- "loss": 1.4802,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.14218972171440178,
251
- "grad_norm": 3.0846526622772217,
252
- "learning_rate": 7.173030056864339e-05,
253
- "loss": 1.4462,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.14625228519195613,
258
- "grad_norm": 1.0574288368225098,
259
- "learning_rate": 7.091795288383428e-05,
260
- "loss": 1.3832,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.15031484866951045,
265
- "grad_norm": 1.7289760112762451,
266
- "learning_rate": 7.010560519902518e-05,
267
- "loss": 1.2393,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.1543774121470648,
272
- "grad_norm": 0.689789891242981,
273
- "learning_rate": 6.929325751421609e-05,
274
- "loss": 1.3216,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.15843997562461914,
279
- "grad_norm": 1.3808571100234985,
280
- "learning_rate": 6.848090982940699e-05,
281
- "loss": 1.3955,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.16250253910217347,
286
- "grad_norm": 1.0446319580078125,
287
- "learning_rate": 6.76685621445979e-05,
288
- "loss": 1.387,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.16656510257972781,
293
- "grad_norm": 1.325724720954895,
294
- "learning_rate": 6.685621445978879e-05,
295
- "loss": 1.3749,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.17062766605728213,
300
- "grad_norm": 0.9511458873748779,
301
- "learning_rate": 6.604386677497969e-05,
302
- "loss": 1.2309,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.17469022953483648,
307
- "grad_norm": 1.0929605960845947,
308
- "learning_rate": 6.52315190901706e-05,
309
- "loss": 1.5331,
310
  "step": 430
311
  },
312
  {
313
  "epoch": 0.17875279301239083,
314
- "grad_norm": 2.1583609580993652,
315
- "learning_rate": 6.44191714053615e-05,
316
- "loss": 1.5038,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 0.18281535648994515,
321
- "grad_norm": 1.5003691911697388,
322
- "learning_rate": 6.36068237205524e-05,
323
- "loss": 1.285,
324
  "step": 450
325
  },
326
  {
327
  "epoch": 0.1868779199674995,
328
- "grad_norm": 3.8774428367614746,
329
- "learning_rate": 6.27944760357433e-05,
330
- "loss": 1.4569,
331
  "step": 460
332
  },
333
  {
334
  "epoch": 0.19094048344505382,
335
- "grad_norm": 0.5628882646560669,
336
- "learning_rate": 6.19821283509342e-05,
337
- "loss": 1.3662,
338
  "step": 470
339
  },
340
  {
341
  "epoch": 0.19500304692260817,
342
- "grad_norm": 3.2969565391540527,
343
- "learning_rate": 6.116978066612511e-05,
344
- "loss": 1.3893,
345
  "step": 480
346
  },
347
  {
348
  "epoch": 0.1990656104001625,
349
- "grad_norm": 1.869675636291504,
350
- "learning_rate": 6.035743298131601e-05,
351
- "loss": 1.3349,
352
  "step": 490
353
  },
354
  {
355
  "epoch": 0.20312817387771684,
356
- "grad_norm": 0.843262791633606,
357
- "learning_rate": 5.954508529650691e-05,
358
- "loss": 1.2563,
359
  "step": 500
360
  },
361
  {
362
  "epoch": 0.2071907373552712,
363
- "grad_norm": 1.2158210277557373,
364
- "learning_rate": 5.873273761169781e-05,
365
- "loss": 1.4126,
366
  "step": 510
367
  },
368
  {
369
  "epoch": 0.2112533008328255,
370
- "grad_norm": 1.8899216651916504,
371
- "learning_rate": 5.7920389926888705e-05,
372
- "loss": 1.3794,
373
  "step": 520
374
  },
375
  {
376
  "epoch": 0.21531586431037986,
377
- "grad_norm": 0.5589442253112793,
378
- "learning_rate": 5.7108042242079615e-05,
379
- "loss": 1.2835,
380
  "step": 530
381
  },
382
  {
383
  "epoch": 0.21937842778793418,
384
- "grad_norm": 1.7187498807907104,
385
- "learning_rate": 5.629569455727052e-05,
386
- "loss": 1.3301,
387
  "step": 540
388
  },
389
  {
390
  "epoch": 0.22344099126548853,
391
- "grad_norm": 0.9800445437431335,
392
- "learning_rate": 5.5483346872461415e-05,
393
- "loss": 1.4507,
394
  "step": 550
395
  },
396
  {
397
  "epoch": 0.22750355474304285,
398
- "grad_norm": 1.1055623292922974,
399
- "learning_rate": 5.467099918765232e-05,
400
- "loss": 1.4316,
401
  "step": 560
402
  },
403
  {
404
  "epoch": 0.2315661182205972,
405
- "grad_norm": 0.9080348610877991,
406
- "learning_rate": 5.3858651502843216e-05,
407
- "loss": 1.3266,
408
  "step": 570
409
  },
410
  {
411
  "epoch": 0.23562868169815154,
412
- "grad_norm": 1.9814904928207397,
413
- "learning_rate": 5.3046303818034126e-05,
414
- "loss": 1.4031,
415
  "step": 580
416
  },
417
  {
418
  "epoch": 0.23969124517570586,
419
- "grad_norm": 1.1566354036331177,
420
- "learning_rate": 5.223395613322502e-05,
421
- "loss": 1.3616,
422
  "step": 590
423
  },
424
  {
425
  "epoch": 0.2437538086532602,
426
- "grad_norm": 0.4727267324924469,
427
- "learning_rate": 5.1421608448415926e-05,
428
- "loss": 1.519,
429
  "step": 600
430
  },
431
  {
432
  "epoch": 0.24781637213081453,
433
- "grad_norm": 0.7331522107124329,
434
- "learning_rate": 5.060926076360682e-05,
435
- "loss": 1.4483,
436
  "step": 610
437
  },
438
  {
439
  "epoch": 0.2518789356083689,
440
- "grad_norm": 0.6916592717170715,
441
- "learning_rate": 4.979691307879773e-05,
442
- "loss": 1.3865,
443
  "step": 620
444
  },
445
  {
446
  "epoch": 0.25594149908592323,
447
- "grad_norm": 1.3255294561386108,
448
- "learning_rate": 4.898456539398863e-05,
449
- "loss": 1.3529,
450
  "step": 630
451
  },
452
  {
453
  "epoch": 0.2600040625634776,
454
- "grad_norm": 1.6844894886016846,
455
- "learning_rate": 4.817221770917953e-05,
456
- "loss": 1.5159,
457
  "step": 640
458
  },
459
  {
460
  "epoch": 0.26406662604103187,
461
- "grad_norm": 0.6133401393890381,
462
  "learning_rate": 4.735987002437044e-05,
463
- "loss": 1.4394,
464
  "step": 650
465
  },
466
  {
467
  "epoch": 0.2681291895185862,
468
- "grad_norm": 2.1573598384857178,
469
  "learning_rate": 4.6547522339561334e-05,
470
- "loss": 1.4777,
471
  "step": 660
472
  },
473
  {
474
  "epoch": 0.27219175299614057,
475
- "grad_norm": 0.7368021607398987,
476
  "learning_rate": 4.573517465475224e-05,
477
- "loss": 1.2622,
478
  "step": 670
479
  },
480
  {
481
  "epoch": 0.2762543164736949,
482
- "grad_norm": 1.5155552625656128,
483
  "learning_rate": 4.492282696994314e-05,
484
  "loss": 1.3557,
485
  "step": 680
486
  },
487
  {
488
  "epoch": 0.28031687995124926,
489
- "grad_norm": 0.580403208732605,
490
  "learning_rate": 4.411047928513404e-05,
491
- "loss": 1.4629,
492
  "step": 690
493
  },
494
  {
495
  "epoch": 0.28437944342880356,
496
- "grad_norm": 0.5736162662506104,
497
  "learning_rate": 4.329813160032494e-05,
498
- "loss": 1.415,
499
  "step": 700
500
  },
501
  {
502
  "epoch": 0.2884420069063579,
503
- "grad_norm": 1.124528169631958,
504
  "learning_rate": 4.2485783915515845e-05,
505
- "loss": 1.5554,
506
  "step": 710
507
  },
508
  {
509
  "epoch": 0.29250457038391225,
510
- "grad_norm": 0.8589762449264526,
511
  "learning_rate": 4.167343623070675e-05,
512
- "loss": 1.3481,
513
  "step": 720
514
  },
515
  {
516
  "epoch": 0.2965671338614666,
517
- "grad_norm": 0.7507007122039795,
518
  "learning_rate": 4.0861088545897645e-05,
519
- "loss": 1.5406,
520
  "step": 730
521
  },
522
  {
523
  "epoch": 0.3006296973390209,
524
- "grad_norm": 1.1872320175170898,
525
  "learning_rate": 4.004874086108855e-05,
526
- "loss": 1.559,
527
  "step": 740
528
  },
529
  {
530
  "epoch": 0.30469226081657524,
531
- "grad_norm": 1.1897319555282593,
532
  "learning_rate": 3.923639317627945e-05,
533
- "loss": 1.4123,
534
  "step": 750
535
  },
536
  {
537
  "epoch": 0.3087548242941296,
538
- "grad_norm": 0.7811614274978638,
539
  "learning_rate": 3.842404549147035e-05,
540
- "loss": 1.2137,
541
  "step": 760
542
  },
543
  {
544
  "epoch": 0.31281738777168394,
545
- "grad_norm": 1.252016305923462,
546
  "learning_rate": 3.761169780666125e-05,
547
- "loss": 1.4091,
548
  "step": 770
549
  },
550
  {
551
  "epoch": 0.3168799512492383,
552
- "grad_norm": 1.0863845348358154,
553
  "learning_rate": 3.6799350121852156e-05,
554
- "loss": 1.4903,
555
  "step": 780
556
  },
557
  {
558
  "epoch": 0.3209425147267926,
559
- "grad_norm": 1.1215578317642212,
560
  "learning_rate": 3.598700243704305e-05,
561
- "loss": 1.2848,
562
  "step": 790
563
  },
564
  {
565
  "epoch": 0.32500507820434693,
566
- "grad_norm": 1.7457737922668457,
567
  "learning_rate": 3.517465475223396e-05,
568
- "loss": 1.3474,
569
  "step": 800
570
  },
571
  {
572
  "epoch": 0.3290676416819013,
573
- "grad_norm": 0.7558390498161316,
574
  "learning_rate": 3.436230706742486e-05,
575
- "loss": 1.3283,
576
  "step": 810
577
  },
578
  {
579
  "epoch": 0.33313020515945563,
580
- "grad_norm": 0.7559828162193298,
581
  "learning_rate": 3.3549959382615757e-05,
582
- "loss": 1.2811,
583
  "step": 820
584
  },
585
  {
586
  "epoch": 0.33719276863701,
587
- "grad_norm": 0.5059702396392822,
588
  "learning_rate": 3.273761169780667e-05,
589
- "loss": 1.411,
590
  "step": 830
591
  },
592
  {
593
  "epoch": 0.34125533211456427,
594
- "grad_norm": 0.6929866671562195,
595
  "learning_rate": 3.1925264012997564e-05,
596
- "loss": 1.1623,
597
  "step": 840
598
  },
599
  {
600
  "epoch": 0.3453178955921186,
601
- "grad_norm": 0.7670132517814636,
602
  "learning_rate": 3.111291632818847e-05,
603
- "loss": 1.4531,
604
  "step": 850
605
  },
606
  {
607
  "epoch": 0.34938045906967297,
608
- "grad_norm": 0.7590322494506836,
609
  "learning_rate": 3.0300568643379367e-05,
610
- "loss": 1.2966,
611
  "step": 860
612
  },
613
  {
614
  "epoch": 0.3534430225472273,
615
- "grad_norm": 0.9692467451095581,
616
  "learning_rate": 2.9488220958570267e-05,
617
- "loss": 1.381,
618
  "step": 870
619
  },
620
  {
621
  "epoch": 0.35750558602478166,
622
- "grad_norm": 1.5862103700637817,
623
  "learning_rate": 2.867587327376117e-05,
624
- "loss": 1.3523,
625
  "step": 880
626
  },
627
  {
628
  "epoch": 0.36156814950233596,
629
- "grad_norm": 1.0058462619781494,
630
  "learning_rate": 2.786352558895207e-05,
631
- "loss": 1.4259,
632
  "step": 890
633
  },
634
  {
635
  "epoch": 0.3656307129798903,
636
- "grad_norm": 2.4880478382110596,
637
  "learning_rate": 2.7051177904142978e-05,
638
- "loss": 1.4788,
639
  "step": 900
640
  },
641
  {
642
  "epoch": 0.36969327645744465,
643
- "grad_norm": 1.9835453033447266,
644
  "learning_rate": 2.6238830219333875e-05,
645
- "loss": 1.4717,
646
  "step": 910
647
  },
648
  {
649
  "epoch": 0.373755839934999,
650
- "grad_norm": 0.7700700759887695,
651
  "learning_rate": 2.5426482534524775e-05,
652
- "loss": 1.381,
653
  "step": 920
654
  },
655
  {
656
  "epoch": 0.3778184034125533,
657
- "grad_norm": 1.249335765838623,
658
  "learning_rate": 2.461413484971568e-05,
659
- "loss": 1.3528,
660
  "step": 930
661
  },
662
  {
663
  "epoch": 0.38188096689010764,
664
- "grad_norm": 0.6663157939910889,
665
  "learning_rate": 2.3801787164906582e-05,
666
- "loss": 1.3832,
667
  "step": 940
668
  },
669
  {
670
  "epoch": 0.385943530367662,
671
- "grad_norm": 1.822151780128479,
672
  "learning_rate": 2.2989439480097482e-05,
673
- "loss": 1.4578,
674
  "step": 950
675
  },
676
  {
677
  "epoch": 0.39000609384521634,
678
- "grad_norm": 2.3494231700897217,
679
  "learning_rate": 2.2177091795288386e-05,
680
- "loss": 1.2875,
681
  "step": 960
682
  },
683
  {
684
  "epoch": 0.3940686573227707,
685
- "grad_norm": 2.2792298793792725,
686
  "learning_rate": 2.1364744110479286e-05,
687
- "loss": 1.499,
688
  "step": 970
689
  },
690
  {
691
  "epoch": 0.398131220800325,
692
- "grad_norm": 1.2619109153747559,
693
  "learning_rate": 2.0552396425670186e-05,
694
- "loss": 1.3482,
695
  "step": 980
696
  },
697
  {
698
  "epoch": 0.40219378427787933,
699
- "grad_norm": 0.6195743680000305,
700
  "learning_rate": 1.974004874086109e-05,
701
- "loss": 1.3943,
702
  "step": 990
703
  },
704
  {
705
  "epoch": 0.4062563477554337,
706
- "grad_norm": 0.8362967371940613,
707
  "learning_rate": 1.892770105605199e-05,
708
- "loss": 1.3816,
709
  "step": 1000
710
  },
711
  {
712
  "epoch": 0.410318911232988,
713
- "grad_norm": 0.7627909779548645,
714
  "learning_rate": 1.8115353371242893e-05,
715
- "loss": 1.2967,
716
  "step": 1010
717
  },
718
  {
719
  "epoch": 0.4143814747105424,
720
- "grad_norm": 1.2969974279403687,
721
  "learning_rate": 1.7303005686433797e-05,
722
- "loss": 1.2868,
723
  "step": 1020
724
  },
725
  {
726
  "epoch": 0.41844403818809667,
727
- "grad_norm": 1.1111704111099243,
728
  "learning_rate": 1.6490658001624697e-05,
729
- "loss": 1.3058,
730
  "step": 1030
731
  },
732
  {
733
  "epoch": 0.422506601665651,
734
- "grad_norm": 1.131678819656372,
735
  "learning_rate": 1.5678310316815597e-05,
736
- "loss": 1.3291,
737
  "step": 1040
738
  },
739
  {
740
  "epoch": 0.42656916514320536,
741
- "grad_norm": 0.8244335055351257,
742
  "learning_rate": 1.48659626320065e-05,
743
- "loss": 1.2973,
744
  "step": 1050
745
  },
746
  {
747
  "epoch": 0.4306317286207597,
748
- "grad_norm": 0.7791102528572083,
749
  "learning_rate": 1.4053614947197402e-05,
750
- "loss": 1.3749,
751
  "step": 1060
752
  },
753
  {
754
  "epoch": 0.43469429209831406,
755
- "grad_norm": 0.8316488862037659,
756
  "learning_rate": 1.3241267262388301e-05,
757
- "loss": 1.4213,
758
  "step": 1070
759
  },
760
  {
761
  "epoch": 0.43875685557586835,
762
- "grad_norm": 0.9478839635848999,
763
  "learning_rate": 1.2428919577579204e-05,
764
- "loss": 1.3864,
765
  "step": 1080
766
  },
767
  {
768
  "epoch": 0.4428194190534227,
769
- "grad_norm": 1.5177056789398193,
770
  "learning_rate": 1.1616571892770106e-05,
771
- "loss": 1.2423,
772
  "step": 1090
773
  },
774
  {
775
  "epoch": 0.44688198253097705,
776
- "grad_norm": 0.6691774725914001,
777
  "learning_rate": 1.0804224207961008e-05,
778
- "loss": 1.3767,
779
  "step": 1100
780
  },
781
  {
782
  "epoch": 0.4509445460085314,
783
- "grad_norm": 1.9306799173355103,
784
  "learning_rate": 9.99187652315191e-06,
785
- "loss": 1.4233,
786
  "step": 1110
787
  },
788
  {
789
  "epoch": 0.4550071094860857,
790
- "grad_norm": 1.131913423538208,
791
  "learning_rate": 9.179528838342812e-06,
792
- "loss": 1.3289,
793
  "step": 1120
794
  },
795
  {
796
  "epoch": 0.45906967296364004,
797
- "grad_norm": 0.7975111603736877,
798
  "learning_rate": 8.367181153533712e-06,
799
- "loss": 1.3472,
800
  "step": 1130
801
  },
802
  {
803
  "epoch": 0.4631322364411944,
804
- "grad_norm": 0.5920394659042358,
805
  "learning_rate": 7.554833468724615e-06,
806
- "loss": 1.2458,
807
  "step": 1140
808
  },
809
  {
810
  "epoch": 0.46719479991874874,
811
- "grad_norm": 0.8416665196418762,
812
  "learning_rate": 6.742485783915516e-06,
813
- "loss": 1.4424,
814
  "step": 1150
815
  },
816
  {
817
  "epoch": 0.4712573633963031,
818
- "grad_norm": 0.670789897441864,
819
  "learning_rate": 5.930138099106418e-06,
820
- "loss": 1.2866,
821
  "step": 1160
822
  },
823
  {
824
  "epoch": 0.4753199268738574,
825
- "grad_norm": 1.7391314506530762,
826
  "learning_rate": 5.117790414297319e-06,
827
- "loss": 1.4517,
828
  "step": 1170
829
  },
830
  {
831
  "epoch": 0.4793824903514117,
832
- "grad_norm": 1.0173118114471436,
833
  "learning_rate": 4.305442729488221e-06,
834
- "loss": 1.4109,
835
  "step": 1180
836
  },
837
  {
838
  "epoch": 0.4834450538289661,
839
- "grad_norm": 0.9398189187049866,
840
  "learning_rate": 3.4930950446791225e-06,
841
- "loss": 1.3602,
842
  "step": 1190
843
  },
844
  {
845
  "epoch": 0.4875076173065204,
846
- "grad_norm": 2.0191538333892822,
847
  "learning_rate": 2.6807473598700244e-06,
848
- "loss": 1.4342,
849
  "step": 1200
850
  }
851
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.004062563477554337,
13
+ "grad_norm": 1.032738447189331,
14
  "learning_rate": 9.91876523151909e-05,
15
+ "loss": 2.0442,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.008125126955108674,
20
+ "grad_norm": 0.8217566609382629,
21
+ "learning_rate": 9.845653939886271e-05,
22
+ "loss": 1.9828,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.01218769043266301,
27
+ "grad_norm": 0.6819207668304443,
28
+ "learning_rate": 9.764419171405362e-05,
29
+ "loss": 1.7187,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.01625025391021735,
34
+ "grad_norm": 0.9606077075004578,
35
+ "learning_rate": 9.683184402924452e-05,
36
+ "loss": 1.6162,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.020312817387771683,
41
+ "grad_norm": 2.6619677543640137,
42
+ "learning_rate": 9.601949634443542e-05,
43
+ "loss": 1.6977,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.02437538086532602,
48
+ "grad_norm": 1.5025429725646973,
49
+ "learning_rate": 9.520714865962633e-05,
50
+ "loss": 1.2874,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.028437944342880356,
55
+ "grad_norm": 2.7792210578918457,
56
+ "learning_rate": 9.439480097481722e-05,
57
+ "loss": 1.4551,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.0325005078204347,
62
+ "grad_norm": 1.557523250579834,
63
+ "learning_rate": 9.358245329000814e-05,
64
+ "loss": 1.4412,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.03656307129798903,
69
+ "grad_norm": 3.4479055404663086,
70
+ "learning_rate": 9.277010560519903e-05,
71
+ "loss": 1.4806,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.040625634775543366,
76
+ "grad_norm": 1.282782793045044,
77
+ "learning_rate": 9.195775792038993e-05,
78
+ "loss": 1.3971,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.04468819825309771,
83
+ "grad_norm": 1.6942788362503052,
84
+ "learning_rate": 9.114541023558083e-05,
85
+ "loss": 1.543,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.04875076173065204,
90
+ "grad_norm": 0.7825320363044739,
91
+ "learning_rate": 9.033306255077174e-05,
92
+ "loss": 1.3638,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.05281332520820638,
97
+ "grad_norm": 0.7435455918312073,
98
+ "learning_rate": 8.952071486596265e-05,
99
+ "loss": 1.4731,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.05687588868576071,
104
+ "grad_norm": 0.5097799897193909,
105
+ "learning_rate": 8.870836718115354e-05,
106
+ "loss": 1.4218,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.06093845216331505,
111
+ "grad_norm": 0.8880577087402344,
112
+ "learning_rate": 8.789601949634444e-05,
113
+ "loss": 1.3477,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.0650010156408694,
118
+ "grad_norm": 1.1695178747177124,
119
+ "learning_rate": 8.708367181153534e-05,
120
+ "loss": 1.4521,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.06906357911842373,
125
+ "grad_norm": 0.7951045036315918,
126
+ "learning_rate": 8.627132412672623e-05,
127
+ "loss": 1.3417,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.07312614259597806,
132
+ "grad_norm": 0.5302656888961792,
133
+ "learning_rate": 8.545897644191714e-05,
134
+ "loss": 1.5202,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.0771887060735324,
139
+ "grad_norm": 0.8830013275146484,
140
+ "learning_rate": 8.464662875710805e-05,
141
+ "loss": 1.4271,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.08125126955108673,
146
+ "grad_norm": 3.1300947666168213,
147
+ "learning_rate": 8.383428107229895e-05,
148
+ "loss": 1.3743,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.08531383302864107,
153
+ "grad_norm": 1.077304720878601,
154
+ "learning_rate": 8.302193338748985e-05,
155
+ "loss": 1.4244,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.08937639650619542,
160
+ "grad_norm": 1.6846965551376343,
161
+ "learning_rate": 8.220958570268074e-05,
162
+ "loss": 1.4471,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.09343895998374975,
167
+ "grad_norm": 1.066351056098938,
168
+ "learning_rate": 8.139723801787165e-05,
169
+ "loss": 1.3723,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.09750152346130408,
174
+ "grad_norm": 1.6999150514602661,
175
+ "learning_rate": 8.058489033306255e-05,
176
+ "loss": 1.368,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.10156408693885842,
181
+ "grad_norm": 0.7261852622032166,
182
+ "learning_rate": 7.977254264825346e-05,
183
+ "loss": 1.3175,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.10562665041641275,
188
+ "grad_norm": 0.8671672344207764,
189
+ "learning_rate": 7.896019496344436e-05,
190
+ "loss": 1.4264,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.10968921389396709,
195
+ "grad_norm": 0.9500339031219482,
196
+ "learning_rate": 7.814784727863525e-05,
197
+ "loss": 1.2985,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.11375177737152142,
202
+ "grad_norm": 1.071378469467163,
203
+ "learning_rate": 7.733549959382617e-05,
204
+ "loss": 1.447,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.11781434084907577,
209
+ "grad_norm": 0.7448930144309998,
210
+ "learning_rate": 7.652315190901706e-05,
211
+ "loss": 1.374,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.1218769043266301,
216
+ "grad_norm": 0.9604855179786682,
217
+ "learning_rate": 7.571080422420796e-05,
218
+ "loss": 1.4803,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.12593946780418444,
223
+ "grad_norm": 1.9951509237289429,
224
+ "learning_rate": 7.489845653939887e-05,
225
+ "loss": 1.4477,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.1300020312817388,
230
+ "grad_norm": 2.3266055583953857,
231
+ "learning_rate": 7.408610885458977e-05,
232
+ "loss": 1.6107,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.1340645947592931,
237
+ "grad_norm": 0.878081738948822,
238
+ "learning_rate": 7.327376116978068e-05,
239
+ "loss": 1.295,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.13812715823684746,
244
+ "grad_norm": 0.7642931938171387,
245
+ "learning_rate": 7.246141348497157e-05,
246
+ "loss": 1.4776,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.14218972171440178,
251
+ "grad_norm": 3.2081470489501953,
252
+ "learning_rate": 7.164906580016247e-05,
253
+ "loss": 1.448,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.14625228519195613,
258
+ "grad_norm": 0.9164071679115295,
259
+ "learning_rate": 7.083671811535338e-05,
260
+ "loss": 1.3792,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.15031484866951045,
265
+ "grad_norm": 1.9146902561187744,
266
+ "learning_rate": 7.002437043054428e-05,
267
+ "loss": 1.239,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.1543774121470648,
272
+ "grad_norm": 0.6990465521812439,
273
+ "learning_rate": 6.921202274573519e-05,
274
+ "loss": 1.3211,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.15843997562461914,
279
+ "grad_norm": 1.7084014415740967,
280
+ "learning_rate": 6.839967506092608e-05,
281
+ "loss": 1.3952,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.16250253910217347,
286
+ "grad_norm": 1.0569523572921753,
287
+ "learning_rate": 6.758732737611698e-05,
288
+ "loss": 1.385,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.16656510257972781,
293
+ "grad_norm": 1.4293252229690552,
294
+ "learning_rate": 6.677497969130788e-05,
295
+ "loss": 1.3736,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.17062766605728213,
300
+ "grad_norm": 1.0318830013275146,
301
+ "learning_rate": 6.596263200649879e-05,
302
+ "loss": 1.2315,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.17469022953483648,
307
+ "grad_norm": 1.0838662385940552,
308
+ "learning_rate": 6.515028432168968e-05,
309
+ "loss": 1.5313,
310
  "step": 430
311
  },
312
  {
313
  "epoch": 0.17875279301239083,
314
+ "grad_norm": 2.388718366622925,
315
+ "learning_rate": 6.43379366368806e-05,
316
+ "loss": 1.5047,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 0.18281535648994515,
321
+ "grad_norm": 1.58635413646698,
322
+ "learning_rate": 6.352558895207149e-05,
323
+ "loss": 1.2847,
324
  "step": 450
325
  },
326
  {
327
  "epoch": 0.1868779199674995,
328
+ "grad_norm": 4.313031196594238,
329
+ "learning_rate": 6.271324126726239e-05,
330
+ "loss": 1.46,
331
  "step": 460
332
  },
333
  {
334
  "epoch": 0.19094048344505382,
335
+ "grad_norm": 0.5860478281974792,
336
+ "learning_rate": 6.190089358245328e-05,
337
+ "loss": 1.3614,
338
  "step": 470
339
  },
340
  {
341
  "epoch": 0.19500304692260817,
342
+ "grad_norm": 3.0871775150299072,
343
+ "learning_rate": 6.10885458976442e-05,
344
+ "loss": 1.3915,
345
  "step": 480
346
  },
347
  {
348
  "epoch": 0.1990656104001625,
349
+ "grad_norm": 2.025517463684082,
350
+ "learning_rate": 6.02761982128351e-05,
351
+ "loss": 1.3326,
352
  "step": 490
353
  },
354
  {
355
  "epoch": 0.20312817387771684,
356
+ "grad_norm": 0.8149346709251404,
357
+ "learning_rate": 5.9463850528026e-05,
358
+ "loss": 1.2553,
359
  "step": 500
360
  },
361
  {
362
  "epoch": 0.2071907373552712,
363
+ "grad_norm": 1.1703840494155884,
364
+ "learning_rate": 5.86515028432169e-05,
365
+ "loss": 1.4137,
366
  "step": 510
367
  },
368
  {
369
  "epoch": 0.2112533008328255,
370
+ "grad_norm": 2.0241522789001465,
371
+ "learning_rate": 5.7839155158407796e-05,
372
+ "loss": 1.3763,
373
  "step": 520
374
  },
375
  {
376
  "epoch": 0.21531586431037986,
377
+ "grad_norm": 0.5776875019073486,
378
+ "learning_rate": 5.70268074735987e-05,
379
+ "loss": 1.282,
380
  "step": 530
381
  },
382
  {
383
  "epoch": 0.21937842778793418,
384
+ "grad_norm": 1.644302487373352,
385
+ "learning_rate": 5.621445978878961e-05,
386
+ "loss": 1.3309,
387
  "step": 540
388
  },
389
  {
390
  "epoch": 0.22344099126548853,
391
+ "grad_norm": 1.0667213201522827,
392
+ "learning_rate": 5.5402112103980506e-05,
393
+ "loss": 1.4516,
394
  "step": 550
395
  },
396
  {
397
  "epoch": 0.22750355474304285,
398
+ "grad_norm": 1.1769509315490723,
399
+ "learning_rate": 5.458976441917141e-05,
400
+ "loss": 1.4305,
401
  "step": 560
402
  },
403
  {
404
  "epoch": 0.2315661182205972,
405
+ "grad_norm": 0.9089193940162659,
406
+ "learning_rate": 5.377741673436231e-05,
407
+ "loss": 1.3263,
408
  "step": 570
409
  },
410
  {
411
  "epoch": 0.23562868169815154,
412
+ "grad_norm": 1.9467777013778687,
413
+ "learning_rate": 5.2965069049553203e-05,
414
+ "loss": 1.402,
415
  "step": 580
416
  },
417
  {
418
  "epoch": 0.23969124517570586,
419
+ "grad_norm": 1.313186526298523,
420
+ "learning_rate": 5.2152721364744114e-05,
421
+ "loss": 1.3604,
422
  "step": 590
423
  },
424
  {
425
  "epoch": 0.2437538086532602,
426
+ "grad_norm": 0.48065802454948425,
427
+ "learning_rate": 5.134037367993502e-05,
428
+ "loss": 1.5186,
429
  "step": 600
430
  },
431
  {
432
  "epoch": 0.24781637213081453,
433
+ "grad_norm": 0.7379820942878723,
434
+ "learning_rate": 5.0528025995125914e-05,
435
+ "loss": 1.4457,
436
  "step": 610
437
  },
438
  {
439
  "epoch": 0.2518789356083689,
440
+ "grad_norm": 0.7363265156745911,
441
+ "learning_rate": 4.971567831031682e-05,
442
+ "loss": 1.3843,
443
  "step": 620
444
  },
445
  {
446
  "epoch": 0.25594149908592323,
447
+ "grad_norm": 1.349368691444397,
448
+ "learning_rate": 4.890333062550772e-05,
449
+ "loss": 1.355,
450
  "step": 630
451
  },
452
  {
453
  "epoch": 0.2600040625634776,
454
+ "grad_norm": 1.6113736629486084,
455
+ "learning_rate": 4.809098294069862e-05,
456
+ "loss": 1.5113,
457
  "step": 640
458
  },
459
  {
460
  "epoch": 0.26406662604103187,
461
+ "grad_norm": 0.7977277636528015,
462
  "learning_rate": 4.735987002437044e-05,
463
+ "loss": 1.4413,
464
  "step": 650
465
  },
466
  {
467
  "epoch": 0.2681291895185862,
468
+ "grad_norm": 2.2527716159820557,
469
  "learning_rate": 4.6547522339561334e-05,
470
+ "loss": 1.4792,
471
  "step": 660
472
  },
473
  {
474
  "epoch": 0.27219175299614057,
475
+ "grad_norm": 0.7665427923202515,
476
  "learning_rate": 4.573517465475224e-05,
477
+ "loss": 1.2581,
478
  "step": 670
479
  },
480
  {
481
  "epoch": 0.2762543164736949,
482
+ "grad_norm": 1.6001981496810913,
483
  "learning_rate": 4.492282696994314e-05,
484
  "loss": 1.3557,
485
  "step": 680
486
  },
487
  {
488
  "epoch": 0.28031687995124926,
489
+ "grad_norm": 0.6028335690498352,
490
  "learning_rate": 4.411047928513404e-05,
491
+ "loss": 1.4625,
492
  "step": 690
493
  },
494
  {
495
  "epoch": 0.28437944342880356,
496
+ "grad_norm": 0.621457576751709,
497
  "learning_rate": 4.329813160032494e-05,
498
+ "loss": 1.4177,
499
  "step": 700
500
  },
501
  {
502
  "epoch": 0.2884420069063579,
503
+ "grad_norm": 1.08269464969635,
504
  "learning_rate": 4.2485783915515845e-05,
505
+ "loss": 1.5553,
506
  "step": 710
507
  },
508
  {
509
  "epoch": 0.29250457038391225,
510
+ "grad_norm": 0.7009453177452087,
511
  "learning_rate": 4.167343623070675e-05,
512
+ "loss": 1.3478,
513
  "step": 720
514
  },
515
  {
516
  "epoch": 0.2965671338614666,
517
+ "grad_norm": 0.7686846256256104,
518
  "learning_rate": 4.0861088545897645e-05,
519
+ "loss": 1.5388,
520
  "step": 730
521
  },
522
  {
523
  "epoch": 0.3006296973390209,
524
+ "grad_norm": 1.193659782409668,
525
  "learning_rate": 4.004874086108855e-05,
526
+ "loss": 1.558,
527
  "step": 740
528
  },
529
  {
530
  "epoch": 0.30469226081657524,
531
+ "grad_norm": 0.9236335754394531,
532
  "learning_rate": 3.923639317627945e-05,
533
+ "loss": 1.4143,
534
  "step": 750
535
  },
536
  {
537
  "epoch": 0.3087548242941296,
538
+ "grad_norm": 0.7839071750640869,
539
  "learning_rate": 3.842404549147035e-05,
540
+ "loss": 1.2117,
541
  "step": 760
542
  },
543
  {
544
  "epoch": 0.31281738777168394,
545
+ "grad_norm": 1.2941004037857056,
546
  "learning_rate": 3.761169780666125e-05,
547
+ "loss": 1.4062,
548
  "step": 770
549
  },
550
  {
551
  "epoch": 0.3168799512492383,
552
+ "grad_norm": 1.426131010055542,
553
  "learning_rate": 3.6799350121852156e-05,
554
+ "loss": 1.4909,
555
  "step": 780
556
  },
557
  {
558
  "epoch": 0.3209425147267926,
559
+ "grad_norm": 1.1472172737121582,
560
  "learning_rate": 3.598700243704305e-05,
561
+ "loss": 1.2864,
562
  "step": 790
563
  },
564
  {
565
  "epoch": 0.32500507820434693,
566
+ "grad_norm": 2.1125993728637695,
567
  "learning_rate": 3.517465475223396e-05,
568
+ "loss": 1.3456,
569
  "step": 800
570
  },
571
  {
572
  "epoch": 0.3290676416819013,
573
+ "grad_norm": 0.7743927240371704,
574
  "learning_rate": 3.436230706742486e-05,
575
+ "loss": 1.3294,
576
  "step": 810
577
  },
578
  {
579
  "epoch": 0.33313020515945563,
580
+ "grad_norm": 0.8188801407814026,
581
  "learning_rate": 3.3549959382615757e-05,
582
+ "loss": 1.2803,
583
  "step": 820
584
  },
585
  {
586
  "epoch": 0.33719276863701,
587
+ "grad_norm": 0.5288794636726379,
588
  "learning_rate": 3.273761169780667e-05,
589
+ "loss": 1.4094,
590
  "step": 830
591
  },
592
  {
593
  "epoch": 0.34125533211456427,
594
+ "grad_norm": 0.6982353329658508,
595
  "learning_rate": 3.1925264012997564e-05,
596
+ "loss": 1.1621,
597
  "step": 840
598
  },
599
  {
600
  "epoch": 0.3453178955921186,
601
+ "grad_norm": 0.7790868282318115,
602
  "learning_rate": 3.111291632818847e-05,
603
+ "loss": 1.4534,
604
  "step": 850
605
  },
606
  {
607
  "epoch": 0.34938045906967297,
608
+ "grad_norm": 0.7421971559524536,
609
  "learning_rate": 3.0300568643379367e-05,
610
+ "loss": 1.2951,
611
  "step": 860
612
  },
613
  {
614
  "epoch": 0.3534430225472273,
615
+ "grad_norm": 0.9682130813598633,
616
  "learning_rate": 2.9488220958570267e-05,
617
+ "loss": 1.3827,
618
  "step": 870
619
  },
620
  {
621
  "epoch": 0.35750558602478166,
622
+ "grad_norm": 1.6505218744277954,
623
  "learning_rate": 2.867587327376117e-05,
624
+ "loss": 1.3482,
625
  "step": 880
626
  },
627
  {
628
  "epoch": 0.36156814950233596,
629
+ "grad_norm": 1.013390064239502,
630
  "learning_rate": 2.786352558895207e-05,
631
+ "loss": 1.4224,
632
  "step": 890
633
  },
634
  {
635
  "epoch": 0.3656307129798903,
636
+ "grad_norm": 2.4053940773010254,
637
  "learning_rate": 2.7051177904142978e-05,
638
+ "loss": 1.4791,
639
  "step": 900
640
  },
641
  {
642
  "epoch": 0.36969327645744465,
643
+ "grad_norm": 1.7795937061309814,
644
  "learning_rate": 2.6238830219333875e-05,
645
+ "loss": 1.472,
646
  "step": 910
647
  },
648
  {
649
  "epoch": 0.373755839934999,
650
+ "grad_norm": 0.7656849026679993,
651
  "learning_rate": 2.5426482534524775e-05,
652
+ "loss": 1.3824,
653
  "step": 920
654
  },
655
  {
656
  "epoch": 0.3778184034125533,
657
+ "grad_norm": 1.1062685251235962,
658
  "learning_rate": 2.461413484971568e-05,
659
+ "loss": 1.3462,
660
  "step": 930
661
  },
662
  {
663
  "epoch": 0.38188096689010764,
664
+ "grad_norm": 0.660727858543396,
665
  "learning_rate": 2.3801787164906582e-05,
666
+ "loss": 1.3834,
667
  "step": 940
668
  },
669
  {
670
  "epoch": 0.385943530367662,
671
+ "grad_norm": 1.8191992044448853,
672
  "learning_rate": 2.2989439480097482e-05,
673
+ "loss": 1.4556,
674
  "step": 950
675
  },
676
  {
677
  "epoch": 0.39000609384521634,
678
+ "grad_norm": 3.3901174068450928,
679
  "learning_rate": 2.2177091795288386e-05,
680
+ "loss": 1.2865,
681
  "step": 960
682
  },
683
  {
684
  "epoch": 0.3940686573227707,
685
+ "grad_norm": 2.4866228103637695,
686
  "learning_rate": 2.1364744110479286e-05,
687
+ "loss": 1.501,
688
  "step": 970
689
  },
690
  {
691
  "epoch": 0.398131220800325,
692
+ "grad_norm": 1.41251540184021,
693
  "learning_rate": 2.0552396425670186e-05,
694
+ "loss": 1.3455,
695
  "step": 980
696
  },
697
  {
698
  "epoch": 0.40219378427787933,
699
+ "grad_norm": 0.645084798336029,
700
  "learning_rate": 1.974004874086109e-05,
701
+ "loss": 1.3918,
702
  "step": 990
703
  },
704
  {
705
  "epoch": 0.4062563477554337,
706
+ "grad_norm": 0.8688336610794067,
707
  "learning_rate": 1.892770105605199e-05,
708
+ "loss": 1.3823,
709
  "step": 1000
710
  },
711
  {
712
  "epoch": 0.410318911232988,
713
+ "grad_norm": 0.7747243642807007,
714
  "learning_rate": 1.8115353371242893e-05,
715
+ "loss": 1.2988,
716
  "step": 1010
717
  },
718
  {
719
  "epoch": 0.4143814747105424,
720
+ "grad_norm": 1.329567313194275,
721
  "learning_rate": 1.7303005686433797e-05,
722
+ "loss": 1.2876,
723
  "step": 1020
724
  },
725
  {
726
  "epoch": 0.41844403818809667,
727
+ "grad_norm": 1.022557258605957,
728
  "learning_rate": 1.6490658001624697e-05,
729
+ "loss": 1.3036,
730
  "step": 1030
731
  },
732
  {
733
  "epoch": 0.422506601665651,
734
+ "grad_norm": 0.9812535047531128,
735
  "learning_rate": 1.5678310316815597e-05,
736
+ "loss": 1.3278,
737
  "step": 1040
738
  },
739
  {
740
  "epoch": 0.42656916514320536,
741
+ "grad_norm": 0.8079454898834229,
742
  "learning_rate": 1.48659626320065e-05,
743
+ "loss": 1.2957,
744
  "step": 1050
745
  },
746
  {
747
  "epoch": 0.4306317286207597,
748
+ "grad_norm": 0.8280611038208008,
749
  "learning_rate": 1.4053614947197402e-05,
750
+ "loss": 1.3747,
751
  "step": 1060
752
  },
753
  {
754
  "epoch": 0.43469429209831406,
755
+ "grad_norm": 0.85203617811203,
756
  "learning_rate": 1.3241267262388301e-05,
757
+ "loss": 1.423,
758
  "step": 1070
759
  },
760
  {
761
  "epoch": 0.43875685557586835,
762
+ "grad_norm": 0.9492881298065186,
763
  "learning_rate": 1.2428919577579204e-05,
764
+ "loss": 1.3845,
765
  "step": 1080
766
  },
767
  {
768
  "epoch": 0.4428194190534227,
769
+ "grad_norm": 1.6084877252578735,
770
  "learning_rate": 1.1616571892770106e-05,
771
+ "loss": 1.2422,
772
  "step": 1090
773
  },
774
  {
775
  "epoch": 0.44688198253097705,
776
+ "grad_norm": 0.7060315012931824,
777
  "learning_rate": 1.0804224207961008e-05,
778
+ "loss": 1.3758,
779
  "step": 1100
780
  },
781
  {
782
  "epoch": 0.4509445460085314,
783
+ "grad_norm": 2.0847179889678955,
784
  "learning_rate": 9.99187652315191e-06,
785
+ "loss": 1.4202,
786
  "step": 1110
787
  },
788
  {
789
  "epoch": 0.4550071094860857,
790
+ "grad_norm": 1.1350940465927124,
791
  "learning_rate": 9.179528838342812e-06,
792
+ "loss": 1.3273,
793
  "step": 1120
794
  },
795
  {
796
  "epoch": 0.45906967296364004,
797
+ "grad_norm": 0.841096043586731,
798
  "learning_rate": 8.367181153533712e-06,
799
+ "loss": 1.3461,
800
  "step": 1130
801
  },
802
  {
803
  "epoch": 0.4631322364411944,
804
+ "grad_norm": 0.5926047563552856,
805
  "learning_rate": 7.554833468724615e-06,
806
+ "loss": 1.2492,
807
  "step": 1140
808
  },
809
  {
810
  "epoch": 0.46719479991874874,
811
+ "grad_norm": 0.8289461135864258,
812
  "learning_rate": 6.742485783915516e-06,
813
+ "loss": 1.4403,
814
  "step": 1150
815
  },
816
  {
817
  "epoch": 0.4712573633963031,
818
+ "grad_norm": 0.6718817353248596,
819
  "learning_rate": 5.930138099106418e-06,
820
+ "loss": 1.2884,
821
  "step": 1160
822
  },
823
  {
824
  "epoch": 0.4753199268738574,
825
+ "grad_norm": 1.6381056308746338,
826
  "learning_rate": 5.117790414297319e-06,
827
+ "loss": 1.4508,
828
  "step": 1170
829
  },
830
  {
831
  "epoch": 0.4793824903514117,
832
+ "grad_norm": 1.0323442220687866,
833
  "learning_rate": 4.305442729488221e-06,
834
+ "loss": 1.4087,
835
  "step": 1180
836
  },
837
  {
838
  "epoch": 0.4834450538289661,
839
+ "grad_norm": 1.0119701623916626,
840
  "learning_rate": 3.4930950446791225e-06,
841
+ "loss": 1.3611,
842
  "step": 1190
843
  },
844
  {
845
  "epoch": 0.4875076173065204,
846
+ "grad_norm": 2.113036870956421,
847
  "learning_rate": 2.6807473598700244e-06,
848
+ "loss": 1.4355,
849
  "step": 1200
850
  }
851
  ],
checkpoint-1200/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d77c4079b78d8ffba36a5b7eea8d68305d2c5af880d94b33c697b01545a3d5f7
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8df8064174041f177505ba0d3a66e4fc8f5d15861f3d5535980ace500355480f
3
  size 5176
checkpoint-1231/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:514393859ed0d46275228e8f825f8b30494c8b087c4be0aa14131e53baddf126
3
  size 8405472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:785ee56c23465a10b4464b35d71e6e3a37ecea3efc9900199739964a182f8463
3
  size 8405472
checkpoint-1231/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c54c73d430754e7c0407a9021ce570230b200ab22664af8e6e3a9efbcfdfa8ef
3
  size 4411194
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c30d9b34cc216e0661d95564283e7db79ccc4d29bdc8a770db5b3015e66be07
3
  size 4411194
checkpoint-1231/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e9e94864fc6d4c9096ed584ef110f46923f94284a19772ce4dff5295d0818f09
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b39d6a8d0d352ee4e87b83934d135a11455b52740783a59a137aa6aadb237215
3
  size 14244
checkpoint-1231/trainer_state.json CHANGED
@@ -10,863 +10,863 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.004062563477554337,
13
- "grad_norm": 0.9910523295402527,
14
  "learning_rate": 9.91876523151909e-05,
15
- "loss": 2.0449,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.008125126955108674,
20
- "grad_norm": 0.8047148585319519,
21
- "learning_rate": 9.83753046303818e-05,
22
- "loss": 1.9782,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.01218769043266301,
27
- "grad_norm": 0.6215866208076477,
28
- "learning_rate": 9.756295694557271e-05,
29
- "loss": 1.7088,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.01625025391021735,
34
- "grad_norm": 0.8722022175788879,
35
- "learning_rate": 9.675060926076362e-05,
36
- "loss": 1.6144,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.020312817387771683,
41
- "grad_norm": 2.4887263774871826,
42
- "learning_rate": 9.593826157595452e-05,
43
- "loss": 1.6992,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.02437538086532602,
48
- "grad_norm": 1.4015988111495972,
49
- "learning_rate": 9.528838342810723e-05,
50
- "loss": 1.2877,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.028437944342880356,
55
- "grad_norm": 2.6476988792419434,
56
- "learning_rate": 9.447603574329814e-05,
57
- "loss": 1.4567,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.0325005078204347,
62
- "grad_norm": 1.3684196472167969,
63
- "learning_rate": 9.366368805848904e-05,
64
- "loss": 1.4383,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.03656307129798903,
69
- "grad_norm": 3.351715564727783,
70
- "learning_rate": 9.285134037367993e-05,
71
- "loss": 1.4779,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.040625634775543366,
76
- "grad_norm": 1.4918268918991089,
77
- "learning_rate": 9.203899268887084e-05,
78
- "loss": 1.4002,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.04468819825309771,
83
- "grad_norm": 1.6876213550567627,
84
- "learning_rate": 9.122664500406174e-05,
85
- "loss": 1.5442,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.04875076173065204,
90
- "grad_norm": 0.7458980083465576,
91
- "learning_rate": 9.041429731925265e-05,
92
- "loss": 1.3639,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.05281332520820638,
97
- "grad_norm": 0.7894807457923889,
98
- "learning_rate": 8.960194963444355e-05,
99
- "loss": 1.4716,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.05687588868576071,
104
- "grad_norm": 0.49681055545806885,
105
- "learning_rate": 8.878960194963445e-05,
106
- "loss": 1.4235,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.06093845216331505,
111
- "grad_norm": 0.829867959022522,
112
- "learning_rate": 8.797725426482534e-05,
113
- "loss": 1.3474,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.0650010156408694,
118
- "grad_norm": 1.1840708255767822,
119
- "learning_rate": 8.716490658001625e-05,
120
- "loss": 1.4501,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.06906357911842373,
125
- "grad_norm": 0.8244519233703613,
126
- "learning_rate": 8.635255889520715e-05,
127
- "loss": 1.3456,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.07312614259597806,
132
- "grad_norm": 0.5561311841011047,
133
- "learning_rate": 8.554021121039806e-05,
134
- "loss": 1.5179,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.0771887060735324,
139
- "grad_norm": 0.868900716304779,
140
- "learning_rate": 8.472786352558896e-05,
141
- "loss": 1.4277,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.08125126955108673,
146
- "grad_norm": 3.0395560264587402,
147
- "learning_rate": 8.391551584077985e-05,
148
- "loss": 1.3674,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.08531383302864107,
153
- "grad_norm": 1.3599716424942017,
154
- "learning_rate": 8.310316815597076e-05,
155
- "loss": 1.4285,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.08937639650619542,
160
- "grad_norm": 2.562626838684082,
161
- "learning_rate": 8.229082047116166e-05,
162
- "loss": 1.4496,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.09343895998374975,
167
- "grad_norm": 0.9251193404197693,
168
- "learning_rate": 8.147847278635257e-05,
169
- "loss": 1.3718,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.09750152346130408,
174
- "grad_norm": 1.510450839996338,
175
- "learning_rate": 8.066612510154347e-05,
176
- "loss": 1.3749,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.10156408693885842,
181
- "grad_norm": 0.7255865335464478,
182
- "learning_rate": 7.985377741673436e-05,
183
- "loss": 1.3185,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.10562665041641275,
188
- "grad_norm": 0.9007101655006409,
189
- "learning_rate": 7.904142973192526e-05,
190
- "loss": 1.4268,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.10968921389396709,
195
- "grad_norm": 0.9022626280784607,
196
- "learning_rate": 7.822908204711617e-05,
197
- "loss": 1.3011,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.11375177737152142,
202
- "grad_norm": 1.0873183012008667,
203
- "learning_rate": 7.741673436230708e-05,
204
- "loss": 1.4486,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.11781434084907577,
209
- "grad_norm": 0.7647557854652405,
210
- "learning_rate": 7.660438667749798e-05,
211
- "loss": 1.3715,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.1218769043266301,
216
- "grad_norm": 0.9489114284515381,
217
- "learning_rate": 7.579203899268887e-05,
218
- "loss": 1.4807,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.12593946780418444,
223
- "grad_norm": 1.9156900644302368,
224
- "learning_rate": 7.497969130787977e-05,
225
- "loss": 1.4448,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.1300020312817388,
230
- "grad_norm": 2.21012544631958,
231
- "learning_rate": 7.416734362307067e-05,
232
- "loss": 1.6132,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.1340645947592931,
237
- "grad_norm": 0.9065983295440674,
238
- "learning_rate": 7.335499593826158e-05,
239
- "loss": 1.2961,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.13812715823684746,
244
- "grad_norm": 0.7284504771232605,
245
- "learning_rate": 7.254264825345249e-05,
246
- "loss": 1.4802,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.14218972171440178,
251
- "grad_norm": 3.0846526622772217,
252
- "learning_rate": 7.173030056864339e-05,
253
- "loss": 1.4462,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.14625228519195613,
258
- "grad_norm": 1.0574288368225098,
259
- "learning_rate": 7.091795288383428e-05,
260
- "loss": 1.3832,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.15031484866951045,
265
- "grad_norm": 1.7289760112762451,
266
- "learning_rate": 7.010560519902518e-05,
267
- "loss": 1.2393,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.1543774121470648,
272
- "grad_norm": 0.689789891242981,
273
- "learning_rate": 6.929325751421609e-05,
274
- "loss": 1.3216,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.15843997562461914,
279
- "grad_norm": 1.3808571100234985,
280
- "learning_rate": 6.848090982940699e-05,
281
- "loss": 1.3955,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.16250253910217347,
286
- "grad_norm": 1.0446319580078125,
287
- "learning_rate": 6.76685621445979e-05,
288
- "loss": 1.387,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.16656510257972781,
293
- "grad_norm": 1.325724720954895,
294
- "learning_rate": 6.685621445978879e-05,
295
- "loss": 1.3749,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.17062766605728213,
300
- "grad_norm": 0.9511458873748779,
301
- "learning_rate": 6.604386677497969e-05,
302
- "loss": 1.2309,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.17469022953483648,
307
- "grad_norm": 1.0929605960845947,
308
- "learning_rate": 6.52315190901706e-05,
309
- "loss": 1.5331,
310
  "step": 430
311
  },
312
  {
313
  "epoch": 0.17875279301239083,
314
- "grad_norm": 2.1583609580993652,
315
- "learning_rate": 6.44191714053615e-05,
316
- "loss": 1.5038,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 0.18281535648994515,
321
- "grad_norm": 1.5003691911697388,
322
- "learning_rate": 6.36068237205524e-05,
323
- "loss": 1.285,
324
  "step": 450
325
  },
326
  {
327
  "epoch": 0.1868779199674995,
328
- "grad_norm": 3.8774428367614746,
329
- "learning_rate": 6.27944760357433e-05,
330
- "loss": 1.4569,
331
  "step": 460
332
  },
333
  {
334
  "epoch": 0.19094048344505382,
335
- "grad_norm": 0.5628882646560669,
336
- "learning_rate": 6.19821283509342e-05,
337
- "loss": 1.3662,
338
  "step": 470
339
  },
340
  {
341
  "epoch": 0.19500304692260817,
342
- "grad_norm": 3.2969565391540527,
343
- "learning_rate": 6.116978066612511e-05,
344
- "loss": 1.3893,
345
  "step": 480
346
  },
347
  {
348
  "epoch": 0.1990656104001625,
349
- "grad_norm": 1.869675636291504,
350
- "learning_rate": 6.035743298131601e-05,
351
- "loss": 1.3349,
352
  "step": 490
353
  },
354
  {
355
  "epoch": 0.20312817387771684,
356
- "grad_norm": 0.843262791633606,
357
- "learning_rate": 5.954508529650691e-05,
358
- "loss": 1.2563,
359
  "step": 500
360
  },
361
  {
362
  "epoch": 0.2071907373552712,
363
- "grad_norm": 1.2158210277557373,
364
- "learning_rate": 5.873273761169781e-05,
365
- "loss": 1.4126,
366
  "step": 510
367
  },
368
  {
369
  "epoch": 0.2112533008328255,
370
- "grad_norm": 1.8899216651916504,
371
- "learning_rate": 5.7920389926888705e-05,
372
- "loss": 1.3794,
373
  "step": 520
374
  },
375
  {
376
  "epoch": 0.21531586431037986,
377
- "grad_norm": 0.5589442253112793,
378
- "learning_rate": 5.7108042242079615e-05,
379
- "loss": 1.2835,
380
  "step": 530
381
  },
382
  {
383
  "epoch": 0.21937842778793418,
384
- "grad_norm": 1.7187498807907104,
385
- "learning_rate": 5.629569455727052e-05,
386
- "loss": 1.3301,
387
  "step": 540
388
  },
389
  {
390
  "epoch": 0.22344099126548853,
391
- "grad_norm": 0.9800445437431335,
392
- "learning_rate": 5.5483346872461415e-05,
393
- "loss": 1.4507,
394
  "step": 550
395
  },
396
  {
397
  "epoch": 0.22750355474304285,
398
- "grad_norm": 1.1055623292922974,
399
- "learning_rate": 5.467099918765232e-05,
400
- "loss": 1.4316,
401
  "step": 560
402
  },
403
  {
404
  "epoch": 0.2315661182205972,
405
- "grad_norm": 0.9080348610877991,
406
- "learning_rate": 5.3858651502843216e-05,
407
- "loss": 1.3266,
408
  "step": 570
409
  },
410
  {
411
  "epoch": 0.23562868169815154,
412
- "grad_norm": 1.9814904928207397,
413
- "learning_rate": 5.3046303818034126e-05,
414
- "loss": 1.4031,
415
  "step": 580
416
  },
417
  {
418
  "epoch": 0.23969124517570586,
419
- "grad_norm": 1.1566354036331177,
420
- "learning_rate": 5.223395613322502e-05,
421
- "loss": 1.3616,
422
  "step": 590
423
  },
424
  {
425
  "epoch": 0.2437538086532602,
426
- "grad_norm": 0.4727267324924469,
427
- "learning_rate": 5.1421608448415926e-05,
428
- "loss": 1.519,
429
  "step": 600
430
  },
431
  {
432
  "epoch": 0.24781637213081453,
433
- "grad_norm": 0.7331522107124329,
434
- "learning_rate": 5.060926076360682e-05,
435
- "loss": 1.4483,
436
  "step": 610
437
  },
438
  {
439
  "epoch": 0.2518789356083689,
440
- "grad_norm": 0.6916592717170715,
441
- "learning_rate": 4.979691307879773e-05,
442
- "loss": 1.3865,
443
  "step": 620
444
  },
445
  {
446
  "epoch": 0.25594149908592323,
447
- "grad_norm": 1.3255294561386108,
448
- "learning_rate": 4.898456539398863e-05,
449
- "loss": 1.3529,
450
  "step": 630
451
  },
452
  {
453
  "epoch": 0.2600040625634776,
454
- "grad_norm": 1.6844894886016846,
455
- "learning_rate": 4.817221770917953e-05,
456
- "loss": 1.5159,
457
  "step": 640
458
  },
459
  {
460
  "epoch": 0.26406662604103187,
461
- "grad_norm": 0.6133401393890381,
462
  "learning_rate": 4.735987002437044e-05,
463
- "loss": 1.4394,
464
  "step": 650
465
  },
466
  {
467
  "epoch": 0.2681291895185862,
468
- "grad_norm": 2.1573598384857178,
469
  "learning_rate": 4.6547522339561334e-05,
470
- "loss": 1.4777,
471
  "step": 660
472
  },
473
  {
474
  "epoch": 0.27219175299614057,
475
- "grad_norm": 0.7368021607398987,
476
  "learning_rate": 4.573517465475224e-05,
477
- "loss": 1.2622,
478
  "step": 670
479
  },
480
  {
481
  "epoch": 0.2762543164736949,
482
- "grad_norm": 1.5155552625656128,
483
  "learning_rate": 4.492282696994314e-05,
484
  "loss": 1.3557,
485
  "step": 680
486
  },
487
  {
488
  "epoch": 0.28031687995124926,
489
- "grad_norm": 0.580403208732605,
490
  "learning_rate": 4.411047928513404e-05,
491
- "loss": 1.4629,
492
  "step": 690
493
  },
494
  {
495
  "epoch": 0.28437944342880356,
496
- "grad_norm": 0.5736162662506104,
497
  "learning_rate": 4.329813160032494e-05,
498
- "loss": 1.415,
499
  "step": 700
500
  },
501
  {
502
  "epoch": 0.2884420069063579,
503
- "grad_norm": 1.124528169631958,
504
  "learning_rate": 4.2485783915515845e-05,
505
- "loss": 1.5554,
506
  "step": 710
507
  },
508
  {
509
  "epoch": 0.29250457038391225,
510
- "grad_norm": 0.8589762449264526,
511
  "learning_rate": 4.167343623070675e-05,
512
- "loss": 1.3481,
513
  "step": 720
514
  },
515
  {
516
  "epoch": 0.2965671338614666,
517
- "grad_norm": 0.7507007122039795,
518
  "learning_rate": 4.0861088545897645e-05,
519
- "loss": 1.5406,
520
  "step": 730
521
  },
522
  {
523
  "epoch": 0.3006296973390209,
524
- "grad_norm": 1.1872320175170898,
525
  "learning_rate": 4.004874086108855e-05,
526
- "loss": 1.559,
527
  "step": 740
528
  },
529
  {
530
  "epoch": 0.30469226081657524,
531
- "grad_norm": 1.1897319555282593,
532
  "learning_rate": 3.923639317627945e-05,
533
- "loss": 1.4123,
534
  "step": 750
535
  },
536
  {
537
  "epoch": 0.3087548242941296,
538
- "grad_norm": 0.7811614274978638,
539
  "learning_rate": 3.842404549147035e-05,
540
- "loss": 1.2137,
541
  "step": 760
542
  },
543
  {
544
  "epoch": 0.31281738777168394,
545
- "grad_norm": 1.252016305923462,
546
  "learning_rate": 3.761169780666125e-05,
547
- "loss": 1.4091,
548
  "step": 770
549
  },
550
  {
551
  "epoch": 0.3168799512492383,
552
- "grad_norm": 1.0863845348358154,
553
  "learning_rate": 3.6799350121852156e-05,
554
- "loss": 1.4903,
555
  "step": 780
556
  },
557
  {
558
  "epoch": 0.3209425147267926,
559
- "grad_norm": 1.1215578317642212,
560
  "learning_rate": 3.598700243704305e-05,
561
- "loss": 1.2848,
562
  "step": 790
563
  },
564
  {
565
  "epoch": 0.32500507820434693,
566
- "grad_norm": 1.7457737922668457,
567
  "learning_rate": 3.517465475223396e-05,
568
- "loss": 1.3474,
569
  "step": 800
570
  },
571
  {
572
  "epoch": 0.3290676416819013,
573
- "grad_norm": 0.7558390498161316,
574
  "learning_rate": 3.436230706742486e-05,
575
- "loss": 1.3283,
576
  "step": 810
577
  },
578
  {
579
  "epoch": 0.33313020515945563,
580
- "grad_norm": 0.7559828162193298,
581
  "learning_rate": 3.3549959382615757e-05,
582
- "loss": 1.2811,
583
  "step": 820
584
  },
585
  {
586
  "epoch": 0.33719276863701,
587
- "grad_norm": 0.5059702396392822,
588
  "learning_rate": 3.273761169780667e-05,
589
- "loss": 1.411,
590
  "step": 830
591
  },
592
  {
593
  "epoch": 0.34125533211456427,
594
- "grad_norm": 0.6929866671562195,
595
  "learning_rate": 3.1925264012997564e-05,
596
- "loss": 1.1623,
597
  "step": 840
598
  },
599
  {
600
  "epoch": 0.3453178955921186,
601
- "grad_norm": 0.7670132517814636,
602
  "learning_rate": 3.111291632818847e-05,
603
- "loss": 1.4531,
604
  "step": 850
605
  },
606
  {
607
  "epoch": 0.34938045906967297,
608
- "grad_norm": 0.7590322494506836,
609
  "learning_rate": 3.0300568643379367e-05,
610
- "loss": 1.2966,
611
  "step": 860
612
  },
613
  {
614
  "epoch": 0.3534430225472273,
615
- "grad_norm": 0.9692467451095581,
616
  "learning_rate": 2.9488220958570267e-05,
617
- "loss": 1.381,
618
  "step": 870
619
  },
620
  {
621
  "epoch": 0.35750558602478166,
622
- "grad_norm": 1.5862103700637817,
623
  "learning_rate": 2.867587327376117e-05,
624
- "loss": 1.3523,
625
  "step": 880
626
  },
627
  {
628
  "epoch": 0.36156814950233596,
629
- "grad_norm": 1.0058462619781494,
630
  "learning_rate": 2.786352558895207e-05,
631
- "loss": 1.4259,
632
  "step": 890
633
  },
634
  {
635
  "epoch": 0.3656307129798903,
636
- "grad_norm": 2.4880478382110596,
637
  "learning_rate": 2.7051177904142978e-05,
638
- "loss": 1.4788,
639
  "step": 900
640
  },
641
  {
642
  "epoch": 0.36969327645744465,
643
- "grad_norm": 1.9835453033447266,
644
  "learning_rate": 2.6238830219333875e-05,
645
- "loss": 1.4717,
646
  "step": 910
647
  },
648
  {
649
  "epoch": 0.373755839934999,
650
- "grad_norm": 0.7700700759887695,
651
  "learning_rate": 2.5426482534524775e-05,
652
- "loss": 1.381,
653
  "step": 920
654
  },
655
  {
656
  "epoch": 0.3778184034125533,
657
- "grad_norm": 1.249335765838623,
658
  "learning_rate": 2.461413484971568e-05,
659
- "loss": 1.3528,
660
  "step": 930
661
  },
662
  {
663
  "epoch": 0.38188096689010764,
664
- "grad_norm": 0.6663157939910889,
665
  "learning_rate": 2.3801787164906582e-05,
666
- "loss": 1.3832,
667
  "step": 940
668
  },
669
  {
670
  "epoch": 0.385943530367662,
671
- "grad_norm": 1.822151780128479,
672
  "learning_rate": 2.2989439480097482e-05,
673
- "loss": 1.4578,
674
  "step": 950
675
  },
676
  {
677
  "epoch": 0.39000609384521634,
678
- "grad_norm": 2.3494231700897217,
679
  "learning_rate": 2.2177091795288386e-05,
680
- "loss": 1.2875,
681
  "step": 960
682
  },
683
  {
684
  "epoch": 0.3940686573227707,
685
- "grad_norm": 2.2792298793792725,
686
  "learning_rate": 2.1364744110479286e-05,
687
- "loss": 1.499,
688
  "step": 970
689
  },
690
  {
691
  "epoch": 0.398131220800325,
692
- "grad_norm": 1.2619109153747559,
693
  "learning_rate": 2.0552396425670186e-05,
694
- "loss": 1.3482,
695
  "step": 980
696
  },
697
  {
698
  "epoch": 0.40219378427787933,
699
- "grad_norm": 0.6195743680000305,
700
  "learning_rate": 1.974004874086109e-05,
701
- "loss": 1.3943,
702
  "step": 990
703
  },
704
  {
705
  "epoch": 0.4062563477554337,
706
- "grad_norm": 0.8362967371940613,
707
  "learning_rate": 1.892770105605199e-05,
708
- "loss": 1.3816,
709
  "step": 1000
710
  },
711
  {
712
  "epoch": 0.410318911232988,
713
- "grad_norm": 0.7627909779548645,
714
  "learning_rate": 1.8115353371242893e-05,
715
- "loss": 1.2967,
716
  "step": 1010
717
  },
718
  {
719
  "epoch": 0.4143814747105424,
720
- "grad_norm": 1.2969974279403687,
721
  "learning_rate": 1.7303005686433797e-05,
722
- "loss": 1.2868,
723
  "step": 1020
724
  },
725
  {
726
  "epoch": 0.41844403818809667,
727
- "grad_norm": 1.1111704111099243,
728
  "learning_rate": 1.6490658001624697e-05,
729
- "loss": 1.3058,
730
  "step": 1030
731
  },
732
  {
733
  "epoch": 0.422506601665651,
734
- "grad_norm": 1.131678819656372,
735
  "learning_rate": 1.5678310316815597e-05,
736
- "loss": 1.3291,
737
  "step": 1040
738
  },
739
  {
740
  "epoch": 0.42656916514320536,
741
- "grad_norm": 0.8244335055351257,
742
  "learning_rate": 1.48659626320065e-05,
743
- "loss": 1.2973,
744
  "step": 1050
745
  },
746
  {
747
  "epoch": 0.4306317286207597,
748
- "grad_norm": 0.7791102528572083,
749
  "learning_rate": 1.4053614947197402e-05,
750
- "loss": 1.3749,
751
  "step": 1060
752
  },
753
  {
754
  "epoch": 0.43469429209831406,
755
- "grad_norm": 0.8316488862037659,
756
  "learning_rate": 1.3241267262388301e-05,
757
- "loss": 1.4213,
758
  "step": 1070
759
  },
760
  {
761
  "epoch": 0.43875685557586835,
762
- "grad_norm": 0.9478839635848999,
763
  "learning_rate": 1.2428919577579204e-05,
764
- "loss": 1.3864,
765
  "step": 1080
766
  },
767
  {
768
  "epoch": 0.4428194190534227,
769
- "grad_norm": 1.5177056789398193,
770
  "learning_rate": 1.1616571892770106e-05,
771
- "loss": 1.2423,
772
  "step": 1090
773
  },
774
  {
775
  "epoch": 0.44688198253097705,
776
- "grad_norm": 0.6691774725914001,
777
  "learning_rate": 1.0804224207961008e-05,
778
- "loss": 1.3767,
779
  "step": 1100
780
  },
781
  {
782
  "epoch": 0.4509445460085314,
783
- "grad_norm": 1.9306799173355103,
784
  "learning_rate": 9.99187652315191e-06,
785
- "loss": 1.4233,
786
  "step": 1110
787
  },
788
  {
789
  "epoch": 0.4550071094860857,
790
- "grad_norm": 1.131913423538208,
791
  "learning_rate": 9.179528838342812e-06,
792
- "loss": 1.3289,
793
  "step": 1120
794
  },
795
  {
796
  "epoch": 0.45906967296364004,
797
- "grad_norm": 0.7975111603736877,
798
  "learning_rate": 8.367181153533712e-06,
799
- "loss": 1.3472,
800
  "step": 1130
801
  },
802
  {
803
  "epoch": 0.4631322364411944,
804
- "grad_norm": 0.5920394659042358,
805
  "learning_rate": 7.554833468724615e-06,
806
- "loss": 1.2458,
807
  "step": 1140
808
  },
809
  {
810
  "epoch": 0.46719479991874874,
811
- "grad_norm": 0.8416665196418762,
812
  "learning_rate": 6.742485783915516e-06,
813
- "loss": 1.4424,
814
  "step": 1150
815
  },
816
  {
817
  "epoch": 0.4712573633963031,
818
- "grad_norm": 0.670789897441864,
819
  "learning_rate": 5.930138099106418e-06,
820
- "loss": 1.2866,
821
  "step": 1160
822
  },
823
  {
824
  "epoch": 0.4753199268738574,
825
- "grad_norm": 1.7391314506530762,
826
  "learning_rate": 5.117790414297319e-06,
827
- "loss": 1.4517,
828
  "step": 1170
829
  },
830
  {
831
  "epoch": 0.4793824903514117,
832
- "grad_norm": 1.0173118114471436,
833
  "learning_rate": 4.305442729488221e-06,
834
- "loss": 1.4109,
835
  "step": 1180
836
  },
837
  {
838
  "epoch": 0.4834450538289661,
839
- "grad_norm": 0.9398189187049866,
840
  "learning_rate": 3.4930950446791225e-06,
841
- "loss": 1.3602,
842
  "step": 1190
843
  },
844
  {
845
  "epoch": 0.4875076173065204,
846
- "grad_norm": 2.0191538333892822,
847
  "learning_rate": 2.6807473598700244e-06,
848
- "loss": 1.4342,
849
  "step": 1200
850
  },
851
  {
852
  "epoch": 0.4915701807840748,
853
- "grad_norm": 1.3462674617767334,
854
  "learning_rate": 1.8683996750609262e-06,
855
- "loss": 1.2824,
856
  "step": 1210
857
  },
858
  {
859
  "epoch": 0.49563274426162907,
860
- "grad_norm": 1.729024887084961,
861
  "learning_rate": 1.0560519902518278e-06,
862
- "loss": 1.2663,
863
  "step": 1220
864
  },
865
  {
866
  "epoch": 0.4996953077391834,
867
- "grad_norm": 2.1285784244537354,
868
  "learning_rate": 2.437043054427295e-07,
869
- "loss": 1.4266,
870
  "step": 1230
871
  }
872
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.004062563477554337,
13
+ "grad_norm": 1.032738447189331,
14
  "learning_rate": 9.91876523151909e-05,
15
+ "loss": 2.0442,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.008125126955108674,
20
+ "grad_norm": 0.8217566609382629,
21
+ "learning_rate": 9.845653939886271e-05,
22
+ "loss": 1.9828,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.01218769043266301,
27
+ "grad_norm": 0.6819207668304443,
28
+ "learning_rate": 9.764419171405362e-05,
29
+ "loss": 1.7187,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.01625025391021735,
34
+ "grad_norm": 0.9606077075004578,
35
+ "learning_rate": 9.683184402924452e-05,
36
+ "loss": 1.6162,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.020312817387771683,
41
+ "grad_norm": 2.6619677543640137,
42
+ "learning_rate": 9.601949634443542e-05,
43
+ "loss": 1.6977,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.02437538086532602,
48
+ "grad_norm": 1.5025429725646973,
49
+ "learning_rate": 9.520714865962633e-05,
50
+ "loss": 1.2874,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.028437944342880356,
55
+ "grad_norm": 2.7792210578918457,
56
+ "learning_rate": 9.439480097481722e-05,
57
+ "loss": 1.4551,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.0325005078204347,
62
+ "grad_norm": 1.557523250579834,
63
+ "learning_rate": 9.358245329000814e-05,
64
+ "loss": 1.4412,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.03656307129798903,
69
+ "grad_norm": 3.4479055404663086,
70
+ "learning_rate": 9.277010560519903e-05,
71
+ "loss": 1.4806,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.040625634775543366,
76
+ "grad_norm": 1.282782793045044,
77
+ "learning_rate": 9.195775792038993e-05,
78
+ "loss": 1.3971,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.04468819825309771,
83
+ "grad_norm": 1.6942788362503052,
84
+ "learning_rate": 9.114541023558083e-05,
85
+ "loss": 1.543,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.04875076173065204,
90
+ "grad_norm": 0.7825320363044739,
91
+ "learning_rate": 9.033306255077174e-05,
92
+ "loss": 1.3638,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.05281332520820638,
97
+ "grad_norm": 0.7435455918312073,
98
+ "learning_rate": 8.952071486596265e-05,
99
+ "loss": 1.4731,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.05687588868576071,
104
+ "grad_norm": 0.5097799897193909,
105
+ "learning_rate": 8.870836718115354e-05,
106
+ "loss": 1.4218,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.06093845216331505,
111
+ "grad_norm": 0.8880577087402344,
112
+ "learning_rate": 8.789601949634444e-05,
113
+ "loss": 1.3477,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.0650010156408694,
118
+ "grad_norm": 1.1695178747177124,
119
+ "learning_rate": 8.708367181153534e-05,
120
+ "loss": 1.4521,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.06906357911842373,
125
+ "grad_norm": 0.7951045036315918,
126
+ "learning_rate": 8.627132412672623e-05,
127
+ "loss": 1.3417,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.07312614259597806,
132
+ "grad_norm": 0.5302656888961792,
133
+ "learning_rate": 8.545897644191714e-05,
134
+ "loss": 1.5202,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.0771887060735324,
139
+ "grad_norm": 0.8830013275146484,
140
+ "learning_rate": 8.464662875710805e-05,
141
+ "loss": 1.4271,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.08125126955108673,
146
+ "grad_norm": 3.1300947666168213,
147
+ "learning_rate": 8.383428107229895e-05,
148
+ "loss": 1.3743,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.08531383302864107,
153
+ "grad_norm": 1.077304720878601,
154
+ "learning_rate": 8.302193338748985e-05,
155
+ "loss": 1.4244,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.08937639650619542,
160
+ "grad_norm": 1.6846965551376343,
161
+ "learning_rate": 8.220958570268074e-05,
162
+ "loss": 1.4471,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.09343895998374975,
167
+ "grad_norm": 1.066351056098938,
168
+ "learning_rate": 8.139723801787165e-05,
169
+ "loss": 1.3723,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.09750152346130408,
174
+ "grad_norm": 1.6999150514602661,
175
+ "learning_rate": 8.058489033306255e-05,
176
+ "loss": 1.368,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.10156408693885842,
181
+ "grad_norm": 0.7261852622032166,
182
+ "learning_rate": 7.977254264825346e-05,
183
+ "loss": 1.3175,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.10562665041641275,
188
+ "grad_norm": 0.8671672344207764,
189
+ "learning_rate": 7.896019496344436e-05,
190
+ "loss": 1.4264,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.10968921389396709,
195
+ "grad_norm": 0.9500339031219482,
196
+ "learning_rate": 7.814784727863525e-05,
197
+ "loss": 1.2985,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.11375177737152142,
202
+ "grad_norm": 1.071378469467163,
203
+ "learning_rate": 7.733549959382617e-05,
204
+ "loss": 1.447,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.11781434084907577,
209
+ "grad_norm": 0.7448930144309998,
210
+ "learning_rate": 7.652315190901706e-05,
211
+ "loss": 1.374,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.1218769043266301,
216
+ "grad_norm": 0.9604855179786682,
217
+ "learning_rate": 7.571080422420796e-05,
218
+ "loss": 1.4803,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.12593946780418444,
223
+ "grad_norm": 1.9951509237289429,
224
+ "learning_rate": 7.489845653939887e-05,
225
+ "loss": 1.4477,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.1300020312817388,
230
+ "grad_norm": 2.3266055583953857,
231
+ "learning_rate": 7.408610885458977e-05,
232
+ "loss": 1.6107,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.1340645947592931,
237
+ "grad_norm": 0.878081738948822,
238
+ "learning_rate": 7.327376116978068e-05,
239
+ "loss": 1.295,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.13812715823684746,
244
+ "grad_norm": 0.7642931938171387,
245
+ "learning_rate": 7.246141348497157e-05,
246
+ "loss": 1.4776,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.14218972171440178,
251
+ "grad_norm": 3.2081470489501953,
252
+ "learning_rate": 7.164906580016247e-05,
253
+ "loss": 1.448,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.14625228519195613,
258
+ "grad_norm": 0.9164071679115295,
259
+ "learning_rate": 7.083671811535338e-05,
260
+ "loss": 1.3792,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.15031484866951045,
265
+ "grad_norm": 1.9146902561187744,
266
+ "learning_rate": 7.002437043054428e-05,
267
+ "loss": 1.239,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.1543774121470648,
272
+ "grad_norm": 0.6990465521812439,
273
+ "learning_rate": 6.921202274573519e-05,
274
+ "loss": 1.3211,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.15843997562461914,
279
+ "grad_norm": 1.7084014415740967,
280
+ "learning_rate": 6.839967506092608e-05,
281
+ "loss": 1.3952,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.16250253910217347,
286
+ "grad_norm": 1.0569523572921753,
287
+ "learning_rate": 6.758732737611698e-05,
288
+ "loss": 1.385,
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.16656510257972781,
293
+ "grad_norm": 1.4293252229690552,
294
+ "learning_rate": 6.677497969130788e-05,
295
+ "loss": 1.3736,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.17062766605728213,
300
+ "grad_norm": 1.0318830013275146,
301
+ "learning_rate": 6.596263200649879e-05,
302
+ "loss": 1.2315,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.17469022953483648,
307
+ "grad_norm": 1.0838662385940552,
308
+ "learning_rate": 6.515028432168968e-05,
309
+ "loss": 1.5313,
310
  "step": 430
311
  },
312
  {
313
  "epoch": 0.17875279301239083,
314
+ "grad_norm": 2.388718366622925,
315
+ "learning_rate": 6.43379366368806e-05,
316
+ "loss": 1.5047,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 0.18281535648994515,
321
+ "grad_norm": 1.58635413646698,
322
+ "learning_rate": 6.352558895207149e-05,
323
+ "loss": 1.2847,
324
  "step": 450
325
  },
326
  {
327
  "epoch": 0.1868779199674995,
328
+ "grad_norm": 4.313031196594238,
329
+ "learning_rate": 6.271324126726239e-05,
330
+ "loss": 1.46,
331
  "step": 460
332
  },
333
  {
334
  "epoch": 0.19094048344505382,
335
+ "grad_norm": 0.5860478281974792,
336
+ "learning_rate": 6.190089358245328e-05,
337
+ "loss": 1.3614,
338
  "step": 470
339
  },
340
  {
341
  "epoch": 0.19500304692260817,
342
+ "grad_norm": 3.0871775150299072,
343
+ "learning_rate": 6.10885458976442e-05,
344
+ "loss": 1.3915,
345
  "step": 480
346
  },
347
  {
348
  "epoch": 0.1990656104001625,
349
+ "grad_norm": 2.025517463684082,
350
+ "learning_rate": 6.02761982128351e-05,
351
+ "loss": 1.3326,
352
  "step": 490
353
  },
354
  {
355
  "epoch": 0.20312817387771684,
356
+ "grad_norm": 0.8149346709251404,
357
+ "learning_rate": 5.9463850528026e-05,
358
+ "loss": 1.2553,
359
  "step": 500
360
  },
361
  {
362
  "epoch": 0.2071907373552712,
363
+ "grad_norm": 1.1703840494155884,
364
+ "learning_rate": 5.86515028432169e-05,
365
+ "loss": 1.4137,
366
  "step": 510
367
  },
368
  {
369
  "epoch": 0.2112533008328255,
370
+ "grad_norm": 2.0241522789001465,
371
+ "learning_rate": 5.7839155158407796e-05,
372
+ "loss": 1.3763,
373
  "step": 520
374
  },
375
  {
376
  "epoch": 0.21531586431037986,
377
+ "grad_norm": 0.5776875019073486,
378
+ "learning_rate": 5.70268074735987e-05,
379
+ "loss": 1.282,
380
  "step": 530
381
  },
382
  {
383
  "epoch": 0.21937842778793418,
384
+ "grad_norm": 1.644302487373352,
385
+ "learning_rate": 5.621445978878961e-05,
386
+ "loss": 1.3309,
387
  "step": 540
388
  },
389
  {
390
  "epoch": 0.22344099126548853,
391
+ "grad_norm": 1.0667213201522827,
392
+ "learning_rate": 5.5402112103980506e-05,
393
+ "loss": 1.4516,
394
  "step": 550
395
  },
396
  {
397
  "epoch": 0.22750355474304285,
398
+ "grad_norm": 1.1769509315490723,
399
+ "learning_rate": 5.458976441917141e-05,
400
+ "loss": 1.4305,
401
  "step": 560
402
  },
403
  {
404
  "epoch": 0.2315661182205972,
405
+ "grad_norm": 0.9089193940162659,
406
+ "learning_rate": 5.377741673436231e-05,
407
+ "loss": 1.3263,
408
  "step": 570
409
  },
410
  {
411
  "epoch": 0.23562868169815154,
412
+ "grad_norm": 1.9467777013778687,
413
+ "learning_rate": 5.2965069049553203e-05,
414
+ "loss": 1.402,
415
  "step": 580
416
  },
417
  {
418
  "epoch": 0.23969124517570586,
419
+ "grad_norm": 1.313186526298523,
420
+ "learning_rate": 5.2152721364744114e-05,
421
+ "loss": 1.3604,
422
  "step": 590
423
  },
424
  {
425
  "epoch": 0.2437538086532602,
426
+ "grad_norm": 0.48065802454948425,
427
+ "learning_rate": 5.134037367993502e-05,
428
+ "loss": 1.5186,
429
  "step": 600
430
  },
431
  {
432
  "epoch": 0.24781637213081453,
433
+ "grad_norm": 0.7379820942878723,
434
+ "learning_rate": 5.0528025995125914e-05,
435
+ "loss": 1.4457,
436
  "step": 610
437
  },
438
  {
439
  "epoch": 0.2518789356083689,
440
+ "grad_norm": 0.7363265156745911,
441
+ "learning_rate": 4.971567831031682e-05,
442
+ "loss": 1.3843,
443
  "step": 620
444
  },
445
  {
446
  "epoch": 0.25594149908592323,
447
+ "grad_norm": 1.349368691444397,
448
+ "learning_rate": 4.890333062550772e-05,
449
+ "loss": 1.355,
450
  "step": 630
451
  },
452
  {
453
  "epoch": 0.2600040625634776,
454
+ "grad_norm": 1.6113736629486084,
455
+ "learning_rate": 4.809098294069862e-05,
456
+ "loss": 1.5113,
457
  "step": 640
458
  },
459
  {
460
  "epoch": 0.26406662604103187,
461
+ "grad_norm": 0.7977277636528015,
462
  "learning_rate": 4.735987002437044e-05,
463
+ "loss": 1.4413,
464
  "step": 650
465
  },
466
  {
467
  "epoch": 0.2681291895185862,
468
+ "grad_norm": 2.2527716159820557,
469
  "learning_rate": 4.6547522339561334e-05,
470
+ "loss": 1.4792,
471
  "step": 660
472
  },
473
  {
474
  "epoch": 0.27219175299614057,
475
+ "grad_norm": 0.7665427923202515,
476
  "learning_rate": 4.573517465475224e-05,
477
+ "loss": 1.2581,
478
  "step": 670
479
  },
480
  {
481
  "epoch": 0.2762543164736949,
482
+ "grad_norm": 1.6001981496810913,
483
  "learning_rate": 4.492282696994314e-05,
484
  "loss": 1.3557,
485
  "step": 680
486
  },
487
  {
488
  "epoch": 0.28031687995124926,
489
+ "grad_norm": 0.6028335690498352,
490
  "learning_rate": 4.411047928513404e-05,
491
+ "loss": 1.4625,
492
  "step": 690
493
  },
494
  {
495
  "epoch": 0.28437944342880356,
496
+ "grad_norm": 0.621457576751709,
497
  "learning_rate": 4.329813160032494e-05,
498
+ "loss": 1.4177,
499
  "step": 700
500
  },
501
  {
502
  "epoch": 0.2884420069063579,
503
+ "grad_norm": 1.08269464969635,
504
  "learning_rate": 4.2485783915515845e-05,
505
+ "loss": 1.5553,
506
  "step": 710
507
  },
508
  {
509
  "epoch": 0.29250457038391225,
510
+ "grad_norm": 0.7009453177452087,
511
  "learning_rate": 4.167343623070675e-05,
512
+ "loss": 1.3478,
513
  "step": 720
514
  },
515
  {
516
  "epoch": 0.2965671338614666,
517
+ "grad_norm": 0.7686846256256104,
518
  "learning_rate": 4.0861088545897645e-05,
519
+ "loss": 1.5388,
520
  "step": 730
521
  },
522
  {
523
  "epoch": 0.3006296973390209,
524
+ "grad_norm": 1.193659782409668,
525
  "learning_rate": 4.004874086108855e-05,
526
+ "loss": 1.558,
527
  "step": 740
528
  },
529
  {
530
  "epoch": 0.30469226081657524,
531
+ "grad_norm": 0.9236335754394531,
532
  "learning_rate": 3.923639317627945e-05,
533
+ "loss": 1.4143,
534
  "step": 750
535
  },
536
  {
537
  "epoch": 0.3087548242941296,
538
+ "grad_norm": 0.7839071750640869,
539
  "learning_rate": 3.842404549147035e-05,
540
+ "loss": 1.2117,
541
  "step": 760
542
  },
543
  {
544
  "epoch": 0.31281738777168394,
545
+ "grad_norm": 1.2941004037857056,
546
  "learning_rate": 3.761169780666125e-05,
547
+ "loss": 1.4062,
548
  "step": 770
549
  },
550
  {
551
  "epoch": 0.3168799512492383,
552
+ "grad_norm": 1.426131010055542,
553
  "learning_rate": 3.6799350121852156e-05,
554
+ "loss": 1.4909,
555
  "step": 780
556
  },
557
  {
558
  "epoch": 0.3209425147267926,
559
+ "grad_norm": 1.1472172737121582,
560
  "learning_rate": 3.598700243704305e-05,
561
+ "loss": 1.2864,
562
  "step": 790
563
  },
564
  {
565
  "epoch": 0.32500507820434693,
566
+ "grad_norm": 2.1125993728637695,
567
  "learning_rate": 3.517465475223396e-05,
568
+ "loss": 1.3456,
569
  "step": 800
570
  },
571
  {
572
  "epoch": 0.3290676416819013,
573
+ "grad_norm": 0.7743927240371704,
574
  "learning_rate": 3.436230706742486e-05,
575
+ "loss": 1.3294,
576
  "step": 810
577
  },
578
  {
579
  "epoch": 0.33313020515945563,
580
+ "grad_norm": 0.8188801407814026,
581
  "learning_rate": 3.3549959382615757e-05,
582
+ "loss": 1.2803,
583
  "step": 820
584
  },
585
  {
586
  "epoch": 0.33719276863701,
587
+ "grad_norm": 0.5288794636726379,
588
  "learning_rate": 3.273761169780667e-05,
589
+ "loss": 1.4094,
590
  "step": 830
591
  },
592
  {
593
  "epoch": 0.34125533211456427,
594
+ "grad_norm": 0.6982353329658508,
595
  "learning_rate": 3.1925264012997564e-05,
596
+ "loss": 1.1621,
597
  "step": 840
598
  },
599
  {
600
  "epoch": 0.3453178955921186,
601
+ "grad_norm": 0.7790868282318115,
602
  "learning_rate": 3.111291632818847e-05,
603
+ "loss": 1.4534,
604
  "step": 850
605
  },
606
  {
607
  "epoch": 0.34938045906967297,
608
+ "grad_norm": 0.7421971559524536,
609
  "learning_rate": 3.0300568643379367e-05,
610
+ "loss": 1.2951,
611
  "step": 860
612
  },
613
  {
614
  "epoch": 0.3534430225472273,
615
+ "grad_norm": 0.9682130813598633,
616
  "learning_rate": 2.9488220958570267e-05,
617
+ "loss": 1.3827,
618
  "step": 870
619
  },
620
  {
621
  "epoch": 0.35750558602478166,
622
+ "grad_norm": 1.6505218744277954,
623
  "learning_rate": 2.867587327376117e-05,
624
+ "loss": 1.3482,
625
  "step": 880
626
  },
627
  {
628
  "epoch": 0.36156814950233596,
629
+ "grad_norm": 1.013390064239502,
630
  "learning_rate": 2.786352558895207e-05,
631
+ "loss": 1.4224,
632
  "step": 890
633
  },
634
  {
635
  "epoch": 0.3656307129798903,
636
+ "grad_norm": 2.4053940773010254,
637
  "learning_rate": 2.7051177904142978e-05,
638
+ "loss": 1.4791,
639
  "step": 900
640
  },
641
  {
642
  "epoch": 0.36969327645744465,
643
+ "grad_norm": 1.7795937061309814,
644
  "learning_rate": 2.6238830219333875e-05,
645
+ "loss": 1.472,
646
  "step": 910
647
  },
648
  {
649
  "epoch": 0.373755839934999,
650
+ "grad_norm": 0.7656849026679993,
651
  "learning_rate": 2.5426482534524775e-05,
652
+ "loss": 1.3824,
653
  "step": 920
654
  },
655
  {
656
  "epoch": 0.3778184034125533,
657
+ "grad_norm": 1.1062685251235962,
658
  "learning_rate": 2.461413484971568e-05,
659
+ "loss": 1.3462,
660
  "step": 930
661
  },
662
  {
663
  "epoch": 0.38188096689010764,
664
+ "grad_norm": 0.660727858543396,
665
  "learning_rate": 2.3801787164906582e-05,
666
+ "loss": 1.3834,
667
  "step": 940
668
  },
669
  {
670
  "epoch": 0.385943530367662,
671
+ "grad_norm": 1.8191992044448853,
672
  "learning_rate": 2.2989439480097482e-05,
673
+ "loss": 1.4556,
674
  "step": 950
675
  },
676
  {
677
  "epoch": 0.39000609384521634,
678
+ "grad_norm": 3.3901174068450928,
679
  "learning_rate": 2.2177091795288386e-05,
680
+ "loss": 1.2865,
681
  "step": 960
682
  },
683
  {
684
  "epoch": 0.3940686573227707,
685
+ "grad_norm": 2.4866228103637695,
686
  "learning_rate": 2.1364744110479286e-05,
687
+ "loss": 1.501,
688
  "step": 970
689
  },
690
  {
691
  "epoch": 0.398131220800325,
692
+ "grad_norm": 1.41251540184021,
693
  "learning_rate": 2.0552396425670186e-05,
694
+ "loss": 1.3455,
695
  "step": 980
696
  },
697
  {
698
  "epoch": 0.40219378427787933,
699
+ "grad_norm": 0.645084798336029,
700
  "learning_rate": 1.974004874086109e-05,
701
+ "loss": 1.3918,
702
  "step": 990
703
  },
704
  {
705
  "epoch": 0.4062563477554337,
706
+ "grad_norm": 0.8688336610794067,
707
  "learning_rate": 1.892770105605199e-05,
708
+ "loss": 1.3823,
709
  "step": 1000
710
  },
711
  {
712
  "epoch": 0.410318911232988,
713
+ "grad_norm": 0.7747243642807007,
714
  "learning_rate": 1.8115353371242893e-05,
715
+ "loss": 1.2988,
716
  "step": 1010
717
  },
718
  {
719
  "epoch": 0.4143814747105424,
720
+ "grad_norm": 1.329567313194275,
721
  "learning_rate": 1.7303005686433797e-05,
722
+ "loss": 1.2876,
723
  "step": 1020
724
  },
725
  {
726
  "epoch": 0.41844403818809667,
727
+ "grad_norm": 1.022557258605957,
728
  "learning_rate": 1.6490658001624697e-05,
729
+ "loss": 1.3036,
730
  "step": 1030
731
  },
732
  {
733
  "epoch": 0.422506601665651,
734
+ "grad_norm": 0.9812535047531128,
735
  "learning_rate": 1.5678310316815597e-05,
736
+ "loss": 1.3278,
737
  "step": 1040
738
  },
739
  {
740
  "epoch": 0.42656916514320536,
741
+ "grad_norm": 0.8079454898834229,
742
  "learning_rate": 1.48659626320065e-05,
743
+ "loss": 1.2957,
744
  "step": 1050
745
  },
746
  {
747
  "epoch": 0.4306317286207597,
748
+ "grad_norm": 0.8280611038208008,
749
  "learning_rate": 1.4053614947197402e-05,
750
+ "loss": 1.3747,
751
  "step": 1060
752
  },
753
  {
754
  "epoch": 0.43469429209831406,
755
+ "grad_norm": 0.85203617811203,
756
  "learning_rate": 1.3241267262388301e-05,
757
+ "loss": 1.423,
758
  "step": 1070
759
  },
760
  {
761
  "epoch": 0.43875685557586835,
762
+ "grad_norm": 0.9492881298065186,
763
  "learning_rate": 1.2428919577579204e-05,
764
+ "loss": 1.3845,
765
  "step": 1080
766
  },
767
  {
768
  "epoch": 0.4428194190534227,
769
+ "grad_norm": 1.6084877252578735,
770
  "learning_rate": 1.1616571892770106e-05,
771
+ "loss": 1.2422,
772
  "step": 1090
773
  },
774
  {
775
  "epoch": 0.44688198253097705,
776
+ "grad_norm": 0.7060315012931824,
777
  "learning_rate": 1.0804224207961008e-05,
778
+ "loss": 1.3758,
779
  "step": 1100
780
  },
781
  {
782
  "epoch": 0.4509445460085314,
783
+ "grad_norm": 2.0847179889678955,
784
  "learning_rate": 9.99187652315191e-06,
785
+ "loss": 1.4202,
786
  "step": 1110
787
  },
788
  {
789
  "epoch": 0.4550071094860857,
790
+ "grad_norm": 1.1350940465927124,
791
  "learning_rate": 9.179528838342812e-06,
792
+ "loss": 1.3273,
793
  "step": 1120
794
  },
795
  {
796
  "epoch": 0.45906967296364004,
797
+ "grad_norm": 0.841096043586731,
798
  "learning_rate": 8.367181153533712e-06,
799
+ "loss": 1.3461,
800
  "step": 1130
801
  },
802
  {
803
  "epoch": 0.4631322364411944,
804
+ "grad_norm": 0.5926047563552856,
805
  "learning_rate": 7.554833468724615e-06,
806
+ "loss": 1.2492,
807
  "step": 1140
808
  },
809
  {
810
  "epoch": 0.46719479991874874,
811
+ "grad_norm": 0.8289461135864258,
812
  "learning_rate": 6.742485783915516e-06,
813
+ "loss": 1.4403,
814
  "step": 1150
815
  },
816
  {
817
  "epoch": 0.4712573633963031,
818
+ "grad_norm": 0.6718817353248596,
819
  "learning_rate": 5.930138099106418e-06,
820
+ "loss": 1.2884,
821
  "step": 1160
822
  },
823
  {
824
  "epoch": 0.4753199268738574,
825
+ "grad_norm": 1.6381056308746338,
826
  "learning_rate": 5.117790414297319e-06,
827
+ "loss": 1.4508,
828
  "step": 1170
829
  },
830
  {
831
  "epoch": 0.4793824903514117,
832
+ "grad_norm": 1.0323442220687866,
833
  "learning_rate": 4.305442729488221e-06,
834
+ "loss": 1.4087,
835
  "step": 1180
836
  },
837
  {
838
  "epoch": 0.4834450538289661,
839
+ "grad_norm": 1.0119701623916626,
840
  "learning_rate": 3.4930950446791225e-06,
841
+ "loss": 1.3611,
842
  "step": 1190
843
  },
844
  {
845
  "epoch": 0.4875076173065204,
846
+ "grad_norm": 2.113036870956421,
847
  "learning_rate": 2.6807473598700244e-06,
848
+ "loss": 1.4355,
849
  "step": 1200
850
  },
851
  {
852
  "epoch": 0.4915701807840748,
853
+ "grad_norm": 1.257258415222168,
854
  "learning_rate": 1.8683996750609262e-06,
855
+ "loss": 1.2809,
856
  "step": 1210
857
  },
858
  {
859
  "epoch": 0.49563274426162907,
860
+ "grad_norm": 1.6237741708755493,
861
  "learning_rate": 1.0560519902518278e-06,
862
+ "loss": 1.2713,
863
  "step": 1220
864
  },
865
  {
866
  "epoch": 0.4996953077391834,
867
+ "grad_norm": 2.050067901611328,
868
  "learning_rate": 2.437043054427295e-07,
869
+ "loss": 1.4235,
870
  "step": 1230
871
  }
872
  ],
checkpoint-1231/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d77c4079b78d8ffba36a5b7eea8d68305d2c5af880d94b33c697b01545a3d5f7
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8df8064174041f177505ba0d3a66e4fc8f5d15861f3d5535980ace500355480f
3
  size 5176
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d77c4079b78d8ffba36a5b7eea8d68305d2c5af880d94b33c697b01545a3d5f7
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8df8064174041f177505ba0d3a66e4fc8f5d15861f3d5535980ace500355480f
3
  size 5176