gpol13 commited on
Commit
70116e4
·
verified ·
1 Parent(s): 01ca755

Upload folder using huggingface_hub

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1280f7f2b208c156d2a4a277a7a1a7316c93f88988b5c754c16059bf2e0f289c
3
  size 3558888
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8b59853002c861613dfba51f1fbd432af1cf3e216a526b636e631933f3c2439
3
  size 3558888
checkpoint-14145/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1280f7f2b208c156d2a4a277a7a1a7316c93f88988b5c754c16059bf2e0f289c
3
  size 3558888
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8b59853002c861613dfba51f1fbd432af1cf3e216a526b636e631933f3c2439
3
  size 3558888
checkpoint-14145/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03e73f5f4a50e8492ca23abad4e74c002d3e1fe0681df8ea443c93e46d7d8cd0
3
  size 7198906
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a297fa43902036f425ebef026e9b07f87a2ecc04deb5e369d0c86cee2cbca96c
3
  size 7198906
checkpoint-14145/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:551d95fa01a350d649318071ec6c30c9643b1415452525afe82187c8c159941c
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe529d2f3b2a2d6f9fa414a8f535de95f1f94c39b13aa854046423d2b6dc21e9
3
  size 988
checkpoint-14145/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:898d0d2ace2142f45ba707687d167a2ef3ebdcb03bdca2695993e6fe1235bc54
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2fa001f6a3b5cd0eba852bf7bbee58219ea892df7002a06d792d5e052374efb
3
  size 1064
checkpoint-14145/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_global_step": 14145,
3
- "best_metric": 0.9658045416033947,
4
- "best_model_checkpoint": "/kaggle/working/codet5-k8s-lora/checkpoint-14145",
5
  "epoch": 5.0,
6
  "eval_steps": 500,
7
  "global_step": 14145,
@@ -11,2041 +11,2041 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.017674089784376106,
14
- "grad_norm": 2.7520432472229004,
15
- "learning_rate": 4.983739837398374e-05,
16
- "loss": 4.115,
17
  "step": 50
18
  },
19
  {
20
  "epoch": 0.03534817956875221,
21
- "grad_norm": 17.1498966217041,
22
- "learning_rate": 4.966065747613998e-05,
23
- "loss": 3.2283,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.053022269353128315,
28
- "grad_norm": 2.4044106006622314,
29
  "learning_rate": 4.948391657829622e-05,
30
- "loss": 2.2704,
31
  "step": 150
32
  },
33
  {
34
  "epoch": 0.07069635913750442,
35
- "grad_norm": 2.3264973163604736,
36
  "learning_rate": 4.930717568045246e-05,
37
- "loss": 1.963,
38
  "step": 200
39
  },
40
  {
41
  "epoch": 0.08837044892188052,
42
- "grad_norm": 2.146696090698242,
43
  "learning_rate": 4.91304347826087e-05,
44
- "loss": 1.8358,
45
  "step": 250
46
  },
47
  {
48
  "epoch": 0.10604453870625663,
49
- "grad_norm": 2.6933417320251465,
50
  "learning_rate": 4.895369388476494e-05,
51
- "loss": 1.7542,
52
  "step": 300
53
  },
54
  {
55
  "epoch": 0.12371862849063273,
56
- "grad_norm": 2.71453857421875,
57
  "learning_rate": 4.8776952986921177e-05,
58
- "loss": 1.6651,
59
  "step": 350
60
  },
61
  {
62
  "epoch": 0.14139271827500885,
63
- "grad_norm": 2.239396095275879,
64
  "learning_rate": 4.8600212089077416e-05,
65
- "loss": 1.5549,
66
  "step": 400
67
  },
68
  {
69
  "epoch": 0.15906680805938495,
70
- "grad_norm": 3.038501262664795,
71
- "learning_rate": 4.8423471191233655e-05,
72
- "loss": 1.4887,
73
  "step": 450
74
  },
75
  {
76
  "epoch": 0.17674089784376104,
77
- "grad_norm": 2.988568067550659,
78
  "learning_rate": 4.825026511134676e-05,
79
- "loss": 1.4896,
80
  "step": 500
81
  },
82
  {
83
  "epoch": 0.19441498762813716,
84
- "grad_norm": 2.7871344089508057,
85
  "learning_rate": 4.807352421350301e-05,
86
- "loss": 1.3147,
87
  "step": 550
88
  },
89
  {
90
  "epoch": 0.21208907741251326,
91
- "grad_norm": 2.9361987113952637,
92
  "learning_rate": 4.789678331565924e-05,
93
- "loss": 1.4059,
94
  "step": 600
95
  },
96
  {
97
  "epoch": 0.22976316719688936,
98
- "grad_norm": 3.7504327297210693,
99
  "learning_rate": 4.7720042417815487e-05,
100
- "loss": 1.3733,
101
  "step": 650
102
  },
103
  {
104
  "epoch": 0.24743725698126545,
105
- "grad_norm": 2.8540990352630615,
106
  "learning_rate": 4.7543301519971726e-05,
107
- "loss": 1.2048,
108
  "step": 700
109
  },
110
  {
111
  "epoch": 0.2651113467656416,
112
- "grad_norm": 3.3986833095550537,
113
  "learning_rate": 4.7366560622127965e-05,
114
- "loss": 1.3204,
115
  "step": 750
116
  },
117
  {
118
  "epoch": 0.2827854365500177,
119
- "grad_norm": 3.490173816680908,
120
  "learning_rate": 4.7189819724284204e-05,
121
- "loss": 1.2636,
122
  "step": 800
123
  },
124
  {
125
  "epoch": 0.30045952633439377,
126
- "grad_norm": 3.0693671703338623,
127
  "learning_rate": 4.701307882644044e-05,
128
- "loss": 1.1539,
129
  "step": 850
130
  },
131
  {
132
  "epoch": 0.3181336161187699,
133
- "grad_norm": 6.666368007659912,
134
  "learning_rate": 4.683633792859668e-05,
135
- "loss": 1.1835,
136
  "step": 900
137
  },
138
  {
139
  "epoch": 0.335807705903146,
140
- "grad_norm": 3.3494420051574707,
141
  "learning_rate": 4.6659597030752915e-05,
142
- "loss": 1.0956,
143
  "step": 950
144
  },
145
  {
146
  "epoch": 0.3534817956875221,
147
- "grad_norm": 2.6836087703704834,
148
  "learning_rate": 4.648285613290916e-05,
149
- "loss": 1.1311,
150
  "step": 1000
151
  },
152
  {
153
  "epoch": 0.3711558854718982,
154
- "grad_norm": 2.8905951976776123,
155
  "learning_rate": 4.630611523506539e-05,
156
- "loss": 1.0765,
157
  "step": 1050
158
  },
159
  {
160
  "epoch": 0.38882997525627433,
161
- "grad_norm": 3.939500093460083,
162
  "learning_rate": 4.612937433722164e-05,
163
- "loss": 1.0456,
164
  "step": 1100
165
  },
166
  {
167
  "epoch": 0.4065040650406504,
168
- "grad_norm": 3.1198384761810303,
169
  "learning_rate": 4.595263343937787e-05,
170
- "loss": 1.0448,
171
  "step": 1150
172
  },
173
  {
174
  "epoch": 0.4241781548250265,
175
- "grad_norm": 3.0342133045196533,
176
  "learning_rate": 4.577589254153412e-05,
177
- "loss": 0.9943,
178
  "step": 1200
179
  },
180
  {
181
  "epoch": 0.4418522446094026,
182
- "grad_norm": 3.555405616760254,
183
  "learning_rate": 4.559915164369035e-05,
184
- "loss": 1.045,
185
  "step": 1250
186
  },
187
  {
188
  "epoch": 0.4595263343937787,
189
- "grad_norm": 3.0326290130615234,
190
  "learning_rate": 4.542241074584659e-05,
191
- "loss": 0.9807,
192
  "step": 1300
193
  },
194
  {
195
  "epoch": 0.47720042417815484,
196
- "grad_norm": 3.1818552017211914,
197
  "learning_rate": 4.524566984800283e-05,
198
- "loss": 0.9889,
199
  "step": 1350
200
  },
201
  {
202
  "epoch": 0.4948745139625309,
203
- "grad_norm": 2.933518171310425,
204
  "learning_rate": 4.506892895015907e-05,
205
- "loss": 0.9373,
206
  "step": 1400
207
  },
208
  {
209
  "epoch": 0.512548603746907,
210
- "grad_norm": 4.3423566818237305,
211
  "learning_rate": 4.489218805231531e-05,
212
- "loss": 1.0253,
213
  "step": 1450
214
  },
215
  {
216
  "epoch": 0.5302226935312832,
217
- "grad_norm": 3.6613636016845703,
218
  "learning_rate": 4.4715447154471546e-05,
219
- "loss": 0.9458,
220
  "step": 1500
221
  },
222
  {
223
  "epoch": 0.5478967833156593,
224
- "grad_norm": 2.736353635787964,
225
- "learning_rate": 4.454224107458466e-05,
226
- "loss": 0.9205,
227
  "step": 1550
228
  },
229
  {
230
  "epoch": 0.5655708731000354,
231
- "grad_norm": 3.9614179134368896,
232
- "learning_rate": 4.43655001767409e-05,
233
- "loss": 0.8896,
234
  "step": 1600
235
  },
236
  {
237
  "epoch": 0.5832449628844114,
238
- "grad_norm": 3.397909164428711,
239
- "learning_rate": 4.418875927889714e-05,
240
- "loss": 0.9247,
241
  "step": 1650
242
  },
243
  {
244
  "epoch": 0.6009190526687875,
245
- "grad_norm": 3.26153564453125,
246
- "learning_rate": 4.401201838105338e-05,
247
- "loss": 0.8023,
248
  "step": 1700
249
  },
250
  {
251
  "epoch": 0.6185931424531637,
252
- "grad_norm": 4.067619800567627,
253
- "learning_rate": 4.383527748320962e-05,
254
- "loss": 0.8967,
255
  "step": 1750
256
  },
257
  {
258
  "epoch": 0.6362672322375398,
259
- "grad_norm": 3.9123120307922363,
260
- "learning_rate": 4.3658536585365856e-05,
261
- "loss": 0.8668,
262
  "step": 1800
263
  },
264
  {
265
  "epoch": 0.6539413220219159,
266
- "grad_norm": 3.9695286750793457,
267
- "learning_rate": 4.3481795687522095e-05,
268
- "loss": 0.8688,
269
  "step": 1850
270
  },
271
  {
272
  "epoch": 0.671615411806292,
273
- "grad_norm": 3.176691770553589,
274
- "learning_rate": 4.3305054789678334e-05,
275
- "loss": 0.8475,
276
  "step": 1900
277
  },
278
  {
279
  "epoch": 0.689289501590668,
280
- "grad_norm": 4.363101005554199,
281
- "learning_rate": 4.3128313891834574e-05,
282
- "loss": 0.8343,
283
  "step": 1950
284
  },
285
  {
286
  "epoch": 0.7069635913750442,
287
- "grad_norm": 4.429725170135498,
288
- "learning_rate": 4.295157299399081e-05,
289
- "loss": 0.8001,
290
  "step": 2000
291
  },
292
  {
293
  "epoch": 0.7246376811594203,
294
- "grad_norm": 3.035944700241089,
295
- "learning_rate": 4.277483209614705e-05,
296
- "loss": 0.7387,
297
  "step": 2050
298
  },
299
  {
300
  "epoch": 0.7423117709437964,
301
- "grad_norm": 3.1769495010375977,
302
- "learning_rate": 4.259809119830329e-05,
303
- "loss": 0.7837,
304
  "step": 2100
305
  },
306
  {
307
  "epoch": 0.7599858607281725,
308
- "grad_norm": 3.7096972465515137,
309
  "learning_rate": 4.242135030045953e-05,
310
- "loss": 0.8015,
311
  "step": 2150
312
  },
313
  {
314
  "epoch": 0.7776599505125487,
315
- "grad_norm": 4.8830718994140625,
316
  "learning_rate": 4.224460940261576e-05,
317
- "loss": 0.8145,
318
  "step": 2200
319
  },
320
  {
321
  "epoch": 0.7953340402969247,
322
- "grad_norm": 4.858372688293457,
323
  "learning_rate": 4.206786850477201e-05,
324
- "loss": 0.7707,
325
  "step": 2250
326
  },
327
  {
328
  "epoch": 0.8130081300813008,
329
- "grad_norm": 4.0257697105407715,
330
  "learning_rate": 4.189112760692824e-05,
331
- "loss": 0.7365,
332
  "step": 2300
333
  },
334
  {
335
  "epoch": 0.8306822198656769,
336
- "grad_norm": 3.4525439739227295,
337
  "learning_rate": 4.171438670908449e-05,
338
- "loss": 0.6824,
339
  "step": 2350
340
  },
341
  {
342
  "epoch": 0.848356309650053,
343
- "grad_norm": 3.3290514945983887,
344
  "learning_rate": 4.153764581124072e-05,
345
- "loss": 0.696,
346
  "step": 2400
347
  },
348
  {
349
  "epoch": 0.8660303994344292,
350
- "grad_norm": 3.284925937652588,
351
  "learning_rate": 4.1360904913396966e-05,
352
- "loss": 0.7506,
353
  "step": 2450
354
  },
355
  {
356
  "epoch": 0.8837044892188052,
357
- "grad_norm": 7.381216526031494,
358
  "learning_rate": 4.11841640155532e-05,
359
- "loss": 0.7282,
360
  "step": 2500
361
  },
362
  {
363
  "epoch": 0.9013785790031813,
364
- "grad_norm": 3.3384296894073486,
365
  "learning_rate": 4.1007423117709444e-05,
366
- "loss": 0.7632,
367
  "step": 2550
368
  },
369
  {
370
  "epoch": 0.9190526687875574,
371
- "grad_norm": 3.1576461791992188,
372
  "learning_rate": 4.0830682219865676e-05,
373
- "loss": 0.739,
374
  "step": 2600
375
  },
376
  {
377
  "epoch": 0.9367267585719335,
378
- "grad_norm": 3.626567840576172,
379
  "learning_rate": 4.0653941322021916e-05,
380
- "loss": 0.6791,
381
  "step": 2650
382
  },
383
  {
384
  "epoch": 0.9544008483563097,
385
- "grad_norm": 4.196743011474609,
386
  "learning_rate": 4.0477200424178155e-05,
387
- "loss": 0.7332,
388
  "step": 2700
389
  },
390
  {
391
  "epoch": 0.9720749381406858,
392
- "grad_norm": 3.1296284198760986,
393
  "learning_rate": 4.0300459526334394e-05,
394
- "loss": 0.6539,
395
  "step": 2750
396
  },
397
  {
398
  "epoch": 0.9897490279250618,
399
- "grad_norm": 5.278796672821045,
400
  "learning_rate": 4.012371862849063e-05,
401
- "loss": 0.737,
402
  "step": 2800
403
  },
404
  {
405
  "epoch": 1.0,
406
- "eval_bertscore_f1": 0.9560778172188761,
407
- "eval_bleu": 0.4824577747321971,
408
- "eval_loss": 0.5060375928878784,
409
- "eval_meteor": 0.6530790735036477,
410
- "eval_rouge1": 0.7971610347699927,
411
- "eval_rouge2": 0.6895819892909911,
412
- "eval_runtime": 1389.2459,
413
- "eval_samples_per_second": 4.653,
414
- "eval_steps_per_second": 0.582,
415
  "step": 2829
416
  },
417
  {
418
  "epoch": 1.007423117709438,
419
- "grad_norm": 3.9626195430755615,
420
  "learning_rate": 3.994697773064687e-05,
421
- "loss": 0.7198,
422
  "step": 2850
423
  },
424
  {
425
  "epoch": 1.025097207493814,
426
- "grad_norm": 4.509051322937012,
427
  "learning_rate": 3.977023683280312e-05,
428
- "loss": 0.6777,
429
  "step": 2900
430
  },
431
  {
432
  "epoch": 1.0427712972781902,
433
- "grad_norm": 3.044351816177368,
434
  "learning_rate": 3.959349593495935e-05,
435
- "loss": 0.6675,
436
  "step": 2950
437
  },
438
  {
439
  "epoch": 1.0604453870625663,
440
- "grad_norm": 3.598339319229126,
441
  "learning_rate": 3.941675503711559e-05,
442
- "loss": 0.7256,
443
  "step": 3000
444
  },
445
  {
446
  "epoch": 1.0781194768469424,
447
- "grad_norm": 5.248291015625,
448
  "learning_rate": 3.924001413927183e-05,
449
- "loss": 0.5862,
450
  "step": 3050
451
  },
452
  {
453
  "epoch": 1.0957935666313185,
454
- "grad_norm": 3.2210874557495117,
455
  "learning_rate": 3.906327324142807e-05,
456
- "loss": 0.7295,
457
  "step": 3100
458
  },
459
  {
460
  "epoch": 1.1134676564156947,
461
- "grad_norm": 3.6727230548858643,
462
  "learning_rate": 3.888653234358431e-05,
463
- "loss": 0.6206,
464
  "step": 3150
465
  },
466
  {
467
  "epoch": 1.1311417462000706,
468
- "grad_norm": 4.341032981872559,
469
  "learning_rate": 3.870979144574055e-05,
470
- "loss": 0.676,
471
  "step": 3200
472
  },
473
  {
474
  "epoch": 1.148815835984447,
475
- "grad_norm": 3.0379395484924316,
476
  "learning_rate": 3.8533050547896786e-05,
477
- "loss": 0.6372,
478
  "step": 3250
479
  },
480
  {
481
  "epoch": 1.1664899257688228,
482
- "grad_norm": 2.846511125564575,
483
  "learning_rate": 3.8356309650053025e-05,
484
- "loss": 0.6758,
485
  "step": 3300
486
  },
487
  {
488
  "epoch": 1.184164015553199,
489
- "grad_norm": 3.267794132232666,
490
  "learning_rate": 3.817956875220926e-05,
491
- "loss": 0.6035,
492
  "step": 3350
493
  },
494
  {
495
  "epoch": 1.201838105337575,
496
- "grad_norm": 5.214766025543213,
497
  "learning_rate": 3.8002827854365503e-05,
498
- "loss": 0.6907,
499
  "step": 3400
500
  },
501
  {
502
  "epoch": 1.2195121951219512,
503
- "grad_norm": 5.761065483093262,
504
  "learning_rate": 3.7826086956521736e-05,
505
- "loss": 0.6092,
506
  "step": 3450
507
  },
508
  {
509
  "epoch": 1.2371862849063273,
510
- "grad_norm": 4.127236843109131,
511
  "learning_rate": 3.764934605867798e-05,
512
- "loss": 0.6127,
513
  "step": 3500
514
  },
515
  {
516
  "epoch": 1.2548603746907034,
517
- "grad_norm": 2.9047141075134277,
518
  "learning_rate": 3.747260516083422e-05,
519
- "loss": 0.6486,
520
  "step": 3550
521
  },
522
  {
523
  "epoch": 1.2725344644750796,
524
- "grad_norm": 3.972148895263672,
525
  "learning_rate": 3.729586426299046e-05,
526
- "loss": 0.6933,
527
  "step": 3600
528
  },
529
  {
530
  "epoch": 1.2902085542594557,
531
- "grad_norm": 3.2735204696655273,
532
  "learning_rate": 3.71191233651467e-05,
533
- "loss": 0.6808,
534
  "step": 3650
535
  },
536
  {
537
  "epoch": 1.3078826440438318,
538
- "grad_norm": 5.26752233505249,
539
  "learning_rate": 3.694238246730294e-05,
540
- "loss": 0.6672,
541
  "step": 3700
542
  },
543
  {
544
  "epoch": 1.3255567338282077,
545
- "grad_norm": 3.852576971054077,
546
  "learning_rate": 3.676564156945918e-05,
547
- "loss": 0.6458,
548
  "step": 3750
549
  },
550
  {
551
  "epoch": 1.343230823612584,
552
- "grad_norm": 5.333845138549805,
553
  "learning_rate": 3.658890067161541e-05,
554
- "loss": 0.5771,
555
  "step": 3800
556
  },
557
  {
558
  "epoch": 1.36090491339696,
559
- "grad_norm": 3.3407108783721924,
560
  "learning_rate": 3.6412159773771656e-05,
561
- "loss": 0.6281,
562
  "step": 3850
563
  },
564
  {
565
  "epoch": 1.378579003181336,
566
- "grad_norm": 3.874502658843994,
567
  "learning_rate": 3.623541887592789e-05,
568
- "loss": 0.6716,
569
  "step": 3900
570
  },
571
  {
572
  "epoch": 1.3962530929657122,
573
- "grad_norm": 23.51529884338379,
574
  "learning_rate": 3.6058677978084134e-05,
575
- "loss": 0.5915,
576
  "step": 3950
577
  },
578
  {
579
  "epoch": 1.4139271827500883,
580
- "grad_norm": 4.40012788772583,
581
  "learning_rate": 3.588193708024037e-05,
582
- "loss": 0.6098,
583
  "step": 4000
584
  },
585
  {
586
  "epoch": 1.4316012725344645,
587
- "grad_norm": 2.98525071144104,
588
  "learning_rate": 3.570519618239661e-05,
589
- "loss": 0.5767,
590
  "step": 4050
591
  },
592
  {
593
  "epoch": 1.4492753623188406,
594
- "grad_norm": 3.8279759883880615,
595
  "learning_rate": 3.5528455284552845e-05,
596
- "loss": 0.5843,
597
  "step": 4100
598
  },
599
  {
600
  "epoch": 1.4669494521032167,
601
- "grad_norm": 2.8104283809661865,
602
  "learning_rate": 3.5351714386709084e-05,
603
- "loss": 0.5314,
604
  "step": 4150
605
  },
606
  {
607
  "epoch": 1.4846235418875928,
608
- "grad_norm": 3.081321954727173,
609
  "learning_rate": 3.5174973488865324e-05,
610
- "loss": 0.5187,
611
  "step": 4200
612
  },
613
  {
614
  "epoch": 1.502297631671969,
615
- "grad_norm": 2.8093416690826416,
616
  "learning_rate": 3.499823259102156e-05,
617
- "loss": 0.5663,
618
  "step": 4250
619
  },
620
  {
621
  "epoch": 1.5199717214563448,
622
- "grad_norm": 3.7971787452697754,
623
  "learning_rate": 3.48214916931778e-05,
624
- "loss": 0.5069,
625
  "step": 4300
626
  },
627
  {
628
  "epoch": 1.5376458112407212,
629
- "grad_norm": 3.116645336151123,
630
  "learning_rate": 3.464475079533404e-05,
631
- "loss": 0.4945,
632
  "step": 4350
633
  },
634
  {
635
  "epoch": 1.555319901025097,
636
- "grad_norm": 2.9984517097473145,
637
  "learning_rate": 3.446800989749028e-05,
638
- "loss": 0.5399,
639
  "step": 4400
640
  },
641
  {
642
  "epoch": 1.5729939908094734,
643
- "grad_norm": 3.3107683658599854,
644
  "learning_rate": 3.429126899964652e-05,
645
- "loss": 0.5507,
646
  "step": 4450
647
  },
648
  {
649
  "epoch": 1.5906680805938493,
650
- "grad_norm": 4.328573226928711,
651
  "learning_rate": 3.411452810180276e-05,
652
- "loss": 0.5695,
653
  "step": 4500
654
  },
655
  {
656
  "epoch": 1.6083421703782255,
657
- "grad_norm": 4.086219787597656,
658
  "learning_rate": 3.3937787203959e-05,
659
- "loss": 0.5234,
660
  "step": 4550
661
  },
662
  {
663
  "epoch": 1.6260162601626016,
664
- "grad_norm": 3.971590280532837,
665
  "learning_rate": 3.376104630611524e-05,
666
- "loss": 0.6107,
667
  "step": 4600
668
  },
669
  {
670
  "epoch": 1.6436903499469777,
671
- "grad_norm": 3.0131218433380127,
672
  "learning_rate": 3.3584305408271476e-05,
673
- "loss": 0.5763,
674
  "step": 4650
675
  },
676
  {
677
  "epoch": 1.6613644397313538,
678
- "grad_norm": 3.862619161605835,
679
  "learning_rate": 3.3407564510427716e-05,
680
- "loss": 0.5422,
681
  "step": 4700
682
  },
683
  {
684
  "epoch": 1.67903852951573,
685
- "grad_norm": 4.276158332824707,
686
  "learning_rate": 3.3230823612583955e-05,
687
- "loss": 0.6326,
688
  "step": 4750
689
  },
690
  {
691
  "epoch": 1.696712619300106,
692
- "grad_norm": 4.451511383056641,
693
  "learning_rate": 3.3054082714740194e-05,
694
- "loss": 0.5883,
695
  "step": 4800
696
  },
697
  {
698
  "epoch": 1.714386709084482,
699
- "grad_norm": 3.8793303966522217,
700
  "learning_rate": 3.2877341816896426e-05,
701
- "loss": 0.4958,
702
  "step": 4850
703
  },
704
  {
705
  "epoch": 1.7320607988688583,
706
- "grad_norm": 3.0720949172973633,
707
  "learning_rate": 3.270060091905267e-05,
708
- "loss": 0.5962,
709
  "step": 4900
710
  },
711
  {
712
  "epoch": 1.7497348886532342,
713
- "grad_norm": 3.674368143081665,
714
  "learning_rate": 3.2523860021208905e-05,
715
- "loss": 0.5179,
716
  "step": 4950
717
  },
718
  {
719
  "epoch": 1.7674089784376106,
720
- "grad_norm": 4.063836574554443,
721
  "learning_rate": 3.234711912336515e-05,
722
- "loss": 0.5588,
723
  "step": 5000
724
  },
725
  {
726
  "epoch": 1.7850830682219865,
727
- "grad_norm": 5.352965354919434,
728
  "learning_rate": 3.217037822552138e-05,
729
- "loss": 0.5762,
730
  "step": 5050
731
  },
732
  {
733
  "epoch": 1.8027571580063628,
734
- "grad_norm": 4.00150203704834,
735
  "learning_rate": 3.199363732767763e-05,
736
- "loss": 0.5546,
737
  "step": 5100
738
  },
739
  {
740
  "epoch": 1.8204312477907387,
741
- "grad_norm": 4.444275856018066,
742
  "learning_rate": 3.181689642983386e-05,
743
- "loss": 0.5321,
744
  "step": 5150
745
  },
746
  {
747
  "epoch": 1.8381053375751149,
748
- "grad_norm": 2.783729076385498,
749
  "learning_rate": 3.164015553199011e-05,
750
- "loss": 0.5859,
751
  "step": 5200
752
  },
753
  {
754
  "epoch": 1.855779427359491,
755
- "grad_norm": 4.168649673461914,
756
  "learning_rate": 3.146341463414634e-05,
757
- "loss": 0.5353,
758
  "step": 5250
759
  },
760
  {
761
  "epoch": 1.873453517143867,
762
- "grad_norm": 4.757116794586182,
763
  "learning_rate": 3.128667373630258e-05,
764
- "loss": 0.4766,
765
  "step": 5300
766
  },
767
  {
768
  "epoch": 1.8911276069282432,
769
- "grad_norm": 2.5472869873046875,
770
  "learning_rate": 3.1109932838458825e-05,
771
- "loss": 0.4687,
772
  "step": 5350
773
  },
774
  {
775
  "epoch": 1.9088016967126193,
776
- "grad_norm": 2.9616148471832275,
777
  "learning_rate": 3.093319194061506e-05,
778
- "loss": 0.5018,
779
  "step": 5400
780
  },
781
  {
782
  "epoch": 1.9264757864969955,
783
- "grad_norm": 3.773808240890503,
784
  "learning_rate": 3.0756451042771303e-05,
785
- "loss": 0.5798,
786
  "step": 5450
787
  },
788
  {
789
  "epoch": 1.9441498762813714,
790
- "grad_norm": 3.725301742553711,
791
  "learning_rate": 3.0579710144927536e-05,
792
- "loss": 0.5041,
793
  "step": 5500
794
  },
795
  {
796
  "epoch": 1.9618239660657477,
797
- "grad_norm": 3.422393560409546,
798
  "learning_rate": 3.040296924708378e-05,
799
- "loss": 0.5053,
800
  "step": 5550
801
  },
802
  {
803
  "epoch": 1.9794980558501236,
804
- "grad_norm": 3.7830734252929688,
805
  "learning_rate": 3.0226228349240014e-05,
806
- "loss": 0.5573,
807
  "step": 5600
808
  },
809
  {
810
  "epoch": 1.9971721456345,
811
- "grad_norm": 2.845203161239624,
812
  "learning_rate": 3.0049487451396253e-05,
813
- "loss": 0.4995,
814
  "step": 5650
815
  },
816
  {
817
  "epoch": 2.0,
818
- "eval_bertscore_f1": 0.9619662560301252,
819
- "eval_bleu": 0.5490499087973257,
820
- "eval_loss": 0.3714849352836609,
821
- "eval_meteor": 0.7078770382751671,
822
- "eval_rouge1": 0.8227979006513153,
823
- "eval_rouge2": 0.7467554150541201,
824
- "eval_runtime": 1347.4627,
825
- "eval_samples_per_second": 4.797,
826
- "eval_steps_per_second": 0.6,
827
  "step": 5658
828
  },
829
  {
830
  "epoch": 2.014846235418876,
831
- "grad_norm": 3.069786787033081,
832
  "learning_rate": 2.9872746553552493e-05,
833
- "loss": 0.4827,
834
  "step": 5700
835
  },
836
  {
837
  "epoch": 2.032520325203252,
838
- "grad_norm": 3.7215096950531006,
839
  "learning_rate": 2.9696005655708732e-05,
840
- "loss": 0.5742,
841
  "step": 5750
842
  },
843
  {
844
  "epoch": 2.050194414987628,
845
- "grad_norm": 4.543232440948486,
846
  "learning_rate": 2.9519264757864974e-05,
847
- "loss": 0.478,
848
  "step": 5800
849
  },
850
  {
851
  "epoch": 2.0678685047720045,
852
- "grad_norm": 3.076716423034668,
853
  "learning_rate": 2.934252386002121e-05,
854
- "loss": 0.4619,
855
  "step": 5850
856
  },
857
  {
858
  "epoch": 2.0855425945563804,
859
- "grad_norm": 3.951244592666626,
860
  "learning_rate": 2.9165782962177453e-05,
861
- "loss": 0.513,
862
  "step": 5900
863
  },
864
  {
865
  "epoch": 2.1032166843407563,
866
- "grad_norm": 2.552813768386841,
867
  "learning_rate": 2.898904206433369e-05,
868
- "loss": 0.491,
869
  "step": 5950
870
  },
871
  {
872
  "epoch": 2.1208907741251326,
873
- "grad_norm": 2.2147703170776367,
874
  "learning_rate": 2.8812301166489924e-05,
875
- "loss": 0.4333,
876
  "step": 6000
877
  },
878
  {
879
  "epoch": 2.1385648639095085,
880
- "grad_norm": 2.9662623405456543,
881
  "learning_rate": 2.8635560268646167e-05,
882
- "loss": 0.4287,
883
  "step": 6050
884
  },
885
  {
886
  "epoch": 2.156238953693885,
887
- "grad_norm": 1.9441404342651367,
888
  "learning_rate": 2.8458819370802403e-05,
889
- "loss": 0.5373,
890
  "step": 6100
891
  },
892
  {
893
  "epoch": 2.1739130434782608,
894
- "grad_norm": 4.967250823974609,
895
  "learning_rate": 2.8282078472958645e-05,
896
- "loss": 0.4971,
897
  "step": 6150
898
  },
899
  {
900
  "epoch": 2.191587133262637,
901
- "grad_norm": 3.1946051120758057,
902
  "learning_rate": 2.810533757511488e-05,
903
- "loss": 0.5078,
904
  "step": 6200
905
  },
906
  {
907
  "epoch": 2.209261223047013,
908
- "grad_norm": 2.941650152206421,
909
  "learning_rate": 2.7928596677271124e-05,
910
- "loss": 0.5376,
911
  "step": 6250
912
  },
913
  {
914
  "epoch": 2.2269353128313893,
915
- "grad_norm": 4.430084705352783,
916
  "learning_rate": 2.775185577942736e-05,
917
- "loss": 0.5443,
918
  "step": 6300
919
  },
920
  {
921
  "epoch": 2.2446094026157652,
922
- "grad_norm": 4.810621738433838,
923
  "learning_rate": 2.7575114881583602e-05,
924
- "loss": 0.5101,
925
  "step": 6350
926
  },
927
  {
928
  "epoch": 2.262283492400141,
929
- "grad_norm": 4.557118892669678,
930
- "learning_rate": 2.7401908801696713e-05,
931
- "loss": 0.5837,
932
  "step": 6400
933
  },
934
  {
935
  "epoch": 2.2799575821845175,
936
- "grad_norm": 3.7677035331726074,
937
- "learning_rate": 2.7225167903852955e-05,
938
- "loss": 0.4693,
939
  "step": 6450
940
  },
941
  {
942
  "epoch": 2.297631671968894,
943
- "grad_norm": 2.5755605697631836,
944
- "learning_rate": 2.704842700600919e-05,
945
- "loss": 0.4601,
946
  "step": 6500
947
  },
948
  {
949
  "epoch": 2.3153057617532697,
950
- "grad_norm": 4.114721775054932,
951
- "learning_rate": 2.6871686108165427e-05,
952
- "loss": 0.5209,
953
  "step": 6550
954
  },
955
  {
956
  "epoch": 2.3329798515376456,
957
- "grad_norm": 4.422333717346191,
958
- "learning_rate": 2.669494521032167e-05,
959
- "loss": 0.5097,
960
  "step": 6600
961
  },
962
  {
963
  "epoch": 2.350653941322022,
964
- "grad_norm": 2.2840325832366943,
965
- "learning_rate": 2.6518204312477905e-05,
966
- "loss": 0.4896,
967
  "step": 6650
968
  },
969
  {
970
  "epoch": 2.368328031106398,
971
- "grad_norm": 4.739809036254883,
972
- "learning_rate": 2.6341463414634148e-05,
973
- "loss": 0.5377,
974
  "step": 6700
975
  },
976
  {
977
  "epoch": 2.3860021208907742,
978
- "grad_norm": 4.174150466918945,
979
- "learning_rate": 2.6164722516790384e-05,
980
- "loss": 0.4754,
981
  "step": 6750
982
  },
983
  {
984
  "epoch": 2.40367621067515,
985
- "grad_norm": 3.5064797401428223,
986
- "learning_rate": 2.5987981618946626e-05,
987
- "loss": 0.4375,
988
  "step": 6800
989
  },
990
  {
991
  "epoch": 2.4213503004595265,
992
- "grad_norm": 5.094990253448486,
993
- "learning_rate": 2.5811240721102865e-05,
994
- "loss": 0.5027,
995
  "step": 6850
996
  },
997
  {
998
  "epoch": 2.4390243902439024,
999
- "grad_norm": 6.338164329528809,
1000
- "learning_rate": 2.5634499823259105e-05,
1001
- "loss": 0.5016,
1002
  "step": 6900
1003
  },
1004
  {
1005
  "epoch": 2.4566984800282787,
1006
- "grad_norm": 3.988973379135132,
1007
- "learning_rate": 2.5457758925415344e-05,
1008
- "loss": 0.4173,
1009
  "step": 6950
1010
  },
1011
  {
1012
  "epoch": 2.4743725698126546,
1013
- "grad_norm": 2.3877015113830566,
1014
- "learning_rate": 2.528101802757158e-05,
1015
- "loss": 0.4857,
1016
  "step": 7000
1017
  },
1018
  {
1019
  "epoch": 2.4920466595970305,
1020
- "grad_norm": 4.041755199432373,
1021
- "learning_rate": 2.5104277129727822e-05,
1022
- "loss": 0.4463,
1023
  "step": 7050
1024
  },
1025
  {
1026
  "epoch": 2.509720749381407,
1027
- "grad_norm": 3.5311896800994873,
1028
- "learning_rate": 2.492753623188406e-05,
1029
- "loss": 0.4899,
1030
  "step": 7100
1031
  },
1032
  {
1033
  "epoch": 2.5273948391657832,
1034
- "grad_norm": 2.9291367530822754,
1035
- "learning_rate": 2.4750795334040297e-05,
1036
- "loss": 0.5041,
1037
  "step": 7150
1038
  },
1039
  {
1040
  "epoch": 2.545068928950159,
1041
- "grad_norm": 3.6040406227111816,
1042
- "learning_rate": 2.4574054436196536e-05,
1043
- "loss": 0.4374,
1044
  "step": 7200
1045
  },
1046
  {
1047
  "epoch": 2.562743018734535,
1048
- "grad_norm": 4.614346981048584,
1049
- "learning_rate": 2.4397313538352776e-05,
1050
- "loss": 0.4659,
1051
  "step": 7250
1052
  },
1053
  {
1054
  "epoch": 2.5804171085189114,
1055
- "grad_norm": 5.035871505737305,
1056
- "learning_rate": 2.4220572640509015e-05,
1057
- "loss": 0.4419,
1058
  "step": 7300
1059
  },
1060
  {
1061
  "epoch": 2.5980911983032873,
1062
- "grad_norm": 3.1209981441497803,
1063
- "learning_rate": 2.4043831742665254e-05,
1064
- "loss": 0.4863,
1065
  "step": 7350
1066
  },
1067
  {
1068
  "epoch": 2.6157652880876636,
1069
- "grad_norm": 2.7074010372161865,
1070
- "learning_rate": 2.3867090844821493e-05,
1071
- "loss": 0.4899,
1072
  "step": 7400
1073
  },
1074
  {
1075
  "epoch": 2.6334393778720395,
1076
- "grad_norm": 4.56402587890625,
1077
- "learning_rate": 2.3690349946977732e-05,
1078
- "loss": 0.5104,
1079
  "step": 7450
1080
  },
1081
  {
1082
  "epoch": 2.6511134676564154,
1083
- "grad_norm": 3.637251377105713,
1084
- "learning_rate": 2.3513609049133968e-05,
1085
- "loss": 0.4638,
1086
  "step": 7500
1087
  },
1088
  {
1089
  "epoch": 2.6687875574407918,
1090
- "grad_norm": 2.912982702255249,
1091
- "learning_rate": 2.3336868151290207e-05,
1092
- "loss": 0.4572,
1093
  "step": 7550
1094
  },
1095
  {
1096
  "epoch": 2.686461647225168,
1097
- "grad_norm": 2.9806952476501465,
1098
- "learning_rate": 2.3160127253446447e-05,
1099
- "loss": 0.4544,
1100
  "step": 7600
1101
  },
1102
  {
1103
  "epoch": 2.704135737009544,
1104
- "grad_norm": 3.6673879623413086,
1105
- "learning_rate": 2.2983386355602686e-05,
1106
- "loss": 0.4428,
1107
  "step": 7650
1108
  },
1109
  {
1110
  "epoch": 2.72180982679392,
1111
- "grad_norm": 7.318435192108154,
1112
- "learning_rate": 2.280664545775893e-05,
1113
- "loss": 0.4137,
1114
  "step": 7700
1115
  },
1116
  {
1117
  "epoch": 2.7394839165782963,
1118
- "grad_norm": 7.58805513381958,
1119
- "learning_rate": 2.2629904559915168e-05,
1120
- "loss": 0.4501,
1121
  "step": 7750
1122
  },
1123
  {
1124
  "epoch": 2.757158006362672,
1125
- "grad_norm": 3.000596046447754,
1126
- "learning_rate": 2.2453163662071407e-05,
1127
- "loss": 0.4104,
1128
  "step": 7800
1129
  },
1130
  {
1131
  "epoch": 2.7748320961470485,
1132
- "grad_norm": 6.188124656677246,
1133
- "learning_rate": 2.2276422764227646e-05,
1134
- "loss": 0.4833,
1135
  "step": 7850
1136
  },
1137
  {
1138
  "epoch": 2.7925061859314244,
1139
- "grad_norm": 2.526710033416748,
1140
- "learning_rate": 2.209968186638388e-05,
1141
- "loss": 0.4377,
1142
  "step": 7900
1143
  },
1144
  {
1145
  "epoch": 2.8101802757158008,
1146
- "grad_norm": 3.080709934234619,
1147
- "learning_rate": 2.192294096854012e-05,
1148
- "loss": 0.4289,
1149
  "step": 7950
1150
  },
1151
  {
1152
  "epoch": 2.8278543655001767,
1153
- "grad_norm": 2.859811544418335,
1154
- "learning_rate": 2.174620007069636e-05,
1155
- "loss": 0.4856,
1156
  "step": 8000
1157
  },
1158
  {
1159
  "epoch": 2.845528455284553,
1160
- "grad_norm": 4.68251371383667,
1161
- "learning_rate": 2.15694591728526e-05,
1162
- "loss": 0.4901,
1163
  "step": 8050
1164
  },
1165
  {
1166
  "epoch": 2.863202545068929,
1167
- "grad_norm": 4.8537211418151855,
1168
- "learning_rate": 2.139271827500884e-05,
1169
- "loss": 0.3815,
1170
  "step": 8100
1171
  },
1172
  {
1173
  "epoch": 2.880876634853305,
1174
- "grad_norm": 3.1829328536987305,
1175
- "learning_rate": 2.1215977377165078e-05,
1176
- "loss": 0.4713,
1177
  "step": 8150
1178
  },
1179
  {
1180
  "epoch": 2.898550724637681,
1181
- "grad_norm": 4.081786155700684,
1182
- "learning_rate": 2.1039236479321317e-05,
1183
- "loss": 0.4494,
1184
  "step": 8200
1185
  },
1186
  {
1187
  "epoch": 2.9162248144220575,
1188
- "grad_norm": 4.547771453857422,
1189
- "learning_rate": 2.0862495581477553e-05,
1190
- "loss": 0.4306,
1191
  "step": 8250
1192
  },
1193
  {
1194
  "epoch": 2.9338989042064334,
1195
- "grad_norm": 2.5716054439544678,
1196
- "learning_rate": 2.0685754683633792e-05,
1197
- "loss": 0.3865,
1198
  "step": 8300
1199
  },
1200
  {
1201
  "epoch": 2.9515729939908093,
1202
- "grad_norm": 3.1603822708129883,
1203
- "learning_rate": 2.050901378579003e-05,
1204
- "loss": 0.4292,
1205
  "step": 8350
1206
  },
1207
  {
1208
  "epoch": 2.9692470837751856,
1209
- "grad_norm": 3.2110049724578857,
1210
- "learning_rate": 2.033227288794627e-05,
1211
- "loss": 0.4612,
1212
  "step": 8400
1213
  },
1214
  {
1215
  "epoch": 2.9869211735595615,
1216
- "grad_norm": 3.1889193058013916,
1217
- "learning_rate": 2.015553199010251e-05,
1218
- "loss": 0.4432,
1219
  "step": 8450
1220
  },
1221
  {
1222
  "epoch": 3.0,
1223
- "eval_bertscore_f1": 0.9641102957124315,
1224
- "eval_bleu": 0.5720155666086876,
1225
- "eval_loss": 0.32330864667892456,
1226
- "eval_meteor": 0.7255114484352385,
1227
- "eval_rouge1": 0.8322775652472418,
1228
- "eval_rouge2": 0.7664804459108749,
1229
- "eval_runtime": 1341.3557,
1230
- "eval_samples_per_second": 4.819,
1231
- "eval_steps_per_second": 0.602,
1232
  "step": 8487
1233
  },
1234
  {
1235
  "epoch": 3.004595263343938,
1236
- "grad_norm": 3.6943013668060303,
1237
- "learning_rate": 1.997879109225875e-05,
1238
- "loss": 0.4008,
1239
  "step": 8500
1240
  },
1241
  {
1242
  "epoch": 3.022269353128314,
1243
- "grad_norm": 3.6444859504699707,
1244
- "learning_rate": 1.9802050194414988e-05,
1245
- "loss": 0.4435,
1246
  "step": 8550
1247
  },
1248
  {
1249
  "epoch": 3.03994344291269,
1250
- "grad_norm": 2.0229413509368896,
1251
- "learning_rate": 1.962530929657123e-05,
1252
- "loss": 0.4479,
1253
  "step": 8600
1254
  },
1255
  {
1256
  "epoch": 3.057617532697066,
1257
- "grad_norm": 2.651965618133545,
1258
- "learning_rate": 1.9448568398727466e-05,
1259
- "loss": 0.4386,
1260
  "step": 8650
1261
  },
1262
  {
1263
  "epoch": 3.0752916224814424,
1264
- "grad_norm": 3.405768394470215,
1265
- "learning_rate": 1.9271827500883705e-05,
1266
- "loss": 0.3994,
1267
  "step": 8700
1268
  },
1269
  {
1270
  "epoch": 3.0929657122658183,
1271
- "grad_norm": 4.544278144836426,
1272
- "learning_rate": 1.9095086603039945e-05,
1273
- "loss": 0.4002,
1274
  "step": 8750
1275
  },
1276
  {
1277
  "epoch": 3.110639802050194,
1278
- "grad_norm": 4.592613220214844,
1279
  "learning_rate": 1.8918345705196184e-05,
1280
- "loss": 0.4925,
1281
  "step": 8800
1282
  },
1283
  {
1284
  "epoch": 3.1283138918345705,
1285
- "grad_norm": 4.989655017852783,
1286
  "learning_rate": 1.8741604807352423e-05,
1287
- "loss": 0.411,
1288
  "step": 8850
1289
  },
1290
  {
1291
  "epoch": 3.1459879816189464,
1292
- "grad_norm": 5.274810791015625,
1293
- "learning_rate": 1.8568398727465537e-05,
1294
- "loss": 0.4002,
1295
  "step": 8900
1296
  },
1297
  {
1298
  "epoch": 3.163662071403323,
1299
- "grad_norm": 3.1220662593841553,
1300
- "learning_rate": 1.8391657829621776e-05,
1301
- "loss": 0.4138,
1302
  "step": 8950
1303
  },
1304
  {
1305
  "epoch": 3.1813361611876987,
1306
- "grad_norm": 3.1085612773895264,
1307
- "learning_rate": 1.8214916931778015e-05,
1308
- "loss": 0.4498,
1309
  "step": 9000
1310
  },
1311
  {
1312
  "epoch": 3.199010250972075,
1313
- "grad_norm": 2.929586410522461,
1314
- "learning_rate": 1.8038176033934255e-05,
1315
- "loss": 0.4271,
1316
  "step": 9050
1317
  },
1318
  {
1319
  "epoch": 3.216684340756451,
1320
- "grad_norm": 3.6650078296661377,
1321
- "learning_rate": 1.7861435136090494e-05,
1322
- "loss": 0.4642,
1323
  "step": 9100
1324
  },
1325
  {
1326
  "epoch": 3.2343584305408273,
1327
- "grad_norm": 4.111539363861084,
1328
- "learning_rate": 1.7684694238246733e-05,
1329
- "loss": 0.3974,
1330
  "step": 9150
1331
  },
1332
  {
1333
  "epoch": 3.252032520325203,
1334
- "grad_norm": 3.7882914543151855,
1335
- "learning_rate": 1.750795334040297e-05,
1336
- "loss": 0.4961,
1337
  "step": 9200
1338
  },
1339
  {
1340
  "epoch": 3.2697066101095795,
1341
- "grad_norm": 3.846184253692627,
1342
- "learning_rate": 1.7331212442559208e-05,
1343
- "loss": 0.4681,
1344
  "step": 9250
1345
  },
1346
  {
1347
  "epoch": 3.2873806998939554,
1348
- "grad_norm": 2.3030924797058105,
1349
- "learning_rate": 1.7154471544715447e-05,
1350
- "loss": 0.3746,
1351
  "step": 9300
1352
  },
1353
  {
1354
  "epoch": 3.3050547896783318,
1355
- "grad_norm": 5.14872407913208,
1356
- "learning_rate": 1.6977730646871686e-05,
1357
- "loss": 0.3876,
1358
  "step": 9350
1359
  },
1360
  {
1361
  "epoch": 3.3227288794627077,
1362
- "grad_norm": 3.8709867000579834,
1363
- "learning_rate": 1.6800989749027926e-05,
1364
- "loss": 0.4326,
1365
  "step": 9400
1366
  },
1367
  {
1368
  "epoch": 3.3404029692470836,
1369
- "grad_norm": 2.4771230220794678,
1370
- "learning_rate": 1.6624248851184165e-05,
1371
- "loss": 0.4111,
1372
  "step": 9450
1373
  },
1374
  {
1375
  "epoch": 3.35807705903146,
1376
- "grad_norm": 4.154597282409668,
1377
- "learning_rate": 1.6447507953340404e-05,
1378
- "loss": 0.514,
1379
  "step": 9500
1380
  },
1381
  {
1382
  "epoch": 3.375751148815836,
1383
- "grad_norm": 3.861116409301758,
1384
- "learning_rate": 1.6270767055496643e-05,
1385
- "loss": 0.4171,
1386
  "step": 9550
1387
  },
1388
  {
1389
  "epoch": 3.393425238600212,
1390
- "grad_norm": 3.292591094970703,
1391
- "learning_rate": 1.609402615765288e-05,
1392
- "loss": 0.4192,
1393
  "step": 9600
1394
  },
1395
  {
1396
  "epoch": 3.411099328384588,
1397
- "grad_norm": 3.9203121662139893,
1398
- "learning_rate": 1.591728525980912e-05,
1399
- "loss": 0.4036,
1400
  "step": 9650
1401
  },
1402
  {
1403
  "epoch": 3.4287734181689644,
1404
- "grad_norm": 3.337324857711792,
1405
- "learning_rate": 1.574054436196536e-05,
1406
- "loss": 0.3511,
1407
  "step": 9700
1408
  },
1409
  {
1410
  "epoch": 3.4464475079533403,
1411
- "grad_norm": 2.305972099304199,
1412
- "learning_rate": 1.55638034641216e-05,
1413
- "loss": 0.4891,
1414
  "step": 9750
1415
  },
1416
  {
1417
  "epoch": 3.4641215977377167,
1418
- "grad_norm": 4.972841739654541,
1419
- "learning_rate": 1.538706256627784e-05,
1420
- "loss": 0.4172,
1421
  "step": 9800
1422
  },
1423
  {
1424
  "epoch": 3.4817956875220926,
1425
- "grad_norm": 3.111032009124756,
1426
- "learning_rate": 1.5210321668434077e-05,
1427
- "loss": 0.4401,
1428
  "step": 9850
1429
  },
1430
  {
1431
  "epoch": 3.499469777306469,
1432
- "grad_norm": 1.9676620960235596,
1433
  "learning_rate": 1.5033580770590316e-05,
1434
- "loss": 0.4398,
1435
  "step": 9900
1436
  },
1437
  {
1438
  "epoch": 3.517143867090845,
1439
- "grad_norm": 3.4216668605804443,
1440
  "learning_rate": 1.4856839872746553e-05,
1441
- "loss": 0.4015,
1442
  "step": 9950
1443
  },
1444
  {
1445
  "epoch": 3.534817956875221,
1446
- "grad_norm": 3.161693811416626,
1447
  "learning_rate": 1.4680098974902792e-05,
1448
- "loss": 0.4216,
1449
  "step": 10000
1450
  },
1451
  {
1452
  "epoch": 3.552492046659597,
1453
- "grad_norm": 5.103592395782471,
1454
  "learning_rate": 1.4503358077059032e-05,
1455
- "loss": 0.4489,
1456
  "step": 10050
1457
  },
1458
  {
1459
  "epoch": 3.570166136443973,
1460
- "grad_norm": 3.90478777885437,
1461
  "learning_rate": 1.432661717921527e-05,
1462
- "loss": 0.4565,
1463
  "step": 10100
1464
  },
1465
  {
1466
  "epoch": 3.5878402262283493,
1467
- "grad_norm": 2.7845191955566406,
1468
  "learning_rate": 1.414987628137151e-05,
1469
- "loss": 0.4312,
1470
  "step": 10150
1471
  },
1472
  {
1473
  "epoch": 3.605514316012725,
1474
- "grad_norm": 4.3978729248046875,
1475
  "learning_rate": 1.397313538352775e-05,
1476
- "loss": 0.5477,
1477
  "step": 10200
1478
  },
1479
  {
1480
  "epoch": 3.6231884057971016,
1481
- "grad_norm": 3.3587982654571533,
1482
  "learning_rate": 1.3796394485683988e-05,
1483
- "loss": 0.4373,
1484
  "step": 10250
1485
  },
1486
  {
1487
  "epoch": 3.6408624955814775,
1488
- "grad_norm": 2.463456392288208,
1489
  "learning_rate": 1.3619653587840228e-05,
1490
- "loss": 0.4234,
1491
  "step": 10300
1492
  },
1493
  {
1494
  "epoch": 3.658536585365854,
1495
- "grad_norm": 3.532365560531616,
1496
  "learning_rate": 1.3442912689996465e-05,
1497
- "loss": 0.4115,
1498
  "step": 10350
1499
  },
1500
  {
1501
  "epoch": 3.6762106751502297,
1502
- "grad_norm": 3.4025349617004395,
1503
  "learning_rate": 1.3266171792152704e-05,
1504
- "loss": 0.4462,
1505
  "step": 10400
1506
  },
1507
  {
1508
  "epoch": 3.693884764934606,
1509
- "grad_norm": 1.0425785779953003,
1510
  "learning_rate": 1.3089430894308943e-05,
1511
- "loss": 0.3494,
1512
  "step": 10450
1513
  },
1514
  {
1515
  "epoch": 3.711558854718982,
1516
- "grad_norm": 4.738943099975586,
1517
  "learning_rate": 1.2912689996465183e-05,
1518
- "loss": 0.4316,
1519
  "step": 10500
1520
  },
1521
  {
1522
  "epoch": 3.729232944503358,
1523
- "grad_norm": 2.0041253566741943,
1524
  "learning_rate": 1.2735949098621422e-05,
1525
- "loss": 0.4596,
1526
  "step": 10550
1527
  },
1528
  {
1529
  "epoch": 3.746907034287734,
1530
- "grad_norm": 4.681216239929199,
1531
  "learning_rate": 1.2559208200777661e-05,
1532
- "loss": 0.4013,
1533
  "step": 10600
1534
  },
1535
  {
1536
  "epoch": 3.7645811240721105,
1537
- "grad_norm": 4.387250900268555,
1538
  "learning_rate": 1.2382467302933899e-05,
1539
- "loss": 0.387,
1540
  "step": 10650
1541
  },
1542
  {
1543
  "epoch": 3.7822552138564864,
1544
- "grad_norm": 2.4324512481689453,
1545
  "learning_rate": 1.2205726405090138e-05,
1546
- "loss": 0.4383,
1547
  "step": 10700
1548
  },
1549
  {
1550
  "epoch": 3.7999293036408623,
1551
- "grad_norm": 4.895308494567871,
1552
  "learning_rate": 1.2028985507246379e-05,
1553
- "loss": 0.4567,
1554
  "step": 10750
1555
  },
1556
  {
1557
  "epoch": 3.8176033934252387,
1558
- "grad_norm": 3.3893020153045654,
1559
  "learning_rate": 1.1852244609402616e-05,
1560
- "loss": 0.4316,
1561
  "step": 10800
1562
  },
1563
  {
1564
  "epoch": 3.8352774832096146,
1565
- "grad_norm": 2.8720388412475586,
1566
  "learning_rate": 1.1675503711558855e-05,
1567
- "loss": 0.3712,
1568
  "step": 10850
1569
  },
1570
  {
1571
  "epoch": 3.852951572993991,
1572
- "grad_norm": 2.9342293739318848,
1573
  "learning_rate": 1.1498762813715094e-05,
1574
- "loss": 0.3595,
1575
  "step": 10900
1576
  },
1577
  {
1578
  "epoch": 3.870625662778367,
1579
- "grad_norm": 5.391896724700928,
1580
  "learning_rate": 1.1322021915871334e-05,
1581
- "loss": 0.3945,
1582
  "step": 10950
1583
  },
1584
  {
1585
  "epoch": 3.888299752562743,
1586
- "grad_norm": 2.1623027324676514,
1587
  "learning_rate": 1.1145281018027571e-05,
1588
- "loss": 0.3849,
1589
  "step": 11000
1590
  },
1591
  {
1592
  "epoch": 3.905973842347119,
1593
- "grad_norm": 2.6391725540161133,
1594
  "learning_rate": 1.096854012018381e-05,
1595
- "loss": 0.3837,
1596
  "step": 11050
1597
  },
1598
  {
1599
  "epoch": 3.9236479321314954,
1600
- "grad_norm": 2.8502111434936523,
1601
  "learning_rate": 1.079179922234005e-05,
1602
- "loss": 0.4109,
1603
  "step": 11100
1604
  },
1605
  {
1606
  "epoch": 3.9413220219158713,
1607
- "grad_norm": 3.7358663082122803,
1608
  "learning_rate": 1.0615058324496289e-05,
1609
- "loss": 0.4113,
1610
  "step": 11150
1611
  },
1612
  {
1613
  "epoch": 3.9589961117002472,
1614
- "grad_norm": 3.771918535232544,
1615
  "learning_rate": 1.0438317426652528e-05,
1616
- "loss": 0.3664,
1617
  "step": 11200
1618
  },
1619
  {
1620
  "epoch": 3.9766702014846236,
1621
- "grad_norm": 2.2865407466888428,
1622
  "learning_rate": 1.0261576528808767e-05,
1623
- "loss": 0.427,
1624
  "step": 11250
1625
  },
1626
  {
1627
  "epoch": 3.9943442912689995,
1628
- "grad_norm": 3.386460781097412,
1629
  "learning_rate": 1.0084835630965006e-05,
1630
- "loss": 0.4252,
1631
  "step": 11300
1632
  },
1633
  {
1634
  "epoch": 4.0,
1635
- "eval_bertscore_f1": 0.965401310132503,
1636
- "eval_bleu": 0.5825957176218498,
1637
- "eval_loss": 0.29947343468666077,
1638
- "eval_meteor": 0.7356418884759407,
1639
- "eval_rouge1": 0.8377531126193314,
1640
- "eval_rouge2": 0.7761899021152611,
1641
- "eval_runtime": 1380.5465,
1642
- "eval_samples_per_second": 4.682,
1643
- "eval_steps_per_second": 0.585,
1644
  "step": 11316
1645
  },
1646
  {
1647
  "epoch": 4.012018381053376,
1648
- "grad_norm": 4.861545085906982,
1649
  "learning_rate": 9.908094733121245e-06,
1650
- "loss": 0.4298,
1651
  "step": 11350
1652
  },
1653
  {
1654
  "epoch": 4.029692470837752,
1655
- "grad_norm": 6.035098075866699,
1656
  "learning_rate": 9.731353835277483e-06,
1657
- "loss": 0.3755,
1658
  "step": 11400
1659
  },
1660
  {
1661
  "epoch": 4.047366560622128,
1662
- "grad_norm": 2.295048952102661,
1663
  "learning_rate": 9.554612937433722e-06,
1664
- "loss": 0.3772,
1665
  "step": 11450
1666
  },
1667
  {
1668
  "epoch": 4.065040650406504,
1669
- "grad_norm": 3.09201979637146,
1670
  "learning_rate": 9.377872039589961e-06,
1671
- "loss": 0.4315,
1672
  "step": 11500
1673
  },
1674
  {
1675
  "epoch": 4.08271474019088,
1676
- "grad_norm": 7.435740947723389,
1677
  "learning_rate": 9.2011311417462e-06,
1678
- "loss": 0.4276,
1679
  "step": 11550
1680
  },
1681
  {
1682
  "epoch": 4.100388829975256,
1683
- "grad_norm": 2.680793046951294,
1684
  "learning_rate": 9.02439024390244e-06,
1685
- "loss": 0.3664,
1686
  "step": 11600
1687
  },
1688
  {
1689
  "epoch": 4.118062919759632,
1690
- "grad_norm": 2.200497627258301,
1691
  "learning_rate": 8.847649346058679e-06,
1692
- "loss": 0.4054,
1693
  "step": 11650
1694
  },
1695
  {
1696
  "epoch": 4.135737009544009,
1697
- "grad_norm": 3.864414930343628,
1698
  "learning_rate": 8.670908448214918e-06,
1699
- "loss": 0.4256,
1700
  "step": 11700
1701
  },
1702
  {
1703
  "epoch": 4.153411099328385,
1704
- "grad_norm": 4.263733863830566,
1705
  "learning_rate": 8.494167550371156e-06,
1706
- "loss": 0.4066,
1707
  "step": 11750
1708
  },
1709
  {
1710
  "epoch": 4.171085189112761,
1711
- "grad_norm": 4.246395587921143,
1712
  "learning_rate": 8.317426652527395e-06,
1713
- "loss": 0.3858,
1714
  "step": 11800
1715
  },
1716
  {
1717
  "epoch": 4.188759278897137,
1718
- "grad_norm": 4.842310905456543,
1719
  "learning_rate": 8.140685754683634e-06,
1720
- "loss": 0.4026,
1721
  "step": 11850
1722
  },
1723
  {
1724
  "epoch": 4.2064333686815125,
1725
- "grad_norm": 2.5259275436401367,
1726
  "learning_rate": 7.963944856839873e-06,
1727
- "loss": 0.3624,
1728
  "step": 11900
1729
  },
1730
  {
1731
  "epoch": 4.224107458465889,
1732
- "grad_norm": 2.959528923034668,
1733
  "learning_rate": 7.787203958996112e-06,
1734
- "loss": 0.3971,
1735
  "step": 11950
1736
  },
1737
  {
1738
  "epoch": 4.241781548250265,
1739
- "grad_norm": 2.9087865352630615,
1740
  "learning_rate": 7.610463061152351e-06,
1741
- "loss": 0.4095,
1742
  "step": 12000
1743
  },
1744
  {
1745
  "epoch": 4.259455638034641,
1746
- "grad_norm": 3.8024725914001465,
1747
  "learning_rate": 7.43372216330859e-06,
1748
  "loss": 0.3815,
1749
  "step": 12050
1750
  },
1751
  {
1752
  "epoch": 4.277129727819017,
1753
- "grad_norm": 3.2564175128936768,
1754
  "learning_rate": 7.256981265464829e-06,
1755
- "loss": 0.3752,
1756
  "step": 12100
1757
  },
1758
  {
1759
  "epoch": 4.294803817603394,
1760
- "grad_norm": 1.157267689704895,
1761
  "learning_rate": 7.080240367621067e-06,
1762
- "loss": 0.3769,
1763
  "step": 12150
1764
  },
1765
  {
1766
  "epoch": 4.31247790738777,
1767
- "grad_norm": 11.618633270263672,
1768
  "learning_rate": 6.903499469777307e-06,
1769
- "loss": 0.4042,
1770
  "step": 12200
1771
  },
1772
  {
1773
  "epoch": 4.330151997172146,
1774
- "grad_norm": 4.113063812255859,
1775
  "learning_rate": 6.726758571933546e-06,
1776
- "loss": 0.4441,
1777
  "step": 12250
1778
  },
1779
  {
1780
  "epoch": 4.3478260869565215,
1781
- "grad_norm": 4.125561237335205,
1782
  "learning_rate": 6.550017674089785e-06,
1783
- "loss": 0.421,
1784
  "step": 12300
1785
  },
1786
  {
1787
  "epoch": 4.365500176740898,
1788
- "grad_norm": 3.2781832218170166,
1789
  "learning_rate": 6.373276776246023e-06,
1790
- "loss": 0.46,
1791
  "step": 12350
1792
  },
1793
  {
1794
  "epoch": 4.383174266525274,
1795
- "grad_norm": 8.616064071655273,
1796
  "learning_rate": 6.1965358784022625e-06,
1797
- "loss": 0.3905,
1798
  "step": 12400
1799
  },
1800
  {
1801
  "epoch": 4.40084835630965,
1802
- "grad_norm": 3.173698902130127,
1803
  "learning_rate": 6.019794980558501e-06,
1804
- "loss": 0.3529,
1805
  "step": 12450
1806
  },
1807
  {
1808
  "epoch": 4.418522446094026,
1809
- "grad_norm": 2.7585160732269287,
1810
  "learning_rate": 5.843054082714741e-06,
1811
- "loss": 0.4236,
1812
  "step": 12500
1813
  },
1814
  {
1815
  "epoch": 4.436196535878402,
1816
- "grad_norm": 1.5063729286193848,
1817
  "learning_rate": 5.666313184870979e-06,
1818
- "loss": 0.4182,
1819
  "step": 12550
1820
  },
1821
  {
1822
  "epoch": 4.453870625662779,
1823
- "grad_norm": 5.755438327789307,
1824
  "learning_rate": 5.4895722870272184e-06,
1825
- "loss": 0.3909,
1826
  "step": 12600
1827
  },
1828
  {
1829
  "epoch": 4.471544715447155,
1830
- "grad_norm": 2.465946674346924,
1831
  "learning_rate": 5.312831389183457e-06,
1832
- "loss": 0.4078,
1833
  "step": 12650
1834
  },
1835
  {
1836
  "epoch": 4.4892188052315305,
1837
- "grad_norm": 2.642314910888672,
1838
  "learning_rate": 5.136090491339696e-06,
1839
- "loss": 0.4581,
1840
  "step": 12700
1841
  },
1842
  {
1843
  "epoch": 4.506892895015906,
1844
- "grad_norm": 3.11537766456604,
1845
  "learning_rate": 4.959349593495935e-06,
1846
- "loss": 0.3574,
1847
  "step": 12750
1848
  },
1849
  {
1850
  "epoch": 4.524566984800282,
1851
- "grad_norm": 5.104282855987549,
1852
  "learning_rate": 4.782608695652174e-06,
1853
- "loss": 0.3889,
1854
  "step": 12800
1855
  },
1856
  {
1857
  "epoch": 4.542241074584659,
1858
- "grad_norm": 3.2097325325012207,
1859
  "learning_rate": 4.605867797808413e-06,
1860
- "loss": 0.3723,
1861
  "step": 12850
1862
  },
1863
  {
1864
  "epoch": 4.559915164369035,
1865
- "grad_norm": 2.8303864002227783,
1866
  "learning_rate": 4.429126899964652e-06,
1867
- "loss": 0.4222,
1868
  "step": 12900
1869
  },
1870
  {
1871
  "epoch": 4.577589254153411,
1872
- "grad_norm": 3.508904457092285,
1873
  "learning_rate": 4.252386002120891e-06,
1874
- "loss": 0.4109,
1875
  "step": 12950
1876
  },
1877
  {
1878
  "epoch": 4.595263343937788,
1879
- "grad_norm": 3.8901443481445312,
1880
  "learning_rate": 4.07564510427713e-06,
1881
- "loss": 0.4248,
1882
  "step": 13000
1883
  },
1884
  {
1885
  "epoch": 4.612937433722164,
1886
- "grad_norm": 4.3722920417785645,
1887
  "learning_rate": 3.898904206433369e-06,
1888
- "loss": 0.4139,
1889
  "step": 13050
1890
  },
1891
  {
1892
  "epoch": 4.6306115235065395,
1893
- "grad_norm": 3.87107515335083,
1894
  "learning_rate": 3.722163308589608e-06,
1895
- "loss": 0.3862,
1896
  "step": 13100
1897
  },
1898
  {
1899
  "epoch": 4.648285613290915,
1900
- "grad_norm": 4.208980560302734,
1901
  "learning_rate": 3.5454224107458466e-06,
1902
- "loss": 0.3971,
1903
  "step": 13150
1904
  },
1905
  {
1906
  "epoch": 4.665959703075291,
1907
- "grad_norm": 3.0796680450439453,
1908
  "learning_rate": 3.368681512902086e-06,
1909
- "loss": 0.4466,
1910
  "step": 13200
1911
  },
1912
  {
1913
  "epoch": 4.683633792859668,
1914
- "grad_norm": 3.2411413192749023,
1915
  "learning_rate": 3.1919406150583245e-06,
1916
- "loss": 0.3462,
1917
  "step": 13250
1918
  },
1919
  {
1920
  "epoch": 4.701307882644044,
1921
- "grad_norm": 3.422546625137329,
1922
  "learning_rate": 3.0151997172145637e-06,
1923
- "loss": 0.4942,
1924
  "step": 13300
1925
  },
1926
  {
1927
  "epoch": 4.71898197242842,
1928
- "grad_norm": 5.258462905883789,
1929
  "learning_rate": 2.8384588193708025e-06,
1930
- "loss": 0.3842,
1931
  "step": 13350
1932
  },
1933
  {
1934
  "epoch": 4.736656062212796,
1935
- "grad_norm": 3.634772300720215,
1936
  "learning_rate": 2.6617179215270417e-06,
1937
- "loss": 0.386,
1938
  "step": 13400
1939
  },
1940
  {
1941
  "epoch": 4.754330151997172,
1942
- "grad_norm": 2.493283987045288,
1943
  "learning_rate": 2.4849770236832804e-06,
1944
- "loss": 0.3646,
1945
  "step": 13450
1946
  },
1947
  {
1948
  "epoch": 4.7720042417815485,
1949
- "grad_norm": 3.546058416366577,
1950
  "learning_rate": 2.3082361258395196e-06,
1951
- "loss": 0.4085,
1952
  "step": 13500
1953
  },
1954
  {
1955
  "epoch": 4.789678331565924,
1956
- "grad_norm": 2.0962002277374268,
1957
  "learning_rate": 2.1314952279957584e-06,
1958
- "loss": 0.3869,
1959
  "step": 13550
1960
  },
1961
  {
1962
  "epoch": 4.8073524213503,
1963
- "grad_norm": 2.2293384075164795,
1964
  "learning_rate": 1.9547543301519976e-06,
1965
- "loss": 0.4841,
1966
  "step": 13600
1967
  },
1968
  {
1969
  "epoch": 4.825026511134676,
1970
- "grad_norm": 3.2926249504089355,
1971
  "learning_rate": 1.7780134323082363e-06,
1972
- "loss": 0.4599,
1973
  "step": 13650
1974
  },
1975
  {
1976
  "epoch": 4.842700600919053,
1977
- "grad_norm": 5.047961235046387,
1978
- "learning_rate": 1.6012725344644753e-06,
1979
- "loss": 0.3796,
1980
  "step": 13700
1981
  },
1982
  {
1983
  "epoch": 4.860374690703429,
1984
- "grad_norm": 3.179448366165161,
1985
- "learning_rate": 1.424531636620714e-06,
1986
- "loss": 0.3898,
1987
  "step": 13750
1988
  },
1989
  {
1990
  "epoch": 4.878048780487805,
1991
- "grad_norm": 5.14663028717041,
1992
- "learning_rate": 1.247790738776953e-06,
1993
- "loss": 0.383,
1994
  "step": 13800
1995
  },
1996
  {
1997
  "epoch": 4.895722870272181,
1998
- "grad_norm": 2.7722623348236084,
1999
- "learning_rate": 1.071049840933192e-06,
2000
- "loss": 0.3923,
2001
  "step": 13850
2002
  },
2003
  {
2004
  "epoch": 4.9133969600565575,
2005
- "grad_norm": 4.3328447341918945,
2006
- "learning_rate": 8.94308943089431e-07,
2007
- "loss": 0.3859,
2008
  "step": 13900
2009
  },
2010
  {
2011
  "epoch": 4.931071049840933,
2012
- "grad_norm": 3.5014865398406982,
2013
- "learning_rate": 7.175680452456699e-07,
2014
- "loss": 0.3909,
2015
  "step": 13950
2016
  },
2017
  {
2018
  "epoch": 4.948745139625309,
2019
- "grad_norm": 4.449154376983643,
2020
- "learning_rate": 5.408271474019089e-07,
2021
- "loss": 0.4711,
2022
  "step": 14000
2023
  },
2024
  {
2025
  "epoch": 4.966419229409685,
2026
- "grad_norm": 2.2578201293945312,
2027
- "learning_rate": 3.640862495581478e-07,
2028
- "loss": 0.3719,
2029
  "step": 14050
2030
  },
2031
  {
2032
  "epoch": 4.984093319194061,
2033
- "grad_norm": 1.688942313194275,
2034
- "learning_rate": 1.8734535171438673e-07,
2035
- "loss": 0.4053,
2036
  "step": 14100
2037
  },
2038
  {
2039
  "epoch": 5.0,
2040
- "eval_bertscore_f1": 0.9658045416033947,
2041
- "eval_bleu": 0.5865134487850142,
2042
- "eval_loss": 0.2939385771751404,
2043
- "eval_meteor": 0.738667698887171,
2044
- "eval_rouge1": 0.8397011041728719,
2045
- "eval_rouge2": 0.7793367916496452,
2046
- "eval_runtime": 1419.0322,
2047
- "eval_samples_per_second": 4.555,
2048
- "eval_steps_per_second": 0.569,
2049
  "step": 14145
2050
  }
2051
  ],
 
1
  {
2
  "best_global_step": 14145,
3
+ "best_metric": 0.9660587414250811,
4
+ "best_model_checkpoint": "/kaggle/working/codet5-k8s-qlora/checkpoint-14145",
5
  "epoch": 5.0,
6
  "eval_steps": 500,
7
  "global_step": 14145,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.017674089784376106,
14
+ "grad_norm": 2.658311605453491,
15
+ "learning_rate": 4.983032873806999e-05,
16
+ "loss": 4.0893,
17
  "step": 50
18
  },
19
  {
20
  "epoch": 0.03534817956875221,
21
+ "grad_norm": 6.100900173187256,
22
+ "learning_rate": 4.9657122658183106e-05,
23
+ "loss": 3.214,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.053022269353128315,
28
+ "grad_norm": 2.4247324466705322,
29
  "learning_rate": 4.948391657829622e-05,
30
+ "loss": 2.2694,
31
  "step": 150
32
  },
33
  {
34
  "epoch": 0.07069635913750442,
35
+ "grad_norm": 2.4390416145324707,
36
  "learning_rate": 4.930717568045246e-05,
37
+ "loss": 1.9621,
38
  "step": 200
39
  },
40
  {
41
  "epoch": 0.08837044892188052,
42
+ "grad_norm": 3.003971576690674,
43
  "learning_rate": 4.91304347826087e-05,
44
+ "loss": 1.8377,
45
  "step": 250
46
  },
47
  {
48
  "epoch": 0.10604453870625663,
49
+ "grad_norm": 2.6893651485443115,
50
  "learning_rate": 4.895369388476494e-05,
51
+ "loss": 1.7639,
52
  "step": 300
53
  },
54
  {
55
  "epoch": 0.12371862849063273,
56
+ "grad_norm": 2.8361988067626953,
57
  "learning_rate": 4.8776952986921177e-05,
58
+ "loss": 1.6632,
59
  "step": 350
60
  },
61
  {
62
  "epoch": 0.14139271827500885,
63
+ "grad_norm": 2.17179012298584,
64
  "learning_rate": 4.8600212089077416e-05,
65
+ "loss": 1.5525,
66
  "step": 400
67
  },
68
  {
69
  "epoch": 0.15906680805938495,
70
+ "grad_norm": 4.485565185546875,
71
+ "learning_rate": 4.842700600919053e-05,
72
+ "loss": 1.4975,
73
  "step": 450
74
  },
75
  {
76
  "epoch": 0.17674089784376104,
77
+ "grad_norm": 3.197230577468872,
78
  "learning_rate": 4.825026511134676e-05,
79
+ "loss": 1.4888,
80
  "step": 500
81
  },
82
  {
83
  "epoch": 0.19441498762813716,
84
+ "grad_norm": 2.8129756450653076,
85
  "learning_rate": 4.807352421350301e-05,
86
+ "loss": 1.32,
87
  "step": 550
88
  },
89
  {
90
  "epoch": 0.21208907741251326,
91
+ "grad_norm": 2.888892650604248,
92
  "learning_rate": 4.789678331565924e-05,
93
+ "loss": 1.4137,
94
  "step": 600
95
  },
96
  {
97
  "epoch": 0.22976316719688936,
98
+ "grad_norm": 3.6058623790740967,
99
  "learning_rate": 4.7720042417815487e-05,
100
+ "loss": 1.3793,
101
  "step": 650
102
  },
103
  {
104
  "epoch": 0.24743725698126545,
105
+ "grad_norm": 3.077688217163086,
106
  "learning_rate": 4.7543301519971726e-05,
107
+ "loss": 1.2157,
108
  "step": 700
109
  },
110
  {
111
  "epoch": 0.2651113467656416,
112
+ "grad_norm": 4.21675443649292,
113
  "learning_rate": 4.7366560622127965e-05,
114
+ "loss": 1.3435,
115
  "step": 750
116
  },
117
  {
118
  "epoch": 0.2827854365500177,
119
+ "grad_norm": 3.459958076477051,
120
  "learning_rate": 4.7189819724284204e-05,
121
+ "loss": 1.2747,
122
  "step": 800
123
  },
124
  {
125
  "epoch": 0.30045952633439377,
126
+ "grad_norm": 3.2092440128326416,
127
  "learning_rate": 4.701307882644044e-05,
128
+ "loss": 1.1624,
129
  "step": 850
130
  },
131
  {
132
  "epoch": 0.3181336161187699,
133
+ "grad_norm": 3.1231963634490967,
134
  "learning_rate": 4.683633792859668e-05,
135
+ "loss": 1.1956,
136
  "step": 900
137
  },
138
  {
139
  "epoch": 0.335807705903146,
140
+ "grad_norm": 3.332000970840454,
141
  "learning_rate": 4.6659597030752915e-05,
142
+ "loss": 1.118,
143
  "step": 950
144
  },
145
  {
146
  "epoch": 0.3534817956875221,
147
+ "grad_norm": 2.992741823196411,
148
  "learning_rate": 4.648285613290916e-05,
149
+ "loss": 1.1513,
150
  "step": 1000
151
  },
152
  {
153
  "epoch": 0.3711558854718982,
154
+ "grad_norm": 2.8758022785186768,
155
  "learning_rate": 4.630611523506539e-05,
156
+ "loss": 1.0998,
157
  "step": 1050
158
  },
159
  {
160
  "epoch": 0.38882997525627433,
161
+ "grad_norm": 3.870368480682373,
162
  "learning_rate": 4.612937433722164e-05,
163
+ "loss": 1.0723,
164
  "step": 1100
165
  },
166
  {
167
  "epoch": 0.4065040650406504,
168
+ "grad_norm": 4.177937030792236,
169
  "learning_rate": 4.595263343937787e-05,
170
+ "loss": 1.0612,
171
  "step": 1150
172
  },
173
  {
174
  "epoch": 0.4241781548250265,
175
+ "grad_norm": 2.760124921798706,
176
  "learning_rate": 4.577589254153412e-05,
177
+ "loss": 1.0086,
178
  "step": 1200
179
  },
180
  {
181
  "epoch": 0.4418522446094026,
182
+ "grad_norm": 3.0196070671081543,
183
  "learning_rate": 4.559915164369035e-05,
184
+ "loss": 1.06,
185
  "step": 1250
186
  },
187
  {
188
  "epoch": 0.4595263343937787,
189
+ "grad_norm": 2.649152994155884,
190
  "learning_rate": 4.542241074584659e-05,
191
+ "loss": 0.9997,
192
  "step": 1300
193
  },
194
  {
195
  "epoch": 0.47720042417815484,
196
+ "grad_norm": 3.8896467685699463,
197
  "learning_rate": 4.524566984800283e-05,
198
+ "loss": 1.0067,
199
  "step": 1350
200
  },
201
  {
202
  "epoch": 0.4948745139625309,
203
+ "grad_norm": 3.186890125274658,
204
  "learning_rate": 4.506892895015907e-05,
205
+ "loss": 0.9501,
206
  "step": 1400
207
  },
208
  {
209
  "epoch": 0.512548603746907,
210
+ "grad_norm": 3.991669178009033,
211
  "learning_rate": 4.489218805231531e-05,
212
+ "loss": 1.0347,
213
  "step": 1450
214
  },
215
  {
216
  "epoch": 0.5302226935312832,
217
+ "grad_norm": 11.242384910583496,
218
  "learning_rate": 4.4715447154471546e-05,
219
+ "loss": 0.9635,
220
  "step": 1500
221
  },
222
  {
223
  "epoch": 0.5478967833156593,
224
+ "grad_norm": 2.5245680809020996,
225
+ "learning_rate": 4.4538706256627785e-05,
226
+ "loss": 0.9248,
227
  "step": 1550
228
  },
229
  {
230
  "epoch": 0.5655708731000354,
231
+ "grad_norm": 4.0713114738464355,
232
+ "learning_rate": 4.4361965358784024e-05,
233
+ "loss": 0.906,
234
  "step": 1600
235
  },
236
  {
237
  "epoch": 0.5832449628844114,
238
+ "grad_norm": 3.434156656265259,
239
+ "learning_rate": 4.4185224460940264e-05,
240
+ "loss": 0.9438,
241
  "step": 1650
242
  },
243
  {
244
  "epoch": 0.6009190526687875,
245
+ "grad_norm": 3.6341230869293213,
246
+ "learning_rate": 4.40084835630965e-05,
247
+ "loss": 0.8156,
248
  "step": 1700
249
  },
250
  {
251
  "epoch": 0.6185931424531637,
252
+ "grad_norm": 4.359820365905762,
253
+ "learning_rate": 4.383174266525274e-05,
254
+ "loss": 0.9052,
255
  "step": 1750
256
  },
257
  {
258
  "epoch": 0.6362672322375398,
259
+ "grad_norm": 3.804647445678711,
260
+ "learning_rate": 4.365500176740898e-05,
261
+ "loss": 0.8758,
262
  "step": 1800
263
  },
264
  {
265
  "epoch": 0.6539413220219159,
266
+ "grad_norm": 21.193464279174805,
267
+ "learning_rate": 4.347826086956522e-05,
268
+ "loss": 0.8776,
269
  "step": 1850
270
  },
271
  {
272
  "epoch": 0.671615411806292,
273
+ "grad_norm": 3.002357244491577,
274
+ "learning_rate": 4.330151997172146e-05,
275
+ "loss": 0.8658,
276
  "step": 1900
277
  },
278
  {
279
  "epoch": 0.689289501590668,
280
+ "grad_norm": 4.116176605224609,
281
+ "learning_rate": 4.31247790738777e-05,
282
+ "loss": 0.8419,
283
  "step": 1950
284
  },
285
  {
286
  "epoch": 0.7069635913750442,
287
+ "grad_norm": 6.561131954193115,
288
+ "learning_rate": 4.294803817603394e-05,
289
+ "loss": 0.8204,
290
  "step": 2000
291
  },
292
  {
293
  "epoch": 0.7246376811594203,
294
+ "grad_norm": 3.203460931777954,
295
+ "learning_rate": 4.277129727819018e-05,
296
+ "loss": 0.7557,
297
  "step": 2050
298
  },
299
  {
300
  "epoch": 0.7423117709437964,
301
+ "grad_norm": 4.0467705726623535,
302
+ "learning_rate": 4.259455638034641e-05,
303
+ "loss": 0.8053,
304
  "step": 2100
305
  },
306
  {
307
  "epoch": 0.7599858607281725,
308
+ "grad_norm": 3.575634002685547,
309
  "learning_rate": 4.242135030045953e-05,
310
+ "loss": 0.8128,
311
  "step": 2150
312
  },
313
  {
314
  "epoch": 0.7776599505125487,
315
+ "grad_norm": 5.7353363037109375,
316
  "learning_rate": 4.224460940261576e-05,
317
+ "loss": 0.8339,
318
  "step": 2200
319
  },
320
  {
321
  "epoch": 0.7953340402969247,
322
+ "grad_norm": 4.916664123535156,
323
  "learning_rate": 4.206786850477201e-05,
324
+ "loss": 0.7758,
325
  "step": 2250
326
  },
327
  {
328
  "epoch": 0.8130081300813008,
329
+ "grad_norm": 4.233948230743408,
330
  "learning_rate": 4.189112760692824e-05,
331
+ "loss": 0.7485,
332
  "step": 2300
333
  },
334
  {
335
  "epoch": 0.8306822198656769,
336
+ "grad_norm": 3.754826545715332,
337
  "learning_rate": 4.171438670908449e-05,
338
+ "loss": 0.6988,
339
  "step": 2350
340
  },
341
  {
342
  "epoch": 0.848356309650053,
343
+ "grad_norm": 3.185098171234131,
344
  "learning_rate": 4.153764581124072e-05,
345
+ "loss": 0.7067,
346
  "step": 2400
347
  },
348
  {
349
  "epoch": 0.8660303994344292,
350
+ "grad_norm": 3.515683650970459,
351
  "learning_rate": 4.1360904913396966e-05,
352
+ "loss": 0.7559,
353
  "step": 2450
354
  },
355
  {
356
  "epoch": 0.8837044892188052,
357
+ "grad_norm": 4.783038139343262,
358
  "learning_rate": 4.11841640155532e-05,
359
+ "loss": 0.7444,
360
  "step": 2500
361
  },
362
  {
363
  "epoch": 0.9013785790031813,
364
+ "grad_norm": 3.4242937564849854,
365
  "learning_rate": 4.1007423117709444e-05,
366
+ "loss": 0.7676,
367
  "step": 2550
368
  },
369
  {
370
  "epoch": 0.9190526687875574,
371
+ "grad_norm": 3.3563663959503174,
372
  "learning_rate": 4.0830682219865676e-05,
373
+ "loss": 0.7416,
374
  "step": 2600
375
  },
376
  {
377
  "epoch": 0.9367267585719335,
378
+ "grad_norm": 21.883926391601562,
379
  "learning_rate": 4.0653941322021916e-05,
380
+ "loss": 0.6892,
381
  "step": 2650
382
  },
383
  {
384
  "epoch": 0.9544008483563097,
385
+ "grad_norm": 3.8259048461914062,
386
  "learning_rate": 4.0477200424178155e-05,
387
+ "loss": 0.7489,
388
  "step": 2700
389
  },
390
  {
391
  "epoch": 0.9720749381406858,
392
+ "grad_norm": 3.026655912399292,
393
  "learning_rate": 4.0300459526334394e-05,
394
+ "loss": 0.6679,
395
  "step": 2750
396
  },
397
  {
398
  "epoch": 0.9897490279250618,
399
+ "grad_norm": 7.62285041809082,
400
  "learning_rate": 4.012371862849063e-05,
401
+ "loss": 0.7393,
402
  "step": 2800
403
  },
404
  {
405
  "epoch": 1.0,
406
+ "eval_bertscore_f1": 0.9551081777928342,
407
+ "eval_bleu": 0.47406093922979725,
408
+ "eval_loss": 0.5141507983207703,
409
+ "eval_meteor": 0.6443492142009581,
410
+ "eval_rouge1": 0.7935683439864762,
411
+ "eval_rouge2": 0.6796198647957756,
412
+ "eval_runtime": 1335.9702,
413
+ "eval_samples_per_second": 4.838,
414
+ "eval_steps_per_second": 0.605,
415
  "step": 2829
416
  },
417
  {
418
  "epoch": 1.007423117709438,
419
+ "grad_norm": 3.7401936054229736,
420
  "learning_rate": 3.994697773064687e-05,
421
+ "loss": 0.7272,
422
  "step": 2850
423
  },
424
  {
425
  "epoch": 1.025097207493814,
426
+ "grad_norm": 4.575202941894531,
427
  "learning_rate": 3.977023683280312e-05,
428
+ "loss": 0.6891,
429
  "step": 2900
430
  },
431
  {
432
  "epoch": 1.0427712972781902,
433
+ "grad_norm": 2.909268379211426,
434
  "learning_rate": 3.959349593495935e-05,
435
+ "loss": 0.6751,
436
  "step": 2950
437
  },
438
  {
439
  "epoch": 1.0604453870625663,
440
+ "grad_norm": 5.258713722229004,
441
  "learning_rate": 3.941675503711559e-05,
442
+ "loss": 0.7308,
443
  "step": 3000
444
  },
445
  {
446
  "epoch": 1.0781194768469424,
447
+ "grad_norm": 4.8982462882995605,
448
  "learning_rate": 3.924001413927183e-05,
449
+ "loss": 0.5938,
450
  "step": 3050
451
  },
452
  {
453
  "epoch": 1.0957935666313185,
454
+ "grad_norm": 3.7401649951934814,
455
  "learning_rate": 3.906327324142807e-05,
456
+ "loss": 0.7358,
457
  "step": 3100
458
  },
459
  {
460
  "epoch": 1.1134676564156947,
461
+ "grad_norm": 2.2274134159088135,
462
  "learning_rate": 3.888653234358431e-05,
463
+ "loss": 0.6251,
464
  "step": 3150
465
  },
466
  {
467
  "epoch": 1.1311417462000706,
468
+ "grad_norm": 4.285720348358154,
469
  "learning_rate": 3.870979144574055e-05,
470
+ "loss": 0.6773,
471
  "step": 3200
472
  },
473
  {
474
  "epoch": 1.148815835984447,
475
+ "grad_norm": 3.1202948093414307,
476
  "learning_rate": 3.8533050547896786e-05,
477
+ "loss": 0.6485,
478
  "step": 3250
479
  },
480
  {
481
  "epoch": 1.1664899257688228,
482
+ "grad_norm": 2.96162486076355,
483
  "learning_rate": 3.8356309650053025e-05,
484
+ "loss": 0.6733,
485
  "step": 3300
486
  },
487
  {
488
  "epoch": 1.184164015553199,
489
+ "grad_norm": 6.456724166870117,
490
  "learning_rate": 3.817956875220926e-05,
491
+ "loss": 0.6142,
492
  "step": 3350
493
  },
494
  {
495
  "epoch": 1.201838105337575,
496
+ "grad_norm": 5.0712690353393555,
497
  "learning_rate": 3.8002827854365503e-05,
498
+ "loss": 0.6952,
499
  "step": 3400
500
  },
501
  {
502
  "epoch": 1.2195121951219512,
503
+ "grad_norm": 5.074472904205322,
504
  "learning_rate": 3.7826086956521736e-05,
505
+ "loss": 0.6147,
506
  "step": 3450
507
  },
508
  {
509
  "epoch": 1.2371862849063273,
510
+ "grad_norm": 4.572699546813965,
511
  "learning_rate": 3.764934605867798e-05,
512
+ "loss": 0.6172,
513
  "step": 3500
514
  },
515
  {
516
  "epoch": 1.2548603746907034,
517
+ "grad_norm": 3.24722957611084,
518
  "learning_rate": 3.747260516083422e-05,
519
+ "loss": 0.6657,
520
  "step": 3550
521
  },
522
  {
523
  "epoch": 1.2725344644750796,
524
+ "grad_norm": 3.6657183170318604,
525
  "learning_rate": 3.729586426299046e-05,
526
+ "loss": 0.6999,
527
  "step": 3600
528
  },
529
  {
530
  "epoch": 1.2902085542594557,
531
+ "grad_norm": 3.2770209312438965,
532
  "learning_rate": 3.71191233651467e-05,
533
+ "loss": 0.6882,
534
  "step": 3650
535
  },
536
  {
537
  "epoch": 1.3078826440438318,
538
+ "grad_norm": 4.611114501953125,
539
  "learning_rate": 3.694238246730294e-05,
540
+ "loss": 0.6767,
541
  "step": 3700
542
  },
543
  {
544
  "epoch": 1.3255567338282077,
545
+ "grad_norm": 3.4801883697509766,
546
  "learning_rate": 3.676564156945918e-05,
547
+ "loss": 0.6503,
548
  "step": 3750
549
  },
550
  {
551
  "epoch": 1.343230823612584,
552
+ "grad_norm": 4.582475185394287,
553
  "learning_rate": 3.658890067161541e-05,
554
+ "loss": 0.5833,
555
  "step": 3800
556
  },
557
  {
558
  "epoch": 1.36090491339696,
559
+ "grad_norm": 3.0982961654663086,
560
  "learning_rate": 3.6412159773771656e-05,
561
+ "loss": 0.6271,
562
  "step": 3850
563
  },
564
  {
565
  "epoch": 1.378579003181336,
566
+ "grad_norm": 3.592360734939575,
567
  "learning_rate": 3.623541887592789e-05,
568
+ "loss": 0.6688,
569
  "step": 3900
570
  },
571
  {
572
  "epoch": 1.3962530929657122,
573
+ "grad_norm": 4.296905994415283,
574
  "learning_rate": 3.6058677978084134e-05,
575
+ "loss": 0.5931,
576
  "step": 3950
577
  },
578
  {
579
  "epoch": 1.4139271827500883,
580
+ "grad_norm": 3.616574764251709,
581
  "learning_rate": 3.588193708024037e-05,
582
+ "loss": 0.6297,
583
  "step": 4000
584
  },
585
  {
586
  "epoch": 1.4316012725344645,
587
+ "grad_norm": 3.1819770336151123,
588
  "learning_rate": 3.570519618239661e-05,
589
+ "loss": 0.5801,
590
  "step": 4050
591
  },
592
  {
593
  "epoch": 1.4492753623188406,
594
+ "grad_norm": 3.5812184810638428,
595
  "learning_rate": 3.5528455284552845e-05,
596
+ "loss": 0.5826,
597
  "step": 4100
598
  },
599
  {
600
  "epoch": 1.4669494521032167,
601
+ "grad_norm": 2.889911651611328,
602
  "learning_rate": 3.5351714386709084e-05,
603
+ "loss": 0.5396,
604
  "step": 4150
605
  },
606
  {
607
  "epoch": 1.4846235418875928,
608
+ "grad_norm": 3.532849073410034,
609
  "learning_rate": 3.5174973488865324e-05,
610
+ "loss": 0.5218,
611
  "step": 4200
612
  },
613
  {
614
  "epoch": 1.502297631671969,
615
+ "grad_norm": 2.939161777496338,
616
  "learning_rate": 3.499823259102156e-05,
617
+ "loss": 0.5701,
618
  "step": 4250
619
  },
620
  {
621
  "epoch": 1.5199717214563448,
622
+ "grad_norm": 3.500262975692749,
623
  "learning_rate": 3.48214916931778e-05,
624
+ "loss": 0.5117,
625
  "step": 4300
626
  },
627
  {
628
  "epoch": 1.5376458112407212,
629
+ "grad_norm": 3.612431526184082,
630
  "learning_rate": 3.464475079533404e-05,
631
+ "loss": 0.5067,
632
  "step": 4350
633
  },
634
  {
635
  "epoch": 1.555319901025097,
636
+ "grad_norm": 3.3735318183898926,
637
  "learning_rate": 3.446800989749028e-05,
638
+ "loss": 0.5403,
639
  "step": 4400
640
  },
641
  {
642
  "epoch": 1.5729939908094734,
643
+ "grad_norm": 28.255231857299805,
644
  "learning_rate": 3.429126899964652e-05,
645
+ "loss": 0.5442,
646
  "step": 4450
647
  },
648
  {
649
  "epoch": 1.5906680805938493,
650
+ "grad_norm": 4.424487113952637,
651
  "learning_rate": 3.411452810180276e-05,
652
+ "loss": 0.5769,
653
  "step": 4500
654
  },
655
  {
656
  "epoch": 1.6083421703782255,
657
+ "grad_norm": 4.6517109870910645,
658
  "learning_rate": 3.3937787203959e-05,
659
+ "loss": 0.5291,
660
  "step": 4550
661
  },
662
  {
663
  "epoch": 1.6260162601626016,
664
+ "grad_norm": 4.276078701019287,
665
  "learning_rate": 3.376104630611524e-05,
666
+ "loss": 0.6207,
667
  "step": 4600
668
  },
669
  {
670
  "epoch": 1.6436903499469777,
671
+ "grad_norm": 3.1325790882110596,
672
  "learning_rate": 3.3584305408271476e-05,
673
+ "loss": 0.5807,
674
  "step": 4650
675
  },
676
  {
677
  "epoch": 1.6613644397313538,
678
+ "grad_norm": 3.2780227661132812,
679
  "learning_rate": 3.3407564510427716e-05,
680
+ "loss": 0.5487,
681
  "step": 4700
682
  },
683
  {
684
  "epoch": 1.67903852951573,
685
+ "grad_norm": 3.9542007446289062,
686
  "learning_rate": 3.3230823612583955e-05,
687
+ "loss": 0.6385,
688
  "step": 4750
689
  },
690
  {
691
  "epoch": 1.696712619300106,
692
+ "grad_norm": 4.091352462768555,
693
  "learning_rate": 3.3054082714740194e-05,
694
+ "loss": 0.5845,
695
  "step": 4800
696
  },
697
  {
698
  "epoch": 1.714386709084482,
699
+ "grad_norm": 2.3576905727386475,
700
  "learning_rate": 3.2877341816896426e-05,
701
+ "loss": 0.4949,
702
  "step": 4850
703
  },
704
  {
705
  "epoch": 1.7320607988688583,
706
+ "grad_norm": 3.200242519378662,
707
  "learning_rate": 3.270060091905267e-05,
708
+ "loss": 0.5922,
709
  "step": 4900
710
  },
711
  {
712
  "epoch": 1.7497348886532342,
713
+ "grad_norm": 3.1346006393432617,
714
  "learning_rate": 3.2523860021208905e-05,
715
+ "loss": 0.5259,
716
  "step": 4950
717
  },
718
  {
719
  "epoch": 1.7674089784376106,
720
+ "grad_norm": 3.4066524505615234,
721
  "learning_rate": 3.234711912336515e-05,
722
+ "loss": 0.562,
723
  "step": 5000
724
  },
725
  {
726
  "epoch": 1.7850830682219865,
727
+ "grad_norm": 5.18930196762085,
728
  "learning_rate": 3.217037822552138e-05,
729
+ "loss": 0.5825,
730
  "step": 5050
731
  },
732
  {
733
  "epoch": 1.8027571580063628,
734
+ "grad_norm": 4.159862995147705,
735
  "learning_rate": 3.199363732767763e-05,
736
+ "loss": 0.5616,
737
  "step": 5100
738
  },
739
  {
740
  "epoch": 1.8204312477907387,
741
+ "grad_norm": 4.439573287963867,
742
  "learning_rate": 3.181689642983386e-05,
743
+ "loss": 0.5334,
744
  "step": 5150
745
  },
746
  {
747
  "epoch": 1.8381053375751149,
748
+ "grad_norm": 6.196533203125,
749
  "learning_rate": 3.164015553199011e-05,
750
+ "loss": 0.5887,
751
  "step": 5200
752
  },
753
  {
754
  "epoch": 1.855779427359491,
755
+ "grad_norm": 3.715372323989868,
756
  "learning_rate": 3.146341463414634e-05,
757
+ "loss": 0.5379,
758
  "step": 5250
759
  },
760
  {
761
  "epoch": 1.873453517143867,
762
+ "grad_norm": 4.34264612197876,
763
  "learning_rate": 3.128667373630258e-05,
764
+ "loss": 0.4827,
765
  "step": 5300
766
  },
767
  {
768
  "epoch": 1.8911276069282432,
769
+ "grad_norm": 2.337557315826416,
770
  "learning_rate": 3.1109932838458825e-05,
771
+ "loss": 0.4685,
772
  "step": 5350
773
  },
774
  {
775
  "epoch": 1.9088016967126193,
776
+ "grad_norm": 3.325277805328369,
777
  "learning_rate": 3.093319194061506e-05,
778
+ "loss": 0.4983,
779
  "step": 5400
780
  },
781
  {
782
  "epoch": 1.9264757864969955,
783
+ "grad_norm": 2.976592540740967,
784
  "learning_rate": 3.0756451042771303e-05,
785
+ "loss": 0.5814,
786
  "step": 5450
787
  },
788
  {
789
  "epoch": 1.9441498762813714,
790
+ "grad_norm": 9.608305931091309,
791
  "learning_rate": 3.0579710144927536e-05,
792
+ "loss": 0.5062,
793
  "step": 5500
794
  },
795
  {
796
  "epoch": 1.9618239660657477,
797
+ "grad_norm": 3.443791151046753,
798
  "learning_rate": 3.040296924708378e-05,
799
+ "loss": 0.5092,
800
  "step": 5550
801
  },
802
  {
803
  "epoch": 1.9794980558501236,
804
+ "grad_norm": 3.4817845821380615,
805
  "learning_rate": 3.0226228349240014e-05,
806
+ "loss": 0.5584,
807
  "step": 5600
808
  },
809
  {
810
  "epoch": 1.9971721456345,
811
+ "grad_norm": 3.2107975482940674,
812
  "learning_rate": 3.0049487451396253e-05,
813
+ "loss": 0.506,
814
  "step": 5650
815
  },
816
  {
817
  "epoch": 2.0,
818
+ "eval_bertscore_f1": 0.9621683897930059,
819
+ "eval_bleu": 0.5494076455991572,
820
+ "eval_loss": 0.37199869751930237,
821
+ "eval_meteor": 0.7077566730507359,
822
+ "eval_rouge1": 0.8237541199852757,
823
+ "eval_rouge2": 0.7474098813874757,
824
+ "eval_runtime": 1281.6235,
825
+ "eval_samples_per_second": 5.044,
826
+ "eval_steps_per_second": 0.63,
827
  "step": 5658
828
  },
829
  {
830
  "epoch": 2.014846235418876,
831
+ "grad_norm": 3.7251229286193848,
832
  "learning_rate": 2.9872746553552493e-05,
833
+ "loss": 0.4928,
834
  "step": 5700
835
  },
836
  {
837
  "epoch": 2.032520325203252,
838
+ "grad_norm": 3.801664113998413,
839
  "learning_rate": 2.9696005655708732e-05,
840
+ "loss": 0.5748,
841
  "step": 5750
842
  },
843
  {
844
  "epoch": 2.050194414987628,
845
+ "grad_norm": 5.817806243896484,
846
  "learning_rate": 2.9519264757864974e-05,
847
+ "loss": 0.4844,
848
  "step": 5800
849
  },
850
  {
851
  "epoch": 2.0678685047720045,
852
+ "grad_norm": 3.028961658477783,
853
  "learning_rate": 2.934252386002121e-05,
854
+ "loss": 0.4626,
855
  "step": 5850
856
  },
857
  {
858
  "epoch": 2.0855425945563804,
859
+ "grad_norm": 3.974060297012329,
860
  "learning_rate": 2.9165782962177453e-05,
861
+ "loss": 0.5274,
862
  "step": 5900
863
  },
864
  {
865
  "epoch": 2.1032166843407563,
866
+ "grad_norm": 2.532444953918457,
867
  "learning_rate": 2.898904206433369e-05,
868
+ "loss": 0.4887,
869
  "step": 5950
870
  },
871
  {
872
  "epoch": 2.1208907741251326,
873
+ "grad_norm": 2.0569326877593994,
874
  "learning_rate": 2.8812301166489924e-05,
875
+ "loss": 0.4353,
876
  "step": 6000
877
  },
878
  {
879
  "epoch": 2.1385648639095085,
880
+ "grad_norm": 3.0496156215667725,
881
  "learning_rate": 2.8635560268646167e-05,
882
+ "loss": 0.4347,
883
  "step": 6050
884
  },
885
  {
886
  "epoch": 2.156238953693885,
887
+ "grad_norm": 2.635395050048828,
888
  "learning_rate": 2.8458819370802403e-05,
889
+ "loss": 0.5406,
890
  "step": 6100
891
  },
892
  {
893
  "epoch": 2.1739130434782608,
894
+ "grad_norm": 4.091008186340332,
895
  "learning_rate": 2.8282078472958645e-05,
896
+ "loss": 0.4885,
897
  "step": 6150
898
  },
899
  {
900
  "epoch": 2.191587133262637,
901
+ "grad_norm": 3.228792905807495,
902
  "learning_rate": 2.810533757511488e-05,
903
+ "loss": 0.5072,
904
  "step": 6200
905
  },
906
  {
907
  "epoch": 2.209261223047013,
908
+ "grad_norm": 2.479149341583252,
909
  "learning_rate": 2.7928596677271124e-05,
910
+ "loss": 0.5475,
911
  "step": 6250
912
  },
913
  {
914
  "epoch": 2.2269353128313893,
915
+ "grad_norm": 4.617306709289551,
916
  "learning_rate": 2.775185577942736e-05,
917
+ "loss": 0.5468,
918
  "step": 6300
919
  },
920
  {
921
  "epoch": 2.2446094026157652,
922
+ "grad_norm": 4.416631698608398,
923
  "learning_rate": 2.7575114881583602e-05,
924
+ "loss": 0.5125,
925
  "step": 6350
926
  },
927
  {
928
  "epoch": 2.262283492400141,
929
+ "grad_norm": 3.7900924682617188,
930
+ "learning_rate": 2.7398373983739838e-05,
931
+ "loss": 0.5824,
932
  "step": 6400
933
  },
934
  {
935
  "epoch": 2.2799575821845175,
936
+ "grad_norm": 3.695364236831665,
937
+ "learning_rate": 2.7221633085896077e-05,
938
+ "loss": 0.4806,
939
  "step": 6450
940
  },
941
  {
942
  "epoch": 2.297631671968894,
943
+ "grad_norm": 2.609520196914673,
944
+ "learning_rate": 2.7044892188052316e-05,
945
+ "loss": 0.4537,
946
  "step": 6500
947
  },
948
  {
949
  "epoch": 2.3153057617532697,
950
+ "grad_norm": 4.006641864776611,
951
+ "learning_rate": 2.6868151290208555e-05,
952
+ "loss": 0.5172,
953
  "step": 6550
954
  },
955
  {
956
  "epoch": 2.3329798515376456,
957
+ "grad_norm": 3.581960439682007,
958
+ "learning_rate": 2.6691410392364795e-05,
959
+ "loss": 0.5089,
960
  "step": 6600
961
  },
962
  {
963
  "epoch": 2.350653941322022,
964
+ "grad_norm": 2.6414718627929688,
965
+ "learning_rate": 2.6514669494521034e-05,
966
+ "loss": 0.4936,
967
  "step": 6650
968
  },
969
  {
970
  "epoch": 2.368328031106398,
971
+ "grad_norm": 3.3889434337615967,
972
+ "learning_rate": 2.6337928596677276e-05,
973
+ "loss": 0.535,
974
  "step": 6700
975
  },
976
  {
977
  "epoch": 2.3860021208907742,
978
+ "grad_norm": 4.371047496795654,
979
+ "learning_rate": 2.6161187698833512e-05,
980
+ "loss": 0.4651,
981
  "step": 6750
982
  },
983
  {
984
  "epoch": 2.40367621067515,
985
+ "grad_norm": 4.057021617889404,
986
+ "learning_rate": 2.5984446800989748e-05,
987
+ "loss": 0.4369,
988
  "step": 6800
989
  },
990
  {
991
  "epoch": 2.4213503004595265,
992
+ "grad_norm": 4.6812615394592285,
993
+ "learning_rate": 2.580770590314599e-05,
994
+ "loss": 0.5067,
995
  "step": 6850
996
  },
997
  {
998
  "epoch": 2.4390243902439024,
999
+ "grad_norm": 6.067279815673828,
1000
+ "learning_rate": 2.5630965005302226e-05,
1001
+ "loss": 0.4901,
1002
  "step": 6900
1003
  },
1004
  {
1005
  "epoch": 2.4566984800282787,
1006
+ "grad_norm": 3.8635661602020264,
1007
+ "learning_rate": 2.545422410745847e-05,
1008
+ "loss": 0.415,
1009
  "step": 6950
1010
  },
1011
  {
1012
  "epoch": 2.4743725698126546,
1013
+ "grad_norm": 1.7011466026306152,
1014
+ "learning_rate": 2.5277483209614705e-05,
1015
+ "loss": 0.4893,
1016
  "step": 7000
1017
  },
1018
  {
1019
  "epoch": 2.4920466595970305,
1020
+ "grad_norm": 3.8497934341430664,
1021
+ "learning_rate": 2.5100742311770947e-05,
1022
+ "loss": 0.4504,
1023
  "step": 7050
1024
  },
1025
  {
1026
  "epoch": 2.509720749381407,
1027
+ "grad_norm": 3.670374631881714,
1028
+ "learning_rate": 2.4924001413927183e-05,
1029
+ "loss": 0.4883,
1030
  "step": 7100
1031
  },
1032
  {
1033
  "epoch": 2.5273948391657832,
1034
+ "grad_norm": 3.130357503890991,
1035
+ "learning_rate": 2.4747260516083422e-05,
1036
+ "loss": 0.5031,
1037
  "step": 7150
1038
  },
1039
  {
1040
  "epoch": 2.545068928950159,
1041
+ "grad_norm": 3.647500514984131,
1042
+ "learning_rate": 2.457051961823966e-05,
1043
+ "loss": 0.4368,
1044
  "step": 7200
1045
  },
1046
  {
1047
  "epoch": 2.562743018734535,
1048
+ "grad_norm": 3.6657369136810303,
1049
+ "learning_rate": 2.43937787203959e-05,
1050
+ "loss": 0.4686,
1051
  "step": 7250
1052
  },
1053
  {
1054
  "epoch": 2.5804171085189114,
1055
+ "grad_norm": 5.371551036834717,
1056
+ "learning_rate": 2.421703782255214e-05,
1057
+ "loss": 0.4433,
1058
  "step": 7300
1059
  },
1060
  {
1061
  "epoch": 2.5980911983032873,
1062
+ "grad_norm": 3.593418598175049,
1063
+ "learning_rate": 2.404029692470838e-05,
1064
+ "loss": 0.4901,
1065
  "step": 7350
1066
  },
1067
  {
1068
  "epoch": 2.6157652880876636,
1069
+ "grad_norm": 3.1181206703186035,
1070
+ "learning_rate": 2.3863556026864618e-05,
1071
+ "loss": 0.4834,
1072
  "step": 7400
1073
  },
1074
  {
1075
  "epoch": 2.6334393778720395,
1076
+ "grad_norm": 4.218138217926025,
1077
+ "learning_rate": 2.3686815129020857e-05,
1078
+ "loss": 0.5013,
1079
  "step": 7450
1080
  },
1081
  {
1082
  "epoch": 2.6511134676564154,
1083
+ "grad_norm": 3.5063066482543945,
1084
+ "learning_rate": 2.3510074231177097e-05,
1085
+ "loss": 0.4705,
1086
  "step": 7500
1087
  },
1088
  {
1089
  "epoch": 2.6687875574407918,
1090
+ "grad_norm": 2.8965365886688232,
1091
+ "learning_rate": 2.3333333333333336e-05,
1092
+ "loss": 0.464,
1093
  "step": 7550
1094
  },
1095
  {
1096
  "epoch": 2.686461647225168,
1097
+ "grad_norm": 2.336358070373535,
1098
+ "learning_rate": 2.3156592435489575e-05,
1099
+ "loss": 0.4591,
1100
  "step": 7600
1101
  },
1102
  {
1103
  "epoch": 2.704135737009544,
1104
+ "grad_norm": 3.5483410358428955,
1105
+ "learning_rate": 2.2979851537645814e-05,
1106
+ "loss": 0.4419,
1107
  "step": 7650
1108
  },
1109
  {
1110
  "epoch": 2.72180982679392,
1111
+ "grad_norm": 4.550882816314697,
1112
+ "learning_rate": 2.280311063980205e-05,
1113
+ "loss": 0.4181,
1114
  "step": 7700
1115
  },
1116
  {
1117
  "epoch": 2.7394839165782963,
1118
+ "grad_norm": 4.471234321594238,
1119
+ "learning_rate": 2.262636974195829e-05,
1120
+ "loss": 0.4558,
1121
  "step": 7750
1122
  },
1123
  {
1124
  "epoch": 2.757158006362672,
1125
+ "grad_norm": 3.0595200061798096,
1126
+ "learning_rate": 2.244962884411453e-05,
1127
+ "loss": 0.4188,
1128
  "step": 7800
1129
  },
1130
  {
1131
  "epoch": 2.7748320961470485,
1132
+ "grad_norm": 7.5111403465271,
1133
+ "learning_rate": 2.2272887946270768e-05,
1134
+ "loss": 0.4834,
1135
  "step": 7850
1136
  },
1137
  {
1138
  "epoch": 2.7925061859314244,
1139
+ "grad_norm": 2.2414655685424805,
1140
+ "learning_rate": 2.2096147048427007e-05,
1141
+ "loss": 0.442,
1142
  "step": 7900
1143
  },
1144
  {
1145
  "epoch": 2.8101802757158008,
1146
+ "grad_norm": 4.036431789398193,
1147
+ "learning_rate": 2.1919406150583246e-05,
1148
+ "loss": 0.4254,
1149
  "step": 7950
1150
  },
1151
  {
1152
  "epoch": 2.8278543655001767,
1153
+ "grad_norm": 3.3172266483306885,
1154
+ "learning_rate": 2.1742665252739485e-05,
1155
+ "loss": 0.4852,
1156
  "step": 8000
1157
  },
1158
  {
1159
  "epoch": 2.845528455284553,
1160
+ "grad_norm": 4.143049240112305,
1161
+ "learning_rate": 2.1565924354895724e-05,
1162
+ "loss": 0.4858,
1163
  "step": 8050
1164
  },
1165
  {
1166
  "epoch": 2.863202545068929,
1167
+ "grad_norm": 5.017402172088623,
1168
+ "learning_rate": 2.138918345705196e-05,
1169
+ "loss": 0.3824,
1170
  "step": 8100
1171
  },
1172
  {
1173
  "epoch": 2.880876634853305,
1174
+ "grad_norm": 2.974952459335327,
1175
+ "learning_rate": 2.1212442559208203e-05,
1176
+ "loss": 0.4777,
1177
  "step": 8150
1178
  },
1179
  {
1180
  "epoch": 2.898550724637681,
1181
+ "grad_norm": 7.074586868286133,
1182
+ "learning_rate": 2.1035701661364442e-05,
1183
+ "loss": 0.4465,
1184
  "step": 8200
1185
  },
1186
  {
1187
  "epoch": 2.9162248144220575,
1188
+ "grad_norm": 3.585792064666748,
1189
+ "learning_rate": 2.085896076352068e-05,
1190
+ "loss": 0.4307,
1191
  "step": 8250
1192
  },
1193
  {
1194
  "epoch": 2.9338989042064334,
1195
+ "grad_norm": 1.6561566591262817,
1196
+ "learning_rate": 2.068221986567692e-05,
1197
+ "loss": 0.3917,
1198
  "step": 8300
1199
  },
1200
  {
1201
  "epoch": 2.9515729939908093,
1202
+ "grad_norm": 4.920962810516357,
1203
+ "learning_rate": 2.050547896783316e-05,
1204
+ "loss": 0.4334,
1205
  "step": 8350
1206
  },
1207
  {
1208
  "epoch": 2.9692470837751856,
1209
+ "grad_norm": 2.6819636821746826,
1210
+ "learning_rate": 2.03287380699894e-05,
1211
+ "loss": 0.4679,
1212
  "step": 8400
1213
  },
1214
  {
1215
  "epoch": 2.9869211735595615,
1216
+ "grad_norm": 3.442260265350342,
1217
+ "learning_rate": 2.0151997172145634e-05,
1218
+ "loss": 0.4466,
1219
  "step": 8450
1220
  },
1221
  {
1222
  "epoch": 3.0,
1223
+ "eval_bertscore_f1": 0.9646675708510055,
1224
+ "eval_bleu": 0.5742982540038749,
1225
+ "eval_loss": 0.3243306279182434,
1226
+ "eval_meteor": 0.7281699575301964,
1227
+ "eval_rouge1": 0.8328916554556949,
1228
+ "eval_rouge2": 0.7666932565109175,
1229
+ "eval_runtime": 1288.8847,
1230
+ "eval_samples_per_second": 5.015,
1231
+ "eval_steps_per_second": 0.627,
1232
  "step": 8487
1233
  },
1234
  {
1235
  "epoch": 3.004595263343938,
1236
+ "grad_norm": 3.7356512546539307,
1237
+ "learning_rate": 1.9975256274301874e-05,
1238
+ "loss": 0.394,
1239
  "step": 8500
1240
  },
1241
  {
1242
  "epoch": 3.022269353128314,
1243
+ "grad_norm": 3.7725515365600586,
1244
+ "learning_rate": 1.9798515376458113e-05,
1245
+ "loss": 0.4484,
1246
  "step": 8550
1247
  },
1248
  {
1249
  "epoch": 3.03994344291269,
1250
+ "grad_norm": 2.475839138031006,
1251
+ "learning_rate": 1.9621774478614352e-05,
1252
+ "loss": 0.4463,
1253
  "step": 8600
1254
  },
1255
  {
1256
  "epoch": 3.057617532697066,
1257
+ "grad_norm": 2.853266716003418,
1258
+ "learning_rate": 1.944503358077059e-05,
1259
+ "loss": 0.4398,
1260
  "step": 8650
1261
  },
1262
  {
1263
  "epoch": 3.0752916224814424,
1264
+ "grad_norm": 2.7079474925994873,
1265
+ "learning_rate": 1.926829268292683e-05,
1266
+ "loss": 0.4021,
1267
  "step": 8700
1268
  },
1269
  {
1270
  "epoch": 3.0929657122658183,
1271
+ "grad_norm": 5.04539680480957,
1272
+ "learning_rate": 1.909155178508307e-05,
1273
+ "loss": 0.3996,
1274
  "step": 8750
1275
  },
1276
  {
1277
  "epoch": 3.110639802050194,
1278
+ "grad_norm": 4.626221656799316,
1279
  "learning_rate": 1.8918345705196184e-05,
1280
+ "loss": 0.4948,
1281
  "step": 8800
1282
  },
1283
  {
1284
  "epoch": 3.1283138918345705,
1285
+ "grad_norm": 4.644408226013184,
1286
  "learning_rate": 1.8741604807352423e-05,
1287
+ "loss": 0.4156,
1288
  "step": 8850
1289
  },
1290
  {
1291
  "epoch": 3.1459879816189464,
1292
+ "grad_norm": 4.299105167388916,
1293
+ "learning_rate": 1.8564863909508662e-05,
1294
+ "loss": 0.3977,
1295
  "step": 8900
1296
  },
1297
  {
1298
  "epoch": 3.163662071403323,
1299
+ "grad_norm": 4.650149345397949,
1300
+ "learning_rate": 1.83881230116649e-05,
1301
+ "loss": 0.4229,
1302
  "step": 8950
1303
  },
1304
  {
1305
  "epoch": 3.1813361611876987,
1306
+ "grad_norm": 2.89013409614563,
1307
+ "learning_rate": 1.821138211382114e-05,
1308
+ "loss": 0.4506,
1309
  "step": 9000
1310
  },
1311
  {
1312
  "epoch": 3.199010250972075,
1313
+ "grad_norm": 2.281370162963867,
1314
+ "learning_rate": 1.8034641215977376e-05,
1315
+ "loss": 0.4288,
1316
  "step": 9050
1317
  },
1318
  {
1319
  "epoch": 3.216684340756451,
1320
+ "grad_norm": 4.948707103729248,
1321
+ "learning_rate": 1.7857900318133615e-05,
1322
+ "loss": 0.4633,
1323
  "step": 9100
1324
  },
1325
  {
1326
  "epoch": 3.2343584305408273,
1327
+ "grad_norm": 3.5856571197509766,
1328
+ "learning_rate": 1.7681159420289855e-05,
1329
+ "loss": 0.3965,
1330
  "step": 9150
1331
  },
1332
  {
1333
  "epoch": 3.252032520325203,
1334
+ "grad_norm": 3.416271686553955,
1335
+ "learning_rate": 1.7504418522446094e-05,
1336
+ "loss": 0.4904,
1337
  "step": 9200
1338
  },
1339
  {
1340
  "epoch": 3.2697066101095795,
1341
+ "grad_norm": 3.599717617034912,
1342
+ "learning_rate": 1.7327677624602333e-05,
1343
+ "loss": 0.4648,
1344
  "step": 9250
1345
  },
1346
  {
1347
  "epoch": 3.2873806998939554,
1348
+ "grad_norm": 2.8439853191375732,
1349
+ "learning_rate": 1.7150936726758572e-05,
1350
+ "loss": 0.3734,
1351
  "step": 9300
1352
  },
1353
  {
1354
  "epoch": 3.3050547896783318,
1355
+ "grad_norm": 4.0927863121032715,
1356
+ "learning_rate": 1.697419582891481e-05,
1357
+ "loss": 0.3913,
1358
  "step": 9350
1359
  },
1360
  {
1361
  "epoch": 3.3227288794627077,
1362
+ "grad_norm": 4.16766881942749,
1363
+ "learning_rate": 1.679745493107105e-05,
1364
+ "loss": 0.4303,
1365
  "step": 9400
1366
  },
1367
  {
1368
  "epoch": 3.3404029692470836,
1369
+ "grad_norm": 3.417738199234009,
1370
+ "learning_rate": 1.662071403322729e-05,
1371
+ "loss": 0.4068,
1372
  "step": 9450
1373
  },
1374
  {
1375
  "epoch": 3.35807705903146,
1376
+ "grad_norm": 4.66575813293457,
1377
+ "learning_rate": 1.644397313538353e-05,
1378
+ "loss": 0.5116,
1379
  "step": 9500
1380
  },
1381
  {
1382
  "epoch": 3.375751148815836,
1383
+ "grad_norm": 6.112340927124023,
1384
+ "learning_rate": 1.6267232237539768e-05,
1385
+ "loss": 0.4244,
1386
  "step": 9550
1387
  },
1388
  {
1389
  "epoch": 3.393425238600212,
1390
+ "grad_norm": 3.322610378265381,
1391
+ "learning_rate": 1.6090491339696007e-05,
1392
+ "loss": 0.4252,
1393
  "step": 9600
1394
  },
1395
  {
1396
  "epoch": 3.411099328384588,
1397
+ "grad_norm": 4.941850185394287,
1398
+ "learning_rate": 1.5913750441852247e-05,
1399
+ "loss": 0.402,
1400
  "step": 9650
1401
  },
1402
  {
1403
  "epoch": 3.4287734181689644,
1404
+ "grad_norm": 2.177600860595703,
1405
+ "learning_rate": 1.5737009544008486e-05,
1406
+ "loss": 0.3437,
1407
  "step": 9700
1408
  },
1409
  {
1410
  "epoch": 3.4464475079533403,
1411
+ "grad_norm": 2.1570093631744385,
1412
+ "learning_rate": 1.5560268646164725e-05,
1413
+ "loss": 0.4871,
1414
  "step": 9750
1415
  },
1416
  {
1417
  "epoch": 3.4641215977377167,
1418
+ "grad_norm": 7.6717305183410645,
1419
+ "learning_rate": 1.538352774832096e-05,
1420
+ "loss": 0.4224,
1421
  "step": 9800
1422
  },
1423
  {
1424
  "epoch": 3.4817956875220926,
1425
+ "grad_norm": 3.082805871963501,
1426
+ "learning_rate": 1.52067868504772e-05,
1427
+ "loss": 0.4474,
1428
  "step": 9850
1429
  },
1430
  {
1431
  "epoch": 3.499469777306469,
1432
+ "grad_norm": 2.8141167163848877,
1433
  "learning_rate": 1.5033580770590316e-05,
1434
+ "loss": 0.4476,
1435
  "step": 9900
1436
  },
1437
  {
1438
  "epoch": 3.517143867090845,
1439
+ "grad_norm": 3.179436206817627,
1440
  "learning_rate": 1.4856839872746553e-05,
1441
+ "loss": 0.3936,
1442
  "step": 9950
1443
  },
1444
  {
1445
  "epoch": 3.534817956875221,
1446
+ "grad_norm": 3.908020257949829,
1447
  "learning_rate": 1.4680098974902792e-05,
1448
+ "loss": 0.4164,
1449
  "step": 10000
1450
  },
1451
  {
1452
  "epoch": 3.552492046659597,
1453
+ "grad_norm": 4.998553276062012,
1454
  "learning_rate": 1.4503358077059032e-05,
1455
+ "loss": 0.4534,
1456
  "step": 10050
1457
  },
1458
  {
1459
  "epoch": 3.570166136443973,
1460
+ "grad_norm": 4.064126014709473,
1461
  "learning_rate": 1.432661717921527e-05,
1462
+ "loss": 0.458,
1463
  "step": 10100
1464
  },
1465
  {
1466
  "epoch": 3.5878402262283493,
1467
+ "grad_norm": 2.2527036666870117,
1468
  "learning_rate": 1.414987628137151e-05,
1469
+ "loss": 0.4285,
1470
  "step": 10150
1471
  },
1472
  {
1473
  "epoch": 3.605514316012725,
1474
+ "grad_norm": 3.3799755573272705,
1475
  "learning_rate": 1.397313538352775e-05,
1476
+ "loss": 0.5488,
1477
  "step": 10200
1478
  },
1479
  {
1480
  "epoch": 3.6231884057971016,
1481
+ "grad_norm": 3.4317479133605957,
1482
  "learning_rate": 1.3796394485683988e-05,
1483
+ "loss": 0.4326,
1484
  "step": 10250
1485
  },
1486
  {
1487
  "epoch": 3.6408624955814775,
1488
+ "grad_norm": 2.245337724685669,
1489
  "learning_rate": 1.3619653587840228e-05,
1490
+ "loss": 0.4279,
1491
  "step": 10300
1492
  },
1493
  {
1494
  "epoch": 3.658536585365854,
1495
+ "grad_norm": 2.9092109203338623,
1496
  "learning_rate": 1.3442912689996465e-05,
1497
+ "loss": 0.4116,
1498
  "step": 10350
1499
  },
1500
  {
1501
  "epoch": 3.6762106751502297,
1502
+ "grad_norm": 2.79837965965271,
1503
  "learning_rate": 1.3266171792152704e-05,
1504
+ "loss": 0.4506,
1505
  "step": 10400
1506
  },
1507
  {
1508
  "epoch": 3.693884764934606,
1509
+ "grad_norm": 1.416994333267212,
1510
  "learning_rate": 1.3089430894308943e-05,
1511
+ "loss": 0.3532,
1512
  "step": 10450
1513
  },
1514
  {
1515
  "epoch": 3.711558854718982,
1516
+ "grad_norm": 4.927233695983887,
1517
  "learning_rate": 1.2912689996465183e-05,
1518
+ "loss": 0.4302,
1519
  "step": 10500
1520
  },
1521
  {
1522
  "epoch": 3.729232944503358,
1523
+ "grad_norm": 2.069500684738159,
1524
  "learning_rate": 1.2735949098621422e-05,
1525
+ "loss": 0.4608,
1526
  "step": 10550
1527
  },
1528
  {
1529
  "epoch": 3.746907034287734,
1530
+ "grad_norm": 3.3507018089294434,
1531
  "learning_rate": 1.2559208200777661e-05,
1532
+ "loss": 0.4024,
1533
  "step": 10600
1534
  },
1535
  {
1536
  "epoch": 3.7645811240721105,
1537
+ "grad_norm": 2.64599871635437,
1538
  "learning_rate": 1.2382467302933899e-05,
1539
+ "loss": 0.3817,
1540
  "step": 10650
1541
  },
1542
  {
1543
  "epoch": 3.7822552138564864,
1544
+ "grad_norm": 2.3984270095825195,
1545
  "learning_rate": 1.2205726405090138e-05,
1546
+ "loss": 0.4399,
1547
  "step": 10700
1548
  },
1549
  {
1550
  "epoch": 3.7999293036408623,
1551
+ "grad_norm": 5.132211685180664,
1552
  "learning_rate": 1.2028985507246379e-05,
1553
+ "loss": 0.4537,
1554
  "step": 10750
1555
  },
1556
  {
1557
  "epoch": 3.8176033934252387,
1558
+ "grad_norm": 3.9488821029663086,
1559
  "learning_rate": 1.1852244609402616e-05,
1560
+ "loss": 0.4433,
1561
  "step": 10800
1562
  },
1563
  {
1564
  "epoch": 3.8352774832096146,
1565
+ "grad_norm": 4.978783130645752,
1566
  "learning_rate": 1.1675503711558855e-05,
1567
+ "loss": 0.3722,
1568
  "step": 10850
1569
  },
1570
  {
1571
  "epoch": 3.852951572993991,
1572
+ "grad_norm": 2.1942172050476074,
1573
  "learning_rate": 1.1498762813715094e-05,
1574
+ "loss": 0.3641,
1575
  "step": 10900
1576
  },
1577
  {
1578
  "epoch": 3.870625662778367,
1579
+ "grad_norm": 1.962399959564209,
1580
  "learning_rate": 1.1322021915871334e-05,
1581
+ "loss": 0.3966,
1582
  "step": 10950
1583
  },
1584
  {
1585
  "epoch": 3.888299752562743,
1586
+ "grad_norm": 2.3611438274383545,
1587
  "learning_rate": 1.1145281018027571e-05,
1588
+ "loss": 0.3872,
1589
  "step": 11000
1590
  },
1591
  {
1592
  "epoch": 3.905973842347119,
1593
+ "grad_norm": 2.8562467098236084,
1594
  "learning_rate": 1.096854012018381e-05,
1595
+ "loss": 0.3823,
1596
  "step": 11050
1597
  },
1598
  {
1599
  "epoch": 3.9236479321314954,
1600
+ "grad_norm": 3.315880060195923,
1601
  "learning_rate": 1.079179922234005e-05,
1602
+ "loss": 0.4215,
1603
  "step": 11100
1604
  },
1605
  {
1606
  "epoch": 3.9413220219158713,
1607
+ "grad_norm": 4.15437650680542,
1608
  "learning_rate": 1.0615058324496289e-05,
1609
+ "loss": 0.4126,
1610
  "step": 11150
1611
  },
1612
  {
1613
  "epoch": 3.9589961117002472,
1614
+ "grad_norm": 3.9605205059051514,
1615
  "learning_rate": 1.0438317426652528e-05,
1616
+ "loss": 0.3773,
1617
  "step": 11200
1618
  },
1619
  {
1620
  "epoch": 3.9766702014846236,
1621
+ "grad_norm": 3.106764793395996,
1622
  "learning_rate": 1.0261576528808767e-05,
1623
+ "loss": 0.4297,
1624
  "step": 11250
1625
  },
1626
  {
1627
  "epoch": 3.9943442912689995,
1628
+ "grad_norm": 3.4298675060272217,
1629
  "learning_rate": 1.0084835630965006e-05,
1630
+ "loss": 0.4305,
1631
  "step": 11300
1632
  },
1633
  {
1634
  "epoch": 4.0,
1635
+ "eval_bertscore_f1": 0.9656413255425373,
1636
+ "eval_bleu": 0.5848426882684508,
1637
+ "eval_loss": 0.3005247414112091,
1638
+ "eval_meteor": 0.73697495147188,
1639
+ "eval_rouge1": 0.8370075787215339,
1640
+ "eval_rouge2": 0.7752220988783712,
1641
+ "eval_runtime": 1268.4642,
1642
+ "eval_samples_per_second": 5.096,
1643
+ "eval_steps_per_second": 0.637,
1644
  "step": 11316
1645
  },
1646
  {
1647
  "epoch": 4.012018381053376,
1648
+ "grad_norm": 4.263380527496338,
1649
  "learning_rate": 9.908094733121245e-06,
1650
+ "loss": 0.4285,
1651
  "step": 11350
1652
  },
1653
  {
1654
  "epoch": 4.029692470837752,
1655
+ "grad_norm": 14.104089736938477,
1656
  "learning_rate": 9.731353835277483e-06,
1657
+ "loss": 0.3837,
1658
  "step": 11400
1659
  },
1660
  {
1661
  "epoch": 4.047366560622128,
1662
+ "grad_norm": 2.5981857776641846,
1663
  "learning_rate": 9.554612937433722e-06,
1664
+ "loss": 0.3773,
1665
  "step": 11450
1666
  },
1667
  {
1668
  "epoch": 4.065040650406504,
1669
+ "grad_norm": 4.44357967376709,
1670
  "learning_rate": 9.377872039589961e-06,
1671
+ "loss": 0.4325,
1672
  "step": 11500
1673
  },
1674
  {
1675
  "epoch": 4.08271474019088,
1676
+ "grad_norm": 3.7187113761901855,
1677
  "learning_rate": 9.2011311417462e-06,
1678
+ "loss": 0.427,
1679
  "step": 11550
1680
  },
1681
  {
1682
  "epoch": 4.100388829975256,
1683
+ "grad_norm": 2.364908218383789,
1684
  "learning_rate": 9.02439024390244e-06,
1685
+ "loss": 0.3617,
1686
  "step": 11600
1687
  },
1688
  {
1689
  "epoch": 4.118062919759632,
1690
+ "grad_norm": 2.663651704788208,
1691
  "learning_rate": 8.847649346058679e-06,
1692
+ "loss": 0.4174,
1693
  "step": 11650
1694
  },
1695
  {
1696
  "epoch": 4.135737009544009,
1697
+ "grad_norm": 3.6699295043945312,
1698
  "learning_rate": 8.670908448214918e-06,
1699
+ "loss": 0.4183,
1700
  "step": 11700
1701
  },
1702
  {
1703
  "epoch": 4.153411099328385,
1704
+ "grad_norm": 4.236429214477539,
1705
  "learning_rate": 8.494167550371156e-06,
1706
+ "loss": 0.4074,
1707
  "step": 11750
1708
  },
1709
  {
1710
  "epoch": 4.171085189112761,
1711
+ "grad_norm": 4.3517632484436035,
1712
  "learning_rate": 8.317426652527395e-06,
1713
+ "loss": 0.3905,
1714
  "step": 11800
1715
  },
1716
  {
1717
  "epoch": 4.188759278897137,
1718
+ "grad_norm": 2.440966844558716,
1719
  "learning_rate": 8.140685754683634e-06,
1720
+ "loss": 0.408,
1721
  "step": 11850
1722
  },
1723
  {
1724
  "epoch": 4.2064333686815125,
1725
+ "grad_norm": 3.0445733070373535,
1726
  "learning_rate": 7.963944856839873e-06,
1727
+ "loss": 0.3646,
1728
  "step": 11900
1729
  },
1730
  {
1731
  "epoch": 4.224107458465889,
1732
+ "grad_norm": 3.174678325653076,
1733
  "learning_rate": 7.787203958996112e-06,
1734
+ "loss": 0.4027,
1735
  "step": 11950
1736
  },
1737
  {
1738
  "epoch": 4.241781548250265,
1739
+ "grad_norm": 4.445051193237305,
1740
  "learning_rate": 7.610463061152351e-06,
1741
+ "loss": 0.4111,
1742
  "step": 12000
1743
  },
1744
  {
1745
  "epoch": 4.259455638034641,
1746
+ "grad_norm": 3.7955079078674316,
1747
  "learning_rate": 7.43372216330859e-06,
1748
  "loss": 0.3815,
1749
  "step": 12050
1750
  },
1751
  {
1752
  "epoch": 4.277129727819017,
1753
+ "grad_norm": 3.0276503562927246,
1754
  "learning_rate": 7.256981265464829e-06,
1755
+ "loss": 0.3765,
1756
  "step": 12100
1757
  },
1758
  {
1759
  "epoch": 4.294803817603394,
1760
+ "grad_norm": 1.8871873617172241,
1761
  "learning_rate": 7.080240367621067e-06,
1762
+ "loss": 0.3771,
1763
  "step": 12150
1764
  },
1765
  {
1766
  "epoch": 4.31247790738777,
1767
+ "grad_norm": 9.927197456359863,
1768
  "learning_rate": 6.903499469777307e-06,
1769
+ "loss": 0.4112,
1770
  "step": 12200
1771
  },
1772
  {
1773
  "epoch": 4.330151997172146,
1774
+ "grad_norm": 4.721640586853027,
1775
  "learning_rate": 6.726758571933546e-06,
1776
+ "loss": 0.4451,
1777
  "step": 12250
1778
  },
1779
  {
1780
  "epoch": 4.3478260869565215,
1781
+ "grad_norm": 2.7340986728668213,
1782
  "learning_rate": 6.550017674089785e-06,
1783
+ "loss": 0.4254,
1784
  "step": 12300
1785
  },
1786
  {
1787
  "epoch": 4.365500176740898,
1788
+ "grad_norm": 3.780824661254883,
1789
  "learning_rate": 6.373276776246023e-06,
1790
+ "loss": 0.4557,
1791
  "step": 12350
1792
  },
1793
  {
1794
  "epoch": 4.383174266525274,
1795
+ "grad_norm": 3.429931640625,
1796
  "learning_rate": 6.1965358784022625e-06,
1797
+ "loss": 0.3858,
1798
  "step": 12400
1799
  },
1800
  {
1801
  "epoch": 4.40084835630965,
1802
+ "grad_norm": 3.944438934326172,
1803
  "learning_rate": 6.019794980558501e-06,
1804
+ "loss": 0.3569,
1805
  "step": 12450
1806
  },
1807
  {
1808
  "epoch": 4.418522446094026,
1809
+ "grad_norm": 2.19978666305542,
1810
  "learning_rate": 5.843054082714741e-06,
1811
+ "loss": 0.4232,
1812
  "step": 12500
1813
  },
1814
  {
1815
  "epoch": 4.436196535878402,
1816
+ "grad_norm": 1.6702100038528442,
1817
  "learning_rate": 5.666313184870979e-06,
1818
+ "loss": 0.4216,
1819
  "step": 12550
1820
  },
1821
  {
1822
  "epoch": 4.453870625662779,
1823
+ "grad_norm": 5.39310884475708,
1824
  "learning_rate": 5.4895722870272184e-06,
1825
+ "loss": 0.393,
1826
  "step": 12600
1827
  },
1828
  {
1829
  "epoch": 4.471544715447155,
1830
+ "grad_norm": 2.8727235794067383,
1831
  "learning_rate": 5.312831389183457e-06,
1832
+ "loss": 0.4104,
1833
  "step": 12650
1834
  },
1835
  {
1836
  "epoch": 4.4892188052315305,
1837
+ "grad_norm": 1.9998319149017334,
1838
  "learning_rate": 5.136090491339696e-06,
1839
+ "loss": 0.4592,
1840
  "step": 12700
1841
  },
1842
  {
1843
  "epoch": 4.506892895015906,
1844
+ "grad_norm": 3.140760660171509,
1845
  "learning_rate": 4.959349593495935e-06,
1846
+ "loss": 0.3582,
1847
  "step": 12750
1848
  },
1849
  {
1850
  "epoch": 4.524566984800282,
1851
+ "grad_norm": 4.489378929138184,
1852
  "learning_rate": 4.782608695652174e-06,
1853
+ "loss": 0.3891,
1854
  "step": 12800
1855
  },
1856
  {
1857
  "epoch": 4.542241074584659,
1858
+ "grad_norm": 3.2630345821380615,
1859
  "learning_rate": 4.605867797808413e-06,
1860
+ "loss": 0.3703,
1861
  "step": 12850
1862
  },
1863
  {
1864
  "epoch": 4.559915164369035,
1865
+ "grad_norm": 2.898639440536499,
1866
  "learning_rate": 4.429126899964652e-06,
1867
+ "loss": 0.4201,
1868
  "step": 12900
1869
  },
1870
  {
1871
  "epoch": 4.577589254153411,
1872
+ "grad_norm": 3.266235589981079,
1873
  "learning_rate": 4.252386002120891e-06,
1874
+ "loss": 0.4142,
1875
  "step": 12950
1876
  },
1877
  {
1878
  "epoch": 4.595263343937788,
1879
+ "grad_norm": 3.594919204711914,
1880
  "learning_rate": 4.07564510427713e-06,
1881
+ "loss": 0.4224,
1882
  "step": 13000
1883
  },
1884
  {
1885
  "epoch": 4.612937433722164,
1886
+ "grad_norm": 4.3656439781188965,
1887
  "learning_rate": 3.898904206433369e-06,
1888
+ "loss": 0.4155,
1889
  "step": 13050
1890
  },
1891
  {
1892
  "epoch": 4.6306115235065395,
1893
+ "grad_norm": 3.863250494003296,
1894
  "learning_rate": 3.722163308589608e-06,
1895
+ "loss": 0.3933,
1896
  "step": 13100
1897
  },
1898
  {
1899
  "epoch": 4.648285613290915,
1900
+ "grad_norm": 3.6120657920837402,
1901
  "learning_rate": 3.5454224107458466e-06,
1902
+ "loss": 0.4009,
1903
  "step": 13150
1904
  },
1905
  {
1906
  "epoch": 4.665959703075291,
1907
+ "grad_norm": 3.4946892261505127,
1908
  "learning_rate": 3.368681512902086e-06,
1909
+ "loss": 0.4538,
1910
  "step": 13200
1911
  },
1912
  {
1913
  "epoch": 4.683633792859668,
1914
+ "grad_norm": 3.0893940925598145,
1915
  "learning_rate": 3.1919406150583245e-06,
1916
+ "loss": 0.3429,
1917
  "step": 13250
1918
  },
1919
  {
1920
  "epoch": 4.701307882644044,
1921
+ "grad_norm": 3.190537929534912,
1922
  "learning_rate": 3.0151997172145637e-06,
1923
+ "loss": 0.489,
1924
  "step": 13300
1925
  },
1926
  {
1927
  "epoch": 4.71898197242842,
1928
+ "grad_norm": 5.128622531890869,
1929
  "learning_rate": 2.8384588193708025e-06,
1930
+ "loss": 0.3926,
1931
  "step": 13350
1932
  },
1933
  {
1934
  "epoch": 4.736656062212796,
1935
+ "grad_norm": 3.6680729389190674,
1936
  "learning_rate": 2.6617179215270417e-06,
1937
+ "loss": 0.3859,
1938
  "step": 13400
1939
  },
1940
  {
1941
  "epoch": 4.754330151997172,
1942
+ "grad_norm": 2.93373703956604,
1943
  "learning_rate": 2.4849770236832804e-06,
1944
+ "loss": 0.3637,
1945
  "step": 13450
1946
  },
1947
  {
1948
  "epoch": 4.7720042417815485,
1949
+ "grad_norm": 3.649087429046631,
1950
  "learning_rate": 2.3082361258395196e-06,
1951
+ "loss": 0.4155,
1952
  "step": 13500
1953
  },
1954
  {
1955
  "epoch": 4.789678331565924,
1956
+ "grad_norm": 1.2933834791183472,
1957
  "learning_rate": 2.1314952279957584e-06,
1958
+ "loss": 0.3868,
1959
  "step": 13550
1960
  },
1961
  {
1962
  "epoch": 4.8073524213503,
1963
+ "grad_norm": 2.177612781524658,
1964
  "learning_rate": 1.9547543301519976e-06,
1965
+ "loss": 0.4799,
1966
  "step": 13600
1967
  },
1968
  {
1969
  "epoch": 4.825026511134676,
1970
+ "grad_norm": 3.7405126094818115,
1971
  "learning_rate": 1.7780134323082363e-06,
1972
+ "loss": 0.455,
1973
  "step": 13650
1974
  },
1975
  {
1976
  "epoch": 4.842700600919053,
1977
+ "grad_norm": 6.44041633605957,
1978
+ "learning_rate": 1.6048073524213503e-06,
1979
+ "loss": 0.3766,
1980
  "step": 13700
1981
  },
1982
  {
1983
  "epoch": 4.860374690703429,
1984
+ "grad_norm": 2.510866165161133,
1985
+ "learning_rate": 1.4280664545775892e-06,
1986
+ "loss": 0.3779,
1987
  "step": 13750
1988
  },
1989
  {
1990
  "epoch": 4.878048780487805,
1991
+ "grad_norm": 4.789300441741943,
1992
+ "learning_rate": 1.2513255567338282e-06,
1993
+ "loss": 0.3892,
1994
  "step": 13800
1995
  },
1996
  {
1997
  "epoch": 4.895722870272181,
1998
+ "grad_norm": 2.6004765033721924,
1999
+ "learning_rate": 1.0745846588900672e-06,
2000
+ "loss": 0.3897,
2001
  "step": 13850
2002
  },
2003
  {
2004
  "epoch": 4.9133969600565575,
2005
+ "grad_norm": 4.115776062011719,
2006
+ "learning_rate": 8.978437610463062e-07,
2007
+ "loss": 0.3896,
2008
  "step": 13900
2009
  },
2010
  {
2011
  "epoch": 4.931071049840933,
2012
+ "grad_norm": 9.11878776550293,
2013
+ "learning_rate": 7.211028632025451e-07,
2014
+ "loss": 0.3918,
2015
  "step": 13950
2016
  },
2017
  {
2018
  "epoch": 4.948745139625309,
2019
+ "grad_norm": 3.8003361225128174,
2020
+ "learning_rate": 5.44361965358784e-07,
2021
+ "loss": 0.476,
2022
  "step": 14000
2023
  },
2024
  {
2025
  "epoch": 4.966419229409685,
2026
+ "grad_norm": 2.117197275161743,
2027
+ "learning_rate": 3.67621067515023e-07,
2028
+ "loss": 0.373,
2029
  "step": 14050
2030
  },
2031
  {
2032
  "epoch": 4.984093319194061,
2033
+ "grad_norm": 1.8130935430526733,
2034
+ "learning_rate": 1.9088016967126194e-07,
2035
+ "loss": 0.4102,
2036
  "step": 14100
2037
  },
2038
  {
2039
  "epoch": 5.0,
2040
+ "eval_bertscore_f1": 0.9660587414250811,
2041
+ "eval_bleu": 0.5882761107143478,
2042
+ "eval_loss": 0.29442909359931946,
2043
+ "eval_meteor": 0.7392640094761435,
2044
+ "eval_rouge1": 0.8386605714105622,
2045
+ "eval_rouge2": 0.7781271007162897,
2046
+ "eval_runtime": 1266.3046,
2047
+ "eval_samples_per_second": 5.105,
2048
+ "eval_steps_per_second": 0.638,
2049
  "step": 14145
2050
  }
2051
  ],
checkpoint-14145/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84eb264dc96b91d8b1c338a7f669bf17dafa7f32a2801b1215e7c0b8df1ea575
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04cb124e4f637bd6bd64f8ac591c0fec6ae1ea5fb92454e4e04a6ba2de4b3e45
3
  size 5432