gpol13 commited on
Commit
01ca755
·
verified ·
1 Parent(s): 9eb7f59

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -199,4 +199,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
199
  [More Information Needed]
200
  ### Framework versions
201
 
202
- - PEFT 0.15.2
 
199
  [More Information Needed]
200
  ### Framework versions
201
 
202
+ - PEFT 0.14.0
adapter_config.json CHANGED
@@ -3,19 +3,18 @@
3
  "auto_mapping": null,
4
  "base_model_name_or_path": "Salesforce/codet5-base",
5
  "bias": "none",
6
- "corda_config": null,
7
  "eva_config": null,
8
  "exclude_modules": null,
9
  "fan_in_fan_out": false,
10
  "inference_mode": true,
11
- "init_lora_weights": true,
12
  "layer_replication": null,
13
  "layers_pattern": null,
14
  "layers_to_transform": null,
15
  "loftq_config": {},
16
  "lora_alpha": 16,
17
  "lora_bias": false,
18
- "lora_dropout": 0.1,
19
  "megatron_config": null,
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
@@ -28,7 +27,6 @@
28
  "v"
29
  ],
30
  "task_type": "SEQ_2_SEQ_LM",
31
- "trainable_token_indices": null,
32
  "use_dora": false,
33
  "use_rslora": false
34
  }
 
3
  "auto_mapping": null,
4
  "base_model_name_or_path": "Salesforce/codet5-base",
5
  "bias": "none",
 
6
  "eva_config": null,
7
  "exclude_modules": null,
8
  "fan_in_fan_out": false,
9
  "inference_mode": true,
10
+ "init_lora_weights": "gaussian",
11
  "layer_replication": null,
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
  "lora_alpha": 16,
16
  "lora_bias": false,
17
+ "lora_dropout": 0.05,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": null,
 
27
  "v"
28
  ],
29
  "task_type": "SEQ_2_SEQ_LM",
 
30
  "use_dora": false,
31
  "use_rslora": false
32
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc6a6711d2467cf8cc0d624e48bbe0ad9250fa2646d0816d7c83796afe2c35d7
3
  size 3558888
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1280f7f2b208c156d2a4a277a7a1a7316c93f88988b5c754c16059bf2e0f289c
3
  size 3558888
checkpoint-14145/README.md CHANGED
@@ -199,4 +199,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
199
  [More Information Needed]
200
  ### Framework versions
201
 
202
- - PEFT 0.15.2
 
199
  [More Information Needed]
200
  ### Framework versions
201
 
202
+ - PEFT 0.14.0
checkpoint-14145/adapter_config.json CHANGED
@@ -3,19 +3,18 @@
3
  "auto_mapping": null,
4
  "base_model_name_or_path": "Salesforce/codet5-base",
5
  "bias": "none",
6
- "corda_config": null,
7
  "eva_config": null,
8
  "exclude_modules": null,
9
  "fan_in_fan_out": false,
10
  "inference_mode": true,
11
- "init_lora_weights": true,
12
  "layer_replication": null,
13
  "layers_pattern": null,
14
  "layers_to_transform": null,
15
  "loftq_config": {},
16
  "lora_alpha": 16,
17
  "lora_bias": false,
18
- "lora_dropout": 0.1,
19
  "megatron_config": null,
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
@@ -28,7 +27,6 @@
28
  "v"
29
  ],
30
  "task_type": "SEQ_2_SEQ_LM",
31
- "trainable_token_indices": null,
32
  "use_dora": false,
33
  "use_rslora": false
34
  }
 
3
  "auto_mapping": null,
4
  "base_model_name_or_path": "Salesforce/codet5-base",
5
  "bias": "none",
 
6
  "eva_config": null,
7
  "exclude_modules": null,
8
  "fan_in_fan_out": false,
9
  "inference_mode": true,
10
+ "init_lora_weights": "gaussian",
11
  "layer_replication": null,
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
  "lora_alpha": 16,
16
  "lora_bias": false,
17
+ "lora_dropout": 0.05,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": null,
 
27
  "v"
28
  ],
29
  "task_type": "SEQ_2_SEQ_LM",
 
30
  "use_dora": false,
31
  "use_rslora": false
32
  }
checkpoint-14145/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc6a6711d2467cf8cc0d624e48bbe0ad9250fa2646d0816d7c83796afe2c35d7
3
  size 3558888
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1280f7f2b208c156d2a4a277a7a1a7316c93f88988b5c754c16059bf2e0f289c
3
  size 3558888
checkpoint-14145/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53ddcb136257f96c53597b9ba32ee00c9c8be3408fecbfc3a03475b4df41d500
3
  size 7198906
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03e73f5f4a50e8492ca23abad4e74c002d3e1fe0681df8ea443c93e46d7d8cd0
3
  size 7198906
checkpoint-14145/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f5b751b47b6cccbdd2c32abdde3d32c796c18aaf5c957da0ab861aefc469561
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:551d95fa01a350d649318071ec6c30c9643b1415452525afe82187c8c159941c
3
  size 988
checkpoint-14145/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb9c7879fbb6caacd2e719a636a3b1cb107ccb95a6e44e69d0bbc3b31d131786
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:898d0d2ace2142f45ba707687d167a2ef3ebdcb03bdca2695993e6fe1235bc54
3
  size 1064
checkpoint-14145/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_global_step": 14145,
3
- "best_metric": 0.9718361922106383,
4
- "best_model_checkpoint": "./codet5-qlora-k8s/checkpoint-14145",
5
  "epoch": 5.0,
6
  "eval_steps": 500,
7
  "global_step": 14145,
@@ -11,2026 +11,2041 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.017674089784376106,
14
- "grad_norm": 1.1207759380340576,
15
- "learning_rate": 0.00029898197242841994,
16
- "loss": 3.2886,
17
  "step": 50
18
  },
19
  {
20
  "epoch": 0.03534817956875221,
21
- "grad_norm": 1.2978123426437378,
22
- "learning_rate": 0.0002979215270413573,
23
- "loss": 1.8567,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.053022269353128315,
28
- "grad_norm": 1.624740719795227,
29
- "learning_rate": 0.0002968610816542948,
30
- "loss": 1.5695,
31
  "step": 150
32
  },
33
  {
34
  "epoch": 0.07069635913750442,
35
- "grad_norm": 1.7711330652236938,
36
- "learning_rate": 0.0002958006362672322,
37
- "loss": 1.4205,
38
  "step": 200
39
  },
40
  {
41
  "epoch": 0.08837044892188052,
42
- "grad_norm": 1.62517511844635,
43
- "learning_rate": 0.0002947401908801697,
44
- "loss": 1.2732,
45
  "step": 250
46
  },
47
  {
48
  "epoch": 0.10604453870625663,
49
- "grad_norm": 2.038139820098877,
50
- "learning_rate": 0.00029367974549310706,
51
- "loss": 1.1913,
52
  "step": 300
53
  },
54
  {
55
  "epoch": 0.12371862849063273,
56
- "grad_norm": 2.262789487838745,
57
- "learning_rate": 0.00029264050901378576,
58
- "loss": 1.117,
59
  "step": 350
60
  },
61
  {
62
  "epoch": 0.14139271827500885,
63
- "grad_norm": 3.121687650680542,
64
- "learning_rate": 0.0002915800636267232,
65
- "loss": 1.0202,
66
  "step": 400
67
  },
68
  {
69
  "epoch": 0.15906680805938495,
70
- "grad_norm": 2.0951812267303467,
71
- "learning_rate": 0.0002905196182396606,
72
- "loss": 0.9499,
73
  "step": 450
74
  },
75
  {
76
  "epoch": 0.17674089784376104,
77
- "grad_norm": 2.670121192932129,
78
- "learning_rate": 0.00028945917285259806,
79
- "loss": 0.9707,
80
  "step": 500
81
  },
82
  {
83
  "epoch": 0.19441498762813716,
84
- "grad_norm": 2.3631107807159424,
85
- "learning_rate": 0.00028841993637327676,
86
- "loss": 0.7961,
87
  "step": 550
88
  },
89
  {
90
  "epoch": 0.21208907741251326,
91
- "grad_norm": 2.10772705078125,
92
- "learning_rate": 0.0002873594909862142,
93
- "loss": 0.8912,
94
  "step": 600
95
  },
96
  {
97
  "epoch": 0.22976316719688936,
98
- "grad_norm": 2.360686779022217,
99
- "learning_rate": 0.00028629904559915163,
100
- "loss": 0.871,
101
  "step": 650
102
  },
103
  {
104
  "epoch": 0.24743725698126545,
105
- "grad_norm": 2.191119432449341,
106
- "learning_rate": 0.0002852598091198303,
107
- "loss": 0.758,
108
  "step": 700
109
  },
110
  {
111
  "epoch": 0.2651113467656416,
112
- "grad_norm": 1.7646818161010742,
113
- "learning_rate": 0.00028419936373276776,
114
- "loss": 0.8244,
115
  "step": 750
116
  },
117
  {
118
  "epoch": 0.2827854365500177,
119
- "grad_norm": 2.3776354789733887,
120
- "learning_rate": 0.00028313891834570514,
121
- "loss": 0.7664,
122
  "step": 800
123
  },
124
  {
125
  "epoch": 0.30045952633439377,
126
- "grad_norm": 2.8682475090026855,
127
- "learning_rate": 0.00028207847295864263,
128
- "loss": 0.6942,
129
  "step": 850
130
  },
131
  {
132
  "epoch": 0.3181336161187699,
133
- "grad_norm": 2.353091239929199,
134
- "learning_rate": 0.00028101802757158,
135
- "loss": 0.7323,
136
  "step": 900
137
  },
138
  {
139
  "epoch": 0.335807705903146,
140
- "grad_norm": 1.9457337856292725,
141
- "learning_rate": 0.00027995758218451745,
142
- "loss": 0.6474,
143
  "step": 950
144
  },
145
  {
146
  "epoch": 0.3534817956875221,
147
- "grad_norm": 2.510075330734253,
148
- "learning_rate": 0.00027889713679745494,
149
- "loss": 0.6801,
150
  "step": 1000
151
  },
152
  {
153
  "epoch": 0.3711558854718982,
154
- "grad_norm": 1.7497014999389648,
155
- "learning_rate": 0.0002778366914103923,
156
- "loss": 0.656,
157
  "step": 1050
158
  },
159
  {
160
  "epoch": 0.38882997525627433,
161
- "grad_norm": 2.862682342529297,
162
- "learning_rate": 0.0002767762460233298,
163
- "loss": 0.6238,
164
  "step": 1100
165
  },
166
  {
167
  "epoch": 0.4065040650406504,
168
- "grad_norm": 1.998961091041565,
169
- "learning_rate": 0.0002757158006362672,
170
- "loss": 0.6306,
171
  "step": 1150
172
  },
173
  {
174
  "epoch": 0.4241781548250265,
175
- "grad_norm": 1.854942798614502,
176
- "learning_rate": 0.0002746553552492047,
177
- "loss": 0.5689,
178
  "step": 1200
179
  },
180
  {
181
  "epoch": 0.4418522446094026,
182
- "grad_norm": 1.8994203805923462,
183
- "learning_rate": 0.00027359490986214206,
184
- "loss": 0.6595,
185
  "step": 1250
186
  },
187
  {
188
  "epoch": 0.4595263343937787,
189
- "grad_norm": 1.6235908269882202,
190
- "learning_rate": 0.0002725344644750795,
191
- "loss": 0.5665,
192
  "step": 1300
193
  },
194
  {
195
  "epoch": 0.47720042417815484,
196
- "grad_norm": 2.291989803314209,
197
- "learning_rate": 0.00027147401908801693,
198
- "loss": 0.5761,
199
  "step": 1350
200
  },
201
  {
202
  "epoch": 0.4948745139625309,
203
- "grad_norm": 1.4632915258407593,
204
- "learning_rate": 0.00027041357370095437,
205
- "loss": 0.5171,
206
  "step": 1400
207
  },
208
  {
209
  "epoch": 0.512548603746907,
210
- "grad_norm": 2.1687259674072266,
211
- "learning_rate": 0.0002693531283138918,
212
- "loss": 0.6183,
213
  "step": 1450
214
  },
215
  {
216
  "epoch": 0.5302226935312832,
217
- "grad_norm": 1.734108805656433,
218
- "learning_rate": 0.00026829268292682924,
219
- "loss": 0.5411,
220
  "step": 1500
221
  },
222
  {
223
  "epoch": 0.5478967833156593,
224
- "grad_norm": 1.3890644311904907,
225
- "learning_rate": 0.00026723223753976667,
226
- "loss": 0.5092,
227
  "step": 1550
228
  },
229
  {
230
  "epoch": 0.5655708731000354,
231
- "grad_norm": 1.98700749874115,
232
- "learning_rate": 0.0002661717921527041,
233
- "loss": 0.4804,
234
  "step": 1600
235
  },
236
  {
237
  "epoch": 0.5832449628844114,
238
- "grad_norm": 1.1181468963623047,
239
- "learning_rate": 0.00026511134676564154,
240
- "loss": 0.5148,
241
  "step": 1650
242
  },
243
  {
244
  "epoch": 0.6009190526687875,
245
- "grad_norm": 1.7994420528411865,
246
- "learning_rate": 0.000264050901378579,
247
- "loss": 0.4231,
248
  "step": 1700
249
  },
250
  {
251
  "epoch": 0.6185931424531637,
252
- "grad_norm": 2.032198667526245,
253
- "learning_rate": 0.0002629904559915164,
254
- "loss": 0.5106,
255
  "step": 1750
256
  },
257
  {
258
  "epoch": 0.6362672322375398,
259
- "grad_norm": 3.585948944091797,
260
- "learning_rate": 0.00026193001060445385,
261
- "loss": 0.4717,
262
  "step": 1800
263
  },
264
  {
265
  "epoch": 0.6539413220219159,
266
- "grad_norm": 1.8610371351242065,
267
- "learning_rate": 0.0002608695652173913,
268
- "loss": 0.4765,
269
  "step": 1850
270
  },
271
  {
272
  "epoch": 0.671615411806292,
273
- "grad_norm": 1.2324624061584473,
274
- "learning_rate": 0.0002598091198303287,
275
- "loss": 0.4643,
276
  "step": 1900
277
  },
278
  {
279
  "epoch": 0.689289501590668,
280
- "grad_norm": 2.391714572906494,
281
- "learning_rate": 0.00025874867444326615,
282
- "loss": 0.4512,
283
  "step": 1950
284
  },
285
  {
286
  "epoch": 0.7069635913750442,
287
- "grad_norm": 1.8863242864608765,
288
- "learning_rate": 0.0002576882290562036,
289
- "loss": 0.4115,
290
  "step": 2000
291
  },
292
  {
293
  "epoch": 0.7246376811594203,
294
- "grad_norm": 0.7850649356842041,
295
- "learning_rate": 0.000256627783669141,
296
- "loss": 0.4341,
297
  "step": 2050
298
  },
299
  {
300
  "epoch": 0.7423117709437964,
301
- "grad_norm": 1.5869959592819214,
302
- "learning_rate": 0.00025556733828207846,
303
- "loss": 0.4172,
304
  "step": 2100
305
  },
306
  {
307
  "epoch": 0.7599858607281725,
308
- "grad_norm": 1.2584971189498901,
309
- "learning_rate": 0.0002545068928950159,
310
- "loss": 0.4384,
311
  "step": 2150
312
  },
313
  {
314
  "epoch": 0.7776599505125487,
315
- "grad_norm": 2.560710906982422,
316
- "learning_rate": 0.00025344644750795333,
317
- "loss": 0.4558,
318
  "step": 2200
319
  },
320
  {
321
  "epoch": 0.7953340402969247,
322
- "grad_norm": 2.2893359661102295,
323
- "learning_rate": 0.00025238600212089076,
324
- "loss": 0.4345,
325
  "step": 2250
326
  },
327
  {
328
  "epoch": 0.8130081300813008,
329
- "grad_norm": 1.5244982242584229,
330
- "learning_rate": 0.0002513255567338282,
331
- "loss": 0.4071,
332
  "step": 2300
333
  },
334
  {
335
  "epoch": 0.8306822198656769,
336
- "grad_norm": 1.384102463722229,
337
- "learning_rate": 0.00025026511134676563,
338
- "loss": 0.3612,
339
  "step": 2350
340
  },
341
  {
342
  "epoch": 0.848356309650053,
343
- "grad_norm": 1.3080965280532837,
344
- "learning_rate": 0.00024920466595970307,
345
- "loss": 0.3556,
346
  "step": 2400
347
  },
348
  {
349
  "epoch": 0.8660303994344292,
350
- "grad_norm": 1.3324400186538696,
351
- "learning_rate": 0.00024814422057264045,
352
- "loss": 0.3985,
353
  "step": 2450
354
  },
355
  {
356
  "epoch": 0.8837044892188052,
357
- "grad_norm": 1.7705445289611816,
358
- "learning_rate": 0.00024708377518557794,
359
- "loss": 0.3895,
360
  "step": 2500
361
  },
362
  {
363
  "epoch": 0.9013785790031813,
364
- "grad_norm": 1.352480173110962,
365
- "learning_rate": 0.0002460233297985153,
366
- "loss": 0.426,
367
  "step": 2550
368
  },
369
  {
370
  "epoch": 0.9190526687875574,
371
- "grad_norm": 1.479979157447815,
372
- "learning_rate": 0.0002449628844114528,
373
- "loss": 0.4057,
374
  "step": 2600
375
  },
376
  {
377
  "epoch": 0.9367267585719335,
378
- "grad_norm": 2.1380653381347656,
379
- "learning_rate": 0.00024390243902439022,
380
- "loss": 0.3689,
381
  "step": 2650
382
  },
383
  {
384
  "epoch": 0.9544008483563097,
385
- "grad_norm": 1.9099682569503784,
386
- "learning_rate": 0.00024284199363732768,
387
- "loss": 0.3991,
388
  "step": 2700
389
  },
390
  {
391
  "epoch": 0.9720749381406858,
392
- "grad_norm": 1.399566411972046,
393
- "learning_rate": 0.0002417815482502651,
394
- "loss": 0.3412,
395
  "step": 2750
396
  },
397
  {
398
  "epoch": 0.9897490279250618,
399
- "grad_norm": 2.508267879486084,
400
- "learning_rate": 0.00024072110286320252,
401
- "loss": 0.3828,
402
  "step": 2800
403
  },
404
  {
405
  "epoch": 1.0,
406
- "eval_bertscore_f1": 0.9667777874331811,
407
- "eval_bleu": 0.5973566262792636,
408
- "eval_loss": 0.27053505182266235,
409
- "eval_runtime": 1054.1237,
410
- "eval_samples_per_second": 6.132,
411
- "eval_steps_per_second": 0.767,
 
 
 
412
  "step": 2829
413
  },
414
  {
415
  "epoch": 1.007423117709438,
416
- "grad_norm": 1.6967344284057617,
417
- "learning_rate": 0.00023966065747613996,
418
- "loss": 0.3787,
419
  "step": 2850
420
  },
421
  {
422
  "epoch": 1.025097207493814,
423
- "grad_norm": 1.7119196653366089,
424
- "learning_rate": 0.0002386002120890774,
425
- "loss": 0.3507,
426
  "step": 2900
427
  },
428
  {
429
  "epoch": 1.0427712972781902,
430
- "grad_norm": 1.5456138849258423,
431
- "learning_rate": 0.00023753976670201483,
432
- "loss": 0.333,
433
  "step": 2950
434
  },
435
  {
436
  "epoch": 1.0604453870625663,
437
- "grad_norm": 1.3519443273544312,
438
- "learning_rate": 0.00023647932131495226,
439
- "loss": 0.3897,
440
  "step": 3000
441
  },
442
  {
443
  "epoch": 1.0781194768469424,
444
- "grad_norm": 1.4092153310775757,
445
- "learning_rate": 0.0002354188759278897,
446
- "loss": 0.3069,
447
  "step": 3050
448
  },
449
  {
450
  "epoch": 1.0957935666313185,
451
- "grad_norm": 1.67427659034729,
452
- "learning_rate": 0.00023435843054082713,
453
- "loss": 0.3876,
454
  "step": 3100
455
  },
456
  {
457
  "epoch": 1.1134676564156947,
458
- "grad_norm": 0.9288003444671631,
459
- "learning_rate": 0.00023329798515376457,
460
- "loss": 0.3052,
461
  "step": 3150
462
  },
463
  {
464
  "epoch": 1.1311417462000706,
465
- "grad_norm": 2.0493695735931396,
466
- "learning_rate": 0.000232237539766702,
467
- "loss": 0.3419,
468
  "step": 3200
469
  },
470
  {
471
  "epoch": 1.148815835984447,
472
- "grad_norm": 1.3473105430603027,
473
- "learning_rate": 0.0002311770943796394,
474
- "loss": 0.351,
475
  "step": 3250
476
  },
477
  {
478
  "epoch": 1.1664899257688228,
479
- "grad_norm": 2.2063777446746826,
480
- "learning_rate": 0.00023011664899257687,
481
- "loss": 0.3732,
482
  "step": 3300
483
  },
484
  {
485
  "epoch": 1.184164015553199,
486
- "grad_norm": 0.7194732427597046,
487
- "learning_rate": 0.00022905620360551428,
488
- "loss": 0.3098,
489
  "step": 3350
490
  },
491
  {
492
  "epoch": 1.201838105337575,
493
- "grad_norm": 1.8693958520889282,
494
- "learning_rate": 0.00022799575821845174,
495
- "loss": 0.3623,
496
  "step": 3400
497
  },
498
  {
499
  "epoch": 1.2195121951219512,
500
- "grad_norm": 1.7452648878097534,
501
- "learning_rate": 0.00022693531283138915,
502
- "loss": 0.2985,
503
  "step": 3450
504
  },
505
  {
506
  "epoch": 1.2371862849063273,
507
- "grad_norm": 2.7502336502075195,
508
- "learning_rate": 0.00022587486744432661,
509
- "loss": 0.2938,
510
  "step": 3500
511
  },
512
  {
513
  "epoch": 1.2548603746907034,
514
- "grad_norm": 1.0220433473587036,
515
- "learning_rate": 0.00022481442205726402,
516
- "loss": 0.3263,
517
  "step": 3550
518
  },
519
  {
520
  "epoch": 1.2725344644750796,
521
- "grad_norm": 1.1841455698013306,
522
- "learning_rate": 0.00022375397667020146,
523
- "loss": 0.3456,
524
  "step": 3600
525
  },
526
  {
527
  "epoch": 1.2902085542594557,
528
- "grad_norm": 1.1220083236694336,
529
- "learning_rate": 0.0002226935312831389,
530
- "loss": 0.3749,
531
  "step": 3650
532
  },
533
  {
534
  "epoch": 1.3078826440438318,
535
- "grad_norm": 2.557077646255493,
536
- "learning_rate": 0.00022163308589607633,
537
- "loss": 0.3479,
538
  "step": 3700
539
  },
540
  {
541
  "epoch": 1.3255567338282077,
542
- "grad_norm": 1.672131061553955,
543
- "learning_rate": 0.00022057264050901376,
544
- "loss": 0.3371,
545
  "step": 3750
546
  },
547
  {
548
  "epoch": 1.343230823612584,
549
- "grad_norm": 1.5530970096588135,
550
- "learning_rate": 0.0002195121951219512,
551
- "loss": 0.3062,
552
  "step": 3800
553
  },
554
  {
555
  "epoch": 1.36090491339696,
556
- "grad_norm": 0.8587738871574402,
557
- "learning_rate": 0.00021845174973488866,
558
- "loss": 0.3458,
559
  "step": 3850
560
  },
561
  {
562
  "epoch": 1.378579003181336,
563
- "grad_norm": 1.2779722213745117,
564
- "learning_rate": 0.00021739130434782607,
565
- "loss": 0.3582,
566
  "step": 3900
567
  },
568
  {
569
  "epoch": 1.3962530929657122,
570
- "grad_norm": 1.7616783380508423,
571
- "learning_rate": 0.00021633085896076348,
572
- "loss": 0.2999,
573
  "step": 3950
574
  },
575
  {
576
  "epoch": 1.4139271827500883,
577
- "grad_norm": 1.2923225164413452,
578
- "learning_rate": 0.00021527041357370094,
579
- "loss": 0.3079,
580
  "step": 4000
581
  },
582
  {
583
  "epoch": 1.4316012725344645,
584
- "grad_norm": 0.7930673360824585,
585
- "learning_rate": 0.00021420996818663835,
586
- "loss": 0.2973,
587
  "step": 4050
588
  },
589
  {
590
  "epoch": 1.4492753623188406,
591
- "grad_norm": 1.5622656345367432,
592
- "learning_rate": 0.0002131495227995758,
593
- "loss": 0.291,
594
  "step": 4100
595
  },
596
  {
597
  "epoch": 1.4669494521032167,
598
- "grad_norm": 0.8834390640258789,
599
- "learning_rate": 0.00021208907741251324,
600
- "loss": 0.2691,
601
  "step": 4150
602
  },
603
  {
604
  "epoch": 1.4846235418875928,
605
- "grad_norm": 1.2596232891082764,
606
- "learning_rate": 0.00021102863202545068,
607
- "loss": 0.247,
608
  "step": 4200
609
  },
610
  {
611
  "epoch": 1.502297631671969,
612
- "grad_norm": 0.7010456323623657,
613
- "learning_rate": 0.00020996818663838811,
614
- "loss": 0.3019,
615
  "step": 4250
616
  },
617
  {
618
  "epoch": 1.5199717214563448,
619
- "grad_norm": 1.071253776550293,
620
- "learning_rate": 0.00020890774125132552,
621
- "loss": 0.2447,
622
  "step": 4300
623
  },
624
  {
625
  "epoch": 1.5376458112407212,
626
- "grad_norm": 0.63275545835495,
627
- "learning_rate": 0.00020784729586426298,
628
- "loss": 0.246,
629
  "step": 4350
630
  },
631
  {
632
  "epoch": 1.555319901025097,
633
- "grad_norm": 1.0832668542861938,
634
- "learning_rate": 0.0002067868504772004,
635
- "loss": 0.249,
636
  "step": 4400
637
  },
638
  {
639
  "epoch": 1.5729939908094734,
640
- "grad_norm": 1.0748353004455566,
641
- "learning_rate": 0.00020572640509013785,
642
- "loss": 0.2585,
643
  "step": 4450
644
  },
645
  {
646
  "epoch": 1.5906680805938493,
647
- "grad_norm": 1.2410573959350586,
648
- "learning_rate": 0.00020466595970307526,
649
- "loss": 0.2821,
650
  "step": 4500
651
  },
652
  {
653
  "epoch": 1.6083421703782255,
654
- "grad_norm": 1.8322285413742065,
655
- "learning_rate": 0.00020360551431601272,
656
- "loss": 0.2642,
657
  "step": 4550
658
  },
659
  {
660
  "epoch": 1.6260162601626016,
661
- "grad_norm": 1.5231540203094482,
662
- "learning_rate": 0.00020254506892895013,
663
- "loss": 0.329,
664
  "step": 4600
665
  },
666
  {
667
  "epoch": 1.6436903499469777,
668
- "grad_norm": 0.8996387124061584,
669
- "learning_rate": 0.0002014846235418876,
670
- "loss": 0.2822,
671
  "step": 4650
672
  },
673
  {
674
  "epoch": 1.6613644397313538,
675
- "grad_norm": 1.107340693473816,
676
- "learning_rate": 0.000200424178154825,
677
- "loss": 0.2647,
678
  "step": 4700
679
  },
680
  {
681
  "epoch": 1.67903852951573,
682
- "grad_norm": 1.44370698928833,
683
- "learning_rate": 0.00019936373276776244,
684
- "loss": 0.3281,
685
  "step": 4750
686
  },
687
  {
688
  "epoch": 1.696712619300106,
689
- "grad_norm": 1.433866024017334,
690
- "learning_rate": 0.00019830328738069987,
691
- "loss": 0.2867,
692
  "step": 4800
693
  },
694
  {
695
  "epoch": 1.714386709084482,
696
- "grad_norm": 0.7778879404067993,
697
- "learning_rate": 0.0001972428419936373,
698
- "loss": 0.2363,
699
  "step": 4850
700
  },
701
  {
702
  "epoch": 1.7320607988688583,
703
- "grad_norm": 1.0693784952163696,
704
- "learning_rate": 0.00019618239660657474,
705
- "loss": 0.2989,
706
  "step": 4900
707
  },
708
  {
709
  "epoch": 1.7497348886532342,
710
- "grad_norm": 0.9680020213127136,
711
- "learning_rate": 0.00019512195121951218,
712
- "loss": 0.2512,
713
  "step": 4950
714
  },
715
  {
716
  "epoch": 1.7674089784376106,
717
- "grad_norm": 0.9300338625907898,
718
- "learning_rate": 0.00019406150583244961,
719
- "loss": 0.2814,
720
  "step": 5000
721
  },
722
  {
723
  "epoch": 1.7850830682219865,
724
- "grad_norm": 1.6086584329605103,
725
- "learning_rate": 0.00019300106044538705,
726
- "loss": 0.2895,
727
  "step": 5050
728
  },
729
  {
730
  "epoch": 1.8027571580063628,
731
- "grad_norm": 1.522153615951538,
732
- "learning_rate": 0.00019194061505832446,
733
- "loss": 0.2804,
734
  "step": 5100
735
  },
736
  {
737
  "epoch": 1.8204312477907387,
738
- "grad_norm": 1.3292605876922607,
739
- "learning_rate": 0.00019088016967126192,
740
- "loss": 0.2676,
741
  "step": 5150
742
  },
743
  {
744
  "epoch": 1.8381053375751149,
745
- "grad_norm": 1.0950225591659546,
746
- "learning_rate": 0.00018981972428419933,
747
- "loss": 0.2991,
748
  "step": 5200
749
  },
750
  {
751
  "epoch": 1.855779427359491,
752
- "grad_norm": 1.8333910703659058,
753
- "learning_rate": 0.0001887592788971368,
754
- "loss": 0.2742,
755
  "step": 5250
756
  },
757
  {
758
  "epoch": 1.873453517143867,
759
- "grad_norm": 1.447016716003418,
760
- "learning_rate": 0.0001876988335100742,
761
- "loss": 0.2125,
762
  "step": 5300
763
  },
764
  {
765
  "epoch": 1.8911276069282432,
766
- "grad_norm": 1.0409213304519653,
767
- "learning_rate": 0.00018663838812301166,
768
- "loss": 0.2372,
769
  "step": 5350
770
  },
771
  {
772
  "epoch": 1.9088016967126193,
773
- "grad_norm": 0.5701714158058167,
774
- "learning_rate": 0.00018557794273594907,
775
- "loss": 0.2332,
776
  "step": 5400
777
  },
778
  {
779
  "epoch": 1.9264757864969955,
780
- "grad_norm": 1.0092428922653198,
781
- "learning_rate": 0.0001845174973488865,
782
- "loss": 0.2897,
783
  "step": 5450
784
  },
785
  {
786
  "epoch": 1.9441498762813714,
787
- "grad_norm": 1.031217098236084,
788
- "learning_rate": 0.00018345705196182397,
789
- "loss": 0.2722,
790
  "step": 5500
791
  },
792
  {
793
  "epoch": 1.9618239660657477,
794
- "grad_norm": 1.2638362646102905,
795
- "learning_rate": 0.00018239660657476137,
796
- "loss": 0.2505,
797
  "step": 5550
798
  },
799
  {
800
  "epoch": 1.9794980558501236,
801
- "grad_norm": 1.3998290300369263,
802
- "learning_rate": 0.00018133616118769884,
803
- "loss": 0.2772,
804
  "step": 5600
805
  },
806
  {
807
  "epoch": 1.9971721456345,
808
- "grad_norm": 1.4681320190429688,
809
- "learning_rate": 0.00018027571580063624,
810
- "loss": 0.25,
811
  "step": 5650
812
  },
813
  {
814
  "epoch": 2.0,
815
- "eval_bertscore_f1": 0.970109825833968,
816
- "eval_bleu": 0.6229920961802436,
817
- "eval_loss": 0.1904931217432022,
818
- "eval_runtime": 1018.8038,
819
- "eval_samples_per_second": 6.345,
820
- "eval_steps_per_second": 0.793,
 
 
 
821
  "step": 5658
822
  },
823
  {
824
  "epoch": 2.014846235418876,
825
- "grad_norm": 1.3499983549118042,
826
- "learning_rate": 0.0001792152704135737,
827
- "loss": 0.2412,
828
  "step": 5700
829
  },
830
  {
831
  "epoch": 2.032520325203252,
832
- "grad_norm": 1.2155545949935913,
833
- "learning_rate": 0.00017815482502651111,
834
- "loss": 0.2919,
835
  "step": 5750
836
  },
837
  {
838
  "epoch": 2.050194414987628,
839
- "grad_norm": 0.9294681549072266,
840
- "learning_rate": 0.00017709437963944858,
841
- "loss": 0.2191,
842
  "step": 5800
843
  },
844
  {
845
  "epoch": 2.0678685047720045,
846
- "grad_norm": 0.8069599270820618,
847
- "learning_rate": 0.00017603393425238598,
848
- "loss": 0.228,
849
  "step": 5850
850
  },
851
  {
852
  "epoch": 2.0855425945563804,
853
- "grad_norm": 1.1825474500656128,
854
- "learning_rate": 0.00017497348886532342,
855
- "loss": 0.2422,
856
  "step": 5900
857
  },
858
  {
859
  "epoch": 2.1032166843407563,
860
- "grad_norm": 1.2947015762329102,
861
- "learning_rate": 0.00017391304347826085,
862
- "loss": 0.2333,
863
  "step": 5950
864
  },
865
  {
866
  "epoch": 2.1208907741251326,
867
- "grad_norm": 1.0622906684875488,
868
- "learning_rate": 0.0001728525980911983,
869
- "loss": 0.2029,
870
  "step": 6000
871
  },
872
  {
873
  "epoch": 2.1385648639095085,
874
- "grad_norm": 0.8785162568092346,
875
- "learning_rate": 0.00017179215270413572,
876
- "loss": 0.2039,
877
  "step": 6050
878
  },
879
  {
880
  "epoch": 2.156238953693885,
881
- "grad_norm": 0.3702610433101654,
882
- "learning_rate": 0.00017073170731707316,
883
- "loss": 0.2631,
884
  "step": 6100
885
  },
886
  {
887
  "epoch": 2.1739130434782608,
888
- "grad_norm": 1.0092154741287231,
889
- "learning_rate": 0.0001696712619300106,
890
- "loss": 0.2325,
891
  "step": 6150
892
  },
893
  {
894
  "epoch": 2.191587133262637,
895
- "grad_norm": 1.648000955581665,
896
- "learning_rate": 0.00016861081654294803,
897
- "loss": 0.2501,
898
  "step": 6200
899
  },
900
  {
901
  "epoch": 2.209261223047013,
902
- "grad_norm": 0.979069173336029,
903
- "learning_rate": 0.00016755037115588544,
904
- "loss": 0.256,
905
  "step": 6250
906
  },
907
  {
908
  "epoch": 2.2269353128313893,
909
- "grad_norm": 1.459558129310608,
910
- "learning_rate": 0.0001664899257688229,
911
- "loss": 0.2603,
912
  "step": 6300
913
  },
914
  {
915
  "epoch": 2.2446094026157652,
916
- "grad_norm": 1.5793472528457642,
917
- "learning_rate": 0.0001654294803817603,
918
- "loss": 0.2564,
919
  "step": 6350
920
  },
921
  {
922
  "epoch": 2.262283492400141,
923
- "grad_norm": 1.1787140369415283,
924
- "learning_rate": 0.00016436903499469777,
925
- "loss": 0.2782,
926
  "step": 6400
927
  },
928
  {
929
  "epoch": 2.2799575821845175,
930
- "grad_norm": 1.041374683380127,
931
- "learning_rate": 0.00016330858960763518,
932
- "loss": 0.2331,
933
  "step": 6450
934
  },
935
  {
936
  "epoch": 2.297631671968894,
937
- "grad_norm": 0.7799555063247681,
938
- "learning_rate": 0.00016224814422057264,
939
- "loss": 0.2338,
940
  "step": 6500
941
  },
942
  {
943
  "epoch": 2.3153057617532697,
944
- "grad_norm": 1.4405689239501953,
945
- "learning_rate": 0.00016118769883351005,
946
- "loss": 0.2737,
947
  "step": 6550
948
  },
949
  {
950
  "epoch": 2.3329798515376456,
951
- "grad_norm": 0.979608416557312,
952
- "learning_rate": 0.00016012725344644748,
953
- "loss": 0.2495,
954
  "step": 6600
955
  },
956
  {
957
  "epoch": 2.350653941322022,
958
- "grad_norm": 0.9300618171691895,
959
- "learning_rate": 0.00015906680805938492,
960
- "loss": 0.2157,
961
  "step": 6650
962
  },
963
  {
964
  "epoch": 2.368328031106398,
965
- "grad_norm": 0.8745370507240295,
966
- "learning_rate": 0.00015800636267232235,
967
- "loss": 0.2837,
968
  "step": 6700
969
  },
970
  {
971
  "epoch": 2.3860021208907742,
972
- "grad_norm": 0.9898168444633484,
973
- "learning_rate": 0.00015694591728525982,
974
- "loss": 0.221,
975
  "step": 6750
976
  },
977
  {
978
  "epoch": 2.40367621067515,
979
- "grad_norm": 0.8933513760566711,
980
- "learning_rate": 0.00015588547189819722,
981
- "loss": 0.1994,
982
  "step": 6800
983
  },
984
  {
985
  "epoch": 2.4213503004595265,
986
- "grad_norm": 1.7144904136657715,
987
- "learning_rate": 0.0001548250265111347,
988
- "loss": 0.2429,
989
  "step": 6850
990
  },
991
  {
992
  "epoch": 2.4390243902439024,
993
- "grad_norm": 1.5800135135650635,
994
- "learning_rate": 0.0001537645811240721,
995
- "loss": 0.2284,
996
  "step": 6900
997
  },
998
  {
999
  "epoch": 2.4566984800282787,
1000
- "grad_norm": 1.0567731857299805,
1001
- "learning_rate": 0.0001527041357370095,
1002
- "loss": 0.2028,
1003
  "step": 6950
1004
  },
1005
  {
1006
  "epoch": 2.4743725698126546,
1007
- "grad_norm": 0.59196537733078,
1008
- "learning_rate": 0.00015164369034994697,
1009
- "loss": 0.228,
1010
  "step": 7000
1011
  },
1012
  {
1013
  "epoch": 2.4920466595970305,
1014
- "grad_norm": 1.0257049798965454,
1015
- "learning_rate": 0.00015058324496288437,
1016
- "loss": 0.2196,
1017
  "step": 7050
1018
  },
1019
  {
1020
  "epoch": 2.509720749381407,
1021
- "grad_norm": 1.500623345375061,
1022
- "learning_rate": 0.00014952279957582184,
1023
- "loss": 0.2351,
1024
  "step": 7100
1025
  },
1026
  {
1027
  "epoch": 2.5273948391657832,
1028
- "grad_norm": 1.1046085357666016,
1029
- "learning_rate": 0.00014846235418875927,
1030
- "loss": 0.2595,
1031
  "step": 7150
1032
  },
1033
  {
1034
  "epoch": 2.545068928950159,
1035
- "grad_norm": 1.2226991653442383,
1036
- "learning_rate": 0.0001474019088016967,
1037
- "loss": 0.1914,
1038
  "step": 7200
1039
  },
1040
  {
1041
  "epoch": 2.562743018734535,
1042
- "grad_norm": 0.6742298007011414,
1043
- "learning_rate": 0.00014634146341463414,
1044
- "loss": 0.2096,
1045
  "step": 7250
1046
  },
1047
  {
1048
  "epoch": 2.5804171085189114,
1049
- "grad_norm": 1.5504461526870728,
1050
- "learning_rate": 0.00014528101802757158,
1051
- "loss": 0.2051,
1052
  "step": 7300
1053
  },
1054
  {
1055
  "epoch": 2.5980911983032873,
1056
- "grad_norm": 0.9681800603866577,
1057
- "learning_rate": 0.000144220572640509,
1058
- "loss": 0.2327,
1059
  "step": 7350
1060
  },
1061
  {
1062
  "epoch": 2.6157652880876636,
1063
- "grad_norm": 0.9383839964866638,
1064
- "learning_rate": 0.00014316012725344645,
1065
- "loss": 0.2344,
1066
  "step": 7400
1067
  },
1068
  {
1069
  "epoch": 2.6334393778720395,
1070
- "grad_norm": 0.6154807209968567,
1071
- "learning_rate": 0.00014209968186638388,
1072
- "loss": 0.2383,
1073
  "step": 7450
1074
  },
1075
  {
1076
  "epoch": 2.6511134676564154,
1077
- "grad_norm": 1.2676986455917358,
1078
- "learning_rate": 0.00014103923647932132,
1079
- "loss": 0.2257,
1080
  "step": 7500
1081
  },
1082
  {
1083
  "epoch": 2.6687875574407918,
1084
- "grad_norm": 1.183440089225769,
1085
- "learning_rate": 0.00013997879109225872,
1086
- "loss": 0.2102,
1087
  "step": 7550
1088
  },
1089
  {
1090
  "epoch": 2.686461647225168,
1091
- "grad_norm": 0.7244306802749634,
1092
- "learning_rate": 0.00013891834570519616,
1093
- "loss": 0.2146,
1094
  "step": 7600
1095
  },
1096
  {
1097
  "epoch": 2.704135737009544,
1098
- "grad_norm": 1.187232494354248,
1099
- "learning_rate": 0.0001378579003181336,
1100
- "loss": 0.2119,
1101
  "step": 7650
1102
  },
1103
  {
1104
  "epoch": 2.72180982679392,
1105
- "grad_norm": 1.4510794878005981,
1106
- "learning_rate": 0.00013679745493107103,
1107
- "loss": 0.1916,
1108
  "step": 7700
1109
  },
1110
  {
1111
  "epoch": 2.7394839165782963,
1112
- "grad_norm": 1.383832335472107,
1113
- "learning_rate": 0.00013573700954400847,
1114
- "loss": 0.2179,
1115
  "step": 7750
1116
  },
1117
  {
1118
  "epoch": 2.757158006362672,
1119
- "grad_norm": 0.9274504780769348,
1120
- "learning_rate": 0.0001346765641569459,
1121
- "loss": 0.199,
1122
  "step": 7800
1123
  },
1124
  {
1125
  "epoch": 2.7748320961470485,
1126
- "grad_norm": 2.6429216861724854,
1127
- "learning_rate": 0.00013361611876988334,
1128
- "loss": 0.2407,
1129
  "step": 7850
1130
  },
1131
  {
1132
  "epoch": 2.7925061859314244,
1133
- "grad_norm": 1.3947652578353882,
1134
- "learning_rate": 0.00013255567338282077,
1135
- "loss": 0.2019,
1136
  "step": 7900
1137
  },
1138
  {
1139
  "epoch": 2.8101802757158008,
1140
- "grad_norm": 1.148478627204895,
1141
- "learning_rate": 0.0001314952279957582,
1142
- "loss": 0.205,
1143
  "step": 7950
1144
  },
1145
  {
1146
  "epoch": 2.8278543655001767,
1147
- "grad_norm": 1.1087610721588135,
1148
- "learning_rate": 0.00013043478260869564,
1149
- "loss": 0.2527,
1150
  "step": 8000
1151
  },
1152
  {
1153
  "epoch": 2.845528455284553,
1154
- "grad_norm": 1.4348084926605225,
1155
- "learning_rate": 0.00012937433722163308,
1156
- "loss": 0.2465,
1157
  "step": 8050
1158
  },
1159
  {
1160
  "epoch": 2.863202545068929,
1161
- "grad_norm": 1.2600926160812378,
1162
- "learning_rate": 0.0001283138918345705,
1163
- "loss": 0.1699,
1164
  "step": 8100
1165
  },
1166
  {
1167
  "epoch": 2.880876634853305,
1168
- "grad_norm": 0.8724793195724487,
1169
- "learning_rate": 0.00012725344644750795,
1170
- "loss": 0.2257,
1171
  "step": 8150
1172
  },
1173
  {
1174
  "epoch": 2.898550724637681,
1175
- "grad_norm": 1.5324125289916992,
1176
- "learning_rate": 0.00012619300106044538,
1177
- "loss": 0.2002,
1178
  "step": 8200
1179
  },
1180
  {
1181
  "epoch": 2.9162248144220575,
1182
- "grad_norm": 1.0066156387329102,
1183
- "learning_rate": 0.00012513255567338282,
1184
- "loss": 0.192,
1185
  "step": 8250
1186
  },
1187
  {
1188
  "epoch": 2.9338989042064334,
1189
- "grad_norm": 0.4273667633533478,
1190
- "learning_rate": 0.00012407211028632022,
1191
- "loss": 0.1758,
1192
  "step": 8300
1193
  },
1194
  {
1195
  "epoch": 2.9515729939908093,
1196
- "grad_norm": 0.6536590456962585,
1197
- "learning_rate": 0.00012301166489925766,
1198
- "loss": 0.1905,
1199
  "step": 8350
1200
  },
1201
  {
1202
  "epoch": 2.9692470837751856,
1203
- "grad_norm": 0.6973742246627808,
1204
- "learning_rate": 0.00012195121951219511,
1205
- "loss": 0.2116,
1206
  "step": 8400
1207
  },
1208
  {
1209
  "epoch": 2.9869211735595615,
1210
- "grad_norm": 0.9764792919158936,
1211
- "learning_rate": 0.00012089077412513254,
1212
- "loss": 0.2098,
1213
  "step": 8450
1214
  },
1215
  {
1216
  "epoch": 3.0,
1217
- "eval_bertscore_f1": 0.9710737692450385,
1218
- "eval_bleu": 0.6330045835354601,
1219
- "eval_loss": 0.15814107656478882,
1220
- "eval_runtime": 1020.076,
1221
- "eval_samples_per_second": 6.337,
1222
- "eval_steps_per_second": 0.792,
 
 
 
1223
  "step": 8487
1224
  },
1225
  {
1226
  "epoch": 3.004595263343938,
1227
- "grad_norm": 1.2752796411514282,
1228
- "learning_rate": 0.00011983032873806998,
1229
- "loss": 0.1815,
1230
  "step": 8500
1231
  },
1232
  {
1233
  "epoch": 3.022269353128314,
1234
- "grad_norm": 1.1779205799102783,
1235
- "learning_rate": 0.00011876988335100741,
1236
- "loss": 0.2125,
1237
  "step": 8550
1238
  },
1239
  {
1240
  "epoch": 3.03994344291269,
1241
- "grad_norm": 0.7505296468734741,
1242
- "learning_rate": 0.00011770943796394485,
1243
- "loss": 0.1989,
1244
  "step": 8600
1245
  },
1246
  {
1247
  "epoch": 3.057617532697066,
1248
- "grad_norm": 4.621654033660889,
1249
- "learning_rate": 0.00011664899257688228,
1250
- "loss": 0.2015,
1251
  "step": 8650
1252
  },
1253
  {
1254
  "epoch": 3.0752916224814424,
1255
- "grad_norm": 0.8159476518630981,
1256
- "learning_rate": 0.0001155885471898197,
1257
- "loss": 0.1685,
1258
  "step": 8700
1259
  },
1260
  {
1261
  "epoch": 3.0929657122658183,
1262
- "grad_norm": 1.5748199224472046,
1263
- "learning_rate": 0.00011452810180275714,
1264
- "loss": 0.1748,
1265
  "step": 8750
1266
  },
1267
  {
1268
  "epoch": 3.110639802050194,
1269
- "grad_norm": 2.3961069583892822,
1270
- "learning_rate": 0.00011346765641569458,
1271
- "loss": 0.2439,
1272
  "step": 8800
1273
  },
1274
  {
1275
  "epoch": 3.1283138918345705,
1276
- "grad_norm": 1.1994341611862183,
1277
- "learning_rate": 0.00011240721102863201,
1278
- "loss": 0.1819,
1279
  "step": 8850
1280
  },
1281
  {
1282
  "epoch": 3.1459879816189464,
1283
- "grad_norm": 1.4692330360412598,
1284
- "learning_rate": 0.00011134676564156945,
1285
- "loss": 0.1757,
1286
  "step": 8900
1287
  },
1288
  {
1289
  "epoch": 3.163662071403323,
1290
- "grad_norm": 0.559505820274353,
1291
- "learning_rate": 0.00011028632025450688,
1292
- "loss": 0.1832,
1293
  "step": 8950
1294
  },
1295
  {
1296
  "epoch": 3.1813361611876987,
1297
- "grad_norm": 0.608403205871582,
1298
- "learning_rate": 0.00010922587486744433,
1299
- "loss": 0.2094,
1300
  "step": 9000
1301
  },
1302
  {
1303
  "epoch": 3.199010250972075,
1304
- "grad_norm": 0.9615042805671692,
1305
- "learning_rate": 0.00010816542948038174,
1306
- "loss": 0.1868,
1307
  "step": 9050
1308
  },
1309
  {
1310
  "epoch": 3.216684340756451,
1311
- "grad_norm": 1.3172391653060913,
1312
- "learning_rate": 0.00010710498409331917,
1313
- "loss": 0.2161,
1314
  "step": 9100
1315
  },
1316
  {
1317
  "epoch": 3.2343584305408273,
1318
- "grad_norm": 1.120377779006958,
1319
- "learning_rate": 0.00010604453870625662,
1320
- "loss": 0.1621,
1321
  "step": 9150
1322
  },
1323
  {
1324
  "epoch": 3.252032520325203,
1325
- "grad_norm": 1.7331315279006958,
1326
- "learning_rate": 0.00010498409331919406,
1327
- "loss": 0.2172,
1328
  "step": 9200
1329
  },
1330
  {
1331
  "epoch": 3.2697066101095795,
1332
- "grad_norm": 1.0575073957443237,
1333
- "learning_rate": 0.00010392364793213149,
1334
- "loss": 0.2013,
1335
  "step": 9250
1336
  },
1337
  {
1338
  "epoch": 3.2873806998939554,
1339
- "grad_norm": 0.9054147601127625,
1340
- "learning_rate": 0.00010286320254506893,
1341
- "loss": 0.1648,
1342
  "step": 9300
1343
  },
1344
  {
1345
  "epoch": 3.3050547896783318,
1346
- "grad_norm": 1.450341820716858,
1347
- "learning_rate": 0.00010180275715800636,
1348
- "loss": 0.1753,
1349
  "step": 9350
1350
  },
1351
  {
1352
  "epoch": 3.3227288794627077,
1353
- "grad_norm": 1.1002529859542847,
1354
- "learning_rate": 0.0001007423117709438,
1355
- "loss": 0.1863,
1356
  "step": 9400
1357
  },
1358
  {
1359
  "epoch": 3.3404029692470836,
1360
- "grad_norm": 0.9334053993225098,
1361
- "learning_rate": 9.968186638388122e-05,
1362
- "loss": 0.1702,
1363
  "step": 9450
1364
  },
1365
  {
1366
  "epoch": 3.35807705903146,
1367
- "grad_norm": 0.8281420469284058,
1368
- "learning_rate": 9.862142099681865e-05,
1369
- "loss": 0.2526,
1370
  "step": 9500
1371
  },
1372
  {
1373
  "epoch": 3.375751148815836,
1374
- "grad_norm": 1.2539646625518799,
1375
- "learning_rate": 9.756097560975609e-05,
1376
- "loss": 0.1894,
1377
  "step": 9550
1378
  },
1379
  {
1380
  "epoch": 3.393425238600212,
1381
- "grad_norm": 0.949944794178009,
1382
- "learning_rate": 9.650053022269352e-05,
1383
- "loss": 0.1936,
1384
  "step": 9600
1385
  },
1386
  {
1387
  "epoch": 3.411099328384588,
1388
- "grad_norm": 1.2891144752502441,
1389
- "learning_rate": 9.544008483563096e-05,
1390
- "loss": 0.1865,
1391
  "step": 9650
1392
  },
1393
  {
1394
  "epoch": 3.4287734181689644,
1395
- "grad_norm": 0.5977984070777893,
1396
- "learning_rate": 9.43796394485684e-05,
1397
- "loss": 0.1484,
1398
  "step": 9700
1399
  },
1400
  {
1401
  "epoch": 3.4464475079533403,
1402
- "grad_norm": 0.3996190130710602,
1403
- "learning_rate": 9.331919406150583e-05,
1404
- "loss": 0.2573,
1405
  "step": 9750
1406
  },
1407
  {
1408
  "epoch": 3.4641215977377167,
1409
- "grad_norm": 1.8965319395065308,
1410
- "learning_rate": 9.225874867444325e-05,
1411
- "loss": 0.195,
1412
  "step": 9800
1413
  },
1414
  {
1415
  "epoch": 3.4817956875220926,
1416
- "grad_norm": 1.197365403175354,
1417
- "learning_rate": 9.119830328738069e-05,
1418
- "loss": 0.2083,
1419
  "step": 9850
1420
  },
1421
  {
1422
  "epoch": 3.499469777306469,
1423
- "grad_norm": 0.6886301040649414,
1424
- "learning_rate": 9.013785790031812e-05,
1425
- "loss": 0.2112,
1426
  "step": 9900
1427
  },
1428
  {
1429
  "epoch": 3.517143867090845,
1430
- "grad_norm": 0.677227795124054,
1431
- "learning_rate": 8.907741251325556e-05,
1432
- "loss": 0.1789,
1433
  "step": 9950
1434
  },
1435
  {
1436
  "epoch": 3.534817956875221,
1437
- "grad_norm": 1.131480097770691,
1438
- "learning_rate": 8.801696712619299e-05,
1439
- "loss": 0.1857,
1440
  "step": 10000
1441
  },
1442
  {
1443
  "epoch": 3.552492046659597,
1444
- "grad_norm": 1.6898012161254883,
1445
- "learning_rate": 8.695652173913043e-05,
1446
- "loss": 0.197,
1447
  "step": 10050
1448
  },
1449
  {
1450
  "epoch": 3.570166136443973,
1451
- "grad_norm": 1.6000021696090698,
1452
- "learning_rate": 8.589607635206786e-05,
1453
- "loss": 0.2028,
1454
  "step": 10100
1455
  },
1456
  {
1457
  "epoch": 3.5878402262283493,
1458
- "grad_norm": 0.59913170337677,
1459
- "learning_rate": 8.48356309650053e-05,
1460
- "loss": 0.2056,
1461
  "step": 10150
1462
  },
1463
  {
1464
  "epoch": 3.605514316012725,
1465
- "grad_norm": 1.2439507246017456,
1466
- "learning_rate": 8.377518557794272e-05,
1467
- "loss": 0.264,
1468
  "step": 10200
1469
  },
1470
  {
1471
  "epoch": 3.6231884057971016,
1472
- "grad_norm": 1.2470340728759766,
1473
- "learning_rate": 8.271474019088015e-05,
1474
- "loss": 0.1818,
1475
  "step": 10250
1476
  },
1477
  {
1478
  "epoch": 3.6408624955814775,
1479
- "grad_norm": 0.8461691737174988,
1480
- "learning_rate": 8.165429480381759e-05,
1481
- "loss": 0.1893,
1482
  "step": 10300
1483
  },
1484
  {
1485
  "epoch": 3.658536585365854,
1486
- "grad_norm": 1.161589503288269,
1487
- "learning_rate": 8.059384941675502e-05,
1488
- "loss": 0.183,
1489
  "step": 10350
1490
  },
1491
  {
1492
  "epoch": 3.6762106751502297,
1493
- "grad_norm": 1.1066420078277588,
1494
- "learning_rate": 7.953340402969246e-05,
1495
- "loss": 0.2059,
1496
  "step": 10400
1497
  },
1498
  {
1499
  "epoch": 3.693884764934606,
1500
- "grad_norm": 0.04225541278719902,
1501
- "learning_rate": 7.847295864262991e-05,
1502
- "loss": 0.1579,
1503
  "step": 10450
1504
  },
1505
  {
1506
  "epoch": 3.711558854718982,
1507
- "grad_norm": 2.536710500717163,
1508
- "learning_rate": 7.741251325556734e-05,
1509
- "loss": 0.2092,
1510
  "step": 10500
1511
  },
1512
  {
1513
  "epoch": 3.729232944503358,
1514
- "grad_norm": 0.49366581439971924,
1515
- "learning_rate": 7.635206786850475e-05,
1516
- "loss": 0.2148,
1517
  "step": 10550
1518
  },
1519
  {
1520
  "epoch": 3.746907034287734,
1521
- "grad_norm": 0.5715583562850952,
1522
- "learning_rate": 7.529162248144219e-05,
1523
- "loss": 0.1697,
1524
  "step": 10600
1525
  },
1526
  {
1527
  "epoch": 3.7645811240721105,
1528
- "grad_norm": 0.6103105545043945,
1529
- "learning_rate": 7.423117709437964e-05,
1530
- "loss": 0.1631,
1531
  "step": 10650
1532
  },
1533
  {
1534
  "epoch": 3.7822552138564864,
1535
- "grad_norm": 0.6263866424560547,
1536
- "learning_rate": 7.317073170731707e-05,
1537
- "loss": 0.211,
1538
  "step": 10700
1539
  },
1540
  {
1541
  "epoch": 3.7999293036408623,
1542
- "grad_norm": 2.3304359912872314,
1543
- "learning_rate": 7.21102863202545e-05,
1544
- "loss": 0.2101,
1545
  "step": 10750
1546
  },
1547
  {
1548
  "epoch": 3.8176033934252387,
1549
- "grad_norm": 0.96124666929245,
1550
- "learning_rate": 7.104984093319194e-05,
1551
- "loss": 0.2033,
1552
  "step": 10800
1553
  },
1554
  {
1555
  "epoch": 3.8352774832096146,
1556
- "grad_norm": 0.8838850855827332,
1557
- "learning_rate": 6.998939554612936e-05,
1558
- "loss": 0.1686,
1559
  "step": 10850
1560
  },
1561
  {
1562
  "epoch": 3.852951572993991,
1563
- "grad_norm": 0.40561985969543457,
1564
- "learning_rate": 6.89289501590668e-05,
1565
- "loss": 0.1584,
1566
  "step": 10900
1567
  },
1568
  {
1569
  "epoch": 3.870625662778367,
1570
- "grad_norm": 0.1844971626996994,
1571
- "learning_rate": 6.788971367974548e-05,
1572
- "loss": 0.1804,
1573
  "step": 10950
1574
  },
1575
  {
1576
  "epoch": 3.888299752562743,
1577
- "grad_norm": 0.16500629484653473,
1578
- "learning_rate": 6.682926829268293e-05,
1579
- "loss": 0.1573,
1580
  "step": 11000
1581
  },
1582
  {
1583
  "epoch": 3.905973842347119,
1584
- "grad_norm": 1.1422902345657349,
1585
- "learning_rate": 6.576882290562035e-05,
1586
- "loss": 0.168,
1587
  "step": 11050
1588
  },
1589
  {
1590
  "epoch": 3.9236479321314954,
1591
- "grad_norm": 1.0575031042099,
1592
- "learning_rate": 6.470837751855779e-05,
1593
- "loss": 0.1839,
1594
  "step": 11100
1595
  },
1596
  {
1597
  "epoch": 3.9413220219158713,
1598
- "grad_norm": 1.222320318222046,
1599
- "learning_rate": 6.364793213149522e-05,
1600
- "loss": 0.184,
1601
  "step": 11150
1602
  },
1603
  {
1604
  "epoch": 3.9589961117002472,
1605
- "grad_norm": 1.2237067222595215,
1606
- "learning_rate": 6.258748674443266e-05,
1607
- "loss": 0.1593,
1608
  "step": 11200
1609
  },
1610
  {
1611
  "epoch": 3.9766702014846236,
1612
- "grad_norm": 0.5730044841766357,
1613
- "learning_rate": 6.152704135737009e-05,
1614
- "loss": 0.1952,
1615
  "step": 11250
1616
  },
1617
  {
1618
  "epoch": 3.9943442912689995,
1619
- "grad_norm": 1.0718284845352173,
1620
- "learning_rate": 6.046659597030752e-05,
1621
- "loss": 0.2006,
1622
  "step": 11300
1623
  },
1624
  {
1625
  "epoch": 4.0,
1626
- "eval_bertscore_f1": 0.971733581422284,
1627
- "eval_bleu": 0.6349849586545804,
1628
- "eval_loss": 0.14258554577827454,
1629
- "eval_runtime": 1019.3837,
1630
- "eval_samples_per_second": 6.341,
1631
- "eval_steps_per_second": 0.793,
 
 
 
1632
  "step": 11316
1633
  },
1634
  {
1635
  "epoch": 4.012018381053376,
1636
- "grad_norm": 1.2913243770599365,
1637
- "learning_rate": 5.940615058324496e-05,
1638
- "loss": 0.1864,
1639
  "step": 11350
1640
  },
1641
  {
1642
  "epoch": 4.029692470837752,
1643
- "grad_norm": 0.7768388390541077,
1644
- "learning_rate": 5.8345705196182385e-05,
1645
- "loss": 0.1761,
1646
  "step": 11400
1647
  },
1648
  {
1649
  "epoch": 4.047366560622128,
1650
- "grad_norm": 1.3466072082519531,
1651
- "learning_rate": 5.728525980911983e-05,
1652
- "loss": 0.149,
1653
  "step": 11450
1654
  },
1655
  {
1656
  "epoch": 4.065040650406504,
1657
- "grad_norm": 0.5589826703071594,
1658
- "learning_rate": 5.622481442205726e-05,
1659
- "loss": 0.1951,
1660
  "step": 11500
1661
  },
1662
  {
1663
  "epoch": 4.08271474019088,
1664
- "grad_norm": 1.1719962358474731,
1665
- "learning_rate": 5.51643690349947e-05,
1666
- "loss": 0.1893,
1667
  "step": 11550
1668
  },
1669
  {
1670
  "epoch": 4.100388829975256,
1671
- "grad_norm": 1.6090291738510132,
1672
- "learning_rate": 5.4103923647932125e-05,
1673
- "loss": 0.1669,
1674
  "step": 11600
1675
  },
1676
  {
1677
  "epoch": 4.118062919759632,
1678
- "grad_norm": 0.7359048128128052,
1679
- "learning_rate": 5.304347826086956e-05,
1680
- "loss": 0.182,
1681
  "step": 11650
1682
  },
1683
  {
1684
  "epoch": 4.135737009544009,
1685
- "grad_norm": 1.337510347366333,
1686
- "learning_rate": 5.1983032873806996e-05,
1687
- "loss": 0.1894,
1688
  "step": 11700
1689
  },
1690
  {
1691
  "epoch": 4.153411099328385,
1692
- "grad_norm": 1.1865614652633667,
1693
- "learning_rate": 5.092258748674443e-05,
1694
- "loss": 0.1898,
1695
  "step": 11750
1696
  },
1697
  {
1698
  "epoch": 4.171085189112761,
1699
- "grad_norm": 1.8255292177200317,
1700
- "learning_rate": 4.986214209968186e-05,
1701
- "loss": 0.1716,
1702
  "step": 11800
1703
  },
1704
  {
1705
  "epoch": 4.188759278897137,
1706
- "grad_norm": 0.624253511428833,
1707
- "learning_rate": 4.8801696712619294e-05,
1708
- "loss": 0.1766,
1709
  "step": 11850
1710
  },
1711
  {
1712
  "epoch": 4.2064333686815125,
1713
- "grad_norm": 0.9569867253303528,
1714
- "learning_rate": 4.774125132555673e-05,
1715
- "loss": 0.1458,
1716
  "step": 11900
1717
  },
1718
  {
1719
  "epoch": 4.224107458465889,
1720
- "grad_norm": 0.9596979022026062,
1721
- "learning_rate": 4.6680805938494165e-05,
1722
- "loss": 0.1808,
1723
  "step": 11950
1724
  },
1725
  {
1726
  "epoch": 4.241781548250265,
1727
- "grad_norm": 1.014739751815796,
1728
- "learning_rate": 4.562036055143159e-05,
1729
- "loss": 0.1918,
1730
  "step": 12000
1731
  },
1732
  {
1733
  "epoch": 4.259455638034641,
1734
- "grad_norm": 1.2566267251968384,
1735
- "learning_rate": 4.455991516436903e-05,
1736
- "loss": 0.1581,
1737
  "step": 12050
1738
  },
1739
  {
1740
  "epoch": 4.277129727819017,
1741
- "grad_norm": 0.7401285171508789,
1742
- "learning_rate": 4.349946977730647e-05,
1743
- "loss": 0.164,
1744
  "step": 12100
1745
  },
1746
  {
1747
  "epoch": 4.294803817603394,
1748
- "grad_norm": 0.43340951204299927,
1749
- "learning_rate": 4.243902439024389e-05,
1750
- "loss": 0.1722,
1751
  "step": 12150
1752
  },
1753
  {
1754
  "epoch": 4.31247790738777,
1755
- "grad_norm": 4.859116554260254,
1756
- "learning_rate": 4.1378579003181334e-05,
1757
- "loss": 0.1986,
1758
  "step": 12200
1759
  },
1760
  {
1761
  "epoch": 4.330151997172146,
1762
- "grad_norm": 1.3189374208450317,
1763
- "learning_rate": 4.031813361611877e-05,
1764
- "loss": 0.2118,
1765
  "step": 12250
1766
  },
1767
  {
1768
  "epoch": 4.3478260869565215,
1769
- "grad_norm": 0.6523744463920593,
1770
- "learning_rate": 3.9257688229056204e-05,
1771
- "loss": 0.1974,
1772
  "step": 12300
1773
  },
1774
  {
1775
  "epoch": 4.365500176740898,
1776
- "grad_norm": 0.5694284439086914,
1777
- "learning_rate": 3.819724284199363e-05,
1778
- "loss": 0.2032,
1779
  "step": 12350
1780
  },
1781
  {
1782
  "epoch": 4.383174266525274,
1783
- "grad_norm": 0.6631605625152588,
1784
- "learning_rate": 3.713679745493107e-05,
1785
- "loss": 0.1648,
1786
  "step": 12400
1787
  },
1788
  {
1789
  "epoch": 4.40084835630965,
1790
- "grad_norm": 1.4959896802902222,
1791
- "learning_rate": 3.60763520678685e-05,
1792
- "loss": 0.1397,
1793
  "step": 12450
1794
  },
1795
  {
1796
  "epoch": 4.418522446094026,
1797
- "grad_norm": 0.8434396982192993,
1798
- "learning_rate": 3.501590668080594e-05,
1799
- "loss": 0.1834,
1800
  "step": 12500
1801
  },
1802
  {
1803
  "epoch": 4.436196535878402,
1804
- "grad_norm": 0.748446524143219,
1805
- "learning_rate": 3.395546129374337e-05,
1806
- "loss": 0.1916,
1807
  "step": 12550
1808
  },
1809
  {
1810
  "epoch": 4.453870625662779,
1811
- "grad_norm": 1.5315219163894653,
1812
- "learning_rate": 3.28950159066808e-05,
1813
- "loss": 0.1696,
1814
  "step": 12600
1815
  },
1816
  {
1817
  "epoch": 4.471544715447155,
1818
- "grad_norm": 0.81038498878479,
1819
- "learning_rate": 3.1834570519618236e-05,
1820
- "loss": 0.1861,
1821
  "step": 12650
1822
  },
1823
  {
1824
  "epoch": 4.4892188052315305,
1825
- "grad_norm": 0.5976743102073669,
1826
- "learning_rate": 3.077412513255567e-05,
1827
- "loss": 0.2113,
1828
  "step": 12700
1829
  },
1830
  {
1831
  "epoch": 4.506892895015906,
1832
- "grad_norm": 0.806030809879303,
1833
- "learning_rate": 2.9713679745493106e-05,
1834
- "loss": 0.1571,
1835
  "step": 12750
1836
  },
1837
  {
1838
  "epoch": 4.524566984800282,
1839
- "grad_norm": 1.6976985931396484,
1840
- "learning_rate": 2.8653234358430538e-05,
1841
- "loss": 0.167,
1842
  "step": 12800
1843
  },
1844
  {
1845
  "epoch": 4.542241074584659,
1846
- "grad_norm": 0.8575769662857056,
1847
- "learning_rate": 2.7592788971367973e-05,
1848
- "loss": 0.1551,
1849
  "step": 12850
1850
  },
1851
  {
1852
  "epoch": 4.559915164369035,
1853
- "grad_norm": 0.8071011900901794,
1854
- "learning_rate": 2.6532343584305405e-05,
1855
- "loss": 0.197,
1856
  "step": 12900
1857
  },
1858
  {
1859
  "epoch": 4.577589254153411,
1860
- "grad_norm": 0.8892520666122437,
1861
- "learning_rate": 2.547189819724284e-05,
1862
- "loss": 0.1766,
1863
  "step": 12950
1864
  },
1865
  {
1866
  "epoch": 4.595263343937788,
1867
- "grad_norm": 1.0027281045913696,
1868
- "learning_rate": 2.4411452810180272e-05,
1869
- "loss": 0.1826,
1870
  "step": 13000
1871
  },
1872
  {
1873
  "epoch": 4.612937433722164,
1874
- "grad_norm": 1.3052587509155273,
1875
- "learning_rate": 2.3351007423117707e-05,
1876
- "loss": 0.1779,
1877
  "step": 13050
1878
  },
1879
  {
1880
  "epoch": 4.6306115235065395,
1881
- "grad_norm": 1.1934298276901245,
1882
- "learning_rate": 2.2290562036055142e-05,
1883
- "loss": 0.1585,
1884
  "step": 13100
1885
  },
1886
  {
1887
  "epoch": 4.648285613290915,
1888
- "grad_norm": 1.305802583694458,
1889
- "learning_rate": 2.1251325556733824e-05,
1890
- "loss": 0.1689,
1891
  "step": 13150
1892
  },
1893
  {
1894
  "epoch": 4.665959703075291,
1895
- "grad_norm": 1.010707974433899,
1896
- "learning_rate": 2.0190880169671262e-05,
1897
- "loss": 0.1975,
1898
  "step": 13200
1899
  },
1900
  {
1901
  "epoch": 4.683633792859668,
1902
- "grad_norm": 0.8816762566566467,
1903
- "learning_rate": 1.9130434782608694e-05,
1904
- "loss": 0.1395,
1905
  "step": 13250
1906
  },
1907
  {
1908
  "epoch": 4.701307882644044,
1909
- "grad_norm": 0.9856722950935364,
1910
- "learning_rate": 1.8069989395546126e-05,
1911
- "loss": 0.2111,
1912
  "step": 13300
1913
  },
1914
  {
1915
  "epoch": 4.71898197242842,
1916
- "grad_norm": 1.1348321437835693,
1917
- "learning_rate": 1.700954400848356e-05,
1918
- "loss": 0.1715,
1919
  "step": 13350
1920
  },
1921
  {
1922
  "epoch": 4.736656062212796,
1923
- "grad_norm": 1.5361067056655884,
1924
- "learning_rate": 1.5949098621420996e-05,
1925
- "loss": 0.1569,
1926
  "step": 13400
1927
  },
1928
  {
1929
  "epoch": 4.754330151997172,
1930
- "grad_norm": 0.6704153418540955,
1931
- "learning_rate": 1.488865323435843e-05,
1932
- "loss": 0.1518,
1933
  "step": 13450
1934
  },
1935
  {
1936
  "epoch": 4.7720042417815485,
1937
- "grad_norm": 1.731184482574463,
1938
- "learning_rate": 1.3828207847295863e-05,
1939
- "loss": 0.1671,
1940
  "step": 13500
1941
  },
1942
  {
1943
  "epoch": 4.789678331565924,
1944
- "grad_norm": 0.18779240548610687,
1945
- "learning_rate": 1.2767762460233296e-05,
1946
- "loss": 0.163,
1947
  "step": 13550
1948
  },
1949
  {
1950
  "epoch": 4.8073524213503,
1951
- "grad_norm": 0.5464810132980347,
1952
- "learning_rate": 1.1707317073170731e-05,
1953
- "loss": 0.1998,
1954
  "step": 13600
1955
  },
1956
  {
1957
  "epoch": 4.825026511134676,
1958
- "grad_norm": 1.0994397401809692,
1959
- "learning_rate": 1.0646871686108165e-05,
1960
- "loss": 0.1997,
1961
  "step": 13650
1962
  },
1963
  {
1964
  "epoch": 4.842700600919053,
1965
- "grad_norm": 1.5267748832702637,
1966
- "learning_rate": 9.586426299045598e-06,
1967
- "loss": 0.1643,
1968
  "step": 13700
1969
  },
1970
  {
1971
  "epoch": 4.860374690703429,
1972
- "grad_norm": 0.586216151714325,
1973
- "learning_rate": 8.525980911983032e-06,
1974
- "loss": 0.1661,
1975
  "step": 13750
1976
  },
1977
  {
1978
  "epoch": 4.878048780487805,
1979
- "grad_norm": 1.5473626852035522,
1980
- "learning_rate": 7.465535524920466e-06,
1981
- "loss": 0.1584,
1982
  "step": 13800
1983
  },
1984
  {
1985
  "epoch": 4.895722870272181,
1986
- "grad_norm": 0.5935518741607666,
1987
- "learning_rate": 6.4050901378578995e-06,
1988
- "loss": 0.1695,
1989
  "step": 13850
1990
  },
1991
  {
1992
  "epoch": 4.9133969600565575,
1993
- "grad_norm": 1.2073270082473755,
1994
- "learning_rate": 5.344644750795333e-06,
1995
- "loss": 0.168,
1996
  "step": 13900
1997
  },
1998
  {
1999
  "epoch": 4.931071049840933,
2000
- "grad_norm": 0.43232324719429016,
2001
- "learning_rate": 4.284199363732767e-06,
2002
- "loss": 0.1631,
2003
  "step": 13950
2004
  },
2005
  {
2006
  "epoch": 4.948745139625309,
2007
- "grad_norm": 1.4200788736343384,
2008
- "learning_rate": 3.223753976670201e-06,
2009
- "loss": 0.1997,
2010
  "step": 14000
2011
  },
2012
  {
2013
  "epoch": 4.966419229409685,
2014
- "grad_norm": 0.20576679706573486,
2015
- "learning_rate": 2.163308589607635e-06,
2016
- "loss": 0.1642,
2017
  "step": 14050
2018
  },
2019
  {
2020
  "epoch": 4.984093319194061,
2021
- "grad_norm": 0.6973247528076172,
2022
- "learning_rate": 1.1028632025450689e-06,
2023
- "loss": 0.1908,
2024
  "step": 14100
2025
  },
2026
  {
2027
  "epoch": 5.0,
2028
- "eval_bertscore_f1": 0.9718361922106383,
2029
- "eval_bleu": 0.6364935942381945,
2030
- "eval_loss": 0.13659338653087616,
2031
- "eval_runtime": 1013.1078,
2032
- "eval_samples_per_second": 6.38,
2033
- "eval_steps_per_second": 0.798,
 
 
 
2034
  "step": 14145
2035
  }
2036
  ],
 
1
  {
2
  "best_global_step": 14145,
3
+ "best_metric": 0.9658045416033947,
4
+ "best_model_checkpoint": "/kaggle/working/codet5-k8s-lora/checkpoint-14145",
5
  "epoch": 5.0,
6
  "eval_steps": 500,
7
  "global_step": 14145,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.017674089784376106,
14
+ "grad_norm": 2.7520432472229004,
15
+ "learning_rate": 4.983739837398374e-05,
16
+ "loss": 4.115,
17
  "step": 50
18
  },
19
  {
20
  "epoch": 0.03534817956875221,
21
+ "grad_norm": 17.1498966217041,
22
+ "learning_rate": 4.966065747613998e-05,
23
+ "loss": 3.2283,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.053022269353128315,
28
+ "grad_norm": 2.4044106006622314,
29
+ "learning_rate": 4.948391657829622e-05,
30
+ "loss": 2.2704,
31
  "step": 150
32
  },
33
  {
34
  "epoch": 0.07069635913750442,
35
+ "grad_norm": 2.3264973163604736,
36
+ "learning_rate": 4.930717568045246e-05,
37
+ "loss": 1.963,
38
  "step": 200
39
  },
40
  {
41
  "epoch": 0.08837044892188052,
42
+ "grad_norm": 2.146696090698242,
43
+ "learning_rate": 4.91304347826087e-05,
44
+ "loss": 1.8358,
45
  "step": 250
46
  },
47
  {
48
  "epoch": 0.10604453870625663,
49
+ "grad_norm": 2.6933417320251465,
50
+ "learning_rate": 4.895369388476494e-05,
51
+ "loss": 1.7542,
52
  "step": 300
53
  },
54
  {
55
  "epoch": 0.12371862849063273,
56
+ "grad_norm": 2.71453857421875,
57
+ "learning_rate": 4.8776952986921177e-05,
58
+ "loss": 1.6651,
59
  "step": 350
60
  },
61
  {
62
  "epoch": 0.14139271827500885,
63
+ "grad_norm": 2.239396095275879,
64
+ "learning_rate": 4.8600212089077416e-05,
65
+ "loss": 1.5549,
66
  "step": 400
67
  },
68
  {
69
  "epoch": 0.15906680805938495,
70
+ "grad_norm": 3.038501262664795,
71
+ "learning_rate": 4.8423471191233655e-05,
72
+ "loss": 1.4887,
73
  "step": 450
74
  },
75
  {
76
  "epoch": 0.17674089784376104,
77
+ "grad_norm": 2.988568067550659,
78
+ "learning_rate": 4.825026511134676e-05,
79
+ "loss": 1.4896,
80
  "step": 500
81
  },
82
  {
83
  "epoch": 0.19441498762813716,
84
+ "grad_norm": 2.7871344089508057,
85
+ "learning_rate": 4.807352421350301e-05,
86
+ "loss": 1.3147,
87
  "step": 550
88
  },
89
  {
90
  "epoch": 0.21208907741251326,
91
+ "grad_norm": 2.9361987113952637,
92
+ "learning_rate": 4.789678331565924e-05,
93
+ "loss": 1.4059,
94
  "step": 600
95
  },
96
  {
97
  "epoch": 0.22976316719688936,
98
+ "grad_norm": 3.7504327297210693,
99
+ "learning_rate": 4.7720042417815487e-05,
100
+ "loss": 1.3733,
101
  "step": 650
102
  },
103
  {
104
  "epoch": 0.24743725698126545,
105
+ "grad_norm": 2.8540990352630615,
106
+ "learning_rate": 4.7543301519971726e-05,
107
+ "loss": 1.2048,
108
  "step": 700
109
  },
110
  {
111
  "epoch": 0.2651113467656416,
112
+ "grad_norm": 3.3986833095550537,
113
+ "learning_rate": 4.7366560622127965e-05,
114
+ "loss": 1.3204,
115
  "step": 750
116
  },
117
  {
118
  "epoch": 0.2827854365500177,
119
+ "grad_norm": 3.490173816680908,
120
+ "learning_rate": 4.7189819724284204e-05,
121
+ "loss": 1.2636,
122
  "step": 800
123
  },
124
  {
125
  "epoch": 0.30045952633439377,
126
+ "grad_norm": 3.0693671703338623,
127
+ "learning_rate": 4.701307882644044e-05,
128
+ "loss": 1.1539,
129
  "step": 850
130
  },
131
  {
132
  "epoch": 0.3181336161187699,
133
+ "grad_norm": 6.666368007659912,
134
+ "learning_rate": 4.683633792859668e-05,
135
+ "loss": 1.1835,
136
  "step": 900
137
  },
138
  {
139
  "epoch": 0.335807705903146,
140
+ "grad_norm": 3.3494420051574707,
141
+ "learning_rate": 4.6659597030752915e-05,
142
+ "loss": 1.0956,
143
  "step": 950
144
  },
145
  {
146
  "epoch": 0.3534817956875221,
147
+ "grad_norm": 2.6836087703704834,
148
+ "learning_rate": 4.648285613290916e-05,
149
+ "loss": 1.1311,
150
  "step": 1000
151
  },
152
  {
153
  "epoch": 0.3711558854718982,
154
+ "grad_norm": 2.8905951976776123,
155
+ "learning_rate": 4.630611523506539e-05,
156
+ "loss": 1.0765,
157
  "step": 1050
158
  },
159
  {
160
  "epoch": 0.38882997525627433,
161
+ "grad_norm": 3.939500093460083,
162
+ "learning_rate": 4.612937433722164e-05,
163
+ "loss": 1.0456,
164
  "step": 1100
165
  },
166
  {
167
  "epoch": 0.4065040650406504,
168
+ "grad_norm": 3.1198384761810303,
169
+ "learning_rate": 4.595263343937787e-05,
170
+ "loss": 1.0448,
171
  "step": 1150
172
  },
173
  {
174
  "epoch": 0.4241781548250265,
175
+ "grad_norm": 3.0342133045196533,
176
+ "learning_rate": 4.577589254153412e-05,
177
+ "loss": 0.9943,
178
  "step": 1200
179
  },
180
  {
181
  "epoch": 0.4418522446094026,
182
+ "grad_norm": 3.555405616760254,
183
+ "learning_rate": 4.559915164369035e-05,
184
+ "loss": 1.045,
185
  "step": 1250
186
  },
187
  {
188
  "epoch": 0.4595263343937787,
189
+ "grad_norm": 3.0326290130615234,
190
+ "learning_rate": 4.542241074584659e-05,
191
+ "loss": 0.9807,
192
  "step": 1300
193
  },
194
  {
195
  "epoch": 0.47720042417815484,
196
+ "grad_norm": 3.1818552017211914,
197
+ "learning_rate": 4.524566984800283e-05,
198
+ "loss": 0.9889,
199
  "step": 1350
200
  },
201
  {
202
  "epoch": 0.4948745139625309,
203
+ "grad_norm": 2.933518171310425,
204
+ "learning_rate": 4.506892895015907e-05,
205
+ "loss": 0.9373,
206
  "step": 1400
207
  },
208
  {
209
  "epoch": 0.512548603746907,
210
+ "grad_norm": 4.3423566818237305,
211
+ "learning_rate": 4.489218805231531e-05,
212
+ "loss": 1.0253,
213
  "step": 1450
214
  },
215
  {
216
  "epoch": 0.5302226935312832,
217
+ "grad_norm": 3.6613636016845703,
218
+ "learning_rate": 4.4715447154471546e-05,
219
+ "loss": 0.9458,
220
  "step": 1500
221
  },
222
  {
223
  "epoch": 0.5478967833156593,
224
+ "grad_norm": 2.736353635787964,
225
+ "learning_rate": 4.454224107458466e-05,
226
+ "loss": 0.9205,
227
  "step": 1550
228
  },
229
  {
230
  "epoch": 0.5655708731000354,
231
+ "grad_norm": 3.9614179134368896,
232
+ "learning_rate": 4.43655001767409e-05,
233
+ "loss": 0.8896,
234
  "step": 1600
235
  },
236
  {
237
  "epoch": 0.5832449628844114,
238
+ "grad_norm": 3.397909164428711,
239
+ "learning_rate": 4.418875927889714e-05,
240
+ "loss": 0.9247,
241
  "step": 1650
242
  },
243
  {
244
  "epoch": 0.6009190526687875,
245
+ "grad_norm": 3.26153564453125,
246
+ "learning_rate": 4.401201838105338e-05,
247
+ "loss": 0.8023,
248
  "step": 1700
249
  },
250
  {
251
  "epoch": 0.6185931424531637,
252
+ "grad_norm": 4.067619800567627,
253
+ "learning_rate": 4.383527748320962e-05,
254
+ "loss": 0.8967,
255
  "step": 1750
256
  },
257
  {
258
  "epoch": 0.6362672322375398,
259
+ "grad_norm": 3.9123120307922363,
260
+ "learning_rate": 4.3658536585365856e-05,
261
+ "loss": 0.8668,
262
  "step": 1800
263
  },
264
  {
265
  "epoch": 0.6539413220219159,
266
+ "grad_norm": 3.9695286750793457,
267
+ "learning_rate": 4.3481795687522095e-05,
268
+ "loss": 0.8688,
269
  "step": 1850
270
  },
271
  {
272
  "epoch": 0.671615411806292,
273
+ "grad_norm": 3.176691770553589,
274
+ "learning_rate": 4.3305054789678334e-05,
275
+ "loss": 0.8475,
276
  "step": 1900
277
  },
278
  {
279
  "epoch": 0.689289501590668,
280
+ "grad_norm": 4.363101005554199,
281
+ "learning_rate": 4.3128313891834574e-05,
282
+ "loss": 0.8343,
283
  "step": 1950
284
  },
285
  {
286
  "epoch": 0.7069635913750442,
287
+ "grad_norm": 4.429725170135498,
288
+ "learning_rate": 4.295157299399081e-05,
289
+ "loss": 0.8001,
290
  "step": 2000
291
  },
292
  {
293
  "epoch": 0.7246376811594203,
294
+ "grad_norm": 3.035944700241089,
295
+ "learning_rate": 4.277483209614705e-05,
296
+ "loss": 0.7387,
297
  "step": 2050
298
  },
299
  {
300
  "epoch": 0.7423117709437964,
301
+ "grad_norm": 3.1769495010375977,
302
+ "learning_rate": 4.259809119830329e-05,
303
+ "loss": 0.7837,
304
  "step": 2100
305
  },
306
  {
307
  "epoch": 0.7599858607281725,
308
+ "grad_norm": 3.7096972465515137,
309
+ "learning_rate": 4.242135030045953e-05,
310
+ "loss": 0.8015,
311
  "step": 2150
312
  },
313
  {
314
  "epoch": 0.7776599505125487,
315
+ "grad_norm": 4.8830718994140625,
316
+ "learning_rate": 4.224460940261576e-05,
317
+ "loss": 0.8145,
318
  "step": 2200
319
  },
320
  {
321
  "epoch": 0.7953340402969247,
322
+ "grad_norm": 4.858372688293457,
323
+ "learning_rate": 4.206786850477201e-05,
324
+ "loss": 0.7707,
325
  "step": 2250
326
  },
327
  {
328
  "epoch": 0.8130081300813008,
329
+ "grad_norm": 4.0257697105407715,
330
+ "learning_rate": 4.189112760692824e-05,
331
+ "loss": 0.7365,
332
  "step": 2300
333
  },
334
  {
335
  "epoch": 0.8306822198656769,
336
+ "grad_norm": 3.4525439739227295,
337
+ "learning_rate": 4.171438670908449e-05,
338
+ "loss": 0.6824,
339
  "step": 2350
340
  },
341
  {
342
  "epoch": 0.848356309650053,
343
+ "grad_norm": 3.3290514945983887,
344
+ "learning_rate": 4.153764581124072e-05,
345
+ "loss": 0.696,
346
  "step": 2400
347
  },
348
  {
349
  "epoch": 0.8660303994344292,
350
+ "grad_norm": 3.284925937652588,
351
+ "learning_rate": 4.1360904913396966e-05,
352
+ "loss": 0.7506,
353
  "step": 2450
354
  },
355
  {
356
  "epoch": 0.8837044892188052,
357
+ "grad_norm": 7.381216526031494,
358
+ "learning_rate": 4.11841640155532e-05,
359
+ "loss": 0.7282,
360
  "step": 2500
361
  },
362
  {
363
  "epoch": 0.9013785790031813,
364
+ "grad_norm": 3.3384296894073486,
365
+ "learning_rate": 4.1007423117709444e-05,
366
+ "loss": 0.7632,
367
  "step": 2550
368
  },
369
  {
370
  "epoch": 0.9190526687875574,
371
+ "grad_norm": 3.1576461791992188,
372
+ "learning_rate": 4.0830682219865676e-05,
373
+ "loss": 0.739,
374
  "step": 2600
375
  },
376
  {
377
  "epoch": 0.9367267585719335,
378
+ "grad_norm": 3.626567840576172,
379
+ "learning_rate": 4.0653941322021916e-05,
380
+ "loss": 0.6791,
381
  "step": 2650
382
  },
383
  {
384
  "epoch": 0.9544008483563097,
385
+ "grad_norm": 4.196743011474609,
386
+ "learning_rate": 4.0477200424178155e-05,
387
+ "loss": 0.7332,
388
  "step": 2700
389
  },
390
  {
391
  "epoch": 0.9720749381406858,
392
+ "grad_norm": 3.1296284198760986,
393
+ "learning_rate": 4.0300459526334394e-05,
394
+ "loss": 0.6539,
395
  "step": 2750
396
  },
397
  {
398
  "epoch": 0.9897490279250618,
399
+ "grad_norm": 5.278796672821045,
400
+ "learning_rate": 4.012371862849063e-05,
401
+ "loss": 0.737,
402
  "step": 2800
403
  },
404
  {
405
  "epoch": 1.0,
406
+ "eval_bertscore_f1": 0.9560778172188761,
407
+ "eval_bleu": 0.4824577747321971,
408
+ "eval_loss": 0.5060375928878784,
409
+ "eval_meteor": 0.6530790735036477,
410
+ "eval_rouge1": 0.7971610347699927,
411
+ "eval_rouge2": 0.6895819892909911,
412
+ "eval_runtime": 1389.2459,
413
+ "eval_samples_per_second": 4.653,
414
+ "eval_steps_per_second": 0.582,
415
  "step": 2829
416
  },
417
  {
418
  "epoch": 1.007423117709438,
419
+ "grad_norm": 3.9626195430755615,
420
+ "learning_rate": 3.994697773064687e-05,
421
+ "loss": 0.7198,
422
  "step": 2850
423
  },
424
  {
425
  "epoch": 1.025097207493814,
426
+ "grad_norm": 4.509051322937012,
427
+ "learning_rate": 3.977023683280312e-05,
428
+ "loss": 0.6777,
429
  "step": 2900
430
  },
431
  {
432
  "epoch": 1.0427712972781902,
433
+ "grad_norm": 3.044351816177368,
434
+ "learning_rate": 3.959349593495935e-05,
435
+ "loss": 0.6675,
436
  "step": 2950
437
  },
438
  {
439
  "epoch": 1.0604453870625663,
440
+ "grad_norm": 3.598339319229126,
441
+ "learning_rate": 3.941675503711559e-05,
442
+ "loss": 0.7256,
443
  "step": 3000
444
  },
445
  {
446
  "epoch": 1.0781194768469424,
447
+ "grad_norm": 5.248291015625,
448
+ "learning_rate": 3.924001413927183e-05,
449
+ "loss": 0.5862,
450
  "step": 3050
451
  },
452
  {
453
  "epoch": 1.0957935666313185,
454
+ "grad_norm": 3.2210874557495117,
455
+ "learning_rate": 3.906327324142807e-05,
456
+ "loss": 0.7295,
457
  "step": 3100
458
  },
459
  {
460
  "epoch": 1.1134676564156947,
461
+ "grad_norm": 3.6727230548858643,
462
+ "learning_rate": 3.888653234358431e-05,
463
+ "loss": 0.6206,
464
  "step": 3150
465
  },
466
  {
467
  "epoch": 1.1311417462000706,
468
+ "grad_norm": 4.341032981872559,
469
+ "learning_rate": 3.870979144574055e-05,
470
+ "loss": 0.676,
471
  "step": 3200
472
  },
473
  {
474
  "epoch": 1.148815835984447,
475
+ "grad_norm": 3.0379395484924316,
476
+ "learning_rate": 3.8533050547896786e-05,
477
+ "loss": 0.6372,
478
  "step": 3250
479
  },
480
  {
481
  "epoch": 1.1664899257688228,
482
+ "grad_norm": 2.846511125564575,
483
+ "learning_rate": 3.8356309650053025e-05,
484
+ "loss": 0.6758,
485
  "step": 3300
486
  },
487
  {
488
  "epoch": 1.184164015553199,
489
+ "grad_norm": 3.267794132232666,
490
+ "learning_rate": 3.817956875220926e-05,
491
+ "loss": 0.6035,
492
  "step": 3350
493
  },
494
  {
495
  "epoch": 1.201838105337575,
496
+ "grad_norm": 5.214766025543213,
497
+ "learning_rate": 3.8002827854365503e-05,
498
+ "loss": 0.6907,
499
  "step": 3400
500
  },
501
  {
502
  "epoch": 1.2195121951219512,
503
+ "grad_norm": 5.761065483093262,
504
+ "learning_rate": 3.7826086956521736e-05,
505
+ "loss": 0.6092,
506
  "step": 3450
507
  },
508
  {
509
  "epoch": 1.2371862849063273,
510
+ "grad_norm": 4.127236843109131,
511
+ "learning_rate": 3.764934605867798e-05,
512
+ "loss": 0.6127,
513
  "step": 3500
514
  },
515
  {
516
  "epoch": 1.2548603746907034,
517
+ "grad_norm": 2.9047141075134277,
518
+ "learning_rate": 3.747260516083422e-05,
519
+ "loss": 0.6486,
520
  "step": 3550
521
  },
522
  {
523
  "epoch": 1.2725344644750796,
524
+ "grad_norm": 3.972148895263672,
525
+ "learning_rate": 3.729586426299046e-05,
526
+ "loss": 0.6933,
527
  "step": 3600
528
  },
529
  {
530
  "epoch": 1.2902085542594557,
531
+ "grad_norm": 3.2735204696655273,
532
+ "learning_rate": 3.71191233651467e-05,
533
+ "loss": 0.6808,
534
  "step": 3650
535
  },
536
  {
537
  "epoch": 1.3078826440438318,
538
+ "grad_norm": 5.26752233505249,
539
+ "learning_rate": 3.694238246730294e-05,
540
+ "loss": 0.6672,
541
  "step": 3700
542
  },
543
  {
544
  "epoch": 1.3255567338282077,
545
+ "grad_norm": 3.852576971054077,
546
+ "learning_rate": 3.676564156945918e-05,
547
+ "loss": 0.6458,
548
  "step": 3750
549
  },
550
  {
551
  "epoch": 1.343230823612584,
552
+ "grad_norm": 5.333845138549805,
553
+ "learning_rate": 3.658890067161541e-05,
554
+ "loss": 0.5771,
555
  "step": 3800
556
  },
557
  {
558
  "epoch": 1.36090491339696,
559
+ "grad_norm": 3.3407108783721924,
560
+ "learning_rate": 3.6412159773771656e-05,
561
+ "loss": 0.6281,
562
  "step": 3850
563
  },
564
  {
565
  "epoch": 1.378579003181336,
566
+ "grad_norm": 3.874502658843994,
567
+ "learning_rate": 3.623541887592789e-05,
568
+ "loss": 0.6716,
569
  "step": 3900
570
  },
571
  {
572
  "epoch": 1.3962530929657122,
573
+ "grad_norm": 23.51529884338379,
574
+ "learning_rate": 3.6058677978084134e-05,
575
+ "loss": 0.5915,
576
  "step": 3950
577
  },
578
  {
579
  "epoch": 1.4139271827500883,
580
+ "grad_norm": 4.40012788772583,
581
+ "learning_rate": 3.588193708024037e-05,
582
+ "loss": 0.6098,
583
  "step": 4000
584
  },
585
  {
586
  "epoch": 1.4316012725344645,
587
+ "grad_norm": 2.98525071144104,
588
+ "learning_rate": 3.570519618239661e-05,
589
+ "loss": 0.5767,
590
  "step": 4050
591
  },
592
  {
593
  "epoch": 1.4492753623188406,
594
+ "grad_norm": 3.8279759883880615,
595
+ "learning_rate": 3.5528455284552845e-05,
596
+ "loss": 0.5843,
597
  "step": 4100
598
  },
599
  {
600
  "epoch": 1.4669494521032167,
601
+ "grad_norm": 2.8104283809661865,
602
+ "learning_rate": 3.5351714386709084e-05,
603
+ "loss": 0.5314,
604
  "step": 4150
605
  },
606
  {
607
  "epoch": 1.4846235418875928,
608
+ "grad_norm": 3.081321954727173,
609
+ "learning_rate": 3.5174973488865324e-05,
610
+ "loss": 0.5187,
611
  "step": 4200
612
  },
613
  {
614
  "epoch": 1.502297631671969,
615
+ "grad_norm": 2.8093416690826416,
616
+ "learning_rate": 3.499823259102156e-05,
617
+ "loss": 0.5663,
618
  "step": 4250
619
  },
620
  {
621
  "epoch": 1.5199717214563448,
622
+ "grad_norm": 3.7971787452697754,
623
+ "learning_rate": 3.48214916931778e-05,
624
+ "loss": 0.5069,
625
  "step": 4300
626
  },
627
  {
628
  "epoch": 1.5376458112407212,
629
+ "grad_norm": 3.116645336151123,
630
+ "learning_rate": 3.464475079533404e-05,
631
+ "loss": 0.4945,
632
  "step": 4350
633
  },
634
  {
635
  "epoch": 1.555319901025097,
636
+ "grad_norm": 2.9984517097473145,
637
+ "learning_rate": 3.446800989749028e-05,
638
+ "loss": 0.5399,
639
  "step": 4400
640
  },
641
  {
642
  "epoch": 1.5729939908094734,
643
+ "grad_norm": 3.3107683658599854,
644
+ "learning_rate": 3.429126899964652e-05,
645
+ "loss": 0.5507,
646
  "step": 4450
647
  },
648
  {
649
  "epoch": 1.5906680805938493,
650
+ "grad_norm": 4.328573226928711,
651
+ "learning_rate": 3.411452810180276e-05,
652
+ "loss": 0.5695,
653
  "step": 4500
654
  },
655
  {
656
  "epoch": 1.6083421703782255,
657
+ "grad_norm": 4.086219787597656,
658
+ "learning_rate": 3.3937787203959e-05,
659
+ "loss": 0.5234,
660
  "step": 4550
661
  },
662
  {
663
  "epoch": 1.6260162601626016,
664
+ "grad_norm": 3.971590280532837,
665
+ "learning_rate": 3.376104630611524e-05,
666
+ "loss": 0.6107,
667
  "step": 4600
668
  },
669
  {
670
  "epoch": 1.6436903499469777,
671
+ "grad_norm": 3.0131218433380127,
672
+ "learning_rate": 3.3584305408271476e-05,
673
+ "loss": 0.5763,
674
  "step": 4650
675
  },
676
  {
677
  "epoch": 1.6613644397313538,
678
+ "grad_norm": 3.862619161605835,
679
+ "learning_rate": 3.3407564510427716e-05,
680
+ "loss": 0.5422,
681
  "step": 4700
682
  },
683
  {
684
  "epoch": 1.67903852951573,
685
+ "grad_norm": 4.276158332824707,
686
+ "learning_rate": 3.3230823612583955e-05,
687
+ "loss": 0.6326,
688
  "step": 4750
689
  },
690
  {
691
  "epoch": 1.696712619300106,
692
+ "grad_norm": 4.451511383056641,
693
+ "learning_rate": 3.3054082714740194e-05,
694
+ "loss": 0.5883,
695
  "step": 4800
696
  },
697
  {
698
  "epoch": 1.714386709084482,
699
+ "grad_norm": 3.8793303966522217,
700
+ "learning_rate": 3.2877341816896426e-05,
701
+ "loss": 0.4958,
702
  "step": 4850
703
  },
704
  {
705
  "epoch": 1.7320607988688583,
706
+ "grad_norm": 3.0720949172973633,
707
+ "learning_rate": 3.270060091905267e-05,
708
+ "loss": 0.5962,
709
  "step": 4900
710
  },
711
  {
712
  "epoch": 1.7497348886532342,
713
+ "grad_norm": 3.674368143081665,
714
+ "learning_rate": 3.2523860021208905e-05,
715
+ "loss": 0.5179,
716
  "step": 4950
717
  },
718
  {
719
  "epoch": 1.7674089784376106,
720
+ "grad_norm": 4.063836574554443,
721
+ "learning_rate": 3.234711912336515e-05,
722
+ "loss": 0.5588,
723
  "step": 5000
724
  },
725
  {
726
  "epoch": 1.7850830682219865,
727
+ "grad_norm": 5.352965354919434,
728
+ "learning_rate": 3.217037822552138e-05,
729
+ "loss": 0.5762,
730
  "step": 5050
731
  },
732
  {
733
  "epoch": 1.8027571580063628,
734
+ "grad_norm": 4.00150203704834,
735
+ "learning_rate": 3.199363732767763e-05,
736
+ "loss": 0.5546,
737
  "step": 5100
738
  },
739
  {
740
  "epoch": 1.8204312477907387,
741
+ "grad_norm": 4.444275856018066,
742
+ "learning_rate": 3.181689642983386e-05,
743
+ "loss": 0.5321,
744
  "step": 5150
745
  },
746
  {
747
  "epoch": 1.8381053375751149,
748
+ "grad_norm": 2.783729076385498,
749
+ "learning_rate": 3.164015553199011e-05,
750
+ "loss": 0.5859,
751
  "step": 5200
752
  },
753
  {
754
  "epoch": 1.855779427359491,
755
+ "grad_norm": 4.168649673461914,
756
+ "learning_rate": 3.146341463414634e-05,
757
+ "loss": 0.5353,
758
  "step": 5250
759
  },
760
  {
761
  "epoch": 1.873453517143867,
762
+ "grad_norm": 4.757116794586182,
763
+ "learning_rate": 3.128667373630258e-05,
764
+ "loss": 0.4766,
765
  "step": 5300
766
  },
767
  {
768
  "epoch": 1.8911276069282432,
769
+ "grad_norm": 2.5472869873046875,
770
+ "learning_rate": 3.1109932838458825e-05,
771
+ "loss": 0.4687,
772
  "step": 5350
773
  },
774
  {
775
  "epoch": 1.9088016967126193,
776
+ "grad_norm": 2.9616148471832275,
777
+ "learning_rate": 3.093319194061506e-05,
778
+ "loss": 0.5018,
779
  "step": 5400
780
  },
781
  {
782
  "epoch": 1.9264757864969955,
783
+ "grad_norm": 3.773808240890503,
784
+ "learning_rate": 3.0756451042771303e-05,
785
+ "loss": 0.5798,
786
  "step": 5450
787
  },
788
  {
789
  "epoch": 1.9441498762813714,
790
+ "grad_norm": 3.725301742553711,
791
+ "learning_rate": 3.0579710144927536e-05,
792
+ "loss": 0.5041,
793
  "step": 5500
794
  },
795
  {
796
  "epoch": 1.9618239660657477,
797
+ "grad_norm": 3.422393560409546,
798
+ "learning_rate": 3.040296924708378e-05,
799
+ "loss": 0.5053,
800
  "step": 5550
801
  },
802
  {
803
  "epoch": 1.9794980558501236,
804
+ "grad_norm": 3.7830734252929688,
805
+ "learning_rate": 3.0226228349240014e-05,
806
+ "loss": 0.5573,
807
  "step": 5600
808
  },
809
  {
810
  "epoch": 1.9971721456345,
811
+ "grad_norm": 2.845203161239624,
812
+ "learning_rate": 3.0049487451396253e-05,
813
+ "loss": 0.4995,
814
  "step": 5650
815
  },
816
  {
817
  "epoch": 2.0,
818
+ "eval_bertscore_f1": 0.9619662560301252,
819
+ "eval_bleu": 0.5490499087973257,
820
+ "eval_loss": 0.3714849352836609,
821
+ "eval_meteor": 0.7078770382751671,
822
+ "eval_rouge1": 0.8227979006513153,
823
+ "eval_rouge2": 0.7467554150541201,
824
+ "eval_runtime": 1347.4627,
825
+ "eval_samples_per_second": 4.797,
826
+ "eval_steps_per_second": 0.6,
827
  "step": 5658
828
  },
829
  {
830
  "epoch": 2.014846235418876,
831
+ "grad_norm": 3.069786787033081,
832
+ "learning_rate": 2.9872746553552493e-05,
833
+ "loss": 0.4827,
834
  "step": 5700
835
  },
836
  {
837
  "epoch": 2.032520325203252,
838
+ "grad_norm": 3.7215096950531006,
839
+ "learning_rate": 2.9696005655708732e-05,
840
+ "loss": 0.5742,
841
  "step": 5750
842
  },
843
  {
844
  "epoch": 2.050194414987628,
845
+ "grad_norm": 4.543232440948486,
846
+ "learning_rate": 2.9519264757864974e-05,
847
+ "loss": 0.478,
848
  "step": 5800
849
  },
850
  {
851
  "epoch": 2.0678685047720045,
852
+ "grad_norm": 3.076716423034668,
853
+ "learning_rate": 2.934252386002121e-05,
854
+ "loss": 0.4619,
855
  "step": 5850
856
  },
857
  {
858
  "epoch": 2.0855425945563804,
859
+ "grad_norm": 3.951244592666626,
860
+ "learning_rate": 2.9165782962177453e-05,
861
+ "loss": 0.513,
862
  "step": 5900
863
  },
864
  {
865
  "epoch": 2.1032166843407563,
866
+ "grad_norm": 2.552813768386841,
867
+ "learning_rate": 2.898904206433369e-05,
868
+ "loss": 0.491,
869
  "step": 5950
870
  },
871
  {
872
  "epoch": 2.1208907741251326,
873
+ "grad_norm": 2.2147703170776367,
874
+ "learning_rate": 2.8812301166489924e-05,
875
+ "loss": 0.4333,
876
  "step": 6000
877
  },
878
  {
879
  "epoch": 2.1385648639095085,
880
+ "grad_norm": 2.9662623405456543,
881
+ "learning_rate": 2.8635560268646167e-05,
882
+ "loss": 0.4287,
883
  "step": 6050
884
  },
885
  {
886
  "epoch": 2.156238953693885,
887
+ "grad_norm": 1.9441404342651367,
888
+ "learning_rate": 2.8458819370802403e-05,
889
+ "loss": 0.5373,
890
  "step": 6100
891
  },
892
  {
893
  "epoch": 2.1739130434782608,
894
+ "grad_norm": 4.967250823974609,
895
+ "learning_rate": 2.8282078472958645e-05,
896
+ "loss": 0.4971,
897
  "step": 6150
898
  },
899
  {
900
  "epoch": 2.191587133262637,
901
+ "grad_norm": 3.1946051120758057,
902
+ "learning_rate": 2.810533757511488e-05,
903
+ "loss": 0.5078,
904
  "step": 6200
905
  },
906
  {
907
  "epoch": 2.209261223047013,
908
+ "grad_norm": 2.941650152206421,
909
+ "learning_rate": 2.7928596677271124e-05,
910
+ "loss": 0.5376,
911
  "step": 6250
912
  },
913
  {
914
  "epoch": 2.2269353128313893,
915
+ "grad_norm": 4.430084705352783,
916
+ "learning_rate": 2.775185577942736e-05,
917
+ "loss": 0.5443,
918
  "step": 6300
919
  },
920
  {
921
  "epoch": 2.2446094026157652,
922
+ "grad_norm": 4.810621738433838,
923
+ "learning_rate": 2.7575114881583602e-05,
924
+ "loss": 0.5101,
925
  "step": 6350
926
  },
927
  {
928
  "epoch": 2.262283492400141,
929
+ "grad_norm": 4.557118892669678,
930
+ "learning_rate": 2.7401908801696713e-05,
931
+ "loss": 0.5837,
932
  "step": 6400
933
  },
934
  {
935
  "epoch": 2.2799575821845175,
936
+ "grad_norm": 3.7677035331726074,
937
+ "learning_rate": 2.7225167903852955e-05,
938
+ "loss": 0.4693,
939
  "step": 6450
940
  },
941
  {
942
  "epoch": 2.297631671968894,
943
+ "grad_norm": 2.5755605697631836,
944
+ "learning_rate": 2.704842700600919e-05,
945
+ "loss": 0.4601,
946
  "step": 6500
947
  },
948
  {
949
  "epoch": 2.3153057617532697,
950
+ "grad_norm": 4.114721775054932,
951
+ "learning_rate": 2.6871686108165427e-05,
952
+ "loss": 0.5209,
953
  "step": 6550
954
  },
955
  {
956
  "epoch": 2.3329798515376456,
957
+ "grad_norm": 4.422333717346191,
958
+ "learning_rate": 2.669494521032167e-05,
959
+ "loss": 0.5097,
960
  "step": 6600
961
  },
962
  {
963
  "epoch": 2.350653941322022,
964
+ "grad_norm": 2.2840325832366943,
965
+ "learning_rate": 2.6518204312477905e-05,
966
+ "loss": 0.4896,
967
  "step": 6650
968
  },
969
  {
970
  "epoch": 2.368328031106398,
971
+ "grad_norm": 4.739809036254883,
972
+ "learning_rate": 2.6341463414634148e-05,
973
+ "loss": 0.5377,
974
  "step": 6700
975
  },
976
  {
977
  "epoch": 2.3860021208907742,
978
+ "grad_norm": 4.174150466918945,
979
+ "learning_rate": 2.6164722516790384e-05,
980
+ "loss": 0.4754,
981
  "step": 6750
982
  },
983
  {
984
  "epoch": 2.40367621067515,
985
+ "grad_norm": 3.5064797401428223,
986
+ "learning_rate": 2.5987981618946626e-05,
987
+ "loss": 0.4375,
988
  "step": 6800
989
  },
990
  {
991
  "epoch": 2.4213503004595265,
992
+ "grad_norm": 5.094990253448486,
993
+ "learning_rate": 2.5811240721102865e-05,
994
+ "loss": 0.5027,
995
  "step": 6850
996
  },
997
  {
998
  "epoch": 2.4390243902439024,
999
+ "grad_norm": 6.338164329528809,
1000
+ "learning_rate": 2.5634499823259105e-05,
1001
+ "loss": 0.5016,
1002
  "step": 6900
1003
  },
1004
  {
1005
  "epoch": 2.4566984800282787,
1006
+ "grad_norm": 3.988973379135132,
1007
+ "learning_rate": 2.5457758925415344e-05,
1008
+ "loss": 0.4173,
1009
  "step": 6950
1010
  },
1011
  {
1012
  "epoch": 2.4743725698126546,
1013
+ "grad_norm": 2.3877015113830566,
1014
+ "learning_rate": 2.528101802757158e-05,
1015
+ "loss": 0.4857,
1016
  "step": 7000
1017
  },
1018
  {
1019
  "epoch": 2.4920466595970305,
1020
+ "grad_norm": 4.041755199432373,
1021
+ "learning_rate": 2.5104277129727822e-05,
1022
+ "loss": 0.4463,
1023
  "step": 7050
1024
  },
1025
  {
1026
  "epoch": 2.509720749381407,
1027
+ "grad_norm": 3.5311896800994873,
1028
+ "learning_rate": 2.492753623188406e-05,
1029
+ "loss": 0.4899,
1030
  "step": 7100
1031
  },
1032
  {
1033
  "epoch": 2.5273948391657832,
1034
+ "grad_norm": 2.9291367530822754,
1035
+ "learning_rate": 2.4750795334040297e-05,
1036
+ "loss": 0.5041,
1037
  "step": 7150
1038
  },
1039
  {
1040
  "epoch": 2.545068928950159,
1041
+ "grad_norm": 3.6040406227111816,
1042
+ "learning_rate": 2.4574054436196536e-05,
1043
+ "loss": 0.4374,
1044
  "step": 7200
1045
  },
1046
  {
1047
  "epoch": 2.562743018734535,
1048
+ "grad_norm": 4.614346981048584,
1049
+ "learning_rate": 2.4397313538352776e-05,
1050
+ "loss": 0.4659,
1051
  "step": 7250
1052
  },
1053
  {
1054
  "epoch": 2.5804171085189114,
1055
+ "grad_norm": 5.035871505737305,
1056
+ "learning_rate": 2.4220572640509015e-05,
1057
+ "loss": 0.4419,
1058
  "step": 7300
1059
  },
1060
  {
1061
  "epoch": 2.5980911983032873,
1062
+ "grad_norm": 3.1209981441497803,
1063
+ "learning_rate": 2.4043831742665254e-05,
1064
+ "loss": 0.4863,
1065
  "step": 7350
1066
  },
1067
  {
1068
  "epoch": 2.6157652880876636,
1069
+ "grad_norm": 2.7074010372161865,
1070
+ "learning_rate": 2.3867090844821493e-05,
1071
+ "loss": 0.4899,
1072
  "step": 7400
1073
  },
1074
  {
1075
  "epoch": 2.6334393778720395,
1076
+ "grad_norm": 4.56402587890625,
1077
+ "learning_rate": 2.3690349946977732e-05,
1078
+ "loss": 0.5104,
1079
  "step": 7450
1080
  },
1081
  {
1082
  "epoch": 2.6511134676564154,
1083
+ "grad_norm": 3.637251377105713,
1084
+ "learning_rate": 2.3513609049133968e-05,
1085
+ "loss": 0.4638,
1086
  "step": 7500
1087
  },
1088
  {
1089
  "epoch": 2.6687875574407918,
1090
+ "grad_norm": 2.912982702255249,
1091
+ "learning_rate": 2.3336868151290207e-05,
1092
+ "loss": 0.4572,
1093
  "step": 7550
1094
  },
1095
  {
1096
  "epoch": 2.686461647225168,
1097
+ "grad_norm": 2.9806952476501465,
1098
+ "learning_rate": 2.3160127253446447e-05,
1099
+ "loss": 0.4544,
1100
  "step": 7600
1101
  },
1102
  {
1103
  "epoch": 2.704135737009544,
1104
+ "grad_norm": 3.6673879623413086,
1105
+ "learning_rate": 2.2983386355602686e-05,
1106
+ "loss": 0.4428,
1107
  "step": 7650
1108
  },
1109
  {
1110
  "epoch": 2.72180982679392,
1111
+ "grad_norm": 7.318435192108154,
1112
+ "learning_rate": 2.280664545775893e-05,
1113
+ "loss": 0.4137,
1114
  "step": 7700
1115
  },
1116
  {
1117
  "epoch": 2.7394839165782963,
1118
+ "grad_norm": 7.58805513381958,
1119
+ "learning_rate": 2.2629904559915168e-05,
1120
+ "loss": 0.4501,
1121
  "step": 7750
1122
  },
1123
  {
1124
  "epoch": 2.757158006362672,
1125
+ "grad_norm": 3.000596046447754,
1126
+ "learning_rate": 2.2453163662071407e-05,
1127
+ "loss": 0.4104,
1128
  "step": 7800
1129
  },
1130
  {
1131
  "epoch": 2.7748320961470485,
1132
+ "grad_norm": 6.188124656677246,
1133
+ "learning_rate": 2.2276422764227646e-05,
1134
+ "loss": 0.4833,
1135
  "step": 7850
1136
  },
1137
  {
1138
  "epoch": 2.7925061859314244,
1139
+ "grad_norm": 2.526710033416748,
1140
+ "learning_rate": 2.209968186638388e-05,
1141
+ "loss": 0.4377,
1142
  "step": 7900
1143
  },
1144
  {
1145
  "epoch": 2.8101802757158008,
1146
+ "grad_norm": 3.080709934234619,
1147
+ "learning_rate": 2.192294096854012e-05,
1148
+ "loss": 0.4289,
1149
  "step": 7950
1150
  },
1151
  {
1152
  "epoch": 2.8278543655001767,
1153
+ "grad_norm": 2.859811544418335,
1154
+ "learning_rate": 2.174620007069636e-05,
1155
+ "loss": 0.4856,
1156
  "step": 8000
1157
  },
1158
  {
1159
  "epoch": 2.845528455284553,
1160
+ "grad_norm": 4.68251371383667,
1161
+ "learning_rate": 2.15694591728526e-05,
1162
+ "loss": 0.4901,
1163
  "step": 8050
1164
  },
1165
  {
1166
  "epoch": 2.863202545068929,
1167
+ "grad_norm": 4.8537211418151855,
1168
+ "learning_rate": 2.139271827500884e-05,
1169
+ "loss": 0.3815,
1170
  "step": 8100
1171
  },
1172
  {
1173
  "epoch": 2.880876634853305,
1174
+ "grad_norm": 3.1829328536987305,
1175
+ "learning_rate": 2.1215977377165078e-05,
1176
+ "loss": 0.4713,
1177
  "step": 8150
1178
  },
1179
  {
1180
  "epoch": 2.898550724637681,
1181
+ "grad_norm": 4.081786155700684,
1182
+ "learning_rate": 2.1039236479321317e-05,
1183
+ "loss": 0.4494,
1184
  "step": 8200
1185
  },
1186
  {
1187
  "epoch": 2.9162248144220575,
1188
+ "grad_norm": 4.547771453857422,
1189
+ "learning_rate": 2.0862495581477553e-05,
1190
+ "loss": 0.4306,
1191
  "step": 8250
1192
  },
1193
  {
1194
  "epoch": 2.9338989042064334,
1195
+ "grad_norm": 2.5716054439544678,
1196
+ "learning_rate": 2.0685754683633792e-05,
1197
+ "loss": 0.3865,
1198
  "step": 8300
1199
  },
1200
  {
1201
  "epoch": 2.9515729939908093,
1202
+ "grad_norm": 3.1603822708129883,
1203
+ "learning_rate": 2.050901378579003e-05,
1204
+ "loss": 0.4292,
1205
  "step": 8350
1206
  },
1207
  {
1208
  "epoch": 2.9692470837751856,
1209
+ "grad_norm": 3.2110049724578857,
1210
+ "learning_rate": 2.033227288794627e-05,
1211
+ "loss": 0.4612,
1212
  "step": 8400
1213
  },
1214
  {
1215
  "epoch": 2.9869211735595615,
1216
+ "grad_norm": 3.1889193058013916,
1217
+ "learning_rate": 2.015553199010251e-05,
1218
+ "loss": 0.4432,
1219
  "step": 8450
1220
  },
1221
  {
1222
  "epoch": 3.0,
1223
+ "eval_bertscore_f1": 0.9641102957124315,
1224
+ "eval_bleu": 0.5720155666086876,
1225
+ "eval_loss": 0.32330864667892456,
1226
+ "eval_meteor": 0.7255114484352385,
1227
+ "eval_rouge1": 0.8322775652472418,
1228
+ "eval_rouge2": 0.7664804459108749,
1229
+ "eval_runtime": 1341.3557,
1230
+ "eval_samples_per_second": 4.819,
1231
+ "eval_steps_per_second": 0.602,
1232
  "step": 8487
1233
  },
1234
  {
1235
  "epoch": 3.004595263343938,
1236
+ "grad_norm": 3.6943013668060303,
1237
+ "learning_rate": 1.997879109225875e-05,
1238
+ "loss": 0.4008,
1239
  "step": 8500
1240
  },
1241
  {
1242
  "epoch": 3.022269353128314,
1243
+ "grad_norm": 3.6444859504699707,
1244
+ "learning_rate": 1.9802050194414988e-05,
1245
+ "loss": 0.4435,
1246
  "step": 8550
1247
  },
1248
  {
1249
  "epoch": 3.03994344291269,
1250
+ "grad_norm": 2.0229413509368896,
1251
+ "learning_rate": 1.962530929657123e-05,
1252
+ "loss": 0.4479,
1253
  "step": 8600
1254
  },
1255
  {
1256
  "epoch": 3.057617532697066,
1257
+ "grad_norm": 2.651965618133545,
1258
+ "learning_rate": 1.9448568398727466e-05,
1259
+ "loss": 0.4386,
1260
  "step": 8650
1261
  },
1262
  {
1263
  "epoch": 3.0752916224814424,
1264
+ "grad_norm": 3.405768394470215,
1265
+ "learning_rate": 1.9271827500883705e-05,
1266
+ "loss": 0.3994,
1267
  "step": 8700
1268
  },
1269
  {
1270
  "epoch": 3.0929657122658183,
1271
+ "grad_norm": 4.544278144836426,
1272
+ "learning_rate": 1.9095086603039945e-05,
1273
+ "loss": 0.4002,
1274
  "step": 8750
1275
  },
1276
  {
1277
  "epoch": 3.110639802050194,
1278
+ "grad_norm": 4.592613220214844,
1279
+ "learning_rate": 1.8918345705196184e-05,
1280
+ "loss": 0.4925,
1281
  "step": 8800
1282
  },
1283
  {
1284
  "epoch": 3.1283138918345705,
1285
+ "grad_norm": 4.989655017852783,
1286
+ "learning_rate": 1.8741604807352423e-05,
1287
+ "loss": 0.411,
1288
  "step": 8850
1289
  },
1290
  {
1291
  "epoch": 3.1459879816189464,
1292
+ "grad_norm": 5.274810791015625,
1293
+ "learning_rate": 1.8568398727465537e-05,
1294
+ "loss": 0.4002,
1295
  "step": 8900
1296
  },
1297
  {
1298
  "epoch": 3.163662071403323,
1299
+ "grad_norm": 3.1220662593841553,
1300
+ "learning_rate": 1.8391657829621776e-05,
1301
+ "loss": 0.4138,
1302
  "step": 8950
1303
  },
1304
  {
1305
  "epoch": 3.1813361611876987,
1306
+ "grad_norm": 3.1085612773895264,
1307
+ "learning_rate": 1.8214916931778015e-05,
1308
+ "loss": 0.4498,
1309
  "step": 9000
1310
  },
1311
  {
1312
  "epoch": 3.199010250972075,
1313
+ "grad_norm": 2.929586410522461,
1314
+ "learning_rate": 1.8038176033934255e-05,
1315
+ "loss": 0.4271,
1316
  "step": 9050
1317
  },
1318
  {
1319
  "epoch": 3.216684340756451,
1320
+ "grad_norm": 3.6650078296661377,
1321
+ "learning_rate": 1.7861435136090494e-05,
1322
+ "loss": 0.4642,
1323
  "step": 9100
1324
  },
1325
  {
1326
  "epoch": 3.2343584305408273,
1327
+ "grad_norm": 4.111539363861084,
1328
+ "learning_rate": 1.7684694238246733e-05,
1329
+ "loss": 0.3974,
1330
  "step": 9150
1331
  },
1332
  {
1333
  "epoch": 3.252032520325203,
1334
+ "grad_norm": 3.7882914543151855,
1335
+ "learning_rate": 1.750795334040297e-05,
1336
+ "loss": 0.4961,
1337
  "step": 9200
1338
  },
1339
  {
1340
  "epoch": 3.2697066101095795,
1341
+ "grad_norm": 3.846184253692627,
1342
+ "learning_rate": 1.7331212442559208e-05,
1343
+ "loss": 0.4681,
1344
  "step": 9250
1345
  },
1346
  {
1347
  "epoch": 3.2873806998939554,
1348
+ "grad_norm": 2.3030924797058105,
1349
+ "learning_rate": 1.7154471544715447e-05,
1350
+ "loss": 0.3746,
1351
  "step": 9300
1352
  },
1353
  {
1354
  "epoch": 3.3050547896783318,
1355
+ "grad_norm": 5.14872407913208,
1356
+ "learning_rate": 1.6977730646871686e-05,
1357
+ "loss": 0.3876,
1358
  "step": 9350
1359
  },
1360
  {
1361
  "epoch": 3.3227288794627077,
1362
+ "grad_norm": 3.8709867000579834,
1363
+ "learning_rate": 1.6800989749027926e-05,
1364
+ "loss": 0.4326,
1365
  "step": 9400
1366
  },
1367
  {
1368
  "epoch": 3.3404029692470836,
1369
+ "grad_norm": 2.4771230220794678,
1370
+ "learning_rate": 1.6624248851184165e-05,
1371
+ "loss": 0.4111,
1372
  "step": 9450
1373
  },
1374
  {
1375
  "epoch": 3.35807705903146,
1376
+ "grad_norm": 4.154597282409668,
1377
+ "learning_rate": 1.6447507953340404e-05,
1378
+ "loss": 0.514,
1379
  "step": 9500
1380
  },
1381
  {
1382
  "epoch": 3.375751148815836,
1383
+ "grad_norm": 3.861116409301758,
1384
+ "learning_rate": 1.6270767055496643e-05,
1385
+ "loss": 0.4171,
1386
  "step": 9550
1387
  },
1388
  {
1389
  "epoch": 3.393425238600212,
1390
+ "grad_norm": 3.292591094970703,
1391
+ "learning_rate": 1.609402615765288e-05,
1392
+ "loss": 0.4192,
1393
  "step": 9600
1394
  },
1395
  {
1396
  "epoch": 3.411099328384588,
1397
+ "grad_norm": 3.9203121662139893,
1398
+ "learning_rate": 1.591728525980912e-05,
1399
+ "loss": 0.4036,
1400
  "step": 9650
1401
  },
1402
  {
1403
  "epoch": 3.4287734181689644,
1404
+ "grad_norm": 3.337324857711792,
1405
+ "learning_rate": 1.574054436196536e-05,
1406
+ "loss": 0.3511,
1407
  "step": 9700
1408
  },
1409
  {
1410
  "epoch": 3.4464475079533403,
1411
+ "grad_norm": 2.305972099304199,
1412
+ "learning_rate": 1.55638034641216e-05,
1413
+ "loss": 0.4891,
1414
  "step": 9750
1415
  },
1416
  {
1417
  "epoch": 3.4641215977377167,
1418
+ "grad_norm": 4.972841739654541,
1419
+ "learning_rate": 1.538706256627784e-05,
1420
+ "loss": 0.4172,
1421
  "step": 9800
1422
  },
1423
  {
1424
  "epoch": 3.4817956875220926,
1425
+ "grad_norm": 3.111032009124756,
1426
+ "learning_rate": 1.5210321668434077e-05,
1427
+ "loss": 0.4401,
1428
  "step": 9850
1429
  },
1430
  {
1431
  "epoch": 3.499469777306469,
1432
+ "grad_norm": 1.9676620960235596,
1433
+ "learning_rate": 1.5033580770590316e-05,
1434
+ "loss": 0.4398,
1435
  "step": 9900
1436
  },
1437
  {
1438
  "epoch": 3.517143867090845,
1439
+ "grad_norm": 3.4216668605804443,
1440
+ "learning_rate": 1.4856839872746553e-05,
1441
+ "loss": 0.4015,
1442
  "step": 9950
1443
  },
1444
  {
1445
  "epoch": 3.534817956875221,
1446
+ "grad_norm": 3.161693811416626,
1447
+ "learning_rate": 1.4680098974902792e-05,
1448
+ "loss": 0.4216,
1449
  "step": 10000
1450
  },
1451
  {
1452
  "epoch": 3.552492046659597,
1453
+ "grad_norm": 5.103592395782471,
1454
+ "learning_rate": 1.4503358077059032e-05,
1455
+ "loss": 0.4489,
1456
  "step": 10050
1457
  },
1458
  {
1459
  "epoch": 3.570166136443973,
1460
+ "grad_norm": 3.90478777885437,
1461
+ "learning_rate": 1.432661717921527e-05,
1462
+ "loss": 0.4565,
1463
  "step": 10100
1464
  },
1465
  {
1466
  "epoch": 3.5878402262283493,
1467
+ "grad_norm": 2.7845191955566406,
1468
+ "learning_rate": 1.414987628137151e-05,
1469
+ "loss": 0.4312,
1470
  "step": 10150
1471
  },
1472
  {
1473
  "epoch": 3.605514316012725,
1474
+ "grad_norm": 4.3978729248046875,
1475
+ "learning_rate": 1.397313538352775e-05,
1476
+ "loss": 0.5477,
1477
  "step": 10200
1478
  },
1479
  {
1480
  "epoch": 3.6231884057971016,
1481
+ "grad_norm": 3.3587982654571533,
1482
+ "learning_rate": 1.3796394485683988e-05,
1483
+ "loss": 0.4373,
1484
  "step": 10250
1485
  },
1486
  {
1487
  "epoch": 3.6408624955814775,
1488
+ "grad_norm": 2.463456392288208,
1489
+ "learning_rate": 1.3619653587840228e-05,
1490
+ "loss": 0.4234,
1491
  "step": 10300
1492
  },
1493
  {
1494
  "epoch": 3.658536585365854,
1495
+ "grad_norm": 3.532365560531616,
1496
+ "learning_rate": 1.3442912689996465e-05,
1497
+ "loss": 0.4115,
1498
  "step": 10350
1499
  },
1500
  {
1501
  "epoch": 3.6762106751502297,
1502
+ "grad_norm": 3.4025349617004395,
1503
+ "learning_rate": 1.3266171792152704e-05,
1504
+ "loss": 0.4462,
1505
  "step": 10400
1506
  },
1507
  {
1508
  "epoch": 3.693884764934606,
1509
+ "grad_norm": 1.0425785779953003,
1510
+ "learning_rate": 1.3089430894308943e-05,
1511
+ "loss": 0.3494,
1512
  "step": 10450
1513
  },
1514
  {
1515
  "epoch": 3.711558854718982,
1516
+ "grad_norm": 4.738943099975586,
1517
+ "learning_rate": 1.2912689996465183e-05,
1518
+ "loss": 0.4316,
1519
  "step": 10500
1520
  },
1521
  {
1522
  "epoch": 3.729232944503358,
1523
+ "grad_norm": 2.0041253566741943,
1524
+ "learning_rate": 1.2735949098621422e-05,
1525
+ "loss": 0.4596,
1526
  "step": 10550
1527
  },
1528
  {
1529
  "epoch": 3.746907034287734,
1530
+ "grad_norm": 4.681216239929199,
1531
+ "learning_rate": 1.2559208200777661e-05,
1532
+ "loss": 0.4013,
1533
  "step": 10600
1534
  },
1535
  {
1536
  "epoch": 3.7645811240721105,
1537
+ "grad_norm": 4.387250900268555,
1538
+ "learning_rate": 1.2382467302933899e-05,
1539
+ "loss": 0.387,
1540
  "step": 10650
1541
  },
1542
  {
1543
  "epoch": 3.7822552138564864,
1544
+ "grad_norm": 2.4324512481689453,
1545
+ "learning_rate": 1.2205726405090138e-05,
1546
+ "loss": 0.4383,
1547
  "step": 10700
1548
  },
1549
  {
1550
  "epoch": 3.7999293036408623,
1551
+ "grad_norm": 4.895308494567871,
1552
+ "learning_rate": 1.2028985507246379e-05,
1553
+ "loss": 0.4567,
1554
  "step": 10750
1555
  },
1556
  {
1557
  "epoch": 3.8176033934252387,
1558
+ "grad_norm": 3.3893020153045654,
1559
+ "learning_rate": 1.1852244609402616e-05,
1560
+ "loss": 0.4316,
1561
  "step": 10800
1562
  },
1563
  {
1564
  "epoch": 3.8352774832096146,
1565
+ "grad_norm": 2.8720388412475586,
1566
+ "learning_rate": 1.1675503711558855e-05,
1567
+ "loss": 0.3712,
1568
  "step": 10850
1569
  },
1570
  {
1571
  "epoch": 3.852951572993991,
1572
+ "grad_norm": 2.9342293739318848,
1573
+ "learning_rate": 1.1498762813715094e-05,
1574
+ "loss": 0.3595,
1575
  "step": 10900
1576
  },
1577
  {
1578
  "epoch": 3.870625662778367,
1579
+ "grad_norm": 5.391896724700928,
1580
+ "learning_rate": 1.1322021915871334e-05,
1581
+ "loss": 0.3945,
1582
  "step": 10950
1583
  },
1584
  {
1585
  "epoch": 3.888299752562743,
1586
+ "grad_norm": 2.1623027324676514,
1587
+ "learning_rate": 1.1145281018027571e-05,
1588
+ "loss": 0.3849,
1589
  "step": 11000
1590
  },
1591
  {
1592
  "epoch": 3.905973842347119,
1593
+ "grad_norm": 2.6391725540161133,
1594
+ "learning_rate": 1.096854012018381e-05,
1595
+ "loss": 0.3837,
1596
  "step": 11050
1597
  },
1598
  {
1599
  "epoch": 3.9236479321314954,
1600
+ "grad_norm": 2.8502111434936523,
1601
+ "learning_rate": 1.079179922234005e-05,
1602
+ "loss": 0.4109,
1603
  "step": 11100
1604
  },
1605
  {
1606
  "epoch": 3.9413220219158713,
1607
+ "grad_norm": 3.7358663082122803,
1608
+ "learning_rate": 1.0615058324496289e-05,
1609
+ "loss": 0.4113,
1610
  "step": 11150
1611
  },
1612
  {
1613
  "epoch": 3.9589961117002472,
1614
+ "grad_norm": 3.771918535232544,
1615
+ "learning_rate": 1.0438317426652528e-05,
1616
+ "loss": 0.3664,
1617
  "step": 11200
1618
  },
1619
  {
1620
  "epoch": 3.9766702014846236,
1621
+ "grad_norm": 2.2865407466888428,
1622
+ "learning_rate": 1.0261576528808767e-05,
1623
+ "loss": 0.427,
1624
  "step": 11250
1625
  },
1626
  {
1627
  "epoch": 3.9943442912689995,
1628
+ "grad_norm": 3.386460781097412,
1629
+ "learning_rate": 1.0084835630965006e-05,
1630
+ "loss": 0.4252,
1631
  "step": 11300
1632
  },
1633
  {
1634
  "epoch": 4.0,
1635
+ "eval_bertscore_f1": 0.965401310132503,
1636
+ "eval_bleu": 0.5825957176218498,
1637
+ "eval_loss": 0.29947343468666077,
1638
+ "eval_meteor": 0.7356418884759407,
1639
+ "eval_rouge1": 0.8377531126193314,
1640
+ "eval_rouge2": 0.7761899021152611,
1641
+ "eval_runtime": 1380.5465,
1642
+ "eval_samples_per_second": 4.682,
1643
+ "eval_steps_per_second": 0.585,
1644
  "step": 11316
1645
  },
1646
  {
1647
  "epoch": 4.012018381053376,
1648
+ "grad_norm": 4.861545085906982,
1649
+ "learning_rate": 9.908094733121245e-06,
1650
+ "loss": 0.4298,
1651
  "step": 11350
1652
  },
1653
  {
1654
  "epoch": 4.029692470837752,
1655
+ "grad_norm": 6.035098075866699,
1656
+ "learning_rate": 9.731353835277483e-06,
1657
+ "loss": 0.3755,
1658
  "step": 11400
1659
  },
1660
  {
1661
  "epoch": 4.047366560622128,
1662
+ "grad_norm": 2.295048952102661,
1663
+ "learning_rate": 9.554612937433722e-06,
1664
+ "loss": 0.3772,
1665
  "step": 11450
1666
  },
1667
  {
1668
  "epoch": 4.065040650406504,
1669
+ "grad_norm": 3.09201979637146,
1670
+ "learning_rate": 9.377872039589961e-06,
1671
+ "loss": 0.4315,
1672
  "step": 11500
1673
  },
1674
  {
1675
  "epoch": 4.08271474019088,
1676
+ "grad_norm": 7.435740947723389,
1677
+ "learning_rate": 9.2011311417462e-06,
1678
+ "loss": 0.4276,
1679
  "step": 11550
1680
  },
1681
  {
1682
  "epoch": 4.100388829975256,
1683
+ "grad_norm": 2.680793046951294,
1684
+ "learning_rate": 9.02439024390244e-06,
1685
+ "loss": 0.3664,
1686
  "step": 11600
1687
  },
1688
  {
1689
  "epoch": 4.118062919759632,
1690
+ "grad_norm": 2.200497627258301,
1691
+ "learning_rate": 8.847649346058679e-06,
1692
+ "loss": 0.4054,
1693
  "step": 11650
1694
  },
1695
  {
1696
  "epoch": 4.135737009544009,
1697
+ "grad_norm": 3.864414930343628,
1698
+ "learning_rate": 8.670908448214918e-06,
1699
+ "loss": 0.4256,
1700
  "step": 11700
1701
  },
1702
  {
1703
  "epoch": 4.153411099328385,
1704
+ "grad_norm": 4.263733863830566,
1705
+ "learning_rate": 8.494167550371156e-06,
1706
+ "loss": 0.4066,
1707
  "step": 11750
1708
  },
1709
  {
1710
  "epoch": 4.171085189112761,
1711
+ "grad_norm": 4.246395587921143,
1712
+ "learning_rate": 8.317426652527395e-06,
1713
+ "loss": 0.3858,
1714
  "step": 11800
1715
  },
1716
  {
1717
  "epoch": 4.188759278897137,
1718
+ "grad_norm": 4.842310905456543,
1719
+ "learning_rate": 8.140685754683634e-06,
1720
+ "loss": 0.4026,
1721
  "step": 11850
1722
  },
1723
  {
1724
  "epoch": 4.2064333686815125,
1725
+ "grad_norm": 2.5259275436401367,
1726
+ "learning_rate": 7.963944856839873e-06,
1727
+ "loss": 0.3624,
1728
  "step": 11900
1729
  },
1730
  {
1731
  "epoch": 4.224107458465889,
1732
+ "grad_norm": 2.959528923034668,
1733
+ "learning_rate": 7.787203958996112e-06,
1734
+ "loss": 0.3971,
1735
  "step": 11950
1736
  },
1737
  {
1738
  "epoch": 4.241781548250265,
1739
+ "grad_norm": 2.9087865352630615,
1740
+ "learning_rate": 7.610463061152351e-06,
1741
+ "loss": 0.4095,
1742
  "step": 12000
1743
  },
1744
  {
1745
  "epoch": 4.259455638034641,
1746
+ "grad_norm": 3.8024725914001465,
1747
+ "learning_rate": 7.43372216330859e-06,
1748
+ "loss": 0.3815,
1749
  "step": 12050
1750
  },
1751
  {
1752
  "epoch": 4.277129727819017,
1753
+ "grad_norm": 3.2564175128936768,
1754
+ "learning_rate": 7.256981265464829e-06,
1755
+ "loss": 0.3752,
1756
  "step": 12100
1757
  },
1758
  {
1759
  "epoch": 4.294803817603394,
1760
+ "grad_norm": 1.157267689704895,
1761
+ "learning_rate": 7.080240367621067e-06,
1762
+ "loss": 0.3769,
1763
  "step": 12150
1764
  },
1765
  {
1766
  "epoch": 4.31247790738777,
1767
+ "grad_norm": 11.618633270263672,
1768
+ "learning_rate": 6.903499469777307e-06,
1769
+ "loss": 0.4042,
1770
  "step": 12200
1771
  },
1772
  {
1773
  "epoch": 4.330151997172146,
1774
+ "grad_norm": 4.113063812255859,
1775
+ "learning_rate": 6.726758571933546e-06,
1776
+ "loss": 0.4441,
1777
  "step": 12250
1778
  },
1779
  {
1780
  "epoch": 4.3478260869565215,
1781
+ "grad_norm": 4.125561237335205,
1782
+ "learning_rate": 6.550017674089785e-06,
1783
+ "loss": 0.421,
1784
  "step": 12300
1785
  },
1786
  {
1787
  "epoch": 4.365500176740898,
1788
+ "grad_norm": 3.2781832218170166,
1789
+ "learning_rate": 6.373276776246023e-06,
1790
+ "loss": 0.46,
1791
  "step": 12350
1792
  },
1793
  {
1794
  "epoch": 4.383174266525274,
1795
+ "grad_norm": 8.616064071655273,
1796
+ "learning_rate": 6.1965358784022625e-06,
1797
+ "loss": 0.3905,
1798
  "step": 12400
1799
  },
1800
  {
1801
  "epoch": 4.40084835630965,
1802
+ "grad_norm": 3.173698902130127,
1803
+ "learning_rate": 6.019794980558501e-06,
1804
+ "loss": 0.3529,
1805
  "step": 12450
1806
  },
1807
  {
1808
  "epoch": 4.418522446094026,
1809
+ "grad_norm": 2.7585160732269287,
1810
+ "learning_rate": 5.843054082714741e-06,
1811
+ "loss": 0.4236,
1812
  "step": 12500
1813
  },
1814
  {
1815
  "epoch": 4.436196535878402,
1816
+ "grad_norm": 1.5063729286193848,
1817
+ "learning_rate": 5.666313184870979e-06,
1818
+ "loss": 0.4182,
1819
  "step": 12550
1820
  },
1821
  {
1822
  "epoch": 4.453870625662779,
1823
+ "grad_norm": 5.755438327789307,
1824
+ "learning_rate": 5.4895722870272184e-06,
1825
+ "loss": 0.3909,
1826
  "step": 12600
1827
  },
1828
  {
1829
  "epoch": 4.471544715447155,
1830
+ "grad_norm": 2.465946674346924,
1831
+ "learning_rate": 5.312831389183457e-06,
1832
+ "loss": 0.4078,
1833
  "step": 12650
1834
  },
1835
  {
1836
  "epoch": 4.4892188052315305,
1837
+ "grad_norm": 2.642314910888672,
1838
+ "learning_rate": 5.136090491339696e-06,
1839
+ "loss": 0.4581,
1840
  "step": 12700
1841
  },
1842
  {
1843
  "epoch": 4.506892895015906,
1844
+ "grad_norm": 3.11537766456604,
1845
+ "learning_rate": 4.959349593495935e-06,
1846
+ "loss": 0.3574,
1847
  "step": 12750
1848
  },
1849
  {
1850
  "epoch": 4.524566984800282,
1851
+ "grad_norm": 5.104282855987549,
1852
+ "learning_rate": 4.782608695652174e-06,
1853
+ "loss": 0.3889,
1854
  "step": 12800
1855
  },
1856
  {
1857
  "epoch": 4.542241074584659,
1858
+ "grad_norm": 3.2097325325012207,
1859
+ "learning_rate": 4.605867797808413e-06,
1860
+ "loss": 0.3723,
1861
  "step": 12850
1862
  },
1863
  {
1864
  "epoch": 4.559915164369035,
1865
+ "grad_norm": 2.8303864002227783,
1866
+ "learning_rate": 4.429126899964652e-06,
1867
+ "loss": 0.4222,
1868
  "step": 12900
1869
  },
1870
  {
1871
  "epoch": 4.577589254153411,
1872
+ "grad_norm": 3.508904457092285,
1873
+ "learning_rate": 4.252386002120891e-06,
1874
+ "loss": 0.4109,
1875
  "step": 12950
1876
  },
1877
  {
1878
  "epoch": 4.595263343937788,
1879
+ "grad_norm": 3.8901443481445312,
1880
+ "learning_rate": 4.07564510427713e-06,
1881
+ "loss": 0.4248,
1882
  "step": 13000
1883
  },
1884
  {
1885
  "epoch": 4.612937433722164,
1886
+ "grad_norm": 4.3722920417785645,
1887
+ "learning_rate": 3.898904206433369e-06,
1888
+ "loss": 0.4139,
1889
  "step": 13050
1890
  },
1891
  {
1892
  "epoch": 4.6306115235065395,
1893
+ "grad_norm": 3.87107515335083,
1894
+ "learning_rate": 3.722163308589608e-06,
1895
+ "loss": 0.3862,
1896
  "step": 13100
1897
  },
1898
  {
1899
  "epoch": 4.648285613290915,
1900
+ "grad_norm": 4.208980560302734,
1901
+ "learning_rate": 3.5454224107458466e-06,
1902
+ "loss": 0.3971,
1903
  "step": 13150
1904
  },
1905
  {
1906
  "epoch": 4.665959703075291,
1907
+ "grad_norm": 3.0796680450439453,
1908
+ "learning_rate": 3.368681512902086e-06,
1909
+ "loss": 0.4466,
1910
  "step": 13200
1911
  },
1912
  {
1913
  "epoch": 4.683633792859668,
1914
+ "grad_norm": 3.2411413192749023,
1915
+ "learning_rate": 3.1919406150583245e-06,
1916
+ "loss": 0.3462,
1917
  "step": 13250
1918
  },
1919
  {
1920
  "epoch": 4.701307882644044,
1921
+ "grad_norm": 3.422546625137329,
1922
+ "learning_rate": 3.0151997172145637e-06,
1923
+ "loss": 0.4942,
1924
  "step": 13300
1925
  },
1926
  {
1927
  "epoch": 4.71898197242842,
1928
+ "grad_norm": 5.258462905883789,
1929
+ "learning_rate": 2.8384588193708025e-06,
1930
+ "loss": 0.3842,
1931
  "step": 13350
1932
  },
1933
  {
1934
  "epoch": 4.736656062212796,
1935
+ "grad_norm": 3.634772300720215,
1936
+ "learning_rate": 2.6617179215270417e-06,
1937
+ "loss": 0.386,
1938
  "step": 13400
1939
  },
1940
  {
1941
  "epoch": 4.754330151997172,
1942
+ "grad_norm": 2.493283987045288,
1943
+ "learning_rate": 2.4849770236832804e-06,
1944
+ "loss": 0.3646,
1945
  "step": 13450
1946
  },
1947
  {
1948
  "epoch": 4.7720042417815485,
1949
+ "grad_norm": 3.546058416366577,
1950
+ "learning_rate": 2.3082361258395196e-06,
1951
+ "loss": 0.4085,
1952
  "step": 13500
1953
  },
1954
  {
1955
  "epoch": 4.789678331565924,
1956
+ "grad_norm": 2.0962002277374268,
1957
+ "learning_rate": 2.1314952279957584e-06,
1958
+ "loss": 0.3869,
1959
  "step": 13550
1960
  },
1961
  {
1962
  "epoch": 4.8073524213503,
1963
+ "grad_norm": 2.2293384075164795,
1964
+ "learning_rate": 1.9547543301519976e-06,
1965
+ "loss": 0.4841,
1966
  "step": 13600
1967
  },
1968
  {
1969
  "epoch": 4.825026511134676,
1970
+ "grad_norm": 3.2926249504089355,
1971
+ "learning_rate": 1.7780134323082363e-06,
1972
+ "loss": 0.4599,
1973
  "step": 13650
1974
  },
1975
  {
1976
  "epoch": 4.842700600919053,
1977
+ "grad_norm": 5.047961235046387,
1978
+ "learning_rate": 1.6012725344644753e-06,
1979
+ "loss": 0.3796,
1980
  "step": 13700
1981
  },
1982
  {
1983
  "epoch": 4.860374690703429,
1984
+ "grad_norm": 3.179448366165161,
1985
+ "learning_rate": 1.424531636620714e-06,
1986
+ "loss": 0.3898,
1987
  "step": 13750
1988
  },
1989
  {
1990
  "epoch": 4.878048780487805,
1991
+ "grad_norm": 5.14663028717041,
1992
+ "learning_rate": 1.247790738776953e-06,
1993
+ "loss": 0.383,
1994
  "step": 13800
1995
  },
1996
  {
1997
  "epoch": 4.895722870272181,
1998
+ "grad_norm": 2.7722623348236084,
1999
+ "learning_rate": 1.071049840933192e-06,
2000
+ "loss": 0.3923,
2001
  "step": 13850
2002
  },
2003
  {
2004
  "epoch": 4.9133969600565575,
2005
+ "grad_norm": 4.3328447341918945,
2006
+ "learning_rate": 8.94308943089431e-07,
2007
+ "loss": 0.3859,
2008
  "step": 13900
2009
  },
2010
  {
2011
  "epoch": 4.931071049840933,
2012
+ "grad_norm": 3.5014865398406982,
2013
+ "learning_rate": 7.175680452456699e-07,
2014
+ "loss": 0.3909,
2015
  "step": 13950
2016
  },
2017
  {
2018
  "epoch": 4.948745139625309,
2019
+ "grad_norm": 4.449154376983643,
2020
+ "learning_rate": 5.408271474019089e-07,
2021
+ "loss": 0.4711,
2022
  "step": 14000
2023
  },
2024
  {
2025
  "epoch": 4.966419229409685,
2026
+ "grad_norm": 2.2578201293945312,
2027
+ "learning_rate": 3.640862495581478e-07,
2028
+ "loss": 0.3719,
2029
  "step": 14050
2030
  },
2031
  {
2032
  "epoch": 4.984093319194061,
2033
+ "grad_norm": 1.688942313194275,
2034
+ "learning_rate": 1.8734535171438673e-07,
2035
+ "loss": 0.4053,
2036
  "step": 14100
2037
  },
2038
  {
2039
  "epoch": 5.0,
2040
+ "eval_bertscore_f1": 0.9658045416033947,
2041
+ "eval_bleu": 0.5865134487850142,
2042
+ "eval_loss": 0.2939385771751404,
2043
+ "eval_meteor": 0.738667698887171,
2044
+ "eval_rouge1": 0.8397011041728719,
2045
+ "eval_rouge2": 0.7793367916496452,
2046
+ "eval_runtime": 1419.0322,
2047
+ "eval_samples_per_second": 4.555,
2048
+ "eval_steps_per_second": 0.569,
2049
  "step": 14145
2050
  }
2051
  ],
checkpoint-14145/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ee513a34fdce19a0629cc9aa4ea661dbd2cb881304d77bffec8f2b929d97943
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84eb264dc96b91d8b1c338a7f669bf17dafa7f32a2801b1215e7c0b8df1ea575
3
  size 5432