reza-alipour commited on
Commit
329df2f
·
1 Parent(s): 1573f37

End of training

Browse files
Files changed (5) hide show
  1. README.md +14 -2
  2. all_results.json +14 -14
  3. eval_results.json +9 -9
  4. train_results.json +5 -5
  5. trainer_state.json +627 -873
README.md CHANGED
@@ -2,11 +2,23 @@
2
  base_model: reza-alipour/ft5
3
  tags:
4
  - generated_from_trainer
 
 
5
  metrics:
6
  - rouge
7
  model-index:
8
  - name: ft5
9
- results: []
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -14,7 +26,7 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  # ft5
16
 
17
- This model is a fine-tuned version of [reza-alipour/ft5](https://huggingface.co/reza-alipour/ft5) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
  - Loss: 0.3032
20
  - Rouge1: 86.5313
 
2
  base_model: reza-alipour/ft5
3
  tags:
4
  - generated_from_trainer
5
+ datasets:
6
+ - reza-alipour/Text-Edit-Instruct-Preprocessed-4m
7
  metrics:
8
  - rouge
9
  model-index:
10
  - name: ft5
11
+ results:
12
+ - task:
13
+ name: Summarization
14
+ type: summarization
15
+ dataset:
16
+ name: reza-alipour/Text-Edit-Instruct-Preprocessed-4m
17
+ type: reza-alipour/Text-Edit-Instruct-Preprocessed-4m
18
+ metrics:
19
+ - name: Rouge1
20
+ type: rouge
21
+ value: 86.5313
22
  ---
23
 
24
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
26
 
27
  # ft5
28
 
29
+ This model is a fine-tuned version of [reza-alipour/ft5](https://huggingface.co/reza-alipour/ft5) on the reza-alipour/Text-Edit-Instruct-Preprocessed-4m dataset.
30
  It achieves the following results on the evaluation set:
31
  - Loss: 0.3032
32
  - Rouge1: 86.5313
all_results.json CHANGED
@@ -1,18 +1,18 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_gen_len": 24.50843661340899,
4
- "eval_loss": 0.3171856105327606,
5
- "eval_rouge1": 86.2684,
6
- "eval_rouge2": 75.1631,
7
- "eval_rougeL": 85.6942,
8
- "eval_rougeLsum": 85.72,
9
- "eval_runtime": 1529.4638,
10
  "eval_samples": 6697,
11
- "eval_samples_per_second": 4.379,
12
- "eval_steps_per_second": 1.095,
13
- "train_loss": 0.3748776986590622,
14
- "train_runtime": 44443.9793,
15
- "train_samples": 1124278,
16
- "train_samples_per_second": 25.297,
17
- "train_steps_per_second": 0.791
18
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_gen_len": 24.568762132298044,
4
+ "eval_loss": 0.3032413423061371,
5
+ "eval_rouge1": 86.5313,
6
+ "eval_rouge2": 75.3351,
7
+ "eval_rougeL": 85.9565,
8
+ "eval_rougeLsum": 85.9785,
9
+ "eval_runtime": 1490.9314,
10
  "eval_samples": 6697,
11
+ "eval_samples_per_second": 4.492,
12
+ "eval_steps_per_second": 1.123,
13
+ "train_loss": 0.35858034201253897,
14
+ "train_runtime": 38927.7212,
15
+ "train_samples": 993343,
16
+ "train_samples_per_second": 25.518,
17
+ "train_steps_per_second": 0.797
18
  }
eval_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_gen_len": 24.50843661340899,
4
- "eval_loss": 0.3171856105327606,
5
- "eval_rouge1": 86.2684,
6
- "eval_rouge2": 75.1631,
7
- "eval_rougeL": 85.6942,
8
- "eval_rougeLsum": 85.72,
9
- "eval_runtime": 1529.4638,
10
  "eval_samples": 6697,
11
- "eval_samples_per_second": 4.379,
12
- "eval_steps_per_second": 1.095
13
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_gen_len": 24.568762132298044,
4
+ "eval_loss": 0.3032413423061371,
5
+ "eval_rouge1": 86.5313,
6
+ "eval_rouge2": 75.3351,
7
+ "eval_rougeL": 85.9565,
8
+ "eval_rougeLsum": 85.9785,
9
+ "eval_runtime": 1490.9314,
10
  "eval_samples": 6697,
11
+ "eval_samples_per_second": 4.492,
12
+ "eval_steps_per_second": 1.123
13
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.3748776986590622,
4
- "train_runtime": 44443.9793,
5
- "train_samples": 1124278,
6
- "train_samples_per_second": 25.297,
7
- "train_steps_per_second": 0.791
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.35858034201253897,
4
+ "train_runtime": 38927.7212,
5
+ "train_samples": 993343,
6
+ "train_samples_per_second": 25.518,
7
+ "train_steps_per_second": 0.797
8
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9999786530045897,
5
  "eval_steps": 500,
6
- "global_step": 35133,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -11,2138 +11,1892 @@
11
  {
12
  "epoch": 0.0,
13
  "learning_rate": 7e-05,
14
- "loss": 0.3593,
15
  "step": 100
16
  },
17
  {
18
  "epoch": 0.01,
19
  "learning_rate": 7e-05,
20
- "loss": 0.396,
21
  "step": 200
22
  },
23
  {
24
  "epoch": 0.01,
25
  "learning_rate": 7e-05,
26
- "loss": 0.3826,
27
  "step": 300
28
  },
29
  {
30
  "epoch": 0.01,
31
  "learning_rate": 7e-05,
32
- "loss": 0.3846,
33
  "step": 400
34
  },
35
  {
36
- "epoch": 0.01,
37
  "learning_rate": 7e-05,
38
- "loss": 0.3829,
39
  "step": 500
40
  },
41
  {
42
  "epoch": 0.02,
43
  "learning_rate": 7e-05,
44
- "loss": 0.3793,
45
  "step": 600
46
  },
47
  {
48
  "epoch": 0.02,
49
  "learning_rate": 7e-05,
50
- "loss": 0.3836,
51
  "step": 700
52
  },
53
  {
54
- "epoch": 0.02,
55
  "learning_rate": 7e-05,
56
- "loss": 0.3984,
57
  "step": 800
58
  },
59
  {
60
  "epoch": 0.03,
61
  "learning_rate": 7e-05,
62
- "loss": 0.3792,
63
  "step": 900
64
  },
65
  {
66
  "epoch": 0.03,
67
  "learning_rate": 7e-05,
68
- "loss": 0.3682,
69
  "step": 1000
70
  },
71
  {
72
- "epoch": 0.03,
73
  "learning_rate": 7e-05,
74
- "loss": 0.3816,
75
  "step": 1100
76
  },
77
  {
78
- "epoch": 0.03,
79
  "learning_rate": 7e-05,
80
- "loss": 0.3638,
81
  "step": 1200
82
  },
83
  {
84
  "epoch": 0.04,
85
  "learning_rate": 7e-05,
86
- "loss": 0.3917,
87
  "step": 1300
88
  },
89
  {
90
- "epoch": 0.04,
91
  "learning_rate": 7e-05,
92
- "loss": 0.3974,
93
  "step": 1400
94
  },
95
  {
96
- "epoch": 0.04,
97
  "learning_rate": 7e-05,
98
- "loss": 0.3772,
99
  "step": 1500
100
  },
101
  {
102
  "epoch": 0.05,
103
  "learning_rate": 7e-05,
104
- "loss": 0.3797,
105
  "step": 1600
106
  },
107
  {
108
  "epoch": 0.05,
109
  "learning_rate": 7e-05,
110
- "loss": 0.3709,
111
  "step": 1700
112
  },
113
  {
114
- "epoch": 0.05,
115
  "learning_rate": 7e-05,
116
- "loss": 0.3948,
117
  "step": 1800
118
  },
119
  {
120
- "epoch": 0.05,
121
  "learning_rate": 7e-05,
122
- "loss": 0.3839,
123
  "step": 1900
124
  },
125
  {
126
  "epoch": 0.06,
127
  "learning_rate": 7e-05,
128
- "loss": 0.3836,
129
  "step": 2000
130
  },
131
  {
132
- "epoch": 0.06,
133
  "learning_rate": 7e-05,
134
- "loss": 0.3856,
135
  "step": 2100
136
  },
137
  {
138
- "epoch": 0.06,
139
  "learning_rate": 7e-05,
140
- "loss": 0.3816,
141
  "step": 2200
142
  },
143
  {
144
  "epoch": 0.07,
145
  "learning_rate": 7e-05,
146
- "loss": 0.3916,
147
  "step": 2300
148
  },
149
  {
150
- "epoch": 0.07,
151
  "learning_rate": 7e-05,
152
- "loss": 0.3752,
153
  "step": 2400
154
  },
155
  {
156
- "epoch": 0.07,
157
  "learning_rate": 7e-05,
158
- "loss": 0.3853,
159
  "step": 2500
160
  },
161
  {
162
- "epoch": 0.07,
163
  "learning_rate": 7e-05,
164
- "loss": 0.3708,
165
  "step": 2600
166
  },
167
  {
168
- "epoch": 0.08,
169
  "learning_rate": 7e-05,
170
- "loss": 0.383,
171
  "step": 2700
172
  },
173
  {
174
- "epoch": 0.08,
175
  "learning_rate": 7e-05,
176
- "loss": 0.3845,
177
  "step": 2800
178
  },
179
  {
180
- "epoch": 0.08,
181
  "learning_rate": 7e-05,
182
- "loss": 0.3771,
183
  "step": 2900
184
  },
185
  {
186
- "epoch": 0.09,
187
  "learning_rate": 7e-05,
188
- "loss": 0.3718,
189
  "step": 3000
190
  },
191
  {
192
- "epoch": 0.09,
193
  "learning_rate": 7e-05,
194
- "loss": 0.3593,
195
  "step": 3100
196
  },
197
  {
198
- "epoch": 0.09,
199
  "learning_rate": 7e-05,
200
- "loss": 0.3817,
201
  "step": 3200
202
  },
203
  {
204
- "epoch": 0.09,
205
  "learning_rate": 7e-05,
206
- "loss": 0.3853,
207
  "step": 3300
208
  },
209
  {
210
- "epoch": 0.1,
211
  "learning_rate": 7e-05,
212
- "loss": 0.3724,
213
  "step": 3400
214
  },
215
  {
216
- "epoch": 0.1,
217
  "learning_rate": 7e-05,
218
- "loss": 0.4023,
219
  "step": 3500
220
  },
221
  {
222
- "epoch": 0.1,
223
  "learning_rate": 7e-05,
224
- "loss": 0.4,
225
  "step": 3600
226
  },
227
  {
228
- "epoch": 0.11,
229
  "learning_rate": 7e-05,
230
- "loss": 0.3877,
231
  "step": 3700
232
  },
233
  {
234
- "epoch": 0.11,
235
  "learning_rate": 7e-05,
236
- "loss": 0.3853,
237
  "step": 3800
238
  },
239
  {
240
- "epoch": 0.11,
241
  "learning_rate": 7e-05,
242
- "loss": 0.3843,
243
  "step": 3900
244
  },
245
  {
246
- "epoch": 0.11,
247
  "learning_rate": 7e-05,
248
- "loss": 0.3719,
249
  "step": 4000
250
  },
251
  {
252
- "epoch": 0.12,
253
  "learning_rate": 7e-05,
254
- "loss": 0.3678,
255
  "step": 4100
256
  },
257
  {
258
- "epoch": 0.12,
259
  "learning_rate": 7e-05,
260
- "loss": 0.3774,
261
  "step": 4200
262
  },
263
  {
264
- "epoch": 0.12,
265
  "learning_rate": 7e-05,
266
- "loss": 0.3738,
267
  "step": 4300
268
  },
269
  {
270
- "epoch": 0.13,
271
  "learning_rate": 7e-05,
272
- "loss": 0.3976,
273
  "step": 4400
274
  },
275
  {
276
- "epoch": 0.13,
277
  "learning_rate": 7e-05,
278
- "loss": 0.3949,
279
  "step": 4500
280
  },
281
  {
282
- "epoch": 0.13,
283
  "learning_rate": 7e-05,
284
- "loss": 0.4005,
285
  "step": 4600
286
  },
287
  {
288
- "epoch": 0.13,
289
  "learning_rate": 7e-05,
290
- "loss": 0.3831,
291
  "step": 4700
292
  },
293
  {
294
- "epoch": 0.14,
295
  "learning_rate": 7e-05,
296
- "loss": 0.383,
297
  "step": 4800
298
  },
299
  {
300
- "epoch": 0.14,
301
  "learning_rate": 7e-05,
302
- "loss": 0.4044,
303
  "step": 4900
304
  },
305
  {
306
- "epoch": 0.14,
307
  "learning_rate": 7e-05,
308
- "loss": 0.3757,
309
  "step": 5000
310
  },
311
  {
312
- "epoch": 0.15,
313
  "learning_rate": 7e-05,
314
- "loss": 0.377,
315
  "step": 5100
316
  },
317
  {
318
- "epoch": 0.15,
319
  "learning_rate": 7e-05,
320
- "loss": 0.3713,
321
  "step": 5200
322
  },
323
  {
324
- "epoch": 0.15,
325
  "learning_rate": 7e-05,
326
- "loss": 0.3968,
327
  "step": 5300
328
  },
329
  {
330
- "epoch": 0.15,
331
  "learning_rate": 7e-05,
332
- "loss": 0.3731,
333
  "step": 5400
334
  },
335
  {
336
- "epoch": 0.16,
337
  "learning_rate": 7e-05,
338
- "loss": 0.369,
339
  "step": 5500
340
  },
341
  {
342
- "epoch": 0.16,
343
  "learning_rate": 7e-05,
344
- "loss": 0.4124,
345
  "step": 5600
346
  },
347
  {
348
- "epoch": 0.16,
349
  "learning_rate": 7e-05,
350
- "loss": 0.383,
351
  "step": 5700
352
  },
353
  {
354
- "epoch": 0.17,
355
  "learning_rate": 7e-05,
356
- "loss": 0.3721,
357
  "step": 5800
358
  },
359
  {
360
- "epoch": 0.17,
361
  "learning_rate": 7e-05,
362
- "loss": 0.387,
363
  "step": 5900
364
  },
365
  {
366
- "epoch": 0.17,
367
  "learning_rate": 7e-05,
368
- "loss": 0.3807,
369
  "step": 6000
370
  },
371
  {
372
- "epoch": 0.17,
373
  "learning_rate": 7e-05,
374
- "loss": 0.3817,
375
  "step": 6100
376
  },
377
  {
378
- "epoch": 0.18,
379
  "learning_rate": 7e-05,
380
- "loss": 0.3657,
381
  "step": 6200
382
  },
383
  {
384
- "epoch": 0.18,
385
  "learning_rate": 7e-05,
386
- "loss": 0.3991,
387
  "step": 6300
388
  },
389
  {
390
- "epoch": 0.18,
391
  "learning_rate": 7e-05,
392
- "loss": 0.3972,
393
  "step": 6400
394
  },
395
  {
396
- "epoch": 0.19,
397
  "learning_rate": 7e-05,
398
- "loss": 0.3692,
399
  "step": 6500
400
  },
401
  {
402
- "epoch": 0.19,
403
  "learning_rate": 7e-05,
404
- "loss": 0.3678,
405
  "step": 6600
406
  },
407
  {
408
- "epoch": 0.19,
409
  "learning_rate": 7e-05,
410
- "loss": 0.3794,
411
  "step": 6700
412
  },
413
  {
414
- "epoch": 0.19,
415
  "learning_rate": 7e-05,
416
- "loss": 0.3876,
417
  "step": 6800
418
  },
419
  {
420
- "epoch": 0.2,
421
  "learning_rate": 7e-05,
422
- "loss": 0.3754,
423
  "step": 6900
424
  },
425
  {
426
- "epoch": 0.2,
427
  "learning_rate": 7e-05,
428
- "loss": 0.3695,
429
  "step": 7000
430
  },
431
  {
432
- "epoch": 0.2,
433
  "learning_rate": 7e-05,
434
- "loss": 0.3911,
435
  "step": 7100
436
  },
437
  {
438
- "epoch": 0.2,
439
  "learning_rate": 7e-05,
440
- "loss": 0.3911,
441
  "step": 7200
442
  },
443
  {
444
- "epoch": 0.21,
445
  "learning_rate": 7e-05,
446
- "loss": 0.3895,
447
  "step": 7300
448
  },
449
  {
450
- "epoch": 0.21,
451
  "learning_rate": 7e-05,
452
- "loss": 0.3674,
453
  "step": 7400
454
  },
455
  {
456
- "epoch": 0.21,
457
  "learning_rate": 7e-05,
458
- "loss": 0.376,
459
  "step": 7500
460
  },
461
  {
462
- "epoch": 0.22,
463
  "learning_rate": 7e-05,
464
- "loss": 0.3729,
465
  "step": 7600
466
  },
467
  {
468
- "epoch": 0.22,
469
  "learning_rate": 7e-05,
470
- "loss": 0.3879,
471
  "step": 7700
472
  },
473
  {
474
- "epoch": 0.22,
475
  "learning_rate": 7e-05,
476
- "loss": 0.3893,
477
  "step": 7800
478
  },
479
  {
480
- "epoch": 0.22,
481
  "learning_rate": 7e-05,
482
- "loss": 0.3769,
483
  "step": 7900
484
  },
485
  {
486
- "epoch": 0.23,
487
  "learning_rate": 7e-05,
488
- "loss": 0.3687,
489
  "step": 8000
490
  },
491
  {
492
- "epoch": 0.23,
493
  "learning_rate": 7e-05,
494
- "loss": 0.3548,
495
  "step": 8100
496
  },
497
  {
498
- "epoch": 0.23,
499
  "learning_rate": 7e-05,
500
- "loss": 0.381,
501
  "step": 8200
502
  },
503
  {
504
- "epoch": 0.24,
505
  "learning_rate": 7e-05,
506
- "loss": 0.3818,
507
  "step": 8300
508
  },
509
  {
510
- "epoch": 0.24,
511
  "learning_rate": 7e-05,
512
- "loss": 0.37,
513
  "step": 8400
514
  },
515
  {
516
- "epoch": 0.24,
517
  "learning_rate": 7e-05,
518
- "loss": 0.3858,
519
  "step": 8500
520
  },
521
  {
522
- "epoch": 0.24,
523
  "learning_rate": 7e-05,
524
- "loss": 0.3818,
525
  "step": 8600
526
  },
527
  {
528
- "epoch": 0.25,
529
  "learning_rate": 7e-05,
530
- "loss": 0.3829,
531
  "step": 8700
532
  },
533
  {
534
- "epoch": 0.25,
535
  "learning_rate": 7e-05,
536
- "loss": 0.3758,
537
  "step": 8800
538
  },
539
  {
540
- "epoch": 0.25,
541
  "learning_rate": 7e-05,
542
- "loss": 0.4087,
543
  "step": 8900
544
  },
545
  {
546
- "epoch": 0.26,
547
  "learning_rate": 7e-05,
548
- "loss": 0.3877,
549
  "step": 9000
550
  },
551
  {
552
- "epoch": 0.26,
553
  "learning_rate": 7e-05,
554
- "loss": 0.3549,
555
  "step": 9100
556
  },
557
  {
558
- "epoch": 0.26,
559
  "learning_rate": 7e-05,
560
- "loss": 0.3774,
561
  "step": 9200
562
  },
563
  {
564
- "epoch": 0.26,
565
  "learning_rate": 7e-05,
566
- "loss": 0.3722,
567
  "step": 9300
568
  },
569
  {
570
- "epoch": 0.27,
571
  "learning_rate": 7e-05,
572
- "loss": 0.3807,
573
  "step": 9400
574
  },
575
  {
576
- "epoch": 0.27,
577
  "learning_rate": 7e-05,
578
- "loss": 0.3672,
579
  "step": 9500
580
  },
581
  {
582
- "epoch": 0.27,
583
  "learning_rate": 7e-05,
584
- "loss": 0.375,
585
  "step": 9600
586
  },
587
  {
588
- "epoch": 0.28,
589
  "learning_rate": 7e-05,
590
- "loss": 0.3643,
591
  "step": 9700
592
  },
593
  {
594
- "epoch": 0.28,
595
  "learning_rate": 7e-05,
596
- "loss": 0.396,
597
  "step": 9800
598
  },
599
  {
600
- "epoch": 0.28,
601
  "learning_rate": 7e-05,
602
- "loss": 0.3897,
603
  "step": 9900
604
  },
605
  {
606
- "epoch": 0.28,
607
  "learning_rate": 7e-05,
608
- "loss": 0.3623,
609
  "step": 10000
610
  },
611
  {
612
- "epoch": 0.29,
613
  "learning_rate": 7e-05,
614
- "loss": 0.3719,
615
  "step": 10100
616
  },
617
  {
618
- "epoch": 0.29,
619
  "learning_rate": 7e-05,
620
- "loss": 0.3865,
621
  "step": 10200
622
  },
623
  {
624
- "epoch": 0.29,
625
  "learning_rate": 7e-05,
626
- "loss": 0.399,
627
  "step": 10300
628
  },
629
  {
630
- "epoch": 0.3,
631
  "learning_rate": 7e-05,
632
- "loss": 0.3935,
633
  "step": 10400
634
  },
635
  {
636
- "epoch": 0.3,
637
  "learning_rate": 7e-05,
638
- "loss": 0.3806,
639
  "step": 10500
640
  },
641
  {
642
- "epoch": 0.3,
643
  "learning_rate": 7e-05,
644
- "loss": 0.3802,
645
  "step": 10600
646
  },
647
  {
648
- "epoch": 0.3,
649
  "learning_rate": 7e-05,
650
- "loss": 0.3743,
651
  "step": 10700
652
  },
653
  {
654
- "epoch": 0.31,
655
  "learning_rate": 7e-05,
656
- "loss": 0.3875,
657
  "step": 10800
658
  },
659
  {
660
- "epoch": 0.31,
661
  "learning_rate": 7e-05,
662
- "loss": 0.3914,
663
  "step": 10900
664
  },
665
  {
666
- "epoch": 0.31,
667
  "learning_rate": 7e-05,
668
- "loss": 0.3865,
669
  "step": 11000
670
  },
671
  {
672
- "epoch": 0.32,
673
  "learning_rate": 7e-05,
674
- "loss": 0.3707,
675
  "step": 11100
676
  },
677
  {
678
- "epoch": 0.32,
679
  "learning_rate": 7e-05,
680
- "loss": 0.3953,
681
  "step": 11200
682
  },
683
  {
684
- "epoch": 0.32,
685
  "learning_rate": 7e-05,
686
- "loss": 0.3873,
687
  "step": 11300
688
  },
689
  {
690
- "epoch": 0.32,
691
  "learning_rate": 7e-05,
692
- "loss": 0.3696,
693
  "step": 11400
694
  },
695
  {
696
- "epoch": 0.33,
697
  "learning_rate": 7e-05,
698
- "loss": 0.3685,
699
  "step": 11500
700
  },
701
  {
702
- "epoch": 0.33,
703
  "learning_rate": 7e-05,
704
- "loss": 0.3941,
705
  "step": 11600
706
  },
707
  {
708
- "epoch": 0.33,
709
  "learning_rate": 7e-05,
710
- "loss": 0.3541,
711
  "step": 11700
712
  },
713
  {
714
- "epoch": 0.34,
715
  "learning_rate": 7e-05,
716
- "loss": 0.3726,
717
  "step": 11800
718
  },
719
  {
720
- "epoch": 0.34,
721
  "learning_rate": 7e-05,
722
- "loss": 0.3867,
723
  "step": 11900
724
  },
725
  {
726
- "epoch": 0.34,
727
  "learning_rate": 7e-05,
728
- "loss": 0.4029,
729
  "step": 12000
730
  },
731
  {
732
- "epoch": 0.34,
733
  "learning_rate": 7e-05,
734
- "loss": 0.3653,
735
  "step": 12100
736
  },
737
  {
738
- "epoch": 0.35,
739
  "learning_rate": 7e-05,
740
- "loss": 0.3644,
741
  "step": 12200
742
  },
743
  {
744
- "epoch": 0.35,
745
  "learning_rate": 7e-05,
746
- "loss": 0.3673,
747
  "step": 12300
748
  },
749
  {
750
- "epoch": 0.35,
751
  "learning_rate": 7e-05,
752
- "loss": 0.3961,
753
  "step": 12400
754
  },
755
  {
756
- "epoch": 0.36,
757
  "learning_rate": 7e-05,
758
- "loss": 0.3759,
759
  "step": 12500
760
  },
761
  {
762
- "epoch": 0.36,
763
  "learning_rate": 7e-05,
764
- "loss": 0.3776,
765
  "step": 12600
766
  },
767
  {
768
- "epoch": 0.36,
769
  "learning_rate": 7e-05,
770
- "loss": 0.3964,
771
  "step": 12700
772
  },
773
  {
774
- "epoch": 0.36,
775
  "learning_rate": 7e-05,
776
- "loss": 0.3696,
777
  "step": 12800
778
  },
779
  {
780
- "epoch": 0.37,
781
  "learning_rate": 7e-05,
782
- "loss": 0.389,
783
  "step": 12900
784
  },
785
  {
786
- "epoch": 0.37,
787
  "learning_rate": 7e-05,
788
- "loss": 0.3802,
789
  "step": 13000
790
  },
791
  {
792
- "epoch": 0.37,
793
  "learning_rate": 7e-05,
794
- "loss": 0.3788,
795
  "step": 13100
796
  },
797
  {
798
- "epoch": 0.38,
799
  "learning_rate": 7e-05,
800
- "loss": 0.37,
801
  "step": 13200
802
  },
803
  {
804
- "epoch": 0.38,
805
  "learning_rate": 7e-05,
806
- "loss": 0.373,
807
  "step": 13300
808
  },
809
  {
810
- "epoch": 0.38,
811
  "learning_rate": 7e-05,
812
- "loss": 0.3853,
813
  "step": 13400
814
  },
815
  {
816
- "epoch": 0.38,
817
  "learning_rate": 7e-05,
818
- "loss": 0.3729,
819
  "step": 13500
820
  },
821
  {
822
- "epoch": 0.39,
823
  "learning_rate": 7e-05,
824
- "loss": 0.3758,
825
  "step": 13600
826
  },
827
  {
828
- "epoch": 0.39,
829
  "learning_rate": 7e-05,
830
- "loss": 0.3769,
831
  "step": 13700
832
  },
833
  {
834
- "epoch": 0.39,
835
  "learning_rate": 7e-05,
836
- "loss": 0.3732,
837
  "step": 13800
838
  },
839
  {
840
- "epoch": 0.4,
841
  "learning_rate": 7e-05,
842
- "loss": 0.3651,
843
  "step": 13900
844
  },
845
  {
846
- "epoch": 0.4,
847
  "learning_rate": 7e-05,
848
- "loss": 0.3865,
849
  "step": 14000
850
  },
851
  {
852
- "epoch": 0.4,
853
  "learning_rate": 7e-05,
854
- "loss": 0.376,
855
  "step": 14100
856
  },
857
  {
858
- "epoch": 0.4,
859
  "learning_rate": 7e-05,
860
- "loss": 0.3728,
861
  "step": 14200
862
  },
863
  {
864
- "epoch": 0.41,
865
  "learning_rate": 7e-05,
866
- "loss": 0.3779,
867
  "step": 14300
868
  },
869
  {
870
- "epoch": 0.41,
871
  "learning_rate": 7e-05,
872
- "loss": 0.3939,
873
  "step": 14400
874
  },
875
  {
876
- "epoch": 0.41,
877
  "learning_rate": 7e-05,
878
- "loss": 0.3791,
879
  "step": 14500
880
  },
881
  {
882
- "epoch": 0.42,
883
  "learning_rate": 7e-05,
884
- "loss": 0.3739,
885
  "step": 14600
886
  },
887
  {
888
- "epoch": 0.42,
889
  "learning_rate": 7e-05,
890
- "loss": 0.3596,
891
  "step": 14700
892
  },
893
  {
894
- "epoch": 0.42,
895
  "learning_rate": 7e-05,
896
- "loss": 0.3669,
897
  "step": 14800
898
  },
899
  {
900
- "epoch": 0.42,
901
  "learning_rate": 7e-05,
902
- "loss": 0.3723,
903
  "step": 14900
904
  },
905
  {
906
- "epoch": 0.43,
907
  "learning_rate": 7e-05,
908
- "loss": 0.3833,
909
  "step": 15000
910
  },
911
  {
912
- "epoch": 0.43,
913
  "learning_rate": 7e-05,
914
- "loss": 0.3639,
915
  "step": 15100
916
  },
917
  {
918
- "epoch": 0.43,
919
  "learning_rate": 7e-05,
920
- "loss": 0.3695,
921
  "step": 15200
922
  },
923
  {
924
- "epoch": 0.44,
925
  "learning_rate": 7e-05,
926
- "loss": 0.3815,
927
  "step": 15300
928
  },
929
  {
930
- "epoch": 0.44,
931
  "learning_rate": 7e-05,
932
- "loss": 0.3762,
933
  "step": 15400
934
  },
935
  {
936
- "epoch": 0.44,
937
  "learning_rate": 7e-05,
938
- "loss": 0.3595,
939
  "step": 15500
940
  },
941
  {
942
- "epoch": 0.44,
943
  "learning_rate": 7e-05,
944
- "loss": 0.3665,
945
  "step": 15600
946
  },
947
  {
948
- "epoch": 0.45,
949
  "learning_rate": 7e-05,
950
- "loss": 0.3556,
951
  "step": 15700
952
  },
953
  {
954
- "epoch": 0.45,
955
  "learning_rate": 7e-05,
956
- "loss": 0.374,
957
  "step": 15800
958
  },
959
  {
960
- "epoch": 0.45,
961
  "learning_rate": 7e-05,
962
- "loss": 0.3768,
963
  "step": 15900
964
  },
965
  {
966
- "epoch": 0.46,
967
  "learning_rate": 7e-05,
968
- "loss": 0.3715,
969
  "step": 16000
970
  },
971
  {
972
- "epoch": 0.46,
973
  "learning_rate": 7e-05,
974
- "loss": 0.3664,
975
  "step": 16100
976
  },
977
  {
978
- "epoch": 0.46,
979
  "learning_rate": 7e-05,
980
- "loss": 0.3818,
981
  "step": 16200
982
  },
983
  {
984
- "epoch": 0.46,
985
  "learning_rate": 7e-05,
986
- "loss": 0.3658,
987
  "step": 16300
988
  },
989
  {
990
- "epoch": 0.47,
991
  "learning_rate": 7e-05,
992
- "loss": 0.3706,
993
  "step": 16400
994
  },
995
  {
996
- "epoch": 0.47,
997
  "learning_rate": 7e-05,
998
- "loss": 0.3677,
999
  "step": 16500
1000
  },
1001
  {
1002
- "epoch": 0.47,
1003
  "learning_rate": 7e-05,
1004
- "loss": 0.3646,
1005
  "step": 16600
1006
  },
1007
  {
1008
- "epoch": 0.48,
1009
  "learning_rate": 7e-05,
1010
- "loss": 0.3655,
1011
  "step": 16700
1012
  },
1013
  {
1014
- "epoch": 0.48,
1015
  "learning_rate": 7e-05,
1016
- "loss": 0.3791,
1017
  "step": 16800
1018
  },
1019
  {
1020
- "epoch": 0.48,
1021
  "learning_rate": 7e-05,
1022
- "loss": 0.352,
1023
  "step": 16900
1024
  },
1025
  {
1026
- "epoch": 0.48,
1027
  "learning_rate": 7e-05,
1028
- "loss": 0.3859,
1029
  "step": 17000
1030
  },
1031
  {
1032
- "epoch": 0.49,
1033
  "learning_rate": 7e-05,
1034
- "loss": 0.3687,
1035
  "step": 17100
1036
  },
1037
  {
1038
- "epoch": 0.49,
1039
  "learning_rate": 7e-05,
1040
- "loss": 0.3669,
1041
  "step": 17200
1042
  },
1043
  {
1044
- "epoch": 0.49,
1045
  "learning_rate": 7e-05,
1046
- "loss": 0.3778,
1047
  "step": 17300
1048
  },
1049
  {
1050
- "epoch": 0.5,
1051
  "learning_rate": 7e-05,
1052
- "loss": 0.3824,
1053
  "step": 17400
1054
  },
1055
  {
1056
- "epoch": 0.5,
1057
  "learning_rate": 7e-05,
1058
- "loss": 0.3854,
1059
  "step": 17500
1060
  },
1061
  {
1062
- "epoch": 0.5,
1063
  "learning_rate": 7e-05,
1064
- "loss": 0.3634,
1065
  "step": 17600
1066
  },
1067
  {
1068
- "epoch": 0.5,
1069
  "learning_rate": 7e-05,
1070
- "loss": 0.3617,
1071
  "step": 17700
1072
  },
1073
  {
1074
- "epoch": 0.51,
1075
  "learning_rate": 7e-05,
1076
- "loss": 0.3692,
1077
  "step": 17800
1078
  },
1079
  {
1080
- "epoch": 0.51,
1081
  "learning_rate": 7e-05,
1082
- "loss": 0.3685,
1083
  "step": 17900
1084
  },
1085
  {
1086
- "epoch": 0.51,
1087
  "learning_rate": 7e-05,
1088
- "loss": 0.386,
1089
  "step": 18000
1090
  },
1091
  {
1092
- "epoch": 0.52,
1093
  "learning_rate": 7e-05,
1094
- "loss": 0.3844,
1095
  "step": 18100
1096
  },
1097
  {
1098
- "epoch": 0.52,
1099
  "learning_rate": 7e-05,
1100
- "loss": 0.3601,
1101
  "step": 18200
1102
  },
1103
  {
1104
- "epoch": 0.52,
1105
  "learning_rate": 7e-05,
1106
- "loss": 0.3713,
1107
  "step": 18300
1108
  },
1109
  {
1110
- "epoch": 0.52,
1111
  "learning_rate": 7e-05,
1112
- "loss": 0.3792,
1113
  "step": 18400
1114
  },
1115
  {
1116
- "epoch": 0.53,
1117
  "learning_rate": 7e-05,
1118
- "loss": 0.382,
1119
  "step": 18500
1120
  },
1121
  {
1122
- "epoch": 0.53,
1123
  "learning_rate": 7e-05,
1124
- "loss": 0.3847,
1125
  "step": 18600
1126
  },
1127
  {
1128
- "epoch": 0.53,
1129
  "learning_rate": 7e-05,
1130
- "loss": 0.3844,
1131
  "step": 18700
1132
  },
1133
  {
1134
- "epoch": 0.54,
1135
  "learning_rate": 7e-05,
1136
- "loss": 0.3692,
1137
  "step": 18800
1138
  },
1139
  {
1140
- "epoch": 0.54,
1141
  "learning_rate": 7e-05,
1142
- "loss": 0.3768,
1143
  "step": 18900
1144
  },
1145
  {
1146
- "epoch": 0.54,
1147
  "learning_rate": 7e-05,
1148
- "loss": 0.3669,
1149
  "step": 19000
1150
  },
1151
  {
1152
- "epoch": 0.54,
1153
  "learning_rate": 7e-05,
1154
- "loss": 0.3952,
1155
  "step": 19100
1156
  },
1157
  {
1158
- "epoch": 0.55,
1159
  "learning_rate": 7e-05,
1160
- "loss": 0.3547,
1161
  "step": 19200
1162
  },
1163
  {
1164
- "epoch": 0.55,
1165
  "learning_rate": 7e-05,
1166
- "loss": 0.3765,
1167
  "step": 19300
1168
  },
1169
  {
1170
- "epoch": 0.55,
1171
  "learning_rate": 7e-05,
1172
- "loss": 0.375,
1173
  "step": 19400
1174
  },
1175
  {
1176
- "epoch": 0.56,
1177
  "learning_rate": 7e-05,
1178
- "loss": 0.3712,
1179
  "step": 19500
1180
  },
1181
  {
1182
- "epoch": 0.56,
1183
  "learning_rate": 7e-05,
1184
- "loss": 0.3765,
1185
  "step": 19600
1186
  },
1187
  {
1188
- "epoch": 0.56,
1189
  "learning_rate": 7e-05,
1190
- "loss": 0.3805,
1191
  "step": 19700
1192
  },
1193
  {
1194
- "epoch": 0.56,
1195
  "learning_rate": 7e-05,
1196
- "loss": 0.3712,
1197
  "step": 19800
1198
  },
1199
  {
1200
- "epoch": 0.57,
1201
  "learning_rate": 7e-05,
1202
- "loss": 0.3851,
1203
  "step": 19900
1204
  },
1205
  {
1206
- "epoch": 0.57,
1207
  "learning_rate": 7e-05,
1208
- "loss": 0.3808,
1209
  "step": 20000
1210
  },
1211
  {
1212
- "epoch": 0.57,
1213
  "learning_rate": 7e-05,
1214
- "loss": 0.3806,
1215
  "step": 20100
1216
  },
1217
  {
1218
- "epoch": 0.57,
1219
  "learning_rate": 7e-05,
1220
- "loss": 0.3484,
1221
  "step": 20200
1222
  },
1223
  {
1224
- "epoch": 0.58,
1225
  "learning_rate": 7e-05,
1226
- "loss": 0.3683,
1227
  "step": 20300
1228
  },
1229
  {
1230
- "epoch": 0.58,
1231
  "learning_rate": 7e-05,
1232
- "loss": 0.3686,
1233
  "step": 20400
1234
  },
1235
  {
1236
- "epoch": 0.58,
1237
  "learning_rate": 7e-05,
1238
- "loss": 0.3817,
1239
  "step": 20500
1240
  },
1241
  {
1242
- "epoch": 0.59,
1243
  "learning_rate": 7e-05,
1244
- "loss": 0.3742,
1245
  "step": 20600
1246
  },
1247
  {
1248
- "epoch": 0.59,
1249
  "learning_rate": 7e-05,
1250
- "loss": 0.3886,
1251
  "step": 20700
1252
  },
1253
  {
1254
- "epoch": 0.59,
1255
  "learning_rate": 7e-05,
1256
- "loss": 0.3642,
1257
  "step": 20800
1258
  },
1259
  {
1260
- "epoch": 0.59,
1261
  "learning_rate": 7e-05,
1262
- "loss": 0.3765,
1263
  "step": 20900
1264
  },
1265
  {
1266
- "epoch": 0.6,
1267
  "learning_rate": 7e-05,
1268
- "loss": 0.3704,
1269
  "step": 21000
1270
  },
1271
  {
1272
- "epoch": 0.6,
1273
  "learning_rate": 7e-05,
1274
- "loss": 0.3629,
1275
  "step": 21100
1276
  },
1277
  {
1278
- "epoch": 0.6,
1279
  "learning_rate": 7e-05,
1280
- "loss": 0.3761,
1281
  "step": 21200
1282
  },
1283
  {
1284
- "epoch": 0.61,
1285
  "learning_rate": 7e-05,
1286
- "loss": 0.3778,
1287
  "step": 21300
1288
  },
1289
  {
1290
- "epoch": 0.61,
1291
  "learning_rate": 7e-05,
1292
- "loss": 0.3832,
1293
  "step": 21400
1294
  },
1295
  {
1296
- "epoch": 0.61,
1297
  "learning_rate": 7e-05,
1298
- "loss": 0.3648,
1299
  "step": 21500
1300
  },
1301
  {
1302
- "epoch": 0.61,
1303
  "learning_rate": 7e-05,
1304
- "loss": 0.3765,
1305
  "step": 21600
1306
  },
1307
  {
1308
- "epoch": 0.62,
1309
  "learning_rate": 7e-05,
1310
- "loss": 0.3766,
1311
  "step": 21700
1312
  },
1313
  {
1314
- "epoch": 0.62,
1315
  "learning_rate": 7e-05,
1316
- "loss": 0.3703,
1317
  "step": 21800
1318
  },
1319
  {
1320
- "epoch": 0.62,
1321
  "learning_rate": 7e-05,
1322
- "loss": 0.3657,
1323
  "step": 21900
1324
  },
1325
  {
1326
- "epoch": 0.63,
1327
  "learning_rate": 7e-05,
1328
- "loss": 0.3763,
1329
  "step": 22000
1330
  },
1331
  {
1332
- "epoch": 0.63,
1333
  "learning_rate": 7e-05,
1334
- "loss": 0.3657,
1335
  "step": 22100
1336
  },
1337
  {
1338
- "epoch": 0.63,
1339
  "learning_rate": 7e-05,
1340
- "loss": 0.3736,
1341
  "step": 22200
1342
  },
1343
  {
1344
- "epoch": 0.63,
1345
  "learning_rate": 7e-05,
1346
- "loss": 0.3894,
1347
  "step": 22300
1348
  },
1349
  {
1350
- "epoch": 0.64,
1351
  "learning_rate": 7e-05,
1352
- "loss": 0.3859,
1353
  "step": 22400
1354
  },
1355
  {
1356
- "epoch": 0.64,
1357
  "learning_rate": 7e-05,
1358
- "loss": 0.3474,
1359
  "step": 22500
1360
  },
1361
  {
1362
- "epoch": 0.64,
1363
  "learning_rate": 7e-05,
1364
- "loss": 0.367,
1365
  "step": 22600
1366
  },
1367
  {
1368
- "epoch": 0.65,
1369
  "learning_rate": 7e-05,
1370
- "loss": 0.3561,
1371
  "step": 22700
1372
  },
1373
  {
1374
- "epoch": 0.65,
1375
  "learning_rate": 7e-05,
1376
- "loss": 0.3703,
1377
  "step": 22800
1378
  },
1379
  {
1380
- "epoch": 0.65,
1381
  "learning_rate": 7e-05,
1382
- "loss": 0.365,
1383
  "step": 22900
1384
  },
1385
  {
1386
- "epoch": 0.65,
1387
  "learning_rate": 7e-05,
1388
- "loss": 0.3924,
1389
  "step": 23000
1390
  },
1391
  {
1392
- "epoch": 0.66,
1393
  "learning_rate": 7e-05,
1394
- "loss": 0.3772,
1395
  "step": 23100
1396
  },
1397
  {
1398
- "epoch": 0.66,
1399
  "learning_rate": 7e-05,
1400
- "loss": 0.3783,
1401
  "step": 23200
1402
  },
1403
  {
1404
- "epoch": 0.66,
1405
  "learning_rate": 7e-05,
1406
- "loss": 0.3713,
1407
  "step": 23300
1408
  },
1409
  {
1410
- "epoch": 0.67,
1411
  "learning_rate": 7e-05,
1412
- "loss": 0.3844,
1413
  "step": 23400
1414
  },
1415
  {
1416
- "epoch": 0.67,
1417
  "learning_rate": 7e-05,
1418
- "loss": 0.3734,
1419
  "step": 23500
1420
  },
1421
  {
1422
- "epoch": 0.67,
1423
  "learning_rate": 7e-05,
1424
- "loss": 0.3747,
1425
  "step": 23600
1426
  },
1427
  {
1428
- "epoch": 0.67,
1429
  "learning_rate": 7e-05,
1430
- "loss": 0.3854,
1431
  "step": 23700
1432
  },
1433
  {
1434
- "epoch": 0.68,
1435
  "learning_rate": 7e-05,
1436
- "loss": 0.3752,
1437
  "step": 23800
1438
  },
1439
  {
1440
- "epoch": 0.68,
1441
  "learning_rate": 7e-05,
1442
- "loss": 0.37,
1443
  "step": 23900
1444
  },
1445
  {
1446
- "epoch": 0.68,
1447
  "learning_rate": 7e-05,
1448
- "loss": 0.3841,
1449
  "step": 24000
1450
  },
1451
  {
1452
- "epoch": 0.69,
1453
  "learning_rate": 7e-05,
1454
- "loss": 0.3717,
1455
  "step": 24100
1456
  },
1457
  {
1458
- "epoch": 0.69,
1459
  "learning_rate": 7e-05,
1460
- "loss": 0.3671,
1461
  "step": 24200
1462
  },
1463
  {
1464
- "epoch": 0.69,
1465
  "learning_rate": 7e-05,
1466
- "loss": 0.3668,
1467
  "step": 24300
1468
  },
1469
  {
1470
- "epoch": 0.69,
1471
  "learning_rate": 7e-05,
1472
- "loss": 0.3782,
1473
  "step": 24400
1474
  },
1475
  {
1476
- "epoch": 0.7,
1477
  "learning_rate": 7e-05,
1478
- "loss": 0.3807,
1479
  "step": 24500
1480
  },
1481
  {
1482
- "epoch": 0.7,
1483
  "learning_rate": 7e-05,
1484
- "loss": 0.3916,
1485
  "step": 24600
1486
  },
1487
  {
1488
- "epoch": 0.7,
1489
  "learning_rate": 7e-05,
1490
- "loss": 0.3664,
1491
  "step": 24700
1492
  },
1493
  {
1494
- "epoch": 0.71,
1495
  "learning_rate": 7e-05,
1496
- "loss": 0.3633,
1497
  "step": 24800
1498
  },
1499
  {
1500
- "epoch": 0.71,
1501
  "learning_rate": 7e-05,
1502
- "loss": 0.3628,
1503
  "step": 24900
1504
  },
1505
  {
1506
- "epoch": 0.71,
1507
  "learning_rate": 7e-05,
1508
- "loss": 0.3691,
1509
  "step": 25000
1510
  },
1511
  {
1512
- "epoch": 0.71,
1513
  "learning_rate": 7e-05,
1514
- "loss": 0.3661,
1515
  "step": 25100
1516
  },
1517
  {
1518
- "epoch": 0.72,
1519
  "learning_rate": 7e-05,
1520
- "loss": 0.3654,
1521
  "step": 25200
1522
  },
1523
  {
1524
- "epoch": 0.72,
1525
- "learning_rate": 7e-05,
1526
- "loss": 0.3616,
1527
- "step": 25300
1528
- },
1529
- {
1530
- "epoch": 0.72,
1531
  "learning_rate": 7e-05,
1532
  "loss": 0.3624,
1533
- "step": 25400
1534
- },
1535
- {
1536
- "epoch": 0.73,
1537
- "learning_rate": 7e-05,
1538
- "loss": 0.3717,
1539
- "step": 25500
1540
- },
1541
- {
1542
- "epoch": 0.73,
1543
- "learning_rate": 7e-05,
1544
- "loss": 0.3606,
1545
- "step": 25600
1546
- },
1547
- {
1548
- "epoch": 0.73,
1549
- "learning_rate": 7e-05,
1550
- "loss": 0.3859,
1551
- "step": 25700
1552
- },
1553
- {
1554
- "epoch": 0.73,
1555
- "learning_rate": 7e-05,
1556
- "loss": 0.3844,
1557
- "step": 25800
1558
- },
1559
- {
1560
- "epoch": 0.74,
1561
- "learning_rate": 7e-05,
1562
- "loss": 0.3696,
1563
- "step": 25900
1564
- },
1565
- {
1566
- "epoch": 0.74,
1567
- "learning_rate": 7e-05,
1568
- "loss": 0.379,
1569
- "step": 26000
1570
- },
1571
- {
1572
- "epoch": 0.74,
1573
- "learning_rate": 7e-05,
1574
- "loss": 0.3798,
1575
- "step": 26100
1576
- },
1577
- {
1578
- "epoch": 0.75,
1579
- "learning_rate": 7e-05,
1580
- "loss": 0.3671,
1581
- "step": 26200
1582
- },
1583
- {
1584
- "epoch": 0.75,
1585
- "learning_rate": 7e-05,
1586
- "loss": 0.3764,
1587
- "step": 26300
1588
- },
1589
- {
1590
- "epoch": 0.75,
1591
- "learning_rate": 7e-05,
1592
- "loss": 0.3651,
1593
- "step": 26400
1594
- },
1595
- {
1596
- "epoch": 0.75,
1597
- "learning_rate": 7e-05,
1598
- "loss": 0.3756,
1599
- "step": 26500
1600
- },
1601
- {
1602
- "epoch": 0.76,
1603
- "learning_rate": 7e-05,
1604
- "loss": 0.3603,
1605
- "step": 26600
1606
- },
1607
- {
1608
- "epoch": 0.76,
1609
- "learning_rate": 7e-05,
1610
- "loss": 0.355,
1611
- "step": 26700
1612
- },
1613
- {
1614
- "epoch": 0.76,
1615
- "learning_rate": 7e-05,
1616
- "loss": 0.3756,
1617
- "step": 26800
1618
- },
1619
- {
1620
- "epoch": 0.77,
1621
- "learning_rate": 7e-05,
1622
- "loss": 0.3784,
1623
- "step": 26900
1624
- },
1625
- {
1626
- "epoch": 0.77,
1627
- "learning_rate": 7e-05,
1628
- "loss": 0.3592,
1629
- "step": 27000
1630
- },
1631
- {
1632
- "epoch": 0.77,
1633
- "learning_rate": 7e-05,
1634
- "loss": 0.353,
1635
- "step": 27100
1636
- },
1637
- {
1638
- "epoch": 0.77,
1639
- "learning_rate": 7e-05,
1640
- "loss": 0.3617,
1641
- "step": 27200
1642
- },
1643
- {
1644
- "epoch": 0.78,
1645
- "learning_rate": 7e-05,
1646
- "loss": 0.3548,
1647
- "step": 27300
1648
- },
1649
- {
1650
- "epoch": 0.78,
1651
- "learning_rate": 7e-05,
1652
- "loss": 0.3867,
1653
- "step": 27400
1654
- },
1655
- {
1656
- "epoch": 0.78,
1657
- "learning_rate": 7e-05,
1658
- "loss": 0.3676,
1659
- "step": 27500
1660
- },
1661
- {
1662
- "epoch": 0.79,
1663
- "learning_rate": 7e-05,
1664
- "loss": 0.3761,
1665
- "step": 27600
1666
- },
1667
- {
1668
- "epoch": 0.79,
1669
- "learning_rate": 7e-05,
1670
- "loss": 0.3783,
1671
- "step": 27700
1672
- },
1673
- {
1674
- "epoch": 0.79,
1675
- "learning_rate": 7e-05,
1676
- "loss": 0.3749,
1677
- "step": 27800
1678
- },
1679
- {
1680
- "epoch": 0.79,
1681
- "learning_rate": 7e-05,
1682
- "loss": 0.3588,
1683
- "step": 27900
1684
- },
1685
- {
1686
- "epoch": 0.8,
1687
- "learning_rate": 7e-05,
1688
- "loss": 0.3794,
1689
- "step": 28000
1690
- },
1691
- {
1692
- "epoch": 0.8,
1693
- "learning_rate": 7e-05,
1694
- "loss": 0.3742,
1695
- "step": 28100
1696
- },
1697
- {
1698
- "epoch": 0.8,
1699
- "learning_rate": 7e-05,
1700
- "loss": 0.3627,
1701
- "step": 28200
1702
- },
1703
- {
1704
- "epoch": 0.81,
1705
- "learning_rate": 7e-05,
1706
- "loss": 0.3796,
1707
- "step": 28300
1708
- },
1709
- {
1710
- "epoch": 0.81,
1711
- "learning_rate": 7e-05,
1712
- "loss": 0.3591,
1713
- "step": 28400
1714
- },
1715
- {
1716
- "epoch": 0.81,
1717
- "learning_rate": 7e-05,
1718
- "loss": 0.3696,
1719
- "step": 28500
1720
- },
1721
- {
1722
- "epoch": 0.81,
1723
- "learning_rate": 7e-05,
1724
- "loss": 0.3709,
1725
- "step": 28600
1726
  },
1727
  {
1728
  "epoch": 0.82,
1729
  "learning_rate": 7e-05,
1730
- "loss": 0.3593,
1731
- "step": 28700
1732
  },
1733
  {
1734
  "epoch": 0.82,
1735
  "learning_rate": 7e-05,
1736
- "loss": 0.3864,
1737
- "step": 28800
1738
  },
1739
  {
1740
  "epoch": 0.82,
1741
  "learning_rate": 7e-05,
1742
- "loss": 0.3706,
1743
- "step": 28900
1744
- },
1745
- {
1746
- "epoch": 0.83,
1747
- "learning_rate": 7e-05,
1748
- "loss": 0.3475,
1749
- "step": 29000
1750
  },
1751
  {
1752
  "epoch": 0.83,
1753
  "learning_rate": 7e-05,
1754
- "loss": 0.3738,
1755
- "step": 29100
1756
  },
1757
  {
1758
  "epoch": 0.83,
1759
  "learning_rate": 7e-05,
1760
- "loss": 0.3797,
1761
- "step": 29200
1762
  },
1763
  {
1764
  "epoch": 0.83,
1765
  "learning_rate": 7e-05,
1766
- "loss": 0.37,
1767
- "step": 29300
1768
  },
1769
  {
1770
  "epoch": 0.84,
1771
  "learning_rate": 7e-05,
1772
- "loss": 0.3654,
1773
- "step": 29400
1774
  },
1775
  {
1776
  "epoch": 0.84,
1777
  "learning_rate": 7e-05,
1778
- "loss": 0.353,
1779
- "step": 29500
1780
  },
1781
  {
1782
  "epoch": 0.84,
1783
  "learning_rate": 7e-05,
1784
- "loss": 0.3781,
1785
- "step": 29600
1786
- },
1787
- {
1788
- "epoch": 0.85,
1789
- "learning_rate": 7e-05,
1790
- "loss": 0.3689,
1791
- "step": 29700
1792
  },
1793
  {
1794
  "epoch": 0.85,
1795
  "learning_rate": 7e-05,
1796
- "loss": 0.3801,
1797
- "step": 29800
1798
  },
1799
  {
1800
  "epoch": 0.85,
1801
  "learning_rate": 7e-05,
1802
- "loss": 0.3602,
1803
- "step": 29900
1804
  },
1805
  {
1806
  "epoch": 0.85,
1807
  "learning_rate": 7e-05,
1808
- "loss": 0.3746,
1809
- "step": 30000
1810
  },
1811
  {
1812
  "epoch": 0.86,
1813
  "learning_rate": 7e-05,
1814
- "loss": 0.367,
1815
- "step": 30100
1816
  },
1817
  {
1818
  "epoch": 0.86,
1819
  "learning_rate": 7e-05,
1820
- "loss": 0.3643,
1821
- "step": 30200
1822
  },
1823
  {
1824
  "epoch": 0.86,
1825
  "learning_rate": 7e-05,
1826
- "loss": 0.3739,
1827
- "step": 30300
1828
- },
1829
- {
1830
- "epoch": 0.87,
1831
- "learning_rate": 7e-05,
1832
- "loss": 0.3671,
1833
- "step": 30400
1834
  },
1835
  {
1836
  "epoch": 0.87,
1837
  "learning_rate": 7e-05,
1838
- "loss": 0.3488,
1839
- "step": 30500
1840
  },
1841
  {
1842
  "epoch": 0.87,
1843
  "learning_rate": 7e-05,
1844
- "loss": 0.3694,
1845
- "step": 30600
1846
  },
1847
  {
1848
  "epoch": 0.87,
1849
  "learning_rate": 7e-05,
1850
- "loss": 0.3512,
1851
- "step": 30700
1852
  },
1853
  {
1854
  "epoch": 0.88,
1855
  "learning_rate": 7e-05,
1856
- "loss": 0.3646,
1857
- "step": 30800
1858
  },
1859
  {
1860
  "epoch": 0.88,
1861
  "learning_rate": 7e-05,
1862
- "loss": 0.3593,
1863
- "step": 30900
1864
  },
1865
  {
1866
  "epoch": 0.88,
1867
  "learning_rate": 7e-05,
1868
- "loss": 0.3497,
1869
- "step": 31000
1870
- },
1871
- {
1872
- "epoch": 0.89,
1873
- "learning_rate": 7e-05,
1874
- "loss": 0.3672,
1875
- "step": 31100
1876
  },
1877
  {
1878
  "epoch": 0.89,
1879
  "learning_rate": 7e-05,
1880
- "loss": 0.3637,
1881
- "step": 31200
1882
  },
1883
  {
1884
  "epoch": 0.89,
1885
  "learning_rate": 7e-05,
1886
- "loss": 0.3587,
1887
- "step": 31300
1888
  },
1889
  {
1890
  "epoch": 0.89,
1891
  "learning_rate": 7e-05,
1892
- "loss": 0.3699,
1893
- "step": 31400
1894
  },
1895
  {
1896
  "epoch": 0.9,
1897
  "learning_rate": 7e-05,
1898
- "loss": 0.3773,
1899
- "step": 31500
1900
  },
1901
  {
1902
  "epoch": 0.9,
1903
  "learning_rate": 7e-05,
1904
- "loss": 0.3646,
1905
- "step": 31600
1906
  },
1907
  {
1908
  "epoch": 0.9,
1909
  "learning_rate": 7e-05,
1910
- "loss": 0.3506,
1911
- "step": 31700
1912
  },
1913
  {
1914
  "epoch": 0.91,
1915
  "learning_rate": 7e-05,
1916
- "loss": 0.3759,
1917
- "step": 31800
1918
  },
1919
  {
1920
  "epoch": 0.91,
1921
  "learning_rate": 7e-05,
1922
- "loss": 0.3704,
1923
- "step": 31900
1924
  },
1925
  {
1926
  "epoch": 0.91,
1927
  "learning_rate": 7e-05,
1928
- "loss": 0.3774,
1929
- "step": 32000
1930
  },
1931
  {
1932
  "epoch": 0.91,
1933
  "learning_rate": 7e-05,
1934
- "loss": 0.3789,
1935
- "step": 32100
1936
  },
1937
  {
1938
  "epoch": 0.92,
1939
  "learning_rate": 7e-05,
1940
- "loss": 0.3635,
1941
- "step": 32200
1942
  },
1943
  {
1944
  "epoch": 0.92,
1945
  "learning_rate": 7e-05,
1946
- "loss": 0.364,
1947
- "step": 32300
1948
  },
1949
  {
1950
  "epoch": 0.92,
1951
  "learning_rate": 7e-05,
1952
- "loss": 0.3623,
1953
- "step": 32400
1954
- },
1955
- {
1956
- "epoch": 0.93,
1957
- "learning_rate": 7e-05,
1958
- "loss": 0.3603,
1959
- "step": 32500
1960
  },
1961
  {
1962
  "epoch": 0.93,
1963
  "learning_rate": 7e-05,
1964
- "loss": 0.3743,
1965
- "step": 32600
1966
  },
1967
  {
1968
  "epoch": 0.93,
1969
  "learning_rate": 7e-05,
1970
- "loss": 0.3788,
1971
- "step": 32700
1972
  },
1973
  {
1974
  "epoch": 0.93,
1975
  "learning_rate": 7e-05,
1976
- "loss": 0.3626,
1977
- "step": 32800
1978
- },
1979
- {
1980
- "epoch": 0.94,
1981
- "learning_rate": 7e-05,
1982
- "loss": 0.3643,
1983
- "step": 32900
1984
  },
1985
  {
1986
  "epoch": 0.94,
1987
  "learning_rate": 7e-05,
1988
- "loss": 0.3766,
1989
- "step": 33000
1990
  },
1991
  {
1992
  "epoch": 0.94,
1993
  "learning_rate": 7e-05,
1994
- "loss": 0.3455,
1995
- "step": 33100
1996
  },
1997
  {
1998
  "epoch": 0.94,
1999
  "learning_rate": 7e-05,
2000
- "loss": 0.3594,
2001
- "step": 33200
2002
  },
2003
  {
2004
  "epoch": 0.95,
2005
  "learning_rate": 7e-05,
2006
- "loss": 0.3677,
2007
- "step": 33300
2008
  },
2009
  {
2010
  "epoch": 0.95,
2011
  "learning_rate": 7e-05,
2012
- "loss": 0.369,
2013
- "step": 33400
2014
  },
2015
  {
2016
  "epoch": 0.95,
2017
  "learning_rate": 7e-05,
2018
- "loss": 0.3612,
2019
- "step": 33500
2020
- },
2021
- {
2022
- "epoch": 0.96,
2023
- "learning_rate": 7e-05,
2024
- "loss": 0.3704,
2025
- "step": 33600
2026
  },
2027
  {
2028
  "epoch": 0.96,
2029
  "learning_rate": 7e-05,
2030
- "loss": 0.366,
2031
- "step": 33700
2032
  },
2033
  {
2034
  "epoch": 0.96,
2035
  "learning_rate": 7e-05,
2036
- "loss": 0.3767,
2037
- "step": 33800
2038
  },
2039
  {
2040
  "epoch": 0.96,
2041
  "learning_rate": 7e-05,
2042
- "loss": 0.3712,
2043
- "step": 33900
2044
  },
2045
  {
2046
  "epoch": 0.97,
2047
  "learning_rate": 7e-05,
2048
- "loss": 0.3583,
2049
- "step": 34000
2050
  },
2051
  {
2052
  "epoch": 0.97,
2053
  "learning_rate": 7e-05,
2054
- "loss": 0.3797,
2055
- "step": 34100
2056
  },
2057
  {
2058
  "epoch": 0.97,
2059
  "learning_rate": 7e-05,
2060
- "loss": 0.3683,
2061
- "step": 34200
2062
- },
2063
- {
2064
- "epoch": 0.98,
2065
- "learning_rate": 7e-05,
2066
- "loss": 0.3795,
2067
- "step": 34300
2068
  },
2069
  {
2070
  "epoch": 0.98,
2071
  "learning_rate": 7e-05,
2072
- "loss": 0.3641,
2073
- "step": 34400
2074
  },
2075
  {
2076
  "epoch": 0.98,
2077
  "learning_rate": 7e-05,
2078
- "loss": 0.3785,
2079
- "step": 34500
2080
  },
2081
  {
2082
  "epoch": 0.98,
2083
  "learning_rate": 7e-05,
2084
- "loss": 0.377,
2085
- "step": 34600
2086
  },
2087
  {
2088
  "epoch": 0.99,
2089
  "learning_rate": 7e-05,
2090
- "loss": 0.3799,
2091
- "step": 34700
2092
  },
2093
  {
2094
  "epoch": 0.99,
2095
  "learning_rate": 7e-05,
2096
- "loss": 0.3669,
2097
- "step": 34800
2098
  },
2099
  {
2100
  "epoch": 0.99,
2101
  "learning_rate": 7e-05,
2102
- "loss": 0.3754,
2103
- "step": 34900
2104
  },
2105
  {
2106
  "epoch": 1.0,
2107
  "learning_rate": 7e-05,
2108
- "loss": 0.3579,
2109
- "step": 35000
2110
  },
2111
  {
2112
  "epoch": 1.0,
2113
  "learning_rate": 7e-05,
2114
- "loss": 0.3929,
2115
- "step": 35100
2116
  },
2117
  {
2118
  "epoch": 1.0,
2119
- "eval_gen_len": 24.50843661340899,
2120
- "eval_loss": 0.3171856105327606,
2121
- "eval_rouge1": 86.2684,
2122
- "eval_rouge2": 75.1631,
2123
- "eval_rougeL": 85.6942,
2124
- "eval_rougeLsum": 85.72,
2125
- "eval_runtime": 1501.411,
2126
- "eval_samples_per_second": 4.46,
2127
- "eval_steps_per_second": 1.116,
2128
- "step": 35133
2129
  },
2130
  {
2131
  "epoch": 1.0,
2132
- "step": 35133,
2133
- "total_flos": 9.124518338088468e+17,
2134
- "train_loss": 0.3748776986590622,
2135
- "train_runtime": 44443.9793,
2136
- "train_samples_per_second": 25.297,
2137
- "train_steps_per_second": 0.791
2138
  }
2139
  ],
2140
  "logging_steps": 100,
2141
- "max_steps": 35133,
2142
  "num_input_tokens_seen": 0,
2143
  "num_train_epochs": 1,
2144
  "save_steps": 2500,
2145
- "total_flos": 9.124518338088468e+17,
2146
  "trial_name": null,
2147
  "trial_params": null
2148
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 31042,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
11
  {
12
  "epoch": 0.0,
13
  "learning_rate": 7e-05,
14
+ "loss": 0.3828,
15
  "step": 100
16
  },
17
  {
18
  "epoch": 0.01,
19
  "learning_rate": 7e-05,
20
+ "loss": 0.3667,
21
  "step": 200
22
  },
23
  {
24
  "epoch": 0.01,
25
  "learning_rate": 7e-05,
26
+ "loss": 0.3574,
27
  "step": 300
28
  },
29
  {
30
  "epoch": 0.01,
31
  "learning_rate": 7e-05,
32
+ "loss": 0.3835,
33
  "step": 400
34
  },
35
  {
36
+ "epoch": 0.02,
37
  "learning_rate": 7e-05,
38
+ "loss": 0.3657,
39
  "step": 500
40
  },
41
  {
42
  "epoch": 0.02,
43
  "learning_rate": 7e-05,
44
+ "loss": 0.3542,
45
  "step": 600
46
  },
47
  {
48
  "epoch": 0.02,
49
  "learning_rate": 7e-05,
50
+ "loss": 0.3592,
51
  "step": 700
52
  },
53
  {
54
+ "epoch": 0.03,
55
  "learning_rate": 7e-05,
56
+ "loss": 0.3429,
57
  "step": 800
58
  },
59
  {
60
  "epoch": 0.03,
61
  "learning_rate": 7e-05,
62
+ "loss": 0.3618,
63
  "step": 900
64
  },
65
  {
66
  "epoch": 0.03,
67
  "learning_rate": 7e-05,
68
+ "loss": 0.3658,
69
  "step": 1000
70
  },
71
  {
72
+ "epoch": 0.04,
73
  "learning_rate": 7e-05,
74
+ "loss": 0.3608,
75
  "step": 1100
76
  },
77
  {
78
+ "epoch": 0.04,
79
  "learning_rate": 7e-05,
80
+ "loss": 0.3488,
81
  "step": 1200
82
  },
83
  {
84
  "epoch": 0.04,
85
  "learning_rate": 7e-05,
86
+ "loss": 0.3714,
87
  "step": 1300
88
  },
89
  {
90
+ "epoch": 0.05,
91
  "learning_rate": 7e-05,
92
+ "loss": 0.3791,
93
  "step": 1400
94
  },
95
  {
96
+ "epoch": 0.05,
97
  "learning_rate": 7e-05,
98
+ "loss": 0.3425,
99
  "step": 1500
100
  },
101
  {
102
  "epoch": 0.05,
103
  "learning_rate": 7e-05,
104
+ "loss": 0.3717,
105
  "step": 1600
106
  },
107
  {
108
  "epoch": 0.05,
109
  "learning_rate": 7e-05,
110
+ "loss": 0.3593,
111
  "step": 1700
112
  },
113
  {
114
+ "epoch": 0.06,
115
  "learning_rate": 7e-05,
116
+ "loss": 0.3619,
117
  "step": 1800
118
  },
119
  {
120
+ "epoch": 0.06,
121
  "learning_rate": 7e-05,
122
+ "loss": 0.3634,
123
  "step": 1900
124
  },
125
  {
126
  "epoch": 0.06,
127
  "learning_rate": 7e-05,
128
+ "loss": 0.3694,
129
  "step": 2000
130
  },
131
  {
132
+ "epoch": 0.07,
133
  "learning_rate": 7e-05,
134
+ "loss": 0.3664,
135
  "step": 2100
136
  },
137
  {
138
+ "epoch": 0.07,
139
  "learning_rate": 7e-05,
140
+ "loss": 0.3553,
141
  "step": 2200
142
  },
143
  {
144
  "epoch": 0.07,
145
  "learning_rate": 7e-05,
146
+ "loss": 0.372,
147
  "step": 2300
148
  },
149
  {
150
+ "epoch": 0.08,
151
  "learning_rate": 7e-05,
152
+ "loss": 0.3901,
153
  "step": 2400
154
  },
155
  {
156
+ "epoch": 0.08,
157
  "learning_rate": 7e-05,
158
+ "loss": 0.3713,
159
  "step": 2500
160
  },
161
  {
162
+ "epoch": 0.08,
163
  "learning_rate": 7e-05,
164
+ "loss": 0.3659,
165
  "step": 2600
166
  },
167
  {
168
+ "epoch": 0.09,
169
  "learning_rate": 7e-05,
170
+ "loss": 0.3568,
171
  "step": 2700
172
  },
173
  {
174
+ "epoch": 0.09,
175
  "learning_rate": 7e-05,
176
+ "loss": 0.3533,
177
  "step": 2800
178
  },
179
  {
180
+ "epoch": 0.09,
181
  "learning_rate": 7e-05,
182
+ "loss": 0.365,
183
  "step": 2900
184
  },
185
  {
186
+ "epoch": 0.1,
187
  "learning_rate": 7e-05,
188
+ "loss": 0.3647,
189
  "step": 3000
190
  },
191
  {
192
+ "epoch": 0.1,
193
  "learning_rate": 7e-05,
194
+ "loss": 0.3671,
195
  "step": 3100
196
  },
197
  {
198
+ "epoch": 0.1,
199
  "learning_rate": 7e-05,
200
+ "loss": 0.3423,
201
  "step": 3200
202
  },
203
  {
204
+ "epoch": 0.11,
205
  "learning_rate": 7e-05,
206
+ "loss": 0.3591,
207
  "step": 3300
208
  },
209
  {
210
+ "epoch": 0.11,
211
  "learning_rate": 7e-05,
212
+ "loss": 0.3715,
213
  "step": 3400
214
  },
215
  {
216
+ "epoch": 0.11,
217
  "learning_rate": 7e-05,
218
+ "loss": 0.3561,
219
  "step": 3500
220
  },
221
  {
222
+ "epoch": 0.12,
223
  "learning_rate": 7e-05,
224
+ "loss": 0.369,
225
  "step": 3600
226
  },
227
  {
228
+ "epoch": 0.12,
229
  "learning_rate": 7e-05,
230
+ "loss": 0.3725,
231
  "step": 3700
232
  },
233
  {
234
+ "epoch": 0.12,
235
  "learning_rate": 7e-05,
236
+ "loss": 0.3563,
237
  "step": 3800
238
  },
239
  {
240
+ "epoch": 0.13,
241
  "learning_rate": 7e-05,
242
+ "loss": 0.3738,
243
  "step": 3900
244
  },
245
  {
246
+ "epoch": 0.13,
247
  "learning_rate": 7e-05,
248
+ "loss": 0.355,
249
  "step": 4000
250
  },
251
  {
252
+ "epoch": 0.13,
253
  "learning_rate": 7e-05,
254
+ "loss": 0.3486,
255
  "step": 4100
256
  },
257
  {
258
+ "epoch": 0.14,
259
  "learning_rate": 7e-05,
260
+ "loss": 0.3627,
261
  "step": 4200
262
  },
263
  {
264
+ "epoch": 0.14,
265
  "learning_rate": 7e-05,
266
+ "loss": 0.3324,
267
  "step": 4300
268
  },
269
  {
270
+ "epoch": 0.14,
271
  "learning_rate": 7e-05,
272
+ "loss": 0.3921,
273
  "step": 4400
274
  },
275
  {
276
+ "epoch": 0.14,
277
  "learning_rate": 7e-05,
278
+ "loss": 0.3509,
279
  "step": 4500
280
  },
281
  {
282
+ "epoch": 0.15,
283
  "learning_rate": 7e-05,
284
+ "loss": 0.3671,
285
  "step": 4600
286
  },
287
  {
288
+ "epoch": 0.15,
289
  "learning_rate": 7e-05,
290
+ "loss": 0.363,
291
  "step": 4700
292
  },
293
  {
294
+ "epoch": 0.15,
295
  "learning_rate": 7e-05,
296
+ "loss": 0.3565,
297
  "step": 4800
298
  },
299
  {
300
+ "epoch": 0.16,
301
  "learning_rate": 7e-05,
302
+ "loss": 0.3709,
303
  "step": 4900
304
  },
305
  {
306
+ "epoch": 0.16,
307
  "learning_rate": 7e-05,
308
+ "loss": 0.3825,
309
  "step": 5000
310
  },
311
  {
312
+ "epoch": 0.16,
313
  "learning_rate": 7e-05,
314
+ "loss": 0.3565,
315
  "step": 5100
316
  },
317
  {
318
+ "epoch": 0.17,
319
  "learning_rate": 7e-05,
320
+ "loss": 0.3665,
321
  "step": 5200
322
  },
323
  {
324
+ "epoch": 0.17,
325
  "learning_rate": 7e-05,
326
+ "loss": 0.364,
327
  "step": 5300
328
  },
329
  {
330
+ "epoch": 0.17,
331
  "learning_rate": 7e-05,
332
+ "loss": 0.3676,
333
  "step": 5400
334
  },
335
  {
336
+ "epoch": 0.18,
337
  "learning_rate": 7e-05,
338
+ "loss": 0.3629,
339
  "step": 5500
340
  },
341
  {
342
+ "epoch": 0.18,
343
  "learning_rate": 7e-05,
344
+ "loss": 0.3677,
345
  "step": 5600
346
  },
347
  {
348
+ "epoch": 0.18,
349
  "learning_rate": 7e-05,
350
+ "loss": 0.3749,
351
  "step": 5700
352
  },
353
  {
354
+ "epoch": 0.19,
355
  "learning_rate": 7e-05,
356
+ "loss": 0.3595,
357
  "step": 5800
358
  },
359
  {
360
+ "epoch": 0.19,
361
  "learning_rate": 7e-05,
362
+ "loss": 0.3583,
363
  "step": 5900
364
  },
365
  {
366
+ "epoch": 0.19,
367
  "learning_rate": 7e-05,
368
+ "loss": 0.3677,
369
  "step": 6000
370
  },
371
  {
372
+ "epoch": 0.2,
373
  "learning_rate": 7e-05,
374
+ "loss": 0.3529,
375
  "step": 6100
376
  },
377
  {
378
+ "epoch": 0.2,
379
  "learning_rate": 7e-05,
380
+ "loss": 0.368,
381
  "step": 6200
382
  },
383
  {
384
+ "epoch": 0.2,
385
  "learning_rate": 7e-05,
386
+ "loss": 0.3578,
387
  "step": 6300
388
  },
389
  {
390
+ "epoch": 0.21,
391
  "learning_rate": 7e-05,
392
+ "loss": 0.3463,
393
  "step": 6400
394
  },
395
  {
396
+ "epoch": 0.21,
397
  "learning_rate": 7e-05,
398
+ "loss": 0.3579,
399
  "step": 6500
400
  },
401
  {
402
+ "epoch": 0.21,
403
  "learning_rate": 7e-05,
404
+ "loss": 0.3628,
405
  "step": 6600
406
  },
407
  {
408
+ "epoch": 0.22,
409
  "learning_rate": 7e-05,
410
+ "loss": 0.3662,
411
  "step": 6700
412
  },
413
  {
414
+ "epoch": 0.22,
415
  "learning_rate": 7e-05,
416
+ "loss": 0.3689,
417
  "step": 6800
418
  },
419
  {
420
+ "epoch": 0.22,
421
  "learning_rate": 7e-05,
422
+ "loss": 0.3758,
423
  "step": 6900
424
  },
425
  {
426
+ "epoch": 0.23,
427
  "learning_rate": 7e-05,
428
+ "loss": 0.347,
429
  "step": 7000
430
  },
431
  {
432
+ "epoch": 0.23,
433
  "learning_rate": 7e-05,
434
+ "loss": 0.3525,
435
  "step": 7100
436
  },
437
  {
438
+ "epoch": 0.23,
439
  "learning_rate": 7e-05,
440
+ "loss": 0.3622,
441
  "step": 7200
442
  },
443
  {
444
+ "epoch": 0.24,
445
  "learning_rate": 7e-05,
446
+ "loss": 0.3462,
447
  "step": 7300
448
  },
449
  {
450
+ "epoch": 0.24,
451
  "learning_rate": 7e-05,
452
+ "loss": 0.3592,
453
  "step": 7400
454
  },
455
  {
456
+ "epoch": 0.24,
457
  "learning_rate": 7e-05,
458
+ "loss": 0.3551,
459
  "step": 7500
460
  },
461
  {
462
+ "epoch": 0.24,
463
  "learning_rate": 7e-05,
464
+ "loss": 0.3624,
465
  "step": 7600
466
  },
467
  {
468
+ "epoch": 0.25,
469
  "learning_rate": 7e-05,
470
+ "loss": 0.3676,
471
  "step": 7700
472
  },
473
  {
474
+ "epoch": 0.25,
475
  "learning_rate": 7e-05,
476
+ "loss": 0.3589,
477
  "step": 7800
478
  },
479
  {
480
+ "epoch": 0.25,
481
  "learning_rate": 7e-05,
482
+ "loss": 0.3474,
483
  "step": 7900
484
  },
485
  {
486
+ "epoch": 0.26,
487
  "learning_rate": 7e-05,
488
+ "loss": 0.3467,
489
  "step": 8000
490
  },
491
  {
492
+ "epoch": 0.26,
493
  "learning_rate": 7e-05,
494
+ "loss": 0.3568,
495
  "step": 8100
496
  },
497
  {
498
+ "epoch": 0.26,
499
  "learning_rate": 7e-05,
500
+ "loss": 0.37,
501
  "step": 8200
502
  },
503
  {
504
+ "epoch": 0.27,
505
  "learning_rate": 7e-05,
506
+ "loss": 0.358,
507
  "step": 8300
508
  },
509
  {
510
+ "epoch": 0.27,
511
  "learning_rate": 7e-05,
512
+ "loss": 0.3691,
513
  "step": 8400
514
  },
515
  {
516
+ "epoch": 0.27,
517
  "learning_rate": 7e-05,
518
+ "loss": 0.36,
519
  "step": 8500
520
  },
521
  {
522
+ "epoch": 0.28,
523
  "learning_rate": 7e-05,
524
+ "loss": 0.3666,
525
  "step": 8600
526
  },
527
  {
528
+ "epoch": 0.28,
529
  "learning_rate": 7e-05,
530
+ "loss": 0.3508,
531
  "step": 8700
532
  },
533
  {
534
+ "epoch": 0.28,
535
  "learning_rate": 7e-05,
536
+ "loss": 0.3457,
537
  "step": 8800
538
  },
539
  {
540
+ "epoch": 0.29,
541
  "learning_rate": 7e-05,
542
+ "loss": 0.3658,
543
  "step": 8900
544
  },
545
  {
546
+ "epoch": 0.29,
547
  "learning_rate": 7e-05,
548
+ "loss": 0.37,
549
  "step": 9000
550
  },
551
  {
552
+ "epoch": 0.29,
553
  "learning_rate": 7e-05,
554
+ "loss": 0.3657,
555
  "step": 9100
556
  },
557
  {
558
+ "epoch": 0.3,
559
  "learning_rate": 7e-05,
560
+ "loss": 0.3677,
561
  "step": 9200
562
  },
563
  {
564
+ "epoch": 0.3,
565
  "learning_rate": 7e-05,
566
+ "loss": 0.3692,
567
  "step": 9300
568
  },
569
  {
570
+ "epoch": 0.3,
571
  "learning_rate": 7e-05,
572
+ "loss": 0.366,
573
  "step": 9400
574
  },
575
  {
576
+ "epoch": 0.31,
577
  "learning_rate": 7e-05,
578
+ "loss": 0.3782,
579
  "step": 9500
580
  },
581
  {
582
+ "epoch": 0.31,
583
  "learning_rate": 7e-05,
584
+ "loss": 0.355,
585
  "step": 9600
586
  },
587
  {
588
+ "epoch": 0.31,
589
  "learning_rate": 7e-05,
590
+ "loss": 0.3562,
591
  "step": 9700
592
  },
593
  {
594
+ "epoch": 0.32,
595
  "learning_rate": 7e-05,
596
+ "loss": 0.3615,
597
  "step": 9800
598
  },
599
  {
600
+ "epoch": 0.32,
601
  "learning_rate": 7e-05,
602
+ "loss": 0.3597,
603
  "step": 9900
604
  },
605
  {
606
+ "epoch": 0.32,
607
  "learning_rate": 7e-05,
608
+ "loss": 0.3682,
609
  "step": 10000
610
  },
611
  {
612
+ "epoch": 0.33,
613
  "learning_rate": 7e-05,
614
+ "loss": 0.3818,
615
  "step": 10100
616
  },
617
  {
618
+ "epoch": 0.33,
619
  "learning_rate": 7e-05,
620
+ "loss": 0.3741,
621
  "step": 10200
622
  },
623
  {
624
+ "epoch": 0.33,
625
  "learning_rate": 7e-05,
626
+ "loss": 0.3695,
627
  "step": 10300
628
  },
629
  {
630
+ "epoch": 0.34,
631
  "learning_rate": 7e-05,
632
+ "loss": 0.3604,
633
  "step": 10400
634
  },
635
  {
636
+ "epoch": 0.34,
637
  "learning_rate": 7e-05,
638
+ "loss": 0.3679,
639
  "step": 10500
640
  },
641
  {
642
+ "epoch": 0.34,
643
  "learning_rate": 7e-05,
644
+ "loss": 0.3571,
645
  "step": 10600
646
  },
647
  {
648
+ "epoch": 0.34,
649
  "learning_rate": 7e-05,
650
+ "loss": 0.367,
651
  "step": 10700
652
  },
653
  {
654
+ "epoch": 0.35,
655
  "learning_rate": 7e-05,
656
+ "loss": 0.3479,
657
  "step": 10800
658
  },
659
  {
660
+ "epoch": 0.35,
661
  "learning_rate": 7e-05,
662
+ "loss": 0.3574,
663
  "step": 10900
664
  },
665
  {
666
+ "epoch": 0.35,
667
  "learning_rate": 7e-05,
668
+ "loss": 0.3602,
669
  "step": 11000
670
  },
671
  {
672
+ "epoch": 0.36,
673
  "learning_rate": 7e-05,
674
+ "loss": 0.3455,
675
  "step": 11100
676
  },
677
  {
678
+ "epoch": 0.36,
679
  "learning_rate": 7e-05,
680
+ "loss": 0.3559,
681
  "step": 11200
682
  },
683
  {
684
+ "epoch": 0.36,
685
  "learning_rate": 7e-05,
686
+ "loss": 0.3476,
687
  "step": 11300
688
  },
689
  {
690
+ "epoch": 0.37,
691
  "learning_rate": 7e-05,
692
+ "loss": 0.3659,
693
  "step": 11400
694
  },
695
  {
696
+ "epoch": 0.37,
697
  "learning_rate": 7e-05,
698
+ "loss": 0.3612,
699
  "step": 11500
700
  },
701
  {
702
+ "epoch": 0.37,
703
  "learning_rate": 7e-05,
704
+ "loss": 0.3524,
705
  "step": 11600
706
  },
707
  {
708
+ "epoch": 0.38,
709
  "learning_rate": 7e-05,
710
+ "loss": 0.3663,
711
  "step": 11700
712
  },
713
  {
714
+ "epoch": 0.38,
715
  "learning_rate": 7e-05,
716
+ "loss": 0.346,
717
  "step": 11800
718
  },
719
  {
720
+ "epoch": 0.38,
721
  "learning_rate": 7e-05,
722
+ "loss": 0.3569,
723
  "step": 11900
724
  },
725
  {
726
+ "epoch": 0.39,
727
  "learning_rate": 7e-05,
728
+ "loss": 0.3549,
729
  "step": 12000
730
  },
731
  {
732
+ "epoch": 0.39,
733
  "learning_rate": 7e-05,
734
+ "loss": 0.3698,
735
  "step": 12100
736
  },
737
  {
738
+ "epoch": 0.39,
739
  "learning_rate": 7e-05,
740
+ "loss": 0.3494,
741
  "step": 12200
742
  },
743
  {
744
+ "epoch": 0.4,
745
  "learning_rate": 7e-05,
746
+ "loss": 0.3592,
747
  "step": 12300
748
  },
749
  {
750
+ "epoch": 0.4,
751
  "learning_rate": 7e-05,
752
+ "loss": 0.3365,
753
  "step": 12400
754
  },
755
  {
756
+ "epoch": 0.4,
757
  "learning_rate": 7e-05,
758
+ "loss": 0.3527,
759
  "step": 12500
760
  },
761
  {
762
+ "epoch": 0.41,
763
  "learning_rate": 7e-05,
764
+ "loss": 0.3749,
765
  "step": 12600
766
  },
767
  {
768
+ "epoch": 0.41,
769
  "learning_rate": 7e-05,
770
+ "loss": 0.3476,
771
  "step": 12700
772
  },
773
  {
774
+ "epoch": 0.41,
775
  "learning_rate": 7e-05,
776
+ "loss": 0.377,
777
  "step": 12800
778
  },
779
  {
780
+ "epoch": 0.42,
781
  "learning_rate": 7e-05,
782
+ "loss": 0.3615,
783
  "step": 12900
784
  },
785
  {
786
+ "epoch": 0.42,
787
  "learning_rate": 7e-05,
788
+ "loss": 0.3604,
789
  "step": 13000
790
  },
791
  {
792
+ "epoch": 0.42,
793
  "learning_rate": 7e-05,
794
+ "loss": 0.3534,
795
  "step": 13100
796
  },
797
  {
798
+ "epoch": 0.43,
799
  "learning_rate": 7e-05,
800
+ "loss": 0.3597,
801
  "step": 13200
802
  },
803
  {
804
+ "epoch": 0.43,
805
  "learning_rate": 7e-05,
806
+ "loss": 0.3376,
807
  "step": 13300
808
  },
809
  {
810
+ "epoch": 0.43,
811
  "learning_rate": 7e-05,
812
+ "loss": 0.3576,
813
  "step": 13400
814
  },
815
  {
816
+ "epoch": 0.43,
817
  "learning_rate": 7e-05,
818
+ "loss": 0.357,
819
  "step": 13500
820
  },
821
  {
822
+ "epoch": 0.44,
823
  "learning_rate": 7e-05,
824
+ "loss": 0.3584,
825
  "step": 13600
826
  },
827
  {
828
+ "epoch": 0.44,
829
  "learning_rate": 7e-05,
830
+ "loss": 0.3394,
831
  "step": 13700
832
  },
833
  {
834
+ "epoch": 0.44,
835
  "learning_rate": 7e-05,
836
+ "loss": 0.3629,
837
  "step": 13800
838
  },
839
  {
840
+ "epoch": 0.45,
841
  "learning_rate": 7e-05,
842
+ "loss": 0.3707,
843
  "step": 13900
844
  },
845
  {
846
+ "epoch": 0.45,
847
  "learning_rate": 7e-05,
848
+ "loss": 0.3613,
849
  "step": 14000
850
  },
851
  {
852
+ "epoch": 0.45,
853
  "learning_rate": 7e-05,
854
+ "loss": 0.3557,
855
  "step": 14100
856
  },
857
  {
858
+ "epoch": 0.46,
859
  "learning_rate": 7e-05,
860
+ "loss": 0.3499,
861
  "step": 14200
862
  },
863
  {
864
+ "epoch": 0.46,
865
  "learning_rate": 7e-05,
866
+ "loss": 0.3644,
867
  "step": 14300
868
  },
869
  {
870
+ "epoch": 0.46,
871
  "learning_rate": 7e-05,
872
+ "loss": 0.3624,
873
  "step": 14400
874
  },
875
  {
876
+ "epoch": 0.47,
877
  "learning_rate": 7e-05,
878
+ "loss": 0.3672,
879
  "step": 14500
880
  },
881
  {
882
+ "epoch": 0.47,
883
  "learning_rate": 7e-05,
884
+ "loss": 0.3573,
885
  "step": 14600
886
  },
887
  {
888
+ "epoch": 0.47,
889
  "learning_rate": 7e-05,
890
+ "loss": 0.3572,
891
  "step": 14700
892
  },
893
  {
894
+ "epoch": 0.48,
895
  "learning_rate": 7e-05,
896
+ "loss": 0.3698,
897
  "step": 14800
898
  },
899
  {
900
+ "epoch": 0.48,
901
  "learning_rate": 7e-05,
902
+ "loss": 0.3638,
903
  "step": 14900
904
  },
905
  {
906
+ "epoch": 0.48,
907
  "learning_rate": 7e-05,
908
+ "loss": 0.3541,
909
  "step": 15000
910
  },
911
  {
912
+ "epoch": 0.49,
913
  "learning_rate": 7e-05,
914
+ "loss": 0.3533,
915
  "step": 15100
916
  },
917
  {
918
+ "epoch": 0.49,
919
  "learning_rate": 7e-05,
920
+ "loss": 0.3522,
921
  "step": 15200
922
  },
923
  {
924
+ "epoch": 0.49,
925
  "learning_rate": 7e-05,
926
+ "loss": 0.3643,
927
  "step": 15300
928
  },
929
  {
930
+ "epoch": 0.5,
931
  "learning_rate": 7e-05,
932
+ "loss": 0.355,
933
  "step": 15400
934
  },
935
  {
936
+ "epoch": 0.5,
937
  "learning_rate": 7e-05,
938
+ "loss": 0.3587,
939
  "step": 15500
940
  },
941
  {
942
+ "epoch": 0.5,
943
  "learning_rate": 7e-05,
944
+ "loss": 0.377,
945
  "step": 15600
946
  },
947
  {
948
+ "epoch": 0.51,
949
  "learning_rate": 7e-05,
950
+ "loss": 0.3594,
951
  "step": 15700
952
  },
953
  {
954
+ "epoch": 0.51,
955
  "learning_rate": 7e-05,
956
+ "loss": 0.3567,
957
  "step": 15800
958
  },
959
  {
960
+ "epoch": 0.51,
961
  "learning_rate": 7e-05,
962
+ "loss": 0.3627,
963
  "step": 15900
964
  },
965
  {
966
+ "epoch": 0.52,
967
  "learning_rate": 7e-05,
968
+ "loss": 0.3429,
969
  "step": 16000
970
  },
971
  {
972
+ "epoch": 0.52,
973
  "learning_rate": 7e-05,
974
+ "loss": 0.3555,
975
  "step": 16100
976
  },
977
  {
978
+ "epoch": 0.52,
979
  "learning_rate": 7e-05,
980
+ "loss": 0.3558,
981
  "step": 16200
982
  },
983
  {
984
+ "epoch": 0.53,
985
  "learning_rate": 7e-05,
986
+ "loss": 0.374,
987
  "step": 16300
988
  },
989
  {
990
+ "epoch": 0.53,
991
  "learning_rate": 7e-05,
992
+ "loss": 0.3602,
993
  "step": 16400
994
  },
995
  {
996
+ "epoch": 0.53,
997
  "learning_rate": 7e-05,
998
+ "loss": 0.3603,
999
  "step": 16500
1000
  },
1001
  {
1002
+ "epoch": 0.53,
1003
  "learning_rate": 7e-05,
1004
+ "loss": 0.3642,
1005
  "step": 16600
1006
  },
1007
  {
1008
+ "epoch": 0.54,
1009
  "learning_rate": 7e-05,
1010
+ "loss": 0.3631,
1011
  "step": 16700
1012
  },
1013
  {
1014
+ "epoch": 0.54,
1015
  "learning_rate": 7e-05,
1016
+ "loss": 0.3665,
1017
  "step": 16800
1018
  },
1019
  {
1020
+ "epoch": 0.54,
1021
  "learning_rate": 7e-05,
1022
+ "loss": 0.3725,
1023
  "step": 16900
1024
  },
1025
  {
1026
+ "epoch": 0.55,
1027
  "learning_rate": 7e-05,
1028
+ "loss": 0.3726,
1029
  "step": 17000
1030
  },
1031
  {
1032
+ "epoch": 0.55,
1033
  "learning_rate": 7e-05,
1034
+ "loss": 0.3464,
1035
  "step": 17100
1036
  },
1037
  {
1038
+ "epoch": 0.55,
1039
  "learning_rate": 7e-05,
1040
+ "loss": 0.3689,
1041
  "step": 17200
1042
  },
1043
  {
1044
+ "epoch": 0.56,
1045
  "learning_rate": 7e-05,
1046
+ "loss": 0.3635,
1047
  "step": 17300
1048
  },
1049
  {
1050
+ "epoch": 0.56,
1051
  "learning_rate": 7e-05,
1052
+ "loss": 0.3618,
1053
  "step": 17400
1054
  },
1055
  {
1056
+ "epoch": 0.56,
1057
  "learning_rate": 7e-05,
1058
+ "loss": 0.3706,
1059
  "step": 17500
1060
  },
1061
  {
1062
+ "epoch": 0.57,
1063
  "learning_rate": 7e-05,
1064
+ "loss": 0.3636,
1065
  "step": 17600
1066
  },
1067
  {
1068
+ "epoch": 0.57,
1069
  "learning_rate": 7e-05,
1070
+ "loss": 0.3652,
1071
  "step": 17700
1072
  },
1073
  {
1074
+ "epoch": 0.57,
1075
  "learning_rate": 7e-05,
1076
+ "loss": 0.3519,
1077
  "step": 17800
1078
  },
1079
  {
1080
+ "epoch": 0.58,
1081
  "learning_rate": 7e-05,
1082
+ "loss": 0.3633,
1083
  "step": 17900
1084
  },
1085
  {
1086
+ "epoch": 0.58,
1087
  "learning_rate": 7e-05,
1088
+ "loss": 0.3798,
1089
  "step": 18000
1090
  },
1091
  {
1092
+ "epoch": 0.58,
1093
  "learning_rate": 7e-05,
1094
+ "loss": 0.3414,
1095
  "step": 18100
1096
  },
1097
  {
1098
+ "epoch": 0.59,
1099
  "learning_rate": 7e-05,
1100
+ "loss": 0.3405,
1101
  "step": 18200
1102
  },
1103
  {
1104
+ "epoch": 0.59,
1105
  "learning_rate": 7e-05,
1106
+ "loss": 0.361,
1107
  "step": 18300
1108
  },
1109
  {
1110
+ "epoch": 0.59,
1111
  "learning_rate": 7e-05,
1112
+ "loss": 0.3777,
1113
  "step": 18400
1114
  },
1115
  {
1116
+ "epoch": 0.6,
1117
  "learning_rate": 7e-05,
1118
+ "loss": 0.3564,
1119
  "step": 18500
1120
  },
1121
  {
1122
+ "epoch": 0.6,
1123
  "learning_rate": 7e-05,
1124
+ "loss": 0.3567,
1125
  "step": 18600
1126
  },
1127
  {
1128
+ "epoch": 0.6,
1129
  "learning_rate": 7e-05,
1130
+ "loss": 0.3654,
1131
  "step": 18700
1132
  },
1133
  {
1134
+ "epoch": 0.61,
1135
  "learning_rate": 7e-05,
1136
+ "loss": 0.3418,
1137
  "step": 18800
1138
  },
1139
  {
1140
+ "epoch": 0.61,
1141
  "learning_rate": 7e-05,
1142
+ "loss": 0.3553,
1143
  "step": 18900
1144
  },
1145
  {
1146
+ "epoch": 0.61,
1147
  "learning_rate": 7e-05,
1148
+ "loss": 0.3686,
1149
  "step": 19000
1150
  },
1151
  {
1152
+ "epoch": 0.62,
1153
  "learning_rate": 7e-05,
1154
+ "loss": 0.3579,
1155
  "step": 19100
1156
  },
1157
  {
1158
+ "epoch": 0.62,
1159
  "learning_rate": 7e-05,
1160
+ "loss": 0.3597,
1161
  "step": 19200
1162
  },
1163
  {
1164
+ "epoch": 0.62,
1165
  "learning_rate": 7e-05,
1166
+ "loss": 0.3524,
1167
  "step": 19300
1168
  },
1169
  {
1170
+ "epoch": 0.62,
1171
  "learning_rate": 7e-05,
1172
+ "loss": 0.3459,
1173
  "step": 19400
1174
  },
1175
  {
1176
+ "epoch": 0.63,
1177
  "learning_rate": 7e-05,
1178
+ "loss": 0.3647,
1179
  "step": 19500
1180
  },
1181
  {
1182
+ "epoch": 0.63,
1183
  "learning_rate": 7e-05,
1184
+ "loss": 0.3553,
1185
  "step": 19600
1186
  },
1187
  {
1188
+ "epoch": 0.63,
1189
  "learning_rate": 7e-05,
1190
+ "loss": 0.3351,
1191
  "step": 19700
1192
  },
1193
  {
1194
+ "epoch": 0.64,
1195
  "learning_rate": 7e-05,
1196
+ "loss": 0.3583,
1197
  "step": 19800
1198
  },
1199
  {
1200
+ "epoch": 0.64,
1201
  "learning_rate": 7e-05,
1202
+ "loss": 0.3648,
1203
  "step": 19900
1204
  },
1205
  {
1206
+ "epoch": 0.64,
1207
  "learning_rate": 7e-05,
1208
+ "loss": 0.3533,
1209
  "step": 20000
1210
  },
1211
  {
1212
+ "epoch": 0.65,
1213
  "learning_rate": 7e-05,
1214
+ "loss": 0.3432,
1215
  "step": 20100
1216
  },
1217
  {
1218
+ "epoch": 0.65,
1219
  "learning_rate": 7e-05,
1220
+ "loss": 0.3592,
1221
  "step": 20200
1222
  },
1223
  {
1224
+ "epoch": 0.65,
1225
  "learning_rate": 7e-05,
1226
+ "loss": 0.3663,
1227
  "step": 20300
1228
  },
1229
  {
1230
+ "epoch": 0.66,
1231
  "learning_rate": 7e-05,
1232
+ "loss": 0.3716,
1233
  "step": 20400
1234
  },
1235
  {
1236
+ "epoch": 0.66,
1237
  "learning_rate": 7e-05,
1238
+ "loss": 0.3461,
1239
  "step": 20500
1240
  },
1241
  {
1242
+ "epoch": 0.66,
1243
  "learning_rate": 7e-05,
1244
+ "loss": 0.3683,
1245
  "step": 20600
1246
  },
1247
  {
1248
+ "epoch": 0.67,
1249
  "learning_rate": 7e-05,
1250
+ "loss": 0.3581,
1251
  "step": 20700
1252
  },
1253
  {
1254
+ "epoch": 0.67,
1255
  "learning_rate": 7e-05,
1256
+ "loss": 0.358,
1257
  "step": 20800
1258
  },
1259
  {
1260
+ "epoch": 0.67,
1261
  "learning_rate": 7e-05,
1262
+ "loss": 0.3629,
1263
  "step": 20900
1264
  },
1265
  {
1266
+ "epoch": 0.68,
1267
  "learning_rate": 7e-05,
1268
+ "loss": 0.3568,
1269
  "step": 21000
1270
  },
1271
  {
1272
+ "epoch": 0.68,
1273
  "learning_rate": 7e-05,
1274
+ "loss": 0.3432,
1275
  "step": 21100
1276
  },
1277
  {
1278
+ "epoch": 0.68,
1279
  "learning_rate": 7e-05,
1280
+ "loss": 0.3688,
1281
  "step": 21200
1282
  },
1283
  {
1284
+ "epoch": 0.69,
1285
  "learning_rate": 7e-05,
1286
+ "loss": 0.347,
1287
  "step": 21300
1288
  },
1289
  {
1290
+ "epoch": 0.69,
1291
  "learning_rate": 7e-05,
1292
+ "loss": 0.3672,
1293
  "step": 21400
1294
  },
1295
  {
1296
+ "epoch": 0.69,
1297
  "learning_rate": 7e-05,
1298
+ "loss": 0.3509,
1299
  "step": 21500
1300
  },
1301
  {
1302
+ "epoch": 0.7,
1303
  "learning_rate": 7e-05,
1304
+ "loss": 0.3538,
1305
  "step": 21600
1306
  },
1307
  {
1308
+ "epoch": 0.7,
1309
  "learning_rate": 7e-05,
1310
+ "loss": 0.3493,
1311
  "step": 21700
1312
  },
1313
  {
1314
+ "epoch": 0.7,
1315
  "learning_rate": 7e-05,
1316
+ "loss": 0.3354,
1317
  "step": 21800
1318
  },
1319
  {
1320
+ "epoch": 0.71,
1321
  "learning_rate": 7e-05,
1322
+ "loss": 0.3656,
1323
  "step": 21900
1324
  },
1325
  {
1326
+ "epoch": 0.71,
1327
  "learning_rate": 7e-05,
1328
+ "loss": 0.3581,
1329
  "step": 22000
1330
  },
1331
  {
1332
+ "epoch": 0.71,
1333
  "learning_rate": 7e-05,
1334
+ "loss": 0.3547,
1335
  "step": 22100
1336
  },
1337
  {
1338
+ "epoch": 0.72,
1339
  "learning_rate": 7e-05,
1340
+ "loss": 0.3526,
1341
  "step": 22200
1342
  },
1343
  {
1344
+ "epoch": 0.72,
1345
  "learning_rate": 7e-05,
1346
+ "loss": 0.35,
1347
  "step": 22300
1348
  },
1349
  {
1350
+ "epoch": 0.72,
1351
  "learning_rate": 7e-05,
1352
+ "loss": 0.3452,
1353
  "step": 22400
1354
  },
1355
  {
1356
+ "epoch": 0.72,
1357
  "learning_rate": 7e-05,
1358
+ "loss": 0.3586,
1359
  "step": 22500
1360
  },
1361
  {
1362
+ "epoch": 0.73,
1363
  "learning_rate": 7e-05,
1364
+ "loss": 0.3598,
1365
  "step": 22600
1366
  },
1367
  {
1368
+ "epoch": 0.73,
1369
  "learning_rate": 7e-05,
1370
+ "loss": 0.3608,
1371
  "step": 22700
1372
  },
1373
  {
1374
+ "epoch": 0.73,
1375
  "learning_rate": 7e-05,
1376
+ "loss": 0.3521,
1377
  "step": 22800
1378
  },
1379
  {
1380
+ "epoch": 0.74,
1381
  "learning_rate": 7e-05,
1382
+ "loss": 0.3393,
1383
  "step": 22900
1384
  },
1385
  {
1386
+ "epoch": 0.74,
1387
  "learning_rate": 7e-05,
1388
+ "loss": 0.3425,
1389
  "step": 23000
1390
  },
1391
  {
1392
+ "epoch": 0.74,
1393
  "learning_rate": 7e-05,
1394
+ "loss": 0.3533,
1395
  "step": 23100
1396
  },
1397
  {
1398
+ "epoch": 0.75,
1399
  "learning_rate": 7e-05,
1400
+ "loss": 0.3337,
1401
  "step": 23200
1402
  },
1403
  {
1404
+ "epoch": 0.75,
1405
  "learning_rate": 7e-05,
1406
+ "loss": 0.3693,
1407
  "step": 23300
1408
  },
1409
  {
1410
+ "epoch": 0.75,
1411
  "learning_rate": 7e-05,
1412
+ "loss": 0.3425,
1413
  "step": 23400
1414
  },
1415
  {
1416
+ "epoch": 0.76,
1417
  "learning_rate": 7e-05,
1418
+ "loss": 0.3775,
1419
  "step": 23500
1420
  },
1421
  {
1422
+ "epoch": 0.76,
1423
  "learning_rate": 7e-05,
1424
+ "loss": 0.3319,
1425
  "step": 23600
1426
  },
1427
  {
1428
+ "epoch": 0.76,
1429
  "learning_rate": 7e-05,
1430
+ "loss": 0.3546,
1431
  "step": 23700
1432
  },
1433
  {
1434
+ "epoch": 0.77,
1435
  "learning_rate": 7e-05,
1436
+ "loss": 0.356,
1437
  "step": 23800
1438
  },
1439
  {
1440
+ "epoch": 0.77,
1441
  "learning_rate": 7e-05,
1442
+ "loss": 0.3533,
1443
  "step": 23900
1444
  },
1445
  {
1446
+ "epoch": 0.77,
1447
  "learning_rate": 7e-05,
1448
+ "loss": 0.3589,
1449
  "step": 24000
1450
  },
1451
  {
1452
+ "epoch": 0.78,
1453
  "learning_rate": 7e-05,
1454
+ "loss": 0.3502,
1455
  "step": 24100
1456
  },
1457
  {
1458
+ "epoch": 0.78,
1459
  "learning_rate": 7e-05,
1460
+ "loss": 0.3625,
1461
  "step": 24200
1462
  },
1463
  {
1464
+ "epoch": 0.78,
1465
  "learning_rate": 7e-05,
1466
+ "loss": 0.3531,
1467
  "step": 24300
1468
  },
1469
  {
1470
+ "epoch": 0.79,
1471
  "learning_rate": 7e-05,
1472
+ "loss": 0.3592,
1473
  "step": 24400
1474
  },
1475
  {
1476
+ "epoch": 0.79,
1477
  "learning_rate": 7e-05,
1478
+ "loss": 0.3519,
1479
  "step": 24500
1480
  },
1481
  {
1482
+ "epoch": 0.79,
1483
  "learning_rate": 7e-05,
1484
+ "loss": 0.3729,
1485
  "step": 24600
1486
  },
1487
  {
1488
+ "epoch": 0.8,
1489
  "learning_rate": 7e-05,
1490
+ "loss": 0.3504,
1491
  "step": 24700
1492
  },
1493
  {
1494
+ "epoch": 0.8,
1495
  "learning_rate": 7e-05,
1496
+ "loss": 0.3532,
1497
  "step": 24800
1498
  },
1499
  {
1500
+ "epoch": 0.8,
1501
  "learning_rate": 7e-05,
1502
+ "loss": 0.3552,
1503
  "step": 24900
1504
  },
1505
  {
1506
+ "epoch": 0.81,
1507
  "learning_rate": 7e-05,
1508
+ "loss": 0.3739,
1509
  "step": 25000
1510
  },
1511
  {
1512
+ "epoch": 0.81,
1513
  "learning_rate": 7e-05,
1514
+ "loss": 0.3445,
1515
  "step": 25100
1516
  },
1517
  {
1518
+ "epoch": 0.81,
1519
  "learning_rate": 7e-05,
1520
+ "loss": 0.3482,
1521
  "step": 25200
1522
  },
1523
  {
1524
+ "epoch": 0.82,
 
 
 
 
 
 
1525
  "learning_rate": 7e-05,
1526
  "loss": 0.3624,
1527
+ "step": 25300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1528
  },
1529
  {
1530
  "epoch": 0.82,
1531
  "learning_rate": 7e-05,
1532
+ "loss": 0.343,
1533
+ "step": 25400
1534
  },
1535
  {
1536
  "epoch": 0.82,
1537
  "learning_rate": 7e-05,
1538
+ "loss": 0.3656,
1539
+ "step": 25500
1540
  },
1541
  {
1542
  "epoch": 0.82,
1543
  "learning_rate": 7e-05,
1544
+ "loss": 0.3474,
1545
+ "step": 25600
 
 
 
 
 
 
1546
  },
1547
  {
1548
  "epoch": 0.83,
1549
  "learning_rate": 7e-05,
1550
+ "loss": 0.3582,
1551
+ "step": 25700
1552
  },
1553
  {
1554
  "epoch": 0.83,
1555
  "learning_rate": 7e-05,
1556
+ "loss": 0.3675,
1557
+ "step": 25800
1558
  },
1559
  {
1560
  "epoch": 0.83,
1561
  "learning_rate": 7e-05,
1562
+ "loss": 0.3547,
1563
+ "step": 25900
1564
  },
1565
  {
1566
  "epoch": 0.84,
1567
  "learning_rate": 7e-05,
1568
+ "loss": 0.3549,
1569
+ "step": 26000
1570
  },
1571
  {
1572
  "epoch": 0.84,
1573
  "learning_rate": 7e-05,
1574
+ "loss": 0.3592,
1575
+ "step": 26100
1576
  },
1577
  {
1578
  "epoch": 0.84,
1579
  "learning_rate": 7e-05,
1580
+ "loss": 0.3546,
1581
+ "step": 26200
 
 
 
 
 
 
1582
  },
1583
  {
1584
  "epoch": 0.85,
1585
  "learning_rate": 7e-05,
1586
+ "loss": 0.3508,
1587
+ "step": 26300
1588
  },
1589
  {
1590
  "epoch": 0.85,
1591
  "learning_rate": 7e-05,
1592
+ "loss": 0.339,
1593
+ "step": 26400
1594
  },
1595
  {
1596
  "epoch": 0.85,
1597
  "learning_rate": 7e-05,
1598
+ "loss": 0.3557,
1599
+ "step": 26500
1600
  },
1601
  {
1602
  "epoch": 0.86,
1603
  "learning_rate": 7e-05,
1604
+ "loss": 0.3742,
1605
+ "step": 26600
1606
  },
1607
  {
1608
  "epoch": 0.86,
1609
  "learning_rate": 7e-05,
1610
+ "loss": 0.3491,
1611
+ "step": 26700
1612
  },
1613
  {
1614
  "epoch": 0.86,
1615
  "learning_rate": 7e-05,
1616
+ "loss": 0.3663,
1617
+ "step": 26800
 
 
 
 
 
 
1618
  },
1619
  {
1620
  "epoch": 0.87,
1621
  "learning_rate": 7e-05,
1622
+ "loss": 0.3625,
1623
+ "step": 26900
1624
  },
1625
  {
1626
  "epoch": 0.87,
1627
  "learning_rate": 7e-05,
1628
+ "loss": 0.3421,
1629
+ "step": 27000
1630
  },
1631
  {
1632
  "epoch": 0.87,
1633
  "learning_rate": 7e-05,
1634
+ "loss": 0.3412,
1635
+ "step": 27100
1636
  },
1637
  {
1638
  "epoch": 0.88,
1639
  "learning_rate": 7e-05,
1640
+ "loss": 0.3576,
1641
+ "step": 27200
1642
  },
1643
  {
1644
  "epoch": 0.88,
1645
  "learning_rate": 7e-05,
1646
+ "loss": 0.3635,
1647
+ "step": 27300
1648
  },
1649
  {
1650
  "epoch": 0.88,
1651
  "learning_rate": 7e-05,
1652
+ "loss": 0.3554,
1653
+ "step": 27400
 
 
 
 
 
 
1654
  },
1655
  {
1656
  "epoch": 0.89,
1657
  "learning_rate": 7e-05,
1658
+ "loss": 0.368,
1659
+ "step": 27500
1660
  },
1661
  {
1662
  "epoch": 0.89,
1663
  "learning_rate": 7e-05,
1664
+ "loss": 0.3524,
1665
+ "step": 27600
1666
  },
1667
  {
1668
  "epoch": 0.89,
1669
  "learning_rate": 7e-05,
1670
+ "loss": 0.3484,
1671
+ "step": 27700
1672
  },
1673
  {
1674
  "epoch": 0.9,
1675
  "learning_rate": 7e-05,
1676
+ "loss": 0.3473,
1677
+ "step": 27800
1678
  },
1679
  {
1680
  "epoch": 0.9,
1681
  "learning_rate": 7e-05,
1682
+ "loss": 0.3328,
1683
+ "step": 27900
1684
  },
1685
  {
1686
  "epoch": 0.9,
1687
  "learning_rate": 7e-05,
1688
+ "loss": 0.3554,
1689
+ "step": 28000
1690
  },
1691
  {
1692
  "epoch": 0.91,
1693
  "learning_rate": 7e-05,
1694
+ "loss": 0.3536,
1695
+ "step": 28100
1696
  },
1697
  {
1698
  "epoch": 0.91,
1699
  "learning_rate": 7e-05,
1700
+ "loss": 0.3424,
1701
+ "step": 28200
1702
  },
1703
  {
1704
  "epoch": 0.91,
1705
  "learning_rate": 7e-05,
1706
+ "loss": 0.3631,
1707
+ "step": 28300
1708
  },
1709
  {
1710
  "epoch": 0.91,
1711
  "learning_rate": 7e-05,
1712
+ "loss": 0.333,
1713
+ "step": 28400
1714
  },
1715
  {
1716
  "epoch": 0.92,
1717
  "learning_rate": 7e-05,
1718
+ "loss": 0.3468,
1719
+ "step": 28500
1720
  },
1721
  {
1722
  "epoch": 0.92,
1723
  "learning_rate": 7e-05,
1724
+ "loss": 0.3487,
1725
+ "step": 28600
1726
  },
1727
  {
1728
  "epoch": 0.92,
1729
  "learning_rate": 7e-05,
1730
+ "loss": 0.3542,
1731
+ "step": 28700
 
 
 
 
 
 
1732
  },
1733
  {
1734
  "epoch": 0.93,
1735
  "learning_rate": 7e-05,
1736
+ "loss": 0.3579,
1737
+ "step": 28800
1738
  },
1739
  {
1740
  "epoch": 0.93,
1741
  "learning_rate": 7e-05,
1742
+ "loss": 0.3641,
1743
+ "step": 28900
1744
  },
1745
  {
1746
  "epoch": 0.93,
1747
  "learning_rate": 7e-05,
1748
+ "loss": 0.3729,
1749
+ "step": 29000
 
 
 
 
 
 
1750
  },
1751
  {
1752
  "epoch": 0.94,
1753
  "learning_rate": 7e-05,
1754
+ "loss": 0.3395,
1755
+ "step": 29100
1756
  },
1757
  {
1758
  "epoch": 0.94,
1759
  "learning_rate": 7e-05,
1760
+ "loss": 0.3572,
1761
+ "step": 29200
1762
  },
1763
  {
1764
  "epoch": 0.94,
1765
  "learning_rate": 7e-05,
1766
+ "loss": 0.3525,
1767
+ "step": 29300
1768
  },
1769
  {
1770
  "epoch": 0.95,
1771
  "learning_rate": 7e-05,
1772
+ "loss": 0.357,
1773
+ "step": 29400
1774
  },
1775
  {
1776
  "epoch": 0.95,
1777
  "learning_rate": 7e-05,
1778
+ "loss": 0.3647,
1779
+ "step": 29500
1780
  },
1781
  {
1782
  "epoch": 0.95,
1783
  "learning_rate": 7e-05,
1784
+ "loss": 0.3675,
1785
+ "step": 29600
 
 
 
 
 
 
1786
  },
1787
  {
1788
  "epoch": 0.96,
1789
  "learning_rate": 7e-05,
1790
+ "loss": 0.365,
1791
+ "step": 29700
1792
  },
1793
  {
1794
  "epoch": 0.96,
1795
  "learning_rate": 7e-05,
1796
+ "loss": 0.3439,
1797
+ "step": 29800
1798
  },
1799
  {
1800
  "epoch": 0.96,
1801
  "learning_rate": 7e-05,
1802
+ "loss": 0.3501,
1803
+ "step": 29900
1804
  },
1805
  {
1806
  "epoch": 0.97,
1807
  "learning_rate": 7e-05,
1808
+ "loss": 0.3552,
1809
+ "step": 30000
1810
  },
1811
  {
1812
  "epoch": 0.97,
1813
  "learning_rate": 7e-05,
1814
+ "loss": 0.351,
1815
+ "step": 30100
1816
  },
1817
  {
1818
  "epoch": 0.97,
1819
  "learning_rate": 7e-05,
1820
+ "loss": 0.3591,
1821
+ "step": 30200
 
 
 
 
 
 
1822
  },
1823
  {
1824
  "epoch": 0.98,
1825
  "learning_rate": 7e-05,
1826
+ "loss": 0.3548,
1827
+ "step": 30300
1828
  },
1829
  {
1830
  "epoch": 0.98,
1831
  "learning_rate": 7e-05,
1832
+ "loss": 0.3608,
1833
+ "step": 30400
1834
  },
1835
  {
1836
  "epoch": 0.98,
1837
  "learning_rate": 7e-05,
1838
+ "loss": 0.3504,
1839
+ "step": 30500
1840
  },
1841
  {
1842
  "epoch": 0.99,
1843
  "learning_rate": 7e-05,
1844
+ "loss": 0.3514,
1845
+ "step": 30600
1846
  },
1847
  {
1848
  "epoch": 0.99,
1849
  "learning_rate": 7e-05,
1850
+ "loss": 0.3456,
1851
+ "step": 30700
1852
  },
1853
  {
1854
  "epoch": 0.99,
1855
  "learning_rate": 7e-05,
1856
+ "loss": 0.343,
1857
+ "step": 30800
1858
  },
1859
  {
1860
  "epoch": 1.0,
1861
  "learning_rate": 7e-05,
1862
+ "loss": 0.3548,
1863
+ "step": 30900
1864
  },
1865
  {
1866
  "epoch": 1.0,
1867
  "learning_rate": 7e-05,
1868
+ "loss": 0.3601,
1869
+ "step": 31000
1870
  },
1871
  {
1872
  "epoch": 1.0,
1873
+ "eval_gen_len": 24.568762132298044,
1874
+ "eval_loss": 0.3032413423061371,
1875
+ "eval_rouge1": 86.5313,
1876
+ "eval_rouge2": 75.3351,
1877
+ "eval_rougeL": 85.9565,
1878
+ "eval_rougeLsum": 85.9785,
1879
+ "eval_runtime": 1478.6198,
1880
+ "eval_samples_per_second": 4.529,
1881
+ "eval_steps_per_second": 1.133,
1882
+ "step": 31042
1883
  },
1884
  {
1885
  "epoch": 1.0,
1886
+ "step": 31042,
1887
+ "total_flos": 8.055925371907523e+17,
1888
+ "train_loss": 0.35858034201253897,
1889
+ "train_runtime": 38927.7212,
1890
+ "train_samples_per_second": 25.518,
1891
+ "train_steps_per_second": 0.797
1892
  }
1893
  ],
1894
  "logging_steps": 100,
1895
+ "max_steps": 31042,
1896
  "num_input_tokens_seen": 0,
1897
  "num_train_epochs": 1,
1898
  "save_steps": 2500,
1899
+ "total_flos": 8.055925371907523e+17,
1900
  "trial_name": null,
1901
  "trial_params": null
1902
  }