TinyPixel commited on
Commit
9f851ff
·
1 Parent(s): e5ccd16

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. adapter_model.bin +1 -1
  2. optimizer.pt +2 -2
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +125 -725
  6. training_args.bin +1 -1
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0173e8d640e3b97bf023b7dd7ac27385a039124097c4b6511ee3d2757253f468
3
  size 134264397
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:268392da518a9d270d6334c47b13de4c67de1c1e83c72ceba6580a4c6489b9e8
3
  size 134264397
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d98fb081f8ae8adbfdf66570c1e22ebe2e127ae4f69c688250445cdf755b504
3
- size 268514565
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f34399e6b00072f7f84227fb1e87aea9a737b5245f3500801be9a3412d5ebdb
3
+ size 268514437
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d0d99dfd7d57ed64b060f6f59d69104e2ef41bc6504780895e7c66b7cf413d0
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1d2ca86f5a045f8c3244ae93f984c7fb23597360d07087fc81ba1e92306a8eb
3
  size 14575
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:468a8363eb81d314ece26266458e95441c269841fe873ca22d23f675428e66a4
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4880f3f40e7ab04d8efce4fc909327351c41d3ccafff780222b4189f527a9a0
3
  size 627
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.17627940285352284,
5
  "eval_steps": 500,
6
- "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -11,1209 +11,609 @@
11
  {
12
  "epoch": 0.0,
13
  "learning_rate": 2e-05,
14
- "loss": 1.9869,
15
  "step": 2
16
  },
17
  {
18
  "epoch": 0.0,
19
  "learning_rate": 2e-05,
20
- "loss": 2.3031,
21
  "step": 4
22
  },
23
  {
24
  "epoch": 0.0,
25
  "learning_rate": 2e-05,
26
- "loss": 2.1553,
27
  "step": 6
28
  },
29
  {
30
  "epoch": 0.0,
31
  "learning_rate": 2e-05,
32
- "loss": 1.9117,
33
  "step": 8
34
  },
35
  {
36
  "epoch": 0.0,
37
  "learning_rate": 2e-05,
38
- "loss": 2.3573,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.01,
43
  "learning_rate": 2e-05,
44
- "loss": 2.316,
45
  "step": 12
46
  },
47
  {
48
  "epoch": 0.01,
49
  "learning_rate": 2e-05,
50
- "loss": 2.1818,
51
  "step": 14
52
  },
53
  {
54
  "epoch": 0.01,
55
  "learning_rate": 2e-05,
56
- "loss": 2.2639,
57
  "step": 16
58
  },
59
  {
60
  "epoch": 0.01,
61
  "learning_rate": 2e-05,
62
- "loss": 1.7835,
63
  "step": 18
64
  },
65
  {
66
  "epoch": 0.01,
67
  "learning_rate": 2e-05,
68
- "loss": 2.2783,
69
  "step": 20
70
  },
71
  {
72
  "epoch": 0.01,
73
  "learning_rate": 2e-05,
74
- "loss": 2.0463,
75
  "step": 22
76
  },
77
  {
78
  "epoch": 0.01,
79
  "learning_rate": 2e-05,
80
- "loss": 1.8931,
81
  "step": 24
82
  },
83
  {
84
  "epoch": 0.01,
85
  "learning_rate": 2e-05,
86
- "loss": 1.9732,
87
  "step": 26
88
  },
89
  {
90
  "epoch": 0.01,
91
  "learning_rate": 2e-05,
92
- "loss": 1.9932,
93
  "step": 28
94
  },
95
  {
96
  "epoch": 0.01,
97
  "learning_rate": 2e-05,
98
- "loss": 2.0808,
99
  "step": 30
100
  },
101
  {
102
  "epoch": 0.01,
103
  "learning_rate": 2e-05,
104
- "loss": 1.9768,
105
  "step": 32
106
  },
107
  {
108
- "epoch": 0.01,
109
  "learning_rate": 2e-05,
110
- "loss": 1.808,
111
  "step": 34
112
  },
113
  {
114
  "epoch": 0.02,
115
  "learning_rate": 2e-05,
116
- "loss": 1.7387,
117
  "step": 36
118
  },
119
  {
120
  "epoch": 0.02,
121
  "learning_rate": 2e-05,
122
- "loss": 1.9784,
123
  "step": 38
124
  },
125
  {
126
  "epoch": 0.02,
127
  "learning_rate": 2e-05,
128
- "loss": 1.8164,
129
  "step": 40
130
  },
131
  {
132
  "epoch": 0.02,
133
  "learning_rate": 2e-05,
134
- "loss": 1.7091,
135
  "step": 42
136
  },
137
  {
138
  "epoch": 0.02,
139
  "learning_rate": 2e-05,
140
- "loss": 1.7938,
141
  "step": 44
142
  },
143
  {
144
  "epoch": 0.02,
145
  "learning_rate": 2e-05,
146
- "loss": 1.973,
147
  "step": 46
148
  },
149
  {
150
  "epoch": 0.02,
151
  "learning_rate": 2e-05,
152
- "loss": 2.1455,
153
  "step": 48
154
  },
155
  {
156
  "epoch": 0.02,
157
  "learning_rate": 2e-05,
158
- "loss": 1.9081,
159
  "step": 50
160
  },
161
  {
162
  "epoch": 0.02,
163
  "learning_rate": 2e-05,
164
- "loss": 1.7108,
165
  "step": 52
166
  },
167
  {
168
  "epoch": 0.02,
169
  "learning_rate": 2e-05,
170
- "loss": 1.8766,
171
  "step": 54
172
  },
173
  {
174
- "epoch": 0.02,
175
  "learning_rate": 2e-05,
176
- "loss": 1.8224,
177
  "step": 56
178
  },
179
  {
180
  "epoch": 0.03,
181
  "learning_rate": 2e-05,
182
- "loss": 1.6724,
183
  "step": 58
184
  },
185
  {
186
  "epoch": 0.03,
187
  "learning_rate": 2e-05,
188
- "loss": 1.9838,
189
  "step": 60
190
  },
191
  {
192
  "epoch": 0.03,
193
  "learning_rate": 2e-05,
194
- "loss": 1.5954,
195
  "step": 62
196
  },
197
  {
198
  "epoch": 0.03,
199
  "learning_rate": 2e-05,
200
- "loss": 1.6449,
201
  "step": 64
202
  },
203
  {
204
  "epoch": 0.03,
205
  "learning_rate": 2e-05,
206
- "loss": 1.6045,
207
  "step": 66
208
  },
209
  {
210
  "epoch": 0.03,
211
  "learning_rate": 2e-05,
212
- "loss": 1.7874,
213
  "step": 68
214
  },
215
  {
216
  "epoch": 0.03,
217
  "learning_rate": 2e-05,
218
- "loss": 1.5796,
219
  "step": 70
220
  },
221
  {
222
  "epoch": 0.03,
223
  "learning_rate": 2e-05,
224
- "loss": 1.7467,
225
  "step": 72
226
  },
227
  {
228
  "epoch": 0.03,
229
  "learning_rate": 2e-05,
230
- "loss": 1.6555,
231
  "step": 74
232
  },
233
  {
234
- "epoch": 0.03,
235
  "learning_rate": 2e-05,
236
- "loss": 1.6531,
237
  "step": 76
238
  },
239
  {
240
- "epoch": 0.03,
241
  "learning_rate": 2e-05,
242
- "loss": 1.7604,
243
  "step": 78
244
  },
245
  {
246
  "epoch": 0.04,
247
  "learning_rate": 2e-05,
248
- "loss": 1.8428,
249
  "step": 80
250
  },
251
  {
252
  "epoch": 0.04,
253
  "learning_rate": 2e-05,
254
- "loss": 1.6431,
255
  "step": 82
256
  },
257
  {
258
  "epoch": 0.04,
259
  "learning_rate": 2e-05,
260
- "loss": 1.6619,
261
  "step": 84
262
  },
263
  {
264
  "epoch": 0.04,
265
  "learning_rate": 2e-05,
266
- "loss": 1.5432,
267
  "step": 86
268
  },
269
  {
270
  "epoch": 0.04,
271
  "learning_rate": 2e-05,
272
- "loss": 1.6974,
273
  "step": 88
274
  },
275
  {
276
  "epoch": 0.04,
277
  "learning_rate": 2e-05,
278
- "loss": 1.5422,
279
  "step": 90
280
  },
281
  {
282
  "epoch": 0.04,
283
  "learning_rate": 2e-05,
284
- "loss": 1.6739,
285
  "step": 92
286
  },
287
  {
288
  "epoch": 0.04,
289
  "learning_rate": 2e-05,
290
- "loss": 1.5725,
291
  "step": 94
292
  },
293
  {
294
  "epoch": 0.04,
295
  "learning_rate": 2e-05,
296
- "loss": 1.6482,
297
  "step": 96
298
  },
299
  {
300
- "epoch": 0.04,
301
  "learning_rate": 2e-05,
302
- "loss": 1.7447,
303
  "step": 98
304
  },
305
  {
306
- "epoch": 0.04,
307
  "learning_rate": 2e-05,
308
- "loss": 1.7923,
309
  "step": 100
310
  },
311
  {
312
- "epoch": 0.04,
313
  "learning_rate": 2e-05,
314
- "loss": 1.6704,
315
  "step": 102
316
  },
317
  {
318
  "epoch": 0.05,
319
  "learning_rate": 2e-05,
320
- "loss": 1.6213,
321
  "step": 104
322
  },
323
  {
324
  "epoch": 0.05,
325
  "learning_rate": 2e-05,
326
- "loss": 1.5271,
327
  "step": 106
328
  },
329
  {
330
  "epoch": 0.05,
331
  "learning_rate": 2e-05,
332
- "loss": 1.5259,
333
  "step": 108
334
  },
335
  {
336
  "epoch": 0.05,
337
  "learning_rate": 2e-05,
338
- "loss": 1.7935,
339
  "step": 110
340
  },
341
  {
342
  "epoch": 0.05,
343
  "learning_rate": 2e-05,
344
- "loss": 1.7862,
345
  "step": 112
346
  },
347
  {
348
  "epoch": 0.05,
349
  "learning_rate": 2e-05,
350
- "loss": 1.6882,
351
  "step": 114
352
  },
353
  {
354
  "epoch": 0.05,
355
  "learning_rate": 2e-05,
356
- "loss": 1.672,
357
  "step": 116
358
  },
359
  {
360
  "epoch": 0.05,
361
  "learning_rate": 2e-05,
362
- "loss": 1.4945,
363
  "step": 118
364
  },
365
  {
366
- "epoch": 0.05,
367
  "learning_rate": 2e-05,
368
- "loss": 1.7371,
369
  "step": 120
370
  },
371
  {
372
- "epoch": 0.05,
373
  "learning_rate": 2e-05,
374
- "loss": 1.6145,
375
  "step": 122
376
  },
377
  {
378
- "epoch": 0.05,
379
  "learning_rate": 2e-05,
380
- "loss": 1.5366,
381
  "step": 124
382
  },
383
  {
384
  "epoch": 0.06,
385
  "learning_rate": 2e-05,
386
- "loss": 1.5875,
387
  "step": 126
388
  },
389
  {
390
  "epoch": 0.06,
391
  "learning_rate": 2e-05,
392
- "loss": 1.5924,
393
  "step": 128
394
  },
395
  {
396
  "epoch": 0.06,
397
  "learning_rate": 2e-05,
398
- "loss": 1.7048,
399
  "step": 130
400
  },
401
  {
402
  "epoch": 0.06,
403
  "learning_rate": 2e-05,
404
- "loss": 1.5899,
405
  "step": 132
406
  },
407
  {
408
  "epoch": 0.06,
409
  "learning_rate": 2e-05,
410
- "loss": 1.7091,
411
  "step": 134
412
  },
413
  {
414
  "epoch": 0.06,
415
  "learning_rate": 2e-05,
416
- "loss": 1.6705,
417
  "step": 136
418
  },
419
  {
420
  "epoch": 0.06,
421
  "learning_rate": 2e-05,
422
- "loss": 1.7555,
423
  "step": 138
424
  },
425
  {
426
  "epoch": 0.06,
427
  "learning_rate": 2e-05,
428
- "loss": 1.7616,
429
  "step": 140
430
  },
431
  {
432
- "epoch": 0.06,
433
  "learning_rate": 2e-05,
434
- "loss": 1.6282,
435
  "step": 142
436
  },
437
  {
438
- "epoch": 0.06,
439
  "learning_rate": 2e-05,
440
- "loss": 1.6069,
441
  "step": 144
442
  },
443
  {
444
- "epoch": 0.06,
445
  "learning_rate": 2e-05,
446
- "loss": 1.5998,
447
  "step": 146
448
  },
449
  {
450
  "epoch": 0.07,
451
  "learning_rate": 2e-05,
452
- "loss": 1.6762,
453
  "step": 148
454
  },
455
  {
456
  "epoch": 0.07,
457
  "learning_rate": 2e-05,
458
- "loss": 1.7779,
459
  "step": 150
460
  },
461
  {
462
  "epoch": 0.07,
463
  "learning_rate": 2e-05,
464
- "loss": 1.7046,
465
  "step": 152
466
  },
467
  {
468
  "epoch": 0.07,
469
  "learning_rate": 2e-05,
470
- "loss": 1.6704,
471
  "step": 154
472
  },
473
  {
474
  "epoch": 0.07,
475
  "learning_rate": 2e-05,
476
- "loss": 1.5523,
477
  "step": 156
478
  },
479
  {
480
  "epoch": 0.07,
481
  "learning_rate": 2e-05,
482
- "loss": 1.6883,
483
  "step": 158
484
  },
485
  {
486
  "epoch": 0.07,
487
  "learning_rate": 2e-05,
488
- "loss": 1.5095,
489
  "step": 160
490
  },
491
  {
492
  "epoch": 0.07,
493
  "learning_rate": 2e-05,
494
- "loss": 1.5435,
495
  "step": 162
496
  },
497
  {
498
- "epoch": 0.07,
499
  "learning_rate": 2e-05,
500
- "loss": 1.8038,
501
  "step": 164
502
  },
503
  {
504
- "epoch": 0.07,
505
  "learning_rate": 2e-05,
506
- "loss": 1.5684,
507
  "step": 166
508
  },
509
  {
510
- "epoch": 0.07,
511
  "learning_rate": 2e-05,
512
- "loss": 1.7597,
513
  "step": 168
514
  },
515
  {
516
- "epoch": 0.07,
517
  "learning_rate": 2e-05,
518
- "loss": 1.7696,
519
  "step": 170
520
  },
521
  {
522
  "epoch": 0.08,
523
  "learning_rate": 2e-05,
524
- "loss": 1.7651,
525
  "step": 172
526
  },
527
  {
528
  "epoch": 0.08,
529
  "learning_rate": 2e-05,
530
- "loss": 1.4832,
531
  "step": 174
532
  },
533
  {
534
  "epoch": 0.08,
535
  "learning_rate": 2e-05,
536
- "loss": 1.6022,
537
  "step": 176
538
  },
539
  {
540
  "epoch": 0.08,
541
  "learning_rate": 2e-05,
542
- "loss": 1.82,
543
  "step": 178
544
  },
545
  {
546
  "epoch": 0.08,
547
  "learning_rate": 2e-05,
548
- "loss": 1.504,
549
  "step": 180
550
  },
551
  {
552
  "epoch": 0.08,
553
  "learning_rate": 2e-05,
554
- "loss": 1.6136,
555
  "step": 182
556
  },
557
  {
558
  "epoch": 0.08,
559
  "learning_rate": 2e-05,
560
- "loss": 1.5933,
561
  "step": 184
562
  },
563
  {
564
- "epoch": 0.08,
565
  "learning_rate": 2e-05,
566
- "loss": 1.6525,
567
  "step": 186
568
  },
569
  {
570
- "epoch": 0.08,
571
  "learning_rate": 2e-05,
572
- "loss": 1.6809,
573
  "step": 188
574
  },
575
  {
576
- "epoch": 0.08,
577
  "learning_rate": 2e-05,
578
- "loss": 1.4673,
579
  "step": 190
580
  },
581
  {
582
- "epoch": 0.08,
583
  "learning_rate": 2e-05,
584
- "loss": 1.4732,
585
  "step": 192
586
  },
587
  {
588
  "epoch": 0.09,
589
  "learning_rate": 2e-05,
590
- "loss": 1.6401,
591
  "step": 194
592
  },
593
  {
594
  "epoch": 0.09,
595
  "learning_rate": 2e-05,
596
- "loss": 1.4686,
597
  "step": 196
598
  },
599
  {
600
  "epoch": 0.09,
601
  "learning_rate": 2e-05,
602
- "loss": 1.6673,
603
  "step": 198
604
  },
605
  {
606
  "epoch": 0.09,
607
  "learning_rate": 2e-05,
608
- "loss": 1.5662,
609
  "step": 200
610
- },
611
- {
612
- "epoch": 0.09,
613
- "learning_rate": 2e-05,
614
- "loss": 1.3947,
615
- "step": 202
616
- },
617
- {
618
- "epoch": 0.09,
619
- "learning_rate": 2e-05,
620
- "loss": 1.6261,
621
- "step": 204
622
- },
623
- {
624
- "epoch": 0.09,
625
- "learning_rate": 2e-05,
626
- "loss": 1.7449,
627
- "step": 206
628
- },
629
- {
630
- "epoch": 0.09,
631
- "learning_rate": 2e-05,
632
- "loss": 1.514,
633
- "step": 208
634
- },
635
- {
636
- "epoch": 0.09,
637
- "learning_rate": 2e-05,
638
- "loss": 1.5812,
639
- "step": 210
640
- },
641
- {
642
- "epoch": 0.09,
643
- "learning_rate": 2e-05,
644
- "loss": 1.5474,
645
- "step": 212
646
- },
647
- {
648
- "epoch": 0.09,
649
- "learning_rate": 2e-05,
650
- "loss": 1.8105,
651
- "step": 214
652
- },
653
- {
654
- "epoch": 0.1,
655
- "learning_rate": 2e-05,
656
- "loss": 1.8234,
657
- "step": 216
658
- },
659
- {
660
- "epoch": 0.1,
661
- "learning_rate": 2e-05,
662
- "loss": 1.6536,
663
- "step": 218
664
- },
665
- {
666
- "epoch": 0.1,
667
- "learning_rate": 2e-05,
668
- "loss": 1.5396,
669
- "step": 220
670
- },
671
- {
672
- "epoch": 0.1,
673
- "learning_rate": 2e-05,
674
- "loss": 1.6499,
675
- "step": 222
676
- },
677
- {
678
- "epoch": 0.1,
679
- "learning_rate": 2e-05,
680
- "loss": 1.6298,
681
- "step": 224
682
- },
683
- {
684
- "epoch": 0.1,
685
- "learning_rate": 2e-05,
686
- "loss": 1.6265,
687
- "step": 226
688
- },
689
- {
690
- "epoch": 0.1,
691
- "learning_rate": 2e-05,
692
- "loss": 1.6794,
693
- "step": 228
694
- },
695
- {
696
- "epoch": 0.1,
697
- "learning_rate": 2e-05,
698
- "loss": 1.6706,
699
- "step": 230
700
- },
701
- {
702
- "epoch": 0.1,
703
- "learning_rate": 2e-05,
704
- "loss": 1.6217,
705
- "step": 232
706
- },
707
- {
708
- "epoch": 0.1,
709
- "learning_rate": 2e-05,
710
- "loss": 1.5515,
711
- "step": 234
712
- },
713
- {
714
- "epoch": 0.1,
715
- "learning_rate": 2e-05,
716
- "loss": 1.7083,
717
- "step": 236
718
- },
719
- {
720
- "epoch": 0.1,
721
- "learning_rate": 2e-05,
722
- "loss": 1.7026,
723
- "step": 238
724
- },
725
- {
726
- "epoch": 0.11,
727
- "learning_rate": 2e-05,
728
- "loss": 1.4813,
729
- "step": 240
730
- },
731
- {
732
- "epoch": 0.11,
733
- "learning_rate": 2e-05,
734
- "loss": 1.6321,
735
- "step": 242
736
- },
737
- {
738
- "epoch": 0.11,
739
- "learning_rate": 2e-05,
740
- "loss": 1.4753,
741
- "step": 244
742
- },
743
- {
744
- "epoch": 0.11,
745
- "learning_rate": 2e-05,
746
- "loss": 1.4435,
747
- "step": 246
748
- },
749
- {
750
- "epoch": 0.11,
751
- "learning_rate": 2e-05,
752
- "loss": 1.5495,
753
- "step": 248
754
- },
755
- {
756
- "epoch": 0.11,
757
- "learning_rate": 2e-05,
758
- "loss": 1.4777,
759
- "step": 250
760
- },
761
- {
762
- "epoch": 0.11,
763
- "learning_rate": 2e-05,
764
- "loss": 1.4638,
765
- "step": 252
766
- },
767
- {
768
- "epoch": 0.11,
769
- "learning_rate": 2e-05,
770
- "loss": 1.6587,
771
- "step": 254
772
- },
773
- {
774
- "epoch": 0.11,
775
- "learning_rate": 2e-05,
776
- "loss": 1.7546,
777
- "step": 256
778
- },
779
- {
780
- "epoch": 0.11,
781
- "learning_rate": 2e-05,
782
- "loss": 1.5319,
783
- "step": 258
784
- },
785
- {
786
- "epoch": 0.11,
787
- "learning_rate": 2e-05,
788
- "loss": 1.4681,
789
- "step": 260
790
- },
791
- {
792
- "epoch": 0.12,
793
- "learning_rate": 2e-05,
794
- "loss": 1.6264,
795
- "step": 262
796
- },
797
- {
798
- "epoch": 0.12,
799
- "learning_rate": 2e-05,
800
- "loss": 1.5565,
801
- "step": 264
802
- },
803
- {
804
- "epoch": 0.12,
805
- "learning_rate": 2e-05,
806
- "loss": 1.5509,
807
- "step": 266
808
- },
809
- {
810
- "epoch": 0.12,
811
- "learning_rate": 2e-05,
812
- "loss": 1.4856,
813
- "step": 268
814
- },
815
- {
816
- "epoch": 0.12,
817
- "learning_rate": 2e-05,
818
- "loss": 1.6516,
819
- "step": 270
820
- },
821
- {
822
- "epoch": 0.12,
823
- "learning_rate": 2e-05,
824
- "loss": 1.6128,
825
- "step": 272
826
- },
827
- {
828
- "epoch": 0.12,
829
- "learning_rate": 2e-05,
830
- "loss": 1.763,
831
- "step": 274
832
- },
833
- {
834
- "epoch": 0.12,
835
- "learning_rate": 2e-05,
836
- "loss": 1.6703,
837
- "step": 276
838
- },
839
- {
840
- "epoch": 0.12,
841
- "learning_rate": 2e-05,
842
- "loss": 1.6881,
843
- "step": 278
844
- },
845
- {
846
- "epoch": 0.12,
847
- "learning_rate": 2e-05,
848
- "loss": 1.49,
849
- "step": 280
850
- },
851
- {
852
- "epoch": 0.12,
853
- "learning_rate": 2e-05,
854
- "loss": 1.7967,
855
- "step": 282
856
- },
857
- {
858
- "epoch": 0.13,
859
- "learning_rate": 2e-05,
860
- "loss": 1.4738,
861
- "step": 284
862
- },
863
- {
864
- "epoch": 0.13,
865
- "learning_rate": 2e-05,
866
- "loss": 1.5275,
867
- "step": 286
868
- },
869
- {
870
- "epoch": 0.13,
871
- "learning_rate": 2e-05,
872
- "loss": 1.4755,
873
- "step": 288
874
- },
875
- {
876
- "epoch": 0.13,
877
- "learning_rate": 2e-05,
878
- "loss": 1.5535,
879
- "step": 290
880
- },
881
- {
882
- "epoch": 0.13,
883
- "learning_rate": 2e-05,
884
- "loss": 1.6888,
885
- "step": 292
886
- },
887
- {
888
- "epoch": 0.13,
889
- "learning_rate": 2e-05,
890
- "loss": 1.6605,
891
- "step": 294
892
- },
893
- {
894
- "epoch": 0.13,
895
- "learning_rate": 2e-05,
896
- "loss": 1.2232,
897
- "step": 296
898
- },
899
- {
900
- "epoch": 0.13,
901
- "learning_rate": 2e-05,
902
- "loss": 1.5012,
903
- "step": 298
904
- },
905
- {
906
- "epoch": 0.13,
907
- "learning_rate": 2e-05,
908
- "loss": 1.5564,
909
- "step": 300
910
- },
911
- {
912
- "epoch": 0.13,
913
- "learning_rate": 2e-05,
914
- "loss": 1.3169,
915
- "step": 302
916
- },
917
- {
918
- "epoch": 0.13,
919
- "learning_rate": 2e-05,
920
- "loss": 1.6143,
921
- "step": 304
922
- },
923
- {
924
- "epoch": 0.13,
925
- "learning_rate": 2e-05,
926
- "loss": 1.6366,
927
- "step": 306
928
- },
929
- {
930
- "epoch": 0.14,
931
- "learning_rate": 2e-05,
932
- "loss": 1.4345,
933
- "step": 308
934
- },
935
- {
936
- "epoch": 0.14,
937
- "learning_rate": 2e-05,
938
- "loss": 1.5439,
939
- "step": 310
940
- },
941
- {
942
- "epoch": 0.14,
943
- "learning_rate": 2e-05,
944
- "loss": 1.5227,
945
- "step": 312
946
- },
947
- {
948
- "epoch": 0.14,
949
- "learning_rate": 2e-05,
950
- "loss": 1.5976,
951
- "step": 314
952
- },
953
- {
954
- "epoch": 0.14,
955
- "learning_rate": 2e-05,
956
- "loss": 1.6695,
957
- "step": 316
958
- },
959
- {
960
- "epoch": 0.14,
961
- "learning_rate": 2e-05,
962
- "loss": 1.6449,
963
- "step": 318
964
- },
965
- {
966
- "epoch": 0.14,
967
- "learning_rate": 2e-05,
968
- "loss": 1.6323,
969
- "step": 320
970
- },
971
- {
972
- "epoch": 0.14,
973
- "learning_rate": 2e-05,
974
- "loss": 1.3631,
975
- "step": 322
976
- },
977
- {
978
- "epoch": 0.14,
979
- "learning_rate": 2e-05,
980
- "loss": 1.599,
981
- "step": 324
982
- },
983
- {
984
- "epoch": 0.14,
985
- "learning_rate": 2e-05,
986
- "loss": 1.6603,
987
- "step": 326
988
- },
989
- {
990
- "epoch": 0.14,
991
- "learning_rate": 2e-05,
992
- "loss": 1.5663,
993
- "step": 328
994
- },
995
- {
996
- "epoch": 0.15,
997
- "learning_rate": 2e-05,
998
- "loss": 1.4458,
999
- "step": 330
1000
- },
1001
- {
1002
- "epoch": 0.15,
1003
- "learning_rate": 2e-05,
1004
- "loss": 1.4435,
1005
- "step": 332
1006
- },
1007
- {
1008
- "epoch": 0.15,
1009
- "learning_rate": 2e-05,
1010
- "loss": 1.4231,
1011
- "step": 334
1012
- },
1013
- {
1014
- "epoch": 0.15,
1015
- "learning_rate": 2e-05,
1016
- "loss": 1.6965,
1017
- "step": 336
1018
- },
1019
- {
1020
- "epoch": 0.15,
1021
- "learning_rate": 2e-05,
1022
- "loss": 1.7649,
1023
- "step": 338
1024
- },
1025
- {
1026
- "epoch": 0.15,
1027
- "learning_rate": 2e-05,
1028
- "loss": 1.5374,
1029
- "step": 340
1030
- },
1031
- {
1032
- "epoch": 0.15,
1033
- "learning_rate": 2e-05,
1034
- "loss": 1.4524,
1035
- "step": 342
1036
- },
1037
- {
1038
- "epoch": 0.15,
1039
- "learning_rate": 2e-05,
1040
- "loss": 1.4514,
1041
- "step": 344
1042
- },
1043
- {
1044
- "epoch": 0.15,
1045
- "learning_rate": 2e-05,
1046
- "loss": 1.6242,
1047
- "step": 346
1048
- },
1049
- {
1050
- "epoch": 0.15,
1051
- "learning_rate": 2e-05,
1052
- "loss": 1.3011,
1053
- "step": 348
1054
- },
1055
- {
1056
- "epoch": 0.15,
1057
- "learning_rate": 2e-05,
1058
- "loss": 1.5991,
1059
- "step": 350
1060
- },
1061
- {
1062
- "epoch": 0.16,
1063
- "learning_rate": 2e-05,
1064
- "loss": 1.5717,
1065
- "step": 352
1066
- },
1067
- {
1068
- "epoch": 0.16,
1069
- "learning_rate": 2e-05,
1070
- "loss": 1.4342,
1071
- "step": 354
1072
- },
1073
- {
1074
- "epoch": 0.16,
1075
- "learning_rate": 2e-05,
1076
- "loss": 1.5818,
1077
- "step": 356
1078
- },
1079
- {
1080
- "epoch": 0.16,
1081
- "learning_rate": 2e-05,
1082
- "loss": 1.3967,
1083
- "step": 358
1084
- },
1085
- {
1086
- "epoch": 0.16,
1087
- "learning_rate": 2e-05,
1088
- "loss": 1.4011,
1089
- "step": 360
1090
- },
1091
- {
1092
- "epoch": 0.16,
1093
- "learning_rate": 2e-05,
1094
- "loss": 1.5034,
1095
- "step": 362
1096
- },
1097
- {
1098
- "epoch": 0.16,
1099
- "learning_rate": 2e-05,
1100
- "loss": 1.5202,
1101
- "step": 364
1102
- },
1103
- {
1104
- "epoch": 0.16,
1105
- "learning_rate": 2e-05,
1106
- "loss": 1.4779,
1107
- "step": 366
1108
- },
1109
- {
1110
- "epoch": 0.16,
1111
- "learning_rate": 2e-05,
1112
- "loss": 1.6557,
1113
- "step": 368
1114
- },
1115
- {
1116
- "epoch": 0.16,
1117
- "learning_rate": 2e-05,
1118
- "loss": 1.6508,
1119
- "step": 370
1120
- },
1121
- {
1122
- "epoch": 0.16,
1123
- "learning_rate": 2e-05,
1124
- "loss": 1.506,
1125
- "step": 372
1126
- },
1127
- {
1128
- "epoch": 0.16,
1129
- "learning_rate": 2e-05,
1130
- "loss": 1.5586,
1131
- "step": 374
1132
- },
1133
- {
1134
- "epoch": 0.17,
1135
- "learning_rate": 2e-05,
1136
- "loss": 1.5296,
1137
- "step": 376
1138
- },
1139
- {
1140
- "epoch": 0.17,
1141
- "learning_rate": 2e-05,
1142
- "loss": 1.5015,
1143
- "step": 378
1144
- },
1145
- {
1146
- "epoch": 0.17,
1147
- "learning_rate": 2e-05,
1148
- "loss": 1.589,
1149
- "step": 380
1150
- },
1151
- {
1152
- "epoch": 0.17,
1153
- "learning_rate": 2e-05,
1154
- "loss": 1.3286,
1155
- "step": 382
1156
- },
1157
- {
1158
- "epoch": 0.17,
1159
- "learning_rate": 2e-05,
1160
- "loss": 1.5073,
1161
- "step": 384
1162
- },
1163
- {
1164
- "epoch": 0.17,
1165
- "learning_rate": 2e-05,
1166
- "loss": 1.4456,
1167
- "step": 386
1168
- },
1169
- {
1170
- "epoch": 0.17,
1171
- "learning_rate": 2e-05,
1172
- "loss": 1.549,
1173
- "step": 388
1174
- },
1175
- {
1176
- "epoch": 0.17,
1177
- "learning_rate": 2e-05,
1178
- "loss": 1.4319,
1179
- "step": 390
1180
- },
1181
- {
1182
- "epoch": 0.17,
1183
- "learning_rate": 2e-05,
1184
- "loss": 1.5936,
1185
- "step": 392
1186
- },
1187
- {
1188
- "epoch": 0.17,
1189
- "learning_rate": 2e-05,
1190
- "loss": 1.2661,
1191
- "step": 394
1192
- },
1193
- {
1194
- "epoch": 0.17,
1195
- "learning_rate": 2e-05,
1196
- "loss": 1.5003,
1197
- "step": 396
1198
- },
1199
- {
1200
- "epoch": 0.18,
1201
- "learning_rate": 2e-05,
1202
- "loss": 1.5337,
1203
- "step": 398
1204
- },
1205
- {
1206
- "epoch": 0.18,
1207
- "learning_rate": 2e-05,
1208
- "loss": 1.3789,
1209
- "step": 400
1210
  }
1211
  ],
1212
  "logging_steps": 2,
1213
- "max_steps": 6807,
1214
  "num_train_epochs": 3,
1215
  "save_steps": 50,
1216
- "total_flos": 7760442301267968.0,
1217
  "trial_name": null,
1218
  "trial_params": null
1219
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.0922535820336149,
5
  "eval_steps": 500,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
11
  {
12
  "epoch": 0.0,
13
  "learning_rate": 2e-05,
14
+ "loss": 2.2297,
15
  "step": 2
16
  },
17
  {
18
  "epoch": 0.0,
19
  "learning_rate": 2e-05,
20
+ "loss": 2.4114,
21
  "step": 4
22
  },
23
  {
24
  "epoch": 0.0,
25
  "learning_rate": 2e-05,
26
+ "loss": 2.5589,
27
  "step": 6
28
  },
29
  {
30
  "epoch": 0.0,
31
  "learning_rate": 2e-05,
32
+ "loss": 2.6596,
33
  "step": 8
34
  },
35
  {
36
  "epoch": 0.0,
37
  "learning_rate": 2e-05,
38
+ "loss": 2.6748,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.01,
43
  "learning_rate": 2e-05,
44
+ "loss": 2.7331,
45
  "step": 12
46
  },
47
  {
48
  "epoch": 0.01,
49
  "learning_rate": 2e-05,
50
+ "loss": 2.7118,
51
  "step": 14
52
  },
53
  {
54
  "epoch": 0.01,
55
  "learning_rate": 2e-05,
56
+ "loss": 2.6889,
57
  "step": 16
58
  },
59
  {
60
  "epoch": 0.01,
61
  "learning_rate": 2e-05,
62
+ "loss": 2.7385,
63
  "step": 18
64
  },
65
  {
66
  "epoch": 0.01,
67
  "learning_rate": 2e-05,
68
+ "loss": 2.7068,
69
  "step": 20
70
  },
71
  {
72
  "epoch": 0.01,
73
  "learning_rate": 2e-05,
74
+ "loss": 2.6188,
75
  "step": 22
76
  },
77
  {
78
  "epoch": 0.01,
79
  "learning_rate": 2e-05,
80
+ "loss": 2.613,
81
  "step": 24
82
  },
83
  {
84
  "epoch": 0.01,
85
  "learning_rate": 2e-05,
86
+ "loss": 2.5193,
87
  "step": 26
88
  },
89
  {
90
  "epoch": 0.01,
91
  "learning_rate": 2e-05,
92
+ "loss": 2.59,
93
  "step": 28
94
  },
95
  {
96
  "epoch": 0.01,
97
  "learning_rate": 2e-05,
98
+ "loss": 2.4855,
99
  "step": 30
100
  },
101
  {
102
  "epoch": 0.01,
103
  "learning_rate": 2e-05,
104
+ "loss": 2.5357,
105
  "step": 32
106
  },
107
  {
108
+ "epoch": 0.02,
109
  "learning_rate": 2e-05,
110
+ "loss": 2.5067,
111
  "step": 34
112
  },
113
  {
114
  "epoch": 0.02,
115
  "learning_rate": 2e-05,
116
+ "loss": 2.4844,
117
  "step": 36
118
  },
119
  {
120
  "epoch": 0.02,
121
  "learning_rate": 2e-05,
122
+ "loss": 2.4234,
123
  "step": 38
124
  },
125
  {
126
  "epoch": 0.02,
127
  "learning_rate": 2e-05,
128
+ "loss": 2.3767,
129
  "step": 40
130
  },
131
  {
132
  "epoch": 0.02,
133
  "learning_rate": 2e-05,
134
+ "loss": 2.396,
135
  "step": 42
136
  },
137
  {
138
  "epoch": 0.02,
139
  "learning_rate": 2e-05,
140
+ "loss": 2.2662,
141
  "step": 44
142
  },
143
  {
144
  "epoch": 0.02,
145
  "learning_rate": 2e-05,
146
+ "loss": 2.1638,
147
  "step": 46
148
  },
149
  {
150
  "epoch": 0.02,
151
  "learning_rate": 2e-05,
152
+ "loss": 2.1706,
153
  "step": 48
154
  },
155
  {
156
  "epoch": 0.02,
157
  "learning_rate": 2e-05,
158
+ "loss": 1.9274,
159
  "step": 50
160
  },
161
  {
162
  "epoch": 0.02,
163
  "learning_rate": 2e-05,
164
+ "loss": 1.9829,
165
  "step": 52
166
  },
167
  {
168
  "epoch": 0.02,
169
  "learning_rate": 2e-05,
170
+ "loss": 1.937,
171
  "step": 54
172
  },
173
  {
174
+ "epoch": 0.03,
175
  "learning_rate": 2e-05,
176
+ "loss": 1.9749,
177
  "step": 56
178
  },
179
  {
180
  "epoch": 0.03,
181
  "learning_rate": 2e-05,
182
+ "loss": 2.1066,
183
  "step": 58
184
  },
185
  {
186
  "epoch": 0.03,
187
  "learning_rate": 2e-05,
188
+ "loss": 2.073,
189
  "step": 60
190
  },
191
  {
192
  "epoch": 0.03,
193
  "learning_rate": 2e-05,
194
+ "loss": 2.1405,
195
  "step": 62
196
  },
197
  {
198
  "epoch": 0.03,
199
  "learning_rate": 2e-05,
200
+ "loss": 2.0468,
201
  "step": 64
202
  },
203
  {
204
  "epoch": 0.03,
205
  "learning_rate": 2e-05,
206
+ "loss": 1.9769,
207
  "step": 66
208
  },
209
  {
210
  "epoch": 0.03,
211
  "learning_rate": 2e-05,
212
+ "loss": 2.0645,
213
  "step": 68
214
  },
215
  {
216
  "epoch": 0.03,
217
  "learning_rate": 2e-05,
218
+ "loss": 2.1321,
219
  "step": 70
220
  },
221
  {
222
  "epoch": 0.03,
223
  "learning_rate": 2e-05,
224
+ "loss": 2.1039,
225
  "step": 72
226
  },
227
  {
228
  "epoch": 0.03,
229
  "learning_rate": 2e-05,
230
+ "loss": 2.0137,
231
  "step": 74
232
  },
233
  {
234
+ "epoch": 0.04,
235
  "learning_rate": 2e-05,
236
+ "loss": 2.0714,
237
  "step": 76
238
  },
239
  {
240
+ "epoch": 0.04,
241
  "learning_rate": 2e-05,
242
+ "loss": 2.0487,
243
  "step": 78
244
  },
245
  {
246
  "epoch": 0.04,
247
  "learning_rate": 2e-05,
248
+ "loss": 2.0397,
249
  "step": 80
250
  },
251
  {
252
  "epoch": 0.04,
253
  "learning_rate": 2e-05,
254
+ "loss": 2.0557,
255
  "step": 82
256
  },
257
  {
258
  "epoch": 0.04,
259
  "learning_rate": 2e-05,
260
+ "loss": 1.9866,
261
  "step": 84
262
  },
263
  {
264
  "epoch": 0.04,
265
  "learning_rate": 2e-05,
266
+ "loss": 2.0334,
267
  "step": 86
268
  },
269
  {
270
  "epoch": 0.04,
271
  "learning_rate": 2e-05,
272
+ "loss": 1.9322,
273
  "step": 88
274
  },
275
  {
276
  "epoch": 0.04,
277
  "learning_rate": 2e-05,
278
+ "loss": 1.9863,
279
  "step": 90
280
  },
281
  {
282
  "epoch": 0.04,
283
  "learning_rate": 2e-05,
284
+ "loss": 1.9943,
285
  "step": 92
286
  },
287
  {
288
  "epoch": 0.04,
289
  "learning_rate": 2e-05,
290
+ "loss": 1.8851,
291
  "step": 94
292
  },
293
  {
294
  "epoch": 0.04,
295
  "learning_rate": 2e-05,
296
+ "loss": 1.9012,
297
  "step": 96
298
  },
299
  {
300
+ "epoch": 0.05,
301
  "learning_rate": 2e-05,
302
+ "loss": 1.7702,
303
  "step": 98
304
  },
305
  {
306
+ "epoch": 0.05,
307
  "learning_rate": 2e-05,
308
+ "loss": 1.4807,
309
  "step": 100
310
  },
311
  {
312
+ "epoch": 0.05,
313
  "learning_rate": 2e-05,
314
+ "loss": 1.829,
315
  "step": 102
316
  },
317
  {
318
  "epoch": 0.05,
319
  "learning_rate": 2e-05,
320
+ "loss": 1.7121,
321
  "step": 104
322
  },
323
  {
324
  "epoch": 0.05,
325
  "learning_rate": 2e-05,
326
+ "loss": 1.913,
327
  "step": 106
328
  },
329
  {
330
  "epoch": 0.05,
331
  "learning_rate": 2e-05,
332
+ "loss": 1.9668,
333
  "step": 108
334
  },
335
  {
336
  "epoch": 0.05,
337
  "learning_rate": 2e-05,
338
+ "loss": 1.9368,
339
  "step": 110
340
  },
341
  {
342
  "epoch": 0.05,
343
  "learning_rate": 2e-05,
344
+ "loss": 1.9563,
345
  "step": 112
346
  },
347
  {
348
  "epoch": 0.05,
349
  "learning_rate": 2e-05,
350
+ "loss": 1.9124,
351
  "step": 114
352
  },
353
  {
354
  "epoch": 0.05,
355
  "learning_rate": 2e-05,
356
+ "loss": 1.9937,
357
  "step": 116
358
  },
359
  {
360
  "epoch": 0.05,
361
  "learning_rate": 2e-05,
362
+ "loss": 1.9534,
363
  "step": 118
364
  },
365
  {
366
+ "epoch": 0.06,
367
  "learning_rate": 2e-05,
368
+ "loss": 1.9646,
369
  "step": 120
370
  },
371
  {
372
+ "epoch": 0.06,
373
  "learning_rate": 2e-05,
374
+ "loss": 1.8869,
375
  "step": 122
376
  },
377
  {
378
+ "epoch": 0.06,
379
  "learning_rate": 2e-05,
380
+ "loss": 1.9113,
381
  "step": 124
382
  },
383
  {
384
  "epoch": 0.06,
385
  "learning_rate": 2e-05,
386
+ "loss": 1.8423,
387
  "step": 126
388
  },
389
  {
390
  "epoch": 0.06,
391
  "learning_rate": 2e-05,
392
+ "loss": 1.9589,
393
  "step": 128
394
  },
395
  {
396
  "epoch": 0.06,
397
  "learning_rate": 2e-05,
398
+ "loss": 1.9223,
399
  "step": 130
400
  },
401
  {
402
  "epoch": 0.06,
403
  "learning_rate": 2e-05,
404
+ "loss": 1.8867,
405
  "step": 132
406
  },
407
  {
408
  "epoch": 0.06,
409
  "learning_rate": 2e-05,
410
+ "loss": 1.8103,
411
  "step": 134
412
  },
413
  {
414
  "epoch": 0.06,
415
  "learning_rate": 2e-05,
416
+ "loss": 1.9077,
417
  "step": 136
418
  },
419
  {
420
  "epoch": 0.06,
421
  "learning_rate": 2e-05,
422
+ "loss": 1.8303,
423
  "step": 138
424
  },
425
  {
426
  "epoch": 0.06,
427
  "learning_rate": 2e-05,
428
+ "loss": 1.8349,
429
  "step": 140
430
  },
431
  {
432
+ "epoch": 0.07,
433
  "learning_rate": 2e-05,
434
+ "loss": 1.7763,
435
  "step": 142
436
  },
437
  {
438
+ "epoch": 0.07,
439
  "learning_rate": 2e-05,
440
+ "loss": 1.8196,
441
  "step": 144
442
  },
443
  {
444
+ "epoch": 0.07,
445
  "learning_rate": 2e-05,
446
+ "loss": 1.8212,
447
  "step": 146
448
  },
449
  {
450
  "epoch": 0.07,
451
  "learning_rate": 2e-05,
452
+ "loss": 1.6991,
453
  "step": 148
454
  },
455
  {
456
  "epoch": 0.07,
457
  "learning_rate": 2e-05,
458
+ "loss": 1.6176,
459
  "step": 150
460
  },
461
  {
462
  "epoch": 0.07,
463
  "learning_rate": 2e-05,
464
+ "loss": 1.8508,
465
  "step": 152
466
  },
467
  {
468
  "epoch": 0.07,
469
  "learning_rate": 2e-05,
470
+ "loss": 1.8574,
471
  "step": 154
472
  },
473
  {
474
  "epoch": 0.07,
475
  "learning_rate": 2e-05,
476
+ "loss": 1.8352,
477
  "step": 156
478
  },
479
  {
480
  "epoch": 0.07,
481
  "learning_rate": 2e-05,
482
+ "loss": 1.9877,
483
  "step": 158
484
  },
485
  {
486
  "epoch": 0.07,
487
  "learning_rate": 2e-05,
488
+ "loss": 1.9674,
489
  "step": 160
490
  },
491
  {
492
  "epoch": 0.07,
493
  "learning_rate": 2e-05,
494
+ "loss": 1.9151,
495
  "step": 162
496
  },
497
  {
498
+ "epoch": 0.08,
499
  "learning_rate": 2e-05,
500
+ "loss": 1.8725,
501
  "step": 164
502
  },
503
  {
504
+ "epoch": 0.08,
505
  "learning_rate": 2e-05,
506
+ "loss": 1.8877,
507
  "step": 166
508
  },
509
  {
510
+ "epoch": 0.08,
511
  "learning_rate": 2e-05,
512
+ "loss": 1.9685,
513
  "step": 168
514
  },
515
  {
516
+ "epoch": 0.08,
517
  "learning_rate": 2e-05,
518
+ "loss": 1.8729,
519
  "step": 170
520
  },
521
  {
522
  "epoch": 0.08,
523
  "learning_rate": 2e-05,
524
+ "loss": 1.9164,
525
  "step": 172
526
  },
527
  {
528
  "epoch": 0.08,
529
  "learning_rate": 2e-05,
530
+ "loss": 1.9212,
531
  "step": 174
532
  },
533
  {
534
  "epoch": 0.08,
535
  "learning_rate": 2e-05,
536
+ "loss": 1.8108,
537
  "step": 176
538
  },
539
  {
540
  "epoch": 0.08,
541
  "learning_rate": 2e-05,
542
+ "loss": 1.8292,
543
  "step": 178
544
  },
545
  {
546
  "epoch": 0.08,
547
  "learning_rate": 2e-05,
548
+ "loss": 1.7682,
549
  "step": 180
550
  },
551
  {
552
  "epoch": 0.08,
553
  "learning_rate": 2e-05,
554
+ "loss": 1.8177,
555
  "step": 182
556
  },
557
  {
558
  "epoch": 0.08,
559
  "learning_rate": 2e-05,
560
+ "loss": 1.8236,
561
  "step": 184
562
  },
563
  {
564
+ "epoch": 0.09,
565
  "learning_rate": 2e-05,
566
+ "loss": 1.847,
567
  "step": 186
568
  },
569
  {
570
+ "epoch": 0.09,
571
  "learning_rate": 2e-05,
572
+ "loss": 1.8181,
573
  "step": 188
574
  },
575
  {
576
+ "epoch": 0.09,
577
  "learning_rate": 2e-05,
578
+ "loss": 1.7495,
579
  "step": 190
580
  },
581
  {
582
+ "epoch": 0.09,
583
  "learning_rate": 2e-05,
584
+ "loss": 1.7743,
585
  "step": 192
586
  },
587
  {
588
  "epoch": 0.09,
589
  "learning_rate": 2e-05,
590
+ "loss": 1.7222,
591
  "step": 194
592
  },
593
  {
594
  "epoch": 0.09,
595
  "learning_rate": 2e-05,
596
+ "loss": 1.687,
597
  "step": 196
598
  },
599
  {
600
  "epoch": 0.09,
601
  "learning_rate": 2e-05,
602
+ "loss": 1.6177,
603
  "step": 198
604
  },
605
  {
606
  "epoch": 0.09,
607
  "learning_rate": 2e-05,
608
+ "loss": 1.5484,
609
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610
  }
611
  ],
612
  "logging_steps": 2,
613
+ "max_steps": 6501,
614
  "num_train_epochs": 3,
615
  "save_steps": 50,
616
+ "total_flos": 1334487070654464.0,
617
  "trial_name": null,
618
  "trial_params": null
619
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dda9faee12cc6bf082bfebf31874aacbda67be29b84b710a466281b06e56914c
3
  size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf78a82702254c8f8180b7e117ddfd70d0006ee1a647152342b51a05191850a4
3
  size 4027