File size: 29,637 Bytes
6b7456d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 19.547511312217196,
  "eval_steps": 500,
  "global_step": 540,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.36199095022624433,
      "grad_norm": 0.5034113526344299,
      "learning_rate": 9.259259259259259e-07,
      "logits/chosen": 1.8096959590911865,
      "logits/rejected": 1.7158682346343994,
      "logps/chosen": -106.03636169433594,
      "logps/rejected": -84.16211700439453,
      "loss": 0.695,
      "rewards/accuracies": 0.3499999940395355,
      "rewards/chosen": -0.00430720392614603,
      "rewards/margins": 0.00014101080887485296,
      "rewards/rejected": -0.004448213614523411,
      "step": 10
    },
    {
      "epoch": 0.7239819004524887,
      "grad_norm": 0.4689481854438782,
      "learning_rate": 1.8518518518518519e-06,
      "logits/chosen": 1.6904691457748413,
      "logits/rejected": 1.6791242361068726,
      "logps/chosen": -82.1568603515625,
      "logps/rejected": -81.06039428710938,
      "loss": 0.6942,
      "rewards/accuracies": 0.5375000238418579,
      "rewards/chosen": -0.002815738320350647,
      "rewards/margins": 0.005291810259222984,
      "rewards/rejected": -0.008107547648251057,
      "step": 20
    },
    {
      "epoch": 1.085972850678733,
      "grad_norm": 0.5041429996490479,
      "learning_rate": 2.7777777777777783e-06,
      "logits/chosen": 1.877396583557129,
      "logits/rejected": 1.9337213039398193,
      "logps/chosen": -76.86979675292969,
      "logps/rejected": -86.98431396484375,
      "loss": 0.6937,
      "rewards/accuracies": 0.48750001192092896,
      "rewards/chosen": -0.002632059855386615,
      "rewards/margins": -0.00260960147716105,
      "rewards/rejected": -2.245856376248412e-05,
      "step": 30
    },
    {
      "epoch": 1.4479638009049773,
      "grad_norm": 0.49786925315856934,
      "learning_rate": 3.7037037037037037e-06,
      "logits/chosen": 1.622865915298462,
      "logits/rejected": 1.649457573890686,
      "logps/chosen": -77.10165405273438,
      "logps/rejected": -75.56939697265625,
      "loss": 0.6927,
      "rewards/accuracies": 0.5249999761581421,
      "rewards/chosen": -0.005956621374934912,
      "rewards/margins": -0.0038306519854813814,
      "rewards/rejected": -0.0021259686909615993,
      "step": 40
    },
    {
      "epoch": 1.8099547511312217,
      "grad_norm": 0.6152529120445251,
      "learning_rate": 4.62962962962963e-06,
      "logits/chosen": 1.7763454914093018,
      "logits/rejected": 1.8556989431381226,
      "logps/chosen": -84.42935943603516,
      "logps/rejected": -97.08312225341797,
      "loss": 0.6937,
      "rewards/accuracies": 0.5625,
      "rewards/chosen": 0.008006314747035503,
      "rewards/margins": 0.005445700138807297,
      "rewards/rejected": 0.0025606155395507812,
      "step": 50
    },
    {
      "epoch": 2.171945701357466,
      "grad_norm": 0.539294958114624,
      "learning_rate": 4.998119881260576e-06,
      "logits/chosen": 1.7218296527862549,
      "logits/rejected": 1.759447455406189,
      "logps/chosen": -71.67170715332031,
      "logps/rejected": -86.7840805053711,
      "loss": 0.6935,
      "rewards/accuracies": 0.4625000059604645,
      "rewards/chosen": 0.0036853079218417406,
      "rewards/margins": -0.002186889760196209,
      "rewards/rejected": 0.005872196517884731,
      "step": 60
    },
    {
      "epoch": 2.5339366515837103,
      "grad_norm": 0.5369167327880859,
      "learning_rate": 4.9866405060165044e-06,
      "logits/chosen": 1.7708947658538818,
      "logits/rejected": 1.8614221811294556,
      "logps/chosen": -86.73070526123047,
      "logps/rejected": -83.09151458740234,
      "loss": 0.6909,
      "rewards/accuracies": 0.5,
      "rewards/chosen": 0.008487144485116005,
      "rewards/margins": 0.004935932345688343,
      "rewards/rejected": 0.0035512119065970182,
      "step": 70
    },
    {
      "epoch": 2.8959276018099547,
      "grad_norm": 0.5544038414955139,
      "learning_rate": 4.964774158361991e-06,
      "logits/chosen": 1.772223711013794,
      "logits/rejected": 1.834602952003479,
      "logps/chosen": -69.32654571533203,
      "logps/rejected": -77.55828094482422,
      "loss": 0.6943,
      "rewards/accuracies": 0.5249999761581421,
      "rewards/chosen": -0.012612676247954369,
      "rewards/margins": -0.011703734286129475,
      "rewards/rejected": -0.0009089424274861813,
      "step": 80
    },
    {
      "epoch": 3.257918552036199,
      "grad_norm": 0.5990457534790039,
      "learning_rate": 4.93261217644956e-06,
      "logits/chosen": 1.8591846227645874,
      "logits/rejected": 1.8804810047149658,
      "logps/chosen": -79.4209213256836,
      "logps/rejected": -76.4483642578125,
      "loss": 0.6927,
      "rewards/accuracies": 0.4749999940395355,
      "rewards/chosen": -0.0016643519047647715,
      "rewards/margins": 0.006901158951222897,
      "rewards/rejected": -0.008565512485802174,
      "step": 90
    },
    {
      "epoch": 3.6199095022624435,
      "grad_norm": 0.6173596978187561,
      "learning_rate": 4.8902889044347e-06,
      "logits/chosen": 1.777044653892517,
      "logits/rejected": 1.7735064029693604,
      "logps/chosen": -85.6143569946289,
      "logps/rejected": -79.8615493774414,
      "loss": 0.6894,
      "rewards/accuracies": 0.5375000238418579,
      "rewards/chosen": -0.0019477702444419265,
      "rewards/margins": 0.002171102212741971,
      "rewards/rejected": -0.004118871875107288,
      "step": 100
    },
    {
      "epoch": 3.981900452488688,
      "grad_norm": 0.6671907901763916,
      "learning_rate": 4.837981131305475e-06,
      "logits/chosen": 1.6935539245605469,
      "logits/rejected": 1.7035915851593018,
      "logps/chosen": -83.69059753417969,
      "logps/rejected": -89.67396545410156,
      "loss": 0.6907,
      "rewards/accuracies": 0.48750001192092896,
      "rewards/chosen": -0.005695090629160404,
      "rewards/margins": 0.0027313486207276583,
      "rewards/rejected": -0.00842643715441227,
      "step": 110
    },
    {
      "epoch": 4.343891402714932,
      "grad_norm": 0.9499974250793457,
      "learning_rate": 4.775907352415367e-06,
      "logits/chosen": 1.801300287246704,
      "logits/rejected": 1.8080482482910156,
      "logps/chosen": -92.2596206665039,
      "logps/rejected": -85.3951416015625,
      "loss": 0.6863,
      "rewards/accuracies": 0.637499988079071,
      "rewards/chosen": 0.000912027433514595,
      "rewards/margins": 0.017921410501003265,
      "rewards/rejected": -0.01700938306748867,
      "step": 120
    },
    {
      "epoch": 4.705882352941177,
      "grad_norm": 0.6524670124053955,
      "learning_rate": 4.70432685680402e-06,
      "logits/chosen": 1.8659807443618774,
      "logits/rejected": 1.8858706951141357,
      "logps/chosen": -78.38592529296875,
      "logps/rejected": -81.4170913696289,
      "loss": 0.6873,
      "rewards/accuracies": 0.5375000238418579,
      "rewards/chosen": 0.002340498147532344,
      "rewards/margins": 0.014832529239356518,
      "rewards/rejected": -0.01249203272163868,
      "step": 130
    },
    {
      "epoch": 5.067873303167421,
      "grad_norm": 0.7461792826652527,
      "learning_rate": 4.623538644118244e-06,
      "logits/chosen": 1.7999374866485596,
      "logits/rejected": 1.869183897972107,
      "logps/chosen": -71.37133026123047,
      "logps/rejected": -83.36344909667969,
      "loss": 0.6829,
      "rewards/accuracies": 0.612500011920929,
      "rewards/chosen": -0.013370734639465809,
      "rewards/margins": 0.013819178566336632,
      "rewards/rejected": -0.027189914137125015,
      "step": 140
    },
    {
      "epoch": 5.429864253393665,
      "grad_norm": 0.6743089556694031,
      "learning_rate": 4.533880175657419e-06,
      "logits/chosen": 1.8166606426239014,
      "logits/rejected": 1.7869322299957275,
      "logps/chosen": -87.78181457519531,
      "logps/rejected": -71.32334899902344,
      "loss": 0.6758,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": -0.0059204003773629665,
      "rewards/margins": 0.03691873326897621,
      "rewards/rejected": -0.04283912852406502,
      "step": 150
    },
    {
      "epoch": 5.791855203619909,
      "grad_norm": 0.7719553112983704,
      "learning_rate": 4.435725964760331e-06,
      "logits/chosen": 1.7498859167099,
      "logits/rejected": 1.7752033472061157,
      "logps/chosen": -72.18727111816406,
      "logps/rejected": -84.50582122802734,
      "loss": 0.6786,
      "rewards/accuracies": 0.675000011920929,
      "rewards/chosen": -0.023137167096138,
      "rewards/margins": 0.042744893580675125,
      "rewards/rejected": -0.06588206440210342,
      "step": 160
    },
    {
      "epoch": 6.153846153846154,
      "grad_norm": 0.8010989427566528,
      "learning_rate": 4.329486012421531e-06,
      "logits/chosen": 1.7785524129867554,
      "logits/rejected": 1.770937204360962,
      "logps/chosen": -87.91519165039062,
      "logps/rejected": -86.76083374023438,
      "loss": 0.6689,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": -0.019223090261220932,
      "rewards/margins": 0.03557150438427925,
      "rewards/rejected": -0.054794590920209885,
      "step": 170
    },
    {
      "epoch": 6.515837104072398,
      "grad_norm": 0.9732075929641724,
      "learning_rate": 4.215604094671835e-06,
      "logits/chosen": 1.76633620262146,
      "logits/rejected": 1.782987356185913,
      "logps/chosen": -85.80155944824219,
      "logps/rejected": -85.45477294921875,
      "loss": 0.6648,
      "rewards/accuracies": 0.675000011920929,
      "rewards/chosen": -0.009860477410256863,
      "rewards/margins": 0.0789995938539505,
      "rewards/rejected": -0.08886007964611053,
      "step": 180
    },
    {
      "epoch": 6.877828054298643,
      "grad_norm": 0.8777466416358948,
      "learning_rate": 4.094555908876765e-06,
      "logits/chosen": 1.8455078601837158,
      "logits/rejected": 1.871913194656372,
      "logps/chosen": -71.23600006103516,
      "logps/rejected": -71.44808197021484,
      "loss": 0.6606,
      "rewards/accuracies": 0.612500011920929,
      "rewards/chosen": -0.04135243222117424,
      "rewards/margins": 0.06673365831375122,
      "rewards/rejected": -0.10808608680963516,
      "step": 190
    },
    {
      "epoch": 7.239819004524887,
      "grad_norm": 0.8959658741950989,
      "learning_rate": 3.966847086696045e-06,
      "logits/chosen": 1.8061046600341797,
      "logits/rejected": 1.8204586505889893,
      "logps/chosen": -80.91191864013672,
      "logps/rejected": -73.68770599365234,
      "loss": 0.6511,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": -0.037331439554691315,
      "rewards/margins": 0.11111048609018326,
      "rewards/rejected": -0.14844192564487457,
      "step": 200
    },
    {
      "epoch": 7.601809954751131,
      "grad_norm": 0.9700360298156738,
      "learning_rate": 3.833011082004229e-06,
      "logits/chosen": 1.7680559158325195,
      "logits/rejected": 1.8058135509490967,
      "logps/chosen": -77.65849304199219,
      "logps/rejected": -78.11155700683594,
      "loss": 0.645,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": -0.0448901504278183,
      "rewards/margins": 0.12249104678630829,
      "rewards/rejected": -0.1673811972141266,
      "step": 210
    },
    {
      "epoch": 7.963800904977376,
      "grad_norm": 1.070133924484253,
      "learning_rate": 3.693606942594873e-06,
      "logits/chosen": 1.7242870330810547,
      "logits/rejected": 1.7760801315307617,
      "logps/chosen": -79.4949722290039,
      "logps/rejected": -84.17713928222656,
      "loss": 0.6516,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": -0.06297416985034943,
      "rewards/margins": 0.1389354169368744,
      "rewards/rejected": -0.20190958678722382,
      "step": 220
    },
    {
      "epoch": 8.32579185520362,
      "grad_norm": 0.880223274230957,
      "learning_rate": 3.549216974976073e-06,
      "logits/chosen": 1.8455660343170166,
      "logits/rejected": 1.8497081995010376,
      "logps/chosen": -82.68794250488281,
      "logps/rejected": -87.30531311035156,
      "loss": 0.6396,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": -0.09025909751653671,
      "rewards/margins": 0.1426868587732315,
      "rewards/rejected": -0.23294594883918762,
      "step": 230
    },
    {
      "epoch": 8.687782805429864,
      "grad_norm": 0.9315020442008972,
      "learning_rate": 3.400444312011776e-06,
      "logits/chosen": 1.7252687215805054,
      "logits/rejected": 1.7745378017425537,
      "logps/chosen": -76.34024047851562,
      "logps/rejected": -89.45391845703125,
      "loss": 0.6325,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": -0.07824709266424179,
      "rewards/margins": 0.14033600687980652,
      "rewards/rejected": -0.21858307719230652,
      "step": 240
    },
    {
      "epoch": 9.049773755656108,
      "grad_norm": 0.9806249141693115,
      "learning_rate": 3.2479103935691047e-06,
      "logits/chosen": 1.7673581838607788,
      "logits/rejected": 1.8124672174453735,
      "logps/chosen": -91.80342102050781,
      "logps/rejected": -91.12406921386719,
      "loss": 0.6314,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.10384390503168106,
      "rewards/margins": 0.189823716878891,
      "rewards/rejected": -0.29366764426231384,
      "step": 250
    },
    {
      "epoch": 9.411764705882353,
      "grad_norm": 1.0357781648635864,
      "learning_rate": 3.092252370695298e-06,
      "logits/chosen": 1.7498366832733154,
      "logits/rejected": 1.6984941959381104,
      "logps/chosen": -95.309326171875,
      "logps/rejected": -80.66096496582031,
      "loss": 0.6213,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": -0.14446459710597992,
      "rewards/margins": 0.1758357733488083,
      "rewards/rejected": -0.3203004002571106,
      "step": 260
    },
    {
      "epoch": 9.773755656108598,
      "grad_norm": 0.9333145022392273,
      "learning_rate": 2.9341204441673267e-06,
      "logits/chosen": 1.8672996759414673,
      "logits/rejected": 1.7658369541168213,
      "logps/chosen": -104.98448181152344,
      "logps/rejected": -81.6786880493164,
      "loss": 0.6131,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": -0.05903264880180359,
      "rewards/margins": 0.22136883437633514,
      "rewards/rejected": -0.28040146827697754,
      "step": 270
    },
    {
      "epoch": 10.135746606334841,
      "grad_norm": 0.9343512654304504,
      "learning_rate": 2.7741751485313295e-06,
      "logits/chosen": 1.7365798950195312,
      "logits/rejected": 1.748958945274353,
      "logps/chosen": -84.08332824707031,
      "logps/rejected": -87.89321899414062,
      "loss": 0.6121,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": -0.13087712228298187,
      "rewards/margins": 0.16090884804725647,
      "rewards/rejected": -0.29178598523139954,
      "step": 280
    },
    {
      "epoch": 10.497737556561086,
      "grad_norm": 0.9784368872642517,
      "learning_rate": 2.6130845929767662e-06,
      "logits/chosen": 1.7028934955596924,
      "logits/rejected": 1.6997534036636353,
      "logps/chosen": -86.77079010009766,
      "logps/rejected": -85.89974975585938,
      "loss": 0.612,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": -0.10491044819355011,
      "rewards/margins": 0.19683709740638733,
      "rewards/rejected": -0.30174753069877625,
      "step": 290
    },
    {
      "epoch": 10.85972850678733,
      "grad_norm": 1.030173659324646,
      "learning_rate": 2.4515216705704396e-06,
      "logits/chosen": 1.8057117462158203,
      "logits/rejected": 1.8746700286865234,
      "logps/chosen": -74.07008361816406,
      "logps/rejected": -89.6046142578125,
      "loss": 0.5997,
      "rewards/accuracies": 0.8125,
      "rewards/chosen": -0.058038175106048584,
      "rewards/margins": 0.2527470886707306,
      "rewards/rejected": -0.3107852637767792,
      "step": 300
    },
    {
      "epoch": 11.221719457013574,
      "grad_norm": 1.0045686960220337,
      "learning_rate": 2.290161247507733e-06,
      "logits/chosen": 1.7842658758163452,
      "logits/rejected": 1.83527410030365,
      "logps/chosen": -85.32085418701172,
      "logps/rejected": -91.45645904541016,
      "loss": 0.5967,
      "rewards/accuracies": 0.800000011920929,
      "rewards/chosen": -0.1375085413455963,
      "rewards/margins": 0.2444867342710495,
      "rewards/rejected": -0.3819952607154846,
      "step": 310
    },
    {
      "epoch": 11.583710407239819,
      "grad_norm": 0.967028796672821,
      "learning_rate": 2.129677344121879e-06,
      "logits/chosen": 1.8002763986587524,
      "logits/rejected": 1.7979824542999268,
      "logps/chosen": -97.67089080810547,
      "logps/rejected": -79.14571380615234,
      "loss": 0.5852,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.1535075604915619,
      "rewards/margins": 0.18517914414405823,
      "rewards/rejected": -0.3386867046356201,
      "step": 320
    },
    {
      "epoch": 11.945701357466064,
      "grad_norm": 1.017149806022644,
      "learning_rate": 1.970740319426474e-06,
      "logits/chosen": 1.8270378112792969,
      "logits/rejected": 1.8545118570327759,
      "logps/chosen": -74.19200897216797,
      "logps/rejected": -86.13984680175781,
      "loss": 0.6015,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": -0.1458810567855835,
      "rewards/margins": 0.20589685440063477,
      "rewards/rejected": -0.35177791118621826,
      "step": 330
    },
    {
      "epoch": 12.307692307692308,
      "grad_norm": 0.9695628881454468,
      "learning_rate": 1.8140140709517467e-06,
      "logits/chosen": 1.745535135269165,
      "logits/rejected": 1.7830098867416382,
      "logps/chosen": -84.69477081298828,
      "logps/rejected": -82.7412109375,
      "loss": 0.5882,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.12116970866918564,
      "rewards/margins": 0.25774163007736206,
      "rewards/rejected": -0.3789113759994507,
      "step": 340
    },
    {
      "epoch": 12.669683257918551,
      "grad_norm": 1.103960394859314,
      "learning_rate": 1.6601532615711452e-06,
      "logits/chosen": 1.777008295059204,
      "logits/rejected": 1.6892492771148682,
      "logps/chosen": -100.1700210571289,
      "logps/rejected": -90.1813735961914,
      "loss": 0.5802,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": -0.10893051326274872,
      "rewards/margins": 0.29678279161453247,
      "rewards/rejected": -0.4057133197784424,
      "step": 350
    },
    {
      "epoch": 13.031674208144796,
      "grad_norm": 1.0589226484298706,
      "learning_rate": 1.509800584902108e-06,
      "logits/chosen": 1.7005256414413452,
      "logits/rejected": 1.7957611083984375,
      "logps/chosen": -79.087158203125,
      "logps/rejected": -102.69563293457031,
      "loss": 0.5814,
      "rewards/accuracies": 0.862500011920929,
      "rewards/chosen": -0.13767623901367188,
      "rewards/margins": 0.3307909071445465,
      "rewards/rejected": -0.4684671461582184,
      "step": 360
    },
    {
      "epoch": 13.393665158371041,
      "grad_norm": 1.056600570678711,
      "learning_rate": 1.3635840807037487e-06,
      "logits/chosen": 1.6859004497528076,
      "logits/rejected": 1.7192614078521729,
      "logps/chosen": -96.27987670898438,
      "logps/rejected": -96.15126037597656,
      "loss": 0.5703,
      "rewards/accuracies": 0.8500000238418579,
      "rewards/chosen": -0.16748157143592834,
      "rewards/margins": 0.38242021203041077,
      "rewards/rejected": -0.5499017834663391,
      "step": 370
    },
    {
      "epoch": 13.755656108597286,
      "grad_norm": 2.783137321472168,
      "learning_rate": 1.2221145114853172e-06,
      "logits/chosen": 1.8700748682022095,
      "logits/rejected": 1.886313796043396,
      "logps/chosen": -82.22146606445312,
      "logps/rejected": -79.1581802368164,
      "loss": 0.5791,
      "rewards/accuracies": 0.8125,
      "rewards/chosen": -0.16777488589286804,
      "rewards/margins": 0.2746582329273224,
      "rewards/rejected": -0.44243311882019043,
      "step": 380
    },
    {
      "epoch": 14.117647058823529,
      "grad_norm": 1.137703776359558,
      "learning_rate": 1.085982811283654e-06,
      "logits/chosen": 1.7121073007583618,
      "logits/rejected": 1.74045729637146,
      "logps/chosen": -81.7417221069336,
      "logps/rejected": -82.1438980102539,
      "loss": 0.5792,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.21657264232635498,
      "rewards/margins": 0.23677067458629608,
      "rewards/rejected": -0.45334330201148987,
      "step": 390
    },
    {
      "epoch": 14.479638009049774,
      "grad_norm": 1.115824580192566,
      "learning_rate": 9.557576172663577e-07,
      "logits/chosen": 1.782957673072815,
      "logits/rejected": 1.740311861038208,
      "logps/chosen": -95.57044982910156,
      "logps/rejected": -87.47151184082031,
      "loss": 0.5648,
      "rewards/accuracies": 0.800000011920929,
      "rewards/chosen": -0.13624358177185059,
      "rewards/margins": 0.3296685218811035,
      "rewards/rejected": -0.4659121632575989,
      "step": 400
    },
    {
      "epoch": 14.841628959276019,
      "grad_norm": 1.0241665840148926,
      "learning_rate": 8.319828944714508e-07,
      "logits/chosen": 1.8231852054595947,
      "logits/rejected": 1.7309335470199585,
      "logps/chosen": -90.45216369628906,
      "logps/rejected": -84.18362426757812,
      "loss": 0.5692,
      "rewards/accuracies": 0.800000011920929,
      "rewards/chosen": -0.21606135368347168,
      "rewards/margins": 0.3016027808189392,
      "rewards/rejected": -0.5176641345024109,
      "step": 410
    },
    {
      "epoch": 15.203619909502262,
      "grad_norm": 1.0937505960464478,
      "learning_rate": 7.151756636052529e-07,
      "logits/chosen": 1.6469671726226807,
      "logits/rejected": 1.755052924156189,
      "logps/chosen": -69.20260620117188,
      "logps/rejected": -91.29935455322266,
      "loss": 0.5813,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.16654367744922638,
      "rewards/margins": 0.29352378845214844,
      "rewards/rejected": -0.4600674510002136,
      "step": 420
    },
    {
      "epoch": 15.565610859728507,
      "grad_norm": 1.189382553100586,
      "learning_rate": 6.058238413897052e-07,
      "logits/chosen": 1.7920663356781006,
      "logits/rejected": 1.767615556716919,
      "logps/chosen": -83.10585021972656,
      "logps/rejected": -87.55345916748047,
      "loss": 0.5717,
      "rewards/accuracies": 0.8500000238418579,
      "rewards/chosen": -0.18599426746368408,
      "rewards/margins": 0.30218419432640076,
      "rewards/rejected": -0.48817843198776245,
      "step": 430
    },
    {
      "epoch": 15.927601809954751,
      "grad_norm": 1.0084723234176636,
      "learning_rate": 5.043842024802675e-07,
      "logits/chosen": 1.6690113544464111,
      "logits/rejected": 1.7104164361953735,
      "logps/chosen": -77.92027282714844,
      "logps/rejected": -88.28632354736328,
      "loss": 0.5616,
      "rewards/accuracies": 0.824999988079071,
      "rewards/chosen": -0.1644773930311203,
      "rewards/margins": 0.31829944252967834,
      "rewards/rejected": -0.48277679085731506,
      "step": 440
    },
    {
      "epoch": 16.289592760180994,
      "grad_norm": 1.099560022354126,
      "learning_rate": 4.1128047146765936e-07,
      "logits/chosen": 1.777606725692749,
      "logits/rejected": 1.7520103454589844,
      "logps/chosen": -92.19166564941406,
      "logps/rejected": -74.17485809326172,
      "loss": 0.5653,
      "rewards/accuracies": 0.800000011920929,
      "rewards/chosen": -0.1612701714038849,
      "rewards/margins": 0.32633036375045776,
      "rewards/rejected": -0.4876004755496979,
      "step": 450
    },
    {
      "epoch": 16.65158371040724,
      "grad_norm": 1.0390541553497314,
      "learning_rate": 3.269015529333805e-07,
      "logits/chosen": 1.6575822830200195,
      "logits/rejected": 1.785485863685608,
      "logps/chosen": -81.9493179321289,
      "logps/rejected": -101.3016128540039,
      "loss": 0.5579,
      "rewards/accuracies": 0.8125,
      "rewards/chosen": -0.20868897438049316,
      "rewards/margins": 0.3946114182472229,
      "rewards/rejected": -0.6033004522323608,
      "step": 460
    },
    {
      "epoch": 17.013574660633484,
      "grad_norm": 1.1404516696929932,
      "learning_rate": 2.515999069522676e-07,
      "logits/chosen": 1.6690752506256104,
      "logits/rejected": 1.6541496515274048,
      "logps/chosen": -88.44367980957031,
      "logps/rejected": -71.12745666503906,
      "loss": 0.5675,
      "rewards/accuracies": 0.824999988079071,
      "rewards/chosen": -0.19705724716186523,
      "rewards/margins": 0.26251187920570374,
      "rewards/rejected": -0.45956912636756897,
      "step": 470
    },
    {
      "epoch": 17.375565610859727,
      "grad_norm": 1.175963282585144,
      "learning_rate": 1.8569007682777417e-07,
      "logits/chosen": 1.6609585285186768,
      "logits/rejected": 1.6719191074371338,
      "logps/chosen": -94.07273864746094,
      "logps/rejected": -76.59893035888672,
      "loss": 0.5678,
      "rewards/accuracies": 0.7749999761581421,
      "rewards/chosen": -0.1740456521511078,
      "rewards/margins": 0.3117365837097168,
      "rewards/rejected": -0.4857822358608246,
      "step": 480
    },
    {
      "epoch": 17.737556561085974,
      "grad_norm": 1.2133598327636719,
      "learning_rate": 1.2944737520980883e-07,
      "logits/chosen": 1.7220537662506104,
      "logits/rejected": 1.7643390893936157,
      "logps/chosen": -83.6407699584961,
      "logps/rejected": -81.10734558105469,
      "loss": 0.5613,
      "rewards/accuracies": 0.862500011920929,
      "rewards/chosen": -0.13465282320976257,
      "rewards/margins": 0.4027688503265381,
      "rewards/rejected": -0.5374216437339783,
      "step": 490
    },
    {
      "epoch": 18.099547511312217,
      "grad_norm": 1.1120809316635132,
      "learning_rate": 8.310673408334496e-08,
      "logits/chosen": 1.5700793266296387,
      "logits/rejected": 1.5635223388671875,
      "logps/chosen": -86.44955444335938,
      "logps/rejected": -86.97091674804688,
      "loss": 0.5626,
      "rewards/accuracies": 0.8374999761581421,
      "rewards/chosen": -0.17045333981513977,
      "rewards/margins": 0.29403942823410034,
      "rewards/rejected": -0.46449270844459534,
      "step": 500
    },
    {
      "epoch": 18.46153846153846,
      "grad_norm": 1.177876353263855,
      "learning_rate": 4.6861723431538273e-08,
      "logits/chosen": 1.5987281799316406,
      "logits/rejected": 1.7516323328018188,
      "logps/chosen": -79.38053131103516,
      "logps/rejected": -97.10954284667969,
      "loss": 0.5634,
      "rewards/accuracies": 0.887499988079071,
      "rewards/chosen": -0.1326012909412384,
      "rewards/margins": 0.40437060594558716,
      "rewards/rejected": -0.5369717478752136,
      "step": 510
    },
    {
      "epoch": 18.823529411764707,
      "grad_norm": 1.2317314147949219,
      "learning_rate": 2.0863742672497244e-08,
      "logits/chosen": 1.71554434299469,
      "logits/rejected": 1.740006446838379,
      "logps/chosen": -83.19001007080078,
      "logps/rejected": -78.78974151611328,
      "loss": 0.573,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.23421506583690643,
      "rewards/margins": 0.25640812516212463,
      "rewards/rejected": -0.4906231760978699,
      "step": 520
    },
    {
      "epoch": 19.18552036199095,
      "grad_norm": 1.118938684463501,
      "learning_rate": 5.221388247169945e-09,
      "logits/chosen": 1.7030197381973267,
      "logits/rejected": 1.7396942377090454,
      "logps/chosen": -86.79795837402344,
      "logps/rejected": -97.96685791015625,
      "loss": 0.5571,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.17865614593029022,
      "rewards/margins": 0.3226766586303711,
      "rewards/rejected": -0.5013328790664673,
      "step": 530
    },
    {
      "epoch": 19.547511312217196,
      "grad_norm": 1.1492843627929688,
      "learning_rate": 0.0,
      "logits/chosen": 1.6407601833343506,
      "logits/rejected": 1.645991325378418,
      "logps/chosen": -74.029296875,
      "logps/rejected": -82.86942291259766,
      "loss": 0.5595,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": -0.228031724691391,
      "rewards/margins": 0.28408557176589966,
      "rewards/rejected": -0.5121172666549683,
      "step": 540
    },
    {
      "epoch": 19.547511312217196,
      "step": 540,
      "total_flos": 1.848581334679683e+18,
      "train_loss": 0.6237324387938888,
      "train_runtime": 3964.2678,
      "train_samples_per_second": 8.915,
      "train_steps_per_second": 0.136
    }
  ],
  "logging_steps": 10,
  "max_steps": 540,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 20,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.848581334679683e+18,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}