File size: 28,850 Bytes
b7cb8bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9984301412872841,
  "eval_steps": 100,
  "global_step": 477,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0020931449502878076,
      "grad_norm": 4.875121866371553,
      "learning_rate": 4.166666666666666e-09,
      "logits/chosen": -2.238138437271118,
      "logits/rejected": -2.554456949234009,
      "logps/chosen": -443.7523193359375,
      "logps/rejected": -491.8927001953125,
      "loss": 0.6931,
      "rewards/accuracies": 0.0,
      "rewards/chosen": 0.0,
      "rewards/margins": 0.0,
      "rewards/rejected": 0.0,
      "step": 1
    },
    {
      "epoch": 0.020931449502878074,
      "grad_norm": 5.553929970393955,
      "learning_rate": 4.166666666666667e-08,
      "logits/chosen": -2.4126930236816406,
      "logits/rejected": -2.5005030632019043,
      "logps/chosen": -418.43328857421875,
      "logps/rejected": -405.0360107421875,
      "loss": 0.6929,
      "rewards/accuracies": 0.4375,
      "rewards/chosen": 0.0017023859545588493,
      "rewards/margins": 0.00048581857117824256,
      "rewards/rejected": 0.0012165673542767763,
      "step": 10
    },
    {
      "epoch": 0.04186289900575615,
      "grad_norm": 4.513029874801273,
      "learning_rate": 8.333333333333334e-08,
      "logits/chosen": -2.208683490753174,
      "logits/rejected": -2.485910415649414,
      "logps/chosen": -428.45208740234375,
      "logps/rejected": -408.13763427734375,
      "loss": 0.6933,
      "rewards/accuracies": 0.44999998807907104,
      "rewards/chosen": 0.0008482746779918671,
      "rewards/margins": -0.00037219192017801106,
      "rewards/rejected": 0.0012204666854813695,
      "step": 20
    },
    {
      "epoch": 0.06279434850863422,
      "grad_norm": 4.637552468831084,
      "learning_rate": 1.25e-07,
      "logits/chosen": -2.224863290786743,
      "logits/rejected": -2.4407901763916016,
      "logps/chosen": -398.6038818359375,
      "logps/rejected": -367.05999755859375,
      "loss": 0.6924,
      "rewards/accuracies": 0.5375000238418579,
      "rewards/chosen": -0.0041518621146678925,
      "rewards/margins": 0.0011339159682393074,
      "rewards/rejected": -0.005285778548568487,
      "step": 30
    },
    {
      "epoch": 0.0837257980115123,
      "grad_norm": 4.657136939144448,
      "learning_rate": 1.6666666666666668e-07,
      "logits/chosen": -2.3235936164855957,
      "logits/rejected": -2.4915928840637207,
      "logps/chosen": -372.97442626953125,
      "logps/rejected": -390.05841064453125,
      "loss": 0.6899,
      "rewards/accuracies": 0.643750011920929,
      "rewards/chosen": -0.019573217257857323,
      "rewards/margins": 0.007190874312072992,
      "rewards/rejected": -0.026764091104269028,
      "step": 40
    },
    {
      "epoch": 0.10465724751439037,
      "grad_norm": 4.947790369246717,
      "learning_rate": 1.9998927475076103e-07,
      "logits/chosen": -2.1541531085968018,
      "logits/rejected": -2.355862855911255,
      "logps/chosen": -408.7329406738281,
      "logps/rejected": -406.50347900390625,
      "loss": 0.6855,
      "rewards/accuracies": 0.6812499761581421,
      "rewards/chosen": -0.04146841913461685,
      "rewards/margins": 0.02013658545911312,
      "rewards/rejected": -0.061604999005794525,
      "step": 50
    },
    {
      "epoch": 0.12558869701726844,
      "grad_norm": 6.135445605235113,
      "learning_rate": 1.9961413253717213e-07,
      "logits/chosen": -2.120229482650757,
      "logits/rejected": -2.287370204925537,
      "logps/chosen": -376.740234375,
      "logps/rejected": -386.8778381347656,
      "loss": 0.678,
      "rewards/accuracies": 0.6937500238418579,
      "rewards/chosen": -0.08536554872989655,
      "rewards/margins": 0.03690432757139206,
      "rewards/rejected": -0.12226986885070801,
      "step": 60
    },
    {
      "epoch": 0.14652014652014653,
      "grad_norm": 5.2300665585071835,
      "learning_rate": 1.9870502626379125e-07,
      "logits/chosen": -2.208547830581665,
      "logits/rejected": -2.316659927368164,
      "logps/chosen": -425.2916564941406,
      "logps/rejected": -429.31463623046875,
      "loss": 0.6673,
      "rewards/accuracies": 0.6312500238418579,
      "rewards/chosen": -0.14128030836582184,
      "rewards/margins": 0.05471445247530937,
      "rewards/rejected": -0.1959947645664215,
      "step": 70
    },
    {
      "epoch": 0.1674515960230246,
      "grad_norm": 6.361729619349137,
      "learning_rate": 1.9726682903510838e-07,
      "logits/chosen": -1.8886642456054688,
      "logits/rejected": -2.2390127182006836,
      "logps/chosen": -470.6441955566406,
      "logps/rejected": -419.4126892089844,
      "loss": 0.6583,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": -0.2689892053604126,
      "rewards/margins": 0.07578183710575104,
      "rewards/rejected": -0.34477105736732483,
      "step": 80
    },
    {
      "epoch": 0.18838304552590268,
      "grad_norm": 7.250967252041406,
      "learning_rate": 1.9530725005474194e-07,
      "logits/chosen": -2.3355867862701416,
      "logits/rejected": -2.404792070388794,
      "logps/chosen": -411.76806640625,
      "logps/rejected": -441.7333068847656,
      "loss": 0.6355,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -0.42172950506210327,
      "rewards/margins": 0.12971071898937225,
      "rewards/rejected": -0.5514402985572815,
      "step": 90
    },
    {
      "epoch": 0.20931449502878074,
      "grad_norm": 7.1454110672964335,
      "learning_rate": 1.9283679330160724e-07,
      "logits/chosen": -2.2639448642730713,
      "logits/rejected": -2.5537800788879395,
      "logps/chosen": -477.0587463378906,
      "logps/rejected": -489.705810546875,
      "loss": 0.6351,
      "rewards/accuracies": 0.643750011920929,
      "rewards/chosen": -0.6349204778671265,
      "rewards/margins": 0.18245458602905273,
      "rewards/rejected": -0.8173751831054688,
      "step": 100
    },
    {
      "epoch": 0.20931449502878074,
      "eval_logits/chosen": -2.2922377586364746,
      "eval_logits/rejected": -2.4565351009368896,
      "eval_logps/chosen": -472.2982177734375,
      "eval_logps/rejected": -487.7696533203125,
      "eval_loss": 0.6359348893165588,
      "eval_rewards/accuracies": 0.6746031641960144,
      "eval_rewards/chosen": -0.675361156463623,
      "eval_rewards/margins": 0.2425757199525833,
      "eval_rewards/rejected": -0.9179368615150452,
      "eval_runtime": 88.9262,
      "eval_samples_per_second": 22.491,
      "eval_steps_per_second": 0.708,
      "step": 100
    },
    {
      "epoch": 0.2302459445316588,
      "grad_norm": 9.360622478279684,
      "learning_rate": 1.898687012251826e-07,
      "logits/chosen": -2.217447280883789,
      "logits/rejected": -2.3863320350646973,
      "logps/chosen": -481.96990966796875,
      "logps/rejected": -499.48345947265625,
      "loss": 0.6311,
      "rewards/accuracies": 0.6312500238418579,
      "rewards/chosen": -0.7452836036682129,
      "rewards/margins": 0.209157794713974,
      "rewards/rejected": -0.9544414281845093,
      "step": 110
    },
    {
      "epoch": 0.25117739403453687,
      "grad_norm": 7.953755036427896,
      "learning_rate": 1.8641888376168482e-07,
      "logits/chosen": -2.2092318534851074,
      "logits/rejected": -2.2929816246032715,
      "logps/chosen": -454.405517578125,
      "logps/rejected": -497.1351623535156,
      "loss": 0.6209,
      "rewards/accuracies": 0.706250011920929,
      "rewards/chosen": -0.7448408007621765,
      "rewards/margins": 0.29463425278663635,
      "rewards/rejected": -1.0394752025604248,
      "step": 120
    },
    {
      "epoch": 0.272108843537415,
      "grad_norm": 8.821105331401093,
      "learning_rate": 1.8250583305165094e-07,
      "logits/chosen": -2.2061495780944824,
      "logits/rejected": -2.3711869716644287,
      "logps/chosen": -472.7056579589844,
      "logps/rejected": -487.33880615234375,
      "loss": 0.6204,
      "rewards/accuracies": 0.643750011920929,
      "rewards/chosen": -0.7287603616714478,
      "rewards/margins": 0.2083979845046997,
      "rewards/rejected": -0.9371584057807922,
      "step": 130
    },
    {
      "epoch": 0.29304029304029305,
      "grad_norm": 9.167325969849378,
      "learning_rate": 1.78150524316067e-07,
      "logits/chosen": -2.2468433380126953,
      "logits/rejected": -2.466036319732666,
      "logps/chosen": -501.697021484375,
      "logps/rejected": -497.5772399902344,
      "loss": 0.6195,
      "rewards/accuracies": 0.637499988079071,
      "rewards/chosen": -0.7497612237930298,
      "rewards/margins": 0.30543631315231323,
      "rewards/rejected": -1.0551974773406982,
      "step": 140
    },
    {
      "epoch": 0.3139717425431711,
      "grad_norm": 10.828055616866019,
      "learning_rate": 1.7337630342238038e-07,
      "logits/chosen": -2.163837432861328,
      "logits/rejected": -2.328864574432373,
      "logps/chosen": -474.3462829589844,
      "logps/rejected": -480.0904846191406,
      "loss": 0.621,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": -0.7979869246482849,
      "rewards/margins": 0.22926858067512512,
      "rewards/rejected": -1.0272555351257324,
      "step": 150
    },
    {
      "epoch": 0.3349031920460492,
      "grad_norm": 9.907119624068729,
      "learning_rate": 1.682087617430782e-07,
      "logits/chosen": -2.1256282329559326,
      "logits/rejected": -2.4207208156585693,
      "logps/chosen": -476.00933837890625,
      "logps/rejected": -491.25799560546875,
      "loss": 0.6148,
      "rewards/accuracies": 0.706250011920929,
      "rewards/chosen": -0.8471584320068359,
      "rewards/margins": 0.2906045913696289,
      "rewards/rejected": -1.1377630233764648,
      "step": 160
    },
    {
      "epoch": 0.35583464154892724,
      "grad_norm": 10.130673374633192,
      "learning_rate": 1.6267559897763025e-07,
      "logits/chosen": -2.240748405456543,
      "logits/rejected": -2.3730461597442627,
      "logps/chosen": -466.5884704589844,
      "logps/rejected": -470.2240295410156,
      "loss": 0.6136,
      "rewards/accuracies": 0.6812499761581421,
      "rewards/chosen": -0.8948806524276733,
      "rewards/margins": 0.24292059242725372,
      "rewards/rejected": -1.137801170349121,
      "step": 170
    },
    {
      "epoch": 0.37676609105180536,
      "grad_norm": 12.664244024162585,
      "learning_rate": 1.5680647467311557e-07,
      "logits/chosen": -2.3886361122131348,
      "logits/rejected": -2.48551344871521,
      "logps/chosen": -466.68115234375,
      "logps/rejected": -481.260498046875,
      "loss": 0.589,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": -0.8065212965011597,
      "rewards/margins": 0.28530603647232056,
      "rewards/rejected": -1.091827392578125,
      "step": 180
    },
    {
      "epoch": 0.3976975405546834,
      "grad_norm": 15.413041204374277,
      "learning_rate": 1.506328492394303e-07,
      "logits/chosen": -2.425926685333252,
      "logits/rejected": -2.436657190322876,
      "logps/chosen": -480.2686462402344,
      "logps/rejected": -514.1541137695312,
      "loss": 0.6247,
      "rewards/accuracies": 0.6312500238418579,
      "rewards/chosen": -1.0268957614898682,
      "rewards/margins": 0.26106053590774536,
      "rewards/rejected": -1.2879562377929688,
      "step": 190
    },
    {
      "epoch": 0.4186289900575615,
      "grad_norm": 16.30024056431674,
      "learning_rate": 1.4418781531128634e-07,
      "logits/chosen": -2.3286993503570557,
      "logits/rejected": -2.387202024459839,
      "logps/chosen": -454.547119140625,
      "logps/rejected": -511.773681640625,
      "loss": 0.6101,
      "rewards/accuracies": 0.6937500238418579,
      "rewards/chosen": -0.8713696599006653,
      "rewards/margins": 0.2568342685699463,
      "rewards/rejected": -1.1282037496566772,
      "step": 200
    },
    {
      "epoch": 0.4186289900575615,
      "eval_logits/chosen": -2.293304443359375,
      "eval_logits/rejected": -2.447746753692627,
      "eval_logps/chosen": -484.72442626953125,
      "eval_logps/rejected": -515.6393432617188,
      "eval_loss": 0.5989560484886169,
      "eval_rewards/accuracies": 0.7142857313156128,
      "eval_rewards/chosen": -0.7996230125427246,
      "eval_rewards/margins": 0.39701077342033386,
      "eval_rewards/rejected": -1.1966338157653809,
      "eval_runtime": 88.7991,
      "eval_samples_per_second": 22.523,
      "eval_steps_per_second": 0.709,
      "step": 200
    },
    {
      "epoch": 0.43956043956043955,
      "grad_norm": 12.590959189684769,
      "learning_rate": 1.375059203609562e-07,
      "logits/chosen": -2.251105785369873,
      "logits/rejected": -2.49545955657959,
      "logps/chosen": -514.7989501953125,
      "logps/rejected": -508.8777770996094,
      "loss": 0.6036,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": -0.9383622407913208,
      "rewards/margins": 0.3089975416660309,
      "rewards/rejected": -1.2473597526550293,
      "step": 210
    },
    {
      "epoch": 0.4604918890633176,
      "grad_norm": 32.27211919256004,
      "learning_rate": 1.306229815126159e-07,
      "logits/chosen": -2.374002456665039,
      "logits/rejected": -2.5104002952575684,
      "logps/chosen": -453.17889404296875,
      "logps/rejected": -502.31829833984375,
      "loss": 0.5905,
      "rewards/accuracies": 0.668749988079071,
      "rewards/chosen": -1.0016330480575562,
      "rewards/margins": 0.3531147539615631,
      "rewards/rejected": -1.3547478914260864,
      "step": 220
    },
    {
      "epoch": 0.48142333856619574,
      "grad_norm": 11.074374701972996,
      "learning_rate": 1.2357589355094274e-07,
      "logits/chosen": -2.240893602371216,
      "logits/rejected": -2.4365756511688232,
      "logps/chosen": -464.9483947753906,
      "logps/rejected": -497.55950927734375,
      "loss": 0.6032,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": -0.8673335909843445,
      "rewards/margins": 0.4288042187690735,
      "rewards/rejected": -1.2961379289627075,
      "step": 230
    },
    {
      "epoch": 0.5023547880690737,
      "grad_norm": 13.608161796310325,
      "learning_rate": 1.1640243115310217e-07,
      "logits/chosen": -2.263231039047241,
      "logits/rejected": -2.374429225921631,
      "logps/chosen": -483.5979919433594,
      "logps/rejected": -511.7247009277344,
      "loss": 0.5829,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": -0.8133866190910339,
      "rewards/margins": 0.35704511404037476,
      "rewards/rejected": -1.1704318523406982,
      "step": 240
    },
    {
      "epoch": 0.5232862375719518,
      "grad_norm": 14.904992006409358,
      "learning_rate": 1.0914104640422679e-07,
      "logits/chosen": -2.312152862548828,
      "logits/rejected": -2.504575490951538,
      "logps/chosen": -487.4195861816406,
      "logps/rejected": -509.62213134765625,
      "loss": 0.5914,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": -0.9289507865905762,
      "rewards/margins": 0.4651150703430176,
      "rewards/rejected": -1.3940656185150146,
      "step": 250
    },
    {
      "epoch": 0.54421768707483,
      "grad_norm": 32.859126344847056,
      "learning_rate": 1.0183066268176774e-07,
      "logits/chosen": -2.452216863632202,
      "logits/rejected": -2.5787224769592285,
      "logps/chosen": -454.101318359375,
      "logps/rejected": -491.07708740234375,
      "loss": 0.5958,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -0.8231406211853027,
      "rewards/margins": 0.37211668491363525,
      "rewards/rejected": -1.1952574253082275,
      "step": 260
    },
    {
      "epoch": 0.565149136577708,
      "grad_norm": 16.410575278967542,
      "learning_rate": 9.451046601356724e-08,
      "logits/chosen": -2.4211385250091553,
      "logits/rejected": -2.5718777179718018,
      "logps/chosen": -482.42889404296875,
      "logps/rejected": -517.08447265625,
      "loss": 0.5968,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -0.7412260174751282,
      "rewards/margins": 0.46059027314186096,
      "rewards/rejected": -1.201816439628601,
      "step": 270
    },
    {
      "epoch": 0.5860805860805861,
      "grad_norm": 14.64481409505789,
      "learning_rate": 8.721969502803953e-08,
      "logits/chosen": -2.414080858230591,
      "logits/rejected": -2.641306161880493,
      "logps/chosen": -471.8504943847656,
      "logps/rejected": -492.3824157714844,
      "loss": 0.6088,
      "rewards/accuracies": 0.6000000238418579,
      "rewards/chosen": -0.9498642086982727,
      "rewards/margins": 0.3709770143032074,
      "rewards/rejected": -1.3208411931991577,
      "step": 280
    },
    {
      "epoch": 0.6070120355834642,
      "grad_norm": 21.87484189841818,
      "learning_rate": 7.999743062239557e-08,
      "logits/chosen": -2.5216970443725586,
      "logits/rejected": -2.5266430377960205,
      "logps/chosen": -452.1351623535156,
      "logps/rejected": -507.50408935546875,
      "loss": 0.5975,
      "rewards/accuracies": 0.6625000238418579,
      "rewards/chosen": -0.9256707429885864,
      "rewards/margins": 0.38739797472953796,
      "rewards/rejected": -1.3130687475204468,
      "step": 290
    },
    {
      "epoch": 0.6279434850863422,
      "grad_norm": 13.23460942074812,
      "learning_rate": 7.28823864763583e-08,
      "logits/chosen": -2.3628604412078857,
      "logits/rejected": -2.5071964263916016,
      "logps/chosen": -530.2737426757812,
      "logps/rejected": -534.9356689453125,
      "loss": 0.5738,
      "rewards/accuracies": 0.706250011920929,
      "rewards/chosen": -0.9033306241035461,
      "rewards/margins": 0.409872442483902,
      "rewards/rejected": -1.313202977180481,
      "step": 300
    },
    {
      "epoch": 0.6279434850863422,
      "eval_logits/chosen": -2.3505780696868896,
      "eval_logits/rejected": -2.500311851501465,
      "eval_logps/chosen": -511.9820861816406,
      "eval_logps/rejected": -562.04541015625,
      "eval_loss": 0.5819065570831299,
      "eval_rewards/accuracies": 0.7142857313156128,
      "eval_rewards/chosen": -1.0721999406814575,
      "eval_rewards/margins": 0.5884942412376404,
      "eval_rewards/rejected": -1.6606942415237427,
      "eval_runtime": 88.8035,
      "eval_samples_per_second": 22.522,
      "eval_steps_per_second": 0.709,
      "step": 300
    },
    {
      "epoch": 0.6488749345892203,
      "grad_norm": 23.240653261962176,
      "learning_rate": 6.591270153428288e-08,
      "logits/chosen": -2.3066353797912598,
      "logits/rejected": -2.4188685417175293,
      "logps/chosen": -530.1605224609375,
      "logps/rejected": -555.5882568359375,
      "loss": 0.5816,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": -1.0851608514785767,
      "rewards/margins": 0.6294665932655334,
      "rewards/rejected": -1.7146275043487549,
      "step": 310
    },
    {
      "epoch": 0.6698063840920984,
      "grad_norm": 12.35925417664361,
      "learning_rate": 5.912573556804452e-08,
      "logits/chosen": -2.4511845111846924,
      "logits/rejected": -2.5960700511932373,
      "logps/chosen": -462.8910217285156,
      "logps/rejected": -492.77459716796875,
      "loss": 0.5721,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -0.9141901135444641,
      "rewards/margins": 0.49542441964149475,
      "rewards/rejected": -1.4096145629882812,
      "step": 320
    },
    {
      "epoch": 0.6907378335949764,
      "grad_norm": 19.635922794228048,
      "learning_rate": 5.255786891654399e-08,
      "logits/chosen": -2.2881722450256348,
      "logits/rejected": -2.3245983123779297,
      "logps/chosen": -490.61956787109375,
      "logps/rejected": -528.5936279296875,
      "loss": 0.5831,
      "rewards/accuracies": 0.675000011920929,
      "rewards/chosen": -1.0118191242218018,
      "rewards/margins": 0.3562072217464447,
      "rewards/rejected": -1.3680263757705688,
      "step": 330
    },
    {
      "epoch": 0.7116692830978545,
      "grad_norm": 34.0341920873177,
      "learning_rate": 4.624430747529102e-08,
      "logits/chosen": -2.2541534900665283,
      "logits/rejected": -2.3677923679351807,
      "logps/chosen": -520.711181640625,
      "logps/rejected": -555.8665771484375,
      "loss": 0.5771,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": -1.1455854177474976,
      "rewards/margins": 0.44834479689598083,
      "rewards/rejected": -1.5939301252365112,
      "step": 340
    },
    {
      "epoch": 0.7326007326007326,
      "grad_norm": 20.184086200131315,
      "learning_rate": 4.0218893981385925e-08,
      "logits/chosen": -2.336240291595459,
      "logits/rejected": -2.5228190422058105,
      "logps/chosen": -490.032470703125,
      "logps/rejected": -514.3966064453125,
      "loss": 0.5772,
      "rewards/accuracies": 0.675000011920929,
      "rewards/chosen": -1.1221544742584229,
      "rewards/margins": 0.41546517610549927,
      "rewards/rejected": -1.5376195907592773,
      "step": 350
    },
    {
      "epoch": 0.7535321821036107,
      "grad_norm": 14.840705395348046,
      "learning_rate": 3.45139266054715e-08,
      "logits/chosen": -2.3588707447052,
      "logits/rejected": -2.5286855697631836,
      "logps/chosen": -525.8394775390625,
      "logps/rejected": -543.2139892578125,
      "loss": 0.5961,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": -0.9700316190719604,
      "rewards/margins": 0.42892080545425415,
      "rewards/rejected": -1.3989523649215698,
      "step": 360
    },
    {
      "epoch": 0.7744636316064888,
      "grad_norm": 12.56992511385935,
      "learning_rate": 2.9159985823062993e-08,
      "logits/chosen": -2.4362387657165527,
      "logits/rejected": -2.588212251663208,
      "logps/chosen": -469.63018798828125,
      "logps/rejected": -491.34185791015625,
      "loss": 0.5787,
      "rewards/accuracies": 0.71875,
      "rewards/chosen": -0.9046362638473511,
      "rewards/margins": 0.42833614349365234,
      "rewards/rejected": -1.332972526550293,
      "step": 370
    },
    {
      "epoch": 0.7953950811093669,
      "grad_norm": 14.216122099186137,
      "learning_rate": 2.4185770493280577e-08,
      "logits/chosen": -2.4785826206207275,
      "logits/rejected": -2.5475876331329346,
      "logps/chosen": -463.3335876464844,
      "logps/rejected": -562.8516235351562,
      "loss": 0.5816,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -1.0568846464157104,
      "rewards/margins": 0.6403349041938782,
      "rewards/rejected": -1.6972196102142334,
      "step": 380
    },
    {
      "epoch": 0.8163265306122449,
      "grad_norm": 17.166403382209694,
      "learning_rate": 1.9617944023656108e-08,
      "logits/chosen": -2.3412299156188965,
      "logits/rejected": -2.431159257888794,
      "logps/chosen": -569.6896362304688,
      "logps/rejected": -604.4752197265625,
      "loss": 0.5647,
      "rewards/accuracies": 0.71875,
      "rewards/chosen": -1.135259985923767,
      "rewards/margins": 0.5612015724182129,
      "rewards/rejected": -1.6964616775512695,
      "step": 390
    },
    {
      "epoch": 0.837257980115123,
      "grad_norm": 25.5326876410102,
      "learning_rate": 1.5480991445620538e-08,
      "logits/chosen": -2.438910961151123,
      "logits/rejected": -2.621582269668579,
      "logps/chosen": -477.71551513671875,
      "logps/rejected": -516.8345336914062,
      "loss": 0.5808,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": -1.0198707580566406,
      "rewards/margins": 0.485908567905426,
      "rewards/rejected": -1.5057791471481323,
      "step": 400
    },
    {
      "epoch": 0.837257980115123,
      "eval_logits/chosen": -2.4454309940338135,
      "eval_logits/rejected": -2.60603404045105,
      "eval_logps/chosen": -509.0269470214844,
      "eval_logps/rejected": -557.9309692382812,
      "eval_loss": 0.5776250958442688,
      "eval_rewards/accuracies": 0.7063491940498352,
      "eval_rewards/chosen": -1.042648196220398,
      "eval_rewards/margins": 0.5769018530845642,
      "eval_rewards/rejected": -1.619550108909607,
      "eval_runtime": 88.8844,
      "eval_samples_per_second": 22.501,
      "eval_steps_per_second": 0.709,
      "step": 400
    },
    {
      "epoch": 0.858189429618001,
      "grad_norm": 12.2363803809367,
      "learning_rate": 1.1797088166794e-08,
      "logits/chosen": -2.327822208404541,
      "logits/rejected": -2.539658308029175,
      "logps/chosen": -523.35693359375,
      "logps/rejected": -556.1873168945312,
      "loss": 0.5837,
      "rewards/accuracies": 0.7124999761581421,
      "rewards/chosen": -1.0230482816696167,
      "rewards/margins": 0.5963117480278015,
      "rewards/rejected": -1.6193599700927734,
      "step": 410
    },
    {
      "epoch": 0.8791208791208791,
      "grad_norm": 17.1630701293683,
      "learning_rate": 8.585981103608341e-09,
      "logits/chosen": -2.3502843379974365,
      "logits/rejected": -2.5074477195739746,
      "logps/chosen": -481.4237365722656,
      "logps/rejected": -559.5806884765625,
      "loss": 0.567,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -1.0329768657684326,
      "rewards/margins": 0.5681900978088379,
      "rewards/rejected": -1.6011669635772705,
      "step": 420
    },
    {
      "epoch": 0.9000523286237572,
      "grad_norm": 16.184790708379772,
      "learning_rate": 5.864882831430273e-09,
      "logits/chosen": -2.352280378341675,
      "logits/rejected": -2.436026096343994,
      "logps/chosen": -513.5238647460938,
      "logps/rejected": -551.8958129882812,
      "loss": 0.5755,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": -1.0582252740859985,
      "rewards/margins": 0.5332168340682983,
      "rewards/rejected": -1.5914418697357178,
      "step": 430
    },
    {
      "epoch": 0.9209837781266352,
      "grad_norm": 17.526839475687186,
      "learning_rate": 3.6483793195745682e-09,
      "logits/chosen": -2.3311455249786377,
      "logits/rejected": -2.440988063812256,
      "logps/chosen": -482.4281311035156,
      "logps/rejected": -498.60345458984375,
      "loss": 0.5787,
      "rewards/accuracies": 0.625,
      "rewards/chosen": -1.0323375463485718,
      "rewards/margins": 0.4054194390773773,
      "rewards/rejected": -1.4377570152282715,
      "step": 440
    },
    {
      "epoch": 0.9419152276295133,
      "grad_norm": 14.705602904039639,
      "learning_rate": 1.9483517457776433e-09,
      "logits/chosen": -2.2350025177001953,
      "logits/rejected": -2.3830924034118652,
      "logps/chosen": -490.513427734375,
      "logps/rejected": -551.2727661132812,
      "loss": 0.579,
      "rewards/accuracies": 0.75,
      "rewards/chosen": -1.0369895696640015,
      "rewards/margins": 0.5606644153594971,
      "rewards/rejected": -1.597654104232788,
      "step": 450
    },
    {
      "epoch": 0.9628466771323915,
      "grad_norm": 15.228089724513376,
      "learning_rate": 7.739128092312918e-10,
      "logits/chosen": -2.281054973602295,
      "logits/rejected": -2.4768524169921875,
      "logps/chosen": -496.84814453125,
      "logps/rejected": -510.46258544921875,
      "loss": 0.579,
      "rewards/accuracies": 0.731249988079071,
      "rewards/chosen": -1.0984748601913452,
      "rewards/margins": 0.47915878891944885,
      "rewards/rejected": -1.5776336193084717,
      "step": 460
    },
    {
      "epoch": 0.9837781266352695,
      "grad_norm": 17.607957497609636,
      "learning_rate": 1.313578835593465e-10,
      "logits/chosen": -2.3311634063720703,
      "logits/rejected": -2.4415996074676514,
      "logps/chosen": -519.3492431640625,
      "logps/rejected": -541.9041137695312,
      "loss": 0.5694,
      "rewards/accuracies": 0.643750011920929,
      "rewards/chosen": -1.0364539623260498,
      "rewards/margins": 0.33034905791282654,
      "rewards/rejected": -1.3668031692504883,
      "step": 470
    },
    {
      "epoch": 0.9984301412872841,
      "step": 477,
      "total_flos": 0.0,
      "train_loss": 0.6095632167232361,
      "train_runtime": 6900.3625,
      "train_samples_per_second": 8.86,
      "train_steps_per_second": 0.069
    }
  ],
  "logging_steps": 10,
  "max_steps": 477,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": false,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}