File size: 28,769 Bytes
e031c07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.0,
  "eval_steps": 1000,
  "global_step": 375,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.01,
      "learning_rate": 1.3157894736842104e-08,
      "logits/chosen": -0.5324900150299072,
      "logits/rejected": -0.5734304189682007,
      "logps/chosen": -543.2296752929688,
      "logps/rejected": -325.48358154296875,
      "loss": 0.6931,
      "rewards/accuracies": 0.0,
      "rewards/chosen": 0.0,
      "rewards/margins": 0.0,
      "rewards/mix_margin": 0.0,
      "rewards/rejected": 0.0,
      "step": 1
    },
    {
      "epoch": 0.08,
      "learning_rate": 1.3157894736842104e-07,
      "logits/chosen": -0.48575523495674133,
      "logits/rejected": -0.5831019878387451,
      "logps/chosen": -334.6309509277344,
      "logps/rejected": -278.2859802246094,
      "loss": 0.6997,
      "rewards/accuracies": 0.4027777910232544,
      "rewards/chosen": -0.012853524647653103,
      "rewards/confidence": -0.0746772438287735,
      "rewards/confidence_mean_diff": 0.0746772438287735,
      "rewards/confidence_moving_diff": 0.0021637948229908943,
      "rewards/margins": -0.007044664584100246,
      "rewards/mix_margin": -0.007044283673167229,
      "rewards/real_percentage": 14.129032135009766,
      "rewards/rejected": -0.005808859597891569,
      "step": 10
    },
    {
      "epoch": 0.16,
      "learning_rate": 2.631578947368421e-07,
      "logits/chosen": -0.45206984877586365,
      "logits/rejected": -0.4436320662498474,
      "logps/chosen": -378.46478271484375,
      "logps/rejected": -291.097412109375,
      "loss": 0.687,
      "rewards/accuracies": 0.48750001192092896,
      "rewards/chosen": 0.019860025495290756,
      "rewards/confidence": -0.07699747383594513,
      "rewards/confidence_mean_diff": 0.07699747383594513,
      "rewards/confidence_moving_diff": -6.244657561182976e-05,
      "rewards/margins": 0.010339610278606415,
      "rewards/mix_margin": 0.010339389555156231,
      "rewards/real_percentage": 11.975000381469727,
      "rewards/rejected": 0.009520411491394043,
      "step": 20
    },
    {
      "epoch": 0.24,
      "learning_rate": 3.9473684210526315e-07,
      "logits/chosen": -0.48425692319869995,
      "logits/rejected": -0.5238968133926392,
      "logps/chosen": -363.4825439453125,
      "logps/rejected": -330.880859375,
      "loss": 0.6746,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.07081757485866547,
      "rewards/confidence": -0.0583333782851696,
      "rewards/confidence_mean_diff": 0.0583333782851696,
      "rewards/confidence_moving_diff": 0.00017116544768214226,
      "rewards/margins": 0.04097529500722885,
      "rewards/mix_margin": 0.04097532853484154,
      "rewards/real_percentage": 12.024999618530273,
      "rewards/rejected": 0.029842281714081764,
      "step": 30
    },
    {
      "epoch": 0.32,
      "learning_rate": 4.999565492409831e-07,
      "logits/chosen": -0.47305864095687866,
      "logits/rejected": -0.582284152507782,
      "logps/chosen": -335.81610107421875,
      "logps/rejected": -256.0378723144531,
      "loss": 0.6474,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": 0.16260090470314026,
      "rewards/confidence": -0.11778082698583603,
      "rewards/confidence_mean_diff": 0.11778082698583603,
      "rewards/confidence_moving_diff": 0.0008547043544240296,
      "rewards/margins": 0.05970517918467522,
      "rewards/mix_margin": 0.05970512703061104,
      "rewards/real_percentage": 12.074999809265137,
      "rewards/rejected": 0.10289572179317474,
      "step": 40
    },
    {
      "epoch": 0.4,
      "learning_rate": 4.984373579809777e-07,
      "logits/chosen": -0.5092490911483765,
      "logits/rejected": -0.5690798163414001,
      "logps/chosen": -329.53302001953125,
      "logps/rejected": -295.02294921875,
      "loss": 0.5866,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": 0.3996647000312805,
      "rewards/confidence": -0.14608541131019592,
      "rewards/confidence_mean_diff": 0.14608541131019592,
      "rewards/confidence_moving_diff": -0.00040556181920692325,
      "rewards/margins": 0.20342092216014862,
      "rewards/mix_margin": 0.2034207135438919,
      "rewards/real_percentage": 12.0,
      "rewards/rejected": 0.1962437778711319,
      "step": 50
    },
    {
      "epoch": 0.48,
      "learning_rate": 4.947607089353757e-07,
      "logits/chosen": -0.4855988025665283,
      "logits/rejected": -0.5692173838615417,
      "logps/chosen": -365.7965393066406,
      "logps/rejected": -290.7939147949219,
      "loss": 0.6262,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.6372241973876953,
      "rewards/confidence": -0.27295011281967163,
      "rewards/confidence_mean_diff": 0.27295011281967163,
      "rewards/confidence_moving_diff": -8.605476614320651e-05,
      "rewards/margins": 0.25993281602859497,
      "rewards/mix_margin": 0.2599331736564636,
      "rewards/real_percentage": 12.100000381469727,
      "rewards/rejected": 0.37729138135910034,
      "step": 60
    },
    {
      "epoch": 0.56,
      "learning_rate": 4.889585305354435e-07,
      "logits/chosen": -0.511881411075592,
      "logits/rejected": -0.5559085607528687,
      "logps/chosen": -374.42559814453125,
      "logps/rejected": -350.49285888671875,
      "loss": 0.5776,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": 0.938610851764679,
      "rewards/confidence": -0.17002172768115997,
      "rewards/confidence_mean_diff": 0.17002172768115997,
      "rewards/confidence_moving_diff": 0.006227460689842701,
      "rewards/margins": 0.39301368594169617,
      "rewards/mix_margin": 0.39301276206970215,
      "rewards/real_percentage": 12.199999809265137,
      "rewards/rejected": 0.5455971360206604,
      "step": 70
    },
    {
      "epoch": 0.64,
      "learning_rate": 4.810812095469401e-07,
      "logits/chosen": -0.4341855049133301,
      "logits/rejected": -0.4922330975532532,
      "logps/chosen": -382.85986328125,
      "logps/rejected": -316.0860595703125,
      "loss": 0.4931,
      "rewards/accuracies": 0.8374999761581421,
      "rewards/chosen": 1.085030198097229,
      "rewards/confidence": -0.36220604181289673,
      "rewards/confidence_mean_diff": 0.36220604181289673,
      "rewards/confidence_moving_diff": -0.004994163755327463,
      "rewards/margins": 0.727383017539978,
      "rewards/mix_margin": 0.7273828387260437,
      "rewards/real_percentage": 11.899999618530273,
      "rewards/rejected": 0.3576471507549286,
      "step": 80
    },
    {
      "epoch": 0.72,
      "learning_rate": 4.711971535058109e-07,
      "logits/chosen": -0.4119408130645752,
      "logits/rejected": -0.5046309232711792,
      "logps/chosen": -335.9080810546875,
      "logps/rejected": -228.6096649169922,
      "loss": 0.5641,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": 0.9102222323417664,
      "rewards/confidence": -0.18118831515312195,
      "rewards/confidence_mean_diff": 0.18118831515312195,
      "rewards/confidence_moving_diff": 0.0009786130394786596,
      "rewards/margins": 0.6580663919448853,
      "rewards/mix_margin": 0.6580665707588196,
      "rewards/real_percentage": 12.074999809265137,
      "rewards/rejected": 0.2521558403968811,
      "step": 90
    },
    {
      "epoch": 0.8,
      "learning_rate": 4.593921966594997e-07,
      "logits/chosen": -0.4459192752838135,
      "logits/rejected": -0.4894910454750061,
      "logps/chosen": -371.5416259765625,
      "logps/rejected": -310.64239501953125,
      "loss": 0.5469,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 1.2468544244766235,
      "rewards/confidence": -0.45958179235458374,
      "rewards/confidence_mean_diff": 0.45958179235458374,
      "rewards/confidence_moving_diff": 0.004259251989424229,
      "rewards/margins": 0.7577625513076782,
      "rewards/mix_margin": 0.757762610912323,
      "rewards/real_percentage": 12.100000381469727,
      "rewards/rejected": 0.4890917241573334,
      "step": 100
    },
    {
      "epoch": 0.88,
      "learning_rate": 4.457688545727496e-07,
      "logits/chosen": -0.5113216042518616,
      "logits/rejected": -0.5288140177726746,
      "logps/chosen": -352.3919982910156,
      "logps/rejected": -276.9599304199219,
      "loss": 0.5222,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": 1.0378813743591309,
      "rewards/confidence": -0.29791101813316345,
      "rewards/confidence_mean_diff": 0.29791101813316345,
      "rewards/confidence_moving_diff": -0.0015016455436125398,
      "rewards/margins": 0.7134403586387634,
      "rewards/mix_margin": 0.7134405374526978,
      "rewards/real_percentage": 11.949999809265137,
      "rewards/rejected": 0.3244408965110779,
      "step": 110
    },
    {
      "epoch": 0.96,
      "learning_rate": 4.3044543387098026e-07,
      "logits/chosen": -0.5033639669418335,
      "logits/rejected": -0.5167360901832581,
      "logps/chosen": -323.29119873046875,
      "logps/rejected": -265.58221435546875,
      "loss": 0.5039,
      "rewards/accuracies": 0.7875000238418579,
      "rewards/chosen": 1.0098955631256104,
      "rewards/confidence": -0.42422398924827576,
      "rewards/confidence_mean_diff": 0.42422398924827576,
      "rewards/confidence_moving_diff": 0.0008432863396592438,
      "rewards/margins": 0.9732195138931274,
      "rewards/mix_margin": 0.9732197523117065,
      "rewards/real_percentage": 11.949999809265137,
      "rewards/rejected": 0.036676160991191864,
      "step": 120
    },
    {
      "epoch": 1.04,
      "learning_rate": 4.1355500485232917e-07,
      "logits/chosen": -0.4795234203338623,
      "logits/rejected": -0.5551981329917908,
      "logps/chosen": -367.8242492675781,
      "logps/rejected": -284.45062255859375,
      "loss": 0.381,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 1.1803219318389893,
      "rewards/confidence": -0.1641966998577118,
      "rewards/confidence_mean_diff": 0.1641966998577118,
      "rewards/confidence_moving_diff": -0.009292250499129295,
      "rewards/margins": 1.2318060398101807,
      "rewards/mix_margin": 1.2318063974380493,
      "rewards/real_percentage": 11.875,
      "rewards/rejected": -0.051483988761901855,
      "step": 130
    },
    {
      "epoch": 1.12,
      "learning_rate": 3.9524424589030863e-07,
      "logits/chosen": -0.47544917464256287,
      "logits/rejected": -0.45598697662353516,
      "logps/chosen": -368.21197509765625,
      "logps/rejected": -327.8885803222656,
      "loss": 0.2637,
      "rewards/accuracies": 0.925000011920929,
      "rewards/chosen": 1.4148461818695068,
      "rewards/confidence": 0.12870559096336365,
      "rewards/confidence_mean_diff": -0.12870559096336365,
      "rewards/confidence_moving_diff": -4.9034319090424106e-05,
      "rewards/margins": 1.949605941772461,
      "rewards/mix_margin": 1.9496057033538818,
      "rewards/real_percentage": 12.024999618530273,
      "rewards/rejected": -0.5347597599029541,
      "step": 140
    },
    {
      "epoch": 1.2,
      "learning_rate": 3.7567216966241556e-07,
      "logits/chosen": -0.5132138133049011,
      "logits/rejected": -0.5720852613449097,
      "logps/chosen": -349.05706787109375,
      "logps/rejected": -309.68194580078125,
      "loss": 0.2546,
      "rewards/accuracies": 0.925000011920929,
      "rewards/chosen": 1.3815641403198242,
      "rewards/confidence": 0.15192195773124695,
      "rewards/confidence_mean_diff": -0.15192195773124695,
      "rewards/confidence_moving_diff": -0.004711526446044445,
      "rewards/margins": 1.766579031944275,
      "rewards/mix_margin": 1.766579031944275,
      "rewards/real_percentage": 11.925000190734863,
      "rewards/rejected": -0.3850148320198059,
      "step": 150
    },
    {
      "epoch": 1.28,
      "learning_rate": 3.5500874226626633e-07,
      "logits/chosen": -0.41593313217163086,
      "logits/rejected": -0.47519993782043457,
      "logps/chosen": -424.2110290527344,
      "logps/rejected": -386.99688720703125,
      "loss": 0.2319,
      "rewards/accuracies": 0.987500011920929,
      "rewards/chosen": 1.594541072845459,
      "rewards/confidence": 0.3976772129535675,
      "rewards/confidence_mean_diff": -0.3976772129535675,
      "rewards/confidence_moving_diff": 0.0034655616618692875,
      "rewards/margins": 2.2624146938323975,
      "rewards/mix_margin": 2.2624149322509766,
      "rewards/real_percentage": 12.024999618530273,
      "rewards/rejected": -0.667873740196228,
      "step": 160
    },
    {
      "epoch": 1.36,
      "learning_rate": 3.334334072150074e-07,
      "logits/chosen": -0.4277438223361969,
      "logits/rejected": -0.44190508127212524,
      "logps/chosen": -359.75262451171875,
      "logps/rejected": -304.85107421875,
      "loss": 0.244,
      "rewards/accuracies": 0.9750000238418579,
      "rewards/chosen": 1.3704854249954224,
      "rewards/confidence": 0.43457871675491333,
      "rewards/confidence_mean_diff": -0.43457871675491333,
      "rewards/confidence_moving_diff": 0.0003442527668084949,
      "rewards/margins": 2.137328624725342,
      "rewards/mix_margin": 2.137328624725342,
      "rewards/real_percentage": 12.024999618530273,
      "rewards/rejected": -0.766843318939209,
      "step": 170
    },
    {
      "epoch": 1.44,
      "learning_rate": 3.1113352712978995e-07,
      "logits/chosen": -0.4778042733669281,
      "logits/rejected": -0.5502051115036011,
      "logps/chosen": -285.4638671875,
      "logps/rejected": -259.30963134765625,
      "loss": 0.2673,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 1.1096071004867554,
      "rewards/confidence": 0.36355599761009216,
      "rewards/confidence_mean_diff": -0.36355599761009216,
      "rewards/confidence_moving_diff": 2.6996247470378876e-05,
      "rewards/margins": 1.878248929977417,
      "rewards/mix_margin": 1.878249168395996,
      "rewards/real_percentage": 11.975000381469727,
      "rewards/rejected": -0.7686418294906616,
      "step": 180
    },
    {
      "epoch": 1.52,
      "learning_rate": 2.8830275666182565e-07,
      "logits/chosen": -0.5888835191726685,
      "logits/rejected": -0.5946951508522034,
      "logps/chosen": -345.4639587402344,
      "logps/rejected": -269.433349609375,
      "loss": 0.2581,
      "rewards/accuracies": 0.949999988079071,
      "rewards/chosen": 1.4412751197814941,
      "rewards/confidence": 0.27181780338287354,
      "rewards/confidence_mean_diff": -0.27181780338287354,
      "rewards/confidence_moving_diff": -0.0013337878044694662,
      "rewards/margins": 1.7757008075714111,
      "rewards/mix_margin": 1.7756999731063843,
      "rewards/real_percentage": 11.975000381469727,
      "rewards/rejected": -0.3344256579875946,
      "step": 190
    },
    {
      "epoch": 1.6,
      "learning_rate": 2.651393607737495e-07,
      "logits/chosen": -0.43257981538772583,
      "logits/rejected": -0.5586498975753784,
      "logps/chosen": -332.6167907714844,
      "logps/rejected": -258.5675354003906,
      "loss": 0.2367,
      "rewards/accuracies": 0.925000011920929,
      "rewards/chosen": 1.6590750217437744,
      "rewards/confidence": 0.20384028553962708,
      "rewards/confidence_mean_diff": -0.20384028553962708,
      "rewards/confidence_moving_diff": 0.0033724855165928602,
      "rewards/margins": 2.299750804901123,
      "rewards/mix_margin": 2.299750804901123,
      "rewards/real_percentage": 12.074999809265137,
      "rewards/rejected": -0.6406754851341248,
      "step": 200
    },
    {
      "epoch": 1.68,
      "learning_rate": 2.418444929845241e-07,
      "logits/chosen": -0.5128785371780396,
      "logits/rejected": -0.5602482557296753,
      "logps/chosen": -347.55145263671875,
      "logps/rejected": -316.63287353515625,
      "loss": 0.2366,
      "rewards/accuracies": 0.949999988079071,
      "rewards/chosen": 1.4995661973953247,
      "rewards/confidence": 0.29339924454689026,
      "rewards/confidence_mean_diff": -0.29339924454689026,
      "rewards/confidence_moving_diff": -0.002348523121327162,
      "rewards/margins": 2.240088939666748,
      "rewards/mix_margin": 2.2400896549224854,
      "rewards/real_percentage": 11.949999809265137,
      "rewards/rejected": -0.7405228018760681,
      "step": 210
    },
    {
      "epoch": 1.76,
      "learning_rate": 2.186204485297965e-07,
      "logits/chosen": -0.5206685066223145,
      "logits/rejected": -0.49740782380104065,
      "logps/chosen": -327.6163024902344,
      "logps/rejected": -312.23345947265625,
      "loss": 0.259,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 1.3457434177398682,
      "rewards/confidence": 0.5061102509498596,
      "rewards/confidence_mean_diff": -0.5061102509498596,
      "rewards/confidence_moving_diff": 0.0032404728699475527,
      "rewards/margins": 2.1838455200195312,
      "rewards/mix_margin": 2.183845281600952,
      "rewards/real_percentage": 12.125,
      "rewards/rejected": -0.8381019830703735,
      "step": 220
    },
    {
      "epoch": 1.84,
      "learning_rate": 1.956689076074607e-07,
      "logits/chosen": -0.47606563568115234,
      "logits/rejected": -0.5649515986442566,
      "logps/chosen": -359.9063415527344,
      "logps/rejected": -272.35333251953125,
      "loss": 0.2392,
      "rewards/accuracies": 0.949999988079071,
      "rewards/chosen": 1.2465178966522217,
      "rewards/confidence": 0.15909627079963684,
      "rewards/confidence_mean_diff": -0.15909627079963684,
      "rewards/confidence_moving_diff": -0.005671085324138403,
      "rewards/margins": 2.0437004566192627,
      "rewards/mix_margin": 2.043700695037842,
      "rewards/real_percentage": 11.899999618530273,
      "rewards/rejected": -0.7971823811531067,
      "step": 230
    },
    {
      "epoch": 1.92,
      "learning_rate": 1.7318918396427674e-07,
      "logits/chosen": -0.5379046201705933,
      "logits/rejected": -0.5706161260604858,
      "logps/chosen": -386.26861572265625,
      "logps/rejected": -303.8609619140625,
      "loss": 0.2138,
      "rewards/accuracies": 0.8999999761581421,
      "rewards/chosen": 1.7834584712982178,
      "rewards/confidence": 0.13540206849575043,
      "rewards/confidence_mean_diff": -0.13540206849575043,
      "rewards/confidence_moving_diff": 0.0014076533261686563,
      "rewards/margins": 2.3947689533233643,
      "rewards/mix_margin": 2.394768476486206,
      "rewards/real_percentage": 11.975000381469727,
      "rewards/rejected": -0.6113101840019226,
      "step": 240
    },
    {
      "epoch": 2.0,
      "learning_rate": 1.513764940330155e-07,
      "logits/chosen": -0.39151811599731445,
      "logits/rejected": -0.473433256149292,
      "logps/chosen": -336.6163024902344,
      "logps/rejected": -306.16046142578125,
      "loss": 0.2558,
      "rewards/accuracies": 0.925000011920929,
      "rewards/chosen": 1.611301064491272,
      "rewards/confidence": 0.2504025101661682,
      "rewards/confidence_mean_diff": -0.2504025101661682,
      "rewards/confidence_moving_diff": 0.000840538355987519,
      "rewards/margins": 2.209441661834717,
      "rewards/mix_margin": 2.209441661834717,
      "rewards/real_percentage": 12.050000190734863,
      "rewards/rejected": -0.5981408357620239,
      "step": 250
    },
    {
      "epoch": 2.08,
      "learning_rate": 1.304202616511362e-07,
      "logits/chosen": -0.5112959742546082,
      "logits/rejected": -0.5279114842414856,
      "logps/chosen": -377.9098815917969,
      "logps/rejected": -316.77825927734375,
      "loss": 0.1627,
      "rewards/accuracies": 0.987500011920929,
      "rewards/chosen": 2.204784631729126,
      "rewards/confidence": 0.5538536906242371,
      "rewards/confidence_mean_diff": -0.5538536906242371,
      "rewards/confidence_moving_diff": -0.005331903696060181,
      "rewards/margins": 2.7410061359405518,
      "rewards/mix_margin": 2.7410056591033936,
      "rewards/real_percentage": 11.774999618530273,
      "rewards/rejected": -0.5362212657928467,
      "step": 260
    },
    {
      "epoch": 2.16,
      "learning_rate": 1.1050247308300944e-07,
      "logits/chosen": -0.48956188559532166,
      "logits/rejected": -0.5282370448112488,
      "logps/chosen": -370.67767333984375,
      "logps/rejected": -370.94476318359375,
      "loss": 0.1444,
      "rewards/accuracies": 0.987500011920929,
      "rewards/chosen": 1.979029893875122,
      "rewards/confidence": 0.7768798470497131,
      "rewards/confidence_mean_diff": -0.7768798470497131,
      "rewards/confidence_moving_diff": -0.008751118555665016,
      "rewards/margins": 5.2986369132995605,
      "rewards/mix_margin": 5.298637866973877,
      "rewards/real_percentage": 11.875,
      "rewards/rejected": -3.3196072578430176,
      "step": 270
    },
    {
      "epoch": 2.24,
      "learning_rate": 9.179609663085594e-08,
      "logits/chosen": -0.478290855884552,
      "logits/rejected": -0.5842245817184448,
      "logps/chosen": -354.32220458984375,
      "logps/rejected": -323.82830810546875,
      "loss": 0.1632,
      "rewards/accuracies": 0.9750000238418579,
      "rewards/chosen": 2.003836154937744,
      "rewards/confidence": 1.0290786027908325,
      "rewards/confidence_mean_diff": -1.0290786027908325,
      "rewards/confidence_moving_diff": 0.009490849450230598,
      "rewards/margins": 2.7980473041534424,
      "rewards/mix_margin": 2.7980475425720215,
      "rewards/real_percentage": 12.175000190734863,
      "rewards/rejected": -0.7942115068435669,
      "step": 280
    },
    {
      "epoch": 2.32,
      "learning_rate": 7.446358055867688e-08,
      "logits/chosen": -0.4719129502773285,
      "logits/rejected": -0.5351340174674988,
      "logps/chosen": -284.57977294921875,
      "logps/rejected": -244.1188507080078,
      "loss": 0.1959,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 1.3237833976745605,
      "rewards/confidence": 0.36271917819976807,
      "rewards/confidence_mean_diff": -0.36271917819976807,
      "rewards/confidence_moving_diff": 0.002586688846349716,
      "rewards/margins": 2.3666577339172363,
      "rewards/mix_margin": 2.3666574954986572,
      "rewards/real_percentage": 12.074999809265137,
      "rewards/rejected": -1.0428742170333862,
      "step": 290
    },
    {
      "epoch": 2.4,
      "learning_rate": 5.8655442373371164e-08,
      "logits/chosen": -0.581800639629364,
      "logits/rejected": -0.6199262142181396,
      "logps/chosen": -420.638671875,
      "logps/rejected": -355.60736083984375,
      "loss": 0.152,
      "rewards/accuracies": 0.987500011920929,
      "rewards/chosen": 1.8940696716308594,
      "rewards/confidence": 0.7953528165817261,
      "rewards/confidence_mean_diff": -0.7953528165817261,
      "rewards/confidence_moving_diff": -0.006861658301204443,
      "rewards/margins": 2.8314507007598877,
      "rewards/mix_margin": 2.831450939178467,
      "rewards/real_percentage": 11.875,
      "rewards/rejected": -0.9373809695243835,
      "step": 300
    },
    {
      "epoch": 2.48,
      "learning_rate": 4.450896171388219e-08,
      "logits/chosen": -0.5456718802452087,
      "logits/rejected": -0.5629149079322815,
      "logps/chosen": -384.376953125,
      "logps/rejected": -332.3739318847656,
      "loss": 0.1365,
      "rewards/accuracies": 0.9750000238418579,
      "rewards/chosen": 2.1317899227142334,
      "rewards/confidence": 0.8724759221076965,
      "rewards/confidence_mean_diff": -0.8724759221076965,
      "rewards/confidence_moving_diff": 0.006575644016265869,
      "rewards/margins": 3.160860300064087,
      "rewards/mix_margin": 3.160860061645508,
      "rewards/real_percentage": 12.024999618530273,
      "rewards/rejected": -1.0290701389312744,
      "step": 310
    },
    {
      "epoch": 2.56,
      "learning_rate": 3.214698819946879e-08,
      "logits/chosen": -0.5238803625106812,
      "logits/rejected": -0.5871630907058716,
      "logps/chosen": -375.8655090332031,
      "logps/rejected": -300.987548828125,
      "loss": 0.1748,
      "rewards/accuracies": 0.9750000238418579,
      "rewards/chosen": 1.7292721271514893,
      "rewards/confidence": 0.5153323411941528,
      "rewards/confidence_mean_diff": -0.5153323411941528,
      "rewards/confidence_moving_diff": 0.0004902526852674782,
      "rewards/margins": 2.5365805625915527,
      "rewards/mix_margin": 2.5365803241729736,
      "rewards/real_percentage": 11.949999809265137,
      "rewards/rejected": -0.8073086738586426,
      "step": 320
    },
    {
      "epoch": 2.64,
      "learning_rate": 2.1676874589879908e-08,
      "logits/chosen": -0.49646130204200745,
      "logits/rejected": -0.5325660705566406,
      "logps/chosen": -361.79986572265625,
      "logps/rejected": -292.0426940917969,
      "loss": 0.1878,
      "rewards/accuracies": 0.9750000238418579,
      "rewards/chosen": 1.4343092441558838,
      "rewards/confidence": 0.5628241300582886,
      "rewards/confidence_mean_diff": -0.5628241300582886,
      "rewards/confidence_moving_diff": 0.0001227855682373047,
      "rewards/margins": 2.628760576248169,
      "rewards/mix_margin": 2.628760814666748,
      "rewards/real_percentage": 12.024999618530273,
      "rewards/rejected": -1.1944514513015747,
      "step": 330
    },
    {
      "epoch": 2.72,
      "learning_rate": 1.3189544521990032e-08,
      "logits/chosen": -0.5395928025245667,
      "logits/rejected": -0.5778788328170776,
      "logps/chosen": -332.5323791503906,
      "logps/rejected": -296.8447265625,
      "loss": 0.1826,
      "rewards/accuracies": 0.987500011920929,
      "rewards/chosen": 1.4775941371917725,
      "rewards/confidence": 0.5103145837783813,
      "rewards/confidence_mean_diff": -0.5103145837783813,
      "rewards/confidence_moving_diff": -0.003172731725499034,
      "rewards/margins": 2.3591160774230957,
      "rewards/mix_margin": 2.3591160774230957,
      "rewards/real_percentage": 11.875,
      "rewards/rejected": -0.8815220594406128,
      "step": 340
    },
    {
      "epoch": 2.8,
      "learning_rate": 6.7587029187732014e-09,
      "logits/chosen": -0.5066567659378052,
      "logits/rejected": -0.5228812098503113,
      "logps/chosen": -346.0731201171875,
      "logps/rejected": -309.6691589355469,
      "loss": 0.1769,
      "rewards/accuracies": 0.949999988079071,
      "rewards/chosen": 1.6007888317108154,
      "rewards/confidence": 0.6578723788261414,
      "rewards/confidence_mean_diff": -0.6578723788261414,
      "rewards/confidence_moving_diff": 0.004312982317060232,
      "rewards/margins": 2.8280742168426514,
      "rewards/mix_margin": 2.8280739784240723,
      "rewards/real_percentage": 12.125,
      "rewards/rejected": -1.2272855043411255,
      "step": 350
    },
    {
      "epoch": 2.88,
      "learning_rate": 2.4401959275140437e-09,
      "logits/chosen": -0.4290226399898529,
      "logits/rejected": -0.4782096743583679,
      "logps/chosen": -323.8050231933594,
      "logps/rejected": -286.50225830078125,
      "loss": 0.1644,
      "rewards/accuracies": 0.987500011920929,
      "rewards/chosen": 1.5553691387176514,
      "rewards/confidence": 0.5026761889457703,
      "rewards/confidence_mean_diff": -0.5026761889457703,
      "rewards/confidence_moving_diff": -0.0073528410866856575,
      "rewards/margins": 2.44854474067688,
      "rewards/mix_margin": 2.4485442638397217,
      "rewards/real_percentage": 11.899999618530273,
      "rewards/rejected": -0.8931753039360046,
      "step": 360
    },
    {
      "epoch": 2.96,
      "learning_rate": 2.715259456224084e-10,
      "logits/chosen": -0.5070622563362122,
      "logits/rejected": -0.5159127712249756,
      "logps/chosen": -368.29248046875,
      "logps/rejected": -357.6094665527344,
      "loss": 0.1676,
      "rewards/accuracies": 0.987500011920929,
      "rewards/chosen": 1.714816689491272,
      "rewards/confidence": 0.8204771876335144,
      "rewards/confidence_mean_diff": -0.8204771876335144,
      "rewards/confidence_moving_diff": 0.002654359443113208,
      "rewards/margins": 2.7493791580200195,
      "rewards/mix_margin": 2.7493796348571777,
      "rewards/real_percentage": 12.125,
      "rewards/rejected": -1.034562587738037,
      "step": 370
    },
    {
      "epoch": 3.0,
      "step": 375,
      "total_flos": 0.0,
      "train_loss": 0.3348727149963379,
      "train_runtime": 2536.3599,
      "train_samples_per_second": 2.366,
      "train_steps_per_second": 0.148
    }
  ],
  "logging_steps": 10,
  "max_steps": 375,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 200,
  "total_flos": 0.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}