codeReasoningGPT-v2 / last-checkpoint /trainer_state.json
AymenELKani's picture
Training in progress, step 3354, checkpoint
e189061 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 3354,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005963918294319368,
"grad_norm": 19.664148330688477,
"learning_rate": 4.5e-06,
"logits/chosen": -61.68421173095703,
"logits/rejected": -59.81378936767578,
"logps/chosen": -3847.377685546875,
"logps/rejected": -3732.838623046875,
"loss": 0.6957,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.009157716296613216,
"rewards/margins": 0.04523131996393204,
"rewards/rejected": -0.036073606461286545,
"step": 10
},
{
"epoch": 0.011927836588638736,
"grad_norm": 27.623851776123047,
"learning_rate": 9.5e-06,
"logits/chosen": -62.0186767578125,
"logits/rejected": -62.229949951171875,
"logps/chosen": -3975.751953125,
"logps/rejected": -3489.30615234375,
"loss": 0.6545,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.3238146901130676,
"rewards/margins": 0.1601802110671997,
"rewards/rejected": 0.1636344939470291,
"step": 20
},
{
"epoch": 0.017891754882958102,
"grad_norm": 27.4714412689209,
"learning_rate": 1.45e-05,
"logits/chosen": -61.2970085144043,
"logits/rejected": -60.396812438964844,
"logps/chosen": -3610.776123046875,
"logps/rejected": -3742.642578125,
"loss": 0.582,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 1.0766934156417847,
"rewards/margins": 0.32587307691574097,
"rewards/rejected": 0.7508202791213989,
"step": 30
},
{
"epoch": 0.02385567317727747,
"grad_norm": 11.324883460998535,
"learning_rate": 1.9500000000000003e-05,
"logits/chosen": -57.98284149169922,
"logits/rejected": -58.55870819091797,
"logps/chosen": -3499.75732421875,
"logps/rejected": -3155.265625,
"loss": 0.465,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 2.3674192428588867,
"rewards/margins": 0.7249017357826233,
"rewards/rejected": 1.6425174474716187,
"step": 40
},
{
"epoch": 0.029819591471596837,
"grad_norm": 20.27655601501465,
"learning_rate": 2.45e-05,
"logits/chosen": -58.64543914794922,
"logits/rejected": -56.548179626464844,
"logps/chosen": -3525.67431640625,
"logps/rejected": -3034.49267578125,
"loss": 0.5633,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 6.382194519042969,
"rewards/margins": 2.046049118041992,
"rewards/rejected": 4.336145877838135,
"step": 50
},
{
"epoch": 0.035783509765916204,
"grad_norm": 65.18791961669922,
"learning_rate": 2.95e-05,
"logits/chosen": -57.381561279296875,
"logits/rejected": -56.21010208129883,
"logps/chosen": -3934.571533203125,
"logps/rejected": -3834.8515625,
"loss": 0.2975,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 10.051956176757812,
"rewards/margins": 2.9109206199645996,
"rewards/rejected": 7.141035556793213,
"step": 60
},
{
"epoch": 0.04174742806023558,
"grad_norm": 3.803753614425659,
"learning_rate": 3.45e-05,
"logits/chosen": -58.149757385253906,
"logits/rejected": -57.435768127441406,
"logps/chosen": -3799.5859375,
"logps/rejected": -3451.617919921875,
"loss": 0.6205,
"rewards/accuracies": 0.875,
"rewards/chosen": 9.363242149353027,
"rewards/margins": 3.4164345264434814,
"rewards/rejected": 5.946808815002441,
"step": 70
},
{
"epoch": 0.04771134635455494,
"grad_norm": 27.553604125976562,
"learning_rate": 3.9500000000000005e-05,
"logits/chosen": -59.83305740356445,
"logits/rejected": -59.072486877441406,
"logps/chosen": -3933.164794921875,
"logps/rejected": -3912.46240234375,
"loss": 0.5561,
"rewards/accuracies": 0.875,
"rewards/chosen": 8.799053192138672,
"rewards/margins": 4.589492321014404,
"rewards/rejected": 4.209560871124268,
"step": 80
},
{
"epoch": 0.05367526464887431,
"grad_norm": 4.916882514953613,
"learning_rate": 4.4500000000000004e-05,
"logits/chosen": -58.64013671875,
"logits/rejected": -58.137168884277344,
"logps/chosen": -3592.069580078125,
"logps/rejected": -3692.88427734375,
"loss": 0.2698,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 6.715191841125488,
"rewards/margins": 4.236520290374756,
"rewards/rejected": 2.4786713123321533,
"step": 90
},
{
"epoch": 0.059639182943193675,
"grad_norm": 39.796302795410156,
"learning_rate": 4.9500000000000004e-05,
"logits/chosen": -58.97795867919922,
"logits/rejected": -60.2847785949707,
"logps/chosen": -3994.854248046875,
"logps/rejected": -4313.3271484375,
"loss": 0.2365,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 8.663000106811523,
"rewards/margins": 4.546173095703125,
"rewards/rejected": 4.11682653427124,
"step": 100
},
{
"epoch": 0.06560310123751305,
"grad_norm": 3.1577208042144775,
"learning_rate": 4.9999056250036984e-05,
"logits/chosen": -58.68115234375,
"logits/rejected": -58.9907341003418,
"logps/chosen": -3749.045654296875,
"logps/rejected": -3762.167236328125,
"loss": 0.0366,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.467626571655273,
"rewards/margins": 5.942251682281494,
"rewards/rejected": 5.525373935699463,
"step": 110
},
{
"epoch": 0.07156701953183241,
"grad_norm": 3.2147419452667236,
"learning_rate": 4.999579399596396e-05,
"logits/chosen": -56.03290939331055,
"logits/rejected": -56.16686248779297,
"logps/chosen": -3634.969482421875,
"logps/rejected": -3742.016845703125,
"loss": 0.2662,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 18.00886344909668,
"rewards/margins": 8.250906944274902,
"rewards/rejected": 9.757956504821777,
"step": 120
},
{
"epoch": 0.07753093782615178,
"grad_norm": 0.006837156601250172,
"learning_rate": 4.9990201890548246e-05,
"logits/chosen": -53.0700569152832,
"logits/rejected": -53.876380920410156,
"logps/chosen": -4084.875732421875,
"logps/rejected": -4197.57958984375,
"loss": 0.0278,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.550527572631836,
"rewards/margins": 10.616544723510742,
"rewards/rejected": 7.933982849121094,
"step": 130
},
{
"epoch": 0.08349485612047115,
"grad_norm": 0.585788905620575,
"learning_rate": 4.998228045502851e-05,
"logits/chosen": -55.47416305541992,
"logits/rejected": -55.35027313232422,
"logps/chosen": -3845.14453125,
"logps/rejected": -3608.837158203125,
"loss": 0.0453,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 15.1895112991333,
"rewards/margins": 8.870885848999023,
"rewards/rejected": 6.318624973297119,
"step": 140
},
{
"epoch": 0.08945877441479051,
"grad_norm": 0.07510236650705338,
"learning_rate": 4.9972030427759666e-05,
"logits/chosen": -50.84516906738281,
"logits/rejected": -50.26861572265625,
"logps/chosen": -3661.830078125,
"logps/rejected": -3803.436767578125,
"loss": 0.2181,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 15.063519477844238,
"rewards/margins": 11.220986366271973,
"rewards/rejected": 3.842533826828003,
"step": 150
},
{
"epoch": 0.09542269270910989,
"grad_norm": 0.06167513504624367,
"learning_rate": 4.995945276414404e-05,
"logits/chosen": -52.945655822753906,
"logits/rejected": -53.59800338745117,
"logps/chosen": -3374.52197265625,
"logps/rejected": -3389.505126953125,
"loss": 0.0903,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 8.205995559692383,
"rewards/margins": 9.442131996154785,
"rewards/rejected": -1.2361366748809814,
"step": 160
},
{
"epoch": 0.10138661100342926,
"grad_norm": 4.309830188751221,
"learning_rate": 4.994454863654233e-05,
"logits/chosen": -58.374549865722656,
"logits/rejected": -57.894004821777344,
"logps/chosen": -4359.32177734375,
"logps/rejected": -4773.193359375,
"loss": 0.0278,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.901937961578369,
"rewards/margins": 13.444661140441895,
"rewards/rejected": -6.542723178863525,
"step": 170
},
{
"epoch": 0.10735052929774862,
"grad_norm": 0.1345188468694687,
"learning_rate": 4.992731943416432e-05,
"logits/chosen": -56.97514724731445,
"logits/rejected": -57.7388801574707,
"logps/chosen": -3933.59716796875,
"logps/rejected": -4406.06103515625,
"loss": 0.093,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 7.6058669090271,
"rewards/margins": 12.426251411437988,
"rewards/rejected": -4.8203840255737305,
"step": 180
},
{
"epoch": 0.11331444759206799,
"grad_norm": 0.4823387563228607,
"learning_rate": 4.990776676293941e-05,
"logits/chosen": -60.002403259277344,
"logits/rejected": -61.1885871887207,
"logps/chosen": -4209.791015625,
"logps/rejected": -4174.68212890625,
"loss": 0.1487,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 3.5269577503204346,
"rewards/margins": 8.545251846313477,
"rewards/rejected": -5.018294811248779,
"step": 190
},
{
"epoch": 0.11927836588638735,
"grad_norm": 1.821547031402588,
"learning_rate": 4.98858924453669e-05,
"logits/chosen": -60.311546325683594,
"logits/rejected": -61.352203369140625,
"logps/chosen": -3726.235595703125,
"logps/rejected": -3612.451904296875,
"loss": 0.0083,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0089493989944458,
"rewards/margins": 10.570683479309082,
"rewards/rejected": -9.561734199523926,
"step": 200
},
{
"epoch": 0.12524228418070674,
"grad_norm": 0.010515117086470127,
"learning_rate": 4.9861698520346155e-05,
"logits/chosen": -58.907005310058594,
"logits/rejected": -58.76160430908203,
"logps/chosen": -3922.297607421875,
"logps/rejected": -3831.158935546875,
"loss": 0.0106,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.980830669403076,
"rewards/margins": 11.232372283935547,
"rewards/rejected": -5.251542091369629,
"step": 210
},
{
"epoch": 0.1312062024750261,
"grad_norm": 0.13617613911628723,
"learning_rate": 4.983518724298652e-05,
"logits/chosen": -57.034385681152344,
"logits/rejected": -61.353546142578125,
"logps/chosen": -3717.315185546875,
"logps/rejected": -4125.3466796875,
"loss": 0.0403,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 11.408642768859863,
"rewards/margins": 12.029062271118164,
"rewards/rejected": -0.6204200983047485,
"step": 220
},
{
"epoch": 0.13717012076934545,
"grad_norm": 0.11460208892822266,
"learning_rate": 4.980636108439712e-05,
"logits/chosen": -60.95244216918945,
"logits/rejected": -62.050682067871094,
"logps/chosen": -3819.20654296875,
"logps/rejected": -3770.43701171875,
"loss": 0.2529,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -2.9785029888153076,
"rewards/margins": 11.722539901733398,
"rewards/rejected": -14.701044082641602,
"step": 230
},
{
"epoch": 0.14313403906366481,
"grad_norm": 0.010561717674136162,
"learning_rate": 4.97752227314566e-05,
"logits/chosen": -61.96656036376953,
"logits/rejected": -63.59345245361328,
"logps/chosen": -3559.760986328125,
"logps/rejected": -3238.27099609375,
"loss": 0.0605,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -3.5705113410949707,
"rewards/margins": 10.212821960449219,
"rewards/rejected": -13.783332824707031,
"step": 240
},
{
"epoch": 0.1490979573579842,
"grad_norm": 0.020034722983837128,
"learning_rate": 4.9741775086562576e-05,
"logits/chosen": -58.21870040893555,
"logits/rejected": -59.43880081176758,
"logps/chosen": -3953.829345703125,
"logps/rejected": -4362.8837890625,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.396712779998779,
"rewards/margins": 12.898712158203125,
"rewards/rejected": -7.5019989013671875,
"step": 250
},
{
"epoch": 0.15506187565230356,
"grad_norm": 0.0008314030710607767,
"learning_rate": 4.970602126736118e-05,
"logits/chosen": -58.50162887573242,
"logits/rejected": -59.48447799682617,
"logps/chosen": -3606.218017578125,
"logps/rejected": -3528.481689453125,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.148120880126953,
"rewards/margins": 12.437234878540039,
"rewards/rejected": -3.2891151905059814,
"step": 260
},
{
"epoch": 0.16102579394662292,
"grad_norm": 1.2535442113876343,
"learning_rate": 4.966796460645644e-05,
"logits/chosen": -59.570526123046875,
"logits/rejected": -63.26567459106445,
"logps/chosen": -3971.397705078125,
"logps/rejected": -4289.9853515625,
"loss": 0.0029,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.893758296966553,
"rewards/margins": 15.240409851074219,
"rewards/rejected": -10.346652030944824,
"step": 270
},
{
"epoch": 0.1669897122409423,
"grad_norm": 0.0032445124816149473,
"learning_rate": 4.962760865109964e-05,
"logits/chosen": -56.13077926635742,
"logits/rejected": -57.030982971191406,
"logps/chosen": -3904.32177734375,
"logps/rejected": -3896.450439453125,
"loss": 0.0019,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.48493766784668,
"rewards/margins": 15.189654350280762,
"rewards/rejected": -4.704716205596924,
"step": 280
},
{
"epoch": 0.17295363053526167,
"grad_norm": 0.0017609696369618177,
"learning_rate": 4.95849571628587e-05,
"logits/chosen": -57.822837829589844,
"logits/rejected": -56.972389221191406,
"logps/chosen": -3716.139892578125,
"logps/rejected": -3481.774658203125,
"loss": 0.0103,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.023122787475586,
"rewards/margins": 13.122570991516113,
"rewards/rejected": 0.9005520939826965,
"step": 290
},
{
"epoch": 0.17891754882958102,
"grad_norm": 1.900453805923462,
"learning_rate": 4.954001411726755e-05,
"logits/chosen": -57.431907653808594,
"logits/rejected": -60.528594970703125,
"logps/chosen": -4033.428955078125,
"logps/rejected": -4488.1083984375,
"loss": 0.0132,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.56352710723877,
"rewards/margins": 13.36528491973877,
"rewards/rejected": -3.8017585277557373,
"step": 300
},
{
"epoch": 0.1848814671239004,
"grad_norm": 0.022368023172020912,
"learning_rate": 4.949278370345558e-05,
"logits/chosen": -57.648948669433594,
"logits/rejected": -61.04389190673828,
"logps/chosen": -3662.002685546875,
"logps/rejected": -3926.045654296875,
"loss": 0.2272,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 6.355870246887207,
"rewards/margins": 15.903188705444336,
"rewards/rejected": -9.547318458557129,
"step": 310
},
{
"epoch": 0.19084538541821977,
"grad_norm": 0.0004944842658005655,
"learning_rate": 4.944327032375716e-05,
"logits/chosen": -59.660667419433594,
"logits/rejected": -62.95856857299805,
"logps/chosen": -3589.598388671875,
"logps/rejected": -3629.106201171875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.476762294769287,
"rewards/margins": 13.896219253540039,
"rewards/rejected": -7.419455528259277,
"step": 320
},
{
"epoch": 0.19680930371253913,
"grad_norm": 0.00029502142569981515,
"learning_rate": 4.93914785933013e-05,
"logits/chosen": -55.67151641845703,
"logits/rejected": -57.00590896606445,
"logps/chosen": -3930.76025390625,
"logps/rejected": -3948.296142578125,
"loss": 0.0028,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.375188827514648,
"rewards/margins": 14.264836311340332,
"rewards/rejected": -2.889647960662842,
"step": 330
},
{
"epoch": 0.20277322200685852,
"grad_norm": 0.003186926245689392,
"learning_rate": 4.9337413339581494e-05,
"logits/chosen": -55.91204833984375,
"logits/rejected": -55.10901641845703,
"logps/chosen": -3886.98583984375,
"logps/rejected": -3999.08251953125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.179794311523438,
"rewards/margins": 14.276512145996094,
"rewards/rejected": -0.09671908617019653,
"step": 340
},
{
"epoch": 0.20873714030117788,
"grad_norm": 0.0017121023265644908,
"learning_rate": 4.928107960200573e-05,
"logits/chosen": -52.622962951660156,
"logits/rejected": -53.773536682128906,
"logps/chosen": -3416.02392578125,
"logps/rejected": -3253.180419921875,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.344406127929688,
"rewards/margins": 12.657001495361328,
"rewards/rejected": 5.687404632568359,
"step": 350
},
{
"epoch": 0.21470105859549724,
"grad_norm": 0.005526215769350529,
"learning_rate": 4.922248263142678e-05,
"logits/chosen": -56.997161865234375,
"logits/rejected": -56.7293701171875,
"logps/chosen": -3787.90234375,
"logps/rejected": -3675.44677734375,
"loss": 0.0075,
"rewards/accuracies": 1.0,
"rewards/chosen": 20.720577239990234,
"rewards/margins": 16.067211151123047,
"rewards/rejected": 4.653366565704346,
"step": 360
},
{
"epoch": 0.22066497688981662,
"grad_norm": 0.00579728651791811,
"learning_rate": 4.916162788965275e-05,
"logits/chosen": -55.556427001953125,
"logits/rejected": -57.451927185058594,
"logps/chosen": -3876.120361328125,
"logps/rejected": -4547.07275390625,
"loss": 0.058,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 19.52408218383789,
"rewards/margins": 14.548626899719238,
"rewards/rejected": 4.975453853607178,
"step": 370
},
{
"epoch": 0.22662889518413598,
"grad_norm": 0.002147270832210779,
"learning_rate": 4.909852104893803e-05,
"logits/chosen": -55.28306198120117,
"logits/rejected": -57.85760498046875,
"logps/chosen": -3812.12646484375,
"logps/rejected": -3860.81787109375,
"loss": 0.0032,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.829846382141113,
"rewards/margins": 15.175271987915039,
"rewards/rejected": -0.34542423486709595,
"step": 380
},
{
"epoch": 0.23259281347845534,
"grad_norm": 2.6284790237696143e-06,
"learning_rate": 4.903316799145453e-05,
"logits/chosen": -56.578338623046875,
"logits/rejected": -61.98085403442383,
"logps/chosen": -3731.21484375,
"logps/rejected": -3811.68017578125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.379071235656738,
"rewards/margins": 15.874186515808105,
"rewards/rejected": -0.4951143264770508,
"step": 390
},
{
"epoch": 0.2385567317727747,
"grad_norm": 0.0006998952012509108,
"learning_rate": 4.896557480874345e-05,
"logits/chosen": -55.025390625,
"logits/rejected": -56.02961349487305,
"logps/chosen": -4079.14990234375,
"logps/rejected": -3936.82666015625,
"loss": 0.0222,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 16.036401748657227,
"rewards/margins": 15.827476501464844,
"rewards/rejected": 0.20892485976219177,
"step": 400
},
{
"epoch": 0.2445206500670941,
"grad_norm": 0.0015518699074164033,
"learning_rate": 4.889574780114745e-05,
"logits/chosen": -57.45948028564453,
"logits/rejected": -58.848793029785156,
"logps/chosen": -3700.404296875,
"logps/rejected": -3442.11572265625,
"loss": 0.0643,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 15.649378776550293,
"rewards/margins": 14.64953899383545,
"rewards/rejected": 0.9998385310173035,
"step": 410
},
{
"epoch": 0.2504845683614135,
"grad_norm": 1.4911309480667114,
"learning_rate": 4.8823693477223444e-05,
"logits/chosen": -58.846778869628906,
"logits/rejected": -59.60619354248047,
"logps/chosen": -3987.731201171875,
"logps/rejected": -4248.55078125,
"loss": 0.0723,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 15.464131355285645,
"rewards/margins": 15.123265266418457,
"rewards/rejected": 0.34086543321609497,
"step": 420
},
{
"epoch": 0.25644848665573283,
"grad_norm": 0.001055889530107379,
"learning_rate": 4.874941855313587e-05,
"logits/chosen": -56.497413635253906,
"logits/rejected": -57.141944885253906,
"logps/chosen": -3793.345703125,
"logps/rejected": -3607.841796875,
"loss": 0.129,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 7.237372398376465,
"rewards/margins": 13.780862808227539,
"rewards/rejected": -6.543488502502441,
"step": 430
},
{
"epoch": 0.2624124049500522,
"grad_norm": 8.529239181598314e-08,
"learning_rate": 4.8672929952030764e-05,
"logits/chosen": -59.98900604248047,
"logits/rejected": -62.46282958984375,
"logps/chosen": -4084.49755859375,
"logps/rejected": -4386.53271484375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.959494113922119,
"rewards/margins": 17.883264541625977,
"rewards/rejected": -12.923771858215332,
"step": 440
},
{
"epoch": 0.26837632324437155,
"grad_norm": 0.012090430594980717,
"learning_rate": 4.8594234803390384e-05,
"logits/chosen": -58.39624786376953,
"logits/rejected": -60.7776985168457,
"logps/chosen": -4043.174560546875,
"logps/rejected": -4138.63037109375,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.332669258117676,
"rewards/margins": 14.487439155578613,
"rewards/rejected": -10.154768943786621,
"step": 450
},
{
"epoch": 0.2743402415386909,
"grad_norm": 0.011768614873290062,
"learning_rate": 4.851334044236871e-05,
"logits/chosen": -57.09410858154297,
"logits/rejected": -58.455528259277344,
"logps/chosen": -3805.43994140625,
"logps/rejected": -3812.210205078125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.533733367919922,
"rewards/margins": 15.815759658813477,
"rewards/rejected": -6.282026767730713,
"step": 460
},
{
"epoch": 0.28030415983301027,
"grad_norm": 0.23742778599262238,
"learning_rate": 4.84302544091077e-05,
"logits/chosen": -57.45296096801758,
"logits/rejected": -58.879905700683594,
"logps/chosen": -3779.80419921875,
"logps/rejected": -3724.149169921875,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.725826263427734,
"rewards/margins": 13.637672424316406,
"rewards/rejected": -3.91184663772583,
"step": 470
},
{
"epoch": 0.28626807812732963,
"grad_norm": 0.25321099162101746,
"learning_rate": 4.8344984448034555e-05,
"logits/chosen": -57.193397521972656,
"logits/rejected": -58.29435348510742,
"logps/chosen": -3555.61181640625,
"logps/rejected": -3608.22265625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.781957626342773,
"rewards/margins": 17.517807006835938,
"rewards/rejected": -6.7358503341674805,
"step": 480
},
{
"epoch": 0.29223199642164904,
"grad_norm": 0.0003638965426944196,
"learning_rate": 4.825753850713977e-05,
"logits/chosen": -55.068214416503906,
"logits/rejected": -55.34920120239258,
"logps/chosen": -3720.23486328125,
"logps/rejected": -3710.431640625,
"loss": 0.0142,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.105955123901367,
"rewards/margins": 15.404626846313477,
"rewards/rejected": -4.298670768737793,
"step": 490
},
{
"epoch": 0.2981959147159684,
"grad_norm": 0.5876765251159668,
"learning_rate": 4.816792473723633e-05,
"logits/chosen": -54.507286071777344,
"logits/rejected": -58.03179168701172,
"logps/chosen": -3657.91650390625,
"logps/rejected": -3473.89013671875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.856443405151367,
"rewards/margins": 16.580219268798828,
"rewards/rejected": -3.7237751483917236,
"step": 500
},
{
"epoch": 0.30415983301028776,
"grad_norm": 1.0686131872716942e-06,
"learning_rate": 4.807615149120004e-05,
"logits/chosen": -53.92388916015625,
"logits/rejected": -56.692657470703125,
"logps/chosen": -3720.27685546875,
"logps/rejected": -3769.602783203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.57264232635498,
"rewards/margins": 18.110240936279297,
"rewards/rejected": -3.537597179412842,
"step": 510
},
{
"epoch": 0.3101237513046071,
"grad_norm": 6.154350558063015e-05,
"learning_rate": 4.7982227323190845e-05,
"logits/chosen": -52.49439239501953,
"logits/rejected": -55.725608825683594,
"logps/chosen": -3611.25341796875,
"logps/rejected": -3967.747314453125,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.996103286743164,
"rewards/margins": 17.36039161682129,
"rewards/rejected": 0.6357126235961914,
"step": 520
},
{
"epoch": 0.3160876695989265,
"grad_norm": 1.5112824769403232e-07,
"learning_rate": 4.788616098785561e-05,
"logits/chosen": -51.7737922668457,
"logits/rejected": -54.860435485839844,
"logps/chosen": -3689.40869140625,
"logps/rejected": -3673.685546875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 19.106679916381836,
"rewards/margins": 18.57061767578125,
"rewards/rejected": 0.5360631942749023,
"step": 530
},
{
"epoch": 0.32205158789324584,
"grad_norm": 6.17673921585083,
"learning_rate": 4.778796143951202e-05,
"logits/chosen": -53.8176155090332,
"logits/rejected": -57.342063903808594,
"logps/chosen": -3829.150390625,
"logps/rejected": -4027.96142578125,
"loss": 0.0041,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.951896667480469,
"rewards/margins": 18.080907821655273,
"rewards/rejected": -4.129012107849121,
"step": 540
},
{
"epoch": 0.32801550618756525,
"grad_norm": 11.827872276306152,
"learning_rate": 4.768763783131397e-05,
"logits/chosen": -53.82781219482422,
"logits/rejected": -53.793556213378906,
"logps/chosen": -4161.9482421875,
"logps/rejected": -4243.11376953125,
"loss": 0.0095,
"rewards/accuracies": 1.0,
"rewards/chosen": 20.491607666015625,
"rewards/margins": 15.793841361999512,
"rewards/rejected": 4.697766304016113,
"step": 550
},
{
"epoch": 0.3339794244818846,
"grad_norm": 0.006199230439960957,
"learning_rate": 4.7585199514398444e-05,
"logits/chosen": -54.75727462768555,
"logits/rejected": -56.78379440307617,
"logps/chosen": -3925.2890625,
"logps/rejected": -4292.37109375,
"loss": 0.2788,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 25.08880043029785,
"rewards/margins": 15.867774963378906,
"rewards/rejected": 9.221026420593262,
"step": 560
},
{
"epoch": 0.33994334277620397,
"grad_norm": 0.0003209487476851791,
"learning_rate": 4.7480656037013836e-05,
"logits/chosen": -53.4384651184082,
"logits/rejected": -55.634620666503906,
"logps/chosen": -3506.94384765625,
"logps/rejected": -3504.106201171875,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.662946701049805,
"rewards/margins": 15.725804328918457,
"rewards/rejected": -3.062856435775757,
"step": 570
},
{
"epoch": 0.34590726107052333,
"grad_norm": 7.712010119576007e-05,
"learning_rate": 4.7374017143630026e-05,
"logits/chosen": -57.10976028442383,
"logits/rejected": -59.99627685546875,
"logps/chosen": -4153.18603515625,
"logps/rejected": -4331.6298828125,
"loss": 0.0146,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.0889763832092285,
"rewards/margins": 18.375919342041016,
"rewards/rejected": -14.286943435668945,
"step": 580
},
{
"epoch": 0.3518711793648427,
"grad_norm": 9.873609815258533e-05,
"learning_rate": 4.726529277403001e-05,
"logits/chosen": -57.93046951293945,
"logits/rejected": -58.8515625,
"logps/chosen": -4072.68115234375,
"logps/rejected": -4284.86962890625,
"loss": 0.0036,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.040389537811279,
"rewards/margins": 16.860654830932617,
"rewards/rejected": -10.820265769958496,
"step": 590
},
{
"epoch": 0.35783509765916205,
"grad_norm": 2.0305640646256506e-05,
"learning_rate": 4.7154493062383534e-05,
"logits/chosen": -59.621116638183594,
"logits/rejected": -61.127525329589844,
"logps/chosen": -3728.64697265625,
"logps/rejected": -4261.62939453125,
"loss": 0.0046,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.971760272979736,
"rewards/margins": 19.21591567993164,
"rewards/rejected": -14.244155883789062,
"step": 600
},
{
"epoch": 0.36379901595348146,
"grad_norm": 7.320449367398396e-05,
"learning_rate": 4.704162833630237e-05,
"logits/chosen": -58.1140251159668,
"logits/rejected": -60.18278884887695,
"logps/chosen": -3717.621826171875,
"logps/rejected": -3631.44482421875,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6981580257415771,
"rewards/margins": 17.43337631225586,
"rewards/rejected": -15.735217094421387,
"step": 610
},
{
"epoch": 0.3697629342478008,
"grad_norm": 4.593440532684326,
"learning_rate": 4.692670911587778e-05,
"logits/chosen": -59.49884796142578,
"logits/rejected": -63.38677978515625,
"logps/chosen": -3749.25048828125,
"logps/rejected": -4039.53564453125,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.481489658355713,
"rewards/margins": 20.123058319091797,
"rewards/rejected": -17.64156723022461,
"step": 620
},
{
"epoch": 0.3757268525421202,
"grad_norm": 8.068655006354675e-05,
"learning_rate": 4.680974611269987e-05,
"logits/chosen": -55.29735565185547,
"logits/rejected": -56.667335510253906,
"logps/chosen": -4161.16796875,
"logps/rejected": -4411.65234375,
"loss": 0.1013,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 7.400763034820557,
"rewards/margins": 19.986141204833984,
"rewards/rejected": -12.585378646850586,
"step": 630
},
{
"epoch": 0.38169077083643954,
"grad_norm": 0.007735874503850937,
"learning_rate": 4.669075022885923e-05,
"logits/chosen": -59.70928192138672,
"logits/rejected": -61.80042266845703,
"logps/chosen": -3695.89306640625,
"logps/rejected": -3571.295654296875,
"loss": 0.0029,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.748845338821411,
"rewards/margins": 17.76656150817871,
"rewards/rejected": -15.017715454101562,
"step": 640
},
{
"epoch": 0.3876546891307589,
"grad_norm": 0.0010597106302157044,
"learning_rate": 4.6569732555930664e-05,
"logits/chosen": -60.23158645629883,
"logits/rejected": -62.561614990234375,
"logps/chosen": -3549.76171875,
"logps/rejected": -3746.29443359375,
"loss": 0.0015,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5340225100517273,
"rewards/margins": 19.62533187866211,
"rewards/rejected": -20.159353256225586,
"step": 650
},
{
"epoch": 0.39361860742507826,
"grad_norm": 6.7051691985398065e-06,
"learning_rate": 4.6446704373939474e-05,
"logits/chosen": -59.58567428588867,
"logits/rejected": -64.33174133300781,
"logps/chosen": -3639.188720703125,
"logps/rejected": -4331.96484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.348783016204834,
"rewards/margins": 21.183481216430664,
"rewards/rejected": -15.834698677062988,
"step": 660
},
{
"epoch": 0.3995825257193976,
"grad_norm": 0.0003712655452545732,
"learning_rate": 4.632167715030992e-05,
"logits/chosen": -58.96162796020508,
"logits/rejected": -60.83256912231445,
"logps/chosen": -3720.49169921875,
"logps/rejected": -3933.169189453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.91277813911438,
"rewards/margins": 18.707401275634766,
"rewards/rejected": -14.794624328613281,
"step": 670
},
{
"epoch": 0.40554644401371703,
"grad_norm": 0.000252110738074407,
"learning_rate": 4.619466253879643e-05,
"logits/chosen": -58.954345703125,
"logits/rejected": -61.925331115722656,
"logps/chosen": -3840.400390625,
"logps/rejected": -3943.953857421875,
"loss": 1.1078,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 10.758131980895996,
"rewards/margins": 18.48508644104004,
"rewards/rejected": -7.72695255279541,
"step": 680
},
{
"epoch": 0.4115103623080364,
"grad_norm": 1.9803482587121835e-07,
"learning_rate": 4.606567237839733e-05,
"logits/chosen": -58.53533172607422,
"logits/rejected": -62.5938720703125,
"logps/chosen": -3771.753173828125,
"logps/rejected": -3718.02587890625,
"loss": 0.0131,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.095252990722656,
"rewards/margins": 18.99433708190918,
"rewards/rejected": -2.899085760116577,
"step": 690
},
{
"epoch": 0.41747428060235575,
"grad_norm": 0.0626341700553894,
"learning_rate": 4.593471869225132e-05,
"logits/chosen": -54.60516357421875,
"logits/rejected": -55.525299072265625,
"logps/chosen": -3515.046875,
"logps/rejected": -3357.25390625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.759082794189453,
"rewards/margins": 17.699445724487305,
"rewards/rejected": 1.0596368312835693,
"step": 700
},
{
"epoch": 0.4234381988966751,
"grad_norm": 4.832664126297459e-05,
"learning_rate": 4.580181368651683e-05,
"logits/chosen": -54.49121856689453,
"logits/rejected": -55.52031326293945,
"logps/chosen": -3684.757080078125,
"logps/rejected": -3753.23828125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 20.624637603759766,
"rewards/margins": 19.6689395904541,
"rewards/rejected": 0.955697238445282,
"step": 710
},
{
"epoch": 0.42940211719099447,
"grad_norm": 5.976962142995035e-07,
"learning_rate": 4.5666969749234276e-05,
"logits/chosen": -54.036834716796875,
"logits/rejected": -56.30643844604492,
"logps/chosen": -3887.684814453125,
"logps/rejected": -4103.453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 19.976490020751953,
"rewards/margins": 20.665203094482422,
"rewards/rejected": -0.6887091398239136,
"step": 720
},
{
"epoch": 0.43536603548531383,
"grad_norm": 2.6839097699848935e-05,
"learning_rate": 4.553019944917135e-05,
"logits/chosen": -53.194984436035156,
"logits/rejected": -55.72241973876953,
"logps/chosen": -3147.961669921875,
"logps/rejected": -3584.89990234375,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": 19.091827392578125,
"rewards/margins": 18.645496368408203,
"rewards/rejected": 0.44633132219314575,
"step": 730
},
{
"epoch": 0.44132995377963324,
"grad_norm": 5.41134322702419e-05,
"learning_rate": 4.539151553465154e-05,
"logits/chosen": -57.11003875732422,
"logits/rejected": -57.11518096923828,
"logps/chosen": -3673.337158203125,
"logps/rejected": -3887.12646484375,
"loss": 0.0199,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 16.159154891967773,
"rewards/margins": 20.49015998840332,
"rewards/rejected": -4.3310017585754395,
"step": 740
},
{
"epoch": 0.4472938720739526,
"grad_norm": 0.000161544798174873,
"learning_rate": 4.52509309323658e-05,
"logits/chosen": -56.69603729248047,
"logits/rejected": -59.89275360107422,
"logps/chosen": -3596.99462890625,
"logps/rejected": -4027.516357421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.372100830078125,
"rewards/margins": 25.469730377197266,
"rewards/rejected": -12.09763240814209,
"step": 750
},
{
"epoch": 0.45325779036827196,
"grad_norm": 1.5759195548525895e-06,
"learning_rate": 4.510845874616769e-05,
"logits/chosen": -61.1575813293457,
"logits/rejected": -63.29572296142578,
"logps/chosen": -4064.885498046875,
"logps/rejected": -4054.22216796875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.724843978881836,
"rewards/margins": 20.249258041381836,
"rewards/rejected": -7.524415016174316,
"step": 760
},
{
"epoch": 0.4592217086625913,
"grad_norm": 0.00013319715799298137,
"learning_rate": 4.4964112255852e-05,
"logits/chosen": -55.76922607421875,
"logits/rejected": -58.15706253051758,
"logps/chosen": -3776.48046875,
"logps/rejected": -3697.975341796875,
"loss": 0.1818,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 12.983065605163574,
"rewards/margins": 20.102405548095703,
"rewards/rejected": -7.119339942932129,
"step": 770
},
{
"epoch": 0.4651856269569107,
"grad_norm": 0.03267952799797058,
"learning_rate": 4.481790491591687e-05,
"logits/chosen": -57.81508255004883,
"logits/rejected": -60.3544807434082,
"logps/chosen": -3629.84765625,
"logps/rejected": -3470.53662109375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.1411714553833,
"rewards/margins": 16.54482650756836,
"rewards/rejected": -3.4036548137664795,
"step": 780
},
{
"epoch": 0.47114954525123004,
"grad_norm": 0.003748134011402726,
"learning_rate": 4.466985035430977e-05,
"logits/chosen": -58.23984909057617,
"logits/rejected": -60.508758544921875,
"logps/chosen": -4077.05029296875,
"logps/rejected": -4333.125,
"loss": 0.004,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.593130111694336,
"rewards/margins": 17.254772186279297,
"rewards/rejected": -1.6616401672363281,
"step": 790
},
{
"epoch": 0.4771134635455494,
"grad_norm": 0.0014063880080357194,
"learning_rate": 4.4519962371157196e-05,
"logits/chosen": -58.69189453125,
"logits/rejected": -59.80534744262695,
"logps/chosen": -3442.610595703125,
"logps/rejected": -3336.97119140625,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.450828552246094,
"rewards/margins": 16.80379867553711,
"rewards/rejected": -6.352969646453857,
"step": 800
},
{
"epoch": 0.4830773818398688,
"grad_norm": 4.745288606500253e-05,
"learning_rate": 4.4368254937478394e-05,
"logits/chosen": -57.631553649902344,
"logits/rejected": -62.973785400390625,
"logps/chosen": -4004.979736328125,
"logps/rejected": -4168.11767578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.3682861328125,
"rewards/margins": 22.90373420715332,
"rewards/rejected": -14.535449028015137,
"step": 810
},
{
"epoch": 0.4890413001341882,
"grad_norm": 0.0005291652050800622,
"learning_rate": 4.4214742193883094e-05,
"logits/chosen": -60.228851318359375,
"logits/rejected": -61.08269500732422,
"logps/chosen": -4001.78173828125,
"logps/rejected": -4179.3408203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.373010635375977,
"rewards/margins": 20.33767318725586,
"rewards/rejected": -10.964665412902832,
"step": 820
},
{
"epoch": 0.49500521842850753,
"grad_norm": 0.00031022998155094683,
"learning_rate": 4.40594384492535e-05,
"logits/chosen": -57.28828048706055,
"logits/rejected": -59.451202392578125,
"logps/chosen": -3300.813232421875,
"logps/rejected": -3466.85205078125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.398109436035156,
"rewards/margins": 20.157407760620117,
"rewards/rejected": -10.759299278259277,
"step": 830
},
{
"epoch": 0.500969136722827,
"grad_norm": 6.490218311228091e-06,
"learning_rate": 4.390235817941054e-05,
"logits/chosen": -59.27235794067383,
"logits/rejected": -61.06171798706055,
"logps/chosen": -4139.15234375,
"logps/rejected": -3869.400390625,
"loss": 0.0026,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.081242561340332,
"rewards/margins": 19.477825164794922,
"rewards/rejected": -10.396584510803223,
"step": 840
},
{
"epoch": 0.5069330550171463,
"grad_norm": 2.243270955659682e-06,
"learning_rate": 4.37435160257646e-05,
"logits/chosen": -59.277854919433594,
"logits/rejected": -61.45380783081055,
"logps/chosen": -3748.130126953125,
"logps/rejected": -3891.33349609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.908781051635742,
"rewards/margins": 19.269939422607422,
"rewards/rejected": -10.36115837097168,
"step": 850
},
{
"epoch": 0.5128969733114657,
"grad_norm": 0.005396808031946421,
"learning_rate": 4.358292679395077e-05,
"logits/chosen": -61.35847091674805,
"logits/rejected": -60.853233337402344,
"logps/chosen": -4117.12109375,
"logps/rejected": -4071.62109375,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.670493125915527,
"rewards/margins": 17.932058334350586,
"rewards/rejected": -7.261567115783691,
"step": 860
},
{
"epoch": 0.518860891605785,
"grad_norm": 4.6453762479359284e-05,
"learning_rate": 4.342060545244886e-05,
"logits/chosen": -59.957664489746094,
"logits/rejected": -59.93918991088867,
"logps/chosen": -4070.168701171875,
"logps/rejected": -4095.42236328125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.734319686889648,
"rewards/margins": 20.455835342407227,
"rewards/rejected": -4.721514701843262,
"step": 870
},
{
"epoch": 0.5248248099001044,
"grad_norm": 2.395652813902416e-07,
"learning_rate": 4.3256567131188136e-05,
"logits/chosen": -55.78017044067383,
"logits/rejected": -59.07427978515625,
"logps/chosen": -3826.934326171875,
"logps/rejected": -3976.23486328125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.891937255859375,
"rewards/margins": 23.67508888244629,
"rewards/rejected": -6.783150672912598,
"step": 880
},
{
"epoch": 0.5307887281944237,
"grad_norm": 2.516942004149314e-06,
"learning_rate": 4.3090827120137114e-05,
"logits/chosen": -55.632896423339844,
"logits/rejected": -59.044334411621094,
"logps/chosen": -3536.372314453125,
"logps/rejected": -3534.555419921875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.21674919128418,
"rewards/margins": 20.339038848876953,
"rewards/rejected": -2.1222903728485107,
"step": 890
},
{
"epoch": 0.5367526464887431,
"grad_norm": 2.6818201149581e-05,
"learning_rate": 4.292340086787834e-05,
"logits/chosen": -57.886573791503906,
"logits/rejected": -58.13624954223633,
"logps/chosen": -3513.440673828125,
"logps/rejected": -3463.99169921875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.707967758178711,
"rewards/margins": 19.617687225341797,
"rewards/rejected": -3.9097201824188232,
"step": 900
},
{
"epoch": 0.5427165647830625,
"grad_norm": 9.664769459050149e-05,
"learning_rate": 4.2754303980168495e-05,
"logits/chosen": -59.3498420715332,
"logits/rejected": -61.16081619262695,
"logps/chosen": -4103.5751953125,
"logps/rejected": -4111.462890625,
"loss": 0.1983,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 7.029057502746582,
"rewards/margins": 19.05350112915039,
"rewards/rejected": -12.024443626403809,
"step": 910
},
{
"epoch": 0.5486804830773818,
"grad_norm": 0.0024496586993336678,
"learning_rate": 4.2583552218483725e-05,
"logits/chosen": -58.522064208984375,
"logits/rejected": -61.4221305847168,
"logps/chosen": -4057.762451171875,
"logps/rejected": -4270.6669921875,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0752465724945068,
"rewards/margins": 21.978342056274414,
"rewards/rejected": -20.903093338012695,
"step": 920
},
{
"epoch": 0.5546444013717012,
"grad_norm": 9.883645361696836e-06,
"learning_rate": 4.241116149855053e-05,
"logits/chosen": -62.64072799682617,
"logits/rejected": -65.83187103271484,
"logps/chosen": -4190.4921875,
"logps/rejected": -4857.25439453125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.829843044281006,
"rewards/margins": 26.19070816040039,
"rewards/rejected": -33.02054977416992,
"step": 930
},
{
"epoch": 0.5606083196660205,
"grad_norm": 0.00028690064209513366,
"learning_rate": 4.2237147888862305e-05,
"logits/chosen": -63.4923210144043,
"logits/rejected": -66.00108337402344,
"logps/chosen": -3985.04833984375,
"logps/rejected": -4251.2177734375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.499919891357422,
"rewards/margins": 20.861995697021484,
"rewards/rejected": -30.361913681030273,
"step": 940
},
{
"epoch": 0.56657223796034,
"grad_norm": 0.00011731364793376997,
"learning_rate": 4.206152760918154e-05,
"logits/chosen": -62.154945373535156,
"logits/rejected": -65.93985748291016,
"logps/chosen": -3877.466796875,
"logps/rejected": -3978.805419921875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.469512939453125,
"rewards/margins": 22.071298599243164,
"rewards/rejected": -31.54081153869629,
"step": 950
},
{
"epoch": 0.5725361562546593,
"grad_norm": 8.252065163105726e-05,
"learning_rate": 4.188431702902803e-05,
"logits/chosen": -64.38203430175781,
"logits/rejected": -67.57643127441406,
"logps/chosen": -4045.811279296875,
"logps/rejected": -4326.49267578125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.671667098999023,
"rewards/margins": 22.858341217041016,
"rewards/rejected": -31.53000831604004,
"step": 960
},
{
"epoch": 0.5785000745489787,
"grad_norm": 4.934684625368391e-07,
"learning_rate": 4.1705532666153036e-05,
"logits/chosen": -61.06531524658203,
"logits/rejected": -64.74785614013672,
"logps/chosen": -3762.716064453125,
"logps/rejected": -4029.44775390625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9471113681793213,
"rewards/margins": 22.45229148864746,
"rewards/rejected": -20.50518226623535,
"step": 970
},
{
"epoch": 0.5844639928432981,
"grad_norm": 0.006156011950224638,
"learning_rate": 4.152519118499971e-05,
"logits/chosen": -62.082862854003906,
"logits/rejected": -65.42506408691406,
"logps/chosen": -3859.382080078125,
"logps/rejected": -4022.57861328125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.43494659662246704,
"rewards/margins": 22.814266204833984,
"rewards/rejected": -22.379318237304688,
"step": 980
},
{
"epoch": 0.5904279111376174,
"grad_norm": 3.552740110990271e-07,
"learning_rate": 4.134330939514979e-05,
"logits/chosen": -61.07642364501953,
"logits/rejected": -64.37804412841797,
"logps/chosen": -4356.12841796875,
"logps/rejected": -4619.02490234375,
"loss": 1.3263,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -3.0648064613342285,
"rewards/margins": 20.497941970825195,
"rewards/rejected": -23.562747955322266,
"step": 990
},
{
"epoch": 0.5963918294319368,
"grad_norm": 1.0799766414493206e-07,
"learning_rate": 4.1159904249756755e-05,
"logits/chosen": -60.018157958984375,
"logits/rejected": -63.62800216674805,
"logps/chosen": -4144.43505859375,
"logps/rejected": -4186.046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.4197044372558594,
"rewards/margins": 20.778461456298828,
"rewards/rejected": -18.35875701904297,
"step": 1000
},
{
"epoch": 0.6023557477262561,
"grad_norm": 8.552977305953391e-06,
"learning_rate": 4.097499284396567e-05,
"logits/chosen": -61.17185592651367,
"logits/rejected": -65.2453384399414,
"logps/chosen": -3919.401611328125,
"logps/rejected": -3826.82861328125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.23002290725708,
"rewards/margins": 21.679697036743164,
"rewards/rejected": -16.449674606323242,
"step": 1010
},
{
"epoch": 0.6083196660205755,
"grad_norm": 1.2926037015859038e-05,
"learning_rate": 4.0788592413319724e-05,
"logits/chosen": -57.3408317565918,
"logits/rejected": -57.7392578125,
"logps/chosen": -4130.8193359375,
"logps/rejected": -3755.291748046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 7.651645660400391,
"rewards/margins": 19.993789672851562,
"rewards/rejected": -12.342143058776855,
"step": 1020
},
{
"epoch": 0.6142835843148949,
"grad_norm": 1.5771300923006493e-06,
"learning_rate": 4.060072033215373e-05,
"logits/chosen": -55.617210388183594,
"logits/rejected": -58.800628662109375,
"logps/chosen": -4015.65771484375,
"logps/rejected": -4279.2431640625,
"loss": 0.0821,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 7.365814208984375,
"rewards/margins": 22.02432632446289,
"rewards/rejected": -14.6585111618042,
"step": 1030
},
{
"epoch": 0.6202475026092142,
"grad_norm": 5.314045483828522e-05,
"learning_rate": 4.0411394111974646e-05,
"logits/chosen": -57.569549560546875,
"logits/rejected": -60.29054641723633,
"logps/chosen": -3956.905517578125,
"logps/rejected": -4089.600341796875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.769119262695312,
"rewards/margins": 19.566356658935547,
"rewards/rejected": -8.797237396240234,
"step": 1040
},
{
"epoch": 0.6262114209035337,
"grad_norm": 9.0191870185663e-06,
"learning_rate": 4.022063139982934e-05,
"logits/chosen": -55.201271057128906,
"logits/rejected": -58.512351989746094,
"logps/chosen": -3667.98486328125,
"logps/rejected": -3769.97998046875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.191717147827148,
"rewards/margins": 19.362136840820312,
"rewards/rejected": -8.170419692993164,
"step": 1050
},
{
"epoch": 0.632175339197853,
"grad_norm": 2.962595999633777e-06,
"learning_rate": 4.0028449976659724e-05,
"logits/chosen": -57.57762908935547,
"logits/rejected": -61.270423889160156,
"logps/chosen": -3651.904296875,
"logps/rejected": -3825.788330078125,
"loss": 0.0782,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 12.59385871887207,
"rewards/margins": 21.37427520751953,
"rewards/rejected": -8.780416488647461,
"step": 1060
},
{
"epoch": 0.6381392574921724,
"grad_norm": 3.167168927120656e-07,
"learning_rate": 3.983486775564539e-05,
"logits/chosen": -52.46897506713867,
"logits/rejected": -56.435447692871094,
"logps/chosen": -3674.512451171875,
"logps/rejected": -4028.725341796875,
"loss": 0.0197,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 10.90526294708252,
"rewards/margins": 22.505237579345703,
"rewards/rejected": -11.599977493286133,
"step": 1070
},
{
"epoch": 0.6441031757864917,
"grad_norm": 7.120500231394544e-05,
"learning_rate": 3.963990278053392e-05,
"logits/chosen": -56.88371658325195,
"logits/rejected": -58.082679748535156,
"logps/chosen": -3925.76953125,
"logps/rejected": -4151.24560546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.153375625610352,
"rewards/margins": 23.492019653320312,
"rewards/rejected": -10.33864688873291,
"step": 1080
},
{
"epoch": 0.6500670940808111,
"grad_norm": 3.851781560371137e-09,
"learning_rate": 3.944357322395905e-05,
"logits/chosen": -58.075218200683594,
"logits/rejected": -61.56608963012695,
"logps/chosen": -4175.98583984375,
"logps/rejected": -4781.39599609375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.212366104125977,
"rewards/margins": 27.55126953125,
"rewards/rejected": -17.338903427124023,
"step": 1090
},
{
"epoch": 0.6560310123751305,
"grad_norm": 5.771942568344457e-08,
"learning_rate": 3.9245897385746775e-05,
"logits/chosen": -56.85009765625,
"logits/rejected": -59.448028564453125,
"logps/chosen": -3711.79296875,
"logps/rejected": -3940.455810546875,
"loss": 0.0027,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.879262924194336,
"rewards/margins": 24.20357322692871,
"rewards/rejected": -11.324310302734375,
"step": 1100
},
{
"epoch": 0.6619949306694498,
"grad_norm": 2.9022641683695838e-05,
"learning_rate": 3.9046893691209664e-05,
"logits/chosen": -53.4720458984375,
"logits/rejected": -57.07532501220703,
"logps/chosen": -3489.55224609375,
"logps/rejected": -3556.66845703125,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.83867359161377,
"rewards/margins": 22.73272705078125,
"rewards/rejected": -8.894055366516113,
"step": 1110
},
{
"epoch": 0.6679588489637692,
"grad_norm": 3.5439442491735917e-09,
"learning_rate": 3.884658068942941e-05,
"logits/chosen": -51.687355041503906,
"logits/rejected": -57.01649856567383,
"logps/chosen": -3604.44384765625,
"logps/rejected": -3905.239501953125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.09690284729004,
"rewards/margins": 26.312124252319336,
"rewards/rejected": -10.21522045135498,
"step": 1120
},
{
"epoch": 0.6739227672580885,
"grad_norm": 2.719633585002157e-08,
"learning_rate": 3.8644977051527885e-05,
"logits/chosen": -54.46210861206055,
"logits/rejected": -57.665122985839844,
"logps/chosen": -3400.2109375,
"logps/rejected": -3510.215576171875,
"loss": 0.0207,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.349239349365234,
"rewards/margins": 22.681480407714844,
"rewards/rejected": -5.332241058349609,
"step": 1130
},
{
"epoch": 0.6798866855524079,
"grad_norm": 2.1326864043658134e-06,
"learning_rate": 3.844210156892683e-05,
"logits/chosen": -56.72893142700195,
"logits/rejected": -58.62841796875,
"logps/chosen": -3941.629638671875,
"logps/rejected": -3959.418701171875,
"loss": 0.0106,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.112319946289062,
"rewards/margins": 23.076122283935547,
"rewards/rejected": -11.96380615234375,
"step": 1140
},
{
"epoch": 0.6858506038467272,
"grad_norm": 1.068567740958315e-07,
"learning_rate": 3.823797315159629e-05,
"logits/chosen": -55.89997482299805,
"logits/rejected": -58.0466423034668,
"logps/chosen": -4139.759765625,
"logps/rejected": -3862.365966796875,
"loss": 0.0232,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 10.52987289428711,
"rewards/margins": 23.36825180053711,
"rewards/rejected": -12.83838176727295,
"step": 1150
},
{
"epoch": 0.6918145221410467,
"grad_norm": 1.9215246993553592e-07,
"learning_rate": 3.803261082629198e-05,
"logits/chosen": -57.30939865112305,
"logits/rejected": -60.035255432128906,
"logps/chosen": -3841.99853515625,
"logps/rejected": -3897.645263671875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.77658462524414,
"rewards/margins": 23.096725463867188,
"rewards/rejected": -12.320140838623047,
"step": 1160
},
{
"epoch": 0.6977784404353661,
"grad_norm": 1.6344721416317043e-06,
"learning_rate": 3.782603373478194e-05,
"logits/chosen": -59.83994674682617,
"logits/rejected": -62.819435119628906,
"logps/chosen": -3778.887939453125,
"logps/rejected": -3805.174560546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 7.7517805099487305,
"rewards/margins": 21.68671417236328,
"rewards/rejected": -13.934931755065918,
"step": 1170
},
{
"epoch": 0.7037423587296854,
"grad_norm": 0.0002575635153334588,
"learning_rate": 3.761826113206216e-05,
"logits/chosen": -54.88861846923828,
"logits/rejected": -58.307289123535156,
"logps/chosen": -3845.460205078125,
"logps/rejected": -3828.728515625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.72022819519043,
"rewards/margins": 23.8989200592041,
"rewards/rejected": -15.178689956665039,
"step": 1180
},
{
"epoch": 0.7097062770240048,
"grad_norm": 8.875089406501502e-05,
"learning_rate": 3.740931238456195e-05,
"logits/chosen": -56.36833572387695,
"logits/rejected": -59.8969612121582,
"logps/chosen": -3818.005126953125,
"logps/rejected": -3839.862060546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.653704643249512,
"rewards/margins": 23.86166763305664,
"rewards/rejected": -15.207963943481445,
"step": 1190
},
{
"epoch": 0.7156701953183241,
"grad_norm": 1.4342495887831319e-05,
"learning_rate": 3.7199206968338776e-05,
"logits/chosen": -57.47324752807617,
"logits/rejected": -61.022796630859375,
"logps/chosen": -3948.418701171875,
"logps/rejected": -4151.45849609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.350122451782227,
"rewards/margins": 23.93897247314453,
"rewards/rejected": -13.588849067687988,
"step": 1200
},
{
"epoch": 0.7216341136126435,
"grad_norm": 0.13247327506542206,
"learning_rate": 3.6987964467262866e-05,
"logits/chosen": -57.539756774902344,
"logits/rejected": -60.251495361328125,
"logps/chosen": -3663.866455078125,
"logps/rejected": -3746.822265625,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.292342185974121,
"rewards/margins": 21.296642303466797,
"rewards/rejected": -13.004300117492676,
"step": 1210
},
{
"epoch": 0.7275980319069629,
"grad_norm": 1.1016503776772879e-05,
"learning_rate": 3.6775604571191835e-05,
"logits/chosen": -57.985084533691406,
"logits/rejected": -62.3299446105957,
"logps/chosen": -3958.30029296875,
"logps/rejected": -4056.701171875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.709458351135254,
"rewards/margins": 25.354206085205078,
"rewards/rejected": -16.644746780395508,
"step": 1220
},
{
"epoch": 0.7335619502012822,
"grad_norm": 5.239522238298377e-07,
"learning_rate": 3.6562147074135395e-05,
"logits/chosen": -55.77289581298828,
"logits/rejected": -56.620445251464844,
"logps/chosen": -3935.78173828125,
"logps/rejected": -4204.76025390625,
"loss": 0.014,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.306615829467773,
"rewards/margins": 23.17990493774414,
"rewards/rejected": -10.873285293579102,
"step": 1230
},
{
"epoch": 0.7395258684956016,
"grad_norm": 0.005283840466290712,
"learning_rate": 3.6347611872410347e-05,
"logits/chosen": -56.01280975341797,
"logits/rejected": -59.32996368408203,
"logps/chosen": -3594.85009765625,
"logps/rejected": -3652.62646484375,
"loss": 0.0027,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.890737533569336,
"rewards/margins": 21.618831634521484,
"rewards/rejected": -9.728094100952148,
"step": 1240
},
{
"epoch": 0.745489786789921,
"grad_norm": 4.326713315094821e-05,
"learning_rate": 3.6132018962786066e-05,
"logits/chosen": -55.68181610107422,
"logits/rejected": -60.5903434753418,
"logps/chosen": -3649.29248046875,
"logps/rejected": -4155.4248046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.527814865112305,
"rewards/margins": 24.51072883605957,
"rewards/rejected": -9.982913970947266,
"step": 1250
},
{
"epoch": 0.7514537050842404,
"grad_norm": 4.07174782779407e-09,
"learning_rate": 3.591538844062058e-05,
"logits/chosen": -56.679443359375,
"logits/rejected": -58.407135009765625,
"logps/chosen": -3861.89453125,
"logps/rejected": -3911.80078125,
"loss": 0.182,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 14.151501655578613,
"rewards/margins": 23.692068099975586,
"rewards/rejected": -9.540567398071289,
"step": 1260
},
{
"epoch": 0.7574176233785597,
"grad_norm": 1.2413269701028184e-07,
"learning_rate": 3.5697740497987554e-05,
"logits/chosen": -53.7273063659668,
"logits/rejected": -59.340187072753906,
"logps/chosen": -3629.626220703125,
"logps/rejected": -3897.79833984375,
"loss": 0.0219,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 11.900304794311523,
"rewards/margins": 24.943315505981445,
"rewards/rejected": -13.043011665344238,
"step": 1270
},
{
"epoch": 0.7633815416728791,
"grad_norm": 4.0087169850266946e-07,
"learning_rate": 3.5479095421794087e-05,
"logits/chosen": -58.437957763671875,
"logits/rejected": -59.74829864501953,
"logps/chosen": -4079.52685546875,
"logps/rejected": -4265.56298828125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.92692756652832,
"rewards/margins": 24.08592414855957,
"rewards/rejected": -15.158998489379883,
"step": 1280
},
{
"epoch": 0.7693454599671985,
"grad_norm": 5.956236464044196e-07,
"learning_rate": 3.525947359188988e-05,
"logits/chosen": -56.96044158935547,
"logits/rejected": -60.473365783691406,
"logps/chosen": -3507.78662109375,
"logps/rejected": -3547.66455078125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.140953063964844,
"rewards/margins": 25.788040161132812,
"rewards/rejected": -17.6470890045166,
"step": 1290
},
{
"epoch": 0.7753093782615178,
"grad_norm": 5.90286333590484e-07,
"learning_rate": 3.503889547916757e-05,
"logits/chosen": -56.5598258972168,
"logits/rejected": -60.83135223388672,
"logps/chosen": -3679.897216796875,
"logps/rejected": -4072.563720703125,
"loss": 0.0251,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 8.379316329956055,
"rewards/margins": 24.321413040161133,
"rewards/rejected": -15.942098617553711,
"step": 1300
},
{
"epoch": 0.7812732965558372,
"grad_norm": 0.00029456091579049826,
"learning_rate": 3.4817381643654656e-05,
"logits/chosen": -54.76763153076172,
"logits/rejected": -57.302955627441406,
"logps/chosen": -3611.793701171875,
"logps/rejected": -3881.207763671875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.57978343963623,
"rewards/margins": 24.082733154296875,
"rewards/rejected": -12.502950668334961,
"step": 1310
},
{
"epoch": 0.7872372148501565,
"grad_norm": 2.6790543117272136e-08,
"learning_rate": 3.4594952732597114e-05,
"logits/chosen": -57.267723083496094,
"logits/rejected": -61.67658233642578,
"logps/chosen": -3998.71630859375,
"logps/rejected": -4348.85205078125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.019363403320312,
"rewards/margins": 25.229736328125,
"rewards/rejected": -10.210372924804688,
"step": 1320
},
{
"epoch": 0.7932011331444759,
"grad_norm": 2.403813823548262e-07,
"learning_rate": 3.437162947853488e-05,
"logits/chosen": -52.67829132080078,
"logits/rejected": -57.03126907348633,
"logps/chosen": -3397.01953125,
"logps/rejected": -3500.949951171875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.576359748840332,
"rewards/margins": 23.891281127929688,
"rewards/rejected": -10.314921379089355,
"step": 1330
},
{
"epoch": 0.7991650514387952,
"grad_norm": 1.207563855132321e-06,
"learning_rate": 3.4147432697369366e-05,
"logits/chosen": -54.901878356933594,
"logits/rejected": -58.046234130859375,
"logps/chosen": -3778.25390625,
"logps/rejected": -4273.1123046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.951375007629395,
"rewards/margins": 22.807498931884766,
"rewards/rejected": -7.8561224937438965,
"step": 1340
},
{
"epoch": 0.8051289697331147,
"grad_norm": 9.455924009671435e-05,
"learning_rate": 3.392238328642319e-05,
"logits/chosen": -56.3574333190918,
"logits/rejected": -60.68909454345703,
"logps/chosen": -3632.43603515625,
"logps/rejected": -3841.61767578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.990877151489258,
"rewards/margins": 24.18165397644043,
"rewards/rejected": -11.190776824951172,
"step": 1350
},
{
"epoch": 0.8110928880274341,
"grad_norm": 1.4930514602440326e-09,
"learning_rate": 3.3696502222492384e-05,
"logits/chosen": -55.69526290893555,
"logits/rejected": -59.13079833984375,
"logps/chosen": -3696.494140625,
"logps/rejected": -3736.537109375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.624422073364258,
"rewards/margins": 24.236011505126953,
"rewards/rejected": -7.611591339111328,
"step": 1360
},
{
"epoch": 0.8170568063217534,
"grad_norm": 2.062861739204891e-07,
"learning_rate": 3.346981055989114e-05,
"logits/chosen": -56.82810592651367,
"logits/rejected": -62.07573318481445,
"logps/chosen": -3906.99267578125,
"logps/rejected": -4055.106689453125,
"loss": 0.6131,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 12.317488670349121,
"rewards/margins": 21.558252334594727,
"rewards/rejected": -9.240762710571289,
"step": 1370
},
{
"epoch": 0.8230207246160728,
"grad_norm": 9.348526509711519e-05,
"learning_rate": 3.324232942848933e-05,
"logits/chosen": -54.5309944152832,
"logits/rejected": -54.803611755371094,
"logps/chosen": -3432.692626953125,
"logps/rejected": -3552.36328125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.066215515136719,
"rewards/margins": 22.905208587646484,
"rewards/rejected": -11.838993072509766,
"step": 1380
},
{
"epoch": 0.8289846429103921,
"grad_norm": 2.5315921448054723e-07,
"learning_rate": 3.3014080031743e-05,
"logits/chosen": -53.31890869140625,
"logits/rejected": -56.65520095825195,
"logps/chosen": -3301.90087890625,
"logps/rejected": -3586.98876953125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.38410472869873,
"rewards/margins": 27.104639053344727,
"rewards/rejected": -14.720535278320312,
"step": 1390
},
{
"epoch": 0.8349485612047115,
"grad_norm": 1.6616603204511193e-07,
"learning_rate": 3.278508364471801e-05,
"logits/chosen": -57.218421936035156,
"logits/rejected": -59.43006134033203,
"logps/chosen": -3883.231201171875,
"logps/rejected": -4120.4892578125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.72683048248291,
"rewards/margins": 24.0744686126709,
"rewards/rejected": -13.347638130187988,
"step": 1400
},
{
"epoch": 0.8409124794990309,
"grad_norm": 7.670184487551523e-08,
"learning_rate": 3.255536161210699e-05,
"logits/chosen": -55.0363655090332,
"logits/rejected": -57.93296432495117,
"logps/chosen": -3802.41455078125,
"logps/rejected": -3902.47509765625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.028858184814453,
"rewards/margins": 22.72267723083496,
"rewards/rejected": -10.693819046020508,
"step": 1410
},
{
"epoch": 0.8468763977933502,
"grad_norm": 1.7279053565744107e-07,
"learning_rate": 3.2324935346239796e-05,
"logits/chosen": -54.202125549316406,
"logits/rejected": -57.15998458862305,
"logps/chosen": -3656.39990234375,
"logps/rejected": -3975.80859375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.18800163269043,
"rewards/margins": 24.70977783203125,
"rewards/rejected": -13.52177619934082,
"step": 1420
},
{
"epoch": 0.8528403160876696,
"grad_norm": 2.6494813454291943e-08,
"learning_rate": 3.209382632508768e-05,
"logits/chosen": -55.63017654418945,
"logits/rejected": -59.00508499145508,
"logps/chosen": -4403.7734375,
"logps/rejected": -4720.34228515625,
"loss": 0.0132,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.121981620788574,
"rewards/margins": 23.280654907226562,
"rewards/rejected": -13.158673286437988,
"step": 1430
},
{
"epoch": 0.8588042343819889,
"grad_norm": 2.5004118242577533e-07,
"learning_rate": 3.1862056090261336e-05,
"logits/chosen": -54.925018310546875,
"logits/rejected": -59.058372497558594,
"logps/chosen": -3932.13427734375,
"logps/rejected": -3746.4765625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.681973457336426,
"rewards/margins": 22.887840270996094,
"rewards/rejected": -8.205865859985352,
"step": 1440
},
{
"epoch": 0.8647681526763084,
"grad_norm": 2.1602265576348145e-07,
"learning_rate": 3.162964624500301e-05,
"logits/chosen": -54.48186492919922,
"logits/rejected": -56.77177047729492,
"logps/chosen": -3672.371826171875,
"logps/rejected": -3484.89697265625,
"loss": 0.0206,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 19.161680221557617,
"rewards/margins": 24.562843322753906,
"rewards/rejected": -5.401161193847656,
"step": 1450
},
{
"epoch": 0.8707320709706277,
"grad_norm": 3.4755198612401728e-06,
"learning_rate": 3.139661845217287e-05,
"logits/chosen": -51.368988037109375,
"logits/rejected": -55.37031936645508,
"logps/chosen": -3605.017578125,
"logps/rejected": -3807.35595703125,
"loss": 0.0054,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.435853958129883,
"rewards/margins": 23.906322479248047,
"rewards/rejected": -6.470468044281006,
"step": 1460
},
{
"epoch": 0.8766959892649471,
"grad_norm": 3.5051095892413286e-06,
"learning_rate": 3.11629944322298e-05,
"logits/chosen": -58.4751091003418,
"logits/rejected": -62.437644958496094,
"logps/chosen": -3605.69140625,
"logps/rejected": -4044.111328125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.948145866394043,
"rewards/margins": 26.044723510742188,
"rewards/rejected": -11.096578598022461,
"step": 1470
},
{
"epoch": 0.8826599075592665,
"grad_norm": 5.029290406355358e-08,
"learning_rate": 3.092879596120689e-05,
"logits/chosen": -59.90129852294922,
"logits/rejected": -63.65130615234375,
"logps/chosen": -4040.684326171875,
"logps/rejected": -3993.44677734375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.216289520263672,
"rewards/margins": 23.25588607788086,
"rewards/rejected": -13.039596557617188,
"step": 1480
},
{
"epoch": 0.8886238258535858,
"grad_norm": 1.237174153327942,
"learning_rate": 3.06940448686816e-05,
"logits/chosen": -59.22917556762695,
"logits/rejected": -61.15869140625,
"logps/chosen": -3975.68408203125,
"logps/rejected": -4147.9228515625,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.068977355957031,
"rewards/margins": 24.409832000732422,
"rewards/rejected": -11.340853691101074,
"step": 1490
},
{
"epoch": 0.8945877441479052,
"grad_norm": 9.023612801684067e-05,
"learning_rate": 3.045876303574116e-05,
"logits/chosen": -59.75891876220703,
"logits/rejected": -62.84197998046875,
"logps/chosen": -3887.1640625,
"logps/rejected": -3652.65869140625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.598661422729492,
"rewards/margins": 22.873065948486328,
"rewards/rejected": -11.274404525756836,
"step": 1500
},
{
"epoch": 0.9005516624422245,
"grad_norm": 4.560320121527184e-08,
"learning_rate": 3.0222972392942943e-05,
"logits/chosen": -58.6246337890625,
"logits/rejected": -61.87261962890625,
"logps/chosen": -3715.770263671875,
"logps/rejected": -4073.483154296875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.044273376464844,
"rewards/margins": 26.504796981811523,
"rewards/rejected": -16.460525512695312,
"step": 1510
},
{
"epoch": 0.9065155807365439,
"grad_norm": 1.8267148504946817e-07,
"learning_rate": 2.998669491827035e-05,
"logits/chosen": -54.9869499206543,
"logits/rejected": -58.3288459777832,
"logps/chosen": -3689.07373046875,
"logps/rejected": -3788.818359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.276895523071289,
"rewards/margins": 23.46544075012207,
"rewards/rejected": -14.188543319702148,
"step": 1520
},
{
"epoch": 0.9124794990308632,
"grad_norm": 0.07166226208209991,
"learning_rate": 2.9749952635084254e-05,
"logits/chosen": -58.55971145629883,
"logits/rejected": -61.770591735839844,
"logps/chosen": -3907.703125,
"logps/rejected": -3977.048095703125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.65731430053711,
"rewards/margins": 23.61507225036621,
"rewards/rejected": -14.957756042480469,
"step": 1530
},
{
"epoch": 0.9184434173251826,
"grad_norm": 5.0479648052714765e-06,
"learning_rate": 2.9512767610070235e-05,
"logits/chosen": -58.743080139160156,
"logits/rejected": -62.081268310546875,
"logps/chosen": -4091.01171875,
"logps/rejected": -4007.314453125,
"loss": 0.0127,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.914778232574463,
"rewards/margins": 22.504919052124023,
"rewards/rejected": -15.590141296386719,
"step": 1540
},
{
"epoch": 0.9244073356195021,
"grad_norm": 5.116536527793869e-09,
"learning_rate": 2.927516195118167e-05,
"logits/chosen": -58.5189323425293,
"logits/rejected": -60.991737365722656,
"logps/chosen": -3521.76025390625,
"logps/rejected": -3490.95068359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.615170478820801,
"rewards/margins": 23.283008575439453,
"rewards/rejected": -17.667837142944336,
"step": 1550
},
{
"epoch": 0.9303712539138214,
"grad_norm": 3.0113267257547705e-06,
"learning_rate": 2.903715780557915e-05,
"logits/chosen": -58.911537170410156,
"logits/rejected": -61.680030822753906,
"logps/chosen": -3976.016357421875,
"logps/rejected": -3833.219482421875,
"loss": 0.0708,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 5.67651891708374,
"rewards/margins": 25.14394187927246,
"rewards/rejected": -19.467424392700195,
"step": 1560
},
{
"epoch": 0.9363351722081408,
"grad_norm": 7.281344949205959e-08,
"learning_rate": 2.8798777357566102e-05,
"logits/chosen": -62.60260009765625,
"logits/rejected": -64.15509796142578,
"logps/chosen": -4138.4677734375,
"logps/rejected": -3727.09375,
"loss": 0.0231,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 6.600105285644531,
"rewards/margins": 23.73483657836914,
"rewards/rejected": -17.13473129272461,
"step": 1570
},
{
"epoch": 0.9422990905024601,
"grad_norm": 1.6167986416348867e-07,
"learning_rate": 2.8560042826520983e-05,
"logits/chosen": -55.1787223815918,
"logits/rejected": -57.27393341064453,
"logps/chosen": -3631.578125,
"logps/rejected": -3276.724609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.137773513793945,
"rewards/margins": 24.159759521484375,
"rewards/rejected": -9.021984100341797,
"step": 1580
},
{
"epoch": 0.9482630087967795,
"grad_norm": 0.0022099693305790424,
"learning_rate": 2.8320976464826233e-05,
"logits/chosen": -56.308815002441406,
"logits/rejected": -58.647621154785156,
"logps/chosen": -3679.826171875,
"logps/rejected": -3692.447998046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.499637603759766,
"rewards/margins": 25.266708374023438,
"rewards/rejected": -7.767067909240723,
"step": 1590
},
{
"epoch": 0.9542269270910988,
"grad_norm": 14.162381172180176,
"learning_rate": 2.808160055579418e-05,
"logits/chosen": -58.639259338378906,
"logits/rejected": -61.093353271484375,
"logps/chosen": -4223.41552734375,
"logps/rejected": -4175.84814453125,
"loss": 0.0454,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 21.25933265686035,
"rewards/margins": 25.541118621826172,
"rewards/rejected": -4.281786918640137,
"step": 1600
},
{
"epoch": 0.9601908453854182,
"grad_norm": 5.322955871633894e-07,
"learning_rate": 2.784193741158993e-05,
"logits/chosen": -54.426841735839844,
"logits/rejected": -58.14940643310547,
"logps/chosen": -3445.11474609375,
"logps/rejected": -3718.72119140625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 18.399213790893555,
"rewards/margins": 24.536479949951172,
"rewards/rejected": -6.137265682220459,
"step": 1610
},
{
"epoch": 0.9661547636797376,
"grad_norm": 5.999038876325358e-06,
"learning_rate": 2.7602009371151717e-05,
"logits/chosen": -57.2712287902832,
"logits/rejected": -61.121788024902344,
"logps/chosen": -4308.03466796875,
"logps/rejected": -4611.1044921875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 22.73969078063965,
"rewards/margins": 24.401287078857422,
"rewards/rejected": -1.6615936756134033,
"step": 1620
},
{
"epoch": 0.9721186819740569,
"grad_norm": 0.005200933199375868,
"learning_rate": 2.7361838798108714e-05,
"logits/chosen": -56.9376220703125,
"logits/rejected": -59.23795700073242,
"logps/chosen": -3882.723876953125,
"logps/rejected": -4050.682373046875,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": 20.68083953857422,
"rewards/margins": 24.391653060913086,
"rewards/rejected": -3.710813045501709,
"step": 1630
},
{
"epoch": 0.9780826002683763,
"grad_norm": 2.6656310936346017e-08,
"learning_rate": 2.7121448078696437e-05,
"logits/chosen": -53.07708740234375,
"logits/rejected": -57.34746170043945,
"logps/chosen": -3162.02685546875,
"logps/rejected": -3019.009033203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 20.784738540649414,
"rewards/margins": 26.76715087890625,
"rewards/rejected": -5.982410907745361,
"step": 1640
},
{
"epoch": 0.9840465185626956,
"grad_norm": 1.415725847664362e-07,
"learning_rate": 2.6880859619670236e-05,
"logits/chosen": -54.78126907348633,
"logits/rejected": -57.343162536621094,
"logps/chosen": -3768.965576171875,
"logps/rejected": -3766.91259765625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 20.079748153686523,
"rewards/margins": 26.081069946289062,
"rewards/rejected": -6.0013227462768555,
"step": 1650
},
{
"epoch": 0.9900104368570151,
"grad_norm": 1.0123216043211869e-08,
"learning_rate": 2.66400958462167e-05,
"logits/chosen": -55.69786834716797,
"logits/rejected": -60.4398078918457,
"logps/chosen": -3736.546142578125,
"logps/rejected": -4183.6357421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.356075286865234,
"rewards/margins": 28.006738662719727,
"rewards/rejected": -11.650662422180176,
"step": 1660
},
{
"epoch": 0.9959743551513345,
"grad_norm": 2.6384802254142414e-07,
"learning_rate": 2.6399179199863423e-05,
"logits/chosen": -55.284812927246094,
"logits/rejected": -59.626625061035156,
"logps/chosen": -3676.928466796875,
"logps/rejected": -3803.169921875,
"loss": 0.0558,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 14.217809677124023,
"rewards/margins": 24.813610076904297,
"rewards/rejected": -10.595799446105957,
"step": 1670
},
{
"epoch": 1.0017891754882957,
"grad_norm": 7.79418769525364e-06,
"learning_rate": 2.6158132136387247e-05,
"logits/chosen": -57.85382843017578,
"logits/rejected": -59.920597076416016,
"logps/chosen": -3755.565673828125,
"logps/rejected": -4092.383056640625,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.796311378479004,
"rewards/margins": 25.673641204833984,
"rewards/rejected": -10.87733268737793,
"step": 1680
},
{
"epoch": 1.0077530937826151,
"grad_norm": 3.236905854464567e-08,
"learning_rate": 2.5916977123721166e-05,
"logits/chosen": -57.10400390625,
"logits/rejected": -59.06767654418945,
"logps/chosen": -3694.768798828125,
"logps/rejected": -4206.1513671875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.047399520874023,
"rewards/margins": 24.886272430419922,
"rewards/rejected": -13.838871955871582,
"step": 1690
},
{
"epoch": 1.0137170120769345,
"grad_norm": 9.515994747744116e-08,
"learning_rate": 2.5675736639860077e-05,
"logits/chosen": -55.722320556640625,
"logits/rejected": -59.74749755859375,
"logps/chosen": -3329.92041015625,
"logps/rejected": -3575.55224609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.99412727355957,
"rewards/margins": 26.575632095336914,
"rewards/rejected": -10.581506729125977,
"step": 1700
},
{
"epoch": 1.019680930371254,
"grad_norm": 2.12275141908691e-10,
"learning_rate": 2.5434433170765635e-05,
"logits/chosen": -55.73252487182617,
"logits/rejected": -59.35845947265625,
"logps/chosen": -3671.85693359375,
"logps/rejected": -4161.1396484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.043779373168945,
"rewards/margins": 28.186742782592773,
"rewards/rejected": -13.142965316772461,
"step": 1710
},
{
"epoch": 1.0256448486655734,
"grad_norm": 1.0201654276897898e-06,
"learning_rate": 2.5193089208270332e-05,
"logits/chosen": -55.662010192871094,
"logits/rejected": -60.647422790527344,
"logps/chosen": -3821.21240234375,
"logps/rejected": -3968.89306640625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.09891128540039,
"rewards/margins": 27.115798950195312,
"rewards/rejected": -11.016887664794922,
"step": 1720
},
{
"epoch": 1.0316087669598926,
"grad_norm": 4.6147793919537605e-10,
"learning_rate": 2.4951727247981026e-05,
"logits/chosen": -57.985267639160156,
"logits/rejected": -60.434349060058594,
"logps/chosen": -3654.449951171875,
"logps/rejected": -3963.91162109375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.611637115478516,
"rewards/margins": 26.88236427307129,
"rewards/rejected": -9.270727157592773,
"step": 1730
},
{
"epoch": 1.037572685254212,
"grad_norm": 6.404677696991712e-05,
"learning_rate": 2.4710369787182163e-05,
"logits/chosen": -57.09203338623047,
"logits/rejected": -61.46424102783203,
"logps/chosen": -3566.01416015625,
"logps/rejected": -3992.688720703125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.677534103393555,
"rewards/margins": 24.495372772216797,
"rewards/rejected": -8.81783676147461,
"step": 1740
},
{
"epoch": 1.0435366035485314,
"grad_norm": 3.6316012597126246e-07,
"learning_rate": 2.4469039322738786e-05,
"logits/chosen": -56.2172737121582,
"logits/rejected": -61.31587600708008,
"logps/chosen": -3745.28369140625,
"logps/rejected": -3803.864501953125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.529300689697266,
"rewards/margins": 27.05365562438965,
"rewards/rejected": -10.524356842041016,
"step": 1750
},
{
"epoch": 1.0495005218428508,
"grad_norm": 3.816726064087561e-07,
"learning_rate": 2.42277583489996e-05,
"logits/chosen": -59.777565002441406,
"logits/rejected": -63.48384475708008,
"logps/chosen": -3635.927734375,
"logps/rejected": -4089.21630859375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.0086088180542,
"rewards/margins": 25.672292709350586,
"rewards/rejected": -11.663687705993652,
"step": 1760
},
{
"epoch": 1.0554644401371702,
"grad_norm": 2.641392484292737e-06,
"learning_rate": 2.3986549355700308e-05,
"logits/chosen": -59.445106506347656,
"logits/rejected": -62.23298263549805,
"logps/chosen": -3913.08447265625,
"logps/rejected": -4078.412109375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.499652862548828,
"rewards/margins": 24.941543579101562,
"rewards/rejected": -8.441890716552734,
"step": 1770
},
{
"epoch": 1.0614283584314894,
"grad_norm": 3.112697655183183e-08,
"learning_rate": 2.3745434825867347e-05,
"logits/chosen": -55.928680419921875,
"logits/rejected": -60.84385299682617,
"logps/chosen": -3434.234375,
"logps/rejected": -3640.389892578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.650065422058105,
"rewards/margins": 28.1390323638916,
"rewards/rejected": -12.488967895507812,
"step": 1780
},
{
"epoch": 1.0673922767258088,
"grad_norm": 2.6007089672930306e-06,
"learning_rate": 2.3504437233722214e-05,
"logits/chosen": -56.6685905456543,
"logits/rejected": -62.179771423339844,
"logps/chosen": -3519.731201171875,
"logps/rejected": -3699.080810546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.84507942199707,
"rewards/margins": 26.682373046875,
"rewards/rejected": -10.83729362487793,
"step": 1790
},
{
"epoch": 1.0733561950201282,
"grad_norm": 1.2538286447525024,
"learning_rate": 2.3263579042586697e-05,
"logits/chosen": -54.93901824951172,
"logits/rejected": -57.605445861816406,
"logps/chosen": -3739.166748046875,
"logps/rejected": -3709.137451171875,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.472308158874512,
"rewards/margins": 25.952701568603516,
"rewards/rejected": -10.480390548706055,
"step": 1800
},
{
"epoch": 1.0793201133144477,
"grad_norm": 1.3886967964449326e-11,
"learning_rate": 2.302288270278904e-05,
"logits/chosen": -58.45922088623047,
"logits/rejected": -60.769447326660156,
"logps/chosen": -4227.611328125,
"logps/rejected": -4209.0576171875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.69550609588623,
"rewards/margins": 27.71603012084961,
"rewards/rejected": -12.020523071289062,
"step": 1810
},
{
"epoch": 1.0852840316087669,
"grad_norm": 4.680672418544418e-07,
"learning_rate": 2.2782370649571368e-05,
"logits/chosen": -57.41197967529297,
"logits/rejected": -61.311851501464844,
"logps/chosen": -3252.1953125,
"logps/rejected": -3305.15966796875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.210145950317383,
"rewards/margins": 26.91498374938965,
"rewards/rejected": -14.704841613769531,
"step": 1820
},
{
"epoch": 1.0912479499030863,
"grad_norm": 1.5242182598740328e-05,
"learning_rate": 2.25420653009985e-05,
"logits/chosen": -60.623023986816406,
"logits/rejected": -65.05506134033203,
"logps/chosen": -3815.992919921875,
"logps/rejected": -3862.602783203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.181077003479004,
"rewards/margins": 28.176227569580078,
"rewards/rejected": -16.995149612426758,
"step": 1830
},
{
"epoch": 1.0972118681974057,
"grad_norm": 0.0001548586442368105,
"learning_rate": 2.2301989055868383e-05,
"logits/chosen": -59.02531051635742,
"logits/rejected": -63.0557975769043,
"logps/chosen": -3652.93701171875,
"logps/rejected": -3930.30419921875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.64163875579834,
"rewards/margins": 25.959850311279297,
"rewards/rejected": -17.318214416503906,
"step": 1840
},
{
"epoch": 1.103175786491725,
"grad_norm": 2.7569740268518217e-05,
"learning_rate": 2.2062164291624284e-05,
"logits/chosen": -59.61224365234375,
"logits/rejected": -62.123130798339844,
"logps/chosen": -4048.87939453125,
"logps/rejected": -4120.58837890625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.486291885375977,
"rewards/margins": 29.3957462310791,
"rewards/rejected": -17.909452438354492,
"step": 1850
},
{
"epoch": 1.1091397047860445,
"grad_norm": 5.057771423899737e-10,
"learning_rate": 2.1822613362269e-05,
"logits/chosen": -57.71868896484375,
"logits/rejected": -60.3415412902832,
"logps/chosen": -4136.40966796875,
"logps/rejected": -4483.23583984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.933389663696289,
"rewards/margins": 29.532873153686523,
"rewards/rejected": -18.599483489990234,
"step": 1860
},
{
"epoch": 1.1151036230803637,
"grad_norm": 3.651170255025704e-09,
"learning_rate": 2.158335859628126e-05,
"logits/chosen": -58.62504959106445,
"logits/rejected": -62.4349479675293,
"logps/chosen": -4120.52001953125,
"logps/rejected": -4143.6689453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.698260307312012,
"rewards/margins": 27.590999603271484,
"rewards/rejected": -17.892738342285156,
"step": 1870
},
{
"epoch": 1.1210675413746831,
"grad_norm": 1.4945511495056962e-08,
"learning_rate": 2.1344422294534466e-05,
"logits/chosen": -54.32935333251953,
"logits/rejected": -58.437278747558594,
"logps/chosen": -3511.121826171875,
"logps/rejected": -3695.01416015625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.38096809387207,
"rewards/margins": 27.27777099609375,
"rewards/rejected": -15.896801948547363,
"step": 1880
},
{
"epoch": 1.1270314596690025,
"grad_norm": 3.889097206410952e-05,
"learning_rate": 2.1105826728218072e-05,
"logits/chosen": -57.310203552246094,
"logits/rejected": -61.566688537597656,
"logps/chosen": -3701.465576171875,
"logps/rejected": -4115.97802734375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.651991844177246,
"rewards/margins": 25.42984962463379,
"rewards/rejected": -16.77785873413086,
"step": 1890
},
{
"epoch": 1.132995377963322,
"grad_norm": 3.0905356052102206e-09,
"learning_rate": 2.086759413676166e-05,
"logits/chosen": -59.252281188964844,
"logits/rejected": -61.68608474731445,
"logps/chosen": -4090.481201171875,
"logps/rejected": -4027.00390625,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.865057945251465,
"rewards/margins": 25.79592514038086,
"rewards/rejected": -16.930866241455078,
"step": 1900
},
{
"epoch": 1.1389592962576414,
"grad_norm": 9.62276430982456e-07,
"learning_rate": 2.062974672576203e-05,
"logits/chosen": -55.8050537109375,
"logits/rejected": -59.1325798034668,
"logps/chosen": -3845.809814453125,
"logps/rejected": -3548.93994140625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.28779411315918,
"rewards/margins": 27.17965316772461,
"rewards/rejected": -15.891860961914062,
"step": 1910
},
{
"epoch": 1.1449232145519606,
"grad_norm": 5.546740311501708e-08,
"learning_rate": 2.0392306664913414e-05,
"logits/chosen": -59.28105926513672,
"logits/rejected": -62.87616729736328,
"logps/chosen": -3794.875732421875,
"logps/rejected": -4073.58740234375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.75609016418457,
"rewards/margins": 28.09228515625,
"rewards/rejected": -17.336193084716797,
"step": 1920
},
{
"epoch": 1.15088713284628,
"grad_norm": 7.052684480868265e-08,
"learning_rate": 2.015529608594104e-05,
"logits/chosen": -60.80534744262695,
"logits/rejected": -63.97881317138672,
"logps/chosen": -4513.54541015625,
"logps/rejected": -4882.16796875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.083196640014648,
"rewards/margins": 27.9643611907959,
"rewards/rejected": -18.88116455078125,
"step": 1930
},
{
"epoch": 1.1568510511405994,
"grad_norm": 0.0006892980891279876,
"learning_rate": 1.991873708053823e-05,
"logits/chosen": -59.05274200439453,
"logits/rejected": -59.57912063598633,
"logps/chosen": -4209.0419921875,
"logps/rejected": -4109.14306640625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.459901809692383,
"rewards/margins": 23.901077270507812,
"rewards/rejected": -15.44117546081543,
"step": 1940
},
{
"epoch": 1.1628149694349188,
"grad_norm": 0.014071582816541195,
"learning_rate": 1.968265169830728e-05,
"logits/chosen": -58.680267333984375,
"logits/rejected": -62.526641845703125,
"logps/chosen": -4415.60400390625,
"logps/rejected": -4929.56689453125,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.520208358764648,
"rewards/margins": 27.615558624267578,
"rewards/rejected": -16.09535026550293,
"step": 1950
},
{
"epoch": 1.1687788877292382,
"grad_norm": 1.5595247315758343e-09,
"learning_rate": 1.9447061944704173e-05,
"logits/chosen": -57.07389450073242,
"logits/rejected": -61.6367301940918,
"logps/chosen": -3919.495361328125,
"logps/rejected": -3924.32177734375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.67729949951172,
"rewards/margins": 28.186996459960938,
"rewards/rejected": -11.509698867797852,
"step": 1960
},
{
"epoch": 1.1747428060235574,
"grad_norm": 2.3406285265537008e-07,
"learning_rate": 1.9211989778987502e-05,
"logits/chosen": -55.24726486206055,
"logits/rejected": -58.94435501098633,
"logps/chosen": -3895.52197265625,
"logps/rejected": -3826.360107421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.88887596130371,
"rewards/margins": 26.167491912841797,
"rewards/rejected": -8.278615951538086,
"step": 1970
},
{
"epoch": 1.1807067243178768,
"grad_norm": 2.7980338046518227e-09,
"learning_rate": 1.897745711217161e-05,
"logits/chosen": -57.610687255859375,
"logits/rejected": -61.2164192199707,
"logps/chosen": -3854.04736328125,
"logps/rejected": -4032.45703125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.306734085083008,
"rewards/margins": 25.79241943359375,
"rewards/rejected": -8.485687255859375,
"step": 1980
},
{
"epoch": 1.1866706426121962,
"grad_norm": 0.0017272484255954623,
"learning_rate": 1.8743485804984294e-05,
"logits/chosen": -56.56272506713867,
"logits/rejected": -60.19001388549805,
"logps/chosen": -3713.989501953125,
"logps/rejected": -3903.66943359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.19795036315918,
"rewards/margins": 26.44158935546875,
"rewards/rejected": -9.243639945983887,
"step": 1990
},
{
"epoch": 1.1926345609065157,
"grad_norm": 4.515972318319683e-10,
"learning_rate": 1.8510097665829177e-05,
"logits/chosen": -54.333839416503906,
"logits/rejected": -58.24871063232422,
"logps/chosen": -3881.78271484375,
"logps/rejected": -3991.203125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.033931732177734,
"rewards/margins": 26.011281967163086,
"rewards/rejected": -8.977351188659668,
"step": 2000
},
{
"epoch": 1.198598479200835,
"grad_norm": 1.3980934454593807e-07,
"learning_rate": 1.827731444875293e-05,
"logits/chosen": -56.24668502807617,
"logits/rejected": -59.58536911010742,
"logps/chosen": -3488.839111328125,
"logps/rejected": -3365.36474609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 17.400005340576172,
"rewards/margins": 26.981313705444336,
"rewards/rejected": -9.581306457519531,
"step": 2010
},
{
"epoch": 1.2045623974951543,
"grad_norm": 5.057323448909301e-09,
"learning_rate": 1.804515785141761e-05,
"logits/chosen": -53.02077102661133,
"logits/rejected": -55.9692497253418,
"logps/chosen": -3452.415283203125,
"logps/rejected": -3728.23046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.993364334106445,
"rewards/margins": 28.094369888305664,
"rewards/rejected": -12.101005554199219,
"step": 2020
},
{
"epoch": 1.2105263157894737,
"grad_norm": 2.0057086658198386e-07,
"learning_rate": 1.7813649513078206e-05,
"logits/chosen": -56.12092208862305,
"logits/rejected": -64.37012481689453,
"logps/chosen": -3950.99658203125,
"logps/rejected": -4666.666015625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.969812393188477,
"rewards/margins": 27.363088607788086,
"rewards/rejected": -10.393278121948242,
"step": 2030
},
{
"epoch": 1.216490234083793,
"grad_norm": 6.621379355919998e-08,
"learning_rate": 1.758281101256567e-05,
"logits/chosen": -56.147132873535156,
"logits/rejected": -59.15153121948242,
"logps/chosen": -3801.32470703125,
"logps/rejected": -3779.278564453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.260011672973633,
"rewards/margins": 27.139633178710938,
"rewards/rejected": -10.879620552062988,
"step": 2040
},
{
"epoch": 1.2224541523781125,
"grad_norm": 2.4662213036208414e-06,
"learning_rate": 1.735266386627554e-05,
"logits/chosen": -53.46790313720703,
"logits/rejected": -58.09453201293945,
"logps/chosen": -3495.250732421875,
"logps/rejected": -3734.54541015625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.44164752960205,
"rewards/margins": 24.200706481933594,
"rewards/rejected": -8.759058952331543,
"step": 2050
},
{
"epoch": 1.2284180706724317,
"grad_norm": 3.332436904202041e-07,
"learning_rate": 1.7123229526162394e-05,
"logits/chosen": -59.822052001953125,
"logits/rejected": -64.50569915771484,
"logps/chosen": -3889.31103515625,
"logps/rejected": -4200.05615234375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.452617645263672,
"rewards/margins": 26.027156829833984,
"rewards/rejected": -9.574542999267578,
"step": 2060
},
{
"epoch": 1.234381988966751,
"grad_norm": 4.8211667547093384e-08,
"learning_rate": 1.6894529377740355e-05,
"logits/chosen": -55.5076789855957,
"logits/rejected": -59.86821365356445,
"logps/chosen": -3581.645751953125,
"logps/rejected": -3897.926513671875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.29252052307129,
"rewards/margins": 26.74545669555664,
"rewards/rejected": -10.452935218811035,
"step": 2070
},
{
"epoch": 1.2403459072610705,
"grad_norm": 3.915023327749623e-08,
"learning_rate": 1.6666584738089735e-05,
"logits/chosen": -55.1793327331543,
"logits/rejected": -57.694480895996094,
"logps/chosen": -3753.420654296875,
"logps/rejected": -3864.955078125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.927478790283203,
"rewards/margins": 26.437557220458984,
"rewards/rejected": -9.51008129119873,
"step": 2080
},
{
"epoch": 1.24630982555539,
"grad_norm": 6.761503641428135e-07,
"learning_rate": 1.6439416853870042e-05,
"logits/chosen": -56.9517936706543,
"logits/rejected": -61.928070068359375,
"logps/chosen": -3902.32421875,
"logps/rejected": -4099.13623046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.528411865234375,
"rewards/margins": 28.16168212890625,
"rewards/rejected": -11.633268356323242,
"step": 2090
},
{
"epoch": 1.2522737438497091,
"grad_norm": 2.90481926706887e-11,
"learning_rate": 1.621304689933967e-05,
"logits/chosen": -56.108123779296875,
"logits/rejected": -61.044708251953125,
"logps/chosen": -3557.475830078125,
"logps/rejected": -3533.578857421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.501747131347656,
"rewards/margins": 24.694103240966797,
"rewards/rejected": -11.19235610961914,
"step": 2100
},
{
"epoch": 1.2582376621440285,
"grad_norm": 2.608700588879742e-09,
"learning_rate": 1.5987495974382154e-05,
"logits/chosen": -56.22504806518555,
"logits/rejected": -59.918357849121094,
"logps/chosen": -3823.78515625,
"logps/rejected": -3600.735107421875,
"loss": 0.0216,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 15.420328140258789,
"rewards/margins": 26.53145980834961,
"rewards/rejected": -11.11113166809082,
"step": 2110
},
{
"epoch": 1.264201580438348,
"grad_norm": 3.650003321808981e-08,
"learning_rate": 1.5762785102539508e-05,
"logits/chosen": -56.17362594604492,
"logits/rejected": -59.19562530517578,
"logps/chosen": -3761.76708984375,
"logps/rejected": -3929.08740234375,
"loss": 4.0644,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 15.054628372192383,
"rewards/margins": 21.474042892456055,
"rewards/rejected": -6.419415473937988,
"step": 2120
},
{
"epoch": 1.2701654987326674,
"grad_norm": 9.014668123086267e-09,
"learning_rate": 1.5538935229052624e-05,
"logits/chosen": -55.22417068481445,
"logits/rejected": -62.16356658935547,
"logps/chosen": -3842.323486328125,
"logps/rejected": -4168.7177734375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.2366361618042,
"rewards/margins": 33.672088623046875,
"rewards/rejected": -18.435455322265625,
"step": 2130
},
{
"epoch": 1.2761294170269868,
"grad_norm": 1.4135798309666825e-08,
"learning_rate": 1.531596721890897e-05,
"logits/chosen": -56.111854553222656,
"logits/rejected": -60.602996826171875,
"logps/chosen": -3569.96875,
"logps/rejected": -3690.09521484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.550596237182617,
"rewards/margins": 27.969738006591797,
"rewards/rejected": -16.419147491455078,
"step": 2140
},
{
"epoch": 1.282093335321306,
"grad_norm": 2.9603810617118143e-05,
"learning_rate": 1.5093901854897745e-05,
"logits/chosen": -58.67523193359375,
"logits/rejected": -60.438629150390625,
"logps/chosen": -4015.98095703125,
"logps/rejected": -4065.284423828125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.875455856323242,
"rewards/margins": 27.658016204833984,
"rewards/rejected": -13.782560348510742,
"step": 2150
},
{
"epoch": 1.2880572536156254,
"grad_norm": 5.880232967214738e-10,
"learning_rate": 1.4872759835672755e-05,
"logits/chosen": -54.29423904418945,
"logits/rejected": -58.524017333984375,
"logps/chosen": -3648.724609375,
"logps/rejected": -3542.318359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.874812126159668,
"rewards/margins": 27.214290618896484,
"rewards/rejected": -14.3394775390625,
"step": 2160
},
{
"epoch": 1.2940211719099448,
"grad_norm": 6.31591660749109e-07,
"learning_rate": 1.4652561773823103e-05,
"logits/chosen": -56.8558464050293,
"logits/rejected": -62.6048698425293,
"logps/chosen": -3897.655517578125,
"logps/rejected": -4199.82958984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.988385200500488,
"rewards/margins": 29.572118759155273,
"rewards/rejected": -18.5837345123291,
"step": 2170
},
{
"epoch": 1.2999850902042642,
"grad_norm": 6.179973297548713e-07,
"learning_rate": 1.4433328193951837e-05,
"logits/chosen": -57.0740966796875,
"logits/rejected": -59.751129150390625,
"logps/chosen": -3662.604736328125,
"logps/rejected": -3552.6328125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.184779167175293,
"rewards/margins": 25.6380558013916,
"rewards/rejected": -15.453274726867676,
"step": 2180
},
{
"epoch": 1.3059490084985836,
"grad_norm": 3.950074471958942e-07,
"learning_rate": 1.421507953076291e-05,
"logits/chosen": -59.95570755004883,
"logits/rejected": -64.49372863769531,
"logps/chosen": -3971.001953125,
"logps/rejected": -4431.60009765625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.856526374816895,
"rewards/margins": 29.100147247314453,
"rewards/rejected": -17.243621826171875,
"step": 2190
},
{
"epoch": 1.3119129267929028,
"grad_norm": 1.1393160720407636e-10,
"learning_rate": 1.3997836127156457e-05,
"logits/chosen": -57.049774169921875,
"logits/rejected": -61.720191955566406,
"logps/chosen": -3714.94189453125,
"logps/rejected": -3634.033935546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.39828872680664,
"rewards/margins": 26.935588836669922,
"rewards/rejected": -14.537300109863281,
"step": 2200
},
{
"epoch": 1.3178768450872222,
"grad_norm": 5.23483825731752e-11,
"learning_rate": 1.3781618232332633e-05,
"logits/chosen": -58.066749572753906,
"logits/rejected": -61.83543014526367,
"logps/chosen": -3958.8125,
"logps/rejected": -4085.514892578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.226404190063477,
"rewards/margins": 26.033559799194336,
"rewards/rejected": -14.807156562805176,
"step": 2210
},
{
"epoch": 1.3238407633815417,
"grad_norm": 7.795271272925675e-08,
"learning_rate": 1.3566445999904174e-05,
"logits/chosen": -55.3619270324707,
"logits/rejected": -61.64369583129883,
"logps/chosen": -3446.460205078125,
"logps/rejected": -3815.262939453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.02111530303955,
"rewards/margins": 31.109447479248047,
"rewards/rejected": -17.088333129882812,
"step": 2220
},
{
"epoch": 1.329804681675861,
"grad_norm": 5.124166690961829e-08,
"learning_rate": 1.3352339486017935e-05,
"logits/chosen": -56.43516159057617,
"logits/rejected": -60.59318161010742,
"logps/chosen": -3860.68896484375,
"logps/rejected": -4440.08740234375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.916462898254395,
"rewards/margins": 30.730871200561523,
"rewards/rejected": -18.814407348632812,
"step": 2230
},
{
"epoch": 1.3357685999701805,
"grad_norm": 2.5762432187548256e-09,
"learning_rate": 1.3139318647485411e-05,
"logits/chosen": -59.685874938964844,
"logits/rejected": -63.754478454589844,
"logps/chosen": -4016.92333984375,
"logps/rejected": -4015.01025390625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.248187065124512,
"rewards/margins": 28.233264923095703,
"rewards/rejected": -15.985074996948242,
"step": 2240
},
{
"epoch": 1.3417325182644997,
"grad_norm": 8.210008672904223e-06,
"learning_rate": 1.2927403339922556e-05,
"logits/chosen": -58.01857376098633,
"logits/rejected": -59.8955078125,
"logps/chosen": -3789.346923828125,
"logps/rejected": -4025.25927734375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.103419303894043,
"rewards/margins": 29.54793357849121,
"rewards/rejected": -16.44451332092285,
"step": 2250
},
{
"epoch": 1.347696436558819,
"grad_norm": 1.7891031234285037e-07,
"learning_rate": 1.2716613315899112e-05,
"logits/chosen": -56.45894241333008,
"logits/rejected": -61.15498733520508,
"logps/chosen": -3575.19189453125,
"logps/rejected": -3521.14501953125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.165433883666992,
"rewards/margins": 26.995624542236328,
"rewards/rejected": -13.830190658569336,
"step": 2260
},
{
"epoch": 1.3536603548531385,
"grad_norm": 1.800779045879608e-06,
"learning_rate": 1.2506968223097431e-05,
"logits/chosen": -55.29785919189453,
"logits/rejected": -58.99498748779297,
"logps/chosen": -3407.56884765625,
"logps/rejected": -3411.91064453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.742193222045898,
"rewards/margins": 25.165752410888672,
"rewards/rejected": -12.423562049865723,
"step": 2270
},
{
"epoch": 1.359624273147458,
"grad_norm": 2.989838321809657e-06,
"learning_rate": 1.229848760248112e-05,
"logits/chosen": -57.45177459716797,
"logits/rejected": -58.72307205200195,
"logps/chosen": -3568.59765625,
"logps/rejected": -3394.478515625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.099716186523438,
"rewards/margins": 27.246999740600586,
"rewards/rejected": -13.147282600402832,
"step": 2280
},
{
"epoch": 1.3655881914417773,
"grad_norm": 5.450796152217663e-07,
"learning_rate": 1.2091190886473644e-05,
"logits/chosen": -56.60668182373047,
"logits/rejected": -61.323768615722656,
"logps/chosen": -3702.658203125,
"logps/rejected": -3887.23193359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.3460693359375,
"rewards/margins": 27.915029525756836,
"rewards/rejected": -14.568957328796387,
"step": 2290
},
{
"epoch": 1.3715521097360965,
"grad_norm": 6.116428608038404e-08,
"learning_rate": 1.1885097397147063e-05,
"logits/chosen": -58.4415168762207,
"logits/rejected": -63.7178840637207,
"logps/chosen": -3674.389892578125,
"logps/rejected": -3665.013671875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.25750732421875,
"rewards/margins": 26.048290252685547,
"rewards/rejected": -13.79078197479248,
"step": 2300
},
{
"epoch": 1.377516028030416,
"grad_norm": 2.8665971285590786e-07,
"learning_rate": 1.1680226344420942e-05,
"logits/chosen": -58.254615783691406,
"logits/rejected": -62.49680709838867,
"logps/chosen": -3836.04052734375,
"logps/rejected": -4047.54150390625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.538701057434082,
"rewards/margins": 26.963558197021484,
"rewards/rejected": -15.424860954284668,
"step": 2310
},
{
"epoch": 1.3834799463247354,
"grad_norm": 1.2907316886412445e-06,
"learning_rate": 1.147659682427189e-05,
"logits/chosen": -58.32746124267578,
"logits/rejected": -62.26505661010742,
"logps/chosen": -4025.608642578125,
"logps/rejected": -4167.8388671875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.876118659973145,
"rewards/margins": 28.321285247802734,
"rewards/rejected": -16.44516944885254,
"step": 2320
},
{
"epoch": 1.3894438646190548,
"grad_norm": 2.1081565648728429e-07,
"learning_rate": 1.1274227816953584e-05,
"logits/chosen": -54.2985954284668,
"logits/rejected": -59.446044921875,
"logps/chosen": -3246.630859375,
"logps/rejected": -3621.38818359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.192728996276855,
"rewards/margins": 27.980316162109375,
"rewards/rejected": -14.78758716583252,
"step": 2330
},
{
"epoch": 1.3954077829133742,
"grad_norm": 4.903237282860573e-08,
"learning_rate": 1.1073138185227638e-05,
"logits/chosen": -58.693580627441406,
"logits/rejected": -61.3970832824707,
"logps/chosen": -3988.88671875,
"logps/rejected": -3924.40625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.711004257202148,
"rewards/margins": 29.38458251953125,
"rewards/rejected": -14.673579216003418,
"step": 2340
},
{
"epoch": 1.4013717012076934,
"grad_norm": 2.6177708605246153e-06,
"learning_rate": 1.0873346672605394e-05,
"logits/chosen": -54.318634033203125,
"logits/rejected": -58.23624801635742,
"logps/chosen": -3739.504638671875,
"logps/rejected": -4247.2490234375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.0569429397583,
"rewards/margins": 29.031036376953125,
"rewards/rejected": -15.974093437194824,
"step": 2350
},
{
"epoch": 1.4073356195020128,
"grad_norm": 1.0652929631760344e-06,
"learning_rate": 1.0674871901600886e-05,
"logits/chosen": -57.78883743286133,
"logits/rejected": -61.051246643066406,
"logps/chosen": -4084.44091796875,
"logps/rejected": -4172.6962890625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.95031452178955,
"rewards/margins": 26.569732666015625,
"rewards/rejected": -13.619417190551758,
"step": 2360
},
{
"epoch": 1.4132995377963322,
"grad_norm": 9.611061102532403e-08,
"learning_rate": 1.047773237199497e-05,
"logits/chosen": -57.65983963012695,
"logits/rejected": -60.987876892089844,
"logps/chosen": -3898.65087890625,
"logps/rejected": -4089.94287109375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.300247192382812,
"rewards/margins": 28.2935733795166,
"rewards/rejected": -15.993327140808105,
"step": 2370
},
{
"epoch": 1.4192634560906516,
"grad_norm": 1.8469700080459006e-05,
"learning_rate": 1.0281946459111022e-05,
"logits/chosen": -55.288063049316406,
"logits/rejected": -59.08159255981445,
"logps/chosen": -3633.615966796875,
"logps/rejected": -3830.56982421875,
"loss": 0.3076,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 12.845723152160645,
"rewards/margins": 25.96260643005371,
"rewards/rejected": -13.11688232421875,
"step": 2380
},
{
"epoch": 1.425227374384971,
"grad_norm": 6.146466446921295e-09,
"learning_rate": 1.0087532412102171e-05,
"logits/chosen": -54.926902770996094,
"logits/rejected": -59.40453338623047,
"logps/chosen": -3904.94677734375,
"logps/rejected": -4362.1982421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.936139106750488,
"rewards/margins": 26.841259002685547,
"rewards/rejected": -11.905121803283691,
"step": 2390
},
{
"epoch": 1.4311912926792902,
"grad_norm": 2.4089372345770244e-06,
"learning_rate": 9.894508352250281e-06,
"logits/chosen": -56.74151611328125,
"logits/rejected": -59.63338088989258,
"logps/chosen": -3725.58642578125,
"logps/rejected": -3677.700439453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.33104133605957,
"rewards/margins": 25.67470932006836,
"rewards/rejected": -13.343668937683105,
"step": 2400
},
{
"epoch": 1.4371552109736097,
"grad_norm": 5.329222219074836e-08,
"learning_rate": 9.702892271276882e-06,
"logits/chosen": -54.703086853027344,
"logits/rejected": -58.85982131958008,
"logps/chosen": -3626.12158203125,
"logps/rejected": -3548.104736328125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.935752868652344,
"rewards/margins": 26.5059871673584,
"rewards/rejected": -13.570233345031738,
"step": 2410
},
{
"epoch": 1.443119129267929,
"grad_norm": 2.663332221430892e-09,
"learning_rate": 9.512702029666165e-06,
"logits/chosen": -58.17033767700195,
"logits/rejected": -60.57352828979492,
"logps/chosen": -4204.1767578125,
"logps/rejected": -4053.822998046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.819692611694336,
"rewards/margins": 27.93819808959961,
"rewards/rejected": -15.118505477905273,
"step": 2420
},
{
"epoch": 1.4490830475622483,
"grad_norm": 3.948618143567728e-08,
"learning_rate": 9.323955355000213e-06,
"logits/chosen": -57.46662521362305,
"logits/rejected": -58.235816955566406,
"logps/chosen": -3560.56298828125,
"logps/rejected": -3385.750732421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 16.08102035522461,
"rewards/margins": 27.879852294921875,
"rewards/rejected": -11.798830032348633,
"step": 2430
},
{
"epoch": 1.455046965856568,
"grad_norm": 5.0093671433160125e-08,
"learning_rate": 9.136669840306617e-06,
"logits/chosen": -54.47039794921875,
"logits/rejected": -57.519554138183594,
"logps/chosen": -3386.38916015625,
"logps/rejected": -3258.2373046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.689840316772461,
"rewards/margins": 28.17424964904785,
"rewards/rejected": -12.484407424926758,
"step": 2440
},
{
"epoch": 1.461010884150887,
"grad_norm": 4.318402080372152e-09,
"learning_rate": 8.950862942418634e-06,
"logits/chosen": -56.27559280395508,
"logits/rejected": -61.70929718017578,
"logps/chosen": -3825.42724609375,
"logps/rejected": -3962.33349609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.570101737976074,
"rewards/margins": 29.67506980895996,
"rewards/rejected": -15.10496711730957,
"step": 2450
},
{
"epoch": 1.4669748024452065,
"grad_norm": 2.655309344845591e-06,
"learning_rate": 8.766551980348035e-06,
"logits/chosen": -56.446922302246094,
"logits/rejected": -59.82023239135742,
"logps/chosen": -3807.442138671875,
"logps/rejected": -4322.15087890625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.840948104858398,
"rewards/margins": 27.626562118530273,
"rewards/rejected": -11.785615921020508,
"step": 2460
},
{
"epoch": 1.472938720739526,
"grad_norm": 9.323419902784735e-08,
"learning_rate": 8.583754133670813e-06,
"logits/chosen": -54.80516815185547,
"logits/rejected": -58.1927490234375,
"logps/chosen": -3539.16064453125,
"logps/rejected": -3787.634033203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.8378267288208,
"rewards/margins": 27.45003890991211,
"rewards/rejected": -11.612213134765625,
"step": 2470
},
{
"epoch": 1.4789026390338451,
"grad_norm": 2.1703629045077832e-06,
"learning_rate": 8.402486440925875e-06,
"logits/chosen": -55.11442184448242,
"logits/rejected": -61.09314727783203,
"logps/chosen": -3670.315185546875,
"logps/rejected": -4020.887451171875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.911749839782715,
"rewards/margins": 28.087337493896484,
"rewards/rejected": -16.175586700439453,
"step": 2480
},
{
"epoch": 1.4848665573281645,
"grad_norm": 2.716829214932659e-07,
"learning_rate": 8.222765798026888e-06,
"logits/chosen": -56.46864700317383,
"logits/rejected": -61.81829833984375,
"logps/chosen": -3806.19580078125,
"logps/rejected": -3518.6796875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.517992973327637,
"rewards/margins": 27.79050636291504,
"rewards/rejected": -14.272509574890137,
"step": 2490
},
{
"epoch": 1.490830475622484,
"grad_norm": 1.8604217189022165e-07,
"learning_rate": 8.044608956687411e-06,
"logits/chosen": -58.91337966918945,
"logits/rejected": -61.38121795654297,
"logps/chosen": -4016.22509765625,
"logps/rejected": -3790.36279296875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.345911979675293,
"rewards/margins": 27.247661590576172,
"rewards/rejected": -12.901748657226562,
"step": 2500
},
{
"epoch": 1.4967943939168034,
"grad_norm": 5.806775948968834e-09,
"learning_rate": 7.868032522859466e-06,
"logits/chosen": -54.46747589111328,
"logits/rejected": -58.573265075683594,
"logps/chosen": -3881.350341796875,
"logps/rejected": -4542.3095703125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.201573371887207,
"rewards/margins": 29.226207733154297,
"rewards/rejected": -15.024637222290039,
"step": 2510
},
{
"epoch": 1.5027583122111228,
"grad_norm": 4.5620687671998894e-08,
"learning_rate": 7.69305295518572e-06,
"logits/chosen": -59.017433166503906,
"logits/rejected": -62.553138732910156,
"logps/chosen": -3937.80517578125,
"logps/rejected": -4334.24267578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.289695739746094,
"rewards/margins": 28.369304656982422,
"rewards/rejected": -18.079607009887695,
"step": 2520
},
{
"epoch": 1.508722230505442,
"grad_norm": 1.7904097648901995e-10,
"learning_rate": 7.5196865634653614e-06,
"logits/chosen": -56.08478546142578,
"logits/rejected": -60.103553771972656,
"logps/chosen": -3620.850341796875,
"logps/rejected": -3999.59521484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.94933795928955,
"rewards/margins": 27.5977840423584,
"rewards/rejected": -14.648447036743164,
"step": 2530
},
{
"epoch": 1.5146861487997616,
"grad_norm": 4.987779789189517e-07,
"learning_rate": 7.347949507133881e-06,
"logits/chosen": -57.28242874145508,
"logits/rejected": -61.029945373535156,
"logps/chosen": -3839.3828125,
"logps/rejected": -3679.442626953125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.256357192993164,
"rewards/margins": 25.953781127929688,
"rewards/rejected": -11.697421073913574,
"step": 2540
},
{
"epoch": 1.5206500670940808,
"grad_norm": 1.5572302336508415e-09,
"learning_rate": 7.17785779375684e-06,
"logits/chosen": -57.10649871826172,
"logits/rejected": -61.82776641845703,
"logps/chosen": -3794.711669921875,
"logps/rejected": -3995.13916015625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.982124328613281,
"rewards/margins": 29.219127655029297,
"rewards/rejected": -16.237003326416016,
"step": 2550
},
{
"epoch": 1.5266139853884002,
"grad_norm": 0.0003727490548044443,
"learning_rate": 7.009427277537828e-06,
"logits/chosen": -57.260643005371094,
"logits/rejected": -61.28925704956055,
"logps/chosen": -3952.393798828125,
"logps/rejected": -4409.12548828125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.187792778015137,
"rewards/margins": 28.366008758544922,
"rewards/rejected": -15.178217887878418,
"step": 2560
},
{
"epoch": 1.5325779036827196,
"grad_norm": 3.760019032239548e-11,
"learning_rate": 6.842673657840684e-06,
"logits/chosen": -58.93050003051758,
"logits/rejected": -61.820579528808594,
"logps/chosen": -4122.65771484375,
"logps/rejected": -4377.88037109375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.47004222869873,
"rewards/margins": 28.11465072631836,
"rewards/rejected": -14.644607543945312,
"step": 2570
},
{
"epoch": 1.5385418219770388,
"grad_norm": 1.33146016878527e-07,
"learning_rate": 6.6776124777261585e-06,
"logits/chosen": -58.728904724121094,
"logits/rejected": -62.30875778198242,
"logps/chosen": -3577.420654296875,
"logps/rejected": -3622.842529296875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.283228874206543,
"rewards/margins": 28.720935821533203,
"rewards/rejected": -13.437705993652344,
"step": 2580
},
{
"epoch": 1.5445057402713582,
"grad_norm": 1.0433809372045744e-09,
"learning_rate": 6.514259122503169e-06,
"logits/chosen": -53.648033142089844,
"logits/rejected": -58.789031982421875,
"logps/chosen": -3476.70947265625,
"logps/rejected": -3617.780517578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.440394401550293,
"rewards/margins": 26.173446655273438,
"rewards/rejected": -12.733052253723145,
"step": 2590
},
{
"epoch": 1.5504696585656776,
"grad_norm": 8.736037448997536e-10,
"learning_rate": 6.35262881829472e-06,
"logits/chosen": -59.83369827270508,
"logits/rejected": -60.867530822753906,
"logps/chosen": -3929.28173828125,
"logps/rejected": -4287.845703125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.677624702453613,
"rewards/margins": 29.104511260986328,
"rewards/rejected": -15.426889419555664,
"step": 2600
},
{
"epoch": 1.556433576859997,
"grad_norm": 1.1132956956316775e-07,
"learning_rate": 6.1927366306186865e-06,
"logits/chosen": -58.01744842529297,
"logits/rejected": -62.40656661987305,
"logps/chosen": -3270.4072265625,
"logps/rejected": -2893.09130859375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.387075424194336,
"rewards/margins": 23.64480972290039,
"rewards/rejected": -10.257735252380371,
"step": 2610
},
{
"epoch": 1.5623974951543165,
"grad_norm": 2.7856225415234803e-07,
"learning_rate": 6.034597462983563e-06,
"logits/chosen": -58.883827209472656,
"logits/rejected": -64.26622009277344,
"logps/chosen": -3842.31640625,
"logps/rejected": -4679.39208984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.385858535766602,
"rewards/margins": 28.50748062133789,
"rewards/rejected": -14.121622085571289,
"step": 2620
},
{
"epoch": 1.5683614134486357,
"grad_norm": 1.1248067011138119e-07,
"learning_rate": 5.878226055499308e-06,
"logits/chosen": -60.66514205932617,
"logits/rejected": -65.04862976074219,
"logps/chosen": -3726.667236328125,
"logps/rejected": -4026.20068359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.686662673950195,
"rewards/margins": 27.384693145751953,
"rewards/rejected": -13.698030471801758,
"step": 2630
},
{
"epoch": 1.574325331742955,
"grad_norm": 5.872105361959257e-07,
"learning_rate": 5.72363698350343e-06,
"logits/chosen": -57.18046951293945,
"logits/rejected": -62.109901428222656,
"logps/chosen": -4155.00439453125,
"logps/rejected": -4609.71044921875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.978708267211914,
"rewards/margins": 30.07086181640625,
"rewards/rejected": -15.092155456542969,
"step": 2640
},
{
"epoch": 1.5802892500372745,
"grad_norm": 1.7974322474856308e-07,
"learning_rate": 5.570844656202415e-06,
"logits/chosen": -56.97467803955078,
"logits/rejected": -60.40959930419922,
"logps/chosen": -3629.501220703125,
"logps/rejected": -4034.34765625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.481059074401855,
"rewards/margins": 26.78216552734375,
"rewards/rejected": -12.301109313964844,
"step": 2650
},
{
"epoch": 1.586253168331594,
"grad_norm": 6.639849781109319e-11,
"learning_rate": 5.419863315328644e-06,
"logits/chosen": -60.214744567871094,
"logits/rejected": -63.88579559326172,
"logps/chosen": -3973.41015625,
"logps/rejected": -4298.8505859375,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.445764541625977,
"rewards/margins": 27.013805389404297,
"rewards/rejected": -12.56804370880127,
"step": 2660
},
{
"epoch": 1.5922170866259133,
"grad_norm": 1.2835856821880043e-09,
"learning_rate": 5.270707033812952e-06,
"logits/chosen": -56.302101135253906,
"logits/rejected": -61.73807907104492,
"logps/chosen": -3850.40234375,
"logps/rejected": -3881.409423828125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.187263488769531,
"rewards/margins": 29.23580551147461,
"rewards/rejected": -14.048541069030762,
"step": 2670
},
{
"epoch": 1.5981810049202325,
"grad_norm": 1.238020241878246e-09,
"learning_rate": 5.12338971447284e-06,
"logits/chosen": -56.36561965942383,
"logits/rejected": -60.4167594909668,
"logps/chosen": -3432.209716796875,
"logps/rejected": -3635.707763671875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.321012496948242,
"rewards/margins": 28.88069725036621,
"rewards/rejected": -14.559684753417969,
"step": 2680
},
{
"epoch": 1.604144923214552,
"grad_norm": 5.2999196142877736e-09,
"learning_rate": 4.977925088716673e-06,
"logits/chosen": -56.03685760498047,
"logits/rejected": -58.90336227416992,
"logps/chosen": -3657.60498046875,
"logps/rejected": -3899.95751953125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.800675392150879,
"rewards/margins": 26.530603408813477,
"rewards/rejected": -15.729925155639648,
"step": 2690
},
{
"epoch": 1.6101088415088713,
"grad_norm": 3.040783553773241e-12,
"learning_rate": 4.834326715263709e-06,
"logits/chosen": -57.493499755859375,
"logits/rejected": -61.91980743408203,
"logps/chosen": -3821.324951171875,
"logps/rejected": -3955.79296875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.035219192504883,
"rewards/margins": 31.038372039794922,
"rewards/rejected": -16.003154754638672,
"step": 2700
},
{
"epoch": 1.6160727598031905,
"grad_norm": 5.735748942470309e-08,
"learning_rate": 4.692607978880334e-06,
"logits/chosen": -55.83156204223633,
"logits/rejected": -59.69164276123047,
"logps/chosen": -3918.27734375,
"logps/rejected": -3958.736328125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.39517879486084,
"rewards/margins": 28.416271209716797,
"rewards/rejected": -16.02109146118164,
"step": 2710
},
{
"epoch": 1.6220366780975102,
"grad_norm": 1.513937263553089e-06,
"learning_rate": 4.552782089132457e-06,
"logits/chosen": -53.48059844970703,
"logits/rejected": -57.53205490112305,
"logps/chosen": -3965.768310546875,
"logps/rejected": -4178.44384765625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.595451354980469,
"rewards/margins": 27.8395938873291,
"rewards/rejected": -14.24414348602295,
"step": 2720
},
{
"epoch": 1.6280005963918294,
"grad_norm": 1.2683202839980368e-05,
"learning_rate": 4.414862079154258e-06,
"logits/chosen": -57.488067626953125,
"logits/rejected": -62.132568359375,
"logps/chosen": -3671.482421875,
"logps/rejected": -3794.837890625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.1273832321167,
"rewards/margins": 28.243274688720703,
"rewards/rejected": -16.115894317626953,
"step": 2730
},
{
"epoch": 1.6339645146861488,
"grad_norm": 4.4300445978251446e-08,
"learning_rate": 4.278860804433346e-06,
"logits/chosen": -55.68921661376953,
"logits/rejected": -59.8105583190918,
"logps/chosen": -4055.94580078125,
"logps/rejected": -4559.392578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.716020584106445,
"rewards/margins": 35.52238845825195,
"rewards/rejected": -21.806364059448242,
"step": 2740
},
{
"epoch": 1.6399284329804682,
"grad_norm": 2.3574155960659482e-08,
"learning_rate": 4.144790941612561e-06,
"logits/chosen": -57.612083435058594,
"logits/rejected": -60.485084533691406,
"logps/chosen": -3919.98046875,
"logps/rejected": -3707.94677734375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.938611030578613,
"rewards/margins": 27.882177352905273,
"rewards/rejected": -13.943568229675293,
"step": 2750
},
{
"epoch": 1.6458923512747874,
"grad_norm": 1.973355574591551e-05,
"learning_rate": 4.012664987308326e-06,
"logits/chosen": -55.63753128051758,
"logits/rejected": -59.27739715576172,
"logps/chosen": -3538.80078125,
"logps/rejected": -3376.485595703125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.036745071411133,
"rewards/margins": 26.072357177734375,
"rewards/rejected": -13.035612106323242,
"step": 2760
},
{
"epoch": 1.651856269569107,
"grad_norm": 2.2980448655118835e-09,
"learning_rate": 3.8824952569458675e-06,
"logits/chosen": -56.71158981323242,
"logits/rejected": -60.6939811706543,
"logps/chosen": -3697.500732421875,
"logps/rejected": -4019.428466796875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.426570892333984,
"rewards/margins": 28.051671981811523,
"rewards/rejected": -15.625103950500488,
"step": 2770
},
{
"epoch": 1.6578201878634262,
"grad_norm": 2.817006627964247e-10,
"learning_rate": 3.754293883611307e-06,
"logits/chosen": -56.885704040527344,
"logits/rejected": -60.8448371887207,
"logps/chosen": -3909.04736328125,
"logps/rejected": -4620.33935546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.772366523742676,
"rewards/margins": 29.079452514648438,
"rewards/rejected": -16.307085037231445,
"step": 2780
},
{
"epoch": 1.6637841061577456,
"grad_norm": 1.7094137216844274e-09,
"learning_rate": 3.628072816920722e-06,
"logits/chosen": -55.26677703857422,
"logits/rejected": -60.0572395324707,
"logps/chosen": -3623.676513671875,
"logps/rejected": -3435.266357421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.957403182983398,
"rewards/margins": 28.258371353149414,
"rewards/rejected": -14.300966262817383,
"step": 2790
},
{
"epoch": 1.669748024452065,
"grad_norm": 1.561394569193908e-09,
"learning_rate": 3.5038438219063247e-06,
"logits/chosen": -58.582733154296875,
"logits/rejected": -61.522300720214844,
"logps/chosen": -3925.781982421875,
"logps/rejected": -4272.77783203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.12500286102295,
"rewards/margins": 27.7889404296875,
"rewards/rejected": -14.663938522338867,
"step": 2800
},
{
"epoch": 1.6757119427463842,
"grad_norm": 3.270694651291706e-05,
"learning_rate": 3.3816184779198566e-06,
"logits/chosen": -57.79584884643555,
"logits/rejected": -61.09257888793945,
"logps/chosen": -3688.829345703125,
"logps/rejected": -3789.216064453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.383466720581055,
"rewards/margins": 25.65230369567871,
"rewards/rejected": -13.268835067749023,
"step": 2810
},
{
"epoch": 1.6816758610407039,
"grad_norm": 0.0009348949533887208,
"learning_rate": 3.2614081775532935e-06,
"logits/chosen": -57.633819580078125,
"logits/rejected": -61.18571090698242,
"logps/chosen": -3827.751220703125,
"logps/rejected": -4190.44873046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.405567169189453,
"rewards/margins": 26.647628784179688,
"rewards/rejected": -15.24206256866455,
"step": 2820
},
{
"epoch": 1.687639779335023,
"grad_norm": 2.3347972728515742e-06,
"learning_rate": 3.143224125576913e-06,
"logits/chosen": -59.698570251464844,
"logits/rejected": -63.5836067199707,
"logps/chosen": -4135.7724609375,
"logps/rejected": -4294.7646484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.506874084472656,
"rewards/margins": 26.867511749267578,
"rewards/rejected": -16.360633850097656,
"step": 2830
},
{
"epoch": 1.6936036976293425,
"grad_norm": 3.288195557615836e-06,
"learning_rate": 3.0270773378949153e-06,
"logits/chosen": -57.68184280395508,
"logits/rejected": -60.27643966674805,
"logps/chosen": -3976.610595703125,
"logps/rejected": -4323.2333984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.625307083129883,
"rewards/margins": 30.234777450561523,
"rewards/rejected": -16.60947036743164,
"step": 2840
},
{
"epoch": 1.699567615923662,
"grad_norm": 1.8086376840642515e-10,
"learning_rate": 2.9129786405186517e-06,
"logits/chosen": -58.33183670043945,
"logits/rejected": -61.32403564453125,
"logps/chosen": -3671.446533203125,
"logps/rejected": -3879.734375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.45142936706543,
"rewards/margins": 27.773706436157227,
"rewards/rejected": -14.32227611541748,
"step": 2850
},
{
"epoch": 1.705531534217981,
"grad_norm": 5.313737716505784e-08,
"learning_rate": 2.8009386685574873e-06,
"logits/chosen": -58.4184684753418,
"logits/rejected": -64.02459716796875,
"logps/chosen": -3511.401611328125,
"logps/rejected": -3506.68798828125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.209698677062988,
"rewards/margins": 27.310054779052734,
"rewards/rejected": -15.10035514831543,
"step": 2860
},
{
"epoch": 1.7114954525123007,
"grad_norm": 2.9329979156500485e-07,
"learning_rate": 2.6909678652275617e-06,
"logits/chosen": -54.92981719970703,
"logits/rejected": -63.78539276123047,
"logps/chosen": -3624.64453125,
"logps/rejected": -4150.95263671875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.3597993850708,
"rewards/margins": 32.095314025878906,
"rewards/rejected": -18.735517501831055,
"step": 2870
},
{
"epoch": 1.71745937080662,
"grad_norm": 2.407121968417414e-08,
"learning_rate": 2.583076480878352e-06,
"logits/chosen": -55.66535568237305,
"logits/rejected": -59.855445861816406,
"logps/chosen": -3529.068359375,
"logps/rejected": -3593.259765625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.202707290649414,
"rewards/margins": 27.2900333404541,
"rewards/rejected": -13.08732795715332,
"step": 2880
},
{
"epoch": 1.7234232891009393,
"grad_norm": 5.434892091926713e-09,
"learning_rate": 2.477274572037236e-06,
"logits/chosen": -59.2020263671875,
"logits/rejected": -62.6593132019043,
"logps/chosen": -4125.734375,
"logps/rejected": -4534.7890625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.742546081542969,
"rewards/margins": 28.544530868530273,
"rewards/rejected": -16.801984786987305,
"step": 2890
},
{
"epoch": 1.7293872073952588,
"grad_norm": 1.486068906819682e-11,
"learning_rate": 2.3735720004721325e-06,
"logits/chosen": -54.690345764160156,
"logits/rejected": -58.92433547973633,
"logps/chosen": -3601.94091796875,
"logps/rejected": -3842.4375,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.382122039794922,
"rewards/margins": 26.631744384765625,
"rewards/rejected": -14.249621391296387,
"step": 2900
},
{
"epoch": 1.735351125689578,
"grad_norm": 1.4909307122223936e-10,
"learning_rate": 2.2719784322722954e-06,
"logits/chosen": -58.35798263549805,
"logits/rejected": -61.391815185546875,
"logps/chosen": -3863.544189453125,
"logps/rejected": -3775.06787109375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.62255859375,
"rewards/margins": 26.525253295898438,
"rewards/rejected": -13.902692794799805,
"step": 2910
},
{
"epoch": 1.7413150439838976,
"grad_norm": 1.7777512766770087e-05,
"learning_rate": 2.172503336947318e-06,
"logits/chosen": -58.63573455810547,
"logits/rejected": -61.71209716796875,
"logps/chosen": -3985.02197265625,
"logps/rejected": -4420.2685546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.010174751281738,
"rewards/margins": 28.709774017333984,
"rewards/rejected": -14.699602127075195,
"step": 2920
},
{
"epoch": 1.7472789622782168,
"grad_norm": 3.4102259860446793e-07,
"learning_rate": 2.0751559865445137e-06,
"logits/chosen": -55.8213005065918,
"logits/rejected": -58.88740921020508,
"logps/chosen": -3726.633544921875,
"logps/rejected": -3584.02880859375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.5889310836792,
"rewards/margins": 27.472326278686523,
"rewards/rejected": -12.883394241333008,
"step": 2930
},
{
"epoch": 1.7532428805725362,
"grad_norm": 3.0110019366702545e-08,
"learning_rate": 1.9799454547846403e-06,
"logits/chosen": -58.080291748046875,
"logits/rejected": -65.03736114501953,
"logps/chosen": -3850.731689453125,
"logps/rejected": -3925.19140625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.993995666503906,
"rewards/margins": 30.477624893188477,
"rewards/rejected": -19.483631134033203,
"step": 2940
},
{
"epoch": 1.7592067988668556,
"grad_norm": 8.025313746884422e-11,
"learning_rate": 1.8868806162161745e-06,
"logits/chosen": -55.89226150512695,
"logits/rejected": -59.2379150390625,
"logps/chosen": -3627.599609375,
"logps/rejected": -3741.21728515625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.812711715698242,
"rewards/margins": 27.617528915405273,
"rewards/rejected": -13.804819107055664,
"step": 2950
},
{
"epoch": 1.7651707171611748,
"grad_norm": 9.478714702026991e-08,
"learning_rate": 1.7959701453880845e-06,
"logits/chosen": -53.85209274291992,
"logits/rejected": -58.788795471191406,
"logps/chosen": -3635.917236328125,
"logps/rejected": -3698.177734375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.880119323730469,
"rewards/margins": 27.0487117767334,
"rewards/rejected": -13.16859245300293,
"step": 2960
},
{
"epoch": 1.7711346354554942,
"grad_norm": 1.0980542128891102e-06,
"learning_rate": 1.7072225160412987e-06,
"logits/chosen": -55.5919189453125,
"logits/rejected": -59.72821044921875,
"logps/chosen": -3429.30859375,
"logps/rejected": -3813.98291015625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.354362487792969,
"rewards/margins": 26.6251277923584,
"rewards/rejected": -13.270769119262695,
"step": 2970
},
{
"epoch": 1.7770985537498136,
"grad_norm": 8.783963556524554e-10,
"learning_rate": 1.6206460003188484e-06,
"logits/chosen": -60.265892028808594,
"logits/rejected": -63.906089782714844,
"logps/chosen": -4072.217529296875,
"logps/rejected": -4386.146484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.840473175048828,
"rewards/margins": 27.794742584228516,
"rewards/rejected": -15.954269409179688,
"step": 2980
},
{
"epoch": 1.783062472044133,
"grad_norm": 2.523800057829817e-09,
"learning_rate": 1.536248667994855e-06,
"logits/chosen": -58.744789123535156,
"logits/rejected": -61.5314826965332,
"logps/chosen": -3976.684814453125,
"logps/rejected": -4438.0166015625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.870981216430664,
"rewards/margins": 27.51932716369629,
"rewards/rejected": -14.648345947265625,
"step": 2990
},
{
"epoch": 1.7890263903384525,
"grad_norm": 5.33410729985917e-06,
"learning_rate": 1.4540383857223255e-06,
"logits/chosen": -56.88322830200195,
"logits/rejected": -61.4012565612793,
"logps/chosen": -3691.760498046875,
"logps/rejected": -3509.0,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.339391708374023,
"rewards/margins": 26.1422119140625,
"rewards/rejected": -11.802818298339844,
"step": 3000
},
{
"epoch": 1.7949903086327716,
"grad_norm": 0.003620662959292531,
"learning_rate": 1.3740228162999164e-06,
"logits/chosen": -55.299964904785156,
"logits/rejected": -60.9326171875,
"logps/chosen": -3670.139892578125,
"logps/rejected": -4088.413330078125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.467697143554688,
"rewards/margins": 26.333303451538086,
"rewards/rejected": -13.865605354309082,
"step": 3010
},
{
"epoch": 1.800954226927091,
"grad_norm": 1.163293128492171e-09,
"learning_rate": 1.2962094179576723e-06,
"logits/chosen": -58.49831008911133,
"logits/rejected": -64.70086669921875,
"logps/chosen": -3779.00732421875,
"logps/rejected": -4333.802734375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.059266090393066,
"rewards/margins": 27.809616088867188,
"rewards/rejected": -17.750350952148438,
"step": 3020
},
{
"epoch": 1.8069181452214105,
"grad_norm": 1.3129453968474536e-08,
"learning_rate": 1.2206054436618624e-06,
"logits/chosen": -54.48987579345703,
"logits/rejected": -60.60657501220703,
"logps/chosen": -3450.112060546875,
"logps/rejected": -3492.62353515625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.539463996887207,
"rewards/margins": 27.055255889892578,
"rewards/rejected": -13.515789985656738,
"step": 3030
},
{
"epoch": 1.8128820635157297,
"grad_norm": 1.3139845123077976e-06,
"learning_rate": 1.1472179404389133e-06,
"logits/chosen": -57.91154098510742,
"logits/rejected": -60.720306396484375,
"logps/chosen": -3706.28857421875,
"logps/rejected": -3608.96484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.64875316619873,
"rewards/margins": 25.565021514892578,
"rewards/rejected": -12.916269302368164,
"step": 3040
},
{
"epoch": 1.8188459818100493,
"grad_norm": 1.7036876798215417e-08,
"learning_rate": 1.0760537487185807e-06,
"logits/chosen": -55.9576301574707,
"logits/rejected": -60.521888732910156,
"logps/chosen": -3810.43359375,
"logps/rejected": -3926.3671875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.514657974243164,
"rewards/margins": 28.64280128479004,
"rewards/rejected": -15.128148078918457,
"step": 3050
},
{
"epoch": 1.8248099001043685,
"grad_norm": 4.0257355067296885e-06,
"learning_rate": 1.0071195016963381e-06,
"logits/chosen": -62.52231979370117,
"logits/rejected": -66.21368408203125,
"logps/chosen": -4432.63916015625,
"logps/rejected": -4683.30517578125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.339556694030762,
"rewards/margins": 29.099838256835938,
"rewards/rejected": -17.76028060913086,
"step": 3060
},
{
"epoch": 1.830773818398688,
"grad_norm": 6.330567181578317e-09,
"learning_rate": 9.404216247151043e-07,
"logits/chosen": -52.628440856933594,
"logits/rejected": -57.48138427734375,
"logps/chosen": -3616.740966796875,
"logps/rejected": -3667.90234375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.859712600708008,
"rewards/margins": 28.531396865844727,
"rewards/rejected": -15.671684265136719,
"step": 3070
},
{
"epoch": 1.8367377366930073,
"grad_norm": 7.075045260762636e-08,
"learning_rate": 8.75966334666345e-07,
"logits/chosen": -56.22023391723633,
"logits/rejected": -59.01957321166992,
"logps/chosen": -3757.162841796875,
"logps/rejected": -3768.07568359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.878870964050293,
"rewards/margins": 27.17599105834961,
"rewards/rejected": -15.297121047973633,
"step": 3080
},
{
"epoch": 1.8427016549873265,
"grad_norm": 1.6106888045541723e-10,
"learning_rate": 8.137596394105884e-07,
"logits/chosen": -57.78483200073242,
"logits/rejected": -62.688690185546875,
"logps/chosen": -3791.12255859375,
"logps/rejected": -4281.9208984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.808479309082031,
"rewards/margins": 31.255359649658203,
"rewards/rejected": -18.446876525878906,
"step": 3090
},
{
"epoch": 1.8486655732816462,
"grad_norm": 1.2802472623696382e-11,
"learning_rate": 7.538073372174243e-07,
"logits/chosen": -54.56230926513672,
"logits/rejected": -58.21129608154297,
"logps/chosen": -3462.29345703125,
"logps/rejected": -3649.39794921875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.672677993774414,
"rewards/margins": 26.28127670288086,
"rewards/rejected": -14.608599662780762,
"step": 3100
},
{
"epoch": 1.8546294915759653,
"grad_norm": 1.6649872452978443e-11,
"learning_rate": 6.961150162250768e-07,
"logits/chosen": -53.14113235473633,
"logits/rejected": -58.1319694519043,
"logps/chosen": -3658.533203125,
"logps/rejected": -3746.419189453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.092714309692383,
"rewards/margins": 26.7106990814209,
"rewards/rejected": -14.617985725402832,
"step": 3110
},
{
"epoch": 1.8605934098702848,
"grad_norm": 2.5056786398636177e-05,
"learning_rate": 6.406880539195192e-07,
"logits/chosen": -57.97718048095703,
"logits/rejected": -60.9071044921875,
"logps/chosen": -4105.29736328125,
"logps/rejected": -4301.6162109375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.045524597167969,
"rewards/margins": 30.137821197509766,
"rewards/rejected": -16.092296600341797,
"step": 3120
},
{
"epoch": 1.8665573281646042,
"grad_norm": 1.519443060260528e-08,
"learning_rate": 5.875316166332301e-07,
"logits/chosen": -52.88419723510742,
"logits/rejected": -56.966552734375,
"logps/chosen": -3373.383544921875,
"logps/rejected": -3637.07861328125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.798542022705078,
"rewards/margins": 27.759979248046875,
"rewards/rejected": -15.961441040039062,
"step": 3130
},
{
"epoch": 1.8725212464589234,
"grad_norm": 1.0141298467658544e-08,
"learning_rate": 5.366506590636728e-07,
"logits/chosen": -56.66168975830078,
"logits/rejected": -60.23418426513672,
"logps/chosen": -3948.93212890625,
"logps/rejected": -4152.4853515625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.425600051879883,
"rewards/margins": 26.627605438232422,
"rewards/rejected": -13.202006340026855,
"step": 3140
},
{
"epoch": 1.878485164753243,
"grad_norm": 1.0312219522745636e-08,
"learning_rate": 4.880499238114289e-07,
"logits/chosen": -56.00712203979492,
"logits/rejected": -60.405303955078125,
"logps/chosen": -3732.51220703125,
"logps/rejected": -4072.91943359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.824224472045898,
"rewards/margins": 29.8277587890625,
"rewards/rejected": -17.00353240966797,
"step": 3150
},
{
"epoch": 1.8844490830475622,
"grad_norm": 3.670846737691136e-08,
"learning_rate": 4.4173394093816323e-07,
"logits/chosen": -57.95183181762695,
"logits/rejected": -62.4155158996582,
"logps/chosen": -4040.39990234375,
"logps/rejected": -4227.3154296875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.566617012023926,
"rewards/margins": 30.018875122070312,
"rewards/rejected": -15.452255249023438,
"step": 3160
},
{
"epoch": 1.8904130013418816,
"grad_norm": 7.669514729968796e-07,
"learning_rate": 3.977070275443889e-07,
"logits/chosen": -54.29924774169922,
"logits/rejected": -58.9632682800293,
"logps/chosen": -3373.657470703125,
"logps/rejected": -3459.740966796875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.414060592651367,
"rewards/margins": 27.17255210876465,
"rewards/rejected": -15.758489608764648,
"step": 3170
},
{
"epoch": 1.896376919636201,
"grad_norm": 7.890444742031377e-09,
"learning_rate": 3.5597328736704515e-07,
"logits/chosen": -57.19881057739258,
"logits/rejected": -59.535736083984375,
"logps/chosen": -3508.376220703125,
"logps/rejected": -4312.44189453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.423650741577148,
"rewards/margins": 30.402755737304688,
"rewards/rejected": -16.97910499572754,
"step": 3180
},
{
"epoch": 1.9023408379305202,
"grad_norm": 0.0008170054643414915,
"learning_rate": 3.1653661039700856e-07,
"logits/chosen": -53.190582275390625,
"logits/rejected": -59.55742263793945,
"logps/chosen": -3487.789794921875,
"logps/rejected": -3961.377685546875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.425531387329102,
"rewards/margins": 26.997051239013672,
"rewards/rejected": -13.571520805358887,
"step": 3190
},
{
"epoch": 1.9083047562248399,
"grad_norm": 1.0537237926655507e-07,
"learning_rate": 2.794006725165055e-07,
"logits/chosen": -58.54961395263672,
"logits/rejected": -61.74705123901367,
"logps/chosen": -3884.13525390625,
"logps/rejected": -3678.408935546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.533299446105957,
"rewards/margins": 25.77750015258789,
"rewards/rejected": -12.244199752807617,
"step": 3200
},
{
"epoch": 1.914268674519159,
"grad_norm": 3.401453625428985e-07,
"learning_rate": 2.4456893515647507e-07,
"logits/chosen": -55.8200569152832,
"logits/rejected": -60.309791564941406,
"logps/chosen": -3641.48876953125,
"logps/rejected": -4029.09130859375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.986165046691895,
"rewards/margins": 28.325210571289062,
"rewards/rejected": -15.339044570922852,
"step": 3210
},
{
"epoch": 1.9202325928134785,
"grad_norm": 7.95227883543248e-09,
"learning_rate": 2.1204464497393828e-07,
"logits/chosen": -58.28154754638672,
"logits/rejected": -63.397918701171875,
"logps/chosen": -3738.774169921875,
"logps/rejected": -4127.396484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.601153373718262,
"rewards/margins": 32.9376106262207,
"rewards/rejected": -20.33645248413086,
"step": 3220
},
{
"epoch": 1.9261965111077979,
"grad_norm": 5.5718683142913505e-06,
"learning_rate": 1.818308335493707e-07,
"logits/chosen": -57.85089874267578,
"logits/rejected": -62.124717712402344,
"logps/chosen": -3759.594482421875,
"logps/rejected": -4074.05859375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.022012710571289,
"rewards/margins": 27.768798828125,
"rewards/rejected": -15.746786117553711,
"step": 3230
},
{
"epoch": 1.932160429402117,
"grad_norm": 2.9493682518477726e-07,
"learning_rate": 1.539303171041423e-07,
"logits/chosen": -56.495628356933594,
"logits/rejected": -59.803489685058594,
"logps/chosen": -3937.475341796875,
"logps/rejected": -4117.28955078125,
"loss": 0.0181,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 13.402850151062012,
"rewards/margins": 29.56199073791504,
"rewards/rejected": -16.159137725830078,
"step": 3240
},
{
"epoch": 1.9381243476964367,
"grad_norm": 2.0169439451933613e-09,
"learning_rate": 1.2834569623800806e-07,
"logits/chosen": -56.14719772338867,
"logits/rejected": -61.06328201293945,
"logps/chosen": -3900.921142578125,
"logps/rejected": -3950.700439453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.294220924377441,
"rewards/margins": 27.905467987060547,
"rewards/rejected": -16.611248016357422,
"step": 3250
},
{
"epoch": 1.944088265990756,
"grad_norm": 2.454358383197075e-10,
"learning_rate": 1.0507935568670469e-07,
"logits/chosen": -59.91279983520508,
"logits/rejected": -64.65019226074219,
"logps/chosen": -4057.24365234375,
"logps/rejected": -4658.7998046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.831171035766602,
"rewards/margins": 30.55777359008789,
"rewards/rejected": -16.726600646972656,
"step": 3260
},
{
"epoch": 1.9500521842850753,
"grad_norm": 6.98909570928663e-05,
"learning_rate": 8.413346409967548e-08,
"logits/chosen": -57.639564514160156,
"logits/rejected": -60.486106872558594,
"logps/chosen": -3805.05908203125,
"logps/rejected": -3502.89208984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.168159484863281,
"rewards/margins": 26.42257308959961,
"rewards/rejected": -13.254412651062012,
"step": 3270
},
{
"epoch": 1.9560161025793947,
"grad_norm": 9.799998590409587e-10,
"learning_rate": 6.5509973837935e-08,
"logits/chosen": -56.7869987487793,
"logits/rejected": -63.3160285949707,
"logps/chosen": -3626.40771484375,
"logps/rejected": -4573.03466796875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.337489128112793,
"rewards/margins": 31.131816864013672,
"rewards/rejected": -16.794330596923828,
"step": 3280
},
{
"epoch": 1.961980020873714,
"grad_norm": 8.010190867935307e-06,
"learning_rate": 4.921062079207839e-08,
"logits/chosen": -58.120140075683594,
"logits/rejected": -62.119712829589844,
"logps/chosen": -3722.432861328125,
"logps/rejected": -4000.43701171875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.0454740524292,
"rewards/margins": 28.300750732421875,
"rewards/rejected": -13.255276679992676,
"step": 3290
},
{
"epoch": 1.9679439391680336,
"grad_norm": 0.012528502382338047,
"learning_rate": 3.5236924220494186e-08,
"logits/chosen": -57.49330520629883,
"logits/rejected": -60.7618522644043,
"logps/chosen": -4048.438232421875,
"logps/rejected": -4347.9658203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.4490966796875,
"rewards/margins": 29.223918914794922,
"rewards/rejected": -15.774820327758789,
"step": 3300
},
{
"epoch": 1.9739078574623528,
"grad_norm": 5.5174933066837184e-08,
"learning_rate": 2.3590186607733154e-08,
"logits/chosen": -57.35243606567383,
"logits/rejected": -63.4393310546875,
"logps/chosen": -3737.04150390625,
"logps/rejected": -4001.02978515625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.624621391296387,
"rewards/margins": 27.510913848876953,
"rewards/rejected": -12.886293411254883,
"step": 3310
},
{
"epoch": 1.9798717757566722,
"grad_norm": 2.4484758665010986e-10,
"learning_rate": 1.4271493543133174e-08,
"logits/chosen": -59.93109130859375,
"logits/rejected": -63.264732360839844,
"logps/chosen": -3958.082763671875,
"logps/rejected": -4254.8232421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.18006420135498,
"rewards/margins": 28.5799503326416,
"rewards/rejected": -15.399887084960938,
"step": 3320
},
{
"epoch": 1.9858356940509916,
"grad_norm": 4.756313121134781e-09,
"learning_rate": 7.281713619605723e-09,
"logits/chosen": -56.019737243652344,
"logits/rejected": -59.69663619995117,
"logps/chosen": -4050.05908203125,
"logps/rejected": -4002.97802734375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.66505241394043,
"rewards/margins": 27.011245727539062,
"rewards/rejected": -13.346193313598633,
"step": 3330
},
{
"epoch": 1.9917996123453108,
"grad_norm": 1.108175638364628e-05,
"learning_rate": 2.6214983526867686e-09,
"logits/chosen": -56.985069274902344,
"logits/rejected": -62.2716178894043,
"logps/chosen": -3683.37255859375,
"logps/rejected": -3870.92041015625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.052160263061523,
"rewards/margins": 27.678844451904297,
"rewards/rejected": -14.626681327819824,
"step": 3340
},
{
"epoch": 1.9977635306396302,
"grad_norm": 2.601581456929125e-07,
"learning_rate": 2.912821198075566e-10,
"logits/chosen": -56.11452102661133,
"logits/rejected": -60.453453063964844,
"logps/chosen": -3346.564453125,
"logps/rejected": -3288.348388671875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.876147270202637,
"rewards/margins": 26.675273895263672,
"rewards/rejected": -13.799127578735352,
"step": 3350
}
],
"logging_steps": 10,
"max_steps": 3354,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}