pleaky2410's picture
Upload folder using huggingface_hub
53b0779 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.7777777777777777,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0044444444444444444,
"grad_norm": 3.6353344917297363,
"learning_rate": 1e-05,
"logits/chosen": -0.4628738462924957,
"logits/rejected": -0.46038827300071716,
"logps/chosen": -305.24371337890625,
"logps/rejected": -217.2339324951172,
"loss": 0.69,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.006762671284377575,
"rewards/margins": 0.005093236453831196,
"rewards/rejected": 0.0016694354126229882,
"step": 5
},
{
"epoch": 0.008888888888888889,
"grad_norm": 3.840994119644165,
"learning_rate": 9.999945685076187e-06,
"logits/chosen": -0.4660520553588867,
"logits/rejected": -0.4597313404083252,
"logps/chosen": -295.14178466796875,
"logps/rejected": -215.0008544921875,
"loss": 0.6585,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.07166890054941177,
"rewards/margins": 0.057903312146663666,
"rewards/rejected": 0.013765583746135235,
"step": 10
},
{
"epoch": 0.013333333333333334,
"grad_norm": 3.3536572456359863,
"learning_rate": 9.99978274148479e-06,
"logits/chosen": -0.46407952904701233,
"logits/rejected": -0.46835923194885254,
"logps/chosen": -307.1814880371094,
"logps/rejected": -220.930908203125,
"loss": 0.6155,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.18775026500225067,
"rewards/margins": 0.13849034905433655,
"rewards/rejected": 0.049259938299655914,
"step": 15
},
{
"epoch": 0.017777777777777778,
"grad_norm": 3.603736400604248,
"learning_rate": 9.999511172765917e-06,
"logits/chosen": -0.4126955568790436,
"logits/rejected": -0.4344128668308258,
"logps/chosen": -295.6951599121094,
"logps/rejected": -219.573974609375,
"loss": 0.572,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.3320372700691223,
"rewards/margins": 0.22264714539051056,
"rewards/rejected": 0.10939009487628937,
"step": 20
},
{
"epoch": 0.022222222222222223,
"grad_norm": 3.7065272331237793,
"learning_rate": 9.999130984819662e-06,
"logits/chosen": -0.42767876386642456,
"logits/rejected": -0.4458894729614258,
"logps/chosen": -331.75592041015625,
"logps/rejected": -231.910400390625,
"loss": 0.5323,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.5343211889266968,
"rewards/margins": 0.32672011852264404,
"rewards/rejected": 0.20760111510753632,
"step": 25
},
{
"epoch": 0.02666666666666667,
"grad_norm": 3.656710624694824,
"learning_rate": 9.998642185905977e-06,
"logits/chosen": -0.44063276052474976,
"logits/rejected": -0.4492092728614807,
"logps/chosen": -311.40277099609375,
"logps/rejected": -233.438720703125,
"loss": 0.5154,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.6043025851249695,
"rewards/margins": 0.3682531714439392,
"rewards/rejected": 0.23604938387870789,
"step": 30
},
{
"epoch": 0.03111111111111111,
"grad_norm": 3.5971853733062744,
"learning_rate": 9.998044786644492e-06,
"logits/chosen": -0.39475446939468384,
"logits/rejected": -0.4055609703063965,
"logps/chosen": -298.6465759277344,
"logps/rejected": -219.363525390625,
"loss": 0.4452,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.8082733154296875,
"rewards/margins": 0.5571426153182983,
"rewards/rejected": 0.25113070011138916,
"step": 35
},
{
"epoch": 0.035555555555555556,
"grad_norm": 3.67197322845459,
"learning_rate": 9.997338800014284e-06,
"logits/chosen": -0.41250643134117126,
"logits/rejected": -0.4259340167045593,
"logps/chosen": -293.3608703613281,
"logps/rejected": -224.9442901611328,
"loss": 0.4586,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.867678165435791,
"rewards/margins": 0.5272501707077026,
"rewards/rejected": 0.34042787551879883,
"step": 40
},
{
"epoch": 0.04,
"grad_norm": 5.049458026885986,
"learning_rate": 9.9965242413536e-06,
"logits/chosen": -0.41178879141807556,
"logits/rejected": -0.4304323196411133,
"logps/chosen": -306.1034851074219,
"logps/rejected": -228.5247802734375,
"loss": 0.3777,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 1.235072374343872,
"rewards/margins": 0.7781749367713928,
"rewards/rejected": 0.4568973183631897,
"step": 45
},
{
"epoch": 0.044444444444444446,
"grad_norm": 3.9216673374176025,
"learning_rate": 9.995601128359516e-06,
"logits/chosen": -0.40246009826660156,
"logits/rejected": -0.3950818181037903,
"logps/chosen": -303.0498352050781,
"logps/rejected": -226.4988250732422,
"loss": 0.3999,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 1.3293551206588745,
"rewards/margins": 0.8089650869369507,
"rewards/rejected": 0.5203902721405029,
"step": 50
},
{
"epoch": 0.04888888888888889,
"grad_norm": 4.434176921844482,
"learning_rate": 9.994569481087552e-06,
"logits/chosen": -0.39378249645233154,
"logits/rejected": -0.40684300661087036,
"logps/chosen": -329.46173095703125,
"logps/rejected": -223.7794952392578,
"loss": 0.3168,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 1.6077896356582642,
"rewards/margins": 1.0661401748657227,
"rewards/rejected": 0.5416494607925415,
"step": 55
},
{
"epoch": 0.05333333333333334,
"grad_norm": 4.062209129333496,
"learning_rate": 9.993429321951251e-06,
"logits/chosen": -0.34955719113349915,
"logits/rejected": -0.3819810748100281,
"logps/chosen": -291.4505615234375,
"logps/rejected": -212.7031707763672,
"loss": 0.2348,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.9593639373779297,
"rewards/margins": 1.3969981670379639,
"rewards/rejected": 0.5623658299446106,
"step": 60
},
{
"epoch": 0.057777777777777775,
"grad_norm": 4.022445201873779,
"learning_rate": 9.992180675721671e-06,
"logits/chosen": -0.3607024550437927,
"logits/rejected": -0.3758237659931183,
"logps/chosen": -325.44622802734375,
"logps/rejected": -226.538818359375,
"loss": 0.1438,
"rewards/accuracies": 0.84375,
"rewards/chosen": 2.2623605728149414,
"rewards/margins": 1.6984504461288452,
"rewards/rejected": 0.5639100670814514,
"step": 65
},
{
"epoch": 0.06222222222222222,
"grad_norm": 4.516530513763428,
"learning_rate": 9.990823569526868e-06,
"logits/chosen": -0.3758849501609802,
"logits/rejected": -0.401409387588501,
"logps/chosen": -293.59283447265625,
"logps/rejected": -212.1451873779297,
"loss": 0.2153,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 2.2533986568450928,
"rewards/margins": 1.599491000175476,
"rewards/rejected": 0.6539075374603271,
"step": 70
},
{
"epoch": 0.06666666666666667,
"grad_norm": 4.2673563957214355,
"learning_rate": 9.989358032851283e-06,
"logits/chosen": -0.38496989011764526,
"logits/rejected": -0.4185038208961487,
"logps/chosen": -331.06707763671875,
"logps/rejected": -238.4701690673828,
"loss": 0.2667,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 2.29125714302063,
"rewards/margins": 1.5516706705093384,
"rewards/rejected": 0.739586353302002,
"step": 75
},
{
"epoch": 0.07111111111111111,
"grad_norm": 4.720420837402344,
"learning_rate": 9.987784097535126e-06,
"logits/chosen": -0.36235010623931885,
"logits/rejected": -0.3792596757411957,
"logps/chosen": -303.70196533203125,
"logps/rejected": -226.2982940673828,
"loss": 0.1933,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 2.5473880767822266,
"rewards/margins": 1.8545589447021484,
"rewards/rejected": 0.6928290128707886,
"step": 80
},
{
"epoch": 0.07555555555555556,
"grad_norm": 7.281383037567139,
"learning_rate": 9.986101797773667e-06,
"logits/chosen": -0.380900114774704,
"logits/rejected": -0.3917911946773529,
"logps/chosen": -295.6128234863281,
"logps/rejected": -229.4749298095703,
"loss": 0.213,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 2.3351025581359863,
"rewards/margins": 1.7777379751205444,
"rewards/rejected": 0.5573645830154419,
"step": 85
},
{
"epoch": 0.08,
"grad_norm": 4.908195972442627,
"learning_rate": 9.984311170116497e-06,
"logits/chosen": -0.37983238697052,
"logits/rejected": -0.3918471932411194,
"logps/chosen": -291.7480163574219,
"logps/rejected": -220.09652709960938,
"loss": 0.1737,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 2.4933111667633057,
"rewards/margins": 1.936655044555664,
"rewards/rejected": 0.5566561222076416,
"step": 90
},
{
"epoch": 0.08444444444444445,
"grad_norm": 5.108543395996094,
"learning_rate": 9.98241225346674e-06,
"logits/chosen": -0.34952667355537415,
"logits/rejected": -0.3850114643573761,
"logps/chosen": -302.5187683105469,
"logps/rejected": -222.33834838867188,
"loss": 0.1069,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 2.5169692039489746,
"rewards/margins": 2.1484999656677246,
"rewards/rejected": 0.36846891045570374,
"step": 95
},
{
"epoch": 0.08888888888888889,
"grad_norm": 4.840832233428955,
"learning_rate": 9.9804050890802e-06,
"logits/chosen": -0.33506280183792114,
"logits/rejected": -0.3472011089324951,
"logps/chosen": -269.5045471191406,
"logps/rejected": -205.8480987548828,
"loss": 0.1074,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 2.4071407318115234,
"rewards/margins": 2.1650052070617676,
"rewards/rejected": 0.24213531613349915,
"step": 100
},
{
"epoch": 0.09333333333333334,
"grad_norm": 5.977898120880127,
"learning_rate": 9.978289720564471e-06,
"logits/chosen": -0.33771952986717224,
"logits/rejected": -0.3690803050994873,
"logps/chosen": -308.5394287109375,
"logps/rejected": -226.24813842773438,
"loss": 0.2107,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 2.450155735015869,
"rewards/margins": 2.019880771636963,
"rewards/rejected": 0.4302748143672943,
"step": 105
},
{
"epoch": 0.09777777777777778,
"grad_norm": 6.031980514526367,
"learning_rate": 9.976066193877982e-06,
"logits/chosen": -0.347932904958725,
"logits/rejected": -0.3845617175102234,
"logps/chosen": -293.0547790527344,
"logps/rejected": -220.5181427001953,
"loss": 0.1324,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 2.321216106414795,
"rewards/margins": 2.2665910720825195,
"rewards/rejected": 0.054625045508146286,
"step": 110
},
{
"epoch": 0.10222222222222223,
"grad_norm": 4.3759660720825195,
"learning_rate": 9.97373455732901e-06,
"logits/chosen": -0.34049180150032043,
"logits/rejected": -0.35589173436164856,
"logps/chosen": -294.77117919921875,
"logps/rejected": -228.1370086669922,
"loss": 0.0647,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 2.3987247943878174,
"rewards/margins": 2.2897610664367676,
"rewards/rejected": 0.1089634895324707,
"step": 115
},
{
"epoch": 0.10666666666666667,
"grad_norm": 4.7751336097717285,
"learning_rate": 9.971294861574617e-06,
"logits/chosen": -0.3569382429122925,
"logits/rejected": -0.35876479744911194,
"logps/chosen": -285.916748046875,
"logps/rejected": -219.81753540039062,
"loss": 0.0106,
"rewards/accuracies": 0.8125,
"rewards/chosen": 2.5361573696136475,
"rewards/margins": 2.6316070556640625,
"rewards/rejected": -0.09544976055622101,
"step": 120
},
{
"epoch": 0.1111111111111111,
"grad_norm": 6.836686134338379,
"learning_rate": 9.968747159619556e-06,
"logits/chosen": -0.3644478917121887,
"logits/rejected": -0.3773222863674164,
"logps/chosen": -301.05084228515625,
"logps/rejected": -231.2653045654297,
"loss": 0.1055,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 2.7265372276306152,
"rewards/margins": 2.74798846244812,
"rewards/rejected": -0.021450763568282127,
"step": 125
},
{
"epoch": 0.11555555555555555,
"grad_norm": 6.012020587921143,
"learning_rate": 9.966091506815128e-06,
"logits/chosen": -0.34487825632095337,
"logits/rejected": -0.3683899939060211,
"logps/chosen": -296.5931701660156,
"logps/rejected": -223.12753295898438,
"loss": 0.0823,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 2.7003228664398193,
"rewards/margins": 2.8483641147613525,
"rewards/rejected": -0.14804117381572723,
"step": 130
},
{
"epoch": 0.12,
"grad_norm": 5.459484100341797,
"learning_rate": 9.963327960857962e-06,
"logits/chosen": -0.3142702579498291,
"logits/rejected": -0.36442944407463074,
"logps/chosen": -310.71368408203125,
"logps/rejected": -211.5228729248047,
"loss": -0.0914,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 3.0668811798095703,
"rewards/margins": 3.4564356803894043,
"rewards/rejected": -0.3895547688007355,
"step": 135
},
{
"epoch": 0.12444444444444444,
"grad_norm": 4.6439080238342285,
"learning_rate": 9.960456581788771e-06,
"logits/chosen": -0.3213174343109131,
"logits/rejected": -0.35702863335609436,
"logps/chosen": -295.9752502441406,
"logps/rejected": -218.5254364013672,
"loss": -0.0509,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 2.7388923168182373,
"rewards/margins": 3.2099480628967285,
"rewards/rejected": -0.47105544805526733,
"step": 140
},
{
"epoch": 0.1288888888888889,
"grad_norm": 7.5686140060424805,
"learning_rate": 9.957477431991053e-06,
"logits/chosen": -0.3489062190055847,
"logits/rejected": -0.38331982493400574,
"logps/chosen": -301.20574951171875,
"logps/rejected": -221.79617309570312,
"loss": 0.0026,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 2.3142552375793457,
"rewards/margins": 3.0163302421569824,
"rewards/rejected": -0.7020750045776367,
"step": 145
},
{
"epoch": 0.13333333333333333,
"grad_norm": 7.296176910400391,
"learning_rate": 9.954390576189726e-06,
"logits/chosen": -0.32641178369522095,
"logits/rejected": -0.3621976673603058,
"logps/chosen": -312.5970153808594,
"logps/rejected": -246.51974487304688,
"loss": 0.1054,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 2.6623806953430176,
"rewards/margins": 2.9366531372070312,
"rewards/rejected": -0.27427244186401367,
"step": 150
},
{
"epoch": 0.13777777777777778,
"grad_norm": 6.089158535003662,
"learning_rate": 9.95119608144972e-06,
"logits/chosen": -0.34094589948654175,
"logits/rejected": -0.36057132482528687,
"logps/chosen": -298.9684143066406,
"logps/rejected": -232.59097290039062,
"loss": 0.0235,
"rewards/accuracies": 0.78125,
"rewards/chosen": 2.0118796825408936,
"rewards/margins": 2.952807664871216,
"rewards/rejected": -0.9409275054931641,
"step": 155
},
{
"epoch": 0.14222222222222222,
"grad_norm": 5.5049662590026855,
"learning_rate": 9.947894017174535e-06,
"logits/chosen": -0.30161410570144653,
"logits/rejected": -0.3480113446712494,
"logps/chosen": -306.0929870605469,
"logps/rejected": -226.927734375,
"loss": -0.0148,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 2.6642236709594727,
"rewards/margins": 3.523815631866455,
"rewards/rejected": -0.8595919609069824,
"step": 160
},
{
"epoch": 0.14666666666666667,
"grad_norm": 5.161729335784912,
"learning_rate": 9.944484455104716e-06,
"logits/chosen": -0.3171108067035675,
"logits/rejected": -0.35261866450309753,
"logps/chosen": -290.88250732421875,
"logps/rejected": -234.3745880126953,
"loss": 0.085,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 1.7442996501922607,
"rewards/margins": 2.7870707511901855,
"rewards/rejected": -1.0427708625793457,
"step": 165
},
{
"epoch": 0.1511111111111111,
"grad_norm": 5.5866618156433105,
"learning_rate": 9.940967469316307e-06,
"logits/chosen": -0.3179735541343689,
"logits/rejected": -0.3568040728569031,
"logps/chosen": -328.8294372558594,
"logps/rejected": -225.0006561279297,
"loss": -0.2403,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 3.2372944355010986,
"rewards/margins": 4.343171119689941,
"rewards/rejected": -1.105877161026001,
"step": 170
},
{
"epoch": 0.15555555555555556,
"grad_norm": 6.542994499206543,
"learning_rate": 9.937343136219234e-06,
"logits/chosen": -0.2941819429397583,
"logits/rejected": -0.3379240930080414,
"logps/chosen": -301.7574462890625,
"logps/rejected": -217.0867919921875,
"loss": -0.1938,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 2.6589813232421875,
"rewards/margins": 4.12928581237793,
"rewards/rejected": -1.4703044891357422,
"step": 175
},
{
"epoch": 0.16,
"grad_norm": 8.16442584991455,
"learning_rate": 9.933611534555645e-06,
"logits/chosen": -0.3271011710166931,
"logits/rejected": -0.3445083200931549,
"logps/chosen": -319.7223205566406,
"logps/rejected": -247.19967651367188,
"loss": -0.0822,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 2.863480806350708,
"rewards/margins": 3.848937511444092,
"rewards/rejected": -0.9854568243026733,
"step": 180
},
{
"epoch": 0.16444444444444445,
"grad_norm": 5.072612285614014,
"learning_rate": 9.929772745398207e-06,
"logits/chosen": -0.3311443328857422,
"logits/rejected": -0.34855595231056213,
"logps/chosen": -304.24371337890625,
"logps/rejected": -228.12423706054688,
"loss": -0.0953,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 2.729543685913086,
"rewards/margins": 3.911928653717041,
"rewards/rejected": -1.1823843717575073,
"step": 185
},
{
"epoch": 0.1688888888888889,
"grad_norm": 5.6365966796875,
"learning_rate": 9.925826852148332e-06,
"logits/chosen": -0.37482309341430664,
"logits/rejected": -0.37685567140579224,
"logps/chosen": -323.057373046875,
"logps/rejected": -238.3128204345703,
"loss": -0.1351,
"rewards/accuracies": 0.8125,
"rewards/chosen": 2.4461398124694824,
"rewards/margins": 4.063778400421143,
"rewards/rejected": -1.6176389455795288,
"step": 190
},
{
"epoch": 0.17333333333333334,
"grad_norm": 7.323070049285889,
"learning_rate": 9.921773940534382e-06,
"logits/chosen": -0.30995437502861023,
"logits/rejected": -0.3514579236507416,
"logps/chosen": -283.3837890625,
"logps/rejected": -228.30905151367188,
"loss": 0.0783,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 1.8092823028564453,
"rewards/margins": 3.168578624725342,
"rewards/rejected": -1.3592965602874756,
"step": 195
},
{
"epoch": 0.17777777777777778,
"grad_norm": 5.961920738220215,
"learning_rate": 9.917614098609786e-06,
"logits/chosen": -0.3327783942222595,
"logits/rejected": -0.36006277799606323,
"logps/chosen": -307.5805358886719,
"logps/rejected": -228.3577880859375,
"loss": -0.0971,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 2.8803937435150146,
"rewards/margins": 3.999189853668213,
"rewards/rejected": -1.1187958717346191,
"step": 200
},
{
"epoch": 0.18222222222222223,
"grad_norm": 8.006556510925293,
"learning_rate": 9.913347416751148e-06,
"logits/chosen": -0.290499210357666,
"logits/rejected": -0.32564371824264526,
"logps/chosen": -310.0920715332031,
"logps/rejected": -222.1564178466797,
"loss": -0.1483,
"rewards/accuracies": 0.8125,
"rewards/chosen": 3.187587022781372,
"rewards/margins": 4.311644554138184,
"rewards/rejected": -1.124057650566101,
"step": 205
},
{
"epoch": 0.18666666666666668,
"grad_norm": 5.825204849243164,
"learning_rate": 9.908973987656263e-06,
"logits/chosen": -0.3070078492164612,
"logits/rejected": -0.3182796239852905,
"logps/chosen": -289.46490478515625,
"logps/rejected": -223.34725952148438,
"loss": -0.2937,
"rewards/accuracies": 0.875,
"rewards/chosen": 3.196002244949341,
"rewards/margins": 4.649127006530762,
"rewards/rejected": -1.453124761581421,
"step": 210
},
{
"epoch": 0.19111111111111112,
"grad_norm": 8.087427139282227,
"learning_rate": 9.904493906342124e-06,
"logits/chosen": -0.284060001373291,
"logits/rejected": -0.3289189636707306,
"logps/chosen": -292.52984619140625,
"logps/rejected": -227.1925811767578,
"loss": -0.0803,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 2.883082628250122,
"rewards/margins": 4.214940071105957,
"rewards/rejected": -1.3318575620651245,
"step": 215
},
{
"epoch": 0.19555555555555557,
"grad_norm": 6.931927680969238,
"learning_rate": 9.899907270142835e-06,
"logits/chosen": -0.29949700832366943,
"logits/rejected": -0.3155062794685364,
"logps/chosen": -305.6365661621094,
"logps/rejected": -233.4442901611328,
"loss": -0.0835,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 2.9822609424591064,
"rewards/margins": 4.396633148193359,
"rewards/rejected": -1.4143723249435425,
"step": 220
},
{
"epoch": 0.2,
"grad_norm": 9.130791664123535,
"learning_rate": 9.895214178707516e-06,
"logits/chosen": -0.31096282601356506,
"logits/rejected": -0.3551832437515259,
"logps/chosen": -313.84320068359375,
"logps/rejected": -236.7030029296875,
"loss": -0.0991,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 2.626882553100586,
"rewards/margins": 4.163486003875732,
"rewards/rejected": -1.536603331565857,
"step": 225
},
{
"epoch": 0.20444444444444446,
"grad_norm": 5.590844631195068,
"learning_rate": 9.890414733998131e-06,
"logits/chosen": -0.2635526657104492,
"logits/rejected": -0.29329806566238403,
"logps/chosen": -297.04638671875,
"logps/rejected": -236.12271118164062,
"loss": 0.0262,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 2.713446855545044,
"rewards/margins": 3.905104875564575,
"rewards/rejected": -1.1916577816009521,
"step": 230
},
{
"epoch": 0.2088888888888889,
"grad_norm": 4.7747979164123535,
"learning_rate": 9.885509040287267e-06,
"logits/chosen": -0.30965957045555115,
"logits/rejected": -0.31621426343917847,
"logps/chosen": -288.59014892578125,
"logps/rejected": -220.62765502929688,
"loss": -0.342,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 3.3641788959503174,
"rewards/margins": 5.042544841766357,
"rewards/rejected": -1.6783654689788818,
"step": 235
},
{
"epoch": 0.21333333333333335,
"grad_norm": 6.6951680183410645,
"learning_rate": 9.880497204155879e-06,
"logits/chosen": -0.27586597204208374,
"logits/rejected": -0.3355752229690552,
"logps/chosen": -316.9598388671875,
"logps/rejected": -244.87149047851562,
"loss": -0.0482,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 2.7237601280212402,
"rewards/margins": 3.941441774368286,
"rewards/rejected": -1.217681646347046,
"step": 240
},
{
"epoch": 0.21777777777777776,
"grad_norm": 5.907822608947754,
"learning_rate": 9.875379334490962e-06,
"logits/chosen": -0.3292551338672638,
"logits/rejected": -0.31635525822639465,
"logps/chosen": -291.8619689941406,
"logps/rejected": -231.01016235351562,
"loss": -0.1414,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 2.5336103439331055,
"rewards/margins": 4.392641067504883,
"rewards/rejected": -1.8590309619903564,
"step": 245
},
{
"epoch": 0.2222222222222222,
"grad_norm": 7.046641826629639,
"learning_rate": 9.870155542483199e-06,
"logits/chosen": -0.3067111372947693,
"logits/rejected": -0.35157865285873413,
"logps/chosen": -319.73187255859375,
"logps/rejected": -236.04483032226562,
"loss": -0.3243,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 3.8645987510681152,
"rewards/margins": 5.3452653884887695,
"rewards/rejected": -1.4806665182113647,
"step": 250
},
{
"epoch": 0.22666666666666666,
"grad_norm": 9.379409790039062,
"learning_rate": 9.864825941624538e-06,
"logits/chosen": -0.267128050327301,
"logits/rejected": -0.2918349802494049,
"logps/chosen": -313.8056335449219,
"logps/rejected": -247.0101776123047,
"loss": -0.1449,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 3.3342041969299316,
"rewards/margins": 4.6097564697265625,
"rewards/rejected": -1.27555251121521,
"step": 255
},
{
"epoch": 0.2311111111111111,
"grad_norm": 7.974717617034912,
"learning_rate": 9.85939064770572e-06,
"logits/chosen": -0.3181043267250061,
"logits/rejected": -0.3094359338283539,
"logps/chosen": -317.86505126953125,
"logps/rejected": -245.4908905029297,
"loss": -0.3156,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 3.143451452255249,
"rewards/margins": 5.211213111877441,
"rewards/rejected": -2.0677614212036133,
"step": 260
},
{
"epoch": 0.23555555555555555,
"grad_norm": 6.511713027954102,
"learning_rate": 9.853849778813777e-06,
"logits/chosen": -0.29388368129730225,
"logits/rejected": -0.3029894530773163,
"logps/chosen": -297.4751892089844,
"logps/rejected": -231.9884796142578,
"loss": -0.167,
"rewards/accuracies": 0.8125,
"rewards/chosen": 3.295167922973633,
"rewards/margins": 5.074382305145264,
"rewards/rejected": -1.77921462059021,
"step": 265
},
{
"epoch": 0.24,
"grad_norm": 6.8637776374816895,
"learning_rate": 9.848203455329459e-06,
"logits/chosen": -0.31308668851852417,
"logits/rejected": -0.3360288441181183,
"logps/chosen": -296.82318115234375,
"logps/rejected": -236.3435821533203,
"loss": -0.2143,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 2.4768126010894775,
"rewards/margins": 4.899205684661865,
"rewards/rejected": -2.422393321990967,
"step": 270
},
{
"epoch": 0.24444444444444444,
"grad_norm": 10.344189643859863,
"learning_rate": 9.842451799924616e-06,
"logits/chosen": -0.2888021171092987,
"logits/rejected": -0.3189722001552582,
"logps/chosen": -321.9690246582031,
"logps/rejected": -239.86520385742188,
"loss": -0.327,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 2.709468364715576,
"rewards/margins": 5.345309257507324,
"rewards/rejected": -2.635840892791748,
"step": 275
},
{
"epoch": 0.24888888888888888,
"grad_norm": 11.444976806640625,
"learning_rate": 9.836594937559541e-06,
"logits/chosen": -0.28263232111930847,
"logits/rejected": -0.2964705526828766,
"logps/chosen": -296.26995849609375,
"logps/rejected": -232.54934692382812,
"loss": 0.0017,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 2.52939772605896,
"rewards/margins": 4.631046295166016,
"rewards/rejected": -2.1016488075256348,
"step": 280
},
{
"epoch": 0.25333333333333335,
"grad_norm": 5.921905040740967,
"learning_rate": 9.830632995480243e-06,
"logits/chosen": -0.26743844151496887,
"logits/rejected": -0.27696385979652405,
"logps/chosen": -298.95477294921875,
"logps/rejected": -236.3730926513672,
"loss": -0.2483,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 2.9082038402557373,
"rewards/margins": 5.20479154586792,
"rewards/rejected": -2.2965879440307617,
"step": 285
},
{
"epoch": 0.2577777777777778,
"grad_norm": 6.96235990524292,
"learning_rate": 9.824566103215697e-06,
"logits/chosen": -0.2472468614578247,
"logits/rejected": -0.2931605279445648,
"logps/chosen": -298.02581787109375,
"logps/rejected": -231.60879516601562,
"loss": -0.2804,
"rewards/accuracies": 0.8125,
"rewards/chosen": 2.8193132877349854,
"rewards/margins": 5.249671459197998,
"rewards/rejected": -2.430358648300171,
"step": 290
},
{
"epoch": 0.26222222222222225,
"grad_norm": 8.460125923156738,
"learning_rate": 9.818394392575018e-06,
"logits/chosen": -0.30542343854904175,
"logits/rejected": -0.32763975858688354,
"logps/chosen": -285.7476501464844,
"logps/rejected": -243.5345458984375,
"loss": -0.1747,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 2.6046016216278076,
"rewards/margins": 4.996693134307861,
"rewards/rejected": -2.3920915126800537,
"step": 295
},
{
"epoch": 0.26666666666666666,
"grad_norm": 7.488274097442627,
"learning_rate": 9.812117997644606e-06,
"logits/chosen": -0.2731490731239319,
"logits/rejected": -0.30121108889579773,
"logps/chosen": -284.916259765625,
"logps/rejected": -233.185546875,
"loss": -0.1565,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 2.169166088104248,
"rewards/margins": 5.066960334777832,
"rewards/rejected": -2.897794246673584,
"step": 300
},
{
"epoch": 0.27111111111111114,
"grad_norm": 8.804214477539062,
"learning_rate": 9.805737054785223e-06,
"logits/chosen": -0.2953334450721741,
"logits/rejected": -0.327360063791275,
"logps/chosen": -300.7308349609375,
"logps/rejected": -236.43685913085938,
"loss": -0.0923,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 2.0204293727874756,
"rewards/margins": 4.94085168838501,
"rewards/rejected": -2.920422315597534,
"step": 305
},
{
"epoch": 0.27555555555555555,
"grad_norm": 8.035072326660156,
"learning_rate": 9.79925170262904e-06,
"logits/chosen": -0.26204347610473633,
"logits/rejected": -0.31125301122665405,
"logps/chosen": -280.49102783203125,
"logps/rejected": -226.66110229492188,
"loss": -0.2221,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 1.1722501516342163,
"rewards/margins": 4.767660140991211,
"rewards/rejected": -3.595409870147705,
"step": 310
},
{
"epoch": 0.28,
"grad_norm": 6.416834831237793,
"learning_rate": 9.792662082076618e-06,
"logits/chosen": -0.2821267247200012,
"logits/rejected": -0.29524296522140503,
"logps/chosen": -313.2020568847656,
"logps/rejected": -236.20578002929688,
"loss": -0.3875,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 2.725848913192749,
"rewards/margins": 5.90293025970459,
"rewards/rejected": -3.17708158493042,
"step": 315
},
{
"epoch": 0.28444444444444444,
"grad_norm": 9.474376678466797,
"learning_rate": 9.785968336293859e-06,
"logits/chosen": -0.2762632966041565,
"logits/rejected": -0.34091368317604065,
"logps/chosen": -315.105224609375,
"logps/rejected": -250.1154327392578,
"loss": -0.2361,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.9460265636444092,
"rewards/margins": 5.044549465179443,
"rewards/rejected": -3.0985231399536133,
"step": 320
},
{
"epoch": 0.28888888888888886,
"grad_norm": 7.876622200012207,
"learning_rate": 9.779170610708872e-06,
"logits/chosen": -0.26600781083106995,
"logits/rejected": -0.2999460697174072,
"logps/chosen": -315.525146484375,
"logps/rejected": -239.6782989501953,
"loss": -0.3024,
"rewards/accuracies": 0.78125,
"rewards/chosen": 2.9207816123962402,
"rewards/margins": 6.140283107757568,
"rewards/rejected": -3.219501495361328,
"step": 325
},
{
"epoch": 0.29333333333333333,
"grad_norm": 9.389948844909668,
"learning_rate": 9.772269053008841e-06,
"logits/chosen": -0.2716449201107025,
"logits/rejected": -0.31395813822746277,
"logps/chosen": -293.0248107910156,
"logps/rejected": -221.9087371826172,
"loss": -0.1898,
"rewards/accuracies": 0.8125,
"rewards/chosen": 2.0722451210021973,
"rewards/margins": 5.335482597351074,
"rewards/rejected": -3.263237714767456,
"step": 330
},
{
"epoch": 0.29777777777777775,
"grad_norm": 8.571460723876953,
"learning_rate": 9.765263813136796e-06,
"logits/chosen": -0.27379176020622253,
"logits/rejected": -0.31927746534347534,
"logps/chosen": -306.12799072265625,
"logps/rejected": -229.9273681640625,
"loss": -0.1855,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 1.6866050958633423,
"rewards/margins": 4.964447975158691,
"rewards/rejected": -3.2778429985046387,
"step": 335
},
{
"epoch": 0.3022222222222222,
"grad_norm": 7.001428127288818,
"learning_rate": 9.758155043288367e-06,
"logits/chosen": -0.28565549850463867,
"logits/rejected": -0.3229166865348816,
"logps/chosen": -297.42449951171875,
"logps/rejected": -250.22286987304688,
"loss": -0.2147,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 2.1362271308898926,
"rewards/margins": 5.681948661804199,
"rewards/rejected": -3.5457210540771484,
"step": 340
},
{
"epoch": 0.30666666666666664,
"grad_norm": 8.346474647521973,
"learning_rate": 9.750942897908468e-06,
"logits/chosen": -0.24829097092151642,
"logits/rejected": -0.2842785120010376,
"logps/chosen": -293.74859619140625,
"logps/rejected": -236.0409698486328,
"loss": -0.4852,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 2.535234212875366,
"rewards/margins": 6.377307891845703,
"rewards/rejected": -3.842073917388916,
"step": 345
},
{
"epoch": 0.3111111111111111,
"grad_norm": 6.549100875854492,
"learning_rate": 9.743627533687953e-06,
"logits/chosen": -0.2822897136211395,
"logits/rejected": -0.3249056041240692,
"logps/chosen": -297.6363830566406,
"logps/rejected": -229.268798828125,
"loss": -0.4093,
"rewards/accuracies": 0.8125,
"rewards/chosen": 2.632903814315796,
"rewards/margins": 6.515559196472168,
"rewards/rejected": -3.882655620574951,
"step": 350
},
{
"epoch": 0.31555555555555553,
"grad_norm": 6.955848217010498,
"learning_rate": 9.736209109560201e-06,
"logits/chosen": -0.2583480179309845,
"logits/rejected": -0.31130915880203247,
"logps/chosen": -286.8586730957031,
"logps/rejected": -228.97238159179688,
"loss": -0.3784,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 2.1645522117614746,
"rewards/margins": 5.964513301849365,
"rewards/rejected": -3.7999610900878906,
"step": 355
},
{
"epoch": 0.32,
"grad_norm": 9.709640502929688,
"learning_rate": 9.728687786697667e-06,
"logits/chosen": -0.2713499069213867,
"logits/rejected": -0.3231387734413147,
"logps/chosen": -310.96929931640625,
"logps/rejected": -238.51025390625,
"loss": -0.3302,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 2.8049497604370117,
"rewards/margins": 6.414994239807129,
"rewards/rejected": -3.6100432872772217,
"step": 360
},
{
"epoch": 0.3244444444444444,
"grad_norm": 6.710853099822998,
"learning_rate": 9.721063728508384e-06,
"logits/chosen": -0.28875869512557983,
"logits/rejected": -0.32300078868865967,
"logps/chosen": -297.21221923828125,
"logps/rejected": -246.2225799560547,
"loss": -0.3494,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 2.3010525703430176,
"rewards/margins": 6.263821601867676,
"rewards/rejected": -3.9627685546875,
"step": 365
},
{
"epoch": 0.3288888888888889,
"grad_norm": 9.562369346618652,
"learning_rate": 9.713337100632407e-06,
"logits/chosen": -0.23941664397716522,
"logits/rejected": -0.2882528305053711,
"logps/chosen": -297.01116943359375,
"logps/rejected": -246.925048828125,
"loss": -0.2107,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 1.6213099956512451,
"rewards/margins": 5.55633020401001,
"rewards/rejected": -3.935020923614502,
"step": 370
},
{
"epoch": 0.3333333333333333,
"grad_norm": 8.356274604797363,
"learning_rate": 9.705508070938219e-06,
"logits/chosen": -0.26807016134262085,
"logits/rejected": -0.29893961548805237,
"logps/chosen": -310.1183776855469,
"logps/rejected": -234.883544921875,
"loss": -0.6507,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 3.1703178882598877,
"rewards/margins": 7.669167995452881,
"rewards/rejected": -4.498850345611572,
"step": 375
},
{
"epoch": 0.3377777777777778,
"grad_norm": 7.058998107910156,
"learning_rate": 9.697576809519079e-06,
"logits/chosen": -0.2949567139148712,
"logits/rejected": -0.3223188519477844,
"logps/chosen": -311.98773193359375,
"logps/rejected": -245.71194458007812,
"loss": -0.3221,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 1.6648155450820923,
"rewards/margins": 6.265153884887695,
"rewards/rejected": -4.600337982177734,
"step": 380
},
{
"epoch": 0.3422222222222222,
"grad_norm": 8.904199600219727,
"learning_rate": 9.689543488689332e-06,
"logits/chosen": -0.25813308358192444,
"logits/rejected": -0.29112708568573,
"logps/chosen": -301.86834716796875,
"logps/rejected": -247.51974487304688,
"loss": -0.3268,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 0.9261104464530945,
"rewards/margins": 5.75935173034668,
"rewards/rejected": -4.8332414627075195,
"step": 385
},
{
"epoch": 0.3466666666666667,
"grad_norm": 6.82271671295166,
"learning_rate": 9.68140828298066e-06,
"logits/chosen": -0.27823004126548767,
"logits/rejected": -0.30908042192459106,
"logps/chosen": -302.1865539550781,
"logps/rejected": -234.9086456298828,
"loss": -0.5093,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 1.626147985458374,
"rewards/margins": 6.896539211273193,
"rewards/rejected": -5.270391941070557,
"step": 390
},
{
"epoch": 0.3511111111111111,
"grad_norm": 10.339046478271484,
"learning_rate": 9.673171369138297e-06,
"logits/chosen": -0.2561442255973816,
"logits/rejected": -0.2945733666419983,
"logps/chosen": -304.0384216308594,
"logps/rejected": -242.31137084960938,
"loss": -0.3237,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 1.6995117664337158,
"rewards/margins": 6.307187557220459,
"rewards/rejected": -4.607676029205322,
"step": 395
},
{
"epoch": 0.35555555555555557,
"grad_norm": 7.32875919342041,
"learning_rate": 9.66483292611718e-06,
"logits/chosen": -0.2525383234024048,
"logits/rejected": -0.28177526593208313,
"logps/chosen": -292.54046630859375,
"logps/rejected": -235.5553741455078,
"loss": -0.4031,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 1.865870714187622,
"rewards/margins": 6.680575370788574,
"rewards/rejected": -4.814703941345215,
"step": 400
},
{
"epoch": 0.36,
"grad_norm": 12.206645011901855,
"learning_rate": 9.656393135078067e-06,
"logits/chosen": -0.2548236846923828,
"logits/rejected": -0.30014172196388245,
"logps/chosen": -300.7771911621094,
"logps/rejected": -239.86367797851562,
"loss": -0.538,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 1.971895456314087,
"rewards/margins": 7.470471382141113,
"rewards/rejected": -5.498574733734131,
"step": 405
},
{
"epoch": 0.36444444444444446,
"grad_norm": 8.910218238830566,
"learning_rate": 9.647852179383606e-06,
"logits/chosen": -0.27060994505882263,
"logits/rejected": -0.3155694603919983,
"logps/chosen": -300.47210693359375,
"logps/rejected": -234.78250122070312,
"loss": -0.5798,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 1.5166006088256836,
"rewards/margins": 7.203047275543213,
"rewards/rejected": -5.686446189880371,
"step": 410
},
{
"epoch": 0.3688888888888889,
"grad_norm": 10.572662353515625,
"learning_rate": 9.639210244594335e-06,
"logits/chosen": -0.2864235043525696,
"logits/rejected": -0.30632856488227844,
"logps/chosen": -301.4001159667969,
"logps/rejected": -254.0180206298828,
"loss": -0.4088,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 1.349990963935852,
"rewards/margins": 6.885331630706787,
"rewards/rejected": -5.535341262817383,
"step": 415
},
{
"epoch": 0.37333333333333335,
"grad_norm": 9.515912055969238,
"learning_rate": 9.630467518464666e-06,
"logits/chosen": -0.2558160424232483,
"logits/rejected": -0.2956928312778473,
"logps/chosen": -304.60302734375,
"logps/rejected": -240.9836883544922,
"loss": -0.2976,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.0677791833877563,
"rewards/margins": 6.517449378967285,
"rewards/rejected": -5.44966983795166,
"step": 420
},
{
"epoch": 0.37777777777777777,
"grad_norm": 7.812021255493164,
"learning_rate": 9.621624190938802e-06,
"logits/chosen": -0.2350511997938156,
"logits/rejected": -0.2532605528831482,
"logps/chosen": -320.7872009277344,
"logps/rejected": -254.2796173095703,
"loss": -0.4306,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 1.7731481790542603,
"rewards/margins": 7.587254524230957,
"rewards/rejected": -5.814105987548828,
"step": 425
},
{
"epoch": 0.38222222222222224,
"grad_norm": 7.678309917449951,
"learning_rate": 9.612680454146609e-06,
"logits/chosen": -0.22189001739025116,
"logits/rejected": -0.2614109218120575,
"logps/chosen": -325.14239501953125,
"logps/rejected": -250.6171875,
"loss": -0.6808,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 2.040872573852539,
"rewards/margins": 8.241477012634277,
"rewards/rejected": -6.200604438781738,
"step": 430
},
{
"epoch": 0.38666666666666666,
"grad_norm": 10.421648979187012,
"learning_rate": 9.603636502399436e-06,
"logits/chosen": -0.2654271721839905,
"logits/rejected": -0.302105575799942,
"logps/chosen": -332.26593017578125,
"logps/rejected": -245.6372528076172,
"loss": -0.6012,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 1.5932146310806274,
"rewards/margins": 8.200170516967773,
"rewards/rejected": -6.606956481933594,
"step": 435
},
{
"epoch": 0.39111111111111113,
"grad_norm": 6.699859142303467,
"learning_rate": 9.594492532185909e-06,
"logits/chosen": -0.2850594222545624,
"logits/rejected": -0.3033252954483032,
"logps/chosen": -307.18463134765625,
"logps/rejected": -248.4318389892578,
"loss": -0.3909,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.7229059934616089,
"rewards/margins": 6.767951011657715,
"rewards/rejected": -6.045044422149658,
"step": 440
},
{
"epoch": 0.39555555555555555,
"grad_norm": 9.903278350830078,
"learning_rate": 9.585248742167638e-06,
"logits/chosen": -0.2718963325023651,
"logits/rejected": -0.30517634749412537,
"logps/chosen": -309.89031982421875,
"logps/rejected": -250.9574432373047,
"loss": -0.3108,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.4615413546562195,
"rewards/margins": 6.469930171966553,
"rewards/rejected": -6.9314703941345215,
"step": 445
},
{
"epoch": 0.4,
"grad_norm": 5.433151721954346,
"learning_rate": 9.57590533317493e-06,
"logits/chosen": -0.2626163959503174,
"logits/rejected": -0.32341477274894714,
"logps/chosen": -312.3274230957031,
"logps/rejected": -233.1435546875,
"loss": -0.7956,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.924404501914978,
"rewards/margins": 8.514490127563477,
"rewards/rejected": -7.590085029602051,
"step": 450
},
{
"epoch": 0.40444444444444444,
"grad_norm": 8.837213516235352,
"learning_rate": 9.566462508202403e-06,
"logits/chosen": -0.248914435505867,
"logits/rejected": -0.3085024952888489,
"logps/chosen": -311.30328369140625,
"logps/rejected": -252.2725067138672,
"loss": -0.4715,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.2921377718448639,
"rewards/margins": 7.126054286956787,
"rewards/rejected": -6.833916664123535,
"step": 455
},
{
"epoch": 0.4088888888888889,
"grad_norm": 7.693172454833984,
"learning_rate": 9.55692047240458e-06,
"logits/chosen": -0.2304973304271698,
"logits/rejected": -0.28682953119277954,
"logps/chosen": -311.18023681640625,
"logps/rejected": -240.9733123779297,
"loss": -0.6889,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.7987859845161438,
"rewards/margins": 8.30670166015625,
"rewards/rejected": -7.507915496826172,
"step": 460
},
{
"epoch": 0.41333333333333333,
"grad_norm": 8.775394439697266,
"learning_rate": 9.547279433091446e-06,
"logits/chosen": -0.2938714325428009,
"logits/rejected": -0.314927875995636,
"logps/chosen": -307.7293701171875,
"logps/rejected": -249.83523559570312,
"loss": -0.5757,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.2602098286151886,
"rewards/margins": 7.839123725891113,
"rewards/rejected": -7.57891321182251,
"step": 465
},
{
"epoch": 0.4177777777777778,
"grad_norm": 6.385190010070801,
"learning_rate": 9.537539599723924e-06,
"logits/chosen": -0.2282254993915558,
"logits/rejected": -0.29543009400367737,
"logps/chosen": -302.2158508300781,
"logps/rejected": -243.7989501953125,
"loss": -0.7249,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 1.112415075302124,
"rewards/margins": 8.864578247070312,
"rewards/rejected": -7.752162933349609,
"step": 470
},
{
"epoch": 0.4222222222222222,
"grad_norm": 9.705090522766113,
"learning_rate": 9.527701183909336e-06,
"logits/chosen": -0.255817174911499,
"logits/rejected": -0.30061060190200806,
"logps/chosen": -319.11309814453125,
"logps/rejected": -252.5963592529297,
"loss": -0.4242,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.3803827464580536,
"rewards/margins": 7.071600437164307,
"rewards/rejected": -7.4519829750061035,
"step": 475
},
{
"epoch": 0.4266666666666667,
"grad_norm": 10.644119262695312,
"learning_rate": 9.51776439939681e-06,
"logits/chosen": -0.24410729110240936,
"logits/rejected": -0.31472498178482056,
"logps/chosen": -326.570556640625,
"logps/rejected": -254.5313262939453,
"loss": -0.6844,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 0.5738657116889954,
"rewards/margins": 9.123512268066406,
"rewards/rejected": -8.549646377563477,
"step": 480
},
{
"epoch": 0.4311111111111111,
"grad_norm": 12.36355972290039,
"learning_rate": 9.507729462072615e-06,
"logits/chosen": -0.24467067420482635,
"logits/rejected": -0.3331097364425659,
"logps/chosen": -322.2784729003906,
"logps/rejected": -260.8439025878906,
"loss": -0.5093,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 0.3209023177623749,
"rewards/margins": 8.416958808898926,
"rewards/rejected": -8.09605598449707,
"step": 485
},
{
"epoch": 0.43555555555555553,
"grad_norm": 11.505626678466797,
"learning_rate": 9.4975965899555e-06,
"logits/chosen": -0.28976163268089294,
"logits/rejected": -0.3090762794017792,
"logps/chosen": -307.57489013671875,
"logps/rejected": -246.5322265625,
"loss": -0.4303,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.3077417612075806,
"rewards/margins": 6.817173004150391,
"rewards/rejected": -8.124914169311523,
"step": 490
},
{
"epoch": 0.44,
"grad_norm": 15.380836486816406,
"learning_rate": 9.48736600319193e-06,
"logits/chosen": -0.2653730809688568,
"logits/rejected": -0.2980864644050598,
"logps/chosen": -319.14019775390625,
"logps/rejected": -263.46063232421875,
"loss": -0.3595,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1717190742492676,
"rewards/margins": 7.247198581695557,
"rewards/rejected": -8.418917655944824,
"step": 495
},
{
"epoch": 0.4444444444444444,
"grad_norm": 7.199528694152832,
"learning_rate": 9.47703792405133e-06,
"logits/chosen": -0.2643812596797943,
"logits/rejected": -0.3057587146759033,
"logps/chosen": -305.49395751953125,
"logps/rejected": -250.5885467529297,
"loss": -0.7595,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -1.0399351119995117,
"rewards/margins": 8.203756332397461,
"rewards/rejected": -9.243691444396973,
"step": 500
},
{
"epoch": 0.4444444444444444,
"eval_logits/chosen": -0.2629312574863434,
"eval_logits/rejected": -0.30713388323783875,
"eval_logps/chosen": -313.8876953125,
"eval_logps/rejected": -254.3212127685547,
"eval_loss": -0.574113667011261,
"eval_rewards/accuracies": 0.828249990940094,
"eval_rewards/chosen": -1.1603001356124878,
"eval_rewards/margins": 8.141514778137207,
"eval_rewards/rejected": -9.301814079284668,
"eval_runtime": 2192.8697,
"eval_samples_per_second": 1.824,
"eval_steps_per_second": 0.912,
"step": 500
},
{
"epoch": 0.4488888888888889,
"grad_norm": 11.993717193603516,
"learning_rate": 9.466612576921223e-06,
"logits/chosen": -0.2699393332004547,
"logits/rejected": -0.3285272717475891,
"logps/chosen": -319.2097473144531,
"logps/rejected": -263.5858154296875,
"loss": -0.394,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.1724228858947754,
"rewards/margins": 7.439938545227051,
"rewards/rejected": -8.612360954284668,
"step": 505
},
{
"epoch": 0.4533333333333333,
"grad_norm": 11.781710624694824,
"learning_rate": 9.456090188302389e-06,
"logits/chosen": -0.26111698150634766,
"logits/rejected": -0.28280287981033325,
"logps/chosen": -309.879638671875,
"logps/rejected": -263.6683654785156,
"loss": -0.6619,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.2497330904006958,
"rewards/margins": 8.266626358032227,
"rewards/rejected": -9.516359329223633,
"step": 510
},
{
"epoch": 0.4577777777777778,
"grad_norm": 11.226400375366211,
"learning_rate": 9.445470986803922e-06,
"logits/chosen": -0.2626830041408539,
"logits/rejected": -0.3101075291633606,
"logps/chosen": -299.5857238769531,
"logps/rejected": -248.7251434326172,
"loss": -0.6192,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -1.528692603111267,
"rewards/margins": 8.301843643188477,
"rewards/rejected": -9.830536842346191,
"step": 515
},
{
"epoch": 0.4622222222222222,
"grad_norm": 8.692116737365723,
"learning_rate": 9.434755203138269e-06,
"logits/chosen": -0.27712422609329224,
"logits/rejected": -0.33624228835105896,
"logps/chosen": -341.59759521484375,
"logps/rejected": -257.9901123046875,
"loss": -0.5867,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.3974745273590088,
"rewards/margins": 8.428323745727539,
"rewards/rejected": -9.825799942016602,
"step": 520
},
{
"epoch": 0.4666666666666667,
"grad_norm": 7.316524982452393,
"learning_rate": 9.423943070116219e-06,
"logits/chosen": -0.3034690320491791,
"logits/rejected": -0.3194289803504944,
"logps/chosen": -294.5498046875,
"logps/rejected": -247.59463500976562,
"loss": -0.5768,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -2.136320114135742,
"rewards/margins": 7.421705722808838,
"rewards/rejected": -9.558026313781738,
"step": 525
},
{
"epoch": 0.4711111111111111,
"grad_norm": 15.097468376159668,
"learning_rate": 9.413034822641845e-06,
"logits/chosen": -0.29432040452957153,
"logits/rejected": -0.3473649322986603,
"logps/chosen": -314.51007080078125,
"logps/rejected": -255.99453735351562,
"loss": -0.3577,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.039400815963745,
"rewards/margins": 7.780667304992676,
"rewards/rejected": -9.820066452026367,
"step": 530
},
{
"epoch": 0.47555555555555556,
"grad_norm": 10.856350898742676,
"learning_rate": 9.402030697707398e-06,
"logits/chosen": -0.27809661626815796,
"logits/rejected": -0.3084755539894104,
"logps/chosen": -290.03839111328125,
"logps/rejected": -246.6155242919922,
"loss": -0.7648,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.642029047012329,
"rewards/margins": 8.514683723449707,
"rewards/rejected": -10.156713485717773,
"step": 535
},
{
"epoch": 0.48,
"grad_norm": 10.466115951538086,
"learning_rate": 9.390930934388164e-06,
"logits/chosen": -0.25123220682144165,
"logits/rejected": -0.28659194707870483,
"logps/chosen": -310.38702392578125,
"logps/rejected": -264.1778869628906,
"loss": -0.6251,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.053196430206299,
"rewards/margins": 8.635331153869629,
"rewards/rejected": -10.68852710723877,
"step": 540
},
{
"epoch": 0.48444444444444446,
"grad_norm": 14.518105506896973,
"learning_rate": 9.37973577383726e-06,
"logits/chosen": -0.2105627954006195,
"logits/rejected": -0.2725834250450134,
"logps/chosen": -309.6773681640625,
"logps/rejected": -247.98507690429688,
"loss": -0.7806,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.5902966260910034,
"rewards/margins": 9.257573127746582,
"rewards/rejected": -10.847868919372559,
"step": 545
},
{
"epoch": 0.4888888888888889,
"grad_norm": 9.231803894042969,
"learning_rate": 9.368445459280405e-06,
"logits/chosen": -0.26593995094299316,
"logits/rejected": -0.28871750831604004,
"logps/chosen": -315.306884765625,
"logps/rejected": -262.83197021484375,
"loss": -0.5635,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.177497148513794,
"rewards/margins": 7.756557464599609,
"rewards/rejected": -10.934054374694824,
"step": 550
},
{
"epoch": 0.49333333333333335,
"grad_norm": 9.906770706176758,
"learning_rate": 9.357060236010626e-06,
"logits/chosen": -0.25906693935394287,
"logits/rejected": -0.32186049222946167,
"logps/chosen": -326.8785095214844,
"logps/rejected": -273.9308166503906,
"loss": -0.6905,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -2.1445183753967285,
"rewards/margins": 9.113534927368164,
"rewards/rejected": -11.258054733276367,
"step": 555
},
{
"epoch": 0.49777777777777776,
"grad_norm": 7.245250225067139,
"learning_rate": 9.345580351382939e-06,
"logits/chosen": -0.2802310585975647,
"logits/rejected": -0.2841408848762512,
"logps/chosen": -293.36065673828125,
"logps/rejected": -265.8360900878906,
"loss": -0.7463,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -3.965707778930664,
"rewards/margins": 9.175572395324707,
"rewards/rejected": -13.141279220581055,
"step": 560
},
{
"epoch": 0.5022222222222222,
"grad_norm": 11.543607711791992,
"learning_rate": 9.334006054808966e-06,
"logits/chosen": -0.2962619960308075,
"logits/rejected": -0.3181178569793701,
"logps/chosen": -326.261962890625,
"logps/rejected": -281.9925231933594,
"loss": -0.5012,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -4.311570644378662,
"rewards/margins": 8.469701766967773,
"rewards/rejected": -12.781272888183594,
"step": 565
},
{
"epoch": 0.5066666666666667,
"grad_norm": 18.13285255432129,
"learning_rate": 9.322337597751525e-06,
"logits/chosen": -0.29192933440208435,
"logits/rejected": -0.32068902254104614,
"logps/chosen": -320.00146484375,
"logps/rejected": -267.2696838378906,
"loss": 0.0417,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -5.439437389373779,
"rewards/margins": 6.237511157989502,
"rewards/rejected": -11.676947593688965,
"step": 570
},
{
"epoch": 0.5111111111111111,
"grad_norm": 15.025782585144043,
"learning_rate": 9.310575233719155e-06,
"logits/chosen": -0.2575679421424866,
"logits/rejected": -0.28944242000579834,
"logps/chosen": -312.4665222167969,
"logps/rejected": -266.6491394042969,
"loss": -0.5652,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.7938873767852783,
"rewards/margins": 8.282114028930664,
"rewards/rejected": -11.07600212097168,
"step": 575
},
{
"epoch": 0.5155555555555555,
"grad_norm": 10.936811447143555,
"learning_rate": 9.29871921826062e-06,
"logits/chosen": -0.2927904725074768,
"logits/rejected": -0.35370174050331116,
"logps/chosen": -321.13885498046875,
"logps/rejected": -268.1432800292969,
"loss": -0.2429,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -4.02617883682251,
"rewards/margins": 8.042525291442871,
"rewards/rejected": -12.068704605102539,
"step": 580
},
{
"epoch": 0.52,
"grad_norm": 12.064530372619629,
"learning_rate": 9.28676980895935e-06,
"logits/chosen": -0.24123439192771912,
"logits/rejected": -0.2889128625392914,
"logps/chosen": -308.1340637207031,
"logps/rejected": -256.29632568359375,
"loss": -0.8401,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.284236431121826,
"rewards/margins": 10.122550010681152,
"rewards/rejected": -13.40678596496582,
"step": 585
},
{
"epoch": 0.5244444444444445,
"grad_norm": 13.778730392456055,
"learning_rate": 9.274727265427849e-06,
"logits/chosen": -0.2769649922847748,
"logits/rejected": -0.31647247076034546,
"logps/chosen": -306.0679626464844,
"logps/rejected": -254.93179321289062,
"loss": -0.7553,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -4.035617828369141,
"rewards/margins": 9.203435897827148,
"rewards/rejected": -13.239053726196289,
"step": 590
},
{
"epoch": 0.5288888888888889,
"grad_norm": 8.471962928771973,
"learning_rate": 9.262591849302049e-06,
"logits/chosen": -0.2713521122932434,
"logits/rejected": -0.3014729619026184,
"logps/chosen": -299.43475341796875,
"logps/rejected": -266.22686767578125,
"loss": -0.4191,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.5517988204956055,
"rewards/margins": 8.98070240020752,
"rewards/rejected": -13.532503128051758,
"step": 595
},
{
"epoch": 0.5333333333333333,
"grad_norm": 12.954193115234375,
"learning_rate": 9.250363824235629e-06,
"logits/chosen": -0.2955438494682312,
"logits/rejected": -0.3413962721824646,
"logps/chosen": -318.26068115234375,
"logps/rejected": -256.75689697265625,
"loss": -0.5025,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.381830930709839,
"rewards/margins": 9.101526260375977,
"rewards/rejected": -12.483358383178711,
"step": 600
},
{
"epoch": 0.5377777777777778,
"grad_norm": 10.307146072387695,
"learning_rate": 9.238043455894294e-06,
"logits/chosen": -0.27938082814216614,
"logits/rejected": -0.3281027674674988,
"logps/chosen": -327.4522399902344,
"logps/rejected": -257.4443054199219,
"loss": -0.7464,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.192465305328369,
"rewards/margins": 8.572819709777832,
"rewards/rejected": -12.765284538269043,
"step": 605
},
{
"epoch": 0.5422222222222223,
"grad_norm": 8.460762023925781,
"learning_rate": 9.225631011949987e-06,
"logits/chosen": -0.2906576991081238,
"logits/rejected": -0.32649320363998413,
"logps/chosen": -341.95928955078125,
"logps/rejected": -275.3990783691406,
"loss": -0.9454,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.693796396255493,
"rewards/margins": 10.737831115722656,
"rewards/rejected": -13.43162727355957,
"step": 610
},
{
"epoch": 0.5466666666666666,
"grad_norm": 18.260358810424805,
"learning_rate": 9.213126762075088e-06,
"logits/chosen": -0.3098008632659912,
"logits/rejected": -0.3394979238510132,
"logps/chosen": -306.67449951171875,
"logps/rejected": -263.02349853515625,
"loss": -0.5332,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -4.892157554626465,
"rewards/margins": 9.199603080749512,
"rewards/rejected": -14.091761589050293,
"step": 615
},
{
"epoch": 0.5511111111111111,
"grad_norm": 12.752532005310059,
"learning_rate": 9.200530977936551e-06,
"logits/chosen": -0.3172837793827057,
"logits/rejected": -0.3619407117366791,
"logps/chosen": -349.4862976074219,
"logps/rejected": -279.5086669921875,
"loss": -0.6241,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.374999046325684,
"rewards/margins": 9.857782363891602,
"rewards/rejected": -14.232782363891602,
"step": 620
},
{
"epoch": 0.5555555555555556,
"grad_norm": 15.203842163085938,
"learning_rate": 9.187843933189994e-06,
"logits/chosen": -0.28893885016441345,
"logits/rejected": -0.3426817059516907,
"logps/chosen": -328.72979736328125,
"logps/rejected": -271.71099853515625,
"loss": -0.8634,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.5763354301452637,
"rewards/margins": 11.408061027526855,
"rewards/rejected": -14.984395980834961,
"step": 625
},
{
"epoch": 0.56,
"grad_norm": 10.816106796264648,
"learning_rate": 9.175065903473769e-06,
"logits/chosen": -0.2791399657726288,
"logits/rejected": -0.2996821403503418,
"logps/chosen": -321.34771728515625,
"logps/rejected": -280.1087341308594,
"loss": -0.5511,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -5.1469244956970215,
"rewards/margins": 9.314436912536621,
"rewards/rejected": -14.4613618850708,
"step": 630
},
{
"epoch": 0.5644444444444444,
"grad_norm": 8.36387825012207,
"learning_rate": 9.162197166402957e-06,
"logits/chosen": -0.29182273149490356,
"logits/rejected": -0.32408252358436584,
"logps/chosen": -316.68487548828125,
"logps/rejected": -270.219482421875,
"loss": -1.0203,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -4.753455638885498,
"rewards/margins": 10.925695419311523,
"rewards/rejected": -15.679153442382812,
"step": 635
},
{
"epoch": 0.5688888888888889,
"grad_norm": 11.839485168457031,
"learning_rate": 9.149238001563348e-06,
"logits/chosen": -0.30801886320114136,
"logits/rejected": -0.329951673746109,
"logps/chosen": -312.73577880859375,
"logps/rejected": -267.54437255859375,
"loss": -0.5904,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -6.468144416809082,
"rewards/margins": 8.780177116394043,
"rewards/rejected": -15.248323440551758,
"step": 640
},
{
"epoch": 0.5733333333333334,
"grad_norm": 16.650188446044922,
"learning_rate": 9.136188690505363e-06,
"logits/chosen": -0.2637523412704468,
"logits/rejected": -0.31697210669517517,
"logps/chosen": -331.0145568847656,
"logps/rejected": -271.3872985839844,
"loss": -0.5915,
"rewards/accuracies": 0.8125,
"rewards/chosen": -5.373086452484131,
"rewards/margins": 8.960186958312988,
"rewards/rejected": -14.333274841308594,
"step": 645
},
{
"epoch": 0.5777777777777777,
"grad_norm": 18.575305938720703,
"learning_rate": 9.123049516737936e-06,
"logits/chosen": -0.3117810785770416,
"logits/rejected": -0.3635488450527191,
"logps/chosen": -328.19989013671875,
"logps/rejected": -272.4068603515625,
"loss": -0.6851,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -5.281620979309082,
"rewards/margins": 10.281137466430664,
"rewards/rejected": -15.562756538391113,
"step": 650
},
{
"epoch": 0.5822222222222222,
"grad_norm": 16.429004669189453,
"learning_rate": 9.109820765722357e-06,
"logits/chosen": -0.27543455362319946,
"logits/rejected": -0.31441715359687805,
"logps/chosen": -335.43609619140625,
"logps/rejected": -286.1941833496094,
"loss": -0.8026,
"rewards/accuracies": 0.8125,
"rewards/chosen": -5.2430877685546875,
"rewards/margins": 10.86163330078125,
"rewards/rejected": -16.10472297668457,
"step": 655
},
{
"epoch": 0.5866666666666667,
"grad_norm": 9.399615287780762,
"learning_rate": 9.096502724866067e-06,
"logits/chosen": -0.3014602065086365,
"logits/rejected": -0.35103824734687805,
"logps/chosen": -359.2838439941406,
"logps/rejected": -294.8686828613281,
"loss": -1.0671,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -4.225809574127197,
"rewards/margins": 11.496713638305664,
"rewards/rejected": -15.722521781921387,
"step": 660
},
{
"epoch": 0.5911111111111111,
"grad_norm": 14.605314254760742,
"learning_rate": 9.083095683516414e-06,
"logits/chosen": -0.28259098529815674,
"logits/rejected": -0.32325831055641174,
"logps/chosen": -350.68878173828125,
"logps/rejected": -277.5746154785156,
"loss": -0.9032,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -6.993232727050781,
"rewards/margins": 10.017694473266602,
"rewards/rejected": -17.010927200317383,
"step": 665
},
{
"epoch": 0.5955555555555555,
"grad_norm": 17.158226013183594,
"learning_rate": 9.069599932954371e-06,
"logits/chosen": -0.29114705324172974,
"logits/rejected": -0.3473047912120819,
"logps/chosen": -339.568603515625,
"logps/rejected": -287.64666748046875,
"loss": -0.5308,
"rewards/accuracies": 0.78125,
"rewards/chosen": -8.217833518981934,
"rewards/margins": 8.741876602172852,
"rewards/rejected": -16.9597110748291,
"step": 670
},
{
"epoch": 0.6,
"grad_norm": 16.861963272094727,
"learning_rate": 9.056015766388205e-06,
"logits/chosen": -0.3223651945590973,
"logits/rejected": -0.36349570751190186,
"logps/chosen": -334.6025695800781,
"logps/rejected": -285.73828125,
"loss": -0.7831,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -7.5147528648376465,
"rewards/margins": 10.620495796203613,
"rewards/rejected": -18.1352481842041,
"step": 675
},
{
"epoch": 0.6044444444444445,
"grad_norm": 14.257497787475586,
"learning_rate": 9.042343478947103e-06,
"logits/chosen": -0.3066635727882385,
"logits/rejected": -0.32420462369918823,
"logps/chosen": -337.81097412109375,
"logps/rejected": -286.6188049316406,
"loss": -0.9349,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -6.352471351623535,
"rewards/margins": 11.51642894744873,
"rewards/rejected": -17.868900299072266,
"step": 680
},
{
"epoch": 0.6088888888888889,
"grad_norm": 16.795236587524414,
"learning_rate": 9.028583367674767e-06,
"logits/chosen": -0.34612902998924255,
"logits/rejected": -0.36396104097366333,
"logps/chosen": -332.6644287109375,
"logps/rejected": -287.1361083984375,
"loss": -0.9376,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -5.387200355529785,
"rewards/margins": 11.706459999084473,
"rewards/rejected": -17.093660354614258,
"step": 685
},
{
"epoch": 0.6133333333333333,
"grad_norm": 11.827564239501953,
"learning_rate": 9.014735731522952e-06,
"logits/chosen": -0.3044932782649994,
"logits/rejected": -0.33501502871513367,
"logps/chosen": -331.37835693359375,
"logps/rejected": -279.79718017578125,
"loss": -1.0226,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -5.231140613555908,
"rewards/margins": 12.324139595031738,
"rewards/rejected": -17.555278778076172,
"step": 690
},
{
"epoch": 0.6177777777777778,
"grad_norm": 13.102601051330566,
"learning_rate": 9.00080087134498e-06,
"logits/chosen": -0.31660374999046326,
"logits/rejected": -0.3677740693092346,
"logps/chosen": -340.77081298828125,
"logps/rejected": -275.3927001953125,
"loss": -0.8796,
"rewards/accuracies": 0.8125,
"rewards/chosen": -5.748688220977783,
"rewards/margins": 11.431347846984863,
"rewards/rejected": -17.180036544799805,
"step": 695
},
{
"epoch": 0.6222222222222222,
"grad_norm": 28.49679946899414,
"learning_rate": 8.9867790898892e-06,
"logits/chosen": -0.3224649131298065,
"logits/rejected": -0.3925584852695465,
"logps/chosen": -349.48919677734375,
"logps/rejected": -279.79119873046875,
"loss": -0.8641,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -6.573420524597168,
"rewards/margins": 11.63664436340332,
"rewards/rejected": -18.210065841674805,
"step": 700
},
{
"epoch": 0.6266666666666667,
"grad_norm": 17.276578903198242,
"learning_rate": 8.972670691792409e-06,
"logits/chosen": -0.3031178414821625,
"logits/rejected": -0.347816526889801,
"logps/chosen": -332.0860900878906,
"logps/rejected": -278.52264404296875,
"loss": -0.6865,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -7.202242851257324,
"rewards/margins": 10.682793617248535,
"rewards/rejected": -17.88503646850586,
"step": 705
},
{
"epoch": 0.6311111111111111,
"grad_norm": 13.52579402923584,
"learning_rate": 8.958475983573234e-06,
"logits/chosen": -0.3044522702693939,
"logits/rejected": -0.34488362073898315,
"logps/chosen": -337.5535583496094,
"logps/rejected": -295.95428466796875,
"loss": -0.9102,
"rewards/accuracies": 0.84375,
"rewards/chosen": -6.68551778793335,
"rewards/margins": 11.271787643432617,
"rewards/rejected": -17.957305908203125,
"step": 710
},
{
"epoch": 0.6355555555555555,
"grad_norm": 12.732015609741211,
"learning_rate": 8.944195273625472e-06,
"logits/chosen": -0.2973068356513977,
"logits/rejected": -0.35419678688049316,
"logps/chosen": -318.517822265625,
"logps/rejected": -271.3919677734375,
"loss": -0.8689,
"rewards/accuracies": 0.84375,
"rewards/chosen": -7.6208038330078125,
"rewards/margins": 10.748211860656738,
"rewards/rejected": -18.369014739990234,
"step": 715
},
{
"epoch": 0.64,
"grad_norm": 15.25625991821289,
"learning_rate": 8.92982887221139e-06,
"logits/chosen": -0.3091279864311218,
"logits/rejected": -0.3462229073047638,
"logps/chosen": -345.78131103515625,
"logps/rejected": -292.4202575683594,
"loss": -0.4038,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -9.038877487182617,
"rewards/margins": 10.578144073486328,
"rewards/rejected": -19.617021560668945,
"step": 720
},
{
"epoch": 0.6444444444444445,
"grad_norm": 8.678261756896973,
"learning_rate": 8.915377091454992e-06,
"logits/chosen": -0.2622337341308594,
"logits/rejected": -0.3454502820968628,
"logps/chosen": -330.6527404785156,
"logps/rejected": -271.9297790527344,
"loss": -0.9941,
"rewards/accuracies": 0.84375,
"rewards/chosen": -6.34903621673584,
"rewards/margins": 11.77011775970459,
"rewards/rejected": -18.119152069091797,
"step": 725
},
{
"epoch": 0.6488888888888888,
"grad_norm": 18.099544525146484,
"learning_rate": 8.900840245335225e-06,
"logits/chosen": -0.29967910051345825,
"logits/rejected": -0.3550174832344055,
"logps/chosen": -339.3348388671875,
"logps/rejected": -284.2086181640625,
"loss": -0.6141,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -8.690653800964355,
"rewards/margins": 10.278780937194824,
"rewards/rejected": -18.969436645507812,
"step": 730
},
{
"epoch": 0.6533333333333333,
"grad_norm": 17.058128356933594,
"learning_rate": 8.886218649679162e-06,
"logits/chosen": -0.30947160720825195,
"logits/rejected": -0.3345088064670563,
"logps/chosen": -320.3451232910156,
"logps/rejected": -277.7720031738281,
"loss": -1.014,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -8.53862190246582,
"rewards/margins": 12.284268379211426,
"rewards/rejected": -20.822891235351562,
"step": 735
},
{
"epoch": 0.6577777777777778,
"grad_norm": 13.636448860168457,
"learning_rate": 8.871512622155147e-06,
"logits/chosen": -0.2878524363040924,
"logits/rejected": -0.3395880162715912,
"logps/chosen": -362.52899169921875,
"logps/rejected": -303.67620849609375,
"loss": -0.9753,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -8.569745063781738,
"rewards/margins": 12.254476547241211,
"rewards/rejected": -20.824222564697266,
"step": 740
},
{
"epoch": 0.6622222222222223,
"grad_norm": 13.943758964538574,
"learning_rate": 8.856722482265886e-06,
"logits/chosen": -0.2777239978313446,
"logits/rejected": -0.2970428466796875,
"logps/chosen": -317.4947509765625,
"logps/rejected": -292.8834533691406,
"loss": -0.952,
"rewards/accuracies": 0.84375,
"rewards/chosen": -9.092373847961426,
"rewards/margins": 11.070058822631836,
"rewards/rejected": -20.162433624267578,
"step": 745
},
{
"epoch": 0.6666666666666666,
"grad_norm": 10.792975425720215,
"learning_rate": 8.841848551341506e-06,
"logits/chosen": -0.300568550825119,
"logits/rejected": -0.35186997056007385,
"logps/chosen": -342.993408203125,
"logps/rejected": -283.03594970703125,
"loss": -0.9528,
"rewards/accuracies": 0.84375,
"rewards/chosen": -8.736288070678711,
"rewards/margins": 11.417104721069336,
"rewards/rejected": -20.153392791748047,
"step": 750
},
{
"epoch": 0.6711111111111111,
"grad_norm": 14.272412300109863,
"learning_rate": 8.826891152532579e-06,
"logits/chosen": -0.24646346271038055,
"logits/rejected": -0.3181930184364319,
"logps/chosen": -330.8204040527344,
"logps/rejected": -290.7005310058594,
"loss": -0.8532,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -8.7890625,
"rewards/margins": 12.464083671569824,
"rewards/rejected": -21.25314712524414,
"step": 755
},
{
"epoch": 0.6755555555555556,
"grad_norm": 18.93645477294922,
"learning_rate": 8.811850610803094e-06,
"logits/chosen": -0.2739986181259155,
"logits/rejected": -0.3238711357116699,
"logps/chosen": -351.69171142578125,
"logps/rejected": -305.0896301269531,
"loss": -0.9423,
"rewards/accuracies": 0.84375,
"rewards/chosen": -8.920026779174805,
"rewards/margins": 12.025522232055664,
"rewards/rejected": -20.945547103881836,
"step": 760
},
{
"epoch": 0.68,
"grad_norm": 18.738351821899414,
"learning_rate": 8.796727252923403e-06,
"logits/chosen": -0.31761056184768677,
"logits/rejected": -0.3448847532272339,
"logps/chosen": -331.40557861328125,
"logps/rejected": -296.1661071777344,
"loss": -0.5371,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -10.65619945526123,
"rewards/margins": 11.044633865356445,
"rewards/rejected": -21.700834274291992,
"step": 765
},
{
"epoch": 0.6844444444444444,
"grad_norm": 17.68587875366211,
"learning_rate": 8.781521407463119e-06,
"logits/chosen": -0.29655805230140686,
"logits/rejected": -0.35701996088027954,
"logps/chosen": -353.18145751953125,
"logps/rejected": -300.9978942871094,
"loss": -1.0413,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -8.621820449829102,
"rewards/margins": 13.096748352050781,
"rewards/rejected": -21.718570709228516,
"step": 770
},
{
"epoch": 0.6888888888888889,
"grad_norm": 8.765332221984863,
"learning_rate": 8.766233404783975e-06,
"logits/chosen": -0.33828821778297424,
"logits/rejected": -0.35216769576072693,
"logps/chosen": -349.8067626953125,
"logps/rejected": -313.57818603515625,
"loss": -0.856,
"rewards/accuracies": 0.8125,
"rewards/chosen": -9.512666702270508,
"rewards/margins": 12.066935539245605,
"rewards/rejected": -21.57960319519043,
"step": 775
},
{
"epoch": 0.6933333333333334,
"grad_norm": 20.219892501831055,
"learning_rate": 8.750863577032652e-06,
"logits/chosen": -0.3195672929286957,
"logits/rejected": -0.3713618218898773,
"logps/chosen": -358.82037353515625,
"logps/rejected": -303.29571533203125,
"loss": -1.2232,
"rewards/accuracies": 0.84375,
"rewards/chosen": -9.787614822387695,
"rewards/margins": 13.660125732421875,
"rewards/rejected": -23.447738647460938,
"step": 780
},
{
"epoch": 0.6977777777777778,
"grad_norm": 14.765336036682129,
"learning_rate": 8.735412258133562e-06,
"logits/chosen": -0.3235880136489868,
"logits/rejected": -0.36679068207740784,
"logps/chosen": -354.3951416015625,
"logps/rejected": -299.1098937988281,
"loss": -0.8905,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -10.815263748168945,
"rewards/margins": 12.30003547668457,
"rewards/rejected": -23.115299224853516,
"step": 785
},
{
"epoch": 0.7022222222222222,
"grad_norm": 13.638387680053711,
"learning_rate": 8.719879783781585e-06,
"logits/chosen": -0.33130335807800293,
"logits/rejected": -0.3633490204811096,
"logps/chosen": -351.318603515625,
"logps/rejected": -302.3772888183594,
"loss": -1.0524,
"rewards/accuracies": 0.84375,
"rewards/chosen": -9.303329467773438,
"rewards/margins": 13.00804615020752,
"rewards/rejected": -22.31137466430664,
"step": 790
},
{
"epoch": 0.7066666666666667,
"grad_norm": 10.990659713745117,
"learning_rate": 8.704266491434787e-06,
"logits/chosen": -0.3289201259613037,
"logits/rejected": -0.36471107602119446,
"logps/chosen": -330.88763427734375,
"logps/rejected": -286.8157958984375,
"loss": -0.6598,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -10.67319107055664,
"rewards/margins": 11.411112785339355,
"rewards/rejected": -22.084304809570312,
"step": 795
},
{
"epoch": 0.7111111111111111,
"grad_norm": 12.232583045959473,
"learning_rate": 8.688572720307083e-06,
"logits/chosen": -0.3058468997478485,
"logits/rejected": -0.3826626241207123,
"logps/chosen": -360.0539855957031,
"logps/rejected": -302.0265808105469,
"loss": -0.9798,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -9.977747917175293,
"rewards/margins": 13.34874153137207,
"rewards/rejected": -23.326488494873047,
"step": 800
},
{
"epoch": 0.7155555555555555,
"grad_norm": 28.097835540771484,
"learning_rate": 8.672798811360863e-06,
"logits/chosen": -0.3440350890159607,
"logits/rejected": -0.3669665455818176,
"logps/chosen": -348.4472351074219,
"logps/rejected": -317.9486083984375,
"loss": -1.158,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -9.318201065063477,
"rewards/margins": 13.85308837890625,
"rewards/rejected": -23.17129135131836,
"step": 805
},
{
"epoch": 0.72,
"grad_norm": 13.566072463989258,
"learning_rate": 8.656945107299598e-06,
"logits/chosen": -0.32617539167404175,
"logits/rejected": -0.3627128601074219,
"logps/chosen": -350.51495361328125,
"logps/rejected": -313.4057922363281,
"loss": -1.1211,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -9.836647987365723,
"rewards/margins": 13.414007186889648,
"rewards/rejected": -23.250656127929688,
"step": 810
},
{
"epoch": 0.7244444444444444,
"grad_norm": 10.32918930053711,
"learning_rate": 8.641011952560372e-06,
"logits/chosen": -0.3140029311180115,
"logits/rejected": -0.3582364618778229,
"logps/chosen": -333.9091796875,
"logps/rejected": -282.69061279296875,
"loss": -0.8948,
"rewards/accuracies": 0.84375,
"rewards/chosen": -10.822344779968262,
"rewards/margins": 11.024767875671387,
"rewards/rejected": -21.84711265563965,
"step": 815
},
{
"epoch": 0.7288888888888889,
"grad_norm": 14.581253051757812,
"learning_rate": 8.624999693306422e-06,
"logits/chosen": -0.33729246258735657,
"logits/rejected": -0.3753616213798523,
"logps/chosen": -342.4247741699219,
"logps/rejected": -309.2422790527344,
"loss": -0.9419,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -11.228838920593262,
"rewards/margins": 13.112091064453125,
"rewards/rejected": -24.34092903137207,
"step": 820
},
{
"epoch": 0.7333333333333333,
"grad_norm": 11.523558616638184,
"learning_rate": 8.608908677419606e-06,
"logits/chosen": -0.37991100549697876,
"logits/rejected": -0.40186434984207153,
"logps/chosen": -348.12396240234375,
"logps/rejected": -309.076171875,
"loss": -1.0504,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -12.649127006530762,
"rewards/margins": 12.755289077758789,
"rewards/rejected": -25.404415130615234,
"step": 825
},
{
"epoch": 0.7377777777777778,
"grad_norm": 17.81552505493164,
"learning_rate": 8.592739254492845e-06,
"logits/chosen": -0.362493097782135,
"logits/rejected": -0.4177095293998718,
"logps/chosen": -335.11981201171875,
"logps/rejected": -290.92218017578125,
"loss": -1.1041,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -11.677441596984863,
"rewards/margins": 13.119203567504883,
"rewards/rejected": -24.79664421081543,
"step": 830
},
{
"epoch": 0.7422222222222222,
"grad_norm": 20.329221725463867,
"learning_rate": 8.576491775822527e-06,
"logits/chosen": -0.33437713980674744,
"logits/rejected": -0.39904457330703735,
"logps/chosen": -357.16943359375,
"logps/rejected": -297.7870178222656,
"loss": -0.724,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -11.969806671142578,
"rewards/margins": 12.316507339477539,
"rewards/rejected": -24.28631591796875,
"step": 835
},
{
"epoch": 0.7466666666666667,
"grad_norm": 19.302101135253906,
"learning_rate": 8.560166594400878e-06,
"logits/chosen": -0.3832574486732483,
"logits/rejected": -0.44351863861083984,
"logps/chosen": -352.62115478515625,
"logps/rejected": -304.46124267578125,
"loss": -0.6363,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -12.629277229309082,
"rewards/margins": 11.651094436645508,
"rewards/rejected": -24.280370712280273,
"step": 840
},
{
"epoch": 0.7511111111111111,
"grad_norm": 14.173489570617676,
"learning_rate": 8.543764064908295e-06,
"logits/chosen": -0.34056347608566284,
"logits/rejected": -0.39399194717407227,
"logps/chosen": -340.8840026855469,
"logps/rejected": -307.18603515625,
"loss": -1.2865,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -11.463663101196289,
"rewards/margins": 13.927221298217773,
"rewards/rejected": -25.390884399414062,
"step": 845
},
{
"epoch": 0.7555555555555555,
"grad_norm": 22.398326873779297,
"learning_rate": 8.527284543705631e-06,
"logits/chosen": -0.37620821595191956,
"logits/rejected": -0.4051085412502289,
"logps/chosen": -341.5446472167969,
"logps/rejected": -314.3455810546875,
"loss": -1.1236,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -11.881677627563477,
"rewards/margins": 14.11926555633545,
"rewards/rejected": -26.00094223022461,
"step": 850
},
{
"epoch": 0.76,
"grad_norm": 20.017797470092773,
"learning_rate": 8.510728388826464e-06,
"logits/chosen": -0.33530497550964355,
"logits/rejected": -0.3962380290031433,
"logps/chosen": -341.4028625488281,
"logps/rejected": -305.88824462890625,
"loss": -1.5163,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -10.922323226928711,
"rewards/margins": 15.069772720336914,
"rewards/rejected": -25.992095947265625,
"step": 855
},
{
"epoch": 0.7644444444444445,
"grad_norm": 13.252291679382324,
"learning_rate": 8.494095959969309e-06,
"logits/chosen": -0.34795650839805603,
"logits/rejected": -0.40874728560447693,
"logps/chosen": -348.9808654785156,
"logps/rejected": -308.14349365234375,
"loss": -0.9905,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -13.379107475280762,
"rewards/margins": 12.994186401367188,
"rewards/rejected": -26.373294830322266,
"step": 860
},
{
"epoch": 0.7688888888888888,
"grad_norm": 18.38790512084961,
"learning_rate": 8.477387618489808e-06,
"logits/chosen": -0.3455773890018463,
"logits/rejected": -0.40834465622901917,
"logps/chosen": -343.2769470214844,
"logps/rejected": -297.47784423828125,
"loss": -1.4511,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -10.266695022583008,
"rewards/margins": 15.304100036621094,
"rewards/rejected": -25.5707950592041,
"step": 865
},
{
"epoch": 0.7733333333333333,
"grad_norm": 21.267852783203125,
"learning_rate": 8.460603727392877e-06,
"logits/chosen": -0.35729557275772095,
"logits/rejected": -0.3905247449874878,
"logps/chosen": -370.48577880859375,
"logps/rejected": -324.64532470703125,
"loss": -1.1358,
"rewards/accuracies": 0.8125,
"rewards/chosen": -12.016322135925293,
"rewards/margins": 15.51159381866455,
"rewards/rejected": -27.527912139892578,
"step": 870
},
{
"epoch": 0.7777777777777778,
"grad_norm": 20.68170166015625,
"learning_rate": 8.443744651324828e-06,
"logits/chosen": -0.3603067994117737,
"logits/rejected": -0.40933218598365784,
"logps/chosen": -356.02154541015625,
"logps/rejected": -310.26666259765625,
"loss": -1.0198,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -13.34446907043457,
"rewards/margins": 13.9938325881958,
"rewards/rejected": -27.338302612304688,
"step": 875
},
{
"epoch": 0.7822222222222223,
"grad_norm": 14.795793533325195,
"learning_rate": 8.426810756565428e-06,
"logits/chosen": -0.3585900664329529,
"logits/rejected": -0.42686209082603455,
"logps/chosen": -368.9267272949219,
"logps/rejected": -311.18023681640625,
"loss": -1.5537,
"rewards/accuracies": 0.875,
"rewards/chosen": -13.114725112915039,
"rewards/margins": 16.6258487701416,
"rewards/rejected": -29.74057388305664,
"step": 880
},
{
"epoch": 0.7866666666666666,
"grad_norm": 19.962947845458984,
"learning_rate": 8.409802411019962e-06,
"logits/chosen": -0.347336083650589,
"logits/rejected": -0.4067932665348053,
"logps/chosen": -343.19158935546875,
"logps/rejected": -304.2242126464844,
"loss": -1.3862,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -13.927999496459961,
"rewards/margins": 15.6153564453125,
"rewards/rejected": -29.54335594177246,
"step": 885
},
{
"epoch": 0.7911111111111111,
"grad_norm": 16.915666580200195,
"learning_rate": 8.392719984211228e-06,
"logits/chosen": -0.36178287863731384,
"logits/rejected": -0.42369580268859863,
"logps/chosen": -363.2778625488281,
"logps/rejected": -314.5802001953125,
"loss": -1.3641,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -14.133458137512207,
"rewards/margins": 13.37634563446045,
"rewards/rejected": -27.50980567932129,
"step": 890
},
{
"epoch": 0.7955555555555556,
"grad_norm": 16.34973907470703,
"learning_rate": 8.375563847271506e-06,
"logits/chosen": -0.3902398645877838,
"logits/rejected": -0.4178919792175293,
"logps/chosen": -354.6260070800781,
"logps/rejected": -320.308837890625,
"loss": -1.296,
"rewards/accuracies": 0.84375,
"rewards/chosen": -14.261367797851562,
"rewards/margins": 15.514287948608398,
"rewards/rejected": -29.77565574645996,
"step": 895
},
{
"epoch": 0.8,
"grad_norm": 20.18850326538086,
"learning_rate": 8.35833437293451e-06,
"logits/chosen": -0.3586779534816742,
"logits/rejected": -0.3966183066368103,
"logps/chosen": -353.3863830566406,
"logps/rejected": -317.9190979003906,
"loss": -1.0465,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -13.865551948547363,
"rewards/margins": 14.156455993652344,
"rewards/rejected": -28.02200698852539,
"step": 900
},
{
"epoch": 0.8044444444444444,
"grad_norm": 18.74745750427246,
"learning_rate": 8.341031935527267e-06,
"logits/chosen": -0.35274258255958557,
"logits/rejected": -0.4157370626926422,
"logps/chosen": -365.7769470214844,
"logps/rejected": -320.2703552246094,
"loss": -1.0852,
"rewards/accuracies": 0.84375,
"rewards/chosen": -14.110136032104492,
"rewards/margins": 15.161088943481445,
"rewards/rejected": -29.271224975585938,
"step": 905
},
{
"epoch": 0.8088888888888889,
"grad_norm": 14.005874633789062,
"learning_rate": 8.323656910962011e-06,
"logits/chosen": -0.40306010842323303,
"logits/rejected": -0.44573473930358887,
"logps/chosen": -346.27105712890625,
"logps/rejected": -315.6506042480469,
"loss": -1.4107,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -13.054827690124512,
"rewards/margins": 15.541888236999512,
"rewards/rejected": -28.596715927124023,
"step": 910
},
{
"epoch": 0.8133333333333334,
"grad_norm": 25.924898147583008,
"learning_rate": 8.306209676727994e-06,
"logits/chosen": -0.3658706545829773,
"logits/rejected": -0.4349114000797272,
"logps/chosen": -358.9135437011719,
"logps/rejected": -326.36090087890625,
"loss": -1.4081,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -12.446220397949219,
"rewards/margins": 17.219030380249023,
"rewards/rejected": -29.66524887084961,
"step": 915
},
{
"epoch": 0.8177777777777778,
"grad_norm": 20.495826721191406,
"learning_rate": 8.288690611883296e-06,
"logits/chosen": -0.39841917157173157,
"logits/rejected": -0.4497374892234802,
"logps/chosen": -353.8162841796875,
"logps/rejected": -313.1631774902344,
"loss": -1.556,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -13.111363410949707,
"rewards/margins": 17.558149337768555,
"rewards/rejected": -30.669513702392578,
"step": 920
},
{
"epoch": 0.8222222222222222,
"grad_norm": 24.87085723876953,
"learning_rate": 8.271100097046585e-06,
"logits/chosen": -0.3760126233100891,
"logits/rejected": -0.42560848593711853,
"logps/chosen": -350.9206237792969,
"logps/rejected": -320.97637939453125,
"loss": -1.4347,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -13.494958877563477,
"rewards/margins": 16.944490432739258,
"rewards/rejected": -30.439449310302734,
"step": 925
},
{
"epoch": 0.8266666666666667,
"grad_norm": 20.46843910217285,
"learning_rate": 8.25343851438885e-06,
"logits/chosen": -0.4249737858772278,
"logits/rejected": -0.4788896441459656,
"logps/chosen": -361.0630187988281,
"logps/rejected": -319.73138427734375,
"loss": -1.3017,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -14.13292407989502,
"rewards/margins": 17.0448055267334,
"rewards/rejected": -31.1777286529541,
"step": 930
},
{
"epoch": 0.8311111111111111,
"grad_norm": 22.38373374938965,
"learning_rate": 8.235706247625098e-06,
"logits/chosen": -0.38224634528160095,
"logits/rejected": -0.4391182065010071,
"logps/chosen": -361.043701171875,
"logps/rejected": -325.9903564453125,
"loss": -1.2866,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -13.751859664916992,
"rewards/margins": 17.840734481811523,
"rewards/rejected": -31.59259605407715,
"step": 935
},
{
"epoch": 0.8355555555555556,
"grad_norm": 21.872596740722656,
"learning_rate": 8.217903682006017e-06,
"logits/chosen": -0.39942440390586853,
"logits/rejected": -0.45849889516830444,
"logps/chosen": -373.39013671875,
"logps/rejected": -337.3862609863281,
"loss": -1.6818,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -15.173101425170898,
"rewards/margins": 18.07097053527832,
"rewards/rejected": -33.24407196044922,
"step": 940
},
{
"epoch": 0.84,
"grad_norm": 21.81854248046875,
"learning_rate": 8.200031204309604e-06,
"logits/chosen": -0.40619197487831116,
"logits/rejected": -0.4568824768066406,
"logps/chosen": -342.34356689453125,
"logps/rejected": -317.7866516113281,
"loss": -1.6939,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -13.93403148651123,
"rewards/margins": 17.663101196289062,
"rewards/rejected": -31.597131729125977,
"step": 945
},
{
"epoch": 0.8444444444444444,
"grad_norm": 22.453853607177734,
"learning_rate": 8.182089202832767e-06,
"logits/chosen": -0.3882743716239929,
"logits/rejected": -0.4640750288963318,
"logps/chosen": -382.75787353515625,
"logps/rejected": -337.13995361328125,
"loss": -2.0499,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -14.779397964477539,
"rewards/margins": 20.105022430419922,
"rewards/rejected": -34.884422302246094,
"step": 950
},
{
"epoch": 0.8488888888888889,
"grad_norm": 18.471513748168945,
"learning_rate": 8.16407806738288e-06,
"logits/chosen": -0.39945605397224426,
"logits/rejected": -0.4585798680782318,
"logps/chosen": -383.2483215332031,
"logps/rejected": -347.448974609375,
"loss": -1.2417,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -15.864675521850586,
"rewards/margins": 16.104869842529297,
"rewards/rejected": -31.969547271728516,
"step": 955
},
{
"epoch": 0.8533333333333334,
"grad_norm": 37.47999954223633,
"learning_rate": 8.145998189269327e-06,
"logits/chosen": -0.4188354015350342,
"logits/rejected": -0.4583558142185211,
"logps/chosen": -377.2878723144531,
"logps/rejected": -350.0664978027344,
"loss": -1.6863,
"rewards/accuracies": 0.84375,
"rewards/chosen": -18.30971336364746,
"rewards/margins": 19.272193908691406,
"rewards/rejected": -37.5819091796875,
"step": 960
},
{
"epoch": 0.8577777777777778,
"grad_norm": 19.40680503845215,
"learning_rate": 8.127849961294984e-06,
"logits/chosen": -0.4305190145969391,
"logits/rejected": -0.477532297372818,
"logps/chosen": -372.54443359375,
"logps/rejected": -340.7271728515625,
"loss": -1.6979,
"rewards/accuracies": 0.84375,
"rewards/chosen": -17.950782775878906,
"rewards/margins": 18.499141693115234,
"rewards/rejected": -36.44992446899414,
"step": 965
},
{
"epoch": 0.8622222222222222,
"grad_norm": 28.603803634643555,
"learning_rate": 8.109633777747703e-06,
"logits/chosen": -0.42268872261047363,
"logits/rejected": -0.4787193834781647,
"logps/chosen": -373.359619140625,
"logps/rejected": -347.9590148925781,
"loss": -1.3852,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -20.33183479309082,
"rewards/margins": 17.556093215942383,
"rewards/rejected": -37.88792419433594,
"step": 970
},
{
"epoch": 0.8666666666666667,
"grad_norm": 23.769119262695312,
"learning_rate": 8.091350034391732e-06,
"logits/chosen": -0.40240478515625,
"logits/rejected": -0.48416176438331604,
"logps/chosen": -384.06976318359375,
"logps/rejected": -361.1874694824219,
"loss": -1.5434,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -18.98853874206543,
"rewards/margins": 18.99036979675293,
"rewards/rejected": -37.97890853881836,
"step": 975
},
{
"epoch": 0.8711111111111111,
"grad_norm": 36.155155181884766,
"learning_rate": 8.072999128459119e-06,
"logits/chosen": -0.41284674406051636,
"logits/rejected": -0.4507782459259033,
"logps/chosen": -360.9227600097656,
"logps/rejected": -339.7681884765625,
"loss": -1.3714,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -18.811853408813477,
"rewards/margins": 17.97747230529785,
"rewards/rejected": -36.78932189941406,
"step": 980
},
{
"epoch": 0.8755555555555555,
"grad_norm": 21.57328224182129,
"learning_rate": 8.05458145864109e-06,
"logits/chosen": -0.39822930097579956,
"logits/rejected": -0.4551811218261719,
"logps/chosen": -353.82623291015625,
"logps/rejected": -345.6108093261719,
"loss": -1.2538,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -17.53032875061035,
"rewards/margins": 19.313552856445312,
"rewards/rejected": -36.8438835144043,
"step": 985
},
{
"epoch": 0.88,
"grad_norm": 39.19557571411133,
"learning_rate": 8.036097425079377e-06,
"logits/chosen": -0.38101926445961,
"logits/rejected": -0.4362686276435852,
"logps/chosen": -381.2861328125,
"logps/rejected": -345.8175048828125,
"loss": -1.2588,
"rewards/accuracies": 0.84375,
"rewards/chosen": -19.006837844848633,
"rewards/margins": 17.608064651489258,
"rewards/rejected": -36.61490249633789,
"step": 990
},
{
"epoch": 0.8844444444444445,
"grad_norm": 22.725879669189453,
"learning_rate": 8.017547429357532e-06,
"logits/chosen": -0.3905089497566223,
"logits/rejected": -0.44199681282043457,
"logps/chosen": -367.7683410644531,
"logps/rejected": -347.76812744140625,
"loss": -2.4106,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -15.675936698913574,
"rewards/margins": 22.89352798461914,
"rewards/rejected": -38.56946563720703,
"step": 995
},
{
"epoch": 0.8888888888888888,
"grad_norm": 20.353073120117188,
"learning_rate": 7.998931874492192e-06,
"logits/chosen": -0.37944620847702026,
"logits/rejected": -0.44008979201316833,
"logps/chosen": -352.2929382324219,
"logps/rejected": -333.12530517578125,
"loss": -1.4519,
"rewards/accuracies": 0.84375,
"rewards/chosen": -15.595646858215332,
"rewards/margins": 17.53089141845703,
"rewards/rejected": -33.12653732299805,
"step": 1000
},
{
"epoch": 0.8888888888888888,
"eval_logits/chosen": -0.39150622487068176,
"eval_logits/rejected": -0.45033136010169983,
"eval_logps/chosen": -367.9861755371094,
"eval_logps/rejected": -349.3917541503906,
"eval_loss": -1.812597393989563,
"eval_rewards/accuracies": 0.8402500152587891,
"eval_rewards/chosen": -17.389860153198242,
"eval_rewards/margins": 20.433107376098633,
"eval_rewards/rejected": -37.822967529296875,
"eval_runtime": 2196.225,
"eval_samples_per_second": 1.821,
"eval_steps_per_second": 0.911,
"step": 1000
},
{
"epoch": 0.8933333333333333,
"grad_norm": 46.65407943725586,
"learning_rate": 7.980251164924342e-06,
"logits/chosen": -0.36357760429382324,
"logits/rejected": -0.4234141409397125,
"logps/chosen": -398.599853515625,
"logps/rejected": -381.31634521484375,
"loss": -1.5236,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -18.2071475982666,
"rewards/margins": 20.315492630004883,
"rewards/rejected": -38.522640228271484,
"step": 1005
},
{
"epoch": 0.8977777777777778,
"grad_norm": 30.023534774780273,
"learning_rate": 7.9615057065105e-06,
"logits/chosen": -0.38881856203079224,
"logits/rejected": -0.441250741481781,
"logps/chosen": -356.4747619628906,
"logps/rejected": -353.72003173828125,
"loss": -2.1979,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -16.97440528869629,
"rewards/margins": 23.14920997619629,
"rewards/rejected": -40.123619079589844,
"step": 1010
},
{
"epoch": 0.9022222222222223,
"grad_norm": 37.5120735168457,
"learning_rate": 7.94269590651393e-06,
"logits/chosen": -0.40530771017074585,
"logits/rejected": -0.4659528136253357,
"logps/chosen": -374.06915283203125,
"logps/rejected": -339.68365478515625,
"loss": -1.3756,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -20.926109313964844,
"rewards/margins": 18.391803741455078,
"rewards/rejected": -39.31791305541992,
"step": 1015
},
{
"epoch": 0.9066666666666666,
"grad_norm": 32.38276672363281,
"learning_rate": 7.923822173595773e-06,
"logits/chosen": -0.42920392751693726,
"logits/rejected": -0.47345709800720215,
"logps/chosen": -371.0946350097656,
"logps/rejected": -351.13507080078125,
"loss": -1.1326,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -19.484638214111328,
"rewards/margins": 17.145843505859375,
"rewards/rejected": -36.6304817199707,
"step": 1020
},
{
"epoch": 0.9111111111111111,
"grad_norm": 27.41214942932129,
"learning_rate": 7.904884917806174e-06,
"logits/chosen": -0.4001992642879486,
"logits/rejected": -0.4714701175689697,
"logps/chosen": -381.10882568359375,
"logps/rejected": -369.2865905761719,
"loss": -1.7117,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -19.89790153503418,
"rewards/margins": 20.406051635742188,
"rewards/rejected": -40.303955078125,
"step": 1025
},
{
"epoch": 0.9155555555555556,
"grad_norm": 28.964096069335938,
"learning_rate": 7.885884550575376e-06,
"logits/chosen": -0.4225890636444092,
"logits/rejected": -0.49147137999534607,
"logps/chosen": -391.1488037109375,
"logps/rejected": -357.6933288574219,
"loss": -2.2471,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -20.362295150756836,
"rewards/margins": 22.882240295410156,
"rewards/rejected": -43.244537353515625,
"step": 1030
},
{
"epoch": 0.92,
"grad_norm": 18.16975212097168,
"learning_rate": 7.866821484704777e-06,
"logits/chosen": -0.39086705446243286,
"logits/rejected": -0.4711666703224182,
"logps/chosen": -402.845703125,
"logps/rejected": -357.61785888671875,
"loss": -1.5986,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -21.25181007385254,
"rewards/margins": 19.634967803955078,
"rewards/rejected": -40.886775970458984,
"step": 1035
},
{
"epoch": 0.9244444444444444,
"grad_norm": 24.85601234436035,
"learning_rate": 7.847696134357967e-06,
"logits/chosen": -0.39659881591796875,
"logits/rejected": -0.4599393308162689,
"logps/chosen": -391.4700012207031,
"logps/rejected": -379.650634765625,
"loss": -2.0057,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -20.11394691467285,
"rewards/margins": 23.591705322265625,
"rewards/rejected": -43.70565414428711,
"step": 1040
},
{
"epoch": 0.9288888888888889,
"grad_norm": 18.584766387939453,
"learning_rate": 7.828508915051724e-06,
"logits/chosen": -0.406088650226593,
"logits/rejected": -0.46071720123291016,
"logps/chosen": -375.5560607910156,
"logps/rejected": -376.0016784667969,
"loss": -2.86,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -19.491628646850586,
"rewards/margins": 27.680038452148438,
"rewards/rejected": -47.171669006347656,
"step": 1045
},
{
"epoch": 0.9333333333333333,
"grad_norm": 28.448862075805664,
"learning_rate": 7.80926024364699e-06,
"logits/chosen": -0.42899399995803833,
"logits/rejected": -0.49288374185562134,
"logps/chosen": -403.46722412109375,
"logps/rejected": -376.3447570800781,
"loss": -1.8305,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -22.05727767944336,
"rewards/margins": 24.375003814697266,
"rewards/rejected": -46.43228530883789,
"step": 1050
},
{
"epoch": 0.9377777777777778,
"grad_norm": 68.84838104248047,
"learning_rate": 7.789950538339813e-06,
"logits/chosen": -0.4087978005409241,
"logits/rejected": -0.4491947591304779,
"logps/chosen": -401.5524597167969,
"logps/rejected": -392.5592956542969,
"loss": -2.0026,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -21.574420928955078,
"rewards/margins": 26.205398559570312,
"rewards/rejected": -47.779815673828125,
"step": 1055
},
{
"epoch": 0.9422222222222222,
"grad_norm": 45.94411087036133,
"learning_rate": 7.770580218652262e-06,
"logits/chosen": -0.43011608719825745,
"logits/rejected": -0.4682633876800537,
"logps/chosen": -372.8076171875,
"logps/rejected": -385.06195068359375,
"loss": -2.0194,
"rewards/accuracies": 0.8125,
"rewards/chosen": -22.3854923248291,
"rewards/margins": 27.099695205688477,
"rewards/rejected": -49.48518753051758,
"step": 1060
},
{
"epoch": 0.9466666666666667,
"grad_norm": 52.724761962890625,
"learning_rate": 7.751149705423313e-06,
"logits/chosen": -0.4273204207420349,
"logits/rejected": -0.48607999086380005,
"logps/chosen": -369.5546875,
"logps/rejected": -360.80035400390625,
"loss": -1.6839,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -22.321815490722656,
"rewards/margins": 21.776235580444336,
"rewards/rejected": -44.098045349121094,
"step": 1065
},
{
"epoch": 0.9511111111111111,
"grad_norm": 23.580408096313477,
"learning_rate": 7.731659420799704e-06,
"logits/chosen": -0.408935010433197,
"logits/rejected": -0.4614839553833008,
"logps/chosen": -400.769287109375,
"logps/rejected": -375.7509765625,
"loss": -1.2632,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -23.772607803344727,
"rewards/margins": 21.02273178100586,
"rewards/rejected": -44.79533767700195,
"step": 1070
},
{
"epoch": 0.9555555555555556,
"grad_norm": 26.15434455871582,
"learning_rate": 7.712109788226763e-06,
"logits/chosen": -0.4153992235660553,
"logits/rejected": -0.45802217721939087,
"logps/chosen": -383.772216796875,
"logps/rejected": -388.574462890625,
"loss": -2.3232,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -22.940959930419922,
"rewards/margins": 24.761186599731445,
"rewards/rejected": -47.702144622802734,
"step": 1075
},
{
"epoch": 0.96,
"grad_norm": 51.878143310546875,
"learning_rate": 7.692501232439214e-06,
"logits/chosen": -0.4019390940666199,
"logits/rejected": -0.48944348096847534,
"logps/chosen": -374.1484680175781,
"logps/rejected": -386.2460021972656,
"loss": -2.357,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -21.894039154052734,
"rewards/margins": 26.68624496459961,
"rewards/rejected": -48.58028030395508,
"step": 1080
},
{
"epoch": 0.9644444444444444,
"grad_norm": 19.433937072753906,
"learning_rate": 7.672834179451943e-06,
"logits/chosen": -0.4297551214694977,
"logits/rejected": -0.4838961064815521,
"logps/chosen": -375.82952880859375,
"logps/rejected": -371.78631591796875,
"loss": -1.63,
"rewards/accuracies": 0.78125,
"rewards/chosen": -23.77305030822754,
"rewards/margins": 20.549041748046875,
"rewards/rejected": -44.32209014892578,
"step": 1085
},
{
"epoch": 0.9688888888888889,
"grad_norm": 31.28313636779785,
"learning_rate": 7.653109056550741e-06,
"logits/chosen": -0.434882253408432,
"logits/rejected": -0.5161997079849243,
"logps/chosen": -379.7670593261719,
"logps/rejected": -393.16558837890625,
"loss": -3.1634,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -23.529193878173828,
"rewards/margins": 31.59170913696289,
"rewards/rejected": -55.12090301513672,
"step": 1090
},
{
"epoch": 0.9733333333333334,
"grad_norm": 26.750364303588867,
"learning_rate": 7.633326292283028e-06,
"logits/chosen": -0.44604843854904175,
"logits/rejected": -0.5044312477111816,
"logps/chosen": -383.0677185058594,
"logps/rejected": -414.618408203125,
"loss": -3.4819,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -23.556535720825195,
"rewards/margins": 34.202369689941406,
"rewards/rejected": -57.75890350341797,
"step": 1095
},
{
"epoch": 0.9777777777777777,
"grad_norm": 25.017847061157227,
"learning_rate": 7.6134863164485395e-06,
"logits/chosen": -0.47239094972610474,
"logits/rejected": -0.5211464166641235,
"logps/chosen": -429.9480895996094,
"logps/rejected": -426.56463623046875,
"loss": -2.141,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -29.6816349029541,
"rewards/margins": 26.086597442626953,
"rewards/rejected": -55.76823043823242,
"step": 1100
},
{
"epoch": 0.9822222222222222,
"grad_norm": 26.209083557128906,
"learning_rate": 7.593589560089984e-06,
"logits/chosen": -0.46421319246292114,
"logits/rejected": -0.5433587431907654,
"logps/chosen": -405.5035400390625,
"logps/rejected": -399.19378662109375,
"loss": -2.5744,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -28.879892349243164,
"rewards/margins": 26.91481590270996,
"rewards/rejected": -55.794708251953125,
"step": 1105
},
{
"epoch": 0.9866666666666667,
"grad_norm": 67.32453918457031,
"learning_rate": 7.573636455483684e-06,
"logits/chosen": -0.4945516586303711,
"logits/rejected": -0.5519949197769165,
"logps/chosen": -412.07989501953125,
"logps/rejected": -467.892822265625,
"loss": -2.6987,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -31.901325225830078,
"rewards/margins": 39.842533111572266,
"rewards/rejected": -71.74385833740234,
"step": 1110
},
{
"epoch": 0.9911111111111112,
"grad_norm": 35.05012512207031,
"learning_rate": 7.553627436130183e-06,
"logits/chosen": -0.4611131548881531,
"logits/rejected": -0.5199744701385498,
"logps/chosen": -402.3938293457031,
"logps/rejected": -429.143798828125,
"loss": -2.9302,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -28.55450439453125,
"rewards/margins": 32.54138946533203,
"rewards/rejected": -61.09589767456055,
"step": 1115
},
{
"epoch": 0.9955555555555555,
"grad_norm": 39.04290771484375,
"learning_rate": 7.533562936744825e-06,
"logits/chosen": -0.4493132531642914,
"logits/rejected": -0.49818509817123413,
"logps/chosen": -429.8363342285156,
"logps/rejected": -478.5335388183594,
"loss": -4.5909,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -30.265472412109375,
"rewards/margins": 44.172183990478516,
"rewards/rejected": -74.43766021728516,
"step": 1120
},
{
"epoch": 1.0,
"grad_norm": 105.05587768554688,
"learning_rate": 7.513443393248312e-06,
"logits/chosen": -0.44253572821617126,
"logits/rejected": -0.5035872459411621,
"logps/chosen": -416.77484130859375,
"logps/rejected": -463.5193786621094,
"loss": -2.9637,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -30.517858505249023,
"rewards/margins": 42.752220153808594,
"rewards/rejected": -73.27008056640625,
"step": 1125
},
{
"epoch": 1.0044444444444445,
"grad_norm": 143.1883544921875,
"learning_rate": 7.493269242757233e-06,
"logits/chosen": -0.4549011290073395,
"logits/rejected": -0.5123938918113708,
"logps/chosen": -414.7423400878906,
"logps/rejected": -436.71807861328125,
"loss": -1.5439,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -32.691585540771484,
"rewards/margins": 36.47550582885742,
"rewards/rejected": -69.1670913696289,
"step": 1130
},
{
"epoch": 1.008888888888889,
"grad_norm": 81.42022705078125,
"learning_rate": 7.473040923574567e-06,
"logits/chosen": -0.4221878945827484,
"logits/rejected": -0.48623982071876526,
"logps/chosen": -414.17059326171875,
"logps/rejected": -472.69464111328125,
"loss": -2.7306,
"rewards/accuracies": 0.78125,
"rewards/chosen": -31.74398422241211,
"rewards/margins": 41.37885284423828,
"rewards/rejected": -73.12284088134766,
"step": 1135
},
{
"epoch": 1.0133333333333334,
"grad_norm": 35.27888870239258,
"learning_rate": 7.4527588751801606e-06,
"logits/chosen": -0.4145434498786926,
"logits/rejected": -0.47934216260910034,
"logps/chosen": -442.28857421875,
"logps/rejected": -457.786865234375,
"loss": -3.4397,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -31.377155303955078,
"rewards/margins": 37.45344543457031,
"rewards/rejected": -68.83060455322266,
"step": 1140
},
{
"epoch": 1.0177777777777777,
"grad_norm": 33.032012939453125,
"learning_rate": 7.432423538221179e-06,
"logits/chosen": -0.4252205491065979,
"logits/rejected": -0.4945794641971588,
"logps/chosen": -399.1712951660156,
"logps/rejected": -493.27459716796875,
"loss": -5.4394,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -28.82522964477539,
"rewards/margins": 51.07146072387695,
"rewards/rejected": -79.89668273925781,
"step": 1145
},
{
"epoch": 1.0222222222222221,
"grad_norm": 38.360721588134766,
"learning_rate": 7.412035354502532e-06,
"logits/chosen": -0.43832993507385254,
"logits/rejected": -0.48460859060287476,
"logps/chosen": -388.6338806152344,
"logps/rejected": -462.28277587890625,
"loss": -5.0777,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -29.647830963134766,
"rewards/margins": 45.525054931640625,
"rewards/rejected": -75.17288208007812,
"step": 1150
},
{
"epoch": 1.0266666666666666,
"grad_norm": 50.89879608154297,
"learning_rate": 7.391594766977277e-06,
"logits/chosen": -0.45310840010643005,
"logits/rejected": -0.5277084112167358,
"logps/chosen": -419.843017578125,
"logps/rejected": -458.0693359375,
"loss": -4.3999,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -30.20745849609375,
"rewards/margins": 42.24293518066406,
"rewards/rejected": -72.45039367675781,
"step": 1155
},
{
"epoch": 1.031111111111111,
"grad_norm": 51.82854080200195,
"learning_rate": 7.371102219736999e-06,
"logits/chosen": -0.4446256756782532,
"logits/rejected": -0.46639928221702576,
"logps/chosen": -419.08734130859375,
"logps/rejected": -515.2535400390625,
"loss": -5.4518,
"rewards/accuracies": 0.84375,
"rewards/chosen": -33.454734802246094,
"rewards/margins": 49.91343307495117,
"rewards/rejected": -83.36817932128906,
"step": 1160
},
{
"epoch": 1.0355555555555556,
"grad_norm": 51.885841369628906,
"learning_rate": 7.350558158002154e-06,
"logits/chosen": -0.4529612064361572,
"logits/rejected": -0.49919238686561584,
"logps/chosen": -415.342529296875,
"logps/rejected": -537.9119873046875,
"loss": -6.4479,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -35.03794860839844,
"rewards/margins": 59.22160720825195,
"rewards/rejected": -94.25955963134766,
"step": 1165
},
{
"epoch": 1.04,
"grad_norm": 206.30380249023438,
"learning_rate": 7.329963028112399e-06,
"logits/chosen": -0.4479581415653229,
"logits/rejected": -0.48728424310684204,
"logps/chosen": -466.96136474609375,
"logps/rejected": -589.3536987304688,
"loss": -3.1816,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -49.58006286621094,
"rewards/margins": 63.28960418701172,
"rewards/rejected": -112.86966705322266,
"step": 1170
},
{
"epoch": 1.0444444444444445,
"grad_norm": 54.78931427001953,
"learning_rate": 7.3093172775169e-06,
"logits/chosen": -0.40078288316726685,
"logits/rejected": -0.4713813364505768,
"logps/chosen": -478.08856201171875,
"logps/rejected": -565.62255859375,
"loss": -5.2624,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -42.438636779785156,
"rewards/margins": 55.889015197753906,
"rewards/rejected": -98.32765197753906,
"step": 1175
},
{
"epoch": 1.048888888888889,
"grad_norm": 33.327232360839844,
"learning_rate": 7.288621354764605e-06,
"logits/chosen": -0.4297246038913727,
"logits/rejected": -0.46544164419174194,
"logps/chosen": -448.8904724121094,
"logps/rejected": -547.2132568359375,
"loss": -5.0996,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -39.51817321777344,
"rewards/margins": 55.78776931762695,
"rewards/rejected": -95.30594635009766,
"step": 1180
},
{
"epoch": 1.0533333333333332,
"grad_norm": 43.984230041503906,
"learning_rate": 7.2678757094945e-06,
"logits/chosen": -0.359012246131897,
"logits/rejected": -0.4188918173313141,
"logps/chosen": -435.9148864746094,
"logps/rejected": -524.6188354492188,
"loss": -5.3232,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -40.167320251464844,
"rewards/margins": 50.48529815673828,
"rewards/rejected": -90.65261840820312,
"step": 1185
},
{
"epoch": 1.0577777777777777,
"grad_norm": 43.805030822753906,
"learning_rate": 7.2470807924258435e-06,
"logits/chosen": -0.33509600162506104,
"logits/rejected": -0.43126893043518066,
"logps/chosen": -472.0062561035156,
"logps/rejected": -550.6618041992188,
"loss": -4.2012,
"rewards/accuracies": 0.84375,
"rewards/chosen": -43.6344108581543,
"rewards/margins": 54.643653869628906,
"rewards/rejected": -98.27806091308594,
"step": 1190
},
{
"epoch": 1.0622222222222222,
"grad_norm": 55.845645904541016,
"learning_rate": 7.226237055348369e-06,
"logits/chosen": -0.3667193651199341,
"logits/rejected": -0.411059707403183,
"logps/chosen": -445.1033630371094,
"logps/rejected": -565.6658325195312,
"loss": -5.7241,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -44.05376434326172,
"rewards/margins": 60.89931106567383,
"rewards/rejected": -104.95307922363281,
"step": 1195
},
{
"epoch": 1.0666666666666667,
"grad_norm": 52.21657943725586,
"learning_rate": 7.205344951112474e-06,
"logits/chosen": -0.34739190340042114,
"logits/rejected": -0.4000583291053772,
"logps/chosen": -476.917724609375,
"logps/rejected": -596.9302978515625,
"loss": -5.2471,
"rewards/accuracies": 0.84375,
"rewards/chosen": -45.8980827331543,
"rewards/margins": 66.90580749511719,
"rewards/rejected": -112.80389404296875,
"step": 1200
},
{
"epoch": 1.0711111111111111,
"grad_norm": 73.39852905273438,
"learning_rate": 7.184404933619377e-06,
"logits/chosen": -0.3436613082885742,
"logits/rejected": -0.430023193359375,
"logps/chosen": -481.1473083496094,
"logps/rejected": -586.77783203125,
"loss": -3.8824,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -49.4990234375,
"rewards/margins": 60.65178680419922,
"rewards/rejected": -110.15081787109375,
"step": 1205
},
{
"epoch": 1.0755555555555556,
"grad_norm": 235.89935302734375,
"learning_rate": 7.163417457811261e-06,
"logits/chosen": -0.335957795381546,
"logits/rejected": -0.4028739333152771,
"logps/chosen": -488.63800048828125,
"logps/rejected": -557.654052734375,
"loss": -4.055,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -52.452980041503906,
"rewards/margins": 50.59123229980469,
"rewards/rejected": -103.0442123413086,
"step": 1210
},
{
"epoch": 1.08,
"grad_norm": 53.787052154541016,
"learning_rate": 7.142382979661386e-06,
"logits/chosen": -0.34447726607322693,
"logits/rejected": -0.40175333619117737,
"logps/chosen": -459.96429443359375,
"logps/rejected": -552.6686401367188,
"loss": -4.7845,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -45.343711853027344,
"rewards/margins": 51.044227600097656,
"rewards/rejected": -96.387939453125,
"step": 1215
},
{
"epoch": 1.0844444444444445,
"grad_norm": 86.77619934082031,
"learning_rate": 7.121301956164184e-06,
"logits/chosen": -0.29622939229011536,
"logits/rejected": -0.3380669057369232,
"logps/chosen": -496.318359375,
"logps/rejected": -672.7567138671875,
"loss": -5.4564,
"rewards/accuracies": 0.84375,
"rewards/chosen": -50.32990646362305,
"rewards/margins": 73.62047576904297,
"rewards/rejected": -123.95037841796875,
"step": 1220
},
{
"epoch": 1.0888888888888888,
"grad_norm": 92.10472869873047,
"learning_rate": 7.100174845325327e-06,
"logits/chosen": -0.2991761565208435,
"logits/rejected": -0.3294784724712372,
"logps/chosen": -509.4859313964844,
"logps/rejected": -744.8641357421875,
"loss": -9.63,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -58.88629150390625,
"rewards/margins": 92.65065002441406,
"rewards/rejected": -151.5369415283203,
"step": 1225
},
{
"epoch": 1.0933333333333333,
"grad_norm": 188.78851318359375,
"learning_rate": 7.0790021061517825e-06,
"logits/chosen": -0.2575603723526001,
"logits/rejected": -0.34444746375083923,
"logps/chosen": -523.0759887695312,
"logps/rejected": -661.7510375976562,
"loss": -3.843,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -56.90376663208008,
"rewards/margins": 74.44867706298828,
"rewards/rejected": -131.35244750976562,
"step": 1230
},
{
"epoch": 1.0977777777777777,
"grad_norm": 148.79954528808594,
"learning_rate": 7.057784198641835e-06,
"logits/chosen": -0.28366950154304504,
"logits/rejected": -0.3355741500854492,
"logps/chosen": -489.5838928222656,
"logps/rejected": -644.3504638671875,
"loss": -5.428,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -53.00238800048828,
"rewards/margins": 78.2137222290039,
"rewards/rejected": -131.2161102294922,
"step": 1235
},
{
"epoch": 1.1022222222222222,
"grad_norm": 160.3306884765625,
"learning_rate": 7.036521583775099e-06,
"logits/chosen": -0.258393794298172,
"logits/rejected": -0.2814292311668396,
"logps/chosen": -461.08599853515625,
"logps/rejected": -765.3035888671875,
"loss": -12.7607,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -53.90653610229492,
"rewards/margins": 112.190185546875,
"rewards/rejected": -166.0967254638672,
"step": 1240
},
{
"epoch": 1.1066666666666667,
"grad_norm": 242.3454132080078,
"learning_rate": 7.015214723502496e-06,
"logits/chosen": -0.22555121779441833,
"logits/rejected": -0.21216616034507751,
"logps/chosen": -536.2150268554688,
"logps/rejected": -937.1095581054688,
"loss": -15.0884,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -73.13267517089844,
"rewards/margins": 140.1719207763672,
"rewards/rejected": -213.30459594726562,
"step": 1245
},
{
"epoch": 1.1111111111111112,
"grad_norm": 492.08544921875,
"learning_rate": 6.993864080736221e-06,
"logits/chosen": -0.21394245326519012,
"logits/rejected": -0.2340272217988968,
"logps/chosen": -685.1243896484375,
"logps/rejected": -1000.0198974609375,
"loss": -4.4512,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -107.66426849365234,
"rewards/margins": 122.04512786865234,
"rewards/rejected": -229.7093963623047,
"step": 1250
},
{
"epoch": 1.1155555555555556,
"grad_norm": 1650.1431884765625,
"learning_rate": 6.972470119339692e-06,
"logits/chosen": -0.24737460911273956,
"logits/rejected": -0.2552156448364258,
"logps/chosen": -635.1632080078125,
"logps/rejected": -981.98583984375,
"loss": -1.7668,
"rewards/accuracies": 0.78125,
"rewards/chosen": -99.38597106933594,
"rewards/margins": 128.8037109375,
"rewards/rejected": -228.1896514892578,
"step": 1255
},
{
"epoch": 1.12,
"grad_norm": 163.82701110839844,
"learning_rate": 6.9510333041174595e-06,
"logits/chosen": -0.18519486486911774,
"logits/rejected": -0.21833041310310364,
"logps/chosen": -644.9240112304688,
"logps/rejected": -994.1267700195312,
"loss": -3.269,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -101.34546661376953,
"rewards/margins": 130.02304077148438,
"rewards/rejected": -231.36849975585938,
"step": 1260
},
{
"epoch": 1.1244444444444444,
"grad_norm": 288.865478515625,
"learning_rate": 6.929554100805118e-06,
"logits/chosen": -0.15947946906089783,
"logits/rejected": -0.15981920063495636,
"logps/chosen": -641.01611328125,
"logps/rejected": -1218.5843505859375,
"loss": -18.7538,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -103.062255859375,
"rewards/margins": 199.00765991210938,
"rewards/rejected": -302.0699462890625,
"step": 1265
},
{
"epoch": 1.1288888888888888,
"grad_norm": 403.9992980957031,
"learning_rate": 6.908032976059184e-06,
"logits/chosen": -0.13905613124370575,
"logits/rejected": -0.11378375440835953,
"logps/chosen": -689.9586791992188,
"logps/rejected": -1391.485595703125,
"loss": -19.5417,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -115.4753189086914,
"rewards/margins": 238.10592651367188,
"rewards/rejected": -353.58123779296875,
"step": 1270
},
{
"epoch": 1.1333333333333333,
"grad_norm": 415.2798767089844,
"learning_rate": 6.886470397446958e-06,
"logits/chosen": -0.17582398653030396,
"logits/rejected": -0.15832173824310303,
"logps/chosen": -593.4633178710938,
"logps/rejected": -1337.550048828125,
"loss": -27.0786,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -84.53248596191406,
"rewards/margins": 245.6161651611328,
"rewards/rejected": -330.1486511230469,
"step": 1275
},
{
"epoch": 1.1377777777777778,
"grad_norm": 1071.2152099609375,
"learning_rate": 6.864866833436368e-06,
"logits/chosen": -0.14370083808898926,
"logits/rejected": -0.12866979837417603,
"logps/chosen": -737.2032470703125,
"logps/rejected": -1335.3382568359375,
"loss": -11.4925,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -133.48977661132812,
"rewards/margins": 196.9632568359375,
"rewards/rejected": -330.4530029296875,
"step": 1280
},
{
"epoch": 1.1422222222222222,
"grad_norm": 789.3484497070312,
"learning_rate": 6.843222753385785e-06,
"logits/chosen": -0.1296389400959015,
"logits/rejected": -0.10764478147029877,
"logps/chosen": -651.3052978515625,
"logps/rejected": -1448.587646484375,
"loss": -23.9463,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -99.7173080444336,
"rewards/margins": 265.5214538574219,
"rewards/rejected": -365.23876953125,
"step": 1285
},
{
"epoch": 1.1466666666666667,
"grad_norm": 1540.5205078125,
"learning_rate": 6.8215386275338335e-06,
"logits/chosen": -0.08001247048377991,
"logits/rejected": -0.09123299270868301,
"logps/chosen": -698.7423095703125,
"logps/rejected": -1478.103759765625,
"loss": -19.8387,
"rewards/accuracies": 0.8125,
"rewards/chosen": -116.57110595703125,
"rewards/margins": 258.63818359375,
"rewards/rejected": -375.20928955078125,
"step": 1290
},
{
"epoch": 1.1511111111111112,
"grad_norm": 767.0540771484375,
"learning_rate": 6.799814926989171e-06,
"logits/chosen": -0.13497574627399445,
"logits/rejected": -0.04107438400387764,
"logps/chosen": -796.4733276367188,
"logps/rejected": -1904.00390625,
"loss": -26.4269,
"rewards/accuracies": 0.78125,
"rewards/chosen": -151.79910278320312,
"rewards/margins": 352.4693908691406,
"rewards/rejected": -504.2684020996094,
"step": 1295
},
{
"epoch": 1.1555555555555554,
"grad_norm": 695.8399047851562,
"learning_rate": 6.778052123720252e-06,
"logits/chosen": -0.11790412664413452,
"logits/rejected": -0.14769446849822998,
"logps/chosen": -748.3878173828125,
"logps/rejected": -1471.530029296875,
"loss": -12.4276,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -133.96165466308594,
"rewards/margins": 242.260986328125,
"rewards/rejected": -376.22265625,
"step": 1300
},
{
"epoch": 1.16,
"grad_norm": 973.673095703125,
"learning_rate": 6.756250690545079e-06,
"logits/chosen": -0.08396363258361816,
"logits/rejected": -0.05135069414973259,
"logps/chosen": -951.2482299804688,
"logps/rejected": -1985.371826171875,
"loss": -32.127,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -190.14053344726562,
"rewards/margins": 336.322998046875,
"rewards/rejected": -526.4635620117188,
"step": 1305
},
{
"epoch": 1.1644444444444444,
"grad_norm": 1283.2349853515625,
"learning_rate": 6.734411101120925e-06,
"logits/chosen": -0.06799821555614471,
"logits/rejected": -0.07330864667892456,
"logps/chosen": -873.49072265625,
"logps/rejected": -1809.4739990234375,
"loss": -2.5052,
"rewards/accuracies": 0.78125,
"rewards/chosen": -171.44715881347656,
"rewards/margins": 302.28948974609375,
"rewards/rejected": -473.7366638183594,
"step": 1310
},
{
"epoch": 1.1688888888888889,
"grad_norm": 780.3442993164062,
"learning_rate": 6.712533829934042e-06,
"logits/chosen": -0.043098777532577515,
"logits/rejected": -0.024469073861837387,
"logps/chosen": -749.4600830078125,
"logps/rejected": -2043.115966796875,
"loss": -40.6397,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -130.70285034179688,
"rewards/margins": 411.14263916015625,
"rewards/rejected": -541.8453979492188,
"step": 1315
},
{
"epoch": 1.1733333333333333,
"grad_norm": 1000.9935913085938,
"learning_rate": 6.690619352289359e-06,
"logits/chosen": -0.09932423382997513,
"logits/rejected": -0.11026652157306671,
"logps/chosen": -644.5574951171875,
"logps/rejected": -1428.832763671875,
"loss": -21.736,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -99.68866729736328,
"rewards/margins": 255.21646118164062,
"rewards/rejected": -354.9051513671875,
"step": 1320
},
{
"epoch": 1.1777777777777778,
"grad_norm": 2265.180908203125,
"learning_rate": 6.6686681443001485e-06,
"logits/chosen": -0.05305319279432297,
"logits/rejected": -0.033907536417245865,
"logps/chosen": -796.5277709960938,
"logps/rejected": -2201.7998046875,
"loss": -52.5458,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -136.5464630126953,
"rewards/margins": 452.1681213378906,
"rewards/rejected": -588.7145385742188,
"step": 1325
},
{
"epoch": 1.1822222222222223,
"grad_norm": 9328.162109375,
"learning_rate": 6.6466806828776865e-06,
"logits/chosen": -0.028526514768600464,
"logits/rejected": -0.03504006937146187,
"logps/chosen": -988.6253662109375,
"logps/rejected": -2289.334228515625,
"loss": -18.6908,
"rewards/accuracies": 0.78125,
"rewards/chosen": -200.4395751953125,
"rewards/margins": 416.51495361328125,
"rewards/rejected": -616.9544677734375,
"step": 1330
},
{
"epoch": 1.1866666666666668,
"grad_norm": 4739.24951171875,
"learning_rate": 6.62465744572089e-06,
"logits/chosen": -0.04077509418129921,
"logits/rejected": -0.00983515102416277,
"logps/chosen": -994.3863525390625,
"logps/rejected": -2518.66162109375,
"loss": -20.8872,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -204.39039611816406,
"rewards/margins": 485.6438903808594,
"rewards/rejected": -690.0343017578125,
"step": 1335
},
{
"epoch": 1.1911111111111112,
"grad_norm": 1538.55615234375,
"learning_rate": 6.602598911305938e-06,
"logits/chosen": -0.06173365190625191,
"logits/rejected": -0.006297842599451542,
"logps/chosen": -734.0841064453125,
"logps/rejected": -2258.87353515625,
"loss": -43.0722,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -131.23867797851562,
"rewards/margins": 476.96844482421875,
"rewards/rejected": -608.2071533203125,
"step": 1340
},
{
"epoch": 1.1955555555555555,
"grad_norm": 587.7572021484375,
"learning_rate": 6.580505558875878e-06,
"logits/chosen": -0.04299772530794144,
"logits/rejected": -0.048394013196229935,
"logps/chosen": -753.9576416015625,
"logps/rejected": -2047.7181396484375,
"loss": -27.9425,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -135.0194091796875,
"rewards/margins": 414.18634033203125,
"rewards/rejected": -549.2056884765625,
"step": 1345
},
{
"epoch": 1.2,
"grad_norm": 451.1556701660156,
"learning_rate": 6.558377868430211e-06,
"logits/chosen": -0.04246233031153679,
"logits/rejected": -0.043165404349565506,
"logps/chosen": -545.9451904296875,
"logps/rejected": -1733.472900390625,
"loss": -42.4667,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -69.52648162841797,
"rewards/margins": 382.27105712890625,
"rewards/rejected": -451.79754638671875,
"step": 1350
},
{
"epoch": 1.2044444444444444,
"grad_norm": 923.8303833007812,
"learning_rate": 6.536216320714466e-06,
"logits/chosen": -0.013859344646334648,
"logits/rejected": -0.03816484287381172,
"logps/chosen": -593.7230224609375,
"logps/rejected": -1762.0712890625,
"loss": -41.333,
"rewards/accuracies": 0.78125,
"rewards/chosen": -87.8678207397461,
"rewards/margins": 374.93817138671875,
"rewards/rejected": -462.8059997558594,
"step": 1355
},
{
"epoch": 1.208888888888889,
"grad_norm": 1663.5784912109375,
"learning_rate": 6.514021397209751e-06,
"logits/chosen": 0.014377089217305183,
"logits/rejected": -0.030507531017065048,
"logps/chosen": -894.8898315429688,
"logps/rejected": -2128.688720703125,
"loss": -22.929,
"rewards/accuracies": 0.78125,
"rewards/chosen": -165.6055450439453,
"rewards/margins": 399.2110900878906,
"rewards/rejected": -564.816650390625,
"step": 1360
},
{
"epoch": 1.2133333333333334,
"grad_norm": 584.5473022460938,
"learning_rate": 6.491793580122301e-06,
"logits/chosen": 0.002529005752876401,
"logits/rejected": 0.0013079143827781081,
"logps/chosen": -935.2101440429688,
"logps/rejected": -2530.46533203125,
"loss": -29.8881,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -185.2637481689453,
"rewards/margins": 504.73175048828125,
"rewards/rejected": -689.9954833984375,
"step": 1365
},
{
"epoch": 1.2177777777777778,
"grad_norm": 790.5787963867188,
"learning_rate": 6.46953335237299e-06,
"logits/chosen": -0.02872173860669136,
"logits/rejected": -0.0967845544219017,
"logps/chosen": -736.6044921875,
"logps/rejected": -1947.3765869140625,
"loss": -28.6821,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -127.88818359375,
"rewards/margins": 389.64739990234375,
"rewards/rejected": -517.5355834960938,
"step": 1370
},
{
"epoch": 1.2222222222222223,
"grad_norm": 565.301513671875,
"learning_rate": 6.447241197586847e-06,
"logits/chosen": -0.005647065117955208,
"logits/rejected": -0.013025308027863503,
"logps/chosen": -920.2081298828125,
"logps/rejected": -2415.46826171875,
"loss": -44.2469,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -190.19842529296875,
"rewards/margins": 470.56903076171875,
"rewards/rejected": -660.7674560546875,
"step": 1375
},
{
"epoch": 1.2266666666666666,
"grad_norm": 1703.7418212890625,
"learning_rate": 6.424917600082552e-06,
"logits/chosen": -0.04485129565000534,
"logits/rejected": -0.01103221159428358,
"logps/chosen": -715.8340454101562,
"logps/rejected": -2420.53173828125,
"loss": -63.7882,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -120.95291900634766,
"rewards/margins": 534.1937255859375,
"rewards/rejected": -655.1466674804688,
"step": 1380
},
{
"epoch": 1.231111111111111,
"grad_norm": 551.6255493164062,
"learning_rate": 6.402563044861899e-06,
"logits/chosen": 0.053200650960206985,
"logits/rejected": -0.008127940818667412,
"logps/chosen": -1044.637451171875,
"logps/rejected": -2480.965087890625,
"loss": -13.3416,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -219.8478240966797,
"rewards/margins": 458.9497985839844,
"rewards/rejected": -678.797607421875,
"step": 1385
},
{
"epoch": 1.2355555555555555,
"grad_norm": 1526.0687255859375,
"learning_rate": 6.380178017599276e-06,
"logits/chosen": 0.024942180141806602,
"logits/rejected": 0.03292980045080185,
"logps/chosen": -1266.3096923828125,
"logps/rejected": -3159.16064453125,
"loss": -19.9606,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -285.90057373046875,
"rewards/margins": 595.1817016601562,
"rewards/rejected": -881.0822143554688,
"step": 1390
},
{
"epoch": 1.24,
"grad_norm": 861.3545532226562,
"learning_rate": 6.357763004631104e-06,
"logits/chosen": 0.038592465221881866,
"logits/rejected": -0.04844246804714203,
"logps/chosen": -1352.316650390625,
"logps/rejected": -2715.897705078125,
"loss": 27.6993,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -313.6338195800781,
"rewards/margins": 437.67071533203125,
"rewards/rejected": -751.3045654296875,
"step": 1395
},
{
"epoch": 1.2444444444444445,
"grad_norm": 4720.46435546875,
"learning_rate": 6.335318492945271e-06,
"logits/chosen": 0.06307001411914825,
"logits/rejected": 0.050534725189208984,
"logps/chosen": -1334.775390625,
"logps/rejected": -2528.11083984375,
"loss": 41.9707,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -308.42840576171875,
"rewards/margins": 382.95184326171875,
"rewards/rejected": -691.3802490234375,
"step": 1400
},
{
"epoch": 1.248888888888889,
"grad_norm": 4182.30126953125,
"learning_rate": 6.312844970170551e-06,
"logits/chosen": 0.06979052722454071,
"logits/rejected": 0.05389819294214249,
"logps/chosen": -848.5364379882812,
"logps/rejected": -2794.65234375,
"loss": -48.1904,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -164.25741577148438,
"rewards/margins": 609.0851440429688,
"rewards/rejected": -773.3425903320312,
"step": 1405
},
{
"epoch": 1.2533333333333334,
"grad_norm": 634.9495239257812,
"learning_rate": 6.29034292456602e-06,
"logits/chosen": 0.097917839884758,
"logits/rejected": 0.09180790185928345,
"logps/chosen": -1049.5567626953125,
"logps/rejected": -3027.710693359375,
"loss": -14.7226,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -223.29800415039062,
"rewards/margins": 616.1754760742188,
"rewards/rejected": -839.4735107421875,
"step": 1410
},
{
"epoch": 1.2577777777777777,
"grad_norm": 1667.810791015625,
"learning_rate": 6.267812845010431e-06,
"logits/chosen": 0.05149533227086067,
"logits/rejected": 0.01714668609201908,
"logps/chosen": -576.4118041992188,
"logps/rejected": -2054.046630859375,
"loss": -52.7336,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -82.24992370605469,
"rewards/margins": 466.6830139160156,
"rewards/rejected": -548.9329223632812,
"step": 1415
},
{
"epoch": 1.2622222222222224,
"grad_norm": 149.2144317626953,
"learning_rate": 6.245255220991606e-06,
"logits/chosen": 0.027654284611344337,
"logits/rejected": -0.004665301647037268,
"logps/chosen": -588.321533203125,
"logps/rejected": -1803.2874755859375,
"loss": -47.1506,
"rewards/accuracies": 0.8125,
"rewards/chosen": -86.83482360839844,
"rewards/margins": 386.0186462402344,
"rewards/rejected": -472.853515625,
"step": 1420
},
{
"epoch": 1.2666666666666666,
"grad_norm": 860.606689453125,
"learning_rate": 6.2226705425958e-06,
"logits/chosen": 0.010288884863257408,
"logits/rejected": 0.01715211756527424,
"logps/chosen": -876.3726806640625,
"logps/rejected": -3463.96728515625,
"loss": -97.3343,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -168.2960205078125,
"rewards/margins": 805.5137939453125,
"rewards/rejected": -973.8098754882812,
"step": 1425
},
{
"epoch": 1.271111111111111,
"grad_norm": 1589.0054931640625,
"learning_rate": 6.200059300497045e-06,
"logits/chosen": 0.06689944118261337,
"logits/rejected": 0.15388646721839905,
"logps/chosen": -1188.5296630859375,
"logps/rejected": -5079.46044921875,
"loss": -138.2135,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -261.08453369140625,
"rewards/margins": 1197.956298828125,
"rewards/rejected": -1459.0408935546875,
"step": 1430
},
{
"epoch": 1.2755555555555556,
"grad_norm": 2617.210205078125,
"learning_rate": 6.177421985946499e-06,
"logits/chosen": 0.04427279904484749,
"logits/rejected": 0.08278901129961014,
"logps/chosen": -1090.3377685546875,
"logps/rejected": -4322.57958984375,
"loss": -93.149,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -228.5421600341797,
"rewards/margins": 997.9312744140625,
"rewards/rejected": -1226.473388671875,
"step": 1435
},
{
"epoch": 1.28,
"grad_norm": 1109.904296875,
"learning_rate": 6.1547590907617685e-06,
"logits/chosen": 0.08895837515592575,
"logits/rejected": 0.11301213502883911,
"logps/chosen": -742.1561889648438,
"logps/rejected": -3306.692138671875,
"loss": -84.8328,
"rewards/accuracies": 0.8125,
"rewards/chosen": -130.0913543701172,
"rewards/margins": 794.3884887695312,
"rewards/rejected": -924.4798583984375,
"step": 1440
},
{
"epoch": 1.2844444444444445,
"grad_norm": 2034.2410888671875,
"learning_rate": 6.132071107316221e-06,
"logits/chosen": 0.05466142296791077,
"logits/rejected": 0.020903872326016426,
"logps/chosen": -794.6520385742188,
"logps/rejected": -2645.4228515625,
"loss": -50.6485,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -143.0889129638672,
"rewards/margins": 582.6082153320312,
"rewards/rejected": -725.6971435546875,
"step": 1445
},
{
"epoch": 1.2888888888888888,
"grad_norm": 3161.018310546875,
"learning_rate": 6.109358528528296e-06,
"logits/chosen": 0.09831374883651733,
"logits/rejected": 0.045040689408779144,
"logps/chosen": -898.0750732421875,
"logps/rejected": -2734.27490234375,
"loss": -50.0723,
"rewards/accuracies": 0.8125,
"rewards/chosen": -176.62228393554688,
"rewards/margins": 578.7171020507812,
"rewards/rejected": -755.3394165039062,
"step": 1450
},
{
"epoch": 1.2933333333333334,
"grad_norm": 4272.8037109375,
"learning_rate": 6.0866218478507875e-06,
"logits/chosen": 0.034332215785980225,
"logits/rejected": -0.030711542814970016,
"logps/chosen": -887.9432373046875,
"logps/rejected": -2781.74609375,
"loss": -67.6966,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -168.97879028320312,
"rewards/margins": 603.0858154296875,
"rewards/rejected": -772.0645751953125,
"step": 1455
},
{
"epoch": 1.2977777777777777,
"grad_norm": 2452.99609375,
"learning_rate": 6.063861559260127e-06,
"logits/chosen": 0.0846308022737503,
"logits/rejected": -0.018889425322413445,
"logps/chosen": -1020.7131958007812,
"logps/rejected": -2657.43115234375,
"loss": -18.0709,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -213.4755859375,
"rewards/margins": 520.6746215820312,
"rewards/rejected": -734.150146484375,
"step": 1460
},
{
"epoch": 1.3022222222222222,
"grad_norm": 212.89353942871094,
"learning_rate": 6.041078157245649e-06,
"logits/chosen": 0.09322404861450195,
"logits/rejected": 0.08232339471578598,
"logps/chosen": -1002.1947021484375,
"logps/rejected": -3723.547607421875,
"loss": -68.3427,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -206.69680786132812,
"rewards/margins": 846.6780395507812,
"rewards/rejected": -1053.374755859375,
"step": 1465
},
{
"epoch": 1.3066666666666666,
"grad_norm": 2106.11083984375,
"learning_rate": 6.018272136798854e-06,
"logits/chosen": 0.08394975960254669,
"logits/rejected": 0.03536719083786011,
"logps/chosen": -936.6207885742188,
"logps/rejected": -3006.883056640625,
"loss": -47.9597,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -179.7518310546875,
"rewards/margins": 658.16796875,
"rewards/rejected": -837.9197998046875,
"step": 1470
},
{
"epoch": 1.3111111111111111,
"grad_norm": 187.57144165039062,
"learning_rate": 5.995443993402647e-06,
"logits/chosen": 0.08657495677471161,
"logits/rejected": 0.09708672016859055,
"logps/chosen": -720.3692626953125,
"logps/rejected": -3342.55029296875,
"loss": -75.119,
"rewards/accuracies": 0.84375,
"rewards/chosen": -126.44435119628906,
"rewards/margins": 811.7178955078125,
"rewards/rejected": -938.1622314453125,
"step": 1475
},
{
"epoch": 1.3155555555555556,
"grad_norm": 918.5309448242188,
"learning_rate": 5.972594223020575e-06,
"logits/chosen": 0.1325269639492035,
"logits/rejected": 0.12702801823616028,
"logps/chosen": -996.6691284179688,
"logps/rejected": -3892.973876953125,
"loss": -66.2872,
"rewards/accuracies": 0.78125,
"rewards/chosen": -203.91000366210938,
"rewards/margins": 897.1043701171875,
"rewards/rejected": -1101.014404296875,
"step": 1480
},
{
"epoch": 1.32,
"grad_norm": 3301.5615234375,
"learning_rate": 5.949723322086053e-06,
"logits/chosen": 0.11541260778903961,
"logits/rejected": 0.024793455377221107,
"logps/chosen": -983.22021484375,
"logps/rejected": -3393.292236328125,
"loss": -70.6341,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -196.8666534423828,
"rewards/margins": 754.6265258789062,
"rewards/rejected": -951.4931640625,
"step": 1485
},
{
"epoch": 1.3244444444444445,
"grad_norm": 3240.269287109375,
"learning_rate": 5.926831787491577e-06,
"logits/chosen": 0.09748705476522446,
"logits/rejected": 0.07405810058116913,
"logps/chosen": -1009.3624877929688,
"logps/rejected": -3883.693359375,
"loss": -72.7366,
"rewards/accuracies": 0.75,
"rewards/chosen": -210.13650512695312,
"rewards/margins": 885.2335205078125,
"rewards/rejected": -1095.369873046875,
"step": 1490
},
{
"epoch": 1.3288888888888888,
"grad_norm": 1777.92333984375,
"learning_rate": 5.9039201165779315e-06,
"logits/chosen": 0.15679362416267395,
"logits/rejected": 0.06116216257214546,
"logps/chosen": -935.0094604492188,
"logps/rejected": -3276.86474609375,
"loss": -13.9424,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -190.66973876953125,
"rewards/margins": 728.3402099609375,
"rewards/rejected": -919.0099487304688,
"step": 1495
},
{
"epoch": 1.3333333333333333,
"grad_norm": 2631.911376953125,
"learning_rate": 5.880988807123379e-06,
"logits/chosen": 0.13361360132694244,
"logits/rejected": 0.11118870973587036,
"logps/chosen": -867.6259765625,
"logps/rejected": -4221.30810546875,
"loss": -116.1614,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -168.588623046875,
"rewards/margins": 1031.815185546875,
"rewards/rejected": -1200.4039306640625,
"step": 1500
},
{
"epoch": 1.3333333333333333,
"eval_logits/chosen": 0.1006997674703598,
"eval_logits/rejected": 0.056181661784648895,
"eval_logps/chosen": -774.4694213867188,
"eval_logps/rejected": -3444.830078125,
"eval_loss": -85.59445190429688,
"eval_rewards/accuracies": 0.7942500114440918,
"eval_rewards/chosen": -139.33482360839844,
"eval_rewards/margins": 827.11962890625,
"eval_rewards/rejected": -966.4544677734375,
"eval_runtime": 2193.4235,
"eval_samples_per_second": 1.824,
"eval_steps_per_second": 0.912,
"step": 1500
},
{
"epoch": 1.3377777777777777,
"grad_norm": 955.7211303710938,
"learning_rate": 5.858038357332851e-06,
"logits/chosen": 0.10563405603170395,
"logits/rejected": 0.04411952942609787,
"logps/chosen": -717.7381591796875,
"logps/rejected": -2960.67724609375,
"loss": -68.156,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -117.87982177734375,
"rewards/margins": 701.3028564453125,
"rewards/rejected": -819.1826782226562,
"step": 1505
},
{
"epoch": 1.3422222222222222,
"grad_norm": 3122.2939453125,
"learning_rate": 5.835069265827119e-06,
"logits/chosen": 0.0719769075512886,
"logits/rejected": 0.030872393399477005,
"logps/chosen": -808.4874877929688,
"logps/rejected": -3150.03173828125,
"loss": -67.1917,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -145.69776916503906,
"rewards/margins": 730.8069458007812,
"rewards/rejected": -876.5046997070312,
"step": 1510
},
{
"epoch": 1.3466666666666667,
"grad_norm": 1660.158935546875,
"learning_rate": 5.812082031631966e-06,
"logits/chosen": 0.12388893216848373,
"logits/rejected": 0.04534872621297836,
"logps/chosen": -628.2339477539062,
"logps/rejected": -3061.64697265625,
"loss": -91.6737,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -90.30569458007812,
"rewards/margins": 763.7083129882812,
"rewards/rejected": -854.0139770507812,
"step": 1515
},
{
"epoch": 1.3511111111111112,
"grad_norm": 208.55455017089844,
"learning_rate": 5.789077154167342e-06,
"logits/chosen": 0.08742909133434296,
"logits/rejected": 0.057026900351047516,
"logps/chosen": -752.9409790039062,
"logps/rejected": -2982.9560546875,
"loss": -85.7046,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -132.0449981689453,
"rewards/margins": 694.6397705078125,
"rewards/rejected": -826.6846923828125,
"step": 1520
},
{
"epoch": 1.3555555555555556,
"grad_norm": 196.6206817626953,
"learning_rate": 5.766055133236513e-06,
"logits/chosen": 0.12026973813772202,
"logits/rejected": 0.054700933396816254,
"logps/chosen": -761.4303588867188,
"logps/rejected": -3311.610595703125,
"loss": -89.4418,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -136.79627990722656,
"rewards/margins": 790.1121215820312,
"rewards/rejected": -926.9083862304688,
"step": 1525
},
{
"epoch": 1.3599999999999999,
"grad_norm": 3600.595947265625,
"learning_rate": 5.7430164690152045e-06,
"logits/chosen": 0.0973397046327591,
"logits/rejected": -0.010263195261359215,
"logps/chosen": -1011.42236328125,
"logps/rejected": -3372.37744140625,
"loss": -77.1505,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -206.7822265625,
"rewards/margins": 737.5157470703125,
"rewards/rejected": -944.2979736328125,
"step": 1530
},
{
"epoch": 1.3644444444444446,
"grad_norm": 2392.074462890625,
"learning_rate": 5.7199616620407325e-06,
"logits/chosen": 0.2140587866306305,
"logits/rejected": 0.13287585973739624,
"logps/chosen": -1339.725341796875,
"logps/rejected": -4591.2490234375,
"loss": -72.0547,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -304.84576416015625,
"rewards/margins": 1005.3800048828125,
"rewards/rejected": -1310.225830078125,
"step": 1535
},
{
"epoch": 1.3688888888888888,
"grad_norm": 1979.218017578125,
"learning_rate": 5.696891213201134e-06,
"logits/chosen": 0.11510731279850006,
"logits/rejected": 0.04925750941038132,
"logps/chosen": -711.453369140625,
"logps/rejected": -3214.048828125,
"loss": -65.2588,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -123.6042251586914,
"rewards/margins": 775.2196044921875,
"rewards/rejected": -898.8238525390625,
"step": 1540
},
{
"epoch": 1.3733333333333333,
"grad_norm": 1114.5130615234375,
"learning_rate": 5.673805623724272e-06,
"logits/chosen": 0.09844042360782623,
"logits/rejected": 0.06581716239452362,
"logps/chosen": -904.0095825195312,
"logps/rejected": -3654.065673828125,
"loss": -103.2758,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -177.9736785888672,
"rewards/margins": 850.4739379882812,
"rewards/rejected": -1028.447509765625,
"step": 1545
},
{
"epoch": 1.3777777777777778,
"grad_norm": 2146.8916015625,
"learning_rate": 5.650705395166965e-06,
"logits/chosen": 0.1780976802110672,
"logits/rejected": 0.11291356384754181,
"logps/chosen": -1122.606201171875,
"logps/rejected": -3680.157470703125,
"loss": -52.3573,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -240.822509765625,
"rewards/margins": 790.123291015625,
"rewards/rejected": -1030.94580078125,
"step": 1550
},
{
"epoch": 1.3822222222222222,
"grad_norm": 2417.669677734375,
"learning_rate": 5.627591029404072e-06,
"logits/chosen": 0.12985600531101227,
"logits/rejected": 0.0674312636256218,
"logps/chosen": -911.4212036132812,
"logps/rejected": -2673.277099609375,
"loss": -32.7178,
"rewards/accuracies": 0.8125,
"rewards/chosen": -182.8104248046875,
"rewards/margins": 550.80224609375,
"rewards/rejected": -733.6126708984375,
"step": 1555
},
{
"epoch": 1.3866666666666667,
"grad_norm": 874.9458618164062,
"learning_rate": 5.604463028617598e-06,
"logits/chosen": 0.14844806492328644,
"logits/rejected": 0.15728525817394257,
"logps/chosen": -931.70947265625,
"logps/rejected": -5559.9111328125,
"loss": -162.2626,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -186.3575897216797,
"rewards/margins": 1415.8370361328125,
"rewards/rejected": -1602.194580078125,
"step": 1560
},
{
"epoch": 1.3911111111111112,
"grad_norm": 1034.4825439453125,
"learning_rate": 5.581321895285787e-06,
"logits/chosen": 0.18663232028484344,
"logits/rejected": 0.07420190423727036,
"logps/chosen": -999.5822143554688,
"logps/rejected": -4148.65234375,
"loss": -63.8503,
"rewards/accuracies": 0.78125,
"rewards/chosen": -205.96835327148438,
"rewards/margins": 971.0234375,
"rewards/rejected": -1176.9918212890625,
"step": 1565
},
{
"epoch": 1.3955555555555557,
"grad_norm": 5635.2412109375,
"learning_rate": 5.558168132172195e-06,
"logits/chosen": 0.13406811654567719,
"logits/rejected": 0.014459284953773022,
"logps/chosen": -930.2107543945312,
"logps/rejected": -2864.10986328125,
"loss": -40.1761,
"rewards/accuracies": 0.78125,
"rewards/chosen": -183.5532989501953,
"rewards/margins": 603.8217163085938,
"rewards/rejected": -787.3750610351562,
"step": 1570
},
{
"epoch": 1.4,
"grad_norm": 752.8253173828125,
"learning_rate": 5.535002242314772e-06,
"logits/chosen": 0.1415996253490448,
"logits/rejected": 0.05800303816795349,
"logps/chosen": -1115.41796875,
"logps/rejected": -4055.927001953125,
"loss": -56.2453,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -241.4073028564453,
"rewards/margins": 909.2194213867188,
"rewards/rejected": -1150.626708984375,
"step": 1575
},
{
"epoch": 1.4044444444444444,
"grad_norm": 204.28115844726562,
"learning_rate": 5.511824729014936e-06,
"logits/chosen": 0.150762677192688,
"logits/rejected": 0.14514730870723724,
"logps/chosen": -741.7395629882812,
"logps/rejected": -4206.9638671875,
"loss": -117.9318,
"rewards/accuracies": 0.78125,
"rewards/chosen": -126.77288818359375,
"rewards/margins": 1062.399169921875,
"rewards/rejected": -1189.172119140625,
"step": 1580
},
{
"epoch": 1.4088888888888889,
"grad_norm": 6281.19091796875,
"learning_rate": 5.488636095826636e-06,
"logits/chosen": 0.15948662161827087,
"logits/rejected": 0.0816819816827774,
"logps/chosen": -1141.554931640625,
"logps/rejected": -4589.34765625,
"loss": -115.1681,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -243.4787139892578,
"rewards/margins": 1065.435302734375,
"rewards/rejected": -1308.9140625,
"step": 1585
},
{
"epoch": 1.4133333333333333,
"grad_norm": 600.7849731445312,
"learning_rate": 5.465436846545407e-06,
"logits/chosen": 0.1765061318874359,
"logits/rejected": 0.030613476410508156,
"logps/chosen": -1068.354736328125,
"logps/rejected": -3554.58154296875,
"loss": -44.7145,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -224.12246704101562,
"rewards/margins": 772.4940185546875,
"rewards/rejected": -996.6165771484375,
"step": 1590
},
{
"epoch": 1.4177777777777778,
"grad_norm": 657.72021484375,
"learning_rate": 5.4422274851974356e-06,
"logits/chosen": 0.18472027778625488,
"logits/rejected": 0.11286415904760361,
"logps/chosen": -865.9571533203125,
"logps/rejected": -4187.04052734375,
"loss": -97.4628,
"rewards/accuracies": 0.78125,
"rewards/chosen": -167.65736389160156,
"rewards/margins": 1020.7579956054688,
"rewards/rejected": -1188.4154052734375,
"step": 1595
},
{
"epoch": 1.4222222222222223,
"grad_norm": 668.9474487304688,
"learning_rate": 5.419008516028597e-06,
"logits/chosen": 0.1573035717010498,
"logits/rejected": 0.04901648312807083,
"logps/chosen": -925.8907470703125,
"logps/rejected": -3878.40380859375,
"loss": -61.9469,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -186.1890869140625,
"rewards/margins": 912.0675048828125,
"rewards/rejected": -1098.256591796875,
"step": 1600
},
{
"epoch": 1.4266666666666667,
"grad_norm": 3902.2021484375,
"learning_rate": 5.395780443493508e-06,
"logits/chosen": 0.16088075935840607,
"logits/rejected": 0.10695306956768036,
"logps/chosen": -1087.218994140625,
"logps/rejected": -4154.6220703125,
"loss": -104.1147,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -228.36355590820312,
"rewards/margins": 948.61474609375,
"rewards/rejected": -1176.978271484375,
"step": 1605
},
{
"epoch": 1.431111111111111,
"grad_norm": 2289.0302734375,
"learning_rate": 5.372543772244566e-06,
"logits/chosen": 0.17162616550922394,
"logits/rejected": 0.07870938628911972,
"logps/chosen": -1285.9188232421875,
"logps/rejected": -4683.6875,
"loss": -128.7691,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -292.28375244140625,
"rewards/margins": 1045.0806884765625,
"rewards/rejected": -1337.3646240234375,
"step": 1610
},
{
"epoch": 1.4355555555555555,
"grad_norm": 3771.407470703125,
"learning_rate": 5.34929900712098e-06,
"logits/chosen": 0.1444883942604065,
"logits/rejected": 0.07830196619033813,
"logps/chosen": -1254.024169921875,
"logps/rejected": -5026.85693359375,
"loss": -123.4905,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -284.7939147949219,
"rewards/margins": 1156.435302734375,
"rewards/rejected": -1441.229248046875,
"step": 1615
},
{
"epoch": 1.44,
"grad_norm": 4702.58544921875,
"learning_rate": 5.326046653137811e-06,
"logits/chosen": 0.13368460536003113,
"logits/rejected": 0.12430386245250702,
"logps/chosen": -1176.692138671875,
"logps/rejected": -5584.7294921875,
"loss": -166.6501,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -262.0100402832031,
"rewards/margins": 1345.173583984375,
"rewards/rejected": -1607.18359375,
"step": 1620
},
{
"epoch": 1.4444444444444444,
"grad_norm": 569.5634155273438,
"learning_rate": 5.302787215474992e-06,
"logits/chosen": 0.15046411752700806,
"logits/rejected": 0.104924775660038,
"logps/chosen": -986.0426635742188,
"logps/rejected": -5192.7490234375,
"loss": -136.9891,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -200.11141967773438,
"rewards/margins": 1287.0059814453125,
"rewards/rejected": -1487.117431640625,
"step": 1625
},
{
"epoch": 1.448888888888889,
"grad_norm": 27159.068359375,
"learning_rate": 5.279521199466356e-06,
"logits/chosen": 0.1597212851047516,
"logits/rejected": 0.0540970079600811,
"logps/chosen": -1303.693115234375,
"logps/rejected": -3794.32958984375,
"loss": -35.0208,
"rewards/accuracies": 0.75,
"rewards/chosen": -298.74774169921875,
"rewards/margins": 771.4027709960938,
"rewards/rejected": -1070.150634765625,
"step": 1630
},
{
"epoch": 1.4533333333333334,
"grad_norm": 2882.916015625,
"learning_rate": 5.256249110588659e-06,
"logits/chosen": 0.18333426117897034,
"logits/rejected": 0.11001193523406982,
"logps/chosen": -938.8673706054688,
"logps/rejected": -4647.5478515625,
"loss": -113.7993,
"rewards/accuracies": 0.75,
"rewards/chosen": -190.62075805664062,
"rewards/margins": 1137.749267578125,
"rewards/rejected": -1328.3699951171875,
"step": 1635
},
{
"epoch": 1.4577777777777778,
"grad_norm": 1122.654052734375,
"learning_rate": 5.232971454450595e-06,
"logits/chosen": 0.18779829144477844,
"logits/rejected": 0.10752624273300171,
"logps/chosen": -1198.038330078125,
"logps/rejected": -4063.475341796875,
"loss": -76.5747,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -266.9295959472656,
"rewards/margins": 885.2711181640625,
"rewards/rejected": -1152.20068359375,
"step": 1640
},
{
"epoch": 1.462222222222222,
"grad_norm": 26724.4765625,
"learning_rate": 5.209688736781811e-06,
"logits/chosen": 0.2143486738204956,
"logits/rejected": 0.08180849254131317,
"logps/chosen": -982.2023315429688,
"logps/rejected": -5008.22705078125,
"loss": -132.0043,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -193.89479064941406,
"rewards/margins": 1239.5858154296875,
"rewards/rejected": -1433.480712890625,
"step": 1645
},
{
"epoch": 1.4666666666666668,
"grad_norm": 871.9409790039062,
"learning_rate": 5.1864014634219214e-06,
"logits/chosen": 0.19577138125896454,
"logits/rejected": 0.07801645994186401,
"logps/chosen": -1281.193115234375,
"logps/rejected": -5583.1650390625,
"loss": -93.5913,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -290.54290771484375,
"rewards/margins": 1318.56298828125,
"rewards/rejected": -1609.1058349609375,
"step": 1650
},
{
"epoch": 1.471111111111111,
"grad_norm": 633.2246704101562,
"learning_rate": 5.163110140309518e-06,
"logits/chosen": 0.16473805904388428,
"logits/rejected": 0.06993107497692108,
"logps/chosen": -1357.2281494140625,
"logps/rejected": -4072.783935546875,
"loss": 21.3237,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -320.8450622558594,
"rewards/margins": 837.5509643554688,
"rewards/rejected": -1158.3958740234375,
"step": 1655
},
{
"epoch": 1.4755555555555555,
"grad_norm": 533.0902709960938,
"learning_rate": 5.139815273471177e-06,
"logits/chosen": 0.18089079856872559,
"logits/rejected": 0.09585729986429214,
"logps/chosen": -841.4392700195312,
"logps/rejected": -4242.81884765625,
"loss": -109.7265,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -155.72561645507812,
"rewards/margins": 1048.8828125,
"rewards/rejected": -1204.6085205078125,
"step": 1660
},
{
"epoch": 1.48,
"grad_norm": 52428.46484375,
"learning_rate": 5.116517369010467e-06,
"logits/chosen": 0.24841204285621643,
"logits/rejected": 0.09177226573228836,
"logps/chosen": -1188.5052490234375,
"logps/rejected": -3917.296875,
"loss": 13.6613,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -261.0191345214844,
"rewards/margins": 848.3692626953125,
"rewards/rejected": -1109.388427734375,
"step": 1665
},
{
"epoch": 1.4844444444444445,
"grad_norm": 7470.201171875,
"learning_rate": 5.0932169330969464e-06,
"logits/chosen": 0.18534071743488312,
"logits/rejected": 0.12083166837692261,
"logps/chosen": -1179.9267578125,
"logps/rejected": -5664.2998046875,
"loss": -87.2841,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -264.7419128417969,
"rewards/margins": 1368.9412841796875,
"rewards/rejected": -1633.6832275390625,
"step": 1670
},
{
"epoch": 1.488888888888889,
"grad_norm": 20524.2578125,
"learning_rate": 5.069914471955179e-06,
"logits/chosen": 0.20060646533966064,
"logits/rejected": 0.10489257425069809,
"logps/chosen": -981.4915161132812,
"logps/rejected": -3824.27685546875,
"loss": -37.233,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -197.96775817871094,
"rewards/margins": 878.4957275390625,
"rewards/rejected": -1076.463623046875,
"step": 1675
},
{
"epoch": 1.4933333333333334,
"grad_norm": 2312.172119140625,
"learning_rate": 5.046610491853724e-06,
"logits/chosen": 0.20951858162879944,
"logits/rejected": 0.07898414134979248,
"logps/chosen": -573.9847412109375,
"logps/rejected": -3592.535888671875,
"loss": -109.885,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -77.85977172851562,
"rewards/margins": 932.3436279296875,
"rewards/rejected": -1010.2033081054688,
"step": 1680
},
{
"epoch": 1.4977777777777779,
"grad_norm": 1623.2857666015625,
"learning_rate": 5.023305499094145e-06,
"logits/chosen": 0.22633978724479675,
"logits/rejected": 0.08304329216480255,
"logps/chosen": -1290.774169921875,
"logps/rejected": -4456.76416015625,
"loss": -107.5082,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -294.6800537109375,
"rewards/margins": 974.953125,
"rewards/rejected": -1269.6331787109375,
"step": 1685
},
{
"epoch": 1.5022222222222221,
"grad_norm": 1518.488525390625,
"learning_rate": 5e-06,
"logits/chosen": 0.16941127181053162,
"logits/rejected": 0.09130094945430756,
"logps/chosen": -761.7131958007812,
"logps/rejected": -4554.498046875,
"loss": -142.6045,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -138.38058471679688,
"rewards/margins": 1160.93408203125,
"rewards/rejected": -1299.314697265625,
"step": 1690
},
{
"epoch": 1.5066666666666668,
"grad_norm": 311.0038757324219,
"learning_rate": 4.976694500905858e-06,
"logits/chosen": 0.19227799773216248,
"logits/rejected": 0.09216197580099106,
"logps/chosen": -1680.769775390625,
"logps/rejected": -5286.17919921875,
"loss": -36.3603,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -417.07342529296875,
"rewards/margins": 1102.249755859375,
"rewards/rejected": -1519.323486328125,
"step": 1695
},
{
"epoch": 1.511111111111111,
"grad_norm": 341.06793212890625,
"learning_rate": 4.953389508146277e-06,
"logits/chosen": 0.22450792789459229,
"logits/rejected": 0.09371861070394516,
"logps/chosen": -1111.1561279296875,
"logps/rejected": -6192.85595703125,
"loss": -159.6742,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -240.66537475585938,
"rewards/margins": 1548.493896484375,
"rewards/rejected": -1789.159423828125,
"step": 1700
},
{
"epoch": 1.5155555555555555,
"grad_norm": 390.3556213378906,
"learning_rate": 4.930085528044823e-06,
"logits/chosen": 0.1802193522453308,
"logits/rejected": 0.07305122911930084,
"logps/chosen": -787.3092041015625,
"logps/rejected": -4876.71875,
"loss": -154.2313,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -143.25827026367188,
"rewards/margins": 1251.798583984375,
"rewards/rejected": -1395.056884765625,
"step": 1705
},
{
"epoch": 1.52,
"grad_norm": 2948.445068359375,
"learning_rate": 4.906783066903055e-06,
"logits/chosen": 0.2890220582485199,
"logits/rejected": 0.0888415202498436,
"logps/chosen": -826.3502197265625,
"logps/rejected": -5362.0126953125,
"loss": -154.5983,
"rewards/accuracies": 0.8125,
"rewards/chosen": -151.1717071533203,
"rewards/margins": 1392.552001953125,
"rewards/rejected": -1543.7237548828125,
"step": 1710
},
{
"epoch": 1.5244444444444445,
"grad_norm": 2725.235107421875,
"learning_rate": 4.883482630989536e-06,
"logits/chosen": 0.24592271447181702,
"logits/rejected": 0.12680990993976593,
"logps/chosen": -1053.497802734375,
"logps/rejected": -4971.4873046875,
"loss": -44.4038,
"rewards/accuracies": 0.84375,
"rewards/chosen": -220.80410766601562,
"rewards/margins": 1203.4788818359375,
"rewards/rejected": -1424.282958984375,
"step": 1715
},
{
"epoch": 1.528888888888889,
"grad_norm": 2223.574951171875,
"learning_rate": 4.860184726528824e-06,
"logits/chosen": 0.22868971526622772,
"logits/rejected": 0.0975094586610794,
"logps/chosen": -1032.4659423828125,
"logps/rejected": -4410.2880859375,
"loss": -112.415,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -218.31973266601562,
"rewards/margins": 1034.145263671875,
"rewards/rejected": -1252.46484375,
"step": 1720
},
{
"epoch": 1.5333333333333332,
"grad_norm": 3770.7490234375,
"learning_rate": 4.8368898596904834e-06,
"logits/chosen": 0.1960953325033188,
"logits/rejected": 0.10902541875839233,
"logps/chosen": -1435.333251953125,
"logps/rejected": -5620.6708984375,
"loss": -76.7429,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -335.00872802734375,
"rewards/margins": 1281.798583984375,
"rewards/rejected": -1616.8072509765625,
"step": 1725
},
{
"epoch": 1.537777777777778,
"grad_norm": 184.910400390625,
"learning_rate": 4.81359853657808e-06,
"logits/chosen": 0.23400822281837463,
"logits/rejected": 0.0940646380186081,
"logps/chosen": -829.26171875,
"logps/rejected": -3815.18408203125,
"loss": -76.5698,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -158.69537353515625,
"rewards/margins": 921.3673706054688,
"rewards/rejected": -1080.062744140625,
"step": 1730
},
{
"epoch": 1.5422222222222222,
"grad_norm": 1886.403076171875,
"learning_rate": 4.790311263218191e-06,
"logits/chosen": 0.21838542819023132,
"logits/rejected": 0.10093291848897934,
"logps/chosen": -858.345703125,
"logps/rejected": -4288.09326171875,
"loss": -129.9072,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -168.2855682373047,
"rewards/margins": 1054.7025146484375,
"rewards/rejected": -1222.988037109375,
"step": 1735
},
{
"epoch": 1.5466666666666666,
"grad_norm": 520.177978515625,
"learning_rate": 4.767028545549407e-06,
"logits/chosen": 0.18090055882930756,
"logits/rejected": 0.09364360570907593,
"logps/chosen": -1115.906005859375,
"logps/rejected": -4257.9716796875,
"loss": -91.7768,
"rewards/accuracies": 0.78125,
"rewards/chosen": -241.649658203125,
"rewards/margins": 967.1146240234375,
"rewards/rejected": -1208.7642822265625,
"step": 1740
},
{
"epoch": 1.551111111111111,
"grad_norm": 2584.12890625,
"learning_rate": 4.743750889411342e-06,
"logits/chosen": 0.2136339694261551,
"logits/rejected": 0.08267398178577423,
"logps/chosen": -702.8377685546875,
"logps/rejected": -4060.947265625,
"loss": -115.7741,
"rewards/accuracies": 0.8125,
"rewards/chosen": -118.99308776855469,
"rewards/margins": 1030.06689453125,
"rewards/rejected": -1149.0599365234375,
"step": 1745
},
{
"epoch": 1.5555555555555556,
"grad_norm": 2166.552978515625,
"learning_rate": 4.720478800533647e-06,
"logits/chosen": 0.24663996696472168,
"logits/rejected": 0.1067671999335289,
"logps/chosen": -1155.109375,
"logps/rejected": -4144.953125,
"loss": -89.86,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -249.94668579101562,
"rewards/margins": 923.3084106445312,
"rewards/rejected": -1173.255126953125,
"step": 1750
},
{
"epoch": 1.56,
"grad_norm": 3155.211669921875,
"learning_rate": 4.697212784525009e-06,
"logits/chosen": 0.2132018506526947,
"logits/rejected": 0.06178309768438339,
"logps/chosen": -685.2828369140625,
"logps/rejected": -3697.40771484375,
"loss": -107.7679,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -112.0811538696289,
"rewards/margins": 929.0880737304688,
"rewards/rejected": -1041.1693115234375,
"step": 1755
},
{
"epoch": 1.5644444444444443,
"grad_norm": 4135.490234375,
"learning_rate": 4.673953346862189e-06,
"logits/chosen": 0.173181414604187,
"logits/rejected": 0.11724446713924408,
"logps/chosen": -847.1492309570312,
"logps/rejected": -4463.97900390625,
"loss": -133.5609,
"rewards/accuracies": 0.78125,
"rewards/chosen": -164.16900634765625,
"rewards/margins": 1107.0006103515625,
"rewards/rejected": -1271.169677734375,
"step": 1760
},
{
"epoch": 1.568888888888889,
"grad_norm": 4123.27490234375,
"learning_rate": 4.65070099287902e-06,
"logits/chosen": 0.19095957279205322,
"logits/rejected": 0.09138090908527374,
"logps/chosen": -1100.893798828125,
"logps/rejected": -5407.83447265625,
"loss": -141.662,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -233.89419555664062,
"rewards/margins": 1317.0438232421875,
"rewards/rejected": -1550.93798828125,
"step": 1765
},
{
"epoch": 1.5733333333333333,
"grad_norm": 1560.21240234375,
"learning_rate": 4.627456227755435e-06,
"logits/chosen": 0.1787518560886383,
"logits/rejected": 0.035512715578079224,
"logps/chosen": -1368.987060546875,
"logps/rejected": -4401.994140625,
"loss": -1.5697,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -312.0104064941406,
"rewards/margins": 941.6595458984375,
"rewards/rejected": -1253.669921875,
"step": 1770
},
{
"epoch": 1.5777777777777777,
"grad_norm": 32934.18359375,
"learning_rate": 4.604219556506492e-06,
"logits/chosen": 0.2877276539802551,
"logits/rejected": 0.10175907611846924,
"logps/chosen": -1241.767333984375,
"logps/rejected": -5513.44580078125,
"loss": -76.6882,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -280.1549377441406,
"rewards/margins": 1308.2108154296875,
"rewards/rejected": -1588.3658447265625,
"step": 1775
},
{
"epoch": 1.5822222222222222,
"grad_norm": 2151.84814453125,
"learning_rate": 4.580991483971403e-06,
"logits/chosen": 0.1933944970369339,
"logits/rejected": 0.06155434995889664,
"logps/chosen": -987.50244140625,
"logps/rejected": -4560.982421875,
"loss": -105.6609,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -201.29513549804688,
"rewards/margins": 1100.52783203125,
"rewards/rejected": -1301.822998046875,
"step": 1780
},
{
"epoch": 1.5866666666666667,
"grad_norm": 1171.986083984375,
"learning_rate": 4.557772514802564e-06,
"logits/chosen": 0.2200632095336914,
"logits/rejected": 0.12937499582767487,
"logps/chosen": -743.1904907226562,
"logps/rejected": -5043.15576171875,
"loss": -153.4002,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -131.7304229736328,
"rewards/margins": 1311.0921630859375,
"rewards/rejected": -1442.822509765625,
"step": 1785
},
{
"epoch": 1.5911111111111111,
"grad_norm": 1308.8167724609375,
"learning_rate": 4.5345631534545935e-06,
"logits/chosen": 0.29665079712867737,
"logits/rejected": 0.10534970462322235,
"logps/chosen": -745.3150634765625,
"logps/rejected": -5404.919921875,
"loss": -168.1946,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -128.2810516357422,
"rewards/margins": 1427.3328857421875,
"rewards/rejected": -1555.6141357421875,
"step": 1790
},
{
"epoch": 1.5955555555555554,
"grad_norm": 1778.48779296875,
"learning_rate": 4.511363904173366e-06,
"logits/chosen": 0.23913387954235077,
"logits/rejected": 0.07443296164274216,
"logps/chosen": -1030.5181884765625,
"logps/rejected": -5341.0888671875,
"loss": -146.5949,
"rewards/accuracies": 0.8125,
"rewards/chosen": -212.07064819335938,
"rewards/margins": 1324.4898681640625,
"rewards/rejected": -1536.560546875,
"step": 1795
},
{
"epoch": 1.6,
"grad_norm": 8834.0068359375,
"learning_rate": 4.488175270985065e-06,
"logits/chosen": 0.1603962481021881,
"logits/rejected": 0.030650785192847252,
"logps/chosen": -1292.248291015625,
"logps/rejected": -4622.3115234375,
"loss": -96.2908,
"rewards/accuracies": 0.75,
"rewards/chosen": -296.3851318359375,
"rewards/margins": 1023.2731323242188,
"rewards/rejected": -1319.6583251953125,
"step": 1800
},
{
"epoch": 1.6044444444444443,
"grad_norm": 3213.7578125,
"learning_rate": 4.46499775768523e-06,
"logits/chosen": 0.2049437314271927,
"logits/rejected": 0.09727514535188675,
"logps/chosen": -1178.759521484375,
"logps/rejected": -5381.6337890625,
"loss": -151.4572,
"rewards/accuracies": 0.78125,
"rewards/chosen": -260.3944396972656,
"rewards/margins": 1285.650634765625,
"rewards/rejected": -1546.045166015625,
"step": 1805
},
{
"epoch": 1.608888888888889,
"grad_norm": 1180.321044921875,
"learning_rate": 4.441831867827806e-06,
"logits/chosen": 0.19563202559947968,
"logits/rejected": 0.028948839753866196,
"logps/chosen": -1116.917236328125,
"logps/rejected": -4179.2021484375,
"loss": -89.8656,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -243.24179077148438,
"rewards/margins": 945.7950439453125,
"rewards/rejected": -1189.036865234375,
"step": 1810
},
{
"epoch": 1.6133333333333333,
"grad_norm": 32007.529296875,
"learning_rate": 4.418678104714214e-06,
"logits/chosen": 0.18151655793190002,
"logits/rejected": 0.10459226369857788,
"logps/chosen": -1238.7088623046875,
"logps/rejected": -5372.6796875,
"loss": -36.7473,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -275.9144592285156,
"rewards/margins": 1263.199951171875,
"rewards/rejected": -1539.114501953125,
"step": 1815
},
{
"epoch": 1.6177777777777778,
"grad_norm": 4922.98291015625,
"learning_rate": 4.395536971382403e-06,
"logits/chosen": 0.25195056200027466,
"logits/rejected": 0.06397799402475357,
"logps/chosen": -614.5676879882812,
"logps/rejected": -4044.288330078125,
"loss": -129.0167,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -88.91123962402344,
"rewards/margins": 1059.1080322265625,
"rewards/rejected": -1148.0191650390625,
"step": 1820
},
{
"epoch": 1.6222222222222222,
"grad_norm": 18587.265625,
"learning_rate": 4.372408970595931e-06,
"logits/chosen": 0.2288244515657425,
"logits/rejected": 0.12830862402915955,
"logps/chosen": -1080.5850830078125,
"logps/rejected": -4167.451171875,
"loss": -24.3531,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -233.8815460205078,
"rewards/margins": 948.3587036132812,
"rewards/rejected": -1182.2403564453125,
"step": 1825
},
{
"epoch": 1.6266666666666667,
"grad_norm": 1632.0062255859375,
"learning_rate": 4.349294604833037e-06,
"logits/chosen": 0.19971409440040588,
"logits/rejected": 0.09651723504066467,
"logps/chosen": -903.0916748046875,
"logps/rejected": -5288.95068359375,
"loss": -165.1823,
"rewards/accuracies": 0.84375,
"rewards/chosen": -180.99258422851562,
"rewards/margins": 1341.563232421875,
"rewards/rejected": -1522.5560302734375,
"step": 1830
},
{
"epoch": 1.6311111111111112,
"grad_norm": 3647.180908203125,
"learning_rate": 4.326194376275729e-06,
"logits/chosen": 0.20333686470985413,
"logits/rejected": 0.04894101247191429,
"logps/chosen": -1200.640380859375,
"logps/rejected": -4328.55810546875,
"loss": -15.3313,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -267.91546630859375,
"rewards/margins": 959.5001220703125,
"rewards/rejected": -1227.4156494140625,
"step": 1835
},
{
"epoch": 1.6355555555555554,
"grad_norm": 369.36090087890625,
"learning_rate": 4.303108786798869e-06,
"logits/chosen": 0.23952741920948029,
"logits/rejected": 0.11030056327581406,
"logps/chosen": -868.2476806640625,
"logps/rejected": -6594.4208984375,
"loss": -203.4027,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -168.63491821289062,
"rewards/margins": 1745.8909912109375,
"rewards/rejected": -1914.5257568359375,
"step": 1840
},
{
"epoch": 1.6400000000000001,
"grad_norm": 840.448974609375,
"learning_rate": 4.280038337959268e-06,
"logits/chosen": 0.22680750489234924,
"logits/rejected": 0.04391016438603401,
"logps/chosen": -1048.15625,
"logps/rejected": -3534.797607421875,
"loss": -36.2203,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -226.58468627929688,
"rewards/margins": 769.23974609375,
"rewards/rejected": -995.8245239257812,
"step": 1845
},
{
"epoch": 1.6444444444444444,
"grad_norm": 2940.37646484375,
"learning_rate": 4.256983530984797e-06,
"logits/chosen": 0.2374953329563141,
"logits/rejected": 0.07543542981147766,
"logps/chosen": -1364.133056640625,
"logps/rejected": -4755.900390625,
"loss": -75.3059,
"rewards/accuracies": 0.78125,
"rewards/chosen": -317.7399597167969,
"rewards/margins": 1042.538818359375,
"rewards/rejected": -1360.2789306640625,
"step": 1850
},
{
"epoch": 1.6488888888888888,
"grad_norm": 4782.626953125,
"learning_rate": 4.2339448667634885e-06,
"logits/chosen": 0.21303322911262512,
"logits/rejected": 0.08738868683576584,
"logps/chosen": -2001.385498046875,
"logps/rejected": -4847.5869140625,
"loss": -8.6322,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -505.5797424316406,
"rewards/margins": 881.3775634765625,
"rewards/rejected": -1386.957275390625,
"step": 1855
},
{
"epoch": 1.6533333333333333,
"grad_norm": 30556.8828125,
"learning_rate": 4.21092284583266e-06,
"logits/chosen": 0.24975800514221191,
"logits/rejected": 0.08625680953264236,
"logps/chosen": -1471.2796630859375,
"logps/rejected": -5477.4912109375,
"loss": -56.1464,
"rewards/accuracies": 0.8125,
"rewards/chosen": -351.5309753417969,
"rewards/margins": 1227.842529296875,
"rewards/rejected": -1579.3734130859375,
"step": 1860
},
{
"epoch": 1.6577777777777778,
"grad_norm": 9967.26953125,
"learning_rate": 4.187917968368036e-06,
"logits/chosen": 0.24700018763542175,
"logits/rejected": 0.14024746417999268,
"logps/chosen": -1792.168701171875,
"logps/rejected": -6281.21484375,
"loss": -94.6902,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -446.555419921875,
"rewards/margins": 1371.84716796875,
"rewards/rejected": -1818.4027099609375,
"step": 1865
},
{
"epoch": 1.6622222222222223,
"grad_norm": 5255.771484375,
"learning_rate": 4.164930734172884e-06,
"logits/chosen": 0.2706100344657898,
"logits/rejected": 0.10564364492893219,
"logps/chosen": -855.5506591796875,
"logps/rejected": -5278.5625,
"loss": -168.4971,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -161.1185760498047,
"rewards/margins": 1357.52880859375,
"rewards/rejected": -1518.647216796875,
"step": 1870
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1137.33056640625,
"learning_rate": 4.141961642667152e-06,
"logits/chosen": 0.22544869780540466,
"logits/rejected": 0.1046823039650917,
"logps/chosen": -853.8209228515625,
"logps/rejected": -4098.5380859375,
"loss": -98.7491,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -156.70986938476562,
"rewards/margins": 1000.6829223632812,
"rewards/rejected": -1157.392822265625,
"step": 1875
},
{
"epoch": 1.6711111111111112,
"grad_norm": 3251.376708984375,
"learning_rate": 4.119011192876624e-06,
"logits/chosen": 0.2323744297027588,
"logits/rejected": 0.11708340793848038,
"logps/chosen": -1144.6881103515625,
"logps/rejected": -5402.4853515625,
"loss": -121.1345,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -251.96133422851562,
"rewards/margins": 1302.1949462890625,
"rewards/rejected": -1554.1563720703125,
"step": 1880
},
{
"epoch": 1.6755555555555555,
"grad_norm": 3250.385986328125,
"learning_rate": 4.09607988342207e-06,
"logits/chosen": 0.2249506413936615,
"logits/rejected": 0.08312972635030746,
"logps/chosen": -1434.54541015625,
"logps/rejected": -6453.66259765625,
"loss": -130.9487,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -339.8567810058594,
"rewards/margins": 1531.913818359375,
"rewards/rejected": -1871.770751953125,
"step": 1885
},
{
"epoch": 1.6800000000000002,
"grad_norm": 2996.223388671875,
"learning_rate": 4.0731682125084244e-06,
"logits/chosen": 0.2461550533771515,
"logits/rejected": 0.11112775653600693,
"logps/chosen": -1187.622802734375,
"logps/rejected": -5081.6640625,
"loss": -67.5338,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -262.11505126953125,
"rewards/margins": 1196.88916015625,
"rewards/rejected": -1459.004150390625,
"step": 1890
},
{
"epoch": 1.6844444444444444,
"grad_norm": 788.92431640625,
"learning_rate": 4.0502766779139485e-06,
"logits/chosen": 0.2763102948665619,
"logits/rejected": 0.1312466859817505,
"logps/chosen": -644.9693603515625,
"logps/rejected": -4451.1240234375,
"loss": -144.7744,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -97.20635986328125,
"rewards/margins": 1170.815185546875,
"rewards/rejected": -1268.0216064453125,
"step": 1895
},
{
"epoch": 1.6888888888888889,
"grad_norm": 24277.05859375,
"learning_rate": 4.027405776979426e-06,
"logits/chosen": 0.2461940348148346,
"logits/rejected": 0.10517482459545135,
"logps/chosen": -1614.966064453125,
"logps/rejected": -5885.6884765625,
"loss": -23.2786,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -385.0234375,
"rewards/margins": 1312.0137939453125,
"rewards/rejected": -1697.037109375,
"step": 1900
},
{
"epoch": 1.6933333333333334,
"grad_norm": 7921.02783203125,
"learning_rate": 4.0045560065973535e-06,
"logits/chosen": 0.24439683556556702,
"logits/rejected": 0.137715682387352,
"logps/chosen": -907.2028198242188,
"logps/rejected": -6215.12890625,
"loss": -171.6943,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -180.00177001953125,
"rewards/margins": 1617.5052490234375,
"rewards/rejected": -1797.507080078125,
"step": 1905
},
{
"epoch": 1.6977777777777778,
"grad_norm": 500.7940979003906,
"learning_rate": 3.981727863201146e-06,
"logits/chosen": 0.23465153574943542,
"logits/rejected": 0.04516502842307091,
"logps/chosen": -952.7020263671875,
"logps/rejected": -4535.16357421875,
"loss": -93.9728,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -186.6133270263672,
"rewards/margins": 1108.33837890625,
"rewards/rejected": -1294.95166015625,
"step": 1910
},
{
"epoch": 1.7022222222222223,
"grad_norm": 9174.0546875,
"learning_rate": 3.958921842754351e-06,
"logits/chosen": 0.27381208539009094,
"logits/rejected": 0.11198027431964874,
"logps/chosen": -1263.0186767578125,
"logps/rejected": -5507.2158203125,
"loss": -149.7667,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -291.18353271484375,
"rewards/margins": 1298.3118896484375,
"rewards/rejected": -1589.495361328125,
"step": 1915
},
{
"epoch": 1.7066666666666666,
"grad_norm": 1700.24072265625,
"learning_rate": 3.936138440739875e-06,
"logits/chosen": 0.27449166774749756,
"logits/rejected": 0.12847240269184113,
"logps/chosen": -1872.1488037109375,
"logps/rejected": -5714.8662109375,
"loss": -67.8528,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -469.8207092285156,
"rewards/margins": 1178.263916015625,
"rewards/rejected": -1648.0845947265625,
"step": 1920
},
{
"epoch": 1.7111111111111112,
"grad_norm": 700.5916137695312,
"learning_rate": 3.913378152149214e-06,
"logits/chosen": 0.24738919734954834,
"logits/rejected": 0.1326262354850769,
"logps/chosen": -1318.026611328125,
"logps/rejected": -6299.62890625,
"loss": -160.6032,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -301.7664489746094,
"rewards/margins": 1518.3792724609375,
"rewards/rejected": -1820.145751953125,
"step": 1925
},
{
"epoch": 1.7155555555555555,
"grad_norm": 2220.398193359375,
"learning_rate": 3.890641471471706e-06,
"logits/chosen": 0.3000715374946594,
"logits/rejected": 0.1240081787109375,
"logps/chosen": -685.4688110351562,
"logps/rejected": -4482.275390625,
"loss": -130.5446,
"rewards/accuracies": 0.8125,
"rewards/chosen": -113.60041809082031,
"rewards/margins": 1168.1650390625,
"rewards/rejected": -1281.765380859375,
"step": 1930
},
{
"epoch": 1.72,
"grad_norm": 2720.75634765625,
"learning_rate": 3.86792889268378e-06,
"logits/chosen": 0.2833808958530426,
"logits/rejected": 0.1499921977519989,
"logps/chosen": -1256.76953125,
"logps/rejected": -5304.7353515625,
"loss": -38.7009,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -288.4460144042969,
"rewards/margins": 1236.643310546875,
"rewards/rejected": -1525.089599609375,
"step": 1935
},
{
"epoch": 1.7244444444444444,
"grad_norm": 630.7939453125,
"learning_rate": 3.845240909238234e-06,
"logits/chosen": 0.25388604402542114,
"logits/rejected": 0.11848431825637817,
"logps/chosen": -774.7329711914062,
"logps/rejected": -5794.5478515625,
"loss": -183.0251,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -141.75735473632812,
"rewards/margins": 1530.6614990234375,
"rewards/rejected": -1672.418701171875,
"step": 1940
},
{
"epoch": 1.728888888888889,
"grad_norm": 2211.608154296875,
"learning_rate": 3.8225780140535025e-06,
"logits/chosen": 0.2086627185344696,
"logits/rejected": 0.11551016569137573,
"logps/chosen": -911.6271362304688,
"logps/rejected": -5228.11181640625,
"loss": -157.3048,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -186.49317932128906,
"rewards/margins": 1314.8564453125,
"rewards/rejected": -1501.349609375,
"step": 1945
},
{
"epoch": 1.7333333333333334,
"grad_norm": 2765.096435546875,
"learning_rate": 3.7999406995029565e-06,
"logits/chosen": 0.20155318081378937,
"logits/rejected": 0.07800821959972382,
"logps/chosen": -1026.3896484375,
"logps/rejected": -5046.630859375,
"loss": -112.5577,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -213.7878875732422,
"rewards/margins": 1228.40625,
"rewards/rejected": -1442.1942138671875,
"step": 1950
},
{
"epoch": 1.7377777777777776,
"grad_norm": 2560.62939453125,
"learning_rate": 3.777329457404202e-06,
"logits/chosen": 0.25891679525375366,
"logits/rejected": 0.13444481790065765,
"logps/chosen": -1175.992919921875,
"logps/rejected": -7102.63818359375,
"loss": -199.6698,
"rewards/accuracies": 0.78125,
"rewards/chosen": -255.4857177734375,
"rewards/margins": 1806.615966796875,
"rewards/rejected": -2062.101806640625,
"step": 1955
},
{
"epoch": 1.7422222222222223,
"grad_norm": 707.2987670898438,
"learning_rate": 3.754744779008395e-06,
"logits/chosen": 0.22196459770202637,
"logits/rejected": 0.08021806925535202,
"logps/chosen": -1127.932861328125,
"logps/rejected": -4943.876953125,
"loss": -100.5632,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -240.08834838867188,
"rewards/margins": 1175.7769775390625,
"rewards/rejected": -1415.865234375,
"step": 1960
},
{
"epoch": 1.7466666666666666,
"grad_norm": 946.744384765625,
"learning_rate": 3.7321871549895715e-06,
"logits/chosen": 0.27232426404953003,
"logits/rejected": 0.10154370963573456,
"logps/chosen": -945.1385498046875,
"logps/rejected": -4433.72900390625,
"loss": -113.8801,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -188.45480346679688,
"rewards/margins": 1073.018798828125,
"rewards/rejected": -1261.4736328125,
"step": 1965
},
{
"epoch": 1.751111111111111,
"grad_norm": 3697.505126953125,
"learning_rate": 3.709657075433982e-06,
"logits/chosen": 0.2271924763917923,
"logits/rejected": 0.04478111118078232,
"logps/chosen": -1110.8345947265625,
"logps/rejected": -4532.7265625,
"loss": -15.3714,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -241.8577423095703,
"rewards/margins": 1051.829833984375,
"rewards/rejected": -1293.687744140625,
"step": 1970
},
{
"epoch": 1.7555555555555555,
"grad_norm": 27715.974609375,
"learning_rate": 3.68715502982945e-06,
"logits/chosen": 0.24351032078266144,
"logits/rejected": 0.14694446325302124,
"logps/chosen": -1400.335205078125,
"logps/rejected": -7460.48583984375,
"loss": -162.933,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -324.5793151855469,
"rewards/margins": 1843.2681884765625,
"rewards/rejected": -2167.847412109375,
"step": 1975
},
{
"epoch": 1.76,
"grad_norm": 286.2002868652344,
"learning_rate": 3.6646815070547316e-06,
"logits/chosen": 0.20532718300819397,
"logits/rejected": 0.06882862746715546,
"logps/chosen": -878.3626098632812,
"logps/rejected": -6145.6318359375,
"loss": -198.1174,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -170.4589080810547,
"rewards/margins": 1607.0462646484375,
"rewards/rejected": -1777.5052490234375,
"step": 1980
},
{
"epoch": 1.7644444444444445,
"grad_norm": 9633.6513671875,
"learning_rate": 3.6422369953688973e-06,
"logits/chosen": 0.15580406785011292,
"logits/rejected": 0.09763354808092117,
"logps/chosen": -1395.62158203125,
"logps/rejected": -5257.75634765625,
"loss": -126.3119,
"rewards/accuracies": 0.84375,
"rewards/chosen": -325.6045227050781,
"rewards/margins": 1179.4747314453125,
"rewards/rejected": -1505.0791015625,
"step": 1985
},
{
"epoch": 1.7688888888888887,
"grad_norm": 2226.966552734375,
"learning_rate": 3.619821982400725e-06,
"logits/chosen": 0.20389437675476074,
"logits/rejected": 0.0854041799902916,
"logps/chosen": -791.7081298828125,
"logps/rejected": -4680.9716796875,
"loss": -131.5356,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -140.87710571289062,
"rewards/margins": 1196.602783203125,
"rewards/rejected": -1337.47998046875,
"step": 1990
},
{
"epoch": 1.7733333333333334,
"grad_norm": 4158.79150390625,
"learning_rate": 3.5974369551381023e-06,
"logits/chosen": 0.24090011417865753,
"logits/rejected": 0.108218252658844,
"logps/chosen": -835.3377075195312,
"logps/rejected": -4290.2568359375,
"loss": -129.5949,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -161.09817504882812,
"rewards/margins": 1058.4774169921875,
"rewards/rejected": -1219.5758056640625,
"step": 1995
},
{
"epoch": 1.7777777777777777,
"grad_norm": 55221.4765625,
"learning_rate": 3.575082399917451e-06,
"logits/chosen": 0.2529537081718445,
"logits/rejected": 0.13592717051506042,
"logps/chosen": -1170.9078369140625,
"logps/rejected": -4635.1025390625,
"loss": -78.99,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -260.46441650390625,
"rewards/margins": 1063.17529296875,
"rewards/rejected": -1323.6397705078125,
"step": 2000
},
{
"epoch": 1.7777777777777777,
"eval_logits/chosen": 0.25555315613746643,
"eval_logits/rejected": 0.1154949739575386,
"eval_logps/chosen": -962.0708618164062,
"eval_logps/rejected": -5228.919921875,
"eval_loss": -118.59148406982422,
"eval_rewards/accuracies": 0.7950000166893005,
"eval_rewards/chosen": -195.61526489257812,
"eval_rewards/margins": 1306.0662841796875,
"eval_rewards/rejected": -1501.6815185546875,
"eval_runtime": 2189.3694,
"eval_samples_per_second": 1.827,
"eval_steps_per_second": 0.914,
"step": 2000
}
],
"logging_steps": 5,
"max_steps": 3375,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}