dpo_test_uhuy / trainer_state.json
bimabk's picture
Upload task output 281ac9fc-853b-477d-8e84-145f9030d732
3a5b7b9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.3425393172228615,
"eval_steps": 500,
"global_step": 3500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019179133103183737,
"grad_norm": 1.7578125,
"learning_rate": 2.468474965491109e-07,
"logits/chosen": -6.4269118309021,
"logits/rejected": -6.423974514007568,
"logps/chosen": -56.09489059448242,
"logps/rejected": -55.90266799926758,
"loss": 0.6935,
"rewards/accuracies": 0.27812498807907104,
"rewards/chosen": -4.531387094175443e-05,
"rewards/margins": -0.0008066934533417225,
"rewards/rejected": 0.0007613796042278409,
"step": 5
},
{
"epoch": 0.0038358266206367474,
"grad_norm": 2.125,
"learning_rate": 5.554068672354994e-07,
"logits/chosen": -6.386446952819824,
"logits/rejected": -6.378155708312988,
"logps/chosen": -57.592018127441406,
"logps/rejected": -56.58235549926758,
"loss": 0.693,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.0007413614657707512,
"rewards/margins": 0.0003508913214318454,
"rewards/rejected": 0.00039047005702741444,
"step": 10
},
{
"epoch": 0.005753739930955121,
"grad_norm": 1.921875,
"learning_rate": 8.63966237921888e-07,
"logits/chosen": -6.385983467102051,
"logits/rejected": -6.420374393463135,
"logps/chosen": -60.8471565246582,
"logps/rejected": -59.70293045043945,
"loss": 0.6933,
"rewards/accuracies": 0.4781250059604645,
"rewards/chosen": -0.0009136783191934228,
"rewards/margins": -0.0002154188259737566,
"rewards/rejected": -0.0006982595077715814,
"step": 15
},
{
"epoch": 0.007671653241273495,
"grad_norm": 2.46875,
"learning_rate": 1.1725256086082765e-06,
"logits/chosen": -6.456655979156494,
"logits/rejected": -6.467028617858887,
"logps/chosen": -59.60247039794922,
"logps/rejected": -58.28697967529297,
"loss": 0.6934,
"rewards/accuracies": 0.484375,
"rewards/chosen": 0.0006150425178930163,
"rewards/margins": -0.0006757881492376328,
"rewards/rejected": 0.0012908302014693618,
"step": 20
},
{
"epoch": 0.009589566551591868,
"grad_norm": 1.7421875,
"learning_rate": 1.4810849792946651e-06,
"logits/chosen": -6.433564186096191,
"logits/rejected": -6.4307708740234375,
"logps/chosen": -56.00537872314453,
"logps/rejected": -55.16205978393555,
"loss": 0.6938,
"rewards/accuracies": 0.4124999940395355,
"rewards/chosen": -0.0005184346809983253,
"rewards/margins": -0.0015976792201399803,
"rewards/rejected": 0.001079244539141655,
"step": 25
},
{
"epoch": 0.011507479861910242,
"grad_norm": 1.9765625,
"learning_rate": 1.7896443499810537e-06,
"logits/chosen": -6.4431304931640625,
"logits/rejected": -6.4442572593688965,
"logps/chosen": -55.598548889160156,
"logps/rejected": -54.446998596191406,
"loss": 0.6934,
"rewards/accuracies": 0.503125011920929,
"rewards/chosen": -0.00016300799325108528,
"rewards/margins": -0.0005609053187072277,
"rewards/rejected": 0.00039789750007912517,
"step": 30
},
{
"epoch": 0.013425393172228616,
"grad_norm": 2.203125,
"learning_rate": 2.0982037206674425e-06,
"logits/chosen": -6.4433274269104,
"logits/rejected": -6.450094699859619,
"logps/chosen": -59.896209716796875,
"logps/rejected": -57.89789581298828,
"loss": 0.6929,
"rewards/accuracies": 0.5093749761581421,
"rewards/chosen": 0.0006533617270179093,
"rewards/margins": 0.0006716603529639542,
"rewards/rejected": -1.8298602299182676e-05,
"step": 35
},
{
"epoch": 0.01534330648254699,
"grad_norm": 2.03125,
"learning_rate": 2.4067630913538307e-06,
"logits/chosen": -6.476667881011963,
"logits/rejected": -6.468405723571777,
"logps/chosen": -58.06646728515625,
"logps/rejected": -56.530914306640625,
"loss": 0.6934,
"rewards/accuracies": 0.4468750059604645,
"rewards/chosen": -0.00031386903719976544,
"rewards/margins": -0.0005673372070305049,
"rewards/rejected": 0.00025346819893456995,
"step": 40
},
{
"epoch": 0.01726121979286536,
"grad_norm": 2.296875,
"learning_rate": 2.7153224620402197e-06,
"logits/chosen": -6.508776664733887,
"logits/rejected": -6.486769676208496,
"logps/chosen": -54.24138259887695,
"logps/rejected": -53.89955520629883,
"loss": 0.6928,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.0002679133613128215,
"rewards/margins": 0.000910227361600846,
"rewards/rejected": -0.0006423138547688723,
"step": 45
},
{
"epoch": 0.019179133103183737,
"grad_norm": 2.46875,
"learning_rate": 3.023881832726608e-06,
"logits/chosen": -6.429900169372559,
"logits/rejected": -6.4368720054626465,
"logps/chosen": -59.0792121887207,
"logps/rejected": -57.9002571105957,
"loss": 0.6923,
"rewards/accuracies": 0.546875,
"rewards/chosen": 0.001494865515269339,
"rewards/margins": 0.0021004711743444204,
"rewards/rejected": -0.0006056058336980641,
"step": 50
},
{
"epoch": 0.02109704641350211,
"grad_norm": 1.796875,
"learning_rate": 3.332441203412997e-06,
"logits/chosen": -6.480206489562988,
"logits/rejected": -6.4737958908081055,
"logps/chosen": -57.52396774291992,
"logps/rejected": -57.11433029174805,
"loss": 0.6922,
"rewards/accuracies": 0.534375011920929,
"rewards/chosen": 0.0009556323057040572,
"rewards/margins": 0.002387548331171274,
"rewards/rejected": -0.0014319162582978606,
"step": 55
},
{
"epoch": 0.023014959723820484,
"grad_norm": 2.25,
"learning_rate": 3.6410005740993855e-06,
"logits/chosen": -6.504572868347168,
"logits/rejected": -6.499182224273682,
"logps/chosen": -58.1120491027832,
"logps/rejected": -57.13053512573242,
"loss": 0.6911,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.001851649023592472,
"rewards/margins": 0.005305788479745388,
"rewards/rejected": -0.0034541389904916286,
"step": 60
},
{
"epoch": 0.024932873034138856,
"grad_norm": 2.0625,
"learning_rate": 3.9495599447857745e-06,
"logits/chosen": -6.491766452789307,
"logits/rejected": -6.455049991607666,
"logps/chosen": -57.039581298828125,
"logps/rejected": -55.7880859375,
"loss": 0.6908,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.0029739458113908768,
"rewards/margins": 0.0059506590478122234,
"rewards/rejected": -0.0029767132364213467,
"step": 65
},
{
"epoch": 0.02685078634445723,
"grad_norm": 1.9453125,
"learning_rate": 4.258119315472162e-06,
"logits/chosen": -6.455752372741699,
"logits/rejected": -6.4692840576171875,
"logps/chosen": -53.70838165283203,
"logps/rejected": -54.29150390625,
"loss": 0.6904,
"rewards/accuracies": 0.6156250238418579,
"rewards/chosen": 0.003416051622480154,
"rewards/margins": 0.006988453213125467,
"rewards/rejected": -0.003572401124984026,
"step": 70
},
{
"epoch": 0.028768699654775604,
"grad_norm": 2.25,
"learning_rate": 4.566678686158551e-06,
"logits/chosen": -6.434661865234375,
"logits/rejected": -6.4351091384887695,
"logps/chosen": -57.85565185546875,
"logps/rejected": -56.18524169921875,
"loss": 0.69,
"rewards/accuracies": 0.590624988079071,
"rewards/chosen": 0.0047136032953858376,
"rewards/margins": 0.008176136761903763,
"rewards/rejected": -0.0034625339321792126,
"step": 75
},
{
"epoch": 0.03068661296509398,
"grad_norm": 2.15625,
"learning_rate": 4.875238056844939e-06,
"logits/chosen": -6.418464660644531,
"logits/rejected": -6.435811519622803,
"logps/chosen": -59.70001220703125,
"logps/rejected": -58.532691955566406,
"loss": 0.6882,
"rewards/accuracies": 0.6781250238418579,
"rewards/chosen": 0.007058107294142246,
"rewards/margins": 0.012781137600541115,
"rewards/rejected": -0.005723030772060156,
"step": 80
},
{
"epoch": 0.032604526275412354,
"grad_norm": 1.7109375,
"learning_rate": 5.183797427531328e-06,
"logits/chosen": -6.434415340423584,
"logits/rejected": -6.414219856262207,
"logps/chosen": -56.356300354003906,
"logps/rejected": -55.965721130371094,
"loss": 0.6885,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.006996457930654287,
"rewards/margins": 0.011944364756345749,
"rewards/rejected": -0.004947904963046312,
"step": 85
},
{
"epoch": 0.03452243958573072,
"grad_norm": 2.15625,
"learning_rate": 5.4923567982177174e-06,
"logits/chosen": -6.413939476013184,
"logits/rejected": -6.413136959075928,
"logps/chosen": -57.609031677246094,
"logps/rejected": -56.8945198059082,
"loss": 0.6848,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.012066302821040154,
"rewards/margins": 0.021702740341424942,
"rewards/rejected": -0.009636437520384789,
"step": 90
},
{
"epoch": 0.0364403528960491,
"grad_norm": 1.8203125,
"learning_rate": 5.800916168904105e-06,
"logits/chosen": -6.4709014892578125,
"logits/rejected": -6.410592079162598,
"logps/chosen": -57.7935791015625,
"logps/rejected": -56.025917053222656,
"loss": 0.6847,
"rewards/accuracies": 0.703125,
"rewards/chosen": 0.011256908066570759,
"rewards/margins": 0.022287515923380852,
"rewards/rejected": -0.011030609719455242,
"step": 95
},
{
"epoch": 0.038358266206367474,
"grad_norm": 2.015625,
"learning_rate": 6.109475539590495e-06,
"logits/chosen": -6.4798479080200195,
"logits/rejected": -6.4916839599609375,
"logps/chosen": -58.99842071533203,
"logps/rejected": -58.10333251953125,
"loss": 0.6842,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.0121616804972291,
"rewards/margins": 0.023794159293174744,
"rewards/rejected": -0.011632479727268219,
"step": 100
},
{
"epoch": 0.04027617951668585,
"grad_norm": 1.9296875,
"learning_rate": 6.418034910276882e-06,
"logits/chosen": -6.448805332183838,
"logits/rejected": -6.4762372970581055,
"logps/chosen": -58.24238967895508,
"logps/rejected": -57.392173767089844,
"loss": 0.6822,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.016423719003796577,
"rewards/margins": 0.029310371726751328,
"rewards/rejected": -0.0128866508603096,
"step": 105
},
{
"epoch": 0.04219409282700422,
"grad_norm": 2.203125,
"learning_rate": 6.726594280963271e-06,
"logits/chosen": -6.426924228668213,
"logits/rejected": -6.42023229598999,
"logps/chosen": -56.317108154296875,
"logps/rejected": -56.5660400390625,
"loss": 0.6817,
"rewards/accuracies": 0.715624988079071,
"rewards/chosen": 0.015550317242741585,
"rewards/margins": 0.031005825847387314,
"rewards/rejected": -0.015455508604645729,
"step": 110
},
{
"epoch": 0.04411200613732259,
"grad_norm": 1.9453125,
"learning_rate": 7.03515365164966e-06,
"logits/chosen": -6.488696098327637,
"logits/rejected": -6.494551658630371,
"logps/chosen": -54.70813751220703,
"logps/rejected": -55.6793098449707,
"loss": 0.6792,
"rewards/accuracies": 0.746874988079071,
"rewards/chosen": 0.01944402977824211,
"rewards/margins": 0.03836524114012718,
"rewards/rejected": -0.01892121136188507,
"step": 115
},
{
"epoch": 0.04602991944764097,
"grad_norm": 1.9609375,
"learning_rate": 7.343713022336048e-06,
"logits/chosen": -6.468048095703125,
"logits/rejected": -6.445672512054443,
"logps/chosen": -55.66649627685547,
"logps/rejected": -56.139678955078125,
"loss": 0.6761,
"rewards/accuracies": 0.7406250238418579,
"rewards/chosen": 0.021984096616506577,
"rewards/margins": 0.047887708991765976,
"rewards/rejected": -0.025903616100549698,
"step": 120
},
{
"epoch": 0.04794783275795934,
"grad_norm": 1.6640625,
"learning_rate": 7.652272393022438e-06,
"logits/chosen": -6.430941581726074,
"logits/rejected": -6.431049346923828,
"logps/chosen": -58.177818298339844,
"logps/rejected": -59.124046325683594,
"loss": 0.6745,
"rewards/accuracies": 0.8031250238418579,
"rewards/chosen": 0.023997317999601364,
"rewards/margins": 0.052540235221385956,
"rewards/rejected": -0.028542915359139442,
"step": 125
},
{
"epoch": 0.04986574606827771,
"grad_norm": 1.7578125,
"learning_rate": 7.960831763708825e-06,
"logits/chosen": -6.422308921813965,
"logits/rejected": -6.405043601989746,
"logps/chosen": -54.78827667236328,
"logps/rejected": -53.91057205200195,
"loss": 0.6757,
"rewards/accuracies": 0.7593749761581421,
"rewards/chosen": 0.023111283779144287,
"rewards/margins": 0.050740379840135574,
"rewards/rejected": -0.027629096060991287,
"step": 130
},
{
"epoch": 0.05178365937859609,
"grad_norm": 1.4765625,
"learning_rate": 8.269391134395214e-06,
"logits/chosen": -6.452818393707275,
"logits/rejected": -6.4319610595703125,
"logps/chosen": -56.06396484375,
"logps/rejected": -55.55432891845703,
"loss": 0.6725,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.028292512521147728,
"rewards/margins": 0.0630292147397995,
"rewards/rejected": -0.03473670035600662,
"step": 135
},
{
"epoch": 0.05370157268891446,
"grad_norm": 1.671875,
"learning_rate": 8.577950505081603e-06,
"logits/chosen": -6.453330993652344,
"logits/rejected": -6.442866325378418,
"logps/chosen": -58.166526794433594,
"logps/rejected": -57.297515869140625,
"loss": 0.6677,
"rewards/accuracies": 0.7718750238418579,
"rewards/chosen": 0.03513844683766365,
"rewards/margins": 0.0794719010591507,
"rewards/rejected": -0.04433346539735794,
"step": 140
},
{
"epoch": 0.05561948599923283,
"grad_norm": 1.6640625,
"learning_rate": 8.886509875767991e-06,
"logits/chosen": -6.464579105377197,
"logits/rejected": -6.458718776702881,
"logps/chosen": -53.89129638671875,
"logps/rejected": -54.87738037109375,
"loss": 0.6712,
"rewards/accuracies": 0.7906249761581421,
"rewards/chosen": 0.030111517757177353,
"rewards/margins": 0.06265858560800552,
"rewards/rejected": -0.03254706412553787,
"step": 145
},
{
"epoch": 0.05753739930955121,
"grad_norm": 1.6171875,
"learning_rate": 9.19506924645438e-06,
"logits/chosen": -6.414637565612793,
"logits/rejected": -6.429790496826172,
"logps/chosen": -59.0002326965332,
"logps/rejected": -58.41865921020508,
"loss": 0.6654,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 0.039772503077983856,
"rewards/margins": 0.08216916769742966,
"rewards/rejected": -0.0423966720700264,
"step": 150
},
{
"epoch": 0.05945531261986958,
"grad_norm": 1.4609375,
"learning_rate": 9.503628617140768e-06,
"logits/chosen": -6.430724143981934,
"logits/rejected": -6.439866542816162,
"logps/chosen": -56.56650924682617,
"logps/rejected": -56.43073654174805,
"loss": 0.6646,
"rewards/accuracies": 0.784375011920929,
"rewards/chosen": 0.04069245606660843,
"rewards/margins": 0.08599359542131424,
"rewards/rejected": -0.04530114680528641,
"step": 155
},
{
"epoch": 0.06137322593018796,
"grad_norm": 1.625,
"learning_rate": 9.812187987827157e-06,
"logits/chosen": -6.413358211517334,
"logits/rejected": -6.445931434631348,
"logps/chosen": -56.371116638183594,
"logps/rejected": -57.74314498901367,
"loss": 0.6645,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.040298428386449814,
"rewards/margins": 0.0885874405503273,
"rewards/rejected": -0.04828901216387749,
"step": 160
},
{
"epoch": 0.06329113924050633,
"grad_norm": 1.625,
"learning_rate": 1.0120747358513545e-05,
"logits/chosen": -6.423710823059082,
"logits/rejected": -6.452187538146973,
"logps/chosen": -57.91943359375,
"logps/rejected": -58.41001510620117,
"loss": 0.6609,
"rewards/accuracies": 0.8343750238418579,
"rewards/chosen": 0.04707075282931328,
"rewards/margins": 0.10583791881799698,
"rewards/rejected": -0.058767169713974,
"step": 165
},
{
"epoch": 0.06520905255082471,
"grad_norm": 1.6328125,
"learning_rate": 1.0429306729199934e-05,
"logits/chosen": -6.453238010406494,
"logits/rejected": -6.488819122314453,
"logps/chosen": -56.78459548950195,
"logps/rejected": -57.84309005737305,
"loss": 0.6561,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.054359547793865204,
"rewards/margins": 0.11485730111598969,
"rewards/rejected": -0.06049775332212448,
"step": 170
},
{
"epoch": 0.06712696586114307,
"grad_norm": 1.6484375,
"learning_rate": 1.0737866099886323e-05,
"logits/chosen": -6.370804786682129,
"logits/rejected": -6.389939308166504,
"logps/chosen": -56.378761291503906,
"logps/rejected": -57.62419509887695,
"loss": 0.6549,
"rewards/accuracies": 0.8343750238418579,
"rewards/chosen": 0.046738989651203156,
"rewards/margins": 0.12162216007709503,
"rewards/rejected": -0.07488318532705307,
"step": 175
},
{
"epoch": 0.06904487917146145,
"grad_norm": 1.8203125,
"learning_rate": 1.1046425470572711e-05,
"logits/chosen": -6.432094573974609,
"logits/rejected": -6.485767364501953,
"logps/chosen": -55.33378982543945,
"logps/rejected": -55.24385452270508,
"loss": 0.653,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.058635931462049484,
"rewards/margins": 0.12551145255565643,
"rewards/rejected": -0.06687554717063904,
"step": 180
},
{
"epoch": 0.07096279248177982,
"grad_norm": 1.5390625,
"learning_rate": 1.13549848412591e-05,
"logits/chosen": -6.448824405670166,
"logits/rejected": -6.453608512878418,
"logps/chosen": -56.46734619140625,
"logps/rejected": -56.84504318237305,
"loss": 0.6488,
"rewards/accuracies": 0.871874988079071,
"rewards/chosen": 0.06262876093387604,
"rewards/margins": 0.14282599091529846,
"rewards/rejected": -0.08019722998142242,
"step": 185
},
{
"epoch": 0.0728807057920982,
"grad_norm": 1.6796875,
"learning_rate": 1.1663544211945488e-05,
"logits/chosen": -6.431854248046875,
"logits/rejected": -6.417555332183838,
"logps/chosen": -52.951072692871094,
"logps/rejected": -53.4705924987793,
"loss": 0.6489,
"rewards/accuracies": 0.846875011920929,
"rewards/chosen": 0.06671543419361115,
"rewards/margins": 0.1369214504957199,
"rewards/rejected": -0.07020601630210876,
"step": 190
},
{
"epoch": 0.07479861910241657,
"grad_norm": 1.828125,
"learning_rate": 1.1972103582631877e-05,
"logits/chosen": -6.399285316467285,
"logits/rejected": -6.416836738586426,
"logps/chosen": -59.31772994995117,
"logps/rejected": -59.72453689575195,
"loss": 0.643,
"rewards/accuracies": 0.8843749761581421,
"rewards/chosen": 0.07494629919528961,
"rewards/margins": 0.15886636078357697,
"rewards/rejected": -0.08392004668712616,
"step": 195
},
{
"epoch": 0.07671653241273495,
"grad_norm": 1.5546875,
"learning_rate": 1.2280662953318267e-05,
"logits/chosen": -6.442248344421387,
"logits/rejected": -6.403599739074707,
"logps/chosen": -56.35515213012695,
"logps/rejected": -57.01416778564453,
"loss": 0.6479,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.06972001492977142,
"rewards/margins": 0.14177976548671722,
"rewards/rejected": -0.0720597356557846,
"step": 200
},
{
"epoch": 0.07863444572305332,
"grad_norm": 1.5546875,
"learning_rate": 1.2589222324004653e-05,
"logits/chosen": -6.4987688064575195,
"logits/rejected": -6.481993198394775,
"logps/chosen": -56.06902313232422,
"logps/rejected": -57.0550537109375,
"loss": 0.6452,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 0.06869082152843475,
"rewards/margins": 0.14782066643238068,
"rewards/rejected": -0.07912982255220413,
"step": 205
},
{
"epoch": 0.0805523590333717,
"grad_norm": 1.734375,
"learning_rate": 1.2897781694691043e-05,
"logits/chosen": -6.460019588470459,
"logits/rejected": -6.460816383361816,
"logps/chosen": -57.56937789916992,
"logps/rejected": -58.26279830932617,
"loss": 0.641,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.07537061721086502,
"rewards/margins": 0.17141608893871307,
"rewards/rejected": -0.09604547917842865,
"step": 210
},
{
"epoch": 0.08247027234369006,
"grad_norm": 1.578125,
"learning_rate": 1.3206341065377431e-05,
"logits/chosen": -6.431567192077637,
"logits/rejected": -6.415804862976074,
"logps/chosen": -56.94245529174805,
"logps/rejected": -58.52058029174805,
"loss": 0.631,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.08770474791526794,
"rewards/margins": 0.2049822360277176,
"rewards/rejected": -0.11727748811244965,
"step": 215
},
{
"epoch": 0.08438818565400844,
"grad_norm": 1.4375,
"learning_rate": 1.3514900436063818e-05,
"logits/chosen": -6.531057834625244,
"logits/rejected": -6.522589206695557,
"logps/chosen": -56.293739318847656,
"logps/rejected": -56.6605224609375,
"loss": 0.6301,
"rewards/accuracies": 0.878125011920929,
"rewards/chosen": 0.09262285381555557,
"rewards/margins": 0.19946810603141785,
"rewards/rejected": -0.10684527456760406,
"step": 220
},
{
"epoch": 0.08630609896432681,
"grad_norm": 1.7734375,
"learning_rate": 1.3823459806750209e-05,
"logits/chosen": -6.420934200286865,
"logits/rejected": -6.435433864593506,
"logps/chosen": -58.42344284057617,
"logps/rejected": -58.990745544433594,
"loss": 0.6275,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.08511321991682053,
"rewards/margins": 0.2056717574596405,
"rewards/rejected": -0.12055854499340057,
"step": 225
},
{
"epoch": 0.08822401227464519,
"grad_norm": 1.6484375,
"learning_rate": 1.4132019177436597e-05,
"logits/chosen": -6.4476728439331055,
"logits/rejected": -6.445916175842285,
"logps/chosen": -57.582061767578125,
"logps/rejected": -58.44621658325195,
"loss": 0.6224,
"rewards/accuracies": 0.840624988079071,
"rewards/chosen": 0.09987424314022064,
"rewards/margins": 0.24059633910655975,
"rewards/rejected": -0.14072208106517792,
"step": 230
},
{
"epoch": 0.09014192558496356,
"grad_norm": 1.6953125,
"learning_rate": 1.4440578548122987e-05,
"logits/chosen": -6.518248558044434,
"logits/rejected": -6.512531280517578,
"logps/chosen": -56.6078987121582,
"logps/rejected": -58.96805953979492,
"loss": 0.6151,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 0.11163216829299927,
"rewards/margins": 0.271917462348938,
"rewards/rejected": -0.16028529405593872,
"step": 235
},
{
"epoch": 0.09205983889528194,
"grad_norm": 1.609375,
"learning_rate": 1.4749137918809374e-05,
"logits/chosen": -6.455610752105713,
"logits/rejected": -6.456049919128418,
"logps/chosen": -58.586448669433594,
"logps/rejected": -59.85078048706055,
"loss": 0.6179,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.09219549596309662,
"rewards/margins": 0.255667120218277,
"rewards/rejected": -0.16347160935401917,
"step": 240
},
{
"epoch": 0.09397775220560031,
"grad_norm": 1.6875,
"learning_rate": 1.5057697289495763e-05,
"logits/chosen": -6.4275312423706055,
"logits/rejected": -6.468707084655762,
"logps/chosen": -55.558929443359375,
"logps/rejected": -57.90214157104492,
"loss": 0.6167,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 0.10123320668935776,
"rewards/margins": 0.26037126779556274,
"rewards/rejected": -0.15913811326026917,
"step": 245
},
{
"epoch": 0.09589566551591867,
"grad_norm": 1.4609375,
"learning_rate": 1.5366256660182153e-05,
"logits/chosen": -6.539719581604004,
"logits/rejected": -6.5594964027404785,
"logps/chosen": -56.036964416503906,
"logps/rejected": -57.6839485168457,
"loss": 0.6076,
"rewards/accuracies": 0.903124988079071,
"rewards/chosen": 0.10764100402593613,
"rewards/margins": 0.28984779119491577,
"rewards/rejected": -0.18220680952072144,
"step": 250
},
{
"epoch": 0.09781357882623705,
"grad_norm": 1.2890625,
"learning_rate": 1.567481603086854e-05,
"logits/chosen": -6.509421348571777,
"logits/rejected": -6.502968788146973,
"logps/chosen": -57.25380325317383,
"logps/rejected": -58.84259796142578,
"loss": 0.6109,
"rewards/accuracies": 0.890625,
"rewards/chosen": 0.11171796172857285,
"rewards/margins": 0.27925556898117065,
"rewards/rejected": -0.1675376147031784,
"step": 255
},
{
"epoch": 0.09973149213655542,
"grad_norm": 1.6328125,
"learning_rate": 1.5983375401554927e-05,
"logits/chosen": -6.527918815612793,
"logits/rejected": -6.5247602462768555,
"logps/chosen": -54.80146408081055,
"logps/rejected": -56.07502365112305,
"loss": 0.6188,
"rewards/accuracies": 0.8531249761581421,
"rewards/chosen": 0.09086655080318451,
"rewards/margins": 0.25640976428985596,
"rewards/rejected": -0.16554318368434906,
"step": 260
},
{
"epoch": 0.1016494054468738,
"grad_norm": 1.4921875,
"learning_rate": 1.629193477224132e-05,
"logits/chosen": -6.579885005950928,
"logits/rejected": -6.561237335205078,
"logps/chosen": -56.587547302246094,
"logps/rejected": -59.09083938598633,
"loss": 0.5984,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.12236499786376953,
"rewards/margins": 0.32771119475364685,
"rewards/rejected": -0.20534619688987732,
"step": 265
},
{
"epoch": 0.10356731875719218,
"grad_norm": 1.4921875,
"learning_rate": 1.6600494142927704e-05,
"logits/chosen": -6.541518211364746,
"logits/rejected": -6.5189385414123535,
"logps/chosen": -56.556121826171875,
"logps/rejected": -58.61604690551758,
"loss": 0.5879,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.1293206363916397,
"rewards/margins": 0.376719206571579,
"rewards/rejected": -0.24739857017993927,
"step": 270
},
{
"epoch": 0.10548523206751055,
"grad_norm": 1.734375,
"learning_rate": 1.6909053513614093e-05,
"logits/chosen": -6.512734889984131,
"logits/rejected": -6.509590148925781,
"logps/chosen": -58.57611083984375,
"logps/rejected": -59.823936462402344,
"loss": 0.5972,
"rewards/accuracies": 0.8843749761581421,
"rewards/chosen": 0.11397837102413177,
"rewards/margins": 0.3285695016384125,
"rewards/rejected": -0.2145911455154419,
"step": 275
},
{
"epoch": 0.10740314537782893,
"grad_norm": 1.25,
"learning_rate": 1.7217612884300485e-05,
"logits/chosen": -6.592258453369141,
"logits/rejected": -6.548396110534668,
"logps/chosen": -57.656585693359375,
"logps/rejected": -59.94633865356445,
"loss": 0.581,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.1426272690296173,
"rewards/margins": 0.40652403235435486,
"rewards/rejected": -0.26389676332473755,
"step": 280
},
{
"epoch": 0.1093210586881473,
"grad_norm": 1.484375,
"learning_rate": 1.7526172254986873e-05,
"logits/chosen": -6.576052188873291,
"logits/rejected": -6.558463096618652,
"logps/chosen": -53.556358337402344,
"logps/rejected": -57.3957633972168,
"loss": 0.5937,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.10910562425851822,
"rewards/margins": 0.348541259765625,
"rewards/rejected": -0.23943564295768738,
"step": 285
},
{
"epoch": 0.11123897199846566,
"grad_norm": 1.5546875,
"learning_rate": 1.783473162567326e-05,
"logits/chosen": -6.541784763336182,
"logits/rejected": -6.5498552322387695,
"logps/chosen": -56.6390495300293,
"logps/rejected": -60.371116638183594,
"loss": 0.5724,
"rewards/accuracies": 0.8843749761581421,
"rewards/chosen": 0.13584202527999878,
"rewards/margins": 0.45434489846229553,
"rewards/rejected": -0.31850284337997437,
"step": 290
},
{
"epoch": 0.11315688530878404,
"grad_norm": 1.6953125,
"learning_rate": 1.814329099635965e-05,
"logits/chosen": -6.512528419494629,
"logits/rejected": -6.557837009429932,
"logps/chosen": -54.72956466674805,
"logps/rejected": -56.99449920654297,
"loss": 0.579,
"rewards/accuracies": 0.871874988079071,
"rewards/chosen": 0.12395240366458893,
"rewards/margins": 0.4107394218444824,
"rewards/rejected": -0.2867870330810547,
"step": 295
},
{
"epoch": 0.11507479861910241,
"grad_norm": 1.5078125,
"learning_rate": 1.845185036704604e-05,
"logits/chosen": -6.576136589050293,
"logits/rejected": -6.607443332672119,
"logps/chosen": -54.656707763671875,
"logps/rejected": -58.98443603515625,
"loss": 0.579,
"rewards/accuracies": 0.8656250238418579,
"rewards/chosen": 0.1109938770532608,
"rewards/margins": 0.4175935387611389,
"rewards/rejected": -0.3065996766090393,
"step": 300
},
{
"epoch": 0.11699271192942079,
"grad_norm": 1.4921875,
"learning_rate": 1.8760409737732424e-05,
"logits/chosen": -6.587884426116943,
"logits/rejected": -6.619060516357422,
"logps/chosen": -58.541290283203125,
"logps/rejected": -63.607521057128906,
"loss": 0.5625,
"rewards/accuracies": 0.871874988079071,
"rewards/chosen": 0.1273353099822998,
"rewards/margins": 0.5049296021461487,
"rewards/rejected": -0.3775942921638489,
"step": 305
},
{
"epoch": 0.11891062523973916,
"grad_norm": 1.59375,
"learning_rate": 1.9068969108418816e-05,
"logits/chosen": -6.658132076263428,
"logits/rejected": -6.647538185119629,
"logps/chosen": -55.23577880859375,
"logps/rejected": -58.95073699951172,
"loss": 0.5774,
"rewards/accuracies": 0.878125011920929,
"rewards/chosen": 0.09004654735326767,
"rewards/margins": 0.43088483810424805,
"rewards/rejected": -0.3408382833003998,
"step": 310
},
{
"epoch": 0.12082853855005754,
"grad_norm": 1.421875,
"learning_rate": 1.9377528479105205e-05,
"logits/chosen": -6.616456508636475,
"logits/rejected": -6.5966973304748535,
"logps/chosen": -52.970489501953125,
"logps/rejected": -57.08417510986328,
"loss": 0.5793,
"rewards/accuracies": 0.878125011920929,
"rewards/chosen": 0.08462224155664444,
"rewards/margins": 0.41819873452186584,
"rewards/rejected": -0.333576500415802,
"step": 315
},
{
"epoch": 0.12274645186037592,
"grad_norm": 1.5234375,
"learning_rate": 1.968608784979159e-05,
"logits/chosen": -6.626871585845947,
"logits/rejected": -6.633614540100098,
"logps/chosen": -51.43700408935547,
"logps/rejected": -56.098548889160156,
"loss": 0.5566,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.11493581533432007,
"rewards/margins": 0.5229749083518982,
"rewards/rejected": -0.4080390930175781,
"step": 320
},
{
"epoch": 0.12466436517069429,
"grad_norm": 1.78125,
"learning_rate": 1.999464722047798e-05,
"logits/chosen": -6.619255065917969,
"logits/rejected": -6.633837699890137,
"logps/chosen": -56.89793014526367,
"logps/rejected": -63.043235778808594,
"loss": 0.5433,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 0.10207729041576385,
"rewards/margins": 0.6024643778800964,
"rewards/rejected": -0.5003870725631714,
"step": 325
},
{
"epoch": 0.12658227848101267,
"grad_norm": 1.328125,
"learning_rate": 2.030320659116437e-05,
"logits/chosen": -6.6238603591918945,
"logits/rejected": -6.6286211013793945,
"logps/chosen": -53.7779655456543,
"logps/rejected": -59.46379852294922,
"loss": 0.542,
"rewards/accuracies": 0.903124988079071,
"rewards/chosen": 0.09198246896266937,
"rewards/margins": 0.6294280290603638,
"rewards/rejected": -0.5374454855918884,
"step": 330
},
{
"epoch": 0.12850019179133104,
"grad_norm": 2.40625,
"learning_rate": 2.0611765961850756e-05,
"logits/chosen": -6.664628028869629,
"logits/rejected": -6.719674110412598,
"logps/chosen": -52.02434539794922,
"logps/rejected": -58.489112854003906,
"loss": 0.5397,
"rewards/accuracies": 0.909375011920929,
"rewards/chosen": 0.04687722399830818,
"rewards/margins": 0.6034437417984009,
"rewards/rejected": -0.5565665364265442,
"step": 335
},
{
"epoch": 0.13041810510164942,
"grad_norm": 2.515625,
"learning_rate": 2.0920325332537144e-05,
"logits/chosen": -6.698832035064697,
"logits/rejected": -6.721909999847412,
"logps/chosen": -54.917030334472656,
"logps/rejected": -60.1119384765625,
"loss": 0.5401,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.039418745785951614,
"rewards/margins": 0.629611611366272,
"rewards/rejected": -0.6690303683280945,
"step": 340
},
{
"epoch": 0.1323360184119678,
"grad_norm": 2.421875,
"learning_rate": 2.1228884703223536e-05,
"logits/chosen": -6.65041971206665,
"logits/rejected": -6.657183647155762,
"logps/chosen": -60.004638671875,
"logps/rejected": -65.77970123291016,
"loss": 0.5344,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -0.22128959000110626,
"rewards/margins": 0.7207987904548645,
"rewards/rejected": -0.9420884251594543,
"step": 345
},
{
"epoch": 0.13425393172228614,
"grad_norm": 2.140625,
"learning_rate": 2.1537444073909925e-05,
"logits/chosen": -6.71240234375,
"logits/rejected": -6.70929479598999,
"logps/chosen": -60.435508728027344,
"logps/rejected": -68.03807830810547,
"loss": 0.5136,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.13982044160366058,
"rewards/margins": 0.8055053949356079,
"rewards/rejected": -0.9453258514404297,
"step": 350
},
{
"epoch": 0.13617184503260452,
"grad_norm": 1.71875,
"learning_rate": 2.184600344459631e-05,
"logits/chosen": -6.7060980796813965,
"logits/rejected": -6.739293575286865,
"logps/chosen": -58.657188415527344,
"logps/rejected": -66.50128936767578,
"loss": 0.5188,
"rewards/accuracies": 0.8843749761581421,
"rewards/chosen": -0.16930516064167023,
"rewards/margins": 0.7925722002983093,
"rewards/rejected": -0.9618774652481079,
"step": 355
},
{
"epoch": 0.1380897583429229,
"grad_norm": 2.078125,
"learning_rate": 2.2154562815282702e-05,
"logits/chosen": -6.688414096832275,
"logits/rejected": -6.681405067443848,
"logps/chosen": -58.76293182373047,
"logps/rejected": -66.48202514648438,
"loss": 0.5135,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -0.20896108448505402,
"rewards/margins": 0.8072658777236938,
"rewards/rejected": -1.0162270069122314,
"step": 360
},
{
"epoch": 0.14000767165324127,
"grad_norm": 1.890625,
"learning_rate": 2.246312218596909e-05,
"logits/chosen": -6.696159362792969,
"logits/rejected": -6.75206995010376,
"logps/chosen": -56.35298538208008,
"logps/rejected": -63.650901794433594,
"loss": 0.5076,
"rewards/accuracies": 0.9156249761581421,
"rewards/chosen": -0.2087055742740631,
"rewards/margins": 0.8270822763442993,
"rewards/rejected": -1.03578782081604,
"step": 365
},
{
"epoch": 0.14192558496355964,
"grad_norm": 1.8203125,
"learning_rate": 2.2771681556655476e-05,
"logits/chosen": -6.695774078369141,
"logits/rejected": -6.776731967926025,
"logps/chosen": -55.71824264526367,
"logps/rejected": -63.604736328125,
"loss": 0.5042,
"rewards/accuracies": 0.903124988079071,
"rewards/chosen": -0.22564025223255157,
"rewards/margins": 0.8389409780502319,
"rewards/rejected": -1.0645811557769775,
"step": 370
},
{
"epoch": 0.14384349827387802,
"grad_norm": 1.953125,
"learning_rate": 2.3080240927341865e-05,
"logits/chosen": -6.816876411437988,
"logits/rejected": -6.805089473724365,
"logps/chosen": -57.54021453857422,
"logps/rejected": -65.3156509399414,
"loss": 0.5065,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -0.22473874688148499,
"rewards/margins": 0.8491897583007812,
"rewards/rejected": -1.0739285945892334,
"step": 375
},
{
"epoch": 0.1457614115841964,
"grad_norm": 1.9140625,
"learning_rate": 2.3388800298028257e-05,
"logits/chosen": -6.714625358581543,
"logits/rejected": -6.712975978851318,
"logps/chosen": -58.104759216308594,
"logps/rejected": -65.9175033569336,
"loss": 0.4919,
"rewards/accuracies": 0.9281250238418579,
"rewards/chosen": -0.3026445806026459,
"rewards/margins": 0.9062640070915222,
"rewards/rejected": -1.2089087963104248,
"step": 380
},
{
"epoch": 0.14767932489451477,
"grad_norm": 2.859375,
"learning_rate": 2.3697359668714642e-05,
"logits/chosen": -6.743095397949219,
"logits/rejected": -6.772109031677246,
"logps/chosen": -59.0074577331543,
"logps/rejected": -69.05128479003906,
"loss": 0.4822,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.37660008668899536,
"rewards/margins": 0.9904800653457642,
"rewards/rejected": -1.3670800924301147,
"step": 385
},
{
"epoch": 0.14959723820483314,
"grad_norm": 3.125,
"learning_rate": 2.400591903940103e-05,
"logits/chosen": -6.754584312438965,
"logits/rejected": -6.786032676696777,
"logps/chosen": -62.139381408691406,
"logps/rejected": -71.784912109375,
"loss": 0.4866,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.5153251886367798,
"rewards/margins": 0.9975606799125671,
"rewards/rejected": -1.5128860473632812,
"step": 390
},
{
"epoch": 0.15151515151515152,
"grad_norm": 2.28125,
"learning_rate": 2.431447841008742e-05,
"logits/chosen": -6.77377986907959,
"logits/rejected": -6.7619805335998535,
"logps/chosen": -62.71953201293945,
"logps/rejected": -71.13737487792969,
"loss": 0.4978,
"rewards/accuracies": 0.8843749761581421,
"rewards/chosen": -0.5491987466812134,
"rewards/margins": 0.8964099884033203,
"rewards/rejected": -1.4456088542938232,
"step": 395
},
{
"epoch": 0.1534330648254699,
"grad_norm": 2.125,
"learning_rate": 2.4623037780773808e-05,
"logits/chosen": -6.71270751953125,
"logits/rejected": -6.765826225280762,
"logps/chosen": -63.85282516479492,
"logps/rejected": -72.63124084472656,
"loss": 0.4761,
"rewards/accuracies": 0.903124988079071,
"rewards/chosen": -0.5679726600646973,
"rewards/margins": 0.9939053654670715,
"rewards/rejected": -1.5618780851364136,
"step": 400
},
{
"epoch": 0.15535097813578827,
"grad_norm": 2.390625,
"learning_rate": 2.4931597151460196e-05,
"logits/chosen": -6.759522914886475,
"logits/rejected": -6.8248186111450195,
"logps/chosen": -63.705726623535156,
"logps/rejected": -74.55303955078125,
"loss": 0.4626,
"rewards/accuracies": 0.9281250238418579,
"rewards/chosen": -0.6370080709457397,
"rewards/margins": 1.109112024307251,
"rewards/rejected": -1.7461200952529907,
"step": 405
},
{
"epoch": 0.15726889144610665,
"grad_norm": 2.984375,
"learning_rate": 2.5240156522146588e-05,
"logits/chosen": -6.789227485656738,
"logits/rejected": -6.754014492034912,
"logps/chosen": -58.8348503112793,
"logps/rejected": -70.08036804199219,
"loss": 0.466,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.4667413830757141,
"rewards/margins": 1.0789754390716553,
"rewards/rejected": -1.5457168817520142,
"step": 410
},
{
"epoch": 0.15918680475642502,
"grad_norm": 4.5,
"learning_rate": 2.5548715892832977e-05,
"logits/chosen": -6.742713928222656,
"logits/rejected": -6.790808200836182,
"logps/chosen": -59.2355842590332,
"logps/rejected": -69.20877075195312,
"loss": 0.4641,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.35354870557785034,
"rewards/margins": 1.1075929403305054,
"rewards/rejected": -1.46114182472229,
"step": 415
},
{
"epoch": 0.1611047180667434,
"grad_norm": 2.875,
"learning_rate": 2.5857275263519365e-05,
"logits/chosen": -6.692159175872803,
"logits/rejected": -6.748051643371582,
"logps/chosen": -64.951416015625,
"logps/rejected": -74.98480224609375,
"loss": 0.4567,
"rewards/accuracies": 0.940625011920929,
"rewards/chosen": -0.5960937738418579,
"rewards/margins": 1.1240198612213135,
"rewards/rejected": -1.7201133966445923,
"step": 420
},
{
"epoch": 0.16302263137706174,
"grad_norm": 2.0625,
"learning_rate": 2.6165834634205754e-05,
"logits/chosen": -6.632455348968506,
"logits/rejected": -6.703897953033447,
"logps/chosen": -64.6573486328125,
"logps/rejected": -75.8453140258789,
"loss": 0.4431,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.6735265851020813,
"rewards/margins": 1.2322088479995728,
"rewards/rejected": -1.9057356119155884,
"step": 425
},
{
"epoch": 0.16494054468738012,
"grad_norm": 2.875,
"learning_rate": 2.647439400489214e-05,
"logits/chosen": -6.690484523773193,
"logits/rejected": -6.69437313079834,
"logps/chosen": -62.8514404296875,
"logps/rejected": -72.82466888427734,
"loss": 0.4438,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.622053861618042,
"rewards/margins": 1.215821385383606,
"rewards/rejected": -1.8378751277923584,
"step": 430
},
{
"epoch": 0.1668584579976985,
"grad_norm": 3.984375,
"learning_rate": 2.6782953375578528e-05,
"logits/chosen": -6.712440490722656,
"logits/rejected": -6.714225769042969,
"logps/chosen": -65.16419219970703,
"logps/rejected": -77.28175354003906,
"loss": 0.4356,
"rewards/accuracies": 0.940625011920929,
"rewards/chosen": -0.7391894459724426,
"rewards/margins": 1.278839349746704,
"rewards/rejected": -2.018028736114502,
"step": 435
},
{
"epoch": 0.16877637130801687,
"grad_norm": 3.34375,
"learning_rate": 2.7091512746264916e-05,
"logits/chosen": -6.68596887588501,
"logits/rejected": -6.724273681640625,
"logps/chosen": -64.77339172363281,
"logps/rejected": -76.25152587890625,
"loss": 0.4358,
"rewards/accuracies": 0.9468749761581421,
"rewards/chosen": -0.7368718385696411,
"rewards/margins": 1.236840009689331,
"rewards/rejected": -1.973711609840393,
"step": 440
},
{
"epoch": 0.17069428461833525,
"grad_norm": 2.46875,
"learning_rate": 2.740007211695131e-05,
"logits/chosen": -6.6403093338012695,
"logits/rejected": -6.6900529861450195,
"logps/chosen": -64.81290435791016,
"logps/rejected": -77.3272476196289,
"loss": 0.4259,
"rewards/accuracies": 0.96875,
"rewards/chosen": -0.5889671444892883,
"rewards/margins": 1.3001738786697388,
"rewards/rejected": -1.8891408443450928,
"step": 445
},
{
"epoch": 0.17261219792865362,
"grad_norm": 3.828125,
"learning_rate": 2.7708631487637697e-05,
"logits/chosen": -6.612371921539307,
"logits/rejected": -6.659645080566406,
"logps/chosen": -61.89921951293945,
"logps/rejected": -74.57048034667969,
"loss": 0.4253,
"rewards/accuracies": 0.9593750238418579,
"rewards/chosen": -0.665620744228363,
"rewards/margins": 1.361120343208313,
"rewards/rejected": -2.0267410278320312,
"step": 450
},
{
"epoch": 0.174530111238972,
"grad_norm": 4.15625,
"learning_rate": 2.8017190858324086e-05,
"logits/chosen": -6.555197238922119,
"logits/rejected": -6.548722743988037,
"logps/chosen": -67.78192901611328,
"logps/rejected": -80.3170394897461,
"loss": 0.4139,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -0.8611640930175781,
"rewards/margins": 1.415268063545227,
"rewards/rejected": -2.2764322757720947,
"step": 455
},
{
"epoch": 0.17644802454929037,
"grad_norm": 2.625,
"learning_rate": 2.832575022901047e-05,
"logits/chosen": -6.598601341247559,
"logits/rejected": -6.614532470703125,
"logps/chosen": -64.5110092163086,
"logps/rejected": -76.21466827392578,
"loss": 0.4438,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.7278603315353394,
"rewards/margins": 1.279840111732483,
"rewards/rejected": -2.0077004432678223,
"step": 460
},
{
"epoch": 0.17836593785960875,
"grad_norm": 3.046875,
"learning_rate": 2.863430959969686e-05,
"logits/chosen": -6.689515113830566,
"logits/rejected": -6.717551231384277,
"logps/chosen": -62.034454345703125,
"logps/rejected": -74.14480590820312,
"loss": 0.4283,
"rewards/accuracies": 0.953125,
"rewards/chosen": -0.5690684914588928,
"rewards/margins": 1.3283374309539795,
"rewards/rejected": -1.8974058628082275,
"step": 465
},
{
"epoch": 0.18028385116992712,
"grad_norm": 2.71875,
"learning_rate": 2.8942868970383248e-05,
"logits/chosen": -6.623570442199707,
"logits/rejected": -6.677834987640381,
"logps/chosen": -64.6675033569336,
"logps/rejected": -76.27220153808594,
"loss": 0.4101,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -0.6819969415664673,
"rewards/margins": 1.3743338584899902,
"rewards/rejected": -2.056330680847168,
"step": 470
},
{
"epoch": 0.1822017644802455,
"grad_norm": 12.8125,
"learning_rate": 2.9251428341069636e-05,
"logits/chosen": -6.536647796630859,
"logits/rejected": -6.574659824371338,
"logps/chosen": -67.92814636230469,
"logps/rejected": -81.60063934326172,
"loss": 0.3935,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -1.0204700231552124,
"rewards/margins": 1.5342200994491577,
"rewards/rejected": -2.554690361022949,
"step": 475
},
{
"epoch": 0.18411967779056387,
"grad_norm": 3.125,
"learning_rate": 2.955998771175603e-05,
"logits/chosen": -6.626161098480225,
"logits/rejected": -6.705386161804199,
"logps/chosen": -62.2401008605957,
"logps/rejected": -77.29728698730469,
"loss": 0.4065,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -0.7265374064445496,
"rewards/margins": 1.4822876453399658,
"rewards/rejected": -2.208824634552002,
"step": 480
},
{
"epoch": 0.18603759110088225,
"grad_norm": 2.015625,
"learning_rate": 2.9868547082442417e-05,
"logits/chosen": -6.6063337326049805,
"logits/rejected": -6.613149166107178,
"logps/chosen": -69.5428237915039,
"logps/rejected": -83.10125732421875,
"loss": 0.3982,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -0.9311842918395996,
"rewards/margins": 1.4642444849014282,
"rewards/rejected": -2.3954286575317383,
"step": 485
},
{
"epoch": 0.18795550441120062,
"grad_norm": 6.71875,
"learning_rate": 3.0177106453128806e-05,
"logits/chosen": -6.5636162757873535,
"logits/rejected": -6.586276054382324,
"logps/chosen": -64.01178741455078,
"logps/rejected": -77.87983703613281,
"loss": 0.3923,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -0.789256751537323,
"rewards/margins": 1.5717377662658691,
"rewards/rejected": -2.360994577407837,
"step": 490
},
{
"epoch": 0.189873417721519,
"grad_norm": 2.28125,
"learning_rate": 3.048566582381519e-05,
"logits/chosen": -6.557529449462891,
"logits/rejected": -6.565741539001465,
"logps/chosen": -67.82279968261719,
"logps/rejected": -82.83336639404297,
"loss": 0.3884,
"rewards/accuracies": 0.984375,
"rewards/chosen": -0.9904484748840332,
"rewards/margins": 1.550180196762085,
"rewards/rejected": -2.540628671646118,
"step": 495
},
{
"epoch": 0.19179133103183735,
"grad_norm": 5.625,
"learning_rate": 3.079422519450158e-05,
"logits/chosen": -6.607028961181641,
"logits/rejected": -6.654728889465332,
"logps/chosen": -60.82036209106445,
"logps/rejected": -76.42386627197266,
"loss": 0.4073,
"rewards/accuracies": 0.96875,
"rewards/chosen": -0.6843277215957642,
"rewards/margins": 1.5797810554504395,
"rewards/rejected": -2.2641091346740723,
"step": 500
},
{
"epoch": 0.19179133103183735,
"eval_logits/chosen": -6.560471057891846,
"eval_logits/rejected": -6.7035040855407715,
"eval_logps/chosen": -64.2278823852539,
"eval_logps/rejected": -80.88381958007812,
"eval_loss": 0.3989410698413849,
"eval_rewards/accuracies": 0.9549999833106995,
"eval_rewards/chosen": -0.877504289150238,
"eval_rewards/margins": 1.5815826654434204,
"eval_rewards/rejected": -2.4590871334075928,
"eval_runtime": 5.3821,
"eval_samples_per_second": 37.16,
"eval_steps_per_second": 37.16,
"step": 500
},
{
"epoch": 0.19370924434215572,
"grad_norm": 3.765625,
"learning_rate": 3.110278456518797e-05,
"logits/chosen": -6.609318733215332,
"logits/rejected": -6.6263556480407715,
"logps/chosen": -71.6617202758789,
"logps/rejected": -85.51215362548828,
"loss": 0.3906,
"rewards/accuracies": 0.96875,
"rewards/chosen": -0.9766203165054321,
"rewards/margins": 1.5865925550460815,
"rewards/rejected": -2.5632128715515137,
"step": 505
},
{
"epoch": 0.1956271576524741,
"grad_norm": 3.625,
"learning_rate": 3.141134393587436e-05,
"logits/chosen": -6.633000373840332,
"logits/rejected": -6.646389961242676,
"logps/chosen": -69.64385986328125,
"logps/rejected": -86.03681945800781,
"loss": 0.3837,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -0.9670968055725098,
"rewards/margins": 1.6623718738555908,
"rewards/rejected": -2.6294689178466797,
"step": 510
},
{
"epoch": 0.19754507096279247,
"grad_norm": 4.78125,
"learning_rate": 3.171990330656075e-05,
"logits/chosen": -6.478829860687256,
"logits/rejected": -6.520793914794922,
"logps/chosen": -70.28801727294922,
"logps/rejected": -86.75557708740234,
"loss": 0.3871,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -1.1147288084030151,
"rewards/margins": 1.6480423212051392,
"rewards/rejected": -2.7627711296081543,
"step": 515
},
{
"epoch": 0.19946298427311085,
"grad_norm": 3.734375,
"learning_rate": 3.202846267724714e-05,
"logits/chosen": -6.553919792175293,
"logits/rejected": -6.541110038757324,
"logps/chosen": -62.589447021484375,
"logps/rejected": -77.74308776855469,
"loss": 0.3918,
"rewards/accuracies": 0.965624988079071,
"rewards/chosen": -0.7898917198181152,
"rewards/margins": 1.6853268146514893,
"rewards/rejected": -2.4752185344696045,
"step": 520
},
{
"epoch": 0.20138089758342922,
"grad_norm": 2.109375,
"learning_rate": 3.233702204793352e-05,
"logits/chosen": -6.471434593200684,
"logits/rejected": -6.537613868713379,
"logps/chosen": -62.585060119628906,
"logps/rejected": -79.62939453125,
"loss": 0.3703,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -0.7669985890388489,
"rewards/margins": 1.7265609502792358,
"rewards/rejected": -2.4935598373413086,
"step": 525
},
{
"epoch": 0.2032988108937476,
"grad_norm": 3.25,
"learning_rate": 3.264558141861991e-05,
"logits/chosen": -6.616142272949219,
"logits/rejected": -6.6340813636779785,
"logps/chosen": -63.94733810424805,
"logps/rejected": -81.17515563964844,
"loss": 0.374,
"rewards/accuracies": 0.984375,
"rewards/chosen": -0.783534586429596,
"rewards/margins": 1.6740057468414307,
"rewards/rejected": -2.4575400352478027,
"step": 530
},
{
"epoch": 0.20521672420406598,
"grad_norm": 2.859375,
"learning_rate": 3.29541407893063e-05,
"logits/chosen": -6.532637119293213,
"logits/rejected": -6.545807838439941,
"logps/chosen": -69.32952117919922,
"logps/rejected": -85.47936248779297,
"loss": 0.3789,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -0.9866589307785034,
"rewards/margins": 1.7148005962371826,
"rewards/rejected": -2.7014594078063965,
"step": 535
},
{
"epoch": 0.20713463751438435,
"grad_norm": 1.359375,
"learning_rate": 3.326270015999269e-05,
"logits/chosen": -6.423262596130371,
"logits/rejected": -6.4812421798706055,
"logps/chosen": -63.51520919799805,
"logps/rejected": -80.93110656738281,
"loss": 0.3684,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -0.7824930548667908,
"rewards/margins": 1.7671178579330444,
"rewards/rejected": -2.5496110916137695,
"step": 540
},
{
"epoch": 0.20905255082470273,
"grad_norm": 2.28125,
"learning_rate": 3.357125953067908e-05,
"logits/chosen": -6.493915557861328,
"logits/rejected": -6.586592197418213,
"logps/chosen": -64.7423095703125,
"logps/rejected": -81.45304870605469,
"loss": 0.359,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -0.8664926290512085,
"rewards/margins": 1.804779052734375,
"rewards/rejected": -2.671271800994873,
"step": 545
},
{
"epoch": 0.2109704641350211,
"grad_norm": 1.59375,
"learning_rate": 3.3879818901365465e-05,
"logits/chosen": -6.5301971435546875,
"logits/rejected": -6.591165065765381,
"logps/chosen": -63.6221809387207,
"logps/rejected": -82.46671295166016,
"loss": 0.3567,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -0.8773013353347778,
"rewards/margins": 1.8641706705093384,
"rewards/rejected": -2.741472005844116,
"step": 550
},
{
"epoch": 0.21288837744533948,
"grad_norm": 3.359375,
"learning_rate": 3.418837827205186e-05,
"logits/chosen": -6.451764106750488,
"logits/rejected": -6.493180274963379,
"logps/chosen": -66.15047454833984,
"logps/rejected": -83.76017761230469,
"loss": 0.3553,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -0.870043933391571,
"rewards/margins": 1.9231412410736084,
"rewards/rejected": -2.793184995651245,
"step": 555
},
{
"epoch": 0.21480629075565785,
"grad_norm": 1.75,
"learning_rate": 3.449693764273824e-05,
"logits/chosen": -6.4344892501831055,
"logits/rejected": -6.489321708679199,
"logps/chosen": -70.24534606933594,
"logps/rejected": -87.98912048339844,
"loss": 0.3585,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -1.126381278038025,
"rewards/margins": 1.8942753076553345,
"rewards/rejected": -3.0206568241119385,
"step": 560
},
{
"epoch": 0.21672420406597623,
"grad_norm": 6.25,
"learning_rate": 3.4805497013424635e-05,
"logits/chosen": -6.488032341003418,
"logits/rejected": -6.582372188568115,
"logps/chosen": -66.94841766357422,
"logps/rejected": -84.56275177001953,
"loss": 0.3636,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -0.9088627696037292,
"rewards/margins": 1.8437904119491577,
"rewards/rejected": -2.7526535987854004,
"step": 565
},
{
"epoch": 0.2186421173762946,
"grad_norm": 1.6640625,
"learning_rate": 3.511405638411102e-05,
"logits/chosen": -6.510122776031494,
"logits/rejected": -6.623865604400635,
"logps/chosen": -68.36225128173828,
"logps/rejected": -87.87826538085938,
"loss": 0.3495,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -1.0378071069717407,
"rewards/margins": 1.9340025186538696,
"rewards/rejected": -2.9718098640441895,
"step": 570
},
{
"epoch": 0.22056003068661298,
"grad_norm": 1.8125,
"learning_rate": 3.542261575479741e-05,
"logits/chosen": -6.497043609619141,
"logits/rejected": -6.4754462242126465,
"logps/chosen": -66.7702407836914,
"logps/rejected": -84.40606689453125,
"loss": 0.3512,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -0.9813149571418762,
"rewards/margins": 1.8828579187393188,
"rewards/rejected": -2.86417293548584,
"step": 575
},
{
"epoch": 0.22247794399693133,
"grad_norm": 0.9140625,
"learning_rate": 3.57311751254838e-05,
"logits/chosen": -6.491125583648682,
"logits/rejected": -6.545646667480469,
"logps/chosen": -65.3485336303711,
"logps/rejected": -84.4004135131836,
"loss": 0.3476,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -0.8092457056045532,
"rewards/margins": 1.9012104272842407,
"rewards/rejected": -2.710456371307373,
"step": 580
},
{
"epoch": 0.2243958573072497,
"grad_norm": 1.109375,
"learning_rate": 3.603973449617019e-05,
"logits/chosen": -6.469923496246338,
"logits/rejected": -6.535366058349609,
"logps/chosen": -66.1703872680664,
"logps/rejected": -84.5575180053711,
"loss": 0.3404,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8879055976867676,
"rewards/margins": 2.0098865032196045,
"rewards/rejected": -2.897792100906372,
"step": 585
},
{
"epoch": 0.22631377061756808,
"grad_norm": 7.125,
"learning_rate": 3.6348293866856574e-05,
"logits/chosen": -6.473888397216797,
"logits/rejected": -6.5238189697265625,
"logps/chosen": -67.31280517578125,
"logps/rejected": -84.41548156738281,
"loss": 0.3516,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -0.9976273775100708,
"rewards/margins": 1.9460136890411377,
"rewards/rejected": -2.943641185760498,
"step": 590
},
{
"epoch": 0.22823168392788645,
"grad_norm": 2.625,
"learning_rate": 3.665685323754296e-05,
"logits/chosen": -6.507689476013184,
"logits/rejected": -6.5280866622924805,
"logps/chosen": -67.81330108642578,
"logps/rejected": -87.51911163330078,
"loss": 0.3408,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9383264780044556,
"rewards/margins": 2.0480716228485107,
"rewards/rejected": -2.9863979816436768,
"step": 595
},
{
"epoch": 0.23014959723820483,
"grad_norm": 0.5703125,
"learning_rate": 3.696541260822935e-05,
"logits/chosen": -6.59119176864624,
"logits/rejected": -6.690882682800293,
"logps/chosen": -66.38182067871094,
"logps/rejected": -84.89421844482422,
"loss": 0.3384,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8638726472854614,
"rewards/margins": 2.045499324798584,
"rewards/rejected": -2.909372568130493,
"step": 600
},
{
"epoch": 0.2320675105485232,
"grad_norm": 4.03125,
"learning_rate": 3.727397197891574e-05,
"logits/chosen": -6.517537593841553,
"logits/rejected": -6.529317378997803,
"logps/chosen": -71.48027038574219,
"logps/rejected": -92.27767944335938,
"loss": 0.341,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -1.1382275819778442,
"rewards/margins": 2.070920705795288,
"rewards/rejected": -3.209148406982422,
"step": 605
},
{
"epoch": 0.23398542385884158,
"grad_norm": 0.9609375,
"learning_rate": 3.758253134960213e-05,
"logits/chosen": -6.474556922912598,
"logits/rejected": -6.567405700683594,
"logps/chosen": -71.85388946533203,
"logps/rejected": -92.21666717529297,
"loss": 0.3391,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -1.1598782539367676,
"rewards/margins": 2.081530809402466,
"rewards/rejected": -3.2414088249206543,
"step": 610
},
{
"epoch": 0.23590333716915995,
"grad_norm": 1.0234375,
"learning_rate": 3.789109072028852e-05,
"logits/chosen": -6.5861101150512695,
"logits/rejected": -6.617328643798828,
"logps/chosen": -64.79547119140625,
"logps/rejected": -86.27708435058594,
"loss": 0.337,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9149611592292786,
"rewards/margins": 2.0861799716949463,
"rewards/rejected": -3.001140832901001,
"step": 615
},
{
"epoch": 0.23782125047947833,
"grad_norm": 0.7109375,
"learning_rate": 3.819965009097491e-05,
"logits/chosen": -6.546975612640381,
"logits/rejected": -6.6057891845703125,
"logps/chosen": -71.35441589355469,
"logps/rejected": -91.72935485839844,
"loss": 0.3345,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0706521272659302,
"rewards/margins": 2.114802837371826,
"rewards/rejected": -3.185454845428467,
"step": 620
},
{
"epoch": 0.2397391637897967,
"grad_norm": 1.765625,
"learning_rate": 3.850820946166129e-05,
"logits/chosen": -6.566898345947266,
"logits/rejected": -6.611680507659912,
"logps/chosen": -64.45490264892578,
"logps/rejected": -84.6996841430664,
"loss": 0.3393,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -0.8809040784835815,
"rewards/margins": 2.0736887454986572,
"rewards/rejected": -2.9545929431915283,
"step": 625
},
{
"epoch": 0.24165707710011508,
"grad_norm": 4.1875,
"learning_rate": 3.881676883234768e-05,
"logits/chosen": -6.510687351226807,
"logits/rejected": -6.579364776611328,
"logps/chosen": -69.7341537475586,
"logps/rejected": -89.81876373291016,
"loss": 0.3369,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -1.0892380475997925,
"rewards/margins": 2.1302390098571777,
"rewards/rejected": -3.2194771766662598,
"step": 630
},
{
"epoch": 0.24357499041043346,
"grad_norm": 1.3828125,
"learning_rate": 3.9125328203034075e-05,
"logits/chosen": -6.530572414398193,
"logits/rejected": -6.635471343994141,
"logps/chosen": -63.116294860839844,
"logps/rejected": -82.94602966308594,
"loss": 0.336,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8233559727668762,
"rewards/margins": 2.076512575149536,
"rewards/rejected": -2.8998684883117676,
"step": 635
},
{
"epoch": 0.24549290372075183,
"grad_norm": 0.703125,
"learning_rate": 3.943388757372046e-05,
"logits/chosen": -6.4983673095703125,
"logits/rejected": -6.571094512939453,
"logps/chosen": -67.33332824707031,
"logps/rejected": -87.63394927978516,
"loss": 0.333,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9015899896621704,
"rewards/margins": 2.184861660003662,
"rewards/rejected": -3.086452007293701,
"step": 640
},
{
"epoch": 0.2474108170310702,
"grad_norm": 0.80859375,
"learning_rate": 3.974244694440685e-05,
"logits/chosen": -6.597878932952881,
"logits/rejected": -6.640139579772949,
"logps/chosen": -65.09263610839844,
"logps/rejected": -85.24698638916016,
"loss": 0.3356,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8725703954696655,
"rewards/margins": 2.157386302947998,
"rewards/rejected": -3.029956817626953,
"step": 645
},
{
"epoch": 0.24932873034138858,
"grad_norm": 0.78125,
"learning_rate": 4.005100631509324e-05,
"logits/chosen": -6.532644748687744,
"logits/rejected": -6.656785488128662,
"logps/chosen": -61.70920944213867,
"logps/rejected": -83.08709716796875,
"loss": 0.3481,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -0.7342279553413391,
"rewards/margins": 2.1510305404663086,
"rewards/rejected": -2.885258197784424,
"step": 650
},
{
"epoch": 0.25124664365170696,
"grad_norm": 2.796875,
"learning_rate": 4.035956568577963e-05,
"logits/chosen": -6.683371067047119,
"logits/rejected": -6.702568054199219,
"logps/chosen": -60.66215896606445,
"logps/rejected": -81.75194549560547,
"loss": 0.334,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7626537680625916,
"rewards/margins": 2.157379627227783,
"rewards/rejected": -2.9200336933135986,
"step": 655
},
{
"epoch": 0.25316455696202533,
"grad_norm": 0.9609375,
"learning_rate": 4.0668125056466014e-05,
"logits/chosen": -6.600775718688965,
"logits/rejected": -6.707993507385254,
"logps/chosen": -63.812767028808594,
"logps/rejected": -84.95133209228516,
"loss": 0.3337,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -0.8133424520492554,
"rewards/margins": 2.163587808609009,
"rewards/rejected": -2.9769299030303955,
"step": 660
},
{
"epoch": 0.2550824702723437,
"grad_norm": 4.875,
"learning_rate": 4.0976684427152406e-05,
"logits/chosen": -6.641859531402588,
"logits/rejected": -6.716789245605469,
"logps/chosen": -61.890541076660156,
"logps/rejected": -82.82421875,
"loss": 0.3364,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -0.5735510587692261,
"rewards/margins": 2.1314024925231934,
"rewards/rejected": -2.704953908920288,
"step": 665
},
{
"epoch": 0.2570003835826621,
"grad_norm": 0.734375,
"learning_rate": 4.128524379783879e-05,
"logits/chosen": -6.6130547523498535,
"logits/rejected": -6.662297248840332,
"logps/chosen": -63.92805862426758,
"logps/rejected": -85.00175476074219,
"loss": 0.3333,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7270557880401611,
"rewards/margins": 2.136582612991333,
"rewards/rejected": -2.863638401031494,
"step": 670
},
{
"epoch": 0.25891829689298046,
"grad_norm": 0.9375,
"learning_rate": 4.1593803168525184e-05,
"logits/chosen": -6.675729274749756,
"logits/rejected": -6.806771755218506,
"logps/chosen": -63.05427169799805,
"logps/rejected": -84.34765625,
"loss": 0.3301,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6259730458259583,
"rewards/margins": 2.159316062927246,
"rewards/rejected": -2.7852890491485596,
"step": 675
},
{
"epoch": 0.26083621020329883,
"grad_norm": 3.859375,
"learning_rate": 4.190236253921157e-05,
"logits/chosen": -6.6718950271606445,
"logits/rejected": -6.7197771072387695,
"logps/chosen": -59.5505485534668,
"logps/rejected": -79.26701354980469,
"loss": 0.3416,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -0.5589812994003296,
"rewards/margins": 2.1066300868988037,
"rewards/rejected": -2.6656112670898438,
"step": 680
},
{
"epoch": 0.2627541235136172,
"grad_norm": 1.78125,
"learning_rate": 4.221092190989796e-05,
"logits/chosen": -6.631156921386719,
"logits/rejected": -6.725895881652832,
"logps/chosen": -64.82130432128906,
"logps/rejected": -85.05762481689453,
"loss": 0.3312,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7829705476760864,
"rewards/margins": 2.1255249977111816,
"rewards/rejected": -2.9084956645965576,
"step": 685
},
{
"epoch": 0.2646720368239356,
"grad_norm": 0.765625,
"learning_rate": 4.2519481280584346e-05,
"logits/chosen": -6.656712532043457,
"logits/rejected": -6.712060451507568,
"logps/chosen": -64.23103332519531,
"logps/rejected": -85.4850845336914,
"loss": 0.3392,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -0.6182041168212891,
"rewards/margins": 2.193781852722168,
"rewards/rejected": -2.811986207962036,
"step": 690
},
{
"epoch": 0.2665899501342539,
"grad_norm": 8.3125,
"learning_rate": 4.282804065127073e-05,
"logits/chosen": -6.607521057128906,
"logits/rejected": -6.724272727966309,
"logps/chosen": -62.71710205078125,
"logps/rejected": -83.67607879638672,
"loss": 0.3385,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -0.5952963829040527,
"rewards/margins": 2.1687827110290527,
"rewards/rejected": -2.7640790939331055,
"step": 695
},
{
"epoch": 0.2685078634445723,
"grad_norm": 1.2421875,
"learning_rate": 4.313660002195712e-05,
"logits/chosen": -6.672816276550293,
"logits/rejected": -6.760655403137207,
"logps/chosen": -61.5716667175293,
"logps/rejected": -82.35960388183594,
"loss": 0.3339,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -0.5193628668785095,
"rewards/margins": 2.136096715927124,
"rewards/rejected": -2.6554598808288574,
"step": 700
},
{
"epoch": 0.27042577675489066,
"grad_norm": 6.90625,
"learning_rate": 4.3445159392643515e-05,
"logits/chosen": -6.69228458404541,
"logits/rejected": -6.7472124099731445,
"logps/chosen": -70.73759460449219,
"logps/rejected": -91.33528900146484,
"loss": 0.3363,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -0.826391339302063,
"rewards/margins": 2.150942802429199,
"rewards/rejected": -2.9773337841033936,
"step": 705
},
{
"epoch": 0.27234369006520903,
"grad_norm": 1.296875,
"learning_rate": 4.37537187633299e-05,
"logits/chosen": -6.696518898010254,
"logits/rejected": -6.7846574783325195,
"logps/chosen": -61.52152633666992,
"logps/rejected": -82.66785430908203,
"loss": 0.3322,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5135270357131958,
"rewards/margins": 2.1553821563720703,
"rewards/rejected": -2.6689088344573975,
"step": 710
},
{
"epoch": 0.2742616033755274,
"grad_norm": 0.828125,
"learning_rate": 4.406227813401629e-05,
"logits/chosen": -6.7538933753967285,
"logits/rejected": -6.772116661071777,
"logps/chosen": -62.2570686340332,
"logps/rejected": -83.71648406982422,
"loss": 0.3309,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6405705213546753,
"rewards/margins": 2.1925716400146484,
"rewards/rejected": -2.8331425189971924,
"step": 715
},
{
"epoch": 0.2761795166858458,
"grad_norm": 1.1015625,
"learning_rate": 4.4370837504702684e-05,
"logits/chosen": -6.731971740722656,
"logits/rejected": -6.773854732513428,
"logps/chosen": -65.49516296386719,
"logps/rejected": -86.7280502319336,
"loss": 0.3311,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7476341724395752,
"rewards/margins": 2.2033274173736572,
"rewards/rejected": -2.9509613513946533,
"step": 720
},
{
"epoch": 0.27809742999616416,
"grad_norm": 0.60546875,
"learning_rate": 4.467939687538906e-05,
"logits/chosen": -6.718592166900635,
"logits/rejected": -6.75787878036499,
"logps/chosen": -60.866233825683594,
"logps/rejected": -82.61892700195312,
"loss": 0.33,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5662195682525635,
"rewards/margins": 2.2021169662475586,
"rewards/rejected": -2.768336534500122,
"step": 725
},
{
"epoch": 0.28001534330648253,
"grad_norm": 5.9375,
"learning_rate": 4.4987956246075455e-05,
"logits/chosen": -6.6921539306640625,
"logits/rejected": -6.746490478515625,
"logps/chosen": -61.933387756347656,
"logps/rejected": -83.12504577636719,
"loss": 0.3304,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -0.6692690253257751,
"rewards/margins": 2.167955160140991,
"rewards/rejected": -2.8372244834899902,
"step": 730
},
{
"epoch": 0.2819332566168009,
"grad_norm": 1.21875,
"learning_rate": 4.529651561676185e-05,
"logits/chosen": -6.7022576332092285,
"logits/rejected": -6.752945899963379,
"logps/chosen": -64.28565216064453,
"logps/rejected": -86.43739318847656,
"loss": 0.3303,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7322890162467957,
"rewards/margins": 2.194568157196045,
"rewards/rejected": -2.9268569946289062,
"step": 735
},
{
"epoch": 0.2838511699271193,
"grad_norm": 0.45703125,
"learning_rate": 4.560507498744823e-05,
"logits/chosen": -6.755038261413574,
"logits/rejected": -6.783468723297119,
"logps/chosen": -66.19497680664062,
"logps/rejected": -86.26811218261719,
"loss": 0.3334,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -0.7362627387046814,
"rewards/margins": 2.1654791831970215,
"rewards/rejected": -2.9017419815063477,
"step": 740
},
{
"epoch": 0.28576908323743766,
"grad_norm": 0.54296875,
"learning_rate": 4.5913634358134624e-05,
"logits/chosen": -6.751487731933594,
"logits/rejected": -6.81637716293335,
"logps/chosen": -63.06780242919922,
"logps/rejected": -84.47251892089844,
"loss": 0.3296,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5739712119102478,
"rewards/margins": 2.154513120651245,
"rewards/rejected": -2.7284841537475586,
"step": 745
},
{
"epoch": 0.28768699654775604,
"grad_norm": 0.609375,
"learning_rate": 4.622219372882101e-05,
"logits/chosen": -6.6978349685668945,
"logits/rejected": -6.745043754577637,
"logps/chosen": -63.39635467529297,
"logps/rejected": -82.64900207519531,
"loss": 0.3317,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -0.5827551484107971,
"rewards/margins": 2.2173619270324707,
"rewards/rejected": -2.800117015838623,
"step": 750
},
{
"epoch": 0.2896049098580744,
"grad_norm": 0.84765625,
"learning_rate": 4.6530753099507394e-05,
"logits/chosen": -6.7169013023376465,
"logits/rejected": -6.784239768981934,
"logps/chosen": -63.23963165283203,
"logps/rejected": -84.25068664550781,
"loss": 0.3285,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.643856406211853,
"rewards/margins": 2.1869945526123047,
"rewards/rejected": -2.8308510780334473,
"step": 755
},
{
"epoch": 0.2915228231683928,
"grad_norm": 0.53515625,
"learning_rate": 4.6839312470193786e-05,
"logits/chosen": -6.725077152252197,
"logits/rejected": -6.785587310791016,
"logps/chosen": -65.50286102294922,
"logps/rejected": -87.09868621826172,
"loss": 0.3333,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -0.8208054304122925,
"rewards/margins": 2.1953988075256348,
"rewards/rejected": -3.016204357147217,
"step": 760
},
{
"epoch": 0.29344073647871116,
"grad_norm": 0.71875,
"learning_rate": 4.714787184088018e-05,
"logits/chosen": -6.824525356292725,
"logits/rejected": -6.858616828918457,
"logps/chosen": -60.581390380859375,
"logps/rejected": -81.6998062133789,
"loss": 0.3279,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5427888631820679,
"rewards/margins": 2.190028190612793,
"rewards/rejected": -2.732816696166992,
"step": 765
},
{
"epoch": 0.29535864978902954,
"grad_norm": 1.953125,
"learning_rate": 4.7456431211566563e-05,
"logits/chosen": -6.723073482513428,
"logits/rejected": -6.727642059326172,
"logps/chosen": -64.07035827636719,
"logps/rejected": -85.05512237548828,
"loss": 0.3304,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6974167823791504,
"rewards/margins": 2.1716017723083496,
"rewards/rejected": -2.8690185546875,
"step": 770
},
{
"epoch": 0.2972765630993479,
"grad_norm": 0.349609375,
"learning_rate": 4.7764990582252955e-05,
"logits/chosen": -6.807308197021484,
"logits/rejected": -6.894169807434082,
"logps/chosen": -59.81858444213867,
"logps/rejected": -80.54625701904297,
"loss": 0.3289,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5121219754219055,
"rewards/margins": 2.172778606414795,
"rewards/rejected": -2.6849007606506348,
"step": 775
},
{
"epoch": 0.2991944764096663,
"grad_norm": 0.68359375,
"learning_rate": 4.807354995293934e-05,
"logits/chosen": -6.723723411560059,
"logits/rejected": -6.834429740905762,
"logps/chosen": -60.23179244995117,
"logps/rejected": -81.87191772460938,
"loss": 0.329,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4345468580722809,
"rewards/margins": 2.197113275527954,
"rewards/rejected": -2.631659984588623,
"step": 780
},
{
"epoch": 0.30111238971998466,
"grad_norm": 0.47265625,
"learning_rate": 4.832039506658243e-05,
"logits/chosen": -6.752009391784668,
"logits/rejected": -6.826085567474365,
"logps/chosen": -64.78627014160156,
"logps/rejected": -86.14216613769531,
"loss": 0.3291,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7065446972846985,
"rewards/margins": 2.1886019706726074,
"rewards/rejected": -2.8951468467712402,
"step": 785
},
{
"epoch": 0.30303030303030304,
"grad_norm": 0.431640625,
"learning_rate": 4.832031166492162e-05,
"logits/chosen": -6.7785844802856445,
"logits/rejected": -6.8263349533081055,
"logps/chosen": -66.33650207519531,
"logps/rejected": -88.02735900878906,
"loss": 0.3273,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6940505504608154,
"rewards/margins": 2.166769504547119,
"rewards/rejected": -2.8608198165893555,
"step": 790
},
{
"epoch": 0.3049482163406214,
"grad_norm": 0.458984375,
"learning_rate": 4.8320109118434643e-05,
"logits/chosen": -6.783939361572266,
"logits/rejected": -6.81668758392334,
"logps/chosen": -59.873023986816406,
"logps/rejected": -80.97574615478516,
"loss": 0.3292,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.49882563948631287,
"rewards/margins": 2.2213451862335205,
"rewards/rejected": -2.720170736312866,
"step": 795
},
{
"epoch": 0.3068661296509398,
"grad_norm": 0.72265625,
"learning_rate": 4.8319787428130454e-05,
"logits/chosen": -6.754337310791016,
"logits/rejected": -6.8234124183654785,
"logps/chosen": -63.73439407348633,
"logps/rejected": -85.82870483398438,
"loss": 0.3271,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6053725481033325,
"rewards/margins": 2.2051076889038086,
"rewards/rejected": -2.8104803562164307,
"step": 800
},
{
"epoch": 0.30878404296125816,
"grad_norm": 0.609375,
"learning_rate": 4.8319346595611474e-05,
"logits/chosen": -6.786177158355713,
"logits/rejected": -6.8356428146362305,
"logps/chosen": -66.29222106933594,
"logps/rejected": -87.39335632324219,
"loss": 0.3287,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7608428001403809,
"rewards/margins": 2.2171874046325684,
"rewards/rejected": -2.978030204772949,
"step": 805
},
{
"epoch": 0.31070195627157654,
"grad_norm": 0.373046875,
"learning_rate": 4.8318786623073625e-05,
"logits/chosen": -6.763392448425293,
"logits/rejected": -6.827507019042969,
"logps/chosen": -64.34239196777344,
"logps/rejected": -85.38350677490234,
"loss": 0.3284,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6473758220672607,
"rewards/margins": 2.150489091873169,
"rewards/rejected": -2.797865390777588,
"step": 810
},
{
"epoch": 0.3126198695818949,
"grad_norm": 0.408203125,
"learning_rate": 4.831810751330627e-05,
"logits/chosen": -6.711167812347412,
"logits/rejected": -6.766715049743652,
"logps/chosen": -62.98204803466797,
"logps/rejected": -84.40223693847656,
"loss": 0.3275,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5519531965255737,
"rewards/margins": 2.2026820182800293,
"rewards/rejected": -2.7546346187591553,
"step": 815
},
{
"epoch": 0.3145377828922133,
"grad_norm": 0.478515625,
"learning_rate": 4.8317309269692265e-05,
"logits/chosen": -6.660008430480957,
"logits/rejected": -6.69751501083374,
"logps/chosen": -63.44572830200195,
"logps/rejected": -85.4597396850586,
"loss": 0.3272,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6921919584274292,
"rewards/margins": 2.1986453533172607,
"rewards/rejected": -2.8908371925354004,
"step": 820
},
{
"epoch": 0.31645569620253167,
"grad_norm": 0.158203125,
"learning_rate": 4.831639189620787e-05,
"logits/chosen": -6.792325019836426,
"logits/rejected": -6.890578269958496,
"logps/chosen": -61.67216873168945,
"logps/rejected": -82.24214935302734,
"loss": 0.3265,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4790772497653961,
"rewards/margins": 2.210103750228882,
"rewards/rejected": -2.689181089401245,
"step": 825
},
{
"epoch": 0.31837360951285004,
"grad_norm": 0.2060546875,
"learning_rate": 4.831535539742279e-05,
"logits/chosen": -6.7359466552734375,
"logits/rejected": -6.817502498626709,
"logps/chosen": -66.23463439941406,
"logps/rejected": -87.56221008300781,
"loss": 0.326,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6128313541412354,
"rewards/margins": 2.193281650543213,
"rewards/rejected": -2.8061134815216064,
"step": 830
},
{
"epoch": 0.3202915228231684,
"grad_norm": 0.33203125,
"learning_rate": 4.831419977850011e-05,
"logits/chosen": -6.762195587158203,
"logits/rejected": -6.8095879554748535,
"logps/chosen": -61.4901237487793,
"logps/rejected": -82.7410888671875,
"loss": 0.3275,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5622529983520508,
"rewards/margins": 2.2180874347686768,
"rewards/rejected": -2.7803401947021484,
"step": 835
},
{
"epoch": 0.3222094361334868,
"grad_norm": 0.4453125,
"learning_rate": 4.8312925045196294e-05,
"logits/chosen": -6.715124607086182,
"logits/rejected": -6.803443908691406,
"logps/chosen": -62.32609176635742,
"logps/rejected": -82.6832275390625,
"loss": 0.3273,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5614801645278931,
"rewards/margins": 2.209786891937256,
"rewards/rejected": -2.7712676525115967,
"step": 840
},
{
"epoch": 0.3241273494438051,
"grad_norm": 0.2099609375,
"learning_rate": 4.8311531203861164e-05,
"logits/chosen": -6.737015724182129,
"logits/rejected": -6.792696952819824,
"logps/chosen": -63.43701934814453,
"logps/rejected": -85.36714935302734,
"loss": 0.3266,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6197028756141663,
"rewards/margins": 2.200639009475708,
"rewards/rejected": -2.8203418254852295,
"step": 845
},
{
"epoch": 0.3260452627541235,
"grad_norm": 0.283203125,
"learning_rate": 4.83100182614378e-05,
"logits/chosen": -6.828474521636963,
"logits/rejected": -6.907891273498535,
"logps/chosen": -62.228851318359375,
"logps/rejected": -84.7411117553711,
"loss": 0.3261,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.565654456615448,
"rewards/margins": 2.194936752319336,
"rewards/rejected": -2.7605910301208496,
"step": 850
},
{
"epoch": 0.32796317606444186,
"grad_norm": 0.12158203125,
"learning_rate": 4.830838622546262e-05,
"logits/chosen": -6.825277805328369,
"logits/rejected": -6.85458517074585,
"logps/chosen": -61.36362838745117,
"logps/rejected": -83.68403625488281,
"loss": 0.3263,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5196749567985535,
"rewards/margins": 2.208179235458374,
"rewards/rejected": -2.7278542518615723,
"step": 855
},
{
"epoch": 0.32988108937476024,
"grad_norm": 0.2060546875,
"learning_rate": 4.830663510406525e-05,
"logits/chosen": -6.76092004776001,
"logits/rejected": -6.817212104797363,
"logps/chosen": -60.813751220703125,
"logps/rejected": -82.04981994628906,
"loss": 0.326,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5437586903572083,
"rewards/margins": 2.1881277561187744,
"rewards/rejected": -2.731886386871338,
"step": 860
},
{
"epoch": 0.3317990026850786,
"grad_norm": 0.208984375,
"learning_rate": 4.8304764905968524e-05,
"logits/chosen": -6.780856132507324,
"logits/rejected": -6.783792018890381,
"logps/chosen": -59.90398025512695,
"logps/rejected": -81.0885009765625,
"loss": 0.3264,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5090753436088562,
"rewards/margins": 2.1934762001037598,
"rewards/rejected": -2.70255184173584,
"step": 865
},
{
"epoch": 0.333716915995397,
"grad_norm": 0.162109375,
"learning_rate": 4.830277564048841e-05,
"logits/chosen": -6.7684478759765625,
"logits/rejected": -6.799691677093506,
"logps/chosen": -61.917884826660156,
"logps/rejected": -82.83894348144531,
"loss": 0.3256,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.520404040813446,
"rewards/margins": 2.187943696975708,
"rewards/rejected": -2.708347797393799,
"step": 870
},
{
"epoch": 0.33563482930571537,
"grad_norm": 0.1298828125,
"learning_rate": 4.8300667317533996e-05,
"logits/chosen": -6.776177883148193,
"logits/rejected": -6.815954685211182,
"logps/chosen": -65.00544738769531,
"logps/rejected": -86.48484802246094,
"loss": 0.3255,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6944643259048462,
"rewards/margins": 2.1923465728759766,
"rewards/rejected": -2.8868112564086914,
"step": 875
},
{
"epoch": 0.33755274261603374,
"grad_norm": 0.12255859375,
"learning_rate": 4.829843994760745e-05,
"logits/chosen": -6.812263488769531,
"logits/rejected": -6.835313320159912,
"logps/chosen": -57.126991271972656,
"logps/rejected": -79.21980285644531,
"loss": 0.3255,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4502105712890625,
"rewards/margins": 2.1968555450439453,
"rewards/rejected": -2.647066116333008,
"step": 880
},
{
"epoch": 0.3394706559263521,
"grad_norm": 0.123046875,
"learning_rate": 4.829609354180392e-05,
"logits/chosen": -6.792092323303223,
"logits/rejected": -6.8232903480529785,
"logps/chosen": -62.5068359375,
"logps/rejected": -82.26536560058594,
"loss": 0.3255,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4687983989715576,
"rewards/margins": 2.2032909393310547,
"rewards/rejected": -2.6720895767211914,
"step": 885
},
{
"epoch": 0.3413885692366705,
"grad_norm": 0.08837890625,
"learning_rate": 4.8293628111811505e-05,
"logits/chosen": -6.755088806152344,
"logits/rejected": -6.835995674133301,
"logps/chosen": -63.63855743408203,
"logps/rejected": -85.75823211669922,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6732860803604126,
"rewards/margins": 2.1958742141723633,
"rewards/rejected": -2.8691601753234863,
"step": 890
},
{
"epoch": 0.34330648254698887,
"grad_norm": 0.1865234375,
"learning_rate": 4.829104366991121e-05,
"logits/chosen": -6.732597351074219,
"logits/rejected": -6.824942588806152,
"logps/chosen": -62.95014572143555,
"logps/rejected": -84.47950744628906,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.569419264793396,
"rewards/margins": 2.2020137310028076,
"rewards/rejected": -2.7714333534240723,
"step": 895
},
{
"epoch": 0.34522439585730724,
"grad_norm": 0.11767578125,
"learning_rate": 4.8288340228976864e-05,
"logits/chosen": -6.789787292480469,
"logits/rejected": -6.844014644622803,
"logps/chosen": -64.68815612792969,
"logps/rejected": -85.33354949951172,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.555423378944397,
"rewards/margins": 2.1973979473114014,
"rewards/rejected": -2.752821445465088,
"step": 900
},
{
"epoch": 0.3471423091676256,
"grad_norm": 0.0703125,
"learning_rate": 4.828551780247507e-05,
"logits/chosen": -6.739270210266113,
"logits/rejected": -6.838677406311035,
"logps/chosen": -63.15021514892578,
"logps/rejected": -85.18204498291016,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6110334992408752,
"rewards/margins": 2.194159507751465,
"rewards/rejected": -2.8051929473876953,
"step": 905
},
{
"epoch": 0.349060222477944,
"grad_norm": 0.1259765625,
"learning_rate": 4.8282576404465136e-05,
"logits/chosen": -6.813556671142578,
"logits/rejected": -6.86798620223999,
"logps/chosen": -61.781394958496094,
"logps/rejected": -83.4185562133789,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4605577886104584,
"rewards/margins": 2.199518918991089,
"rewards/rejected": -2.66007661819458,
"step": 910
},
{
"epoch": 0.35097813578826237,
"grad_norm": 0.06494140625,
"learning_rate": 4.8279516049598964e-05,
"logits/chosen": -6.776249885559082,
"logits/rejected": -6.871140956878662,
"logps/chosen": -61.13481521606445,
"logps/rejected": -82.49009704589844,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.41385921835899353,
"rewards/margins": 2.1962554454803467,
"rewards/rejected": -2.6101150512695312,
"step": 915
},
{
"epoch": 0.35289604909858074,
"grad_norm": 0.10107421875,
"learning_rate": 4.827633675312108e-05,
"logits/chosen": -6.725433349609375,
"logits/rejected": -6.784366607666016,
"logps/chosen": -61.72522735595703,
"logps/rejected": -81.87347412109375,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.49139589071273804,
"rewards/margins": 2.188589572906494,
"rewards/rejected": -2.679985523223877,
"step": 920
},
{
"epoch": 0.3548139624088991,
"grad_norm": 0.123046875,
"learning_rate": 4.827303853086843e-05,
"logits/chosen": -6.812512397766113,
"logits/rejected": -6.855481147766113,
"logps/chosen": -58.620384216308594,
"logps/rejected": -80.5108642578125,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.48473721742630005,
"rewards/margins": 2.198575258255005,
"rewards/rejected": -2.68331241607666,
"step": 925
},
{
"epoch": 0.3567318757192175,
"grad_norm": 0.0771484375,
"learning_rate": 4.826962139927038e-05,
"logits/chosen": -6.829216003417969,
"logits/rejected": -6.905343532562256,
"logps/chosen": -63.4243278503418,
"logps/rejected": -84.9180908203125,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5410398244857788,
"rewards/margins": 2.1947712898254395,
"rewards/rejected": -2.7358109951019287,
"step": 930
},
{
"epoch": 0.35864978902953587,
"grad_norm": 0.0673828125,
"learning_rate": 4.826608537534864e-05,
"logits/chosen": -6.665771484375,
"logits/rejected": -6.781673431396484,
"logps/chosen": -61.3337516784668,
"logps/rejected": -83.3028335571289,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5410109758377075,
"rewards/margins": 2.192946195602417,
"rewards/rejected": -2.733957290649414,
"step": 935
},
{
"epoch": 0.36056770233985425,
"grad_norm": 0.058349609375,
"learning_rate": 4.826243047671713e-05,
"logits/chosen": -6.8091139793396,
"logits/rejected": -6.85195255279541,
"logps/chosen": -63.64402389526367,
"logps/rejected": -85.47203063964844,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.638896107673645,
"rewards/margins": 2.196115255355835,
"rewards/rejected": -2.8350117206573486,
"step": 940
},
{
"epoch": 0.3624856156501726,
"grad_norm": 0.103515625,
"learning_rate": 4.825865672158193e-05,
"logits/chosen": -6.7124199867248535,
"logits/rejected": -6.7912397384643555,
"logps/chosen": -62.58756637573242,
"logps/rejected": -83.9194107055664,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6223362684249878,
"rewards/margins": 2.1918787956237793,
"rewards/rejected": -2.8142151832580566,
"step": 945
},
{
"epoch": 0.364403528960491,
"grad_norm": 0.06787109375,
"learning_rate": 4.825476412874119e-05,
"logits/chosen": -6.7734575271606445,
"logits/rejected": -6.823586463928223,
"logps/chosen": -62.79764938354492,
"logps/rejected": -84.3304672241211,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5803407430648804,
"rewards/margins": 2.1943869590759277,
"rewards/rejected": -2.774728298187256,
"step": 950
},
{
"epoch": 0.36632144227080937,
"grad_norm": 0.06591796875,
"learning_rate": 4.8250752717584965e-05,
"logits/chosen": -6.757647514343262,
"logits/rejected": -6.80877161026001,
"logps/chosen": -66.12776947021484,
"logps/rejected": -87.11688232421875,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6834413409233093,
"rewards/margins": 2.198050022125244,
"rewards/rejected": -2.8814916610717773,
"step": 955
},
{
"epoch": 0.36823935558112775,
"grad_norm": 0.0771484375,
"learning_rate": 4.8246622508095275e-05,
"logits/chosen": -6.766076564788818,
"logits/rejected": -6.856036186218262,
"logps/chosen": -68.24916076660156,
"logps/rejected": -88.81385040283203,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7616792917251587,
"rewards/margins": 2.1991758346557617,
"rewards/rejected": -2.960855007171631,
"step": 960
},
{
"epoch": 0.3701572688914461,
"grad_norm": 0.08056640625,
"learning_rate": 4.824237352084582e-05,
"logits/chosen": -6.84201192855835,
"logits/rejected": -6.853090763092041,
"logps/chosen": -63.752159118652344,
"logps/rejected": -85.63033294677734,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6369014978408813,
"rewards/margins": 2.1962616443634033,
"rewards/rejected": -2.833163261413574,
"step": 965
},
{
"epoch": 0.3720751822017645,
"grad_norm": 0.058837890625,
"learning_rate": 4.8238005777002006e-05,
"logits/chosen": -6.842381477355957,
"logits/rejected": -6.8631696701049805,
"logps/chosen": -62.98088455200195,
"logps/rejected": -84.48819732666016,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5686295032501221,
"rewards/margins": 2.1978487968444824,
"rewards/rejected": -2.7664780616760254,
"step": 970
},
{
"epoch": 0.3739930955120829,
"grad_norm": 0.06005859375,
"learning_rate": 4.82335192983208e-05,
"logits/chosen": -6.801692008972168,
"logits/rejected": -6.871775150299072,
"logps/chosen": -65.15239715576172,
"logps/rejected": -85.4663314819336,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.561257004737854,
"rewards/margins": 2.1907687187194824,
"rewards/rejected": -2.752026081085205,
"step": 975
},
{
"epoch": 0.37591100882240125,
"grad_norm": 0.059326171875,
"learning_rate": 4.822891410715061e-05,
"logits/chosen": -6.735692024230957,
"logits/rejected": -6.789946556091309,
"logps/chosen": -59.1342887878418,
"logps/rejected": -80.65605926513672,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.43425020575523376,
"rewards/margins": 2.1964149475097656,
"rewards/rejected": -2.6306653022766113,
"step": 980
},
{
"epoch": 0.3778289221327196,
"grad_norm": 0.04443359375,
"learning_rate": 4.822419022643119e-05,
"logits/chosen": -6.6819329261779785,
"logits/rejected": -6.77020263671875,
"logps/chosen": -65.46723937988281,
"logps/rejected": -85.34056091308594,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5593077540397644,
"rewards/margins": 2.1933341026306152,
"rewards/rejected": -2.7526416778564453,
"step": 985
},
{
"epoch": 0.379746835443038,
"grad_norm": 0.06201171875,
"learning_rate": 4.821934767969352e-05,
"logits/chosen": -6.758429527282715,
"logits/rejected": -6.803321838378906,
"logps/chosen": -58.53925323486328,
"logps/rejected": -80.94148254394531,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5433183908462524,
"rewards/margins": 2.1945109367370605,
"rewards/rejected": -2.7378292083740234,
"step": 990
},
{
"epoch": 0.3816647487533564,
"grad_norm": 0.05126953125,
"learning_rate": 4.8214386491059666e-05,
"logits/chosen": -6.7455244064331055,
"logits/rejected": -6.8056206703186035,
"logps/chosen": -66.12544250488281,
"logps/rejected": -86.32331848144531,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6424646377563477,
"rewards/margins": 2.1983883380889893,
"rewards/rejected": -2.840852737426758,
"step": 995
},
{
"epoch": 0.3835826620636747,
"grad_norm": 0.06982421875,
"learning_rate": 4.820930668524273e-05,
"logits/chosen": -6.71005392074585,
"logits/rejected": -6.745247840881348,
"logps/chosen": -66.3671646118164,
"logps/rejected": -87.68939208984375,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6825047731399536,
"rewards/margins": 2.1931846141815186,
"rewards/rejected": -2.8756892681121826,
"step": 1000
},
{
"epoch": 0.3835826620636747,
"eval_logits/chosen": -6.65899658203125,
"eval_logits/rejected": -6.858875751495361,
"eval_logps/chosen": -61.29154968261719,
"eval_logps/rejected": -84.09736633300781,
"eval_loss": 0.3251773715019226,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": -0.5838707089424133,
"eval_rewards/margins": 2.196570634841919,
"eval_rewards/rejected": -2.7804412841796875,
"eval_runtime": 5.2595,
"eval_samples_per_second": 38.026,
"eval_steps_per_second": 38.026,
"step": 1000
},
{
"epoch": 0.38550057537399307,
"grad_norm": 0.0966796875,
"learning_rate": 4.8204108287546635e-05,
"logits/chosen": -6.718655586242676,
"logits/rejected": -6.77310037612915,
"logps/chosen": -61.59770584106445,
"logps/rejected": -82.8775863647461,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5203923583030701,
"rewards/margins": 2.196401596069336,
"rewards/rejected": -2.716794013977051,
"step": 1005
},
{
"epoch": 0.38741848868431145,
"grad_norm": 0.0546875,
"learning_rate": 4.8198791323866075e-05,
"logits/chosen": -6.782384395599365,
"logits/rejected": -6.805130958557129,
"logps/chosen": -66.81037139892578,
"logps/rejected": -87.42963409423828,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6235989928245544,
"rewards/margins": 2.195854663848877,
"rewards/rejected": -2.819453716278076,
"step": 1010
},
{
"epoch": 0.3893364019946298,
"grad_norm": 0.0654296875,
"learning_rate": 4.819335582068633e-05,
"logits/chosen": -6.907446384429932,
"logits/rejected": -6.924661159515381,
"logps/chosen": -60.52661895751953,
"logps/rejected": -81.73246765136719,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5020779371261597,
"rewards/margins": 2.1980807781219482,
"rewards/rejected": -2.7001590728759766,
"step": 1015
},
{
"epoch": 0.3912543153049482,
"grad_norm": 0.04833984375,
"learning_rate": 4.818780180508317e-05,
"logits/chosen": -6.781402587890625,
"logits/rejected": -6.855971336364746,
"logps/chosen": -62.72636795043945,
"logps/rejected": -83.95735931396484,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5438644886016846,
"rewards/margins": 2.1982574462890625,
"rewards/rejected": -2.742121934890747,
"step": 1020
},
{
"epoch": 0.3931722286152666,
"grad_norm": 0.06201171875,
"learning_rate": 4.818212930472272e-05,
"logits/chosen": -6.8052659034729,
"logits/rejected": -6.898565769195557,
"logps/chosen": -62.053138732910156,
"logps/rejected": -83.63028717041016,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5735294222831726,
"rewards/margins": 2.195286750793457,
"rewards/rejected": -2.7688159942626953,
"step": 1025
},
{
"epoch": 0.39509014192558495,
"grad_norm": 0.049072265625,
"learning_rate": 4.817633834786127e-05,
"logits/chosen": -6.7843756675720215,
"logits/rejected": -6.809134006500244,
"logps/chosen": -64.53950500488281,
"logps/rejected": -85.89805603027344,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5933750867843628,
"rewards/margins": 2.1912827491760254,
"rewards/rejected": -2.7846579551696777,
"step": 1030
},
{
"epoch": 0.3970080552359033,
"grad_norm": 0.05419921875,
"learning_rate": 4.8170428963345233e-05,
"logits/chosen": -6.7891716957092285,
"logits/rejected": -6.815330505371094,
"logps/chosen": -66.38642883300781,
"logps/rejected": -87.5756607055664,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6958648562431335,
"rewards/margins": 2.1894466876983643,
"rewards/rejected": -2.8853116035461426,
"step": 1035
},
{
"epoch": 0.3989259685462217,
"grad_norm": 0.07763671875,
"learning_rate": 4.81644011806109e-05,
"logits/chosen": -6.720099449157715,
"logits/rejected": -6.809721946716309,
"logps/chosen": -61.158897399902344,
"logps/rejected": -81.80070495605469,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.520710825920105,
"rewards/margins": 2.191119432449341,
"rewards/rejected": -2.7118301391601562,
"step": 1040
},
{
"epoch": 0.4008438818565401,
"grad_norm": 0.05810546875,
"learning_rate": 4.8158255029684364e-05,
"logits/chosen": -6.766349792480469,
"logits/rejected": -6.818787574768066,
"logps/chosen": -63.04213333129883,
"logps/rejected": -85.06871032714844,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6426771283149719,
"rewards/margins": 2.19403076171875,
"rewards/rejected": -2.836707592010498,
"step": 1045
},
{
"epoch": 0.40276179516685845,
"grad_norm": 0.05712890625,
"learning_rate": 4.815199054118132e-05,
"logits/chosen": -6.773943901062012,
"logits/rejected": -6.834680080413818,
"logps/chosen": -62.253082275390625,
"logps/rejected": -83.83917236328125,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4995577931404114,
"rewards/margins": 2.1934943199157715,
"rewards/rejected": -2.693052053451538,
"step": 1050
},
{
"epoch": 0.4046797084771768,
"grad_norm": 0.045166015625,
"learning_rate": 4.8145607746306934e-05,
"logits/chosen": -6.594931602478027,
"logits/rejected": -6.683431148529053,
"logps/chosen": -66.52024841308594,
"logps/rejected": -87.12366485595703,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6391206979751587,
"rewards/margins": 2.1987528800964355,
"rewards/rejected": -2.8378734588623047,
"step": 1055
},
{
"epoch": 0.4065976217874952,
"grad_norm": 0.0458984375,
"learning_rate": 4.8139106676855725e-05,
"logits/chosen": -6.760615348815918,
"logits/rejected": -6.792774200439453,
"logps/chosen": -63.51324462890625,
"logps/rejected": -85.3805923461914,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5777386426925659,
"rewards/margins": 2.20072865486145,
"rewards/rejected": -2.7784667015075684,
"step": 1060
},
{
"epoch": 0.4085155350978136,
"grad_norm": 0.052734375,
"learning_rate": 4.813248736521134e-05,
"logits/chosen": -6.811188697814941,
"logits/rejected": -6.867379665374756,
"logps/chosen": -67.62599182128906,
"logps/rejected": -87.9825668334961,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7734149098396301,
"rewards/margins": 2.195405960083008,
"rewards/rejected": -2.9688210487365723,
"step": 1065
},
{
"epoch": 0.41043344840813195,
"grad_norm": 0.052001953125,
"learning_rate": 4.812574984434643e-05,
"logits/chosen": -6.798884391784668,
"logits/rejected": -6.8902130126953125,
"logps/chosen": -60.3090705871582,
"logps/rejected": -81.30734252929688,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5456782579421997,
"rewards/margins": 2.198199987411499,
"rewards/rejected": -2.7438783645629883,
"step": 1070
},
{
"epoch": 0.4123513617184503,
"grad_norm": 0.04541015625,
"learning_rate": 4.811889414782247e-05,
"logits/chosen": -6.775029182434082,
"logits/rejected": -6.823780059814453,
"logps/chosen": -62.48878860473633,
"logps/rejected": -83.3646011352539,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5705373883247375,
"rewards/margins": 2.1929373741149902,
"rewards/rejected": -2.763474702835083,
"step": 1075
},
{
"epoch": 0.4142692750287687,
"grad_norm": 0.04833984375,
"learning_rate": 4.811192030978961e-05,
"logits/chosen": -6.747941017150879,
"logits/rejected": -6.830478668212891,
"logps/chosen": -66.92538452148438,
"logps/rejected": -88.23207092285156,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7639543414115906,
"rewards/margins": 2.1968750953674316,
"rewards/rejected": -2.960829734802246,
"step": 1080
},
{
"epoch": 0.4161871883390871,
"grad_norm": 0.0517578125,
"learning_rate": 4.810482836498652e-05,
"logits/chosen": -6.778285980224609,
"logits/rejected": -6.8560028076171875,
"logps/chosen": -64.7900161743164,
"logps/rejected": -85.25300598144531,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5712297558784485,
"rewards/margins": 2.196739912033081,
"rewards/rejected": -2.767969846725464,
"step": 1085
},
{
"epoch": 0.41810510164940545,
"grad_norm": 0.04736328125,
"learning_rate": 4.809761834874016e-05,
"logits/chosen": -6.8289031982421875,
"logits/rejected": -6.893160820007324,
"logps/chosen": -59.30530548095703,
"logps/rejected": -81.9992446899414,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.48122939467430115,
"rewards/margins": 2.1956143379211426,
"rewards/rejected": -2.6768441200256348,
"step": 1090
},
{
"epoch": 0.42002301495972383,
"grad_norm": 0.048583984375,
"learning_rate": 4.809029029696565e-05,
"logits/chosen": -6.742526054382324,
"logits/rejected": -6.7710466384887695,
"logps/chosen": -70.74293518066406,
"logps/rejected": -91.24531555175781,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7574164867401123,
"rewards/margins": 2.1936662197113037,
"rewards/rejected": -2.951082706451416,
"step": 1095
},
{
"epoch": 0.4219409282700422,
"grad_norm": 0.03955078125,
"learning_rate": 4.8082844246166064e-05,
"logits/chosen": -6.797771453857422,
"logits/rejected": -6.8490142822265625,
"logps/chosen": -58.99123001098633,
"logps/rejected": -79.65338134765625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4870396554470062,
"rewards/margins": 2.196023464202881,
"rewards/rejected": -2.68306303024292,
"step": 1100
},
{
"epoch": 0.4238588415803606,
"grad_norm": 0.04736328125,
"learning_rate": 4.8075280233432274e-05,
"logits/chosen": -6.773340702056885,
"logits/rejected": -6.859117031097412,
"logps/chosen": -64.4073486328125,
"logps/rejected": -85.19906616210938,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6140931248664856,
"rewards/margins": 2.196549654006958,
"rewards/rejected": -2.810642719268799,
"step": 1105
},
{
"epoch": 0.42577675489067895,
"grad_norm": 0.04541015625,
"learning_rate": 4.806759829644277e-05,
"logits/chosen": -6.728902339935303,
"logits/rejected": -6.808338165283203,
"logps/chosen": -61.23081588745117,
"logps/rejected": -82.18063354492188,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4775692820549011,
"rewards/margins": 2.1966347694396973,
"rewards/rejected": -2.674203872680664,
"step": 1110
},
{
"epoch": 0.42769466820099733,
"grad_norm": 0.054443359375,
"learning_rate": 4.805979847346342e-05,
"logits/chosen": -6.670124053955078,
"logits/rejected": -6.779568672180176,
"logps/chosen": -63.63054656982422,
"logps/rejected": -84.89369201660156,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5798670053482056,
"rewards/margins": 2.1969175338745117,
"rewards/rejected": -2.7767844200134277,
"step": 1115
},
{
"epoch": 0.4296125815113157,
"grad_norm": 0.04248046875,
"learning_rate": 4.805188080334735e-05,
"logits/chosen": -6.852222442626953,
"logits/rejected": -6.859063625335693,
"logps/chosen": -65.18397521972656,
"logps/rejected": -84.5948486328125,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5618688464164734,
"rewards/margins": 2.198063373565674,
"rewards/rejected": -2.759932041168213,
"step": 1120
},
{
"epoch": 0.4315304948216341,
"grad_norm": 0.052978515625,
"learning_rate": 4.80438453255347e-05,
"logits/chosen": -6.80611515045166,
"logits/rejected": -6.918792724609375,
"logps/chosen": -60.48639678955078,
"logps/rejected": -82.47330474853516,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5442097187042236,
"rewards/margins": 2.1984286308288574,
"rewards/rejected": -2.742638111114502,
"step": 1125
},
{
"epoch": 0.43344840813195246,
"grad_norm": 0.052978515625,
"learning_rate": 4.8035692080052436e-05,
"logits/chosen": -6.697965145111084,
"logits/rejected": -6.774870872497559,
"logps/chosen": -64.44297790527344,
"logps/rejected": -85.90459442138672,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7526843547821045,
"rewards/margins": 2.1978108882904053,
"rewards/rejected": -2.9504952430725098,
"step": 1130
},
{
"epoch": 0.43536632144227083,
"grad_norm": 0.05029296875,
"learning_rate": 4.802742110751416e-05,
"logits/chosen": -6.803210258483887,
"logits/rejected": -6.829536437988281,
"logps/chosen": -60.73114776611328,
"logps/rejected": -82.74317169189453,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.44673386216163635,
"rewards/margins": 2.194598913192749,
"rewards/rejected": -2.6413326263427734,
"step": 1135
},
{
"epoch": 0.4372842347525892,
"grad_norm": 0.059326171875,
"learning_rate": 4.801903244911993e-05,
"logits/chosen": -6.766633033752441,
"logits/rejected": -6.864239692687988,
"logps/chosen": -63.66731643676758,
"logps/rejected": -82.88044738769531,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5311779379844666,
"rewards/margins": 2.1935510635375977,
"rewards/rejected": -2.724729061126709,
"step": 1140
},
{
"epoch": 0.4392021480629076,
"grad_norm": 0.044189453125,
"learning_rate": 4.8010526146656e-05,
"logits/chosen": -6.751435279846191,
"logits/rejected": -6.798174858093262,
"logps/chosen": -68.84373474121094,
"logps/rejected": -90.4491195678711,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7617834210395813,
"rewards/margins": 2.1939213275909424,
"rewards/rejected": -2.9557042121887207,
"step": 1145
},
{
"epoch": 0.44112006137322596,
"grad_norm": 0.04443359375,
"learning_rate": 4.800190224249464e-05,
"logits/chosen": -6.790631294250488,
"logits/rejected": -6.874907493591309,
"logps/chosen": -64.97017669677734,
"logps/rejected": -85.68257904052734,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5560901761054993,
"rewards/margins": 2.1980504989624023,
"rewards/rejected": -2.7541403770446777,
"step": 1150
},
{
"epoch": 0.4430379746835443,
"grad_norm": 0.04296875,
"learning_rate": 4.799316077959397e-05,
"logits/chosen": -6.840447902679443,
"logits/rejected": -6.897547721862793,
"logps/chosen": -61.42547607421875,
"logps/rejected": -83.30725860595703,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5857731103897095,
"rewards/margins": 2.1990787982940674,
"rewards/rejected": -2.784851551055908,
"step": 1155
},
{
"epoch": 0.44495588799386265,
"grad_norm": 0.04052734375,
"learning_rate": 4.798430180149765e-05,
"logits/chosen": -6.734255313873291,
"logits/rejected": -6.802587032318115,
"logps/chosen": -63.668617248535156,
"logps/rejected": -86.23512268066406,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6360193490982056,
"rewards/margins": 2.197523593902588,
"rewards/rejected": -2.833542823791504,
"step": 1160
},
{
"epoch": 0.44687380130418103,
"grad_norm": 0.046875,
"learning_rate": 4.797532535233475e-05,
"logits/chosen": -6.807023525238037,
"logits/rejected": -6.8437395095825195,
"logps/chosen": -61.99128341674805,
"logps/rejected": -82.16876220703125,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5318325161933899,
"rewards/margins": 2.200625419616699,
"rewards/rejected": -2.7324578762054443,
"step": 1165
},
{
"epoch": 0.4487917146144994,
"grad_norm": 0.064453125,
"learning_rate": 4.7966231476819484e-05,
"logits/chosen": -6.760836124420166,
"logits/rejected": -6.797084808349609,
"logps/chosen": -60.612220764160156,
"logps/rejected": -81.6513442993164,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.574898362159729,
"rewards/margins": 2.1929774284362793,
"rewards/rejected": -2.7678756713867188,
"step": 1170
},
{
"epoch": 0.4507096279248178,
"grad_norm": 0.04150390625,
"learning_rate": 4.7957020220251006e-05,
"logits/chosen": -6.7507781982421875,
"logits/rejected": -6.816678524017334,
"logps/chosen": -59.12261199951172,
"logps/rejected": -81.34478759765625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4484129548072815,
"rewards/margins": 2.1968235969543457,
"rewards/rejected": -2.6452364921569824,
"step": 1175
},
{
"epoch": 0.45262754123513615,
"grad_norm": 0.040283203125,
"learning_rate": 4.7947691628513175e-05,
"logits/chosen": -6.8405938148498535,
"logits/rejected": -6.911283016204834,
"logps/chosen": -62.71659469604492,
"logps/rejected": -84.43605041503906,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5480086207389832,
"rewards/margins": 2.1955459117889404,
"rewards/rejected": -2.7435545921325684,
"step": 1180
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.051513671875,
"learning_rate": 4.793824574807431e-05,
"logits/chosen": -6.7717742919921875,
"logits/rejected": -6.848321437835693,
"logps/chosen": -62.666900634765625,
"logps/rejected": -82.93745422363281,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6121290922164917,
"rewards/margins": 2.195594072341919,
"rewards/rejected": -2.8077232837677,
"step": 1185
},
{
"epoch": 0.4564633678557729,
"grad_norm": 0.06201171875,
"learning_rate": 4.7928682625987026e-05,
"logits/chosen": -6.753211975097656,
"logits/rejected": -6.8527021408081055,
"logps/chosen": -63.77888107299805,
"logps/rejected": -85.04472351074219,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6296527981758118,
"rewards/margins": 2.1975722312927246,
"rewards/rejected": -2.8272249698638916,
"step": 1190
},
{
"epoch": 0.4583812811660913,
"grad_norm": 0.046630859375,
"learning_rate": 4.79190023098879e-05,
"logits/chosen": -6.826653957366943,
"logits/rejected": -6.868197441101074,
"logps/chosen": -63.947975158691406,
"logps/rejected": -85.13566589355469,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6934822797775269,
"rewards/margins": 2.195202350616455,
"rewards/rejected": -2.8886845111846924,
"step": 1195
},
{
"epoch": 0.46029919447640966,
"grad_norm": 0.0400390625,
"learning_rate": 4.7909204847997314e-05,
"logits/chosen": -6.788220405578613,
"logits/rejected": -6.831966400146484,
"logps/chosen": -60.81682205200195,
"logps/rejected": -81.76251220703125,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5235145688056946,
"rewards/margins": 2.1972155570983887,
"rewards/rejected": -2.7207303047180176,
"step": 1200
},
{
"epoch": 0.46221710778672803,
"grad_norm": 0.044921875,
"learning_rate": 4.789929028911919e-05,
"logits/chosen": -6.754181861877441,
"logits/rejected": -6.800840854644775,
"logps/chosen": -64.92515563964844,
"logps/rejected": -86.32291412353516,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7156641483306885,
"rewards/margins": 2.1928858757019043,
"rewards/rejected": -2.9085500240325928,
"step": 1205
},
{
"epoch": 0.4641350210970464,
"grad_norm": 0.04736328125,
"learning_rate": 4.7889258682640706e-05,
"logits/chosen": -6.868006229400635,
"logits/rejected": -6.896157264709473,
"logps/chosen": -60.29206085205078,
"logps/rejected": -81.28080749511719,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3792288899421692,
"rewards/margins": 2.199084758758545,
"rewards/rejected": -2.5783140659332275,
"step": 1210
},
{
"epoch": 0.4660529344073648,
"grad_norm": 0.037109375,
"learning_rate": 4.7879110078532146e-05,
"logits/chosen": -6.787661552429199,
"logits/rejected": -6.892359256744385,
"logps/chosen": -60.83192825317383,
"logps/rejected": -82.53497314453125,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5688813924789429,
"rewards/margins": 2.2011029720306396,
"rewards/rejected": -2.7699837684631348,
"step": 1215
},
{
"epoch": 0.46797084771768316,
"grad_norm": 0.04736328125,
"learning_rate": 4.7868844527346537e-05,
"logits/chosen": -6.738536834716797,
"logits/rejected": -6.749987602233887,
"logps/chosen": -62.675140380859375,
"logps/rejected": -84.42222595214844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7274782657623291,
"rewards/margins": 2.192878007888794,
"rewards/rejected": -2.920356035232544,
"step": 1220
},
{
"epoch": 0.46988876102800153,
"grad_norm": 0.050537109375,
"learning_rate": 4.785846208021948e-05,
"logits/chosen": -6.793893337249756,
"logits/rejected": -6.861934661865234,
"logps/chosen": -61.73297119140625,
"logps/rejected": -83.82185363769531,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5035967230796814,
"rewards/margins": 2.196446657180786,
"rewards/rejected": -2.700043201446533,
"step": 1225
},
{
"epoch": 0.4718066743383199,
"grad_norm": 0.05078125,
"learning_rate": 4.7847962788868864e-05,
"logits/chosen": -6.81466817855835,
"logits/rejected": -6.926538944244385,
"logps/chosen": -59.07057571411133,
"logps/rejected": -80.15919494628906,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3567991554737091,
"rewards/margins": 2.197136163711548,
"rewards/rejected": -2.5539352893829346,
"step": 1230
},
{
"epoch": 0.4737245876486383,
"grad_norm": 0.058349609375,
"learning_rate": 4.783734670559462e-05,
"logits/chosen": -6.831311225891113,
"logits/rejected": -6.877817630767822,
"logps/chosen": -64.96932220458984,
"logps/rejected": -85.99290466308594,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6482155919075012,
"rewards/margins": 2.1953940391540527,
"rewards/rejected": -2.843609571456909,
"step": 1235
},
{
"epoch": 0.47564250095895666,
"grad_norm": 0.0390625,
"learning_rate": 4.7826613883278425e-05,
"logits/chosen": -6.820085048675537,
"logits/rejected": -6.860791206359863,
"logps/chosen": -59.95061111450195,
"logps/rejected": -81.42887878417969,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5063437819480896,
"rewards/margins": 2.197067975997925,
"rewards/rejected": -2.703411817550659,
"step": 1240
},
{
"epoch": 0.47756041426927504,
"grad_norm": 0.0419921875,
"learning_rate": 4.781576437538349e-05,
"logits/chosen": -6.722633361816406,
"logits/rejected": -6.769705772399902,
"logps/chosen": -62.50212860107422,
"logps/rejected": -83.28905487060547,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.486052930355072,
"rewards/margins": 2.1938464641571045,
"rewards/rejected": -2.6798996925354004,
"step": 1245
},
{
"epoch": 0.4794783275795934,
"grad_norm": 0.0576171875,
"learning_rate": 4.780479823595428e-05,
"logits/chosen": -6.791558265686035,
"logits/rejected": -6.841597557067871,
"logps/chosen": -60.80095291137695,
"logps/rejected": -82.75154113769531,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5802160501480103,
"rewards/margins": 2.193262815475464,
"rewards/rejected": -2.7734789848327637,
"step": 1250
},
{
"epoch": 0.4813962408899118,
"grad_norm": 0.05322265625,
"learning_rate": 4.7793715519616194e-05,
"logits/chosen": -6.77915096282959,
"logits/rejected": -6.826117515563965,
"logps/chosen": -61.505638122558594,
"logps/rejected": -81.8968505859375,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.44169655442237854,
"rewards/margins": 2.194701671600342,
"rewards/rejected": -2.6363978385925293,
"step": 1255
},
{
"epoch": 0.48331415420023016,
"grad_norm": 0.047119140625,
"learning_rate": 4.778251628157537e-05,
"logits/chosen": -6.796322822570801,
"logits/rejected": -6.828359127044678,
"logps/chosen": -63.79240036010742,
"logps/rejected": -84.49711608886719,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6860198378562927,
"rewards/margins": 2.1973636150360107,
"rewards/rejected": -2.883383274078369,
"step": 1260
},
{
"epoch": 0.48523206751054854,
"grad_norm": 0.035400390625,
"learning_rate": 4.777120057761836e-05,
"logits/chosen": -6.802372932434082,
"logits/rejected": -6.869441032409668,
"logps/chosen": -60.574119567871094,
"logps/rejected": -81.42657470703125,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4548783302307129,
"rewards/margins": 2.194202423095703,
"rewards/rejected": -2.649080753326416,
"step": 1265
},
{
"epoch": 0.4871499808208669,
"grad_norm": 0.048095703125,
"learning_rate": 4.7759768464111865e-05,
"logits/chosen": -6.760685920715332,
"logits/rejected": -6.879847049713135,
"logps/chosen": -65.76459503173828,
"logps/rejected": -85.82694244384766,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5539541244506836,
"rewards/margins": 2.1962039470672607,
"rewards/rejected": -2.7501578330993652,
"step": 1270
},
{
"epoch": 0.4890678941311853,
"grad_norm": 0.049560546875,
"learning_rate": 4.774821999800244e-05,
"logits/chosen": -6.74704647064209,
"logits/rejected": -6.814779758453369,
"logps/chosen": -61.025238037109375,
"logps/rejected": -82.47376251220703,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5795901417732239,
"rewards/margins": 2.198065996170044,
"rewards/rejected": -2.777656078338623,
"step": 1275
},
{
"epoch": 0.49098580744150366,
"grad_norm": 0.044189453125,
"learning_rate": 4.773655523681627e-05,
"logits/chosen": -6.8133039474487305,
"logits/rejected": -6.857518672943115,
"logps/chosen": -59.81324005126953,
"logps/rejected": -81.82405853271484,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5019493103027344,
"rewards/margins": 2.1963508129119873,
"rewards/rejected": -2.6983001232147217,
"step": 1280
},
{
"epoch": 0.49290372075182204,
"grad_norm": 0.0576171875,
"learning_rate": 4.7724774238658787e-05,
"logits/chosen": -6.734938144683838,
"logits/rejected": -6.824878692626953,
"logps/chosen": -64.06251525878906,
"logps/rejected": -85.48880767822266,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5863821506500244,
"rewards/margins": 2.197831630706787,
"rewards/rejected": -2.7842135429382324,
"step": 1285
},
{
"epoch": 0.4948216340621404,
"grad_norm": 0.048583984375,
"learning_rate": 4.7712877062214474e-05,
"logits/chosen": -6.768890380859375,
"logits/rejected": -6.9102044105529785,
"logps/chosen": -60.18365478515625,
"logps/rejected": -81.67802429199219,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4272948205471039,
"rewards/margins": 2.1933791637420654,
"rewards/rejected": -2.620673656463623,
"step": 1290
},
{
"epoch": 0.4967395473724588,
"grad_norm": 0.044677734375,
"learning_rate": 4.7700863766746484e-05,
"logits/chosen": -6.8031463623046875,
"logits/rejected": -6.905440330505371,
"logps/chosen": -60.71598434448242,
"logps/rejected": -81.36261749267578,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4217904210090637,
"rewards/margins": 2.193682909011841,
"rewards/rejected": -2.6154732704162598,
"step": 1295
},
{
"epoch": 0.49865746068277716,
"grad_norm": 0.05029296875,
"learning_rate": 4.768873441209644e-05,
"logits/chosen": -6.832518577575684,
"logits/rejected": -6.83789587020874,
"logps/chosen": -62.7836799621582,
"logps/rejected": -82.60746765136719,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5582566857337952,
"rewards/margins": 2.1951565742492676,
"rewards/rejected": -2.753412961959839,
"step": 1300
},
{
"epoch": 0.5005753739930955,
"grad_norm": 0.044189453125,
"learning_rate": 4.7676489058684055e-05,
"logits/chosen": -6.725746154785156,
"logits/rejected": -6.810865879058838,
"logps/chosen": -62.290992736816406,
"logps/rejected": -83.63255310058594,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4835914671421051,
"rewards/margins": 2.1948485374450684,
"rewards/rejected": -2.6784403324127197,
"step": 1305
},
{
"epoch": 0.5024932873034139,
"grad_norm": 0.05029296875,
"learning_rate": 4.7664127767506884e-05,
"logits/chosen": -6.764736175537109,
"logits/rejected": -6.878215789794922,
"logps/chosen": -59.05699920654297,
"logps/rejected": -80.65967559814453,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5298063158988953,
"rewards/margins": 2.198462724685669,
"rewards/rejected": -2.72826886177063,
"step": 1310
},
{
"epoch": 0.5044112006137322,
"grad_norm": 0.061767578125,
"learning_rate": 4.765165060014e-05,
"logits/chosen": -6.81158447265625,
"logits/rejected": -6.888514041900635,
"logps/chosen": -56.54206466674805,
"logps/rejected": -76.92692565917969,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.40915647149086,
"rewards/margins": 2.200356960296631,
"rewards/rejected": -2.609513521194458,
"step": 1315
},
{
"epoch": 0.5063291139240507,
"grad_norm": 0.03369140625,
"learning_rate": 4.763905761873566e-05,
"logits/chosen": -6.763751983642578,
"logits/rejected": -6.8651227951049805,
"logps/chosen": -57.99541473388672,
"logps/rejected": -78.6756591796875,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4112626910209656,
"rewards/margins": 2.1965606212615967,
"rewards/rejected": -2.607823133468628,
"step": 1320
},
{
"epoch": 0.508247027234369,
"grad_norm": 0.048828125,
"learning_rate": 4.762634888602306e-05,
"logits/chosen": -6.782062530517578,
"logits/rejected": -6.847997188568115,
"logps/chosen": -64.24237823486328,
"logps/rejected": -85.33832550048828,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6039192080497742,
"rewards/margins": 2.1964235305786133,
"rewards/rejected": -2.8003430366516113,
"step": 1325
},
{
"epoch": 0.5101649405446874,
"grad_norm": 0.044921875,
"learning_rate": 4.761352446530797e-05,
"logits/chosen": -6.749083518981934,
"logits/rejected": -6.8179802894592285,
"logps/chosen": -59.93855667114258,
"logps/rejected": -81.1589126586914,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5835517048835754,
"rewards/margins": 2.194997549057007,
"rewards/rejected": -2.7785494327545166,
"step": 1330
},
{
"epoch": 0.5120828538550057,
"grad_norm": 0.0380859375,
"learning_rate": 4.7600584420472416e-05,
"logits/chosen": -6.820675849914551,
"logits/rejected": -6.821556091308594,
"logps/chosen": -62.568939208984375,
"logps/rejected": -83.0500717163086,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6582069993019104,
"rewards/margins": 2.1977479457855225,
"rewards/rejected": -2.855954647064209,
"step": 1335
},
{
"epoch": 0.5140007671653242,
"grad_norm": 0.0400390625,
"learning_rate": 4.758752881597442e-05,
"logits/chosen": -6.764553070068359,
"logits/rejected": -6.8421125411987305,
"logps/chosen": -61.8564567565918,
"logps/rejected": -82.87976837158203,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5464135408401489,
"rewards/margins": 2.1998982429504395,
"rewards/rejected": -2.746311902999878,
"step": 1340
},
{
"epoch": 0.5159186804756425,
"grad_norm": 0.040771484375,
"learning_rate": 4.757435771684761e-05,
"logits/chosen": -6.775679111480713,
"logits/rejected": -6.910268306732178,
"logps/chosen": -62.16166305541992,
"logps/rejected": -85.0639419555664,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5519260764122009,
"rewards/margins": 2.1962175369262695,
"rewards/rejected": -2.7481436729431152,
"step": 1345
},
{
"epoch": 0.5178365937859609,
"grad_norm": 0.05078125,
"learning_rate": 4.756107118870093e-05,
"logits/chosen": -6.817084312438965,
"logits/rejected": -6.920017242431641,
"logps/chosen": -63.124549865722656,
"logps/rejected": -84.37528228759766,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5143846273422241,
"rewards/margins": 2.195559024810791,
"rewards/rejected": -2.7099435329437256,
"step": 1350
},
{
"epoch": 0.5197545070962792,
"grad_norm": 0.04541015625,
"learning_rate": 4.754766929771832e-05,
"logits/chosen": -6.7549333572387695,
"logits/rejected": -6.808642387390137,
"logps/chosen": -65.70404052734375,
"logps/rejected": -86.11283874511719,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7226360440254211,
"rewards/margins": 2.1955502033233643,
"rewards/rejected": -2.9181859493255615,
"step": 1355
},
{
"epoch": 0.5216724204065977,
"grad_norm": 0.0299072265625,
"learning_rate": 4.7534152110658354e-05,
"logits/chosen": -6.86520528793335,
"logits/rejected": -6.91876220703125,
"logps/chosen": -63.727516174316406,
"logps/rejected": -84.4239273071289,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.576887845993042,
"rewards/margins": 2.19865345954895,
"rewards/rejected": -2.775541067123413,
"step": 1360
},
{
"epoch": 0.523590333716916,
"grad_norm": 0.035888671875,
"learning_rate": 4.752051969485394e-05,
"logits/chosen": -6.792748928070068,
"logits/rejected": -6.829402923583984,
"logps/chosen": -58.34467697143555,
"logps/rejected": -80.06613159179688,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4105150103569031,
"rewards/margins": 2.199594259262085,
"rewards/rejected": -2.610109329223633,
"step": 1365
},
{
"epoch": 0.5255082470272344,
"grad_norm": 0.040283203125,
"learning_rate": 4.7506772118211987e-05,
"logits/chosen": -6.792714595794678,
"logits/rejected": -6.8383941650390625,
"logps/chosen": -69.7136459350586,
"logps/rejected": -90.48081970214844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8187990188598633,
"rewards/margins": 2.194049596786499,
"rewards/rejected": -3.0128488540649414,
"step": 1370
},
{
"epoch": 0.5274261603375527,
"grad_norm": 0.036376953125,
"learning_rate": 4.749290944921303e-05,
"logits/chosen": -6.784049987792969,
"logits/rejected": -6.838326454162598,
"logps/chosen": -60.717193603515625,
"logps/rejected": -82.25090026855469,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5634230375289917,
"rewards/margins": 2.1975927352905273,
"rewards/rejected": -2.7610154151916504,
"step": 1375
},
{
"epoch": 0.5293440736478712,
"grad_norm": 0.049560546875,
"learning_rate": 4.747893175691092e-05,
"logits/chosen": -6.78844690322876,
"logits/rejected": -6.837592124938965,
"logps/chosen": -62.469459533691406,
"logps/rejected": -83.67124938964844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6941007971763611,
"rewards/margins": 2.196812152862549,
"rewards/rejected": -2.890913248062134,
"step": 1380
},
{
"epoch": 0.5312619869581895,
"grad_norm": 0.0546875,
"learning_rate": 4.7464839110932476e-05,
"logits/chosen": -6.742993354797363,
"logits/rejected": -6.785296440124512,
"logps/chosen": -69.74168395996094,
"logps/rejected": -91.88017272949219,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.825350284576416,
"rewards/margins": 2.1931517124176025,
"rewards/rejected": -3.0185019969940186,
"step": 1385
},
{
"epoch": 0.5331799002685078,
"grad_norm": 0.0390625,
"learning_rate": 4.745063158147712e-05,
"logits/chosen": -6.85055685043335,
"logits/rejected": -6.899823188781738,
"logps/chosen": -61.71512985229492,
"logps/rejected": -83.54269409179688,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.472137987613678,
"rewards/margins": 2.195171356201172,
"rewards/rejected": -2.667309522628784,
"step": 1390
},
{
"epoch": 0.5350978135788262,
"grad_norm": 0.0400390625,
"learning_rate": 4.743630923931655e-05,
"logits/chosen": -6.822695732116699,
"logits/rejected": -6.846883296966553,
"logps/chosen": -62.36680221557617,
"logps/rejected": -83.02012634277344,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5894892811775208,
"rewards/margins": 2.195992946624756,
"rewards/rejected": -2.785482168197632,
"step": 1395
},
{
"epoch": 0.5370157268891446,
"grad_norm": 0.048828125,
"learning_rate": 4.742187215579439e-05,
"logits/chosen": -6.81241512298584,
"logits/rejected": -6.8298821449279785,
"logps/chosen": -63.766090393066406,
"logps/rejected": -85.15402221679688,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6096689701080322,
"rewards/margins": 2.1970481872558594,
"rewards/rejected": -2.8067169189453125,
"step": 1400
},
{
"epoch": 0.538933640199463,
"grad_norm": 0.039306640625,
"learning_rate": 4.740732040282581e-05,
"logits/chosen": -6.7713141441345215,
"logits/rejected": -6.855074882507324,
"logps/chosen": -63.4873161315918,
"logps/rejected": -85.61607360839844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.577864944934845,
"rewards/margins": 2.197863817214966,
"rewards/rejected": -2.775728464126587,
"step": 1405
},
{
"epoch": 0.5408515535097813,
"grad_norm": 0.036865234375,
"learning_rate": 4.739265405289716e-05,
"logits/chosen": -6.76360559463501,
"logits/rejected": -6.843583583831787,
"logps/chosen": -61.78276443481445,
"logps/rejected": -83.03706359863281,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.523431658744812,
"rewards/margins": 2.1963558197021484,
"rewards/rejected": -2.719787120819092,
"step": 1410
},
{
"epoch": 0.5427694668200997,
"grad_norm": 0.046875,
"learning_rate": 4.737787317906568e-05,
"logits/chosen": -6.7519378662109375,
"logits/rejected": -6.820305824279785,
"logps/chosen": -63.89045333862305,
"logps/rejected": -85.07756042480469,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5729597806930542,
"rewards/margins": 2.1944832801818848,
"rewards/rejected": -2.7674431800842285,
"step": 1415
},
{
"epoch": 0.5446873801304181,
"grad_norm": 0.0380859375,
"learning_rate": 4.736297785495903e-05,
"logits/chosen": -6.815577507019043,
"logits/rejected": -6.836461544036865,
"logps/chosen": -60.28411865234375,
"logps/rejected": -82.16222381591797,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6184202432632446,
"rewards/margins": 2.1970183849334717,
"rewards/rejected": -2.815438747406006,
"step": 1420
},
{
"epoch": 0.5466052934407365,
"grad_norm": 0.054443359375,
"learning_rate": 4.734796815477503e-05,
"logits/chosen": -6.764504909515381,
"logits/rejected": -6.821629524230957,
"logps/chosen": -61.06037139892578,
"logps/rejected": -82.24360656738281,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5765479207038879,
"rewards/margins": 2.1989150047302246,
"rewards/rejected": -2.775463104248047,
"step": 1425
},
{
"epoch": 0.5485232067510548,
"grad_norm": 0.039306640625,
"learning_rate": 4.73328441532812e-05,
"logits/chosen": -6.749651908874512,
"logits/rejected": -6.841190338134766,
"logps/chosen": -59.021446228027344,
"logps/rejected": -80.57456970214844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4866851270198822,
"rewards/margins": 2.1995737552642822,
"rewards/rejected": -2.6862587928771973,
"step": 1430
},
{
"epoch": 0.5504411200613732,
"grad_norm": 0.049072265625,
"learning_rate": 4.731760592581445e-05,
"logits/chosen": -6.712447166442871,
"logits/rejected": -6.835790157318115,
"logps/chosen": -59.537620544433594,
"logps/rejected": -81.30517578125,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4595329165458679,
"rewards/margins": 2.198075771331787,
"rewards/rejected": -2.6576085090637207,
"step": 1435
},
{
"epoch": 0.5523590333716916,
"grad_norm": 0.043212890625,
"learning_rate": 4.7302253548280674e-05,
"logits/chosen": -6.720251560211182,
"logits/rejected": -6.798407077789307,
"logps/chosen": -60.29015350341797,
"logps/rejected": -81.96891784667969,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.56800776720047,
"rewards/margins": 2.1948490142822266,
"rewards/rejected": -2.762856960296631,
"step": 1440
},
{
"epoch": 0.55427694668201,
"grad_norm": 0.04736328125,
"learning_rate": 4.728678709715438e-05,
"logits/chosen": -6.74158239364624,
"logits/rejected": -6.872788906097412,
"logps/chosen": -61.41450881958008,
"logps/rejected": -83.51676940917969,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4987873136997223,
"rewards/margins": 2.1945414543151855,
"rewards/rejected": -2.693329095840454,
"step": 1445
},
{
"epoch": 0.5561948599923283,
"grad_norm": 0.043212890625,
"learning_rate": 4.72712066494783e-05,
"logits/chosen": -6.711419105529785,
"logits/rejected": -6.823484897613525,
"logps/chosen": -62.261627197265625,
"logps/rejected": -83.39556121826172,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5504243969917297,
"rewards/margins": 2.1979422569274902,
"rewards/rejected": -2.748366594314575,
"step": 1450
},
{
"epoch": 0.5581127733026467,
"grad_norm": 0.0390625,
"learning_rate": 4.725551228286304e-05,
"logits/chosen": -6.845943450927734,
"logits/rejected": -6.932714939117432,
"logps/chosen": -65.34230041503906,
"logps/rejected": -86.31060791015625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5645556449890137,
"rewards/margins": 2.200577735900879,
"rewards/rejected": -2.7651333808898926,
"step": 1455
},
{
"epoch": 0.5600306866129651,
"grad_norm": 0.0419921875,
"learning_rate": 4.723970407548663e-05,
"logits/chosen": -6.818108558654785,
"logits/rejected": -6.868282318115234,
"logps/chosen": -62.125328063964844,
"logps/rejected": -82.881591796875,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.45852231979370117,
"rewards/margins": 2.1955528259277344,
"rewards/rejected": -2.6540751457214355,
"step": 1460
},
{
"epoch": 0.5619485999232835,
"grad_norm": 0.060546875,
"learning_rate": 4.7223782106094215e-05,
"logits/chosen": -6.684022426605225,
"logits/rejected": -6.757493495941162,
"logps/chosen": -64.17045593261719,
"logps/rejected": -85.75135803222656,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5858643651008606,
"rewards/margins": 2.1960318088531494,
"rewards/rejected": -2.781895875930786,
"step": 1465
},
{
"epoch": 0.5638665132336018,
"grad_norm": 0.05419921875,
"learning_rate": 4.720774645399759e-05,
"logits/chosen": -6.799399375915527,
"logits/rejected": -6.8811469078063965,
"logps/chosen": -58.72871780395508,
"logps/rejected": -79.8555908203125,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4386534094810486,
"rewards/margins": 2.194446086883545,
"rewards/rejected": -2.6330995559692383,
"step": 1470
},
{
"epoch": 0.5657844265439202,
"grad_norm": 0.0306396484375,
"learning_rate": 4.719159719907484e-05,
"logits/chosen": -6.7151007652282715,
"logits/rejected": -6.806893348693848,
"logps/chosen": -59.9053840637207,
"logps/rejected": -80.97401428222656,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5145848393440247,
"rewards/margins": 2.198782444000244,
"rewards/rejected": -2.713367223739624,
"step": 1475
},
{
"epoch": 0.5677023398542386,
"grad_norm": 0.044189453125,
"learning_rate": 4.7175334421769954e-05,
"logits/chosen": -6.7644171714782715,
"logits/rejected": -6.841060638427734,
"logps/chosen": -60.12373733520508,
"logps/rejected": -80.3787612915039,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5275717973709106,
"rewards/margins": 2.1961476802825928,
"rewards/rejected": -2.723719596862793,
"step": 1480
},
{
"epoch": 0.569620253164557,
"grad_norm": 0.038330078125,
"learning_rate": 4.715895820309239e-05,
"logits/chosen": -6.837026119232178,
"logits/rejected": -6.902268886566162,
"logps/chosen": -63.5143928527832,
"logps/rejected": -85.01191711425781,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5784210562705994,
"rewards/margins": 2.199751377105713,
"rewards/rejected": -2.778172492980957,
"step": 1485
},
{
"epoch": 0.5715381664748753,
"grad_norm": 0.051513671875,
"learning_rate": 4.71424686246167e-05,
"logits/chosen": -6.749986171722412,
"logits/rejected": -6.84371280670166,
"logps/chosen": -59.216644287109375,
"logps/rejected": -81.05298614501953,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.37248149514198303,
"rewards/margins": 2.1974806785583496,
"rewards/rejected": -2.5699620246887207,
"step": 1490
},
{
"epoch": 0.5734560797851938,
"grad_norm": 0.045654296875,
"learning_rate": 4.7125865768482113e-05,
"logits/chosen": -6.700467586517334,
"logits/rejected": -6.8221635818481445,
"logps/chosen": -65.43324279785156,
"logps/rejected": -86.24195861816406,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6096637845039368,
"rewards/margins": 2.195284366607666,
"rewards/rejected": -2.804947853088379,
"step": 1495
},
{
"epoch": 0.5753739930955121,
"grad_norm": 0.04736328125,
"learning_rate": 4.710914971739211e-05,
"logits/chosen": -6.764548301696777,
"logits/rejected": -6.841375827789307,
"logps/chosen": -64.81352233886719,
"logps/rejected": -86.60743713378906,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6613066792488098,
"rewards/margins": 2.1976189613342285,
"rewards/rejected": -2.8589255809783936,
"step": 1500
},
{
"epoch": 0.5753739930955121,
"eval_logits/chosen": -6.663508415222168,
"eval_logits/rejected": -6.8637237548828125,
"eval_logps/chosen": -61.2503776550293,
"eval_logps/rejected": -84.0605239868164,
"eval_loss": 0.32512322068214417,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": -0.5797533988952637,
"eval_rewards/margins": 2.197005271911621,
"eval_rewards/rejected": -2.7767584323883057,
"eval_runtime": 5.3083,
"eval_samples_per_second": 37.677,
"eval_steps_per_second": 37.677,
"step": 1500
},
{
"epoch": 0.5772919064058305,
"grad_norm": 0.038818359375,
"learning_rate": 4.709232055461405e-05,
"logits/chosen": -6.675856113433838,
"logits/rejected": -6.763123512268066,
"logps/chosen": -61.63834762573242,
"logps/rejected": -83.7894515991211,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5152076482772827,
"rewards/margins": 2.196242570877075,
"rewards/rejected": -2.7114500999450684,
"step": 1505
},
{
"epoch": 0.5792098197161488,
"grad_norm": 0.0498046875,
"learning_rate": 4.707537836397872e-05,
"logits/chosen": -6.814902305603027,
"logits/rejected": -6.906038761138916,
"logps/chosen": -64.63713836669922,
"logps/rejected": -86.2259521484375,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5156192779541016,
"rewards/margins": 2.196268320083618,
"rewards/rejected": -2.7118875980377197,
"step": 1510
},
{
"epoch": 0.5811277330264673,
"grad_norm": 0.0556640625,
"learning_rate": 4.705832322987994e-05,
"logits/chosen": -6.769433498382568,
"logits/rejected": -6.875026702880859,
"logps/chosen": -61.46131134033203,
"logps/rejected": -82.59616088867188,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5708250999450684,
"rewards/margins": 2.193892002105713,
"rewards/rejected": -2.7647171020507812,
"step": 1515
},
{
"epoch": 0.5830456463367856,
"grad_norm": 0.055419921875,
"learning_rate": 4.7041155237274105e-05,
"logits/chosen": -6.731886863708496,
"logits/rejected": -6.820645809173584,
"logps/chosen": -64.55342102050781,
"logps/rejected": -85.70281219482422,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6872868537902832,
"rewards/margins": 2.1945242881774902,
"rewards/rejected": -2.8818111419677734,
"step": 1520
},
{
"epoch": 0.584963559647104,
"grad_norm": 0.052490234375,
"learning_rate": 4.702387447167984e-05,
"logits/chosen": -6.7164740562438965,
"logits/rejected": -6.778362274169922,
"logps/chosen": -62.84291458129883,
"logps/rejected": -84.47712707519531,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5710551142692566,
"rewards/margins": 2.1979777812957764,
"rewards/rejected": -2.7690329551696777,
"step": 1525
},
{
"epoch": 0.5868814729574223,
"grad_norm": 0.034912109375,
"learning_rate": 4.700648101917749e-05,
"logits/chosen": -6.749175071716309,
"logits/rejected": -6.815337181091309,
"logps/chosen": -63.7158088684082,
"logps/rejected": -84.00125885009766,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6128538846969604,
"rewards/margins": 2.1982455253601074,
"rewards/rejected": -2.8110997676849365,
"step": 1530
},
{
"epoch": 0.5887993862677406,
"grad_norm": 0.033935546875,
"learning_rate": 4.698897496640871e-05,
"logits/chosen": -6.734133720397949,
"logits/rejected": -6.797808647155762,
"logps/chosen": -61.02180862426758,
"logps/rejected": -81.09629821777344,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5548717379570007,
"rewards/margins": 2.196436643600464,
"rewards/rejected": -2.7513084411621094,
"step": 1535
},
{
"epoch": 0.5907172995780591,
"grad_norm": 0.05078125,
"learning_rate": 4.697135640057609e-05,
"logits/chosen": -6.733152866363525,
"logits/rejected": -6.8301849365234375,
"logps/chosen": -61.2963752746582,
"logps/rejected": -81.04912567138672,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.41952013969421387,
"rewards/margins": 2.194058895111084,
"rewards/rejected": -2.613579034805298,
"step": 1540
},
{
"epoch": 0.5926352128883774,
"grad_norm": 0.037353515625,
"learning_rate": 4.695362540944266e-05,
"logits/chosen": -6.73095178604126,
"logits/rejected": -6.822793006896973,
"logps/chosen": -63.28429412841797,
"logps/rejected": -83.17731475830078,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5410445928573608,
"rewards/margins": 2.1978955268859863,
"rewards/rejected": -2.7389400005340576,
"step": 1545
},
{
"epoch": 0.5945531261986958,
"grad_norm": 0.03564453125,
"learning_rate": 4.693578208133145e-05,
"logits/chosen": -6.678317070007324,
"logits/rejected": -6.7426886558532715,
"logps/chosen": -64.89047241210938,
"logps/rejected": -87.15535736083984,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7630675435066223,
"rewards/margins": 2.196265697479248,
"rewards/rejected": -2.9593334197998047,
"step": 1550
},
{
"epoch": 0.5964710395090141,
"grad_norm": 0.03466796875,
"learning_rate": 4.691782650512511e-05,
"logits/chosen": -6.7095465660095215,
"logits/rejected": -6.760190010070801,
"logps/chosen": -67.8410415649414,
"logps/rejected": -89.27967834472656,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8494323492050171,
"rewards/margins": 2.1951375007629395,
"rewards/rejected": -3.0445704460144043,
"step": 1555
},
{
"epoch": 0.5983889528193326,
"grad_norm": 0.036376953125,
"learning_rate": 4.6899758770265416e-05,
"logits/chosen": -6.807528495788574,
"logits/rejected": -6.854405403137207,
"logps/chosen": -64.88179016113281,
"logps/rejected": -85.54252624511719,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5705582499504089,
"rewards/margins": 2.196807384490967,
"rewards/rejected": -2.7673659324645996,
"step": 1560
},
{
"epoch": 0.6003068661296509,
"grad_norm": 0.05078125,
"learning_rate": 4.688157896675282e-05,
"logits/chosen": -6.67507791519165,
"logits/rejected": -6.780775547027588,
"logps/chosen": -63.21833419799805,
"logps/rejected": -83.4687728881836,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5824323296546936,
"rewards/margins": 2.2005057334899902,
"rewards/rejected": -2.782937526702881,
"step": 1565
},
{
"epoch": 0.6022247794399693,
"grad_norm": 0.03955078125,
"learning_rate": 4.686328718514604e-05,
"logits/chosen": -6.837525844573975,
"logits/rejected": -6.915156364440918,
"logps/chosen": -62.40456008911133,
"logps/rejected": -84.44121551513672,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5634955167770386,
"rewards/margins": 2.195712089538574,
"rewards/rejected": -2.7592074871063232,
"step": 1570
},
{
"epoch": 0.6041426927502876,
"grad_norm": 0.0390625,
"learning_rate": 4.684488351656158e-05,
"logits/chosen": -6.743095397949219,
"logits/rejected": -6.8821210861206055,
"logps/chosen": -58.7745361328125,
"logps/rejected": -79.80195617675781,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.47515901923179626,
"rewards/margins": 2.195002317428589,
"rewards/rejected": -2.670161485671997,
"step": 1575
},
{
"epoch": 0.6060606060606061,
"grad_norm": 0.04443359375,
"learning_rate": 4.6826368052673295e-05,
"logits/chosen": -6.782208442687988,
"logits/rejected": -6.839322090148926,
"logps/chosen": -58.1090202331543,
"logps/rejected": -79.67631530761719,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5011711716651917,
"rewards/margins": 2.196396589279175,
"rewards/rejected": -2.6975674629211426,
"step": 1580
},
{
"epoch": 0.6079785193709244,
"grad_norm": 0.0634765625,
"learning_rate": 4.68077408857119e-05,
"logits/chosen": -6.660303592681885,
"logits/rejected": -6.718382835388184,
"logps/chosen": -64.7911605834961,
"logps/rejected": -86.10942077636719,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6913571357727051,
"rewards/margins": 2.1963887214660645,
"rewards/rejected": -2.8877463340759277,
"step": 1585
},
{
"epoch": 0.6098964326812428,
"grad_norm": 0.039306640625,
"learning_rate": 4.678900210846456e-05,
"logits/chosen": -6.826661586761475,
"logits/rejected": -6.863768100738525,
"logps/chosen": -64.81199645996094,
"logps/rejected": -85.13519287109375,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5662140846252441,
"rewards/margins": 2.198249340057373,
"rewards/rejected": -2.764463424682617,
"step": 1590
},
{
"epoch": 0.6118143459915611,
"grad_norm": 0.046630859375,
"learning_rate": 4.677015181427439e-05,
"logits/chosen": -6.7675042152404785,
"logits/rejected": -6.877073764801025,
"logps/chosen": -61.42873001098633,
"logps/rejected": -83.35433197021484,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6646031141281128,
"rewards/margins": 2.1970815658569336,
"rewards/rejected": -2.861684799194336,
"step": 1595
},
{
"epoch": 0.6137322593018796,
"grad_norm": 0.039794921875,
"learning_rate": 4.675119009704e-05,
"logits/chosen": -6.88427734375,
"logits/rejected": -6.932794094085693,
"logps/chosen": -61.87055206298828,
"logps/rejected": -82.3925552368164,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.46913057565689087,
"rewards/margins": 2.1977620124816895,
"rewards/rejected": -2.6668922901153564,
"step": 1600
},
{
"epoch": 0.6156501726121979,
"grad_norm": 0.045166015625,
"learning_rate": 4.673211705121503e-05,
"logits/chosen": -6.758817195892334,
"logits/rejected": -6.858736991882324,
"logps/chosen": -61.0307502746582,
"logps/rejected": -82.81901550292969,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.506413459777832,
"rewards/margins": 2.196242332458496,
"rewards/rejected": -2.7026560306549072,
"step": 1605
},
{
"epoch": 0.6175680859225163,
"grad_norm": 0.0380859375,
"learning_rate": 4.67129327718077e-05,
"logits/chosen": -6.817410945892334,
"logits/rejected": -6.829085350036621,
"logps/chosen": -62.83549118041992,
"logps/rejected": -83.76074981689453,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6362646818161011,
"rewards/margins": 2.196465015411377,
"rewards/rejected": -2.8327295780181885,
"step": 1610
},
{
"epoch": 0.6194859992328346,
"grad_norm": 0.053466796875,
"learning_rate": 4.669363735438028e-05,
"logits/chosen": -6.759008884429932,
"logits/rejected": -6.868435859680176,
"logps/chosen": -61.52397918701172,
"logps/rejected": -84.11946868896484,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5710245966911316,
"rewards/margins": 2.1951489448547363,
"rewards/rejected": -2.7661736011505127,
"step": 1615
},
{
"epoch": 0.6214039125431531,
"grad_norm": 0.038818359375,
"learning_rate": 4.667423089504868e-05,
"logits/chosen": -6.804877281188965,
"logits/rejected": -6.851701259613037,
"logps/chosen": -61.42131423950195,
"logps/rejected": -82.5312728881836,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5742114186286926,
"rewards/margins": 2.1968421936035156,
"rewards/rejected": -2.7710535526275635,
"step": 1620
},
{
"epoch": 0.6233218258534714,
"grad_norm": 0.03173828125,
"learning_rate": 4.665471349048191e-05,
"logits/chosen": -6.743138790130615,
"logits/rejected": -6.808810234069824,
"logps/chosen": -57.78135299682617,
"logps/rejected": -78.70478057861328,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4692755341529846,
"rewards/margins": 2.198038101196289,
"rewards/rejected": -2.667313814163208,
"step": 1625
},
{
"epoch": 0.6252397391637898,
"grad_norm": 0.052001953125,
"learning_rate": 4.663508523790167e-05,
"logits/chosen": -6.797055244445801,
"logits/rejected": -6.851097106933594,
"logps/chosen": -63.55707550048828,
"logps/rejected": -84.19354248046875,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5817860960960388,
"rewards/margins": 2.198854923248291,
"rewards/rejected": -2.7806410789489746,
"step": 1630
},
{
"epoch": 0.6271576524741082,
"grad_norm": 0.05029296875,
"learning_rate": 4.661534623508179e-05,
"logits/chosen": -6.698374271392822,
"logits/rejected": -6.778656005859375,
"logps/chosen": -64.4042739868164,
"logps/rejected": -85.94600677490234,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5719730257987976,
"rewards/margins": 2.195774555206299,
"rewards/rejected": -2.767747402191162,
"step": 1635
},
{
"epoch": 0.6290755657844266,
"grad_norm": 0.0458984375,
"learning_rate": 4.65954965803478e-05,
"logits/chosen": -6.783711910247803,
"logits/rejected": -6.867341041564941,
"logps/chosen": -64.90428161621094,
"logps/rejected": -85.47926330566406,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6290830969810486,
"rewards/margins": 2.197962522506714,
"rewards/rejected": -2.827045440673828,
"step": 1640
},
{
"epoch": 0.6309934790947449,
"grad_norm": 0.049560546875,
"learning_rate": 4.657553637257641e-05,
"logits/chosen": -6.754020690917969,
"logits/rejected": -6.789788722991943,
"logps/chosen": -62.860679626464844,
"logps/rejected": -84.67469787597656,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.58977210521698,
"rewards/margins": 2.19518780708313,
"rewards/rejected": -2.784959554672241,
"step": 1645
},
{
"epoch": 0.6329113924050633,
"grad_norm": 0.0712890625,
"learning_rate": 4.655546571119501e-05,
"logits/chosen": -6.775702476501465,
"logits/rejected": -6.795037746429443,
"logps/chosen": -63.498443603515625,
"logps/rejected": -84.87963104248047,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6724980473518372,
"rewards/margins": 2.198357105255127,
"rewards/rejected": -2.8708553314208984,
"step": 1650
},
{
"epoch": 0.6348293057153817,
"grad_norm": 0.064453125,
"learning_rate": 4.653528469618122e-05,
"logits/chosen": -6.807392120361328,
"logits/rejected": -6.87429141998291,
"logps/chosen": -59.84100341796875,
"logps/rejected": -80.7729721069336,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5027590990066528,
"rewards/margins": 2.1962478160858154,
"rewards/rejected": -2.699007034301758,
"step": 1655
},
{
"epoch": 0.6367472190257001,
"grad_norm": 0.034423828125,
"learning_rate": 4.651499342806235e-05,
"logits/chosen": -6.737414360046387,
"logits/rejected": -6.8177337646484375,
"logps/chosen": -58.65924072265625,
"logps/rejected": -79.90467071533203,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4649263918399811,
"rewards/margins": 2.196115493774414,
"rewards/rejected": -2.661041736602783,
"step": 1660
},
{
"epoch": 0.6386651323360184,
"grad_norm": 0.042236328125,
"learning_rate": 4.64945920079149e-05,
"logits/chosen": -6.836834907531738,
"logits/rejected": -6.920575141906738,
"logps/chosen": -60.32160568237305,
"logps/rejected": -82.49227905273438,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5061328411102295,
"rewards/margins": 2.1959569454193115,
"rewards/rejected": -2.70209002494812,
"step": 1665
},
{
"epoch": 0.6405830456463368,
"grad_norm": 0.046630859375,
"learning_rate": 4.6474080537364086e-05,
"logits/chosen": -6.743792533874512,
"logits/rejected": -6.792672157287598,
"logps/chosen": -65.84016418457031,
"logps/rejected": -86.33671569824219,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7148057818412781,
"rewards/margins": 2.1939172744750977,
"rewards/rejected": -2.9087231159210205,
"step": 1670
},
{
"epoch": 0.6425009589566552,
"grad_norm": 0.050537109375,
"learning_rate": 4.64534591185833e-05,
"logits/chosen": -6.721620082855225,
"logits/rejected": -6.873713493347168,
"logps/chosen": -59.97218704223633,
"logps/rejected": -82.32026672363281,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5144708752632141,
"rewards/margins": 2.1981799602508545,
"rewards/rejected": -2.712650775909424,
"step": 1675
},
{
"epoch": 0.6444188722669736,
"grad_norm": 0.05029296875,
"learning_rate": 4.643272785429364e-05,
"logits/chosen": -6.694830894470215,
"logits/rejected": -6.757704734802246,
"logps/chosen": -62.828887939453125,
"logps/rejected": -85.16447448730469,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6964989900588989,
"rewards/margins": 2.196366786956787,
"rewards/rejected": -2.8928656578063965,
"step": 1680
},
{
"epoch": 0.6463367855772919,
"grad_norm": 0.06689453125,
"learning_rate": 4.6411886847763344e-05,
"logits/chosen": -6.788907051086426,
"logits/rejected": -6.820149898529053,
"logps/chosen": -64.8192367553711,
"logps/rejected": -86.21455383300781,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7176898717880249,
"rewards/margins": 2.1988344192504883,
"rewards/rejected": -2.9165244102478027,
"step": 1685
},
{
"epoch": 0.6482546988876102,
"grad_norm": 0.038330078125,
"learning_rate": 4.6390936202807337e-05,
"logits/chosen": -6.768073081970215,
"logits/rejected": -6.80368709564209,
"logps/chosen": -69.4419174194336,
"logps/rejected": -89.60987854003906,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7591196894645691,
"rewards/margins": 2.195878744125366,
"rewards/rejected": -2.95499849319458,
"step": 1690
},
{
"epoch": 0.6501726121979287,
"grad_norm": 0.046630859375,
"learning_rate": 4.636987602378666e-05,
"logits/chosen": -6.750307559967041,
"logits/rejected": -6.827138423919678,
"logps/chosen": -64.7966537475586,
"logps/rejected": -86.62849426269531,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6886340379714966,
"rewards/margins": 2.195281505584717,
"rewards/rejected": -2.883915662765503,
"step": 1695
},
{
"epoch": 0.652090525508247,
"grad_norm": 0.039306640625,
"learning_rate": 4.6348706415607987e-05,
"logits/chosen": -6.8090667724609375,
"logits/rejected": -6.860124111175537,
"logps/chosen": -59.753684997558594,
"logps/rejected": -81.40279388427734,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.585098147392273,
"rewards/margins": 2.1974575519561768,
"rewards/rejected": -2.78255558013916,
"step": 1700
},
{
"epoch": 0.6540084388185654,
"grad_norm": 0.051513671875,
"learning_rate": 4.6327427483723095e-05,
"logits/chosen": -6.84018611907959,
"logits/rejected": -6.8761467933654785,
"logps/chosen": -64.61951446533203,
"logps/rejected": -85.91930389404297,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5915480852127075,
"rewards/margins": 2.1989455223083496,
"rewards/rejected": -2.7904937267303467,
"step": 1705
},
{
"epoch": 0.6559263521288837,
"grad_norm": 0.05712890625,
"learning_rate": 4.6306039334128314e-05,
"logits/chosen": -6.728167533874512,
"logits/rejected": -6.835136413574219,
"logps/chosen": -59.91040802001953,
"logps/rejected": -82.41685485839844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5621598362922668,
"rewards/margins": 2.199091672897339,
"rewards/rejected": -2.761251926422119,
"step": 1710
},
{
"epoch": 0.6578442654392022,
"grad_norm": 0.0546875,
"learning_rate": 4.628454207336403e-05,
"logits/chosen": -6.8048810958862305,
"logits/rejected": -6.874848365783691,
"logps/chosen": -63.48942947387695,
"logps/rejected": -84.7466812133789,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5312366485595703,
"rewards/margins": 2.1963775157928467,
"rewards/rejected": -2.727614402770996,
"step": 1715
},
{
"epoch": 0.6597621787495205,
"grad_norm": 0.040771484375,
"learning_rate": 4.6262935808514154e-05,
"logits/chosen": -6.784109592437744,
"logits/rejected": -6.887749671936035,
"logps/chosen": -57.665740966796875,
"logps/rejected": -79.55059814453125,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.33379465341567993,
"rewards/margins": 2.198172092437744,
"rewards/rejected": -2.5319666862487793,
"step": 1720
},
{
"epoch": 0.6616800920598389,
"grad_norm": 0.044677734375,
"learning_rate": 4.624122064720555e-05,
"logits/chosen": -6.82706356048584,
"logits/rejected": -6.852479457855225,
"logps/chosen": -61.39141845703125,
"logps/rejected": -82.49794006347656,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5211236476898193,
"rewards/margins": 2.1956381797790527,
"rewards/rejected": -2.716762065887451,
"step": 1725
},
{
"epoch": 0.6635980053701572,
"grad_norm": 0.050048828125,
"learning_rate": 4.621939669760755e-05,
"logits/chosen": -6.765600681304932,
"logits/rejected": -6.7940497398376465,
"logps/chosen": -60.3200798034668,
"logps/rejected": -81.86724090576172,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6085076332092285,
"rewards/margins": 2.1932969093322754,
"rewards/rejected": -2.801804542541504,
"step": 1730
},
{
"epoch": 0.6655159186804757,
"grad_norm": 0.054443359375,
"learning_rate": 4.6197464068431366e-05,
"logits/chosen": -6.755357265472412,
"logits/rejected": -6.797060966491699,
"logps/chosen": -58.92303466796875,
"logps/rejected": -81.09183502197266,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.49806681275367737,
"rewards/margins": 2.1958022117614746,
"rewards/rejected": -2.693869113922119,
"step": 1735
},
{
"epoch": 0.667433831990794,
"grad_norm": 0.039794921875,
"learning_rate": 4.6175422868929615e-05,
"logits/chosen": -6.759666442871094,
"logits/rejected": -6.852987766265869,
"logps/chosen": -61.73881149291992,
"logps/rejected": -82.39242553710938,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4188078045845032,
"rewards/margins": 2.197063446044922,
"rewards/rejected": -2.615870952606201,
"step": 1740
},
{
"epoch": 0.6693517453011124,
"grad_norm": 0.0341796875,
"learning_rate": 4.615327320889568e-05,
"logits/chosen": -6.756011962890625,
"logits/rejected": -6.833889961242676,
"logps/chosen": -65.3582534790039,
"logps/rejected": -86.70683288574219,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6901088356971741,
"rewards/margins": 2.198507070541382,
"rewards/rejected": -2.888615846633911,
"step": 1745
},
{
"epoch": 0.6712696586114307,
"grad_norm": 0.04443359375,
"learning_rate": 4.613101519866326e-05,
"logits/chosen": -6.754976749420166,
"logits/rejected": -6.827247619628906,
"logps/chosen": -66.11727905273438,
"logps/rejected": -87.72874450683594,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7430967092514038,
"rewards/margins": 2.1992220878601074,
"rewards/rejected": -2.9423186779022217,
"step": 1750
},
{
"epoch": 0.6731875719217492,
"grad_norm": 0.0341796875,
"learning_rate": 4.6108648949105756e-05,
"logits/chosen": -6.783148765563965,
"logits/rejected": -6.81369686126709,
"logps/chosen": -65.33592224121094,
"logps/rejected": -84.68086242675781,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6635004281997681,
"rewards/margins": 2.1974689960479736,
"rewards/rejected": -2.860969305038452,
"step": 1755
},
{
"epoch": 0.6751054852320675,
"grad_norm": 0.04736328125,
"learning_rate": 4.608617457163573e-05,
"logits/chosen": -6.7608642578125,
"logits/rejected": -6.836709499359131,
"logps/chosen": -59.4000244140625,
"logps/rejected": -81.574951171875,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4980418086051941,
"rewards/margins": 2.1935529708862305,
"rewards/rejected": -2.6915950775146484,
"step": 1760
},
{
"epoch": 0.6770233985423859,
"grad_norm": 0.050537109375,
"learning_rate": 4.606359217820441e-05,
"logits/chosen": -6.732338905334473,
"logits/rejected": -6.816025733947754,
"logps/chosen": -65.36774444580078,
"logps/rejected": -85.60862731933594,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5363543629646301,
"rewards/margins": 2.19832706451416,
"rewards/rejected": -2.7346813678741455,
"step": 1765
},
{
"epoch": 0.6789413118527042,
"grad_norm": 0.051025390625,
"learning_rate": 4.6040901881301004e-05,
"logits/chosen": -6.83046817779541,
"logits/rejected": -6.893084526062012,
"logps/chosen": -60.8812141418457,
"logps/rejected": -81.68726348876953,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.41270560026168823,
"rewards/margins": 2.1950905323028564,
"rewards/rejected": -2.6077961921691895,
"step": 1770
},
{
"epoch": 0.6808592251630227,
"grad_norm": 0.0361328125,
"learning_rate": 4.6018103793952287e-05,
"logits/chosen": -6.795252323150635,
"logits/rejected": -6.8598222732543945,
"logps/chosen": -59.79205322265625,
"logps/rejected": -80.0960693359375,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.40817388892173767,
"rewards/margins": 2.1988348960876465,
"rewards/rejected": -2.607008695602417,
"step": 1775
},
{
"epoch": 0.682777138473341,
"grad_norm": 0.040771484375,
"learning_rate": 4.599519802972192e-05,
"logits/chosen": -6.790003776550293,
"logits/rejected": -6.81201171875,
"logps/chosen": -62.64923095703125,
"logps/rejected": -83.82284545898438,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5379251837730408,
"rewards/margins": 2.197380542755127,
"rewards/rejected": -2.7353057861328125,
"step": 1780
},
{
"epoch": 0.6846950517836594,
"grad_norm": 0.04541015625,
"learning_rate": 4.597218470270997e-05,
"logits/chosen": -6.7385454177856445,
"logits/rejected": -6.783153533935547,
"logps/chosen": -66.6662826538086,
"logps/rejected": -88.35466003417969,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7900538444519043,
"rewards/margins": 2.200716018676758,
"rewards/rejected": -2.990769863128662,
"step": 1785
},
{
"epoch": 0.6866129650939777,
"grad_norm": 0.0751953125,
"learning_rate": 4.594906392755229e-05,
"logits/chosen": -6.846865653991699,
"logits/rejected": -6.8920698165893555,
"logps/chosen": -62.57611083984375,
"logps/rejected": -83.23236846923828,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5446496605873108,
"rewards/margins": 2.1978793144226074,
"rewards/rejected": -2.7425286769866943,
"step": 1790
},
{
"epoch": 0.6885308784042962,
"grad_norm": 0.037353515625,
"learning_rate": 4.592583581941994e-05,
"logits/chosen": -6.785799980163574,
"logits/rejected": -6.822165012359619,
"logps/chosen": -62.872100830078125,
"logps/rejected": -83.56089782714844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6173510551452637,
"rewards/margins": 2.195507526397705,
"rewards/rejected": -2.8128585815429688,
"step": 1795
},
{
"epoch": 0.6904487917146145,
"grad_norm": 0.0517578125,
"learning_rate": 4.590250049401866e-05,
"logits/chosen": -6.7071852684021,
"logits/rejected": -6.744346618652344,
"logps/chosen": -59.503631591796875,
"logps/rejected": -82.10621643066406,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4692237973213196,
"rewards/margins": 2.197390556335449,
"rewards/rejected": -2.666614294052124,
"step": 1800
},
{
"epoch": 0.6923667050249329,
"grad_norm": 0.06884765625,
"learning_rate": 4.587905806758828e-05,
"logits/chosen": -6.759226322174072,
"logits/rejected": -6.798618316650391,
"logps/chosen": -60.597740173339844,
"logps/rejected": -81.20115661621094,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.47514835000038147,
"rewards/margins": 2.1932222843170166,
"rewards/rejected": -2.668370485305786,
"step": 1805
},
{
"epoch": 0.6942846183352512,
"grad_norm": 0.042724609375,
"learning_rate": 4.585550865690211e-05,
"logits/chosen": -6.79116153717041,
"logits/rejected": -6.905958652496338,
"logps/chosen": -65.27635192871094,
"logps/rejected": -87.11880493164062,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6134611368179321,
"rewards/margins": 2.199479818344116,
"rewards/rejected": -2.812941074371338,
"step": 1810
},
{
"epoch": 0.6962025316455697,
"grad_norm": 0.06396484375,
"learning_rate": 4.5831852379266374e-05,
"logits/chosen": -6.826533317565918,
"logits/rejected": -6.830357551574707,
"logps/chosen": -61.02429962158203,
"logps/rejected": -82.49093627929688,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4633258879184723,
"rewards/margins": 2.1991915702819824,
"rewards/rejected": -2.662517547607422,
"step": 1815
},
{
"epoch": 0.698120444955888,
"grad_norm": 0.044677734375,
"learning_rate": 4.5808089352519645e-05,
"logits/chosen": -6.621363639831543,
"logits/rejected": -6.738465309143066,
"logps/chosen": -62.206825256347656,
"logps/rejected": -82.90311431884766,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5602379441261292,
"rewards/margins": 2.1953892707824707,
"rewards/rejected": -2.755626678466797,
"step": 1820
},
{
"epoch": 0.7000383582662064,
"grad_norm": 0.042724609375,
"learning_rate": 4.578421969503224e-05,
"logits/chosen": -6.793525695800781,
"logits/rejected": -6.882171630859375,
"logps/chosen": -62.86467361450195,
"logps/rejected": -84.51292419433594,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5820156335830688,
"rewards/margins": 2.1975064277648926,
"rewards/rejected": -2.7795217037200928,
"step": 1825
},
{
"epoch": 0.7019562715765247,
"grad_norm": 0.04541015625,
"learning_rate": 4.576024352570563e-05,
"logits/chosen": -6.7367753982543945,
"logits/rejected": -6.8105878829956055,
"logps/chosen": -64.4591293334961,
"logps/rejected": -84.80027770996094,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6178757548332214,
"rewards/margins": 2.1955502033233643,
"rewards/rejected": -2.8134257793426514,
"step": 1830
},
{
"epoch": 0.7038741848868432,
"grad_norm": 0.04736328125,
"learning_rate": 4.573616096397187e-05,
"logits/chosen": -6.7723212242126465,
"logits/rejected": -6.824551582336426,
"logps/chosen": -60.999359130859375,
"logps/rejected": -81.71595764160156,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6143304109573364,
"rewards/margins": 2.199537992477417,
"rewards/rejected": -2.8138680458068848,
"step": 1835
},
{
"epoch": 0.7057920981971615,
"grad_norm": 0.038818359375,
"learning_rate": 4.571197212979295e-05,
"logits/chosen": -6.857874870300293,
"logits/rejected": -6.9073357582092285,
"logps/chosen": -65.43289184570312,
"logps/rejected": -86.42829895019531,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6608371734619141,
"rewards/margins": 2.197458267211914,
"rewards/rejected": -2.858295440673828,
"step": 1840
},
{
"epoch": 0.7077100115074798,
"grad_norm": 0.0458984375,
"learning_rate": 4.5687677143660254e-05,
"logits/chosen": -6.782164573669434,
"logits/rejected": -6.784448146820068,
"logps/chosen": -63.07709884643555,
"logps/rejected": -83.46958923339844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5670441389083862,
"rewards/margins": 2.1972358226776123,
"rewards/rejected": -2.764279842376709,
"step": 1845
},
{
"epoch": 0.7096279248177982,
"grad_norm": 0.050048828125,
"learning_rate": 4.566327612659394e-05,
"logits/chosen": -6.812560081481934,
"logits/rejected": -6.878233432769775,
"logps/chosen": -61.72986602783203,
"logps/rejected": -83.14173889160156,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5427154302597046,
"rewards/margins": 2.1970319747924805,
"rewards/rejected": -2.7397477626800537,
"step": 1850
},
{
"epoch": 0.7115458381281166,
"grad_norm": 0.04541015625,
"learning_rate": 4.563876920014234e-05,
"logits/chosen": -6.7176947593688965,
"logits/rejected": -6.807933807373047,
"logps/chosen": -63.34507369995117,
"logps/rejected": -84.20928955078125,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6023356318473816,
"rewards/margins": 2.1956591606140137,
"rewards/rejected": -2.79799485206604,
"step": 1855
},
{
"epoch": 0.713463751438435,
"grad_norm": 0.045166015625,
"learning_rate": 4.561415648638133e-05,
"logits/chosen": -6.859047889709473,
"logits/rejected": -6.9745049476623535,
"logps/chosen": -61.1859016418457,
"logps/rejected": -83.69287109375,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5361623167991638,
"rewards/margins": 2.1962532997131348,
"rewards/rejected": -2.732415199279785,
"step": 1860
},
{
"epoch": 0.7153816647487533,
"grad_norm": 0.06884765625,
"learning_rate": 4.5589438107913764e-05,
"logits/chosen": -6.736496925354004,
"logits/rejected": -6.8658857345581055,
"logps/chosen": -61.28631591796875,
"logps/rejected": -83.09465026855469,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5119591951370239,
"rewards/margins": 2.195549964904785,
"rewards/rejected": -2.7075092792510986,
"step": 1865
},
{
"epoch": 0.7172995780590717,
"grad_norm": 0.0869140625,
"learning_rate": 4.556461418786884e-05,
"logits/chosen": -6.786721229553223,
"logits/rejected": -6.838905334472656,
"logps/chosen": -60.60860061645508,
"logps/rejected": -82.28132629394531,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5649218559265137,
"rewards/margins": 2.1966118812561035,
"rewards/rejected": -2.761533260345459,
"step": 1870
},
{
"epoch": 0.7192174913693901,
"grad_norm": 0.04931640625,
"learning_rate": 4.5539684849901465e-05,
"logits/chosen": -6.7685346603393555,
"logits/rejected": -6.808984279632568,
"logps/chosen": -65.05911254882812,
"logps/rejected": -85.53936767578125,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6445679664611816,
"rewards/margins": 2.1987884044647217,
"rewards/rejected": -2.8433563709259033,
"step": 1875
},
{
"epoch": 0.7211354046797085,
"grad_norm": 0.04345703125,
"learning_rate": 4.551465021819168e-05,
"logits/chosen": -6.751245021820068,
"logits/rejected": -6.854135036468506,
"logps/chosen": -62.14875411987305,
"logps/rejected": -83.81892395019531,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5632321238517761,
"rewards/margins": 2.1948935985565186,
"rewards/rejected": -2.7581255435943604,
"step": 1880
},
{
"epoch": 0.7230533179900268,
"grad_norm": 0.06591796875,
"learning_rate": 4.548951041744404e-05,
"logits/chosen": -6.735540866851807,
"logits/rejected": -6.806166172027588,
"logps/chosen": -61.762107849121094,
"logps/rejected": -83.3387451171875,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4712158739566803,
"rewards/margins": 2.1985621452331543,
"rewards/rejected": -2.6697781085968018,
"step": 1885
},
{
"epoch": 0.7249712313003452,
"grad_norm": 0.039306640625,
"learning_rate": 4.5464265572886934e-05,
"logits/chosen": -6.727081298828125,
"logits/rejected": -6.819952487945557,
"logps/chosen": -66.33174133300781,
"logps/rejected": -87.17375183105469,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7186114192008972,
"rewards/margins": 2.196577310562134,
"rewards/rejected": -2.9151885509490967,
"step": 1890
},
{
"epoch": 0.7268891446106636,
"grad_norm": 0.053955078125,
"learning_rate": 4.543891581027205e-05,
"logits/chosen": -6.7953901290893555,
"logits/rejected": -6.884771823883057,
"logps/chosen": -67.89469909667969,
"logps/rejected": -90.35826110839844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7310246229171753,
"rewards/margins": 2.199010133743286,
"rewards/rejected": -2.93003511428833,
"step": 1895
},
{
"epoch": 0.728807057920982,
"grad_norm": 0.05810546875,
"learning_rate": 4.541346125587367e-05,
"logits/chosen": -6.854927062988281,
"logits/rejected": -6.904066562652588,
"logps/chosen": -62.1525764465332,
"logps/rejected": -82.98854064941406,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.559751570224762,
"rewards/margins": 2.1974406242370605,
"rewards/rejected": -2.7571921348571777,
"step": 1900
},
{
"epoch": 0.7307249712313003,
"grad_norm": 0.08740234375,
"learning_rate": 4.53879020364881e-05,
"logits/chosen": -6.737257957458496,
"logits/rejected": -6.838744163513184,
"logps/chosen": -62.46352005004883,
"logps/rejected": -84.47117614746094,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5287628769874573,
"rewards/margins": 2.199683666229248,
"rewards/rejected": -2.7284464836120605,
"step": 1905
},
{
"epoch": 0.7326428845416187,
"grad_norm": 0.0546875,
"learning_rate": 4.5362238279432966e-05,
"logits/chosen": -6.782439231872559,
"logits/rejected": -6.8341827392578125,
"logps/chosen": -64.24818420410156,
"logps/rejected": -86.03765869140625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5050908327102661,
"rewards/margins": 2.196857213973999,
"rewards/rejected": -2.7019479274749756,
"step": 1910
},
{
"epoch": 0.7345607978519371,
"grad_norm": 0.0556640625,
"learning_rate": 4.533647011254668e-05,
"logits/chosen": -6.7711181640625,
"logits/rejected": -6.8788042068481445,
"logps/chosen": -65.85881042480469,
"logps/rejected": -85.33684539794922,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5805426836013794,
"rewards/margins": 2.1959762573242188,
"rewards/rejected": -2.7765188217163086,
"step": 1915
},
{
"epoch": 0.7364787111622555,
"grad_norm": 0.0576171875,
"learning_rate": 4.531059766418772e-05,
"logits/chosen": -6.808495998382568,
"logits/rejected": -6.879085540771484,
"logps/chosen": -56.28014373779297,
"logps/rejected": -77.42876434326172,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3688265085220337,
"rewards/margins": 2.1956605911254883,
"rewards/rejected": -2.5644869804382324,
"step": 1920
},
{
"epoch": 0.7383966244725738,
"grad_norm": 0.10986328125,
"learning_rate": 4.528462106323401e-05,
"logits/chosen": -6.75214147567749,
"logits/rejected": -6.826302528381348,
"logps/chosen": -63.860877990722656,
"logps/rejected": -85.62739562988281,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6718140840530396,
"rewards/margins": 2.194911479949951,
"rewards/rejected": -2.866725444793701,
"step": 1925
},
{
"epoch": 0.7403145377828922,
"grad_norm": 0.037353515625,
"learning_rate": 4.525854043908233e-05,
"logits/chosen": -6.792781829833984,
"logits/rejected": -6.87396240234375,
"logps/chosen": -64.07568359375,
"logps/rejected": -85.14360046386719,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6465386152267456,
"rewards/margins": 2.19848370552063,
"rewards/rejected": -2.845022439956665,
"step": 1930
},
{
"epoch": 0.7422324510932106,
"grad_norm": 0.041015625,
"learning_rate": 4.5232355921647566e-05,
"logits/chosen": -6.751712799072266,
"logits/rejected": -6.775839328765869,
"logps/chosen": -60.863189697265625,
"logps/rejected": -81.8580093383789,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4481169283390045,
"rewards/margins": 2.1981563568115234,
"rewards/rejected": -2.646273374557495,
"step": 1935
},
{
"epoch": 0.744150364403529,
"grad_norm": 0.050537109375,
"learning_rate": 4.520606764136218e-05,
"logits/chosen": -6.766876220703125,
"logits/rejected": -6.857789039611816,
"logps/chosen": -68.12163543701172,
"logps/rejected": -89.1402587890625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6213817596435547,
"rewards/margins": 2.1959993839263916,
"rewards/rejected": -2.8173813819885254,
"step": 1940
},
{
"epoch": 0.7460682777138473,
"grad_norm": 0.04541015625,
"learning_rate": 4.517967572917548e-05,
"logits/chosen": -6.843106269836426,
"logits/rejected": -6.927582740783691,
"logps/chosen": -60.88816452026367,
"logps/rejected": -83.1964111328125,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5037531852722168,
"rewards/margins": 2.1971211433410645,
"rewards/rejected": -2.7008743286132812,
"step": 1945
},
{
"epoch": 0.7479861910241657,
"grad_norm": 0.0537109375,
"learning_rate": 4.5153180316553e-05,
"logits/chosen": -6.827356815338135,
"logits/rejected": -6.820866584777832,
"logps/chosen": -63.216529846191406,
"logps/rejected": -84.10847473144531,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6285332441329956,
"rewards/margins": 2.1946308612823486,
"rewards/rejected": -2.823164224624634,
"step": 1950
},
{
"epoch": 0.7499041043344841,
"grad_norm": 0.048583984375,
"learning_rate": 4.5126581535475836e-05,
"logits/chosen": -6.798864841461182,
"logits/rejected": -6.872601509094238,
"logps/chosen": -66.13003540039062,
"logps/rejected": -87.56874084472656,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.689039409160614,
"rewards/margins": 2.1969897747039795,
"rewards/rejected": -2.8860294818878174,
"step": 1955
},
{
"epoch": 0.7518220176448025,
"grad_norm": 0.056396484375,
"learning_rate": 4.5099879518439994e-05,
"logits/chosen": -6.668278694152832,
"logits/rejected": -6.7603960037231445,
"logps/chosen": -64.11713409423828,
"logps/rejected": -84.96097564697266,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5732687711715698,
"rewards/margins": 2.199057102203369,
"rewards/rejected": -2.7723259925842285,
"step": 1960
},
{
"epoch": 0.7537399309551208,
"grad_norm": 0.0419921875,
"learning_rate": 4.5073074398455726e-05,
"logits/chosen": -6.860171318054199,
"logits/rejected": -6.9390549659729,
"logps/chosen": -59.60344696044922,
"logps/rejected": -80.41170501708984,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4485054910182953,
"rewards/margins": 2.1985862255096436,
"rewards/rejected": -2.6470913887023926,
"step": 1965
},
{
"epoch": 0.7556578442654392,
"grad_norm": 0.05224609375,
"learning_rate": 4.504616630904687e-05,
"logits/chosen": -6.770961761474609,
"logits/rejected": -6.811574459075928,
"logps/chosen": -58.90739822387695,
"logps/rejected": -80.26576232910156,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.44480863213539124,
"rewards/margins": 2.1979799270629883,
"rewards/rejected": -2.6427886486053467,
"step": 1970
},
{
"epoch": 0.7575757575757576,
"grad_norm": 0.061767578125,
"learning_rate": 4.5019155384250175e-05,
"logits/chosen": -6.8337860107421875,
"logits/rejected": -6.892735481262207,
"logps/chosen": -63.18619918823242,
"logps/rejected": -83.08711242675781,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5215980410575867,
"rewards/margins": 2.1953835487365723,
"rewards/rejected": -2.7169814109802246,
"step": 1975
},
{
"epoch": 0.759493670886076,
"grad_norm": 0.044677734375,
"learning_rate": 4.4992041758614665e-05,
"logits/chosen": -6.797111511230469,
"logits/rejected": -6.9238691329956055,
"logps/chosen": -61.52330780029297,
"logps/rejected": -82.0053939819336,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4695897698402405,
"rewards/margins": 2.1961278915405273,
"rewards/rejected": -2.665717840194702,
"step": 1980
},
{
"epoch": 0.7614115841963943,
"grad_norm": 0.060791015625,
"learning_rate": 4.4964825567200924e-05,
"logits/chosen": -6.764313697814941,
"logits/rejected": -6.81424617767334,
"logps/chosen": -63.5527229309082,
"logps/rejected": -84.34095764160156,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6415268778800964,
"rewards/margins": 2.197373867034912,
"rewards/rejected": -2.838900566101074,
"step": 1985
},
{
"epoch": 0.7633294975067128,
"grad_norm": 0.05419921875,
"learning_rate": 4.493750694558045e-05,
"logits/chosen": -6.801182746887207,
"logits/rejected": -6.829155921936035,
"logps/chosen": -60.45806121826172,
"logps/rejected": -80.76268005371094,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5038140416145325,
"rewards/margins": 2.197751522064209,
"rewards/rejected": -2.7015655040740967,
"step": 1990
},
{
"epoch": 0.7652474108170311,
"grad_norm": 0.1923828125,
"learning_rate": 4.4910086029834964e-05,
"logits/chosen": -6.829585075378418,
"logits/rejected": -6.883375644683838,
"logps/chosen": -64.5816879272461,
"logps/rejected": -86.17229461669922,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6635004878044128,
"rewards/margins": 2.1997756958007812,
"rewards/rejected": -2.863276243209839,
"step": 1995
},
{
"epoch": 0.7671653241273494,
"grad_norm": 0.038818359375,
"learning_rate": 4.488256295655578e-05,
"logits/chosen": -6.76079797744751,
"logits/rejected": -6.796362400054932,
"logps/chosen": -62.818328857421875,
"logps/rejected": -83.43183898925781,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5418844223022461,
"rewards/margins": 2.197444438934326,
"rewards/rejected": -2.7393288612365723,
"step": 2000
},
{
"epoch": 0.7671653241273494,
"eval_logits/chosen": -6.664936065673828,
"eval_logits/rejected": -6.8608903884887695,
"eval_logps/chosen": -61.263816833496094,
"eval_logps/rejected": -84.09880065917969,
"eval_loss": 0.32511982321739197,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": -0.5810979604721069,
"eval_rewards/margins": 2.1994879245758057,
"eval_rewards/rejected": -2.780586004257202,
"eval_runtime": 5.3218,
"eval_samples_per_second": 37.581,
"eval_steps_per_second": 37.581,
"step": 2000
},
{
"epoch": 0.7690832374376678,
"grad_norm": 0.072265625,
"learning_rate": 4.4854937862843045e-05,
"logits/chosen": -6.7290191650390625,
"logits/rejected": -6.806711673736572,
"logps/chosen": -64.05516052246094,
"logps/rejected": -85.09039306640625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5960283279418945,
"rewards/margins": 2.1978256702423096,
"rewards/rejected": -2.793853759765625,
"step": 2005
},
{
"epoch": 0.7710011507479861,
"grad_norm": 0.11181640625,
"learning_rate": 4.482721088630511e-05,
"logits/chosen": -6.806267738342285,
"logits/rejected": -6.852849006652832,
"logps/chosen": -64.03093719482422,
"logps/rejected": -84.20830535888672,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5489016771316528,
"rewards/margins": 2.1934478282928467,
"rewards/rejected": -2.742349863052368,
"step": 2010
},
{
"epoch": 0.7729190640583046,
"grad_norm": 0.0419921875,
"learning_rate": 4.479938216505786e-05,
"logits/chosen": -6.806654930114746,
"logits/rejected": -6.920736789703369,
"logps/chosen": -63.2310791015625,
"logps/rejected": -85.70663452148438,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5934334993362427,
"rewards/margins": 2.1975131034851074,
"rewards/rejected": -2.7909464836120605,
"step": 2015
},
{
"epoch": 0.7748369773686229,
"grad_norm": 0.1298828125,
"learning_rate": 4.477145183772396e-05,
"logits/chosen": -6.734241485595703,
"logits/rejected": -6.792110443115234,
"logps/chosen": -64.88965606689453,
"logps/rejected": -85.61289978027344,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6298405528068542,
"rewards/margins": 2.1967482566833496,
"rewards/rejected": -2.8265888690948486,
"step": 2020
},
{
"epoch": 0.7767548906789413,
"grad_norm": 0.06494140625,
"learning_rate": 4.4743420043432235e-05,
"logits/chosen": -6.755318641662598,
"logits/rejected": -6.848862648010254,
"logps/chosen": -61.517234802246094,
"logps/rejected": -82.68122100830078,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5705845355987549,
"rewards/margins": 2.196594715118408,
"rewards/rejected": -2.767179012298584,
"step": 2025
},
{
"epoch": 0.7786728039892596,
"grad_norm": 0.0556640625,
"learning_rate": 4.471528692181692e-05,
"logits/chosen": -6.737010955810547,
"logits/rejected": -6.783226013183594,
"logps/chosen": -63.67380905151367,
"logps/rejected": -85.3752212524414,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5692884922027588,
"rewards/margins": 2.198531150817871,
"rewards/rejected": -2.767819881439209,
"step": 2030
},
{
"epoch": 0.7805907172995781,
"grad_norm": 0.058837890625,
"learning_rate": 4.468705261301701e-05,
"logits/chosen": -6.7614288330078125,
"logits/rejected": -6.859000205993652,
"logps/chosen": -62.118080139160156,
"logps/rejected": -83.3497085571289,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4895971417427063,
"rewards/margins": 2.193037509918213,
"rewards/rejected": -2.6826348304748535,
"step": 2035
},
{
"epoch": 0.7825086306098964,
"grad_norm": 0.048828125,
"learning_rate": 4.465871725767552e-05,
"logits/chosen": -6.761736869812012,
"logits/rejected": -6.77576208114624,
"logps/chosen": -64.80572509765625,
"logps/rejected": -84.99140930175781,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.636228084564209,
"rewards/margins": 2.1979706287384033,
"rewards/rejected": -2.8341987133026123,
"step": 2040
},
{
"epoch": 0.7844265439202148,
"grad_norm": 0.046875,
"learning_rate": 4.4630280996938836e-05,
"logits/chosen": -6.707512855529785,
"logits/rejected": -6.794578552246094,
"logps/chosen": -59.2886848449707,
"logps/rejected": -80.18730163574219,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4784001410007477,
"rewards/margins": 2.1976962089538574,
"rewards/rejected": -2.6760964393615723,
"step": 2045
},
{
"epoch": 0.7863444572305331,
"grad_norm": 0.055419921875,
"learning_rate": 4.4601743972455956e-05,
"logits/chosen": -6.743660926818848,
"logits/rejected": -6.8471574783325195,
"logps/chosen": -64.73351287841797,
"logps/rejected": -85.35092163085938,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6120368242263794,
"rewards/margins": 2.1981964111328125,
"rewards/rejected": -2.8102333545684814,
"step": 2050
},
{
"epoch": 0.7882623705408516,
"grad_norm": 0.057861328125,
"learning_rate": 4.457310632637782e-05,
"logits/chosen": -6.833253383636475,
"logits/rejected": -6.930048942565918,
"logps/chosen": -62.33038330078125,
"logps/rejected": -84.3667984008789,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5810059309005737,
"rewards/margins": 2.196547031402588,
"rewards/rejected": -2.777553081512451,
"step": 2055
},
{
"epoch": 0.7901802838511699,
"grad_norm": 0.05126953125,
"learning_rate": 4.45443682013566e-05,
"logits/chosen": -6.695115089416504,
"logits/rejected": -6.742805480957031,
"logps/chosen": -64.67724609375,
"logps/rejected": -85.6129150390625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6756724715232849,
"rewards/margins": 2.1990294456481934,
"rewards/rejected": -2.874701738357544,
"step": 2060
},
{
"epoch": 0.7920981971614883,
"grad_norm": 0.0458984375,
"learning_rate": 4.4515529740544965e-05,
"logits/chosen": -6.786559104919434,
"logits/rejected": -6.889459133148193,
"logps/chosen": -65.39054870605469,
"logps/rejected": -87.1576156616211,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6647933125495911,
"rewards/margins": 2.194797992706299,
"rewards/rejected": -2.859591245651245,
"step": 2065
},
{
"epoch": 0.7940161104718066,
"grad_norm": 0.04638671875,
"learning_rate": 4.44865910875954e-05,
"logits/chosen": -6.708530426025391,
"logits/rejected": -6.796938419342041,
"logps/chosen": -63.345420837402344,
"logps/rejected": -83.50309753417969,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5706071257591248,
"rewards/margins": 2.196532726287842,
"rewards/rejected": -2.7671401500701904,
"step": 2070
},
{
"epoch": 0.7959340237821251,
"grad_norm": 0.058837890625,
"learning_rate": 4.445755238665947e-05,
"logits/chosen": -6.803549289703369,
"logits/rejected": -6.857290744781494,
"logps/chosen": -65.19212341308594,
"logps/rejected": -87.05712127685547,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6691518425941467,
"rewards/margins": 2.1989452838897705,
"rewards/rejected": -2.8680973052978516,
"step": 2075
},
{
"epoch": 0.7978519370924434,
"grad_norm": 0.053955078125,
"learning_rate": 4.442841378238711e-05,
"logits/chosen": -6.7612199783325195,
"logits/rejected": -6.839663028717041,
"logps/chosen": -62.892539978027344,
"logps/rejected": -84.74234771728516,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6525223851203918,
"rewards/margins": 2.1974222660064697,
"rewards/rejected": -2.849944591522217,
"step": 2080
},
{
"epoch": 0.7997698504027618,
"grad_norm": 0.047119140625,
"learning_rate": 4.4399175419925886e-05,
"logits/chosen": -6.800496578216553,
"logits/rejected": -6.868733882904053,
"logps/chosen": -64.82361602783203,
"logps/rejected": -86.96720123291016,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.632161021232605,
"rewards/margins": 2.1986496448516846,
"rewards/rejected": -2.830810308456421,
"step": 2085
},
{
"epoch": 0.8016877637130801,
"grad_norm": 0.046630859375,
"learning_rate": 4.43698374449203e-05,
"logits/chosen": -6.7268853187561035,
"logits/rejected": -6.8085432052612305,
"logps/chosen": -62.931060791015625,
"logps/rejected": -84.10462951660156,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5087457895278931,
"rewards/margins": 2.196925640106201,
"rewards/rejected": -2.705671548843384,
"step": 2090
},
{
"epoch": 0.8036056770233986,
"grad_norm": 0.05859375,
"learning_rate": 4.4340400003511073e-05,
"logits/chosen": -6.776535987854004,
"logits/rejected": -6.851083278656006,
"logps/chosen": -62.702186584472656,
"logps/rejected": -83.18672943115234,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.538625955581665,
"rewards/margins": 2.1954431533813477,
"rewards/rejected": -2.7340688705444336,
"step": 2095
},
{
"epoch": 0.8055235903337169,
"grad_norm": 0.059814453125,
"learning_rate": 4.431086324233436e-05,
"logits/chosen": -6.781048774719238,
"logits/rejected": -6.8217363357543945,
"logps/chosen": -65.83214569091797,
"logps/rejected": -86.54866790771484,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.567939281463623,
"rewards/margins": 2.200836181640625,
"rewards/rejected": -2.768775463104248,
"step": 2100
},
{
"epoch": 0.8074415036440353,
"grad_norm": 0.04736328125,
"learning_rate": 4.4281227308521064e-05,
"logits/chosen": -6.783602714538574,
"logits/rejected": -6.803304195404053,
"logps/chosen": -65.60836029052734,
"logps/rejected": -87.5150146484375,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7341927289962769,
"rewards/margins": 2.195850372314453,
"rewards/rejected": -2.9300434589385986,
"step": 2105
},
{
"epoch": 0.8093594169543536,
"grad_norm": 0.07080078125,
"learning_rate": 4.4251492349696115e-05,
"logits/chosen": -6.827857971191406,
"logits/rejected": -6.948951721191406,
"logps/chosen": -60.68585968017578,
"logps/rejected": -81.73527526855469,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.37252140045166016,
"rewards/margins": 2.194404125213623,
"rewards/rejected": -2.5669257640838623,
"step": 2110
},
{
"epoch": 0.8112773302646721,
"grad_norm": 0.09326171875,
"learning_rate": 4.42216585139777e-05,
"logits/chosen": -6.7736992835998535,
"logits/rejected": -6.819214820861816,
"logps/chosen": -62.67510986328125,
"logps/rejected": -84.61715698242188,
"loss": 0.3261,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5392805933952332,
"rewards/margins": 2.1884799003601074,
"rewards/rejected": -2.7277605533599854,
"step": 2115
},
{
"epoch": 0.8131952435749904,
"grad_norm": 0.08251953125,
"learning_rate": 4.419172594997653e-05,
"logits/chosen": -6.801031589508057,
"logits/rejected": -6.857367515563965,
"logps/chosen": -62.54511260986328,
"logps/rejected": -83.21441650390625,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4863591194152832,
"rewards/margins": 2.1975858211517334,
"rewards/rejected": -2.6839447021484375,
"step": 2120
},
{
"epoch": 0.8151131568853088,
"grad_norm": 0.099609375,
"learning_rate": 4.4161694806795126e-05,
"logits/chosen": -6.776806831359863,
"logits/rejected": -6.829167366027832,
"logps/chosen": -62.74772262573242,
"logps/rejected": -84.97938537597656,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6619436740875244,
"rewards/margins": 2.1976840496063232,
"rewards/rejected": -2.8596279621124268,
"step": 2125
},
{
"epoch": 0.8170310701956272,
"grad_norm": 0.103515625,
"learning_rate": 4.413156523402706e-05,
"logits/chosen": -6.788256645202637,
"logits/rejected": -6.827269077301025,
"logps/chosen": -64.37828063964844,
"logps/rejected": -85.09651184082031,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6709007620811462,
"rewards/margins": 2.2003588676452637,
"rewards/rejected": -2.8712592124938965,
"step": 2130
},
{
"epoch": 0.8189489835059456,
"grad_norm": 0.0888671875,
"learning_rate": 4.410133738175618e-05,
"logits/chosen": -6.746880531311035,
"logits/rejected": -6.855155944824219,
"logps/chosen": -60.25474166870117,
"logps/rejected": -82.25382995605469,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5617008209228516,
"rewards/margins": 2.1955270767211914,
"rewards/rejected": -2.757227897644043,
"step": 2135
},
{
"epoch": 0.8208668968162639,
"grad_norm": 0.080078125,
"learning_rate": 4.407101140055594e-05,
"logits/chosen": -6.7756781578063965,
"logits/rejected": -6.8511552810668945,
"logps/chosen": -63.31382369995117,
"logps/rejected": -84.40943908691406,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5347599387168884,
"rewards/margins": 2.194408655166626,
"rewards/rejected": -2.729168653488159,
"step": 2140
},
{
"epoch": 0.8227848101265823,
"grad_norm": 0.0732421875,
"learning_rate": 4.4040587441488566e-05,
"logits/chosen": -6.786390781402588,
"logits/rejected": -6.8126220703125,
"logps/chosen": -64.05632019042969,
"logps/rejected": -86.46208190917969,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6747733354568481,
"rewards/margins": 2.1963019371032715,
"rewards/rejected": -2.87107515335083,
"step": 2145
},
{
"epoch": 0.8247027234369007,
"grad_norm": 0.076171875,
"learning_rate": 4.401006565610436e-05,
"logits/chosen": -6.741326808929443,
"logits/rejected": -6.817544460296631,
"logps/chosen": -61.1034049987793,
"logps/rejected": -82.86426544189453,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6038416028022766,
"rewards/margins": 2.2050437927246094,
"rewards/rejected": -2.8088855743408203,
"step": 2150
},
{
"epoch": 0.826620636747219,
"grad_norm": 0.10888671875,
"learning_rate": 4.397944619644089e-05,
"logits/chosen": -6.805345058441162,
"logits/rejected": -6.8814568519592285,
"logps/chosen": -62.960472106933594,
"logps/rejected": -84.38675689697266,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5378700494766235,
"rewards/margins": 2.198153495788574,
"rewards/rejected": -2.7360236644744873,
"step": 2155
},
{
"epoch": 0.8285385500575374,
"grad_norm": 0.07373046875,
"learning_rate": 4.394872921502232e-05,
"logits/chosen": -6.7272233963012695,
"logits/rejected": -6.84502649307251,
"logps/chosen": -64.93167114257812,
"logps/rejected": -87.38648223876953,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5903478264808655,
"rewards/margins": 2.1909964084625244,
"rewards/rejected": -2.781344175338745,
"step": 2160
},
{
"epoch": 0.8304564633678557,
"grad_norm": 0.10498046875,
"learning_rate": 4.3917914864858546e-05,
"logits/chosen": -6.765233516693115,
"logits/rejected": -6.857439994812012,
"logps/chosen": -61.6948127746582,
"logps/rejected": -82.59429931640625,
"loss": 0.3284,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.46801313757896423,
"rewards/margins": 2.223072052001953,
"rewards/rejected": -2.6910855770111084,
"step": 2165
},
{
"epoch": 0.8323743766781742,
"grad_norm": 0.1083984375,
"learning_rate": 4.388700329944453e-05,
"logits/chosen": -6.72791051864624,
"logits/rejected": -6.8128252029418945,
"logps/chosen": -60.26092529296875,
"logps/rejected": -81.54100036621094,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4659239649772644,
"rewards/margins": 2.1824138164520264,
"rewards/rejected": -2.6483378410339355,
"step": 2170
},
{
"epoch": 0.8342922899884925,
"grad_norm": 0.10986328125,
"learning_rate": 4.385599467275944e-05,
"logits/chosen": -6.779816627502441,
"logits/rejected": -6.829860687255859,
"logps/chosen": -60.439300537109375,
"logps/rejected": -81.71858215332031,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.546042799949646,
"rewards/margins": 2.1965088844299316,
"rewards/rejected": -2.742551803588867,
"step": 2175
},
{
"epoch": 0.8362102032988109,
"grad_norm": 0.2451171875,
"learning_rate": 4.3824889139265984e-05,
"logits/chosen": -6.751175880432129,
"logits/rejected": -6.8204522132873535,
"logps/chosen": -64.71672058105469,
"logps/rejected": -84.95536804199219,
"loss": 0.3256,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5461716651916504,
"rewards/margins": 2.2114083766937256,
"rewards/rejected": -2.757579803466797,
"step": 2180
},
{
"epoch": 0.8381281166091292,
"grad_norm": 0.1669921875,
"learning_rate": 4.3793686853909556e-05,
"logits/chosen": -6.78512716293335,
"logits/rejected": -6.888462066650391,
"logps/chosen": -63.5004997253418,
"logps/rejected": -83.88385009765625,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6083672642707825,
"rewards/margins": 2.203294038772583,
"rewards/rejected": -2.8116612434387207,
"step": 2185
},
{
"epoch": 0.8400460299194477,
"grad_norm": 0.10693359375,
"learning_rate": 4.376238797211751e-05,
"logits/chosen": -6.882073879241943,
"logits/rejected": -6.960659980773926,
"logps/chosen": -62.78485107421875,
"logps/rejected": -84.0,
"loss": 0.3258,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5518940687179565,
"rewards/margins": 2.181422472000122,
"rewards/rejected": -2.7333168983459473,
"step": 2190
},
{
"epoch": 0.841963943229766,
"grad_norm": 0.10302734375,
"learning_rate": 4.373099264979839e-05,
"logits/chosen": -6.732028961181641,
"logits/rejected": -6.831223487854004,
"logps/chosen": -65.45835876464844,
"logps/rejected": -87.10944366455078,
"loss": 0.3254,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7259451150894165,
"rewards/margins": 2.1964221000671387,
"rewards/rejected": -2.9223673343658447,
"step": 2195
},
{
"epoch": 0.8438818565400844,
"grad_norm": 0.201171875,
"learning_rate": 4.36995010433411e-05,
"logits/chosen": -6.707369804382324,
"logits/rejected": -6.776480197906494,
"logps/chosen": -67.71757507324219,
"logps/rejected": -89.28482055664062,
"loss": 0.326,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7898483276367188,
"rewards/margins": 2.1907553672790527,
"rewards/rejected": -2.9806036949157715,
"step": 2200
},
{
"epoch": 0.8457997698504027,
"grad_norm": 0.2734375,
"learning_rate": 4.366791330961419e-05,
"logits/chosen": -6.8558478355407715,
"logits/rejected": -6.885350704193115,
"logps/chosen": -66.4366226196289,
"logps/rejected": -85.9381103515625,
"loss": 0.326,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.614000141620636,
"rewards/margins": 2.194479465484619,
"rewards/rejected": -2.8084795475006104,
"step": 2205
},
{
"epoch": 0.8477176831607212,
"grad_norm": 0.2197265625,
"learning_rate": 4.3636229605965046e-05,
"logits/chosen": -6.758575439453125,
"logits/rejected": -6.804045677185059,
"logps/chosen": -66.45049285888672,
"logps/rejected": -88.2242202758789,
"loss": 0.326,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.743838906288147,
"rewards/margins": 2.1854729652404785,
"rewards/rejected": -2.929311752319336,
"step": 2210
},
{
"epoch": 0.8496355964710395,
"grad_norm": 0.21484375,
"learning_rate": 4.3604450090219094e-05,
"logits/chosen": -6.822757720947266,
"logits/rejected": -6.8710737228393555,
"logps/chosen": -63.0651741027832,
"logps/rejected": -84.55931854248047,
"loss": 0.3277,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5265265703201294,
"rewards/margins": 2.20011043548584,
"rewards/rejected": -2.7266366481781006,
"step": 2215
},
{
"epoch": 0.8515535097813579,
"grad_norm": 0.62890625,
"learning_rate": 4.357257492067904e-05,
"logits/chosen": -6.8631696701049805,
"logits/rejected": -6.924670219421387,
"logps/chosen": -58.963844299316406,
"logps/rejected": -80.02134704589844,
"loss": 0.3317,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4552484154701233,
"rewards/margins": 2.2213027477264404,
"rewards/rejected": -2.676551103591919,
"step": 2220
},
{
"epoch": 0.8534714230916762,
"grad_norm": 0.82421875,
"learning_rate": 4.3540604256124066e-05,
"logits/chosen": -6.800238132476807,
"logits/rejected": -6.881220817565918,
"logps/chosen": -60.06099319458008,
"logps/rejected": -81.65616607666016,
"loss": 0.3294,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4933759570121765,
"rewards/margins": 2.1853110790252686,
"rewards/rejected": -2.67868709564209,
"step": 2225
},
{
"epoch": 0.8553893364019947,
"grad_norm": 1.0390625,
"learning_rate": 4.3508538255809035e-05,
"logits/chosen": -6.7798614501953125,
"logits/rejected": -6.826601982116699,
"logps/chosen": -61.747901916503906,
"logps/rejected": -83.8855209350586,
"loss": 0.3466,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -0.39245396852493286,
"rewards/margins": 2.295319080352783,
"rewards/rejected": -2.687772750854492,
"step": 2230
},
{
"epoch": 0.857307249712313,
"grad_norm": 5.9375,
"learning_rate": 4.3476377079463705e-05,
"logits/chosen": -6.831582546234131,
"logits/rejected": -6.855175971984863,
"logps/chosen": -62.176368713378906,
"logps/rejected": -81.49797058105469,
"loss": 0.3641,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -0.527076005935669,
"rewards/margins": 2.0065996646881104,
"rewards/rejected": -2.5336756706237793,
"step": 2235
},
{
"epoch": 0.8592251630226314,
"grad_norm": 2.734375,
"learning_rate": 4.3444120887291936e-05,
"logits/chosen": -6.75543737411499,
"logits/rejected": -6.856991767883301,
"logps/chosen": -60.326416015625,
"logps/rejected": -79.26715850830078,
"loss": 0.3855,
"rewards/accuracies": 0.965624988079071,
"rewards/chosen": -0.3984132707118988,
"rewards/margins": 1.9175384044647217,
"rewards/rejected": -2.3159518241882324,
"step": 2240
},
{
"epoch": 0.8611430763329497,
"grad_norm": 2.140625,
"learning_rate": 4.34117698399709e-05,
"logits/chosen": -6.790041446685791,
"logits/rejected": -6.8845930099487305,
"logps/chosen": -57.28984832763672,
"logps/rejected": -74.63005065917969,
"loss": 0.3935,
"rewards/accuracies": 0.953125,
"rewards/chosen": -0.3596039116382599,
"rewards/margins": 1.7848408222198486,
"rewards/rejected": -2.144444704055786,
"step": 2245
},
{
"epoch": 0.8630609896432682,
"grad_norm": 5.5625,
"learning_rate": 4.337932409865023e-05,
"logits/chosen": -6.925728797912598,
"logits/rejected": -6.95987606048584,
"logps/chosen": -62.9880256652832,
"logps/rejected": -80.00062561035156,
"loss": 0.4107,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -0.46328768134117126,
"rewards/margins": 1.8457485437393188,
"rewards/rejected": -2.3090367317199707,
"step": 2250
},
{
"epoch": 0.8649789029535865,
"grad_norm": 2.34375,
"learning_rate": 4.33467838249513e-05,
"logits/chosen": -6.822978973388672,
"logits/rejected": -6.83398962020874,
"logps/chosen": -60.968116760253906,
"logps/rejected": -78.74832916259766,
"loss": 0.3894,
"rewards/accuracies": 0.953125,
"rewards/chosen": -0.4434930682182312,
"rewards/margins": 1.8895623683929443,
"rewards/rejected": -2.3330554962158203,
"step": 2255
},
{
"epoch": 0.8668968162639049,
"grad_norm": 2.203125,
"learning_rate": 4.331414918096637e-05,
"logits/chosen": -6.800551414489746,
"logits/rejected": -6.828469753265381,
"logps/chosen": -62.655357360839844,
"logps/rejected": -83.25059509277344,
"loss": 0.36,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -0.5360097885131836,
"rewards/margins": 2.1591033935546875,
"rewards/rejected": -2.695112943649292,
"step": 2260
},
{
"epoch": 0.8688147295742232,
"grad_norm": 3.328125,
"learning_rate": 4.328142032925777e-05,
"logits/chosen": -6.843760013580322,
"logits/rejected": -6.8942742347717285,
"logps/chosen": -60.87250900268555,
"logps/rejected": -80.40345764160156,
"loss": 0.379,
"rewards/accuracies": 0.965624988079071,
"rewards/chosen": -0.45352882146835327,
"rewards/margins": 1.9877502918243408,
"rewards/rejected": -2.441279172897339,
"step": 2265
},
{
"epoch": 0.8707326428845417,
"grad_norm": 1.34375,
"learning_rate": 4.32485974328571e-05,
"logits/chosen": -6.862430572509766,
"logits/rejected": -6.91161584854126,
"logps/chosen": -60.5975341796875,
"logps/rejected": -78.65064239501953,
"loss": 0.3711,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -0.32780012488365173,
"rewards/margins": 1.910913109779358,
"rewards/rejected": -2.238713264465332,
"step": 2270
},
{
"epoch": 0.87265055619486,
"grad_norm": 7.03125,
"learning_rate": 4.3215680655264436e-05,
"logits/chosen": -6.953143119812012,
"logits/rejected": -6.958700656890869,
"logps/chosen": -60.2089958190918,
"logps/rejected": -80.94921112060547,
"loss": 0.3767,
"rewards/accuracies": 0.96875,
"rewards/chosen": -0.4435577392578125,
"rewards/margins": 2.0195579528808594,
"rewards/rejected": -2.463115692138672,
"step": 2275
},
{
"epoch": 0.8745684695051784,
"grad_norm": 2.375,
"learning_rate": 4.3182670160447495e-05,
"logits/chosen": -6.874422550201416,
"logits/rejected": -6.977646827697754,
"logps/chosen": -58.399261474609375,
"logps/rejected": -80.04866027832031,
"loss": 0.353,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -0.26949343085289,
"rewards/margins": 2.098801851272583,
"rewards/rejected": -2.368295192718506,
"step": 2280
},
{
"epoch": 0.8764863828154967,
"grad_norm": 3.828125,
"learning_rate": 4.314956611284084e-05,
"logits/chosen": -6.972007751464844,
"logits/rejected": -7.0395026206970215,
"logps/chosen": -61.51708984375,
"logps/rejected": -81.02632141113281,
"loss": 0.3673,
"rewards/accuracies": 0.965624988079071,
"rewards/chosen": -0.26689666509628296,
"rewards/margins": 2.0594050884246826,
"rewards/rejected": -2.3263020515441895,
"step": 2285
},
{
"epoch": 0.8784042961258152,
"grad_norm": 1.796875,
"learning_rate": 4.311636867734503e-05,
"logits/chosen": -6.955082893371582,
"logits/rejected": -7.046578407287598,
"logps/chosen": -58.28876876831055,
"logps/rejected": -81.63599395751953,
"loss": 0.3619,
"rewards/accuracies": 0.984375,
"rewards/chosen": -0.30293378233909607,
"rewards/margins": 2.241940498352051,
"rewards/rejected": -2.544874429702759,
"step": 2290
},
{
"epoch": 0.8803222094361335,
"grad_norm": 1.3125,
"learning_rate": 4.308307801932584e-05,
"logits/chosen": -6.8621649742126465,
"logits/rejected": -6.930577754974365,
"logps/chosen": -61.332130432128906,
"logps/rejected": -82.65200805664062,
"loss": 0.3517,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -0.26416295766830444,
"rewards/margins": 2.246364116668701,
"rewards/rejected": -2.5105273723602295,
"step": 2295
},
{
"epoch": 0.8822401227464519,
"grad_norm": 1.0703125,
"learning_rate": 4.304969430461337e-05,
"logits/chosen": -6.878881931304932,
"logits/rejected": -6.98480224609375,
"logps/chosen": -57.89545822143555,
"logps/rejected": -77.03311920166016,
"loss": 0.3547,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -0.03846656531095505,
"rewards/margins": 2.111886739730835,
"rewards/rejected": -2.1503536701202393,
"step": 2300
},
{
"epoch": 0.8841580360567702,
"grad_norm": 0.8125,
"learning_rate": 4.301621769950129e-05,
"logits/chosen": -6.871652126312256,
"logits/rejected": -6.941521644592285,
"logps/chosen": -52.32462692260742,
"logps/rejected": -73.90729522705078,
"loss": 0.3493,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -0.026116441935300827,
"rewards/margins": 2.0886855125427246,
"rewards/rejected": -2.1148018836975098,
"step": 2305
},
{
"epoch": 0.8860759493670886,
"grad_norm": 0.5703125,
"learning_rate": 4.2982648370746005e-05,
"logits/chosen": -6.835605621337891,
"logits/rejected": -6.929086208343506,
"logps/chosen": -63.22246170043945,
"logps/rejected": -83.64646911621094,
"loss": 0.3502,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -0.3698285222053528,
"rewards/margins": 2.067920446395874,
"rewards/rejected": -2.437749147415161,
"step": 2310
},
{
"epoch": 0.887993862677407,
"grad_norm": 1.171875,
"learning_rate": 4.294898648556575e-05,
"logits/chosen": -6.868348598480225,
"logits/rejected": -6.924635887145996,
"logps/chosen": -60.72758865356445,
"logps/rejected": -82.21714782714844,
"loss": 0.3501,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -0.34204745292663574,
"rewards/margins": 2.1199233531951904,
"rewards/rejected": -2.4619712829589844,
"step": 2315
},
{
"epoch": 0.8899117759877253,
"grad_norm": 1.140625,
"learning_rate": 4.2915232211639855e-05,
"logits/chosen": -6.862878322601318,
"logits/rejected": -6.937772274017334,
"logps/chosen": -57.07465744018555,
"logps/rejected": -78.38792419433594,
"loss": 0.3525,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -0.14980041980743408,
"rewards/margins": 2.179995059967041,
"rewards/rejected": -2.3297953605651855,
"step": 2320
},
{
"epoch": 0.8918296892980437,
"grad_norm": 1.265625,
"learning_rate": 4.288138571710783e-05,
"logits/chosen": -6.880101680755615,
"logits/rejected": -6.914247989654541,
"logps/chosen": -59.02347946166992,
"logps/rejected": -79.57148742675781,
"loss": 0.3485,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -0.11769998073577881,
"rewards/margins": 2.1596789360046387,
"rewards/rejected": -2.277379035949707,
"step": 2325
},
{
"epoch": 0.8937476026083621,
"grad_norm": 1.5234375,
"learning_rate": 4.2847447170568584e-05,
"logits/chosen": -6.755918979644775,
"logits/rejected": -6.846758842468262,
"logps/chosen": -57.86151123046875,
"logps/rejected": -79.17916870117188,
"loss": 0.3405,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -0.14960043132305145,
"rewards/margins": 2.1391072273254395,
"rewards/rejected": -2.2887072563171387,
"step": 2330
},
{
"epoch": 0.8956655159186805,
"grad_norm": 0.6953125,
"learning_rate": 4.2813416741079564e-05,
"logits/chosen": -6.889958381652832,
"logits/rejected": -6.950301170349121,
"logps/chosen": -58.084312438964844,
"logps/rejected": -78.97642517089844,
"loss": 0.3407,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.14899101853370667,
"rewards/margins": 2.1399528980255127,
"rewards/rejected": -2.2889437675476074,
"step": 2335
},
{
"epoch": 0.8975834292289988,
"grad_norm": 0.62890625,
"learning_rate": 4.27792945981559e-05,
"logits/chosen": -6.840428829193115,
"logits/rejected": -6.967151641845703,
"logps/chosen": -56.54399490356445,
"logps/rejected": -76.96357727050781,
"loss": 0.3386,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": 0.008828367106616497,
"rewards/margins": 2.1768367290496826,
"rewards/rejected": -2.168008327484131,
"step": 2340
},
{
"epoch": 0.8995013425393172,
"grad_norm": 0.48046875,
"learning_rate": 4.2745080911769565e-05,
"logits/chosen": -6.883180141448975,
"logits/rejected": -6.966970920562744,
"logps/chosen": -57.99214553833008,
"logps/rejected": -79.50528717041016,
"loss": 0.3409,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -0.05504138022661209,
"rewards/margins": 2.1709494590759277,
"rewards/rejected": -2.2259907722473145,
"step": 2345
},
{
"epoch": 0.9014192558496356,
"grad_norm": 0.369140625,
"learning_rate": 4.2710775852348576e-05,
"logits/chosen": -6.948625087738037,
"logits/rejected": -6.978739261627197,
"logps/chosen": -62.683006286621094,
"logps/rejected": -84.10035705566406,
"loss": 0.3405,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.11399755626916885,
"rewards/margins": 2.2531776428222656,
"rewards/rejected": -2.3671748638153076,
"step": 2350
},
{
"epoch": 0.903337169159954,
"grad_norm": 0.5703125,
"learning_rate": 4.267637959077606e-05,
"logits/chosen": -6.881336212158203,
"logits/rejected": -6.992560386657715,
"logps/chosen": -57.68507766723633,
"logps/rejected": -79.43367767333984,
"loss": 0.3344,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.08601374179124832,
"rewards/margins": 2.2121574878692627,
"rewards/rejected": -2.298171281814575,
"step": 2355
},
{
"epoch": 0.9052550824702723,
"grad_norm": 0.326171875,
"learning_rate": 4.264189229838945e-05,
"logits/chosen": -6.853945255279541,
"logits/rejected": -6.962986946105957,
"logps/chosen": -55.08637237548828,
"logps/rejected": -76.57861328125,
"loss": 0.3395,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": 0.04339681938290596,
"rewards/margins": 2.2207624912261963,
"rewards/rejected": -2.177365779876709,
"step": 2360
},
{
"epoch": 0.9071729957805907,
"grad_norm": 0.396484375,
"learning_rate": 4.2607314146979664e-05,
"logits/chosen": -6.9054412841796875,
"logits/rejected": -7.006344795227051,
"logps/chosen": -55.31730270385742,
"logps/rejected": -77.40642547607422,
"loss": 0.3399,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11814818531274796,
"rewards/margins": 2.2816827297210693,
"rewards/rejected": -2.16353440284729,
"step": 2365
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.59765625,
"learning_rate": 4.257264530879019e-05,
"logits/chosen": -6.856900691986084,
"logits/rejected": -6.938775539398193,
"logps/chosen": -55.32623291015625,
"logps/rejected": -76.9993667602539,
"loss": 0.3337,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.012098994106054306,
"rewards/margins": 2.1912758350372314,
"rewards/rejected": -2.1791763305664062,
"step": 2370
},
{
"epoch": 0.9110088224012275,
"grad_norm": 0.51953125,
"learning_rate": 4.253788595651624e-05,
"logits/chosen": -6.835860252380371,
"logits/rejected": -6.922590732574463,
"logps/chosen": -56.22468185424805,
"logps/rejected": -78.07466125488281,
"loss": 0.3349,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08803171664476395,
"rewards/margins": 2.245025634765625,
"rewards/rejected": -2.156993865966797,
"step": 2375
},
{
"epoch": 0.9129267357115458,
"grad_norm": 0.29296875,
"learning_rate": 4.250303626330394e-05,
"logits/chosen": -6.897280693054199,
"logits/rejected": -6.991764068603516,
"logps/chosen": -54.25004959106445,
"logps/rejected": -75.50594329833984,
"loss": 0.3321,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08122767508029938,
"rewards/margins": 2.2148044109344482,
"rewards/rejected": -2.1335768699645996,
"step": 2380
},
{
"epoch": 0.9148446490218642,
"grad_norm": 0.75,
"learning_rate": 4.246809640274939e-05,
"logits/chosen": -6.8050665855407715,
"logits/rejected": -6.896435737609863,
"logps/chosen": -56.57902145385742,
"logps/rejected": -77.64833068847656,
"loss": 0.3316,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.13071151077747345,
"rewards/margins": 2.223146438598633,
"rewards/rejected": -2.0924346446990967,
"step": 2385
},
{
"epoch": 0.9167625623321826,
"grad_norm": 0.310546875,
"learning_rate": 4.243306654889788e-05,
"logits/chosen": -6.833188056945801,
"logits/rejected": -6.917020320892334,
"logps/chosen": -53.51298141479492,
"logps/rejected": -75.99391174316406,
"loss": 0.3292,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.15236929059028625,
"rewards/margins": 2.231903314590454,
"rewards/rejected": -2.0795340538024902,
"step": 2390
},
{
"epoch": 0.918680475642501,
"grad_norm": 0.14453125,
"learning_rate": 4.239794687624294e-05,
"logits/chosen": -6.868006229400635,
"logits/rejected": -6.944448947906494,
"logps/chosen": -54.134910583496094,
"logps/rejected": -74.07920837402344,
"loss": 0.3273,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1325669139623642,
"rewards/margins": 2.1751344203948975,
"rewards/rejected": -2.042567729949951,
"step": 2395
},
{
"epoch": 0.9205983889528193,
"grad_norm": 0.25,
"learning_rate": 4.2362737559725526e-05,
"logits/chosen": -6.768338203430176,
"logits/rejected": -6.835727691650391,
"logps/chosen": -55.674278259277344,
"logps/rejected": -76.75242614746094,
"loss": 0.3299,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": 0.09858144074678421,
"rewards/margins": 2.154266834259033,
"rewards/rejected": -2.05568528175354,
"step": 2400
},
{
"epoch": 0.9225163022631377,
"grad_norm": 0.2041015625,
"learning_rate": 4.232743877473316e-05,
"logits/chosen": -6.808007717132568,
"logits/rejected": -6.890481472015381,
"logps/chosen": -51.641632080078125,
"logps/rejected": -72.97166442871094,
"loss": 0.3288,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.18179336190223694,
"rewards/margins": 2.2345938682556152,
"rewards/rejected": -2.0528006553649902,
"step": 2405
},
{
"epoch": 0.9244342155734561,
"grad_norm": 0.16015625,
"learning_rate": 4.229205069709898e-05,
"logits/chosen": -6.86145544052124,
"logits/rejected": -6.930682182312012,
"logps/chosen": -55.37908172607422,
"logps/rejected": -76.59054565429688,
"loss": 0.3286,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.09439722448587418,
"rewards/margins": 2.2271690368652344,
"rewards/rejected": -2.1327717304229736,
"step": 2410
},
{
"epoch": 0.9263521288837745,
"grad_norm": 0.248046875,
"learning_rate": 4.225657350310099e-05,
"logits/chosen": -6.836122989654541,
"logits/rejected": -6.910861015319824,
"logps/chosen": -59.543251037597656,
"logps/rejected": -80.47911071777344,
"loss": 0.3281,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.04849497973918915,
"rewards/margins": 2.2070679664611816,
"rewards/rejected": -2.1585729122161865,
"step": 2415
},
{
"epoch": 0.9282700421940928,
"grad_norm": 0.1591796875,
"learning_rate": 4.222100736946103e-05,
"logits/chosen": -6.886750221252441,
"logits/rejected": -6.970227241516113,
"logps/chosen": -53.348419189453125,
"logps/rejected": -75.61641693115234,
"loss": 0.3282,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.17651376128196716,
"rewards/margins": 2.2037792205810547,
"rewards/rejected": -2.027265787124634,
"step": 2420
},
{
"epoch": 0.9301879555044112,
"grad_norm": 0.1220703125,
"learning_rate": 4.218535247334402e-05,
"logits/chosen": -6.816466331481934,
"logits/rejected": -6.905699729919434,
"logps/chosen": -55.218040466308594,
"logps/rejected": -76.34078979492188,
"loss": 0.3269,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1573016345500946,
"rewards/margins": 2.205143451690674,
"rewards/rejected": -2.047841787338257,
"step": 2425
},
{
"epoch": 0.9321058688147296,
"grad_norm": 0.2138671875,
"learning_rate": 4.2149608992357024e-05,
"logits/chosen": -6.868281364440918,
"logits/rejected": -6.949463844299316,
"logps/chosen": -54.75147247314453,
"logps/rejected": -75.82257080078125,
"loss": 0.3266,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.17414028942584991,
"rewards/margins": 2.208597183227539,
"rewards/rejected": -2.034456729888916,
"step": 2430
},
{
"epoch": 0.934023782125048,
"grad_norm": 0.1953125,
"learning_rate": 4.211377710454837e-05,
"logits/chosen": -6.814573764801025,
"logits/rejected": -6.940939903259277,
"logps/chosen": -55.294456481933594,
"logps/rejected": -76.1449966430664,
"loss": 0.3266,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1870449334383011,
"rewards/margins": 2.2274434566497803,
"rewards/rejected": -2.040398120880127,
"step": 2435
},
{
"epoch": 0.9359416954353663,
"grad_norm": 0.201171875,
"learning_rate": 4.2077856988406756e-05,
"logits/chosen": -6.854167938232422,
"logits/rejected": -6.9163618087768555,
"logps/chosen": -52.53936767578125,
"logps/rejected": -75.00141906738281,
"loss": 0.3272,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16979984939098358,
"rewards/margins": 2.208627700805664,
"rewards/rejected": -2.038827896118164,
"step": 2440
},
{
"epoch": 0.9378596087456847,
"grad_norm": 0.1123046875,
"learning_rate": 4.204184882286038e-05,
"logits/chosen": -6.842519283294678,
"logits/rejected": -6.933518409729004,
"logps/chosen": -56.68706512451172,
"logps/rejected": -78.18891143798828,
"loss": 0.3259,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.09812651574611664,
"rewards/margins": 2.197338581085205,
"rewards/rejected": -2.099212169647217,
"step": 2445
},
{
"epoch": 0.9397775220560031,
"grad_norm": 0.0732421875,
"learning_rate": 4.200575278727604e-05,
"logits/chosen": -6.877575874328613,
"logits/rejected": -6.985260009765625,
"logps/chosen": -54.264122009277344,
"logps/rejected": -74.304931640625,
"loss": 0.3259,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12776386737823486,
"rewards/margins": 2.1849300861358643,
"rewards/rejected": -2.057166337966919,
"step": 2450
},
{
"epoch": 0.9416954353663214,
"grad_norm": 0.09326171875,
"learning_rate": 4.1969569061458224e-05,
"logits/chosen": -6.85354471206665,
"logits/rejected": -6.928282737731934,
"logps/chosen": -58.26618576049805,
"logps/rejected": -79.95372009277344,
"loss": 0.3256,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.06239093095064163,
"rewards/margins": 2.1833994388580322,
"rewards/rejected": -2.1210083961486816,
"step": 2455
},
{
"epoch": 0.9436133486766398,
"grad_norm": 0.1328125,
"learning_rate": 4.1933297825648244e-05,
"logits/chosen": -6.8315935134887695,
"logits/rejected": -6.913296699523926,
"logps/chosen": -53.44468307495117,
"logps/rejected": -75.22355651855469,
"loss": 0.3259,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1496339738368988,
"rewards/margins": 2.1975769996643066,
"rewards/rejected": -2.047943353652954,
"step": 2460
},
{
"epoch": 0.9455312619869581,
"grad_norm": 0.1435546875,
"learning_rate": 4.189693926052333e-05,
"logits/chosen": -6.85409688949585,
"logits/rejected": -6.940674781799316,
"logps/chosen": -54.494407653808594,
"logps/rejected": -75.82017517089844,
"loss": 0.3255,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.19262854754924774,
"rewards/margins": 2.1881494522094727,
"rewards/rejected": -1.995520830154419,
"step": 2465
},
{
"epoch": 0.9474491752972766,
"grad_norm": 0.07275390625,
"learning_rate": 4.186049354719571e-05,
"logits/chosen": -6.920261383056641,
"logits/rejected": -6.956971645355225,
"logps/chosen": -54.238853454589844,
"logps/rejected": -74.33580017089844,
"loss": 0.3257,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12954334914684296,
"rewards/margins": 2.2072174549102783,
"rewards/rejected": -2.077674150466919,
"step": 2470
},
{
"epoch": 0.9493670886075949,
"grad_norm": 0.1474609375,
"learning_rate": 4.182396086721172e-05,
"logits/chosen": -6.929312705993652,
"logits/rejected": -6.987416744232178,
"logps/chosen": -51.33516311645508,
"logps/rejected": -72.56217956542969,
"loss": 0.3259,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.18959593772888184,
"rewards/margins": 2.210606098175049,
"rewards/rejected": -2.021010160446167,
"step": 2475
},
{
"epoch": 0.9512850019179133,
"grad_norm": 0.05224609375,
"learning_rate": 4.1787341402550915e-05,
"logits/chosen": -6.868629455566406,
"logits/rejected": -6.9263153076171875,
"logps/chosen": -56.88227462768555,
"logps/rejected": -78.9183578491211,
"loss": 0.3254,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.10913822799921036,
"rewards/margins": 2.2045962810516357,
"rewards/rejected": -2.0954582691192627,
"step": 2480
},
{
"epoch": 0.9532029152282316,
"grad_norm": 0.146484375,
"learning_rate": 4.175063533562514e-05,
"logits/chosen": -6.909239292144775,
"logits/rejected": -6.994379997253418,
"logps/chosen": -57.91779708862305,
"logps/rejected": -78.76531219482422,
"loss": 0.3256,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0988362729549408,
"rewards/margins": 2.2019526958465576,
"rewards/rejected": -2.103116273880005,
"step": 2485
},
{
"epoch": 0.9551208285385501,
"grad_norm": 0.05029296875,
"learning_rate": 4.1713842849277634e-05,
"logits/chosen": -6.838289737701416,
"logits/rejected": -6.935790061950684,
"logps/chosen": -53.85325241088867,
"logps/rejected": -75.92013549804688,
"loss": 0.3258,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.17410710453987122,
"rewards/margins": 2.203998327255249,
"rewards/rejected": -2.0298912525177,
"step": 2490
},
{
"epoch": 0.9570387418488684,
"grad_norm": 0.040771484375,
"learning_rate": 4.1676964126782105e-05,
"logits/chosen": -6.864720821380615,
"logits/rejected": -6.89766788482666,
"logps/chosen": -54.05454635620117,
"logps/rejected": -74.34647369384766,
"loss": 0.3255,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16576418280601501,
"rewards/margins": 2.1935932636260986,
"rewards/rejected": -2.0278289318084717,
"step": 2495
},
{
"epoch": 0.9589566551591868,
"grad_norm": 0.06298828125,
"learning_rate": 4.1639999351841845e-05,
"logits/chosen": -6.877594947814941,
"logits/rejected": -6.9712982177734375,
"logps/chosen": -57.58259963989258,
"logps/rejected": -78.67808532714844,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.15820643305778503,
"rewards/margins": 2.194255828857422,
"rewards/rejected": -2.0360493659973145,
"step": 2500
},
{
"epoch": 0.9589566551591868,
"eval_logits/chosen": -6.7274932861328125,
"eval_logits/rejected": -7.004845142364502,
"eval_logps/chosen": -54.04244613647461,
"eval_logps/rejected": -76.8889389038086,
"eval_loss": 0.32545679807662964,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.14103949069976807,
"eval_rewards/margins": 2.20063853263855,
"eval_rewards/rejected": -2.0595991611480713,
"eval_runtime": 5.2337,
"eval_samples_per_second": 38.214,
"eval_steps_per_second": 38.214,
"step": 2500
},
{
"epoch": 0.9608745684695051,
"grad_norm": 0.062255859375,
"learning_rate": 4.160294870858879e-05,
"logits/chosen": -6.783505916595459,
"logits/rejected": -6.876799583435059,
"logps/chosen": -56.87532424926758,
"logps/rejected": -78.06890869140625,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.17683516442775726,
"rewards/margins": 2.1882314682006836,
"rewards/rejected": -2.0113959312438965,
"step": 2505
},
{
"epoch": 0.9627924817798236,
"grad_norm": 0.0625,
"learning_rate": 4.15658123815826e-05,
"logits/chosen": -6.886815071105957,
"logits/rejected": -6.940008640289307,
"logps/chosen": -51.72332000732422,
"logps/rejected": -73.25819396972656,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.24257156252861023,
"rewards/margins": 2.1972813606262207,
"rewards/rejected": -1.954709768295288,
"step": 2510
},
{
"epoch": 0.9647103950901419,
"grad_norm": 0.0927734375,
"learning_rate": 4.152859055580976e-05,
"logits/chosen": -6.846029758453369,
"logits/rejected": -6.949192047119141,
"logps/chosen": -56.957305908203125,
"logps/rejected": -79.1841049194336,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.10997702181339264,
"rewards/margins": 2.1889004707336426,
"rewards/rejected": -2.0789237022399902,
"step": 2515
},
{
"epoch": 0.9666283084004603,
"grad_norm": 0.05859375,
"learning_rate": 4.1491283416682646e-05,
"logits/chosen": -6.93172550201416,
"logits/rejected": -7.003387451171875,
"logps/chosen": -59.26605224609375,
"logps/rejected": -80.91238403320312,
"loss": 0.3254,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0580294243991375,
"rewards/margins": 2.1971397399902344,
"rewards/rejected": -2.1391100883483887,
"step": 2520
},
{
"epoch": 0.9685462217107786,
"grad_norm": 0.0257568359375,
"learning_rate": 4.145389115003861e-05,
"logits/chosen": -6.888918399810791,
"logits/rejected": -6.952479362487793,
"logps/chosen": -55.66357421875,
"logps/rejected": -77.88643646240234,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.140243262052536,
"rewards/margins": 2.1988415718078613,
"rewards/rejected": -2.058598041534424,
"step": 2525
},
{
"epoch": 0.9704641350210971,
"grad_norm": 0.055908203125,
"learning_rate": 4.141641394213903e-05,
"logits/chosen": -6.902583122253418,
"logits/rejected": -6.9869842529296875,
"logps/chosen": -55.62483596801758,
"logps/rejected": -76.63645935058594,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.15657739341259003,
"rewards/margins": 2.1922218799591064,
"rewards/rejected": -2.035644292831421,
"step": 2530
},
{
"epoch": 0.9723820483314154,
"grad_norm": 0.1396484375,
"learning_rate": 4.1378851979668424e-05,
"logits/chosen": -6.8762526512146,
"logits/rejected": -6.9788818359375,
"logps/chosen": -56.33964920043945,
"logps/rejected": -77.26721954345703,
"loss": 0.3256,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.14851531386375427,
"rewards/margins": 2.206197738647461,
"rewards/rejected": -2.057682514190674,
"step": 2535
},
{
"epoch": 0.9742999616417338,
"grad_norm": 0.04541015625,
"learning_rate": 4.134120544973349e-05,
"logits/chosen": -6.7452521324157715,
"logits/rejected": -6.859133720397949,
"logps/chosen": -52.94233322143555,
"logps/rejected": -74.87626647949219,
"loss": 0.3254,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1908825784921646,
"rewards/margins": 2.20391845703125,
"rewards/rejected": -2.0130362510681152,
"step": 2540
},
{
"epoch": 0.9762178749520521,
"grad_norm": 0.033203125,
"learning_rate": 4.1303474539862165e-05,
"logits/chosen": -6.876730442047119,
"logits/rejected": -6.973580837249756,
"logps/chosen": -58.592201232910156,
"logps/rejected": -80.33421325683594,
"loss": 0.3254,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0061996979638934135,
"rewards/margins": 2.2044360637664795,
"rewards/rejected": -2.2106356620788574,
"step": 2545
},
{
"epoch": 0.9781357882623706,
"grad_norm": 0.0400390625,
"learning_rate": 4.126565943800272e-05,
"logits/chosen": -6.827844142913818,
"logits/rejected": -6.945103645324707,
"logps/chosen": -51.06730651855469,
"logps/rejected": -73.34208679199219,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.17692360281944275,
"rewards/margins": 2.20219087600708,
"rewards/rejected": -2.0252671241760254,
"step": 2550
},
{
"epoch": 0.9800537015726889,
"grad_norm": 0.044921875,
"learning_rate": 4.12277603325228e-05,
"logits/chosen": -6.779177665710449,
"logits/rejected": -6.879153251647949,
"logps/chosen": -56.09104537963867,
"logps/rejected": -77.8788833618164,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1476428508758545,
"rewards/margins": 2.2048072814941406,
"rewards/rejected": -2.0571646690368652,
"step": 2555
},
{
"epoch": 0.9819716148830073,
"grad_norm": 0.056884765625,
"learning_rate": 4.1189777412208516e-05,
"logits/chosen": -6.883273124694824,
"logits/rejected": -6.9658308029174805,
"logps/chosen": -53.4930534362793,
"logps/rejected": -74.64016723632812,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12911777198314667,
"rewards/margins": 2.201110363006592,
"rewards/rejected": -2.0719926357269287,
"step": 2560
},
{
"epoch": 0.9838895281933256,
"grad_norm": 0.029296875,
"learning_rate": 4.1151710866263457e-05,
"logits/chosen": -6.886050224304199,
"logits/rejected": -6.961801052093506,
"logps/chosen": -54.818687438964844,
"logps/rejected": -76.78516387939453,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.06988323479890823,
"rewards/margins": 2.1951420307159424,
"rewards/rejected": -2.125258684158325,
"step": 2565
},
{
"epoch": 0.9858074415036441,
"grad_norm": 0.03271484375,
"learning_rate": 4.11135608843078e-05,
"logits/chosen": -6.865797996520996,
"logits/rejected": -6.954972267150879,
"logps/chosen": -55.82950973510742,
"logps/rejected": -76.11198425292969,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16017884016036987,
"rewards/margins": 2.1961967945098877,
"rewards/rejected": -2.036017894744873,
"step": 2570
},
{
"epoch": 0.9877253548139624,
"grad_norm": 0.038818359375,
"learning_rate": 4.107532765637733e-05,
"logits/chosen": -6.876706600189209,
"logits/rejected": -6.931038856506348,
"logps/chosen": -56.75634765625,
"logps/rejected": -77.3016128540039,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.18982794880867004,
"rewards/margins": 2.195154905319214,
"rewards/rejected": -2.005326747894287,
"step": 2575
},
{
"epoch": 0.9896432681242808,
"grad_norm": 0.031982421875,
"learning_rate": 4.103701137292248e-05,
"logits/chosen": -6.9785475730896,
"logits/rejected": -7.048098564147949,
"logps/chosen": -58.41059112548828,
"logps/rejected": -78.7070083618164,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11143723875284195,
"rewards/margins": 2.19964861869812,
"rewards/rejected": -2.0882115364074707,
"step": 2580
},
{
"epoch": 0.9915611814345991,
"grad_norm": 0.091796875,
"learning_rate": 4.099861222480745e-05,
"logits/chosen": -6.864126682281494,
"logits/rejected": -6.916762351989746,
"logps/chosen": -53.8144645690918,
"logps/rejected": -74.3333740234375,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16026651859283447,
"rewards/margins": 2.200939655303955,
"rewards/rejected": -2.040672779083252,
"step": 2585
},
{
"epoch": 0.9934790947449176,
"grad_norm": 0.052001953125,
"learning_rate": 4.096013040330918e-05,
"logits/chosen": -6.825551509857178,
"logits/rejected": -6.919713020324707,
"logps/chosen": -55.239952087402344,
"logps/rejected": -76.48191833496094,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1743491291999817,
"rewards/margins": 2.1970267295837402,
"rewards/rejected": -2.0226778984069824,
"step": 2590
},
{
"epoch": 0.9953970080552359,
"grad_norm": 0.052978515625,
"learning_rate": 4.092156610011644e-05,
"logits/chosen": -6.889337062835693,
"logits/rejected": -6.97268009185791,
"logps/chosen": -53.6662712097168,
"logps/rejected": -75.61515045166016,
"loss": 0.3253,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.15172195434570312,
"rewards/margins": 2.1960272789001465,
"rewards/rejected": -2.0443053245544434,
"step": 2595
},
{
"epoch": 0.9973149213655543,
"grad_norm": 0.0400390625,
"learning_rate": 4.0882919507328866e-05,
"logits/chosen": -6.899672031402588,
"logits/rejected": -6.974337100982666,
"logps/chosen": -55.918304443359375,
"logps/rejected": -78.09788513183594,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12672047317028046,
"rewards/margins": 2.198251485824585,
"rewards/rejected": -2.071531295776367,
"step": 2600
},
{
"epoch": 0.9992328346758726,
"grad_norm": 0.0308837890625,
"learning_rate": 4.084419081745601e-05,
"logits/chosen": -6.8632493019104,
"logits/rejected": -6.958096981048584,
"logps/chosen": -55.341552734375,
"logps/rejected": -77.59717559814453,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1237340122461319,
"rewards/margins": 2.197999954223633,
"rewards/rejected": -2.074265956878662,
"step": 2605
},
{
"epoch": 0.9996164173379364,
"eval_logits/chosen": -6.729816436767578,
"eval_logits/rejected": -7.007940292358398,
"eval_logps/chosen": -54.07815933227539,
"eval_logps/rejected": -76.93055725097656,
"eval_loss": 0.3251609802246094,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.13746821880340576,
"eval_rewards/margins": 2.2012293338775635,
"eval_rewards/rejected": -2.063760995864868,
"eval_runtime": 5.2222,
"eval_samples_per_second": 38.298,
"eval_steps_per_second": 38.298,
"step": 2606
},
{
"epoch": 1.001150747986191,
"grad_norm": 0.03466796875,
"learning_rate": 4.0805380223416344e-05,
"logits/chosen": -6.883546352386475,
"logits/rejected": -6.957623481750488,
"logps/chosen": -53.838043212890625,
"logps/rejected": -74.10066986083984,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.22372904419898987,
"rewards/margins": 2.1984477043151855,
"rewards/rejected": -1.974718689918518,
"step": 2610
},
{
"epoch": 1.0030686612965094,
"grad_norm": 0.041259765625,
"learning_rate": 4.076648791853635e-05,
"logits/chosen": -6.892508506774902,
"logits/rejected": -6.995348930358887,
"logps/chosen": -56.61248016357422,
"logps/rejected": -78.36751556396484,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.10622930526733398,
"rewards/margins": 2.1974518299102783,
"rewards/rejected": -2.0912225246429443,
"step": 2615
},
{
"epoch": 1.0049865746068278,
"grad_norm": 0.034912109375,
"learning_rate": 4.072751409654955e-05,
"logits/chosen": -6.887833595275879,
"logits/rejected": -6.990255832672119,
"logps/chosen": -53.994590759277344,
"logps/rejected": -75.97149658203125,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.17344704270362854,
"rewards/margins": 2.1953394412994385,
"rewards/rejected": -2.0218923091888428,
"step": 2620
},
{
"epoch": 1.006904487917146,
"grad_norm": 0.02197265625,
"learning_rate": 4.068845895159548e-05,
"logits/chosen": -6.899606227874756,
"logits/rejected": -7.0003509521484375,
"logps/chosen": -58.028724670410156,
"logps/rejected": -78.30329895019531,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08123298734426498,
"rewards/margins": 2.1976544857025146,
"rewards/rejected": -2.116421699523926,
"step": 2625
},
{
"epoch": 1.0088224012274645,
"grad_norm": 0.03662109375,
"learning_rate": 4.0649322678218804e-05,
"logits/chosen": -6.895656585693359,
"logits/rejected": -6.965889930725098,
"logps/chosen": -54.9753532409668,
"logps/rejected": -75.52403259277344,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1174411028623581,
"rewards/margins": 2.196857452392578,
"rewards/rejected": -2.079416036605835,
"step": 2630
},
{
"epoch": 1.010740314537783,
"grad_norm": 0.034423828125,
"learning_rate": 4.0610105471368305e-05,
"logits/chosen": -6.923454284667969,
"logits/rejected": -6.931260108947754,
"logps/chosen": -55.9629020690918,
"logps/rejected": -76.51974487304688,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2325584888458252,
"rewards/margins": 2.194713592529297,
"rewards/rejected": -1.9621549844741821,
"step": 2635
},
{
"epoch": 1.0126582278481013,
"grad_norm": 0.0263671875,
"learning_rate": 4.057080752639591e-05,
"logits/chosen": -6.872107028961182,
"logits/rejected": -6.9589643478393555,
"logps/chosen": -54.76153564453125,
"logps/rejected": -75.4432373046875,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.10465948283672333,
"rewards/margins": 2.196145534515381,
"rewards/rejected": -2.0914859771728516,
"step": 2640
},
{
"epoch": 1.0145761411584195,
"grad_norm": 0.0308837890625,
"learning_rate": 4.053142903905573e-05,
"logits/chosen": -6.88769006729126,
"logits/rejected": -6.998019218444824,
"logps/chosen": -55.536720275878906,
"logps/rejected": -77.26806640625,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11749310791492462,
"rewards/margins": 2.1989588737487793,
"rewards/rejected": -2.081465482711792,
"step": 2645
},
{
"epoch": 1.016494054468738,
"grad_norm": 0.029541015625,
"learning_rate": 4.0491970205503084e-05,
"logits/chosen": -6.833132266998291,
"logits/rejected": -6.9340500831604,
"logps/chosen": -54.23346710205078,
"logps/rejected": -74.97486877441406,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.15681582689285278,
"rewards/margins": 2.198915958404541,
"rewards/rejected": -2.042099952697754,
"step": 2650
},
{
"epoch": 1.0184119677790564,
"grad_norm": 0.025390625,
"learning_rate": 4.04524312222935e-05,
"logits/chosen": -6.784567356109619,
"logits/rejected": -6.9042158126831055,
"logps/chosen": -56.043975830078125,
"logps/rejected": -77.23291015625,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.19256003201007843,
"rewards/margins": 2.1978323459625244,
"rewards/rejected": -2.005272150039673,
"step": 2655
},
{
"epoch": 1.0203298810893748,
"grad_norm": 0.0283203125,
"learning_rate": 4.041281228638177e-05,
"logits/chosen": -6.854004859924316,
"logits/rejected": -6.9370012283325195,
"logps/chosen": -54.6614875793457,
"logps/rejected": -76.17295837402344,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12523391842842102,
"rewards/margins": 2.196631908416748,
"rewards/rejected": -2.0713982582092285,
"step": 2660
},
{
"epoch": 1.022247794399693,
"grad_norm": 0.0196533203125,
"learning_rate": 4.037311359512096e-05,
"logits/chosen": -6.878549098968506,
"logits/rejected": -6.92901086807251,
"logps/chosen": -55.35753631591797,
"logps/rejected": -75.7037582397461,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.18637792766094208,
"rewards/margins": 2.1981968879699707,
"rewards/rejected": -2.011819362640381,
"step": 2665
},
{
"epoch": 1.0241657077100115,
"grad_norm": 0.037841796875,
"learning_rate": 4.0333335346261415e-05,
"logits/chosen": -6.915754795074463,
"logits/rejected": -6.975900173187256,
"logps/chosen": -58.16853713989258,
"logps/rejected": -79.80914306640625,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.06968244165182114,
"rewards/margins": 2.1984660625457764,
"rewards/rejected": -2.128783702850342,
"step": 2670
},
{
"epoch": 1.02608362102033,
"grad_norm": 0.0250244140625,
"learning_rate": 4.029347773794975e-05,
"logits/chosen": -6.917555332183838,
"logits/rejected": -6.966570854187012,
"logps/chosen": -58.62860107421875,
"logps/rejected": -78.85334777832031,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.09619802236557007,
"rewards/margins": 2.1974024772644043,
"rewards/rejected": -2.1012043952941895,
"step": 2675
},
{
"epoch": 1.0280015343306483,
"grad_norm": 0.030517578125,
"learning_rate": 4.025354096872794e-05,
"logits/chosen": -6.830000877380371,
"logits/rejected": -6.904738426208496,
"logps/chosen": -56.95014572143555,
"logps/rejected": -77.70689392089844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.13039973378181458,
"rewards/margins": 2.1978321075439453,
"rewards/rejected": -2.067432403564453,
"step": 2680
},
{
"epoch": 1.0299194476409665,
"grad_norm": 0.025146484375,
"learning_rate": 4.0213525237532235e-05,
"logits/chosen": -6.799073219299316,
"logits/rejected": -6.8913750648498535,
"logps/chosen": -55.946311950683594,
"logps/rejected": -78.06874084472656,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11492273956537247,
"rewards/margins": 2.1941425800323486,
"rewards/rejected": -2.0792198181152344,
"step": 2685
},
{
"epoch": 1.031837360951285,
"grad_norm": 0.02001953125,
"learning_rate": 4.017343074369226e-05,
"logits/chosen": -6.870763301849365,
"logits/rejected": -6.954649448394775,
"logps/chosen": -53.58631134033203,
"logps/rejected": -74.53648376464844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.20419125258922577,
"rewards/margins": 2.1963956356048584,
"rewards/rejected": -1.9922046661376953,
"step": 2690
},
{
"epoch": 1.0337552742616034,
"grad_norm": 0.02587890625,
"learning_rate": 4.0133257686929944e-05,
"logits/chosen": -6.9005842208862305,
"logits/rejected": -6.95761251449585,
"logps/chosen": -55.439735412597656,
"logps/rejected": -74.86544036865234,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.24408455193042755,
"rewards/margins": 2.200181245803833,
"rewards/rejected": -1.9560966491699219,
"step": 2695
},
{
"epoch": 1.0356731875719218,
"grad_norm": 0.02880859375,
"learning_rate": 4.009300626735859e-05,
"logits/chosen": -6.9174299240112305,
"logits/rejected": -6.982752323150635,
"logps/chosen": -57.56931686401367,
"logps/rejected": -78.64064025878906,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07381283491849899,
"rewards/margins": 2.198716640472412,
"rewards/rejected": -2.124903440475464,
"step": 2700
},
{
"epoch": 1.03759110088224,
"grad_norm": 0.02978515625,
"learning_rate": 4.0052676685481814e-05,
"logits/chosen": -6.861070156097412,
"logits/rejected": -6.959973335266113,
"logps/chosen": -53.57646560668945,
"logps/rejected": -74.02730560302734,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.18773815035820007,
"rewards/margins": 2.191598892211914,
"rewards/rejected": -2.0038609504699707,
"step": 2705
},
{
"epoch": 1.0395090141925585,
"grad_norm": 0.0302734375,
"learning_rate": 4.001226914219261e-05,
"logits/chosen": -6.911322593688965,
"logits/rejected": -7.01156759262085,
"logps/chosen": -53.097312927246094,
"logps/rejected": -73.89768981933594,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16086049377918243,
"rewards/margins": 2.1958365440368652,
"rewards/rejected": -2.0349762439727783,
"step": 2710
},
{
"epoch": 1.041426927502877,
"grad_norm": 0.02587890625,
"learning_rate": 3.997178383877231e-05,
"logits/chosen": -6.8495354652404785,
"logits/rejected": -6.967142581939697,
"logps/chosen": -56.043922424316406,
"logps/rejected": -78.30987548828125,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.15614651143550873,
"rewards/margins": 2.1998848915100098,
"rewards/rejected": -2.04373836517334,
"step": 2715
},
{
"epoch": 1.0433448408131953,
"grad_norm": 0.0286865234375,
"learning_rate": 3.993122097688959e-05,
"logits/chosen": -6.892895698547363,
"logits/rejected": -6.953802585601807,
"logps/chosen": -57.95183181762695,
"logps/rejected": -78.77163696289062,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.04726942256093025,
"rewards/margins": 2.1945252418518066,
"rewards/rejected": -2.1472554206848145,
"step": 2720
},
{
"epoch": 1.0452627541235135,
"grad_norm": 0.0260009765625,
"learning_rate": 3.989058075859947e-05,
"logits/chosen": -6.83001184463501,
"logits/rejected": -6.906769752502441,
"logps/chosen": -56.60731887817383,
"logps/rejected": -76.92979431152344,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11525243520736694,
"rewards/margins": 2.1981358528137207,
"rewards/rejected": -2.082883596420288,
"step": 2725
},
{
"epoch": 1.047180667433832,
"grad_norm": 0.031494140625,
"learning_rate": 3.984986338634229e-05,
"logits/chosen": -6.8577399253845215,
"logits/rejected": -6.957972526550293,
"logps/chosen": -55.03181076049805,
"logps/rejected": -75.6190414428711,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16855183243751526,
"rewards/margins": 2.197056770324707,
"rewards/rejected": -2.0285048484802246,
"step": 2730
},
{
"epoch": 1.0490985807441504,
"grad_norm": 0.0225830078125,
"learning_rate": 3.980906906294273e-05,
"logits/chosen": -6.934423923492432,
"logits/rejected": -7.023044586181641,
"logps/chosen": -54.798828125,
"logps/rejected": -76.52142333984375,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1898016333580017,
"rewards/margins": 2.1955370903015137,
"rewards/rejected": -2.005735397338867,
"step": 2735
},
{
"epoch": 1.0510164940544688,
"grad_norm": 0.027099609375,
"learning_rate": 3.976819799160879e-05,
"logits/chosen": -6.8770341873168945,
"logits/rejected": -6.964162349700928,
"logps/chosen": -54.404808044433594,
"logps/rejected": -75.58030700683594,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.17514730989933014,
"rewards/margins": 2.197451114654541,
"rewards/rejected": -2.022303819656372,
"step": 2740
},
{
"epoch": 1.052934407364787,
"grad_norm": 0.0301513671875,
"learning_rate": 3.9727250375930744e-05,
"logits/chosen": -6.848829746246338,
"logits/rejected": -6.935307502746582,
"logps/chosen": -55.02604293823242,
"logps/rejected": -76.05598449707031,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1925317794084549,
"rewards/margins": 2.198523998260498,
"rewards/rejected": -2.0059921741485596,
"step": 2745
},
{
"epoch": 1.0548523206751055,
"grad_norm": 0.0277099609375,
"learning_rate": 3.96862264198802e-05,
"logits/chosen": -6.894994258880615,
"logits/rejected": -6.986319065093994,
"logps/chosen": -57.370811462402344,
"logps/rejected": -79.54820251464844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.026093292981386185,
"rewards/margins": 2.2008121013641357,
"rewards/rejected": -2.1747188568115234,
"step": 2750
},
{
"epoch": 1.056770233985424,
"grad_norm": 0.0230712890625,
"learning_rate": 3.9645126327808997e-05,
"logits/chosen": -6.829963684082031,
"logits/rejected": -6.898384094238281,
"logps/chosen": -50.599239349365234,
"logps/rejected": -71.79393768310547,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.26723015308380127,
"rewards/margins": 2.201087713241577,
"rewards/rejected": -1.9338576793670654,
"step": 2755
},
{
"epoch": 1.0586881472957423,
"grad_norm": 0.0302734375,
"learning_rate": 3.960395030444826e-05,
"logits/chosen": -6.888216972351074,
"logits/rejected": -6.947674751281738,
"logps/chosen": -55.1099739074707,
"logps/rejected": -76.78738403320312,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.09314815700054169,
"rewards/margins": 2.196256399154663,
"rewards/rejected": -2.1031081676483154,
"step": 2760
},
{
"epoch": 1.0606060606060606,
"grad_norm": 0.027099609375,
"learning_rate": 3.9562698554907324e-05,
"logits/chosen": -6.857504367828369,
"logits/rejected": -6.95501184463501,
"logps/chosen": -51.588653564453125,
"logps/rejected": -73.61457061767578,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.19752810895442963,
"rewards/margins": 2.198362350463867,
"rewards/rejected": -2.0008342266082764,
"step": 2765
},
{
"epoch": 1.062523973916379,
"grad_norm": 0.0145263671875,
"learning_rate": 3.952137128467275e-05,
"logits/chosen": -6.8695220947265625,
"logits/rejected": -6.959606170654297,
"logps/chosen": -56.88529586791992,
"logps/rejected": -78.67168426513672,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.061385512351989746,
"rewards/margins": 2.1960301399230957,
"rewards/rejected": -2.1346447467803955,
"step": 2770
},
{
"epoch": 1.0644418872266974,
"grad_norm": 0.021728515625,
"learning_rate": 3.947996869960729e-05,
"logits/chosen": -6.845946311950684,
"logits/rejected": -6.92853307723999,
"logps/chosen": -52.496307373046875,
"logps/rejected": -75.715576171875,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.17457488179206848,
"rewards/margins": 2.196068286895752,
"rewards/rejected": -2.021493434906006,
"step": 2775
},
{
"epoch": 1.0663598005370156,
"grad_norm": 0.024658203125,
"learning_rate": 3.943849100594886e-05,
"logits/chosen": -6.955491542816162,
"logits/rejected": -7.005650997161865,
"logps/chosen": -56.628509521484375,
"logps/rejected": -78.13602447509766,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.04586848244071007,
"rewards/margins": 2.200742244720459,
"rewards/rejected": -2.1548736095428467,
"step": 2780
},
{
"epoch": 1.068277713847334,
"grad_norm": 0.017578125,
"learning_rate": 3.9396938410309515e-05,
"logits/chosen": -6.926751136779785,
"logits/rejected": -6.99490213394165,
"logps/chosen": -57.37287139892578,
"logps/rejected": -79.40666198730469,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.052088893949985504,
"rewards/margins": 2.1968274116516113,
"rewards/rejected": -2.14473819732666,
"step": 2785
},
{
"epoch": 1.0701956271576525,
"grad_norm": 0.024658203125,
"learning_rate": 3.9355311119674394e-05,
"logits/chosen": -6.852726936340332,
"logits/rejected": -6.912999629974365,
"logps/chosen": -55.161170959472656,
"logps/rejected": -77.08748626708984,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1337597668170929,
"rewards/margins": 2.1965129375457764,
"rewards/rejected": -2.062753200531006,
"step": 2790
},
{
"epoch": 1.072113540467971,
"grad_norm": 0.020263671875,
"learning_rate": 3.9313609341400757e-05,
"logits/chosen": -6.858818054199219,
"logits/rejected": -6.979428291320801,
"logps/chosen": -53.97002410888672,
"logps/rejected": -76.10984802246094,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.22703592479228973,
"rewards/margins": 2.198535919189453,
"rewards/rejected": -1.9714998006820679,
"step": 2795
},
{
"epoch": 1.0740314537782891,
"grad_norm": 0.0264892578125,
"learning_rate": 3.9271833283216866e-05,
"logits/chosen": -6.864724159240723,
"logits/rejected": -6.9316606521606445,
"logps/chosen": -55.78950881958008,
"logps/rejected": -78.62726593017578,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.14562520384788513,
"rewards/margins": 2.197704315185547,
"rewards/rejected": -2.0520787239074707,
"step": 2800
},
{
"epoch": 1.0759493670886076,
"grad_norm": 0.031494140625,
"learning_rate": 3.922998315322101e-05,
"logits/chosen": -6.889589786529541,
"logits/rejected": -6.990285396575928,
"logps/chosen": -52.906951904296875,
"logps/rejected": -73.70419311523438,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.26940232515335083,
"rewards/margins": 2.2004714012145996,
"rewards/rejected": -1.9310691356658936,
"step": 2805
},
{
"epoch": 1.077867280398926,
"grad_norm": 0.0196533203125,
"learning_rate": 3.9188059159880463e-05,
"logits/chosen": -6.812591552734375,
"logits/rejected": -6.900485038757324,
"logps/chosen": -59.0921745300293,
"logps/rejected": -80.58056640625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08650780469179153,
"rewards/margins": 2.1971893310546875,
"rewards/rejected": -2.1106815338134766,
"step": 2810
},
{
"epoch": 1.0797851937092444,
"grad_norm": 0.02197265625,
"learning_rate": 3.9146061512030415e-05,
"logits/chosen": -6.838484764099121,
"logits/rejected": -6.911144256591797,
"logps/chosen": -56.846275329589844,
"logps/rejected": -79.10497283935547,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.053564321249723434,
"rewards/margins": 2.197330951690674,
"rewards/rejected": -2.1437668800354004,
"step": 2815
},
{
"epoch": 1.0817031070195626,
"grad_norm": 0.0269775390625,
"learning_rate": 3.910399041887295e-05,
"logits/chosen": -6.862179756164551,
"logits/rejected": -6.935473442077637,
"logps/chosen": -53.73378372192383,
"logps/rejected": -75.07929229736328,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.15086905658245087,
"rewards/margins": 2.197706699371338,
"rewards/rejected": -2.046837568283081,
"step": 2820
},
{
"epoch": 1.083621020329881,
"grad_norm": 0.017578125,
"learning_rate": 3.9061846089976005e-05,
"logits/chosen": -6.84485387802124,
"logits/rejected": -6.956694602966309,
"logps/chosen": -56.87104034423828,
"logps/rejected": -77.37132263183594,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12146338075399399,
"rewards/margins": 2.1992440223693848,
"rewards/rejected": -2.0777807235717773,
"step": 2825
},
{
"epoch": 1.0855389336401995,
"grad_norm": 0.0159912109375,
"learning_rate": 3.901962873527233e-05,
"logits/chosen": -6.868411064147949,
"logits/rejected": -6.946302890777588,
"logps/chosen": -54.27248001098633,
"logps/rejected": -76.27902221679688,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.18422341346740723,
"rewards/margins": 2.1939854621887207,
"rewards/rejected": -2.0097622871398926,
"step": 2830
},
{
"epoch": 1.087456846950518,
"grad_norm": 0.024658203125,
"learning_rate": 3.897733856505845e-05,
"logits/chosen": -6.850857734680176,
"logits/rejected": -6.960956573486328,
"logps/chosen": -57.13166046142578,
"logps/rejected": -78.78927612304688,
"loss": 0.3252,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.019278164952993393,
"rewards/margins": 2.1967427730560303,
"rewards/rejected": -2.1774649620056152,
"step": 2835
},
{
"epoch": 1.0893747602608361,
"grad_norm": 0.024169921875,
"learning_rate": 3.893497578999357e-05,
"logits/chosen": -6.8516998291015625,
"logits/rejected": -6.964435577392578,
"logps/chosen": -56.83503341674805,
"logps/rejected": -77.79374694824219,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.13902735710144043,
"rewards/margins": 2.1968703269958496,
"rewards/rejected": -2.05784273147583,
"step": 2840
},
{
"epoch": 1.0912926735711546,
"grad_norm": 0.0224609375,
"learning_rate": 3.8892540621098594e-05,
"logits/chosen": -6.85550594329834,
"logits/rejected": -6.9149274826049805,
"logps/chosen": -58.30366134643555,
"logps/rejected": -79.84410095214844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.04130606725811958,
"rewards/margins": 2.1934211254119873,
"rewards/rejected": -2.1521153450012207,
"step": 2845
},
{
"epoch": 1.093210586881473,
"grad_norm": 0.0220947265625,
"learning_rate": 3.885003326975502e-05,
"logits/chosen": -6.817744255065918,
"logits/rejected": -6.904561519622803,
"logps/chosen": -51.51453399658203,
"logps/rejected": -73.3747329711914,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.24189969897270203,
"rewards/margins": 2.1980557441711426,
"rewards/rejected": -1.9561560153961182,
"step": 2850
},
{
"epoch": 1.0951285001917914,
"grad_norm": 0.02490234375,
"learning_rate": 3.880745394770392e-05,
"logits/chosen": -6.877533912658691,
"logits/rejected": -6.920339107513428,
"logps/chosen": -59.07216262817383,
"logps/rejected": -80.83231353759766,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.059574197977781296,
"rewards/margins": 2.1979880332946777,
"rewards/rejected": -2.138413429260254,
"step": 2855
},
{
"epoch": 1.0970464135021096,
"grad_norm": 0.0244140625,
"learning_rate": 3.876480286704484e-05,
"logits/chosen": -6.807048797607422,
"logits/rejected": -6.901270389556885,
"logps/chosen": -53.65718460083008,
"logps/rejected": -74.63452911376953,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1505444347858429,
"rewards/margins": 2.1971020698547363,
"rewards/rejected": -2.0465571880340576,
"step": 2860
},
{
"epoch": 1.098964326812428,
"grad_norm": 0.0234375,
"learning_rate": 3.872208024023484e-05,
"logits/chosen": -6.927255153656006,
"logits/rejected": -6.956273555755615,
"logps/chosen": -53.692848205566406,
"logps/rejected": -76.09283447265625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.15065941214561462,
"rewards/margins": 2.201733112335205,
"rewards/rejected": -2.0510735511779785,
"step": 2865
},
{
"epoch": 1.1008822401227465,
"grad_norm": 0.035888671875,
"learning_rate": 3.867928628008731e-05,
"logits/chosen": -6.816448211669922,
"logits/rejected": -6.911751747131348,
"logps/chosen": -55.2706184387207,
"logps/rejected": -76.91694641113281,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16313551366329193,
"rewards/margins": 2.1990082263946533,
"rewards/rejected": -2.035872459411621,
"step": 2870
},
{
"epoch": 1.102800153433065,
"grad_norm": 0.0303955078125,
"learning_rate": 3.8636421199771e-05,
"logits/chosen": -6.858266353607178,
"logits/rejected": -6.889333248138428,
"logps/chosen": -53.993865966796875,
"logps/rejected": -75.5336685180664,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1297408640384674,
"rewards/margins": 2.1988682746887207,
"rewards/rejected": -2.0691275596618652,
"step": 2875
},
{
"epoch": 1.1047180667433831,
"grad_norm": 0.0186767578125,
"learning_rate": 3.859348521280892e-05,
"logits/chosen": -6.896686553955078,
"logits/rejected": -6.956762790679932,
"logps/chosen": -53.74113082885742,
"logps/rejected": -75.0785903930664,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1880251169204712,
"rewards/margins": 2.1969103813171387,
"rewards/rejected": -2.008885622024536,
"step": 2880
},
{
"epoch": 1.1066359800537016,
"grad_norm": 0.0196533203125,
"learning_rate": 3.855047853307728e-05,
"logits/chosen": -6.921117305755615,
"logits/rejected": -7.01132345199585,
"logps/chosen": -54.68280792236328,
"logps/rejected": -76.28700256347656,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.14484816789627075,
"rewards/margins": 2.1998002529144287,
"rewards/rejected": -2.0549521446228027,
"step": 2885
},
{
"epoch": 1.10855389336402,
"grad_norm": 0.018798828125,
"learning_rate": 3.850740137480447e-05,
"logits/chosen": -6.905709743499756,
"logits/rejected": -6.965888977050781,
"logps/chosen": -58.196983337402344,
"logps/rejected": -79.59626770019531,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11491873115301132,
"rewards/margins": 2.1975598335266113,
"rewards/rejected": -2.082641363143921,
"step": 2890
},
{
"epoch": 1.1104718066743384,
"grad_norm": 0.0172119140625,
"learning_rate": 3.846425395256989e-05,
"logits/chosen": -6.784145355224609,
"logits/rejected": -6.9189653396606445,
"logps/chosen": -52.5921516418457,
"logps/rejected": -74.90009307861328,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.18322737514972687,
"rewards/margins": 2.1953601837158203,
"rewards/rejected": -2.0121326446533203,
"step": 2895
},
{
"epoch": 1.1123897199846566,
"grad_norm": 0.0184326171875,
"learning_rate": 3.8421036481303e-05,
"logits/chosen": -6.781815528869629,
"logits/rejected": -6.881045341491699,
"logps/chosen": -53.2947998046875,
"logps/rejected": -74.60124206542969,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.14246733486652374,
"rewards/margins": 2.1933109760284424,
"rewards/rejected": -2.0508437156677246,
"step": 2900
},
{
"epoch": 1.114307633294975,
"grad_norm": 0.0205078125,
"learning_rate": 3.837774917628218e-05,
"logits/chosen": -6.8118181228637695,
"logits/rejected": -6.875295162200928,
"logps/chosen": -54.65224075317383,
"logps/rejected": -75.99118041992188,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.14632579684257507,
"rewards/margins": 2.1963367462158203,
"rewards/rejected": -2.050011157989502,
"step": 2905
},
{
"epoch": 1.1162255466052935,
"grad_norm": 0.0206298828125,
"learning_rate": 3.833439225313362e-05,
"logits/chosen": -6.832925319671631,
"logits/rejected": -6.927382469177246,
"logps/chosen": -54.1140251159668,
"logps/rejected": -75.5179214477539,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12394438683986664,
"rewards/margins": 2.195122480392456,
"rewards/rejected": -2.0711779594421387,
"step": 2910
},
{
"epoch": 1.1181434599156117,
"grad_norm": 0.0250244140625,
"learning_rate": 3.829096592783039e-05,
"logits/chosen": -6.910910129547119,
"logits/rejected": -6.9941840171813965,
"logps/chosen": -54.79912185668945,
"logps/rejected": -77.29903411865234,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.10858787596225739,
"rewards/margins": 2.1974599361419678,
"rewards/rejected": -2.088872194290161,
"step": 2915
},
{
"epoch": 1.1200613732259301,
"grad_norm": 0.0145263671875,
"learning_rate": 3.82474704166912e-05,
"logits/chosen": -6.777789115905762,
"logits/rejected": -6.893073081970215,
"logps/chosen": -57.53221893310547,
"logps/rejected": -78.73521423339844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1271776258945465,
"rewards/margins": 2.195502758026123,
"rewards/rejected": -2.0683252811431885,
"step": 2920
},
{
"epoch": 1.1219792865362486,
"grad_norm": 0.0244140625,
"learning_rate": 3.8203905936379415e-05,
"logits/chosen": -6.841919898986816,
"logits/rejected": -6.939420223236084,
"logps/chosen": -55.87000274658203,
"logps/rejected": -76.84537506103516,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1004914790391922,
"rewards/margins": 2.1957545280456543,
"rewards/rejected": -2.0952632427215576,
"step": 2925
},
{
"epoch": 1.123897199846567,
"grad_norm": 0.0250244140625,
"learning_rate": 3.8160272703901975e-05,
"logits/chosen": -6.862138271331787,
"logits/rejected": -6.949643135070801,
"logps/chosen": -57.05035400390625,
"logps/rejected": -77.7467041015625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.13071158528327942,
"rewards/margins": 2.197261333465576,
"rewards/rejected": -2.06654953956604,
"step": 2930
},
{
"epoch": 1.1258151131568854,
"grad_norm": 0.0263671875,
"learning_rate": 3.8116570936608245e-05,
"logits/chosen": -6.812536716461182,
"logits/rejected": -6.889588832855225,
"logps/chosen": -58.20977783203125,
"logps/rejected": -79.45494079589844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08483082056045532,
"rewards/margins": 2.194852352142334,
"rewards/rejected": -2.1100213527679443,
"step": 2935
},
{
"epoch": 1.1277330264672036,
"grad_norm": 0.01513671875,
"learning_rate": 3.807280085218904e-05,
"logits/chosen": -6.816141605377197,
"logits/rejected": -6.902308464050293,
"logps/chosen": -53.630462646484375,
"logps/rejected": -75.4277114868164,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.20424386858940125,
"rewards/margins": 2.1985929012298584,
"rewards/rejected": -1.9943492412567139,
"step": 2940
},
{
"epoch": 1.129650939777522,
"grad_norm": 0.0208740234375,
"learning_rate": 3.8028962668675436e-05,
"logits/chosen": -6.88754940032959,
"logits/rejected": -6.952759742736816,
"logps/chosen": -55.587440490722656,
"logps/rejected": -77.20613098144531,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1409328579902649,
"rewards/margins": 2.198951005935669,
"rewards/rejected": -2.058018207550049,
"step": 2945
},
{
"epoch": 1.1315688530878405,
"grad_norm": 0.03466796875,
"learning_rate": 3.7985056604437754e-05,
"logits/chosen": -6.822312355041504,
"logits/rejected": -6.915589332580566,
"logps/chosen": -56.829490661621094,
"logps/rejected": -77.65740966796875,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11516109853982925,
"rewards/margins": 2.199615240097046,
"rewards/rejected": -2.084454298019409,
"step": 2950
},
{
"epoch": 1.1334867663981587,
"grad_norm": 0.023193359375,
"learning_rate": 3.794108287818444e-05,
"logits/chosen": -6.859801292419434,
"logits/rejected": -6.943611145019531,
"logps/chosen": -53.40679931640625,
"logps/rejected": -73.86637878417969,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.18870747089385986,
"rewards/margins": 2.196364164352417,
"rewards/rejected": -2.0076565742492676,
"step": 2955
},
{
"epoch": 1.1354046797084771,
"grad_norm": 0.0177001953125,
"learning_rate": 3.7897041708961e-05,
"logits/chosen": -6.775750637054443,
"logits/rejected": -6.829716682434082,
"logps/chosen": -56.192054748535156,
"logps/rejected": -77.07792663574219,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16598518192768097,
"rewards/margins": 2.196262836456299,
"rewards/rejected": -2.0302774906158447,
"step": 2960
},
{
"epoch": 1.1373225930187956,
"grad_norm": 0.014892578125,
"learning_rate": 3.785293331614884e-05,
"logits/chosen": -6.888825416564941,
"logits/rejected": -6.977200984954834,
"logps/chosen": -58.45457077026367,
"logps/rejected": -80.14210510253906,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.06244196370244026,
"rewards/margins": 2.196946620941162,
"rewards/rejected": -2.134504795074463,
"step": 2965
},
{
"epoch": 1.139240506329114,
"grad_norm": 0.01806640625,
"learning_rate": 3.78087579194643e-05,
"logits/chosen": -6.927948951721191,
"logits/rejected": -6.9935150146484375,
"logps/chosen": -55.169342041015625,
"logps/rejected": -77.38768005371094,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.09456191211938858,
"rewards/margins": 2.1975150108337402,
"rewards/rejected": -2.1029531955718994,
"step": 2970
},
{
"epoch": 1.1411584196394322,
"grad_norm": 0.0263671875,
"learning_rate": 3.7764515738957434e-05,
"logits/chosen": -6.886633396148682,
"logits/rejected": -6.958956718444824,
"logps/chosen": -55.48567581176758,
"logps/rejected": -77.4232177734375,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1457061469554901,
"rewards/margins": 2.1986184120178223,
"rewards/rejected": -2.0529122352600098,
"step": 2975
},
{
"epoch": 1.1430763329497506,
"grad_norm": 0.0185546875,
"learning_rate": 3.772020699501098e-05,
"logits/chosen": -6.843485355377197,
"logits/rejected": -6.9481916427612305,
"logps/chosen": -53.60565948486328,
"logps/rejected": -75.5847396850586,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.21644556522369385,
"rewards/margins": 2.1974103450775146,
"rewards/rejected": -1.9809648990631104,
"step": 2980
},
{
"epoch": 1.144994246260069,
"grad_norm": 0.0179443359375,
"learning_rate": 3.7675831908339234e-05,
"logits/chosen": -6.9442949295043945,
"logits/rejected": -6.987727165222168,
"logps/chosen": -55.760963439941406,
"logps/rejected": -76.97257995605469,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08401085436344147,
"rewards/margins": 2.1952691078186035,
"rewards/rejected": -2.1112582683563232,
"step": 2985
},
{
"epoch": 1.1469121595703875,
"grad_norm": 0.0172119140625,
"learning_rate": 3.763139069998699e-05,
"logits/chosen": -6.921295166015625,
"logits/rejected": -6.971589088439941,
"logps/chosen": -57.0744514465332,
"logps/rejected": -77.71913146972656,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1222480982542038,
"rewards/margins": 2.1965291500091553,
"rewards/rejected": -2.0742809772491455,
"step": 2990
},
{
"epoch": 1.1488300728807057,
"grad_norm": 0.024658203125,
"learning_rate": 3.7586883591328396e-05,
"logits/chosen": -6.899412631988525,
"logits/rejected": -6.98648738861084,
"logps/chosen": -53.3134765625,
"logps/rejected": -73.20283508300781,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2066071331501007,
"rewards/margins": 2.1967947483062744,
"rewards/rejected": -1.9901872873306274,
"step": 2995
},
{
"epoch": 1.1507479861910241,
"grad_norm": 0.017822265625,
"learning_rate": 3.7542310804065875e-05,
"logits/chosen": -6.8181610107421875,
"logits/rejected": -6.9312543869018555,
"logps/chosen": -53.06462860107422,
"logps/rejected": -74.0704345703125,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2070445716381073,
"rewards/margins": 2.197538137435913,
"rewards/rejected": -1.9904934167861938,
"step": 3000
},
{
"epoch": 1.1507479861910241,
"eval_logits/chosen": -6.733577728271484,
"eval_logits/rejected": -7.0103278160095215,
"eval_logps/chosen": -54.098121643066406,
"eval_logps/rejected": -76.94671630859375,
"eval_loss": 0.32510754466056824,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.13547228276729584,
"eval_rewards/margins": 2.2008495330810547,
"eval_rewards/rejected": -2.0653772354125977,
"eval_runtime": 5.247,
"eval_samples_per_second": 38.117,
"eval_steps_per_second": 38.117,
"step": 3000
},
{
"epoch": 1.1526658995013426,
"grad_norm": 0.0213623046875,
"learning_rate": 3.7497672560229e-05,
"logits/chosen": -6.835085391998291,
"logits/rejected": -6.877735137939453,
"logps/chosen": -56.40270233154297,
"logps/rejected": -77.52452087402344,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.18680231273174286,
"rewards/margins": 2.198249340057373,
"rewards/rejected": -2.011446714401245,
"step": 3005
},
{
"epoch": 1.154583812811661,
"grad_norm": 0.013916015625,
"learning_rate": 3.745296908217341e-05,
"logits/chosen": -6.860623359680176,
"logits/rejected": -6.987900733947754,
"logps/chosen": -55.96862030029297,
"logps/rejected": -76.41857147216797,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16915419697761536,
"rewards/margins": 2.1973133087158203,
"rewards/rejected": -2.0281593799591064,
"step": 3010
},
{
"epoch": 1.1565017261219792,
"grad_norm": 0.0205078125,
"learning_rate": 3.74082005925797e-05,
"logits/chosen": -6.845609188079834,
"logits/rejected": -6.985723972320557,
"logps/chosen": -49.98345947265625,
"logps/rejected": -71.40119934082031,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.28592175245285034,
"rewards/margins": 2.197575330734253,
"rewards/rejected": -1.911653757095337,
"step": 3015
},
{
"epoch": 1.1584196394322976,
"grad_norm": 0.0223388671875,
"learning_rate": 3.7363367314452307e-05,
"logits/chosen": -6.875467777252197,
"logits/rejected": -6.945004463195801,
"logps/chosen": -54.1942024230957,
"logps/rejected": -76.0034408569336,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11485148966312408,
"rewards/margins": 2.1981101036071777,
"rewards/rejected": -2.083258867263794,
"step": 3020
},
{
"epoch": 1.160337552742616,
"grad_norm": 0.019287109375,
"learning_rate": 3.731846947111837e-05,
"logits/chosen": -6.876239776611328,
"logits/rejected": -6.9427642822265625,
"logps/chosen": -58.8173828125,
"logps/rejected": -80.17155456542969,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1200895681977272,
"rewards/margins": 2.1984190940856934,
"rewards/rejected": -2.07832932472229,
"step": 3025
},
{
"epoch": 1.1622554660529345,
"grad_norm": 0.0157470703125,
"learning_rate": 3.7273507286226684e-05,
"logits/chosen": -6.8489837646484375,
"logits/rejected": -6.969413757324219,
"logps/chosen": -56.049156188964844,
"logps/rejected": -77.49417114257812,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1288760006427765,
"rewards/margins": 2.1958889961242676,
"rewards/rejected": -2.0670130252838135,
"step": 3030
},
{
"epoch": 1.1641733793632527,
"grad_norm": 0.0157470703125,
"learning_rate": 3.722848098374653e-05,
"logits/chosen": -6.9269609451293945,
"logits/rejected": -6.994809150695801,
"logps/chosen": -55.518402099609375,
"logps/rejected": -76.10893249511719,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1681075096130371,
"rewards/margins": 2.1947579383850098,
"rewards/rejected": -2.0266504287719727,
"step": 3035
},
{
"epoch": 1.1660912926735711,
"grad_norm": 0.0191650390625,
"learning_rate": 3.718339078796656e-05,
"logits/chosen": -6.861593723297119,
"logits/rejected": -6.984339714050293,
"logps/chosen": -57.30218505859375,
"logps/rejected": -78.30956268310547,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07483305037021637,
"rewards/margins": 2.1995906829833984,
"rewards/rejected": -2.1247572898864746,
"step": 3040
},
{
"epoch": 1.1680092059838896,
"grad_norm": 0.01611328125,
"learning_rate": 3.7138236923493745e-05,
"logits/chosen": -6.8836259841918945,
"logits/rejected": -6.949560642242432,
"logps/chosen": -58.995750427246094,
"logps/rejected": -79.87813568115234,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1622563898563385,
"rewards/margins": 2.1966044902801514,
"rewards/rejected": -2.034348249435425,
"step": 3045
},
{
"epoch": 1.1699271192942078,
"grad_norm": 0.0189208984375,
"learning_rate": 3.709301961525215e-05,
"logits/chosen": -6.907643795013428,
"logits/rejected": -6.970582008361816,
"logps/chosen": -52.92273712158203,
"logps/rejected": -74.26122283935547,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12806203961372375,
"rewards/margins": 2.198936700820923,
"rewards/rejected": -2.0708744525909424,
"step": 3050
},
{
"epoch": 1.1718450326045262,
"grad_norm": 0.0206298828125,
"learning_rate": 3.7047739088481896e-05,
"logits/chosen": -6.881479740142822,
"logits/rejected": -6.99112606048584,
"logps/chosen": -56.346168518066406,
"logps/rejected": -77.93466186523438,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11438564956188202,
"rewards/margins": 2.197819709777832,
"rewards/rejected": -2.0834341049194336,
"step": 3055
},
{
"epoch": 1.1737629459148446,
"grad_norm": 0.01544189453125,
"learning_rate": 3.700239556873803e-05,
"logits/chosen": -6.802818298339844,
"logits/rejected": -6.9134674072265625,
"logps/chosen": -55.84782028198242,
"logps/rejected": -76.99220275878906,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.14138475060462952,
"rewards/margins": 2.1987390518188477,
"rewards/rejected": -2.05735445022583,
"step": 3060
},
{
"epoch": 1.175680859225163,
"grad_norm": 0.01446533203125,
"learning_rate": 3.695698928188936e-05,
"logits/chosen": -6.930673122406006,
"logits/rejected": -7.025910377502441,
"logps/chosen": -55.19673538208008,
"logps/rejected": -77.19384765625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08800756931304932,
"rewards/margins": 2.198402166366577,
"rewards/rejected": -2.1103944778442383,
"step": 3065
},
{
"epoch": 1.1775987725354815,
"grad_norm": 0.0159912109375,
"learning_rate": 3.6911520454117364e-05,
"logits/chosen": -6.800990104675293,
"logits/rejected": -6.837213039398193,
"logps/chosen": -52.48417282104492,
"logps/rejected": -75.27742767333984,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.22398178279399872,
"rewards/margins": 2.1966872215270996,
"rewards/rejected": -1.9727054834365845,
"step": 3070
},
{
"epoch": 1.1795166858457997,
"grad_norm": 0.014892578125,
"learning_rate": 3.686598931191506e-05,
"logits/chosen": -6.877493858337402,
"logits/rejected": -6.961717128753662,
"logps/chosen": -55.0633659362793,
"logps/rejected": -74.63704681396484,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1815473586320877,
"rewards/margins": 2.1940340995788574,
"rewards/rejected": -2.0124869346618652,
"step": 3075
},
{
"epoch": 1.1814345991561181,
"grad_norm": 0.0196533203125,
"learning_rate": 3.6820396082085854e-05,
"logits/chosen": -6.874161720275879,
"logits/rejected": -6.961171627044678,
"logps/chosen": -55.867698669433594,
"logps/rejected": -77.20703125,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.09498462826013565,
"rewards/margins": 2.1980140209198,
"rewards/rejected": -2.103029489517212,
"step": 3080
},
{
"epoch": 1.1833525124664366,
"grad_norm": 0.019287109375,
"learning_rate": 3.6774740991742456e-05,
"logits/chosen": -6.843630790710449,
"logits/rejected": -6.964916229248047,
"logps/chosen": -54.132240295410156,
"logps/rejected": -74.72636413574219,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1745484322309494,
"rewards/margins": 2.1994543075561523,
"rewards/rejected": -2.0249056816101074,
"step": 3085
},
{
"epoch": 1.1852704257767548,
"grad_norm": 0.0177001953125,
"learning_rate": 3.672902426830569e-05,
"logits/chosen": -6.85568904876709,
"logits/rejected": -6.929766654968262,
"logps/chosen": -57.98683547973633,
"logps/rejected": -79.25408172607422,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.13423751294612885,
"rewards/margins": 2.1988065242767334,
"rewards/rejected": -2.0645689964294434,
"step": 3090
},
{
"epoch": 1.1871883390870732,
"grad_norm": 0.019287109375,
"learning_rate": 3.668324613950339e-05,
"logits/chosen": -6.860760688781738,
"logits/rejected": -6.9167962074279785,
"logps/chosen": -56.801109313964844,
"logps/rejected": -76.44920349121094,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12205035984516144,
"rewards/margins": 2.196124315261841,
"rewards/rejected": -2.0740737915039062,
"step": 3095
},
{
"epoch": 1.1891062523973916,
"grad_norm": 0.02294921875,
"learning_rate": 3.6637406833369336e-05,
"logits/chosen": -6.864434719085693,
"logits/rejected": -6.943790435791016,
"logps/chosen": -55.60688400268555,
"logps/rejected": -75.59327697753906,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.20054812729358673,
"rewards/margins": 2.1957311630249023,
"rewards/rejected": -1.9951833486557007,
"step": 3100
},
{
"epoch": 1.19102416570771,
"grad_norm": 0.0206298828125,
"learning_rate": 3.659150657824194e-05,
"logits/chosen": -6.883216857910156,
"logits/rejected": -6.926869869232178,
"logps/chosen": -54.942848205566406,
"logps/rejected": -75.64328002929688,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.25306662917137146,
"rewards/margins": 2.1989903450012207,
"rewards/rejected": -1.9459235668182373,
"step": 3105
},
{
"epoch": 1.1929420790180283,
"grad_norm": 0.023193359375,
"learning_rate": 3.6545545602763296e-05,
"logits/chosen": -6.765725612640381,
"logits/rejected": -6.863596439361572,
"logps/chosen": -51.03193664550781,
"logps/rejected": -72.6492919921875,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.24340319633483887,
"rewards/margins": 2.1984338760375977,
"rewards/rejected": -1.9550306797027588,
"step": 3110
},
{
"epoch": 1.1948599923283467,
"grad_norm": 0.0223388671875,
"learning_rate": 3.649952413587796e-05,
"logits/chosen": -6.795783042907715,
"logits/rejected": -6.881206512451172,
"logps/chosen": -59.676666259765625,
"logps/rejected": -80.19944763183594,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1134626716375351,
"rewards/margins": 2.1996171474456787,
"rewards/rejected": -2.0861546993255615,
"step": 3115
},
{
"epoch": 1.1967779056386652,
"grad_norm": 0.022705078125,
"learning_rate": 3.645344240683176e-05,
"logits/chosen": -6.884184837341309,
"logits/rejected": -6.944088935852051,
"logps/chosen": -59.5294303894043,
"logps/rejected": -81.12696838378906,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.05250592157244682,
"rewards/margins": 2.197688102722168,
"rewards/rejected": -2.1451821327209473,
"step": 3120
},
{
"epoch": 1.1986958189489836,
"grad_norm": 0.0211181640625,
"learning_rate": 3.640730064517077e-05,
"logits/chosen": -6.844605922698975,
"logits/rejected": -6.9427008628845215,
"logps/chosen": -58.22944259643555,
"logps/rejected": -79.50534057617188,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.05716212838888168,
"rewards/margins": 2.1979916095733643,
"rewards/rejected": -2.140829563140869,
"step": 3125
},
{
"epoch": 1.2006137322593018,
"grad_norm": 0.01446533203125,
"learning_rate": 3.636109908074006e-05,
"logits/chosen": -6.839515686035156,
"logits/rejected": -6.921526908874512,
"logps/chosen": -54.10723114013672,
"logps/rejected": -75.46257781982422,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16128435730934143,
"rewards/margins": 2.1983935832977295,
"rewards/rejected": -2.037109375,
"step": 3130
},
{
"epoch": 1.2025316455696202,
"grad_norm": 0.01708984375,
"learning_rate": 3.631483794368261e-05,
"logits/chosen": -6.844223976135254,
"logits/rejected": -6.917840480804443,
"logps/chosen": -57.75761795043945,
"logps/rejected": -78.73417663574219,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.14469479024410248,
"rewards/margins": 2.1967318058013916,
"rewards/rejected": -2.052037000656128,
"step": 3135
},
{
"epoch": 1.2044495588799387,
"grad_norm": 0.0196533203125,
"learning_rate": 3.626851746443816e-05,
"logits/chosen": -6.834771156311035,
"logits/rejected": -6.892405033111572,
"logps/chosen": -56.64507293701172,
"logps/rejected": -77.17357635498047,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1187446340918541,
"rewards/margins": 2.1978745460510254,
"rewards/rejected": -2.079129934310913,
"step": 3140
},
{
"epoch": 1.206367472190257,
"grad_norm": 0.0166015625,
"learning_rate": 3.622213787374202e-05,
"logits/chosen": -6.868593692779541,
"logits/rejected": -6.975184440612793,
"logps/chosen": -56.99578094482422,
"logps/rejected": -77.51200103759766,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16191808879375458,
"rewards/margins": 2.1966910362243652,
"rewards/rejected": -2.0347728729248047,
"step": 3145
},
{
"epoch": 1.2082853855005753,
"grad_norm": 0.0179443359375,
"learning_rate": 3.617569940262397e-05,
"logits/chosen": -6.836582183837891,
"logits/rejected": -6.939114570617676,
"logps/chosen": -52.9292106628418,
"logps/rejected": -74.92284393310547,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12345506250858307,
"rewards/margins": 2.1982033252716064,
"rewards/rejected": -2.0747480392456055,
"step": 3150
},
{
"epoch": 1.2102032988108937,
"grad_norm": 0.020263671875,
"learning_rate": 3.61292022824071e-05,
"logits/chosen": -6.889954566955566,
"logits/rejected": -6.915895938873291,
"logps/chosen": -55.65690994262695,
"logps/rejected": -75.88868713378906,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11389796435832977,
"rewards/margins": 2.1961417198181152,
"rewards/rejected": -2.0822436809539795,
"step": 3155
},
{
"epoch": 1.2121212121212122,
"grad_norm": 0.017578125,
"learning_rate": 3.6082646744706615e-05,
"logits/chosen": -6.902949333190918,
"logits/rejected": -6.99478816986084,
"logps/chosen": -54.92612838745117,
"logps/rejected": -75.23097229003906,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16485390067100525,
"rewards/margins": 2.195668935775757,
"rewards/rejected": -2.0308148860931396,
"step": 3160
},
{
"epoch": 1.2140391254315306,
"grad_norm": 0.02099609375,
"learning_rate": 3.6036033021428763e-05,
"logits/chosen": -6.836057186126709,
"logits/rejected": -6.9265947341918945,
"logps/chosen": -55.4571647644043,
"logps/rejected": -77.2686538696289,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1580025851726532,
"rewards/margins": 2.1961824893951416,
"rewards/rejected": -2.038179874420166,
"step": 3165
},
{
"epoch": 1.2159570387418488,
"grad_norm": 0.0157470703125,
"learning_rate": 3.598936134476957e-05,
"logits/chosen": -6.854968070983887,
"logits/rejected": -6.951455593109131,
"logps/chosen": -56.069427490234375,
"logps/rejected": -77.87905883789062,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1228158250451088,
"rewards/margins": 2.196932077407837,
"rewards/rejected": -2.0741162300109863,
"step": 3170
},
{
"epoch": 1.2178749520521672,
"grad_norm": 0.0133056640625,
"learning_rate": 3.59426319472138e-05,
"logits/chosen": -6.809521675109863,
"logits/rejected": -6.926182746887207,
"logps/chosen": -57.12538528442383,
"logps/rejected": -77.77890014648438,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.09594462811946869,
"rewards/margins": 2.195291042327881,
"rewards/rejected": -2.09934663772583,
"step": 3175
},
{
"epoch": 1.2197928653624857,
"grad_norm": 0.019775390625,
"learning_rate": 3.58958450615337e-05,
"logits/chosen": -6.759768486022949,
"logits/rejected": -6.831299781799316,
"logps/chosen": -56.357330322265625,
"logps/rejected": -78.47567749023438,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.24715280532836914,
"rewards/margins": 2.1972153186798096,
"rewards/rejected": -1.9500625133514404,
"step": 3180
},
{
"epoch": 1.2217107786728039,
"grad_norm": 0.0166015625,
"learning_rate": 3.5849000920787916e-05,
"logits/chosen": -6.913460731506348,
"logits/rejected": -6.9834394454956055,
"logps/chosen": -54.47563934326172,
"logps/rejected": -75.08209228515625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0902893990278244,
"rewards/margins": 2.1981558799743652,
"rewards/rejected": -2.1078667640686035,
"step": 3185
},
{
"epoch": 1.2236286919831223,
"grad_norm": 0.01531982421875,
"learning_rate": 3.580209975832027e-05,
"logits/chosen": -6.87200403213501,
"logits/rejected": -6.999584197998047,
"logps/chosen": -55.20698928833008,
"logps/rejected": -77.44094848632812,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12308267503976822,
"rewards/margins": 2.196704387664795,
"rewards/rejected": -2.073621988296509,
"step": 3190
},
{
"epoch": 1.2255466052934407,
"grad_norm": 0.01483154296875,
"learning_rate": 3.5755141807758636e-05,
"logits/chosen": -6.90018367767334,
"logits/rejected": -6.980373382568359,
"logps/chosen": -58.038169860839844,
"logps/rejected": -79.75993347167969,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07556581497192383,
"rewards/margins": 2.197662115097046,
"rewards/rejected": -2.122096061706543,
"step": 3195
},
{
"epoch": 1.2274645186037592,
"grad_norm": 0.0186767578125,
"learning_rate": 3.570812730301377e-05,
"logits/chosen": -6.910298824310303,
"logits/rejected": -6.948716163635254,
"logps/chosen": -58.978660583496094,
"logps/rejected": -79.23673248291016,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.10630476474761963,
"rewards/margins": 2.1975457668304443,
"rewards/rejected": -2.0912413597106934,
"step": 3200
},
{
"epoch": 1.2293824319140776,
"grad_norm": 0.0166015625,
"learning_rate": 3.5661056478278125e-05,
"logits/chosen": -6.822863578796387,
"logits/rejected": -6.938515663146973,
"logps/chosen": -56.370582580566406,
"logps/rejected": -77.22187805175781,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1240546703338623,
"rewards/margins": 2.1947436332702637,
"rewards/rejected": -2.0706887245178223,
"step": 3205
},
{
"epoch": 1.2313003452243958,
"grad_norm": 0.0196533203125,
"learning_rate": 3.561392956802472e-05,
"logits/chosen": -6.785411834716797,
"logits/rejected": -6.905782222747803,
"logps/chosen": -52.2618408203125,
"logps/rejected": -73.18843078613281,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.20731830596923828,
"rewards/margins": 2.1965816020965576,
"rewards/rejected": -1.9892632961273193,
"step": 3210
},
{
"epoch": 1.2332182585347142,
"grad_norm": 0.0179443359375,
"learning_rate": 3.556674680700593e-05,
"logits/chosen": -6.823803901672363,
"logits/rejected": -6.908896446228027,
"logps/chosen": -53.1720085144043,
"logps/rejected": -75.19302368164062,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1953558772802353,
"rewards/margins": 2.1971781253814697,
"rewards/rejected": -2.0018222332000732,
"step": 3215
},
{
"epoch": 1.2351361718450327,
"grad_norm": 0.0162353515625,
"learning_rate": 3.5519508430252364e-05,
"logits/chosen": -6.835984706878662,
"logits/rejected": -6.891526699066162,
"logps/chosen": -59.92537307739258,
"logps/rejected": -81.62738800048828,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.04124726727604866,
"rewards/margins": 2.1987502574920654,
"rewards/rejected": -2.1575026512145996,
"step": 3220
},
{
"epoch": 1.2370540851553509,
"grad_norm": 0.0184326171875,
"learning_rate": 3.5472214673071645e-05,
"logits/chosen": -6.899560451507568,
"logits/rejected": -6.903540134429932,
"logps/chosen": -53.959434509277344,
"logps/rejected": -75.20841979980469,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11058112233877182,
"rewards/margins": 2.197727680206299,
"rewards/rejected": -2.087146520614624,
"step": 3225
},
{
"epoch": 1.2389719984656693,
"grad_norm": 0.018310546875,
"learning_rate": 3.542486577104728e-05,
"logits/chosen": -6.880455017089844,
"logits/rejected": -6.971824645996094,
"logps/chosen": -58.014862060546875,
"logps/rejected": -78.8389892578125,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1089930385351181,
"rewards/margins": 2.199493646621704,
"rewards/rejected": -2.0905003547668457,
"step": 3230
},
{
"epoch": 1.2408899117759877,
"grad_norm": 0.0177001953125,
"learning_rate": 3.5377461960037454e-05,
"logits/chosen": -6.817750453948975,
"logits/rejected": -6.916102409362793,
"logps/chosen": -56.97007369995117,
"logps/rejected": -78.2486572265625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.10282842814922333,
"rewards/margins": 2.1967597007751465,
"rewards/rejected": -2.093931198120117,
"step": 3235
},
{
"epoch": 1.2428078250863062,
"grad_norm": 0.0157470703125,
"learning_rate": 3.533000347617386e-05,
"logits/chosen": -6.9070892333984375,
"logits/rejected": -6.9540510177612305,
"logps/chosen": -54.85721969604492,
"logps/rejected": -75.885986328125,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.13600504398345947,
"rewards/margins": 2.1978845596313477,
"rewards/rejected": -2.0618793964385986,
"step": 3240
},
{
"epoch": 1.2447257383966246,
"grad_norm": 0.01806640625,
"learning_rate": 3.5282490555860566e-05,
"logits/chosen": -6.82870626449585,
"logits/rejected": -6.905839443206787,
"logps/chosen": -58.199119567871094,
"logps/rejected": -78.6649169921875,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07048743218183517,
"rewards/margins": 2.197709321975708,
"rewards/rejected": -2.1272220611572266,
"step": 3245
},
{
"epoch": 1.2466436517069428,
"grad_norm": 0.0157470703125,
"learning_rate": 3.5234923435772776e-05,
"logits/chosen": -6.8863935470581055,
"logits/rejected": -6.9651618003845215,
"logps/chosen": -56.577064514160156,
"logps/rejected": -78.65019226074219,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.006662911735475063,
"rewards/margins": 2.197619915008545,
"rewards/rejected": -2.1909565925598145,
"step": 3250
},
{
"epoch": 1.2485615650172612,
"grad_norm": 0.01434326171875,
"learning_rate": 3.518730235285569e-05,
"logits/chosen": -6.8779778480529785,
"logits/rejected": -6.99672794342041,
"logps/chosen": -54.36248779296875,
"logps/rejected": -75.02043151855469,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16153615713119507,
"rewards/margins": 2.19610857963562,
"rewards/rejected": -2.0345723628997803,
"step": 3255
},
{
"epoch": 1.2504794783275797,
"grad_norm": 0.0172119140625,
"learning_rate": 3.513962754432329e-05,
"logits/chosen": -6.871232509613037,
"logits/rejected": -6.936939239501953,
"logps/chosen": -55.50481033325195,
"logps/rejected": -77.04280090332031,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08255159109830856,
"rewards/margins": 2.1961824893951416,
"rewards/rejected": -2.113631010055542,
"step": 3260
},
{
"epoch": 1.2523973916378979,
"grad_norm": 0.0177001953125,
"learning_rate": 3.509189924765723e-05,
"logits/chosen": -6.956633567810059,
"logits/rejected": -6.980214595794678,
"logps/chosen": -55.857261657714844,
"logps/rejected": -76.93995666503906,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07723622769117355,
"rewards/margins": 2.1976943016052246,
"rewards/rejected": -2.1204581260681152,
"step": 3265
},
{
"epoch": 1.2543153049482163,
"grad_norm": 0.0169677734375,
"learning_rate": 3.504411770060553e-05,
"logits/chosen": -6.8603997230529785,
"logits/rejected": -6.952916622161865,
"logps/chosen": -54.337066650390625,
"logps/rejected": -75.54595947265625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.19328327476978302,
"rewards/margins": 2.196732759475708,
"rewards/rejected": -2.0034494400024414,
"step": 3270
},
{
"epoch": 1.2562332182585347,
"grad_norm": 0.01226806640625,
"learning_rate": 3.499628314118154e-05,
"logits/chosen": -6.9065842628479,
"logits/rejected": -6.992828369140625,
"logps/chosen": -53.137229919433594,
"logps/rejected": -75.12387084960938,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1256095916032791,
"rewards/margins": 2.197791576385498,
"rewards/rejected": -2.0721821784973145,
"step": 3275
},
{
"epoch": 1.2581511315688532,
"grad_norm": 0.0250244140625,
"learning_rate": 3.4948395807662644e-05,
"logits/chosen": -6.882128715515137,
"logits/rejected": -6.942866325378418,
"logps/chosen": -54.67943572998047,
"logps/rejected": -75.58345031738281,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.21490740776062012,
"rewards/margins": 2.196397066116333,
"rewards/rejected": -1.9814897775650024,
"step": 3280
},
{
"epoch": 1.2600690448791716,
"grad_norm": 0.015869140625,
"learning_rate": 3.490045593858909e-05,
"logits/chosen": -6.784398555755615,
"logits/rejected": -6.870619297027588,
"logps/chosen": -58.52797317504883,
"logps/rejected": -78.68760681152344,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.22775070369243622,
"rewards/margins": 2.1975009441375732,
"rewards/rejected": -1.969750165939331,
"step": 3285
},
{
"epoch": 1.2619869581894898,
"grad_norm": 0.0179443359375,
"learning_rate": 3.485246377276285e-05,
"logits/chosen": -6.8864336013793945,
"logits/rejected": -6.979198455810547,
"logps/chosen": -55.7845458984375,
"logps/rejected": -76.38514709472656,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07828356325626373,
"rewards/margins": 2.195582628250122,
"rewards/rejected": -2.1172988414764404,
"step": 3290
},
{
"epoch": 1.2639048714998082,
"grad_norm": 0.01611328125,
"learning_rate": 3.48044195492464e-05,
"logits/chosen": -6.8806962966918945,
"logits/rejected": -6.947422981262207,
"logps/chosen": -55.211936950683594,
"logps/rejected": -76.61866760253906,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16678637266159058,
"rewards/margins": 2.1968917846679688,
"rewards/rejected": -2.0301055908203125,
"step": 3295
},
{
"epoch": 1.2658227848101267,
"grad_norm": 0.01806640625,
"learning_rate": 3.4756323507361515e-05,
"logits/chosen": -6.963173866271973,
"logits/rejected": -7.004177093505859,
"logps/chosen": -57.901634216308594,
"logps/rejected": -79.7287826538086,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07122094929218292,
"rewards/margins": 2.1957080364227295,
"rewards/rejected": -2.1244869232177734,
"step": 3300
},
{
"epoch": 1.2677406981204449,
"grad_norm": 0.018310546875,
"learning_rate": 3.47081758866881e-05,
"logits/chosen": -6.824810028076172,
"logits/rejected": -6.932721138000488,
"logps/chosen": -56.644142150878906,
"logps/rejected": -78.6021728515625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11658480018377304,
"rewards/margins": 2.1953206062316895,
"rewards/rejected": -2.078735589981079,
"step": 3305
},
{
"epoch": 1.2696586114307633,
"grad_norm": 0.019287109375,
"learning_rate": 3.465997692706299e-05,
"logits/chosen": -6.901789665222168,
"logits/rejected": -7.001550197601318,
"logps/chosen": -53.700408935546875,
"logps/rejected": -75.07813262939453,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.17139770090579987,
"rewards/margins": 2.1972126960754395,
"rewards/rejected": -2.0258147716522217,
"step": 3310
},
{
"epoch": 1.2715765247410817,
"grad_norm": 0.0218505859375,
"learning_rate": 3.461172686857874e-05,
"logits/chosen": -6.829520225524902,
"logits/rejected": -6.944082736968994,
"logps/chosen": -53.95747756958008,
"logps/rejected": -75.05677795410156,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16962416470050812,
"rewards/margins": 2.1952710151672363,
"rewards/rejected": -2.0256474018096924,
"step": 3315
},
{
"epoch": 1.2734944380514,
"grad_norm": 0.0162353515625,
"learning_rate": 3.456342595158247e-05,
"logits/chosen": -6.842916011810303,
"logits/rejected": -6.947360992431641,
"logps/chosen": -55.469261169433594,
"logps/rejected": -76.36644744873047,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12937235832214355,
"rewards/margins": 2.198073387145996,
"rewards/rejected": -2.0687010288238525,
"step": 3320
},
{
"epoch": 1.2754123513617184,
"grad_norm": 0.022216796875,
"learning_rate": 3.45150744166746e-05,
"logits/chosen": -6.803062438964844,
"logits/rejected": -6.875199794769287,
"logps/chosen": -60.186309814453125,
"logps/rejected": -81.22868347167969,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.029813658446073532,
"rewards/margins": 2.200932025909424,
"rewards/rejected": -2.1711182594299316,
"step": 3325
},
{
"epoch": 1.2773302646720368,
"grad_norm": 0.015869140625,
"learning_rate": 3.446667250470774e-05,
"logits/chosen": -6.896180152893066,
"logits/rejected": -6.9682936668396,
"logps/chosen": -57.92467498779297,
"logps/rejected": -79.21630096435547,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07933951914310455,
"rewards/margins": 2.197868824005127,
"rewards/rejected": -2.1185295581817627,
"step": 3330
},
{
"epoch": 1.2792481779823552,
"grad_norm": 0.0196533203125,
"learning_rate": 3.441822045678542e-05,
"logits/chosen": -6.864504337310791,
"logits/rejected": -6.974801540374756,
"logps/chosen": -56.012306213378906,
"logps/rejected": -77.16068267822266,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12609338760375977,
"rewards/margins": 2.196408748626709,
"rewards/rejected": -2.0703155994415283,
"step": 3335
},
{
"epoch": 1.2811660912926737,
"grad_norm": 0.020751953125,
"learning_rate": 3.436971851426089e-05,
"logits/chosen": -6.881219387054443,
"logits/rejected": -6.962426662445068,
"logps/chosen": -54.42626190185547,
"logps/rejected": -75.70896911621094,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16146163642406464,
"rewards/margins": 2.197981595993042,
"rewards/rejected": -2.036520004272461,
"step": 3340
},
{
"epoch": 1.2830840046029919,
"grad_norm": 0.016357421875,
"learning_rate": 3.4321166918735966e-05,
"logits/chosen": -6.941650390625,
"logits/rejected": -6.986546993255615,
"logps/chosen": -57.1054573059082,
"logps/rejected": -78.90348815917969,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11171239614486694,
"rewards/margins": 2.1978182792663574,
"rewards/rejected": -2.0861058235168457,
"step": 3345
},
{
"epoch": 1.2850019179133103,
"grad_norm": 0.0179443359375,
"learning_rate": 3.4272565912059794e-05,
"logits/chosen": -6.926405906677246,
"logits/rejected": -7.031134605407715,
"logps/chosen": -55.7785758972168,
"logps/rejected": -77.1592788696289,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1397164762020111,
"rewards/margins": 2.1975929737091064,
"rewards/rejected": -2.0578763484954834,
"step": 3350
},
{
"epoch": 1.2869198312236287,
"grad_norm": 0.0174560546875,
"learning_rate": 3.4223915736327656e-05,
"logits/chosen": -6.847365379333496,
"logits/rejected": -6.972783088684082,
"logps/chosen": -54.550384521484375,
"logps/rejected": -75.249755859375,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.20894715189933777,
"rewards/margins": 2.196265459060669,
"rewards/rejected": -1.9873180389404297,
"step": 3355
},
{
"epoch": 1.288837744533947,
"grad_norm": 0.019775390625,
"learning_rate": 3.417521663387974e-05,
"logits/chosen": -6.9112982749938965,
"logits/rejected": -6.933781623840332,
"logps/chosen": -56.79960250854492,
"logps/rejected": -77.83222961425781,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.14105048775672913,
"rewards/margins": 2.1970365047454834,
"rewards/rejected": -2.055985927581787,
"step": 3360
},
{
"epoch": 1.2907556578442654,
"grad_norm": 0.018798828125,
"learning_rate": 3.412646884729998e-05,
"logits/chosen": -6.834046840667725,
"logits/rejected": -6.908982276916504,
"logps/chosen": -56.66218948364258,
"logps/rejected": -77.36146545410156,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1607808917760849,
"rewards/margins": 2.1996397972106934,
"rewards/rejected": -2.0388588905334473,
"step": 3365
},
{
"epoch": 1.2926735711545838,
"grad_norm": 0.0133056640625,
"learning_rate": 3.407767261941478e-05,
"logits/chosen": -6.865869045257568,
"logits/rejected": -6.947390556335449,
"logps/chosen": -55.22978591918945,
"logps/rejected": -76.95445251464844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16833913326263428,
"rewards/margins": 2.198389768600464,
"rewards/rejected": -2.030050754547119,
"step": 3370
},
{
"epoch": 1.2945914844649022,
"grad_norm": 0.01904296875,
"learning_rate": 3.4028828193291894e-05,
"logits/chosen": -6.868216037750244,
"logits/rejected": -6.96189022064209,
"logps/chosen": -53.82965087890625,
"logps/rejected": -75.89132690429688,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.15720410645008087,
"rewards/margins": 2.197294235229492,
"rewards/rejected": -2.0400900840759277,
"step": 3375
},
{
"epoch": 1.2965093977752207,
"grad_norm": 0.01495361328125,
"learning_rate": 3.3979935812239116e-05,
"logits/chosen": -6.839072227478027,
"logits/rejected": -6.940023899078369,
"logps/chosen": -56.325706481933594,
"logps/rejected": -76.52516174316406,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1907631903886795,
"rewards/margins": 2.196441888809204,
"rewards/rejected": -2.005678653717041,
"step": 3380
},
{
"epoch": 1.2984273110855389,
"grad_norm": 0.0194091796875,
"learning_rate": 3.393099571980315e-05,
"logits/chosen": -6.903067111968994,
"logits/rejected": -6.9862565994262695,
"logps/chosen": -51.87671661376953,
"logps/rejected": -74.49134826660156,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12387783825397491,
"rewards/margins": 2.196930170059204,
"rewards/rejected": -2.073052406311035,
"step": 3385
},
{
"epoch": 1.3003452243958573,
"grad_norm": 0.0152587890625,
"learning_rate": 3.3882008159768344e-05,
"logits/chosen": -6.856507301330566,
"logits/rejected": -6.935732364654541,
"logps/chosen": -52.5578727722168,
"logps/rejected": -73.25543212890625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2329297810792923,
"rewards/margins": 2.1983587741851807,
"rewards/rejected": -1.9654289484024048,
"step": 3390
},
{
"epoch": 1.3022631377061757,
"grad_norm": 0.0135498046875,
"learning_rate": 3.383297337615551e-05,
"logits/chosen": -6.921877861022949,
"logits/rejected": -6.980062007904053,
"logps/chosen": -54.018531799316406,
"logps/rejected": -76.19944763183594,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.14331956207752228,
"rewards/margins": 2.1956329345703125,
"rewards/rejected": -2.0523130893707275,
"step": 3395
},
{
"epoch": 1.304181051016494,
"grad_norm": 0.021728515625,
"learning_rate": 3.378389161322069e-05,
"logits/chosen": -6.900376796722412,
"logits/rejected": -6.962699890136719,
"logps/chosen": -59.96956253051758,
"logps/rejected": -81.11237335205078,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.007493323180824518,
"rewards/margins": 2.1981189250946045,
"rewards/rejected": -2.1906256675720215,
"step": 3400
},
{
"epoch": 1.3060989643268124,
"grad_norm": 0.0166015625,
"learning_rate": 3.373476311545394e-05,
"logits/chosen": -6.904867649078369,
"logits/rejected": -6.971893310546875,
"logps/chosen": -56.5262336730957,
"logps/rejected": -76.99403381347656,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11257918179035187,
"rewards/margins": 2.1982243061065674,
"rewards/rejected": -2.0856451988220215,
"step": 3405
},
{
"epoch": 1.3080168776371308,
"grad_norm": 0.015869140625,
"learning_rate": 3.368558812757811e-05,
"logits/chosen": -6.831631660461426,
"logits/rejected": -6.9506120681762695,
"logps/chosen": -60.0141716003418,
"logps/rejected": -81.53129577636719,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.057561974972486496,
"rewards/margins": 2.19670033454895,
"rewards/rejected": -2.1391382217407227,
"step": 3410
},
{
"epoch": 1.3099347909474492,
"grad_norm": 0.0152587890625,
"learning_rate": 3.363636689454765e-05,
"logits/chosen": -6.874886989593506,
"logits/rejected": -6.909640312194824,
"logps/chosen": -58.28754425048828,
"logps/rejected": -79.06494140625,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.044596798717975616,
"rewards/margins": 2.1944453716278076,
"rewards/rejected": -2.149848461151123,
"step": 3415
},
{
"epoch": 1.3118527042577677,
"grad_norm": 0.01434326171875,
"learning_rate": 3.358709966154735e-05,
"logits/chosen": -6.809027194976807,
"logits/rejected": -6.925405025482178,
"logps/chosen": -53.523101806640625,
"logps/rejected": -75.57826232910156,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.17070826888084412,
"rewards/margins": 2.1942429542541504,
"rewards/rejected": -2.0235350131988525,
"step": 3420
},
{
"epoch": 1.3137706175680859,
"grad_norm": 0.0181884765625,
"learning_rate": 3.353778667399114e-05,
"logits/chosen": -6.857409477233887,
"logits/rejected": -6.90288782119751,
"logps/chosen": -56.48028564453125,
"logps/rejected": -77.70547485351562,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.18256667256355286,
"rewards/margins": 2.197591781616211,
"rewards/rejected": -2.0150249004364014,
"step": 3425
},
{
"epoch": 1.3156885308784043,
"grad_norm": 0.0185546875,
"learning_rate": 3.348842817752088e-05,
"logits/chosen": -6.897669315338135,
"logits/rejected": -6.962693214416504,
"logps/chosen": -56.440391540527344,
"logps/rejected": -77.97578430175781,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.077767513692379,
"rewards/margins": 2.2007930278778076,
"rewards/rejected": -2.1230251789093018,
"step": 3430
},
{
"epoch": 1.3176064441887227,
"grad_norm": 0.0198974609375,
"learning_rate": 3.343902441800511e-05,
"logits/chosen": -6.962320804595947,
"logits/rejected": -7.01629638671875,
"logps/chosen": -50.10385513305664,
"logps/rejected": -71.91188049316406,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.20694105327129364,
"rewards/margins": 2.1971373558044434,
"rewards/rejected": -1.9901962280273438,
"step": 3435
},
{
"epoch": 1.319524357499041,
"grad_norm": 0.0155029296875,
"learning_rate": 3.338957564153784e-05,
"logits/chosen": -6.870461463928223,
"logits/rejected": -6.979992866516113,
"logps/chosen": -56.268280029296875,
"logps/rejected": -77.19740295410156,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11492518335580826,
"rewards/margins": 2.1967813968658447,
"rewards/rejected": -2.0818562507629395,
"step": 3440
},
{
"epoch": 1.3214422708093594,
"grad_norm": 0.021728515625,
"learning_rate": 3.3340082094437343e-05,
"logits/chosen": -6.830872535705566,
"logits/rejected": -6.9500885009765625,
"logps/chosen": -51.35883712768555,
"logps/rejected": -75.20509338378906,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.17043998837471008,
"rewards/margins": 2.1950507164001465,
"rewards/rejected": -2.0246105194091797,
"step": 3445
},
{
"epoch": 1.3233601841196778,
"grad_norm": 0.0155029296875,
"learning_rate": 3.329054402324487e-05,
"logits/chosen": -6.876837730407715,
"logits/rejected": -6.953248500823975,
"logps/chosen": -53.9123649597168,
"logps/rejected": -75.28543853759766,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1824209988117218,
"rewards/margins": 2.196981906890869,
"rewards/rejected": -2.0145609378814697,
"step": 3450
},
{
"epoch": 1.325278097429996,
"grad_norm": 0.015869140625,
"learning_rate": 3.3240961674723483e-05,
"logits/chosen": -6.871293067932129,
"logits/rejected": -6.961977958679199,
"logps/chosen": -53.93735885620117,
"logps/rejected": -74.31446838378906,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.18244056403636932,
"rewards/margins": 2.195777416229248,
"rewards/rejected": -2.013336658477783,
"step": 3455
},
{
"epoch": 1.3271960107403145,
"grad_norm": 0.0128173828125,
"learning_rate": 3.3191335295856795e-05,
"logits/chosen": -6.839479923248291,
"logits/rejected": -6.9421234130859375,
"logps/chosen": -52.604591369628906,
"logps/rejected": -73.18003845214844,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.24276569485664368,
"rewards/margins": 2.200589179992676,
"rewards/rejected": -1.957823395729065,
"step": 3460
},
{
"epoch": 1.3291139240506329,
"grad_norm": 0.0157470703125,
"learning_rate": 3.314166513384775e-05,
"logits/chosen": -6.872336387634277,
"logits/rejected": -6.956648826599121,
"logps/chosen": -51.502952575683594,
"logps/rejected": -73.16020202636719,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.21097616851329803,
"rewards/margins": 2.1991126537323,
"rewards/rejected": -1.9881365299224854,
"step": 3465
},
{
"epoch": 1.3310318373609513,
"grad_norm": 0.0123291015625,
"learning_rate": 3.309195143611739e-05,
"logits/chosen": -6.865052223205566,
"logits/rejected": -6.946972846984863,
"logps/chosen": -52.97285842895508,
"logps/rejected": -73.24884033203125,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.18618887662887573,
"rewards/margins": 2.196037769317627,
"rewards/rejected": -2.0098490715026855,
"step": 3470
},
{
"epoch": 1.3329497506712698,
"grad_norm": 0.01348876953125,
"learning_rate": 3.304219445030361e-05,
"logits/chosen": -6.938165187835693,
"logits/rejected": -6.9916205406188965,
"logps/chosen": -55.28739547729492,
"logps/rejected": -76.15113830566406,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.13353893160820007,
"rewards/margins": 2.19685697555542,
"rewards/rejected": -2.0633180141448975,
"step": 3475
},
{
"epoch": 1.334867663981588,
"grad_norm": 0.01806640625,
"learning_rate": 3.2992394424259955e-05,
"logits/chosen": -6.886143684387207,
"logits/rejected": -6.981758117675781,
"logps/chosen": -54.95038604736328,
"logps/rejected": -75.45576477050781,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12386339902877808,
"rewards/margins": 2.197850227355957,
"rewards/rejected": -2.073986768722534,
"step": 3480
},
{
"epoch": 1.3367855772919064,
"grad_norm": 0.01287841796875,
"learning_rate": 3.294255160605433e-05,
"logits/chosen": -6.929156303405762,
"logits/rejected": -7.001688480377197,
"logps/chosen": -57.22789764404297,
"logps/rejected": -78.39673614501953,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.09558825939893723,
"rewards/margins": 2.1976473331451416,
"rewards/rejected": -2.1020588874816895,
"step": 3485
},
{
"epoch": 1.3387034906022248,
"grad_norm": 0.0137939453125,
"learning_rate": 3.289266624396785e-05,
"logits/chosen": -6.8782854080200195,
"logits/rejected": -6.912919044494629,
"logps/chosen": -57.03832244873047,
"logps/rejected": -77.84246063232422,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.10016844421625137,
"rewards/margins": 2.197028875350952,
"rewards/rejected": -2.096860408782959,
"step": 3490
},
{
"epoch": 1.340621403912543,
"grad_norm": 0.0157470703125,
"learning_rate": 3.28427385864935e-05,
"logits/chosen": -6.863548278808594,
"logits/rejected": -6.929646968841553,
"logps/chosen": -55.10469436645508,
"logps/rejected": -77.05055236816406,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.10287058353424072,
"rewards/margins": 2.1964352130889893,
"rewards/rejected": -2.093564510345459,
"step": 3495
},
{
"epoch": 1.3425393172228615,
"grad_norm": 0.0159912109375,
"learning_rate": 3.2792768882335e-05,
"logits/chosen": -6.898591041564941,
"logits/rejected": -6.955502510070801,
"logps/chosen": -57.91131591796875,
"logps/rejected": -79.73384094238281,
"loss": 0.3251,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08750198036432266,
"rewards/margins": 2.1968607902526855,
"rewards/rejected": -2.109358787536621,
"step": 3500
},
{
"epoch": 1.3425393172228615,
"eval_logits/chosen": -6.735074043273926,
"eval_logits/rejected": -7.012642860412598,
"eval_logps/chosen": -54.117008209228516,
"eval_logps/rejected": -76.93775177001953,
"eval_loss": 0.325105220079422,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": 0.13358335196971893,
"eval_rewards/margins": 2.1980643272399902,
"eval_rewards/rejected": -2.064480781555176,
"eval_runtime": 5.3108,
"eval_samples_per_second": 37.659,
"eval_steps_per_second": 37.659,
"step": 3500
}
],
"logging_steps": 5,
"max_steps": 7821,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}