openhermes-dpo / trainer_state.json
Jenbenarye's picture
Model save
7d4c0c4 verified
{
"best_global_step": 1200,
"best_metric": 0.4862891137599945,
"best_model_checkpoint": "runs/dpo-OpenHermes-2.5-Mistral-7B-20251120-1236/checkpoints/checkpoint-1200",
"epoch": 1.0,
"eval_steps": 100,
"global_step": 1274,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003925417075564278,
"grad_norm": 6.150045871734619,
"learning_rate": 1.9937205651491366e-05,
"logits/chosen": -3.0840773582458496,
"logits/rejected": -3.0958099365234375,
"logps/chosen": -311.7703857421875,
"logps/rejected": -290.61724853515625,
"loss": 0.6805,
"rewards/accuracies": 0.4833333492279053,
"rewards/chosen": 0.13000372052192688,
"rewards/margins": 0.03325975313782692,
"rewards/rejected": 0.09674396365880966,
"step": 5
},
{
"epoch": 0.007850834151128557,
"grad_norm": 10.357205390930176,
"learning_rate": 1.9858712715855573e-05,
"logits/chosen": -3.0053043365478516,
"logits/rejected": -3.0651307106018066,
"logps/chosen": -339.4639892578125,
"logps/rejected": -330.821533203125,
"loss": 0.6648,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.5165472626686096,
"rewards/margins": 0.10410015285015106,
"rewards/rejected": 0.412447065114975,
"step": 10
},
{
"epoch": 0.011776251226692836,
"grad_norm": 5.737811088562012,
"learning_rate": 1.9780219780219784e-05,
"logits/chosen": -3.049772262573242,
"logits/rejected": -2.9993741512298584,
"logps/chosen": -339.0422668457031,
"logps/rejected": -313.1036071777344,
"loss": 0.6335,
"rewards/accuracies": 0.6458333730697632,
"rewards/chosen": 0.5449298620223999,
"rewards/margins": 0.23259714245796204,
"rewards/rejected": 0.31233277916908264,
"step": 15
},
{
"epoch": 0.015701668302257114,
"grad_norm": 5.574727535247803,
"learning_rate": 1.9701726844583988e-05,
"logits/chosen": -3.0041749477386475,
"logits/rejected": -2.9779772758483887,
"logps/chosen": -337.6470642089844,
"logps/rejected": -344.86907958984375,
"loss": 0.5883,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.6041615605354309,
"rewards/margins": 0.3823166787624359,
"rewards/rejected": 0.221844881772995,
"step": 20
},
{
"epoch": 0.019627085377821395,
"grad_norm": 5.843604564666748,
"learning_rate": 1.9623233908948195e-05,
"logits/chosen": -2.9967105388641357,
"logits/rejected": -3.058979034423828,
"logps/chosen": -295.9526062011719,
"logps/rejected": -288.1315002441406,
"loss": 0.5635,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": 0.8546509742736816,
"rewards/margins": 0.5589786767959595,
"rewards/rejected": 0.2956722378730774,
"step": 25
},
{
"epoch": 0.023552502453385672,
"grad_norm": 5.119436264038086,
"learning_rate": 1.9544740973312402e-05,
"logits/chosen": -3.0148398876190186,
"logits/rejected": -2.98360013961792,
"logps/chosen": -310.6913146972656,
"logps/rejected": -300.73724365234375,
"loss": 0.5576,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.528806209564209,
"rewards/margins": 0.6290292739868164,
"rewards/rejected": -0.10022306442260742,
"step": 30
},
{
"epoch": 0.02747791952894995,
"grad_norm": 6.46522855758667,
"learning_rate": 1.9466248037676613e-05,
"logits/chosen": -2.9573066234588623,
"logits/rejected": -3.0088870525360107,
"logps/chosen": -308.3896789550781,
"logps/rejected": -297.2813415527344,
"loss": 0.5858,
"rewards/accuracies": 0.6541667580604553,
"rewards/chosen": 0.3321291506290436,
"rewards/margins": 0.6073407530784607,
"rewards/rejected": -0.2752116024494171,
"step": 35
},
{
"epoch": 0.03140333660451423,
"grad_norm": 5.806535720825195,
"learning_rate": 1.9387755102040817e-05,
"logits/chosen": -2.980921745300293,
"logits/rejected": -3.0631861686706543,
"logps/chosen": -328.13287353515625,
"logps/rejected": -307.7826232910156,
"loss": 0.6097,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.26903384923934937,
"rewards/margins": 0.5272954106330872,
"rewards/rejected": -0.2582615911960602,
"step": 40
},
{
"epoch": 0.03532875368007851,
"grad_norm": 5.124495506286621,
"learning_rate": 1.9309262166405024e-05,
"logits/chosen": -3.0031769275665283,
"logits/rejected": -3.003542900085449,
"logps/chosen": -307.12860107421875,
"logps/rejected": -323.902099609375,
"loss": 0.5483,
"rewards/accuracies": 0.6833333373069763,
"rewards/chosen": -0.10443999618291855,
"rewards/margins": 0.6921336650848389,
"rewards/rejected": -0.7965737581253052,
"step": 45
},
{
"epoch": 0.03925417075564279,
"grad_norm": 5.090153217315674,
"learning_rate": 1.923076923076923e-05,
"logits/chosen": -3.009328842163086,
"logits/rejected": -3.043778657913208,
"logps/chosen": -316.96917724609375,
"logps/rejected": -311.90960693359375,
"loss": 0.5658,
"rewards/accuracies": 0.7041667103767395,
"rewards/chosen": -0.5328065752983093,
"rewards/margins": 0.6153702139854431,
"rewards/rejected": -1.148176670074463,
"step": 50
},
{
"epoch": 0.04317958783120707,
"grad_norm": 5.527870178222656,
"learning_rate": 1.9152276295133442e-05,
"logits/chosen": -3.0257294178009033,
"logits/rejected": -3.0307681560516357,
"logps/chosen": -312.945068359375,
"logps/rejected": -302.48126220703125,
"loss": 0.5258,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -0.4231169819831848,
"rewards/margins": 0.7520371079444885,
"rewards/rejected": -1.1751540899276733,
"step": 55
},
{
"epoch": 0.047105004906771344,
"grad_norm": 5.95127010345459,
"learning_rate": 1.9073783359497646e-05,
"logits/chosen": -2.9485747814178467,
"logits/rejected": -3.040693998336792,
"logps/chosen": -323.712158203125,
"logps/rejected": -296.09710693359375,
"loss": 0.5437,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.07572325319051743,
"rewards/margins": 0.7711307406425476,
"rewards/rejected": -0.846854031085968,
"step": 60
},
{
"epoch": 0.05103042198233562,
"grad_norm": 5.20306396484375,
"learning_rate": 1.8995290423861853e-05,
"logits/chosen": -2.932573080062866,
"logits/rejected": -2.9804799556732178,
"logps/chosen": -320.690185546875,
"logps/rejected": -315.3598327636719,
"loss": 0.5424,
"rewards/accuracies": 0.720833420753479,
"rewards/chosen": 0.1321389377117157,
"rewards/margins": 0.7304352521896362,
"rewards/rejected": -0.5982962846755981,
"step": 65
},
{
"epoch": 0.0549558390578999,
"grad_norm": 6.864138603210449,
"learning_rate": 1.891679748822606e-05,
"logits/chosen": -2.9804160594940186,
"logits/rejected": -3.058073043823242,
"logps/chosen": -302.26055908203125,
"logps/rejected": -324.0291442871094,
"loss": 0.5701,
"rewards/accuracies": 0.6833333969116211,
"rewards/chosen": 0.36459389328956604,
"rewards/margins": 0.7426995038986206,
"rewards/rejected": -0.37810567021369934,
"step": 70
},
{
"epoch": 0.058881256133464184,
"grad_norm": 4.591891765594482,
"learning_rate": 1.8838304552590268e-05,
"logits/chosen": -2.9886913299560547,
"logits/rejected": -2.987067699432373,
"logps/chosen": -314.6214904785156,
"logps/rejected": -298.33953857421875,
"loss": 0.5124,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": 0.3623279631137848,
"rewards/margins": 0.8299944996833801,
"rewards/rejected": -0.4676665663719177,
"step": 75
},
{
"epoch": 0.06280667320902845,
"grad_norm": 4.877047538757324,
"learning_rate": 1.8759811616954475e-05,
"logits/chosen": -2.989983081817627,
"logits/rejected": -3.0554869174957275,
"logps/chosen": -312.76019287109375,
"logps/rejected": -337.36920166015625,
"loss": 0.5185,
"rewards/accuracies": 0.720833420753479,
"rewards/chosen": 0.21076758205890656,
"rewards/margins": 0.8532983064651489,
"rewards/rejected": -0.6425307393074036,
"step": 80
},
{
"epoch": 0.06673209028459273,
"grad_norm": 6.023036003112793,
"learning_rate": 1.8681318681318682e-05,
"logits/chosen": -2.9612174034118652,
"logits/rejected": -3.071810722351074,
"logps/chosen": -326.7922058105469,
"logps/rejected": -314.8778076171875,
"loss": 0.5549,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": 0.18136966228485107,
"rewards/margins": 0.7432295083999634,
"rewards/rejected": -0.5618598461151123,
"step": 85
},
{
"epoch": 0.07065750736015702,
"grad_norm": 6.082668781280518,
"learning_rate": 1.860282574568289e-05,
"logits/chosen": -2.832726001739502,
"logits/rejected": -2.891904830932617,
"logps/chosen": -330.4433898925781,
"logps/rejected": -299.0811462402344,
"loss": 0.5603,
"rewards/accuracies": 0.6916667222976685,
"rewards/chosen": 0.12478373199701309,
"rewards/margins": 0.821280300617218,
"rewards/rejected": -0.6964964866638184,
"step": 90
},
{
"epoch": 0.0745829244357213,
"grad_norm": 5.67427396774292,
"learning_rate": 1.8524332810047097e-05,
"logits/chosen": -2.997680902481079,
"logits/rejected": -3.0600008964538574,
"logps/chosen": -325.9932861328125,
"logps/rejected": -297.598388671875,
"loss": 0.5578,
"rewards/accuracies": 0.7000000476837158,
"rewards/chosen": 0.11321593821048737,
"rewards/margins": 0.7891088724136353,
"rewards/rejected": -0.6758929491043091,
"step": 95
},
{
"epoch": 0.07850834151128558,
"grad_norm": 5.028480052947998,
"learning_rate": 1.8445839874411304e-05,
"logits/chosen": -2.9710261821746826,
"logits/rejected": -2.934241771697998,
"logps/chosen": -332.2840881347656,
"logps/rejected": -323.45794677734375,
"loss": 0.5515,
"rewards/accuracies": 0.7416667342185974,
"rewards/chosen": -0.20720729231834412,
"rewards/margins": 0.7714598774909973,
"rewards/rejected": -0.9786672592163086,
"step": 100
},
{
"epoch": 0.07850834151128558,
"eval_logits/chosen": -2.980896234512329,
"eval_logits/rejected": -3.010272741317749,
"eval_logps/chosen": -332.7633972167969,
"eval_logps/rejected": -323.36285400390625,
"eval_loss": 0.5452204346656799,
"eval_rewards/accuracies": 0.7120000123977661,
"eval_rewards/chosen": -0.3399922549724579,
"eval_rewards/margins": 0.7534830570220947,
"eval_rewards/rejected": -1.0934752225875854,
"eval_runtime": 171.9076,
"eval_samples_per_second": 11.634,
"eval_steps_per_second": 5.817,
"step": 100
},
{
"epoch": 0.08243375858684986,
"grad_norm": 4.673857688903809,
"learning_rate": 1.836734693877551e-05,
"logits/chosen": -2.955043077468872,
"logits/rejected": -3.010183811187744,
"logps/chosen": -360.6026611328125,
"logps/rejected": -354.76043701171875,
"loss": 0.538,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -0.5105575919151306,
"rewards/margins": 0.7706912755966187,
"rewards/rejected": -1.2812488079071045,
"step": 105
},
{
"epoch": 0.08635917566241413,
"grad_norm": 6.655648231506348,
"learning_rate": 1.828885400313972e-05,
"logits/chosen": -2.9778573513031006,
"logits/rejected": -2.9207446575164795,
"logps/chosen": -323.5428161621094,
"logps/rejected": -342.148681640625,
"loss": 0.5713,
"rewards/accuracies": 0.6958334445953369,
"rewards/chosen": -0.6803138852119446,
"rewards/margins": 0.697836697101593,
"rewards/rejected": -1.3781505823135376,
"step": 110
},
{
"epoch": 0.09028459273797841,
"grad_norm": 5.369375228881836,
"learning_rate": 1.8210361067503926e-05,
"logits/chosen": -2.9552626609802246,
"logits/rejected": -3.075476884841919,
"logps/chosen": -348.5240173339844,
"logps/rejected": -307.6527404785156,
"loss": 0.5155,
"rewards/accuracies": 0.7583333849906921,
"rewards/chosen": -0.3947049677371979,
"rewards/margins": 0.8081506490707397,
"rewards/rejected": -1.2028557062149048,
"step": 115
},
{
"epoch": 0.09421000981354269,
"grad_norm": 4.998170852661133,
"learning_rate": 1.8131868131868133e-05,
"logits/chosen": -2.8421874046325684,
"logits/rejected": -2.9301371574401855,
"logps/chosen": -320.47760009765625,
"logps/rejected": -307.7643737792969,
"loss": 0.499,
"rewards/accuracies": 0.7541667222976685,
"rewards/chosen": -0.32407405972480774,
"rewards/margins": 0.8497726321220398,
"rewards/rejected": -1.1738468408584595,
"step": 120
},
{
"epoch": 0.09813542688910697,
"grad_norm": 6.9904890060424805,
"learning_rate": 1.805337519623234e-05,
"logits/chosen": -2.96341872215271,
"logits/rejected": -2.9047999382019043,
"logps/chosen": -350.89495849609375,
"logps/rejected": -367.870361328125,
"loss": 0.5847,
"rewards/accuracies": 0.6958333849906921,
"rewards/chosen": -0.3227779269218445,
"rewards/margins": 0.7198423147201538,
"rewards/rejected": -1.0426201820373535,
"step": 125
},
{
"epoch": 0.10206084396467124,
"grad_norm": 5.14391565322876,
"learning_rate": 1.7974882260596548e-05,
"logits/chosen": -3.0105910301208496,
"logits/rejected": -3.0620298385620117,
"logps/chosen": -341.6379089355469,
"logps/rejected": -306.1999816894531,
"loss": 0.5432,
"rewards/accuracies": 0.7291666865348816,
"rewards/chosen": -0.4734025001525879,
"rewards/margins": 0.7205262184143066,
"rewards/rejected": -1.1939287185668945,
"step": 130
},
{
"epoch": 0.10598626104023552,
"grad_norm": 5.304475784301758,
"learning_rate": 1.7896389324960755e-05,
"logits/chosen": -2.86970853805542,
"logits/rejected": -2.8937735557556152,
"logps/chosen": -313.49835205078125,
"logps/rejected": -306.93585205078125,
"loss": 0.5462,
"rewards/accuracies": 0.7125000357627869,
"rewards/chosen": -0.5461211800575256,
"rewards/margins": 0.748654305934906,
"rewards/rejected": -1.294775366783142,
"step": 135
},
{
"epoch": 0.1099116781157998,
"grad_norm": 4.115116119384766,
"learning_rate": 1.7817896389324962e-05,
"logits/chosen": -3.0203137397766113,
"logits/rejected": -3.0821175575256348,
"logps/chosen": -354.2901306152344,
"logps/rejected": -316.67205810546875,
"loss": 0.5105,
"rewards/accuracies": 0.720833420753479,
"rewards/chosen": -0.45508939027786255,
"rewards/margins": 0.8571721911430359,
"rewards/rejected": -1.3122615814208984,
"step": 140
},
{
"epoch": 0.11383709519136408,
"grad_norm": 5.043489456176758,
"learning_rate": 1.773940345368917e-05,
"logits/chosen": -2.8955276012420654,
"logits/rejected": -2.957524061203003,
"logps/chosen": -318.019775390625,
"logps/rejected": -321.0815124511719,
"loss": 0.5172,
"rewards/accuracies": 0.7666667699813843,
"rewards/chosen": -0.375745952129364,
"rewards/margins": 0.8528572916984558,
"rewards/rejected": -1.2286031246185303,
"step": 145
},
{
"epoch": 0.11776251226692837,
"grad_norm": 6.230247497558594,
"learning_rate": 1.7660910518053377e-05,
"logits/chosen": -2.986295700073242,
"logits/rejected": -3.0270934104919434,
"logps/chosen": -348.29058837890625,
"logps/rejected": -330.9169006347656,
"loss": 0.5398,
"rewards/accuracies": 0.720833420753479,
"rewards/chosen": -0.6200595498085022,
"rewards/margins": 0.8251369595527649,
"rewards/rejected": -1.4451963901519775,
"step": 150
},
{
"epoch": 0.12168792934249265,
"grad_norm": 5.081642150878906,
"learning_rate": 1.7582417582417584e-05,
"logits/chosen": -2.935692548751831,
"logits/rejected": -3.0154006481170654,
"logps/chosen": -348.2768249511719,
"logps/rejected": -316.89361572265625,
"loss": 0.556,
"rewards/accuracies": 0.7125000357627869,
"rewards/chosen": -0.7520820498466492,
"rewards/margins": 0.8174117207527161,
"rewards/rejected": -1.5694936513900757,
"step": 155
},
{
"epoch": 0.1256133464180569,
"grad_norm": 5.332559585571289,
"learning_rate": 1.750392464678179e-05,
"logits/chosen": -2.853811740875244,
"logits/rejected": -2.8885927200317383,
"logps/chosen": -339.7810363769531,
"logps/rejected": -333.8861999511719,
"loss": 0.5375,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": -0.8646346926689148,
"rewards/margins": 0.8464757800102234,
"rewards/rejected": -1.7111107110977173,
"step": 160
},
{
"epoch": 0.1295387634936212,
"grad_norm": 5.659219741821289,
"learning_rate": 1.7425431711146e-05,
"logits/chosen": -2.9798367023468018,
"logits/rejected": -3.0234591960906982,
"logps/chosen": -369.3193054199219,
"logps/rejected": -320.97088623046875,
"loss": 0.5365,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -0.5549585223197937,
"rewards/margins": 0.8239587545394897,
"rewards/rejected": -1.3789172172546387,
"step": 165
},
{
"epoch": 0.13346418056918546,
"grad_norm": 6.147792339324951,
"learning_rate": 1.7346938775510206e-05,
"logits/chosen": -2.9105169773101807,
"logits/rejected": -2.9818620681762695,
"logps/chosen": -348.80194091796875,
"logps/rejected": -312.60272216796875,
"loss": 0.5357,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -0.12175627052783966,
"rewards/margins": 0.814881443977356,
"rewards/rejected": -0.9366377592086792,
"step": 170
},
{
"epoch": 0.13738959764474976,
"grad_norm": 5.922034740447998,
"learning_rate": 1.7268445839874413e-05,
"logits/chosen": -2.961211919784546,
"logits/rejected": -3.006511688232422,
"logps/chosen": -299.61260986328125,
"logps/rejected": -318.53961181640625,
"loss": 0.5611,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.25987696647644043,
"rewards/margins": 0.7301396727561951,
"rewards/rejected": -0.47026267647743225,
"step": 175
},
{
"epoch": 0.14131501472031405,
"grad_norm": 5.806030750274658,
"learning_rate": 1.718995290423862e-05,
"logits/chosen": -2.8726003170013428,
"logits/rejected": -2.9730982780456543,
"logps/chosen": -335.31817626953125,
"logps/rejected": -330.07611083984375,
"loss": 0.4908,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.5076613426208496,
"rewards/margins": 0.9330441355705261,
"rewards/rejected": -0.4253828525543213,
"step": 180
},
{
"epoch": 0.1452404317958783,
"grad_norm": 5.604033470153809,
"learning_rate": 1.7111459968602827e-05,
"logits/chosen": -3.0221714973449707,
"logits/rejected": -3.0379929542541504,
"logps/chosen": -299.20281982421875,
"logps/rejected": -292.1725158691406,
"loss": 0.5898,
"rewards/accuracies": 0.6750000715255737,
"rewards/chosen": 0.12294058501720428,
"rewards/margins": 0.6319655179977417,
"rewards/rejected": -0.5090248584747314,
"step": 185
},
{
"epoch": 0.1491658488714426,
"grad_norm": 5.10993766784668,
"learning_rate": 1.7032967032967035e-05,
"logits/chosen": -2.916008949279785,
"logits/rejected": -2.984748601913452,
"logps/chosen": -313.1849365234375,
"logps/rejected": -306.8742370605469,
"loss": 0.5154,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.03891729563474655,
"rewards/margins": 0.8485898971557617,
"rewards/rejected": -0.8096725344657898,
"step": 190
},
{
"epoch": 0.15309126594700687,
"grad_norm": 4.921535015106201,
"learning_rate": 1.6954474097331242e-05,
"logits/chosen": -3.0675573348999023,
"logits/rejected": -2.9610018730163574,
"logps/chosen": -333.9068908691406,
"logps/rejected": -316.9395446777344,
"loss": 0.524,
"rewards/accuracies": 0.7250000834465027,
"rewards/chosen": -0.31782767176628113,
"rewards/margins": 0.7863305807113647,
"rewards/rejected": -1.1041581630706787,
"step": 195
},
{
"epoch": 0.15701668302257116,
"grad_norm": 5.64870548248291,
"learning_rate": 1.687598116169545e-05,
"logits/chosen": -2.9307010173797607,
"logits/rejected": -2.9969723224639893,
"logps/chosen": -315.3205871582031,
"logps/rejected": -319.23876953125,
"loss": 0.5485,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.5450848937034607,
"rewards/margins": 0.7509050965309143,
"rewards/rejected": -1.295989990234375,
"step": 200
},
{
"epoch": 0.15701668302257116,
"eval_logits/chosen": -3.0087897777557373,
"eval_logits/rejected": -3.0380184650421143,
"eval_logps/chosen": -334.6849670410156,
"eval_logps/rejected": -325.60333251953125,
"eval_loss": 0.5298904776573181,
"eval_rewards/accuracies": 0.7354999780654907,
"eval_rewards/chosen": -0.532148003578186,
"eval_rewards/margins": 0.7853737473487854,
"eval_rewards/rejected": -1.3175216913223267,
"eval_runtime": 171.0727,
"eval_samples_per_second": 11.691,
"eval_steps_per_second": 5.845,
"step": 200
},
{
"epoch": 0.16094210009813542,
"grad_norm": 5.222318172454834,
"learning_rate": 1.6797488226059656e-05,
"logits/chosen": -2.896974563598633,
"logits/rejected": -2.8886332511901855,
"logps/chosen": -343.0540466308594,
"logps/rejected": -330.8460388183594,
"loss": 0.5583,
"rewards/accuracies": 0.6791667342185974,
"rewards/chosen": -0.49637308716773987,
"rewards/margins": 0.8196843266487122,
"rewards/rejected": -1.3160574436187744,
"step": 205
},
{
"epoch": 0.1648675171736997,
"grad_norm": 4.988500118255615,
"learning_rate": 1.6718995290423864e-05,
"logits/chosen": -3.0786221027374268,
"logits/rejected": -3.0803046226501465,
"logps/chosen": -364.15679931640625,
"logps/rejected": -344.13348388671875,
"loss": 0.5415,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -0.23943662643432617,
"rewards/margins": 0.7990777492523193,
"rewards/rejected": -1.038514494895935,
"step": 210
},
{
"epoch": 0.16879293424926398,
"grad_norm": 5.479763984680176,
"learning_rate": 1.664050235478807e-05,
"logits/chosen": -2.971986770629883,
"logits/rejected": -2.9459171295166016,
"logps/chosen": -297.9768371582031,
"logps/rejected": -293.61004638671875,
"loss": 0.5346,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": 0.14465472102165222,
"rewards/margins": 0.7910071611404419,
"rewards/rejected": -0.6463524103164673,
"step": 215
},
{
"epoch": 0.17271835132482827,
"grad_norm": 4.327686786651611,
"learning_rate": 1.6562009419152278e-05,
"logits/chosen": -3.0093016624450684,
"logits/rejected": -2.9483301639556885,
"logps/chosen": -312.3398742675781,
"logps/rejected": -321.89990234375,
"loss": 0.5276,
"rewards/accuracies": 0.7416667342185974,
"rewards/chosen": 0.4244857728481293,
"rewards/margins": 0.7893426418304443,
"rewards/rejected": -0.36485689878463745,
"step": 220
},
{
"epoch": 0.17664376840039253,
"grad_norm": 5.979199409484863,
"learning_rate": 1.6483516483516486e-05,
"logits/chosen": -3.0172836780548096,
"logits/rejected": -3.0143485069274902,
"logps/chosen": -305.2528991699219,
"logps/rejected": -335.7610168457031,
"loss": 0.5053,
"rewards/accuracies": 0.7291666865348816,
"rewards/chosen": 0.4461596608161926,
"rewards/margins": 0.9256842732429504,
"rewards/rejected": -0.47952452301979065,
"step": 225
},
{
"epoch": 0.18056918547595682,
"grad_norm": 5.040202617645264,
"learning_rate": 1.6405023547880693e-05,
"logits/chosen": -2.9379525184631348,
"logits/rejected": -2.9742045402526855,
"logps/chosen": -293.68841552734375,
"logps/rejected": -288.2654724121094,
"loss": 0.5655,
"rewards/accuracies": 0.6958333253860474,
"rewards/chosen": 0.10342751443386078,
"rewards/margins": 0.7664733529090881,
"rewards/rejected": -0.6630457639694214,
"step": 230
},
{
"epoch": 0.1844946025515211,
"grad_norm": 6.082977294921875,
"learning_rate": 1.63265306122449e-05,
"logits/chosen": -2.972628593444824,
"logits/rejected": -2.9927875995635986,
"logps/chosen": -342.9659118652344,
"logps/rejected": -326.91204833984375,
"loss": 0.589,
"rewards/accuracies": 0.6791666746139526,
"rewards/chosen": -0.3530040681362152,
"rewards/margins": 0.6996762752532959,
"rewards/rejected": -1.0526803731918335,
"step": 235
},
{
"epoch": 0.18842001962708538,
"grad_norm": 5.5625176429748535,
"learning_rate": 1.6248037676609107e-05,
"logits/chosen": -2.9938926696777344,
"logits/rejected": -3.048494338989258,
"logps/chosen": -365.29718017578125,
"logps/rejected": -342.7457275390625,
"loss": 0.56,
"rewards/accuracies": 0.6916667222976685,
"rewards/chosen": -0.7515178918838501,
"rewards/margins": 0.7500923871994019,
"rewards/rejected": -1.501610279083252,
"step": 240
},
{
"epoch": 0.19234543670264967,
"grad_norm": 4.691596984863281,
"learning_rate": 1.6169544740973315e-05,
"logits/chosen": -3.0218453407287598,
"logits/rejected": -3.0742886066436768,
"logps/chosen": -317.8963928222656,
"logps/rejected": -302.0952453613281,
"loss": 0.4919,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -0.7345434427261353,
"rewards/margins": 0.9255669713020325,
"rewards/rejected": -1.6601102352142334,
"step": 245
},
{
"epoch": 0.19627085377821393,
"grad_norm": 6.012603759765625,
"learning_rate": 1.6091051805337522e-05,
"logits/chosen": -2.928499698638916,
"logits/rejected": -3.021066188812256,
"logps/chosen": -358.77154541015625,
"logps/rejected": -351.7578430175781,
"loss": 0.5549,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6217355728149414,
"rewards/margins": 0.8296122550964355,
"rewards/rejected": -1.4513477087020874,
"step": 250
},
{
"epoch": 0.20019627085377822,
"grad_norm": 5.680856227874756,
"learning_rate": 1.601255886970173e-05,
"logits/chosen": -2.9057984352111816,
"logits/rejected": -2.9277281761169434,
"logps/chosen": -328.77520751953125,
"logps/rejected": -336.87127685546875,
"loss": 0.5115,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.37844187021255493,
"rewards/margins": 0.9464572668075562,
"rewards/rejected": -1.3248990774154663,
"step": 255
},
{
"epoch": 0.2041216879293425,
"grad_norm": 5.254044532775879,
"learning_rate": 1.5934065934065933e-05,
"logits/chosen": -2.89184832572937,
"logits/rejected": -2.9895052909851074,
"logps/chosen": -327.46881103515625,
"logps/rejected": -325.6787414550781,
"loss": 0.5054,
"rewards/accuracies": 0.7583333849906921,
"rewards/chosen": -0.12966637313365936,
"rewards/margins": 1.035685658454895,
"rewards/rejected": -1.1653519868850708,
"step": 260
},
{
"epoch": 0.20804710500490678,
"grad_norm": 5.806418418884277,
"learning_rate": 1.5855572998430144e-05,
"logits/chosen": -2.9994475841522217,
"logits/rejected": -3.0370442867279053,
"logps/chosen": -306.01910400390625,
"logps/rejected": -285.0171203613281,
"loss": 0.5742,
"rewards/accuracies": 0.6958333253860474,
"rewards/chosen": -0.1305859386920929,
"rewards/margins": 0.7369558215141296,
"rewards/rejected": -0.8675416707992554,
"step": 265
},
{
"epoch": 0.21197252208047104,
"grad_norm": 4.36320686340332,
"learning_rate": 1.577708006279435e-05,
"logits/chosen": -2.8925890922546387,
"logits/rejected": -2.969078540802002,
"logps/chosen": -336.0727233886719,
"logps/rejected": -323.86175537109375,
"loss": 0.4837,
"rewards/accuracies": 0.7791666984558105,
"rewards/chosen": -0.15950943529605865,
"rewards/margins": 0.9283342361450195,
"rewards/rejected": -1.087843656539917,
"step": 270
},
{
"epoch": 0.21589793915603533,
"grad_norm": 4.775585174560547,
"learning_rate": 1.5698587127158558e-05,
"logits/chosen": -2.9799633026123047,
"logits/rejected": -2.998260736465454,
"logps/chosen": -339.5887756347656,
"logps/rejected": -322.0613708496094,
"loss": 0.4891,
"rewards/accuracies": 0.7458333969116211,
"rewards/chosen": -0.2023775577545166,
"rewards/margins": 0.9467275738716125,
"rewards/rejected": -1.1491053104400635,
"step": 275
},
{
"epoch": 0.2198233562315996,
"grad_norm": 5.527752876281738,
"learning_rate": 1.5620094191522762e-05,
"logits/chosen": -3.0590403079986572,
"logits/rejected": -3.1203842163085938,
"logps/chosen": -326.7115478515625,
"logps/rejected": -302.5054016113281,
"loss": 0.5579,
"rewards/accuracies": 0.6791666746139526,
"rewards/chosen": -0.37672197818756104,
"rewards/margins": 0.8057243227958679,
"rewards/rejected": -1.1824463605880737,
"step": 280
},
{
"epoch": 0.2237487733071639,
"grad_norm": 4.813480377197266,
"learning_rate": 1.5541601255886973e-05,
"logits/chosen": -3.0462276935577393,
"logits/rejected": -3.101527690887451,
"logps/chosen": -325.5246276855469,
"logps/rejected": -336.7899475097656,
"loss": 0.5218,
"rewards/accuracies": 0.7375000715255737,
"rewards/chosen": -0.45412200689315796,
"rewards/margins": 0.7958036661148071,
"rewards/rejected": -1.2499258518218994,
"step": 285
},
{
"epoch": 0.22767419038272815,
"grad_norm": 4.553357124328613,
"learning_rate": 1.546310832025118e-05,
"logits/chosen": -2.9580142498016357,
"logits/rejected": -2.963024139404297,
"logps/chosen": -342.618896484375,
"logps/rejected": -353.5760803222656,
"loss": 0.5056,
"rewards/accuracies": 0.7458333373069763,
"rewards/chosen": -0.3053968548774719,
"rewards/margins": 0.9428439140319824,
"rewards/rejected": -1.2482408285140991,
"step": 290
},
{
"epoch": 0.23159960745829244,
"grad_norm": 5.060155391693115,
"learning_rate": 1.5384615384615387e-05,
"logits/chosen": -2.9472720623016357,
"logits/rejected": -3.016634702682495,
"logps/chosen": -344.4608154296875,
"logps/rejected": -318.6928405761719,
"loss": 0.4181,
"rewards/accuracies": 0.79583340883255,
"rewards/chosen": -0.15565678477287292,
"rewards/margins": 1.2021899223327637,
"rewards/rejected": -1.3578466176986694,
"step": 295
},
{
"epoch": 0.23552502453385674,
"grad_norm": 3.455629348754883,
"learning_rate": 1.530612244897959e-05,
"logits/chosen": -2.9951541423797607,
"logits/rejected": -3.020198106765747,
"logps/chosen": -308.458251953125,
"logps/rejected": -329.9710998535156,
"loss": 0.4599,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.03096728026866913,
"rewards/margins": 1.1172107458114624,
"rewards/rejected": -1.148177981376648,
"step": 300
},
{
"epoch": 0.23552502453385674,
"eval_logits/chosen": -3.0017831325531006,
"eval_logits/rejected": -3.031320333480835,
"eval_logps/chosen": -331.4214782714844,
"eval_logps/rejected": -324.1851806640625,
"eval_loss": 0.5177174210548401,
"eval_rewards/accuracies": 0.7275000214576721,
"eval_rewards/chosen": -0.20579998195171356,
"eval_rewards/margins": 0.9699056148529053,
"eval_rewards/rejected": -1.1757057905197144,
"eval_runtime": 170.7357,
"eval_samples_per_second": 11.714,
"eval_steps_per_second": 5.857,
"step": 300
},
{
"epoch": 0.239450441609421,
"grad_norm": 5.653757572174072,
"learning_rate": 1.52276295133438e-05,
"logits/chosen": -2.964921236038208,
"logits/rejected": -2.9607841968536377,
"logps/chosen": -355.35760498046875,
"logps/rejected": -382.9061584472656,
"loss": 0.5153,
"rewards/accuracies": 0.720833420753479,
"rewards/chosen": -0.141874298453331,
"rewards/margins": 1.0537707805633545,
"rewards/rejected": -1.1956450939178467,
"step": 305
},
{
"epoch": 0.2433758586849853,
"grad_norm": 5.022225856781006,
"learning_rate": 1.5149136577708007e-05,
"logits/chosen": -2.906406879425049,
"logits/rejected": -3.0163021087646484,
"logps/chosen": -315.5581970214844,
"logps/rejected": -309.3556213378906,
"loss": 0.5086,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.02891678549349308,
"rewards/margins": 1.1372863054275513,
"rewards/rejected": -1.1662030220031738,
"step": 310
},
{
"epoch": 0.24730127576054955,
"grad_norm": 4.488466739654541,
"learning_rate": 1.5070643642072216e-05,
"logits/chosen": -3.0036263465881348,
"logits/rejected": -3.0129470825195312,
"logps/chosen": -334.03955078125,
"logps/rejected": -307.4307861328125,
"loss": 0.5034,
"rewards/accuracies": 0.7458333373069763,
"rewards/chosen": -0.05140721797943115,
"rewards/margins": 1.0299065113067627,
"rewards/rejected": -1.0813137292861938,
"step": 315
},
{
"epoch": 0.2512266928361138,
"grad_norm": 6.025671482086182,
"learning_rate": 1.4992150706436422e-05,
"logits/chosen": -3.0598220825195312,
"logits/rejected": -2.9974725246429443,
"logps/chosen": -336.5476989746094,
"logps/rejected": -314.2373046875,
"loss": 0.5272,
"rewards/accuracies": 0.7416667342185974,
"rewards/chosen": -0.0364832878112793,
"rewards/margins": 0.9740719795227051,
"rewards/rejected": -1.010555386543274,
"step": 320
},
{
"epoch": 0.25515210991167814,
"grad_norm": 3.7678122520446777,
"learning_rate": 1.4913657770800629e-05,
"logits/chosen": -3.0125625133514404,
"logits/rejected": -3.0665228366851807,
"logps/chosen": -373.77386474609375,
"logps/rejected": -362.3180847167969,
"loss": 0.4491,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -0.11678016185760498,
"rewards/margins": 1.1747580766677856,
"rewards/rejected": -1.291538119316101,
"step": 325
},
{
"epoch": 0.2590775269872424,
"grad_norm": 6.283595085144043,
"learning_rate": 1.4835164835164836e-05,
"logits/chosen": -2.9614596366882324,
"logits/rejected": -3.0732309818267822,
"logps/chosen": -340.6827392578125,
"logps/rejected": -330.89178466796875,
"loss": 0.5298,
"rewards/accuracies": 0.7583334445953369,
"rewards/chosen": -0.27896976470947266,
"rewards/margins": 1.1068425178527832,
"rewards/rejected": -1.3858124017715454,
"step": 330
},
{
"epoch": 0.26300294406280667,
"grad_norm": 5.159917831420898,
"learning_rate": 1.4756671899529042e-05,
"logits/chosen": -2.9882471561431885,
"logits/rejected": -2.958286762237549,
"logps/chosen": -304.15899658203125,
"logps/rejected": -337.4718933105469,
"loss": 0.5442,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": -0.5084502100944519,
"rewards/margins": 0.826800525188446,
"rewards/rejected": -1.3352506160736084,
"step": 335
},
{
"epoch": 0.26692836113837093,
"grad_norm": 5.694521427154541,
"learning_rate": 1.467817896389325e-05,
"logits/chosen": -3.0913736820220947,
"logits/rejected": -3.1289680004119873,
"logps/chosen": -306.4797668457031,
"logps/rejected": -315.7909851074219,
"loss": 0.5478,
"rewards/accuracies": 0.708333432674408,
"rewards/chosen": -0.5964398384094238,
"rewards/margins": 0.8492003679275513,
"rewards/rejected": -1.4456400871276855,
"step": 340
},
{
"epoch": 0.27085377821393525,
"grad_norm": 4.4798665046691895,
"learning_rate": 1.4599686028257458e-05,
"logits/chosen": -2.967850923538208,
"logits/rejected": -3.0410642623901367,
"logps/chosen": -341.167236328125,
"logps/rejected": -345.0897216796875,
"loss": 0.491,
"rewards/accuracies": 0.7541667222976685,
"rewards/chosen": -0.3792761266231537,
"rewards/margins": 1.023285150527954,
"rewards/rejected": -1.4025614261627197,
"step": 345
},
{
"epoch": 0.2747791952894995,
"grad_norm": 4.080179214477539,
"learning_rate": 1.4521193092621665e-05,
"logits/chosen": -3.078789234161377,
"logits/rejected": -3.0680148601531982,
"logps/chosen": -342.5566711425781,
"logps/rejected": -333.19732666015625,
"loss": 0.4792,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.26053228974342346,
"rewards/margins": 1.0307527780532837,
"rewards/rejected": -1.2912850379943848,
"step": 350
},
{
"epoch": 0.2787046123650638,
"grad_norm": 6.406314849853516,
"learning_rate": 1.4442700156985871e-05,
"logits/chosen": -2.993866443634033,
"logits/rejected": -3.0816330909729004,
"logps/chosen": -325.2431640625,
"logps/rejected": -306.7488708496094,
"loss": 0.5484,
"rewards/accuracies": 0.7291666865348816,
"rewards/chosen": -0.2325890064239502,
"rewards/margins": 0.7650774121284485,
"rewards/rejected": -0.9976664781570435,
"step": 355
},
{
"epoch": 0.2826300294406281,
"grad_norm": 4.517834663391113,
"learning_rate": 1.436420722135008e-05,
"logits/chosen": -2.9789466857910156,
"logits/rejected": -2.9082839488983154,
"logps/chosen": -326.4145202636719,
"logps/rejected": -335.26019287109375,
"loss": 0.4693,
"rewards/accuracies": 0.7541667222976685,
"rewards/chosen": -0.11003933846950531,
"rewards/margins": 0.9707590341567993,
"rewards/rejected": -1.0807983875274658,
"step": 360
},
{
"epoch": 0.28655544651619236,
"grad_norm": 3.715141534805298,
"learning_rate": 1.4285714285714287e-05,
"logits/chosen": -2.981189250946045,
"logits/rejected": -3.0649161338806152,
"logps/chosen": -307.39764404296875,
"logps/rejected": -301.6369934082031,
"loss": 0.481,
"rewards/accuracies": 0.7791666984558105,
"rewards/chosen": -0.12946780025959015,
"rewards/margins": 1.1251821517944336,
"rewards/rejected": -1.2546498775482178,
"step": 365
},
{
"epoch": 0.2904808635917566,
"grad_norm": 3.333448648452759,
"learning_rate": 1.4207221350078494e-05,
"logits/chosen": -2.984611988067627,
"logits/rejected": -3.030081272125244,
"logps/chosen": -309.75335693359375,
"logps/rejected": -308.0655517578125,
"loss": 0.4995,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -0.0332108810544014,
"rewards/margins": 1.0196669101715088,
"rewards/rejected": -1.0528777837753296,
"step": 370
},
{
"epoch": 0.2944062806673209,
"grad_norm": 5.067115306854248,
"learning_rate": 1.41287284144427e-05,
"logits/chosen": -3.0295064449310303,
"logits/rejected": -3.0551486015319824,
"logps/chosen": -306.81561279296875,
"logps/rejected": -293.89971923828125,
"loss": 0.5708,
"rewards/accuracies": 0.6833333373069763,
"rewards/chosen": -0.12991970777511597,
"rewards/margins": 0.8558316230773926,
"rewards/rejected": -0.9857513308525085,
"step": 375
},
{
"epoch": 0.2983316977428852,
"grad_norm": 4.063174247741699,
"learning_rate": 1.4050235478806909e-05,
"logits/chosen": -2.9749984741210938,
"logits/rejected": -3.0381150245666504,
"logps/chosen": -374.1710510253906,
"logps/rejected": -350.29168701171875,
"loss": 0.5021,
"rewards/accuracies": 0.7541667222976685,
"rewards/chosen": -0.11557143926620483,
"rewards/margins": 1.067694067955017,
"rewards/rejected": -1.1832655668258667,
"step": 380
},
{
"epoch": 0.30225711481844947,
"grad_norm": 4.8487396240234375,
"learning_rate": 1.3971742543171116e-05,
"logits/chosen": -2.9603800773620605,
"logits/rejected": -3.019071102142334,
"logps/chosen": -311.37628173828125,
"logps/rejected": -320.9827575683594,
"loss": 0.4274,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -0.017604345455765724,
"rewards/margins": 1.2022392749786377,
"rewards/rejected": -1.2198436260223389,
"step": 385
},
{
"epoch": 0.30618253189401373,
"grad_norm": 4.178959846496582,
"learning_rate": 1.3893249607535323e-05,
"logits/chosen": -3.040865421295166,
"logits/rejected": -3.0666539669036865,
"logps/chosen": -327.70928955078125,
"logps/rejected": -322.9264221191406,
"loss": 0.4558,
"rewards/accuracies": 0.7750000953674316,
"rewards/chosen": 0.11298879235982895,
"rewards/margins": 1.198480248451233,
"rewards/rejected": -1.085491418838501,
"step": 390
},
{
"epoch": 0.310107948969578,
"grad_norm": 4.6533122062683105,
"learning_rate": 1.3814756671899529e-05,
"logits/chosen": -2.9745497703552246,
"logits/rejected": -3.0309340953826904,
"logps/chosen": -330.2334289550781,
"logps/rejected": -340.81536865234375,
"loss": 0.5296,
"rewards/accuracies": 0.7041667103767395,
"rewards/chosen": -0.015470663085579872,
"rewards/margins": 1.0612616539001465,
"rewards/rejected": -1.0767322778701782,
"step": 395
},
{
"epoch": 0.3140333660451423,
"grad_norm": 5.215358734130859,
"learning_rate": 1.3736263736263738e-05,
"logits/chosen": -3.002040147781372,
"logits/rejected": -3.0286686420440674,
"logps/chosen": -353.7515563964844,
"logps/rejected": -327.62030029296875,
"loss": 0.5766,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2245529592037201,
"rewards/margins": 0.8832836151123047,
"rewards/rejected": -1.1078366041183472,
"step": 400
},
{
"epoch": 0.3140333660451423,
"eval_logits/chosen": -3.016608953475952,
"eval_logits/rejected": -3.0451409816741943,
"eval_logps/chosen": -331.353271484375,
"eval_logps/rejected": -324.7242736816406,
"eval_loss": 0.5065792202949524,
"eval_rewards/accuracies": 0.7444999814033508,
"eval_rewards/chosen": -0.19897931814193726,
"eval_rewards/margins": 1.030638337135315,
"eval_rewards/rejected": -1.2296175956726074,
"eval_runtime": 172.2901,
"eval_samples_per_second": 11.608,
"eval_steps_per_second": 5.804,
"step": 400
},
{
"epoch": 0.3179587831207066,
"grad_norm": 3.946781635284424,
"learning_rate": 1.3657770800627945e-05,
"logits/chosen": -2.966956853866577,
"logits/rejected": -3.056098461151123,
"logps/chosen": -356.13909912109375,
"logps/rejected": -317.91571044921875,
"loss": 0.4387,
"rewards/accuracies": 0.7791666984558105,
"rewards/chosen": -0.06729185581207275,
"rewards/margins": 1.262393832206726,
"rewards/rejected": -1.3296858072280884,
"step": 405
},
{
"epoch": 0.32188420019627084,
"grad_norm": 5.394034385681152,
"learning_rate": 1.357927786499215e-05,
"logits/chosen": -3.05330491065979,
"logits/rejected": -3.038001775741577,
"logps/chosen": -359.85101318359375,
"logps/rejected": -340.84295654296875,
"loss": 0.5456,
"rewards/accuracies": 0.7458333969116211,
"rewards/chosen": -0.33758872747421265,
"rewards/margins": 0.9710124731063843,
"rewards/rejected": -1.3086011409759521,
"step": 410
},
{
"epoch": 0.3258096172718351,
"grad_norm": 5.074666976928711,
"learning_rate": 1.3500784929356358e-05,
"logits/chosen": -3.051016092300415,
"logits/rejected": -3.0877463817596436,
"logps/chosen": -344.28314208984375,
"logps/rejected": -312.5767822265625,
"loss": 0.5506,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -0.33664119243621826,
"rewards/margins": 0.9169348478317261,
"rewards/rejected": -1.2535761594772339,
"step": 415
},
{
"epoch": 0.3297350343473994,
"grad_norm": 6.106690883636475,
"learning_rate": 1.3422291993720567e-05,
"logits/chosen": -3.0146520137786865,
"logits/rejected": -3.0691580772399902,
"logps/chosen": -326.73046875,
"logps/rejected": -341.7567138671875,
"loss": 0.5092,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": -0.4858369827270508,
"rewards/margins": 1.0325465202331543,
"rewards/rejected": -1.5183833837509155,
"step": 420
},
{
"epoch": 0.3336604514229637,
"grad_norm": 4.349857330322266,
"learning_rate": 1.3343799058084774e-05,
"logits/chosen": -3.0472264289855957,
"logits/rejected": -3.1090333461761475,
"logps/chosen": -336.30706787109375,
"logps/rejected": -323.3406982421875,
"loss": 0.4795,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -0.33309873938560486,
"rewards/margins": 1.1322623491287231,
"rewards/rejected": -1.46536123752594,
"step": 425
},
{
"epoch": 0.33758586849852795,
"grad_norm": 4.5804219245910645,
"learning_rate": 1.326530612244898e-05,
"logits/chosen": -2.928574323654175,
"logits/rejected": -3.0479984283447266,
"logps/chosen": -346.50762939453125,
"logps/rejected": -323.5039978027344,
"loss": 0.4464,
"rewards/accuracies": 0.7750000357627869,
"rewards/chosen": -0.41089972853660583,
"rewards/margins": 1.1351535320281982,
"rewards/rejected": -1.546053409576416,
"step": 430
},
{
"epoch": 0.34151128557409227,
"grad_norm": 4.321537494659424,
"learning_rate": 1.3186813186813187e-05,
"logits/chosen": -3.0598042011260986,
"logits/rejected": -3.1074109077453613,
"logps/chosen": -331.7185363769531,
"logps/rejected": -315.3821716308594,
"loss": 0.5302,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3797026574611664,
"rewards/margins": 0.9000622034072876,
"rewards/rejected": -1.2797647714614868,
"step": 435
},
{
"epoch": 0.34543670264965654,
"grad_norm": 5.391077041625977,
"learning_rate": 1.3108320251177396e-05,
"logits/chosen": -3.0739028453826904,
"logits/rejected": -3.085374355316162,
"logps/chosen": -336.9720458984375,
"logps/rejected": -317.68609619140625,
"loss": 0.4884,
"rewards/accuracies": 0.7625001072883606,
"rewards/chosen": 0.12200820446014404,
"rewards/margins": 1.0954450368881226,
"rewards/rejected": -0.973436713218689,
"step": 440
},
{
"epoch": 0.3493621197252208,
"grad_norm": 4.104001045227051,
"learning_rate": 1.3029827315541603e-05,
"logits/chosen": -3.0037009716033936,
"logits/rejected": -3.0276780128479004,
"logps/chosen": -311.5299377441406,
"logps/rejected": -290.1028137207031,
"loss": 0.5133,
"rewards/accuracies": 0.7291667461395264,
"rewards/chosen": 0.2962660789489746,
"rewards/margins": 1.045003056526184,
"rewards/rejected": -0.7487369775772095,
"step": 445
},
{
"epoch": 0.35328753680078506,
"grad_norm": 4.049530506134033,
"learning_rate": 1.2951334379905809e-05,
"logits/chosen": -3.0438899993896484,
"logits/rejected": -3.1289470195770264,
"logps/chosen": -312.2237548828125,
"logps/rejected": -289.80206298828125,
"loss": 0.4748,
"rewards/accuracies": 0.783333420753479,
"rewards/chosen": 0.2921372056007385,
"rewards/margins": 1.1328870058059692,
"rewards/rejected": -0.8407497406005859,
"step": 450
},
{
"epoch": 0.3572129538763494,
"grad_norm": 4.7967753410339355,
"learning_rate": 1.2872841444270016e-05,
"logits/chosen": -3.0214269161224365,
"logits/rejected": -3.032402276992798,
"logps/chosen": -320.35296630859375,
"logps/rejected": -322.9061279296875,
"loss": 0.4614,
"rewards/accuracies": 0.7708333730697632,
"rewards/chosen": 0.3199036717414856,
"rewards/margins": 1.2129650115966797,
"rewards/rejected": -0.8930614590644836,
"step": 455
},
{
"epoch": 0.36113837095191365,
"grad_norm": 4.228839874267578,
"learning_rate": 1.2794348508634225e-05,
"logits/chosen": -3.0375332832336426,
"logits/rejected": -3.0748536586761475,
"logps/chosen": -338.22369384765625,
"logps/rejected": -323.2271423339844,
"loss": 0.4843,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": 0.07565226405858994,
"rewards/margins": 1.1118285655975342,
"rewards/rejected": -1.036176323890686,
"step": 460
},
{
"epoch": 0.3650637880274779,
"grad_norm": 4.642974376678467,
"learning_rate": 1.271585557299843e-05,
"logits/chosen": -3.008430004119873,
"logits/rejected": -3.081514835357666,
"logps/chosen": -329.63775634765625,
"logps/rejected": -327.0590515136719,
"loss": 0.5383,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -0.1453629583120346,
"rewards/margins": 1.031752347946167,
"rewards/rejected": -1.1771153211593628,
"step": 465
},
{
"epoch": 0.3689892051030422,
"grad_norm": 5.04410457611084,
"learning_rate": 1.2637362637362638e-05,
"logits/chosen": -3.0850863456726074,
"logits/rejected": -3.0572659969329834,
"logps/chosen": -331.3392028808594,
"logps/rejected": -314.00653076171875,
"loss": 0.4975,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -0.1242925375699997,
"rewards/margins": 1.032974362373352,
"rewards/rejected": -1.1572668552398682,
"step": 470
},
{
"epoch": 0.3729146221786065,
"grad_norm": 6.009474754333496,
"learning_rate": 1.2558869701726845e-05,
"logits/chosen": -2.9685397148132324,
"logits/rejected": -3.083059787750244,
"logps/chosen": -352.71142578125,
"logps/rejected": -327.12872314453125,
"loss": 0.5339,
"rewards/accuracies": 0.6958333849906921,
"rewards/chosen": -0.14256823062896729,
"rewards/margins": 1.011826753616333,
"rewards/rejected": -1.1543948650360107,
"step": 475
},
{
"epoch": 0.37684003925417076,
"grad_norm": 5.437267303466797,
"learning_rate": 1.2480376766091054e-05,
"logits/chosen": -3.053178310394287,
"logits/rejected": -3.0942587852478027,
"logps/chosen": -320.6322021484375,
"logps/rejected": -312.03155517578125,
"loss": 0.5107,
"rewards/accuracies": 0.7708333730697632,
"rewards/chosen": -0.028355002403259277,
"rewards/margins": 1.0027117729187012,
"rewards/rejected": -1.031066656112671,
"step": 480
},
{
"epoch": 0.380765456329735,
"grad_norm": 5.028485298156738,
"learning_rate": 1.240188383045526e-05,
"logits/chosen": -2.875955104827881,
"logits/rejected": -2.964559555053711,
"logps/chosen": -347.6171569824219,
"logps/rejected": -321.8453369140625,
"loss": 0.523,
"rewards/accuracies": 0.7208333611488342,
"rewards/chosen": -0.030794035643339157,
"rewards/margins": 1.1214120388031006,
"rewards/rejected": -1.1522061824798584,
"step": 485
},
{
"epoch": 0.38469087340529934,
"grad_norm": 4.091563701629639,
"learning_rate": 1.2323390894819467e-05,
"logits/chosen": -2.8418350219726562,
"logits/rejected": -2.8112716674804688,
"logps/chosen": -309.3431091308594,
"logps/rejected": -331.121826171875,
"loss": 0.4678,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": 0.0016341328155249357,
"rewards/margins": 1.139491319656372,
"rewards/rejected": -1.13785719871521,
"step": 490
},
{
"epoch": 0.3886162904808636,
"grad_norm": 4.04428768157959,
"learning_rate": 1.2244897959183674e-05,
"logits/chosen": -3.047217845916748,
"logits/rejected": -3.066889524459839,
"logps/chosen": -317.3092956542969,
"logps/rejected": -299.8882751464844,
"loss": 0.5092,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.00645809480920434,
"rewards/margins": 0.9209384918212891,
"rewards/rejected": -0.9273965954780579,
"step": 495
},
{
"epoch": 0.39254170755642787,
"grad_norm": 4.938584804534912,
"learning_rate": 1.2166405023547883e-05,
"logits/chosen": -3.029782772064209,
"logits/rejected": -3.041254758834839,
"logps/chosen": -329.04168701171875,
"logps/rejected": -326.0158996582031,
"loss": 0.4689,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.10879228264093399,
"rewards/margins": 1.0214567184448242,
"rewards/rejected": -0.9126644134521484,
"step": 500
},
{
"epoch": 0.39254170755642787,
"eval_logits/chosen": -3.0307250022888184,
"eval_logits/rejected": -3.0587258338928223,
"eval_logps/chosen": -328.3224792480469,
"eval_logps/rejected": -321.09954833984375,
"eval_loss": 0.5021634697914124,
"eval_rewards/accuracies": 0.7450000047683716,
"eval_rewards/chosen": 0.10410188138484955,
"eval_rewards/margins": 0.971247136592865,
"eval_rewards/rejected": -0.8671452403068542,
"eval_runtime": 170.3972,
"eval_samples_per_second": 11.737,
"eval_steps_per_second": 5.869,
"step": 500
},
{
"epoch": 0.39646712463199213,
"grad_norm": 6.137747287750244,
"learning_rate": 1.2087912087912089e-05,
"logits/chosen": -3.0541741847991943,
"logits/rejected": -3.036529064178467,
"logps/chosen": -306.2962341308594,
"logps/rejected": -317.01666259765625,
"loss": 0.5514,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.02994796633720398,
"rewards/margins": 0.8114150762557983,
"rewards/rejected": -0.7814672589302063,
"step": 505
},
{
"epoch": 0.40039254170755645,
"grad_norm": 5.219884872436523,
"learning_rate": 1.2009419152276296e-05,
"logits/chosen": -3.1144938468933105,
"logits/rejected": -3.1160566806793213,
"logps/chosen": -329.6716613769531,
"logps/rejected": -319.11407470703125,
"loss": 0.5221,
"rewards/accuracies": 0.7625000476837158,
"rewards/chosen": 0.043878063559532166,
"rewards/margins": 1.0103065967559814,
"rewards/rejected": -0.9664285778999329,
"step": 510
},
{
"epoch": 0.4043179587831207,
"grad_norm": 4.543951511383057,
"learning_rate": 1.1930926216640503e-05,
"logits/chosen": -2.979218006134033,
"logits/rejected": -2.968158483505249,
"logps/chosen": -286.9298400878906,
"logps/rejected": -313.0599060058594,
"loss": 0.5016,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -0.1237020492553711,
"rewards/margins": 0.9319057464599609,
"rewards/rejected": -1.055607795715332,
"step": 515
},
{
"epoch": 0.408243375858685,
"grad_norm": 5.305350303649902,
"learning_rate": 1.1852433281004712e-05,
"logits/chosen": -2.945699691772461,
"logits/rejected": -2.975595474243164,
"logps/chosen": -313.67431640625,
"logps/rejected": -317.87261962890625,
"loss": 0.4597,
"rewards/accuracies": 0.7625000476837158,
"rewards/chosen": 0.10118236392736435,
"rewards/margins": 1.1309287548065186,
"rewards/rejected": -1.0297462940216064,
"step": 520
},
{
"epoch": 0.41216879293424924,
"grad_norm": 4.5991692543029785,
"learning_rate": 1.1773940345368918e-05,
"logits/chosen": -2.9673571586608887,
"logits/rejected": -2.9875621795654297,
"logps/chosen": -318.8524475097656,
"logps/rejected": -301.4873046875,
"loss": 0.4501,
"rewards/accuracies": 0.79583340883255,
"rewards/chosen": -0.11968664824962616,
"rewards/margins": 1.2507131099700928,
"rewards/rejected": -1.370399832725525,
"step": 525
},
{
"epoch": 0.41609421000981356,
"grad_norm": 4.974186420440674,
"learning_rate": 1.1695447409733125e-05,
"logits/chosen": -2.9936716556549072,
"logits/rejected": -3.038217544555664,
"logps/chosen": -349.4075927734375,
"logps/rejected": -314.17413330078125,
"loss": 0.5153,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -0.30208900570869446,
"rewards/margins": 1.0824108123779297,
"rewards/rejected": -1.3844999074935913,
"step": 530
},
{
"epoch": 0.4200196270853778,
"grad_norm": 4.36262845993042,
"learning_rate": 1.1616954474097332e-05,
"logits/chosen": -3.0161938667297363,
"logits/rejected": -3.0261764526367188,
"logps/chosen": -342.5355529785156,
"logps/rejected": -355.38604736328125,
"loss": 0.4833,
"rewards/accuracies": 0.7541667222976685,
"rewards/chosen": -0.37566858530044556,
"rewards/margins": 1.1315479278564453,
"rewards/rejected": -1.5072165727615356,
"step": 535
},
{
"epoch": 0.4239450441609421,
"grad_norm": 4.01798677444458,
"learning_rate": 1.1538461538461538e-05,
"logits/chosen": -3.040135145187378,
"logits/rejected": -3.107337474822998,
"logps/chosen": -312.9956970214844,
"logps/rejected": -323.22760009765625,
"loss": 0.4472,
"rewards/accuracies": 0.7916667461395264,
"rewards/chosen": -0.5123935341835022,
"rewards/margins": 1.0743831396102905,
"rewards/rejected": -1.5867767333984375,
"step": 540
},
{
"epoch": 0.4278704612365064,
"grad_norm": 5.212751388549805,
"learning_rate": 1.1459968602825747e-05,
"logits/chosen": -3.0182480812072754,
"logits/rejected": -3.0287539958953857,
"logps/chosen": -332.08709716796875,
"logps/rejected": -337.66424560546875,
"loss": 0.5333,
"rewards/accuracies": 0.7291666865348816,
"rewards/chosen": -0.5322802066802979,
"rewards/margins": 1.01456618309021,
"rewards/rejected": -1.546846628189087,
"step": 545
},
{
"epoch": 0.43179587831207067,
"grad_norm": 4.944022178649902,
"learning_rate": 1.1381475667189954e-05,
"logits/chosen": -2.9780099391937256,
"logits/rejected": -3.069441080093384,
"logps/chosen": -343.2859191894531,
"logps/rejected": -345.21441650390625,
"loss": 0.5462,
"rewards/accuracies": 0.7041666507720947,
"rewards/chosen": -0.5427877306938171,
"rewards/margins": 1.0927588939666748,
"rewards/rejected": -1.6355466842651367,
"step": 550
},
{
"epoch": 0.43572129538763493,
"grad_norm": 3.889643669128418,
"learning_rate": 1.1302982731554161e-05,
"logits/chosen": -2.9730026721954346,
"logits/rejected": -3.022367477416992,
"logps/chosen": -324.47406005859375,
"logps/rejected": -326.12353515625,
"loss": 0.4898,
"rewards/accuracies": 0.7708333134651184,
"rewards/chosen": -0.42287206649780273,
"rewards/margins": 1.078303575515747,
"rewards/rejected": -1.5011756420135498,
"step": 555
},
{
"epoch": 0.4396467124631992,
"grad_norm": 5.789305210113525,
"learning_rate": 1.1224489795918367e-05,
"logits/chosen": -3.0275089740753174,
"logits/rejected": -3.048832654953003,
"logps/chosen": -327.5396423339844,
"logps/rejected": -334.8791198730469,
"loss": 0.4796,
"rewards/accuracies": 0.7625000476837158,
"rewards/chosen": -0.35886624455451965,
"rewards/margins": 1.1369972229003906,
"rewards/rejected": -1.4958635568618774,
"step": 560
},
{
"epoch": 0.4435721295387635,
"grad_norm": 4.372865676879883,
"learning_rate": 1.1145996860282576e-05,
"logits/chosen": -3.0532193183898926,
"logits/rejected": -3.059150457382202,
"logps/chosen": -312.3909606933594,
"logps/rejected": -333.62982177734375,
"loss": 0.5414,
"rewards/accuracies": 0.7000001072883606,
"rewards/chosen": -0.38431116938591003,
"rewards/margins": 0.9614561796188354,
"rewards/rejected": -1.3457673788070679,
"step": 565
},
{
"epoch": 0.4474975466143278,
"grad_norm": 4.2039642333984375,
"learning_rate": 1.1067503924646783e-05,
"logits/chosen": -2.9767494201660156,
"logits/rejected": -3.029498338699341,
"logps/chosen": -304.53497314453125,
"logps/rejected": -317.2914733886719,
"loss": 0.4874,
"rewards/accuracies": 0.7541667222976685,
"rewards/chosen": -0.24222330749034882,
"rewards/margins": 1.0617117881774902,
"rewards/rejected": -1.303935170173645,
"step": 570
},
{
"epoch": 0.45142296368989204,
"grad_norm": 4.793886661529541,
"learning_rate": 1.098901098901099e-05,
"logits/chosen": -3.0597524642944336,
"logits/rejected": -3.086867332458496,
"logps/chosen": -341.6616516113281,
"logps/rejected": -336.95037841796875,
"loss": 0.4886,
"rewards/accuracies": 0.7708333730697632,
"rewards/chosen": -0.08540080487728119,
"rewards/margins": 1.0467342138290405,
"rewards/rejected": -1.132135033607483,
"step": 575
},
{
"epoch": 0.4553483807654563,
"grad_norm": 14.698615074157715,
"learning_rate": 1.0910518053375196e-05,
"logits/chosen": -3.0199100971221924,
"logits/rejected": -3.032982349395752,
"logps/chosen": -356.33404541015625,
"logps/rejected": -344.212890625,
"loss": 0.459,
"rewards/accuracies": 0.7666667699813843,
"rewards/chosen": 0.0503697507083416,
"rewards/margins": 1.1821739673614502,
"rewards/rejected": -1.1318042278289795,
"step": 580
},
{
"epoch": 0.4592737978410206,
"grad_norm": 3.817831516265869,
"learning_rate": 1.0832025117739405e-05,
"logits/chosen": -3.019148349761963,
"logits/rejected": -2.993727207183838,
"logps/chosen": -329.00787353515625,
"logps/rejected": -309.19903564453125,
"loss": 0.4704,
"rewards/accuracies": 0.7666667699813843,
"rewards/chosen": 0.019623804837465286,
"rewards/margins": 1.1090670824050903,
"rewards/rejected": -1.0894432067871094,
"step": 585
},
{
"epoch": 0.4631992149165849,
"grad_norm": 4.845146656036377,
"learning_rate": 1.0753532182103612e-05,
"logits/chosen": -3.0271801948547363,
"logits/rejected": -3.024869441986084,
"logps/chosen": -352.64697265625,
"logps/rejected": -353.69305419921875,
"loss": 0.494,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -0.22464075684547424,
"rewards/margins": 1.1474275588989258,
"rewards/rejected": -1.3720684051513672,
"step": 590
},
{
"epoch": 0.46712463199214915,
"grad_norm": 4.333688735961914,
"learning_rate": 1.067503924646782e-05,
"logits/chosen": -3.0770018100738525,
"logits/rejected": -3.119150400161743,
"logps/chosen": -354.69439697265625,
"logps/rejected": -327.85791015625,
"loss": 0.456,
"rewards/accuracies": 0.7791666388511658,
"rewards/chosen": -0.06289488822221756,
"rewards/margins": 1.210386037826538,
"rewards/rejected": -1.2732809782028198,
"step": 595
},
{
"epoch": 0.47105004906771347,
"grad_norm": 4.578314304351807,
"learning_rate": 1.0596546310832025e-05,
"logits/chosen": -3.0018227100372314,
"logits/rejected": -3.0865108966827393,
"logps/chosen": -311.1248779296875,
"logps/rejected": -309.74273681640625,
"loss": 0.4818,
"rewards/accuracies": 0.7750000357627869,
"rewards/chosen": -0.05175580456852913,
"rewards/margins": 1.112265944480896,
"rewards/rejected": -1.1640217304229736,
"step": 600
},
{
"epoch": 0.47105004906771347,
"eval_logits/chosen": -3.0338661670684814,
"eval_logits/rejected": -3.0616049766540527,
"eval_logps/chosen": -330.5699157714844,
"eval_logps/rejected": -324.86346435546875,
"eval_loss": 0.49363288283348083,
"eval_rewards/accuracies": 0.7524999976158142,
"eval_rewards/chosen": -0.1206398755311966,
"eval_rewards/margins": 1.1228933334350586,
"eval_rewards/rejected": -1.2435332536697388,
"eval_runtime": 171.2479,
"eval_samples_per_second": 11.679,
"eval_steps_per_second": 5.839,
"step": 600
},
{
"epoch": 0.47497546614327774,
"grad_norm": 5.259768009185791,
"learning_rate": 1.0518053375196234e-05,
"logits/chosen": -3.009653091430664,
"logits/rejected": -3.0764992237091064,
"logps/chosen": -346.80059814453125,
"logps/rejected": -324.3446350097656,
"loss": 0.5004,
"rewards/accuracies": 0.7375000715255737,
"rewards/chosen": -0.03653601557016373,
"rewards/margins": 1.1117885112762451,
"rewards/rejected": -1.1483246088027954,
"step": 605
},
{
"epoch": 0.478900883218842,
"grad_norm": 6.143235206604004,
"learning_rate": 1.0439560439560441e-05,
"logits/chosen": -2.989396810531616,
"logits/rejected": -3.0582377910614014,
"logps/chosen": -337.9186096191406,
"logps/rejected": -332.8787841796875,
"loss": 0.575,
"rewards/accuracies": 0.6875000596046448,
"rewards/chosen": -0.15022191405296326,
"rewards/margins": 1.013187050819397,
"rewards/rejected": -1.163408875465393,
"step": 610
},
{
"epoch": 0.48282630029440626,
"grad_norm": 5.3697590827941895,
"learning_rate": 1.0361067503924647e-05,
"logits/chosen": -3.028064727783203,
"logits/rejected": -3.033423662185669,
"logps/chosen": -343.3203430175781,
"logps/rejected": -327.01202392578125,
"loss": 0.4706,
"rewards/accuracies": 0.7791666984558105,
"rewards/chosen": 0.030805181711912155,
"rewards/margins": 1.2250150442123413,
"rewards/rejected": -1.1942098140716553,
"step": 615
},
{
"epoch": 0.4867517173699706,
"grad_norm": 3.8416597843170166,
"learning_rate": 1.0282574568288854e-05,
"logits/chosen": -2.909081220626831,
"logits/rejected": -2.975391387939453,
"logps/chosen": -323.62725830078125,
"logps/rejected": -332.06524658203125,
"loss": 0.4792,
"rewards/accuracies": 0.7291666865348816,
"rewards/chosen": -0.17823012173175812,
"rewards/margins": 1.1644293069839478,
"rewards/rejected": -1.3426594734191895,
"step": 620
},
{
"epoch": 0.49067713444553485,
"grad_norm": 5.597027778625488,
"learning_rate": 1.0204081632653063e-05,
"logits/chosen": -3.052537441253662,
"logits/rejected": -3.0111711025238037,
"logps/chosen": -319.8970642089844,
"logps/rejected": -322.13671875,
"loss": 0.5098,
"rewards/accuracies": 0.7541667222976685,
"rewards/chosen": -0.31647107005119324,
"rewards/margins": 1.0438673496246338,
"rewards/rejected": -1.3603384494781494,
"step": 625
},
{
"epoch": 0.4946025515210991,
"grad_norm": 5.787623405456543,
"learning_rate": 1.012558869701727e-05,
"logits/chosen": -2.932849407196045,
"logits/rejected": -3.022317886352539,
"logps/chosen": -350.6145324707031,
"logps/rejected": -343.1180114746094,
"loss": 0.5072,
"rewards/accuracies": 0.7375000715255737,
"rewards/chosen": -0.10851552337408066,
"rewards/margins": 1.0356271266937256,
"rewards/rejected": -1.144142746925354,
"step": 630
},
{
"epoch": 0.4985279685966634,
"grad_norm": 6.2099385261535645,
"learning_rate": 1.0047095761381476e-05,
"logits/chosen": -2.9498133659362793,
"logits/rejected": -2.9196791648864746,
"logps/chosen": -315.73150634765625,
"logps/rejected": -315.6263427734375,
"loss": 0.5515,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.16974535584449768,
"rewards/margins": 0.9117358326911926,
"rewards/rejected": -1.0814812183380127,
"step": 635
},
{
"epoch": 0.5024533856722276,
"grad_norm": 4.4105448722839355,
"learning_rate": 9.968602825745683e-06,
"logits/chosen": -2.970156192779541,
"logits/rejected": -2.9927220344543457,
"logps/chosen": -296.1843566894531,
"logps/rejected": -292.82244873046875,
"loss": 0.5219,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.09649379551410675,
"rewards/margins": 0.9780260324478149,
"rewards/rejected": -1.0745197534561157,
"step": 640
},
{
"epoch": 0.5063788027477919,
"grad_norm": 4.673651218414307,
"learning_rate": 9.890109890109892e-06,
"logits/chosen": -3.0400826930999756,
"logits/rejected": -3.0831387042999268,
"logps/chosen": -339.0031433105469,
"logps/rejected": -329.25152587890625,
"loss": 0.5512,
"rewards/accuracies": 0.7041667699813843,
"rewards/chosen": -0.14476463198661804,
"rewards/margins": 0.8560221791267395,
"rewards/rejected": -1.0007867813110352,
"step": 645
},
{
"epoch": 0.5103042198233563,
"grad_norm": 4.254279613494873,
"learning_rate": 9.811616954474098e-06,
"logits/chosen": -3.0292727947235107,
"logits/rejected": -3.0746917724609375,
"logps/chosen": -313.74591064453125,
"logps/rejected": -302.05279541015625,
"loss": 0.5132,
"rewards/accuracies": 0.7166666388511658,
"rewards/chosen": -0.06440563499927521,
"rewards/margins": 1.017377257347107,
"rewards/rejected": -1.0817829370498657,
"step": 650
},
{
"epoch": 0.5142296368989205,
"grad_norm": 4.169983863830566,
"learning_rate": 9.733124018838307e-06,
"logits/chosen": -2.944243907928467,
"logits/rejected": -3.0422754287719727,
"logps/chosen": -288.42559814453125,
"logps/rejected": -296.96026611328125,
"loss": 0.4622,
"rewards/accuracies": 0.7958333492279053,
"rewards/chosen": -0.060607265681028366,
"rewards/margins": 1.0034582614898682,
"rewards/rejected": -1.0640655755996704,
"step": 655
},
{
"epoch": 0.5181550539744848,
"grad_norm": 4.819730281829834,
"learning_rate": 9.654631083202512e-06,
"logits/chosen": -2.9609310626983643,
"logits/rejected": -3.0102219581604004,
"logps/chosen": -329.1315002441406,
"logps/rejected": -305.99737548828125,
"loss": 0.5414,
"rewards/accuracies": 0.6958333849906921,
"rewards/chosen": -0.272332102060318,
"rewards/margins": 0.9030130505561829,
"rewards/rejected": -1.1753450632095337,
"step": 660
},
{
"epoch": 0.5220804710500491,
"grad_norm": 4.765251159667969,
"learning_rate": 9.576138147566721e-06,
"logits/chosen": -2.911606550216675,
"logits/rejected": -3.0551559925079346,
"logps/chosen": -372.60357666015625,
"logps/rejected": -350.260498046875,
"loss": 0.5388,
"rewards/accuracies": 0.7458333969116211,
"rewards/chosen": -0.2132304608821869,
"rewards/margins": 0.8807324171066284,
"rewards/rejected": -1.0939629077911377,
"step": 665
},
{
"epoch": 0.5260058881256133,
"grad_norm": 4.679354667663574,
"learning_rate": 9.497645211930927e-06,
"logits/chosen": -2.996860980987549,
"logits/rejected": -2.969712734222412,
"logps/chosen": -329.93768310546875,
"logps/rejected": -349.2161560058594,
"loss": 0.5549,
"rewards/accuracies": 0.7291666865348816,
"rewards/chosen": -0.19874221086502075,
"rewards/margins": 0.9041234254837036,
"rewards/rejected": -1.1028656959533691,
"step": 670
},
{
"epoch": 0.5299313052011776,
"grad_norm": 5.291715621948242,
"learning_rate": 9.419152276295134e-06,
"logits/chosen": -3.0396275520324707,
"logits/rejected": -3.0426812171936035,
"logps/chosen": -360.34552001953125,
"logps/rejected": -345.2974853515625,
"loss": 0.561,
"rewards/accuracies": 0.6916667222976685,
"rewards/chosen": -0.31488001346588135,
"rewards/margins": 0.8902130126953125,
"rewards/rejected": -1.2050931453704834,
"step": 675
},
{
"epoch": 0.5338567222767419,
"grad_norm": 4.548229694366455,
"learning_rate": 9.340659340659341e-06,
"logits/chosen": -2.9637022018432617,
"logits/rejected": -2.995466947555542,
"logps/chosen": -349.22283935546875,
"logps/rejected": -336.8186340332031,
"loss": 0.5911,
"rewards/accuracies": 0.67083340883255,
"rewards/chosen": -0.22216463088989258,
"rewards/margins": 0.75648432970047,
"rewards/rejected": -0.978648841381073,
"step": 680
},
{
"epoch": 0.5377821393523062,
"grad_norm": 3.729768753051758,
"learning_rate": 9.262166405023548e-06,
"logits/chosen": -2.9287047386169434,
"logits/rejected": -2.999833106994629,
"logps/chosen": -338.10162353515625,
"logps/rejected": -330.71282958984375,
"loss": 0.4436,
"rewards/accuracies": 0.7541667222976685,
"rewards/chosen": 0.11126607656478882,
"rewards/margins": 1.1149643659591675,
"rewards/rejected": -1.003698468208313,
"step": 685
},
{
"epoch": 0.5417075564278705,
"grad_norm": 3.5768096446990967,
"learning_rate": 9.183673469387756e-06,
"logits/chosen": -3.034193754196167,
"logits/rejected": -3.030886650085449,
"logps/chosen": -336.0780944824219,
"logps/rejected": -328.44195556640625,
"loss": 0.4591,
"rewards/accuracies": 0.7708333730697632,
"rewards/chosen": 0.15392692387104034,
"rewards/margins": 1.142258882522583,
"rewards/rejected": -0.9883320927619934,
"step": 690
},
{
"epoch": 0.5456329735034348,
"grad_norm": 4.376099109649658,
"learning_rate": 9.105180533751963e-06,
"logits/chosen": -2.9962871074676514,
"logits/rejected": -3.0676894187927246,
"logps/chosen": -312.1530456542969,
"logps/rejected": -304.1636657714844,
"loss": 0.4686,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.018533676862716675,
"rewards/margins": 1.1388862133026123,
"rewards/rejected": -1.1574198007583618,
"step": 695
},
{
"epoch": 0.549558390578999,
"grad_norm": 5.068153381347656,
"learning_rate": 9.02668759811617e-06,
"logits/chosen": -3.0012753009796143,
"logits/rejected": -2.9406380653381348,
"logps/chosen": -334.9573059082031,
"logps/rejected": -336.917724609375,
"loss": 0.5127,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -0.1398693174123764,
"rewards/margins": 1.0154725313186646,
"rewards/rejected": -1.1553419828414917,
"step": 700
},
{
"epoch": 0.549558390578999,
"eval_logits/chosen": -3.029242753982544,
"eval_logits/rejected": -3.0569090843200684,
"eval_logps/chosen": -332.35626220703125,
"eval_logps/rejected": -325.6695861816406,
"eval_loss": 0.49267128109931946,
"eval_rewards/accuracies": 0.75,
"eval_rewards/chosen": -0.29927709698677063,
"eval_rewards/margins": 1.0248706340789795,
"eval_rewards/rejected": -1.3241477012634277,
"eval_runtime": 170.0257,
"eval_samples_per_second": 11.763,
"eval_steps_per_second": 5.881,
"step": 700
},
{
"epoch": 0.5534838076545633,
"grad_norm": 5.191127777099609,
"learning_rate": 8.948194662480377e-06,
"logits/chosen": -2.9839444160461426,
"logits/rejected": -3.006364345550537,
"logps/chosen": -337.6143798828125,
"logps/rejected": -352.8746643066406,
"loss": 0.5361,
"rewards/accuracies": 0.7125000357627869,
"rewards/chosen": -0.30696621537208557,
"rewards/margins": 1.0262271165847778,
"rewards/rejected": -1.3331931829452515,
"step": 705
},
{
"epoch": 0.5574092247301276,
"grad_norm": 3.932312250137329,
"learning_rate": 8.869701726844585e-06,
"logits/chosen": -2.9533979892730713,
"logits/rejected": -3.0565037727355957,
"logps/chosen": -316.4660949707031,
"logps/rejected": -302.0240783691406,
"loss": 0.4628,
"rewards/accuracies": 0.7708333730697632,
"rewards/chosen": -0.23080229759216309,
"rewards/margins": 1.1649229526519775,
"rewards/rejected": -1.3957254886627197,
"step": 710
},
{
"epoch": 0.5613346418056918,
"grad_norm": 6.213256359100342,
"learning_rate": 8.791208791208792e-06,
"logits/chosen": -2.9602789878845215,
"logits/rejected": -2.960151195526123,
"logps/chosen": -283.5656433105469,
"logps/rejected": -263.10260009765625,
"loss": 0.4912,
"rewards/accuracies": 0.7458332777023315,
"rewards/chosen": -0.20947471261024475,
"rewards/margins": 1.0615582466125488,
"rewards/rejected": -1.2710330486297607,
"step": 715
},
{
"epoch": 0.5652600588812562,
"grad_norm": 3.982071876525879,
"learning_rate": 8.712715855573e-06,
"logits/chosen": -2.966439723968506,
"logits/rejected": -3.0421411991119385,
"logps/chosen": -328.9455261230469,
"logps/rejected": -314.2897644042969,
"loss": 0.4648,
"rewards/accuracies": 0.7541667222976685,
"rewards/chosen": -0.02404719963669777,
"rewards/margins": 1.0856393575668335,
"rewards/rejected": -1.1096864938735962,
"step": 720
},
{
"epoch": 0.5691854759568205,
"grad_norm": 5.195390701293945,
"learning_rate": 8.634222919937206e-06,
"logits/chosen": -3.0160458087921143,
"logits/rejected": -3.0692107677459717,
"logps/chosen": -332.7279357910156,
"logps/rejected": -336.43927001953125,
"loss": 0.5002,
"rewards/accuracies": 0.7291666269302368,
"rewards/chosen": -0.19675110280513763,
"rewards/margins": 0.985478401184082,
"rewards/rejected": -1.1822296380996704,
"step": 725
},
{
"epoch": 0.5731108930323847,
"grad_norm": 4.930713653564453,
"learning_rate": 8.555729984301414e-06,
"logits/chosen": -3.0057716369628906,
"logits/rejected": -3.0269782543182373,
"logps/chosen": -306.15875244140625,
"logps/rejected": -305.45416259765625,
"loss": 0.4791,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.07756291329860687,
"rewards/margins": 1.0487645864486694,
"rewards/rejected": -1.1263275146484375,
"step": 730
},
{
"epoch": 0.577036310107949,
"grad_norm": 3.645521640777588,
"learning_rate": 8.477237048665621e-06,
"logits/chosen": -3.024445056915283,
"logits/rejected": -3.0869052410125732,
"logps/chosen": -303.17327880859375,
"logps/rejected": -301.08905029296875,
"loss": 0.4716,
"rewards/accuracies": 0.7666667699813843,
"rewards/chosen": -0.038672782480716705,
"rewards/margins": 1.1686241626739502,
"rewards/rejected": -1.2072969675064087,
"step": 735
},
{
"epoch": 0.5809617271835132,
"grad_norm": 4.946695327758789,
"learning_rate": 8.398744113029828e-06,
"logits/chosen": -3.0039095878601074,
"logits/rejected": -3.015929698944092,
"logps/chosen": -322.4432373046875,
"logps/rejected": -309.04107666015625,
"loss": 0.5134,
"rewards/accuracies": 0.7458333373069763,
"rewards/chosen": -0.12716850638389587,
"rewards/margins": 0.9853676557540894,
"rewards/rejected": -1.1125361919403076,
"step": 740
},
{
"epoch": 0.5848871442590775,
"grad_norm": 5.070699214935303,
"learning_rate": 8.320251177394036e-06,
"logits/chosen": -3.0126490592956543,
"logits/rejected": -3.098520278930664,
"logps/chosen": -357.4333190917969,
"logps/rejected": -332.8435363769531,
"loss": 0.5058,
"rewards/accuracies": 0.7625000476837158,
"rewards/chosen": -0.04318712279200554,
"rewards/margins": 1.034911036491394,
"rewards/rejected": -1.078098177909851,
"step": 745
},
{
"epoch": 0.5888125613346418,
"grad_norm": 3.6236932277679443,
"learning_rate": 8.241758241758243e-06,
"logits/chosen": -2.9999072551727295,
"logits/rejected": -2.990403413772583,
"logps/chosen": -357.82733154296875,
"logps/rejected": -326.899169921875,
"loss": 0.4588,
"rewards/accuracies": 0.7708333730697632,
"rewards/chosen": -0.11650122702121735,
"rewards/margins": 1.1574772596359253,
"rewards/rejected": -1.2739784717559814,
"step": 750
},
{
"epoch": 0.592737978410206,
"grad_norm": 4.285853862762451,
"learning_rate": 8.16326530612245e-06,
"logits/chosen": -3.0003159046173096,
"logits/rejected": -3.048569679260254,
"logps/chosen": -349.8592834472656,
"logps/rejected": -304.07427978515625,
"loss": 0.4784,
"rewards/accuracies": 0.7750000953674316,
"rewards/chosen": -0.07542826980352402,
"rewards/margins": 1.2488701343536377,
"rewards/rejected": -1.32429838180542,
"step": 755
},
{
"epoch": 0.5966633954857704,
"grad_norm": 4.365904331207275,
"learning_rate": 8.084772370486657e-06,
"logits/chosen": -3.0357697010040283,
"logits/rejected": -3.12182879447937,
"logps/chosen": -317.8271484375,
"logps/rejected": -298.142578125,
"loss": 0.4855,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": -0.06173648685216904,
"rewards/margins": 1.079683542251587,
"rewards/rejected": -1.1414198875427246,
"step": 760
},
{
"epoch": 0.6005888125613347,
"grad_norm": 4.72523307800293,
"learning_rate": 8.006279434850865e-06,
"logits/chosen": -3.0452723503112793,
"logits/rejected": -3.0594534873962402,
"logps/chosen": -317.8534240722656,
"logps/rejected": -329.19976806640625,
"loss": 0.5079,
"rewards/accuracies": 0.7583333849906921,
"rewards/chosen": -0.3027920126914978,
"rewards/margins": 1.1086828708648682,
"rewards/rejected": -1.4114749431610107,
"step": 765
},
{
"epoch": 0.6045142296368989,
"grad_norm": 4.864989757537842,
"learning_rate": 7.927786499215072e-06,
"logits/chosen": -3.0068793296813965,
"logits/rejected": -3.017885446548462,
"logps/chosen": -314.66876220703125,
"logps/rejected": -317.1419677734375,
"loss": 0.4965,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -0.18796098232269287,
"rewards/margins": 1.0385706424713135,
"rewards/rejected": -1.2265316247940063,
"step": 770
},
{
"epoch": 0.6084396467124632,
"grad_norm": 4.275363922119141,
"learning_rate": 7.849293563579279e-06,
"logits/chosen": -3.002530574798584,
"logits/rejected": -2.980320453643799,
"logps/chosen": -317.7269287109375,
"logps/rejected": -339.15997314453125,
"loss": 0.4468,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.04108821228146553,
"rewards/margins": 1.3123092651367188,
"rewards/rejected": -1.3533976078033447,
"step": 775
},
{
"epoch": 0.6123650637880275,
"grad_norm": 4.48534631729126,
"learning_rate": 7.770800627943486e-06,
"logits/chosen": -2.961935520172119,
"logits/rejected": -3.0330467224121094,
"logps/chosen": -370.4855041503906,
"logps/rejected": -328.0148010253906,
"loss": 0.5256,
"rewards/accuracies": 0.7791666984558105,
"rewards/chosen": 0.09827003628015518,
"rewards/margins": 1.1447842121124268,
"rewards/rejected": -1.0465141534805298,
"step": 780
},
{
"epoch": 0.6162904808635917,
"grad_norm": 4.550291061401367,
"learning_rate": 7.692307692307694e-06,
"logits/chosen": -3.0217397212982178,
"logits/rejected": -3.0674948692321777,
"logps/chosen": -317.39971923828125,
"logps/rejected": -297.3795471191406,
"loss": 0.5762,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -0.1348334103822708,
"rewards/margins": 0.9545001983642578,
"rewards/rejected": -1.0893336534500122,
"step": 785
},
{
"epoch": 0.620215897939156,
"grad_norm": 3.8520846366882324,
"learning_rate": 7.6138147566719e-06,
"logits/chosen": -3.0688512325286865,
"logits/rejected": -3.068896770477295,
"logps/chosen": -332.6806640625,
"logps/rejected": -327.0592956542969,
"loss": 0.4392,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.005734431557357311,
"rewards/margins": 1.1970218420028687,
"rewards/rejected": -1.2027562856674194,
"step": 790
},
{
"epoch": 0.6241413150147204,
"grad_norm": 5.522543907165527,
"learning_rate": 7.535321821036108e-06,
"logits/chosen": -3.0281357765197754,
"logits/rejected": -3.069322109222412,
"logps/chosen": -318.053466796875,
"logps/rejected": -313.0615234375,
"loss": 0.5247,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -0.1159566193819046,
"rewards/margins": 0.977625846862793,
"rewards/rejected": -1.0935826301574707,
"step": 795
},
{
"epoch": 0.6280667320902846,
"grad_norm": 5.020391941070557,
"learning_rate": 7.4568288854003145e-06,
"logits/chosen": -2.9681613445281982,
"logits/rejected": -2.948774814605713,
"logps/chosen": -342.89019775390625,
"logps/rejected": -317.9969482421875,
"loss": 0.4847,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -0.22335605323314667,
"rewards/margins": 1.1023824214935303,
"rewards/rejected": -1.3257384300231934,
"step": 800
},
{
"epoch": 0.6280667320902846,
"eval_logits/chosen": -3.0195696353912354,
"eval_logits/rejected": -3.047680139541626,
"eval_logps/chosen": -330.66943359375,
"eval_logps/rejected": -324.49725341796875,
"eval_loss": 0.489461213350296,
"eval_rewards/accuracies": 0.7534999847412109,
"eval_rewards/chosen": -0.13059695065021515,
"eval_rewards/margins": 1.0763192176818848,
"eval_rewards/rejected": -1.2069162130355835,
"eval_runtime": 170.5252,
"eval_samples_per_second": 11.728,
"eval_steps_per_second": 5.864,
"step": 800
},
{
"epoch": 0.6319921491658489,
"grad_norm": 3.8706929683685303,
"learning_rate": 7.378335949764521e-06,
"logits/chosen": -3.020859479904175,
"logits/rejected": -3.0139949321746826,
"logps/chosen": -333.51654052734375,
"logps/rejected": -324.34991455078125,
"loss": 0.5385,
"rewards/accuracies": 0.720833420753479,
"rewards/chosen": -0.24091720581054688,
"rewards/margins": 0.9881976246833801,
"rewards/rejected": -1.2291150093078613,
"step": 805
},
{
"epoch": 0.6359175662414132,
"grad_norm": 3.939385175704956,
"learning_rate": 7.299843014128729e-06,
"logits/chosen": -2.999584674835205,
"logits/rejected": -2.953639507293701,
"logps/chosen": -311.6091003417969,
"logps/rejected": -367.5775451660156,
"loss": 0.4001,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.0669659823179245,
"rewards/margins": 1.300567388534546,
"rewards/rejected": -1.3675333261489868,
"step": 810
},
{
"epoch": 0.6398429833169774,
"grad_norm": 3.8747594356536865,
"learning_rate": 7.2213500784929355e-06,
"logits/chosen": -2.9922289848327637,
"logits/rejected": -3.0901122093200684,
"logps/chosen": -351.9647216796875,
"logps/rejected": -324.533935546875,
"loss": 0.4751,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -0.19494260847568512,
"rewards/margins": 1.1178690195083618,
"rewards/rejected": -1.3128114938735962,
"step": 815
},
{
"epoch": 0.6437684003925417,
"grad_norm": 4.751869201660156,
"learning_rate": 7.1428571428571436e-06,
"logits/chosen": -3.0304911136627197,
"logits/rejected": -3.0640957355499268,
"logps/chosen": -350.48040771484375,
"logps/rejected": -304.4465026855469,
"loss": 0.4689,
"rewards/accuracies": 0.7708333730697632,
"rewards/chosen": -0.17158253490924835,
"rewards/margins": 1.1375932693481445,
"rewards/rejected": -1.309175968170166,
"step": 820
},
{
"epoch": 0.647693817468106,
"grad_norm": 3.528496503829956,
"learning_rate": 7.06436420722135e-06,
"logits/chosen": -2.954141139984131,
"logits/rejected": -3.0600366592407227,
"logps/chosen": -344.02496337890625,
"logps/rejected": -313.1765441894531,
"loss": 0.4165,
"rewards/accuracies": 0.7958333492279053,
"rewards/chosen": 0.023122036829590797,
"rewards/margins": 1.3192641735076904,
"rewards/rejected": -1.2961422204971313,
"step": 825
},
{
"epoch": 0.6516192345436702,
"grad_norm": 4.554298400878906,
"learning_rate": 6.985871271585558e-06,
"logits/chosen": -2.985323190689087,
"logits/rejected": -3.0694005489349365,
"logps/chosen": -347.07855224609375,
"logps/rejected": -321.8019714355469,
"loss": 0.4487,
"rewards/accuracies": 0.7833333015441895,
"rewards/chosen": 0.15593689680099487,
"rewards/margins": 1.35723078250885,
"rewards/rejected": -1.2012939453125,
"step": 830
},
{
"epoch": 0.6555446516192346,
"grad_norm": 4.962828159332275,
"learning_rate": 6.9073783359497645e-06,
"logits/chosen": -2.939275026321411,
"logits/rejected": -3.06274151802063,
"logps/chosen": -318.2911376953125,
"logps/rejected": -321.3827209472656,
"loss": 0.453,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.16927500069141388,
"rewards/margins": 1.2757118940353394,
"rewards/rejected": -1.106436848640442,
"step": 835
},
{
"epoch": 0.6594700686947988,
"grad_norm": 3.8393807411193848,
"learning_rate": 6.828885400313973e-06,
"logits/chosen": -2.948727607727051,
"logits/rejected": -2.9849319458007812,
"logps/chosen": -278.67950439453125,
"logps/rejected": -289.4703063964844,
"loss": 0.5496,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.003879111958667636,
"rewards/margins": 0.9358898997306824,
"rewards/rejected": -0.9397690892219543,
"step": 840
},
{
"epoch": 0.6633954857703631,
"grad_norm": 4.3185577392578125,
"learning_rate": 6.750392464678179e-06,
"logits/chosen": -2.9654245376586914,
"logits/rejected": -3.0434441566467285,
"logps/chosen": -327.6491394042969,
"logps/rejected": -321.876953125,
"loss": 0.4393,
"rewards/accuracies": 0.8083333969116211,
"rewards/chosen": -0.07706048339605331,
"rewards/margins": 1.2577455043792725,
"rewards/rejected": -1.3348058462142944,
"step": 845
},
{
"epoch": 0.6673209028459274,
"grad_norm": 4.293339252471924,
"learning_rate": 6.671899529042387e-06,
"logits/chosen": -3.1013782024383545,
"logits/rejected": -3.0577914714813232,
"logps/chosen": -321.20611572265625,
"logps/rejected": -330.765380859375,
"loss": 0.4861,
"rewards/accuracies": 0.7708333730697632,
"rewards/chosen": -0.0720919817686081,
"rewards/margins": 1.2457802295684814,
"rewards/rejected": -1.3178722858428955,
"step": 850
},
{
"epoch": 0.6712463199214916,
"grad_norm": 4.7395734786987305,
"learning_rate": 6.5934065934065935e-06,
"logits/chosen": -2.967613697052002,
"logits/rejected": -3.026918411254883,
"logps/chosen": -309.9462890625,
"logps/rejected": -322.9964904785156,
"loss": 0.5118,
"rewards/accuracies": 0.7416667342185974,
"rewards/chosen": -0.34576496481895447,
"rewards/margins": 1.138620138168335,
"rewards/rejected": -1.4843851327896118,
"step": 855
},
{
"epoch": 0.6751717369970559,
"grad_norm": 4.396761417388916,
"learning_rate": 6.514913657770802e-06,
"logits/chosen": -2.872307300567627,
"logits/rejected": -2.9442195892333984,
"logps/chosen": -344.57275390625,
"logps/rejected": -361.8569030761719,
"loss": 0.4233,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.27561578154563904,
"rewards/margins": 1.3650743961334229,
"rewards/rejected": -1.6406902074813843,
"step": 860
},
{
"epoch": 0.6790971540726202,
"grad_norm": 5.042901039123535,
"learning_rate": 6.436420722135008e-06,
"logits/chosen": -2.9460458755493164,
"logits/rejected": -2.9742355346679688,
"logps/chosen": -318.40972900390625,
"logps/rejected": -338.845703125,
"loss": 0.5214,
"rewards/accuracies": 0.73333340883255,
"rewards/chosen": -0.464036762714386,
"rewards/margins": 1.0700992345809937,
"rewards/rejected": -1.5341359376907349,
"step": 865
},
{
"epoch": 0.6830225711481845,
"grad_norm": 5.222243785858154,
"learning_rate": 6.357927786499215e-06,
"logits/chosen": -2.948620080947876,
"logits/rejected": -3.0497138500213623,
"logps/chosen": -319.1834716796875,
"logps/rejected": -319.8209228515625,
"loss": 0.4982,
"rewards/accuracies": 0.7708333730697632,
"rewards/chosen": -0.6082950830459595,
"rewards/margins": 1.108607530593872,
"rewards/rejected": -1.7169023752212524,
"step": 870
},
{
"epoch": 0.6869479882237488,
"grad_norm": 4.216038703918457,
"learning_rate": 6.279434850863423e-06,
"logits/chosen": -2.9881601333618164,
"logits/rejected": -2.9443399906158447,
"logps/chosen": -347.36181640625,
"logps/rejected": -356.3033447265625,
"loss": 0.4577,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.43880724906921387,
"rewards/margins": 1.2593494653701782,
"rewards/rejected": -1.698156714439392,
"step": 875
},
{
"epoch": 0.6908734052993131,
"grad_norm": 4.496317386627197,
"learning_rate": 6.20094191522763e-06,
"logits/chosen": -2.937516450881958,
"logits/rejected": -2.9816195964813232,
"logps/chosen": -338.54888916015625,
"logps/rejected": -316.7308654785156,
"loss": 0.4974,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -0.38299983739852905,
"rewards/margins": 1.1357471942901611,
"rewards/rejected": -1.518747091293335,
"step": 880
},
{
"epoch": 0.6947988223748773,
"grad_norm": 5.088541507720947,
"learning_rate": 6.122448979591837e-06,
"logits/chosen": -2.9346837997436523,
"logits/rejected": -3.0072033405303955,
"logps/chosen": -323.017578125,
"logps/rejected": -321.0302429199219,
"loss": 0.5165,
"rewards/accuracies": 0.7333334684371948,
"rewards/chosen": -0.16942360997200012,
"rewards/margins": 1.0610682964324951,
"rewards/rejected": -1.2304918766021729,
"step": 885
},
{
"epoch": 0.6987242394504416,
"grad_norm": 5.229645729064941,
"learning_rate": 6.043956043956044e-06,
"logits/chosen": -3.0896613597869873,
"logits/rejected": -3.1020538806915283,
"logps/chosen": -340.7657775878906,
"logps/rejected": -308.33831787109375,
"loss": 0.4848,
"rewards/accuracies": 0.7666666507720947,
"rewards/chosen": -0.14006583392620087,
"rewards/margins": 1.063689947128296,
"rewards/rejected": -1.2037558555603027,
"step": 890
},
{
"epoch": 0.7026496565260059,
"grad_norm": 4.50402307510376,
"learning_rate": 5.965463108320252e-06,
"logits/chosen": -2.930692195892334,
"logits/rejected": -3.0338551998138428,
"logps/chosen": -322.0299072265625,
"logps/rejected": -337.92059326171875,
"loss": 0.4844,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -0.06006438657641411,
"rewards/margins": 1.1535618305206299,
"rewards/rejected": -1.2136261463165283,
"step": 895
},
{
"epoch": 0.7065750736015701,
"grad_norm": 3.8728554248809814,
"learning_rate": 5.886970172684459e-06,
"logits/chosen": -2.997563123703003,
"logits/rejected": -3.0621156692504883,
"logps/chosen": -320.8427429199219,
"logps/rejected": -323.98663330078125,
"loss": 0.5245,
"rewards/accuracies": 0.7291667461395264,
"rewards/chosen": -0.022658739238977432,
"rewards/margins": 1.0391814708709717,
"rewards/rejected": -1.061840295791626,
"step": 900
},
{
"epoch": 0.7065750736015701,
"eval_logits/chosen": -3.025604724884033,
"eval_logits/rejected": -3.0542104244232178,
"eval_logps/chosen": -331.0598449707031,
"eval_logps/rejected": -325.0303955078125,
"eval_loss": 0.4869418144226074,
"eval_rewards/accuracies": 0.7524999976158142,
"eval_rewards/chosen": -0.16963602602481842,
"eval_rewards/margins": 1.0905920267105103,
"eval_rewards/rejected": -1.2602282762527466,
"eval_runtime": 171.3633,
"eval_samples_per_second": 11.671,
"eval_steps_per_second": 5.836,
"step": 900
},
{
"epoch": 0.7105004906771345,
"grad_norm": 4.844978332519531,
"learning_rate": 5.808477237048666e-06,
"logits/chosen": -3.054149627685547,
"logits/rejected": -3.120988368988037,
"logps/chosen": -354.59063720703125,
"logps/rejected": -327.55731201171875,
"loss": 0.5112,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -0.17106744647026062,
"rewards/margins": 1.1445057392120361,
"rewards/rejected": -1.3155733346939087,
"step": 905
},
{
"epoch": 0.7144259077526988,
"grad_norm": 3.322969675064087,
"learning_rate": 5.729984301412873e-06,
"logits/chosen": -2.979074716567993,
"logits/rejected": -3.056349277496338,
"logps/chosen": -341.1482238769531,
"logps/rejected": -330.65411376953125,
"loss": 0.4561,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -0.04363600164651871,
"rewards/margins": 1.232508659362793,
"rewards/rejected": -1.2761447429656982,
"step": 910
},
{
"epoch": 0.718351324828263,
"grad_norm": 5.024198532104492,
"learning_rate": 5.651491365777081e-06,
"logits/chosen": -3.0509443283081055,
"logits/rejected": -2.996936321258545,
"logps/chosen": -318.218017578125,
"logps/rejected": -317.7236022949219,
"loss": 0.4975,
"rewards/accuracies": 0.7541667222976685,
"rewards/chosen": -0.0626036748290062,
"rewards/margins": 1.1322224140167236,
"rewards/rejected": -1.1948261260986328,
"step": 915
},
{
"epoch": 0.7222767419038273,
"grad_norm": 4.653200149536133,
"learning_rate": 5.572998430141288e-06,
"logits/chosen": -3.0015101432800293,
"logits/rejected": -3.004070520401001,
"logps/chosen": -353.28912353515625,
"logps/rejected": -330.3291320800781,
"loss": 0.4619,
"rewards/accuracies": 0.7541667222976685,
"rewards/chosen": -0.03766501322388649,
"rewards/margins": 1.2282439470291138,
"rewards/rejected": -1.2659088373184204,
"step": 920
},
{
"epoch": 0.7262021589793916,
"grad_norm": 4.00246524810791,
"learning_rate": 5.494505494505495e-06,
"logits/chosen": -2.997545003890991,
"logits/rejected": -3.094526767730713,
"logps/chosen": -299.0417785644531,
"logps/rejected": -316.5038146972656,
"loss": 0.4866,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -0.06449007242918015,
"rewards/margins": 1.0651543140411377,
"rewards/rejected": -1.1296443939208984,
"step": 925
},
{
"epoch": 0.7301275760549558,
"grad_norm": 4.44005823135376,
"learning_rate": 5.4160125588697024e-06,
"logits/chosen": -3.0456299781799316,
"logits/rejected": -3.024275541305542,
"logps/chosen": -305.1586608886719,
"logps/rejected": -291.7957763671875,
"loss": 0.4836,
"rewards/accuracies": 0.7583333849906921,
"rewards/chosen": -0.02189583331346512,
"rewards/margins": 0.9891737103462219,
"rewards/rejected": -1.011069655418396,
"step": 930
},
{
"epoch": 0.7340529931305201,
"grad_norm": 5.462418556213379,
"learning_rate": 5.33751962323391e-06,
"logits/chosen": -2.901125907897949,
"logits/rejected": -2.9909684658050537,
"logps/chosen": -326.17791748046875,
"logps/rejected": -334.52166748046875,
"loss": 0.5115,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.09602449834346771,
"rewards/margins": 0.9841717481613159,
"rewards/rejected": -1.0801963806152344,
"step": 935
},
{
"epoch": 0.7379784102060843,
"grad_norm": 4.357146739959717,
"learning_rate": 5.259026687598117e-06,
"logits/chosen": -3.0265889167785645,
"logits/rejected": -3.0476319789886475,
"logps/chosen": -298.66705322265625,
"logps/rejected": -311.0827331542969,
"loss": 0.4936,
"rewards/accuracies": 0.7625000476837158,
"rewards/chosen": -0.18625633418560028,
"rewards/margins": 0.9413064122200012,
"rewards/rejected": -1.1275627613067627,
"step": 940
},
{
"epoch": 0.7419038272816487,
"grad_norm": 5.111855983734131,
"learning_rate": 5.180533751962323e-06,
"logits/chosen": -3.0872814655303955,
"logits/rejected": -3.0954253673553467,
"logps/chosen": -331.3386535644531,
"logps/rejected": -317.2918701171875,
"loss": 0.5034,
"rewards/accuracies": 0.7541667222976685,
"rewards/chosen": -0.17156757414340973,
"rewards/margins": 1.000931978225708,
"rewards/rejected": -1.172499418258667,
"step": 945
},
{
"epoch": 0.745829244357213,
"grad_norm": 5.880111217498779,
"learning_rate": 5.1020408163265315e-06,
"logits/chosen": -2.9598050117492676,
"logits/rejected": -3.0074758529663086,
"logps/chosen": -311.2569274902344,
"logps/rejected": -295.6765441894531,
"loss": 0.5429,
"rewards/accuracies": 0.6750000715255737,
"rewards/chosen": -0.17758509516716003,
"rewards/margins": 0.9686153531074524,
"rewards/rejected": -1.14620041847229,
"step": 950
},
{
"epoch": 0.7497546614327772,
"grad_norm": 4.473018169403076,
"learning_rate": 5.023547880690738e-06,
"logits/chosen": -2.981584072113037,
"logits/rejected": -3.0066146850585938,
"logps/chosen": -322.27655029296875,
"logps/rejected": -333.5732421875,
"loss": 0.4551,
"rewards/accuracies": 0.7541667222976685,
"rewards/chosen": -0.10117676109075546,
"rewards/margins": 1.1731947660446167,
"rewards/rejected": -1.2743713855743408,
"step": 955
},
{
"epoch": 0.7536800785083415,
"grad_norm": 5.427559852600098,
"learning_rate": 4.945054945054946e-06,
"logits/chosen": -3.038059949874878,
"logits/rejected": -3.0491480827331543,
"logps/chosen": -304.99285888671875,
"logps/rejected": -331.29669189453125,
"loss": 0.5048,
"rewards/accuracies": 0.7541667222976685,
"rewards/chosen": -0.1531282365322113,
"rewards/margins": 1.0113624334335327,
"rewards/rejected": -1.1644906997680664,
"step": 960
},
{
"epoch": 0.7576054955839058,
"grad_norm": 5.369307994842529,
"learning_rate": 4.866562009419153e-06,
"logits/chosen": -3.041508197784424,
"logits/rejected": -3.017305374145508,
"logps/chosen": -309.01409912109375,
"logps/rejected": -324.8525390625,
"loss": 0.468,
"rewards/accuracies": 0.7750000357627869,
"rewards/chosen": -0.14672012627124786,
"rewards/margins": 1.1099258661270142,
"rewards/rejected": -1.256645917892456,
"step": 965
},
{
"epoch": 0.76153091265947,
"grad_norm": 4.683850288391113,
"learning_rate": 4.7880690737833605e-06,
"logits/chosen": -3.0513949394226074,
"logits/rejected": -3.1184747219085693,
"logps/chosen": -349.2635192871094,
"logps/rejected": -324.3399963378906,
"loss": 0.54,
"rewards/accuracies": 0.7125000357627869,
"rewards/chosen": -0.11181743443012238,
"rewards/margins": 0.9319450259208679,
"rewards/rejected": -1.0437625646591187,
"step": 970
},
{
"epoch": 0.7654563297350343,
"grad_norm": 4.057793617248535,
"learning_rate": 4.709576138147567e-06,
"logits/chosen": -3.0879323482513428,
"logits/rejected": -3.1031229496002197,
"logps/chosen": -345.81207275390625,
"logps/rejected": -354.1678466796875,
"loss": 0.4916,
"rewards/accuracies": 0.7583333849906921,
"rewards/chosen": 0.02626100741326809,
"rewards/margins": 1.1445354223251343,
"rewards/rejected": -1.118274450302124,
"step": 975
},
{
"epoch": 0.7693817468105987,
"grad_norm": 3.788588285446167,
"learning_rate": 4.631083202511774e-06,
"logits/chosen": -3.0559887886047363,
"logits/rejected": -2.9645016193389893,
"logps/chosen": -329.69366455078125,
"logps/rejected": -323.6266174316406,
"loss": 0.5673,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.006841002497822046,
"rewards/margins": 1.0048836469650269,
"rewards/rejected": -0.9980427026748657,
"step": 980
},
{
"epoch": 0.7733071638861629,
"grad_norm": 4.364587306976318,
"learning_rate": 4.5525902668759815e-06,
"logits/chosen": -2.923006296157837,
"logits/rejected": -3.013233184814453,
"logps/chosen": -316.62188720703125,
"logps/rejected": -303.6081848144531,
"loss": 0.5261,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -0.10443178564310074,
"rewards/margins": 1.0743087530136108,
"rewards/rejected": -1.1787405014038086,
"step": 985
},
{
"epoch": 0.7772325809617272,
"grad_norm": 5.203098773956299,
"learning_rate": 4.474097331240189e-06,
"logits/chosen": -3.0155367851257324,
"logits/rejected": -3.095081090927124,
"logps/chosen": -320.33074951171875,
"logps/rejected": -339.04791259765625,
"loss": 0.5602,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.04893790930509567,
"rewards/margins": 1.0242021083831787,
"rewards/rejected": -0.9752641916275024,
"step": 990
},
{
"epoch": 0.7811579980372915,
"grad_norm": 5.797107219696045,
"learning_rate": 4.395604395604396e-06,
"logits/chosen": -2.9568419456481934,
"logits/rejected": -3.061304807662964,
"logps/chosen": -323.64459228515625,
"logps/rejected": -306.8099060058594,
"loss": 0.4912,
"rewards/accuracies": 0.7458333969116211,
"rewards/chosen": -0.03198995441198349,
"rewards/margins": 1.1657757759094238,
"rewards/rejected": -1.1977657079696655,
"step": 995
},
{
"epoch": 0.7850834151128557,
"grad_norm": 3.8195958137512207,
"learning_rate": 4.317111459968603e-06,
"logits/chosen": -2.9896445274353027,
"logits/rejected": -2.980637550354004,
"logps/chosen": -349.469482421875,
"logps/rejected": -325.8355712890625,
"loss": 0.5145,
"rewards/accuracies": 0.7583333849906921,
"rewards/chosen": 0.16838806867599487,
"rewards/margins": 1.0479358434677124,
"rewards/rejected": -0.8795478940010071,
"step": 1000
},
{
"epoch": 0.7850834151128557,
"eval_logits/chosen": -3.02506422996521,
"eval_logits/rejected": -3.0537216663360596,
"eval_logps/chosen": -328.9156188964844,
"eval_logps/rejected": -322.50213623046875,
"eval_loss": 0.48850810527801514,
"eval_rewards/accuracies": 0.7524999976158142,
"eval_rewards/chosen": 0.0447828434407711,
"eval_rewards/margins": 1.0521847009658813,
"eval_rewards/rejected": -1.0074018239974976,
"eval_runtime": 170.8761,
"eval_samples_per_second": 11.704,
"eval_steps_per_second": 5.852,
"step": 1000
},
{
"epoch": 0.78900883218842,
"grad_norm": 3.5479438304901123,
"learning_rate": 4.2386185243328105e-06,
"logits/chosen": -2.979447841644287,
"logits/rejected": -3.07000994682312,
"logps/chosen": -329.8793640136719,
"logps/rejected": -326.6648864746094,
"loss": 0.5044,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.15325433015823364,
"rewards/margins": 1.0215200185775757,
"rewards/rejected": -0.8682657480239868,
"step": 1005
},
{
"epoch": 0.7929342492639843,
"grad_norm": 5.247315883636475,
"learning_rate": 4.160125588697018e-06,
"logits/chosen": -2.996269702911377,
"logits/rejected": -3.06060528755188,
"logps/chosen": -315.26666259765625,
"logps/rejected": -309.08868408203125,
"loss": 0.5036,
"rewards/accuracies": 0.7750000953674316,
"rewards/chosen": 0.1521468460559845,
"rewards/margins": 1.0418260097503662,
"rewards/rejected": -0.8896790742874146,
"step": 1010
},
{
"epoch": 0.7968596663395485,
"grad_norm": 4.7451324462890625,
"learning_rate": 4.081632653061225e-06,
"logits/chosen": -3.060560464859009,
"logits/rejected": -3.1156129837036133,
"logps/chosen": -307.20135498046875,
"logps/rejected": -289.0655212402344,
"loss": 0.5071,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -0.054080985486507416,
"rewards/margins": 1.019335150718689,
"rewards/rejected": -1.0734161138534546,
"step": 1015
},
{
"epoch": 0.8007850834151129,
"grad_norm": 4.9453325271606445,
"learning_rate": 4.003139717425432e-06,
"logits/chosen": -3.077300548553467,
"logits/rejected": -3.1319777965545654,
"logps/chosen": -320.95050048828125,
"logps/rejected": -289.74591064453125,
"loss": 0.5736,
"rewards/accuracies": 0.6958333849906921,
"rewards/chosen": 0.06706535816192627,
"rewards/margins": 0.8430485725402832,
"rewards/rejected": -0.7759832143783569,
"step": 1020
},
{
"epoch": 0.8047105004906772,
"grad_norm": 5.086551189422607,
"learning_rate": 3.9246467817896395e-06,
"logits/chosen": -2.9904251098632812,
"logits/rejected": -3.015242099761963,
"logps/chosen": -348.0818176269531,
"logps/rejected": -313.29937744140625,
"loss": 0.5274,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.15665733814239502,
"rewards/margins": 1.0903061628341675,
"rewards/rejected": -0.933648943901062,
"step": 1025
},
{
"epoch": 0.8086359175662414,
"grad_norm": 3.6982407569885254,
"learning_rate": 3.846153846153847e-06,
"logits/chosen": -2.8871021270751953,
"logits/rejected": -3.033609390258789,
"logps/chosen": -319.4637145996094,
"logps/rejected": -308.5705871582031,
"loss": 0.4841,
"rewards/accuracies": 0.720833420753479,
"rewards/chosen": 0.15460513532161713,
"rewards/margins": 1.0347096920013428,
"rewards/rejected": -0.8801045417785645,
"step": 1030
},
{
"epoch": 0.8125613346418057,
"grad_norm": 4.721177101135254,
"learning_rate": 3.767660910518054e-06,
"logits/chosen": -3.060762882232666,
"logits/rejected": -3.0055036544799805,
"logps/chosen": -320.2731628417969,
"logps/rejected": -321.6552734375,
"loss": 0.5038,
"rewards/accuracies": 0.7666667699813843,
"rewards/chosen": 0.1908079981803894,
"rewards/margins": 0.9346101880073547,
"rewards/rejected": -0.7438021302223206,
"step": 1035
},
{
"epoch": 0.81648675171737,
"grad_norm": 3.8394100666046143,
"learning_rate": 3.6891679748822605e-06,
"logits/chosen": -3.0200486183166504,
"logits/rejected": -3.0659079551696777,
"logps/chosen": -309.3419494628906,
"logps/rejected": -303.4842834472656,
"loss": 0.4625,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.19585837423801422,
"rewards/margins": 1.1063092947006226,
"rewards/rejected": -0.9104509353637695,
"step": 1040
},
{
"epoch": 0.8204121687929342,
"grad_norm": 4.434594631195068,
"learning_rate": 3.6106750392464677e-06,
"logits/chosen": -3.118605136871338,
"logits/rejected": -3.0567848682403564,
"logps/chosen": -320.66400146484375,
"logps/rejected": -327.2590637207031,
"loss": 0.5165,
"rewards/accuracies": 0.7208333611488342,
"rewards/chosen": 0.15826813876628876,
"rewards/margins": 0.8408550024032593,
"rewards/rejected": -0.6825869083404541,
"step": 1045
},
{
"epoch": 0.8243375858684985,
"grad_norm": 5.180295467376709,
"learning_rate": 3.532182103610675e-06,
"logits/chosen": -3.04954195022583,
"logits/rejected": -3.0940709114074707,
"logps/chosen": -333.8227233886719,
"logps/rejected": -317.74530029296875,
"loss": 0.5001,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": 0.304600328207016,
"rewards/margins": 1.0906543731689453,
"rewards/rejected": -0.7860540151596069,
"step": 1050
},
{
"epoch": 0.8282630029440629,
"grad_norm": 3.821493148803711,
"learning_rate": 3.4536891679748822e-06,
"logits/chosen": -2.9625072479248047,
"logits/rejected": -3.038464069366455,
"logps/chosen": -317.5421142578125,
"logps/rejected": -318.5979309082031,
"loss": 0.4845,
"rewards/accuracies": 0.7541667222976685,
"rewards/chosen": 0.11151299625635147,
"rewards/margins": 1.0481318235397339,
"rewards/rejected": -0.9366186857223511,
"step": 1055
},
{
"epoch": 0.8321884200196271,
"grad_norm": 5.562465190887451,
"learning_rate": 3.3751962323390895e-06,
"logits/chosen": -3.0714111328125,
"logits/rejected": -3.0893895626068115,
"logps/chosen": -363.3538513183594,
"logps/rejected": -348.7355651855469,
"loss": 0.487,
"rewards/accuracies": 0.7458333969116211,
"rewards/chosen": 0.14830578863620758,
"rewards/margins": 1.0616153478622437,
"rewards/rejected": -0.9133096933364868,
"step": 1060
},
{
"epoch": 0.8361138370951914,
"grad_norm": 4.553463459014893,
"learning_rate": 3.2967032967032968e-06,
"logits/chosen": -2.939422845840454,
"logits/rejected": -2.907299518585205,
"logps/chosen": -330.71014404296875,
"logps/rejected": -317.1188049316406,
"loss": 0.5002,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": 0.015831544995307922,
"rewards/margins": 1.0325143337249756,
"rewards/rejected": -1.016682744026184,
"step": 1065
},
{
"epoch": 0.8400392541707556,
"grad_norm": 3.9285788536071777,
"learning_rate": 3.218210361067504e-06,
"logits/chosen": -2.965064287185669,
"logits/rejected": -3.0384631156921387,
"logps/chosen": -334.41192626953125,
"logps/rejected": -332.02899169921875,
"loss": 0.4383,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.18648472428321838,
"rewards/margins": 1.2396255731582642,
"rewards/rejected": -1.0531408786773682,
"step": 1070
},
{
"epoch": 0.8439646712463199,
"grad_norm": 3.849515676498413,
"learning_rate": 3.1397174254317113e-06,
"logits/chosen": -3.0192911624908447,
"logits/rejected": -3.064319610595703,
"logps/chosen": -291.82257080078125,
"logps/rejected": -276.48486328125,
"loss": 0.5327,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -0.07870586216449738,
"rewards/margins": 0.9807574152946472,
"rewards/rejected": -1.059463381767273,
"step": 1075
},
{
"epoch": 0.8478900883218842,
"grad_norm": 4.2341814041137695,
"learning_rate": 3.0612244897959185e-06,
"logits/chosen": -3.021660566329956,
"logits/rejected": -3.0418386459350586,
"logps/chosen": -313.7025451660156,
"logps/rejected": -316.4256591796875,
"loss": 0.5029,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": 0.049659062176942825,
"rewards/margins": 0.9659037590026855,
"rewards/rejected": -0.9162446856498718,
"step": 1080
},
{
"epoch": 0.8518155053974484,
"grad_norm": 3.874643564224243,
"learning_rate": 2.982731554160126e-06,
"logits/chosen": -3.0091512203216553,
"logits/rejected": -3.06247615814209,
"logps/chosen": -333.6044616699219,
"logps/rejected": -305.1171875,
"loss": 0.5009,
"rewards/accuracies": 0.7291666865348816,
"rewards/chosen": 0.03525074943900108,
"rewards/margins": 1.0619693994522095,
"rewards/rejected": -1.0267184972763062,
"step": 1085
},
{
"epoch": 0.8557409224730128,
"grad_norm": 4.802616119384766,
"learning_rate": 2.904238618524333e-06,
"logits/chosen": -3.0582773685455322,
"logits/rejected": -3.0825817584991455,
"logps/chosen": -356.7900695800781,
"logps/rejected": -343.6094665527344,
"loss": 0.4828,
"rewards/accuracies": 0.7583333849906921,
"rewards/chosen": 0.11294318735599518,
"rewards/margins": 1.103849172592163,
"rewards/rejected": -0.9909059405326843,
"step": 1090
},
{
"epoch": 0.8596663395485771,
"grad_norm": 4.6176838874816895,
"learning_rate": 2.8257456828885403e-06,
"logits/chosen": -2.992724895477295,
"logits/rejected": -3.021177053451538,
"logps/chosen": -326.8610534667969,
"logps/rejected": -327.75445556640625,
"loss": 0.3941,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": 0.09250589460134506,
"rewards/margins": 1.3396714925765991,
"rewards/rejected": -1.247165560722351,
"step": 1095
},
{
"epoch": 0.8635917566241413,
"grad_norm": 4.353200912475586,
"learning_rate": 2.7472527472527476e-06,
"logits/chosen": -3.025132656097412,
"logits/rejected": -3.075371503829956,
"logps/chosen": -323.29315185546875,
"logps/rejected": -304.34423828125,
"loss": 0.456,
"rewards/accuracies": 0.7375000715255737,
"rewards/chosen": 0.09489820152521133,
"rewards/margins": 1.147526741027832,
"rewards/rejected": -1.0526283979415894,
"step": 1100
},
{
"epoch": 0.8635917566241413,
"eval_logits/chosen": -3.0234742164611816,
"eval_logits/rejected": -3.051736354827881,
"eval_logps/chosen": -328.9091491699219,
"eval_logps/rejected": -322.8738098144531,
"eval_loss": 0.49041956663131714,
"eval_rewards/accuracies": 0.7534999847412109,
"eval_rewards/chosen": 0.04543456435203552,
"eval_rewards/margins": 1.0900031328201294,
"eval_rewards/rejected": -1.044568657875061,
"eval_runtime": 170.9675,
"eval_samples_per_second": 11.698,
"eval_steps_per_second": 5.849,
"step": 1100
},
{
"epoch": 0.8675171736997056,
"grad_norm": 3.9487624168395996,
"learning_rate": 2.668759811616955e-06,
"logits/chosen": -2.9902524948120117,
"logits/rejected": -3.033686876296997,
"logps/chosen": -304.41265869140625,
"logps/rejected": -300.32049560546875,
"loss": 0.4872,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": 0.16270777583122253,
"rewards/margins": 1.078561544418335,
"rewards/rejected": -0.9158536195755005,
"step": 1105
},
{
"epoch": 0.8714425907752699,
"grad_norm": 5.130923748016357,
"learning_rate": 2.5902668759811617e-06,
"logits/chosen": -2.932274580001831,
"logits/rejected": -2.995884656906128,
"logps/chosen": -318.7715148925781,
"logps/rejected": -321.5521240234375,
"loss": 0.5181,
"rewards/accuracies": 0.7000000476837158,
"rewards/chosen": -0.003421901259571314,
"rewards/margins": 1.0905206203460693,
"rewards/rejected": -1.0939425230026245,
"step": 1110
},
{
"epoch": 0.8753680078508341,
"grad_norm": 5.341976642608643,
"learning_rate": 2.511773940345369e-06,
"logits/chosen": -2.9918906688690186,
"logits/rejected": -3.070976734161377,
"logps/chosen": -328.22149658203125,
"logps/rejected": -295.317138671875,
"loss": 0.5221,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": 0.2609195113182068,
"rewards/margins": 1.2272056341171265,
"rewards/rejected": -0.9662860631942749,
"step": 1115
},
{
"epoch": 0.8792934249263984,
"grad_norm": 3.9076426029205322,
"learning_rate": 2.4332810047095766e-06,
"logits/chosen": -3.0119221210479736,
"logits/rejected": -3.0435667037963867,
"logps/chosen": -345.94036865234375,
"logps/rejected": -323.3532409667969,
"loss": 0.4533,
"rewards/accuracies": 0.7750000357627869,
"rewards/chosen": 0.23655609786510468,
"rewards/margins": 1.205311894416809,
"rewards/rejected": -0.9687557220458984,
"step": 1120
},
{
"epoch": 0.8832188420019627,
"grad_norm": 4.00649356842041,
"learning_rate": 2.3547880690737835e-06,
"logits/chosen": -3.0578651428222656,
"logits/rejected": -3.1050515174865723,
"logps/chosen": -293.8743896484375,
"logps/rejected": -334.2582702636719,
"loss": 0.5158,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": 0.27813708782196045,
"rewards/margins": 1.1314489841461182,
"rewards/rejected": -0.8533117175102234,
"step": 1125
},
{
"epoch": 0.887144259077527,
"grad_norm": 5.170398712158203,
"learning_rate": 2.2762951334379907e-06,
"logits/chosen": -2.9842798709869385,
"logits/rejected": -3.0326454639434814,
"logps/chosen": -319.64111328125,
"logps/rejected": -334.63641357421875,
"loss": 0.5767,
"rewards/accuracies": 0.6958334445953369,
"rewards/chosen": 0.04911806434392929,
"rewards/margins": 1.029329538345337,
"rewards/rejected": -0.9802114367485046,
"step": 1130
},
{
"epoch": 0.8910696761530913,
"grad_norm": 4.750176906585693,
"learning_rate": 2.197802197802198e-06,
"logits/chosen": -3.024641275405884,
"logits/rejected": -3.0461010932922363,
"logps/chosen": -344.88226318359375,
"logps/rejected": -349.8857727050781,
"loss": 0.4812,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": 0.08117427676916122,
"rewards/margins": 1.2805150747299194,
"rewards/rejected": -1.1993409395217896,
"step": 1135
},
{
"epoch": 0.8949950932286556,
"grad_norm": 4.3012471199035645,
"learning_rate": 2.1193092621664052e-06,
"logits/chosen": -2.9693052768707275,
"logits/rejected": -3.012446165084839,
"logps/chosen": -332.40740966796875,
"logps/rejected": -347.2129211425781,
"loss": 0.4836,
"rewards/accuracies": 0.783333420753479,
"rewards/chosen": 0.11985665559768677,
"rewards/margins": 1.132505178451538,
"rewards/rejected": -1.012648582458496,
"step": 1140
},
{
"epoch": 0.8989205103042198,
"grad_norm": 4.1170196533203125,
"learning_rate": 2.0408163265306125e-06,
"logits/chosen": -2.9982786178588867,
"logits/rejected": -2.985097646713257,
"logps/chosen": -325.58453369140625,
"logps/rejected": -318.0592346191406,
"loss": 0.4191,
"rewards/accuracies": 0.8125001192092896,
"rewards/chosen": 0.2217942774295807,
"rewards/margins": 1.308205485343933,
"rewards/rejected": -1.0864112377166748,
"step": 1145
},
{
"epoch": 0.9028459273797841,
"grad_norm": 6.188891887664795,
"learning_rate": 1.9623233908948198e-06,
"logits/chosen": -2.999929189682007,
"logits/rejected": -3.010659694671631,
"logps/chosen": -335.7912902832031,
"logps/rejected": -308.9612731933594,
"loss": 0.4935,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.025047356262803078,
"rewards/margins": 1.0598541498184204,
"rewards/rejected": -1.0849015712738037,
"step": 1150
},
{
"epoch": 0.9067713444553483,
"grad_norm": 4.374786376953125,
"learning_rate": 1.883830455259027e-06,
"logits/chosen": -2.9940290451049805,
"logits/rejected": -3.0808780193328857,
"logps/chosen": -332.26287841796875,
"logps/rejected": -301.36590576171875,
"loss": 0.5517,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": 0.07976453751325607,
"rewards/margins": 0.9496763348579407,
"rewards/rejected": -0.8699118494987488,
"step": 1155
},
{
"epoch": 0.9106967615309126,
"grad_norm": 5.303534030914307,
"learning_rate": 1.8053375196232339e-06,
"logits/chosen": -2.9458823204040527,
"logits/rejected": -3.05169939994812,
"logps/chosen": -350.2687072753906,
"logps/rejected": -344.89892578125,
"loss": 0.529,
"rewards/accuracies": 0.7041667103767395,
"rewards/chosen": -0.006747332401573658,
"rewards/margins": 1.0622098445892334,
"rewards/rejected": -1.0689570903778076,
"step": 1160
},
{
"epoch": 0.914622178606477,
"grad_norm": 4.187100887298584,
"learning_rate": 1.7268445839874411e-06,
"logits/chosen": -3.0305941104888916,
"logits/rejected": -3.0799167156219482,
"logps/chosen": -311.58660888671875,
"logps/rejected": -297.4324035644531,
"loss": 0.5227,
"rewards/accuracies": 0.73333340883255,
"rewards/chosen": 0.021794170141220093,
"rewards/margins": 0.9995294809341431,
"rewards/rejected": -0.977735161781311,
"step": 1165
},
{
"epoch": 0.9185475956820413,
"grad_norm": 3.8420519828796387,
"learning_rate": 1.6483516483516484e-06,
"logits/chosen": -3.073319911956787,
"logits/rejected": -3.1020355224609375,
"logps/chosen": -323.134521484375,
"logps/rejected": -314.55340576171875,
"loss": 0.453,
"rewards/accuracies": 0.7666667699813843,
"rewards/chosen": 0.005049190018326044,
"rewards/margins": 1.1697251796722412,
"rewards/rejected": -1.1646759510040283,
"step": 1170
},
{
"epoch": 0.9224730127576055,
"grad_norm": 4.952281951904297,
"learning_rate": 1.5698587127158556e-06,
"logits/chosen": -2.9573421478271484,
"logits/rejected": -3.016396999359131,
"logps/chosen": -336.82427978515625,
"logps/rejected": -305.33697509765625,
"loss": 0.4634,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": 0.049565743654966354,
"rewards/margins": 1.13016676902771,
"rewards/rejected": -1.0806009769439697,
"step": 1175
},
{
"epoch": 0.9263984298331698,
"grad_norm": 4.881412029266357,
"learning_rate": 1.491365777080063e-06,
"logits/chosen": -3.013920783996582,
"logits/rejected": -3.0628743171691895,
"logps/chosen": -325.85760498046875,
"logps/rejected": -325.6579284667969,
"loss": 0.5157,
"rewards/accuracies": 0.7375000715255737,
"rewards/chosen": -0.006006541661918163,
"rewards/margins": 1.0770210027694702,
"rewards/rejected": -1.0830276012420654,
"step": 1180
},
{
"epoch": 0.930323846908734,
"grad_norm": 4.166913986206055,
"learning_rate": 1.4128728414442702e-06,
"logits/chosen": -3.0066945552825928,
"logits/rejected": -3.021183490753174,
"logps/chosen": -298.1296081542969,
"logps/rejected": -306.4847106933594,
"loss": 0.4665,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.030468011274933815,
"rewards/margins": 1.1519229412078857,
"rewards/rejected": -1.1214549541473389,
"step": 1185
},
{
"epoch": 0.9342492639842983,
"grad_norm": 5.559418678283691,
"learning_rate": 1.3343799058084774e-06,
"logits/chosen": -3.032975912094116,
"logits/rejected": -3.018131732940674,
"logps/chosen": -299.36419677734375,
"logps/rejected": -299.8133239746094,
"loss": 0.5393,
"rewards/accuracies": 0.6875000596046448,
"rewards/chosen": 0.018437325954437256,
"rewards/margins": 0.9440910220146179,
"rewards/rejected": -0.9256537556648254,
"step": 1190
},
{
"epoch": 0.9381746810598626,
"grad_norm": 5.020077228546143,
"learning_rate": 1.2558869701726845e-06,
"logits/chosen": -2.986281156539917,
"logits/rejected": -3.06579852104187,
"logps/chosen": -341.192626953125,
"logps/rejected": -319.23809814453125,
"loss": 0.5241,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.07408008724451065,
"rewards/margins": 1.0439367294311523,
"rewards/rejected": -1.1180168390274048,
"step": 1195
},
{
"epoch": 0.9421000981354269,
"grad_norm": 4.814427375793457,
"learning_rate": 1.1773940345368917e-06,
"logits/chosen": -3.0177111625671387,
"logits/rejected": -3.044379711151123,
"logps/chosen": -326.84661865234375,
"logps/rejected": -306.0335388183594,
"loss": 0.4989,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.0778479278087616,
"rewards/margins": 1.1605087518692017,
"rewards/rejected": -1.2383568286895752,
"step": 1200
},
{
"epoch": 0.9421000981354269,
"eval_logits/chosen": -3.0254786014556885,
"eval_logits/rejected": -3.0536904335021973,
"eval_logps/chosen": -329.79644775390625,
"eval_logps/rejected": -323.7355651855469,
"eval_loss": 0.4862891137599945,
"eval_rewards/accuracies": 0.7605000138282776,
"eval_rewards/chosen": -0.043296121060848236,
"eval_rewards/margins": 1.0874476432800293,
"eval_rewards/rejected": -1.1307436227798462,
"eval_runtime": 170.5812,
"eval_samples_per_second": 11.725,
"eval_steps_per_second": 5.862,
"step": 1200
},
{
"epoch": 0.9460255152109912,
"grad_norm": 4.624739646911621,
"learning_rate": 1.098901098901099e-06,
"logits/chosen": -3.055946111679077,
"logits/rejected": -3.1179323196411133,
"logps/chosen": -363.6905822753906,
"logps/rejected": -346.05767822265625,
"loss": 0.4542,
"rewards/accuracies": 0.7750000357627869,
"rewards/chosen": -0.008908344432711601,
"rewards/margins": 1.0937221050262451,
"rewards/rejected": -1.102630376815796,
"step": 1205
},
{
"epoch": 0.9499509322865555,
"grad_norm": 3.9496641159057617,
"learning_rate": 1.0204081632653063e-06,
"logits/chosen": -2.9469313621520996,
"logits/rejected": -3.045012950897217,
"logps/chosen": -328.9994201660156,
"logps/rejected": -339.7306213378906,
"loss": 0.4544,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.06163526326417923,
"rewards/margins": 1.2660752534866333,
"rewards/rejected": -1.327710509300232,
"step": 1210
},
{
"epoch": 0.9538763493621197,
"grad_norm": 4.550204753875732,
"learning_rate": 9.419152276295135e-07,
"logits/chosen": -3.0455386638641357,
"logits/rejected": -3.007603168487549,
"logps/chosen": -321.6459655761719,
"logps/rejected": -317.13592529296875,
"loss": 0.518,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.07271427661180496,
"rewards/margins": 1.1276568174362183,
"rewards/rejected": -1.2003710269927979,
"step": 1215
},
{
"epoch": 0.957801766437684,
"grad_norm": 3.7910659313201904,
"learning_rate": 8.634222919937206e-07,
"logits/chosen": -2.887434720993042,
"logits/rejected": -3.0296366214752197,
"logps/chosen": -328.447021484375,
"logps/rejected": -312.25146484375,
"loss": 0.5169,
"rewards/accuracies": 0.7375000715255737,
"rewards/chosen": -0.016386663541197777,
"rewards/margins": 1.084718942642212,
"rewards/rejected": -1.1011055707931519,
"step": 1220
},
{
"epoch": 0.9617271835132483,
"grad_norm": 4.135540008544922,
"learning_rate": 7.849293563579278e-07,
"logits/chosen": -3.051018714904785,
"logits/rejected": -3.042524814605713,
"logps/chosen": -303.7222900390625,
"logps/rejected": -311.6209411621094,
"loss": 0.4563,
"rewards/accuracies": 0.7625001072883606,
"rewards/chosen": 0.08631271123886108,
"rewards/margins": 1.19536554813385,
"rewards/rejected": -1.1090528964996338,
"step": 1225
},
{
"epoch": 0.9656526005888125,
"grad_norm": 4.80025577545166,
"learning_rate": 7.064364207221351e-07,
"logits/chosen": -3.0100882053375244,
"logits/rejected": -3.033210039138794,
"logps/chosen": -297.29388427734375,
"logps/rejected": -312.75390625,
"loss": 0.5043,
"rewards/accuracies": 0.7416667342185974,
"rewards/chosen": -0.07621364295482635,
"rewards/margins": 0.9946663975715637,
"rewards/rejected": -1.0708800554275513,
"step": 1230
},
{
"epoch": 0.9695780176643768,
"grad_norm": 4.171872138977051,
"learning_rate": 6.279434850863422e-07,
"logits/chosen": -2.984192371368408,
"logits/rejected": -2.9866414070129395,
"logps/chosen": -315.4156799316406,
"logps/rejected": -311.60174560546875,
"loss": 0.5081,
"rewards/accuracies": 0.7291667461395264,
"rewards/chosen": -0.02722536399960518,
"rewards/margins": 1.0488277673721313,
"rewards/rejected": -1.0760531425476074,
"step": 1235
},
{
"epoch": 0.9735034347399412,
"grad_norm": 4.678730487823486,
"learning_rate": 5.494505494505495e-07,
"logits/chosen": -3.045713424682617,
"logits/rejected": -3.0048978328704834,
"logps/chosen": -355.97918701171875,
"logps/rejected": -362.16143798828125,
"loss": 0.4872,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.0551709420979023,
"rewards/margins": 1.1289502382278442,
"rewards/rejected": -1.184121012687683,
"step": 1240
},
{
"epoch": 0.9774288518155054,
"grad_norm": 5.555654048919678,
"learning_rate": 4.7095761381475676e-07,
"logits/chosen": -3.045968532562256,
"logits/rejected": -3.028806686401367,
"logps/chosen": -362.6111755371094,
"logps/rejected": -342.56658935546875,
"loss": 0.4904,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": -0.12567153573036194,
"rewards/margins": 1.1269028186798096,
"rewards/rejected": -1.2525743246078491,
"step": 1245
},
{
"epoch": 0.9813542688910697,
"grad_norm": 4.789409637451172,
"learning_rate": 3.924646781789639e-07,
"logits/chosen": -2.9453186988830566,
"logits/rejected": -3.032778739929199,
"logps/chosen": -335.04803466796875,
"logps/rejected": -338.70782470703125,
"loss": 0.5291,
"rewards/accuracies": 0.7208333611488342,
"rewards/chosen": -0.0959320068359375,
"rewards/margins": 1.0104596614837646,
"rewards/rejected": -1.1063916683197021,
"step": 1250
},
{
"epoch": 0.985279685966634,
"grad_norm": 4.851970672607422,
"learning_rate": 3.139717425431711e-07,
"logits/chosen": -3.0085787773132324,
"logits/rejected": -3.033092975616455,
"logps/chosen": -294.70330810546875,
"logps/rejected": -298.2593994140625,
"loss": 0.4952,
"rewards/accuracies": 0.720833420753479,
"rewards/chosen": -0.07039856910705566,
"rewards/margins": 1.0396531820297241,
"rewards/rejected": -1.1100517511367798,
"step": 1255
},
{
"epoch": 0.9892051030421982,
"grad_norm": 4.407826900482178,
"learning_rate": 2.3547880690737838e-07,
"logits/chosen": -2.992248296737671,
"logits/rejected": -3.090275287628174,
"logps/chosen": -361.5509338378906,
"logps/rejected": -348.03094482421875,
"loss": 0.4522,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -0.02963084913790226,
"rewards/margins": 1.1609312295913696,
"rewards/rejected": -1.1905620098114014,
"step": 1260
},
{
"epoch": 0.9931305201177625,
"grad_norm": 5.248498916625977,
"learning_rate": 1.5698587127158556e-07,
"logits/chosen": -3.013667345046997,
"logits/rejected": -2.995178461074829,
"logps/chosen": -306.033447265625,
"logps/rejected": -311.38470458984375,
"loss": 0.5122,
"rewards/accuracies": 0.7666666507720947,
"rewards/chosen": -0.04827199503779411,
"rewards/margins": 1.0230185985565186,
"rewards/rejected": -1.0712906122207642,
"step": 1265
},
{
"epoch": 0.9970559371933267,
"grad_norm": 4.212546348571777,
"learning_rate": 7.849293563579278e-08,
"logits/chosen": -3.0183610916137695,
"logits/rejected": -3.0684821605682373,
"logps/chosen": -313.4974060058594,
"logps/rejected": -312.05755615234375,
"loss": 0.5292,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.12905262410640717,
"rewards/margins": 0.9748676419258118,
"rewards/rejected": -1.1039202213287354,
"step": 1270
},
{
"epoch": 1.0,
"step": 1274,
"total_flos": 0.0,
"train_loss": 0.5089355802610868,
"train_runtime": 12172.1578,
"train_samples_per_second": 5.023,
"train_steps_per_second": 0.105
}
],
"logging_steps": 5,
"max_steps": 1274,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}