{ "best_global_step": 1200, "best_metric": 0.4862891137599945, "best_model_checkpoint": "runs/dpo-OpenHermes-2.5-Mistral-7B-20251120-1236/checkpoints/checkpoint-1200", "epoch": 1.0, "eval_steps": 100, "global_step": 1274, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003925417075564278, "grad_norm": 6.150045871734619, "learning_rate": 1.9937205651491366e-05, "logits/chosen": -3.0840773582458496, "logits/rejected": -3.0958099365234375, "logps/chosen": -311.7703857421875, "logps/rejected": -290.61724853515625, "loss": 0.6805, "rewards/accuracies": 0.4833333492279053, "rewards/chosen": 0.13000372052192688, "rewards/margins": 0.03325975313782692, "rewards/rejected": 0.09674396365880966, "step": 5 }, { "epoch": 0.007850834151128557, "grad_norm": 10.357205390930176, "learning_rate": 1.9858712715855573e-05, "logits/chosen": -3.0053043365478516, "logits/rejected": -3.0651307106018066, "logps/chosen": -339.4639892578125, "logps/rejected": -330.821533203125, "loss": 0.6648, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.5165472626686096, "rewards/margins": 0.10410015285015106, "rewards/rejected": 0.412447065114975, "step": 10 }, { "epoch": 0.011776251226692836, "grad_norm": 5.737811088562012, "learning_rate": 1.9780219780219784e-05, "logits/chosen": -3.049772262573242, "logits/rejected": -2.9993741512298584, "logps/chosen": -339.0422668457031, "logps/rejected": -313.1036071777344, "loss": 0.6335, "rewards/accuracies": 0.6458333730697632, "rewards/chosen": 0.5449298620223999, "rewards/margins": 0.23259714245796204, "rewards/rejected": 0.31233277916908264, "step": 15 }, { "epoch": 0.015701668302257114, "grad_norm": 5.574727535247803, "learning_rate": 1.9701726844583988e-05, "logits/chosen": -3.0041749477386475, "logits/rejected": -2.9779772758483887, "logps/chosen": -337.6470642089844, "logps/rejected": -344.86907958984375, "loss": 0.5883, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6041615605354309, "rewards/margins": 0.3823166787624359, "rewards/rejected": 0.221844881772995, "step": 20 }, { "epoch": 0.019627085377821395, "grad_norm": 5.843604564666748, "learning_rate": 1.9623233908948195e-05, "logits/chosen": -2.9967105388641357, "logits/rejected": -3.058979034423828, "logps/chosen": -295.9526062011719, "logps/rejected": -288.1315002441406, "loss": 0.5635, "rewards/accuracies": 0.7166666984558105, "rewards/chosen": 0.8546509742736816, "rewards/margins": 0.5589786767959595, "rewards/rejected": 0.2956722378730774, "step": 25 }, { "epoch": 0.023552502453385672, "grad_norm": 5.119436264038086, "learning_rate": 1.9544740973312402e-05, "logits/chosen": -3.0148398876190186, "logits/rejected": -2.98360013961792, "logps/chosen": -310.6913146972656, "logps/rejected": -300.73724365234375, "loss": 0.5576, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.528806209564209, "rewards/margins": 0.6290292739868164, "rewards/rejected": -0.10022306442260742, "step": 30 }, { "epoch": 0.02747791952894995, "grad_norm": 6.46522855758667, "learning_rate": 1.9466248037676613e-05, "logits/chosen": -2.9573066234588623, "logits/rejected": -3.0088870525360107, "logps/chosen": -308.3896789550781, "logps/rejected": -297.2813415527344, "loss": 0.5858, "rewards/accuracies": 0.6541667580604553, "rewards/chosen": 0.3321291506290436, "rewards/margins": 0.6073407530784607, "rewards/rejected": -0.2752116024494171, "step": 35 }, { "epoch": 0.03140333660451423, "grad_norm": 5.806535720825195, "learning_rate": 1.9387755102040817e-05, "logits/chosen": -2.980921745300293, "logits/rejected": -3.0631861686706543, "logps/chosen": -328.13287353515625, "logps/rejected": -307.7826232910156, "loss": 0.6097, "rewards/accuracies": 0.6875, "rewards/chosen": 0.26903384923934937, "rewards/margins": 0.5272954106330872, "rewards/rejected": -0.2582615911960602, "step": 40 }, { "epoch": 0.03532875368007851, "grad_norm": 5.124495506286621, "learning_rate": 1.9309262166405024e-05, "logits/chosen": -3.0031769275665283, "logits/rejected": -3.003542900085449, "logps/chosen": -307.12860107421875, "logps/rejected": -323.902099609375, "loss": 0.5483, "rewards/accuracies": 0.6833333373069763, "rewards/chosen": -0.10443999618291855, "rewards/margins": 0.6921336650848389, "rewards/rejected": -0.7965737581253052, "step": 45 }, { "epoch": 0.03925417075564279, "grad_norm": 5.090153217315674, "learning_rate": 1.923076923076923e-05, "logits/chosen": -3.009328842163086, "logits/rejected": -3.043778657913208, "logps/chosen": -316.96917724609375, "logps/rejected": -311.90960693359375, "loss": 0.5658, "rewards/accuracies": 0.7041667103767395, "rewards/chosen": -0.5328065752983093, "rewards/margins": 0.6153702139854431, "rewards/rejected": -1.148176670074463, "step": 50 }, { "epoch": 0.04317958783120707, "grad_norm": 5.527870178222656, "learning_rate": 1.9152276295133442e-05, "logits/chosen": -3.0257294178009033, "logits/rejected": -3.0307681560516357, "logps/chosen": -312.945068359375, "logps/rejected": -302.48126220703125, "loss": 0.5258, "rewards/accuracies": 0.7083333730697632, "rewards/chosen": -0.4231169819831848, "rewards/margins": 0.7520371079444885, "rewards/rejected": -1.1751540899276733, "step": 55 }, { "epoch": 0.047105004906771344, "grad_norm": 5.95127010345459, "learning_rate": 1.9073783359497646e-05, "logits/chosen": -2.9485747814178467, "logits/rejected": -3.040693998336792, "logps/chosen": -323.712158203125, "logps/rejected": -296.09710693359375, "loss": 0.5437, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.07572325319051743, "rewards/margins": 0.7711307406425476, "rewards/rejected": -0.846854031085968, "step": 60 }, { "epoch": 0.05103042198233562, "grad_norm": 5.20306396484375, "learning_rate": 1.8995290423861853e-05, "logits/chosen": -2.932573080062866, "logits/rejected": -2.9804799556732178, "logps/chosen": -320.690185546875, "logps/rejected": -315.3598327636719, "loss": 0.5424, "rewards/accuracies": 0.720833420753479, "rewards/chosen": 0.1321389377117157, "rewards/margins": 0.7304352521896362, "rewards/rejected": -0.5982962846755981, "step": 65 }, { "epoch": 0.0549558390578999, "grad_norm": 6.864138603210449, "learning_rate": 1.891679748822606e-05, "logits/chosen": -2.9804160594940186, "logits/rejected": -3.058073043823242, "logps/chosen": -302.26055908203125, "logps/rejected": -324.0291442871094, "loss": 0.5701, "rewards/accuracies": 0.6833333969116211, "rewards/chosen": 0.36459389328956604, "rewards/margins": 0.7426995038986206, "rewards/rejected": -0.37810567021369934, "step": 70 }, { "epoch": 0.058881256133464184, "grad_norm": 4.591891765594482, "learning_rate": 1.8838304552590268e-05, "logits/chosen": -2.9886913299560547, "logits/rejected": -2.987067699432373, "logps/chosen": -314.6214904785156, "logps/rejected": -298.33953857421875, "loss": 0.5124, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.3623279631137848, "rewards/margins": 0.8299944996833801, "rewards/rejected": -0.4676665663719177, "step": 75 }, { "epoch": 0.06280667320902845, "grad_norm": 4.877047538757324, "learning_rate": 1.8759811616954475e-05, "logits/chosen": -2.989983081817627, "logits/rejected": -3.0554869174957275, "logps/chosen": -312.76019287109375, "logps/rejected": -337.36920166015625, "loss": 0.5185, "rewards/accuracies": 0.720833420753479, "rewards/chosen": 0.21076758205890656, "rewards/margins": 0.8532983064651489, "rewards/rejected": -0.6425307393074036, "step": 80 }, { "epoch": 0.06673209028459273, "grad_norm": 6.023036003112793, "learning_rate": 1.8681318681318682e-05, "logits/chosen": -2.9612174034118652, "logits/rejected": -3.071810722351074, "logps/chosen": -326.7922058105469, "logps/rejected": -314.8778076171875, "loss": 0.5549, "rewards/accuracies": 0.7083333730697632, "rewards/chosen": 0.18136966228485107, "rewards/margins": 0.7432295083999634, "rewards/rejected": -0.5618598461151123, "step": 85 }, { "epoch": 0.07065750736015702, "grad_norm": 6.082668781280518, "learning_rate": 1.860282574568289e-05, "logits/chosen": -2.832726001739502, "logits/rejected": -2.891904830932617, "logps/chosen": -330.4433898925781, "logps/rejected": -299.0811462402344, "loss": 0.5603, "rewards/accuracies": 0.6916667222976685, "rewards/chosen": 0.12478373199701309, "rewards/margins": 0.821280300617218, "rewards/rejected": -0.6964964866638184, "step": 90 }, { "epoch": 0.0745829244357213, "grad_norm": 5.67427396774292, "learning_rate": 1.8524332810047097e-05, "logits/chosen": -2.997680902481079, "logits/rejected": -3.0600008964538574, "logps/chosen": -325.9932861328125, "logps/rejected": -297.598388671875, "loss": 0.5578, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": 0.11321593821048737, "rewards/margins": 0.7891088724136353, "rewards/rejected": -0.6758929491043091, "step": 95 }, { "epoch": 0.07850834151128558, "grad_norm": 5.028480052947998, "learning_rate": 1.8445839874411304e-05, "logits/chosen": -2.9710261821746826, "logits/rejected": -2.934241771697998, "logps/chosen": -332.2840881347656, "logps/rejected": -323.45794677734375, "loss": 0.5515, "rewards/accuracies": 0.7416667342185974, "rewards/chosen": -0.20720729231834412, "rewards/margins": 0.7714598774909973, "rewards/rejected": -0.9786672592163086, "step": 100 }, { "epoch": 0.07850834151128558, "eval_logits/chosen": -2.980896234512329, "eval_logits/rejected": -3.010272741317749, "eval_logps/chosen": -332.7633972167969, "eval_logps/rejected": -323.36285400390625, "eval_loss": 0.5452204346656799, "eval_rewards/accuracies": 0.7120000123977661, "eval_rewards/chosen": -0.3399922549724579, "eval_rewards/margins": 0.7534830570220947, "eval_rewards/rejected": -1.0934752225875854, "eval_runtime": 171.9076, "eval_samples_per_second": 11.634, "eval_steps_per_second": 5.817, "step": 100 }, { "epoch": 0.08243375858684986, "grad_norm": 4.673857688903809, "learning_rate": 1.836734693877551e-05, "logits/chosen": -2.955043077468872, "logits/rejected": -3.010183811187744, "logps/chosen": -360.6026611328125, "logps/rejected": -354.76043701171875, "loss": 0.538, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -0.5105575919151306, "rewards/margins": 0.7706912755966187, "rewards/rejected": -1.2812488079071045, "step": 105 }, { "epoch": 0.08635917566241413, "grad_norm": 6.655648231506348, "learning_rate": 1.828885400313972e-05, "logits/chosen": -2.9778573513031006, "logits/rejected": -2.9207446575164795, "logps/chosen": -323.5428161621094, "logps/rejected": -342.148681640625, "loss": 0.5713, "rewards/accuracies": 0.6958334445953369, "rewards/chosen": -0.6803138852119446, "rewards/margins": 0.697836697101593, "rewards/rejected": -1.3781505823135376, "step": 110 }, { "epoch": 0.09028459273797841, "grad_norm": 5.369375228881836, "learning_rate": 1.8210361067503926e-05, "logits/chosen": -2.9552626609802246, "logits/rejected": -3.075476884841919, "logps/chosen": -348.5240173339844, "logps/rejected": -307.6527404785156, "loss": 0.5155, "rewards/accuracies": 0.7583333849906921, "rewards/chosen": -0.3947049677371979, "rewards/margins": 0.8081506490707397, "rewards/rejected": -1.2028557062149048, "step": 115 }, { "epoch": 0.09421000981354269, "grad_norm": 4.998170852661133, "learning_rate": 1.8131868131868133e-05, "logits/chosen": -2.8421874046325684, "logits/rejected": -2.9301371574401855, "logps/chosen": -320.47760009765625, "logps/rejected": -307.7643737792969, "loss": 0.499, "rewards/accuracies": 0.7541667222976685, "rewards/chosen": -0.32407405972480774, "rewards/margins": 0.8497726321220398, "rewards/rejected": -1.1738468408584595, "step": 120 }, { "epoch": 0.09813542688910697, "grad_norm": 6.9904890060424805, "learning_rate": 1.805337519623234e-05, "logits/chosen": -2.96341872215271, "logits/rejected": -2.9047999382019043, "logps/chosen": -350.89495849609375, "logps/rejected": -367.870361328125, "loss": 0.5847, "rewards/accuracies": 0.6958333849906921, "rewards/chosen": -0.3227779269218445, "rewards/margins": 0.7198423147201538, "rewards/rejected": -1.0426201820373535, "step": 125 }, { "epoch": 0.10206084396467124, "grad_norm": 5.14391565322876, "learning_rate": 1.7974882260596548e-05, "logits/chosen": -3.0105910301208496, "logits/rejected": -3.0620298385620117, "logps/chosen": -341.6379089355469, "logps/rejected": -306.1999816894531, "loss": 0.5432, "rewards/accuracies": 0.7291666865348816, "rewards/chosen": -0.4734025001525879, "rewards/margins": 0.7205262184143066, "rewards/rejected": -1.1939287185668945, "step": 130 }, { "epoch": 0.10598626104023552, "grad_norm": 5.304475784301758, "learning_rate": 1.7896389324960755e-05, "logits/chosen": -2.86970853805542, "logits/rejected": -2.8937735557556152, "logps/chosen": -313.49835205078125, "logps/rejected": -306.93585205078125, "loss": 0.5462, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -0.5461211800575256, "rewards/margins": 0.748654305934906, "rewards/rejected": -1.294775366783142, "step": 135 }, { "epoch": 0.1099116781157998, "grad_norm": 4.115116119384766, "learning_rate": 1.7817896389324962e-05, "logits/chosen": -3.0203137397766113, "logits/rejected": -3.0821175575256348, "logps/chosen": -354.2901306152344, "logps/rejected": -316.67205810546875, "loss": 0.5105, "rewards/accuracies": 0.720833420753479, "rewards/chosen": -0.45508939027786255, "rewards/margins": 0.8571721911430359, "rewards/rejected": -1.3122615814208984, "step": 140 }, { "epoch": 0.11383709519136408, "grad_norm": 5.043489456176758, "learning_rate": 1.773940345368917e-05, "logits/chosen": -2.8955276012420654, "logits/rejected": -2.957524061203003, "logps/chosen": -318.019775390625, "logps/rejected": -321.0815124511719, "loss": 0.5172, "rewards/accuracies": 0.7666667699813843, "rewards/chosen": -0.375745952129364, "rewards/margins": 0.8528572916984558, "rewards/rejected": -1.2286031246185303, "step": 145 }, { "epoch": 0.11776251226692837, "grad_norm": 6.230247497558594, "learning_rate": 1.7660910518053377e-05, "logits/chosen": -2.986295700073242, "logits/rejected": -3.0270934104919434, "logps/chosen": -348.29058837890625, "logps/rejected": -330.9169006347656, "loss": 0.5398, "rewards/accuracies": 0.720833420753479, "rewards/chosen": -0.6200595498085022, "rewards/margins": 0.8251369595527649, "rewards/rejected": -1.4451963901519775, "step": 150 }, { "epoch": 0.12168792934249265, "grad_norm": 5.081642150878906, "learning_rate": 1.7582417582417584e-05, "logits/chosen": -2.935692548751831, "logits/rejected": -3.0154006481170654, "logps/chosen": -348.2768249511719, "logps/rejected": -316.89361572265625, "loss": 0.556, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -0.7520820498466492, "rewards/margins": 0.8174117207527161, "rewards/rejected": -1.5694936513900757, "step": 155 }, { "epoch": 0.1256133464180569, "grad_norm": 5.332559585571289, "learning_rate": 1.750392464678179e-05, "logits/chosen": -2.853811740875244, "logits/rejected": -2.8885927200317383, "logps/chosen": -339.7810363769531, "logps/rejected": -333.8861999511719, "loss": 0.5375, "rewards/accuracies": 0.7416666746139526, "rewards/chosen": -0.8646346926689148, "rewards/margins": 0.8464757800102234, "rewards/rejected": -1.7111107110977173, "step": 160 }, { "epoch": 0.1295387634936212, "grad_norm": 5.659219741821289, "learning_rate": 1.7425431711146e-05, "logits/chosen": -2.9798367023468018, "logits/rejected": -3.0234591960906982, "logps/chosen": -369.3193054199219, "logps/rejected": -320.97088623046875, "loss": 0.5365, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -0.5549585223197937, "rewards/margins": 0.8239587545394897, "rewards/rejected": -1.3789172172546387, "step": 165 }, { "epoch": 0.13346418056918546, "grad_norm": 6.147792339324951, "learning_rate": 1.7346938775510206e-05, "logits/chosen": -2.9105169773101807, "logits/rejected": -2.9818620681762695, "logps/chosen": -348.80194091796875, "logps/rejected": -312.60272216796875, "loss": 0.5357, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -0.12175627052783966, "rewards/margins": 0.814881443977356, "rewards/rejected": -0.9366377592086792, "step": 170 }, { "epoch": 0.13738959764474976, "grad_norm": 5.922034740447998, "learning_rate": 1.7268445839874413e-05, "logits/chosen": -2.961211919784546, "logits/rejected": -3.006511688232422, "logps/chosen": -299.61260986328125, "logps/rejected": -318.53961181640625, "loss": 0.5611, "rewards/accuracies": 0.6875, "rewards/chosen": 0.25987696647644043, "rewards/margins": 0.7301396727561951, "rewards/rejected": -0.47026267647743225, "step": 175 }, { "epoch": 0.14131501472031405, "grad_norm": 5.806030750274658, "learning_rate": 1.718995290423862e-05, "logits/chosen": -2.8726003170013428, "logits/rejected": -2.9730982780456543, "logps/chosen": -335.31817626953125, "logps/rejected": -330.07611083984375, "loss": 0.4908, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.5076613426208496, "rewards/margins": 0.9330441355705261, "rewards/rejected": -0.4253828525543213, "step": 180 }, { "epoch": 0.1452404317958783, "grad_norm": 5.604033470153809, "learning_rate": 1.7111459968602827e-05, "logits/chosen": -3.0221714973449707, "logits/rejected": -3.0379929542541504, "logps/chosen": -299.20281982421875, "logps/rejected": -292.1725158691406, "loss": 0.5898, "rewards/accuracies": 0.6750000715255737, "rewards/chosen": 0.12294058501720428, "rewards/margins": 0.6319655179977417, "rewards/rejected": -0.5090248584747314, "step": 185 }, { "epoch": 0.1491658488714426, "grad_norm": 5.10993766784668, "learning_rate": 1.7032967032967035e-05, "logits/chosen": -2.916008949279785, "logits/rejected": -2.984748601913452, "logps/chosen": -313.1849365234375, "logps/rejected": -306.8742370605469, "loss": 0.5154, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.03891729563474655, "rewards/margins": 0.8485898971557617, "rewards/rejected": -0.8096725344657898, "step": 190 }, { "epoch": 0.15309126594700687, "grad_norm": 4.921535015106201, "learning_rate": 1.6954474097331242e-05, "logits/chosen": -3.0675573348999023, "logits/rejected": -2.9610018730163574, "logps/chosen": -333.9068908691406, "logps/rejected": -316.9395446777344, "loss": 0.524, "rewards/accuracies": 0.7250000834465027, "rewards/chosen": -0.31782767176628113, "rewards/margins": 0.7863305807113647, "rewards/rejected": -1.1041581630706787, "step": 195 }, { "epoch": 0.15701668302257116, "grad_norm": 5.64870548248291, "learning_rate": 1.687598116169545e-05, "logits/chosen": -2.9307010173797607, "logits/rejected": -2.9969723224639893, "logps/chosen": -315.3205871582031, "logps/rejected": -319.23876953125, "loss": 0.5485, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5450848937034607, "rewards/margins": 0.7509050965309143, "rewards/rejected": -1.295989990234375, "step": 200 }, { "epoch": 0.15701668302257116, "eval_logits/chosen": -3.0087897777557373, "eval_logits/rejected": -3.0380184650421143, "eval_logps/chosen": -334.6849670410156, "eval_logps/rejected": -325.60333251953125, "eval_loss": 0.5298904776573181, "eval_rewards/accuracies": 0.7354999780654907, "eval_rewards/chosen": -0.532148003578186, "eval_rewards/margins": 0.7853737473487854, "eval_rewards/rejected": -1.3175216913223267, "eval_runtime": 171.0727, "eval_samples_per_second": 11.691, "eval_steps_per_second": 5.845, "step": 200 }, { "epoch": 0.16094210009813542, "grad_norm": 5.222318172454834, "learning_rate": 1.6797488226059656e-05, "logits/chosen": -2.896974563598633, "logits/rejected": -2.8886332511901855, "logps/chosen": -343.0540466308594, "logps/rejected": -330.8460388183594, "loss": 0.5583, "rewards/accuracies": 0.6791667342185974, "rewards/chosen": -0.49637308716773987, "rewards/margins": 0.8196843266487122, "rewards/rejected": -1.3160574436187744, "step": 205 }, { "epoch": 0.1648675171736997, "grad_norm": 4.988500118255615, "learning_rate": 1.6718995290423864e-05, "logits/chosen": -3.0786221027374268, "logits/rejected": -3.0803046226501465, "logps/chosen": -364.15679931640625, "logps/rejected": -344.13348388671875, "loss": 0.5415, "rewards/accuracies": 0.7083333730697632, "rewards/chosen": -0.23943662643432617, "rewards/margins": 0.7990777492523193, "rewards/rejected": -1.038514494895935, "step": 210 }, { "epoch": 0.16879293424926398, "grad_norm": 5.479763984680176, "learning_rate": 1.664050235478807e-05, "logits/chosen": -2.971986770629883, "logits/rejected": -2.9459171295166016, "logps/chosen": -297.9768371582031, "logps/rejected": -293.61004638671875, "loss": 0.5346, "rewards/accuracies": 0.7416666746139526, "rewards/chosen": 0.14465472102165222, "rewards/margins": 0.7910071611404419, "rewards/rejected": -0.6463524103164673, "step": 215 }, { "epoch": 0.17271835132482827, "grad_norm": 4.327686786651611, "learning_rate": 1.6562009419152278e-05, "logits/chosen": -3.0093016624450684, "logits/rejected": -2.9483301639556885, "logps/chosen": -312.3398742675781, "logps/rejected": -321.89990234375, "loss": 0.5276, "rewards/accuracies": 0.7416667342185974, "rewards/chosen": 0.4244857728481293, "rewards/margins": 0.7893426418304443, "rewards/rejected": -0.36485689878463745, "step": 220 }, { "epoch": 0.17664376840039253, "grad_norm": 5.979199409484863, "learning_rate": 1.6483516483516486e-05, "logits/chosen": -3.0172836780548096, "logits/rejected": -3.0143485069274902, "logps/chosen": -305.2528991699219, "logps/rejected": -335.7610168457031, "loss": 0.5053, "rewards/accuracies": 0.7291666865348816, "rewards/chosen": 0.4461596608161926, "rewards/margins": 0.9256842732429504, "rewards/rejected": -0.47952452301979065, "step": 225 }, { "epoch": 0.18056918547595682, "grad_norm": 5.040202617645264, "learning_rate": 1.6405023547880693e-05, "logits/chosen": -2.9379525184631348, "logits/rejected": -2.9742045402526855, "logps/chosen": -293.68841552734375, "logps/rejected": -288.2654724121094, "loss": 0.5655, "rewards/accuracies": 0.6958333253860474, "rewards/chosen": 0.10342751443386078, "rewards/margins": 0.7664733529090881, "rewards/rejected": -0.6630457639694214, "step": 230 }, { "epoch": 0.1844946025515211, "grad_norm": 6.082977294921875, "learning_rate": 1.63265306122449e-05, "logits/chosen": -2.972628593444824, "logits/rejected": -2.9927875995635986, "logps/chosen": -342.9659118652344, "logps/rejected": -326.91204833984375, "loss": 0.589, "rewards/accuracies": 0.6791666746139526, "rewards/chosen": -0.3530040681362152, "rewards/margins": 0.6996762752532959, "rewards/rejected": -1.0526803731918335, "step": 235 }, { "epoch": 0.18842001962708538, "grad_norm": 5.5625176429748535, "learning_rate": 1.6248037676609107e-05, "logits/chosen": -2.9938926696777344, "logits/rejected": -3.048494338989258, "logps/chosen": -365.29718017578125, "logps/rejected": -342.7457275390625, "loss": 0.56, "rewards/accuracies": 0.6916667222976685, "rewards/chosen": -0.7515178918838501, "rewards/margins": 0.7500923871994019, "rewards/rejected": -1.501610279083252, "step": 240 }, { "epoch": 0.19234543670264967, "grad_norm": 4.691596984863281, "learning_rate": 1.6169544740973315e-05, "logits/chosen": -3.0218453407287598, "logits/rejected": -3.0742886066436768, "logps/chosen": -317.8963928222656, "logps/rejected": -302.0952453613281, "loss": 0.4919, "rewards/accuracies": 0.7583333253860474, "rewards/chosen": -0.7345434427261353, "rewards/margins": 0.9255669713020325, "rewards/rejected": -1.6601102352142334, "step": 245 }, { "epoch": 0.19627085377821393, "grad_norm": 6.012603759765625, "learning_rate": 1.6091051805337522e-05, "logits/chosen": -2.928499698638916, "logits/rejected": -3.021066188812256, "logps/chosen": -358.77154541015625, "logps/rejected": -351.7578430175781, "loss": 0.5549, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6217355728149414, "rewards/margins": 0.8296122550964355, "rewards/rejected": -1.4513477087020874, "step": 250 }, { "epoch": 0.20019627085377822, "grad_norm": 5.680856227874756, "learning_rate": 1.601255886970173e-05, "logits/chosen": -2.9057984352111816, "logits/rejected": -2.9277281761169434, "logps/chosen": -328.77520751953125, "logps/rejected": -336.87127685546875, "loss": 0.5115, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.37844187021255493, "rewards/margins": 0.9464572668075562, "rewards/rejected": -1.3248990774154663, "step": 255 }, { "epoch": 0.2041216879293425, "grad_norm": 5.254044532775879, "learning_rate": 1.5934065934065933e-05, "logits/chosen": -2.89184832572937, "logits/rejected": -2.9895052909851074, "logps/chosen": -327.46881103515625, "logps/rejected": -325.6787414550781, "loss": 0.5054, "rewards/accuracies": 0.7583333849906921, "rewards/chosen": -0.12966637313365936, "rewards/margins": 1.035685658454895, "rewards/rejected": -1.1653519868850708, "step": 260 }, { "epoch": 0.20804710500490678, "grad_norm": 5.806418418884277, "learning_rate": 1.5855572998430144e-05, "logits/chosen": -2.9994475841522217, "logits/rejected": -3.0370442867279053, "logps/chosen": -306.01910400390625, "logps/rejected": -285.0171203613281, "loss": 0.5742, "rewards/accuracies": 0.6958333253860474, "rewards/chosen": -0.1305859386920929, "rewards/margins": 0.7369558215141296, "rewards/rejected": -0.8675416707992554, "step": 265 }, { "epoch": 0.21197252208047104, "grad_norm": 4.36320686340332, "learning_rate": 1.577708006279435e-05, "logits/chosen": -2.8925890922546387, "logits/rejected": -2.969078540802002, "logps/chosen": -336.0727233886719, "logps/rejected": -323.86175537109375, "loss": 0.4837, "rewards/accuracies": 0.7791666984558105, "rewards/chosen": -0.15950943529605865, "rewards/margins": 0.9283342361450195, "rewards/rejected": -1.087843656539917, "step": 270 }, { "epoch": 0.21589793915603533, "grad_norm": 4.775585174560547, "learning_rate": 1.5698587127158558e-05, "logits/chosen": -2.9799633026123047, "logits/rejected": -2.998260736465454, "logps/chosen": -339.5887756347656, "logps/rejected": -322.0613708496094, "loss": 0.4891, "rewards/accuracies": 0.7458333969116211, "rewards/chosen": -0.2023775577545166, "rewards/margins": 0.9467275738716125, "rewards/rejected": -1.1491053104400635, "step": 275 }, { "epoch": 0.2198233562315996, "grad_norm": 5.527752876281738, "learning_rate": 1.5620094191522762e-05, "logits/chosen": -3.0590403079986572, "logits/rejected": -3.1203842163085938, "logps/chosen": -326.7115478515625, "logps/rejected": -302.5054016113281, "loss": 0.5579, "rewards/accuracies": 0.6791666746139526, "rewards/chosen": -0.37672197818756104, "rewards/margins": 0.8057243227958679, "rewards/rejected": -1.1824463605880737, "step": 280 }, { "epoch": 0.2237487733071639, "grad_norm": 4.813480377197266, "learning_rate": 1.5541601255886973e-05, "logits/chosen": -3.0462276935577393, "logits/rejected": -3.101527690887451, "logps/chosen": -325.5246276855469, "logps/rejected": -336.7899475097656, "loss": 0.5218, "rewards/accuracies": 0.7375000715255737, "rewards/chosen": -0.45412200689315796, "rewards/margins": 0.7958036661148071, "rewards/rejected": -1.2499258518218994, "step": 285 }, { "epoch": 0.22767419038272815, "grad_norm": 4.553357124328613, "learning_rate": 1.546310832025118e-05, "logits/chosen": -2.9580142498016357, "logits/rejected": -2.963024139404297, "logps/chosen": -342.618896484375, "logps/rejected": -353.5760803222656, "loss": 0.5056, "rewards/accuracies": 0.7458333373069763, "rewards/chosen": -0.3053968548774719, "rewards/margins": 0.9428439140319824, "rewards/rejected": -1.2482408285140991, "step": 290 }, { "epoch": 0.23159960745829244, "grad_norm": 5.060155391693115, "learning_rate": 1.5384615384615387e-05, "logits/chosen": -2.9472720623016357, "logits/rejected": -3.016634702682495, "logps/chosen": -344.4608154296875, "logps/rejected": -318.6928405761719, "loss": 0.4181, "rewards/accuracies": 0.79583340883255, "rewards/chosen": -0.15565678477287292, "rewards/margins": 1.2021899223327637, "rewards/rejected": -1.3578466176986694, "step": 295 }, { "epoch": 0.23552502453385674, "grad_norm": 3.455629348754883, "learning_rate": 1.530612244897959e-05, "logits/chosen": -2.9951541423797607, "logits/rejected": -3.020198106765747, "logps/chosen": -308.458251953125, "logps/rejected": -329.9710998535156, "loss": 0.4599, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.03096728026866913, "rewards/margins": 1.1172107458114624, "rewards/rejected": -1.148177981376648, "step": 300 }, { "epoch": 0.23552502453385674, "eval_logits/chosen": -3.0017831325531006, "eval_logits/rejected": -3.031320333480835, "eval_logps/chosen": -331.4214782714844, "eval_logps/rejected": -324.1851806640625, "eval_loss": 0.5177174210548401, "eval_rewards/accuracies": 0.7275000214576721, "eval_rewards/chosen": -0.20579998195171356, "eval_rewards/margins": 0.9699056148529053, "eval_rewards/rejected": -1.1757057905197144, "eval_runtime": 170.7357, "eval_samples_per_second": 11.714, "eval_steps_per_second": 5.857, "step": 300 }, { "epoch": 0.239450441609421, "grad_norm": 5.653757572174072, "learning_rate": 1.52276295133438e-05, "logits/chosen": -2.964921236038208, "logits/rejected": -2.9607841968536377, "logps/chosen": -355.35760498046875, "logps/rejected": -382.9061584472656, "loss": 0.5153, "rewards/accuracies": 0.720833420753479, "rewards/chosen": -0.141874298453331, "rewards/margins": 1.0537707805633545, "rewards/rejected": -1.1956450939178467, "step": 305 }, { "epoch": 0.2433758586849853, "grad_norm": 5.022225856781006, "learning_rate": 1.5149136577708007e-05, "logits/chosen": -2.906406879425049, "logits/rejected": -3.0163021087646484, "logps/chosen": -315.5581970214844, "logps/rejected": -309.3556213378906, "loss": 0.5086, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.02891678549349308, "rewards/margins": 1.1372863054275513, "rewards/rejected": -1.1662030220031738, "step": 310 }, { "epoch": 0.24730127576054955, "grad_norm": 4.488466739654541, "learning_rate": 1.5070643642072216e-05, "logits/chosen": -3.0036263465881348, "logits/rejected": -3.0129470825195312, "logps/chosen": -334.03955078125, "logps/rejected": -307.4307861328125, "loss": 0.5034, "rewards/accuracies": 0.7458333373069763, "rewards/chosen": -0.05140721797943115, "rewards/margins": 1.0299065113067627, "rewards/rejected": -1.0813137292861938, "step": 315 }, { "epoch": 0.2512266928361138, "grad_norm": 6.025671482086182, "learning_rate": 1.4992150706436422e-05, "logits/chosen": -3.0598220825195312, "logits/rejected": -2.9974725246429443, "logps/chosen": -336.5476989746094, "logps/rejected": -314.2373046875, "loss": 0.5272, "rewards/accuracies": 0.7416667342185974, "rewards/chosen": -0.0364832878112793, "rewards/margins": 0.9740719795227051, "rewards/rejected": -1.010555386543274, "step": 320 }, { "epoch": 0.25515210991167814, "grad_norm": 3.7678122520446777, "learning_rate": 1.4913657770800629e-05, "logits/chosen": -3.0125625133514404, "logits/rejected": -3.0665228366851807, "logps/chosen": -373.77386474609375, "logps/rejected": -362.3180847167969, "loss": 0.4491, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -0.11678016185760498, "rewards/margins": 1.1747580766677856, "rewards/rejected": -1.291538119316101, "step": 325 }, { "epoch": 0.2590775269872424, "grad_norm": 6.283595085144043, "learning_rate": 1.4835164835164836e-05, "logits/chosen": -2.9614596366882324, "logits/rejected": -3.0732309818267822, "logps/chosen": -340.6827392578125, "logps/rejected": -330.89178466796875, "loss": 0.5298, "rewards/accuracies": 0.7583334445953369, "rewards/chosen": -0.27896976470947266, "rewards/margins": 1.1068425178527832, "rewards/rejected": -1.3858124017715454, "step": 330 }, { "epoch": 0.26300294406280667, "grad_norm": 5.159917831420898, "learning_rate": 1.4756671899529042e-05, "logits/chosen": -2.9882471561431885, "logits/rejected": -2.958286762237549, "logps/chosen": -304.15899658203125, "logps/rejected": -337.4718933105469, "loss": 0.5442, "rewards/accuracies": 0.7416666746139526, "rewards/chosen": -0.5084502100944519, "rewards/margins": 0.826800525188446, "rewards/rejected": -1.3352506160736084, "step": 335 }, { "epoch": 0.26692836113837093, "grad_norm": 5.694521427154541, "learning_rate": 1.467817896389325e-05, "logits/chosen": -3.0913736820220947, "logits/rejected": -3.1289680004119873, "logps/chosen": -306.4797668457031, "logps/rejected": -315.7909851074219, "loss": 0.5478, "rewards/accuracies": 0.708333432674408, "rewards/chosen": -0.5964398384094238, "rewards/margins": 0.8492003679275513, "rewards/rejected": -1.4456400871276855, "step": 340 }, { "epoch": 0.27085377821393525, "grad_norm": 4.4798665046691895, "learning_rate": 1.4599686028257458e-05, "logits/chosen": -2.967850923538208, "logits/rejected": -3.0410642623901367, "logps/chosen": -341.167236328125, "logps/rejected": -345.0897216796875, "loss": 0.491, "rewards/accuracies": 0.7541667222976685, "rewards/chosen": -0.3792761266231537, "rewards/margins": 1.023285150527954, "rewards/rejected": -1.4025614261627197, "step": 345 }, { "epoch": 0.2747791952894995, "grad_norm": 4.080179214477539, "learning_rate": 1.4521193092621665e-05, "logits/chosen": -3.078789234161377, "logits/rejected": -3.0680148601531982, "logps/chosen": -342.5566711425781, "logps/rejected": -333.19732666015625, "loss": 0.4792, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.26053228974342346, "rewards/margins": 1.0307527780532837, "rewards/rejected": -1.2912850379943848, "step": 350 }, { "epoch": 0.2787046123650638, "grad_norm": 6.406314849853516, "learning_rate": 1.4442700156985871e-05, "logits/chosen": -2.993866443634033, "logits/rejected": -3.0816330909729004, "logps/chosen": -325.2431640625, "logps/rejected": -306.7488708496094, "loss": 0.5484, "rewards/accuracies": 0.7291666865348816, "rewards/chosen": -0.2325890064239502, "rewards/margins": 0.7650774121284485, "rewards/rejected": -0.9976664781570435, "step": 355 }, { "epoch": 0.2826300294406281, "grad_norm": 4.517834663391113, "learning_rate": 1.436420722135008e-05, "logits/chosen": -2.9789466857910156, "logits/rejected": -2.9082839488983154, "logps/chosen": -326.4145202636719, "logps/rejected": -335.26019287109375, "loss": 0.4693, "rewards/accuracies": 0.7541667222976685, "rewards/chosen": -0.11003933846950531, "rewards/margins": 0.9707590341567993, "rewards/rejected": -1.0807983875274658, "step": 360 }, { "epoch": 0.28655544651619236, "grad_norm": 3.715141534805298, "learning_rate": 1.4285714285714287e-05, "logits/chosen": -2.981189250946045, "logits/rejected": -3.0649161338806152, "logps/chosen": -307.39764404296875, "logps/rejected": -301.6369934082031, "loss": 0.481, "rewards/accuracies": 0.7791666984558105, "rewards/chosen": -0.12946780025959015, "rewards/margins": 1.1251821517944336, "rewards/rejected": -1.2546498775482178, "step": 365 }, { "epoch": 0.2904808635917566, "grad_norm": 3.333448648452759, "learning_rate": 1.4207221350078494e-05, "logits/chosen": -2.984611988067627, "logits/rejected": -3.030081272125244, "logps/chosen": -309.75335693359375, "logps/rejected": -308.0655517578125, "loss": 0.4995, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -0.0332108810544014, "rewards/margins": 1.0196669101715088, "rewards/rejected": -1.0528777837753296, "step": 370 }, { "epoch": 0.2944062806673209, "grad_norm": 5.067115306854248, "learning_rate": 1.41287284144427e-05, "logits/chosen": -3.0295064449310303, "logits/rejected": -3.0551486015319824, "logps/chosen": -306.81561279296875, "logps/rejected": -293.89971923828125, "loss": 0.5708, "rewards/accuracies": 0.6833333373069763, "rewards/chosen": -0.12991970777511597, "rewards/margins": 0.8558316230773926, "rewards/rejected": -0.9857513308525085, "step": 375 }, { "epoch": 0.2983316977428852, "grad_norm": 4.063174247741699, "learning_rate": 1.4050235478806909e-05, "logits/chosen": -2.9749984741210938, "logits/rejected": -3.0381150245666504, "logps/chosen": -374.1710510253906, "logps/rejected": -350.29168701171875, "loss": 0.5021, "rewards/accuracies": 0.7541667222976685, "rewards/chosen": -0.11557143926620483, "rewards/margins": 1.067694067955017, "rewards/rejected": -1.1832655668258667, "step": 380 }, { "epoch": 0.30225711481844947, "grad_norm": 4.8487396240234375, "learning_rate": 1.3971742543171116e-05, "logits/chosen": -2.9603800773620605, "logits/rejected": -3.019071102142334, "logps/chosen": -311.37628173828125, "logps/rejected": -320.9827575683594, "loss": 0.4274, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": -0.017604345455765724, "rewards/margins": 1.2022392749786377, "rewards/rejected": -1.2198436260223389, "step": 385 }, { "epoch": 0.30618253189401373, "grad_norm": 4.178959846496582, "learning_rate": 1.3893249607535323e-05, "logits/chosen": -3.040865421295166, "logits/rejected": -3.0666539669036865, "logps/chosen": -327.70928955078125, "logps/rejected": -322.9264221191406, "loss": 0.4558, "rewards/accuracies": 0.7750000953674316, "rewards/chosen": 0.11298879235982895, "rewards/margins": 1.198480248451233, "rewards/rejected": -1.085491418838501, "step": 390 }, { "epoch": 0.310107948969578, "grad_norm": 4.6533122062683105, "learning_rate": 1.3814756671899529e-05, "logits/chosen": -2.9745497703552246, "logits/rejected": -3.0309340953826904, "logps/chosen": -330.2334289550781, "logps/rejected": -340.81536865234375, "loss": 0.5296, "rewards/accuracies": 0.7041667103767395, "rewards/chosen": -0.015470663085579872, "rewards/margins": 1.0612616539001465, "rewards/rejected": -1.0767322778701782, "step": 395 }, { "epoch": 0.3140333660451423, "grad_norm": 5.215358734130859, "learning_rate": 1.3736263736263738e-05, "logits/chosen": -3.002040147781372, "logits/rejected": -3.0286686420440674, "logps/chosen": -353.7515563964844, "logps/rejected": -327.62030029296875, "loss": 0.5766, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2245529592037201, "rewards/margins": 0.8832836151123047, "rewards/rejected": -1.1078366041183472, "step": 400 }, { "epoch": 0.3140333660451423, "eval_logits/chosen": -3.016608953475952, "eval_logits/rejected": -3.0451409816741943, "eval_logps/chosen": -331.353271484375, "eval_logps/rejected": -324.7242736816406, "eval_loss": 0.5065792202949524, "eval_rewards/accuracies": 0.7444999814033508, "eval_rewards/chosen": -0.19897931814193726, "eval_rewards/margins": 1.030638337135315, "eval_rewards/rejected": -1.2296175956726074, "eval_runtime": 172.2901, "eval_samples_per_second": 11.608, "eval_steps_per_second": 5.804, "step": 400 }, { "epoch": 0.3179587831207066, "grad_norm": 3.946781635284424, "learning_rate": 1.3657770800627945e-05, "logits/chosen": -2.966956853866577, "logits/rejected": -3.056098461151123, "logps/chosen": -356.13909912109375, "logps/rejected": -317.91571044921875, "loss": 0.4387, "rewards/accuracies": 0.7791666984558105, "rewards/chosen": -0.06729185581207275, "rewards/margins": 1.262393832206726, "rewards/rejected": -1.3296858072280884, "step": 405 }, { "epoch": 0.32188420019627084, "grad_norm": 5.394034385681152, "learning_rate": 1.357927786499215e-05, "logits/chosen": -3.05330491065979, "logits/rejected": -3.038001775741577, "logps/chosen": -359.85101318359375, "logps/rejected": -340.84295654296875, "loss": 0.5456, "rewards/accuracies": 0.7458333969116211, "rewards/chosen": -0.33758872747421265, "rewards/margins": 0.9710124731063843, "rewards/rejected": -1.3086011409759521, "step": 410 }, { "epoch": 0.3258096172718351, "grad_norm": 5.074666976928711, "learning_rate": 1.3500784929356358e-05, "logits/chosen": -3.051016092300415, "logits/rejected": -3.0877463817596436, "logps/chosen": -344.28314208984375, "logps/rejected": -312.5767822265625, "loss": 0.5506, "rewards/accuracies": 0.7166666984558105, "rewards/chosen": -0.33664119243621826, "rewards/margins": 0.9169348478317261, "rewards/rejected": -1.2535761594772339, "step": 415 }, { "epoch": 0.3297350343473994, "grad_norm": 6.106690883636475, "learning_rate": 1.3422291993720567e-05, "logits/chosen": -3.0146520137786865, "logits/rejected": -3.0691580772399902, "logps/chosen": -326.73046875, "logps/rejected": -341.7567138671875, "loss": 0.5092, "rewards/accuracies": 0.7416666746139526, "rewards/chosen": -0.4858369827270508, "rewards/margins": 1.0325465202331543, "rewards/rejected": -1.5183833837509155, "step": 420 }, { "epoch": 0.3336604514229637, "grad_norm": 4.349857330322266, "learning_rate": 1.3343799058084774e-05, "logits/chosen": -3.0472264289855957, "logits/rejected": -3.1090333461761475, "logps/chosen": -336.30706787109375, "logps/rejected": -323.3406982421875, "loss": 0.4795, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -0.33309873938560486, "rewards/margins": 1.1322623491287231, "rewards/rejected": -1.46536123752594, "step": 425 }, { "epoch": 0.33758586849852795, "grad_norm": 4.5804219245910645, "learning_rate": 1.326530612244898e-05, "logits/chosen": -2.928574323654175, "logits/rejected": -3.0479984283447266, "logps/chosen": -346.50762939453125, "logps/rejected": -323.5039978027344, "loss": 0.4464, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.41089972853660583, "rewards/margins": 1.1351535320281982, "rewards/rejected": -1.546053409576416, "step": 430 }, { "epoch": 0.34151128557409227, "grad_norm": 4.321537494659424, "learning_rate": 1.3186813186813187e-05, "logits/chosen": -3.0598042011260986, "logits/rejected": -3.1074109077453613, "logps/chosen": -331.7185363769531, "logps/rejected": -315.3821716308594, "loss": 0.5302, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3797026574611664, "rewards/margins": 0.9000622034072876, "rewards/rejected": -1.2797647714614868, "step": 435 }, { "epoch": 0.34543670264965654, "grad_norm": 5.391077041625977, "learning_rate": 1.3108320251177396e-05, "logits/chosen": -3.0739028453826904, "logits/rejected": -3.085374355316162, "logps/chosen": -336.9720458984375, "logps/rejected": -317.68609619140625, "loss": 0.4884, "rewards/accuracies": 0.7625001072883606, "rewards/chosen": 0.12200820446014404, "rewards/margins": 1.0954450368881226, "rewards/rejected": -0.973436713218689, "step": 440 }, { "epoch": 0.3493621197252208, "grad_norm": 4.104001045227051, "learning_rate": 1.3029827315541603e-05, "logits/chosen": -3.0037009716033936, "logits/rejected": -3.0276780128479004, "logps/chosen": -311.5299377441406, "logps/rejected": -290.1028137207031, "loss": 0.5133, "rewards/accuracies": 0.7291667461395264, "rewards/chosen": 0.2962660789489746, "rewards/margins": 1.045003056526184, "rewards/rejected": -0.7487369775772095, "step": 445 }, { "epoch": 0.35328753680078506, "grad_norm": 4.049530506134033, "learning_rate": 1.2951334379905809e-05, "logits/chosen": -3.0438899993896484, "logits/rejected": -3.1289470195770264, "logps/chosen": -312.2237548828125, "logps/rejected": -289.80206298828125, "loss": 0.4748, "rewards/accuracies": 0.783333420753479, "rewards/chosen": 0.2921372056007385, "rewards/margins": 1.1328870058059692, "rewards/rejected": -0.8407497406005859, "step": 450 }, { "epoch": 0.3572129538763494, "grad_norm": 4.7967753410339355, "learning_rate": 1.2872841444270016e-05, "logits/chosen": -3.0214269161224365, "logits/rejected": -3.032402276992798, "logps/chosen": -320.35296630859375, "logps/rejected": -322.9061279296875, "loss": 0.4614, "rewards/accuracies": 0.7708333730697632, "rewards/chosen": 0.3199036717414856, "rewards/margins": 1.2129650115966797, "rewards/rejected": -0.8930614590644836, "step": 455 }, { "epoch": 0.36113837095191365, "grad_norm": 4.228839874267578, "learning_rate": 1.2794348508634225e-05, "logits/chosen": -3.0375332832336426, "logits/rejected": -3.0748536586761475, "logps/chosen": -338.22369384765625, "logps/rejected": -323.2271423339844, "loss": 0.4843, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": 0.07565226405858994, "rewards/margins": 1.1118285655975342, "rewards/rejected": -1.036176323890686, "step": 460 }, { "epoch": 0.3650637880274779, "grad_norm": 4.642974376678467, "learning_rate": 1.271585557299843e-05, "logits/chosen": -3.008430004119873, "logits/rejected": -3.081514835357666, "logps/chosen": -329.63775634765625, "logps/rejected": -327.0590515136719, "loss": 0.5383, "rewards/accuracies": 0.7166666984558105, "rewards/chosen": -0.1453629583120346, "rewards/margins": 1.031752347946167, "rewards/rejected": -1.1771153211593628, "step": 465 }, { "epoch": 0.3689892051030422, "grad_norm": 5.04410457611084, "learning_rate": 1.2637362637362638e-05, "logits/chosen": -3.0850863456726074, "logits/rejected": -3.0572659969329834, "logps/chosen": -331.3392028808594, "logps/rejected": -314.00653076171875, "loss": 0.4975, "rewards/accuracies": 0.7083333730697632, "rewards/chosen": -0.1242925375699997, "rewards/margins": 1.032974362373352, "rewards/rejected": -1.1572668552398682, "step": 470 }, { "epoch": 0.3729146221786065, "grad_norm": 6.009474754333496, "learning_rate": 1.2558869701726845e-05, "logits/chosen": -2.9685397148132324, "logits/rejected": -3.083059787750244, "logps/chosen": -352.71142578125, "logps/rejected": -327.12872314453125, "loss": 0.5339, "rewards/accuracies": 0.6958333849906921, "rewards/chosen": -0.14256823062896729, "rewards/margins": 1.011826753616333, "rewards/rejected": -1.1543948650360107, "step": 475 }, { "epoch": 0.37684003925417076, "grad_norm": 5.437267303466797, "learning_rate": 1.2480376766091054e-05, "logits/chosen": -3.053178310394287, "logits/rejected": -3.0942587852478027, "logps/chosen": -320.6322021484375, "logps/rejected": -312.03155517578125, "loss": 0.5107, "rewards/accuracies": 0.7708333730697632, "rewards/chosen": -0.028355002403259277, "rewards/margins": 1.0027117729187012, "rewards/rejected": -1.031066656112671, "step": 480 }, { "epoch": 0.380765456329735, "grad_norm": 5.028485298156738, "learning_rate": 1.240188383045526e-05, "logits/chosen": -2.875955104827881, "logits/rejected": -2.964559555053711, "logps/chosen": -347.6171569824219, "logps/rejected": -321.8453369140625, "loss": 0.523, "rewards/accuracies": 0.7208333611488342, "rewards/chosen": -0.030794035643339157, "rewards/margins": 1.1214120388031006, "rewards/rejected": -1.1522061824798584, "step": 485 }, { "epoch": 0.38469087340529934, "grad_norm": 4.091563701629639, "learning_rate": 1.2323390894819467e-05, "logits/chosen": -2.8418350219726562, "logits/rejected": -2.8112716674804688, "logps/chosen": -309.3431091308594, "logps/rejected": -331.121826171875, "loss": 0.4678, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": 0.0016341328155249357, "rewards/margins": 1.139491319656372, "rewards/rejected": -1.13785719871521, "step": 490 }, { "epoch": 0.3886162904808636, "grad_norm": 4.04428768157959, "learning_rate": 1.2244897959183674e-05, "logits/chosen": -3.047217845916748, "logits/rejected": -3.066889524459839, "logps/chosen": -317.3092956542969, "logps/rejected": -299.8882751464844, "loss": 0.5092, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.00645809480920434, "rewards/margins": 0.9209384918212891, "rewards/rejected": -0.9273965954780579, "step": 495 }, { "epoch": 0.39254170755642787, "grad_norm": 4.938584804534912, "learning_rate": 1.2166405023547883e-05, "logits/chosen": -3.029782772064209, "logits/rejected": -3.041254758834839, "logps/chosen": -329.04168701171875, "logps/rejected": -326.0158996582031, "loss": 0.4689, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.10879228264093399, "rewards/margins": 1.0214567184448242, "rewards/rejected": -0.9126644134521484, "step": 500 }, { "epoch": 0.39254170755642787, "eval_logits/chosen": -3.0307250022888184, "eval_logits/rejected": -3.0587258338928223, "eval_logps/chosen": -328.3224792480469, "eval_logps/rejected": -321.09954833984375, "eval_loss": 0.5021634697914124, "eval_rewards/accuracies": 0.7450000047683716, "eval_rewards/chosen": 0.10410188138484955, "eval_rewards/margins": 0.971247136592865, "eval_rewards/rejected": -0.8671452403068542, "eval_runtime": 170.3972, "eval_samples_per_second": 11.737, "eval_steps_per_second": 5.869, "step": 500 }, { "epoch": 0.39646712463199213, "grad_norm": 6.137747287750244, "learning_rate": 1.2087912087912089e-05, "logits/chosen": -3.0541741847991943, "logits/rejected": -3.036529064178467, "logps/chosen": -306.2962341308594, "logps/rejected": -317.01666259765625, "loss": 0.5514, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.02994796633720398, "rewards/margins": 0.8114150762557983, "rewards/rejected": -0.7814672589302063, "step": 505 }, { "epoch": 0.40039254170755645, "grad_norm": 5.219884872436523, "learning_rate": 1.2009419152276296e-05, "logits/chosen": -3.1144938468933105, "logits/rejected": -3.1160566806793213, "logps/chosen": -329.6716613769531, "logps/rejected": -319.11407470703125, "loss": 0.5221, "rewards/accuracies": 0.7625000476837158, "rewards/chosen": 0.043878063559532166, "rewards/margins": 1.0103065967559814, "rewards/rejected": -0.9664285778999329, "step": 510 }, { "epoch": 0.4043179587831207, "grad_norm": 4.543951511383057, "learning_rate": 1.1930926216640503e-05, "logits/chosen": -2.979218006134033, "logits/rejected": -2.968158483505249, "logps/chosen": -286.9298400878906, "logps/rejected": -313.0599060058594, "loss": 0.5016, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -0.1237020492553711, "rewards/margins": 0.9319057464599609, "rewards/rejected": -1.055607795715332, "step": 515 }, { "epoch": 0.408243375858685, "grad_norm": 5.305350303649902, "learning_rate": 1.1852433281004712e-05, "logits/chosen": -2.945699691772461, "logits/rejected": -2.975595474243164, "logps/chosen": -313.67431640625, "logps/rejected": -317.87261962890625, "loss": 0.4597, "rewards/accuracies": 0.7625000476837158, "rewards/chosen": 0.10118236392736435, "rewards/margins": 1.1309287548065186, "rewards/rejected": -1.0297462940216064, "step": 520 }, { "epoch": 0.41216879293424924, "grad_norm": 4.5991692543029785, "learning_rate": 1.1773940345368918e-05, "logits/chosen": -2.9673571586608887, "logits/rejected": -2.9875621795654297, "logps/chosen": -318.8524475097656, "logps/rejected": -301.4873046875, "loss": 0.4501, "rewards/accuracies": 0.79583340883255, "rewards/chosen": -0.11968664824962616, "rewards/margins": 1.2507131099700928, "rewards/rejected": -1.370399832725525, "step": 525 }, { "epoch": 0.41609421000981356, "grad_norm": 4.974186420440674, "learning_rate": 1.1695447409733125e-05, "logits/chosen": -2.9936716556549072, "logits/rejected": -3.038217544555664, "logps/chosen": -349.4075927734375, "logps/rejected": -314.17413330078125, "loss": 0.5153, "rewards/accuracies": 0.7583333253860474, "rewards/chosen": -0.30208900570869446, "rewards/margins": 1.0824108123779297, "rewards/rejected": -1.3844999074935913, "step": 530 }, { "epoch": 0.4200196270853778, "grad_norm": 4.36262845993042, "learning_rate": 1.1616954474097332e-05, "logits/chosen": -3.0161938667297363, "logits/rejected": -3.0261764526367188, "logps/chosen": -342.5355529785156, "logps/rejected": -355.38604736328125, "loss": 0.4833, "rewards/accuracies": 0.7541667222976685, "rewards/chosen": -0.37566858530044556, "rewards/margins": 1.1315479278564453, "rewards/rejected": -1.5072165727615356, "step": 535 }, { "epoch": 0.4239450441609421, "grad_norm": 4.01798677444458, "learning_rate": 1.1538461538461538e-05, "logits/chosen": -3.040135145187378, "logits/rejected": -3.107337474822998, "logps/chosen": -312.9956970214844, "logps/rejected": -323.22760009765625, "loss": 0.4472, "rewards/accuracies": 0.7916667461395264, "rewards/chosen": -0.5123935341835022, "rewards/margins": 1.0743831396102905, "rewards/rejected": -1.5867767333984375, "step": 540 }, { "epoch": 0.4278704612365064, "grad_norm": 5.212751388549805, "learning_rate": 1.1459968602825747e-05, "logits/chosen": -3.0182480812072754, "logits/rejected": -3.0287539958953857, "logps/chosen": -332.08709716796875, "logps/rejected": -337.66424560546875, "loss": 0.5333, "rewards/accuracies": 0.7291666865348816, "rewards/chosen": -0.5322802066802979, "rewards/margins": 1.01456618309021, "rewards/rejected": -1.546846628189087, "step": 545 }, { "epoch": 0.43179587831207067, "grad_norm": 4.944022178649902, "learning_rate": 1.1381475667189954e-05, "logits/chosen": -2.9780099391937256, "logits/rejected": -3.069441080093384, "logps/chosen": -343.2859191894531, "logps/rejected": -345.21441650390625, "loss": 0.5462, "rewards/accuracies": 0.7041666507720947, "rewards/chosen": -0.5427877306938171, "rewards/margins": 1.0927588939666748, "rewards/rejected": -1.6355466842651367, "step": 550 }, { "epoch": 0.43572129538763493, "grad_norm": 3.889643669128418, "learning_rate": 1.1302982731554161e-05, "logits/chosen": -2.9730026721954346, "logits/rejected": -3.022367477416992, "logps/chosen": -324.47406005859375, "logps/rejected": -326.12353515625, "loss": 0.4898, "rewards/accuracies": 0.7708333134651184, "rewards/chosen": -0.42287206649780273, "rewards/margins": 1.078303575515747, "rewards/rejected": -1.5011756420135498, "step": 555 }, { "epoch": 0.4396467124631992, "grad_norm": 5.789305210113525, "learning_rate": 1.1224489795918367e-05, "logits/chosen": -3.0275089740753174, "logits/rejected": -3.048832654953003, "logps/chosen": -327.5396423339844, "logps/rejected": -334.8791198730469, "loss": 0.4796, "rewards/accuracies": 0.7625000476837158, "rewards/chosen": -0.35886624455451965, "rewards/margins": 1.1369972229003906, "rewards/rejected": -1.4958635568618774, "step": 560 }, { "epoch": 0.4435721295387635, "grad_norm": 4.372865676879883, "learning_rate": 1.1145996860282576e-05, "logits/chosen": -3.0532193183898926, "logits/rejected": -3.059150457382202, "logps/chosen": -312.3909606933594, "logps/rejected": -333.62982177734375, "loss": 0.5414, "rewards/accuracies": 0.7000001072883606, "rewards/chosen": -0.38431116938591003, "rewards/margins": 0.9614561796188354, "rewards/rejected": -1.3457673788070679, "step": 565 }, { "epoch": 0.4474975466143278, "grad_norm": 4.2039642333984375, "learning_rate": 1.1067503924646783e-05, "logits/chosen": -2.9767494201660156, "logits/rejected": -3.029498338699341, "logps/chosen": -304.53497314453125, "logps/rejected": -317.2914733886719, "loss": 0.4874, "rewards/accuracies": 0.7541667222976685, "rewards/chosen": -0.24222330749034882, "rewards/margins": 1.0617117881774902, "rewards/rejected": -1.303935170173645, "step": 570 }, { "epoch": 0.45142296368989204, "grad_norm": 4.793886661529541, "learning_rate": 1.098901098901099e-05, "logits/chosen": -3.0597524642944336, "logits/rejected": -3.086867332458496, "logps/chosen": -341.6616516113281, "logps/rejected": -336.95037841796875, "loss": 0.4886, "rewards/accuracies": 0.7708333730697632, "rewards/chosen": -0.08540080487728119, "rewards/margins": 1.0467342138290405, "rewards/rejected": -1.132135033607483, "step": 575 }, { "epoch": 0.4553483807654563, "grad_norm": 14.698615074157715, "learning_rate": 1.0910518053375196e-05, "logits/chosen": -3.0199100971221924, "logits/rejected": -3.032982349395752, "logps/chosen": -356.33404541015625, "logps/rejected": -344.212890625, "loss": 0.459, "rewards/accuracies": 0.7666667699813843, "rewards/chosen": 0.0503697507083416, "rewards/margins": 1.1821739673614502, "rewards/rejected": -1.1318042278289795, "step": 580 }, { "epoch": 0.4592737978410206, "grad_norm": 3.817831516265869, "learning_rate": 1.0832025117739405e-05, "logits/chosen": -3.019148349761963, "logits/rejected": -2.993727207183838, "logps/chosen": -329.00787353515625, "logps/rejected": -309.19903564453125, "loss": 0.4704, "rewards/accuracies": 0.7666667699813843, "rewards/chosen": 0.019623804837465286, "rewards/margins": 1.1090670824050903, "rewards/rejected": -1.0894432067871094, "step": 585 }, { "epoch": 0.4631992149165849, "grad_norm": 4.845146656036377, "learning_rate": 1.0753532182103612e-05, "logits/chosen": -3.0271801948547363, "logits/rejected": -3.024869441986084, "logps/chosen": -352.64697265625, "logps/rejected": -353.69305419921875, "loss": 0.494, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -0.22464075684547424, "rewards/margins": 1.1474275588989258, "rewards/rejected": -1.3720684051513672, "step": 590 }, { "epoch": 0.46712463199214915, "grad_norm": 4.333688735961914, "learning_rate": 1.067503924646782e-05, "logits/chosen": -3.0770018100738525, "logits/rejected": -3.119150400161743, "logps/chosen": -354.69439697265625, "logps/rejected": -327.85791015625, "loss": 0.456, "rewards/accuracies": 0.7791666388511658, "rewards/chosen": -0.06289488822221756, "rewards/margins": 1.210386037826538, "rewards/rejected": -1.2732809782028198, "step": 595 }, { "epoch": 0.47105004906771347, "grad_norm": 4.578314304351807, "learning_rate": 1.0596546310832025e-05, "logits/chosen": -3.0018227100372314, "logits/rejected": -3.0865108966827393, "logps/chosen": -311.1248779296875, "logps/rejected": -309.74273681640625, "loss": 0.4818, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.05175580456852913, "rewards/margins": 1.112265944480896, "rewards/rejected": -1.1640217304229736, "step": 600 }, { "epoch": 0.47105004906771347, "eval_logits/chosen": -3.0338661670684814, "eval_logits/rejected": -3.0616049766540527, "eval_logps/chosen": -330.5699157714844, "eval_logps/rejected": -324.86346435546875, "eval_loss": 0.49363288283348083, "eval_rewards/accuracies": 0.7524999976158142, "eval_rewards/chosen": -0.1206398755311966, "eval_rewards/margins": 1.1228933334350586, "eval_rewards/rejected": -1.2435332536697388, "eval_runtime": 171.2479, "eval_samples_per_second": 11.679, "eval_steps_per_second": 5.839, "step": 600 }, { "epoch": 0.47497546614327774, "grad_norm": 5.259768009185791, "learning_rate": 1.0518053375196234e-05, "logits/chosen": -3.009653091430664, "logits/rejected": -3.0764992237091064, "logps/chosen": -346.80059814453125, "logps/rejected": -324.3446350097656, "loss": 0.5004, "rewards/accuracies": 0.7375000715255737, "rewards/chosen": -0.03653601557016373, "rewards/margins": 1.1117885112762451, "rewards/rejected": -1.1483246088027954, "step": 605 }, { "epoch": 0.478900883218842, "grad_norm": 6.143235206604004, "learning_rate": 1.0439560439560441e-05, "logits/chosen": -2.989396810531616, "logits/rejected": -3.0582377910614014, "logps/chosen": -337.9186096191406, "logps/rejected": -332.8787841796875, "loss": 0.575, "rewards/accuracies": 0.6875000596046448, "rewards/chosen": -0.15022191405296326, "rewards/margins": 1.013187050819397, "rewards/rejected": -1.163408875465393, "step": 610 }, { "epoch": 0.48282630029440626, "grad_norm": 5.3697590827941895, "learning_rate": 1.0361067503924647e-05, "logits/chosen": -3.028064727783203, "logits/rejected": -3.033423662185669, "logps/chosen": -343.3203430175781, "logps/rejected": -327.01202392578125, "loss": 0.4706, "rewards/accuracies": 0.7791666984558105, "rewards/chosen": 0.030805181711912155, "rewards/margins": 1.2250150442123413, "rewards/rejected": -1.1942098140716553, "step": 615 }, { "epoch": 0.4867517173699706, "grad_norm": 3.8416597843170166, "learning_rate": 1.0282574568288854e-05, "logits/chosen": -2.909081220626831, "logits/rejected": -2.975391387939453, "logps/chosen": -323.62725830078125, "logps/rejected": -332.06524658203125, "loss": 0.4792, "rewards/accuracies": 0.7291666865348816, "rewards/chosen": -0.17823012173175812, "rewards/margins": 1.1644293069839478, "rewards/rejected": -1.3426594734191895, "step": 620 }, { "epoch": 0.49067713444553485, "grad_norm": 5.597027778625488, "learning_rate": 1.0204081632653063e-05, "logits/chosen": -3.052537441253662, "logits/rejected": -3.0111711025238037, "logps/chosen": -319.8970642089844, "logps/rejected": -322.13671875, "loss": 0.5098, "rewards/accuracies": 0.7541667222976685, "rewards/chosen": -0.31647107005119324, "rewards/margins": 1.0438673496246338, "rewards/rejected": -1.3603384494781494, "step": 625 }, { "epoch": 0.4946025515210991, "grad_norm": 5.787623405456543, "learning_rate": 1.012558869701727e-05, "logits/chosen": -2.932849407196045, "logits/rejected": -3.022317886352539, "logps/chosen": -350.6145324707031, "logps/rejected": -343.1180114746094, "loss": 0.5072, "rewards/accuracies": 0.7375000715255737, "rewards/chosen": -0.10851552337408066, "rewards/margins": 1.0356271266937256, "rewards/rejected": -1.144142746925354, "step": 630 }, { "epoch": 0.4985279685966634, "grad_norm": 6.2099385261535645, "learning_rate": 1.0047095761381476e-05, "logits/chosen": -2.9498133659362793, "logits/rejected": -2.9196791648864746, "logps/chosen": -315.73150634765625, "logps/rejected": -315.6263427734375, "loss": 0.5515, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.16974535584449768, "rewards/margins": 0.9117358326911926, "rewards/rejected": -1.0814812183380127, "step": 635 }, { "epoch": 0.5024533856722276, "grad_norm": 4.4105448722839355, "learning_rate": 9.968602825745683e-06, "logits/chosen": -2.970156192779541, "logits/rejected": -2.9927220344543457, "logps/chosen": -296.1843566894531, "logps/rejected": -292.82244873046875, "loss": 0.5219, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.09649379551410675, "rewards/margins": 0.9780260324478149, "rewards/rejected": -1.0745197534561157, "step": 640 }, { "epoch": 0.5063788027477919, "grad_norm": 4.673651218414307, "learning_rate": 9.890109890109892e-06, "logits/chosen": -3.0400826930999756, "logits/rejected": -3.0831387042999268, "logps/chosen": -339.0031433105469, "logps/rejected": -329.25152587890625, "loss": 0.5512, "rewards/accuracies": 0.7041667699813843, "rewards/chosen": -0.14476463198661804, "rewards/margins": 0.8560221791267395, "rewards/rejected": -1.0007867813110352, "step": 645 }, { "epoch": 0.5103042198233563, "grad_norm": 4.254279613494873, "learning_rate": 9.811616954474098e-06, "logits/chosen": -3.0292727947235107, "logits/rejected": -3.0746917724609375, "logps/chosen": -313.74591064453125, "logps/rejected": -302.05279541015625, "loss": 0.5132, "rewards/accuracies": 0.7166666388511658, "rewards/chosen": -0.06440563499927521, "rewards/margins": 1.017377257347107, "rewards/rejected": -1.0817829370498657, "step": 650 }, { "epoch": 0.5142296368989205, "grad_norm": 4.169983863830566, "learning_rate": 9.733124018838307e-06, "logits/chosen": -2.944243907928467, "logits/rejected": -3.0422754287719727, "logps/chosen": -288.42559814453125, "logps/rejected": -296.96026611328125, "loss": 0.4622, "rewards/accuracies": 0.7958333492279053, "rewards/chosen": -0.060607265681028366, "rewards/margins": 1.0034582614898682, "rewards/rejected": -1.0640655755996704, "step": 655 }, { "epoch": 0.5181550539744848, "grad_norm": 4.819730281829834, "learning_rate": 9.654631083202512e-06, "logits/chosen": -2.9609310626983643, "logits/rejected": -3.0102219581604004, "logps/chosen": -329.1315002441406, "logps/rejected": -305.99737548828125, "loss": 0.5414, "rewards/accuracies": 0.6958333849906921, "rewards/chosen": -0.272332102060318, "rewards/margins": 0.9030130505561829, "rewards/rejected": -1.1753450632095337, "step": 660 }, { "epoch": 0.5220804710500491, "grad_norm": 4.765251159667969, "learning_rate": 9.576138147566721e-06, "logits/chosen": -2.911606550216675, "logits/rejected": -3.0551559925079346, "logps/chosen": -372.60357666015625, "logps/rejected": -350.260498046875, "loss": 0.5388, "rewards/accuracies": 0.7458333969116211, "rewards/chosen": -0.2132304608821869, "rewards/margins": 0.8807324171066284, "rewards/rejected": -1.0939629077911377, "step": 665 }, { "epoch": 0.5260058881256133, "grad_norm": 4.679354667663574, "learning_rate": 9.497645211930927e-06, "logits/chosen": -2.996860980987549, "logits/rejected": -2.969712734222412, "logps/chosen": -329.93768310546875, "logps/rejected": -349.2161560058594, "loss": 0.5549, "rewards/accuracies": 0.7291666865348816, "rewards/chosen": -0.19874221086502075, "rewards/margins": 0.9041234254837036, "rewards/rejected": -1.1028656959533691, "step": 670 }, { "epoch": 0.5299313052011776, "grad_norm": 5.291715621948242, "learning_rate": 9.419152276295134e-06, "logits/chosen": -3.0396275520324707, "logits/rejected": -3.0426812171936035, "logps/chosen": -360.34552001953125, "logps/rejected": -345.2974853515625, "loss": 0.561, "rewards/accuracies": 0.6916667222976685, "rewards/chosen": -0.31488001346588135, "rewards/margins": 0.8902130126953125, "rewards/rejected": -1.2050931453704834, "step": 675 }, { "epoch": 0.5338567222767419, "grad_norm": 4.548229694366455, "learning_rate": 9.340659340659341e-06, "logits/chosen": -2.9637022018432617, "logits/rejected": -2.995466947555542, "logps/chosen": -349.22283935546875, "logps/rejected": -336.8186340332031, "loss": 0.5911, "rewards/accuracies": 0.67083340883255, "rewards/chosen": -0.22216463088989258, "rewards/margins": 0.75648432970047, "rewards/rejected": -0.978648841381073, "step": 680 }, { "epoch": 0.5377821393523062, "grad_norm": 3.729768753051758, "learning_rate": 9.262166405023548e-06, "logits/chosen": -2.9287047386169434, "logits/rejected": -2.999833106994629, "logps/chosen": -338.10162353515625, "logps/rejected": -330.71282958984375, "loss": 0.4436, "rewards/accuracies": 0.7541667222976685, "rewards/chosen": 0.11126607656478882, "rewards/margins": 1.1149643659591675, "rewards/rejected": -1.003698468208313, "step": 685 }, { "epoch": 0.5417075564278705, "grad_norm": 3.5768096446990967, "learning_rate": 9.183673469387756e-06, "logits/chosen": -3.034193754196167, "logits/rejected": -3.030886650085449, "logps/chosen": -336.0780944824219, "logps/rejected": -328.44195556640625, "loss": 0.4591, "rewards/accuracies": 0.7708333730697632, "rewards/chosen": 0.15392692387104034, "rewards/margins": 1.142258882522583, "rewards/rejected": -0.9883320927619934, "step": 690 }, { "epoch": 0.5456329735034348, "grad_norm": 4.376099109649658, "learning_rate": 9.105180533751963e-06, "logits/chosen": -2.9962871074676514, "logits/rejected": -3.0676894187927246, "logps/chosen": -312.1530456542969, "logps/rejected": -304.1636657714844, "loss": 0.4686, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.018533676862716675, "rewards/margins": 1.1388862133026123, "rewards/rejected": -1.1574198007583618, "step": 695 }, { "epoch": 0.549558390578999, "grad_norm": 5.068153381347656, "learning_rate": 9.02668759811617e-06, "logits/chosen": -3.0012753009796143, "logits/rejected": -2.9406380653381348, "logps/chosen": -334.9573059082031, "logps/rejected": -336.917724609375, "loss": 0.5127, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -0.1398693174123764, "rewards/margins": 1.0154725313186646, "rewards/rejected": -1.1553419828414917, "step": 700 }, { "epoch": 0.549558390578999, "eval_logits/chosen": -3.029242753982544, "eval_logits/rejected": -3.0569090843200684, "eval_logps/chosen": -332.35626220703125, "eval_logps/rejected": -325.6695861816406, "eval_loss": 0.49267128109931946, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -0.29927709698677063, "eval_rewards/margins": 1.0248706340789795, "eval_rewards/rejected": -1.3241477012634277, "eval_runtime": 170.0257, "eval_samples_per_second": 11.763, "eval_steps_per_second": 5.881, "step": 700 }, { "epoch": 0.5534838076545633, "grad_norm": 5.191127777099609, "learning_rate": 8.948194662480377e-06, "logits/chosen": -2.9839444160461426, "logits/rejected": -3.006364345550537, "logps/chosen": -337.6143798828125, "logps/rejected": -352.8746643066406, "loss": 0.5361, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -0.30696621537208557, "rewards/margins": 1.0262271165847778, "rewards/rejected": -1.3331931829452515, "step": 705 }, { "epoch": 0.5574092247301276, "grad_norm": 3.932312250137329, "learning_rate": 8.869701726844585e-06, "logits/chosen": -2.9533979892730713, "logits/rejected": -3.0565037727355957, "logps/chosen": -316.4660949707031, "logps/rejected": -302.0240783691406, "loss": 0.4628, "rewards/accuracies": 0.7708333730697632, "rewards/chosen": -0.23080229759216309, "rewards/margins": 1.1649229526519775, "rewards/rejected": -1.3957254886627197, "step": 710 }, { "epoch": 0.5613346418056918, "grad_norm": 6.213256359100342, "learning_rate": 8.791208791208792e-06, "logits/chosen": -2.9602789878845215, "logits/rejected": -2.960151195526123, "logps/chosen": -283.5656433105469, "logps/rejected": -263.10260009765625, "loss": 0.4912, "rewards/accuracies": 0.7458332777023315, "rewards/chosen": -0.20947471261024475, "rewards/margins": 1.0615582466125488, "rewards/rejected": -1.2710330486297607, "step": 715 }, { "epoch": 0.5652600588812562, "grad_norm": 3.982071876525879, "learning_rate": 8.712715855573e-06, "logits/chosen": -2.966439723968506, "logits/rejected": -3.0421411991119385, "logps/chosen": -328.9455261230469, "logps/rejected": -314.2897644042969, "loss": 0.4648, "rewards/accuracies": 0.7541667222976685, "rewards/chosen": -0.02404719963669777, "rewards/margins": 1.0856393575668335, "rewards/rejected": -1.1096864938735962, "step": 720 }, { "epoch": 0.5691854759568205, "grad_norm": 5.195390701293945, "learning_rate": 8.634222919937206e-06, "logits/chosen": -3.0160458087921143, "logits/rejected": -3.0692107677459717, "logps/chosen": -332.7279357910156, "logps/rejected": -336.43927001953125, "loss": 0.5002, "rewards/accuracies": 0.7291666269302368, "rewards/chosen": -0.19675110280513763, "rewards/margins": 0.985478401184082, "rewards/rejected": -1.1822296380996704, "step": 725 }, { "epoch": 0.5731108930323847, "grad_norm": 4.930713653564453, "learning_rate": 8.555729984301414e-06, "logits/chosen": -3.0057716369628906, "logits/rejected": -3.0269782543182373, "logps/chosen": -306.15875244140625, "logps/rejected": -305.45416259765625, "loss": 0.4791, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.07756291329860687, "rewards/margins": 1.0487645864486694, "rewards/rejected": -1.1263275146484375, "step": 730 }, { "epoch": 0.577036310107949, "grad_norm": 3.645521640777588, "learning_rate": 8.477237048665621e-06, "logits/chosen": -3.024445056915283, "logits/rejected": -3.0869052410125732, "logps/chosen": -303.17327880859375, "logps/rejected": -301.08905029296875, "loss": 0.4716, "rewards/accuracies": 0.7666667699813843, "rewards/chosen": -0.038672782480716705, "rewards/margins": 1.1686241626739502, "rewards/rejected": -1.2072969675064087, "step": 735 }, { "epoch": 0.5809617271835132, "grad_norm": 4.946695327758789, "learning_rate": 8.398744113029828e-06, "logits/chosen": -3.0039095878601074, "logits/rejected": -3.015929698944092, "logps/chosen": -322.4432373046875, "logps/rejected": -309.04107666015625, "loss": 0.5134, "rewards/accuracies": 0.7458333373069763, "rewards/chosen": -0.12716850638389587, "rewards/margins": 0.9853676557540894, "rewards/rejected": -1.1125361919403076, "step": 740 }, { "epoch": 0.5848871442590775, "grad_norm": 5.070699214935303, "learning_rate": 8.320251177394036e-06, "logits/chosen": -3.0126490592956543, "logits/rejected": -3.098520278930664, "logps/chosen": -357.4333190917969, "logps/rejected": -332.8435363769531, "loss": 0.5058, "rewards/accuracies": 0.7625000476837158, "rewards/chosen": -0.04318712279200554, "rewards/margins": 1.034911036491394, "rewards/rejected": -1.078098177909851, "step": 745 }, { "epoch": 0.5888125613346418, "grad_norm": 3.6236932277679443, "learning_rate": 8.241758241758243e-06, "logits/chosen": -2.9999072551727295, "logits/rejected": -2.990403413772583, "logps/chosen": -357.82733154296875, "logps/rejected": -326.899169921875, "loss": 0.4588, "rewards/accuracies": 0.7708333730697632, "rewards/chosen": -0.11650122702121735, "rewards/margins": 1.1574772596359253, "rewards/rejected": -1.2739784717559814, "step": 750 }, { "epoch": 0.592737978410206, "grad_norm": 4.285853862762451, "learning_rate": 8.16326530612245e-06, "logits/chosen": -3.0003159046173096, "logits/rejected": -3.048569679260254, "logps/chosen": -349.8592834472656, "logps/rejected": -304.07427978515625, "loss": 0.4784, "rewards/accuracies": 0.7750000953674316, "rewards/chosen": -0.07542826980352402, "rewards/margins": 1.2488701343536377, "rewards/rejected": -1.32429838180542, "step": 755 }, { "epoch": 0.5966633954857704, "grad_norm": 4.365904331207275, "learning_rate": 8.084772370486657e-06, "logits/chosen": -3.0357697010040283, "logits/rejected": -3.12182879447937, "logps/chosen": -317.8271484375, "logps/rejected": -298.142578125, "loss": 0.4855, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -0.06173648685216904, "rewards/margins": 1.079683542251587, "rewards/rejected": -1.1414198875427246, "step": 760 }, { "epoch": 0.6005888125613347, "grad_norm": 4.72523307800293, "learning_rate": 8.006279434850865e-06, "logits/chosen": -3.0452723503112793, "logits/rejected": -3.0594534873962402, "logps/chosen": -317.8534240722656, "logps/rejected": -329.19976806640625, "loss": 0.5079, "rewards/accuracies": 0.7583333849906921, "rewards/chosen": -0.3027920126914978, "rewards/margins": 1.1086828708648682, "rewards/rejected": -1.4114749431610107, "step": 765 }, { "epoch": 0.6045142296368989, "grad_norm": 4.864989757537842, "learning_rate": 7.927786499215072e-06, "logits/chosen": -3.0068793296813965, "logits/rejected": -3.017885446548462, "logps/chosen": -314.66876220703125, "logps/rejected": -317.1419677734375, "loss": 0.4965, "rewards/accuracies": 0.7583333253860474, "rewards/chosen": -0.18796098232269287, "rewards/margins": 1.0385706424713135, "rewards/rejected": -1.2265316247940063, "step": 770 }, { "epoch": 0.6084396467124632, "grad_norm": 4.275363922119141, "learning_rate": 7.849293563579279e-06, "logits/chosen": -3.002530574798584, "logits/rejected": -2.980320453643799, "logps/chosen": -317.7269287109375, "logps/rejected": -339.15997314453125, "loss": 0.4468, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.04108821228146553, "rewards/margins": 1.3123092651367188, "rewards/rejected": -1.3533976078033447, "step": 775 }, { "epoch": 0.6123650637880275, "grad_norm": 4.48534631729126, "learning_rate": 7.770800627943486e-06, "logits/chosen": -2.961935520172119, "logits/rejected": -3.0330467224121094, "logps/chosen": -370.4855041503906, "logps/rejected": -328.0148010253906, "loss": 0.5256, "rewards/accuracies": 0.7791666984558105, "rewards/chosen": 0.09827003628015518, "rewards/margins": 1.1447842121124268, "rewards/rejected": -1.0465141534805298, "step": 780 }, { "epoch": 0.6162904808635917, "grad_norm": 4.550291061401367, "learning_rate": 7.692307692307694e-06, "logits/chosen": -3.0217397212982178, "logits/rejected": -3.0674948692321777, "logps/chosen": -317.39971923828125, "logps/rejected": -297.3795471191406, "loss": 0.5762, "rewards/accuracies": 0.7166666984558105, "rewards/chosen": -0.1348334103822708, "rewards/margins": 0.9545001983642578, "rewards/rejected": -1.0893336534500122, "step": 785 }, { "epoch": 0.620215897939156, "grad_norm": 3.8520846366882324, "learning_rate": 7.6138147566719e-06, "logits/chosen": -3.0688512325286865, "logits/rejected": -3.068896770477295, "logps/chosen": -332.6806640625, "logps/rejected": -327.0592956542969, "loss": 0.4392, "rewards/accuracies": 0.8125, "rewards/chosen": -0.005734431557357311, "rewards/margins": 1.1970218420028687, "rewards/rejected": -1.2027562856674194, "step": 790 }, { "epoch": 0.6241413150147204, "grad_norm": 5.522543907165527, "learning_rate": 7.535321821036108e-06, "logits/chosen": -3.0281357765197754, "logits/rejected": -3.069322109222412, "logps/chosen": -318.053466796875, "logps/rejected": -313.0615234375, "loss": 0.5247, "rewards/accuracies": 0.7083333730697632, "rewards/chosen": -0.1159566193819046, "rewards/margins": 0.977625846862793, "rewards/rejected": -1.0935826301574707, "step": 795 }, { "epoch": 0.6280667320902846, "grad_norm": 5.020391941070557, "learning_rate": 7.4568288854003145e-06, "logits/chosen": -2.9681613445281982, "logits/rejected": -2.948774814605713, "logps/chosen": -342.89019775390625, "logps/rejected": -317.9969482421875, "loss": 0.4847, "rewards/accuracies": 0.7583333253860474, "rewards/chosen": -0.22335605323314667, "rewards/margins": 1.1023824214935303, "rewards/rejected": -1.3257384300231934, "step": 800 }, { "epoch": 0.6280667320902846, "eval_logits/chosen": -3.0195696353912354, "eval_logits/rejected": -3.047680139541626, "eval_logps/chosen": -330.66943359375, "eval_logps/rejected": -324.49725341796875, "eval_loss": 0.489461213350296, "eval_rewards/accuracies": 0.7534999847412109, "eval_rewards/chosen": -0.13059695065021515, "eval_rewards/margins": 1.0763192176818848, "eval_rewards/rejected": -1.2069162130355835, "eval_runtime": 170.5252, "eval_samples_per_second": 11.728, "eval_steps_per_second": 5.864, "step": 800 }, { "epoch": 0.6319921491658489, "grad_norm": 3.8706929683685303, "learning_rate": 7.378335949764521e-06, "logits/chosen": -3.020859479904175, "logits/rejected": -3.0139949321746826, "logps/chosen": -333.51654052734375, "logps/rejected": -324.34991455078125, "loss": 0.5385, "rewards/accuracies": 0.720833420753479, "rewards/chosen": -0.24091720581054688, "rewards/margins": 0.9881976246833801, "rewards/rejected": -1.2291150093078613, "step": 805 }, { "epoch": 0.6359175662414132, "grad_norm": 3.939385175704956, "learning_rate": 7.299843014128729e-06, "logits/chosen": -2.999584674835205, "logits/rejected": -2.953639507293701, "logps/chosen": -311.6091003417969, "logps/rejected": -367.5775451660156, "loss": 0.4001, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0669659823179245, "rewards/margins": 1.300567388534546, "rewards/rejected": -1.3675333261489868, "step": 810 }, { "epoch": 0.6398429833169774, "grad_norm": 3.8747594356536865, "learning_rate": 7.2213500784929355e-06, "logits/chosen": -2.9922289848327637, "logits/rejected": -3.0901122093200684, "logps/chosen": -351.9647216796875, "logps/rejected": -324.533935546875, "loss": 0.4751, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -0.19494260847568512, "rewards/margins": 1.1178690195083618, "rewards/rejected": -1.3128114938735962, "step": 815 }, { "epoch": 0.6437684003925417, "grad_norm": 4.751869201660156, "learning_rate": 7.1428571428571436e-06, "logits/chosen": -3.0304911136627197, "logits/rejected": -3.0640957355499268, "logps/chosen": -350.48040771484375, "logps/rejected": -304.4465026855469, "loss": 0.4689, "rewards/accuracies": 0.7708333730697632, "rewards/chosen": -0.17158253490924835, "rewards/margins": 1.1375932693481445, "rewards/rejected": -1.309175968170166, "step": 820 }, { "epoch": 0.647693817468106, "grad_norm": 3.528496503829956, "learning_rate": 7.06436420722135e-06, "logits/chosen": -2.954141139984131, "logits/rejected": -3.0600366592407227, "logps/chosen": -344.02496337890625, "logps/rejected": -313.1765441894531, "loss": 0.4165, "rewards/accuracies": 0.7958333492279053, "rewards/chosen": 0.023122036829590797, "rewards/margins": 1.3192641735076904, "rewards/rejected": -1.2961422204971313, "step": 825 }, { "epoch": 0.6516192345436702, "grad_norm": 4.554298400878906, "learning_rate": 6.985871271585558e-06, "logits/chosen": -2.985323190689087, "logits/rejected": -3.0694005489349365, "logps/chosen": -347.07855224609375, "logps/rejected": -321.8019714355469, "loss": 0.4487, "rewards/accuracies": 0.7833333015441895, "rewards/chosen": 0.15593689680099487, "rewards/margins": 1.35723078250885, "rewards/rejected": -1.2012939453125, "step": 830 }, { "epoch": 0.6555446516192346, "grad_norm": 4.962828159332275, "learning_rate": 6.9073783359497645e-06, "logits/chosen": -2.939275026321411, "logits/rejected": -3.06274151802063, "logps/chosen": -318.2911376953125, "logps/rejected": -321.3827209472656, "loss": 0.453, "rewards/accuracies": 0.75, "rewards/chosen": 0.16927500069141388, "rewards/margins": 1.2757118940353394, "rewards/rejected": -1.106436848640442, "step": 835 }, { "epoch": 0.6594700686947988, "grad_norm": 3.8393807411193848, "learning_rate": 6.828885400313973e-06, "logits/chosen": -2.948727607727051, "logits/rejected": -2.9849319458007812, "logps/chosen": -278.67950439453125, "logps/rejected": -289.4703063964844, "loss": 0.5496, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.003879111958667636, "rewards/margins": 0.9358898997306824, "rewards/rejected": -0.9397690892219543, "step": 840 }, { "epoch": 0.6633954857703631, "grad_norm": 4.3185577392578125, "learning_rate": 6.750392464678179e-06, "logits/chosen": -2.9654245376586914, "logits/rejected": -3.0434441566467285, "logps/chosen": -327.6491394042969, "logps/rejected": -321.876953125, "loss": 0.4393, "rewards/accuracies": 0.8083333969116211, "rewards/chosen": -0.07706048339605331, "rewards/margins": 1.2577455043792725, "rewards/rejected": -1.3348058462142944, "step": 845 }, { "epoch": 0.6673209028459274, "grad_norm": 4.293339252471924, "learning_rate": 6.671899529042387e-06, "logits/chosen": -3.1013782024383545, "logits/rejected": -3.0577914714813232, "logps/chosen": -321.20611572265625, "logps/rejected": -330.765380859375, "loss": 0.4861, "rewards/accuracies": 0.7708333730697632, "rewards/chosen": -0.0720919817686081, "rewards/margins": 1.2457802295684814, "rewards/rejected": -1.3178722858428955, "step": 850 }, { "epoch": 0.6712463199214916, "grad_norm": 4.7395734786987305, "learning_rate": 6.5934065934065935e-06, "logits/chosen": -2.967613697052002, "logits/rejected": -3.026918411254883, "logps/chosen": -309.9462890625, "logps/rejected": -322.9964904785156, "loss": 0.5118, "rewards/accuracies": 0.7416667342185974, "rewards/chosen": -0.34576496481895447, "rewards/margins": 1.138620138168335, "rewards/rejected": -1.4843851327896118, "step": 855 }, { "epoch": 0.6751717369970559, "grad_norm": 4.396761417388916, "learning_rate": 6.514913657770802e-06, "logits/chosen": -2.872307300567627, "logits/rejected": -2.9442195892333984, "logps/chosen": -344.57275390625, "logps/rejected": -361.8569030761719, "loss": 0.4233, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.27561578154563904, "rewards/margins": 1.3650743961334229, "rewards/rejected": -1.6406902074813843, "step": 860 }, { "epoch": 0.6790971540726202, "grad_norm": 5.042901039123535, "learning_rate": 6.436420722135008e-06, "logits/chosen": -2.9460458755493164, "logits/rejected": -2.9742355346679688, "logps/chosen": -318.40972900390625, "logps/rejected": -338.845703125, "loss": 0.5214, "rewards/accuracies": 0.73333340883255, "rewards/chosen": -0.464036762714386, "rewards/margins": 1.0700992345809937, "rewards/rejected": -1.5341359376907349, "step": 865 }, { "epoch": 0.6830225711481845, "grad_norm": 5.222243785858154, "learning_rate": 6.357927786499215e-06, "logits/chosen": -2.948620080947876, "logits/rejected": -3.0497138500213623, "logps/chosen": -319.1834716796875, "logps/rejected": -319.8209228515625, "loss": 0.4982, "rewards/accuracies": 0.7708333730697632, "rewards/chosen": -0.6082950830459595, "rewards/margins": 1.108607530593872, "rewards/rejected": -1.7169023752212524, "step": 870 }, { "epoch": 0.6869479882237488, "grad_norm": 4.216038703918457, "learning_rate": 6.279434850863423e-06, "logits/chosen": -2.9881601333618164, "logits/rejected": -2.9443399906158447, "logps/chosen": -347.36181640625, "logps/rejected": -356.3033447265625, "loss": 0.4577, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.43880724906921387, "rewards/margins": 1.2593494653701782, "rewards/rejected": -1.698156714439392, "step": 875 }, { "epoch": 0.6908734052993131, "grad_norm": 4.496317386627197, "learning_rate": 6.20094191522763e-06, "logits/chosen": -2.937516450881958, "logits/rejected": -2.9816195964813232, "logps/chosen": -338.54888916015625, "logps/rejected": -316.7308654785156, "loss": 0.4974, "rewards/accuracies": 0.7583333253860474, "rewards/chosen": -0.38299983739852905, "rewards/margins": 1.1357471942901611, "rewards/rejected": -1.518747091293335, "step": 880 }, { "epoch": 0.6947988223748773, "grad_norm": 5.088541507720947, "learning_rate": 6.122448979591837e-06, "logits/chosen": -2.9346837997436523, "logits/rejected": -3.0072033405303955, "logps/chosen": -323.017578125, "logps/rejected": -321.0302429199219, "loss": 0.5165, "rewards/accuracies": 0.7333334684371948, "rewards/chosen": -0.16942360997200012, "rewards/margins": 1.0610682964324951, "rewards/rejected": -1.2304918766021729, "step": 885 }, { "epoch": 0.6987242394504416, "grad_norm": 5.229645729064941, "learning_rate": 6.043956043956044e-06, "logits/chosen": -3.0896613597869873, "logits/rejected": -3.1020538806915283, "logps/chosen": -340.7657775878906, "logps/rejected": -308.33831787109375, "loss": 0.4848, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -0.14006583392620087, "rewards/margins": 1.063689947128296, "rewards/rejected": -1.2037558555603027, "step": 890 }, { "epoch": 0.7026496565260059, "grad_norm": 4.50402307510376, "learning_rate": 5.965463108320252e-06, "logits/chosen": -2.930692195892334, "logits/rejected": -3.0338551998138428, "logps/chosen": -322.0299072265625, "logps/rejected": -337.92059326171875, "loss": 0.4844, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -0.06006438657641411, "rewards/margins": 1.1535618305206299, "rewards/rejected": -1.2136261463165283, "step": 895 }, { "epoch": 0.7065750736015701, "grad_norm": 3.8728554248809814, "learning_rate": 5.886970172684459e-06, "logits/chosen": -2.997563123703003, "logits/rejected": -3.0621156692504883, "logps/chosen": -320.8427429199219, "logps/rejected": -323.98663330078125, "loss": 0.5245, "rewards/accuracies": 0.7291667461395264, "rewards/chosen": -0.022658739238977432, "rewards/margins": 1.0391814708709717, "rewards/rejected": -1.061840295791626, "step": 900 }, { "epoch": 0.7065750736015701, "eval_logits/chosen": -3.025604724884033, "eval_logits/rejected": -3.0542104244232178, "eval_logps/chosen": -331.0598449707031, "eval_logps/rejected": -325.0303955078125, "eval_loss": 0.4869418144226074, "eval_rewards/accuracies": 0.7524999976158142, "eval_rewards/chosen": -0.16963602602481842, "eval_rewards/margins": 1.0905920267105103, "eval_rewards/rejected": -1.2602282762527466, "eval_runtime": 171.3633, "eval_samples_per_second": 11.671, "eval_steps_per_second": 5.836, "step": 900 }, { "epoch": 0.7105004906771345, "grad_norm": 4.844978332519531, "learning_rate": 5.808477237048666e-06, "logits/chosen": -3.054149627685547, "logits/rejected": -3.120988368988037, "logps/chosen": -354.59063720703125, "logps/rejected": -327.55731201171875, "loss": 0.5112, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -0.17106744647026062, "rewards/margins": 1.1445057392120361, "rewards/rejected": -1.3155733346939087, "step": 905 }, { "epoch": 0.7144259077526988, "grad_norm": 3.322969675064087, "learning_rate": 5.729984301412873e-06, "logits/chosen": -2.979074716567993, "logits/rejected": -3.056349277496338, "logps/chosen": -341.1482238769531, "logps/rejected": -330.65411376953125, "loss": 0.4561, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -0.04363600164651871, "rewards/margins": 1.232508659362793, "rewards/rejected": -1.2761447429656982, "step": 910 }, { "epoch": 0.718351324828263, "grad_norm": 5.024198532104492, "learning_rate": 5.651491365777081e-06, "logits/chosen": -3.0509443283081055, "logits/rejected": -2.996936321258545, "logps/chosen": -318.218017578125, "logps/rejected": -317.7236022949219, "loss": 0.4975, "rewards/accuracies": 0.7541667222976685, "rewards/chosen": -0.0626036748290062, "rewards/margins": 1.1322224140167236, "rewards/rejected": -1.1948261260986328, "step": 915 }, { "epoch": 0.7222767419038273, "grad_norm": 4.653200149536133, "learning_rate": 5.572998430141288e-06, "logits/chosen": -3.0015101432800293, "logits/rejected": -3.004070520401001, "logps/chosen": -353.28912353515625, "logps/rejected": -330.3291320800781, "loss": 0.4619, "rewards/accuracies": 0.7541667222976685, "rewards/chosen": -0.03766501322388649, "rewards/margins": 1.2282439470291138, "rewards/rejected": -1.2659088373184204, "step": 920 }, { "epoch": 0.7262021589793916, "grad_norm": 4.00246524810791, "learning_rate": 5.494505494505495e-06, "logits/chosen": -2.997545003890991, "logits/rejected": -3.094526767730713, "logps/chosen": -299.0417785644531, "logps/rejected": -316.5038146972656, "loss": 0.4866, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": -0.06449007242918015, "rewards/margins": 1.0651543140411377, "rewards/rejected": -1.1296443939208984, "step": 925 }, { "epoch": 0.7301275760549558, "grad_norm": 4.44005823135376, "learning_rate": 5.4160125588697024e-06, "logits/chosen": -3.0456299781799316, "logits/rejected": -3.024275541305542, "logps/chosen": -305.1586608886719, "logps/rejected": -291.7957763671875, "loss": 0.4836, "rewards/accuracies": 0.7583333849906921, "rewards/chosen": -0.02189583331346512, "rewards/margins": 0.9891737103462219, "rewards/rejected": -1.011069655418396, "step": 930 }, { "epoch": 0.7340529931305201, "grad_norm": 5.462418556213379, "learning_rate": 5.33751962323391e-06, "logits/chosen": -2.901125907897949, "logits/rejected": -2.9909684658050537, "logps/chosen": -326.17791748046875, "logps/rejected": -334.52166748046875, "loss": 0.5115, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.09602449834346771, "rewards/margins": 0.9841717481613159, "rewards/rejected": -1.0801963806152344, "step": 935 }, { "epoch": 0.7379784102060843, "grad_norm": 4.357146739959717, "learning_rate": 5.259026687598117e-06, "logits/chosen": -3.0265889167785645, "logits/rejected": -3.0476319789886475, "logps/chosen": -298.66705322265625, "logps/rejected": -311.0827331542969, "loss": 0.4936, "rewards/accuracies": 0.7625000476837158, "rewards/chosen": -0.18625633418560028, "rewards/margins": 0.9413064122200012, "rewards/rejected": -1.1275627613067627, "step": 940 }, { "epoch": 0.7419038272816487, "grad_norm": 5.111855983734131, "learning_rate": 5.180533751962323e-06, "logits/chosen": -3.0872814655303955, "logits/rejected": -3.0954253673553467, "logps/chosen": -331.3386535644531, "logps/rejected": -317.2918701171875, "loss": 0.5034, "rewards/accuracies": 0.7541667222976685, "rewards/chosen": -0.17156757414340973, "rewards/margins": 1.000931978225708, "rewards/rejected": -1.172499418258667, "step": 945 }, { "epoch": 0.745829244357213, "grad_norm": 5.880111217498779, "learning_rate": 5.1020408163265315e-06, "logits/chosen": -2.9598050117492676, "logits/rejected": -3.0074758529663086, "logps/chosen": -311.2569274902344, "logps/rejected": -295.6765441894531, "loss": 0.5429, "rewards/accuracies": 0.6750000715255737, "rewards/chosen": -0.17758509516716003, "rewards/margins": 0.9686153531074524, "rewards/rejected": -1.14620041847229, "step": 950 }, { "epoch": 0.7497546614327772, "grad_norm": 4.473018169403076, "learning_rate": 5.023547880690738e-06, "logits/chosen": -2.981584072113037, "logits/rejected": -3.0066146850585938, "logps/chosen": -322.27655029296875, "logps/rejected": -333.5732421875, "loss": 0.4551, "rewards/accuracies": 0.7541667222976685, "rewards/chosen": -0.10117676109075546, "rewards/margins": 1.1731947660446167, "rewards/rejected": -1.2743713855743408, "step": 955 }, { "epoch": 0.7536800785083415, "grad_norm": 5.427559852600098, "learning_rate": 4.945054945054946e-06, "logits/chosen": -3.038059949874878, "logits/rejected": -3.0491480827331543, "logps/chosen": -304.99285888671875, "logps/rejected": -331.29669189453125, "loss": 0.5048, "rewards/accuracies": 0.7541667222976685, "rewards/chosen": -0.1531282365322113, "rewards/margins": 1.0113624334335327, "rewards/rejected": -1.1644906997680664, "step": 960 }, { "epoch": 0.7576054955839058, "grad_norm": 5.369307994842529, "learning_rate": 4.866562009419153e-06, "logits/chosen": -3.041508197784424, "logits/rejected": -3.017305374145508, "logps/chosen": -309.01409912109375, "logps/rejected": -324.8525390625, "loss": 0.468, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.14672012627124786, "rewards/margins": 1.1099258661270142, "rewards/rejected": -1.256645917892456, "step": 965 }, { "epoch": 0.76153091265947, "grad_norm": 4.683850288391113, "learning_rate": 4.7880690737833605e-06, "logits/chosen": -3.0513949394226074, "logits/rejected": -3.1184747219085693, "logps/chosen": -349.2635192871094, "logps/rejected": -324.3399963378906, "loss": 0.54, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -0.11181743443012238, "rewards/margins": 0.9319450259208679, "rewards/rejected": -1.0437625646591187, "step": 970 }, { "epoch": 0.7654563297350343, "grad_norm": 4.057793617248535, "learning_rate": 4.709576138147567e-06, "logits/chosen": -3.0879323482513428, "logits/rejected": -3.1031229496002197, "logps/chosen": -345.81207275390625, "logps/rejected": -354.1678466796875, "loss": 0.4916, "rewards/accuracies": 0.7583333849906921, "rewards/chosen": 0.02626100741326809, "rewards/margins": 1.1445354223251343, "rewards/rejected": -1.118274450302124, "step": 975 }, { "epoch": 0.7693817468105987, "grad_norm": 3.788588285446167, "learning_rate": 4.631083202511774e-06, "logits/chosen": -3.0559887886047363, "logits/rejected": -2.9645016193389893, "logps/chosen": -329.69366455078125, "logps/rejected": -323.6266174316406, "loss": 0.5673, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.006841002497822046, "rewards/margins": 1.0048836469650269, "rewards/rejected": -0.9980427026748657, "step": 980 }, { "epoch": 0.7733071638861629, "grad_norm": 4.364587306976318, "learning_rate": 4.5525902668759815e-06, "logits/chosen": -2.923006296157837, "logits/rejected": -3.013233184814453, "logps/chosen": -316.62188720703125, "logps/rejected": -303.6081848144531, "loss": 0.5261, "rewards/accuracies": 0.7083333730697632, "rewards/chosen": -0.10443178564310074, "rewards/margins": 1.0743087530136108, "rewards/rejected": -1.1787405014038086, "step": 985 }, { "epoch": 0.7772325809617272, "grad_norm": 5.203098773956299, "learning_rate": 4.474097331240189e-06, "logits/chosen": -3.0155367851257324, "logits/rejected": -3.095081090927124, "logps/chosen": -320.33074951171875, "logps/rejected": -339.04791259765625, "loss": 0.5602, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04893790930509567, "rewards/margins": 1.0242021083831787, "rewards/rejected": -0.9752641916275024, "step": 990 }, { "epoch": 0.7811579980372915, "grad_norm": 5.797107219696045, "learning_rate": 4.395604395604396e-06, "logits/chosen": -2.9568419456481934, "logits/rejected": -3.061304807662964, "logps/chosen": -323.64459228515625, "logps/rejected": -306.8099060058594, "loss": 0.4912, "rewards/accuracies": 0.7458333969116211, "rewards/chosen": -0.03198995441198349, "rewards/margins": 1.1657757759094238, "rewards/rejected": -1.1977657079696655, "step": 995 }, { "epoch": 0.7850834151128557, "grad_norm": 3.8195958137512207, "learning_rate": 4.317111459968603e-06, "logits/chosen": -2.9896445274353027, "logits/rejected": -2.980637550354004, "logps/chosen": -349.469482421875, "logps/rejected": -325.8355712890625, "loss": 0.5145, "rewards/accuracies": 0.7583333849906921, "rewards/chosen": 0.16838806867599487, "rewards/margins": 1.0479358434677124, "rewards/rejected": -0.8795478940010071, "step": 1000 }, { "epoch": 0.7850834151128557, "eval_logits/chosen": -3.02506422996521, "eval_logits/rejected": -3.0537216663360596, "eval_logps/chosen": -328.9156188964844, "eval_logps/rejected": -322.50213623046875, "eval_loss": 0.48850810527801514, "eval_rewards/accuracies": 0.7524999976158142, "eval_rewards/chosen": 0.0447828434407711, "eval_rewards/margins": 1.0521847009658813, "eval_rewards/rejected": -1.0074018239974976, "eval_runtime": 170.8761, "eval_samples_per_second": 11.704, "eval_steps_per_second": 5.852, "step": 1000 }, { "epoch": 0.78900883218842, "grad_norm": 3.5479438304901123, "learning_rate": 4.2386185243328105e-06, "logits/chosen": -2.979447841644287, "logits/rejected": -3.07000994682312, "logps/chosen": -329.8793640136719, "logps/rejected": -326.6648864746094, "loss": 0.5044, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.15325433015823364, "rewards/margins": 1.0215200185775757, "rewards/rejected": -0.8682657480239868, "step": 1005 }, { "epoch": 0.7929342492639843, "grad_norm": 5.247315883636475, "learning_rate": 4.160125588697018e-06, "logits/chosen": -2.996269702911377, "logits/rejected": -3.06060528755188, "logps/chosen": -315.26666259765625, "logps/rejected": -309.08868408203125, "loss": 0.5036, "rewards/accuracies": 0.7750000953674316, "rewards/chosen": 0.1521468460559845, "rewards/margins": 1.0418260097503662, "rewards/rejected": -0.8896790742874146, "step": 1010 }, { "epoch": 0.7968596663395485, "grad_norm": 4.7451324462890625, "learning_rate": 4.081632653061225e-06, "logits/chosen": -3.060560464859009, "logits/rejected": -3.1156129837036133, "logps/chosen": -307.20135498046875, "logps/rejected": -289.0655212402344, "loss": 0.5071, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -0.054080985486507416, "rewards/margins": 1.019335150718689, "rewards/rejected": -1.0734161138534546, "step": 1015 }, { "epoch": 0.8007850834151129, "grad_norm": 4.9453325271606445, "learning_rate": 4.003139717425432e-06, "logits/chosen": -3.077300548553467, "logits/rejected": -3.1319777965545654, "logps/chosen": -320.95050048828125, "logps/rejected": -289.74591064453125, "loss": 0.5736, "rewards/accuracies": 0.6958333849906921, "rewards/chosen": 0.06706535816192627, "rewards/margins": 0.8430485725402832, "rewards/rejected": -0.7759832143783569, "step": 1020 }, { "epoch": 0.8047105004906772, "grad_norm": 5.086551189422607, "learning_rate": 3.9246467817896395e-06, "logits/chosen": -2.9904251098632812, "logits/rejected": -3.015242099761963, "logps/chosen": -348.0818176269531, "logps/rejected": -313.29937744140625, "loss": 0.5274, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.15665733814239502, "rewards/margins": 1.0903061628341675, "rewards/rejected": -0.933648943901062, "step": 1025 }, { "epoch": 0.8086359175662414, "grad_norm": 3.6982407569885254, "learning_rate": 3.846153846153847e-06, "logits/chosen": -2.8871021270751953, "logits/rejected": -3.033609390258789, "logps/chosen": -319.4637145996094, "logps/rejected": -308.5705871582031, "loss": 0.4841, "rewards/accuracies": 0.720833420753479, "rewards/chosen": 0.15460513532161713, "rewards/margins": 1.0347096920013428, "rewards/rejected": -0.8801045417785645, "step": 1030 }, { "epoch": 0.8125613346418057, "grad_norm": 4.721177101135254, "learning_rate": 3.767660910518054e-06, "logits/chosen": -3.060762882232666, "logits/rejected": -3.0055036544799805, "logps/chosen": -320.2731628417969, "logps/rejected": -321.6552734375, "loss": 0.5038, "rewards/accuracies": 0.7666667699813843, "rewards/chosen": 0.1908079981803894, "rewards/margins": 0.9346101880073547, "rewards/rejected": -0.7438021302223206, "step": 1035 }, { "epoch": 0.81648675171737, "grad_norm": 3.8394100666046143, "learning_rate": 3.6891679748822605e-06, "logits/chosen": -3.0200486183166504, "logits/rejected": -3.0659079551696777, "logps/chosen": -309.3419494628906, "logps/rejected": -303.4842834472656, "loss": 0.4625, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.19585837423801422, "rewards/margins": 1.1063092947006226, "rewards/rejected": -0.9104509353637695, "step": 1040 }, { "epoch": 0.8204121687929342, "grad_norm": 4.434594631195068, "learning_rate": 3.6106750392464677e-06, "logits/chosen": -3.118605136871338, "logits/rejected": -3.0567848682403564, "logps/chosen": -320.66400146484375, "logps/rejected": -327.2590637207031, "loss": 0.5165, "rewards/accuracies": 0.7208333611488342, "rewards/chosen": 0.15826813876628876, "rewards/margins": 0.8408550024032593, "rewards/rejected": -0.6825869083404541, "step": 1045 }, { "epoch": 0.8243375858684985, "grad_norm": 5.180295467376709, "learning_rate": 3.532182103610675e-06, "logits/chosen": -3.04954195022583, "logits/rejected": -3.0940709114074707, "logps/chosen": -333.8227233886719, "logps/rejected": -317.74530029296875, "loss": 0.5001, "rewards/accuracies": 0.7416666746139526, "rewards/chosen": 0.304600328207016, "rewards/margins": 1.0906543731689453, "rewards/rejected": -0.7860540151596069, "step": 1050 }, { "epoch": 0.8282630029440629, "grad_norm": 3.821493148803711, "learning_rate": 3.4536891679748822e-06, "logits/chosen": -2.9625072479248047, "logits/rejected": -3.038464069366455, "logps/chosen": -317.5421142578125, "logps/rejected": -318.5979309082031, "loss": 0.4845, "rewards/accuracies": 0.7541667222976685, "rewards/chosen": 0.11151299625635147, "rewards/margins": 1.0481318235397339, "rewards/rejected": -0.9366186857223511, "step": 1055 }, { "epoch": 0.8321884200196271, "grad_norm": 5.562465190887451, "learning_rate": 3.3751962323390895e-06, "logits/chosen": -3.0714111328125, "logits/rejected": -3.0893895626068115, "logps/chosen": -363.3538513183594, "logps/rejected": -348.7355651855469, "loss": 0.487, "rewards/accuracies": 0.7458333969116211, "rewards/chosen": 0.14830578863620758, "rewards/margins": 1.0616153478622437, "rewards/rejected": -0.9133096933364868, "step": 1060 }, { "epoch": 0.8361138370951914, "grad_norm": 4.553463459014893, "learning_rate": 3.2967032967032968e-06, "logits/chosen": -2.939422845840454, "logits/rejected": -2.907299518585205, "logps/chosen": -330.71014404296875, "logps/rejected": -317.1188049316406, "loss": 0.5002, "rewards/accuracies": 0.7583333253860474, "rewards/chosen": 0.015831544995307922, "rewards/margins": 1.0325143337249756, "rewards/rejected": -1.016682744026184, "step": 1065 }, { "epoch": 0.8400392541707556, "grad_norm": 3.9285788536071777, "learning_rate": 3.218210361067504e-06, "logits/chosen": -2.965064287185669, "logits/rejected": -3.0384631156921387, "logps/chosen": -334.41192626953125, "logps/rejected": -332.02899169921875, "loss": 0.4383, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.18648472428321838, "rewards/margins": 1.2396255731582642, "rewards/rejected": -1.0531408786773682, "step": 1070 }, { "epoch": 0.8439646712463199, "grad_norm": 3.849515676498413, "learning_rate": 3.1397174254317113e-06, "logits/chosen": -3.0192911624908447, "logits/rejected": -3.064319610595703, "logps/chosen": -291.82257080078125, "logps/rejected": -276.48486328125, "loss": 0.5327, "rewards/accuracies": 0.7166666984558105, "rewards/chosen": -0.07870586216449738, "rewards/margins": 0.9807574152946472, "rewards/rejected": -1.059463381767273, "step": 1075 }, { "epoch": 0.8478900883218842, "grad_norm": 4.2341814041137695, "learning_rate": 3.0612244897959185e-06, "logits/chosen": -3.021660566329956, "logits/rejected": -3.0418386459350586, "logps/chosen": -313.7025451660156, "logps/rejected": -316.4256591796875, "loss": 0.5029, "rewards/accuracies": 0.7416666746139526, "rewards/chosen": 0.049659062176942825, "rewards/margins": 0.9659037590026855, "rewards/rejected": -0.9162446856498718, "step": 1080 }, { "epoch": 0.8518155053974484, "grad_norm": 3.874643564224243, "learning_rate": 2.982731554160126e-06, "logits/chosen": -3.0091512203216553, "logits/rejected": -3.06247615814209, "logps/chosen": -333.6044616699219, "logps/rejected": -305.1171875, "loss": 0.5009, "rewards/accuracies": 0.7291666865348816, "rewards/chosen": 0.03525074943900108, "rewards/margins": 1.0619693994522095, "rewards/rejected": -1.0267184972763062, "step": 1085 }, { "epoch": 0.8557409224730128, "grad_norm": 4.802616119384766, "learning_rate": 2.904238618524333e-06, "logits/chosen": -3.0582773685455322, "logits/rejected": -3.0825817584991455, "logps/chosen": -356.7900695800781, "logps/rejected": -343.6094665527344, "loss": 0.4828, "rewards/accuracies": 0.7583333849906921, "rewards/chosen": 0.11294318735599518, "rewards/margins": 1.103849172592163, "rewards/rejected": -0.9909059405326843, "step": 1090 }, { "epoch": 0.8596663395485771, "grad_norm": 4.6176838874816895, "learning_rate": 2.8257456828885403e-06, "logits/chosen": -2.992724895477295, "logits/rejected": -3.021177053451538, "logps/chosen": -326.8610534667969, "logps/rejected": -327.75445556640625, "loss": 0.3941, "rewards/accuracies": 0.8166667222976685, "rewards/chosen": 0.09250589460134506, "rewards/margins": 1.3396714925765991, "rewards/rejected": -1.247165560722351, "step": 1095 }, { "epoch": 0.8635917566241413, "grad_norm": 4.353200912475586, "learning_rate": 2.7472527472527476e-06, "logits/chosen": -3.025132656097412, "logits/rejected": -3.075371503829956, "logps/chosen": -323.29315185546875, "logps/rejected": -304.34423828125, "loss": 0.456, "rewards/accuracies": 0.7375000715255737, "rewards/chosen": 0.09489820152521133, "rewards/margins": 1.147526741027832, "rewards/rejected": -1.0526283979415894, "step": 1100 }, { "epoch": 0.8635917566241413, "eval_logits/chosen": -3.0234742164611816, "eval_logits/rejected": -3.051736354827881, "eval_logps/chosen": -328.9091491699219, "eval_logps/rejected": -322.8738098144531, "eval_loss": 0.49041956663131714, "eval_rewards/accuracies": 0.7534999847412109, "eval_rewards/chosen": 0.04543456435203552, "eval_rewards/margins": 1.0900031328201294, "eval_rewards/rejected": -1.044568657875061, "eval_runtime": 170.9675, "eval_samples_per_second": 11.698, "eval_steps_per_second": 5.849, "step": 1100 }, { "epoch": 0.8675171736997056, "grad_norm": 3.9487624168395996, "learning_rate": 2.668759811616955e-06, "logits/chosen": -2.9902524948120117, "logits/rejected": -3.033686876296997, "logps/chosen": -304.41265869140625, "logps/rejected": -300.32049560546875, "loss": 0.4872, "rewards/accuracies": 0.7416666746139526, "rewards/chosen": 0.16270777583122253, "rewards/margins": 1.078561544418335, "rewards/rejected": -0.9158536195755005, "step": 1105 }, { "epoch": 0.8714425907752699, "grad_norm": 5.130923748016357, "learning_rate": 2.5902668759811617e-06, "logits/chosen": -2.932274580001831, "logits/rejected": -2.995884656906128, "logps/chosen": -318.7715148925781, "logps/rejected": -321.5521240234375, "loss": 0.5181, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -0.003421901259571314, "rewards/margins": 1.0905206203460693, "rewards/rejected": -1.0939425230026245, "step": 1110 }, { "epoch": 0.8753680078508341, "grad_norm": 5.341976642608643, "learning_rate": 2.511773940345369e-06, "logits/chosen": -2.9918906688690186, "logits/rejected": -3.070976734161377, "logps/chosen": -328.22149658203125, "logps/rejected": -295.317138671875, "loss": 0.5221, "rewards/accuracies": 0.7583333253860474, "rewards/chosen": 0.2609195113182068, "rewards/margins": 1.2272056341171265, "rewards/rejected": -0.9662860631942749, "step": 1115 }, { "epoch": 0.8792934249263984, "grad_norm": 3.9076426029205322, "learning_rate": 2.4332810047095766e-06, "logits/chosen": -3.0119221210479736, "logits/rejected": -3.0435667037963867, "logps/chosen": -345.94036865234375, "logps/rejected": -323.3532409667969, "loss": 0.4533, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": 0.23655609786510468, "rewards/margins": 1.205311894416809, "rewards/rejected": -0.9687557220458984, "step": 1120 }, { "epoch": 0.8832188420019627, "grad_norm": 4.00649356842041, "learning_rate": 2.3547880690737835e-06, "logits/chosen": -3.0578651428222656, "logits/rejected": -3.1050515174865723, "logps/chosen": -293.8743896484375, "logps/rejected": -334.2582702636719, "loss": 0.5158, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": 0.27813708782196045, "rewards/margins": 1.1314489841461182, "rewards/rejected": -0.8533117175102234, "step": 1125 }, { "epoch": 0.887144259077527, "grad_norm": 5.170398712158203, "learning_rate": 2.2762951334379907e-06, "logits/chosen": -2.9842798709869385, "logits/rejected": -3.0326454639434814, "logps/chosen": -319.64111328125, "logps/rejected": -334.63641357421875, "loss": 0.5767, "rewards/accuracies": 0.6958334445953369, "rewards/chosen": 0.04911806434392929, "rewards/margins": 1.029329538345337, "rewards/rejected": -0.9802114367485046, "step": 1130 }, { "epoch": 0.8910696761530913, "grad_norm": 4.750176906585693, "learning_rate": 2.197802197802198e-06, "logits/chosen": -3.024641275405884, "logits/rejected": -3.0461010932922363, "logps/chosen": -344.88226318359375, "logps/rejected": -349.8857727050781, "loss": 0.4812, "rewards/accuracies": 0.7500000596046448, "rewards/chosen": 0.08117427676916122, "rewards/margins": 1.2805150747299194, "rewards/rejected": -1.1993409395217896, "step": 1135 }, { "epoch": 0.8949950932286556, "grad_norm": 4.3012471199035645, "learning_rate": 2.1193092621664052e-06, "logits/chosen": -2.9693052768707275, "logits/rejected": -3.012446165084839, "logps/chosen": -332.40740966796875, "logps/rejected": -347.2129211425781, "loss": 0.4836, "rewards/accuracies": 0.783333420753479, "rewards/chosen": 0.11985665559768677, "rewards/margins": 1.132505178451538, "rewards/rejected": -1.012648582458496, "step": 1140 }, { "epoch": 0.8989205103042198, "grad_norm": 4.1170196533203125, "learning_rate": 2.0408163265306125e-06, "logits/chosen": -2.9982786178588867, "logits/rejected": -2.985097646713257, "logps/chosen": -325.58453369140625, "logps/rejected": -318.0592346191406, "loss": 0.4191, "rewards/accuracies": 0.8125001192092896, "rewards/chosen": 0.2217942774295807, "rewards/margins": 1.308205485343933, "rewards/rejected": -1.0864112377166748, "step": 1145 }, { "epoch": 0.9028459273797841, "grad_norm": 6.188891887664795, "learning_rate": 1.9623233908948198e-06, "logits/chosen": -2.999929189682007, "logits/rejected": -3.010659694671631, "logps/chosen": -335.7912902832031, "logps/rejected": -308.9612731933594, "loss": 0.4935, "rewards/accuracies": 0.75, "rewards/chosen": -0.025047356262803078, "rewards/margins": 1.0598541498184204, "rewards/rejected": -1.0849015712738037, "step": 1150 }, { "epoch": 0.9067713444553483, "grad_norm": 4.374786376953125, "learning_rate": 1.883830455259027e-06, "logits/chosen": -2.9940290451049805, "logits/rejected": -3.0808780193328857, "logps/chosen": -332.26287841796875, "logps/rejected": -301.36590576171875, "loss": 0.5517, "rewards/accuracies": 0.7166666984558105, "rewards/chosen": 0.07976453751325607, "rewards/margins": 0.9496763348579407, "rewards/rejected": -0.8699118494987488, "step": 1155 }, { "epoch": 0.9106967615309126, "grad_norm": 5.303534030914307, "learning_rate": 1.8053375196232339e-06, "logits/chosen": -2.9458823204040527, "logits/rejected": -3.05169939994812, "logps/chosen": -350.2687072753906, "logps/rejected": -344.89892578125, "loss": 0.529, "rewards/accuracies": 0.7041667103767395, "rewards/chosen": -0.006747332401573658, "rewards/margins": 1.0622098445892334, "rewards/rejected": -1.0689570903778076, "step": 1160 }, { "epoch": 0.914622178606477, "grad_norm": 4.187100887298584, "learning_rate": 1.7268445839874411e-06, "logits/chosen": -3.0305941104888916, "logits/rejected": -3.0799167156219482, "logps/chosen": -311.58660888671875, "logps/rejected": -297.4324035644531, "loss": 0.5227, "rewards/accuracies": 0.73333340883255, "rewards/chosen": 0.021794170141220093, "rewards/margins": 0.9995294809341431, "rewards/rejected": -0.977735161781311, "step": 1165 }, { "epoch": 0.9185475956820413, "grad_norm": 3.8420519828796387, "learning_rate": 1.6483516483516484e-06, "logits/chosen": -3.073319911956787, "logits/rejected": -3.1020355224609375, "logps/chosen": -323.134521484375, "logps/rejected": -314.55340576171875, "loss": 0.453, "rewards/accuracies": 0.7666667699813843, "rewards/chosen": 0.005049190018326044, "rewards/margins": 1.1697251796722412, "rewards/rejected": -1.1646759510040283, "step": 1170 }, { "epoch": 0.9224730127576055, "grad_norm": 4.952281951904297, "learning_rate": 1.5698587127158556e-06, "logits/chosen": -2.9573421478271484, "logits/rejected": -3.016396999359131, "logps/chosen": -336.82427978515625, "logps/rejected": -305.33697509765625, "loss": 0.4634, "rewards/accuracies": 0.7416666746139526, "rewards/chosen": 0.049565743654966354, "rewards/margins": 1.13016676902771, "rewards/rejected": -1.0806009769439697, "step": 1175 }, { "epoch": 0.9263984298331698, "grad_norm": 4.881412029266357, "learning_rate": 1.491365777080063e-06, "logits/chosen": -3.013920783996582, "logits/rejected": -3.0628743171691895, "logps/chosen": -325.85760498046875, "logps/rejected": -325.6579284667969, "loss": 0.5157, "rewards/accuracies": 0.7375000715255737, "rewards/chosen": -0.006006541661918163, "rewards/margins": 1.0770210027694702, "rewards/rejected": -1.0830276012420654, "step": 1180 }, { "epoch": 0.930323846908734, "grad_norm": 4.166913986206055, "learning_rate": 1.4128728414442702e-06, "logits/chosen": -3.0066945552825928, "logits/rejected": -3.021183490753174, "logps/chosen": -298.1296081542969, "logps/rejected": -306.4847106933594, "loss": 0.4665, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.030468011274933815, "rewards/margins": 1.1519229412078857, "rewards/rejected": -1.1214549541473389, "step": 1185 }, { "epoch": 0.9342492639842983, "grad_norm": 5.559418678283691, "learning_rate": 1.3343799058084774e-06, "logits/chosen": -3.032975912094116, "logits/rejected": -3.018131732940674, "logps/chosen": -299.36419677734375, "logps/rejected": -299.8133239746094, "loss": 0.5393, "rewards/accuracies": 0.6875000596046448, "rewards/chosen": 0.018437325954437256, "rewards/margins": 0.9440910220146179, "rewards/rejected": -0.9256537556648254, "step": 1190 }, { "epoch": 0.9381746810598626, "grad_norm": 5.020077228546143, "learning_rate": 1.2558869701726845e-06, "logits/chosen": -2.986281156539917, "logits/rejected": -3.06579852104187, "logps/chosen": -341.192626953125, "logps/rejected": -319.23809814453125, "loss": 0.5241, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.07408008724451065, "rewards/margins": 1.0439367294311523, "rewards/rejected": -1.1180168390274048, "step": 1195 }, { "epoch": 0.9421000981354269, "grad_norm": 4.814427375793457, "learning_rate": 1.1773940345368917e-06, "logits/chosen": -3.0177111625671387, "logits/rejected": -3.044379711151123, "logps/chosen": -326.84661865234375, "logps/rejected": -306.0335388183594, "loss": 0.4989, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0778479278087616, "rewards/margins": 1.1605087518692017, "rewards/rejected": -1.2383568286895752, "step": 1200 }, { "epoch": 0.9421000981354269, "eval_logits/chosen": -3.0254786014556885, "eval_logits/rejected": -3.0536904335021973, "eval_logps/chosen": -329.79644775390625, "eval_logps/rejected": -323.7355651855469, "eval_loss": 0.4862891137599945, "eval_rewards/accuracies": 0.7605000138282776, "eval_rewards/chosen": -0.043296121060848236, "eval_rewards/margins": 1.0874476432800293, "eval_rewards/rejected": -1.1307436227798462, "eval_runtime": 170.5812, "eval_samples_per_second": 11.725, "eval_steps_per_second": 5.862, "step": 1200 }, { "epoch": 0.9460255152109912, "grad_norm": 4.624739646911621, "learning_rate": 1.098901098901099e-06, "logits/chosen": -3.055946111679077, "logits/rejected": -3.1179323196411133, "logps/chosen": -363.6905822753906, "logps/rejected": -346.05767822265625, "loss": 0.4542, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.008908344432711601, "rewards/margins": 1.0937221050262451, "rewards/rejected": -1.102630376815796, "step": 1205 }, { "epoch": 0.9499509322865555, "grad_norm": 3.9496641159057617, "learning_rate": 1.0204081632653063e-06, "logits/chosen": -2.9469313621520996, "logits/rejected": -3.045012950897217, "logps/chosen": -328.9994201660156, "logps/rejected": -339.7306213378906, "loss": 0.4544, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.06163526326417923, "rewards/margins": 1.2660752534866333, "rewards/rejected": -1.327710509300232, "step": 1210 }, { "epoch": 0.9538763493621197, "grad_norm": 4.550204753875732, "learning_rate": 9.419152276295135e-07, "logits/chosen": -3.0455386638641357, "logits/rejected": -3.007603168487549, "logps/chosen": -321.6459655761719, "logps/rejected": -317.13592529296875, "loss": 0.518, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.07271427661180496, "rewards/margins": 1.1276568174362183, "rewards/rejected": -1.2003710269927979, "step": 1215 }, { "epoch": 0.957801766437684, "grad_norm": 3.7910659313201904, "learning_rate": 8.634222919937206e-07, "logits/chosen": -2.887434720993042, "logits/rejected": -3.0296366214752197, "logps/chosen": -328.447021484375, "logps/rejected": -312.25146484375, "loss": 0.5169, "rewards/accuracies": 0.7375000715255737, "rewards/chosen": -0.016386663541197777, "rewards/margins": 1.084718942642212, "rewards/rejected": -1.1011055707931519, "step": 1220 }, { "epoch": 0.9617271835132483, "grad_norm": 4.135540008544922, "learning_rate": 7.849293563579278e-07, "logits/chosen": -3.051018714904785, "logits/rejected": -3.042524814605713, "logps/chosen": -303.7222900390625, "logps/rejected": -311.6209411621094, "loss": 0.4563, "rewards/accuracies": 0.7625001072883606, "rewards/chosen": 0.08631271123886108, "rewards/margins": 1.19536554813385, "rewards/rejected": -1.1090528964996338, "step": 1225 }, { "epoch": 0.9656526005888125, "grad_norm": 4.80025577545166, "learning_rate": 7.064364207221351e-07, "logits/chosen": -3.0100882053375244, "logits/rejected": -3.033210039138794, "logps/chosen": -297.29388427734375, "logps/rejected": -312.75390625, "loss": 0.5043, "rewards/accuracies": 0.7416667342185974, "rewards/chosen": -0.07621364295482635, "rewards/margins": 0.9946663975715637, "rewards/rejected": -1.0708800554275513, "step": 1230 }, { "epoch": 0.9695780176643768, "grad_norm": 4.171872138977051, "learning_rate": 6.279434850863422e-07, "logits/chosen": -2.984192371368408, "logits/rejected": -2.9866414070129395, "logps/chosen": -315.4156799316406, "logps/rejected": -311.60174560546875, "loss": 0.5081, "rewards/accuracies": 0.7291667461395264, "rewards/chosen": -0.02722536399960518, "rewards/margins": 1.0488277673721313, "rewards/rejected": -1.0760531425476074, "step": 1235 }, { "epoch": 0.9735034347399412, "grad_norm": 4.678730487823486, "learning_rate": 5.494505494505495e-07, "logits/chosen": -3.045713424682617, "logits/rejected": -3.0048978328704834, "logps/chosen": -355.97918701171875, "logps/rejected": -362.16143798828125, "loss": 0.4872, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0551709420979023, "rewards/margins": 1.1289502382278442, "rewards/rejected": -1.184121012687683, "step": 1240 }, { "epoch": 0.9774288518155054, "grad_norm": 5.555654048919678, "learning_rate": 4.7095761381475676e-07, "logits/chosen": -3.045968532562256, "logits/rejected": -3.028806686401367, "logps/chosen": -362.6111755371094, "logps/rejected": -342.56658935546875, "loss": 0.4904, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": -0.12567153573036194, "rewards/margins": 1.1269028186798096, "rewards/rejected": -1.2525743246078491, "step": 1245 }, { "epoch": 0.9813542688910697, "grad_norm": 4.789409637451172, "learning_rate": 3.924646781789639e-07, "logits/chosen": -2.9453186988830566, "logits/rejected": -3.032778739929199, "logps/chosen": -335.04803466796875, "logps/rejected": -338.70782470703125, "loss": 0.5291, "rewards/accuracies": 0.7208333611488342, "rewards/chosen": -0.0959320068359375, "rewards/margins": 1.0104596614837646, "rewards/rejected": -1.1063916683197021, "step": 1250 }, { "epoch": 0.985279685966634, "grad_norm": 4.851970672607422, "learning_rate": 3.139717425431711e-07, "logits/chosen": -3.0085787773132324, "logits/rejected": -3.033092975616455, "logps/chosen": -294.70330810546875, "logps/rejected": -298.2593994140625, "loss": 0.4952, "rewards/accuracies": 0.720833420753479, "rewards/chosen": -0.07039856910705566, "rewards/margins": 1.0396531820297241, "rewards/rejected": -1.1100517511367798, "step": 1255 }, { "epoch": 0.9892051030421982, "grad_norm": 4.407826900482178, "learning_rate": 2.3547880690737838e-07, "logits/chosen": -2.992248296737671, "logits/rejected": -3.090275287628174, "logps/chosen": -361.5509338378906, "logps/rejected": -348.03094482421875, "loss": 0.4522, "rewards/accuracies": 0.7833333611488342, "rewards/chosen": -0.02963084913790226, "rewards/margins": 1.1609312295913696, "rewards/rejected": -1.1905620098114014, "step": 1260 }, { "epoch": 0.9931305201177625, "grad_norm": 5.248498916625977, "learning_rate": 1.5698587127158556e-07, "logits/chosen": -3.013667345046997, "logits/rejected": -2.995178461074829, "logps/chosen": -306.033447265625, "logps/rejected": -311.38470458984375, "loss": 0.5122, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -0.04827199503779411, "rewards/margins": 1.0230185985565186, "rewards/rejected": -1.0712906122207642, "step": 1265 }, { "epoch": 0.9970559371933267, "grad_norm": 4.212546348571777, "learning_rate": 7.849293563579278e-08, "logits/chosen": -3.0183610916137695, "logits/rejected": -3.0684821605682373, "logps/chosen": -313.4974060058594, "logps/rejected": -312.05755615234375, "loss": 0.5292, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.12905262410640717, "rewards/margins": 0.9748676419258118, "rewards/rejected": -1.1039202213287354, "step": 1270 }, { "epoch": 1.0, "step": 1274, "total_flos": 0.0, "train_loss": 0.5089355802610868, "train_runtime": 12172.1578, "train_samples_per_second": 5.023, "train_steps_per_second": 0.105 } ], "logging_steps": 5, "max_steps": 1274, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }