{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.115606936416185, "eval_steps": 50, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0038535645472061657, "grad_norm": 448.5186462402344, "learning_rate": 6.41025641025641e-08, "logits/chosen": 1.4453125, "logits/rejected": 0.7858073115348816, "logps/chosen": -2268.0, "logps/rejected": -1634.6666259765625, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.007707129094412331, "grad_norm": 418.93499755859375, "learning_rate": 1.282051282051282e-07, "logits/chosen": 1.7473958730697632, "logits/rejected": 1.1223958730697632, "logps/chosen": -2445.333251953125, "logps/rejected": -1826.6666259765625, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.011560693641618497, "grad_norm": 405.8653259277344, "learning_rate": 1.9230769230769234e-07, "logits/chosen": 1.6393228769302368, "logits/rejected": 0.9677734375, "logps/chosen": -2356.0, "logps/rejected": -1770.6666259765625, "loss": 0.7788, "rewards/accuracies": 0.2152777910232544, "rewards/chosen": 0.0389404296875, "rewards/margins": -0.03106689453125, "rewards/rejected": 0.070068359375, "step": 3 }, { "epoch": 0.015414258188824663, "grad_norm": 423.0823974609375, "learning_rate": 2.564102564102564e-07, "logits/chosen": 1.4075521230697632, "logits/rejected": 0.6988932490348816, "logps/chosen": -2170.666748046875, "logps/rejected": -1557.3333740234375, "loss": 0.7227, "rewards/accuracies": 0.173611119389534, "rewards/chosen": -0.008382161147892475, "rewards/margins": 0.025054931640625, "rewards/rejected": -0.033447265625, "step": 4 }, { "epoch": 0.019267822736030827, "grad_norm": 381.6596374511719, "learning_rate": 3.205128205128205e-07, "logits/chosen": 1.20703125, "logits/rejected": 0.5538737177848816, "logps/chosen": -2094.666748046875, "logps/rejected": -1480.0, "loss": 0.6769, "rewards/accuracies": 0.25, "rewards/chosen": 0.0388997383415699, "rewards/margins": 0.12237548828125, "rewards/rejected": -0.0834554061293602, "step": 5 }, { "epoch": 0.023121387283236993, "grad_norm": 463.4579772949219, "learning_rate": 3.846153846153847e-07, "logits/chosen": 1.34375, "logits/rejected": 0.666015625, "logps/chosen": -2296.0, "logps/rejected": -1642.6666259765625, "loss": 0.7853, "rewards/accuracies": 0.1805555671453476, "rewards/chosen": -0.02089436911046505, "rewards/margins": -0.0598042793571949, "rewards/rejected": 0.0390116386115551, "step": 6 }, { "epoch": 0.02697495183044316, "grad_norm": 438.0790710449219, "learning_rate": 4.4871794871794876e-07, "logits/chosen": 1.8216146230697632, "logits/rejected": 1.1456705331802368, "logps/chosen": -2376.0, "logps/rejected": -1713.3333740234375, "loss": 0.6877, "rewards/accuracies": 0.2638888955116272, "rewards/chosen": 0.1390787810087204, "rewards/margins": 0.1389973908662796, "rewards/rejected": 0.0, "step": 7 }, { "epoch": 0.030828516377649325, "grad_norm": 372.55230712890625, "learning_rate": 5.128205128205128e-07, "logits/chosen": 1.3411458730697632, "logits/rejected": 0.6773274540901184, "logps/chosen": -2212.0, "logps/rejected": -1584.0, "loss": 0.6407, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.2947591245174408, "rewards/margins": 0.2540690004825592, "rewards/rejected": 0.0403238944709301, "step": 8 }, { "epoch": 0.03468208092485549, "grad_norm": 433.6789855957031, "learning_rate": 5.76923076923077e-07, "logits/chosen": 1.6067708730697632, "logits/rejected": 0.986328125, "logps/chosen": -2416.0, "logps/rejected": -1789.3333740234375, "loss": 0.6505, "rewards/accuracies": 0.354166716337204, "rewards/chosen": 0.1943155974149704, "rewards/margins": 0.2057902067899704, "rewards/rejected": -0.011128743179142475, "step": 9 }, { "epoch": 0.038535645472061654, "grad_norm": 317.9736633300781, "learning_rate": 6.41025641025641e-07, "logits/chosen": 1.330078125, "logits/rejected": 0.6175944209098816, "logps/chosen": -2160.0, "logps/rejected": -1560.0, "loss": 0.5536, "rewards/accuracies": 0.4930555522441864, "rewards/chosen": 0.435546875, "rewards/margins": 0.4632161557674408, "rewards/rejected": -0.02777099609375, "step": 10 }, { "epoch": 0.04238921001926782, "grad_norm": 304.54364013671875, "learning_rate": 7.051282051282052e-07, "logits/chosen": 1.56640625, "logits/rejected": 0.9544270634651184, "logps/chosen": -2069.333251953125, "logps/rejected": -1509.3333740234375, "loss": 0.5421, "rewards/accuracies": 0.5347222685813904, "rewards/chosen": 0.4866536557674408, "rewards/margins": 0.4855143129825592, "rewards/rejected": 0.0013529459247365594, "step": 11 }, { "epoch": 0.046242774566473986, "grad_norm": 286.419189453125, "learning_rate": 7.692307692307694e-07, "logits/chosen": 1.6197916269302368, "logits/rejected": 1.044921875, "logps/chosen": -2424.0, "logps/rejected": -1832.0, "loss": 0.472, "rewards/accuracies": 0.5972222685813904, "rewards/chosen": 0.6129557490348816, "rewards/margins": 0.6751301884651184, "rewards/rejected": -0.06182861328125, "step": 12 }, { "epoch": 0.05009633911368015, "grad_norm": 141.82571411132812, "learning_rate": 8.333333333333333e-07, "logits/chosen": 1.5598958730697632, "logits/rejected": 0.9134114384651184, "logps/chosen": -2264.0, "logps/rejected": -1661.3333740234375, "loss": 0.2929, "rewards/accuracies": 0.8611111640930176, "rewards/chosen": 1.4231771230697632, "rewards/margins": 1.4596353769302368, "rewards/rejected": -0.0360921211540699, "step": 13 }, { "epoch": 0.05394990366088632, "grad_norm": 129.6219482421875, "learning_rate": 8.974358974358975e-07, "logits/chosen": 1.6653646230697632, "logits/rejected": 0.9921875, "logps/chosen": -2305.333251953125, "logps/rejected": -1693.3333740234375, "loss": 0.259, "rewards/accuracies": 0.895833432674408, "rewards/chosen": 1.6705728769302368, "rewards/margins": 1.7395833730697632, "rewards/rejected": -0.0674845352768898, "step": 14 }, { "epoch": 0.057803468208092484, "grad_norm": 109.51737976074219, "learning_rate": 9.615384615384617e-07, "logits/chosen": 1.3333333730697632, "logits/rejected": 0.7392578125, "logps/chosen": -2308.0, "logps/rejected": -1712.0, "loss": 0.2452, "rewards/accuracies": 0.868055522441864, "rewards/chosen": 1.7565103769302368, "rewards/margins": 1.7994791269302368, "rewards/rejected": -0.0431721992790699, "step": 15 }, { "epoch": 0.06165703275529865, "grad_norm": 100.95893096923828, "learning_rate": 1.0256410256410257e-06, "logits/chosen": 1.720703125, "logits/rejected": 1.0779622793197632, "logps/chosen": -2328.0, "logps/rejected": -1733.3333740234375, "loss": 0.2138, "rewards/accuracies": 0.9097222685813904, "rewards/chosen": 1.9830728769302368, "rewards/margins": 2.0377604961395264, "rewards/rejected": -0.0556437186896801, "step": 16 }, { "epoch": 0.06551059730250482, "grad_norm": 53.36378479003906, "learning_rate": 1.0897435897435899e-06, "logits/chosen": 1.7734375, "logits/rejected": 1.1705728769302368, "logps/chosen": -2284.0, "logps/rejected": -1690.6666259765625, "loss": 0.1169, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": 2.9791667461395264, "rewards/margins": 3.0885417461395264, "rewards/rejected": -0.1070963516831398, "step": 17 }, { "epoch": 0.06936416184971098, "grad_norm": 18.18455696105957, "learning_rate": 1.153846153846154e-06, "logits/chosen": 1.8046875, "logits/rejected": 1.1145833730697632, "logps/chosen": -2082.666748046875, "logps/rejected": -1552.0, "loss": 0.0644, "rewards/accuracies": 0.9722222685813904, "rewards/chosen": 4.596354007720947, "rewards/margins": 4.731770992279053, "rewards/rejected": -0.13323974609375, "step": 18 }, { "epoch": 0.07321772639691715, "grad_norm": 13.227095603942871, "learning_rate": 1.217948717948718e-06, "logits/chosen": 2.0169270038604736, "logits/rejected": 1.4205728769302368, "logps/chosen": -2277.333251953125, "logps/rejected": -1724.0, "loss": 0.0441, "rewards/accuracies": 0.9791667461395264, "rewards/chosen": 5.34375, "rewards/margins": 5.541666507720947, "rewards/rejected": -0.2017008513212204, "step": 19 }, { "epoch": 0.07707129094412331, "grad_norm": 10.507977485656738, "learning_rate": 1.282051282051282e-06, "logits/chosen": 1.7981771230697632, "logits/rejected": 1.1018880605697632, "logps/chosen": -2196.0, "logps/rejected": -1606.6666259765625, "loss": 0.036, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 6.208333492279053, "rewards/margins": 6.364583492279053, "rewards/rejected": -0.15087890625, "step": 20 }, { "epoch": 0.08092485549132948, "grad_norm": 12.47641658782959, "learning_rate": 1.3461538461538462e-06, "logits/chosen": 1.7942708730697632, "logits/rejected": 1.1002603769302368, "logps/chosen": -2218.666748046875, "logps/rejected": -1641.3333740234375, "loss": 0.0321, "rewards/accuracies": 0.9861111640930176, "rewards/chosen": 6.333333492279053, "rewards/margins": 6.510416507720947, "rewards/rejected": -0.169677734375, "step": 21 }, { "epoch": 0.08477842003853564, "grad_norm": 11.549690246582031, "learning_rate": 1.4102564102564104e-06, "logits/chosen": 2.0143229961395264, "logits/rejected": 1.326171875, "logps/chosen": -2433.333251953125, "logps/rejected": -1828.0, "loss": 0.0285, "rewards/accuracies": 0.9861111640930176, "rewards/chosen": 7.140625, "rewards/margins": 7.401041507720947, "rewards/rejected": -0.2650553286075592, "step": 22 }, { "epoch": 0.08863198458574181, "grad_norm": 8.92558479309082, "learning_rate": 1.4743589743589745e-06, "logits/chosen": 1.9908853769302368, "logits/rejected": 1.4466146230697632, "logps/chosen": -2345.333251953125, "logps/rejected": -1792.0, "loss": 0.0225, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 6.932291507720947, "rewards/margins": 7.182291507720947, "rewards/rejected": -0.2403767853975296, "step": 23 }, { "epoch": 0.09248554913294797, "grad_norm": 1.44755220413208, "learning_rate": 1.5384615384615387e-06, "logits/chosen": 2.3619792461395264, "logits/rejected": 1.7864583730697632, "logps/chosen": -2117.333251953125, "logps/rejected": -1616.0, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 9.59375, "rewards/margins": 10.177083015441895, "rewards/rejected": -0.591796875, "step": 24 }, { "epoch": 0.09633911368015415, "grad_norm": 1.0887795686721802, "learning_rate": 1.602564102564103e-06, "logits/chosen": 2.5520832538604736, "logits/rejected": 1.9375, "logps/chosen": -2352.0, "logps/rejected": -1850.6666259765625, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 12.229166984558105, "rewards/margins": 13.09375, "rewards/rejected": -0.8736979365348816, "step": 25 }, { "epoch": 0.1001926782273603, "grad_norm": 1.4453604221343994, "learning_rate": 1.6666666666666667e-06, "logits/chosen": 2.6276042461395264, "logits/rejected": 2.0950520038604736, "logps/chosen": -1996.0, "logps/rejected": -1533.3333740234375, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 11.760416984558105, "rewards/margins": 12.677083015441895, "rewards/rejected": -0.9244791865348816, "step": 26 }, { "epoch": 0.10404624277456648, "grad_norm": 0.8091738224029541, "learning_rate": 1.7307692307692308e-06, "logits/chosen": 2.7916667461395264, "logits/rejected": 2.1692707538604736, "logps/chosen": -2254.666748046875, "logps/rejected": -1789.3333740234375, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 12.4375, "rewards/margins": 13.21875, "rewards/rejected": -0.7900390625, "step": 27 }, { "epoch": 0.10789980732177264, "grad_norm": 0.4383913278579712, "learning_rate": 1.794871794871795e-06, "logits/chosen": 2.7265625, "logits/rejected": 2.2135417461395264, "logps/chosen": -2296.0, "logps/rejected": -1784.0, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 14.875, "rewards/margins": 15.666666984558105, "rewards/rejected": -0.8157551884651184, "step": 28 }, { "epoch": 0.11175337186897881, "grad_norm": 1.0030765533447266, "learning_rate": 1.8589743589743592e-06, "logits/chosen": 2.734375, "logits/rejected": 2.1393229961395264, "logps/chosen": -2024.0, "logps/rejected": -1558.0, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 13.479166984558105, "rewards/margins": 14.552083015441895, "rewards/rejected": -1.0748697519302368, "step": 29 }, { "epoch": 0.11560693641618497, "grad_norm": 0.2180957943201065, "learning_rate": 1.9230769230769234e-06, "logits/chosen": 2.9401042461395264, "logits/rejected": 2.3658854961395264, "logps/chosen": -2174.666748046875, "logps/rejected": -1697.3333740234375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 14.96875, "rewards/margins": 15.8125, "rewards/rejected": -0.8196614384651184, "step": 30 }, { "epoch": 0.11946050096339114, "grad_norm": 0.45384857058525085, "learning_rate": 1.987179487179487e-06, "logits/chosen": 3.0807292461395264, "logits/rejected": 2.6145832538604736, "logps/chosen": -2309.333251953125, "logps/rejected": -1833.3333740234375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 14.572916984558105, "rewards/margins": 15.9375, "rewards/rejected": -1.3541666269302368, "step": 31 }, { "epoch": 0.1233140655105973, "grad_norm": 0.2732933759689331, "learning_rate": 2.0512820512820513e-06, "logits/chosen": 2.8411457538604736, "logits/rejected": 2.3411457538604736, "logps/chosen": -2002.6666259765625, "logps/rejected": -1565.3333740234375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 14.1875, "rewards/margins": 15.177083015441895, "rewards/rejected": -0.9830729365348816, "step": 32 }, { "epoch": 0.12716763005780346, "grad_norm": 0.4784167408943176, "learning_rate": 2.1153846153846155e-06, "logits/chosen": 2.8645832538604736, "logits/rejected": 2.3177082538604736, "logps/chosen": -2181.333251953125, "logps/rejected": -1694.6666259765625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 16.5625, "rewards/margins": 18.30208396911621, "rewards/rejected": -1.7395833730697632, "step": 33 }, { "epoch": 0.13102119460500963, "grad_norm": 0.15778791904449463, "learning_rate": 2.1794871794871797e-06, "logits/chosen": 3.1145832538604736, "logits/rejected": 2.6432292461395264, "logps/chosen": -2212.0, "logps/rejected": -1764.0, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 16.58333396911621, "rewards/margins": 18.53125, "rewards/rejected": -1.9348958730697632, "step": 34 }, { "epoch": 0.1348747591522158, "grad_norm": 0.33640778064727783, "learning_rate": 2.243589743589744e-06, "logits/chosen": 3.4817707538604736, "logits/rejected": 3.1067707538604736, "logps/chosen": -2046.6666259765625, "logps/rejected": -1670.6666259765625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 14.822916984558105, "rewards/margins": 17.32291603088379, "rewards/rejected": -2.5, "step": 35 }, { "epoch": 0.13872832369942195, "grad_norm": 0.15328630805015564, "learning_rate": 2.307692307692308e-06, "logits/chosen": 3.2864582538604736, "logits/rejected": 2.8932292461395264, "logps/chosen": -1974.6666259765625, "logps/rejected": -1586.6666259765625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 16.82291603088379, "rewards/margins": 19.58333396911621, "rewards/rejected": -2.7604167461395264, "step": 36 }, { "epoch": 0.14258188824662812, "grad_norm": 0.047410909086465836, "learning_rate": 2.371794871794872e-06, "logits/chosen": 3.5286457538604736, "logits/rejected": 3.1979167461395264, "logps/chosen": -2182.666748046875, "logps/rejected": -1753.3333740234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 19.35416603088379, "rewards/margins": 22.47916603088379, "rewards/rejected": -3.0911457538604736, "step": 37 }, { "epoch": 0.1464354527938343, "grad_norm": 0.032907333225011826, "learning_rate": 2.435897435897436e-06, "logits/chosen": 3.5911457538604736, "logits/rejected": 3.2239582538604736, "logps/chosen": -2046.6666259765625, "logps/rejected": -1652.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 18.5, "rewards/margins": 21.54166603088379, "rewards/rejected": -3.0481770038604736, "step": 38 }, { "epoch": 0.15028901734104047, "grad_norm": 0.012092908844351768, "learning_rate": 2.5e-06, "logits/chosen": 3.796875, "logits/rejected": 3.5520832538604736, "logps/chosen": -2168.0, "logps/rejected": -1776.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 19.375, "rewards/margins": 22.58333396911621, "rewards/rejected": -3.2213542461395264, "step": 39 }, { "epoch": 0.15414258188824662, "grad_norm": 0.019754642620682716, "learning_rate": 2.564102564102564e-06, "logits/chosen": 3.84375, "logits/rejected": 3.6666667461395264, "logps/chosen": -2005.3333740234375, "logps/rejected": -1618.6666259765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 19.13541603088379, "rewards/margins": 22.64583396911621, "rewards/rejected": -3.5078125, "step": 40 }, { "epoch": 0.1579961464354528, "grad_norm": 0.010096232406795025, "learning_rate": 2.6282051282051286e-06, "logits/chosen": 3.9791667461395264, "logits/rejected": 3.7135417461395264, "logps/chosen": -2234.666748046875, "logps/rejected": -1844.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.58333396911621, "rewards/margins": 23.625, "rewards/rejected": -4.075520992279053, "step": 41 }, { "epoch": 0.16184971098265896, "grad_norm": 0.09703537821769714, "learning_rate": 2.6923076923076923e-06, "logits/chosen": 3.7109375, "logits/rejected": 3.4739582538604736, "logps/chosen": -2141.333251953125, "logps/rejected": -1774.6666259765625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 18.97916603088379, "rewards/margins": 23.35416603088379, "rewards/rejected": -4.3515625, "step": 42 }, { "epoch": 0.16570327552986513, "grad_norm": 0.07086902856826782, "learning_rate": 2.756410256410257e-06, "logits/chosen": 3.6770832538604736, "logits/rejected": 3.4348957538604736, "logps/chosen": -2282.666748046875, "logps/rejected": -1856.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 20.9375, "rewards/margins": 25.0, "rewards/rejected": -4.080729007720947, "step": 43 }, { "epoch": 0.16955684007707128, "grad_norm": 0.034193553030490875, "learning_rate": 2.8205128205128207e-06, "logits/chosen": 3.9453125, "logits/rejected": 3.7057292461395264, "logps/chosen": -2293.333251953125, "logps/rejected": -1908.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 19.98958396911621, "rewards/margins": 24.3125, "rewards/rejected": -4.330729007720947, "step": 44 }, { "epoch": 0.17341040462427745, "grad_norm": 0.006385047920048237, "learning_rate": 2.8846153846153845e-06, "logits/chosen": 3.9140625, "logits/rejected": 3.7109375, "logps/chosen": -2188.0, "logps/rejected": -1782.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.3125, "rewards/margins": 24.97916603088379, "rewards/rejected": -4.622395992279053, "step": 45 }, { "epoch": 0.17726396917148363, "grad_norm": 0.10817603021860123, "learning_rate": 2.948717948717949e-06, "logits/chosen": 3.953125, "logits/rejected": 3.6536457538604736, "logps/chosen": -2026.6666259765625, "logps/rejected": -1685.3333740234375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 17.92708396911621, "rewards/margins": 22.45833396911621, "rewards/rejected": -4.536458492279053, "step": 46 }, { "epoch": 0.1811175337186898, "grad_norm": 0.22870993614196777, "learning_rate": 3.012820512820513e-06, "logits/chosen": 4.1640625, "logits/rejected": 3.9973957538604736, "logps/chosen": -2280.0, "logps/rejected": -1897.3333740234375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 20.1875, "rewards/margins": 25.3125, "rewards/rejected": -5.130208492279053, "step": 47 }, { "epoch": 0.18497109826589594, "grad_norm": 0.017764927819371223, "learning_rate": 3.0769230769230774e-06, "logits/chosen": 4.1484375, "logits/rejected": 3.9557292461395264, "logps/chosen": -2266.666748046875, "logps/rejected": -1896.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.83333396911621, "rewards/margins": 24.79166603088379, "rewards/rejected": -4.989583492279053, "step": 48 }, { "epoch": 0.18882466281310212, "grad_norm": 0.016674794256687164, "learning_rate": 3.141025641025641e-06, "logits/chosen": 4.1171875, "logits/rejected": 4.0546875, "logps/chosen": -1901.3333740234375, "logps/rejected": -1554.6666259765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 18.0, "rewards/margins": 22.5625, "rewards/rejected": -4.604166507720947, "step": 49 }, { "epoch": 0.1926782273603083, "grad_norm": 0.15108929574489594, "learning_rate": 3.205128205128206e-06, "logits/chosen": 4.033854007720947, "logits/rejected": 3.8411457538604736, "logps/chosen": -2014.6666259765625, "logps/rejected": -1633.3333740234375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 19.79166603088379, "rewards/margins": 25.10416603088379, "rewards/rejected": -5.328125, "step": 50 }, { "epoch": 0.1926782273603083, "eval_logits/chosen": 3.982088327407837, "eval_logits/rejected": 3.8206934928894043, "eval_logps/chosen": -2109.170654296875, "eval_logps/rejected": -1737.1707763671875, "eval_loss": 0.0009570037364028394, "eval_rewards/accuracies": 0.9989836812019348, "eval_rewards/chosen": 19.470273971557617, "eval_rewards/margins": 24.602134704589844, "eval_rewards/rejected": -5.122522830963135, "eval_runtime": 348.065, "eval_samples_per_second": 5.651, "eval_steps_per_second": 0.236, "step": 50 }, { "epoch": 0.19653179190751446, "grad_norm": 0.01975693739950657, "learning_rate": 3.2692307692307696e-06, "logits/chosen": 3.9713542461395264, "logits/rejected": 3.7265625, "logps/chosen": -2062.666748046875, "logps/rejected": -1696.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.97916603088379, "rewards/margins": 25.29166603088379, "rewards/rejected": -5.317708492279053, "step": 51 }, { "epoch": 0.2003853564547206, "grad_norm": 0.003589821746572852, "learning_rate": 3.3333333333333333e-06, "logits/chosen": 4.1171875, "logits/rejected": 3.9635417461395264, "logps/chosen": -2125.333251953125, "logps/rejected": -1753.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.6875, "rewards/margins": 25.14583396911621, "rewards/rejected": -5.455729007720947, "step": 52 }, { "epoch": 0.20423892100192678, "grad_norm": 0.03871523216366768, "learning_rate": 3.397435897435898e-06, "logits/chosen": 3.9114582538604736, "logits/rejected": 3.6640625, "logps/chosen": -1968.0, "logps/rejected": -1612.0, "loss": 0.0049, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 18.97916603088379, "rewards/margins": 24.08333396911621, "rewards/rejected": -5.125, "step": 53 }, { "epoch": 0.20809248554913296, "grad_norm": 0.008748271502554417, "learning_rate": 3.4615384615384617e-06, "logits/chosen": 3.8671875, "logits/rejected": 3.6328125, "logps/chosen": -2134.666748046875, "logps/rejected": -1769.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.39583396911621, "rewards/margins": 25.72916603088379, "rewards/rejected": -5.375, "step": 54 }, { "epoch": 0.2119460500963391, "grad_norm": 0.0759611427783966, "learning_rate": 3.5256410256410263e-06, "logits/chosen": 3.8463542461395264, "logits/rejected": 3.6015625, "logps/chosen": -1994.6666259765625, "logps/rejected": -1634.6666259765625, "loss": 0.0049, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 19.03125, "rewards/margins": 24.41666603088379, "rewards/rejected": -5.385416507720947, "step": 55 }, { "epoch": 0.21579961464354527, "grad_norm": 0.1303359717130661, "learning_rate": 3.58974358974359e-06, "logits/chosen": 4.1328125, "logits/rejected": 4.026041507720947, "logps/chosen": -2118.666748046875, "logps/rejected": -1722.6666259765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 19.95833396911621, "rewards/margins": 25.6875, "rewards/rejected": -5.71875, "step": 56 }, { "epoch": 0.21965317919075145, "grad_norm": 0.006375041790306568, "learning_rate": 3.653846153846154e-06, "logits/chosen": 3.8958332538604736, "logits/rejected": 3.7369792461395264, "logps/chosen": -2112.0, "logps/rejected": -1742.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.52083396911621, "rewards/margins": 26.52083396911621, "rewards/rejected": -6.010416507720947, "step": 57 }, { "epoch": 0.22350674373795762, "grad_norm": 0.029440419748425484, "learning_rate": 3.7179487179487184e-06, "logits/chosen": 4.002604007720947, "logits/rejected": 3.8880207538604736, "logps/chosen": -2077.333251953125, "logps/rejected": -1717.3333740234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 19.47916603088379, "rewards/margins": 25.08333396911621, "rewards/rejected": -5.609375, "step": 58 }, { "epoch": 0.22736030828516376, "grad_norm": 0.04970398545265198, "learning_rate": 3.782051282051282e-06, "logits/chosen": 3.9895832538604736, "logits/rejected": 3.7708332538604736, "logps/chosen": -2126.666748046875, "logps/rejected": -1762.6666259765625, "loss": 0.0049, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 19.60416603088379, "rewards/margins": 25.58333396911621, "rewards/rejected": -5.947916507720947, "step": 59 }, { "epoch": 0.23121387283236994, "grad_norm": 0.019775498658418655, "learning_rate": 3.846153846153847e-06, "logits/chosen": 4.080729007720947, "logits/rejected": 3.9505207538604736, "logps/chosen": -2014.6666259765625, "logps/rejected": -1677.3333740234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 19.1875, "rewards/margins": 25.29166603088379, "rewards/rejected": -6.119791507720947, "step": 60 }, { "epoch": 0.2350674373795761, "grad_norm": 0.006104462780058384, "learning_rate": 3.910256410256411e-06, "logits/chosen": 3.875, "logits/rejected": 3.7447917461395264, "logps/chosen": -1852.0, "logps/rejected": -1517.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.79166603088379, "rewards/margins": 24.16666603088379, "rewards/rejected": -5.375, "step": 61 }, { "epoch": 0.23892100192678228, "grad_norm": 0.14444145560264587, "learning_rate": 3.974358974358974e-06, "logits/chosen": 4.125, "logits/rejected": 4.020833492279053, "logps/chosen": -2354.666748046875, "logps/rejected": -2006.6666259765625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 20.71875, "rewards/margins": 27.10416603088379, "rewards/rejected": -6.390625, "step": 62 }, { "epoch": 0.24277456647398843, "grad_norm": 0.007748492527753115, "learning_rate": 4.0384615384615385e-06, "logits/chosen": 4.098958492279053, "logits/rejected": 4.080729007720947, "logps/chosen": -1901.3333740234375, "logps/rejected": -1544.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.5625, "rewards/margins": 25.6875, "rewards/rejected": -6.166666507720947, "step": 63 }, { "epoch": 0.2466281310211946, "grad_norm": 0.0066373045556247234, "learning_rate": 4.102564102564103e-06, "logits/chosen": 4.044270992279053, "logits/rejected": 3.9635417461395264, "logps/chosen": -2042.6666259765625, "logps/rejected": -1696.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.72916603088379, "rewards/margins": 26.45833396911621, "rewards/rejected": -6.739583492279053, "step": 64 }, { "epoch": 0.2504816955684008, "grad_norm": 0.011538499034941196, "learning_rate": 4.166666666666667e-06, "logits/chosen": 4.010416507720947, "logits/rejected": 3.859375, "logps/chosen": -2330.666748046875, "logps/rejected": -1916.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 23.33333396911621, "rewards/margins": 29.64583396911621, "rewards/rejected": -6.307291507720947, "step": 65 }, { "epoch": 0.2543352601156069, "grad_norm": 0.002117054769769311, "learning_rate": 4.230769230769231e-06, "logits/chosen": 3.9661457538604736, "logits/rejected": 3.8098957538604736, "logps/chosen": -2078.666748046875, "logps/rejected": -1737.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.625, "rewards/margins": 26.52083396911621, "rewards/rejected": -6.875, "step": 66 }, { "epoch": 0.2581888246628131, "grad_norm": 0.0023708210792392492, "learning_rate": 4.294871794871795e-06, "logits/chosen": 4.145833492279053, "logits/rejected": 4.130208492279053, "logps/chosen": -1994.6666259765625, "logps/rejected": -1656.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.20833396911621, "rewards/margins": 26.16666603088379, "rewards/rejected": -6.953125, "step": 67 }, { "epoch": 0.26204238921001927, "grad_norm": 0.008680014871060848, "learning_rate": 4.358974358974359e-06, "logits/chosen": 4.184895992279053, "logits/rejected": 4.036458492279053, "logps/chosen": -2061.333251953125, "logps/rejected": -1720.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.89583396911621, "rewards/margins": 27.1875, "rewards/rejected": -7.317708492279053, "step": 68 }, { "epoch": 0.2658959537572254, "grad_norm": 0.003541042795404792, "learning_rate": 4.423076923076924e-06, "logits/chosen": 4.052083492279053, "logits/rejected": 4.010416507720947, "logps/chosen": -1946.6666259765625, "logps/rejected": -1613.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.4375, "rewards/margins": 25.5625, "rewards/rejected": -6.125, "step": 69 }, { "epoch": 0.2697495183044316, "grad_norm": 0.04448345676064491, "learning_rate": 4.487179487179488e-06, "logits/chosen": 4.067708492279053, "logits/rejected": 3.9635417461395264, "logps/chosen": -2218.666748046875, "logps/rejected": -1870.6666259765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 20.14583396911621, "rewards/margins": 27.27083396911621, "rewards/rejected": -7.109375, "step": 70 }, { "epoch": 0.27360308285163776, "grad_norm": 0.002983363810926676, "learning_rate": 4.551282051282052e-06, "logits/chosen": 3.8854167461395264, "logits/rejected": 3.7682292461395264, "logps/chosen": -1994.6666259765625, "logps/rejected": -1661.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.91666603088379, "rewards/margins": 26.97916603088379, "rewards/rejected": -7.067708492279053, "step": 71 }, { "epoch": 0.2774566473988439, "grad_norm": 0.001534629613161087, "learning_rate": 4.615384615384616e-06, "logits/chosen": 4.2265625, "logits/rejected": 4.067708492279053, "logps/chosen": -2369.333251953125, "logps/rejected": -1988.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.22916603088379, "rewards/margins": 30.22916603088379, "rewards/rejected": -8.020833015441895, "step": 72 }, { "epoch": 0.2813102119460501, "grad_norm": 0.002654832089319825, "learning_rate": 4.6794871794871795e-06, "logits/chosen": 4.028645992279053, "logits/rejected": 3.9244792461395264, "logps/chosen": -1996.0, "logps/rejected": -1637.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.15625, "rewards/margins": 27.45833396911621, "rewards/rejected": -7.28125, "step": 73 }, { "epoch": 0.28516377649325625, "grad_norm": 0.003633022541180253, "learning_rate": 4.743589743589744e-06, "logits/chosen": 4.169270992279053, "logits/rejected": 4.190104007720947, "logps/chosen": -1802.6666259765625, "logps/rejected": -1486.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.78125, "rewards/margins": 26.16666603088379, "rewards/rejected": -7.375, "step": 74 }, { "epoch": 0.28901734104046245, "grad_norm": 0.014494485221803188, "learning_rate": 4.807692307692308e-06, "logits/chosen": 3.9244792461395264, "logits/rejected": 3.7682292461395264, "logps/chosen": -2072.0, "logps/rejected": -1714.6666259765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 21.20833396911621, "rewards/margins": 28.9375, "rewards/rejected": -7.729166507720947, "step": 75 }, { "epoch": 0.2928709055876686, "grad_norm": 0.2149343341588974, "learning_rate": 4.871794871794872e-06, "logits/chosen": 4.192708492279053, "logits/rejected": 4.2265625, "logps/chosen": -1848.0, "logps/rejected": -1526.6666259765625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 18.22916603088379, "rewards/margins": 25.16666603088379, "rewards/rejected": -6.9375, "step": 76 }, { "epoch": 0.29672447013487474, "grad_norm": 0.019362634047865868, "learning_rate": 4.935897435897436e-06, "logits/chosen": 3.9609375, "logits/rejected": 3.84375, "logps/chosen": -2040.0, "logps/rejected": -1734.6666259765625, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 18.65625, "rewards/margins": 26.33333396911621, "rewards/rejected": -7.661458492279053, "step": 77 }, { "epoch": 0.30057803468208094, "grad_norm": 0.00887187011539936, "learning_rate": 5e-06, "logits/chosen": 4.044270992279053, "logits/rejected": 3.9427082538604736, "logps/chosen": -2185.333251953125, "logps/rejected": -1864.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.14583396911621, "rewards/margins": 28.27083396911621, "rewards/rejected": -8.098958015441895, "step": 78 }, { "epoch": 0.3044315992292871, "grad_norm": 0.2066686451435089, "learning_rate": 4.999974750389921e-06, "logits/chosen": 3.9557292461395264, "logits/rejected": 3.84375, "logps/chosen": -2113.333251953125, "logps/rejected": -1776.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.83333396911621, "rewards/margins": 28.4375, "rewards/rejected": -7.5625, "step": 79 }, { "epoch": 0.30828516377649323, "grad_norm": 0.007379862014204264, "learning_rate": 4.999899002069717e-06, "logits/chosen": 4.0859375, "logits/rejected": 4.0, "logps/chosen": -1998.6666259765625, "logps/rejected": -1621.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.02083396911621, "rewards/margins": 28.95833396911621, "rewards/rejected": -7.890625, "step": 80 }, { "epoch": 0.31213872832369943, "grad_norm": 0.005021043587476015, "learning_rate": 4.999772756569482e-06, "logits/chosen": 4.065104007720947, "logits/rejected": 3.8958332538604736, "logps/chosen": -2114.666748046875, "logps/rejected": -1796.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.35416603088379, "rewards/margins": 28.77083396911621, "rewards/rejected": -8.416666984558105, "step": 81 }, { "epoch": 0.3159922928709056, "grad_norm": 0.3324376046657562, "learning_rate": 4.9995960164393334e-06, "logits/chosen": 4.151041507720947, "logits/rejected": 4.088541507720947, "logps/chosen": -2066.666748046875, "logps/rejected": -1741.3333740234375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 19.1875, "rewards/margins": 27.25, "rewards/rejected": -8.041666984558105, "step": 82 }, { "epoch": 0.3198458574181118, "grad_norm": 0.0030700168572366238, "learning_rate": 4.999368785249369e-06, "logits/chosen": 3.9479167461395264, "logits/rejected": 3.7421875, "logps/chosen": -2148.0, "logps/rejected": -1829.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.1875, "rewards/margins": 27.54166603088379, "rewards/rejected": -8.390625, "step": 83 }, { "epoch": 0.3236994219653179, "grad_norm": 0.08104661107063293, "learning_rate": 4.999091067589587e-06, "logits/chosen": 4.197916507720947, "logits/rejected": 4.0703125, "logps/chosen": -2106.666748046875, "logps/rejected": -1752.0, "loss": 0.0049, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 20.73958396911621, "rewards/margins": 28.91666603088379, "rewards/rejected": -8.192708015441895, "step": 84 }, { "epoch": 0.32755298651252407, "grad_norm": 0.03488411009311676, "learning_rate": 4.998762869069797e-06, "logits/chosen": 4.2578125, "logits/rejected": 4.15625, "logps/chosen": -2121.333251953125, "logps/rejected": -1752.0, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 21.125, "rewards/margins": 29.97916603088379, "rewards/rejected": -8.802083015441895, "step": 85 }, { "epoch": 0.33140655105973027, "grad_norm": 0.002621225779876113, "learning_rate": 4.998384196319508e-06, "logits/chosen": 4.260416507720947, "logits/rejected": 4.169270992279053, "logps/chosen": -2080.0, "logps/rejected": -1733.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.85416603088379, "rewards/margins": 30.6875, "rewards/rejected": -8.838541984558105, "step": 86 }, { "epoch": 0.3352601156069364, "grad_norm": 0.02308177761733532, "learning_rate": 4.99795505698779e-06, "logits/chosen": 4.166666507720947, "logits/rejected": 3.9375, "logps/chosen": -2192.0, "logps/rejected": -1866.6666259765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 21.0, "rewards/margins": 30.08333396911621, "rewards/rejected": -9.0625, "step": 87 }, { "epoch": 0.33911368015414256, "grad_norm": 0.0020183336455374956, "learning_rate": 4.997475459743124e-06, "logits/chosen": 4.041666507720947, "logits/rejected": 3.8125, "logps/chosen": -2074.666748046875, "logps/rejected": -1772.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.25, "rewards/margins": 28.125, "rewards/rejected": -8.895833015441895, "step": 88 }, { "epoch": 0.34296724470134876, "grad_norm": 0.02345126122236252, "learning_rate": 4.996945414273225e-06, "logits/chosen": 4.283854007720947, "logits/rejected": 4.078125, "logps/chosen": -2378.666748046875, "logps/rejected": -2030.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.35416603088379, "rewards/margins": 31.54166603088379, "rewards/rejected": -10.177083015441895, "step": 89 }, { "epoch": 0.3468208092485549, "grad_norm": 0.00278104399330914, "learning_rate": 4.996364931284847e-06, "logits/chosen": 4.138020992279053, "logits/rejected": 4.106770992279053, "logps/chosen": -2005.3333740234375, "logps/rejected": -1657.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.66666603088379, "rewards/margins": 28.89583396911621, "rewards/rejected": -8.203125, "step": 90 }, { "epoch": 0.35067437379576105, "grad_norm": 0.0059468126855790615, "learning_rate": 4.995734022503565e-06, "logits/chosen": 4.026041507720947, "logits/rejected": 3.7057292461395264, "logps/chosen": -2257.333251953125, "logps/rejected": -1906.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.58333396911621, "rewards/margins": 30.8125, "rewards/rejected": -9.260416984558105, "step": 91 }, { "epoch": 0.35452793834296725, "grad_norm": 0.006139659788459539, "learning_rate": 4.9950527006735375e-06, "logits/chosen": 4.03125, "logits/rejected": 3.8307292461395264, "logps/chosen": -2029.3333740234375, "logps/rejected": -1717.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.14583396911621, "rewards/margins": 28.70833396911621, "rewards/rejected": -8.567708015441895, "step": 92 }, { "epoch": 0.3583815028901734, "grad_norm": 0.0008786112302914262, "learning_rate": 4.994320979557256e-06, "logits/chosen": 4.34375, "logits/rejected": 4.161458492279053, "logps/chosen": -2157.333251953125, "logps/rejected": -1837.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.0, "rewards/margins": 31.89583396911621, "rewards/rejected": -9.854166984558105, "step": 93 }, { "epoch": 0.3622350674373796, "grad_norm": 0.004149068612605333, "learning_rate": 4.993538873935256e-06, "logits/chosen": 4.145833492279053, "logits/rejected": 3.9869792461395264, "logps/chosen": -2213.333251953125, "logps/rejected": -1896.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.39583396911621, "rewards/margins": 30.85416603088379, "rewards/rejected": -9.5, "step": 94 }, { "epoch": 0.36608863198458574, "grad_norm": 0.0028043068014085293, "learning_rate": 4.992706399605829e-06, "logits/chosen": 4.252604007720947, "logits/rejected": 4.244791507720947, "logps/chosen": -2006.6666259765625, "logps/rejected": -1678.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.625, "rewards/margins": 29.58333396911621, "rewards/rejected": -8.958333015441895, "step": 95 }, { "epoch": 0.3699421965317919, "grad_norm": 0.0021775844506919384, "learning_rate": 4.991823573384695e-06, "logits/chosen": 4.2265625, "logits/rejected": 4.046875, "logps/chosen": -2296.0, "logps/rejected": -1968.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.5, "rewards/margins": 31.10416603088379, "rewards/rejected": -9.609375, "step": 96 }, { "epoch": 0.3737957610789981, "grad_norm": 0.0022722152061760426, "learning_rate": 4.990890413104671e-06, "logits/chosen": 4.138020992279053, "logits/rejected": 4.0234375, "logps/chosen": -2085.333251953125, "logps/rejected": -1770.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.4375, "rewards/margins": 29.64583396911621, "rewards/rejected": -9.21875, "step": 97 }, { "epoch": 0.37764932562620424, "grad_norm": 0.0055904267355799675, "learning_rate": 4.989906937615302e-06, "logits/chosen": 4.307291507720947, "logits/rejected": 4.239583492279053, "logps/chosen": -2138.666748046875, "logps/rejected": -1829.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.45833396911621, "rewards/margins": 30.58333396911621, "rewards/rejected": -10.114583015441895, "step": 98 }, { "epoch": 0.3815028901734104, "grad_norm": 0.0493633933365345, "learning_rate": 4.988873166782485e-06, "logits/chosen": 4.171875, "logits/rejected": 4.046875, "logps/chosen": -2190.666748046875, "logps/rejected": -1861.3333740234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 21.125, "rewards/margins": 30.77083396911621, "rewards/rejected": -9.6875, "step": 99 }, { "epoch": 0.3853564547206166, "grad_norm": 0.004305478185415268, "learning_rate": 4.98778912148807e-06, "logits/chosen": 3.921875, "logits/rejected": 3.6614582538604736, "logps/chosen": -1960.0, "logps/rejected": -1645.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.6875, "rewards/margins": 29.77083396911621, "rewards/rejected": -9.104166984558105, "step": 100 }, { "epoch": 0.3853564547206166, "eval_logits/chosen": 4.14253044128418, "eval_logits/rejected": 3.96875, "eval_logps/chosen": -2095.51220703125, "eval_logps/rejected": -1781.1707763671875, "eval_loss": 0.0009135735454037786, "eval_rewards/accuracies": 0.9989836812019348, "eval_rewards/chosen": 20.794206619262695, "eval_rewards/margins": 30.300304412841797, "eval_rewards/rejected": -9.511052131652832, "eval_runtime": 348.1385, "eval_samples_per_second": 5.65, "eval_steps_per_second": 0.236, "step": 100 }, { "epoch": 0.3892100192678227, "grad_norm": 0.0011047407751902938, "learning_rate": 4.986654823629435e-06, "logits/chosen": 4.161458492279053, "logits/rejected": 4.002604007720947, "logps/chosen": -2213.333251953125, "logps/rejected": -1870.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.14583396911621, "rewards/margins": 30.70833396911621, "rewards/rejected": -9.5625, "step": 101 }, { "epoch": 0.3930635838150289, "grad_norm": 0.11183677613735199, "learning_rate": 4.985470296119038e-06, "logits/chosen": 4.015625, "logits/rejected": 3.7708332538604736, "logps/chosen": -2048.0, "logps/rejected": -1741.3333740234375, "loss": 0.005, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 19.95833396911621, "rewards/margins": 29.625, "rewards/rejected": -9.65625, "step": 102 }, { "epoch": 0.3969171483622351, "grad_norm": 0.000864846573676914, "learning_rate": 4.984235562883971e-06, "logits/chosen": 3.9453125, "logits/rejected": 3.6145832538604736, "logps/chosen": -2084.0, "logps/rejected": -1772.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.0, "rewards/margins": 30.64583396911621, "rewards/rejected": -9.645833015441895, "step": 103 }, { "epoch": 0.4007707129094412, "grad_norm": 0.0015598663594573736, "learning_rate": 4.982950648865457e-06, "logits/chosen": 4.255208492279053, "logits/rejected": 4.09375, "logps/chosen": -2100.0, "logps/rejected": -1765.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.0, "rewards/margins": 32.04166793823242, "rewards/rejected": -10.010416984558105, "step": 104 }, { "epoch": 0.4046242774566474, "grad_norm": 0.014870800077915192, "learning_rate": 4.981615580018358e-06, "logits/chosen": 4.098958492279053, "logits/rejected": 3.890625, "logps/chosen": -1890.6666259765625, "logps/rejected": -1569.3333740234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 20.45833396911621, "rewards/margins": 29.54166603088379, "rewards/rejected": -9.072916984558105, "step": 105 }, { "epoch": 0.40847784200385356, "grad_norm": 0.002607164904475212, "learning_rate": 4.980230383310649e-06, "logits/chosen": 4.119791507720947, "logits/rejected": 3.8697917461395264, "logps/chosen": -2110.666748046875, "logps/rejected": -1801.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.33333396911621, "rewards/margins": 30.22916603088379, "rewards/rejected": -9.875, "step": 106 }, { "epoch": 0.4123314065510597, "grad_norm": 0.6502520442008972, "learning_rate": 4.978795086722873e-06, "logits/chosen": 4.192708492279053, "logits/rejected": 4.03125, "logps/chosen": -2044.0, "logps/rejected": -1750.6666259765625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 19.83333396911621, "rewards/margins": 29.08333396911621, "rewards/rejected": -9.239583015441895, "step": 107 }, { "epoch": 0.4161849710982659, "grad_norm": 0.001943534123711288, "learning_rate": 4.977309719247571e-06, "logits/chosen": 4.28125, "logits/rejected": 4.200520992279053, "logps/chosen": -2058.666748046875, "logps/rejected": -1752.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.41666603088379, "rewards/margins": 30.39583396911621, "rewards/rejected": -9.958333015441895, "step": 108 }, { "epoch": 0.42003853564547206, "grad_norm": 0.016159335151314735, "learning_rate": 4.9757743108887045e-06, "logits/chosen": 4.177083492279053, "logits/rejected": 3.9869792461395264, "logps/chosen": -2061.333251953125, "logps/rejected": -1750.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.89583396911621, "rewards/margins": 32.0, "rewards/rejected": -10.135416984558105, "step": 109 }, { "epoch": 0.4238921001926782, "grad_norm": 0.008836313150823116, "learning_rate": 4.974188892661041e-06, "logits/chosen": 4.244791507720947, "logits/rejected": 4.234375, "logps/chosen": -1944.0, "logps/rejected": -1666.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.95833396911621, "rewards/margins": 27.4375, "rewards/rejected": -9.458333015441895, "step": 110 }, { "epoch": 0.4277456647398844, "grad_norm": 0.03773793950676918, "learning_rate": 4.972553496589537e-06, "logits/chosen": 4.252604007720947, "logits/rejected": 4.072916507720947, "logps/chosen": -2196.0, "logps/rejected": -1870.6666259765625, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 21.14583396911621, "rewards/margins": 31.33333396911621, "rewards/rejected": -10.197916984558105, "step": 111 }, { "epoch": 0.43159922928709055, "grad_norm": 0.004504084587097168, "learning_rate": 4.970868155708681e-06, "logits/chosen": 4.3203125, "logits/rejected": 4.229166507720947, "logps/chosen": -2177.333251953125, "logps/rejected": -1881.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.83333396911621, "rewards/margins": 31.27083396911621, "rewards/rejected": -10.427083015441895, "step": 112 }, { "epoch": 0.43545279383429675, "grad_norm": 0.003044608049094677, "learning_rate": 4.969132904061834e-06, "logits/chosen": 3.9921875, "logits/rejected": 4.010416507720947, "logps/chosen": -1908.0, "logps/rejected": -1604.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.5, "rewards/margins": 29.27083396911621, "rewards/rejected": -9.791666984558105, "step": 113 }, { "epoch": 0.4393063583815029, "grad_norm": 0.00097280228510499, "learning_rate": 4.967347776700538e-06, "logits/chosen": 4.0703125, "logits/rejected": 4.002604007720947, "logps/chosen": -1897.3333740234375, "logps/rejected": -1614.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.89583396911621, "rewards/margins": 28.5625, "rewards/rejected": -9.666666984558105, "step": 114 }, { "epoch": 0.44315992292870904, "grad_norm": 0.01929352805018425, "learning_rate": 4.965512809683808e-06, "logits/chosen": 4.171875, "logits/rejected": 4.1328125, "logps/chosen": -2274.666748046875, "logps/rejected": -1962.6666259765625, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 20.77083396911621, "rewards/margins": 32.14583206176758, "rewards/rejected": -11.385416984558105, "step": 115 }, { "epoch": 0.44701348747591524, "grad_norm": 0.0003989486722275615, "learning_rate": 4.963628040077406e-06, "logits/chosen": 4.072916507720947, "logits/rejected": 4.0390625, "logps/chosen": -2213.333251953125, "logps/rejected": -1910.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.14583396911621, "rewards/margins": 30.875, "rewards/rejected": -10.713541984558105, "step": 116 }, { "epoch": 0.4508670520231214, "grad_norm": 0.004931002389639616, "learning_rate": 4.9616935059530915e-06, "logits/chosen": 4.322916507720947, "logits/rejected": 4.322916507720947, "logps/chosen": -2146.666748046875, "logps/rejected": -1821.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.27083396911621, "rewards/margins": 32.0, "rewards/rejected": -10.71875, "step": 117 }, { "epoch": 0.45472061657032753, "grad_norm": 0.004491392523050308, "learning_rate": 4.959709246387847e-06, "logits/chosen": 4.0546875, "logits/rejected": 4.143229007720947, "logps/chosen": -1741.3333740234375, "logps/rejected": -1470.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.125, "rewards/margins": 27.70833396911621, "rewards/rejected": -9.59375, "step": 118 }, { "epoch": 0.45857418111753373, "grad_norm": 0.0019086259417235851, "learning_rate": 4.957675301463099e-06, "logits/chosen": 4.2578125, "logits/rejected": 4.059895992279053, "logps/chosen": -2314.666748046875, "logps/rejected": -2010.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.59375, "rewards/margins": 31.35416603088379, "rewards/rejected": -11.770833015441895, "step": 119 }, { "epoch": 0.4624277456647399, "grad_norm": 0.0012846403988078237, "learning_rate": 4.955591712263901e-06, "logits/chosen": 4.1875, "logits/rejected": 4.075520992279053, "logps/chosen": -2120.0, "logps/rejected": -1821.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.72916603088379, "rewards/margins": 31.75, "rewards/rejected": -11.03125, "step": 120 }, { "epoch": 0.4662813102119461, "grad_norm": 0.0004999793600291014, "learning_rate": 4.953458520878104e-06, "logits/chosen": 4.158854007720947, "logits/rejected": 4.098958492279053, "logps/chosen": -2056.0, "logps/rejected": -1745.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.04166603088379, "rewards/margins": 30.9375, "rewards/rejected": -10.895833015441895, "step": 121 }, { "epoch": 0.4701348747591522, "grad_norm": 0.00440808804705739, "learning_rate": 4.951275770395508e-06, "logits/chosen": 4.067708492279053, "logits/rejected": 4.080729007720947, "logps/chosen": -1872.0, "logps/rejected": -1574.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.69791603088379, "rewards/margins": 29.91666603088379, "rewards/rejected": -10.239583015441895, "step": 122 }, { "epoch": 0.47398843930635837, "grad_norm": 0.034791793674230576, "learning_rate": 4.9490435049069925e-06, "logits/chosen": 4.200520992279053, "logits/rejected": 4.221354007720947, "logps/chosen": -2086.666748046875, "logps/rejected": -1798.6666259765625, "loss": 0.0096, "rewards/accuracies": 0.9861111640930176, "rewards/chosen": 19.02083396911621, "rewards/margins": 30.04166603088379, "rewards/rejected": -11.052083015441895, "step": 123 }, { "epoch": 0.47784200385356457, "grad_norm": 0.0021993895061314106, "learning_rate": 4.946761769503624e-06, "logits/chosen": 4.190104007720947, "logits/rejected": 4.338541507720947, "logps/chosen": -1886.6666259765625, "logps/rejected": -1573.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.22916603088379, "rewards/margins": 30.6875, "rewards/rejected": -10.489583015441895, "step": 124 }, { "epoch": 0.4816955684007707, "grad_norm": 0.004487167112529278, "learning_rate": 4.944430610275747e-06, "logits/chosen": 4.1875, "logits/rejected": 4.1875, "logps/chosen": -2076.0, "logps/rejected": -1773.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.5, "rewards/margins": 30.97916603088379, "rewards/rejected": -11.46875, "step": 125 }, { "epoch": 0.48554913294797686, "grad_norm": 0.0003160021733492613, "learning_rate": 4.942050074312048e-06, "logits/chosen": 4.203125, "logits/rejected": 4.197916507720947, "logps/chosen": -2100.0, "logps/rejected": -1812.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.5, "rewards/margins": 32.02083206176758, "rewards/rejected": -11.520833015441895, "step": 126 }, { "epoch": 0.48940269749518306, "grad_norm": 0.0012640366330742836, "learning_rate": 4.939620209698613e-06, "logits/chosen": 4.1484375, "logits/rejected": 4.0, "logps/chosen": -2329.333251953125, "logps/rejected": -2026.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.46875, "rewards/margins": 34.41666793823242, "rewards/rejected": -12.895833015441895, "step": 127 }, { "epoch": 0.4932562620423892, "grad_norm": 0.0009563867351971567, "learning_rate": 4.9371410655179495e-06, "logits/chosen": 4.2578125, "logits/rejected": 4.328125, "logps/chosen": -2054.666748046875, "logps/rejected": -1774.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.52083396911621, "rewards/margins": 32.4375, "rewards/rejected": -11.927083015441895, "step": 128 }, { "epoch": 0.49710982658959535, "grad_norm": 0.0027785792481154203, "learning_rate": 4.934612691847995e-06, "logits/chosen": 4.005208492279053, "logits/rejected": 4.0, "logps/chosen": -1880.0, "logps/rejected": -1572.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.02083396911621, "rewards/margins": 32.02083206176758, "rewards/rejected": -10.979166984558105, "step": 129 }, { "epoch": 0.5009633911368016, "grad_norm": 0.02179774083197117, "learning_rate": 4.932035139761111e-06, "logits/chosen": 4.2421875, "logits/rejected": 4.2421875, "logps/chosen": -2057.333251953125, "logps/rejected": -1794.6666259765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 19.39583396911621, "rewards/margins": 31.35416603088379, "rewards/rejected": -11.989583015441895, "step": 130 }, { "epoch": 0.5048169556840078, "grad_norm": 0.0005111052305437624, "learning_rate": 4.929408461323044e-06, "logits/chosen": 4.21875, "logits/rejected": 4.28125, "logps/chosen": -1985.3333740234375, "logps/rejected": -1696.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.90625, "rewards/margins": 30.77083396911621, "rewards/rejected": -10.8125, "step": 131 }, { "epoch": 0.5086705202312138, "grad_norm": 0.0005501223495230079, "learning_rate": 4.926732709591879e-06, "logits/chosen": 4.130208492279053, "logits/rejected": 4.205729007720947, "logps/chosen": -2141.333251953125, "logps/rejected": -1828.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.19791603088379, "rewards/margins": 31.20833396911621, "rewards/rejected": -11.010416984558105, "step": 132 }, { "epoch": 0.51252408477842, "grad_norm": 0.0010068649426102638, "learning_rate": 4.924007938616967e-06, "logits/chosen": 4.182291507720947, "logits/rejected": 4.265625, "logps/chosen": -1874.6666259765625, "logps/rejected": -1598.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.5625, "rewards/margins": 30.25, "rewards/rejected": -10.6875, "step": 133 }, { "epoch": 0.5163776493256262, "grad_norm": 0.0034830172080546618, "learning_rate": 4.921234203437832e-06, "logits/chosen": 4.036458492279053, "logits/rejected": 3.9947917461395264, "logps/chosen": -2061.333251953125, "logps/rejected": -1780.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.58333396911621, "rewards/margins": 33.3125, "rewards/rejected": -11.729166984558105, "step": 134 }, { "epoch": 0.5202312138728323, "grad_norm": 0.0010813827393576503, "learning_rate": 4.918411560083058e-06, "logits/chosen": 4.244791507720947, "logits/rejected": 4.377604007720947, "logps/chosen": -2006.6666259765625, "logps/rejected": -1694.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.89583396911621, "rewards/margins": 33.1875, "rewards/rejected": -12.270833015441895, "step": 135 }, { "epoch": 0.5240847784200385, "grad_norm": 0.0029865028336644173, "learning_rate": 4.915540065569163e-06, "logits/chosen": 4.231770992279053, "logits/rejected": 4.255208492279053, "logps/chosen": -2132.0, "logps/rejected": -1861.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.9375, "rewards/margins": 31.25, "rewards/rejected": -12.385416984558105, "step": 136 }, { "epoch": 0.5279383429672447, "grad_norm": 0.039576079696416855, "learning_rate": 4.9126197778994374e-06, "logits/chosen": 4.239583492279053, "logits/rejected": 4.338541507720947, "logps/chosen": -2094.666748046875, "logps/rejected": -1818.6666259765625, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 19.10416603088379, "rewards/margins": 31.0, "rewards/rejected": -11.90625, "step": 137 }, { "epoch": 0.5317919075144508, "grad_norm": 0.0007458980544470251, "learning_rate": 4.909650756062782e-06, "logits/chosen": 3.9791667461395264, "logits/rejected": 4.052083492279053, "logps/chosen": -1902.6666259765625, "logps/rejected": -1578.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.72916603088379, "rewards/margins": 32.25, "rewards/rejected": -10.53125, "step": 138 }, { "epoch": 0.535645472061657, "grad_norm": 0.0009885648032650352, "learning_rate": 4.906633060032514e-06, "logits/chosen": 4.260416507720947, "logits/rejected": 4.369791507720947, "logps/chosen": -1982.6666259765625, "logps/rejected": -1700.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.35416603088379, "rewards/margins": 29.875, "rewards/rejected": -11.541666984558105, "step": 139 }, { "epoch": 0.5394990366088632, "grad_norm": 0.0022061257623136044, "learning_rate": 4.903566750765148e-06, "logits/chosen": 4.348958492279053, "logits/rejected": 4.494791507720947, "logps/chosen": -1850.6666259765625, "logps/rejected": -1558.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.59375, "rewards/margins": 30.29166603088379, "rewards/rejected": -11.6875, "step": 140 }, { "epoch": 0.5433526011560693, "grad_norm": 0.0011118401307612658, "learning_rate": 4.900451890199179e-06, "logits/chosen": 4.1953125, "logits/rejected": 4.328125, "logps/chosen": -2180.0, "logps/rejected": -1885.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.02083396911621, "rewards/margins": 31.85416603088379, "rewards/rejected": -11.791666984558105, "step": 141 }, { "epoch": 0.5472061657032755, "grad_norm": 0.027036940678954124, "learning_rate": 4.8972885412538155e-06, "logits/chosen": 4.174479007720947, "logits/rejected": 4.21875, "logps/chosen": -2025.3333740234375, "logps/rejected": -1744.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 19.5, "rewards/margins": 30.3125, "rewards/rejected": -10.8125, "step": 142 }, { "epoch": 0.5510597302504817, "grad_norm": 0.05268075317144394, "learning_rate": 4.894076767827721e-06, "logits/chosen": 4.1015625, "logits/rejected": 4.138020992279053, "logps/chosen": -1929.3333740234375, "logps/rejected": -1632.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 20.54166603088379, "rewards/margins": 32.02083206176758, "rewards/rejected": -11.510416984558105, "step": 143 }, { "epoch": 0.5549132947976878, "grad_norm": 0.002933803014457226, "learning_rate": 4.890816634797716e-06, "logits/chosen": 4.135416507720947, "logits/rejected": 4.1328125, "logps/chosen": -2206.666748046875, "logps/rejected": -1897.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.0, "rewards/margins": 35.125, "rewards/rejected": -13.114583015441895, "step": 144 }, { "epoch": 0.558766859344894, "grad_norm": 0.0009696971974335611, "learning_rate": 4.88750820801747e-06, "logits/chosen": 4.34375, "logits/rejected": 4.427083492279053, "logps/chosen": -1997.3333740234375, "logps/rejected": -1694.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.25, "rewards/margins": 32.41666793823242, "rewards/rejected": -12.197916984558105, "step": 145 }, { "epoch": 0.5626204238921002, "grad_norm": 0.0017577955732122064, "learning_rate": 4.884151554316175e-06, "logits/chosen": 4.276041507720947, "logits/rejected": 4.239583492279053, "logps/chosen": -2128.0, "logps/rejected": -1846.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.0625, "rewards/margins": 32.95833206176758, "rewards/rejected": -12.9375, "step": 146 }, { "epoch": 0.5664739884393064, "grad_norm": 0.0012356030056253076, "learning_rate": 4.880746741497187e-06, "logits/chosen": 4.119791507720947, "logits/rejected": 4.143229007720947, "logps/chosen": -2137.333251953125, "logps/rejected": -1878.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.33333396911621, "rewards/margins": 31.3125, "rewards/rejected": -11.958333015441895, "step": 147 }, { "epoch": 0.5703275529865125, "grad_norm": 0.0005208005313761532, "learning_rate": 4.8772938383366615e-06, "logits/chosen": 4.2578125, "logits/rejected": 4.28125, "logps/chosen": -2356.0, "logps/rejected": -2040.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.375, "rewards/margins": 33.52083206176758, "rewards/rejected": -12.125, "step": 148 }, { "epoch": 0.5741811175337187, "grad_norm": 0.014800318516790867, "learning_rate": 4.873792914582166e-06, "logits/chosen": 3.9270832538604736, "logits/rejected": 4.0859375, "logps/chosen": -1782.6666259765625, "logps/rejected": -1502.6666259765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 19.45833396911621, "rewards/margins": 30.6875, "rewards/rejected": -11.21875, "step": 149 }, { "epoch": 0.5780346820809249, "grad_norm": 9.917033457895741e-05, "learning_rate": 4.87024404095127e-06, "logits/chosen": 4.057291507720947, "logits/rejected": 4.145833492279053, "logps/chosen": -2221.333251953125, "logps/rejected": -1930.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.89583396911621, "rewards/margins": 33.72916793823242, "rewards/rejected": -12.833333015441895, "step": 150 }, { "epoch": 0.5780346820809249, "eval_logits/chosen": 4.148246765136719, "eval_logits/rejected": 4.175685882568359, "eval_logps/chosen": -2098.43896484375, "eval_logps/rejected": -1809.6585693359375, "eval_loss": 0.000809903081972152, "eval_rewards/accuracies": 0.9989836812019348, "eval_rewards/chosen": 20.569358825683594, "eval_rewards/margins": 32.91615676879883, "eval_rewards/rejected": -12.346036911010742, "eval_runtime": 348.2805, "eval_samples_per_second": 5.648, "eval_steps_per_second": 0.235, "step": 150 }, { "epoch": 0.581888246628131, "grad_norm": 0.0021192687563598156, "learning_rate": 4.866647289130112e-06, "logits/chosen": 4.109375, "logits/rejected": 4.140625, "logps/chosen": -2221.333251953125, "logps/rejected": -1949.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.0625, "rewards/margins": 33.5625, "rewards/rejected": -13.520833015441895, "step": 151 }, { "epoch": 0.5857418111753372, "grad_norm": 0.0031571441795676947, "learning_rate": 4.863002731771957e-06, "logits/chosen": 4.286458492279053, "logits/rejected": 4.341145992279053, "logps/chosen": -2116.0, "logps/rejected": -1844.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.79166603088379, "rewards/margins": 31.70833396911621, "rewards/rejected": -12.947916984558105, "step": 152 }, { "epoch": 0.5895953757225434, "grad_norm": 0.006539942696690559, "learning_rate": 4.8593104424957275e-06, "logits/chosen": 4.088541507720947, "logits/rejected": 4.1015625, "logps/chosen": -2056.0, "logps/rejected": -1778.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.71875, "rewards/margins": 30.5625, "rewards/rejected": -11.854166984558105, "step": 153 }, { "epoch": 0.5934489402697495, "grad_norm": 0.00046424902393482625, "learning_rate": 4.855570495884514e-06, "logits/chosen": 4.177083492279053, "logits/rejected": 4.296875, "logps/chosen": -2026.6666259765625, "logps/rejected": -1713.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.02083396911621, "rewards/margins": 32.41666793823242, "rewards/rejected": -11.416666984558105, "step": 154 }, { "epoch": 0.5973025048169557, "grad_norm": 0.004140717908740044, "learning_rate": 4.851782967484073e-06, "logits/chosen": 4.2890625, "logits/rejected": 4.309895992279053, "logps/chosen": -2253.333251953125, "logps/rejected": -1972.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.47916603088379, "rewards/margins": 32.39583206176758, "rewards/rejected": -12.927083015441895, "step": 155 }, { "epoch": 0.6011560693641619, "grad_norm": 0.0004713100497610867, "learning_rate": 4.847947933801296e-06, "logits/chosen": 4.203125, "logits/rejected": 4.2890625, "logps/chosen": -2104.0, "logps/rejected": -1810.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.125, "rewards/margins": 34.04166793823242, "rewards/rejected": -12.916666984558105, "step": 156 }, { "epoch": 0.605009633911368, "grad_norm": 0.00046476826537400484, "learning_rate": 4.844065472302666e-06, "logits/chosen": 4.184895992279053, "logits/rejected": 4.2890625, "logps/chosen": -1880.0, "logps/rejected": -1605.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.60416603088379, "rewards/margins": 30.27083396911621, "rewards/rejected": -11.677083015441895, "step": 157 }, { "epoch": 0.6088631984585742, "grad_norm": 0.007054036017507315, "learning_rate": 4.840135661412696e-06, "logits/chosen": 4.171875, "logits/rejected": 4.244791507720947, "logps/chosen": -1950.6666259765625, "logps/rejected": -1672.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.46875, "rewards/margins": 31.64583396911621, "rewards/rejected": -12.177083015441895, "step": 158 }, { "epoch": 0.6127167630057804, "grad_norm": 0.0004586298018693924, "learning_rate": 4.836158580512339e-06, "logits/chosen": 4.130208492279053, "logits/rejected": 4.203125, "logps/chosen": -2098.666748046875, "logps/rejected": -1810.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.52083396911621, "rewards/margins": 32.10416793823242, "rewards/rejected": -12.541666984558105, "step": 159 }, { "epoch": 0.6165703275529865, "grad_norm": 0.0029204734601080418, "learning_rate": 4.832134309937388e-06, "logits/chosen": 4.096354007720947, "logits/rejected": 4.169270992279053, "logps/chosen": -1996.0, "logps/rejected": -1718.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.875, "rewards/margins": 32.27083206176758, "rewards/rejected": -12.4375, "step": 160 }, { "epoch": 0.6204238921001927, "grad_norm": 0.014253883622586727, "learning_rate": 4.8280629309768556e-06, "logits/chosen": 4.1484375, "logits/rejected": 4.182291507720947, "logps/chosen": -2096.0, "logps/rejected": -1816.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.35416603088379, "rewards/margins": 33.20833206176758, "rewards/rejected": -12.864583015441895, "step": 161 }, { "epoch": 0.6242774566473989, "grad_norm": 0.0008097507525235415, "learning_rate": 4.823944525871324e-06, "logits/chosen": 4.322916507720947, "logits/rejected": 4.401041507720947, "logps/chosen": -2260.0, "logps/rejected": -1984.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.22916603088379, "rewards/margins": 33.16666793823242, "rewards/rejected": -12.947916984558105, "step": 162 }, { "epoch": 0.628131021194605, "grad_norm": 0.027163458988070488, "learning_rate": 4.819779177811294e-06, "logits/chosen": 4.2109375, "logits/rejected": 4.432291507720947, "logps/chosen": -1956.0, "logps/rejected": -1674.6666259765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 19.29166603088379, "rewards/margins": 31.16666603088379, "rewards/rejected": -11.854166984558105, "step": 163 }, { "epoch": 0.6319845857418112, "grad_norm": 0.008401485159993172, "learning_rate": 4.815566970935497e-06, "logits/chosen": 4.15625, "logits/rejected": 4.364583492279053, "logps/chosen": -2040.0, "logps/rejected": -1738.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.0, "rewards/margins": 32.0625, "rewards/rejected": -12.03125, "step": 164 }, { "epoch": 0.6358381502890174, "grad_norm": 0.0007931821164675057, "learning_rate": 4.8113079903291955e-06, "logits/chosen": 4.1171875, "logits/rejected": 4.15625, "logps/chosen": -2164.0, "logps/rejected": -1892.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.47916603088379, "rewards/margins": 32.33333206176758, "rewards/rejected": -12.822916984558105, "step": 165 }, { "epoch": 0.6396917148362236, "grad_norm": 0.005584372207522392, "learning_rate": 4.807002322022471e-06, "logits/chosen": 3.9635417461395264, "logits/rejected": 3.8645832538604736, "logps/chosen": -1992.0, "logps/rejected": -1709.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.20833396911621, "rewards/margins": 32.39583206176758, "rewards/rejected": -12.125, "step": 166 }, { "epoch": 0.6435452793834296, "grad_norm": 0.0011656780261546373, "learning_rate": 4.80265005298848e-06, "logits/chosen": 4.239583492279053, "logits/rejected": 4.489583492279053, "logps/chosen": -1893.3333740234375, "logps/rejected": -1590.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.22916603088379, "rewards/margins": 30.3125, "rewards/rejected": -12.072916984558105, "step": 167 }, { "epoch": 0.6473988439306358, "grad_norm": 0.0009605508530512452, "learning_rate": 4.7982512711416995e-06, "logits/chosen": 4.111979007720947, "logits/rejected": 4.1796875, "logps/chosen": -2052.0, "logps/rejected": -1781.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.27083396911621, "rewards/margins": 32.35416793823242, "rewards/rejected": -13.041666984558105, "step": 168 }, { "epoch": 0.651252408477842, "grad_norm": 0.000541891495231539, "learning_rate": 4.793806065336151e-06, "logits/chosen": 4.229166507720947, "logits/rejected": 4.2109375, "logps/chosen": -2317.333251953125, "logps/rejected": -2037.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.16666603088379, "rewards/margins": 34.70833206176758, "rewards/rejected": -13.5, "step": 169 }, { "epoch": 0.6551059730250481, "grad_norm": 0.014681258238852024, "learning_rate": 4.789314525363604e-06, "logits/chosen": 3.9869792461395264, "logits/rejected": 3.9947917461395264, "logps/chosen": -2080.0, "logps/rejected": -1794.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.14583396911621, "rewards/margins": 32.64583206176758, "rewards/rejected": -12.53125, "step": 170 }, { "epoch": 0.6589595375722543, "grad_norm": 0.0013073825975880027, "learning_rate": 4.784776741951766e-06, "logits/chosen": 3.9869792461395264, "logits/rejected": 4.075520992279053, "logps/chosen": -1956.0, "logps/rejected": -1673.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.1875, "rewards/margins": 32.89583206176758, "rewards/rejected": -12.708333015441895, "step": 171 }, { "epoch": 0.6628131021194605, "grad_norm": 0.0005131351063027978, "learning_rate": 4.780192806762445e-06, "logits/chosen": 4.072916507720947, "logits/rejected": 4.013020992279053, "logps/chosen": -2378.666748046875, "logps/rejected": -2081.333251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.8125, "rewards/margins": 36.29166793823242, "rewards/rejected": -14.427083015441895, "step": 172 }, { "epoch": 0.6666666666666666, "grad_norm": 0.0006757262162864208, "learning_rate": 4.775562812389704e-06, "logits/chosen": 4.130208492279053, "logits/rejected": 4.161458492279053, "logps/chosen": -2181.333251953125, "logps/rejected": -1898.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.25, "rewards/margins": 35.5, "rewards/rejected": -13.208333015441895, "step": 173 }, { "epoch": 0.6705202312138728, "grad_norm": 0.0019099900964647532, "learning_rate": 4.770886852357983e-06, "logits/chosen": 4.0390625, "logits/rejected": 4.0859375, "logps/chosen": -2118.666748046875, "logps/rejected": -1816.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.91666603088379, "rewards/margins": 35.125, "rewards/rejected": -13.145833015441895, "step": 174 }, { "epoch": 0.674373795761079, "grad_norm": 0.0010234953369945288, "learning_rate": 4.766165021120217e-06, "logits/chosen": 4.158854007720947, "logits/rejected": 4.1875, "logps/chosen": -2004.0, "logps/rejected": -1706.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.70833396911621, "rewards/margins": 33.0625, "rewards/rejected": -12.34375, "step": 175 }, { "epoch": 0.6782273603082851, "grad_norm": 0.0011535166995599866, "learning_rate": 4.7613974140559245e-06, "logits/chosen": 4.307291507720947, "logits/rejected": 4.4375, "logps/chosen": -1858.6666259765625, "logps/rejected": -1573.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.64583396911621, "rewards/margins": 31.6875, "rewards/rejected": -12.0, "step": 176 }, { "epoch": 0.6820809248554913, "grad_norm": 0.005667518824338913, "learning_rate": 4.75658412746928e-06, "logits/chosen": 4.052083492279053, "logits/rejected": 4.200520992279053, "logps/chosen": -1930.6666259765625, "logps/rejected": -1656.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.9375, "rewards/margins": 32.58333206176758, "rewards/rejected": -12.645833015441895, "step": 177 }, { "epoch": 0.6859344894026975, "grad_norm": 0.001189857255667448, "learning_rate": 4.751725258587172e-06, "logits/chosen": 4.286458492279053, "logits/rejected": 4.395833492279053, "logps/chosen": -2240.0, "logps/rejected": -1929.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.0, "rewards/margins": 35.79166793823242, "rewards/rejected": -13.770833015441895, "step": 178 }, { "epoch": 0.6897880539499036, "grad_norm": 0.0123676722869277, "learning_rate": 4.746820905557236e-06, "logits/chosen": 3.984375, "logits/rejected": 4.018229007720947, "logps/chosen": -2058.666748046875, "logps/rejected": -1760.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.14583396911621, "rewards/margins": 33.91666793823242, "rewards/rejected": -12.78125, "step": 179 }, { "epoch": 0.6936416184971098, "grad_norm": 0.0005225472850725055, "learning_rate": 4.7418711674458735e-06, "logits/chosen": 4.1015625, "logits/rejected": 4.28125, "logps/chosen": -1814.6666259765625, "logps/rejected": -1552.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.0625, "rewards/margins": 31.39583396911621, "rewards/rejected": -12.333333015441895, "step": 180 }, { "epoch": 0.697495183044316, "grad_norm": 0.007103869691491127, "learning_rate": 4.7368761442362495e-06, "logits/chosen": 4.044270992279053, "logits/rejected": 4.143229007720947, "logps/chosen": -2098.666748046875, "logps/rejected": -1821.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.71875, "rewards/margins": 32.91666793823242, "rewards/rejected": -13.197916984558105, "step": 181 }, { "epoch": 0.7013487475915221, "grad_norm": 0.0005761756910942495, "learning_rate": 4.731835936826276e-06, "logits/chosen": 4.338541507720947, "logits/rejected": 4.598958492279053, "logps/chosen": -2006.6666259765625, "logps/rejected": -1717.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.90625, "rewards/margins": 33.4375, "rewards/rejected": -13.541666984558105, "step": 182 }, { "epoch": 0.7052023121387283, "grad_norm": 0.00032268877839669585, "learning_rate": 4.726750647026569e-06, "logits/chosen": 4.1875, "logits/rejected": 4.307291507720947, "logps/chosen": -1968.0, "logps/rejected": -1708.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.5, "rewards/margins": 31.39583396911621, "rewards/rejected": -12.90625, "step": 183 }, { "epoch": 0.7090558766859345, "grad_norm": 0.00048800164950080216, "learning_rate": 4.721620377558398e-06, "logits/chosen": 4.0859375, "logits/rejected": 4.171875, "logps/chosen": -1976.0, "logps/rejected": -1698.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.97916603088379, "rewards/margins": 32.33333206176758, "rewards/rejected": -12.395833015441895, "step": 184 }, { "epoch": 0.7129094412331407, "grad_norm": 0.02520829811692238, "learning_rate": 4.716445232051604e-06, "logits/chosen": 4.145833492279053, "logits/rejected": 4.364583492279053, "logps/chosen": -1992.0, "logps/rejected": -1674.6666259765625, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 21.52083396911621, "rewards/margins": 33.60416793823242, "rewards/rejected": -12.052083015441895, "step": 185 }, { "epoch": 0.7167630057803468, "grad_norm": 0.03613102063536644, "learning_rate": 4.711225315042513e-06, "logits/chosen": 4.231770992279053, "logits/rejected": 4.294270992279053, "logps/chosen": -2078.666748046875, "logps/rejected": -1800.0, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 20.41666603088379, "rewards/margins": 34.625, "rewards/rejected": -14.25, "step": 186 }, { "epoch": 0.720616570327553, "grad_norm": 0.0005551240174099803, "learning_rate": 4.705960731971821e-06, "logits/chosen": 4.109375, "logits/rejected": 4.114583492279053, "logps/chosen": -2046.6666259765625, "logps/rejected": -1785.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.96875, "rewards/margins": 33.14583206176758, "rewards/rejected": -13.104166984558105, "step": 187 }, { "epoch": 0.7244701348747592, "grad_norm": 0.0005413247854448855, "learning_rate": 4.700651589182461e-06, "logits/chosen": 4.1640625, "logits/rejected": 4.255208492279053, "logps/chosen": -1942.6666259765625, "logps/rejected": -1676.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.33333396911621, "rewards/margins": 32.125, "rewards/rejected": -12.84375, "step": 188 }, { "epoch": 0.7283236994219653, "grad_norm": 0.0005890597240068018, "learning_rate": 4.695297993917465e-06, "logits/chosen": 4.182291507720947, "logits/rejected": 4.265625, "logps/chosen": -2222.666748046875, "logps/rejected": -1964.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.14583396911621, "rewards/margins": 33.91666793823242, "rewards/rejected": -11.741862297058105, "step": 189 }, { "epoch": 0.7321772639691715, "grad_norm": 0.03633887320756912, "learning_rate": 4.689900054317785e-06, "logits/chosen": 4.236979007720947, "logits/rejected": 4.307291507720947, "logps/chosen": -2109.333251953125, "logps/rejected": -1840.0, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 20.22916603088379, "rewards/margins": 34.20833206176758, "rewards/rejected": -13.96875, "step": 190 }, { "epoch": 0.7360308285163777, "grad_norm": 0.0006353289936669171, "learning_rate": 4.684457879420117e-06, "logits/chosen": 4.25, "logits/rejected": 4.268229007720947, "logps/chosen": -2264.0, "logps/rejected": -1988.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.35416603088379, "rewards/margins": 34.375, "rewards/rejected": -14.020833015441895, "step": 191 }, { "epoch": 0.7398843930635838, "grad_norm": 0.0008391065639443696, "learning_rate": 4.678971579154698e-06, "logits/chosen": 4.1015625, "logits/rejected": 4.122395992279053, "logps/chosen": -2237.333251953125, "logps/rejected": -1957.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.5625, "rewards/margins": 34.625, "rewards/rejected": -14.052083015441895, "step": 192 }, { "epoch": 0.74373795761079, "grad_norm": 0.0006799135589972138, "learning_rate": 4.6734412643430795e-06, "logits/chosen": 4.221354007720947, "logits/rejected": 4.369791507720947, "logps/chosen": -2100.0, "logps/rejected": -1825.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.04166603088379, "rewards/margins": 32.97916793823242, "rewards/rejected": -13.875, "step": 193 }, { "epoch": 0.7475915221579962, "grad_norm": 0.0022318533156067133, "learning_rate": 4.6678670466958985e-06, "logits/chosen": 4.291666507720947, "logits/rejected": 4.328125, "logps/chosen": -2280.0, "logps/rejected": -1982.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.16666603088379, "rewards/margins": 34.45833206176758, "rewards/rejected": -14.270833015441895, "step": 194 }, { "epoch": 0.7514450867052023, "grad_norm": 0.0007786553469486535, "learning_rate": 4.66224903881061e-06, "logits/chosen": 3.8932292461395264, "logits/rejected": 3.9114582538604736, "logps/chosen": -2006.6666259765625, "logps/rejected": -1726.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.16666603088379, "rewards/margins": 33.95833206176758, "rewards/rejected": -13.791666984558105, "step": 195 }, { "epoch": 0.7552986512524085, "grad_norm": 0.0006435930263251066, "learning_rate": 4.656587354169223e-06, "logits/chosen": 4.270833492279053, "logits/rejected": 4.447916507720947, "logps/chosen": -2038.6666259765625, "logps/rejected": -1744.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.42708396911621, "rewards/margins": 32.1875, "rewards/rejected": -13.6875, "step": 196 }, { "epoch": 0.7591522157996147, "grad_norm": 0.0002568648778833449, "learning_rate": 4.650882107136e-06, "logits/chosen": 4.161458492279053, "logits/rejected": 4.359375, "logps/chosen": -1989.3333740234375, "logps/rejected": -1725.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.125, "rewards/margins": 32.22916793823242, "rewards/rejected": -13.083333015441895, "step": 197 }, { "epoch": 0.7630057803468208, "grad_norm": 0.002467144513502717, "learning_rate": 4.64513341295515e-06, "logits/chosen": 4.1640625, "logits/rejected": 4.309895992279053, "logps/chosen": -2021.3333740234375, "logps/rejected": -1776.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.14583396911621, "rewards/margins": 31.60416603088379, "rewards/rejected": -13.40625, "step": 198 }, { "epoch": 0.766859344894027, "grad_norm": 0.0020542533602565527, "learning_rate": 4.639341387748506e-06, "logits/chosen": 4.265625, "logits/rejected": 4.244791507720947, "logps/chosen": -2100.0, "logps/rejected": -1844.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.625, "rewards/margins": 34.0, "rewards/rejected": -14.3125, "step": 199 }, { "epoch": 0.7707129094412332, "grad_norm": 0.00031060780747793615, "learning_rate": 4.633506148513167e-06, "logits/chosen": 4.072916507720947, "logits/rejected": 4.057291507720947, "logps/chosen": -2105.333251953125, "logps/rejected": -1773.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 23.625, "rewards/margins": 37.4375, "rewards/rejected": -13.8125, "step": 200 }, { "epoch": 0.7707129094412332, "eval_logits/chosen": 4.146151065826416, "eval_logits/rejected": 4.221227169036865, "eval_logps/chosen": -2100.58544921875, "eval_logps/rejected": -1823.4146728515625, "eval_loss": 0.0008077605743892491, "eval_rewards/accuracies": 0.9989836812019348, "eval_rewards/chosen": 20.33612823486328, "eval_rewards/margins": 34.022865295410156, "eval_rewards/rejected": -13.697408676147461, "eval_runtime": 348.2448, "eval_samples_per_second": 5.648, "eval_steps_per_second": 0.235, "step": 200 }, { "epoch": 0.7745664739884393, "grad_norm": 0.011130915954709053, "learning_rate": 4.627627813119147e-06, "logits/chosen": 4.177083492279053, "logits/rejected": 4.359375, "logps/chosen": -2012.0, "logps/rejected": -1713.3333740234375, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 19.5, "rewards/margins": 33.75, "rewards/rejected": -14.25, "step": 201 }, { "epoch": 0.7784200385356455, "grad_norm": 0.35845765471458435, "learning_rate": 4.621706500306987e-06, "logits/chosen": 4.125, "logits/rejected": 4.138020992279053, "logps/chosen": -2144.0, "logps/rejected": -1861.3333740234375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 19.82291603088379, "rewards/margins": 34.39583206176758, "rewards/rejected": -14.583333015441895, "step": 202 }, { "epoch": 0.7822736030828517, "grad_norm": 0.00043684348929673433, "learning_rate": 4.615742329685358e-06, "logits/chosen": 4.171875, "logits/rejected": 4.244791507720947, "logps/chosen": -2154.666748046875, "logps/rejected": -1878.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.60416603088379, "rewards/margins": 34.97916793823242, "rewards/rejected": -14.375, "step": 203 }, { "epoch": 0.7861271676300579, "grad_norm": 0.0012516637798398733, "learning_rate": 4.609735421728647e-06, "logits/chosen": 4.296875, "logits/rejected": 4.322916507720947, "logps/chosen": -2164.0, "logps/rejected": -1884.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.04166603088379, "rewards/margins": 35.375, "rewards/rejected": -14.322916984558105, "step": 204 }, { "epoch": 0.789980732177264, "grad_norm": 0.013026461005210876, "learning_rate": 4.6036858977745215e-06, "logits/chosen": 3.8802082538604736, "logits/rejected": 4.015625, "logps/chosen": -1798.6666259765625, "logps/rejected": -1534.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.10416603088379, "rewards/margins": 29.75, "rewards/rejected": -11.677083015441895, "step": 205 }, { "epoch": 0.7938342967244701, "grad_norm": 0.0014575921231880784, "learning_rate": 4.597593880021476e-06, "logits/chosen": 3.9661457538604736, "logits/rejected": 4.114583492279053, "logps/chosen": -1836.0, "logps/rejected": -1562.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.09375, "rewards/margins": 32.10416793823242, "rewards/rejected": -13.020833015441895, "step": 206 }, { "epoch": 0.7976878612716763, "grad_norm": 0.0010703422594815493, "learning_rate": 4.591459491526371e-06, "logits/chosen": 4.0390625, "logits/rejected": 4.057291507720947, "logps/chosen": -2026.6666259765625, "logps/rejected": -1760.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.0, "rewards/margins": 35.14583206176758, "rewards/rejected": -14.104166984558105, "step": 207 }, { "epoch": 0.8015414258188824, "grad_norm": 0.0025984381791204214, "learning_rate": 4.58528285620194e-06, "logits/chosen": 4.177083492279053, "logits/rejected": 4.276041507720947, "logps/chosen": -2000.0, "logps/rejected": -1722.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.08333396911621, "rewards/margins": 33.85416793823242, "rewards/rejected": -13.71875, "step": 208 }, { "epoch": 0.8053949903660886, "grad_norm": 0.2219504415988922, "learning_rate": 4.579064098814289e-06, "logits/chosen": 4.0078125, "logits/rejected": 4.0703125, "logps/chosen": -2044.0, "logps/rejected": -1789.3333740234375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 20.375, "rewards/margins": 34.5, "rewards/rejected": -14.177083015441895, "step": 209 }, { "epoch": 0.8092485549132948, "grad_norm": 0.029551025480031967, "learning_rate": 4.572803344980378e-06, "logits/chosen": 4.25, "logits/rejected": 4.330729007720947, "logps/chosen": -2088.0, "logps/rejected": -1814.6666259765625, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 20.33333396911621, "rewards/margins": 35.27083206176758, "rewards/rejected": -14.9375, "step": 210 }, { "epoch": 0.8131021194605009, "grad_norm": 0.0008567037293687463, "learning_rate": 4.566500721165482e-06, "logits/chosen": 4.263020992279053, "logits/rejected": 4.479166507720947, "logps/chosen": -1988.0, "logps/rejected": -1726.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.91666603088379, "rewards/margins": 34.35416793823242, "rewards/rejected": -14.385416984558105, "step": 211 }, { "epoch": 0.8169556840077071, "grad_norm": 0.0007925010868348181, "learning_rate": 4.560156354680636e-06, "logits/chosen": 4.026041507720947, "logits/rejected": 4.0859375, "logps/chosen": -2013.3333740234375, "logps/rejected": -1712.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.95833396911621, "rewards/margins": 33.75, "rewards/rejected": -13.864583015441895, "step": 212 }, { "epoch": 0.8208092485549133, "grad_norm": 0.0006680373917333782, "learning_rate": 4.553770373680062e-06, "logits/chosen": 4.239583492279053, "logits/rejected": 4.453125, "logps/chosen": -2045.3333740234375, "logps/rejected": -1753.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.72916603088379, "rewards/margins": 34.0, "rewards/rejected": -13.21875, "step": 213 }, { "epoch": 0.8246628131021194, "grad_norm": 0.005400534253567457, "learning_rate": 4.547342907158587e-06, "logits/chosen": 4.21875, "logits/rejected": 4.286458492279053, "logps/chosen": -2185.333251953125, "logps/rejected": -1917.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.95833396911621, "rewards/margins": 35.75, "rewards/rejected": -14.822916984558105, "step": 214 }, { "epoch": 0.8285163776493256, "grad_norm": 0.0009045708575285971, "learning_rate": 4.540874084949027e-06, "logits/chosen": 4.348958492279053, "logits/rejected": 4.453125, "logps/chosen": -2144.0, "logps/rejected": -1874.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.26041603088379, "rewards/margins": 35.58333206176758, "rewards/rejected": -15.385416984558105, "step": 215 }, { "epoch": 0.8323699421965318, "grad_norm": 0.0003154251317027956, "learning_rate": 4.5343640377195766e-06, "logits/chosen": 4.223958492279053, "logits/rejected": 4.453125, "logps/chosen": -1870.6666259765625, "logps/rejected": -1593.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.375, "rewards/margins": 34.14583206176758, "rewards/rejected": -13.71875, "step": 216 }, { "epoch": 0.8362235067437379, "grad_norm": 0.002044543856754899, "learning_rate": 4.527812896971154e-06, "logits/chosen": 4.309895992279053, "logits/rejected": 4.458333492279053, "logps/chosen": -2149.333251953125, "logps/rejected": -1881.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.47916603088379, "rewards/margins": 33.95833206176758, "rewards/rejected": -14.4375, "step": 217 }, { "epoch": 0.8400770712909441, "grad_norm": 0.0003632612933870405, "learning_rate": 4.521220795034763e-06, "logits/chosen": 4.213541507720947, "logits/rejected": 4.2890625, "logps/chosen": -2073.333251953125, "logps/rejected": -1825.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.04166603088379, "rewards/margins": 35.08333206176758, "rewards/rejected": -15.03125, "step": 218 }, { "epoch": 0.8439306358381503, "grad_norm": 0.0004218002432025969, "learning_rate": 4.514587865068806e-06, "logits/chosen": 4.216145992279053, "logits/rejected": 4.364583492279053, "logps/chosen": -2161.333251953125, "logps/rejected": -1906.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.89583396911621, "rewards/margins": 35.04166793823242, "rewards/rejected": -15.208333015441895, "step": 219 }, { "epoch": 0.8477842003853564, "grad_norm": 0.0008845488191582263, "learning_rate": 4.507914241056396e-06, "logits/chosen": 4.158854007720947, "logits/rejected": 4.359375, "logps/chosen": -2049.333251953125, "logps/rejected": -1776.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.72916603088379, "rewards/margins": 35.97916793823242, "rewards/rejected": -14.260416984558105, "step": 220 }, { "epoch": 0.8516377649325626, "grad_norm": 0.0007526836125180125, "learning_rate": 4.501200057802659e-06, "logits/chosen": 4.328125, "logits/rejected": 4.572916507720947, "logps/chosen": -2122.666748046875, "logps/rejected": -1870.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.78125, "rewards/margins": 34.0, "rewards/rejected": -15.260416984558105, "step": 221 }, { "epoch": 0.8554913294797688, "grad_norm": 0.012577052228152752, "learning_rate": 4.494445450932003e-06, "logits/chosen": 4.057291507720947, "logits/rejected": 4.122395992279053, "logps/chosen": -1990.6666259765625, "logps/rejected": -1706.6666259765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 23.08333396911621, "rewards/margins": 37.0, "rewards/rejected": -13.958333015441895, "step": 222 }, { "epoch": 0.859344894026975, "grad_norm": 0.056867413222789764, "learning_rate": 4.487650556885378e-06, "logits/chosen": 4.388020992279053, "logits/rejected": 4.606770992279053, "logps/chosen": -2128.0, "logps/rejected": -1860.0, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 20.48958396911621, "rewards/margins": 35.79166793823242, "rewards/rejected": -15.270833015441895, "step": 223 }, { "epoch": 0.8631984585741811, "grad_norm": 0.014765867963433266, "learning_rate": 4.480815512917525e-06, "logits/chosen": 4.317708492279053, "logits/rejected": 4.520833492279053, "logps/chosen": -2206.666748046875, "logps/rejected": -1946.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.91666603088379, "rewards/margins": 35.60416793823242, "rewards/rejected": -14.708333015441895, "step": 224 }, { "epoch": 0.8670520231213873, "grad_norm": 0.0007118711364455521, "learning_rate": 4.473940457094199e-06, "logits/chosen": 4.265625, "logits/rejected": 4.427083492279053, "logps/chosen": -2089.333251953125, "logps/rejected": -1805.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.64583396911621, "rewards/margins": 34.125, "rewards/rejected": -14.46875, "step": 225 }, { "epoch": 0.8709055876685935, "grad_norm": 0.0013994709588587284, "learning_rate": 4.467025528289384e-06, "logits/chosen": 4.080729007720947, "logits/rejected": 4.3046875, "logps/chosen": -1926.6666259765625, "logps/rejected": -1668.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.1875, "rewards/margins": 32.75, "rewards/rejected": -13.552083015441895, "step": 226 }, { "epoch": 0.8747591522157996, "grad_norm": 0.013033472001552582, "learning_rate": 4.4600708661824855e-06, "logits/chosen": 4.390625, "logits/rejected": 4.604166507720947, "logps/chosen": -1932.0, "logps/rejected": -1682.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.58333396911621, "rewards/margins": 32.47916793823242, "rewards/rejected": -14.833333015441895, "step": 227 }, { "epoch": 0.8786127167630058, "grad_norm": 0.028947103768587112, "learning_rate": 4.453076611255507e-06, "logits/chosen": 4.130208492279053, "logits/rejected": 4.291666507720947, "logps/chosen": -2120.0, "logps/rejected": -1832.0, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 20.75, "rewards/margins": 36.0625, "rewards/rejected": -15.239583015441895, "step": 228 }, { "epoch": 0.882466281310212, "grad_norm": 0.0005405103438533843, "learning_rate": 4.4460429047902174e-06, "logits/chosen": 4.28125, "logits/rejected": 4.34375, "logps/chosen": -2058.666748046875, "logps/rejected": -1816.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.41666603088379, "rewards/margins": 35.14583206176758, "rewards/rejected": -15.75, "step": 229 }, { "epoch": 0.8863198458574181, "grad_norm": 0.0005442643305286765, "learning_rate": 4.438969888865293e-06, "logits/chosen": 4.25, "logits/rejected": 4.408854007720947, "logps/chosen": -2152.0, "logps/rejected": -1893.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.10416603088379, "rewards/margins": 37.375, "rewards/rejected": -16.29166603088379, "step": 230 }, { "epoch": 0.8901734104046243, "grad_norm": 0.001009560888633132, "learning_rate": 4.431857706353449e-06, "logits/chosen": 4.447916507720947, "logits/rejected": 4.692708492279053, "logps/chosen": -2194.666748046875, "logps/rejected": -1965.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.30208396911621, "rewards/margins": 34.0, "rewards/rejected": -15.729166984558105, "step": 231 }, { "epoch": 0.8940269749518305, "grad_norm": 0.00028289612964726985, "learning_rate": 4.424706500918553e-06, "logits/chosen": 4.286458492279053, "logits/rejected": 4.427083492279053, "logps/chosen": -2210.666748046875, "logps/rejected": -1914.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.3125, "rewards/margins": 37.66666793823242, "rewards/rejected": -15.34375, "step": 232 }, { "epoch": 0.8978805394990366, "grad_norm": 0.00025388237554579973, "learning_rate": 4.417516417012725e-06, "logits/chosen": 4.348958492279053, "logits/rejected": 4.541666507720947, "logps/chosen": -2150.666748046875, "logps/rejected": -1889.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.19791603088379, "rewards/margins": 34.41666793823242, "rewards/rejected": -15.197916984558105, "step": 233 }, { "epoch": 0.9017341040462428, "grad_norm": 0.0002300368796568364, "learning_rate": 4.4102875998734176e-06, "logits/chosen": 4.34375, "logits/rejected": 4.536458492279053, "logps/chosen": -2252.0, "logps/rejected": -1997.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.33333396911621, "rewards/margins": 37.91666793823242, "rewards/rejected": -16.59375, "step": 234 }, { "epoch": 0.905587668593449, "grad_norm": 0.00038044314715079963, "learning_rate": 4.403020195520481e-06, "logits/chosen": 4.286458492279053, "logits/rejected": 4.46875, "logps/chosen": -2017.3333740234375, "logps/rejected": -1733.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.75, "rewards/margins": 35.1875, "rewards/rejected": -14.40625, "step": 235 }, { "epoch": 0.9094412331406551, "grad_norm": 0.00011744195217033848, "learning_rate": 4.395714350753216e-06, "logits/chosen": 4.390625, "logits/rejected": 4.609375, "logps/chosen": -2065.333251953125, "logps/rejected": -1817.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.01041603088379, "rewards/margins": 34.91666793823242, "rewards/rejected": -15.927083015441895, "step": 236 }, { "epoch": 0.9132947976878613, "grad_norm": 0.0004573737387545407, "learning_rate": 4.388370213147409e-06, "logits/chosen": 4.322916507720947, "logits/rejected": 4.5625, "logps/chosen": -1921.3333740234375, "logps/rejected": -1677.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.80208396911621, "rewards/margins": 33.47916793823242, "rewards/rejected": -14.666666984558105, "step": 237 }, { "epoch": 0.9171483622350675, "grad_norm": 0.0009191853459924459, "learning_rate": 4.380987931052347e-06, "logits/chosen": 4.25, "logits/rejected": 4.515625, "logps/chosen": -2081.333251953125, "logps/rejected": -1810.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.77083396911621, "rewards/margins": 37.16666793823242, "rewards/rejected": -15.364583015441895, "step": 238 }, { "epoch": 0.9210019267822736, "grad_norm": 0.003251289715990424, "learning_rate": 4.373567653587828e-06, "logits/chosen": 4.122395992279053, "logits/rejected": 4.359375, "logps/chosen": -1804.0, "logps/rejected": -1552.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.54166603088379, "rewards/margins": 33.54166793823242, "rewards/rejected": -14.0, "step": 239 }, { "epoch": 0.9248554913294798, "grad_norm": 8.846312994137406e-05, "learning_rate": 4.36610953064114e-06, "logits/chosen": 4.421875, "logits/rejected": 4.703125, "logps/chosen": -2121.333251953125, "logps/rejected": -1846.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.0625, "rewards/margins": 35.3125, "rewards/rejected": -15.25, "step": 240 }, { "epoch": 0.928709055876686, "grad_norm": 0.0004799103771802038, "learning_rate": 4.35861371286404e-06, "logits/chosen": 4.223958492279053, "logits/rejected": 4.471354007720947, "logps/chosen": -1905.3333740234375, "logps/rejected": -1637.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.95833396911621, "rewards/margins": 34.20833206176758, "rewards/rejected": -15.260416984558105, "step": 241 }, { "epoch": 0.9325626204238922, "grad_norm": 0.0012691016308963299, "learning_rate": 4.351080351669711e-06, "logits/chosen": 4.354166507720947, "logits/rejected": 4.484375, "logps/chosen": -2149.333251953125, "logps/rejected": -1882.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.64583396911621, "rewards/margins": 35.64583206176758, "rewards/rejected": -15.958333015441895, "step": 242 }, { "epoch": 0.9364161849710982, "grad_norm": 0.00014443202235270292, "learning_rate": 4.343509599229697e-06, "logits/chosen": 4.184895992279053, "logits/rejected": 4.236979007720947, "logps/chosen": -2160.0, "logps/rejected": -1946.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.20833396911621, "rewards/margins": 34.97916793823242, "rewards/rejected": -15.677083015441895, "step": 243 }, { "epoch": 0.9402697495183044, "grad_norm": 0.0009293487528339028, "learning_rate": 4.335901608470837e-06, "logits/chosen": 4.205729007720947, "logits/rejected": 4.364583492279053, "logps/chosen": -1980.0, "logps/rejected": -1746.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.22916603088379, "rewards/margins": 34.22916793823242, "rewards/rejected": -15.010416984558105, "step": 244 }, { "epoch": 0.9441233140655106, "grad_norm": 0.00019258857355453074, "learning_rate": 4.328256533072171e-06, "logits/chosen": 4.244791507720947, "logits/rejected": 4.479166507720947, "logps/chosen": -2061.333251953125, "logps/rejected": -1814.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.35416603088379, "rewards/margins": 34.77083206176758, "rewards/rejected": -15.395833015441895, "step": 245 }, { "epoch": 0.9479768786127167, "grad_norm": 0.00040573260048404336, "learning_rate": 4.3205745274618365e-06, "logits/chosen": 4.283854007720947, "logits/rejected": 4.489583492279053, "logps/chosen": -2078.666748046875, "logps/rejected": -1800.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.33333396911621, "rewards/margins": 35.0625, "rewards/rejected": -14.6875, "step": 246 }, { "epoch": 0.9518304431599229, "grad_norm": 0.0001072615195880644, "learning_rate": 4.312855746813951e-06, "logits/chosen": 4.135416507720947, "logits/rejected": 4.333333492279053, "logps/chosen": -1953.3333740234375, "logps/rejected": -1689.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.08333396911621, "rewards/margins": 35.95833206176758, "rewards/rejected": -14.90625, "step": 247 }, { "epoch": 0.9556840077071291, "grad_norm": 0.000267416617134586, "learning_rate": 4.305100347045476e-06, "logits/chosen": 4.296875, "logits/rejected": 4.505208492279053, "logps/chosen": -2069.333251953125, "logps/rejected": -1805.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.28125, "rewards/margins": 36.02083206176758, "rewards/rejected": -15.75, "step": 248 }, { "epoch": 0.9595375722543352, "grad_norm": 0.00014109206676948816, "learning_rate": 4.297308484813067e-06, "logits/chosen": 4.354166507720947, "logits/rejected": 4.614583492279053, "logps/chosen": -1976.0, "logps/rejected": -1712.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.9375, "rewards/margins": 34.22916793823242, "rewards/rejected": -15.291666984558105, "step": 249 }, { "epoch": 0.9633911368015414, "grad_norm": 0.0015265446854755282, "learning_rate": 4.289480317509911e-06, "logits/chosen": 4.25, "logits/rejected": 4.424479007720947, "logps/chosen": -2109.333251953125, "logps/rejected": -1833.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.33333396911621, "rewards/margins": 36.125, "rewards/rejected": -15.739583015441895, "step": 250 }, { "epoch": 0.9633911368015414, "eval_logits/chosen": 4.28715705871582, "eval_logits/rejected": 4.495998382568359, "eval_logps/chosen": -2100.8779296875, "eval_logps/rejected": -1841.8536376953125, "eval_loss": 0.0007315205875784159, "eval_rewards/accuracies": 0.9989836812019348, "eval_rewards/chosen": 20.260671615600586, "eval_rewards/margins": 35.86433029174805, "eval_rewards/rejected": -15.60442066192627, "eval_runtime": 348.3437, "eval_samples_per_second": 5.647, "eval_steps_per_second": 0.235, "step": 250 }, { "epoch": 0.9672447013487476, "grad_norm": 8.86472116690129e-05, "learning_rate": 4.281616003262547e-06, "logits/chosen": 4.21875, "logits/rejected": 4.247395992279053, "logps/chosen": -2134.666748046875, "logps/rejected": -1862.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.25, "rewards/margins": 37.875, "rewards/rejected": -15.625, "step": 251 }, { "epoch": 0.9710982658959537, "grad_norm": 0.004651631228625774, "learning_rate": 4.273715700927666e-06, "logits/chosen": 4.322916507720947, "logits/rejected": 4.546875, "logps/chosen": -2173.333251953125, "logps/rejected": -1925.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.88541603088379, "rewards/margins": 34.83333206176758, "rewards/rejected": -15.989583015441895, "step": 252 }, { "epoch": 0.9749518304431599, "grad_norm": 0.0007221353007480502, "learning_rate": 4.265779570088914e-06, "logits/chosen": 4.390625, "logits/rejected": 4.65625, "logps/chosen": -2029.3333740234375, "logps/rejected": -1766.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.60416603088379, "rewards/margins": 35.29166793823242, "rewards/rejected": -15.697916984558105, "step": 253 }, { "epoch": 0.9788053949903661, "grad_norm": 0.0018224004888907075, "learning_rate": 4.257807771053658e-06, "logits/chosen": 4.171875, "logits/rejected": 4.302083492279053, "logps/chosen": -2194.666748046875, "logps/rejected": -1938.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.47916603088379, "rewards/margins": 36.54166793823242, "rewards/rejected": -16.03125, "step": 254 }, { "epoch": 0.9826589595375722, "grad_norm": 0.00047694490058347583, "learning_rate": 4.249800464849751e-06, "logits/chosen": 4.223958492279053, "logits/rejected": 4.453125, "logps/chosen": -1994.6666259765625, "logps/rejected": -1730.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.26041603088379, "rewards/margins": 34.52083206176758, "rewards/rejected": -15.239583015441895, "step": 255 }, { "epoch": 0.9865125240847784, "grad_norm": 9.856942051555961e-05, "learning_rate": 4.24175781322228e-06, "logits/chosen": 4.322916507720947, "logits/rejected": 4.6328125, "logps/chosen": -1948.0, "logps/rejected": -1688.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.95833396911621, "rewards/margins": 35.70833206176758, "rewards/rejected": -15.75, "step": 256 }, { "epoch": 0.9903660886319846, "grad_norm": 0.02929985150694847, "learning_rate": 4.2336799786303e-06, "logits/chosen": 4.432291507720947, "logits/rejected": 4.713541507720947, "logps/chosen": -2192.0, "logps/rejected": -1908.0, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 19.70833396911621, "rewards/margins": 35.85416793823242, "rewards/rejected": -16.17708396911621, "step": 257 }, { "epoch": 0.9942196531791907, "grad_norm": 0.0001623107527848333, "learning_rate": 4.22556712424355e-06, "logits/chosen": 4.354166507720947, "logits/rejected": 4.645833492279053, "logps/chosen": -2168.0, "logps/rejected": -1878.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.0625, "rewards/margins": 36.08333206176758, "rewards/rejected": -15.010416984558105, "step": 258 }, { "epoch": 0.9980732177263969, "grad_norm": 9.923698962666094e-05, "learning_rate": 4.217419413939158e-06, "logits/chosen": 4.291666507720947, "logits/rejected": 4.588541507720947, "logps/chosen": -2029.3333740234375, "logps/rejected": -1774.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.16666603088379, "rewards/margins": 35.89583206176758, "rewards/rejected": -15.666666984558105, "step": 259 }, { "epoch": 1.0, "grad_norm": 9.923698962666094e-05, "learning_rate": 4.2092370122983295e-06, "logits/chosen": 4.46875, "logits/rejected": 4.739583492279053, "logps/chosen": -1928.0, "logps/rejected": -1664.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.25, "rewards/margins": 35.54166793823242, "rewards/rejected": -15.3125, "step": 260 }, { "epoch": 1.0038535645472062, "grad_norm": 0.0002012676268350333, "learning_rate": 4.201020084603027e-06, "logits/chosen": 4.458333492279053, "logits/rejected": 4.729166507720947, "logps/chosen": -2196.0, "logps/rejected": -1961.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.17708396911621, "rewards/margins": 34.77083206176758, "rewards/rejected": -16.60416603088379, "step": 261 }, { "epoch": 1.0077071290944124, "grad_norm": 0.00042612780816853046, "learning_rate": 4.192768796832625e-06, "logits/chosen": 4.302083492279053, "logits/rejected": 4.5, "logps/chosen": -2126.666748046875, "logps/rejected": -1877.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.85416603088379, "rewards/margins": 37.27083206176758, "rewards/rejected": -16.41666603088379, "step": 262 }, { "epoch": 1.0115606936416186, "grad_norm": 0.0005720489425584674, "learning_rate": 4.184483315660565e-06, "logits/chosen": 4.463541507720947, "logits/rejected": 4.755208492279053, "logps/chosen": -2145.333251953125, "logps/rejected": -1901.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.84375, "rewards/margins": 35.16666793823242, "rewards/rejected": -16.35416603088379, "step": 263 }, { "epoch": 1.0154142581888246, "grad_norm": 0.0006988844252191484, "learning_rate": 4.176163808450978e-06, "logits/chosen": 4.234375, "logits/rejected": 4.5, "logps/chosen": -1944.0, "logps/rejected": -1680.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.04166603088379, "rewards/margins": 35.125, "rewards/rejected": -15.083333015441895, "step": 264 }, { "epoch": 1.0192678227360308, "grad_norm": 0.00013966260303277522, "learning_rate": 4.167810443255319e-06, "logits/chosen": 4.416666507720947, "logits/rejected": 4.65625, "logps/chosen": -2176.0, "logps/rejected": -1904.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.85416603088379, "rewards/margins": 37.0625, "rewards/rejected": -16.19791603088379, "step": 265 }, { "epoch": 1.023121387283237, "grad_norm": 0.026800798252224922, "learning_rate": 4.159423388808956e-06, "logits/chosen": 4.309895992279053, "logits/rejected": 4.536458492279053, "logps/chosen": -2174.666748046875, "logps/rejected": -1929.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.375, "rewards/margins": 35.54166793823242, "rewards/rejected": -16.19791603088379, "step": 266 }, { "epoch": 1.0269749518304432, "grad_norm": 0.001391033991239965, "learning_rate": 4.151002814527774e-06, "logits/chosen": 4.447916507720947, "logits/rejected": 4.651041507720947, "logps/chosen": -2061.333251953125, "logps/rejected": -1822.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.4375, "rewards/margins": 35.75, "rewards/rejected": -16.3125, "step": 267 }, { "epoch": 1.0308285163776494, "grad_norm": 0.0007444653892889619, "learning_rate": 4.1425488905047485e-06, "logits/chosen": 4.255208492279053, "logits/rejected": 4.546875, "logps/chosen": -1998.6666259765625, "logps/rejected": -1726.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.97916603088379, "rewards/margins": 35.875, "rewards/rejected": -15.916666984558105, "step": 268 }, { "epoch": 1.0346820809248556, "grad_norm": 9.462093294132501e-05, "learning_rate": 4.134061787506504e-06, "logits/chosen": 4.432291507720947, "logits/rejected": 4.734375, "logps/chosen": -1973.3333740234375, "logps/rejected": -1720.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.0, "rewards/margins": 34.41666793823242, "rewards/rejected": -15.427083015441895, "step": 269 }, { "epoch": 1.0385356454720616, "grad_norm": 0.000779222697019577, "learning_rate": 4.125541676969876e-06, "logits/chosen": 4.34375, "logits/rejected": 4.682291507720947, "logps/chosen": -1948.0, "logps/rejected": -1670.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.78125, "rewards/margins": 34.10416793823242, "rewards/rejected": -15.270833015441895, "step": 270 }, { "epoch": 1.0423892100192678, "grad_norm": 0.0011096380185335875, "learning_rate": 4.116988730998439e-06, "logits/chosen": 4.479166507720947, "logits/rejected": 4.765625, "logps/chosen": -1994.6666259765625, "logps/rejected": -1769.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.125, "rewards/margins": 34.60416793823242, "rewards/rejected": -15.489583015441895, "step": 271 }, { "epoch": 1.046242774566474, "grad_norm": 0.028453350067138672, "learning_rate": 4.108403122359034e-06, "logits/chosen": 4.1875, "logits/rejected": 4.2890625, "logps/chosen": -1950.6666259765625, "logps/rejected": -1720.0, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 19.34375, "rewards/margins": 34.6875, "rewards/rejected": -15.322916984558105, "step": 272 }, { "epoch": 1.0500963391136802, "grad_norm": 0.0014503924176096916, "learning_rate": 4.099785024478276e-06, "logits/chosen": 4.25, "logits/rejected": 4.440104007720947, "logps/chosen": -1994.6666259765625, "logps/rejected": -1726.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.01041603088379, "rewards/margins": 34.6875, "rewards/rejected": -15.677083015441895, "step": 273 }, { "epoch": 1.0539499036608864, "grad_norm": 0.031464193016290665, "learning_rate": 4.091134611439056e-06, "logits/chosen": 4.203125, "logits/rejected": 4.510416507720947, "logps/chosen": -1961.3333740234375, "logps/rejected": -1721.3333740234375, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 16.75, "rewards/margins": 31.6875, "rewards/rejected": -14.947916984558105, "step": 274 }, { "epoch": 1.0578034682080926, "grad_norm": 0.0009349633473902941, "learning_rate": 4.082452057977018e-06, "logits/chosen": 4.270833492279053, "logits/rejected": 4.442708492279053, "logps/chosen": -2070.666748046875, "logps/rejected": -1818.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.6875, "rewards/margins": 36.0, "rewards/rejected": -16.3125, "step": 275 }, { "epoch": 1.0616570327552985, "grad_norm": 0.00042548601049929857, "learning_rate": 4.073737539477033e-06, "logits/chosen": 4.520833492279053, "logits/rejected": 4.859375, "logps/chosen": -2201.333251953125, "logps/rejected": -1933.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.20833396911621, "rewards/margins": 36.16666793823242, "rewards/rejected": -15.989583015441895, "step": 276 }, { "epoch": 1.0655105973025047, "grad_norm": 0.0002301628264831379, "learning_rate": 4.064991231969656e-06, "logits/chosen": 4.416666507720947, "logits/rejected": 4.760416507720947, "logps/chosen": -1965.3333740234375, "logps/rejected": -1704.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.39583396911621, "rewards/margins": 35.875, "rewards/rejected": -16.48958396911621, "step": 277 }, { "epoch": 1.069364161849711, "grad_norm": 0.0007679308182559907, "learning_rate": 4.056213312127573e-06, "logits/chosen": 4.302083492279053, "logits/rejected": 4.338541507720947, "logps/chosen": -2309.333251953125, "logps/rejected": -2058.666748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.88541603088379, "rewards/margins": 38.89583206176758, "rewards/rejected": -16.98958396911621, "step": 278 }, { "epoch": 1.0732177263969171, "grad_norm": 0.0013241744600236416, "learning_rate": 4.047403957262024e-06, "logits/chosen": 4.252604007720947, "logits/rejected": 4.442708492279053, "logps/chosen": -2092.0, "logps/rejected": -1826.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.16666603088379, "rewards/margins": 39.22916793823242, "rewards/rejected": -17.09375, "step": 279 }, { "epoch": 1.0770712909441233, "grad_norm": 0.0005100194830447435, "learning_rate": 4.03856334531923e-06, "logits/chosen": 4.369791507720947, "logits/rejected": 4.526041507720947, "logps/chosen": -2117.333251953125, "logps/rejected": -1853.3333740234375, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 22.35416603088379, "rewards/margins": 39.20833206176758, "rewards/rejected": -16.89583396911621, "step": 280 }, { "epoch": 1.0809248554913296, "grad_norm": 0.04262573644518852, "learning_rate": 4.029691654876794e-06, "logits/chosen": 4.291666507720947, "logits/rejected": 4.651041507720947, "logps/chosen": -1937.3333740234375, "logps/rejected": -1693.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.1875, "rewards/margins": 34.27083206176758, "rewards/rejected": -15.0625, "step": 281 }, { "epoch": 1.0847784200385355, "grad_norm": 0.0001316546549787745, "learning_rate": 4.020789065140097e-06, "logits/chosen": 4.377604007720947, "logits/rejected": 4.700520992279053, "logps/chosen": -2161.333251953125, "logps/rejected": -1878.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.60416603088379, "rewards/margins": 36.125, "rewards/rejected": -15.5625, "step": 282 }, { "epoch": 1.0886319845857417, "grad_norm": 0.0008041548426263034, "learning_rate": 4.011855755938674e-06, "logits/chosen": 4.169270992279053, "logits/rejected": 4.401041507720947, "logps/chosen": -2009.3333740234375, "logps/rejected": -1754.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.5625, "rewards/margins": 34.97916793823242, "rewards/rejected": -15.375, "step": 283 }, { "epoch": 1.092485549132948, "grad_norm": 0.0009880390716716647, "learning_rate": 4.002891907722584e-06, "logits/chosen": 4.2421875, "logits/rejected": 4.4609375, "logps/chosen": -1960.0, "logps/rejected": -1720.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.1875, "rewards/margins": 34.83333206176758, "rewards/rejected": -15.625, "step": 284 }, { "epoch": 1.0963391136801541, "grad_norm": 0.01791134849190712, "learning_rate": 3.993897701558764e-06, "logits/chosen": 4.453125, "logits/rejected": 4.848958492279053, "logps/chosen": -1926.6666259765625, "logps/rejected": -1665.3333740234375, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 17.91666603088379, "rewards/margins": 33.77083206176758, "rewards/rejected": -15.854166984558105, "step": 285 }, { "epoch": 1.1001926782273603, "grad_norm": 0.0007872717105783522, "learning_rate": 3.984873319127375e-06, "logits/chosen": 4.333333492279053, "logits/rejected": 4.609375, "logps/chosen": -2126.666748046875, "logps/rejected": -1864.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.0625, "rewards/margins": 37.41666793823242, "rewards/rejected": -17.3125, "step": 286 }, { "epoch": 1.1040462427745665, "grad_norm": 0.001516925753094256, "learning_rate": 3.975818942718125e-06, "logits/chosen": 4.317708492279053, "logits/rejected": 4.661458492279053, "logps/chosen": -1900.0, "logps/rejected": -1657.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.625, "rewards/margins": 33.5, "rewards/rejected": -14.885416984558105, "step": 287 }, { "epoch": 1.1078998073217727, "grad_norm": 0.0005156663828529418, "learning_rate": 3.9667347552265945e-06, "logits/chosen": 4.143229007720947, "logits/rejected": 4.1875, "logps/chosen": -2168.0, "logps/rejected": -1884.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.4375, "rewards/margins": 38.66666793823242, "rewards/rejected": -16.29166603088379, "step": 288 }, { "epoch": 1.1117533718689787, "grad_norm": 0.0008751638815738261, "learning_rate": 3.957620940150537e-06, "logits/chosen": 4.247395992279053, "logits/rejected": 4.473958492279053, "logps/chosen": -2025.3333740234375, "logps/rejected": -1762.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.72916603088379, "rewards/margins": 37.95833206176758, "rewards/rejected": -16.21875, "step": 289 }, { "epoch": 1.115606936416185, "grad_norm": 0.00016570820298511535, "learning_rate": 3.948477681586173e-06, "logits/chosen": 4.21875, "logits/rejected": 4.494791507720947, "logps/chosen": -1797.3333740234375, "logps/rejected": -1553.3333740234375, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 18.69791603088379, "rewards/margins": 32.41666793823242, "rewards/rejected": -13.708333015441895, "step": 290 }, { "epoch": 1.1194605009633911, "grad_norm": 0.0238595400005579, "learning_rate": 3.939305164224474e-06, "logits/chosen": 4.252604007720947, "logits/rejected": 4.401041507720947, "logps/chosen": -2209.333251953125, "logps/rejected": -1957.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.25, "rewards/margins": 38.20833206176758, "rewards/rejected": -16.91666603088379, "step": 291 }, { "epoch": 1.1233140655105973, "grad_norm": 0.0004754903493449092, "learning_rate": 3.93010357334743e-06, "logits/chosen": 4.2421875, "logits/rejected": 4.40625, "logps/chosen": -2193.333251953125, "logps/rejected": -1912.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.4375, "rewards/margins": 39.54166793823242, "rewards/rejected": -17.11458396911621, "step": 292 }, { "epoch": 1.1271676300578035, "grad_norm": 0.00018809924949891865, "learning_rate": 3.920873094824305e-06, "logits/chosen": 4.348958492279053, "logits/rejected": 4.622395992279053, "logps/chosen": -2228.0, "logps/rejected": -1962.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.52083396911621, "rewards/margins": 37.45833206176758, "rewards/rejected": -16.92708396911621, "step": 293 }, { "epoch": 1.1310211946050097, "grad_norm": 0.0012840500567108393, "learning_rate": 3.911613915107888e-06, "logits/chosen": 4.1875, "logits/rejected": 4.393229007720947, "logps/chosen": -2029.3333740234375, "logps/rejected": -1794.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.02083396911621, "rewards/margins": 35.6875, "rewards/rejected": -15.708333015441895, "step": 294 }, { "epoch": 1.1348747591522157, "grad_norm": 0.00034955269074998796, "learning_rate": 3.902326221230719e-06, "logits/chosen": 4.325520992279053, "logits/rejected": 4.6875, "logps/chosen": -1949.3333740234375, "logps/rejected": -1704.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.33333396911621, "rewards/margins": 34.64583206176758, "rewards/rejected": -15.333333015441895, "step": 295 }, { "epoch": 1.138728323699422, "grad_norm": 0.0007619110401719809, "learning_rate": 3.893010200801319e-06, "logits/chosen": 4.119791507720947, "logits/rejected": 4.4375, "logps/chosen": -2001.3333740234375, "logps/rejected": -1728.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.95833396911621, "rewards/margins": 35.20833206176758, "rewards/rejected": -15.270833015441895, "step": 296 }, { "epoch": 1.142581888246628, "grad_norm": 5.930203406023793e-05, "learning_rate": 3.883666042000392e-06, "logits/chosen": 4.125, "logits/rejected": 4.479166507720947, "logps/chosen": -1868.0, "logps/rejected": -1598.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.32291603088379, "rewards/margins": 35.29166793823242, "rewards/rejected": -14.958333015441895, "step": 297 }, { "epoch": 1.1464354527938343, "grad_norm": 0.00019131222506985068, "learning_rate": 3.874293933577034e-06, "logits/chosen": 4.505208492279053, "logits/rejected": 4.885416507720947, "logps/chosen": -2042.6666259765625, "logps/rejected": -1789.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.625, "rewards/margins": 33.60416793823242, "rewards/rejected": -12.981770515441895, "step": 298 }, { "epoch": 1.1502890173410405, "grad_norm": 0.0012673679739236832, "learning_rate": 3.86489406484491e-06, "logits/chosen": 4.369791507720947, "logits/rejected": 4.567708492279053, "logps/chosen": -2341.333251953125, "logps/rejected": -2064.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.91666603088379, "rewards/margins": 40.20833206176758, "rewards/rejected": -17.3125, "step": 299 }, { "epoch": 1.1541425818882467, "grad_norm": 1.22669280244736e-05, "learning_rate": 3.855466625678435e-06, "logits/chosen": 4.151041507720947, "logits/rejected": 4.4296875, "logps/chosen": -2021.3333740234375, "logps/rejected": -1768.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.15625, "rewards/margins": 34.39583206176758, "rewards/rejected": -15.291666984558105, "step": 300 }, { "epoch": 1.1541425818882467, "eval_logits/chosen": 4.2862043380737305, "eval_logits/rejected": 4.5182929039001465, "eval_logps/chosen": -2102.829345703125, "eval_logps/rejected": -1847.51220703125, "eval_loss": 0.0007195669459179044, "eval_rewards/accuracies": 0.9989836812019348, "eval_rewards/chosen": 20.144817352294922, "eval_rewards/margins": 36.32316970825195, "eval_rewards/rejected": -16.17378044128418, "eval_runtime": 348.173, "eval_samples_per_second": 5.649, "eval_steps_per_second": 0.236, "step": 300 }, { "epoch": 1.157996146435453, "grad_norm": 0.001302621210925281, "learning_rate": 3.846011806508942e-06, "logits/chosen": 4.307291507720947, "logits/rejected": 4.484375, "logps/chosen": -2086.666748046875, "logps/rejected": -1854.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.57291603088379, "rewards/margins": 35.70833206176758, "rewards/rejected": -16.09375, "step": 301 }, { "epoch": 1.1618497109826589, "grad_norm": 0.00020072223560418934, "learning_rate": 3.8365297983208285e-06, "logits/chosen": 4.236979007720947, "logits/rejected": 4.515625, "logps/chosen": -2060.0, "logps/rejected": -1817.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.5, "rewards/margins": 35.75, "rewards/rejected": -16.20833396911621, "step": 302 }, { "epoch": 1.165703275529865, "grad_norm": 0.00022804598847869784, "learning_rate": 3.8270207926477e-06, "logits/chosen": 4.234375, "logits/rejected": 4.479166507720947, "logps/chosen": -2236.0, "logps/rejected": -1957.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.54166603088379, "rewards/margins": 38.08333206176758, "rewards/rejected": -17.54166603088379, "step": 303 }, { "epoch": 1.1695568400770713, "grad_norm": 0.000749854079913348, "learning_rate": 3.817484981568507e-06, "logits/chosen": 4.192708492279053, "logits/rejected": 4.453125, "logps/chosen": -2097.333251953125, "logps/rejected": -1850.6666259765625, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 18.72916603088379, "rewards/margins": 35.125, "rewards/rejected": -16.375, "step": 304 }, { "epoch": 1.1734104046242775, "grad_norm": 0.32064732909202576, "learning_rate": 3.807922557703658e-06, "logits/chosen": 4.119791507720947, "logits/rejected": 4.348958492279053, "logps/chosen": -1965.3333740234375, "logps/rejected": -1737.3333740234375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 18.39583396911621, "rewards/margins": 34.0, "rewards/rejected": -15.625, "step": 305 }, { "epoch": 1.1772639691714837, "grad_norm": 0.0007139483350329101, "learning_rate": 3.798333714211132e-06, "logits/chosen": 4.354166507720947, "logits/rejected": 4.703125, "logps/chosen": -2142.666748046875, "logps/rejected": -1904.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.9375, "rewards/margins": 38.375, "rewards/rejected": -17.39583396911621, "step": 306 }, { "epoch": 1.1811175337186899, "grad_norm": 0.00024680374190211296, "learning_rate": 3.7887186447825772e-06, "logits/chosen": 4.200520992279053, "logits/rejected": 4.3359375, "logps/chosen": -2169.333251953125, "logps/rejected": -1897.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.375, "rewards/margins": 37.41666793823242, "rewards/rejected": -16.04166603088379, "step": 307 }, { "epoch": 1.1849710982658959, "grad_norm": 0.0005567274056375027, "learning_rate": 3.7790775436393954e-06, "logits/chosen": 4.276041507720947, "logits/rejected": 4.434895992279053, "logps/chosen": -2174.666748046875, "logps/rejected": -1920.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.5, "rewards/margins": 36.89583206176758, "rewards/rejected": -16.41666603088379, "step": 308 }, { "epoch": 1.188824662813102, "grad_norm": 0.00015132366388570517, "learning_rate": 3.769410605528824e-06, "logits/chosen": 4.453125, "logits/rejected": 4.75, "logps/chosen": -2137.333251953125, "logps/rejected": -1884.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.25, "rewards/margins": 34.375, "rewards/rejected": -15.104166984558105, "step": 309 }, { "epoch": 1.1926782273603083, "grad_norm": 0.010731222108006477, "learning_rate": 3.7597180257199956e-06, "logits/chosen": 4.333333492279053, "logits/rejected": 4.533854007720947, "logps/chosen": -2264.0, "logps/rejected": -1981.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.875, "rewards/margins": 38.625, "rewards/rejected": -16.67708396911621, "step": 310 }, { "epoch": 1.1965317919075145, "grad_norm": 0.00013122966629453003, "learning_rate": 3.7500000000000005e-06, "logits/chosen": 4.390625, "logits/rejected": 4.630208492279053, "logps/chosen": -2248.0, "logps/rejected": -2009.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.86458396911621, "rewards/margins": 36.5, "rewards/rejected": -16.64583396911621, "step": 311 }, { "epoch": 1.2003853564547207, "grad_norm": 0.00032931292662397027, "learning_rate": 3.7402567246699257e-06, "logits/chosen": 4.161458492279053, "logits/rejected": 4.299479007720947, "logps/chosen": -2064.0, "logps/rejected": -1793.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.9375, "rewards/margins": 37.66666793823242, "rewards/rejected": -15.697916984558105, "step": 312 }, { "epoch": 1.2042389210019269, "grad_norm": 0.0011779814958572388, "learning_rate": 3.7304883965408944e-06, "logits/chosen": 4.1953125, "logits/rejected": 4.3125, "logps/chosen": -2046.6666259765625, "logps/rejected": -1833.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.67708396911621, "rewards/margins": 33.91666793823242, "rewards/rejected": -15.260416984558105, "step": 313 }, { "epoch": 1.208092485549133, "grad_norm": 0.00024276028852909803, "learning_rate": 3.720695212930086e-06, "logits/chosen": 4.221354007720947, "logits/rejected": 4.5, "logps/chosen": -2089.333251953125, "logps/rejected": -1838.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.5625, "rewards/margins": 35.08333206176758, "rewards/rejected": -15.583333015441895, "step": 314 }, { "epoch": 1.211946050096339, "grad_norm": 0.00027044350281357765, "learning_rate": 3.710877371656757e-06, "logits/chosen": 4.427083492279053, "logits/rejected": 4.677083492279053, "logps/chosen": -2226.666748046875, "logps/rejected": -1989.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.71875, "rewards/margins": 35.97916793823242, "rewards/rejected": -16.32291603088379, "step": 315 }, { "epoch": 1.2157996146435452, "grad_norm": 0.0021469658240675926, "learning_rate": 3.7010350710382377e-06, "logits/chosen": 4.520833492279053, "logits/rejected": 4.838541507720947, "logps/chosen": -2157.333251953125, "logps/rejected": -1912.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.52083396911621, "rewards/margins": 35.3125, "rewards/rejected": -15.802083015441895, "step": 316 }, { "epoch": 1.2196531791907514, "grad_norm": 0.0002596381527837366, "learning_rate": 3.6911685098859295e-06, "logits/chosen": 4.34375, "logits/rejected": 4.78125, "logps/chosen": -2116.0, "logps/rejected": -1850.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 24.625, "rewards/margins": 36.95833206176758, "rewards/rejected": -12.291666984558105, "step": 317 }, { "epoch": 1.2235067437379576, "grad_norm": 0.0003486131317913532, "learning_rate": 3.68127788750129e-06, "logits/chosen": 4.239583492279053, "logits/rejected": 4.682291507720947, "logps/chosen": -1924.0, "logps/rejected": -1678.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.8125, "rewards/margins": 35.41666793823242, "rewards/rejected": -15.625, "step": 318 }, { "epoch": 1.2273603082851638, "grad_norm": 0.0002175725530833006, "learning_rate": 3.6713634036718077e-06, "logits/chosen": 4.369791507720947, "logits/rejected": 4.8125, "logps/chosen": -1965.3333740234375, "logps/rejected": -1717.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.27083396911621, "rewards/margins": 34.9375, "rewards/rejected": -15.708333015441895, "step": 319 }, { "epoch": 1.2312138728323698, "grad_norm": 0.0011432063765823841, "learning_rate": 3.661425258666962e-06, "logits/chosen": 4.315104007720947, "logits/rejected": 4.497395992279053, "logps/chosen": -2197.333251953125, "logps/rejected": -1965.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.8125, "rewards/margins": 37.1875, "rewards/rejected": -17.375, "step": 320 }, { "epoch": 1.235067437379576, "grad_norm": 9.950863750418648e-05, "learning_rate": 3.6514636532341825e-06, "logits/chosen": 4.473958492279053, "logits/rejected": 4.822916507720947, "logps/chosen": -2033.3333740234375, "logps/rejected": -1745.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.3125, "rewards/margins": 36.52083206176758, "rewards/rejected": -15.197916984558105, "step": 321 }, { "epoch": 1.2389210019267822, "grad_norm": 4.507841731538065e-05, "learning_rate": 3.6414787885947907e-06, "logits/chosen": 4.473958492279053, "logits/rejected": 4.776041507720947, "logps/chosen": -2177.333251953125, "logps/rejected": -1901.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.14583396911621, "rewards/margins": 36.22916793823242, "rewards/rejected": -15.104166984558105, "step": 322 }, { "epoch": 1.2427745664739884, "grad_norm": 0.020042747259140015, "learning_rate": 3.6314708664399378e-06, "logits/chosen": 4.2421875, "logits/rejected": 4.703125, "logps/chosen": -1888.0, "logps/rejected": -1636.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.14583396911621, "rewards/margins": 35.52083206176758, "rewards/rejected": -15.40625, "step": 323 }, { "epoch": 1.2466281310211946, "grad_norm": 9.116792352870107e-05, "learning_rate": 3.621440088926531e-06, "logits/chosen": 4.265625, "logits/rejected": 4.708333492279053, "logps/chosen": -1821.3333740234375, "logps/rejected": -1564.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.64583396911621, "rewards/margins": 34.35416793823242, "rewards/rejected": -12.645833015441895, "step": 324 }, { "epoch": 1.2504816955684008, "grad_norm": 5.05089046782814e-05, "learning_rate": 3.6113866586731455e-06, "logits/chosen": 4.40625, "logits/rejected": 4.755208492279053, "logps/chosen": -2030.6666259765625, "logps/rejected": -1780.0, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 18.61458396911621, "rewards/margins": 34.4375, "rewards/rejected": -15.875, "step": 325 }, { "epoch": 1.254335260115607, "grad_norm": 0.027630647644400597, "learning_rate": 3.601310778755937e-06, "logits/chosen": 4.333333492279053, "logits/rejected": 4.791666507720947, "logps/chosen": -1892.0, "logps/rejected": -1632.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.89583396911621, "rewards/margins": 34.9375, "rewards/rejected": -15.083333015441895, "step": 326 }, { "epoch": 1.2581888246628132, "grad_norm": 0.0004291742225177586, "learning_rate": 3.5912126527045368e-06, "logits/chosen": 4.442708492279053, "logits/rejected": 4.645833492279053, "logps/chosen": -2260.0, "logps/rejected": -2022.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.70833396911621, "rewards/margins": 37.625, "rewards/rejected": -16.91666603088379, "step": 327 }, { "epoch": 1.2620423892100192, "grad_norm": 0.0001043154697981663, "learning_rate": 3.581092484497941e-06, "logits/chosen": 4.359375, "logits/rejected": 4.609375, "logps/chosen": -2216.0, "logps/rejected": -1942.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.70833396911621, "rewards/margins": 40.29166793823242, "rewards/rejected": -17.57291603088379, "step": 328 }, { "epoch": 1.2658959537572254, "grad_norm": 0.0005115741514600813, "learning_rate": 3.5709504785603906e-06, "logits/chosen": 4.234375, "logits/rejected": 4.614583492279053, "logps/chosen": -2121.333251953125, "logps/rejected": -1856.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.66666603088379, "rewards/margins": 36.20833206176758, "rewards/rejected": -15.5625, "step": 329 }, { "epoch": 1.2697495183044316, "grad_norm": 0.00015755351341795176, "learning_rate": 3.560786839757242e-06, "logits/chosen": 4.359375, "logits/rejected": 4.604166507720947, "logps/chosen": -2076.0, "logps/rejected": -1833.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.44791603088379, "rewards/margins": 37.6875, "rewards/rejected": -17.19791603088379, "step": 330 }, { "epoch": 1.2736030828516378, "grad_norm": 0.00031994516029953957, "learning_rate": 3.5506017733908277e-06, "logits/chosen": 4.447916507720947, "logits/rejected": 4.875, "logps/chosen": -2013.3333740234375, "logps/rejected": -1745.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.35416603088379, "rewards/margins": 36.375, "rewards/rejected": -15.979166984558105, "step": 331 }, { "epoch": 1.2774566473988438, "grad_norm": 6.085784480092116e-05, "learning_rate": 3.540395485196313e-06, "logits/chosen": 4.380208492279053, "logits/rejected": 4.598958492279053, "logps/chosen": -2396.0, "logps/rejected": -2138.666748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.375, "rewards/margins": 38.125, "rewards/rejected": -16.77083396911621, "step": 332 }, { "epoch": 1.28131021194605, "grad_norm": 0.000174942149897106, "learning_rate": 3.5301681813375343e-06, "logits/chosen": 4.0546875, "logits/rejected": 4.247395992279053, "logps/chosen": -2185.333251953125, "logps/rejected": -1941.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.4375, "rewards/margins": 38.16666793823242, "rewards/rejected": -16.72916603088379, "step": 333 }, { "epoch": 1.2851637764932562, "grad_norm": 0.0004059112397953868, "learning_rate": 3.5199200684028395e-06, "logits/chosen": 4.3984375, "logits/rejected": 4.635416507720947, "logps/chosen": -2141.333251953125, "logps/rejected": -1908.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.84375, "rewards/margins": 35.375, "rewards/rejected": -16.51041603088379, "step": 334 }, { "epoch": 1.2890173410404624, "grad_norm": 0.0017011824529618025, "learning_rate": 3.509651353400913e-06, "logits/chosen": 4.231770992279053, "logits/rejected": 4.640625, "logps/chosen": -2006.6666259765625, "logps/rejected": -1730.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.90625, "rewards/margins": 38.20833206176758, "rewards/rejected": -16.30208396911621, "step": 335 }, { "epoch": 1.2928709055876686, "grad_norm": 0.0004998824442736804, "learning_rate": 3.4993622437565955e-06, "logits/chosen": 4.197916507720947, "logits/rejected": 4.380208492279053, "logps/chosen": -2309.333251953125, "logps/rejected": -2056.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.39583396911621, "rewards/margins": 36.45833206176758, "rewards/rejected": -16.0, "step": 336 }, { "epoch": 1.2967244701348748, "grad_norm": 0.0003009653592016548, "learning_rate": 3.4890529473066927e-06, "logits/chosen": 4.494791507720947, "logits/rejected": 4.760416507720947, "logps/chosen": -2270.666748046875, "logps/rejected": -1997.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.20833396911621, "rewards/margins": 37.29166793823242, "rewards/rejected": -16.07291603088379, "step": 337 }, { "epoch": 1.300578034682081, "grad_norm": 0.0004917384940199554, "learning_rate": 3.4787236722957747e-06, "logits/chosen": 4.291666507720947, "logits/rejected": 4.494791507720947, "logps/chosen": -2062.666748046875, "logps/rejected": -1826.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.02083396911621, "rewards/margins": 36.4375, "rewards/rejected": -16.42708396911621, "step": 338 }, { "epoch": 1.3044315992292872, "grad_norm": 0.0004920103237964213, "learning_rate": 3.4683746273719754e-06, "logits/chosen": 4.411458492279053, "logits/rejected": 4.609375, "logps/chosen": -2437.333251953125, "logps/rejected": -2172.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.16666603088379, "rewards/margins": 39.45833206176758, "rewards/rejected": -17.28125, "step": 339 }, { "epoch": 1.3082851637764932, "grad_norm": 0.00105193757917732, "learning_rate": 3.458006021582776e-06, "logits/chosen": 4.182291507720947, "logits/rejected": 4.484375, "logps/chosen": -2016.0, "logps/rejected": -1764.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.41666603088379, "rewards/margins": 36.125, "rewards/rejected": -15.739583015441895, "step": 340 }, { "epoch": 1.3121387283236994, "grad_norm": 0.00036215176805853844, "learning_rate": 3.447618064370777e-06, "logits/chosen": 4.390625, "logits/rejected": 4.802083492279053, "logps/chosen": -1916.0, "logps/rejected": -1693.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.58333396911621, "rewards/margins": 32.77083206176758, "rewards/rejected": -15.177083015441895, "step": 341 }, { "epoch": 1.3159922928709056, "grad_norm": 0.00013438148016575724, "learning_rate": 3.437210965569475e-06, "logits/chosen": 4.614583492279053, "logits/rejected": 5.067708492279053, "logps/chosen": -1997.3333740234375, "logps/rejected": -1758.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.1875, "rewards/margins": 34.27083206176758, "rewards/rejected": -16.125, "step": 342 }, { "epoch": 1.3198458574181118, "grad_norm": 0.005069986917078495, "learning_rate": 3.4267849353990178e-06, "logits/chosen": 4.276041507720947, "logits/rejected": 4.729166507720947, "logps/chosen": -1878.6666259765625, "logps/rejected": -1632.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.75, "rewards/margins": 34.08333206176758, "rewards/rejected": -15.28125, "step": 343 }, { "epoch": 1.323699421965318, "grad_norm": 0.000644951534923166, "learning_rate": 3.416340184461965e-06, "logits/chosen": 4.40625, "logits/rejected": 4.729166507720947, "logps/chosen": -1909.3333740234375, "logps/rejected": -1677.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.8125, "rewards/margins": 32.83333206176758, "rewards/rejected": -15.041666984558105, "step": 344 }, { "epoch": 1.327552986512524, "grad_norm": 0.00027751049492508173, "learning_rate": 3.4058769237390254e-06, "logits/chosen": 4.328125, "logits/rejected": 4.677083492279053, "logps/chosen": -2014.6666259765625, "logps/rejected": -1764.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.85416603088379, "rewards/margins": 35.70833206176758, "rewards/rejected": -15.875, "step": 345 }, { "epoch": 1.3314065510597302, "grad_norm": 0.0007482718792743981, "learning_rate": 3.395395364584802e-06, "logits/chosen": 4.411458492279053, "logits/rejected": 4.927083492279053, "logps/chosen": -2085.333251953125, "logps/rejected": -1824.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.44791603088379, "rewards/margins": 35.45833206176758, "rewards/rejected": -15.9375, "step": 346 }, { "epoch": 1.3352601156069364, "grad_norm": 0.03056366741657257, "learning_rate": 3.384895718723521e-06, "logits/chosen": 4.361979007720947, "logits/rejected": 4.588541507720947, "logps/chosen": -2182.666748046875, "logps/rejected": -1941.3333740234375, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 20.52083396911621, "rewards/margins": 37.16666793823242, "rewards/rejected": -16.53125, "step": 347 }, { "epoch": 1.3391136801541426, "grad_norm": 0.0005928495083935559, "learning_rate": 3.3743781982447533e-06, "logits/chosen": 4.328125, "logits/rejected": 4.458333492279053, "logps/chosen": -2370.666748046875, "logps/rejected": -2118.666748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.85416603088379, "rewards/margins": 39.0, "rewards/rejected": -17.17708396911621, "step": 348 }, { "epoch": 1.3429672447013488, "grad_norm": 6.958826270420104e-05, "learning_rate": 3.3638430155991307e-06, "logits/chosen": 4.40625, "logits/rejected": 4.776041507720947, "logps/chosen": -2140.0, "logps/rejected": -1897.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.48958396911621, "rewards/margins": 36.20833206176758, "rewards/rejected": -16.6875, "step": 349 }, { "epoch": 1.346820809248555, "grad_norm": 0.0003207935078535229, "learning_rate": 3.3532903835940578e-06, "logits/chosen": 4.427083492279053, "logits/rejected": 4.734375, "logps/chosen": -2122.666748046875, "logps/rejected": -1881.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.82291603088379, "rewards/margins": 35.29166793823242, "rewards/rejected": -16.4375, "step": 350 }, { "epoch": 1.346820809248555, "eval_logits/chosen": 4.323361396789551, "eval_logits/rejected": 4.64253044128418, "eval_logps/chosen": -2098.146240234375, "eval_logps/rejected": -1847.0244140625, "eval_loss": 0.0007292190566658974, "eval_rewards/accuracies": 0.9989836812019348, "eval_rewards/chosen": 20.488567352294922, "eval_rewards/margins": 36.567073822021484, "eval_rewards/rejected": -16.071645736694336, "eval_runtime": 348.0472, "eval_samples_per_second": 5.652, "eval_steps_per_second": 0.236, "step": 350 }, { "epoch": 1.3506743737957612, "grad_norm": 0.0010065939277410507, "learning_rate": 3.3427205153894088e-06, "logits/chosen": 4.416666507720947, "logits/rejected": 4.958333492279053, "logps/chosen": -1832.0, "logps/rejected": -1594.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.66666603088379, "rewards/margins": 32.39583206176758, "rewards/rejected": -14.78125, "step": 351 }, { "epoch": 1.3545279383429674, "grad_norm": 0.0024720907676965, "learning_rate": 3.3321336244932233e-06, "logits/chosen": 4.333333492279053, "logits/rejected": 4.671875, "logps/chosen": -1933.3333740234375, "logps/rejected": -1682.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.85416603088379, "rewards/margins": 35.0, "rewards/rejected": -15.104166984558105, "step": 352 }, { "epoch": 1.3583815028901733, "grad_norm": 0.023016855120658875, "learning_rate": 3.3215299247573974e-06, "logits/chosen": 4.286458492279053, "logits/rejected": 4.572916507720947, "logps/chosen": -2116.0, "logps/rejected": -1850.6666259765625, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 20.625, "rewards/margins": 36.1875, "rewards/rejected": -15.541666984558105, "step": 353 }, { "epoch": 1.3622350674373795, "grad_norm": 0.0002819003420881927, "learning_rate": 3.3109096303733564e-06, "logits/chosen": 4.315104007720947, "logits/rejected": 4.611979007720947, "logps/chosen": -1981.3333740234375, "logps/rejected": -1705.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.22916603088379, "rewards/margins": 35.5, "rewards/rejected": -15.270833015441895, "step": 354 }, { "epoch": 1.3660886319845857, "grad_norm": 0.0005204101908020675, "learning_rate": 3.300272955867734e-06, "logits/chosen": 4.302083492279053, "logits/rejected": 4.557291507720947, "logps/chosen": -2133.333251953125, "logps/rejected": -1877.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.83333396911621, "rewards/margins": 37.0625, "rewards/rejected": -16.16666603088379, "step": 355 }, { "epoch": 1.369942196531792, "grad_norm": 0.00031020076130516827, "learning_rate": 3.2896201160980364e-06, "logits/chosen": 4.239583492279053, "logits/rejected": 4.489583492279053, "logps/chosen": -1973.3333740234375, "logps/rejected": -1748.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.9375, "rewards/margins": 33.89583206176758, "rewards/rejected": -15.03125, "step": 356 }, { "epoch": 1.3737957610789981, "grad_norm": 0.0002177243586629629, "learning_rate": 3.2789513262483053e-06, "logits/chosen": 4.322916507720947, "logits/rejected": 4.65625, "logps/chosen": -2360.0, "logps/rejected": -2069.333251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.64583396911621, "rewards/margins": 37.91666793823242, "rewards/rejected": -16.26041603088379, "step": 357 }, { "epoch": 1.3776493256262041, "grad_norm": 1.9773253370658495e-05, "learning_rate": 3.268266801824768e-06, "logits/chosen": 4.109375, "logits/rejected": 4.401041507720947, "logps/chosen": -2000.0, "logps/rejected": -1753.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.20833396911621, "rewards/margins": 34.83333206176758, "rewards/rejected": -14.625, "step": 358 }, { "epoch": 1.3815028901734103, "grad_norm": 0.00022747440380044281, "learning_rate": 3.2575667586514847e-06, "logits/chosen": 4.203125, "logits/rejected": 4.546875, "logps/chosen": -2066.666748046875, "logps/rejected": -1816.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.8125, "rewards/margins": 35.83333206176758, "rewards/rejected": -16.01041603088379, "step": 359 }, { "epoch": 1.3853564547206165, "grad_norm": 0.001409685704857111, "learning_rate": 3.2468514128659884e-06, "logits/chosen": 4.432291507720947, "logits/rejected": 4.8125, "logps/chosen": -2190.666748046875, "logps/rejected": -1930.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.4375, "rewards/margins": 36.79166793823242, "rewards/rejected": -16.33333396911621, "step": 360 }, { "epoch": 1.3892100192678227, "grad_norm": 0.0007508211419917643, "learning_rate": 3.236120980914923e-06, "logits/chosen": 4.4375, "logits/rejected": 4.869791507720947, "logps/chosen": -2053.333251953125, "logps/rejected": -1813.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.86458396911621, "rewards/margins": 35.875, "rewards/rejected": -16.02083396911621, "step": 361 }, { "epoch": 1.393063583815029, "grad_norm": 0.00038101067184470594, "learning_rate": 3.225375679549666e-06, "logits/chosen": 4.1484375, "logits/rejected": 4.361979007720947, "logps/chosen": -2112.0, "logps/rejected": -1868.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.77083396911621, "rewards/margins": 38.25, "rewards/rejected": -16.47916603088379, "step": 362 }, { "epoch": 1.3969171483622351, "grad_norm": 7.290680514415726e-05, "learning_rate": 3.2146157258219534e-06, "logits/chosen": 4.385416507720947, "logits/rejected": 4.713541507720947, "logps/chosen": -2116.0, "logps/rejected": -1878.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.8125, "rewards/margins": 34.97916793823242, "rewards/rejected": -16.15625, "step": 363 }, { "epoch": 1.4007707129094413, "grad_norm": 0.0002785023534670472, "learning_rate": 3.203841337079494e-06, "logits/chosen": 4.385416507720947, "logits/rejected": 4.703125, "logps/chosen": -2072.0, "logps/rejected": -1825.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.3125, "rewards/margins": 36.75, "rewards/rejected": -16.41666603088379, "step": 364 }, { "epoch": 1.4046242774566475, "grad_norm": 0.03271384909749031, "learning_rate": 3.1930527309615796e-06, "logits/chosen": 4.3125, "logits/rejected": 4.536458492279053, "logps/chosen": -2041.3333740234375, "logps/rejected": -1790.6666259765625, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 21.71875, "rewards/margins": 38.64583206176758, "rewards/rejected": -16.875, "step": 365 }, { "epoch": 1.4084778420038535, "grad_norm": 0.0015993170673027635, "learning_rate": 3.1822501253946875e-06, "logits/chosen": 4.1796875, "logits/rejected": 4.46875, "logps/chosen": -2226.666748046875, "logps/rejected": -1954.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.5, "rewards/margins": 37.04166793823242, "rewards/rejected": -16.5625, "step": 366 }, { "epoch": 1.4123314065510597, "grad_norm": 0.029254142194986343, "learning_rate": 3.171433738588082e-06, "logits/chosen": 4.25, "logits/rejected": 4.458333492279053, "logps/chosen": -2041.3333740234375, "logps/rejected": -1820.0, "loss": 0.0096, "rewards/accuracies": 0.9861111640930176, "rewards/chosen": 19.0, "rewards/margins": 35.95833206176758, "rewards/rejected": -16.92708396911621, "step": 367 }, { "epoch": 1.416184971098266, "grad_norm": 0.020064212381839752, "learning_rate": 3.1606037890294013e-06, "logits/chosen": 4.255208492279053, "logits/rejected": 4.666666507720947, "logps/chosen": -1998.6666259765625, "logps/rejected": -1754.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.41666603088379, "rewards/margins": 33.6875, "rewards/rejected": -14.239583015441895, "step": 368 }, { "epoch": 1.420038535645472, "grad_norm": 0.002557218074798584, "learning_rate": 3.1497604954802485e-06, "logits/chosen": 4.15625, "logits/rejected": 4.486979007720947, "logps/chosen": -2037.3333740234375, "logps/rejected": -1776.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.85416603088379, "rewards/margins": 36.0, "rewards/rejected": -16.05208396911621, "step": 369 }, { "epoch": 1.423892100192678, "grad_norm": 0.0008516180096194148, "learning_rate": 3.1389040769717718e-06, "logits/chosen": 4.322916507720947, "logits/rejected": 4.635416507720947, "logps/chosen": -2112.0, "logps/rejected": -1842.6666259765625, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 22.4375, "rewards/margins": 39.125, "rewards/rejected": -16.65625, "step": 370 }, { "epoch": 1.4277456647398843, "grad_norm": 0.028436506167054176, "learning_rate": 3.128034752800237e-06, "logits/chosen": 4.252604007720947, "logits/rejected": 4.609375, "logps/chosen": -2004.0, "logps/rejected": -1749.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.88541603088379, "rewards/margins": 36.04166793823242, "rewards/rejected": -16.13541603088379, "step": 371 }, { "epoch": 1.4315992292870905, "grad_norm": 0.00014424383698496968, "learning_rate": 3.1171527425226027e-06, "logits/chosen": 4.40625, "logits/rejected": 4.744791507720947, "logps/chosen": -2138.666748046875, "logps/rejected": -1877.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.83333396911621, "rewards/margins": 37.3125, "rewards/rejected": -16.52083396911621, "step": 372 }, { "epoch": 1.4354527938342967, "grad_norm": 0.0004149235028307885, "learning_rate": 3.106258265952082e-06, "logits/chosen": 4.317708492279053, "logits/rejected": 4.723958492279053, "logps/chosen": -1945.3333740234375, "logps/rejected": -1698.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.02083396911621, "rewards/margins": 34.20833206176758, "rewards/rejected": -15.177083015441895, "step": 373 }, { "epoch": 1.439306358381503, "grad_norm": 0.0002695361035875976, "learning_rate": 3.0953515431537027e-06, "logits/chosen": 4.260416507720947, "logits/rejected": 4.479166507720947, "logps/chosen": -2150.666748046875, "logps/rejected": -1888.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.45833396911621, "rewards/margins": 39.29166793823242, "rewards/rejected": -16.83333396911621, "step": 374 }, { "epoch": 1.443159922928709, "grad_norm": 0.0004806209180969745, "learning_rate": 3.084432794439865e-06, "logits/chosen": 4.221354007720947, "logits/rejected": 4.739583492279053, "logps/chosen": -1841.3333740234375, "logps/rejected": -1568.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.1875, "rewards/margins": 33.27083206176758, "rewards/rejected": -14.020833015441895, "step": 375 }, { "epoch": 1.4470134874759153, "grad_norm": 0.00017019506776705384, "learning_rate": 3.0735022403658847e-06, "logits/chosen": 4.411458492279053, "logits/rejected": 4.807291507720947, "logps/chosen": -2001.3333740234375, "logps/rejected": -1738.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.85416603088379, "rewards/margins": 36.20833206176758, "rewards/rejected": -16.32291603088379, "step": 376 }, { "epoch": 1.4508670520231215, "grad_norm": 0.0004560339730232954, "learning_rate": 3.0625601017255453e-06, "logits/chosen": 4.489583492279053, "logits/rejected": 4.979166507720947, "logps/chosen": -2013.3333740234375, "logps/rejected": -1766.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.52083396911621, "rewards/margins": 34.14583206176758, "rewards/rejected": -15.645833015441895, "step": 377 }, { "epoch": 1.4547206165703275, "grad_norm": 0.0004979541408829391, "learning_rate": 3.0516065995466336e-06, "logits/chosen": 4.473958492279053, "logits/rejected": 4.78125, "logps/chosen": -2020.0, "logps/rejected": -1765.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.20833396911621, "rewards/margins": 36.41666793823242, "rewards/rejected": -16.25, "step": 378 }, { "epoch": 1.4585741811175337, "grad_norm": 3.761165862670168e-05, "learning_rate": 3.040641955086478e-06, "logits/chosen": 4.247395992279053, "logits/rejected": 4.723958492279053, "logps/chosen": -1932.0, "logps/rejected": -1696.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.89583396911621, "rewards/margins": 34.9375, "rewards/rejected": -16.09375, "step": 379 }, { "epoch": 1.4624277456647399, "grad_norm": 0.0007200956461019814, "learning_rate": 3.0296663898274766e-06, "logits/chosen": 4.307291507720947, "logits/rejected": 4.505208492279053, "logps/chosen": -2121.333251953125, "logps/rejected": -1861.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.52083396911621, "rewards/margins": 37.95833206176758, "rewards/rejected": -16.41666603088379, "step": 380 }, { "epoch": 1.466281310211946, "grad_norm": 0.0017461583483964205, "learning_rate": 3.0186801254726213e-06, "logits/chosen": 4.28125, "logits/rejected": 4.677083492279053, "logps/chosen": -2028.0, "logps/rejected": -1760.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.72916603088379, "rewards/margins": 35.72916793823242, "rewards/rejected": -16.0, "step": 381 }, { "epoch": 1.4701348747591523, "grad_norm": 0.0003617673646658659, "learning_rate": 3.007683383941027e-06, "logits/chosen": 4.494791507720947, "logits/rejected": 4.770833492279053, "logps/chosen": -2185.333251953125, "logps/rejected": -1940.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.83333396911621, "rewards/margins": 37.02083206176758, "rewards/rejected": -17.13541603088379, "step": 382 }, { "epoch": 1.4739884393063583, "grad_norm": 0.00022845232160761952, "learning_rate": 2.9966763873634424e-06, "logits/chosen": 4.208333492279053, "logits/rejected": 4.526041507720947, "logps/chosen": -2038.6666259765625, "logps/rejected": -1778.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.47916603088379, "rewards/margins": 37.22916793823242, "rewards/rejected": -15.71875, "step": 383 }, { "epoch": 1.4778420038535645, "grad_norm": 0.00024514508550055325, "learning_rate": 2.985659358077765e-06, "logits/chosen": 4.338541507720947, "logits/rejected": 4.65625, "logps/chosen": -2108.0, "logps/rejected": -1853.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.66666603088379, "rewards/margins": 37.41666793823242, "rewards/rejected": -16.84375, "step": 384 }, { "epoch": 1.4816955684007707, "grad_norm": 0.00015097780851647258, "learning_rate": 2.974632518624548e-06, "logits/chosen": 4.341145992279053, "logits/rejected": 4.932291507720947, "logps/chosen": -1965.3333740234375, "logps/rejected": -1718.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.01041603088379, "rewards/margins": 35.0625, "rewards/rejected": -16.13541603088379, "step": 385 }, { "epoch": 1.4855491329479769, "grad_norm": 2.926447086792905e-05, "learning_rate": 2.9635960917425114e-06, "logits/chosen": 4.091145992279053, "logits/rejected": 4.325520992279053, "logps/chosen": -1912.0, "logps/rejected": -1657.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.02083396911621, "rewards/margins": 38.33333206176758, "rewards/rejected": -16.28125, "step": 386 }, { "epoch": 1.489402697495183, "grad_norm": 0.0007821715553291142, "learning_rate": 2.9525503003640336e-06, "logits/chosen": 4.364583492279053, "logits/rejected": 4.723958492279053, "logps/chosen": -2213.333251953125, "logps/rejected": -1952.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.52083396911621, "rewards/margins": 38.29166793823242, "rewards/rejected": -16.78125, "step": 387 }, { "epoch": 1.4932562620423893, "grad_norm": 0.0002179664297727868, "learning_rate": 2.941495367610656e-06, "logits/chosen": 4.395833492279053, "logits/rejected": 4.786458492279053, "logps/chosen": -2118.666748046875, "logps/rejected": -1888.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.875, "rewards/margins": 36.25, "rewards/rejected": -17.39583396911621, "step": 388 }, { "epoch": 1.4971098265895955, "grad_norm": 0.0006210917490534484, "learning_rate": 2.9304315167885706e-06, "logits/chosen": 4.317708492279053, "logits/rejected": 4.744791507720947, "logps/chosen": -1968.0, "logps/rejected": -1716.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.4375, "rewards/margins": 35.04166793823242, "rewards/rejected": -15.583333015441895, "step": 389 }, { "epoch": 1.5009633911368017, "grad_norm": 4.099030775250867e-05, "learning_rate": 2.9193589713841132e-06, "logits/chosen": 4.411458492279053, "logits/rejected": 4.838541507720947, "logps/chosen": -1966.6666259765625, "logps/rejected": -1700.0, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 19.66666603088379, "rewards/margins": 35.16666793823242, "rewards/rejected": -15.510416984558105, "step": 390 }, { "epoch": 1.5048169556840079, "grad_norm": 0.02906755544245243, "learning_rate": 2.9082779550592478e-06, "logits/chosen": 4.453125, "logits/rejected": 4.90625, "logps/chosen": -2197.333251953125, "logps/rejected": -1914.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.625, "rewards/margins": 37.33333206176758, "rewards/rejected": -16.73958396911621, "step": 391 }, { "epoch": 1.5086705202312138, "grad_norm": 0.00040009996155276895, "learning_rate": 2.897188691647047e-06, "logits/chosen": 4.171875, "logits/rejected": 4.546875, "logps/chosen": -2058.666748046875, "logps/rejected": -1813.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.3125, "rewards/margins": 35.66666793823242, "rewards/rejected": -15.385416984558105, "step": 392 }, { "epoch": 1.51252408477842, "grad_norm": 0.00025818386347964406, "learning_rate": 2.8860914051471722e-06, "logits/chosen": 4.328125, "logits/rejected": 4.598958492279053, "logps/chosen": -2176.0, "logps/rejected": -1922.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.83333396911621, "rewards/margins": 37.625, "rewards/rejected": -16.82291603088379, "step": 393 }, { "epoch": 1.5163776493256262, "grad_norm": 0.001543400576338172, "learning_rate": 2.874986319721349e-06, "logits/chosen": 4.2578125, "logits/rejected": 4.536458492279053, "logps/chosen": -2044.0, "logps/rejected": -1810.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.85416603088379, "rewards/margins": 35.91666793823242, "rewards/rejected": -16.0, "step": 394 }, { "epoch": 1.5202312138728322, "grad_norm": 0.00034155408502556384, "learning_rate": 2.86387365968884e-06, "logits/chosen": 4.359375, "logits/rejected": 4.713541507720947, "logps/chosen": -2178.666748046875, "logps/rejected": -1933.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.63541603088379, "rewards/margins": 35.35416793823242, "rewards/rejected": -16.65625, "step": 395 }, { "epoch": 1.5240847784200384, "grad_norm": 0.0002657130535226315, "learning_rate": 2.852753649521911e-06, "logits/chosen": 4.390625, "logits/rejected": 4.859375, "logps/chosen": -1965.3333740234375, "logps/rejected": -1734.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.64583396911621, "rewards/margins": 34.33333206176758, "rewards/rejected": -15.677083015441895, "step": 396 }, { "epoch": 1.5279383429672446, "grad_norm": 0.0022116873878985643, "learning_rate": 2.8416265138412985e-06, "logits/chosen": 4.411458492279053, "logits/rejected": 4.71875, "logps/chosen": -2116.0, "logps/rejected": -1869.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.6875, "rewards/margins": 37.625, "rewards/rejected": -16.89583396911621, "step": 397 }, { "epoch": 1.5317919075144508, "grad_norm": 0.02541920728981495, "learning_rate": 2.830492477411671e-06, "logits/chosen": 4.377604007720947, "logits/rejected": 4.713541507720947, "logps/chosen": -2368.0, "logps/rejected": -2124.0, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 20.8125, "rewards/margins": 38.16666793823242, "rewards/rejected": -17.3125, "step": 398 }, { "epoch": 1.535645472061657, "grad_norm": 0.000323989283060655, "learning_rate": 2.8193517651370934e-06, "logits/chosen": 4.307291507720947, "logits/rejected": 4.640625, "logps/chosen": -2109.333251953125, "logps/rejected": -1854.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.08333396911621, "rewards/margins": 37.95833206176758, "rewards/rejected": -16.875, "step": 399 }, { "epoch": 1.5394990366088632, "grad_norm": 0.00020920176757499576, "learning_rate": 2.8082046020564772e-06, "logits/chosen": 4.3125, "logits/rejected": 4.666666507720947, "logps/chosen": -1980.0, "logps/rejected": -1726.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.97916603088379, "rewards/margins": 36.27083206176758, "rewards/rejected": -16.28125, "step": 400 }, { "epoch": 1.5394990366088632, "eval_logits/chosen": 4.318597793579102, "eval_logits/rejected": 4.64939022064209, "eval_logps/chosen": -2099.0244140625, "eval_logps/rejected": -1849.756103515625, "eval_loss": 0.0007181625696830451, "eval_rewards/accuracies": 0.9989836812019348, "eval_rewards/chosen": 20.449695587158203, "eval_rewards/margins": 36.84146499633789, "eval_rewards/rejected": -16.4001522064209, "eval_runtime": 348.129, "eval_samples_per_second": 5.65, "eval_steps_per_second": 0.236, "step": 400 }, { "epoch": 1.5433526011560694, "grad_norm": 4.337758582551032e-05, "learning_rate": 2.797051213339041e-06, "logits/chosen": 4.489583492279053, "logits/rejected": 4.864583492279053, "logps/chosen": -2040.0, "logps/rejected": -1804.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.83333396911621, "rewards/margins": 36.33333206176758, "rewards/rejected": -17.44791603088379, "step": 401 }, { "epoch": 1.5472061657032756, "grad_norm": 0.0023176458198577166, "learning_rate": 2.785891824279755e-06, "logits/chosen": 4.369791507720947, "logits/rejected": 4.8125, "logps/chosen": -2004.0, "logps/rejected": -1740.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.39583396911621, "rewards/margins": 36.8125, "rewards/rejected": -16.39583396911621, "step": 402 }, { "epoch": 1.5510597302504818, "grad_norm": 0.0006734149646945298, "learning_rate": 2.774726660294799e-06, "logits/chosen": 4.296875, "logits/rejected": 4.578125, "logps/chosen": -2278.666748046875, "logps/rejected": -2009.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 23.04166603088379, "rewards/margins": 39.79166793823242, "rewards/rejected": -16.77083396911621, "step": 403 }, { "epoch": 1.5549132947976878, "grad_norm": 7.131211896194145e-05, "learning_rate": 2.763555946917002e-06, "logits/chosen": 4.390625, "logits/rejected": 4.776041507720947, "logps/chosen": -2040.0, "logps/rejected": -1770.6666259765625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 21.79166603088379, "rewards/margins": 37.89583206176758, "rewards/rejected": -16.05208396911621, "step": 404 }, { "epoch": 1.558766859344894, "grad_norm": 0.18365070223808289, "learning_rate": 2.7523799097912905e-06, "logits/chosen": 4.203125, "logits/rejected": 4.505208492279053, "logps/chosen": -2098.666748046875, "logps/rejected": -1856.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.83333396911621, "rewards/margins": 36.95833206176758, "rewards/rejected": -17.13541603088379, "step": 405 }, { "epoch": 1.5626204238921002, "grad_norm": 0.00022877859009895474, "learning_rate": 2.741198774670128e-06, "logits/chosen": 4.286458492279053, "logits/rejected": 4.807291507720947, "logps/chosen": -1861.3333740234375, "logps/rejected": -1602.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.15625, "rewards/margins": 34.4375, "rewards/rejected": -15.28125, "step": 406 }, { "epoch": 1.5664739884393064, "grad_norm": 0.003441161708906293, "learning_rate": 2.7300127674089555e-06, "logits/chosen": 4.270833492279053, "logits/rejected": 4.494791507720947, "logps/chosen": -2116.0, "logps/rejected": -1849.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.77083396911621, "rewards/margins": 37.91666793823242, "rewards/rejected": -16.13541603088379, "step": 407 }, { "epoch": 1.5703275529865124, "grad_norm": 0.0003719261731021106, "learning_rate": 2.7188221139616303e-06, "logits/chosen": 4.489583492279053, "logits/rejected": 4.947916507720947, "logps/chosen": -2117.333251953125, "logps/rejected": -1850.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.89583396911621, "rewards/margins": 36.10416793823242, "rewards/rejected": -16.23958396911621, "step": 408 }, { "epoch": 1.5741811175337186, "grad_norm": 8.334410813404247e-05, "learning_rate": 2.7076270403758624e-06, "logits/chosen": 4.291666507720947, "logits/rejected": 4.739583492279053, "logps/chosen": -2065.333251953125, "logps/rejected": -1806.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.5, "rewards/margins": 37.33333206176758, "rewards/rejected": -15.8125, "step": 409 }, { "epoch": 1.5780346820809248, "grad_norm": 0.00016456421872135252, "learning_rate": 2.696427772788645e-06, "logits/chosen": 4.192708492279053, "logits/rejected": 4.677083492279053, "logps/chosen": -1888.0, "logps/rejected": -1610.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.125, "rewards/margins": 36.375, "rewards/rejected": -15.239583015441895, "step": 410 }, { "epoch": 1.581888246628131, "grad_norm": 0.004210240673273802, "learning_rate": 2.68522453742169e-06, "logits/chosen": 4.453125, "logits/rejected": 4.890625, "logps/chosen": -2058.666748046875, "logps/rejected": -1826.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.14583396911621, "rewards/margins": 35.08333206176758, "rewards/rejected": -15.927083015441895, "step": 411 }, { "epoch": 1.5857418111753372, "grad_norm": 0.022662866860628128, "learning_rate": 2.674017560576858e-06, "logits/chosen": 4.421875, "logits/rejected": 4.828125, "logps/chosen": -2149.333251953125, "logps/rejected": -1898.6666259765625, "loss": 0.0049, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 18.76041603088379, "rewards/margins": 34.625, "rewards/rejected": -15.84375, "step": 412 }, { "epoch": 1.5895953757225434, "grad_norm": 0.023732855916023254, "learning_rate": 2.662807068631585e-06, "logits/chosen": 4.286458492279053, "logits/rejected": 4.536458492279053, "logps/chosen": -2186.666748046875, "logps/rejected": -1960.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.60416603088379, "rewards/margins": 38.5, "rewards/rejected": -17.90625, "step": 413 }, { "epoch": 1.5934489402697496, "grad_norm": 0.0013039689511060715, "learning_rate": 2.6515932880343103e-06, "logits/chosen": 4.208333492279053, "logits/rejected": 4.463541507720947, "logps/chosen": -2208.0, "logps/rejected": -1957.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.66666603088379, "rewards/margins": 37.45833206176758, "rewards/rejected": -16.82291603088379, "step": 414 }, { "epoch": 1.5973025048169558, "grad_norm": 0.0008117995457723737, "learning_rate": 2.640376445299905e-06, "logits/chosen": 4.557291507720947, "logits/rejected": 5.145833492279053, "logps/chosen": -1890.6666259765625, "logps/rejected": -1638.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.91666603088379, "rewards/margins": 34.5, "rewards/rejected": -16.57291603088379, "step": 415 }, { "epoch": 1.601156069364162, "grad_norm": 9.624979429645464e-05, "learning_rate": 2.6291567670050934e-06, "logits/chosen": 4.1953125, "logits/rejected": 4.630208492279053, "logps/chosen": -1813.3333740234375, "logps/rejected": -1550.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.41666603088379, "rewards/margins": 35.27083206176758, "rewards/rejected": -14.84375, "step": 416 }, { "epoch": 1.605009633911368, "grad_norm": 0.00043110878323204815, "learning_rate": 2.6179344797838775e-06, "logits/chosen": 4.197916507720947, "logits/rejected": 4.557291507720947, "logps/chosen": -1960.0, "logps/rejected": -1686.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.5625, "rewards/margins": 35.91666793823242, "rewards/rejected": -15.354166984558105, "step": 417 }, { "epoch": 1.6088631984585742, "grad_norm": 0.0011149642523378134, "learning_rate": 2.606709810322957e-06, "logits/chosen": 4.385416507720947, "logits/rejected": 4.6875, "logps/chosen": -2150.666748046875, "logps/rejected": -1914.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.53125, "rewards/margins": 39.22916793823242, "rewards/rejected": -18.69791603088379, "step": 418 }, { "epoch": 1.6127167630057804, "grad_norm": 9.33109040488489e-05, "learning_rate": 2.5954829853571552e-06, "logits/chosen": 4.401041507720947, "logits/rejected": 4.671875, "logps/chosen": -2132.0, "logps/rejected": -1896.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.0625, "rewards/margins": 35.75, "rewards/rejected": -16.66666603088379, "step": 419 }, { "epoch": 1.6165703275529864, "grad_norm": 0.0005716350860893726, "learning_rate": 2.5842542316648333e-06, "logits/chosen": 4.234375, "logits/rejected": 4.458333492279053, "logps/chosen": -2112.0, "logps/rejected": -1892.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.98958396911621, "rewards/margins": 36.375, "rewards/rejected": -17.38541603088379, "step": 420 }, { "epoch": 1.6204238921001926, "grad_norm": 6.314170605037361e-05, "learning_rate": 2.573023776063315e-06, "logits/chosen": 4.182291507720947, "logits/rejected": 4.734375, "logps/chosen": -1882.6666259765625, "logps/rejected": -1641.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.76041603088379, "rewards/margins": 33.41666793823242, "rewards/rejected": -14.666666984558105, "step": 421 }, { "epoch": 1.6242774566473988, "grad_norm": 0.003175926161929965, "learning_rate": 2.561791845404298e-06, "logits/chosen": 4.372395992279053, "logits/rejected": 4.8125, "logps/chosen": -1884.0, "logps/rejected": -1634.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.10416603088379, "rewards/margins": 35.04166793823242, "rewards/rejected": -15.927083015441895, "step": 422 }, { "epoch": 1.628131021194605, "grad_norm": 0.00018841511337086558, "learning_rate": 2.550558666569279e-06, "logits/chosen": 4.375, "logits/rejected": 4.630208492279053, "logps/chosen": -2160.0, "logps/rejected": -1929.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.41666603088379, "rewards/margins": 38.41666793823242, "rewards/rejected": -17.90625, "step": 423 }, { "epoch": 1.6319845857418112, "grad_norm": 6.853613740531728e-05, "learning_rate": 2.5393244664649665e-06, "logits/chosen": 4.348958492279053, "logits/rejected": 4.744791507720947, "logps/chosen": -2213.333251953125, "logps/rejected": -1956.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.625, "rewards/margins": 41.20833206176758, "rewards/rejected": -18.59375, "step": 424 }, { "epoch": 1.6358381502890174, "grad_norm": 4.78974288853351e-05, "learning_rate": 2.528089472018698e-06, "logits/chosen": 4.432291507720947, "logits/rejected": 4.817708492279053, "logps/chosen": -2172.0, "logps/rejected": -1933.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.86458396911621, "rewards/margins": 35.97916793823242, "rewards/rejected": -17.17708396911621, "step": 425 }, { "epoch": 1.6396917148362236, "grad_norm": 0.0005072590429335833, "learning_rate": 2.5168539101738576e-06, "logits/chosen": 4.283854007720947, "logits/rejected": 4.625, "logps/chosen": -2109.333251953125, "logps/rejected": -1880.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.73958396911621, "rewards/margins": 37.16666793823242, "rewards/rejected": -17.4375, "step": 426 }, { "epoch": 1.6435452793834298, "grad_norm": 0.0009426700416952372, "learning_rate": 2.5056180078852883e-06, "logits/chosen": 4.213541507720947, "logits/rejected": 4.75, "logps/chosen": -2022.6666259765625, "logps/rejected": -1770.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.29166603088379, "rewards/margins": 36.375, "rewards/rejected": -16.08333396911621, "step": 427 }, { "epoch": 1.647398843930636, "grad_norm": 7.960922084748745e-05, "learning_rate": 2.4943819921147125e-06, "logits/chosen": 4.411458492279053, "logits/rejected": 4.833333492279053, "logps/chosen": -2046.6666259765625, "logps/rejected": -1797.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.0, "rewards/margins": 37.58333206176758, "rewards/rejected": -17.53125, "step": 428 }, { "epoch": 1.6512524084778422, "grad_norm": 0.00024510343791916966, "learning_rate": 2.4831460898261428e-06, "logits/chosen": 4.291666507720947, "logits/rejected": 4.734375, "logps/chosen": -1925.3333740234375, "logps/rejected": -1693.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.42708396911621, "rewards/margins": 35.45833206176758, "rewards/rejected": -16.05208396911621, "step": 429 }, { "epoch": 1.6551059730250481, "grad_norm": 0.0001600939140189439, "learning_rate": 2.4719105279813022e-06, "logits/chosen": 4.270833492279053, "logits/rejected": 4.666666507720947, "logps/chosen": -2114.666748046875, "logps/rejected": -1853.3333740234375, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 21.39583396911621, "rewards/margins": 38.66666793823242, "rewards/rejected": -17.26041603088379, "step": 430 }, { "epoch": 1.6589595375722543, "grad_norm": 0.012807044200599194, "learning_rate": 2.460675533535034e-06, "logits/chosen": 4.348958492279053, "logits/rejected": 4.645833492279053, "logps/chosen": -2085.333251953125, "logps/rejected": -1820.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.92708396911621, "rewards/margins": 36.45833206176758, "rewards/rejected": -16.47916603088379, "step": 431 }, { "epoch": 1.6628131021194605, "grad_norm": 0.00026522116968408227, "learning_rate": 2.449441333430722e-06, "logits/chosen": 4.315104007720947, "logits/rejected": 4.700520992279053, "logps/chosen": -1862.6666259765625, "logps/rejected": -1660.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.67708396911621, "rewards/margins": 34.60416793823242, "rewards/rejected": -15.96875, "step": 432 }, { "epoch": 1.6666666666666665, "grad_norm": 7.895232556620613e-05, "learning_rate": 2.438208154595703e-06, "logits/chosen": 4.489583492279053, "logits/rejected": 4.895833492279053, "logps/chosen": -2073.333251953125, "logps/rejected": -1820.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.29166603088379, "rewards/margins": 34.97916793823242, "rewards/rejected": -16.67708396911621, "step": 433 }, { "epoch": 1.6705202312138727, "grad_norm": 7.718052802374586e-05, "learning_rate": 2.4269762239366855e-06, "logits/chosen": 4.3203125, "logits/rejected": 4.692708492279053, "logps/chosen": -2056.0, "logps/rejected": -1822.6666259765625, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 19.5, "rewards/margins": 36.0625, "rewards/rejected": -16.5, "step": 434 }, { "epoch": 1.674373795761079, "grad_norm": 0.028694886714220047, "learning_rate": 2.415745768335167e-06, "logits/chosen": 4.46875, "logits/rejected": 4.890625, "logps/chosen": -2252.0, "logps/rejected": -1976.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.47916603088379, "rewards/margins": 37.58333206176758, "rewards/rejected": -18.10416603088379, "step": 435 }, { "epoch": 1.6782273603082851, "grad_norm": 0.0012596540618687868, "learning_rate": 2.4045170146428456e-06, "logits/chosen": 4.283854007720947, "logits/rejected": 4.713541507720947, "logps/chosen": -2044.0, "logps/rejected": -1813.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.20833396911621, "rewards/margins": 35.5625, "rewards/rejected": -16.40625, "step": 436 }, { "epoch": 1.6820809248554913, "grad_norm": 0.003078378504142165, "learning_rate": 2.3932901896770435e-06, "logits/chosen": 4.286458492279053, "logits/rejected": 4.640625, "logps/chosen": -1906.6666259765625, "logps/rejected": -1649.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.10416603088379, "rewards/margins": 35.58333206176758, "rewards/rejected": -16.5625, "step": 437 }, { "epoch": 1.6859344894026975, "grad_norm": 0.0001601027324795723, "learning_rate": 2.3820655202161237e-06, "logits/chosen": 4.421875, "logits/rejected": 4.869791507720947, "logps/chosen": -2044.0, "logps/rejected": -1805.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.41666603088379, "rewards/margins": 35.22916793823242, "rewards/rejected": -16.80208396911621, "step": 438 }, { "epoch": 1.6897880539499037, "grad_norm": 0.0008686166838742793, "learning_rate": 2.370843232994907e-06, "logits/chosen": 4.348958492279053, "logits/rejected": 4.833333492279053, "logps/chosen": -1978.6666259765625, "logps/rejected": -1722.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.59375, "rewards/margins": 35.125, "rewards/rejected": -16.55208396911621, "step": 439 }, { "epoch": 1.69364161849711, "grad_norm": 0.000319930724799633, "learning_rate": 2.359623554700096e-06, "logits/chosen": 4.265625, "logits/rejected": 4.572916507720947, "logps/chosen": -2014.6666259765625, "logps/rejected": -1782.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.21875, "rewards/margins": 37.04166793823242, "rewards/rejected": -16.88541603088379, "step": 440 }, { "epoch": 1.6974951830443161, "grad_norm": 0.0013946524122729897, "learning_rate": 2.3484067119656905e-06, "logits/chosen": 4.307291507720947, "logits/rejected": 4.729166507720947, "logps/chosen": -2000.0, "logps/rejected": -1757.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.20833396911621, "rewards/margins": 35.27083206176758, "rewards/rejected": -17.10416603088379, "step": 441 }, { "epoch": 1.701348747591522, "grad_norm": 0.0002593209792394191, "learning_rate": 2.337192931368416e-06, "logits/chosen": 4.364583492279053, "logits/rejected": 4.729166507720947, "logps/chosen": -2238.666748046875, "logps/rejected": -1977.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.54166603088379, "rewards/margins": 39.3125, "rewards/rejected": -17.76041603088379, "step": 442 }, { "epoch": 1.7052023121387283, "grad_norm": 0.0009471942903473973, "learning_rate": 2.3259824394231427e-06, "logits/chosen": 4.229166507720947, "logits/rejected": 4.572916507720947, "logps/chosen": -2162.666748046875, "logps/rejected": -1897.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.875, "rewards/margins": 37.5, "rewards/rejected": -16.625, "step": 443 }, { "epoch": 1.7090558766859345, "grad_norm": 0.0003259051591157913, "learning_rate": 2.31477546257831e-06, "logits/chosen": 4.395833492279053, "logits/rejected": 4.802083492279053, "logps/chosen": -2130.666748046875, "logps/rejected": -1878.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.47916603088379, "rewards/margins": 37.79166793823242, "rewards/rejected": -17.29166603088379, "step": 444 }, { "epoch": 1.7129094412331407, "grad_norm": 0.0012897133128717542, "learning_rate": 2.3035722272113555e-06, "logits/chosen": 4.291666507720947, "logits/rejected": 4.677083492279053, "logps/chosen": -2090.666748046875, "logps/rejected": -1806.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.75, "rewards/margins": 39.54166793823242, "rewards/rejected": -16.79166603088379, "step": 445 }, { "epoch": 1.7167630057803467, "grad_norm": 8.148100459948182e-05, "learning_rate": 2.2923729596241376e-06, "logits/chosen": 4.614583492279053, "logits/rejected": 5.177083492279053, "logps/chosen": -2110.666748046875, "logps/rejected": -1842.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.95833396911621, "rewards/margins": 37.27083206176758, "rewards/rejected": -17.30208396911621, "step": 446 }, { "epoch": 1.7206165703275529, "grad_norm": 0.0006660564104095101, "learning_rate": 2.28117788603837e-06, "logits/chosen": 4.247395992279053, "logits/rejected": 4.505208492279053, "logps/chosen": -2168.0, "logps/rejected": -1926.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.04166603088379, "rewards/margins": 39.20833206176758, "rewards/rejected": -18.16666603088379, "step": 447 }, { "epoch": 1.724470134874759, "grad_norm": 0.0006667288835160434, "learning_rate": 2.2699872325910458e-06, "logits/chosen": 4.247395992279053, "logits/rejected": 4.580729007720947, "logps/chosen": -2045.3333740234375, "logps/rejected": -1834.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.20833396911621, "rewards/margins": 33.83333206176758, "rewards/rejected": -16.625, "step": 448 }, { "epoch": 1.7283236994219653, "grad_norm": 5.2325303840916604e-05, "learning_rate": 2.258801225329873e-06, "logits/chosen": 4.28125, "logits/rejected": 4.484375, "logps/chosen": -2265.333251953125, "logps/rejected": -2017.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.52083396911621, "rewards/margins": 38.70833206176758, "rewards/rejected": -18.14583396911621, "step": 449 }, { "epoch": 1.7321772639691715, "grad_norm": 6.071604730095714e-05, "learning_rate": 2.24762009020871e-06, "logits/chosen": 4.442708492279053, "logits/rejected": 4.78125, "logps/chosen": -2405.333251953125, "logps/rejected": -2150.666748046875, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 22.29166603088379, "rewards/margins": 41.625, "rewards/rejected": -19.20833396911621, "step": 450 }, { "epoch": 1.7321772639691715, "eval_logits/chosen": 4.3176445960998535, "eval_logits/rejected": 4.66501522064209, "eval_logps/chosen": -2101.56103515625, "eval_logps/rejected": -1855.0244140625, "eval_loss": 0.0007180569809861481, "eval_rewards/accuracies": 0.9989836812019348, "eval_rewards/chosen": 20.27134132385254, "eval_rewards/margins": 37.227134704589844, "eval_rewards/rejected": -16.964176177978516, "eval_runtime": 348.154, "eval_samples_per_second": 5.65, "eval_steps_per_second": 0.236, "step": 450 }, { "epoch": 1.7360308285163777, "grad_norm": 0.020920194685459137, "learning_rate": 2.2364440530829977e-06, "logits/chosen": 4.296875, "logits/rejected": 4.598958492279053, "logps/chosen": -2208.0, "logps/rejected": -1950.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.625, "rewards/margins": 37.91666793823242, "rewards/rejected": -18.27083396911621, "step": 451 }, { "epoch": 1.739884393063584, "grad_norm": 4.300917498767376e-05, "learning_rate": 2.2252733397052016e-06, "logits/chosen": 4.346354007720947, "logits/rejected": 4.609375, "logps/chosen": -2178.666748046875, "logps/rejected": -1925.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.1875, "rewards/margins": 39.20833206176758, "rewards/rejected": -17.97916603088379, "step": 452 }, { "epoch": 1.74373795761079, "grad_norm": 0.0001491853763582185, "learning_rate": 2.214108175720246e-06, "logits/chosen": 4.364583492279053, "logits/rejected": 4.692708492279053, "logps/chosen": -2261.333251953125, "logps/rejected": -2013.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.02083396911621, "rewards/margins": 39.375, "rewards/rejected": -18.33333396911621, "step": 453 }, { "epoch": 1.7475915221579963, "grad_norm": 0.0009159508626908064, "learning_rate": 2.20294878666096e-06, "logits/chosen": 4.356770992279053, "logits/rejected": 4.671875, "logps/chosen": -2138.666748046875, "logps/rejected": -1889.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.33333396911621, "rewards/margins": 36.125, "rewards/rejected": -16.85416603088379, "step": 454 }, { "epoch": 1.7514450867052023, "grad_norm": 6.22665902483277e-05, "learning_rate": 2.191795397943523e-06, "logits/chosen": 4.111979007720947, "logits/rejected": 4.489583492279053, "logps/chosen": -2197.333251953125, "logps/rejected": -1920.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.75, "rewards/margins": 39.79166793823242, "rewards/rejected": -16.98958396911621, "step": 455 }, { "epoch": 1.7552986512524085, "grad_norm": 0.00011398660717532039, "learning_rate": 2.1806482348629065e-06, "logits/chosen": 4.328125, "logits/rejected": 4.5703125, "logps/chosen": -2249.333251953125, "logps/rejected": -1993.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.27083396911621, "rewards/margins": 39.16666793823242, "rewards/rejected": -17.88541603088379, "step": 456 }, { "epoch": 1.7591522157996147, "grad_norm": 1.045942735800054e-05, "learning_rate": 2.1695075225883293e-06, "logits/chosen": 4.34375, "logits/rejected": 4.744791507720947, "logps/chosen": -2070.666748046875, "logps/rejected": -1841.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.875, "rewards/margins": 34.14583206176758, "rewards/rejected": -16.34375, "step": 457 }, { "epoch": 1.7630057803468207, "grad_norm": 0.00021350568567868322, "learning_rate": 2.158373486158703e-06, "logits/chosen": 4.395833492279053, "logits/rejected": 4.869791507720947, "logps/chosen": -2132.0, "logps/rejected": -1873.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.72916603088379, "rewards/margins": 38.41666793823242, "rewards/rejected": -17.67708396911621, "step": 458 }, { "epoch": 1.7668593448940269, "grad_norm": 0.00016899702313821763, "learning_rate": 2.1472463504780893e-06, "logits/chosen": 4.552083492279053, "logits/rejected": 4.963541507720947, "logps/chosen": -2029.3333740234375, "logps/rejected": -1800.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.90625, "rewards/margins": 35.0, "rewards/rejected": -17.07291603088379, "step": 459 }, { "epoch": 1.770712909441233, "grad_norm": 0.000739628856536001, "learning_rate": 2.1361263403111605e-06, "logits/chosen": 4.283854007720947, "logits/rejected": 4.661458492279053, "logps/chosen": -2182.666748046875, "logps/rejected": -1934.6666259765625, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 23.45833396911621, "rewards/margins": 37.375, "rewards/rejected": -13.889323234558105, "step": 460 }, { "epoch": 1.7745664739884393, "grad_norm": 0.05123860388994217, "learning_rate": 2.125013680278651e-06, "logits/chosen": 4.286458492279053, "logits/rejected": 4.697916507720947, "logps/chosen": -1938.6666259765625, "logps/rejected": -1702.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.51041603088379, "rewards/margins": 34.20833206176758, "rewards/rejected": -15.708333015441895, "step": 461 }, { "epoch": 1.7784200385356455, "grad_norm": 0.05256764963269234, "learning_rate": 2.1139085948528286e-06, "logits/chosen": 4.2109375, "logits/rejected": 4.5625, "logps/chosen": -2104.0, "logps/rejected": -1845.3333740234375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 21.29166603088379, "rewards/margins": 37.91666793823242, "rewards/rejected": -16.6875, "step": 462 }, { "epoch": 1.7822736030828517, "grad_norm": 0.0008022841066122055, "learning_rate": 2.1028113083529543e-06, "logits/chosen": 4.125, "logits/rejected": 4.322916507720947, "logps/chosen": -2177.333251953125, "logps/rejected": -1922.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.95833396911621, "rewards/margins": 39.875, "rewards/rejected": -17.88541603088379, "step": 463 }, { "epoch": 1.7861271676300579, "grad_norm": 0.0001675303210504353, "learning_rate": 2.091722044940753e-06, "logits/chosen": 4.346354007720947, "logits/rejected": 4.752604007720947, "logps/chosen": -2062.666748046875, "logps/rejected": -1834.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.38541603088379, "rewards/margins": 33.875, "rewards/rejected": -16.47916603088379, "step": 464 }, { "epoch": 1.789980732177264, "grad_norm": 0.0015041643055155873, "learning_rate": 2.080641028615888e-06, "logits/chosen": 4.395833492279053, "logits/rejected": 4.984375, "logps/chosen": -2048.0, "logps/rejected": -1765.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.91666603088379, "rewards/margins": 37.25, "rewards/rejected": -17.32291603088379, "step": 465 }, { "epoch": 1.7938342967244703, "grad_norm": 0.00018625195662025362, "learning_rate": 2.06956848321143e-06, "logits/chosen": 4.28125, "logits/rejected": 4.6328125, "logps/chosen": -2089.333251953125, "logps/rejected": -1832.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.9375, "rewards/margins": 37.83333206176758, "rewards/rejected": -16.88541603088379, "step": 466 }, { "epoch": 1.7976878612716765, "grad_norm": 0.00014119746629148722, "learning_rate": 2.0585046323893448e-06, "logits/chosen": 4.236979007720947, "logits/rejected": 4.661458492279053, "logps/chosen": -1962.6666259765625, "logps/rejected": -1682.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.95833396911621, "rewards/margins": 36.79166793823242, "rewards/rejected": -15.895833015441895, "step": 467 }, { "epoch": 1.8015414258188824, "grad_norm": 0.00040941318729892373, "learning_rate": 2.0474496996359676e-06, "logits/chosen": 4.348958492279053, "logits/rejected": 4.854166507720947, "logps/chosen": -1958.6666259765625, "logps/rejected": -1697.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.52083396911621, "rewards/margins": 37.95833206176758, "rewards/rejected": -16.41666603088379, "step": 468 }, { "epoch": 1.8053949903660886, "grad_norm": 0.0007083431119099259, "learning_rate": 2.036403908257489e-06, "logits/chosen": 4.411458492279053, "logits/rejected": 4.8125, "logps/chosen": -2309.333251953125, "logps/rejected": -2044.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.20833396911621, "rewards/margins": 36.33333206176758, "rewards/rejected": -17.14583396911621, "step": 469 }, { "epoch": 1.8092485549132948, "grad_norm": 0.000241468966123648, "learning_rate": 2.0253674813754522e-06, "logits/chosen": 4.328125, "logits/rejected": 4.729166507720947, "logps/chosen": -2018.6666259765625, "logps/rejected": -1785.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.125, "rewards/margins": 35.70833206176758, "rewards/rejected": -16.61458396911621, "step": 470 }, { "epoch": 1.8131021194605008, "grad_norm": 0.0012441333383321762, "learning_rate": 2.0143406419222354e-06, "logits/chosen": 4.2578125, "logits/rejected": 4.807291507720947, "logps/chosen": -1897.3333740234375, "logps/rejected": -1657.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.60416603088379, "rewards/margins": 35.70833206176758, "rewards/rejected": -16.14583396911621, "step": 471 }, { "epoch": 1.816955684007707, "grad_norm": 0.00011901009565917775, "learning_rate": 2.003323612636558e-06, "logits/chosen": 4.104166507720947, "logits/rejected": 4.341145992279053, "logps/chosen": -2116.0, "logps/rejected": -1862.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.52083396911621, "rewards/margins": 38.375, "rewards/rejected": -16.83333396911621, "step": 472 }, { "epoch": 1.8208092485549132, "grad_norm": 0.00032537244260311127, "learning_rate": 1.992316616058974e-06, "logits/chosen": 4.080729007720947, "logits/rejected": 4.3671875, "logps/chosen": -2000.0, "logps/rejected": -1746.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.35416603088379, "rewards/margins": 38.16666793823242, "rewards/rejected": -16.70833396911621, "step": 473 }, { "epoch": 1.8246628131021194, "grad_norm": 0.0001536967174615711, "learning_rate": 1.98131987452738e-06, "logits/chosen": 4.328125, "logits/rejected": 4.916666507720947, "logps/chosen": -1988.0, "logps/rejected": -1725.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.6875, "rewards/margins": 34.75, "rewards/rejected": -16.03125, "step": 474 }, { "epoch": 1.8285163776493256, "grad_norm": 0.003546386957168579, "learning_rate": 1.970333610172525e-06, "logits/chosen": 4.3828125, "logits/rejected": 4.760416507720947, "logps/chosen": -2078.666748046875, "logps/rejected": -1813.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.58333396911621, "rewards/margins": 36.16666793823242, "rewards/rejected": -16.625, "step": 475 }, { "epoch": 1.8323699421965318, "grad_norm": 3.9332619053311646e-05, "learning_rate": 1.9593580449135217e-06, "logits/chosen": 4.557291507720947, "logits/rejected": 4.989583492279053, "logps/chosen": -2212.0, "logps/rejected": -1982.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.63541603088379, "rewards/margins": 38.41666793823242, "rewards/rejected": -18.85416603088379, "step": 476 }, { "epoch": 1.836223506743738, "grad_norm": 0.00030418651294894516, "learning_rate": 1.948393400453367e-06, "logits/chosen": 4.369791507720947, "logits/rejected": 4.828125, "logps/chosen": -1969.3333740234375, "logps/rejected": -1733.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.125, "rewards/margins": 36.0, "rewards/rejected": -16.85416603088379, "step": 477 }, { "epoch": 1.8400770712909442, "grad_norm": 0.00010887546523008496, "learning_rate": 1.937439898274455e-06, "logits/chosen": 4.255208492279053, "logits/rejected": 4.609375, "logps/chosen": -2006.6666259765625, "logps/rejected": -1785.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.71875, "rewards/margins": 34.6875, "rewards/rejected": -16.96875, "step": 478 }, { "epoch": 1.8439306358381504, "grad_norm": 0.0009900054428726435, "learning_rate": 1.9264977596341157e-06, "logits/chosen": 4.286458492279053, "logits/rejected": 4.567708492279053, "logps/chosen": -2120.0, "logps/rejected": -1876.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.77083396911621, "rewards/margins": 37.04166793823242, "rewards/rejected": -17.27083396911621, "step": 479 }, { "epoch": 1.8477842003853564, "grad_norm": 0.0004568180884234607, "learning_rate": 1.9155672055601364e-06, "logits/chosen": 4.489583492279053, "logits/rejected": 4.75, "logps/chosen": -2248.0, "logps/rejected": -1993.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.58333396911621, "rewards/margins": 40.625, "rewards/rejected": -19.04166603088379, "step": 480 }, { "epoch": 1.8516377649325626, "grad_norm": 0.0003510061651468277, "learning_rate": 1.904648456846297e-06, "logits/chosen": 4.322916507720947, "logits/rejected": 4.666666507720947, "logps/chosen": -1884.0, "logps/rejected": -1641.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.92708396911621, "rewards/margins": 34.3125, "rewards/rejected": -15.375, "step": 481 }, { "epoch": 1.8554913294797688, "grad_norm": 0.00016259752737823874, "learning_rate": 1.893741734047919e-06, "logits/chosen": 4.244791507720947, "logits/rejected": 4.494791507720947, "logps/chosen": -2166.666748046875, "logps/rejected": -1948.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.1875, "rewards/margins": 36.58333206176758, "rewards/rejected": -17.40625, "step": 482 }, { "epoch": 1.859344894026975, "grad_norm": 0.0003130500263068825, "learning_rate": 1.882847257477398e-06, "logits/chosen": 4.354166507720947, "logits/rejected": 4.609375, "logps/chosen": -2156.0, "logps/rejected": -1930.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.10416603088379, "rewards/margins": 37.625, "rewards/rejected": -17.48958396911621, "step": 483 }, { "epoch": 1.863198458574181, "grad_norm": 0.00012112512195017189, "learning_rate": 1.8719652471997637e-06, "logits/chosen": 4.484375, "logits/rejected": 4.890625, "logps/chosen": -2121.333251953125, "logps/rejected": -1870.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.15625, "rewards/margins": 35.02083206176758, "rewards/rejected": -16.85416603088379, "step": 484 }, { "epoch": 1.8670520231213872, "grad_norm": 0.00019226575386710465, "learning_rate": 1.86109592302823e-06, "logits/chosen": 4.3359375, "logits/rejected": 4.682291507720947, "logps/chosen": -1966.6666259765625, "logps/rejected": -1761.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.16666603088379, "rewards/margins": 34.58333206176758, "rewards/rejected": -16.47916603088379, "step": 485 }, { "epoch": 1.8709055876685934, "grad_norm": 0.04141245037317276, "learning_rate": 1.8502395045197522e-06, "logits/chosen": 4.369791507720947, "logits/rejected": 4.807291507720947, "logps/chosen": -1902.6666259765625, "logps/rejected": -1672.0, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 18.01041603088379, "rewards/margins": 34.33333206176758, "rewards/rejected": -16.34375, "step": 486 }, { "epoch": 1.8747591522157996, "grad_norm": 0.00012722906831186265, "learning_rate": 1.8393962109705995e-06, "logits/chosen": 4.192708492279053, "logits/rejected": 4.515625, "logps/chosen": -2166.666748046875, "logps/rejected": -1918.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.91666603088379, "rewards/margins": 36.75, "rewards/rejected": -14.8125, "step": 487 }, { "epoch": 1.8786127167630058, "grad_norm": 0.0003962251066695899, "learning_rate": 1.8285662614119185e-06, "logits/chosen": 4.364583492279053, "logits/rejected": 4.739583492279053, "logps/chosen": -2000.0, "logps/rejected": -1781.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.07291603088379, "rewards/margins": 34.60416793823242, "rewards/rejected": -16.57291603088379, "step": 488 }, { "epoch": 1.882466281310212, "grad_norm": 3.733192352228798e-05, "learning_rate": 1.8177498746053129e-06, "logits/chosen": 4.104166507720947, "logits/rejected": 4.34375, "logps/chosen": -2130.666748046875, "logps/rejected": -1888.0, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 21.77083396911621, "rewards/margins": 38.79166793823242, "rewards/rejected": -17.0, "step": 489 }, { "epoch": 1.8863198458574182, "grad_norm": 0.017817115411162376, "learning_rate": 1.8069472690384221e-06, "logits/chosen": 4.234375, "logits/rejected": 4.5625, "logps/chosen": -2057.333251953125, "logps/rejected": -1810.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.8125, "rewards/margins": 37.75, "rewards/rejected": -16.83333396911621, "step": 490 }, { "epoch": 1.8901734104046244, "grad_norm": 0.000405539118219167, "learning_rate": 1.796158662920507e-06, "logits/chosen": 4.356770992279053, "logits/rejected": 4.7890625, "logps/chosen": -1990.6666259765625, "logps/rejected": -1732.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.58333396911621, "rewards/margins": 37.04166793823242, "rewards/rejected": -17.4375, "step": 491 }, { "epoch": 1.8940269749518306, "grad_norm": 0.00019496023014653474, "learning_rate": 1.7853842741780474e-06, "logits/chosen": 4.302083492279053, "logits/rejected": 4.640625, "logps/chosen": -2102.666748046875, "logps/rejected": -1845.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.04166603088379, "rewards/margins": 38.29166793823242, "rewards/rejected": -17.25, "step": 492 }, { "epoch": 1.8978805394990366, "grad_norm": 0.0009353151544928551, "learning_rate": 1.7746243204503342e-06, "logits/chosen": 4.2421875, "logits/rejected": 4.515625, "logps/chosen": -2038.6666259765625, "logps/rejected": -1804.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.79166603088379, "rewards/margins": 38.08333206176758, "rewards/rejected": -17.23958396911621, "step": 493 }, { "epoch": 1.9017341040462428, "grad_norm": 0.00022226295550353825, "learning_rate": 1.7638790190850777e-06, "logits/chosen": 4.421875, "logits/rejected": 4.765625, "logps/chosen": -1988.0, "logps/rejected": -1768.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.39583396911621, "rewards/margins": 35.14583206176758, "rewards/rejected": -16.79166603088379, "step": 494 }, { "epoch": 1.905587668593449, "grad_norm": 0.000608772155828774, "learning_rate": 1.7531485871340122e-06, "logits/chosen": 4.333333492279053, "logits/rejected": 4.666666507720947, "logps/chosen": -1941.3333740234375, "logps/rejected": -1686.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.85416603088379, "rewards/margins": 36.29166793823242, "rewards/rejected": -16.41666603088379, "step": 495 }, { "epoch": 1.909441233140655, "grad_norm": 0.00011534040095284581, "learning_rate": 1.742433241348516e-06, "logits/chosen": 4.260416507720947, "logits/rejected": 4.6875, "logps/chosen": -1930.6666259765625, "logps/rejected": -1682.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.04166603088379, "rewards/margins": 37.22916793823242, "rewards/rejected": -16.21875, "step": 496 }, { "epoch": 1.9132947976878611, "grad_norm": 0.000622666790150106, "learning_rate": 1.7317331981752327e-06, "logits/chosen": 4.1875, "logits/rejected": 4.744791507720947, "logps/chosen": -1906.6666259765625, "logps/rejected": -1650.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.07291603088379, "rewards/margins": 34.8125, "rewards/rejected": -15.677083015441895, "step": 497 }, { "epoch": 1.9171483622350673, "grad_norm": 0.002169637242332101, "learning_rate": 1.7210486737516947e-06, "logits/chosen": 4.390625, "logits/rejected": 4.723958492279053, "logps/chosen": -2184.0, "logps/rejected": -1930.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.20833396911621, "rewards/margins": 39.5, "rewards/rejected": -19.27083396911621, "step": 498 }, { "epoch": 1.9210019267822736, "grad_norm": 0.000621818529907614, "learning_rate": 1.7103798839019647e-06, "logits/chosen": 4.411458492279053, "logits/rejected": 4.71875, "logps/chosen": -2124.0, "logps/rejected": -1881.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.6875, "rewards/margins": 37.20833206176758, "rewards/rejected": -18.52083396911621, "step": 499 }, { "epoch": 1.9248554913294798, "grad_norm": 0.00032108768937177956, "learning_rate": 1.6997270441322676e-06, "logits/chosen": 4.359375, "logits/rejected": 4.619791507720947, "logps/chosen": -2078.666748046875, "logps/rejected": -1874.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.375, "rewards/margins": 36.39583206176758, "rewards/rejected": -18.03125, "step": 500 }, { "epoch": 1.9248554913294798, "eval_logits/chosen": 4.330411434173584, "eval_logits/rejected": 4.682164669036865, "eval_logps/chosen": -2102.146240234375, "eval_logps/rejected": -1858.43896484375, "eval_loss": 0.0007186427828855813, "eval_rewards/accuracies": 0.9989836812019348, "eval_rewards/chosen": 20.154726028442383, "eval_rewards/margins": 37.41006088256836, "eval_rewards/rejected": -17.246952056884766, "eval_runtime": 347.9779, "eval_samples_per_second": 5.653, "eval_steps_per_second": 0.236, "step": 500 }, { "epoch": 1.928709055876686, "grad_norm": 6.621197098866105e-05, "learning_rate": 1.6890903696266447e-06, "logits/chosen": 4.151041507720947, "logits/rejected": 4.369791507720947, "logps/chosen": -2242.666748046875, "logps/rejected": -1996.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.41666603088379, "rewards/margins": 41.20833206176758, "rewards/rejected": -18.8125, "step": 501 }, { "epoch": 1.9325626204238922, "grad_norm": 0.00015971431275829673, "learning_rate": 1.6784700752426037e-06, "logits/chosen": 4.28125, "logits/rejected": 4.520833492279053, "logps/chosen": -2173.333251953125, "logps/rejected": -1926.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.25, "rewards/margins": 41.5, "rewards/rejected": -19.20833396911621, "step": 502 }, { "epoch": 1.9364161849710984, "grad_norm": 0.0013756364351138473, "learning_rate": 1.6678663755067765e-06, "logits/chosen": 4.265625, "logits/rejected": 4.78125, "logps/chosen": -1968.0, "logps/rejected": -1690.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.54166603088379, "rewards/margins": 37.58333206176758, "rewards/rejected": -17.0625, "step": 503 }, { "epoch": 1.9402697495183046, "grad_norm": 0.00046545075019821525, "learning_rate": 1.6572794846105919e-06, "logits/chosen": 4.302083492279053, "logits/rejected": 4.723958492279053, "logps/chosen": -1990.6666259765625, "logps/rejected": -1745.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.82291603088379, "rewards/margins": 37.83333206176758, "rewards/rejected": -16.98958396911621, "step": 504 }, { "epoch": 1.9441233140655108, "grad_norm": 0.00043211979209445417, "learning_rate": 1.6467096164059433e-06, "logits/chosen": 4.322916507720947, "logits/rejected": 4.869791507720947, "logps/chosen": -1884.0, "logps/rejected": -1613.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.54166603088379, "rewards/margins": 35.14583206176758, "rewards/rejected": -15.604166984558105, "step": 505 }, { "epoch": 1.9479768786127167, "grad_norm": 0.000806916446890682, "learning_rate": 1.63615698440087e-06, "logits/chosen": 4.28125, "logits/rejected": 4.588541507720947, "logps/chosen": -2116.0, "logps/rejected": -1869.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.5625, "rewards/margins": 37.04166793823242, "rewards/rejected": -16.53125, "step": 506 }, { "epoch": 1.951830443159923, "grad_norm": 0.0022516001481562853, "learning_rate": 1.6256218017552484e-06, "logits/chosen": 4.203125, "logits/rejected": 4.463541507720947, "logps/chosen": -2194.666748046875, "logps/rejected": -1917.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 22.10416603088379, "rewards/margins": 39.875, "rewards/rejected": -17.8125, "step": 507 }, { "epoch": 1.9556840077071291, "grad_norm": 0.0008361614309251308, "learning_rate": 1.6151042812764798e-06, "logits/chosen": 4.205729007720947, "logits/rejected": 4.5, "logps/chosen": -2040.0, "logps/rejected": -1801.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.16666603088379, "rewards/margins": 37.33333206176758, "rewards/rejected": -17.14583396911621, "step": 508 }, { "epoch": 1.9595375722543351, "grad_norm": 0.0008503508288413286, "learning_rate": 1.6046046354151987e-06, "logits/chosen": 4.416666507720947, "logits/rejected": 4.755208492279053, "logps/chosen": -2209.333251953125, "logps/rejected": -1969.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.08333396911621, "rewards/margins": 37.41666793823242, "rewards/rejected": -17.32291603088379, "step": 509 }, { "epoch": 1.9633911368015413, "grad_norm": 0.0009199672495014966, "learning_rate": 1.594123076260975e-06, "logits/chosen": 4.302083492279053, "logits/rejected": 4.640625, "logps/chosen": -2100.0, "logps/rejected": -1852.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.625, "rewards/margins": 39.08333206176758, "rewards/rejected": -18.4375, "step": 510 }, { "epoch": 1.9672447013487475, "grad_norm": 0.0004664993903134018, "learning_rate": 1.5836598155380362e-06, "logits/chosen": 4.5625, "logits/rejected": 5.03125, "logps/chosen": -2198.666748046875, "logps/rejected": -1936.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.01041603088379, "rewards/margins": 39.16666793823242, "rewards/rejected": -19.125, "step": 511 }, { "epoch": 1.9710982658959537, "grad_norm": 0.0003700663219206035, "learning_rate": 1.5732150646009824e-06, "logits/chosen": 4.458333492279053, "logits/rejected": 4.96875, "logps/chosen": -2000.0, "logps/rejected": -1746.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.6875, "rewards/margins": 35.04166793823242, "rewards/rejected": -16.30208396911621, "step": 512 }, { "epoch": 1.97495183044316, "grad_norm": 0.00027191892149858177, "learning_rate": 1.5627890344305256e-06, "logits/chosen": 4.3125, "logits/rejected": 4.661458492279053, "logps/chosen": -2230.666748046875, "logps/rejected": -1989.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.89583396911621, "rewards/margins": 36.75, "rewards/rejected": -16.88541603088379, "step": 513 }, { "epoch": 1.9788053949903661, "grad_norm": 0.00035341514740139246, "learning_rate": 1.5523819356292235e-06, "logits/chosen": 4.5, "logits/rejected": 4.760416507720947, "logps/chosen": -2093.333251953125, "logps/rejected": -1849.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.19791603088379, "rewards/margins": 36.33333206176758, "rewards/rejected": -18.125, "step": 514 }, { "epoch": 1.9826589595375723, "grad_norm": 5.553813753067516e-05, "learning_rate": 1.5419939784172245e-06, "logits/chosen": 4.25, "logits/rejected": 4.619791507720947, "logps/chosen": -2129.333251953125, "logps/rejected": -1861.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.54166603088379, "rewards/margins": 38.625, "rewards/rejected": -17.05208396911621, "step": 515 }, { "epoch": 1.9865125240847785, "grad_norm": 0.0004361188330221921, "learning_rate": 1.531625372628025e-06, "logits/chosen": 4.372395992279053, "logits/rejected": 4.817708492279053, "logps/chosen": -1990.6666259765625, "logps/rejected": -1768.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.5, "rewards/margins": 33.6875, "rewards/rejected": -16.21875, "step": 516 }, { "epoch": 1.9903660886319847, "grad_norm": 0.0009759567328728735, "learning_rate": 1.5212763277042263e-06, "logits/chosen": 4.5, "logits/rejected": 4.989583492279053, "logps/chosen": -2289.333251953125, "logps/rejected": -2029.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.54166603088379, "rewards/margins": 36.79166793823242, "rewards/rejected": -17.27083396911621, "step": 517 }, { "epoch": 1.9942196531791907, "grad_norm": 5.371678344090469e-05, "learning_rate": 1.5109470526933083e-06, "logits/chosen": 4.364583492279053, "logits/rejected": 4.596354007720947, "logps/chosen": -2101.333251953125, "logps/rejected": -1866.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.8125, "rewards/margins": 37.4375, "rewards/rejected": -17.65625, "step": 518 }, { "epoch": 1.998073217726397, "grad_norm": 0.02715446799993515, "learning_rate": 1.500637756243405e-06, "logits/chosen": 4.182291507720947, "logits/rejected": 4.416666507720947, "logps/chosen": -2034.6666259765625, "logps/rejected": -1796.0, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 20.20833396911621, "rewards/margins": 37.47916793823242, "rewards/rejected": -17.3125, "step": 519 }, { "epoch": 2.0, "grad_norm": 0.0012922444147989154, "learning_rate": 1.490348646599087e-06, "logits/chosen": 4.541666507720947, "logits/rejected": 5.010416507720947, "logps/chosen": -2064.0, "logps/rejected": -1818.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.02083396911621, "rewards/margins": 34.16666793823242, "rewards/rejected": -17.20833396911621, "step": 520 }, { "epoch": 2.003853564547206, "grad_norm": 0.0007311231456696987, "learning_rate": 1.480079931597161e-06, "logits/chosen": 4.322916507720947, "logits/rejected": 4.677083492279053, "logps/chosen": -2061.333251953125, "logps/rejected": -1821.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.85416603088379, "rewards/margins": 36.79166793823242, "rewards/rejected": -16.90625, "step": 521 }, { "epoch": 2.0077071290944124, "grad_norm": 0.00010546141129452735, "learning_rate": 1.469831818662467e-06, "logits/chosen": 4.458333492279053, "logits/rejected": 4.838541507720947, "logps/chosen": -2000.0, "logps/rejected": -1769.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.77083396911621, "rewards/margins": 35.79166793823242, "rewards/rejected": -16.97916603088379, "step": 522 }, { "epoch": 2.0115606936416186, "grad_norm": 0.0004044270608574152, "learning_rate": 1.4596045148036878e-06, "logits/chosen": 4.247395992279053, "logits/rejected": 4.645833492279053, "logps/chosen": -1902.6666259765625, "logps/rejected": -1670.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.07291603088379, "rewards/margins": 35.41666793823242, "rewards/rejected": -16.30208396911621, "step": 523 }, { "epoch": 2.015414258188825, "grad_norm": 0.0015955432318150997, "learning_rate": 1.4493982266091727e-06, "logits/chosen": 4.328125, "logits/rejected": 4.666666507720947, "logps/chosen": -2141.333251953125, "logps/rejected": -1929.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.26041603088379, "rewards/margins": 36.02083206176758, "rewards/rejected": -16.83333396911621, "step": 524 }, { "epoch": 2.019267822736031, "grad_norm": 0.0012335004284977913, "learning_rate": 1.439213160242759e-06, "logits/chosen": 4.354166507720947, "logits/rejected": 4.65625, "logps/chosen": -2014.6666259765625, "logps/rejected": -1754.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.14583396911621, "rewards/margins": 37.0, "rewards/rejected": -16.86458396911621, "step": 525 }, { "epoch": 2.023121387283237, "grad_norm": 0.00014680824824608862, "learning_rate": 1.4290495214396103e-06, "logits/chosen": 4.40625, "logits/rejected": 4.880208492279053, "logps/chosen": -1929.3333740234375, "logps/rejected": -1676.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.94791603088379, "rewards/margins": 35.6875, "rewards/rejected": -16.78125, "step": 526 }, { "epoch": 2.026974951830443, "grad_norm": 5.2134852012386546e-05, "learning_rate": 1.4189075155020599e-06, "logits/chosen": 4.338541507720947, "logits/rejected": 4.697916507720947, "logps/chosen": -2112.0, "logps/rejected": -1868.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.54166603088379, "rewards/margins": 38.5, "rewards/rejected": -18.91666603088379, "step": 527 }, { "epoch": 2.030828516377649, "grad_norm": 7.683436706429347e-05, "learning_rate": 1.4087873472954638e-06, "logits/chosen": 4.143229007720947, "logits/rejected": 4.484375, "logps/chosen": -2146.666748046875, "logps/rejected": -1873.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.33333396911621, "rewards/margins": 37.60416793823242, "rewards/rejected": -17.25, "step": 528 }, { "epoch": 2.0346820809248554, "grad_norm": 0.00013955016038380563, "learning_rate": 1.3986892212440637e-06, "logits/chosen": 4.244791507720947, "logits/rejected": 4.604166507720947, "logps/chosen": -2049.333251953125, "logps/rejected": -1802.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.47916603088379, "rewards/margins": 37.375, "rewards/rejected": -16.9375, "step": 529 }, { "epoch": 2.0385356454720616, "grad_norm": 5.735335350036621, "learning_rate": 1.3886133413268551e-06, "logits/chosen": 4.270833492279053, "logits/rejected": 4.697916507720947, "logps/chosen": -2129.333251953125, "logps/rejected": -1886.6666259765625, "loss": 0.0061, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 21.39583396911621, "rewards/margins": 38.25, "rewards/rejected": -16.91666603088379, "step": 530 }, { "epoch": 2.0423892100192678, "grad_norm": 0.00012583394709508866, "learning_rate": 1.37855991107347e-06, "logits/chosen": 4.369791507720947, "logits/rejected": 4.848958492279053, "logps/chosen": -2046.6666259765625, "logps/rejected": -1792.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.91666603088379, "rewards/margins": 36.64583206176758, "rewards/rejected": -16.6875, "step": 531 }, { "epoch": 2.046242774566474, "grad_norm": 0.027468297630548477, "learning_rate": 1.3685291335600626e-06, "logits/chosen": 4.380208492279053, "logits/rejected": 4.59375, "logps/chosen": -2269.333251953125, "logps/rejected": -2025.3333740234375, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 20.02083396911621, "rewards/margins": 38.16666793823242, "rewards/rejected": -18.14583396911621, "step": 532 }, { "epoch": 2.05009633911368, "grad_norm": 2.948888868559152e-05, "learning_rate": 1.3585212114052104e-06, "logits/chosen": 4.401041507720947, "logits/rejected": 4.635416507720947, "logps/chosen": -2265.333251953125, "logps/rejected": -2029.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.625, "rewards/margins": 38.375, "rewards/rejected": -18.79166603088379, "step": 533 }, { "epoch": 2.0539499036608864, "grad_norm": 0.0002872212207876146, "learning_rate": 1.3485363467658186e-06, "logits/chosen": 4.049479007720947, "logits/rejected": 4.354166507720947, "logps/chosen": -2084.0, "logps/rejected": -1830.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.16666603088379, "rewards/margins": 37.66666793823242, "rewards/rejected": -17.51041603088379, "step": 534 }, { "epoch": 2.0578034682080926, "grad_norm": 0.013165987096726894, "learning_rate": 1.3385747413330391e-06, "logits/chosen": 4.315104007720947, "logits/rejected": 4.791666507720947, "logps/chosen": -1901.3333740234375, "logps/rejected": -1656.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 19.47916603088379, "rewards/margins": 35.89583206176758, "rewards/rejected": -16.42708396911621, "step": 535 }, { "epoch": 2.0616570327552988, "grad_norm": 0.00020183733431622386, "learning_rate": 1.3286365963281933e-06, "logits/chosen": 4.53125, "logits/rejected": 5.020833492279053, "logps/chosen": -2146.666748046875, "logps/rejected": -1894.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.72916603088379, "rewards/margins": 37.79166793823242, "rewards/rejected": -18.08333396911621, "step": 536 }, { "epoch": 2.065510597302505, "grad_norm": 0.0002859625965356827, "learning_rate": 1.3187221124987107e-06, "logits/chosen": 4.236979007720947, "logits/rejected": 4.744791507720947, "logps/chosen": -1914.6666259765625, "logps/rejected": -1657.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.41666603088379, "rewards/margins": 34.72916793823242, "rewards/rejected": -15.270833015441895, "step": 537 }, { "epoch": 2.069364161849711, "grad_norm": 0.05169886350631714, "learning_rate": 1.308831490114072e-06, "logits/chosen": 4.276041507720947, "logits/rejected": 4.625, "logps/chosen": -2202.666748046875, "logps/rejected": -1980.0, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 19.48958396911621, "rewards/margins": 38.3125, "rewards/rejected": -18.77083396911621, "step": 538 }, { "epoch": 2.0732177263969174, "grad_norm": 0.00011470437311800197, "learning_rate": 1.2989649289617638e-06, "logits/chosen": 4.182291507720947, "logits/rejected": 4.421875, "logps/chosen": -2037.3333740234375, "logps/rejected": -1825.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.20833396911621, "rewards/margins": 34.97916793823242, "rewards/rejected": -17.72916603088379, "step": 539 }, { "epoch": 2.077071290944123, "grad_norm": 0.0008594996761530638, "learning_rate": 1.289122628343244e-06, "logits/chosen": 4.184895992279053, "logits/rejected": 4.578125, "logps/chosen": -2022.6666259765625, "logps/rejected": -1772.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.32291603088379, "rewards/margins": 36.16666793823242, "rewards/rejected": -15.78125, "step": 540 }, { "epoch": 2.0809248554913293, "grad_norm": 0.00019885599613189697, "learning_rate": 1.2793047870699146e-06, "logits/chosen": 4.354166507720947, "logits/rejected": 4.78125, "logps/chosen": -2116.0, "logps/rejected": -1869.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.60416603088379, "rewards/margins": 34.625, "rewards/rejected": -17.03125, "step": 541 }, { "epoch": 2.0847784200385355, "grad_norm": 0.001316198380663991, "learning_rate": 1.269511603459106e-06, "logits/chosen": 4.484375, "logits/rejected": 4.901041507720947, "logps/chosen": -2249.333251953125, "logps/rejected": -2010.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.75, "rewards/margins": 36.5, "rewards/rejected": -17.79166603088379, "step": 542 }, { "epoch": 2.0886319845857417, "grad_norm": 0.0008642252651043236, "learning_rate": 1.2597432753300753e-06, "logits/chosen": 4.208333492279053, "logits/rejected": 4.606770992279053, "logps/chosen": -2070.666748046875, "logps/rejected": -1826.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.92708396911621, "rewards/margins": 37.10416793823242, "rewards/rejected": -17.1875, "step": 543 }, { "epoch": 2.092485549132948, "grad_norm": 0.0006710060406476259, "learning_rate": 1.2500000000000007e-06, "logits/chosen": 4.296875, "logits/rejected": 4.708333492279053, "logps/chosen": -1966.6666259765625, "logps/rejected": -1730.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.88541603088379, "rewards/margins": 35.3125, "rewards/rejected": -16.35416603088379, "step": 544 }, { "epoch": 2.096339113680154, "grad_norm": 0.00020137008687015623, "learning_rate": 1.2402819742800044e-06, "logits/chosen": 4.453125, "logits/rejected": 4.885416507720947, "logps/chosen": -2113.333251953125, "logps/rejected": -1849.3333740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.89583396911621, "rewards/margins": 37.5, "rewards/rejected": -17.58333396911621, "step": 545 }, { "epoch": 2.1001926782273603, "grad_norm": 0.0010984725086018443, "learning_rate": 1.2305893944711773e-06, "logits/chosen": 4.432291507720947, "logits/rejected": 4.932291507720947, "logps/chosen": -1989.3333740234375, "logps/rejected": -1750.6666259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 19.27083396911621, "rewards/margins": 37.3125, "rewards/rejected": -18.08333396911621, "step": 546 }, { "epoch": 2.1040462427745665, "grad_norm": 6.509053491754457e-05, "learning_rate": 1.2209224563606045e-06, "logits/chosen": 4.278645992279053, "logits/rejected": 4.5546875, "logps/chosen": -2109.333251953125, "logps/rejected": -1880.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 20.44791603088379, "rewards/margins": 37.875, "rewards/rejected": -17.40625, "step": 547 }, { "epoch": 2.1078998073217727, "grad_norm": 0.00013614218914881349, "learning_rate": 1.211281355217424e-06, "logits/chosen": 4.510416507720947, "logits/rejected": 4.942708492279053, "logps/chosen": -2044.0, "logps/rejected": -1812.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 18.51041603088379, "rewards/margins": 35.25, "rewards/rejected": -16.66666603088379, "step": 548 }, { "epoch": 2.111753371868979, "grad_norm": 0.04498204216361046, "learning_rate": 1.201666285788869e-06, "logits/chosen": 4.065104007720947, "logits/rejected": 4.510416507720947, "logps/chosen": -1888.0, "logps/rejected": -1644.0, "loss": 0.0048, "rewards/accuracies": 0.9930555820465088, "rewards/chosen": 20.0, "rewards/margins": 35.91666793823242, "rewards/rejected": -15.895833015441895, "step": 549 }, { "epoch": 2.115606936416185, "grad_norm": 0.0023263785988092422, "learning_rate": 1.1920774422963422e-06, "logits/chosen": 4.333333492279053, "logits/rejected": 4.666666507720947, "logps/chosen": -2156.0, "logps/rejected": -1884.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 21.53125, "rewards/margins": 39.5, "rewards/rejected": -17.97916603088379, "step": 550 }, { "epoch": 2.115606936416185, "eval_logits/chosen": 4.345274448394775, "eval_logits/rejected": 4.693216323852539, "eval_logps/chosen": -2102.9267578125, "eval_logps/rejected": -1860.6829833984375, "eval_loss": 0.0007177366642281413, "eval_rewards/accuracies": 0.9989836812019348, "eval_rewards/chosen": 20.086891174316406, "eval_rewards/margins": 37.51829147338867, "eval_rewards/rejected": -17.438262939453125, "eval_runtime": 348.1482, "eval_samples_per_second": 5.65, "eval_steps_per_second": 0.236, "step": 550 } ], "logging_steps": 1, "max_steps": 777, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }